diff --git a/.circleci/config.yml b/.circleci/config.yml index bba65c8aa202..f12de88b2a3b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -16,7 +16,8 @@ commands: - run: name: Install JDK 8 on macos command: | - brew install --cask adoptopenjdk/openjdk/adoptopenjdk8 + HOMEBREW_NO_AUTO_UPDATE=1 brew tap bell-sw/liberica + HOMEBREW_NO_AUTO_UPDATE=1 brew install --cask liberica-jdk8 increase-max-open-files-on-macos: steps: @@ -53,6 +54,7 @@ commands: command: | echo "Installing CMake..." choco install cmake --installargs 'ADD_CMAKE_TO_PATH=System' -y + choco install liberica8jdk -y mkdir $Env:THIRDPARTY_HOME cd $Env:THIRDPARTY_HOME echo "Building Snappy dependency..." @@ -66,9 +68,10 @@ commands: - run: name: "Build RocksDB" command: | + $env:Path = $env:JAVA_HOME + ";" + $env:Path mkdir build cd build - & $Env:CMAKE_BIN -G "$Env:CMAKE_GENERATOR" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DJNI=1 .. + & $Env:CMAKE_BIN -G "$Env:CMAKE_GENERATOR" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE="$Env:CMAKE_PORTABLE" -DSNAPPY=1 -DJNI=1 .. cd .. echo "Building with VS version: $Env:CMAKE_GENERATOR" msbuild.exe build/rocksdb.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64 @@ -77,6 +80,11 @@ commands: shell: powershell.exe command: | build_tools\run_ci_db_test.ps1 -SuiteRun arena_test,db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16 + - run: + name: "Test RocksJava" + command: | + cd build\java + & $Env:CTEST_BIN -C Debug -j 16 pre-steps-macos: steps: - pre-steps @@ -104,6 +112,15 @@ commands: path: /tmp/core_dumps when: on_fail + post-pmd-steps: + steps: + - store_artifacts: + path: /home/circleci/project/java/target/pmd.xml + when: on_fail + - store_artifacts: + path: /home/circleci/project/java/target/site + when: on_fail + upgrade-cmake: steps: - run: @@ -126,6 +143,13 @@ commands: command: | HOMEBREW_NO_AUTO_UPDATE=1 brew install gflags + install-maven: + steps: + - run: + name: Install maven + command: | + sudo apt-get update -y && sudo apt-get install -y maven + setup-folly: steps: - run: @@ -152,15 +176,15 @@ commands: steps: - run: name: "Test low-variance benchmarks" - command: ./tools/benchmark_ci.py --db_dir /tmp/rocksdb-benchmark-datadir --output_dir /tmp/benchmark-results --num_keys 10000000 + command: ./tools/benchmark_ci.py --db_dir /tmp/rocksdb-benchmark-datadir --output_dir /tmp/benchmark-results --num_keys 20000000 environment: LD_LIBRARY_PATH: /usr/local/lib # How long to run parts of the test(s) - DURATION_RO: 400 - DURATION_RW: 700 + DURATION_RO: 300 + DURATION_RW: 500 # Keep threads within physical capacity of server (much lower than default) NUM_THREADS: 1 - MAX_BACKGROUND_JOBS: 3 + MAX_BACKGROUND_JOBS: 4 # Don't run a couple of "optional" initial tests CI_TESTS_ONLY: "true" # Reduce configured size of levels to ensure more levels in the leveled compaction LSM tree @@ -170,7 +194,11 @@ commands: # The benchmark host has 32GB memory # The following values are tailored to work with that # Note, tests may not exercise the targeted issues if the memory is increased on new test hosts. - + COMPRESSION_TYPE: "none" + CACHE_INDEX_AND_FILTER_BLOCKS: 1 + MIN_LEVEL_TO_COMPRESS: 3 + CACHE_SIZE_MB: 10240 + MB_WRITE_PER_SEC: 2 post-benchmarks: steps: @@ -201,25 +229,28 @@ executors: # $ docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -it zjay437/rocksdb:0.5 bash # option `--cap-add=SYS_PTRACE --security-opt seccomp=unconfined` is used to enable gdb to attach an existing process - image: zjay437/rocksdb:0.6 + linux-java-docker: + docker: + - image: evolvedbinary/rocksjava:centos6_x64-be jobs: build-macos: macos: - xcode: 12.5.1 - resource_class: large + xcode: 14.3.1 + resource_class: macos.m1.medium.gen1 environment: ROCKSDB_DISABLE_JEMALLOC: 1 # jemalloc cause env_test hang, disable it for now steps: - increase-max-open-files-on-macos - install-gflags-on-macos - pre-steps-macos - - run: ulimit -S -n `ulimit -H -n` && OPT=-DCIRCLECI make V=1 J=32 -j32 all + - run: ulimit -S -n `ulimit -H -n` && OPT=-DCIRCLECI make V=1 J=16 -j16 all - post-steps build-macos-cmake: macos: - xcode: 12.5.1 - resource_class: large + xcode: 14.3.1 + resource_class: macos.m1.medium.gen1 parameters: run_even_tests: description: run even or odd tests, used to split tests to 2 groups @@ -235,20 +266,20 @@ jobs: command: ulimit -S -n `ulimit -H -n` && mkdir build && cd build && cmake -DWITH_GFLAGS=1 .. - run: name: "Build tests" - command: cd build && make V=1 -j32 + command: cd build && make V=1 -j16 - when: condition: << parameters.run_even_tests >> steps: - run: name: "Run even tests" - command: ulimit -S -n `ulimit -H -n` && cd build && ctest -j32 -I 0,,2 + command: ulimit -S -n `ulimit -H -n` && cd build && ctest -j16 -I 0,,2 - when: condition: not: << parameters.run_even_tests >> steps: - run: name: "Run odd tests" - command: ulimit -S -n `ulimit -H -n` && cd build && ctest -j32 -I 1,,2 + command: ulimit -S -n `ulimit -H -n` && cd build && ctest -j16 -I 1,,2 - post-steps build-linux: @@ -269,12 +300,12 @@ jobs: ./sst_dump --help | grep -E -q 'Supported compression types: kNoCompression$' # Verify no compiled in compression - post-steps - build-linux-shared_lib-alt_namespace-status_checked: + build-linux-static_lib-alt_namespace-status_checked: executor: linux-docker resource_class: 2xlarge steps: - pre-steps - - run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=shared OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j32 check + - run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=static OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j24 check - post-steps build-linux-release: @@ -282,11 +313,21 @@ jobs: resource_class: 2xlarge steps: - checkout # check out the code in the project directory + - run: make V=1 -j32 LIB_MODE=shared release + - run: ls librocksdb.so # ensure shared lib built + - run: ./db_stress --version # ensure with gflags + - run: make clean - run: make V=1 -j32 release + - run: ls librocksdb.a # ensure static lib built - run: ./db_stress --version # ensure with gflags - run: make clean - run: apt-get remove -y libgflags-dev + - run: make V=1 -j32 LIB_MODE=shared release + - run: ls librocksdb.so # ensure shared lib built + - run: if ./db_stress --version; then false; else true; fi # ensure without gflags + - run: make clean - run: make V=1 -j32 release + - run: ls librocksdb.a # ensure static lib built - run: if ./db_stress --version; then false; else true; fi # ensure without gflags - post-steps @@ -302,27 +343,6 @@ jobs: - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench - run: if ./db_stress --version; then false; else true; fi # ensure without gflags - build-linux-lite: - executor: linux-docker - resource_class: large - steps: - - pre-steps - - run: LITE=1 make V=1 J=8 -j8 check - - post-steps - - build-linux-lite-release: - executor: linux-docker - resource_class: large - steps: - - checkout # check out the code in the project directory - - run: LITE=1 make V=1 -j8 release - - run: ./db_stress --version # ensure with gflags - - run: make clean - - run: apt-get remove -y libgflags-dev - - run: LITE=1 make V=1 -j8 release - - run: if ./db_stress --version; then false; else true; fi # ensure without gflags - - post-steps - build-linux-clang-no_test_run: executor: linux-docker resource_class: xlarge @@ -427,7 +447,10 @@ jobs: steps: - checkout # check out the code in the project directory - run: apt-get update -y && apt-get install -y libgflags-dev - - run: make V=1 -j8 unity_test + - run: + name: "Unity build" + command: make V=1 -j8 unity_test + no_output_timeout: 20m - run: make V=1 -j8 -k check-headers # could be moved to a different build - post-steps @@ -438,7 +461,7 @@ jobs: - pre-steps - setup-folly - build-folly - - run: USE_FOLLY=1 CC=gcc-7 CXX=g++-7 V=1 make -j32 check + - run: USE_FOLLY=1 LIB_MODE=static CC=gcc-7 CXX=g++-7 V=1 make -j32 check # TODO: LIB_MODE only to work around unresolved linker failures - post-steps build-linux-gcc-7-with-folly-lite-no-test: @@ -484,7 +507,7 @@ jobs: resource_class: 2xlarge steps: - pre-steps - - run: CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench + - run: LIB_MODE=static CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench # TODO: LIB_MODE only to work around unresolved linker failures - post-steps build-linux-clang-13-no_test_run: @@ -503,7 +526,7 @@ jobs: - pre-steps - setup-folly - build-folly - - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check + - run: CC=clang-13 CXX=clang++-13 LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check # TODO: LIB_MODE only to work around unresolved linker failures - post-steps # This job is only to make sure the microbench tests are able to run, the benchmark result is not meaningful as the CI host is changing. @@ -520,7 +543,7 @@ jobs: resource_class: large steps: - pre-steps - - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000' blackbox_crash_test_with_atomic_flush + - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000 --use_io_uring=0' blackbox_crash_test_with_atomic_flush - post-steps build-linux-crashtest-tiered-storage-bb: @@ -530,7 +553,7 @@ jobs: - pre-steps - run: name: "run crashtest" - command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS=--duration=10800 blackbox_crash_test_with_tiered_storage + command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS='--duration=10800 --use_io_uring=0' blackbox_crash_test_with_tiered_storage no_output_timeout: 100m - post-steps @@ -541,10 +564,28 @@ jobs: - pre-steps - run: name: "run crashtest" - command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS=--duration=10800 whitebox_crash_test_with_tiered_storage + command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS='--duration=10800 --use_io_uring=0' whitebox_crash_test_with_tiered_storage no_output_timeout: 100m - post-steps + build-windows-vs2022-avx2: + executor: + name: win/server-2022 + size: 2xlarge + environment: + THIRDPARTY_HOME: C:/Users/circleci/thirdparty + CMAKE_HOME: C:/Program Files/CMake + CMAKE_BIN: C:/Program Files/CMake/bin/cmake.exe + CTEST_BIN: C:/Program Files/CMake/bin/ctest.exe + JAVA_HOME: C:/Program Files/BellSoft/LibericaJDK-8 + SNAPPY_HOME: C:/Users/circleci/thirdparty/snappy-1.1.8 + SNAPPY_INCLUDE: C:/Users/circleci/thirdparty/snappy-1.1.8;C:/Users/circleci/thirdparty/snappy-1.1.8/build + SNAPPY_LIB_DEBUG: C:/Users/circleci/thirdparty/snappy-1.1.8/build/Debug/snappy.lib + CMAKE_GENERATOR: Visual Studio 17 2022 + CMAKE_PORTABLE: AVX2 + steps: + - windows-build-steps + build-windows-vs2022: executor: name: win/server-2022 @@ -553,10 +594,13 @@ jobs: THIRDPARTY_HOME: C:/Users/circleci/thirdparty CMAKE_HOME: C:/Program Files/CMake CMAKE_BIN: C:/Program Files/CMake/bin/cmake.exe + CTEST_BIN: C:/Program Files/CMake/bin/ctest.exe + JAVA_HOME: C:/Program Files/BellSoft/LibericaJDK-8 SNAPPY_HOME: C:/Users/circleci/thirdparty/snappy-1.1.8 SNAPPY_INCLUDE: C:/Users/circleci/thirdparty/snappy-1.1.8;C:/Users/circleci/thirdparty/snappy-1.1.8/build SNAPPY_LIB_DEBUG: C:/Users/circleci/thirdparty/snappy-1.1.8/build/Debug/snappy.lib CMAKE_GENERATOR: Visual Studio 17 2022 + CMAKE_PORTABLE: 1 steps: - windows-build-steps @@ -568,10 +612,13 @@ jobs: THIRDPARTY_HOME: C:/Users/circleci/thirdparty CMAKE_HOME: C:/Program Files/CMake CMAKE_BIN: C:/Program Files/CMake/bin/cmake.exe + CTEST_BIN: C:/Program Files/CMake/bin/ctest.exe + JAVA_HOME: C:/Program Files/BellSoft/LibericaJDK-8 SNAPPY_HOME: C:/Users/circleci/thirdparty/snappy-1.1.8 SNAPPY_INCLUDE: C:/Users/circleci/thirdparty/snappy-1.1.8;C:/Users/circleci/thirdparty/snappy-1.1.8/build SNAPPY_LIB_DEBUG: C:/Users/circleci/thirdparty/snappy-1.1.8/build/Debug/snappy.lib CMAKE_GENERATOR: Visual Studio 16 2019 + CMAKE_PORTABLE: 1 steps: - windows-build-steps @@ -592,8 +639,29 @@ jobs: command: make V=1 J=8 -j8 jtest - post-steps + build-linux-java-pmd: + machine: + image: ubuntu-2004:202111-02 + resource_class: large + environment: + JAVA_HOME: /usr/lib/jvm/java-8-openjdk-amd64 + steps: + - install-maven + - pre-steps + - run: + name: "Set Java Environment" + command: | + echo "JAVA_HOME=${JAVA_HOME}" + echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV + which java && java -version + which javac && javac -version + - run: + name: "PMD RocksDBJava" + command: make V=1 J=8 -j8 jpmd + - post-pmd-steps + build-linux-java-static: - executor: linux-docker + executor: linux-java-docker resource_class: large steps: - pre-steps @@ -606,15 +674,15 @@ jobs: which javac && javac -version - run: name: "Build RocksDBJava Static Library" - command: make V=1 J=8 -j8 rocksdbjavastatic + command: scl enable devtoolset-7 'make V=1 J=8 -j8 rocksdbjavastatic' - post-steps build-macos-java: macos: - xcode: 12.5.1 - resource_class: large + xcode: 14.3.1 + resource_class: macos.m1.medium.gen1 environment: - JAVA_HOME: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home + JAVA_HOME: /Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home ROCKSDB_DISABLE_JEMALLOC: 1 # jemalloc causes java 8 crash steps: - increase-max-open-files-on-macos @@ -636,10 +704,10 @@ jobs: build-macos-java-static: macos: - xcode: 12.5.1 - resource_class: large + xcode: 14.3.1 + resource_class: macos.m1.medium.gen1 environment: - JAVA_HOME: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home + JAVA_HOME: /Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home steps: - increase-max-open-files-on-macos - install-gflags-on-macos @@ -661,10 +729,10 @@ jobs: build-macos-java-static-universal: macos: - xcode: 12.5.1 - resource_class: large + xcode: 14.3.1 + resource_class: macos.m1.medium.gen1 environment: - JAVA_HOME: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home + JAVA_HOME: /Library/Java/JavaVirtualMachines/liberica-jdk-8.jdk/Contents/Home steps: - increase-max-open-files-on-macos - install-gflags-on-macos @@ -821,18 +889,16 @@ workflows: - build-linux-cmake-with-folly-coroutines - build-linux-cmake-with-benchmark - build-linux-encrypted_env-no_compression - - build-linux-lite jobs-linux-run-tests-san: jobs: - build-linux-clang10-asan - build-linux-clang10-ubsan - build-linux-clang10-mini-tsan - - build-linux-shared_lib-alt_namespace-status_checked + - build-linux-static_lib-alt_namespace-status_checked jobs-linux-no-test-run: jobs: - build-linux-release - build-linux-release-rtti - - build-linux-lite-release - build-examples - build-fuzzers - build-linux-clang-no_test_run @@ -848,6 +914,7 @@ workflows: - build-linux-mini-crashtest jobs-windows: jobs: + - build-windows-vs2022-avx2 - build-windows-vs2022 - build-windows-vs2019 - build-cmake-mingw @@ -858,6 +925,7 @@ workflows: - build-macos-java - build-macos-java-static - build-macos-java-static-universal + - build-linux-java-pmd jobs-macos: jobs: - build-macos diff --git a/.github/workflows/sanity_check.yml b/.github/workflows/sanity_check.yml index 6ee53ce1b623..efc9d99cf372 100644 --- a/.github/workflows/sanity_check.yml +++ b/.github/workflows/sanity_check.yml @@ -33,9 +33,7 @@ jobs: run: pip install argparse - name: Download clang-format-diff.py - uses: wei/wget@v1 - with: - args: https://raw.githubusercontent.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py + run: wget https://raw.githubusercontent.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py - name: Check format run: VERBOSE_CHECK=1 make check-format diff --git a/.gitignore b/.gitignore index 098f0ccf6e4e..d884f9aa5dcd 100644 --- a/.gitignore +++ b/.gitignore @@ -87,6 +87,7 @@ fbcode travis-build/ buckifier/*.pyc buckifier/__pycache__ +.arcconfig compile_commands.json .vscode @@ -99,3 +100,5 @@ fuzz/crash-* cmake-build-* third-party/folly/ +.cache +*.sublime-* diff --git a/CMakeLists.txt b/CMakeLists.txt index 96708b05f114..892b5c847d09 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -261,33 +261,10 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64") endif(HAS_LOONGARCH64) endif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64") -option(PORTABLE "build a portable binary" OFF) -option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF) -option(FORCE_AVX "force building with AVX, even when PORTABLE=ON" OFF) -option(FORCE_AVX2 "force building with AVX2, even when PORTABLE=ON" OFF) -if(PORTABLE) - add_definitions(-DROCKSDB_PORTABLE) - - # MSVC does not need a separate compiler flag to enable SSE4.2; if nmmintrin.h - # is available, it is available by default. - if(FORCE_SSE42 AND NOT MSVC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpclmul") - endif() - if(MSVC) - if(FORCE_AVX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX") - endif() - # MSVC automatically enables BMI / lzcnt with AVX2. - if(FORCE_AVX2) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") - endif() - else() - if(FORCE_AVX) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx") - endif() - if(FORCE_AVX2) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mbmi -mlzcnt") - endif() +set(PORTABLE 0 CACHE STRING "Minimum CPU arch to support, or 0 = current CPU, 1 = baseline CPU") +if(PORTABLE MATCHES "1|ON|YES|TRUE|Y") + # Usually nothing to do; compiler default is typically the most general + if(NOT MSVC) if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196") endif() @@ -295,16 +272,27 @@ if(PORTABLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=loongarch64") endif() endif() -else() +elseif(PORTABLE MATCHES "0|OFF|NO|FALSE|N") if(MSVC) + # NOTE: No auto-detection of current CPU, but instead assume some useful + # level of optimization is supported set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") else() + # Require instruction set from current CPU (with some legacy or opt-out + # exceptions) if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x" AND NOT HAS_S390X_MARCH_NATIVE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196") elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64" AND NOT HAS_ARMV8_CRC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") endif() endif() +else() + # Name of a CPU arch spec or feature set to require + if(MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:${PORTABLE}") + else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${PORTABLE}") + endif() endif() include(CheckCXXSourceCompiles) @@ -313,25 +301,6 @@ if(NOT MSVC) set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul") endif() -CHECK_CXX_SOURCE_COMPILES(" -#include -#include -#include -int main() { - volatile uint32_t x = _mm_crc32_u32(0, 0); - const auto a = _mm_set_epi64x(0, 0); - const auto b = _mm_set_epi64x(0, 0); - const auto c = _mm_clmulepi64_si128(a, b, 0x00); - auto d = _mm_cvtsi128_si64(c); -} -" HAVE_SSE42) -if(HAVE_SSE42) - add_definitions(-DHAVE_SSE42) - add_definitions(-DHAVE_PCLMUL) -elseif(FORCE_SSE42) - message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled") -endif() - # Check if -latomic is required or not if (NOT MSVC) set(CMAKE_REQUIRED_FLAGS "--std=c++17") @@ -504,12 +473,6 @@ if(CMAKE_COMPILER_IS_GNUCXX) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-builtin-memcmp") endif() -option(ROCKSDB_LITE "Build RocksDBLite version" OFF) -if(ROCKSDB_LITE) - add_definitions(-DROCKSDB_LITE) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -Os") -endif() - if(CMAKE_SYSTEM_NAME MATCHES "Cygwin") add_definitions(-fno-builtin-memcmp -DCYGWIN) elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin") @@ -592,7 +555,7 @@ if(HAVE_SCHED_GETCPU) add_definitions(-DROCKSDB_SCHED_GETCPU_PRESENT) endif() -check_cxx_symbol_exists(getauxval auvx.h HAVE_AUXV_GETAUXVAL) +check_cxx_symbol_exists(getauxval "sys/auxv.h" HAVE_AUXV_GETAUXVAL) if(HAVE_AUXV_GETAUXVAL) add_definitions(-DROCKSDB_AUXV_GETAUXVAL_PRESENT) endif() @@ -675,7 +638,9 @@ set(SOURCES cache/compressed_secondary_cache.cc cache/lru_cache.cc cache/secondary_cache.cc + cache/secondary_cache_adapter.cc cache/sharded_cache.cc + cache/tiered_secondary_cache.cc db/arena_wrapped_db_iter.cc db/blob/blob_contents.cc db/blob/blob_fetcher.cc @@ -758,9 +723,11 @@ set(SOURCES db/wal_manager.cc db/wide/wide_column_serialization.cc db/wide/wide_columns.cc + db/wide/wide_columns_helper.cc db/write_batch.cc db/write_batch_base.cc db/write_controller.cc + db/write_stall_stats.cc db/write_thread.cc env/composite_env.cc env/env.cc @@ -813,6 +780,7 @@ set(SOURCES options/configurable.cc options/customizable.cc options/db_options.cc + options/offpeak_time_info.cc options/options.cc options/options_helper.cc options/options_parser.cc @@ -852,6 +820,7 @@ set(SOURCES table/get_context.cc table/iterator.cc table/merging_iterator.cc + table/compaction_merging_iterator.cc table/meta_blocks.cc table/persistent_cache_helper.cc table/plain/plain_table_bloom.cc @@ -893,6 +862,7 @@ set(SOURCES util/compression_context_cache.cc util/concurrent_task_limiter_impl.cc util/crc32c.cc + util/data_structure.cc util/dynamic_bloom.cc util/hash.cc util/murmurhash.cc @@ -906,6 +876,8 @@ set(SOURCES util/string_util.cc util/thread_local.cc util/threadpool_imp.cc + util/udt_util.cc + util/write_batch_util.cc util/xxhash.cc utilities/agg_merge/agg_merge.cc utilities/backup/backup_engine.cc @@ -1036,12 +1008,6 @@ if ( ROCKSDB_PLUGINS ) endforeach() endif() -if(HAVE_SSE42 AND NOT MSVC) - set_source_files_properties( - util/crc32c.cc - PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul") -endif() - if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64") list(APPEND SOURCES util/crc32c_ppc.c @@ -1164,11 +1130,15 @@ set(BUILD_VERSION_CC ${CMAKE_BINARY_DIR}/build_version.cc) configure_file(util/build_version.cc.in ${BUILD_VERSION_CC} @ONLY) add_library(${ROCKSDB_STATIC_LIB} STATIC ${SOURCES} ${BUILD_VERSION_CC}) +target_include_directories(${ROCKSDB_STATIC_LIB} PUBLIC + $) target_link_libraries(${ROCKSDB_STATIC_LIB} PRIVATE ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) if(ROCKSDB_BUILD_SHARED) add_library(${ROCKSDB_SHARED_LIB} SHARED ${SOURCES} ${BUILD_VERSION_CC}) + target_include_directories(${ROCKSDB_SHARED_LIB} PUBLIC + $) target_link_libraries(${ROCKSDB_SHARED_LIB} PRIVATE ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) @@ -1307,6 +1277,7 @@ if(WITH_TESTS OR WITH_BENCHMARK_TOOLS) add_subdirectory(third-party/gtest-1.8.1/fused-src/gtest) add_library(testharness STATIC test_util/mock_time_env.cc + test_util/secondary_cache_test_util.cc test_util/testharness.cc) target_link_libraries(testharness gtest) endif() @@ -1327,6 +1298,7 @@ if(WITH_TESTS) cloud/cloud_manifest_test.cc cloud/cloud_scheduler_test.cc cloud/replication_test.cc + cache/tiered_secondary_cache_test.cc db/blob/blob_counting_iterator_test.cc db/blob/blob_file_addition_test.cc db/blob/blob_file_builder_test.cc @@ -1357,6 +1329,7 @@ if(WITH_TESTS) db/db_bloom_filter_test.cc db/db_compaction_filter_test.cc db/db_compaction_test.cc + db/db_clip_test.cc db/db_dynamic_level_test.cc db/db_encryption_test.cc db/db_flush_test.cc @@ -1421,6 +1394,7 @@ if(WITH_TESTS) db/wal_edit_test.cc db/wide/db_wide_basic_test.cc db/wide/wide_column_serialization_test.cc + db/wide/wide_columns_helper_test.cc db/write_batch_test.cc db/write_callback_test.cc db/write_controller_test.cc @@ -1487,6 +1461,7 @@ if(WITH_TESTS) util/timer_test.cc util/thread_list_test.cc util/thread_local_test.cc + util/udt_util_test.cc util/work_queue_test.cc utilities/agg_merge/agg_merge_test.cc utilities/backup/backup_engine_test.cc diff --git a/HISTORY.md b/HISTORY.md index 237c25d17eff..cf6b2f857dcc 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,14 +1,281 @@ # Rocksdb Change Log -## 7.10.2 (02/10/2023) +> NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt` + +## 8.9.1 (12/8/2023) ### Bug Fixes -* Fixed a bug in DB open/recovery from a compressed WAL that was caused due to incorrect handling of certain record fragments with the same offset within a WAL block. +* Avoid destroying the periodic task scheduler's default timer in order to prevent static destruction order issues. + +## 8.9.0 (11/17/2023) +### New Features +* Add GetEntity() and PutEntity() API implementation for Attribute Group support. Through the use of Column Families, AttributeGroup enables users to logically group wide-column entities. + +### Public API Changes +* Added rocksdb_ratelimiter_create_auto_tuned API to create an auto-tuned GenericRateLimiter. +* Added clipColumnFamily() to the Java API to clip the entries in the CF according to the range [begin_key, end_key). +* Make the `EnableFileDeletion` API not default to force enabling. For users that rely on this default behavior and still +want to continue to use force enabling, they need to explicitly pass a `true` to `EnableFileDeletion`. +* Add new Cache APIs GetSecondaryCacheCapacity() and GetSecondaryCachePinnedUsage() to return the configured capacity, and cache reservation charged to the secondary cache. + +### Behavior Changes +* During off-peak hours defined by `daily_offpeak_time_utc`, the compaction picker will select a larger number of files for periodic compaction. This selection will include files that are projected to expire by the next off-peak start time, ensuring that these files are not chosen for periodic compaction outside of off-peak hours. +* If an error occurs when writing to a trace file after `DB::StartTrace()`, the subsequent trace writes are skipped to avoid writing to a file that has previously seen error. In this case, `DB::EndTrace()` will also return a non-ok status with info about the error occured previously in its status message. +* Deleting stale files upon recovery are delegated to SstFileManger if available so they can be rate limited. +* Make RocksDB only call `TablePropertiesCollector::Finish()` once. +* When `WAL_ttl_seconds > 0`, we now process archived WALs for deletion at least every `WAL_ttl_seconds / 2` seconds. Previously it could be less frequent in case of small `WAL_ttl_seconds` values when size-based expiration (`WAL_size_limit_MB > 0 `) was simultaneously enabled. + +### Bug Fixes +* Fixed a crash or assertion failure bug in experimental new HyperClockCache variant, especially when running with a SecondaryCache. +* Fix a race between flush error recovery and db destruction that can lead to db crashing. +* Fixed some bugs in the index builder/reader path for user-defined timestamps in Memtable only feature. + +## 8.8.0 (10/23/2023) +### New Features +* Introduce AttributeGroup by adding the first AttributeGroup support API, MultiGetEntity(). Through the use of Column Families, AttributeGroup enables users to logically group wide-column entities. More APIs to support AttributeGroup will come soon, including GetEntity, PutEntity, and others. +* Added new tickers `rocksdb.fifo.{max.size|ttl}.compactions` to count FIFO compactions that drop files for different reasons +* Add an experimental offpeak duration awareness by setting `DBOptions::daily_offpeak_time_utc` in "HH:mm-HH:mm" format. This information will be used for resource optimization in the future +* Users can now change the max bytes granted in a single refill period (i.e, burst) during runtime by `SetSingleBurstBytes()` for RocksDB rate limiter + +### Public API Changes +* The default value of `DBOptions::fail_if_options_file_error` changed from `false` to `true`. Operations that set in-memory options (e.g., `DB::Open*()`, `DB::SetOptions()`, `DB::CreateColumnFamily*()`, and `DB::DropColumnFamily()`) but fail to persist the change will now return a non-OK `Status` by default. + +### Behavior Changes +* For non direct IO, eliminate the file system prefetching attempt for compaction read when `Options::compaction_readahead_size` is 0 +* During a write stop, writes now block on in-progress recovery attempts + +### Bug Fixes +* Fix a bug in auto_readahead_size where first_internal_key of index blocks wasn't copied properly resulting in corruption error when first_internal_key was used for comparison. +* Fixed a bug where compaction read under non direct IO still falls back to RocksDB internal prefetching after file system's prefetching returns non-OK status other than `Status::NotSupported()` +* Add bounds check in WBWIIteratorImpl and make BaseDeltaIterator, WriteUnpreparedTxn and WritePreparedTxn respect the upper bound and lower bound in ReadOption. See 11680. +* Fixed the handling of wide-column base values in the `max_successive_merges` logic. +* Fixed a rare race bug involving a concurrent combination of Create/DropColumnFamily and/or Set(DB)Options that could lead to inconsistency between (a) the DB's reported options state, (b) the DB options in effect, and (c) the latest persisted OPTIONS file. +* Fixed a possible underflow when computing the compressed secondary cache share of memory reservations while updating the compressed secondary to total block cache ratio. + +### Performance Improvements +* Improved the I/O efficiency of DB::Open a new DB with `create_missing_column_families=true` and many column families. + +## 8.7.0 (09/22/2023) +### New Features +* Added an experimental new "automatic" variant of HyperClockCache that does not require a prior estimate of the average size of cache entries. This variant is activated when HyperClockCacheOptions::estimated\_entry\_charge = 0 and has essentially the same concurrency benefits as the existing HyperClockCache. +* Add a new statistic `COMPACTION_CPU_TOTAL_TIME` that records cumulative compaction cpu time. This ticker is updated regularly while a compaction is running. +* Add `GetEntity()` API for ReadOnly DB and Secondary DB. +* Add a new iterator API `Iterator::Refresh(const Snapshot *)` that allows iterator to be refreshed while using the input snapshot to read. +* Added a new read option `merge_operand_count_threshold`. When the number of merge operands applied during a successful point lookup exceeds this threshold, the query will return a special OK status with a new subcode `kMergeOperandThresholdExceeded`. Applications might use this signal to take action to reduce the number of merge operands for the affected key(s), for example by running a compaction. +* For `NewRibbonFilterPolicy()`, made the `bloom_before_level` option mutable through the Configurable interface and the SetOptions API, allowing dynamic switching between all-Bloom and all-Ribbon configurations, and configurations in between. See comments on `NewRibbonFilterPolicy()` +* RocksDB now allows the block cache to be stacked on top of a compressed secondary cache and a non-volatile secondary cache, thus creating a three-tier cache. To set it up, use the `NewTieredCache()` API in rocksdb/cache.h.. +* Added a new wide-column aware full merge API called `FullMergeV3` to `MergeOperator`. `FullMergeV3` supports wide columns both as base value and merge result, which enables the application to perform more general transformations during merges. For backward compatibility, the default implementation implements the earlier logic of applying the merge operation to the default column of any wide-column entities. Specifically, if there is no base value or the base value is a plain key-value, the default implementation falls back to `FullMergeV2`. If the base value is a wide-column entity, the default implementation invokes `FullMergeV2` to perform the merge on the default column, and leaves any other columns unchanged. +* Add wide column support to ldb commands (scan, dump, idump, dump_wal) and sst_dump tool's scan command + +### Public API Changes +* Expose more information about input files used in table creation (if any) in `CompactionFilter::Context`. See `CompactionFilter::Context::input_start_level`,`CompactionFilter::Context::input_table_properties` for more. +* `Options::compaction_readahead_size` 's default value is changed from 0 to 2MB. +* When using LZ4 compression, the `acceleration` parameter is configurable by setting the negated value in `CompressionOptions::level`. For example, `CompressionOptions::level=-10` will set `acceleration=10` +* The `NewTieredCache` API has been changed to take the total cache capacity (inclusive of both the primary and the compressed secondary cache) and the ratio of total capacity to allocate to the compressed cache. These are specified in `TieredCacheOptions`. Any capacity specified in `LRUCacheOptions`, `HyperClockCacheOptions` and `CompressedSecondaryCacheOptions` is ignored. A new API, `UpdateTieredCache` is provided to dynamically update the total capacity, ratio of compressed cache, and admission policy. +* The `NewTieredVolatileCache()` API in rocksdb/cache.h has been renamed to `NewTieredCache()`. + +### Behavior Changes +* Compaction read performance will regress when `Options::compaction_readahead_size` is explicitly set to 0 +* Universal size amp compaction will conditionally exclude some of the newest L0 files when selecting input with a small negative impact to size amp. This is to prevent a large number of L0 files from being locked by a size amp compaction, potentially leading to write stop with a few more flushes. +* Change ldb scan command delimiter from ':' to '==>'. + +### Bug Fixes +* Fix a bug where if there is an error reading from offset 0 of a file from L1+ and that the file is not the first file in the sorted run, data can be lost in compaction and read/scan can return incorrect results. +* Fix a bug where iterator may return incorrect result for DeleteRange() users if there was an error reading from a file. +* Fix a bug with atomic_flush=true that can cause DB to stuck after a flush fails (#11872). +* Fix a bug where RocksDB (with atomic_flush=false) can delete output SST files of pending flushes when a previous concurrent flush fails (#11865). This can result in DB entering read-only state with error message like `IO error: No such file or directory: While open a file for random read: /tmp/rocksdbtest-501/db_flush_test_87732_4230653031040984171/000013.sst`. +* Fix an assertion fault during seek with async_io when readahead trimming is enabled. +* When the compressed secondary cache capacity is reduced to 0, it should be completely disabled. Before this fix, inserts and lookups would still go to the backing `LRUCache` before returning, thus incurring locking overhead. With this fix, inserts and lookups are no-ops and do not add any overhead. +* Updating the tiered cache (cache allocated using NewTieredCache()) by calling SetCapacity() on it was not working properly. The initial creation would set the primary cache capacity to the combined primary and compressed secondary cache capacity. But SetCapacity() would just set the primary cache capacity. With this fix, the user always specifies the total budget and compressed secondary cache ratio on creation. Subsequently, SetCapacity() will distribute the new capacity across the two caches by the same ratio. +* Fixed a bug in `MultiGet` for cleaning up SuperVersion acquired with locking db mutex. +* Fix a bug where row cache can falsely return kNotFound even though row cache entry is hit. +* Fixed a race condition in `GenericRateLimiter` that could cause it to stop granting requests +* Fix a bug (Issue #10257) where DB can hang after write stall since no compaction is scheduled (#11764). +* Add a fix for async_io where during seek, when reading a block for seeking a target key in a file without any readahead, the iterator aligned the read on a page boundary and reading more than necessary. This increased the storage read bandwidth usage. +* Fix an issue in sst dump tool to handle bounds specified for data with user-defined timestamps. +* When auto_readahead_size is enabled, update readahead upper bound during readahead trimming when reseek changes iterate_upper_bound dynamically. +* Fixed a bug where `rocksdb.file.read.verify.file.checksums.micros` is not populated + +### Performance Improvements +* Added additional improvements in tuning readahead_size during Scans when auto_readahead_size is enabled. However it's not supported with Iterator::Prev operation and will return NotSupported error. +* During async_io, the Seek happens in 2 phases. Phase 1 starts an asynchronous read on a block cache miss, and phase 2 waits for it to complete and finishes the seek. In both phases, it tries to lookup the block cache for the data block first before looking in the prefetch buffer. It's optimized by doing the block cache lookup only in the first phase that would save some CPU. + +## 8.6.0 (08/18/2023) +### New Features +* Added enhanced data integrity checking on SST files with new format_version=6. Performance impact is very small or negligible. Previously if SST data was misplaced or re-arranged by the storage layer, it could pass block checksum with higher than 1 in 4 billion probability. With format_version=6, block checksums depend on what file they are in and location within the file. This way, misplaced SST data is no more likely to pass checksum verification than randomly corrupted data. Also in format_version=6, SST footers are checksum-protected. +* Add a new feature to trim readahead_size during scans upto upper_bound when iterate_upper_bound is specified. It's enabled through ReadOptions.auto_readahead_size. Users must also specify ReadOptions.iterate_upper_bound. +* RocksDB will compare the number of input keys to the number of keys processed after each compaction. Compaction will fail and report Corruption status if the verification fails. Option `compaction_verify_record_count` is introduced for this purpose and is enabled by default. +* Add a CF option `bottommost_file_compaction_delay` to allow specifying the delay of bottommost level single-file compactions. +* Add support to allow enabling / disabling user-defined timestamps feature for an existing column family in combination with the in-Memtable only feature. +* Implement a new admission policy for the compressed secondary cache that admits blocks evicted from the primary cache with the hit bit set. This policy can be specified in TieredVolatileCacheOptions by setting the newly added adm_policy option. +* Add a column family option `memtable_max_range_deletions` that limits the number of range deletions in a memtable. RocksDB will try to do an automatic flush after the limit is reached. (#11358) +* Add PutEntity API in sst_file_writer +* Add `timeout` in microsecond option to `WaitForCompactOptions` to allow timely termination of prolonged waiting in scenarios like recurring recoverable errors, such as out-of-space situations and continuous write streams that sustain ongoing flush and compactions +* New statistics `rocksdb.file.read.{get|multiget|db.iterator|verify.checksum|verify.file.checksums}.micros` measure read time of block-based SST tables or blob files during db open, `Get()`, `MultiGet()`, using db iterator, `VerifyFileChecksums()` and `VerifyChecksum()`. They require stats level greater than `StatsLevel::kExceptDetailedTimers`. +* Add close_db option to `WaitForCompactOptions` to call Close() after waiting is done. +* Add a new compression option `CompressionOptions::checksum` for enabling ZSTD's checksum feature to detect corruption during decompression. + +### Public API Changes +* Mark `Options::access_hint_on_compaction_start` related APIs as deprecated. See #11631 for alternative behavior. + +### Behavior Changes +* Statistics `rocksdb.sst.read.micros` now includes time spent on multi read and async read into the file +* For Universal Compaction users, periodic compaction (option `periodic_compaction_seconds`) will be set to 30 days by default if block based table is used. + +### Bug Fixes +* Fix a bug in FileTTLBooster that can cause users with a large number of levels (more than 65) to see errors like "runtime error: shift exponent .. is too large.." (#11673). + +## 8.5.0 (07/21/2023) +### Public API Changes +* Removed recently added APIs `GeneralCache` and `MakeSharedGeneralCache()` as our plan changed to stop exposing a general-purpose cache interface. The old forms of these APIs, `Cache` and `NewLRUCache()`, are still available, although general-purpose caching support will be dropped eventually. + +### Behavior Changes +* Option `periodic_compaction_seconds` no longer supports FIFO compaction: setting it has no effect on FIFO compactions. FIFO compaction users should only set option `ttl` instead. +* Move prefetching responsibility to page cache for compaction read for non directIO use case + +### Performance Improvements +* In case of direct_io, if buffer passed by callee is already aligned, RandomAccessFileRead::Read will avoid realloacting a new buffer, reducing memcpy and use already passed aligned buffer. +* Small efficiency improvement to HyperClockCache by reducing chance of compiler-generated heap allocations + +### Bug Fixes +* Fix use_after_free bug in async_io MultiReads when underlying FS enabled kFSBuffer. kFSBuffer is when underlying FS pass their own buffer instead of using RocksDB scratch in FSReadRequest. Right now it's an experimental feature. + +## 8.4.0 (06/26/2023) +### New Features +* Add FSReadRequest::fs_scratch which is a data buffer allocated and provided by underlying FileSystem to RocksDB during reads, when FS wants to provide its own buffer with data instead of using RocksDB provided FSReadRequest::scratch. This can help in cpu optimization by avoiding copy from file system's buffer to RocksDB buffer. More details on how to use/enable it in file_system.h. Right now its supported only for MultiReads(async + sync) with non direct io. +* Start logging non-zero user-defined timestamp sizes in WAL to signal user key format in subsequent records and use it during recovery. This change will break recovery from WAL files written by early versions that contain user-defined timestamps. The workaround is to ensure there are no WAL files to recover (i.e. by flushing before close) before upgrade. +* Added new property "rocksdb.obsolete-sst-files-size-property" that reports the size of SST files that have become obsolete but have not yet been deleted or scheduled for deletion +* Start to record the value of the flag `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` in the Manifest and table properties for a SST file when it is created. And use the recorded flag when creating a table reader for the SST file. This flag is only explicitly record if it's false. +* Add a new option OptimisticTransactionDBOptions::shared_lock_buckets that enables sharing mutexes for validating transactions between DB instances, for better balancing memory efficiency and validation contention across DB instances. Different column families and DBs also now use different hash seeds in this validation, so that the same set of key names will not contend across DBs or column families. +* Add a new ticker `rocksdb.files.marked.trash.deleted` to track the number of trash files deleted by background thread from the trash queue. +* Add an API NewTieredVolatileCache() in include/rocksdb/cache.h to allocate an instance of a block cache with a primary block cache tier and a compressed secondary cache tier. A cache of this type distributes memory reservations against the block cache, such as WriteBufferManager, table reader memory etc., proportionally across both the primary and compressed secondary cache. +* Add `WaitForCompact()` to wait for all flush and compactions jobs to finish. Jobs to wait include the unscheduled (queued, but not scheduled yet). +* Add `WriteBatch::Release()` that releases the batch's serialized data to the caller. + +### Public API Changes +* Add C API `rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio`. +* change the FileSystem::use_async_io() API to SupportedOps API in order to extend it to various operations supported by underlying FileSystem. Right now it contains FSSupportedOps::kAsyncIO and FSSupportedOps::kFSBuffer. More details about FSSupportedOps in filesystem.h +* Add new tickers: `rocksdb.error.handler.bg.error.count`, `rocksdb.error.handler.bg.io.error.count`, `rocksdb.error.handler.bg.retryable.io.error.count` to replace the misspelled ones: `rocksdb.error.handler.bg.errro.count`, `rocksdb.error.handler.bg.io.errro.count`, `rocksdb.error.handler.bg.retryable.io.errro.count` ('error' instead of 'errro'). Users should switch to use the new tickers before 9.0 release as the misspelled old tickers will be completely removed then. +* Overload the API CreateColumnFamilyWithImport() to support creating ColumnFamily by importing multiple ColumnFamilies It requires that CFs should not overlap in user key range. + +### Behavior Changes +* Change the default value for option `level_compaction_dynamic_level_bytes` to true. This affects users who use leveled compaction and do not set this option explicitly. These users may see additional background compactions following DB open. These compactions help to shape the LSM according to `level_compaction_dynamic_level_bytes` such that the size of each level Ln is approximately size of Ln-1 * `max_bytes_for_level_multiplier`. Turning on this option has other benefits too: see more detail in wiki: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction#option-level_compaction_dynamic_level_bytes-and-levels-target-size and in option comment in advanced_options.h (#11525). +* For Leveled Compaction users, `CompactRange()` will now always try to compact to the last non-empty level. (#11468) +For Leveled Compaction users, `CompactRange()` with `bottommost_level_compaction = BottommostLevelCompaction::kIfHaveCompactionFilter` will behave similar to `kForceOptimized` in that it will skip files created during this manual compaction when compacting files in the bottommost level. (#11468) +* RocksDB will try to drop range tombstones during non-bottommost compaction when it is safe to do so. (#11459) +* When a DB is openend with `allow_ingest_behind=true` (currently only Universal compaction is supported), files in the last level, i.e. the ingested files, will not be included in any compaction. (#11489) +* Statistics `rocksdb.sst.read.micros` scope is expanded to all SST reads except for file ingestion and column family import (some compaction reads were previously excluded). + +### Bug Fixes +* Reduced cases of illegally using Env::Default() during static destruction by never destroying the internal PosixEnv itself (except for builds checking for memory leaks). (#11538) +* Fix extra prefetching during seek in async_io when BlockBasedTableOptions.num_file_reads_for_auto_readahead is 1 leading to extra reads than required. +* Fix a bug where compactions that are qualified to be run as 2 subcompactions were only run as one subcompaction. +* Fix a use-after-move bug in block.cc. + +## 8.3.0 (05/19/2023) +### New Features +* Introduced a new option `block_protection_bytes_per_key`, which can be used to enable per key-value integrity protection for in-memory blocks in block cache (#11287). +* Added `JemallocAllocatorOptions::num_arenas`. Setting `num_arenas > 1` may mitigate mutex contention in the allocator, particularly in scenarios where block allocations commonly bypass jemalloc tcache. +* Improve the operational safety of publishing a DB or SST files to many hosts by using different block cache hash seeds on different hosts. The exact behavior is controlled by new option `ShardedCacheOptions::hash_seed`, which also documents the solved problem in more detail. +* Introduced a new option `CompactionOptionsFIFO::file_temperature_age_thresholds` that allows FIFO compaction to compact files to different temperatures based on key age (#11428). +* Added a new ticker stat to count how many times RocksDB detected a corruption while verifying a block checksum: `BLOCK_CHECKSUM_MISMATCH_COUNT`. +* New statistics `rocksdb.file.read.db.open.micros` that measures read time of block-based SST tables or blob files during db open. +* New statistics tickers for various iterator seek behaviors and relevant filtering, as \*`_LEVEL_SEEK_`\*. (#11460) + +### Public API Changes +* EXPERIMENTAL: Add new API `DB::ClipColumnFamily` to clip the key in CF to a certain range. It will physically deletes all keys outside the range including tombstones. +* Add `MakeSharedCache()` construction functions to various cache Options objects, and deprecated the `NewWhateverCache()` functions with long parameter lists. +* Changed the meaning of various Bloom filter stats (prefix vs. whole key), with iterator-related filtering only being tracked in the new \*`_LEVEL_SEEK_`\*. stats. (#11460) + +### Behavior changes +* For x86, CPU features are no longer detected at runtime nor in build scripts, but in source code using common preprocessor defines. This will likely unlock some small performance improvements on some newer hardware, but could hurt performance of the kCRC32c checksum, which is no longer the default, on some "portable" builds. See PR #11419 for details. + +### Bug Fixes +* Delete an empty WAL file on DB open if the log number is less than the min log number to keep +* Delete temp OPTIONS file on DB open if there is a failure to write it out or rename it + +### Performance Improvements +* Improved the I/O efficiency of prefetching SST metadata by recording more information in the DB manifest. Opening files written with previous versions will still rely on heuristics for how much to prefetch (#11406). + +## 8.2.0 (04/24/2023) +### Public API Changes +* `SstFileWriter::DeleteRange()` now returns `Status::InvalidArgument` if the range's end key comes before its start key according to the user comparator. Previously the behavior was undefined. +* Add `multi_get_for_update` to C API. +* Remove unnecessary constructor for CompressionOptions. + +### Behavior changes +* Changed default block cache size from an 8MB to 32MB LRUCache, which increases the default number of cache shards from 16 to 64. This change is intended to minimize cache mutex contention under stress conditions. See https://github.com/facebook/rocksdb/wiki/Block-Cache for more information. +* For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now trivially moves levels down to fill LSM starting from bottommost level during DB open. See more in comments for option `level_compaction_dynamic_level_bytes` (#11321). +* User-provided `ReadOptions` take effect for more reads of non-`CacheEntryRole::kDataBlock` blocks. +* For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now drains unnecessary levels through background compaction automatically (#11340). This together with #11321 makes it automatic to migrate other compaction settings to level compaction with `level_compaction_dynamic_level_bytes=true`. In addition, a live DB that becomes smaller will now have unnecessary levels drained which can help to reduce read and space amp. +* If `CompactRange()` is called with `CompactRangeOptions::bottommost_level_compaction=kForce*` to compact from L0 to L1, RocksDB now will try to do trivial move from L0 to L1 and then do an intra L1 compaction, instead of a L0 to L1 compaction with trivial move disabled (#11375)). + +### Bug Fixes +* In the DB::VerifyFileChecksums API, ensure that file system reads of SST files are equal to the readahead_size in ReadOptions, if specified. Previously, each read was 2x the readahead_size. +* In block cache tracing, fixed some cases of bad hit/miss information (and more) with MultiGet. + +### New Features +* Add experimental `PerfContext` counters `iter_{next|prev|seek}_count` for db iterator, each counting the times of corresponding API being called. +* Allow runtime changes to whether `WriteBufferManager` allows stall or not by calling `SetAllowStall()` +* Added statistics tickers BYTES_COMPRESSED_FROM, BYTES_COMPRESSED_TO, BYTES_COMPRESSION_BYPASSED, BYTES_COMPRESSION_REJECTED, NUMBER_BLOCK_COMPRESSION_BYPASSED, and NUMBER_BLOCK_COMPRESSION_REJECTED. Disabled/deprecated histograms BYTES_COMPRESSED and BYTES_DECOMPRESSED, and ticker NUMBER_BLOCK_NOT_COMPRESSED. The new tickers offer more inight into compression ratios, rejected vs. disabled compression, etc. (#11388) +* New statistics `rocksdb.file.read.{flush|compaction}.micros` that measure read time of block-based SST tables or blob files during flush or compaction. + +## 8.1.0 (03/18/2023) +### Behavior changes +* Compaction output file cutting logic now considers range tombstone start keys. For example, SST partitioner now may receive ParitionRequest for range tombstone start keys. +* If the async_io ReadOption is specified for MultiGet or NewIterator on a platform that doesn't support IO uring, the option is ignored and synchronous IO is used. + +### Bug Fixes +* Fixed an issue for backward iteration when user defined timestamp is enabled in combination with BlobDB. +* Fixed a couple of cases where a Merge operand encountered during iteration wasn't reflected in the `internal_merge_count` PerfContext counter. +* Fixed a bug in CreateColumnFamilyWithImport()/ExportColumnFamily() which did not support range tombstones (#11252). +* Fixed a bug where an excluded column family from an atomic flush contains unflushed data that should've been included in this atomic flush (i.e, data of seqno less than the max seqno of this atomic flush), leading to potential data loss in this excluded column family when `WriteOptions::disableWAL == true` (#11148). + +### New Features +* Add statistics rocksdb.secondary.cache.filter.hits, rocksdb.secondary.cache.index.hits, and rocksdb.secondary.cache.filter.hits +* Added a new PerfContext counter `internal_merge_point_lookup_count` which tracks the number of Merge operands applied while serving point lookup queries. +* Add new statistics rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} +* Add support for SecondaryCache with HyperClockCache (`HyperClockCacheOptions` inherits `secondary_cache` option from `ShardedCacheOptions`) +* Add new db properties `rocksdb.cf-write-stall-stats`, `rocksdb.db-write-stall-stats`and APIs to examine them in a structured way. In particular, users of `GetMapProperty()` with property `kCFWriteStallStats`/`kDBWriteStallStats` can now use the functions in `WriteStallStatsMapKeys` to find stats in the map. + +### Public API Changes +* Changed various functions and features in `Cache` that are mostly relevant to custom implementations or wrappers. Especially, asychronous lookup functionality is moved from `Lookup()` to a new `StartAsyncLookup()` function. + +## 8.0.0 (02/19/2023) +### Behavior changes +* `ReadOptions::verify_checksums=false` disables checksum verification for more reads of non-`CacheEntryRole::kDataBlock` blocks. +* In case of scan with async_io enabled, if posix doesn't support IOUring, Status::NotSupported error will be returned to the users. Initially that error was swallowed and reads were switched to synchronous reads. -## 7.10.1 (02/01/2023) ### Bug Fixes * Fixed a data race on `ColumnFamilyData::flush_reason` caused by concurrent flushes. +* Fixed an issue in `Get` and `MultiGet` when user-defined timestamps is enabled in combination with BlobDB. +* Fixed some atypical behaviors for `LockWAL()` such as allowing concurrent/recursive use and not expecting `UnlockWAL()` after non-OK result. See API comments. +* Fixed a feature interaction bug where for blobs `GetEntity` would expose the blob reference instead of the blob value. * Fixed `DisableManualCompaction()` and `CompactRangeOptions::canceled` to cancel compactions even when they are waiting on conflicting compactions to finish * Fixed a bug in which a successful `GetMergeOperands()` could transiently return `Status::MergeInProgress()` * Return the correct error (Status::NotSupported()) to MultiGet caller when ReadOptions::async_io flag is true and IO uring is not enabled. Previously, Status::Corruption() was being returned when the actual failure was lack of async IO support. +* Fixed a bug in DB open/recovery from a compressed WAL that was caused due to incorrect handling of certain record fragments with the same offset within a WAL block. + +### Feature Removal +* Remove RocksDB Lite. +* The feature block_cache_compressed is removed. Statistics related to it are removed too. +* Remove deprecated Env::LoadEnv(). Use Env::CreateFromString() instead. +* Remove deprecated FileSystem::Load(). Use FileSystem::CreateFromString() instead. +* Removed the deprecated version of these utility functions and the corresponding Java bindings: `LoadOptionsFromFile`, `LoadLatestOptions`, `CheckOptionsCompatibility`. +* Remove the FactoryFunc from the LoadObject method from the Customizable helper methods. + +### Public API Changes +* Moved rarely-needed Cache class definition to new advanced_cache.h, and added a CacheWrapper class to advanced_cache.h. Minor changes to SimCache API definitions. +* Completely removed the following deprecated/obsolete statistics: the tickers `BLOCK_CACHE_INDEX_BYTES_EVICT`, `BLOCK_CACHE_FILTER_BYTES_EVICT`, `BLOOM_FILTER_MICROS`, `NO_FILE_CLOSES`, `STALL_L0_SLOWDOWN_MICROS`, `STALL_MEMTABLE_COMPACTION_MICROS`, `STALL_L0_NUM_FILES_MICROS`, `RATE_LIMIT_DELAY_MILLIS`, `NO_ITERATORS`, `NUMBER_FILTERED_DELETES`, `WRITE_TIMEDOUT`, `BLOB_DB_GC_NUM_KEYS_OVERWRITTEN`, `BLOB_DB_GC_NUM_KEYS_EXPIRED`, `BLOB_DB_GC_BYTES_OVERWRITTEN`, `BLOB_DB_GC_BYTES_EXPIRED`, `BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT` as well as the histograms `STALL_L0_SLOWDOWN_COUNT`, `STALL_MEMTABLE_COMPACTION_COUNT`, `STALL_L0_NUM_FILES_COUNT`, `HARD_RATE_LIMIT_DELAY_COUNT`, `SOFT_RATE_LIMIT_DELAY_COUNT`, `BLOB_DB_GC_MICROS`, and `NUM_DATA_BLOCKS_READ_PER_LEVEL`. Note that as a result, the C++ enum values of the still supported statistics have changed. Developers are advised to not rely on the actual numeric values. +* Deprecated IngestExternalFileOptions::write_global_seqno and change default to false. This option only needs to be set to true to generate a DB compatible with RocksDB versions before 5.16.0. +* Remove deprecated APIs `GetColumnFamilyOptionsFrom{Map|String}(const ColumnFamilyOptions&, ..)`, `GetDBOptionsFrom{Map|String}(const DBOptions&, ..)`, `GetBlockBasedTableOptionsFrom{Map|String}(const BlockBasedTableOptions& table_options, ..)` and ` GetPlainTableOptionsFrom{Map|String}(const PlainTableOptions& table_options,..)`. +* Added a subcode of `Status::Corruption`, `Status::SubCode::kMergeOperatorFailed`, for users to identify corruption failures originating in the merge operator, as opposed to RocksDB's internally identified data corruptions + +### Build Changes +* The `make` build now builds a shared library by default instead of a static library. Use `LIB_MODE=static` to override. + +### New Features +* Compaction filters are now supported for wide-column entities by means of the `FilterV3` API. See the comment of the API for more details. +* Added `do_not_compress_roles` to `CompressedSecondaryCacheOptions` to disable compression on certain kinds of block. Filter blocks are now not compressed by CompressedSecondaryCache by default. +* Added a new `MultiGetEntity` API that enables batched wide-column point lookups. See the API comments for more details. ## 7.10.0 (01/23/2023) ### Behavior changes diff --git a/INSTALL.md b/INSTALL.md index 7d3b147796c9..fb4651e4b817 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -17,15 +17,18 @@ There are few options when compiling RocksDB: * `make check` will compile and run all the unit tests. `make check` will compile RocksDB in debug mode. * `make all` will compile our static library, and all our tools and unit tests. Our tools -depend on gflags. You will need to have gflags installed to run `make all`. This will compile RocksDB in debug mode. Don't +depend on gflags 2.2.0 or newer. You will need to have gflags installed to run `make all`. This will compile RocksDB in debug mode. Don't use binaries compiled by `make all` in production. -* By default the binary we produce is optimized for the platform you're compiling on -(`-march=native` or the equivalent). SSE4.2 will thus be enabled automatically if your -CPU supports it. To print a warning if your CPU does not support SSE4.2, build with -`USE_SSE=1 make static_lib` or, if using CMake, `cmake -DFORCE_SSE42=ON`. If you want -to build a portable binary, add `PORTABLE=1` before your make commands, like this: -`PORTABLE=1 make static_lib`. +* By default the binary we produce is optimized for the CPU you're compiling on +(`-march=native` or the equivalent). To build a binary compatible with the most +general architecture supported by your CPU and compiler, set `PORTABLE=1` for +the build, but performance will suffer as many operations benefit from newer +and wider instructions. In addition to `PORTABLE=0` (default) and `PORTABLE=1`, +it can be set to an architecture name recognized by your compiler. For example, +on 64-bit x86, a reasonable compromise is `PORTABLE=haswell` which supports +many or most of the available optimizations while still being compatible with +most processors made since roughly 2013. ## Dependencies @@ -48,6 +51,11 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi * If you wish to build the RocksJava static target, then cmake is required for building Snappy. * If you wish to run microbench (e.g, `make microbench`, `make ribbon_bench` or `cmake -DWITH_BENCHMARK=1`), Google benchmark >= 1.6.0 is needed. +* You can do the following to install Google benchmark. These commands are copied from `./build_tools/ubuntu20_image/Dockerfile`: + +`$ git clone --depth 1 --branch v1.7.0 https://github.com/google/benchmark.git ~/benchmark` + +`$ cd ~/benchmark && mkdir build && cd build && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 && ninja && ninja install` ## Supported platforms @@ -69,7 +77,7 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi git clone https://github.com/gflags/gflags.git cd gflags - git checkout v2.0 + git checkout v2.2.0 ./configure && make && sudo make install **Notice**: Once installed, please add the include path for gflags to your `CPATH` environment variable and the @@ -178,7 +186,7 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi gmake rocksdbjava * **iOS**: - * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`. + * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define an important pre-processing macros: `IOS_CROSS_COMPILE`. * **Windows** (Visual Studio 2017 to up): * Read and follow the instructions at CMakeLists.txt diff --git a/Makefile b/Makefile index 1c7dc7791377..42433fdde980 100644 --- a/Makefile +++ b/Makefile @@ -44,13 +44,6 @@ quoted_perl_command = $(subst ','\'',$(perl_command)) # Set the default DEBUG_LEVEL to 1 DEBUG_LEVEL?=1 -# LIB_MODE says whether or not to use/build "shared" or "static" libraries. -# Mode "static" means to link against static libraries (.a) -# Mode "shared" means to link against shared libraries (.so, .sl, .dylib, etc) -# -# Set the default LIB_MODE to static -LIB_MODE?=static - # OBJ_DIR is where the object files reside. Default to the current directory OBJ_DIR?=. @@ -81,29 +74,42 @@ else ifneq ($(filter jtest rocksdbjava%, $(MAKECMDGOALS)),) endif endif -$(info $$DEBUG_LEVEL is ${DEBUG_LEVEL}) - -# Lite build flag. -LITE ?= 0 -ifeq ($(LITE), 0) -ifneq ($(filter -DROCKSDB_LITE,$(OPT)),) - # Be backward compatible and support older format where OPT=-DROCKSDB_LITE is - # specified instead of LITE=1 on the command line. - LITE=1 -endif -else ifeq ($(LITE), 1) -ifeq ($(filter -DROCKSDB_LITE,$(OPT)),) - OPT += -DROCKSDB_LITE -endif +# LIB_MODE says whether or not to use/build "shared" or "static" libraries. +# Mode "static" means to link against static libraries (.a) +# Mode "shared" means to link against shared libraries (.so, .sl, .dylib, etc) +# +ifeq ($(DEBUG_LEVEL), 0) +# For optimized, set the default LIB_MODE to static for code size/efficiency + LIB_MODE?=static +else +# For debug, set the default LIB_MODE to shared for efficient `make check` etc. + LIB_MODE?=shared endif +$(info $$DEBUG_LEVEL is $(DEBUG_LEVEL), $$LIB_MODE is $(LIB_MODE)) + +# Detect what platform we're building on. +# Export some common variables that might have been passed as Make variables +# instead of environment variables. +dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \ + export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \ + export LDFLAGS="$(EXTRA_LDFLAGS)"; \ + export COMPILE_WITH_ASAN="$(COMPILE_WITH_ASAN)"; \ + export COMPILE_WITH_TSAN="$(COMPILE_WITH_TSAN)"; \ + export COMPILE_WITH_UBSAN="$(COMPILE_WITH_UBSAN)"; \ + export PORTABLE="$(PORTABLE)"; \ + export ROCKSDB_NO_FBCODE="$(ROCKSDB_NO_FBCODE)"; \ + export USE_CLANG="$(USE_CLANG)"; \ + export LIB_MODE="$(LIB_MODE)"; \ + export ROCKSDB_CXX_STANDARD="$(ROCKSDB_CXX_STANDARD)"; \ + export USE_FOLLY="$(USE_FOLLY)"; \ + "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk")) +# this file is generated by the previous line to set build flags and sources +include make_config.mk + # Figure out optimize level. ifneq ($(DEBUG_LEVEL), 2) -ifeq ($(LITE), 0) OPTIMIZE_LEVEL ?= -O2 -else - OPTIMIZE_LEVEL ?= -Os -endif endif # `OPTIMIZE_LEVEL` is empty when the user does not set it and `DEBUG_LEVEL=2`. # In that case, the compiler default (`-O0` for gcc and clang) will be used. @@ -236,25 +242,6 @@ am__v_AR_1 = AM_LINK = $(AM_V_CCLD)$(CXX) -L. $(patsubst lib%.a, -l%, $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^)) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) AM_SHARE = $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$@ -L. $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^) $(EXEC_LDFLAGS) $(LDFLAGS) -o $@ -# Detect what platform we're building on. -# Export some common variables that might have been passed as Make variables -# instead of environment variables. -dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \ - export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \ - export LDFLAGS="$(EXTRA_LDFLAGS)"; \ - export COMPILE_WITH_ASAN="$(COMPILE_WITH_ASAN)"; \ - export COMPILE_WITH_TSAN="$(COMPILE_WITH_TSAN)"; \ - export COMPILE_WITH_UBSAN="$(COMPILE_WITH_UBSAN)"; \ - export PORTABLE="$(PORTABLE)"; \ - export ROCKSDB_NO_FBCODE="$(ROCKSDB_NO_FBCODE)"; \ - export USE_CLANG="$(USE_CLANG)"; \ - export LIB_MODE="$(LIB_MODE)"; \ - export ROCKSDB_CXX_STANDARD="$(ROCKSDB_CXX_STANDARD)"; \ - export USE_FOLLY="$(USE_FOLLY)"; \ - "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk")) -# this file is generated by the previous line to set build flags and sources -include make_config.mk - ROCKSDB_PLUGIN_MKS = $(foreach plugin, $(ROCKSDB_PLUGINS), plugin/$(plugin)/*.mk) include $(ROCKSDB_PLUGIN_MKS) ROCKSDB_PLUGIN_PROTO =ROCKSDB_NAMESPACE::ObjectLibrary\&, const std::string\& @@ -337,13 +324,6 @@ endif ifeq ($(PLATFORM), OS_SOLARIS) PLATFORM_CXXFLAGS += -D _GLIBCXX_USE_C99 endif -ifneq ($(filter -DROCKSDB_LITE,$(OPT)),) - # found - CFLAGS += -fno-exceptions - CXXFLAGS += -fno-exceptions - # LUA is not supported under ROCKSDB_LITE - LUA_PATH = -endif ifeq ($(LIB_MODE),shared) # So that binaries are executable from build location, in addition to install location @@ -357,8 +337,8 @@ ifneq ($(MACHINE), arm64) # linking with jemalloc (as it won't be arm64-compatible) and remove some other options # set during platform detection DISABLE_JEMALLOC=1 -PLATFORM_CFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CFLAGS)) -PLATFORM_CXXFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CXXFLAGS)) +PLATFORM_CCFLAGS := $(filter-out -march=native, $(PLATFORM_CCFLAGS)) +PLATFORM_CXXFLAGS := $(filter-out -march=native, $(PLATFORM_CXXFLAGS)) endif endif endif @@ -560,7 +540,7 @@ endif ifdef USE_CLANG # Used by some teams in Facebook - WARNING_FLAGS += -Wshift-sign-overflow + WARNING_FLAGS += -Wshift-sign-overflow -Wambiguous-reversed-operator endif ifeq ($(PLATFORM), OS_OPENBSD) @@ -1017,7 +997,7 @@ endif .PHONY: check_0 check_0: - printf '%s\n' '' \ + @printf '%s\n' '' \ 'To monitor subtest ,' \ ' run "make watch-log" in a separate window' ''; \ { \ @@ -1039,7 +1019,7 @@ valgrind-exclude-regexp = InlineSkipTest.ConcurrentInsert|TransactionStressTest. .PHONY: valgrind_check_0 valgrind_check_0: test_log_prefix := valgrind_ valgrind_check_0: - printf '%s\n' '' \ + @printf '%s\n' '' \ 'To monitor subtest ,' \ ' run "make watch-log" in a separate window' ''; \ { \ @@ -1085,13 +1065,11 @@ check: all rm -rf $(TEST_TMPDIR) ifneq ($(PLATFORM), OS_AIX) $(PYTHON) tools/check_all_python.py -ifeq ($(filter -DROCKSDB_LITE,$(OPT)),) ifndef ASSERT_STATUS_CHECKED # not yet working with these tests $(PYTHON) tools/ldb_test.py sh tools/rocksdb_dump_test.sh endif endif -endif ifndef SKIP_FORMAT_BUCK_CHECKS $(MAKE) check-format $(MAKE) check-buck-targets @@ -1247,9 +1225,9 @@ clean: clean-ext-libraries-all clean-rocks clean-rocksjava clean-not-downloaded: clean-ext-libraries-bin clean-rocks clean-not-downloaded-rocksjava clean-rocks: - echo shared=$(ALL_SHARED_LIBS) - echo static=$(ALL_STATIC_LIBS) - rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(ALL_STATIC_LIBS) $(ALL_SHARED_LIBS) $(MICROBENCHS) +# Not practical to exactly match all versions/variants in naming (e.g. debug or not) + rm -f ${LIBNAME}*.so* ${LIBNAME}*.a + rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(MICROBENCHS) rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report $(FIND) . -name "*.[oda]" -exec rm -f {} \; $(FIND) . -type f \( -name "*.gcda" -o -name "*.gcno" \) -exec rm -f {} \; @@ -1442,6 +1420,9 @@ thread_local_test: $(OBJ_DIR)/util/thread_local_test.o $(TEST_LIBRARY) $(LIBRARY work_queue_test: $(OBJ_DIR)/util/work_queue_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +udt_util_test: $(OBJ_DIR)/util/udt_util_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + corruption_test: $(OBJ_DIR)/db/corruption_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -1505,6 +1486,9 @@ db_compaction_filter_test: $(OBJ_DIR)/db/db_compaction_filter_test.o $(TEST_LIBR db_compaction_test: $(OBJ_DIR)/db/db_compaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +db_clip_test: $(OBJ_DIR)/db/db_clip_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + db_dynamic_level_test: $(OBJ_DIR)/db/db_dynamic_level_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -1919,6 +1903,9 @@ compressed_secondary_cache_test: $(OBJ_DIR)/cache/compressed_secondary_cache_tes lru_cache_test: $(OBJ_DIR)/cache/lru_cache_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +tiered_secondary_cache_test: $(OBJ_DIR)/cache/tiered_secondary_cache_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + range_del_aggregator_test: $(OBJ_DIR)/db/range_del_aggregator_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) @@ -2018,6 +2005,9 @@ cache_reservation_manager_test: $(OBJ_DIR)/cache/cache_reservation_manager_test. wide_column_serialization_test: $(OBJ_DIR)/db/wide/wide_column_serialization_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +wide_columns_helper_test: $(OBJ_DIR)/db/wide/wide_columns_helper_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + #------------------------------------------------- # make install related stuff PREFIX ?= /usr/local @@ -2122,8 +2112,8 @@ ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-javadoc.jar ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_JAVA_VERSION)-sources.jar SHA256_CMD = sha256sum -ZLIB_VER ?= 1.2.13 -ZLIB_SHA256 ?= b3a24de97a8fdbc835b9833169501030b8977031bcb54b3b3ac13740f846ab30 +ZLIB_VER ?= 1.3 +ZLIB_SHA256 ?= ff0ba4c292013dbc27530b3a81e1f9a813cd39de01ca5e0f8bf355702efa593e ZLIB_DOWNLOAD_BASE ?= http://zlib.net BZIP2_VER ?= 1.0.8 BZIP2_SHA256 ?= ab5a03176ee106d3f0fa90e381da478ddae405918153cca248e682cd0c4a2269 @@ -2131,11 +2121,11 @@ BZIP2_DOWNLOAD_BASE ?= http://sourceware.org/pub/bzip2 SNAPPY_VER ?= 1.1.8 SNAPPY_SHA256 ?= 16b677f07832a612b0836178db7f374e414f94657c138e6993cbfc5dcc58651f SNAPPY_DOWNLOAD_BASE ?= https://github.com/google/snappy/archive -LZ4_VER ?= 1.9.3 -LZ4_SHA256 ?= 030644df4611007ff7dc962d981f390361e6c97a34e5cbc393ddfbe019ffe2c1 +LZ4_VER ?= 1.9.4 +LZ4_SHA256 ?= 0b0e3aa07c8c063ddf40b082bdf7e37a1562bda40a0ff5272957f3e987e0e54b LZ4_DOWNLOAD_BASE ?= https://github.com/lz4/lz4/archive -ZSTD_VER ?= 1.4.9 -ZSTD_SHA256 ?= acf714d98e3db7b876e5b540cbf6dee298f60eb3c0723104f6d3f065cd60d6a8 +ZSTD_VER ?= 1.5.5 +ZSTD_SHA256 ?= 98e9c3d949d1b924e28e01eccb7deed865eefebf25c2f21c702e5cd5b63b85e1 ZSTD_DOWNLOAD_BASE ?= https://github.com/facebook/zstd/archive CURL_SSL_OPTS ?= --tlsv1 @@ -2447,6 +2437,9 @@ jtest_run: jtest: rocksdbjava cd java;$(MAKE) sample test +jpmd: rocksdbjava rocksdbjavageneratepom + cd java;$(MAKE) pmd + jdb_bench: cd java;$(MAKE) db_bench; @@ -2475,6 +2468,8 @@ checkout_folly: @# NOTE: this hack is required for gcc in some cases perl -pi -e 's/(__has_include..)/__cpp_rtti && $$1/' third-party/folly/folly/memory/MemoryResource.h +CXX_M_FLAGS = $(filter -m%, $(CXXFLAGS)) + build_folly: FOLLY_INST_PATH=`cd third-party/folly; $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \ if [ "$$FOLLY_INST_PATH" ]; then \ @@ -2485,8 +2480,8 @@ build_folly: fi # Restore the original version of Invoke.h with boost dependency cd third-party/folly && ${GIT_COMMAND} checkout folly/functional/Invoke.h - cd third-party/folly && MAYBE_AVX2=`echo $(CXXFLAGS) | grep -o -- -DHAVE_AVX2 | sed 's/-DHAVE_AVX2/-mavx2/g' || true` && \ - CXXFLAGS=" $$MAYBE_AVX2 -DHAVE_CXX11_ATOMIC " $(PYTHON) build/fbcode_builder/getdeps.py build --no-tests + cd third-party/folly && \ + CXXFLAGS=" $(CXX_M_FLAGS) -DHAVE_CXX11_ATOMIC " $(PYTHON) build/fbcode_builder/getdeps.py build --no-tests # --------------------------------------------------------------------------- # Build size testing @@ -2507,18 +2502,6 @@ build_size: $(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib $$(stat --printf="%s" `readlink -f librocksdb.so`) strip `readlink -f librocksdb.so` $(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib_stripped $$(stat --printf="%s" `readlink -f librocksdb.so`) - # === lite build, static === - $(MAKE) clean - $(MAKE) LITE=1 static_lib - $(REPORT_BUILD_STATISTIC) rocksdb.build_size.static_lib_lite $$(stat --printf="%s" librocksdb.a) - strip librocksdb.a - $(REPORT_BUILD_STATISTIC) rocksdb.build_size.static_lib_lite_stripped $$(stat --printf="%s" librocksdb.a) - # === lite build, shared === - $(MAKE) clean - $(MAKE) LITE=1 shared_lib - $(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib_lite $$(stat --printf="%s" `readlink -f librocksdb.so`) - strip `readlink -f librocksdb.so` - $(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib_lite_stripped $$(stat --printf="%s" `readlink -f librocksdb.so`) # --------------------------------------------------------------------------- # Platform-specific compilation diff --git a/PLUGINS.md b/PLUGINS.md index 60a1e6590439..37fc68b86a38 100644 --- a/PLUGINS.md +++ b/PLUGINS.md @@ -5,3 +5,5 @@ This is the list of all known third-party plugins for RocksDB. If something is m * [ZenFS](https://github.com/westerndigitalcorporation/zenfs): a file system for zoned block devices * [RADOS](https://github.com/riversand963/rocksdb-rados-env): an Env used for interacting with RADOS. Migrated from RocksDB main repo. * [PMEM](https://github.com/pmem/pmem-rocksdb-plugin): a collection of plugins to enable Persistent Memory on RocksDB. +* [IPPCP](https://github.com/intel/ippcp-plugin-rocksdb): a plugin to enable encryption on RocksDB based on Intel optimized open source IPP-Crypto library. +* [encfs](https://github.com/pegasus-kv/encfs): a plugin to enable encryption on RocksDB based on OpenSSL library. \ No newline at end of file diff --git a/ROCKSDB_LITE.md b/ROCKSDB_LITE.md deleted file mode 100644 index 166426c6009f..000000000000 --- a/ROCKSDB_LITE.md +++ /dev/null @@ -1,21 +0,0 @@ -# RocksDBLite - -RocksDBLite is a project focused on mobile use cases, which don't need a lot of fancy things we've built for server workloads and they are very sensitive to binary size. For that reason, we added a compile flag ROCKSDB_LITE that comments out a lot of the nonessential code and keeps the binary lean. - -Some examples of the features disabled by ROCKSDB_LITE: -* compiled-in support for LDB tool -* No backup engine -* No support for replication (which we provide in form of TransactionalIterator) -* No advanced monitoring tools -* No special-purpose memtables that are highly optimized for specific use cases -* No Transactions - -When adding a new big feature to RocksDB, please add ROCKSDB_LITE compile guard if: -* Nobody from mobile really needs your feature, -* Your feature is adding a lot of weight to the binary. - -Don't add ROCKSDB_LITE compile guard if: -* It would introduce a lot of code complexity. Compile guards make code harder to read. It's a trade-off. -* Your feature is not adding a lot of weight. - -If unsure, ask. :) diff --git a/TARGETS b/TARGETS index 93bdf75dd95d..a8f4ad0f9930 100644 --- a/TARGETS +++ b/TARGETS @@ -3,8 +3,6 @@ # --> DO NOT EDIT MANUALLY <-- # This file is a Facebook-specific integration for buck builds, so can # only be validated by Facebook employees. -# -# @noautodeps @nocodemods load("//rocks/buckifier:defs.bzl", "cpp_library_wrapper","rocks_cpp_library_wrapper","cpp_binary_wrapper","cpp_unittest_wrapper","fancy_bench_wrapper","add_c_test_wrapper") @@ -19,7 +17,6 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "cache/compressed_secondary_cache.cc", "cache/lru_cache.cc", "cache/secondary_cache.cc", - "cache/sharded_cache.cc", "cloud/aws/aws_file_system.cc", "cloud/aws/aws_kafka.cc", "cloud/aws/aws_kinesis.cc", @@ -35,6 +32,9 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "cloud/db_cloud_impl.cc", "cloud/manifest_reader.cc", "cloud/purge.cc", + "cache/secondary_cache_adapter.cc", + "cache/sharded_cache.cc", + "cache/tiered_secondary_cache.cc", "db/arena_wrapped_db_iter.cc", "db/blob/blob_contents.cc", "db/blob/blob_fetcher.cc", @@ -119,9 +119,11 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "db/wal_manager.cc", "db/wide/wide_column_serialization.cc", "db/wide/wide_columns.cc", + "db/wide/wide_columns_helper.cc", "db/write_batch.cc", "db/write_batch_base.cc", "db/write_controller.cc", + "db/write_stall_stats.cc", "db/write_thread.cc", "env/composite_env.cc", "env/env.cc", @@ -178,6 +180,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "options/configurable.cc", "options/customizable.cc", "options/db_options.cc", + "options/offpeak_time_info.cc", "options/options.cc", "options/options_helper.cc", "options/options_parser.cc", @@ -217,6 +220,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "table/block_based/reader_common.cc", "table/block_based/uncompression_dict_reader.cc", "table/block_fetcher.cc", + "table/compaction_merging_iterator.cc", "table/cuckoo/cuckoo_table_builder.cc", "table/cuckoo/cuckoo_table_factory.cc", "table/cuckoo/cuckoo_table_reader.cc", @@ -264,6 +268,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "util/concurrent_task_limiter_impl.cc", "util/crc32c.cc", "util/crc32c_arm64.cc", + "util/data_structure.cc", "util/dynamic_bloom.cc", "util/file_checksum_helper.cc", "util/hash.cc", @@ -277,6 +282,8 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "util/string_util.cc", "util/thread_local.cc", "util/threadpool_imp.cc", + "util/udt_util.cc", + "util/write_batch_util.cc", "util/xxhash.cc", "utilities/agg_merge/agg_merge.cc", "utilities/backup/backup_engine.cc", @@ -367,6 +374,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[ "//folly/synchronization:distributed_mutex", ], headers=None, link_whole=False, extra_test_libs=False) +<<<<<<< HEAD cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[ "cache/cache.cc", "cache/cache_entry_roles.cc", @@ -726,11 +734,14 @@ cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[ "//folly/synchronization:distributed_mutex", ], headers=None, link_whole=True, extra_test_libs=False) +cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[], deps=[":rocksdb_lib"], headers=None, link_whole=True, extra_test_libs=False) + cpp_library_wrapper(name="rocksdb_test_lib", srcs=[ "db/db_test_util.cc", "db/db_with_timestamp_test_util.cc", "table/mock_table.cc", "test_util/mock_time_env.cc", + "test_util/secondary_cache_test_util.cc", "test_util/testharness.cc", "test_util/testutil.cc", "tools/block_cache_analyzer/block_cache_trace_analyzer.cc", @@ -760,7 +771,9 @@ rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[ "db_stress_tool/db_stress_stat.cc", "db_stress_tool/db_stress_test_base.cc", "db_stress_tool/db_stress_tool.cc", + "db_stress_tool/db_stress_wide_merge_operator.cc", "db_stress_tool/expected_state.cc", + "db_stress_tool/expected_value.cc", "db_stress_tool/multi_ops_txns_stress.cc", "db_stress_tool/no_batched_ops_stress.cc", "test_util/testutil.cc", @@ -771,6 +784,8 @@ rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[ cpp_binary_wrapper(name="db_stress", srcs=["db_stress_tool/db_stress.cc"], deps=[":rocksdb_stress_lib"], extra_preprocessor_flags=[], extra_bench_libs=False) +cpp_binary_wrapper(name="cache_bench", srcs=["cache/cache_bench.cc"], deps=[":rocksdb_cache_bench_tools_lib"], extra_preprocessor_flags=[], extra_bench_libs=False) + cpp_binary_wrapper(name="ribbon_bench", srcs=["microbench/ribbon_bench.cc"], deps=[], extra_preprocessor_flags=[], extra_bench_libs=True) cpp_binary_wrapper(name="db_basic_bench", srcs=["microbench/db_basic_bench.cc"], deps=[], extra_preprocessor_flags=[], extra_bench_libs=True) @@ -5141,6 +5156,11 @@ cpp_unittest_wrapper(name="db_cloud_test", deps=[":rocksdb_test_lib"], extra_compiler_flags=[]) +cpp_unittest_wrapper(name="db_clip_test", + srcs=["db/db_clip_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + cpp_unittest_wrapper(name="db_compaction_filter_test", srcs=["db/db_compaction_filter_test.cc"], @@ -5870,6 +5890,12 @@ cpp_unittest_wrapper(name="tiered_compaction_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="tiered_secondary_cache_test", + srcs=["cache/tiered_secondary_cache_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="timer_queue_test", srcs=["util/timer_queue_test.cc"], deps=[":rocksdb_test_lib"], @@ -5906,6 +5932,12 @@ cpp_unittest_wrapper(name="ttl_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="udt_util_test", + srcs=["util/udt_util_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="util_merge_operators_test", srcs=["utilities/util_merge_operators_test.cc"], deps=[":rocksdb_test_lib"], @@ -5942,6 +5974,12 @@ cpp_unittest_wrapper(name="wide_column_serialization_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="wide_columns_helper_test", + srcs=["db/wide/wide_columns_helper_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="work_queue_test", srcs=["util/work_queue_test.cc"], deps=[":rocksdb_test_lib"], @@ -5995,3 +6033,5 @@ cpp_unittest_wrapper(name="write_unprepared_transaction_test", deps=[":rocksdb_test_lib"], extra_compiler_flags=[]) + +export_file(name = "tools/db_crashtest.py") diff --git a/USERS.md b/USERS.md index f31e2678522e..086cab90d1d9 100644 --- a/USERS.md +++ b/USERS.md @@ -15,6 +15,28 @@ At Facebook, we use RocksDB as storage engines in multiple data management servi [2] https://code.facebook.com/posts/357056558062811/logdevice-a-distributed-data-store-for-logs/ +## Bilibili +[Bilibili](bilibili.com) [uses](https://www.alluxio.io/blog/when-ai-meets-alluxio-at-bilibili-building-an-efficient-ai-platform-for-data-preprocessing-and-model-training/) Alluxio to speed up its ML training workloads, and Alluxio uses RocksDB to store its filesystem metadata, so Bilibili uses RocksDB. + +Bilibili's [real-time platform](https://www.alibabacloud.com/blog/architecture-and-practices-of-bilibilis-real-time-platform_596676) uses Flink, and uses RocksDB as Flink's state store. + +## TikTok +TikTok, or its parent company ByteDance, uses RocksDB as the storage engine for some storage systems, such as its distributed graph database [ByteGraph](https://vldb.org/pvldb/vol15/p3306-li.pdf). + +Also, TikTok uses [Alluxio](alluxio.io) to [speed up Presto queries](https://www.alluxio.io/resources/videos/improving-presto-performance-with-alluxio-at-tiktok/), and Alluxio stores the files' metadata in RocksDB. + +## FoundationDB +[FoundationDB](https://www.foundationdb.org/) [uses](https://github.com/apple/foundationdb/blob/377f1f692da6ab2fe5bdac57035651db3e5fb66d/fdbserver/KeyValueStoreRocksDB.actor.cpp) RocksDB to implement a [key-value store interface](https://github.com/apple/foundationdb/blob/377f1f692da6ab2fe5bdac57035651db3e5fb66d/fdbserver/KeyValueStoreRocksDB.actor.cpp#L1127) in its server backend. + +## Apple +Apple [uses](https://opensource.apple.com/projects/foundationdb/) FoundationDB, so it also uses RocksDB. + +## Snowflake +Snowflake [uses](https://www.snowflake.com/blog/how-foundationdb-powers-snowflake-metadata-forward/) FoundationDB, so it also uses RocksDB. + +## Microsoft +The Bing search engine from Microsoft uses RocksDB as the storage engine for its web data platform: https://blogs.bing.com/Engineering-Blog/october-2021/RocksDB-in-Microsoft-Bing + ## LinkedIn Two different use cases at Linkedin are using RocksDB as a storage engine: @@ -26,6 +48,9 @@ Learn more about those use cases in a Tech Talk by Ankit Gupta and Naveen Somasu ## Yahoo Yahoo is using RocksDB as a storage engine for their biggest distributed data store Sherpa. Learn more about it here: http://yahooeng.tumblr.com/post/120730204806/sherpa-scales-new-heights +## Tencent +[PaxosStore](https://github.com/Tencent/paxosstore) is a distributed database supporting WeChat. It uses RocksDB as its storage engine. + ## Baidu [Apache Doris](http://doris.apache.org/master/en/) is a MPP analytical database engine released by Baidu. It [uses RocksDB](http://doris.apache.org/master/en/administrator-guide/operation/tablet-meta-tool.html) to manage its tablet's metadata. @@ -79,9 +104,18 @@ quasardb uses a heavily tuned RocksDB as its persistence layer. ## TiKV [TiKV](https://github.com/pingcap/tikv) is a GEO-replicated, high-performance, distributed, transactional key-value database. TiKV is powered by Rust and Raft. TiKV uses RocksDB as its persistence layer. +## TiDB +[TiDB](https://github.com/pingcap/tidb) uses the TiKV distributed key-value database, so it uses RocksDB. + +## PingCAP +[PingCAP](https://www.pingcap.com/) is the company behind TiDB, its cloud database service uses RocksDB. + ## Apache Spark [Spark Structured Streaming](https://docs.databricks.com/structured-streaming/rocksdb-state-store.html) uses RocksDB as the local state store. +## Databricks +[Databricks](https://www.databricks.com/) [replaces AWS RDS with TiDB](https://www.pingcap.com/case-study/how-databricks-tackles-the-scalability-limit-with-a-mysql-alternative/) for scalability, so it uses RocksDB. + ## Apache Flink [Apache Flink](https://flink.apache.org/news/2016/03/08/release-1.0.0.html) uses RocksDB to store state locally on a machine. @@ -118,6 +152,9 @@ LzLabs is using RocksDB as a storage engine in their multi-database distributed ## ArangoDB [ArangoDB](https://www.arangodb.com/) is a native multi-model database with flexible data models for documents, graphs, and key-values, for building high performance applications using a convenient SQL-like query language or JavaScript extensions. It uses RocksDB as its storage engine. +## Qdrant +[Qdrant](https://qdrant.tech/) is an open source vector database, it [uses](https://qdrant.tech/documentation/concepts/storage/) RocksDB as its persistent storage. + ## Milvus [Milvus](https://milvus.io/) is an open source vector database for unstructured data. It uses RocksDB not only as one of the supported kv storage engines, but also as a message queue. @@ -127,5 +164,9 @@ LzLabs is using RocksDB as a storage engine in their multi-database distributed ## Solana Labs [Solana](https://github.com/solana-labs/solana) is a fast, secure, scalable, and decentralized blockchain. It uses RocksDB as the underlying storage for its ledger store. +## Apache Kvrocks + +[Apache Kvrocks](https://github.com/apache/kvrocks) is an open-source distributed key-value NoSQL database built on top of RocksDB. It serves as a cost-saving and capacity-increasing alternative drop-in replacement for Redis. + ## Others More databases using RocksDB can be found at [dbdb.io](https://dbdb.io/browse?embeds=rocksdb). diff --git a/buckifier/buckify_rocksdb.py b/buckifier/buckify_rocksdb.py index ac09c051976e..b56e1a82de46 100755 --- a/buckifier/buckify_rocksdb.py +++ b/buckifier/buckify_rocksdb.py @@ -26,7 +26,7 @@ # $python3 buckifier/buckify_rocksdb.py \ # '{"fake": { # "extra_deps": [":test_dep", "//fakes/module:mock1"], -# "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"] +# "extra_compiler_flags": ["-DFOO_BAR", "-Os"] # } # }' # (Generated TARGETS file has test_dep and mock1 as dependencies for RocksDB @@ -154,16 +154,9 @@ def generate_targets(repo_path, deps_map): # rocksdb_whole_archive_lib TARGETS.add_library( "rocksdb_whole_archive_lib", - src_mk["LIB_SOURCES"] + - # always add range_tree, it's only excluded on ppc64, which we don't use internally - src_mk["RANGE_TREE_SOURCES"] + src_mk["TOOL_LIB_SOURCES"], + [], deps=[ - "//folly/container:f14_hash", - "//folly/experimental/coro:blocking_wait", - "//folly/experimental/coro:collect", - "//folly/experimental/coro:coroutine", - "//folly/experimental/coro:task", - "//folly/synchronization:distributed_mutex", + ":rocksdb_lib", ], headers=None, extra_external_deps="", @@ -204,6 +197,10 @@ def generate_targets(repo_path, deps_map): TARGETS.add_binary( "db_stress", ["db_stress_tool/db_stress.cc"], [":rocksdb_stress_lib"] ) + # cache_bench binary + TARGETS.add_binary( + "cache_bench", ["cache/cache_bench.cc"], [":rocksdb_cache_bench_tools_lib"] + ) # bench binaries for src in src_mk.get("MICROBENCH_SOURCES", []): name = src.rsplit("/", 1)[1].split(".")[0] if "/" in src else src.split(".")[0] @@ -306,6 +303,7 @@ def generate_targets(repo_path, deps_map): deps=json.dumps(deps["extra_deps"] + [":rocksdb_test_lib"]), extra_compiler_flags=json.dumps(deps["extra_compiler_flags"]), ) + TARGETS.export_file("tools/db_crashtest.py") print(ColorString.info("Generated TARGETS Summary:")) print(ColorString.info("- %d libs" % TARGETS.total_lib)) diff --git a/buckifier/targets_builder.py b/buckifier/targets_builder.py index 343b2207d6b5..f5d727469b32 100644 --- a/buckifier/targets_builder.py +++ b/buckifier/targets_builder.py @@ -23,7 +23,7 @@ def pretty_list(lst, indent=8): return res -class TARGETSBuilder(object): +class TARGETSBuilder: def __init__(self, path, extra_argv): self.path = path header = targets_cfg.rocksdb_target_header_template.format( @@ -148,3 +148,9 @@ def register_test(self, test_name, src, deps, extra_compiler_flags): ).encode("utf-8") ) self.total_test = self.total_test + 1 + + def export_file(self, name): + with open(self.path, "a") as targets_file: + targets_file.write( + targets_cfg.export_file_template.format(name=name) + ) diff --git a/buckifier/targets_cfg.py b/buckifier/targets_cfg.py index 491c34d6e597..ead6ac51a50e 100644 --- a/buckifier/targets_cfg.py +++ b/buckifier/targets_cfg.py @@ -6,8 +6,6 @@ # --> DO NOT EDIT MANUALLY <-- # This file is a Facebook-specific integration for buck builds, so can # only be validated by Facebook employees. -# -# @noautodeps @nocodemods load("//rocks/buckifier:defs.bzl", "cpp_library_wrapper","rocks_cpp_library_wrapper","cpp_binary_wrapper","cpp_unittest_wrapper","fancy_bench_wrapper","add_c_test_wrapper") """ @@ -39,3 +37,7 @@ fancy_bench_wrapper(suite_name="{name}", binary_to_bench_to_metric_list_map={bench_config}, slow={slow}, expected_runtime={expected_runtime}, sl_iterations={sl_iterations}, regression_threshold={regression_threshold}) """ + +export_file_template = """ +export_file(name = "{name}") +""" diff --git a/buckifier/util.py b/buckifier/util.py index 8943fed2bdfd..be197efd07b1 100644 --- a/buckifier/util.py +++ b/buckifier/util.py @@ -14,7 +14,7 @@ import time -class ColorString(object): +class ColorString: """Generate colorful strings on terminal""" HEADER = "\033[95m" diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index f93952bf2037..e46736136087 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -63,13 +63,7 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then if [ "$LIB_MODE" == "shared" ]; then PIC_BUILD=1 fi - if [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM010" ]; then - source "$PWD/build_tools/fbcode_config_platform010.sh" - elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM009" ]; then - source "$PWD/build_tools/fbcode_config_platform009.sh" - else - source "$PWD/build_tools/fbcode_config_platform009.sh" - fi + source "$PWD/build_tools/fbcode_config_platform010.sh" fi # Delete existing output, if it exists @@ -154,7 +148,7 @@ case "$TARGET_OS" in ;; IOS) PLATFORM=IOS - COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE -DROCKSDB_LITE" + COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE " PLATFORM_SHARED_EXT=dylib PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name " CROSS_COMPILE=true @@ -425,7 +419,7 @@ EOF if ! test $ROCKSDB_DISABLE_JEMALLOC; then # Test whether jemalloc is available - if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o -ljemalloc \ + if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS $LDFLAGS -x c++ - -o test.o -ljemalloc \ 2>/dev/null; then # This will enable some preprocessor identifiers in the Makefile JEMALLOC=1 @@ -434,12 +428,19 @@ EOF WITH_JEMALLOC_FLAG=1 # check for JEMALLOC installed with HomeBrew if [ "$PLATFORM" == "OS_MACOSX" ]; then + if [ "$TARGET_ARCHITECTURE" = "arm64" ]; then + # on M1 Macs, homebrew installs here instead of /usr/local + JEMALLOC_PREFIX="/opt/homebrew" + else + JEMALLOC_PREFIX="/usr/local" + fi if hash brew 2>/dev/null && brew ls --versions jemalloc > /dev/null; then JEMALLOC_VER=$(brew ls --versions jemalloc | tail -n 1 | cut -f 2 -d ' ') - JEMALLOC_INCLUDE="-I/usr/local/Cellar/jemalloc/${JEMALLOC_VER}/include" - JEMALLOC_LIB="/usr/local/Cellar/jemalloc/${JEMALLOC_VER}/lib/libjemalloc_pic.a" - PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $JEMALLOC_LIB" - JAVA_STATIC_LDFLAGS="$JAVA_STATIC_LDFLAGS $JEMALLOC_LIB" + JEMALLOC_INCLUDE="-I${JEMALLOC_PREFIX}/Cellar/jemalloc/${JEMALLOC_VER}/include" + JEMALLOC_LIB="${JEMALLOC_PREFIX}/Cellar/jemalloc/${JEMALLOC_VER}/lib/libjemalloc_pic.a" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -L${JEMALLOC_PREFIX}/lib $JEMALLOC_LIB" + JAVA_LDFLAGS="$JAVA_LDFLAGS -L${JEMALLOC_PREFIX}/lib $JEMALLOC_LIB" + JAVA_STATIC_LDFLAGS="$JAVA_STATIC_LDFLAGS -L${JEMALLOC_PREFIX}/lib $JEMALLOC_LIB" fi fi fi @@ -650,7 +651,7 @@ if [ "${USE_KAFKA}XXX" = "1XXX" ]; then PLATFORM_LDFLAGS="-lrdkafka++ $PLATFORM_LDFLAGS" fi -if test "0$PORTABLE" -eq 0; then +if [ "$PORTABLE" == "" ] || [ "$PORTABLE" == 0 ]; then if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then # Tune for this POWER processor, treating '+' models as base models POWER=`LD_SHOW_AUXV=1 /bin/true | grep AT_PLATFORM | grep -E -o power[0-9]+` @@ -673,37 +674,36 @@ if test "0$PORTABLE" -eq 0; then COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}" elif [ "$TARGET_OS" == "IOS" ]; then COMMON_FLAGS="$COMMON_FLAGS" - elif [ "$TARGET_OS" == "AIX" ] || [ "$TARGET_OS" == "SunOS" ]; then - # TODO: Not sure why we don't use -march=native on these OSes - if test "$USE_SSE"; then - TRY_SSE_ETC="1" - fi else COMMON_FLAGS="$COMMON_FLAGS -march=native " fi else - # PORTABLE=1 - if test "$USE_SSE"; then - TRY_SSE_ETC="1" - fi - - if test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then - COMMON_FLAGS="$COMMON_FLAGS -march=z196 " - fi - - if test -n "`echo $TARGET_ARCHITECTURE | grep ^riscv64`"; then - RISC_ISA=$(cat /proc/cpuinfo | grep isa | head -1 | cut --delimiter=: -f 2 | cut -b 2-) - COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}" + # PORTABLE specified + if [ "$PORTABLE" == 1 ]; then + if test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then + COMMON_FLAGS="$COMMON_FLAGS -march=z196 " + elif test -n "`echo $TARGET_ARCHITECTURE | grep ^riscv64`"; then + RISC_ISA=$(cat /proc/cpuinfo | grep isa | head -1 | cut --delimiter=: -f 2 | cut -b 2-) + COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}" + elif test "$USE_SSE"; then + # USE_SSE is DEPRECATED + # This is a rough approximation of the old USE_SSE behavior + COMMON_FLAGS="$COMMON_FLAGS -march=haswell" + fi + # Other than those cases, not setting -march= here. + else + # Assume PORTABLE is a minimum assumed cpu type, e.g. PORTABLE=haswell + COMMON_FLAGS="$COMMON_FLAGS -march=${PORTABLE}" fi if [[ "${PLATFORM}" == "OS_MACOSX" ]]; then - # For portability compile for macOS 10.13 (2017) or newer - COMMON_FLAGS="$COMMON_FLAGS -mmacosx-version-min=10.13" - PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -mmacosx-version-min=10.13" + # For portability compile for macOS 10.14 (2018) or newer + COMMON_FLAGS="$COMMON_FLAGS -mmacosx-version-min=10.14" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -mmacosx-version-min=10.14" # -mmacosx-version-min must come first here. - PLATFORM_SHARED_LDFLAGS="-mmacosx-version-min=10.13 $PLATFORM_SHARED_LDFLAGS" - PLATFORM_CMAKE_FLAGS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.13" - JAVA_STATIC_DEPS_COMMON_FLAGS="-mmacosx-version-min=10.13" + PLATFORM_SHARED_LDFLAGS="-mmacosx-version-min=10.14 $PLATFORM_SHARED_LDFLAGS" + PLATFORM_CMAKE_FLAGS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.14" + JAVA_STATIC_DEPS_COMMON_FLAGS="-mmacosx-version-min=10.14" JAVA_STATIC_DEPS_LDFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS" JAVA_STATIC_DEPS_CCFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS" JAVA_STATIC_DEPS_CXXFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS" @@ -727,101 +727,6 @@ EOF fi fi -if test "$TRY_SSE_ETC"; then - # The USE_SSE flag now means "attempt to compile with widely-available - # Intel architecture extensions utilized by specific optimizations in the - # source code." It's a qualifier on PORTABLE=1 that means "mostly portable." - # It doesn't even really check that your current CPU is compatible. - # - # SSE4.2 available since nehalem, ca. 2008-2010 - # Includes POPCNT for BitsSetToOne, BitParity - TRY_SSE42="-msse4.2" - # PCLMUL available since westmere, ca. 2010-2011 - TRY_PCLMUL="-mpclmul" - # AVX2 available since haswell, ca. 2013-2015 - TRY_AVX2="-mavx2" - # BMI available since haswell, ca. 2013-2015 - # Primarily for TZCNT for CountTrailingZeroBits - TRY_BMI="-mbmi" - # LZCNT available since haswell, ca. 2013-2015 - # For FloorLog2 - TRY_LZCNT="-mlzcnt" -fi - -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_SSE42 -x c++ - -o test.o 2>/dev/null < - #include - int main() { - volatile uint32_t x = _mm_crc32_u32(0, 0); - (void)x; - } -EOF -if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS $TRY_SSE42 -DHAVE_SSE42" -elif test "$USE_SSE"; then - echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" >&2 -fi - -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_PCLMUL -x c++ - -o test.o 2>/dev/null < - #include - int main() { - const auto a = _mm_set_epi64x(0, 0); - const auto b = _mm_set_epi64x(0, 0); - const auto c = _mm_clmulepi64_si128(a, b, 0x00); - auto d = _mm_cvtsi128_si64(c); - (void)d; - } -EOF -if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS $TRY_PCLMUL -DHAVE_PCLMUL" -elif test "$USE_SSE"; then - echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling" >&2 -fi - -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_AVX2 -x c++ - -o test.o 2>/dev/null < - #include - int main() { - const auto a = _mm256_setr_epi32(0, 1, 2, 3, 4, 7, 6, 5); - const auto b = _mm256_permutevar8x32_epi32(a, a); - (void)b; - } -EOF -if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS $TRY_AVX2 -DHAVE_AVX2" -elif test "$USE_SSE"; then - echo "warning: USE_SSE specified but compiler could not use AVX2 intrinsics, disabling" >&2 -fi - -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_BMI -x c++ - -o test.o 2>/dev/null < - #include - int main(int argc, char *argv[]) { - (void)argv; - return (int)_tzcnt_u64((uint64_t)argc); - } -EOF -if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS $TRY_BMI -DHAVE_BMI" -elif test "$USE_SSE"; then - echo "warning: USE_SSE specified but compiler could not use BMI intrinsics, disabling" >&2 -fi - -$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_LZCNT -x c++ - -o test.o 2>/dev/null < - #include - int main(int argc, char *argv[]) { - (void)argv; - return (int)_lzcnt_u64((uint64_t)argc); - } -EOF -if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS $TRY_LZCNT -DHAVE_LZCNT" -elif test "$USE_SSE"; then - echo "warning: USE_SSE specified but compiler could not use LZCNT intrinsics, disabling" >&2 -fi - $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null < int main() { diff --git a/build_tools/dependencies_platform009.sh b/build_tools/dependencies_platform009.sh deleted file mode 100644 index ce8dd4e06a62..000000000000 --- a/build_tools/dependencies_platform009.sh +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -GCC_BASE=/mnt/gvfs/third-party2/gcc/1795efe5f06778c15a92c8f9a2aba5dc496d9d4d/9.x/centos7-native/3bed279 -CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/7318eaac22659b6ff2fe43918e4b69fd0772a8a7/9.0.0/platform009/651ee30 -LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/4959b39cfbe5965a37c861c4c327fa7c5c759b87/9.x/platform009/9202ce7 -GLIBC_BASE=/mnt/gvfs/third-party2/glibc/45ce3375cdc77ecb2520bbf8f0ecddd3f98efd7a/2.30/platform009/f259413 -SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/be4de3205e029101b18aa8103daa696c2bef3b19/1.1.3/platform009/7f3b187 -ZLIB_BASE=/mnt/gvfs/third-party2/zlib/3c160ac5c67e257501e24c6c1d00ad5e01d73db6/1.2.8/platform009/7f3b187 -BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/73a237ac5bc0a5f5d67b39b8d253cfebaab88684/1.0.6/platform009/7f3b187 -LZ4_BASE=/mnt/gvfs/third-party2/lz4/6ca38d3c390be2774d61a300f151464bbd632d62/1.9.1/platform009/7f3b187 -ZSTD_BASE=/mnt/gvfs/third-party2/zstd/64c58a207d2495e83abc57a500a956df09b79a7c/1.4.x/platform009/ba86d1f -GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/824d0a8a5abb5b121afd1b35fc3896407ea50092/2.2.0/platform009/7f3b187 -JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/b62912d333ef33f9760efa6219dbe3fe6abb3b0e/master/platform009/c305944 -NUMA_BASE=/mnt/gvfs/third-party2/numa/0af65f71e23a67bf65dc91b11f95caa39325c432/2.0.11/platform009/7f3b187 -LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/02486dac347645d31dce116f44e1de3177315be2/1.4/platform009/5191652 -TBB_BASE=/mnt/gvfs/third-party2/tbb/2e0ec671e550bfca347300bf3f789d9c0fff24ad/2018_U5/platform009/7f3b187 -LIBURING_BASE=/mnt/gvfs/third-party2/liburing/70dbd9cfee63a25611417d09433a86d7711b3990/20200729/platform009/7f3b187 -KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/32b8a2407b634df3f8f948ba373fc4acc6a18296/fb/platform009/da39a3e -BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/08634589372fa5f237bfd374e8c644a8364e78c1/2.32/platform009/ba86d1f/ -VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/6ae525939ad02e5e676855082fbbc7828dbafeac/3.15.0/platform009/7f3b187 -LUA_BASE=/mnt/gvfs/third-party2/lua/162efd9561a3d21f6869f4814011e9cf1b3ff4dc/5.3.4/platform009/a6271c4 -BENCHMARK_BASE=/mnt/gvfs/third-party2/benchmark/30bf49ad6414325e17f3425b0edcb64239427ae3/1.6.1/platform009/7f3b187 -GLOG_BASE=/mnt/gvfs/third-party2/glog/32d751bd5673375b438158717ab6a57c1cc57e3d/0.3.2_fb/platform009/10a364d diff --git a/build_tools/error_filter.py b/build_tools/error_filter.py index c42df1f91e5e..d9cb1099c4c0 100644 --- a/build_tools/error_filter.py +++ b/build_tools/error_filter.py @@ -15,7 +15,7 @@ import sys -class ErrorParserBase(object): +class ErrorParserBase: def parse_error(self, line): """Parses a line of test output. If it contains an error, returns a formatted message describing the error; otherwise, returns None. diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh index cf3c355b1f18..fa629af97804 100644 --- a/build_tools/fbcode_config.sh +++ b/build_tools/fbcode_config.sh @@ -147,7 +147,7 @@ else fi CFLAGS+=" $DEPS_INCLUDE" -CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42" +CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT" CXXFLAGS+=" $CFLAGS" EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS" diff --git a/build_tools/fbcode_config_platform009.sh b/build_tools/fbcode_config_platform009.sh deleted file mode 100644 index 8c8ba092c68f..000000000000 --- a/build_tools/fbcode_config_platform009.sh +++ /dev/null @@ -1,170 +0,0 @@ -#!/bin/sh -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -# -# Set environment variables so that we can compile rocksdb using -# fbcode settings. It uses the latest g++ and clang compilers and also -# uses jemalloc -# Environment variables that change the behavior of this script: -# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included - - -BASEDIR=`dirname $BASH_SOURCE` -source "$BASEDIR/dependencies_platform009.sh" - -CFLAGS="" - -# libgcc -LIBGCC_INCLUDE="$LIBGCC_BASE/include/c++/9.3.0 -I $LIBGCC_BASE/include/c++/9.3.0/backward" -LIBGCC_LIBS=" -L $LIBGCC_BASE/lib" - -# glibc -GLIBC_INCLUDE="$GLIBC_BASE/include" -GLIBC_LIBS=" -L $GLIBC_BASE/lib" - -if test -z $PIC_BUILD; then - MAYBE_PIC= -else - MAYBE_PIC=_pic -fi - -if ! test $ROCKSDB_DISABLE_SNAPPY; then - # snappy - SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/" - SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy${MAYBE_PIC}.a" - CFLAGS+=" -DSNAPPY" -fi - -if ! test $ROCKSDB_DISABLE_ZLIB; then - # location of zlib headers and libraries - ZLIB_INCLUDE=" -I $ZLIB_BASE/include/" - ZLIB_LIBS=" $ZLIB_BASE/lib/libz${MAYBE_PIC}.a" - CFLAGS+=" -DZLIB" -fi - -if ! test $ROCKSDB_DISABLE_BZIP; then - # location of bzip headers and libraries - BZIP_INCLUDE=" -I $BZIP2_BASE/include/" - BZIP_LIBS=" $BZIP2_BASE/lib/libbz2${MAYBE_PIC}.a" - CFLAGS+=" -DBZIP2" -fi - -if ! test $ROCKSDB_DISABLE_LZ4; then - LZ4_INCLUDE=" -I $LZ4_BASE/include/" - LZ4_LIBS=" $LZ4_BASE/lib/liblz4${MAYBE_PIC}.a" - CFLAGS+=" -DLZ4" -fi - -if ! test $ROCKSDB_DISABLE_ZSTD; then - ZSTD_INCLUDE=" -I $ZSTD_BASE/include/" - ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd${MAYBE_PIC}.a" - CFLAGS+=" -DZSTD" -fi - -# location of gflags headers and libraries -GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/" -GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags${MAYBE_PIC}.a" -CFLAGS+=" -DGFLAGS=gflags" - -BENCHMARK_INCLUDE=" -I $BENCHMARK_BASE/include/" -BENCHMARK_LIBS=" $BENCHMARK_BASE/lib/libbenchmark${MAYBE_PIC}.a" - -GLOG_INCLUDE=" -I $GLOG_BASE/include/" -GLOG_LIBS=" $GLOG_BASE/lib/libglog${MAYBE_PIC}.a" - -# location of jemalloc -JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/" -JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc${MAYBE_PIC}.a" - -# location of numa -NUMA_INCLUDE=" -I $NUMA_BASE/include/" -NUMA_LIB=" $NUMA_BASE/lib/libnuma${MAYBE_PIC}.a" -CFLAGS+=" -DNUMA" - -# location of libunwind -LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind${MAYBE_PIC}.a" - -# location of TBB -TBB_INCLUDE=" -isystem $TBB_BASE/include/" -TBB_LIBS="$TBB_BASE/lib/libtbb${MAYBE_PIC}.a" -CFLAGS+=" -DTBB" - -# location of LIBURING -LIBURING_INCLUDE=" -isystem $LIBURING_BASE/include/" -LIBURING_LIBS="$LIBURING_BASE/lib/liburing${MAYBE_PIC}.a" -CFLAGS+=" -DLIBURING" - -test "$USE_SSE" || USE_SSE=1 -export USE_SSE -test "$PORTABLE" || PORTABLE=1 -export PORTABLE - -BINUTILS="$BINUTILS_BASE/bin" -AR="$BINUTILS/ar" -AS="$BINUTILS/as" - -DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE $TBB_INCLUDE $LIBURING_INCLUDE $BENCHMARK_INCLUDE $GLOG_INCLUDE" - -STDLIBS="-L $GCC_BASE/lib64" - -CLANG_BIN="$CLANG_BASE/bin" -CLANG_LIB="$CLANG_BASE/lib" -CLANG_SRC="$CLANG_BASE/../../src" - -CLANG_ANALYZER="$CLANG_BIN/clang++" -CLANG_SCAN_BUILD="$CLANG_SRC/llvm/clang/tools/scan-build/bin/scan-build" - -if [ -z "$USE_CLANG" ]; then - # gcc - CC="$GCC_BASE/bin/gcc" - CXX="$GCC_BASE/bin/g++" - AR="$GCC_BASE/bin/gcc-ar" - - CFLAGS+=" -B$BINUTILS" - CFLAGS+=" -isystem $LIBGCC_INCLUDE" - CFLAGS+=" -isystem $GLIBC_INCLUDE" - JEMALLOC=1 -else - # clang - CLANG_INCLUDE="$CLANG_LIB/clang/stable/include" - CC="$CLANG_BIN/clang" - CXX="$CLANG_BIN/clang++" - AR="$CLANG_BIN/llvm-ar" - - KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include" - - CFLAGS+=" -B$BINUTILS -nostdinc -nostdlib" - CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x " - CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x/x86_64-facebook-linux " - CFLAGS+=" -isystem $GLIBC_INCLUDE" - CFLAGS+=" -isystem $LIBGCC_INCLUDE" - CFLAGS+=" -isystem $CLANG_INCLUDE" - CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux " - CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE " - CFLAGS+=" -Wno-expansion-to-defined " - CXXFLAGS="-nostdinc++" -fi - -CFLAGS+=" $DEPS_INCLUDE" -CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT" -CXXFLAGS+=" $CFLAGS" - -EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS" -EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/platform009/lib/ld.so" -EXEC_LDFLAGS+=" $LIBUNWIND" -EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/platform009/lib" -EXEC_LDFLAGS+=" -Wl,-rpath=$GCC_BASE/lib64" -# required by libtbb -EXEC_LDFLAGS+=" -ldl" - -PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++" -PLATFORM_LDFLAGS+=" -B$BINUTILS" - -EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS" - -VALGRIND_VER="$VALGRIND_BASE/bin/" - -# lua not supported because it's on track for deprecation, I think -LUA_PATH= -LUA_LIB= - -export CC CXX AR AS CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB diff --git a/build_tools/fbcode_config_platform010.sh b/build_tools/fbcode_config_platform010.sh index babe92c412c8..25835d091080 100644 --- a/build_tools/fbcode_config_platform010.sh +++ b/build_tools/fbcode_config_platform010.sh @@ -154,7 +154,7 @@ CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux " CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE " CFLAGS+=" $DEPS_INCLUDE" -CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT" +CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_IOURING_PRESENT" CXXFLAGS+=" $CFLAGS" EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS" diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh index 0baeca983733..5ecdb1d21571 100755 --- a/build_tools/regression_build_test.sh +++ b/build_tools/regression_build_test.sh @@ -360,7 +360,7 @@ function send_to_ods { echo >&2 "ERROR: Key $key doesn't have a value." return fi - curl --silent "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \ + curl --silent "https://www.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \ --connect-timeout 60 } diff --git a/build_tools/update_dependencies.sh b/build_tools/update_dependencies.sh index a2fdcd0ee4a5..c549e5b6e763 100755 --- a/build_tools/update_dependencies.sh +++ b/build_tools/update_dependencies.sh @@ -104,46 +104,3 @@ get_lib_base valgrind LATEST platform010 get_lib_base lua 5.3.4 platform010 git diff $OUTPUT - - -########################################################### -# platform009 dependencies # -########################################################### - -OUTPUT="$BASEDIR/dependencies_platform009.sh" - -rm -f "$OUTPUT" -touch "$OUTPUT" - -echo "Writing dependencies to $OUTPUT" - -# Compilers locations -GCC_BASE=`readlink -f $TP2_LATEST/gcc/9.x/centos7-native/*/` -CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/9.0.0/platform009/*/` - -log_header -log_variable GCC_BASE -log_variable CLANG_BASE - -# Libraries locations -get_lib_base libgcc 9.x platform009 -get_lib_base glibc 2.30 platform009 -get_lib_base snappy LATEST platform009 -get_lib_base zlib LATEST platform009 -get_lib_base bzip2 LATEST platform009 -get_lib_base lz4 LATEST platform009 -get_lib_base zstd LATEST platform009 -get_lib_base gflags LATEST platform009 -get_lib_base jemalloc LATEST platform009 -get_lib_base numa LATEST platform009 -get_lib_base libunwind LATEST platform009 -get_lib_base tbb 2018_U5 platform009 -get_lib_base liburing LATEST platform009 -get_lib_base benchmark LATEST platform009 - -get_lib_base kernel-headers fb platform009 -get_lib_base binutils LATEST centos7-native -get_lib_base valgrind LATEST platform009 -get_lib_base lua 5.3.4 platform009 - -git diff $OUTPUT diff --git a/cache/cache.cc b/cache/cache.cc index 7d23fb757972..3dbea128e1f8 100644 --- a/cache/cache.cc +++ b/cache/cache.cc @@ -16,7 +16,8 @@ #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE +const Cache::CacheItemHelper kNoopCacheItemHelper{}; + static std::unordered_map lru_cache_options_type_info = { {"capacity", @@ -64,7 +65,41 @@ static std::unordered_map OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, }; -#endif // ROCKSDB_LITE + +namespace { +static void NoopDelete(Cache::ObjectPtr /*obj*/, + MemoryAllocator* /*allocator*/) { + assert(false); +} + +static size_t SliceSize(Cache::ObjectPtr obj) { + return static_cast(obj)->size(); +} + +static Status SliceSaveTo(Cache::ObjectPtr from_obj, size_t from_offset, + size_t length, char* out) { + const Slice& slice = *static_cast(from_obj); + std::memcpy(out, slice.data() + from_offset, length); + return Status::OK(); +} + +static Status NoopCreate(const Slice& /*data*/, CompressionType /*type*/, + CacheTier /*source*/, Cache::CreateContext* /*ctx*/, + MemoryAllocator* /*allocator*/, + Cache::ObjectPtr* /*out_obj*/, + size_t* /*out_charge*/) { + assert(false); + return Status::NotSupported(); +} + +static Cache::CacheItemHelper kBasicCacheItemHelper(CacheEntryRole::kMisc, + &NoopDelete); +} // namespace + +const Cache::CacheItemHelper kSliceCacheItemHelper{ + CacheEntryRole::kMisc, &NoopDelete, &SliceSize, + &SliceSaveTo, &NoopCreate, &kBasicCacheItemHelper, +}; Status SecondaryCache::CreateFromString( const ConfigOptions& config_options, const std::string& value, @@ -75,7 +110,6 @@ Status SecondaryCache::CreateFromString( Status status; std::shared_ptr sec_cache; -#ifndef ROCKSDB_LITE CompressedSecondaryCacheOptions sec_cache_opts; status = OptionTypeInfo::ParseStruct(config_options, "", &comp_sec_cache_options_type_info, "", @@ -84,19 +118,13 @@ Status SecondaryCache::CreateFromString( sec_cache = NewCompressedSecondaryCache(sec_cache_opts); } -#else - (void)config_options; - status = Status::NotSupported( - "Cannot load compressed secondary cache in LITE mode ", args); -#endif //! ROCKSDB_LITE if (status.ok()) { result->swap(sec_cache); } return status; } else { - return LoadSharedObject(config_options, value, nullptr, - result); + return LoadSharedObject(config_options, value, result); } } @@ -108,7 +136,6 @@ Status Cache::CreateFromString(const ConfigOptions& config_options, if (value.find('=') == std::string::npos) { cache = NewLRUCache(ParseSizeT(value)); } else { -#ifndef ROCKSDB_LITE LRUCacheOptions cache_opts; status = OptionTypeInfo::ParseStruct(config_options, "", &lru_cache_options_type_info, "", @@ -116,14 +143,51 @@ Status Cache::CreateFromString(const ConfigOptions& config_options, if (status.ok()) { cache = NewLRUCache(cache_opts); } -#else - (void)config_options; - status = Status::NotSupported("Cannot load cache in LITE mode ", value); -#endif //! ROCKSDB_LITE } if (status.ok()) { result->swap(cache); } return status; } + +bool Cache::AsyncLookupHandle::IsReady() { + return pending_handle == nullptr || pending_handle->IsReady(); +} + +bool Cache::AsyncLookupHandle::IsPending() { return pending_handle != nullptr; } + +Cache::Handle* Cache::AsyncLookupHandle::Result() { + assert(!IsPending()); + return result_handle; +} + +void Cache::StartAsyncLookup(AsyncLookupHandle& async_handle) { + async_handle.found_dummy_entry = false; // in case re-used + assert(!async_handle.IsPending()); + async_handle.result_handle = + Lookup(async_handle.key, async_handle.helper, async_handle.create_context, + async_handle.priority, async_handle.stats); +} + +Cache::Handle* Cache::Wait(AsyncLookupHandle& async_handle) { + WaitAll(&async_handle, 1); + return async_handle.Result(); +} + +void Cache::WaitAll(AsyncLookupHandle* async_handles, size_t count) { + for (size_t i = 0; i < count; ++i) { + if (async_handles[i].IsPending()) { + // If a pending handle gets here, it should be marked at "to be handled + // by a caller" by that caller erasing the pending_cache on it. + assert(async_handles[i].pending_cache == nullptr); + } + } +} + +void Cache::SetEvictionCallback(EvictionCallback&& fn) { + // Overwriting non-empty with non-empty could indicate a bug + assert(!eviction_callback_ || !fn); + eviction_callback_ = std::move(fn); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc index 1dfbfe3c7ff6..89945abf7f0c 100644 --- a/cache/cache_bench_tool.cc +++ b/cache/cache_bench_tool.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "cache_key.h" #ifdef GFLAGS #include #include @@ -13,10 +12,13 @@ #include #include +#include "cache/cache_key.h" +#include "cache/sharded_cache.h" #include "db/db_impl/db_impl.h" #include "monitoring/histogram.h" #include "port/port.h" -#include "rocksdb/cache.h" +#include "port/stack_trace.h" +#include "rocksdb/advanced_cache.h" #include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/env.h" @@ -31,6 +33,7 @@ #include "util/hash.h" #include "util/mutexlock.h" #include "util/random.h" +#include "util/stderr_logger.h" #include "util/stop_watch.h" #include "util/string_util.h" @@ -43,21 +46,39 @@ static constexpr uint64_t GiB = MiB << 10; DEFINE_uint32(threads, 16, "Number of concurrent threads to run."); DEFINE_uint64(cache_size, 1 * GiB, "Number of bytes to use as a cache of uncompressed data."); -DEFINE_uint32(num_shard_bits, 6, "shard_bits."); +DEFINE_int32(num_shard_bits, -1, + "ShardedCacheOptions::shard_bits. Default = auto"); DEFINE_double(resident_ratio, 0.25, "Ratio of keys fitting in cache to keyspace."); DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread."); DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added."); - -DEFINE_uint32(skew, 5, "Degree of skew in key selection"); +DEFINE_uint32(value_bytes_estimate, 0, + "If > 0, overrides estimated_entry_charge or " + "min_avg_entry_charge depending on cache_type."); + +DEFINE_int32( + degenerate_hash_bits, 0, + "With HCC, fix this many hash bits to increase table hash collisions"); +DEFINE_uint32(skew, 5, "Degree of skew in key selection. 0 = no skew"); DEFINE_bool(populate_cache, true, "Populate cache before operations"); -DEFINE_uint32(lookup_insert_percent, 87, +DEFINE_double(pinned_ratio, 0.25, + "Keep roughly this portion of entries pinned in cache."); +DEFINE_double( + vary_capacity_ratio, 0.0, + "If greater than 0.0, will periodically vary the capacity between this " + "ratio less than full size and full size. If vary_capacity_ratio + " + "pinned_ratio is close to or exceeds 1.0, the cache might thrash."); + +DEFINE_uint32(lookup_insert_percent, 82, "Ratio of lookup (+ insert on not found) to total workload " "(expressed as a percentage)"); DEFINE_uint32(insert_percent, 2, "Ratio of insert to total workload (expressed as a percentage)"); +DEFINE_uint32(blind_insert_percent, 5, + "Ratio of insert without keeping handle to total workload " + "(expressed as a percentage)"); DEFINE_uint32(lookup_percent, 10, "Ratio of lookup to total workload (expressed as a percentage)"); DEFINE_uint32(erase_percent, 1, @@ -71,20 +92,40 @@ DEFINE_uint32( DEFINE_uint32(gather_stats_entries_per_lock, 256, "For Cache::ApplyToAllEntries"); -DEFINE_bool(skewed, false, "If true, skew the key access distribution"); + +DEFINE_uint32(usleep, 0, "Sleep up to this many microseconds after each op."); DEFINE_bool(lean, false, "If true, no additional computation is performed besides cache " "operations."); -#ifndef ROCKSDB_LITE +DEFINE_bool(early_exit, false, + "Exit before deallocating most memory. Good for malloc stats, e.g." + "MALLOC_CONF=\"stats_print:true\""); + +DEFINE_bool(histograms, true, + "Whether to track and print histogram statistics."); + +DEFINE_bool(report_problems, true, "Whether to ReportProblems() at the end."); + +DEFINE_uint32(seed, 0, "Hashing/random seed to use. 0 = choose at random"); + DEFINE_string(secondary_cache_uri, "", "Full URI for creating a custom secondary cache object"); -static class std::shared_ptr secondary_cache; -#endif // ROCKSDB_LITE DEFINE_string(cache_type, "lru_cache", "Type of block cache."); +DEFINE_bool(use_jemalloc_no_dump_allocator, false, + "Whether to use JemallocNoDumpAllocator"); + +DEFINE_uint32(jemalloc_no_dump_allocator_num_arenas, + ROCKSDB_NAMESPACE::JemallocAllocatorOptions().num_arenas, + "JemallocNodumpAllocator::num_arenas"); + +DEFINE_bool(jemalloc_no_dump_allocator_limit_tcache_size, + ROCKSDB_NAMESPACE::JemallocAllocatorOptions().limit_tcache_size, + "JemallocNodumpAllocator::limit_tcache_size"); + // ## BEGIN stress_cache_key sub-tool options ## // See class StressCacheKey below. DEFINE_bool(stress_cache_key, false, @@ -147,9 +188,6 @@ class SharedState { public: explicit SharedState(CacheBench* cache_bench) : cv_(&mu_), - num_initialized_(0), - start_(false), - num_done_(0), cache_bench_(cache_bench) {} ~SharedState() {} @@ -172,15 +210,31 @@ class SharedState { bool Started() const { return start_; } + void AddLookupStats(uint64_t hits, uint64_t misses, size_t pinned_count) { + MutexLock l(&mu_); + lookup_count_ += hits + misses; + lookup_hits_ += hits; + pinned_count_ += pinned_count; + } + + double GetLookupHitRatio() const { + return 1.0 * lookup_hits_ / lookup_count_; + } + + size_t GetPinnedCount() const { return pinned_count_; } + private: port::Mutex mu_; port::CondVar cv_; - uint64_t num_initialized_; - bool start_; - uint64_t num_done_; - CacheBench* cache_bench_; + + uint64_t num_initialized_ = 0; + bool start_ = false; + uint64_t num_done_ = 0; + uint64_t lookup_count_ = 0; + uint64_t lookup_hits_ = 0; + size_t pinned_count_ = 0; }; // Per-thread state for concurrent executions of the same benchmark. @@ -192,26 +246,32 @@ struct ThreadState { uint64_t duration_us = 0; ThreadState(uint32_t index, SharedState* _shared) - : tid(index), rnd(1000 + index), shared(_shared) {} + : tid(index), rnd(FLAGS_seed + 1 + index), shared(_shared) {} }; struct KeyGen { char key_data[27]; - Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) { - uint64_t key = 0; - if (!FLAGS_skewed) { - uint64_t raw = rnd.Next(); - // Skew according to setting - for (uint32_t i = 0; i < FLAGS_skew; ++i) { - raw = std::min(raw, rnd.Next()); - } - key = FastRange64(raw, max_key); - } else { - key = rnd.Skewed(max_log); - if (key > max_key) { - key -= max_key; - } + Slice GetRand(Random64& rnd, uint64_t max_key, uint32_t skew) { + uint64_t raw = rnd.Next(); + // Skew according to setting + for (uint32_t i = 0; i < skew; ++i) { + raw = std::min(raw, rnd.Next()); + } + uint64_t key = FastRange64(raw, max_key); + if (FLAGS_degenerate_hash_bits) { + uint64_t key_hash = + Hash64(reinterpret_cast(&key), sizeof(key)); + // HCC uses the high 64 bits and a lower bit mask for starting probe + // location, so we fix hash bits starting at the bottom of that word. + auto hi_hash = uint64_t{0x9e3779b97f4a7c13U} ^ + (key_hash << 1 << (FLAGS_degenerate_hash_bits - 1)); + uint64_t un_hi, un_lo; + BijectiveUnhash2x64(hi_hash, key_hash, &un_hi, &un_lo); + un_lo ^= BitwiseAnd(FLAGS_seed, INT32_MAX); + EncodeFixed64(key_data, un_lo); + EncodeFixed64(key_data + 8, un_hi); + return Slice(key_data, kCacheKeySize); } // Variable size and alignment size_t off = key % 8; @@ -226,8 +286,8 @@ struct KeyGen { } }; -Cache::ObjectPtr createValue(Random64& rnd) { - char* rv = new char[FLAGS_value_bytes]; +Cache::ObjectPtr createValue(Random64& rnd, MemoryAllocator* alloc) { + char* rv = AllocateBlock(FLAGS_value_bytes, alloc).release(); // Fill with some filler data, and take some CPU time for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) { EncodeFixed64(rv + i, rnd.Next()); @@ -244,7 +304,8 @@ Status SaveToFn(Cache::ObjectPtr from_obj, size_t /*from_offset*/, return Status::OK(); } -Status CreateFn(const Slice& data, Cache::CreateContext* /*context*/, +Status CreateFn(const Slice& data, CompressionType /*type*/, + CacheTier /*source*/, Cache::CreateContext* /*context*/, MemoryAllocator* /*allocator*/, Cache::ObjectPtr* out_obj, size_t* out_charge) { *out_obj = new char[data.size()]; @@ -253,16 +314,41 @@ Status CreateFn(const Slice& data, Cache::CreateContext* /*context*/, return Status::OK(); }; -void DeleteFn(Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) { - delete[] static_cast(value); +void DeleteFn(Cache::ObjectPtr value, MemoryAllocator* alloc) { + CustomDeleter{alloc}(static_cast(value)); } +Cache::CacheItemHelper helper1_wos(CacheEntryRole::kDataBlock, DeleteFn); Cache::CacheItemHelper helper1(CacheEntryRole::kDataBlock, DeleteFn, SizeFn, - SaveToFn, CreateFn); + SaveToFn, CreateFn, &helper1_wos); +Cache::CacheItemHelper helper2_wos(CacheEntryRole::kIndexBlock, DeleteFn); Cache::CacheItemHelper helper2(CacheEntryRole::kIndexBlock, DeleteFn, SizeFn, - SaveToFn, CreateFn); + SaveToFn, CreateFn, &helper2_wos); +Cache::CacheItemHelper helper3_wos(CacheEntryRole::kFilterBlock, DeleteFn); Cache::CacheItemHelper helper3(CacheEntryRole::kFilterBlock, DeleteFn, SizeFn, - SaveToFn, CreateFn); + SaveToFn, CreateFn, &helper3_wos); + +void ConfigureSecondaryCache(ShardedCacheOptions& opts) { + if (!FLAGS_secondary_cache_uri.empty()) { + std::shared_ptr secondary_cache; + Status s = SecondaryCache::CreateFromString( + ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache); + if (secondary_cache == nullptr) { + fprintf(stderr, + "No secondary cache registered matching string: %s status=%s\n", + FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str()); + exit(1); + } + opts.secondary_cache = secondary_cache; + } +} + +ShardedCacheBase* AsShardedCache(Cache* c) { + if (!FLAGS_secondary_cache_uri.empty()) { + c = static_cast_with_check(c)->GetTarget().get(); + } + return static_cast_with_check(c); +} } // namespace class CacheBench { @@ -277,52 +363,59 @@ class CacheBench { FLAGS_lookup_insert_percent), insert_threshold_(lookup_insert_threshold_ + kHundredthUint64 * FLAGS_insert_percent), - lookup_threshold_(insert_threshold_ + + blind_insert_threshold_(insert_threshold_ + + kHundredthUint64 * FLAGS_blind_insert_percent), + lookup_threshold_(blind_insert_threshold_ + kHundredthUint64 * FLAGS_lookup_percent), erase_threshold_(lookup_threshold_ + - kHundredthUint64 * FLAGS_erase_percent), - skewed_(FLAGS_skewed) { + kHundredthUint64 * FLAGS_erase_percent) { if (erase_threshold_ != 100U * kHundredthUint64) { fprintf(stderr, "Percentages must add to 100.\n"); exit(1); } - max_log_ = 0; - if (skewed_) { - uint64_t max_key = max_key_; - while (max_key >>= 1) max_log_++; - if (max_key > (static_cast(1) << max_log_)) max_log_++; + std::shared_ptr allocator; + if (FLAGS_use_jemalloc_no_dump_allocator) { + JemallocAllocatorOptions opts; + opts.num_arenas = FLAGS_jemalloc_no_dump_allocator_num_arenas; + opts.limit_tcache_size = + FLAGS_jemalloc_no_dump_allocator_limit_tcache_size; + Status s = NewJemallocNodumpAllocator(opts, &allocator); + assert(s.ok()); } - if (FLAGS_cache_type == "clock_cache") { fprintf(stderr, "Old clock cache implementation has been removed.\n"); exit(1); - } else if (FLAGS_cache_type == "hyper_clock_cache") { - cache_ = HyperClockCacheOptions(FLAGS_cache_size, FLAGS_value_bytes, - FLAGS_num_shard_bits) - .MakeSharedCache(); + } else if (EndsWith(FLAGS_cache_type, "hyper_clock_cache")) { + HyperClockCacheOptions opts( + FLAGS_cache_size, /*estimated_entry_charge=*/0, FLAGS_num_shard_bits); + opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX); + opts.memory_allocator = allocator; + if (FLAGS_cache_type == "fixed_hyper_clock_cache" || + FLAGS_cache_type == "hyper_clock_cache") { + opts.estimated_entry_charge = FLAGS_value_bytes_estimate > 0 + ? FLAGS_value_bytes_estimate + : FLAGS_value_bytes; + } else if (FLAGS_cache_type == "auto_hyper_clock_cache") { + if (FLAGS_value_bytes_estimate > 0) { + opts.min_avg_entry_charge = FLAGS_value_bytes_estimate; + } + } else { + fprintf(stderr, "Cache type not supported.\n"); + exit(1); + } + ConfigureSecondaryCache(opts); + cache_ = opts.MakeSharedCache(); } else if (FLAGS_cache_type == "lru_cache") { LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits, false /* strict_capacity_limit */, 0.5 /* high_pri_pool_ratio */); -#ifndef ROCKSDB_LITE - if (!FLAGS_secondary_cache_uri.empty()) { - Status s = SecondaryCache::CreateFromString( - ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache); - if (secondary_cache == nullptr) { - fprintf( - stderr, - "No secondary cache registered matching string: %s status=%s\n", - FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str()); - exit(1); - } - opts.secondary_cache = secondary_cache; - } -#endif // ROCKSDB_LITE - + opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX); + opts.memory_allocator = allocator; + ConfigureSecondaryCache(opts); cache_ = NewLRUCache(opts); } else { - fprintf(stderr, "Cache type not supported."); + fprintf(stderr, "Cache type not supported.\n"); exit(1); } } @@ -330,13 +423,51 @@ class CacheBench { ~CacheBench() {} void PopulateCache() { - Random64 rnd(1); + Random64 rnd(FLAGS_seed); KeyGen keygen; - for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) { - Status s = cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_), - createValue(rnd), &helper1, FLAGS_value_bytes); + size_t max_occ = 0; + size_t inserts_since_max_occ_increase = 0; + size_t keys_since_last_not_found = 0; + + // Avoid redundant insertions by checking Lookup before Insert. + // Loop until insertions consistently fail to increase max occupancy or + // it becomes difficult to find keys not already inserted. + while (inserts_since_max_occ_increase < 100 && + keys_since_last_not_found < 100) { + Slice key = keygen.GetRand(rnd, max_key_, FLAGS_skew); + + Cache::Handle* handle = cache_->Lookup(key); + if (handle != nullptr) { + cache_->Release(handle); + ++keys_since_last_not_found; + continue; + } + keys_since_last_not_found = 0; + + Status s = + cache_->Insert(key, createValue(rnd, cache_->memory_allocator()), + &helper1, FLAGS_value_bytes); assert(s.ok()); + + handle = cache_->Lookup(key); + if (!handle) { + fprintf(stderr, "Failed to lookup key just inserted.\n"); + assert(false); + exit(42); + } else { + cache_->Release(handle); + } + + size_t occ = cache_->GetOccupancyCount(); + if (occ > max_occ) { + max_occ = occ; + inserts_since_max_occ_increase = 0; + } else { + ++inserts_since_max_occ_increase; + } } + printf("Population complete (%zu entries, %g average charge)\n", max_occ, + 1.0 * FLAGS_cache_size / max_occ); } bool Run() { @@ -395,19 +526,35 @@ class CacheBench { FLAGS_ops_per_thread / elapsed_secs); printf("Thread ops/sec = %u\n", ops_per_sec); - printf("\nOperation latency (ns):\n"); - HistogramImpl combined; - for (uint32_t i = 0; i < FLAGS_threads; i++) { - combined.Merge(threads[i]->latency_ns_hist); - } - printf("%s", combined.ToString().c_str()); + printf("Lookup hit ratio: %g\n", shared.GetLookupHitRatio()); - if (FLAGS_gather_stats) { - printf("\nGather stats latency (us):\n"); - printf("%s", stats_hist.ToString().c_str()); + size_t occ = cache_->GetOccupancyCount(); + size_t slot = cache_->GetTableAddressCount(); + printf("Final load factor: %g (%zu / %zu)\n", 1.0 * occ / slot, occ, slot); + + printf("Final pinned count: %zu\n", shared.GetPinnedCount()); + + if (FLAGS_histograms) { + printf("\nOperation latency (ns):\n"); + HistogramImpl combined; + for (uint32_t i = 0; i < FLAGS_threads; i++) { + combined.Merge(threads[i]->latency_ns_hist); + } + printf("%s", combined.ToString().c_str()); + + if (FLAGS_gather_stats) { + printf("\nGather stats latency (us):\n"); + printf("%s", stats_hist.ToString().c_str()); + } } - printf("\n%s", stats_report.c_str()); + if (FLAGS_report_problems) { + printf("\n"); + std::shared_ptr logger = + std::make_shared(InfoLogLevel::DEBUG_LEVEL); + cache_->ReportProblems(logger); + } + printf("%s", stats_report.c_str()); return true; } @@ -418,10 +565,9 @@ class CacheBench { // Cumulative thresholds in the space of a random uint64_t const uint64_t lookup_insert_threshold_; const uint64_t insert_threshold_; + const uint64_t blind_insert_threshold_; const uint64_t lookup_threshold_; const uint64_t erase_threshold_; - const bool skewed_; - int max_log_; // A benchmark version of gathering stats on an active block cache by // iterating over it. The primary purpose is to measure the impact of @@ -454,7 +600,7 @@ class CacheBench { for (;;) { if (shared->AllDone()) { std::ostringstream ostr; - ostr << "Most recent cache entry stats:\n" + ostr << "\nMost recent cache entry stats:\n" << "Number of entries: " << total_entry_count << "\n" << "Table occupancy: " << table_occupancy << " / " << table_size << " = " @@ -491,13 +637,17 @@ class CacheBench { // Something slightly more expensive as in stats by category helpers.insert(helper); }; - timer.Start(); + if (FLAGS_histograms) { + timer.Start(); + } Cache::ApplyToAllEntriesOptions opts; opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock; shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts); table_occupancy = shared->GetCacheBench()->cache_->GetOccupancyCount(); table_size = shared->GetCacheBench()->cache_->GetTableAddressCount(); - stats_hist->Add(timer.ElapsedNanos() / 1000); + if (FLAGS_histograms) { + stats_hist->Add(timer.ElapsedNanos() / 1000); + } } } @@ -528,62 +678,89 @@ class CacheBench { void OperateCache(ThreadState* thread) { // To use looked-up values uint64_t result = 0; + uint64_t lookup_misses = 0; + uint64_t lookup_hits = 0; // To hold handles for a non-trivial amount of time - Cache::Handle* handle = nullptr; + std::deque pinned; + size_t total_pin_count = static_cast( + (FLAGS_cache_size * FLAGS_pinned_ratio) / FLAGS_value_bytes + 0.999999); + // For this thread. Some round up, some round down, as appropriate + size_t pin_count = (total_pin_count + thread->tid) / FLAGS_threads; + KeyGen gen; const auto clock = SystemClock::Default().get(); uint64_t start_time = clock->NowMicros(); StopWatchNano timer(clock); + auto system_clock = SystemClock::Default(); + size_t steps_to_next_capacity_change = 0; for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { - Slice key = gen.GetRand(thread->rnd, max_key_, max_log_); + Slice key = gen.GetRand(thread->rnd, max_key_, FLAGS_skew); uint64_t random_op = thread->rnd.Next(); - timer.Start(); + if (FLAGS_vary_capacity_ratio > 0.0 && thread->tid == 0) { + if (steps_to_next_capacity_change == 0) { + double cut_ratio = static_cast(thread->rnd.Next()) / + static_cast(UINT64_MAX) * + FLAGS_vary_capacity_ratio; + cache_->SetCapacity(FLAGS_cache_size * (1.0 - cut_ratio)); + steps_to_next_capacity_change = + static_cast(FLAGS_ops_per_thread / 100); + } else { + --steps_to_next_capacity_change; + } + } + + if (FLAGS_histograms) { + timer.Start(); + } if (random_op < lookup_insert_threshold_) { - if (handle) { - cache_->Release(handle); - handle = nullptr; - } // do lookup - handle = cache_->Lookup(key, &helper2, /*context*/ nullptr, - Cache::Priority::LOW, true); + auto handle = cache_->Lookup(key, &helper2, /*context*/ nullptr, + Cache::Priority::LOW); if (handle) { + ++lookup_hits; if (!FLAGS_lean) { // do something with the data result += NPHash64(static_cast(cache_->Value(handle)), FLAGS_value_bytes); } + pinned.push_back(handle); } else { + ++lookup_misses; // do insert - Status s = cache_->Insert(key, createValue(thread->rnd), &helper2, - FLAGS_value_bytes, &handle); + Status s = cache_->Insert( + key, createValue(thread->rnd, cache_->memory_allocator()), + &helper2, FLAGS_value_bytes, &pinned.emplace_back()); assert(s.ok()); } } else if (random_op < insert_threshold_) { - if (handle) { - cache_->Release(handle); - handle = nullptr; - } // do insert - Status s = cache_->Insert(key, createValue(thread->rnd), &helper3, - FLAGS_value_bytes, &handle); + Status s = cache_->Insert( + key, createValue(thread->rnd, cache_->memory_allocator()), &helper3, + FLAGS_value_bytes, &pinned.emplace_back()); + assert(s.ok()); + } else if (random_op < blind_insert_threshold_) { + // insert without keeping a handle + Status s = cache_->Insert( + key, createValue(thread->rnd, cache_->memory_allocator()), &helper3, + FLAGS_value_bytes); assert(s.ok()); } else if (random_op < lookup_threshold_) { - if (handle) { - cache_->Release(handle); - handle = nullptr; - } // do lookup - handle = cache_->Lookup(key, &helper2, /*context*/ nullptr, - Cache::Priority::LOW, true); + auto handle = cache_->Lookup(key, &helper2, /*context*/ nullptr, + Cache::Priority::LOW); if (handle) { + ++lookup_hits; if (!FLAGS_lean) { // do something with the data result += NPHash64(static_cast(cache_->Value(handle)), FLAGS_value_bytes); } + pinned.push_back(handle); + } else { + ++lookup_misses; } } else if (random_op < erase_threshold_) { // do erase @@ -592,9 +769,27 @@ class CacheBench { // Should be extremely unlikely (noop) assert(random_op >= kHundredthUint64 * 100U); } - thread->latency_ns_hist.Add(timer.ElapsedNanos()); + if (FLAGS_histograms) { + thread->latency_ns_hist.Add(timer.ElapsedNanos()); + } + if (FLAGS_usleep > 0) { + unsigned us = + static_cast(thread->rnd.Uniform(FLAGS_usleep + 1)); + if (us > 0) { + system_clock->SleepForMicroseconds(us); + } + } + while (pinned.size() > pin_count) { + cache_->Release(pinned.front()); + pinned.pop_front(); + } } - if (handle) { + if (FLAGS_early_exit) { + MutexLock l(thread->shared->GetMutex()); + exit(0); + } + thread->shared->AddLookupStats(lookup_hits, lookup_misses, pinned.size()); + for (auto handle : pinned) { cache_->Release(handle); handle = nullptr; } @@ -614,13 +809,16 @@ class CacheBench { #ifndef NDEBUG printf("WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); #endif + printf("----------------------------\n"); printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion); + printf("Cache impl name : %s\n", cache_->Name()); printf("DMutex impl name : %s\n", DMutex::kName()); printf("Number of threads : %u\n", FLAGS_threads); printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread); printf("Cache size : %s\n", BytesToHumanString(FLAGS_cache_size).c_str()); - printf("Num shard bits : %u\n", FLAGS_num_shard_bits); + printf("Num shard bits : %d\n", + AsShardedCache(cache_.get())->GetNumShardBits()); printf("Max key : %" PRIu64 "\n", max_key_); printf("Resident ratio : %g\n", FLAGS_resident_ratio); printf("Skew degree : %u\n", FLAGS_skew); @@ -940,6 +1138,7 @@ class StressCacheKey { }; int cache_bench_tool(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ParseCommandLineFlags(&argc, &argv, true); if (FLAGS_stress_cache_key) { @@ -953,11 +1152,14 @@ int cache_bench_tool(int argc, char** argv) { exit(1); } + if (FLAGS_seed == 0) { + FLAGS_seed = static_cast(port::GetProcessID()); + printf("Using seed = %" PRIu32 "\n", FLAGS_seed); + } + ROCKSDB_NAMESPACE::CacheBench bench; if (FLAGS_populate_cache) { bench.PopulateCache(); - printf("Population complete\n"); - printf("----------------------------\n"); } if (bench.Run()) { return 0; diff --git a/cache/cache_entry_stats.h b/cache/cache_entry_stats.h index 054304086d15..9968995da95a 100644 --- a/cache/cache_entry_stats.h +++ b/cache/cache_entry_stats.h @@ -143,7 +143,7 @@ class CacheEntryStatsCollector { } } // If we reach here, shared entry is in cache with handle `h`. - assert(cache.get()->GetCacheItemHelper(h) == &cache.kBasicHelper); + assert(cache.get()->GetCacheItemHelper(h) == cache.GetBasicHelper()); // Build an aliasing shared_ptr that keeps `ptr` in cache while there // are references. diff --git a/cache/cache_helpers.cc b/cache/cache_helpers.cc index 22597bf6daf0..bceb6f3c0a06 100644 --- a/cache/cache_helpers.cc +++ b/cache/cache_helpers.cc @@ -25,7 +25,8 @@ Status WarmInCache(Cache* cache, const Slice& key, const Slice& saved, assert(helper->create_cb); Cache::ObjectPtr value; size_t charge; - Status st = helper->create_cb(saved, create_context, + Status st = helper->create_cb(saved, CompressionType::kNoCompression, + CacheTier::kVolatileTier, create_context, cache->memory_allocator(), &value, &charge); if (st.ok()) { st = diff --git a/cache/cache_helpers.h b/cache/cache_helpers.h index eb4559dfe98a..2dca6ecb34ee 100644 --- a/cache/cache_helpers.h +++ b/cache/cache_helpers.h @@ -7,7 +7,7 @@ #include -#include "rocksdb/cache.h" +#include "rocksdb/advanced_cache.h" #include "rocksdb/rocksdb_namespace.h" namespace ROCKSDB_NAMESPACE { diff --git a/cache/cache_key.cc b/cache/cache_key.cc index a7932897242c..addff61d17b0 100644 --- a/cache/cache_key.cc +++ b/cache/cache_key.cc @@ -8,7 +8,7 @@ #include #include -#include "rocksdb/cache.h" +#include "rocksdb/advanced_cache.h" #include "table/unique_id_impl.h" #include "util/hash.h" #include "util/math.h" diff --git a/cache/cache_reservation_manager.cc b/cache/cache_reservation_manager.cc index b43bfddc6936..2a4be4204578 100644 --- a/cache/cache_reservation_manager.cc +++ b/cache/cache_reservation_manager.cc @@ -169,7 +169,7 @@ Slice CacheReservationManagerImpl::GetNextCacheKey() { template const Cache::CacheItemHelper* CacheReservationManagerImpl::TEST_GetCacheItemHelperForRole() { - return &CacheInterface::kHelper; + return CacheInterface::GetHelper(); } template class CacheReservationManagerImpl< diff --git a/cache/cache_reservation_manager.h b/cache/cache_reservation_manager.h index 08bf59b00661..a7b06dea2073 100644 --- a/cache/cache_reservation_manager.h +++ b/cache/cache_reservation_manager.h @@ -273,9 +273,10 @@ class ConcurrentCacheReservationManager std::size_t total_mem_used = cache_res_mgr_->GetTotalMemoryUsed(); Status s; if (!increase) { - assert(total_mem_used >= memory_used_delta); - s = cache_res_mgr_->UpdateCacheReservation(total_mem_used - - memory_used_delta); + s = cache_res_mgr_->UpdateCacheReservation( + (total_mem_used > memory_used_delta) + ? (total_mem_used - memory_used_delta) + : 0); } else { s = cache_res_mgr_->UpdateCacheReservation(total_mem_used + memory_used_delta); diff --git a/cache/cache_test.cc b/cache/cache_test.cc index 32335f3d2b7b..f21efc47a928 100644 --- a/cache/cache_test.cc +++ b/cache/cache_test.cc @@ -18,8 +18,10 @@ #include "cache/lru_cache.h" #include "cache/typed_cache.h" #include "port/stack_trace.h" +#include "test_util/secondary_cache_test_util.h" #include "test_util/testharness.h" #include "util/coding.h" +#include "util/hash_containers.h" #include "util/string_util.h" // HyperClockCache only supports 16-byte keys, so some of the tests @@ -68,26 +70,16 @@ const Cache::CacheItemHelper kDumbHelper{ CacheEntryRole::kMisc, [](Cache::ObjectPtr /*value*/, MemoryAllocator* /*alloc*/) {}}; -const Cache::CacheItemHelper kEraseOnDeleteHelper1{ +const Cache::CacheItemHelper kInvokeOnDeleteHelper{ CacheEntryRole::kMisc, [](Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) { - Cache* cache = static_cast(value); - cache->Erase("foo"); + auto& fn = *static_cast*>(value); + fn(); }}; - -const Cache::CacheItemHelper kEraseOnDeleteHelper2{ - CacheEntryRole::kMisc, - [](Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) { - Cache* cache = static_cast(value); - cache->Erase(EncodeKey16Bytes(1234)); - }}; - -const std::string kLRU = "lru"; -const std::string kHyperClock = "hyper_clock"; - } // anonymous namespace -class CacheTest : public testing::TestWithParam { +class CacheTest : public testing::Test, + public secondary_cache_test_util::WithCacheTypeParam { public: static CacheTest* current_; static std::string type_; @@ -95,8 +87,7 @@ class CacheTest : public testing::TestWithParam { static void Deleter(Cache::ObjectPtr v, MemoryAllocator*) { current_->deleted_values_.push_back(DecodeValue(v)); } - static constexpr Cache::CacheItemHelper kHelper{CacheEntryRole::kMisc, - &Deleter}; + static const Cache::CacheItemHelper kHelper; static const int kCacheSize = 1000; static const int kNumShardBits = 4; @@ -108,8 +99,6 @@ class CacheTest : public testing::TestWithParam { std::shared_ptr cache_; std::shared_ptr cache2_; - size_t estimated_value_size_ = 1; - CacheTest() : cache_(NewCache(kCacheSize, kNumShardBits, false)), cache2_(NewCache(kCacheSize2, kNumShardBits2, false)) { @@ -119,48 +108,12 @@ class CacheTest : public testing::TestWithParam { ~CacheTest() override {} - std::shared_ptr NewCache(size_t capacity) { - auto type = GetParam(); - if (type == kLRU) { - return NewLRUCache(capacity); - } - if (type == kHyperClock) { - return HyperClockCacheOptions( - capacity, estimated_value_size_ /*estimated_value_size*/) - .MakeSharedCache(); - } - return nullptr; - } - - std::shared_ptr NewCache( - size_t capacity, int num_shard_bits, bool strict_capacity_limit, - CacheMetadataChargePolicy charge_policy = kDontChargeCacheMetadata) { - auto type = GetParam(); - if (type == kLRU) { - LRUCacheOptions co; - co.capacity = capacity; - co.num_shard_bits = num_shard_bits; - co.strict_capacity_limit = strict_capacity_limit; - co.high_pri_pool_ratio = 0; - co.metadata_charge_policy = charge_policy; - return NewLRUCache(co); - } - if (type == kHyperClock) { - return HyperClockCacheOptions(capacity, 1 /*estimated_value_size*/, - num_shard_bits, strict_capacity_limit, - nullptr /*allocator*/, charge_policy) - .MakeSharedCache(); - } - return nullptr; - } - // These functions encode/decode keys in tests cases that use // int keys. // Currently, HyperClockCache requires keys to be 16B long, whereas // LRUCache doesn't, so the encoding depends on the cache type. std::string EncodeKey(int k) { - auto type = GetParam(); - if (type == kHyperClock) { + if (IsHyperClock()) { return EncodeKey16Bytes(k); } else { return EncodeKey32Bits(k); @@ -168,8 +121,7 @@ class CacheTest : public testing::TestWithParam { } int DecodeKey(const Slice& k) { - auto type = GetParam(); - if (type == kHyperClock) { + if (IsHyperClock()) { return DecodeKey16Bytes(k); } else { return DecodeKey32Bits(k); @@ -187,8 +139,8 @@ class CacheTest : public testing::TestWithParam { void Insert(std::shared_ptr cache, int key, int value, int charge = 1) { - EXPECT_OK( - cache->Insert(EncodeKey(key), EncodeValue(value), &kHelper, charge)); + EXPECT_OK(cache->Insert(EncodeKey(key), EncodeValue(value), &kHelper, + charge, /*handle*/ nullptr, Cache::Priority::HIGH)); } void Erase(std::shared_ptr cache, int key) { @@ -212,21 +164,22 @@ class CacheTest : public testing::TestWithParam { void Erase2(int key) { Erase(cache2_, key); } }; +const Cache::CacheItemHelper CacheTest::kHelper{CacheEntryRole::kMisc, + &CacheTest::Deleter}; + CacheTest* CacheTest::current_; std::string CacheTest::type_; class LRUCacheTest : public CacheTest {}; TEST_P(CacheTest, UsageTest) { - auto type = GetParam(); - // cache is std::shared_ptr and will be automatically cleaned up. const size_t kCapacity = 100000; - auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata); + auto cache = NewCache(kCapacity, 6, false, kDontChargeCacheMetadata); auto precise_cache = NewCache(kCapacity, 0, false, kFullChargeCacheMetadata); ASSERT_EQ(0, cache->GetUsage()); size_t baseline_meta_usage = precise_cache->GetUsage(); - if (type != kHyperClock) { + if (!IsHyperClock()) { ASSERT_EQ(0, baseline_meta_usage); } @@ -234,20 +187,19 @@ TEST_P(CacheTest, UsageTest) { char value[10] = "abcdef"; // make sure everything will be cached for (int i = 1; i < 100; ++i) { - std::string key; - if (type == kLRU) { - key = std::string(i, 'a'); - } else { - key = EncodeKey(i); - } + std::string key = EncodeKey(i); auto kv_size = key.size() + 5; ASSERT_OK(cache->Insert(key, value, &kDumbHelper, kv_size)); ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, kv_size)); usage += kv_size; ASSERT_EQ(usage, cache->GetUsage()); - if (type == kHyperClock) { + if (GetParam() == kFixedHyperClock) { ASSERT_EQ(baseline_meta_usage + usage, precise_cache->GetUsage()); } else { + // AutoHyperClockCache meta usage grows in proportion to lifetime + // max number of entries. LRUCache in proportion to resident number of + // entries, though there is an untracked component proportional to + // lifetime max number of entries. ASSERT_LT(usage, precise_cache->GetUsage()); } } @@ -255,16 +207,15 @@ TEST_P(CacheTest, UsageTest) { cache->EraseUnRefEntries(); precise_cache->EraseUnRefEntries(); ASSERT_EQ(0, cache->GetUsage()); - ASSERT_EQ(baseline_meta_usage, precise_cache->GetUsage()); + if (GetParam() != kAutoHyperClock) { + // NOTE: AutoHyperClockCache meta usage grows in proportion to lifetime + // max number of entries. + ASSERT_EQ(baseline_meta_usage, precise_cache->GetUsage()); + } // make sure the cache will be overloaded for (size_t i = 1; i < kCapacity; ++i) { - std::string key; - if (type == kLRU) { - key = std::to_string(i); - } else { - key = EncodeKey(static_cast(1000 + i)); - } + std::string key = EncodeKey(static_cast(1000 + i)); ASSERT_OK(cache->Insert(key, value, &kDumbHelper, key.size() + 5)); ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, key.size() + 5)); } @@ -273,7 +224,7 @@ TEST_P(CacheTest, UsageTest) { ASSERT_GT(kCapacity, cache->GetUsage()); ASSERT_GT(kCapacity, precise_cache->GetUsage()); ASSERT_LT(kCapacity * 0.95, cache->GetUsage()); - if (type != kHyperClock) { + if (!IsHyperClock()) { ASSERT_LT(kCapacity * 0.95, precise_cache->GetUsage()); } else { // estimated value size of 1 is weird for clock cache, because @@ -284,22 +235,20 @@ TEST_P(CacheTest, UsageTest) { } } -// TODO: This test takes longer than expected on ClockCache. This is -// because the values size estimate at construction is too sloppy. +// TODO: This test takes longer than expected on FixedHyperClockCache. +// This is because the values size estimate at construction is too sloppy. // Fix this. // Why is it so slow? The cache is constructed with an estimate of 1, but // then the charge is claimed to be 21. This will cause the hash table // to be extremely sparse, which in turn means clock needs to scan too // many slots to find victims. TEST_P(CacheTest, PinnedUsageTest) { - auto type = GetParam(); - // cache is std::shared_ptr and will be automatically cleaned up. const size_t kCapacity = 200000; auto cache = NewCache(kCapacity, 8, false, kDontChargeCacheMetadata); auto precise_cache = NewCache(kCapacity, 8, false, kFullChargeCacheMetadata); size_t baseline_meta_usage = precise_cache->GetUsage(); - if (type != kHyperClock) { + if (!IsHyperClock()) { ASSERT_EQ(0, baseline_meta_usage); } @@ -312,12 +261,7 @@ TEST_P(CacheTest, PinnedUsageTest) { // Add entries. Unpin some of them after insertion. Then, pin some of them // again. Check GetPinnedUsage(). for (int i = 1; i < 100; ++i) { - std::string key; - if (type == kLRU) { - key = std::string(i, 'a'); - } else { - key = EncodeKey(i); - } + std::string key = EncodeKey(i); auto kv_size = key.size() + 5; Cache::Handle* handle; Cache::Handle* handle_in_precise_cache; @@ -358,12 +302,7 @@ TEST_P(CacheTest, PinnedUsageTest) { // check that overloading the cache does not change the pinned usage for (size_t i = 1; i < 2 * kCapacity; ++i) { - std::string key; - if (type == kLRU) { - key = std::to_string(i); - } else { - key = EncodeKey(static_cast(1000 + i)); - } + std::string key = EncodeKey(static_cast(1000 + i)); ASSERT_OK(cache->Insert(key, value, &kDumbHelper, key.size() + 5)); ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, key.size() + 5)); } @@ -387,7 +326,11 @@ TEST_P(CacheTest, PinnedUsageTest) { cache->EraseUnRefEntries(); precise_cache->EraseUnRefEntries(); ASSERT_EQ(0, cache->GetUsage()); - ASSERT_EQ(baseline_meta_usage, precise_cache->GetUsage()); + if (GetParam() != kAutoHyperClock) { + // NOTE: AutoHyperClockCache meta usage grows in proportion to lifetime + // max number of entries. + ASSERT_EQ(baseline_meta_usage, precise_cache->GetUsage()); + } } TEST_P(CacheTest, HitAndMiss) { @@ -404,7 +347,7 @@ TEST_P(CacheTest, HitAndMiss) { ASSERT_EQ(-1, Lookup(300)); Insert(100, 102); - if (GetParam() == kHyperClock) { + if (IsHyperClock()) { // ClockCache usually doesn't overwrite on Insert ASSERT_EQ(101, Lookup(100)); } else { @@ -414,7 +357,7 @@ TEST_P(CacheTest, HitAndMiss) { ASSERT_EQ(-1, Lookup(300)); ASSERT_EQ(1U, deleted_values_.size()); - if (GetParam() == kHyperClock) { + if (IsHyperClock()) { ASSERT_EQ(102, deleted_values_[0]); } else { ASSERT_EQ(101, deleted_values_[0]); @@ -422,7 +365,7 @@ TEST_P(CacheTest, HitAndMiss) { } TEST_P(CacheTest, InsertSameKey) { - if (GetParam() == kHyperClock) { + if (IsHyperClock()) { ROCKSDB_GTEST_BYPASS( "ClockCache doesn't guarantee Insert overwrite same key."); return; @@ -451,7 +394,7 @@ TEST_P(CacheTest, Erase) { } TEST_P(CacheTest, EntriesArePinned) { - if (GetParam() == kHyperClock) { + if (IsHyperClock()) { ROCKSDB_GTEST_BYPASS( "ClockCache doesn't guarantee Insert overwrite same key."); return; @@ -515,7 +458,7 @@ TEST_P(CacheTest, ExternalRefPinsEntries) { Insert(1000 + j, 2000 + j); } // Clock cache is even more stateful and needs more churn to evict - if (GetParam() == kHyperClock) { + if (IsHyperClock()) { for (int j = 0; j < kCacheSize; j++) { Insert(11000 + j, 11000 + j); } @@ -553,20 +496,20 @@ TEST_P(CacheTest, EvictionPolicyRef) { // Check whether the entries inserted in the beginning // are evicted. Ones without extra ref are evicted and // those with are not. - ASSERT_EQ(-1, Lookup(100)); - ASSERT_EQ(-1, Lookup(101)); - ASSERT_EQ(-1, Lookup(102)); - ASSERT_EQ(-1, Lookup(103)); + EXPECT_EQ(-1, Lookup(100)); + EXPECT_EQ(-1, Lookup(101)); + EXPECT_EQ(-1, Lookup(102)); + EXPECT_EQ(-1, Lookup(103)); - ASSERT_EQ(-1, Lookup(300)); - ASSERT_EQ(-1, Lookup(301)); - ASSERT_EQ(-1, Lookup(302)); - ASSERT_EQ(-1, Lookup(303)); + EXPECT_EQ(-1, Lookup(300)); + EXPECT_EQ(-1, Lookup(301)); + EXPECT_EQ(-1, Lookup(302)); + EXPECT_EQ(-1, Lookup(303)); - ASSERT_EQ(101, Lookup(200)); - ASSERT_EQ(102, Lookup(201)); - ASSERT_EQ(103, Lookup(202)); - ASSERT_EQ(104, Lookup(203)); + EXPECT_EQ(101, Lookup(200)); + EXPECT_EQ(102, Lookup(201)); + EXPECT_EQ(103, Lookup(202)); + EXPECT_EQ(104, Lookup(203)); // Cleaning up all the handles cache_->Release(h201); @@ -576,37 +519,22 @@ TEST_P(CacheTest, EvictionPolicyRef) { } TEST_P(CacheTest, EvictEmptyCache) { - auto type = GetParam(); - // Insert item large than capacity to trigger eviction on empty cache. auto cache = NewCache(1, 0, false); - if (type == kLRU) { - ASSERT_OK(cache->Insert("foo", nullptr, &kDumbHelper, 10)); - } else { - ASSERT_OK(cache->Insert(EncodeKey(1000), nullptr, &kDumbHelper, 10)); - } + ASSERT_OK(cache->Insert(EncodeKey(1000), nullptr, &kDumbHelper, 10)); } TEST_P(CacheTest, EraseFromDeleter) { - auto type = GetParam(); - // Have deleter which will erase item from cache, which will re-enter // the cache at that point. std::shared_ptr cache = NewCache(10, 0, false); - std::string foo, bar; - const Cache::CacheItemHelper* erase_helper; - if (type == kLRU) { - foo = "foo"; - bar = "bar"; - erase_helper = &kEraseOnDeleteHelper1; - } else { - foo = EncodeKey(1234); - bar = EncodeKey(5678); - erase_helper = &kEraseOnDeleteHelper2; - } + std::string foo = EncodeKey(1234); + std::string bar = EncodeKey(5678); + + std::function erase_fn = [&]() { cache->Erase(foo); }; ASSERT_OK(cache->Insert(foo, nullptr, &kDumbHelper, 1)); - ASSERT_OK(cache->Insert(bar, cache.get(), erase_helper, 1)); + ASSERT_OK(cache->Insert(bar, &erase_fn, &kInvokeOnDeleteHelper, 1)); cache->Erase(bar); ASSERT_EQ(nullptr, cache->Lookup(foo)); @@ -714,10 +642,10 @@ using TypedHandle = SharedCache::TypedHandle; } // namespace TEST_P(CacheTest, SetCapacity) { - auto type = GetParam(); - if (type == kHyperClock) { + if (IsHyperClock()) { + // TODO: update test & code for limited supoort ROCKSDB_GTEST_BYPASS( - "FastLRUCache and HyperClockCache don't support arbitrary capacity " + "HyperClockCache doesn't support arbitrary capacity " "adjustments."); return; } @@ -847,7 +775,7 @@ TEST_P(CacheTest, OverCapacity) { cache.Release(handles[i]); } - if (GetParam() == kHyperClock) { + if (IsHyperClock()) { // Make sure eviction is triggered. ASSERT_OK(cache.Insert(EncodeKey(-1), nullptr, 1, &handles[0])); @@ -959,8 +887,7 @@ TEST_P(CacheTest, DefaultShardBits) { // Prevent excessive allocation (to save time & space) estimated_value_size_ = 100000; // Implementations use different minimum shard sizes - size_t min_shard_size = - (GetParam() == kHyperClock ? 32U * 1024U : 512U) * 1024U; + size_t min_shard_size = (IsHyperClock() ? 32U * 1024U : 512U) * 1024U; std::shared_ptr cache = NewCache(32U * min_shard_size); ShardedCacheBase* sc = dynamic_cast(cache.get()); @@ -992,9 +919,101 @@ TEST_P(CacheTest, GetChargeAndDeleter) { cache_->Release(h1); } +namespace { +bool AreTwoCacheKeysOrdered(Cache* cache) { + std::vector keys; + const auto callback = [&](const Slice& key, Cache::ObjectPtr /*value*/, + size_t /*charge*/, + const Cache::CacheItemHelper* /*helper*/) { + keys.push_back(key.ToString()); + }; + cache->ApplyToAllEntries(callback, /*opts*/ {}); + EXPECT_EQ(keys.size(), 2U); + EXPECT_NE(keys[0], keys[1]); + return keys[0] < keys[1]; +} +} // namespace + +TEST_P(CacheTest, CacheUniqueSeeds) { + // kQuasiRandomHashSeed should generate unique seeds (up to 2 billion before + // repeating) + UnorderedSet seeds_seen; + // Roughly sqrt(number of possible values) for a decent chance at detecting + // a random collision if it's possible (shouldn't be) + uint16_t kSamples = 20000; + seeds_seen.reserve(kSamples); + + // Hash seed should affect ordering of entries in the table, so we should + // have extremely high chance of seeing two entries ordered both ways. + bool seen_forward_order = false; + bool seen_reverse_order = false; + + for (int i = 0; i < kSamples; ++i) { + auto cache = NewCache(2, [=](ShardedCacheOptions& opts) { + opts.hash_seed = LRUCacheOptions::kQuasiRandomHashSeed; + opts.num_shard_bits = 0; + opts.metadata_charge_policy = kDontChargeCacheMetadata; + }); + auto val = cache->GetHashSeed(); + ASSERT_TRUE(seeds_seen.insert(val).second); + + ASSERT_OK(cache->Insert(EncodeKey(1), nullptr, &kHelper, /*charge*/ 1)); + ASSERT_OK(cache->Insert(EncodeKey(2), nullptr, &kHelper, /*charge*/ 1)); + + if (AreTwoCacheKeysOrdered(cache.get())) { + seen_forward_order = true; + } else { + seen_reverse_order = true; + } + } + + ASSERT_TRUE(seen_forward_order); + ASSERT_TRUE(seen_reverse_order); +} + +TEST_P(CacheTest, CacheHostSeed) { + // kHostHashSeed should generate a consistent seed within this process + // (and other processes on the same host, but not unit testing that). + // And we should be able to use that chosen seed as an explicit option + // (for debugging). + // And we should verify consistent ordering of entries. + uint32_t expected_seed = 0; + bool expected_order = false; + // 10 iterations -> chance of a random seed falsely appearing consistent + // should be low, just 1 in 2^9. + for (int i = 0; i < 10; ++i) { + auto cache = NewCache(2, [=](ShardedCacheOptions& opts) { + if (i != 5) { + opts.hash_seed = LRUCacheOptions::kHostHashSeed; + } else { + // Can be used as explicit seed + opts.hash_seed = static_cast(expected_seed); + ASSERT_GE(opts.hash_seed, 0); + } + opts.num_shard_bits = 0; + opts.metadata_charge_policy = kDontChargeCacheMetadata; + }); + ASSERT_OK(cache->Insert(EncodeKey(1), nullptr, &kHelper, /*charge*/ 1)); + ASSERT_OK(cache->Insert(EncodeKey(2), nullptr, &kHelper, /*charge*/ 1)); + uint32_t val = cache->GetHashSeed(); + bool order = AreTwoCacheKeysOrdered(cache.get()); + if (i != 0) { + ASSERT_EQ(val, expected_seed); + ASSERT_EQ(order, expected_order); + } else { + expected_seed = val; + expected_order = order; + } + } + // Printed for reference in case it's needed to reproduce other unit test + // failures on another host + fprintf(stderr, "kHostHashSeed -> %u\n", (unsigned)expected_seed); +} + INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest, - testing::Values(kLRU, kHyperClock)); -INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU)); + secondary_cache_test_util::GetTestingCacheTypes()); +INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, + testing::Values(secondary_cache_test_util::kLRU)); } // namespace ROCKSDB_NAMESPACE diff --git a/cache/charged_cache.cc b/cache/charged_cache.cc index 3c32fc9611a8..6a21bacfbc0d 100644 --- a/cache/charged_cache.cc +++ b/cache/charged_cache.cc @@ -11,7 +11,7 @@ namespace ROCKSDB_NAMESPACE { ChargedCache::ChargedCache(std::shared_ptr cache, std::shared_ptr block_cache) - : cache_(cache), + : CacheWrapper(cache), cache_res_mgr_(std::make_shared( std::make_shared< CacheReservationManagerImpl>( @@ -19,14 +19,16 @@ ChargedCache::ChargedCache(std::shared_ptr cache, Status ChargedCache::Insert(const Slice& key, ObjectPtr obj, const CacheItemHelper* helper, size_t charge, - Handle** handle, Priority priority) { - Status s = cache_->Insert(key, obj, helper, charge, handle, priority); + Handle** handle, Priority priority, + const Slice& compressed_val, CompressionType type) { + Status s = target_->Insert(key, obj, helper, charge, handle, priority, + compressed_val, type); if (s.ok()) { // Insert may cause the cache entry eviction if the cache is full. So we // directly call the reservation manager to update the total memory used // in the cache. assert(cache_res_mgr_); - cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage()) + cache_res_mgr_->UpdateCacheReservation(target_->GetUsage()) .PermitUncheckedError(); } return s; @@ -35,25 +37,33 @@ Status ChargedCache::Insert(const Slice& key, ObjectPtr obj, Cache::Handle* ChargedCache::Lookup(const Slice& key, const CacheItemHelper* helper, CreateContext* create_context, - Priority priority, bool wait, - Statistics* stats) { - auto handle = - cache_->Lookup(key, helper, create_context, priority, wait, stats); + Priority priority, Statistics* stats) { + auto handle = target_->Lookup(key, helper, create_context, priority, stats); // Lookup may promote the KV pair from the secondary cache to the primary // cache. So we directly call the reservation manager to update the total // memory used in the cache. if (helper && helper->create_cb) { assert(cache_res_mgr_); - cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage()) + cache_res_mgr_->UpdateCacheReservation(target_->GetUsage()) .PermitUncheckedError(); } return handle; } +void ChargedCache::WaitAll(AsyncLookupHandle* async_handles, size_t count) { + target_->WaitAll(async_handles, count); + // In case of any promotions. Although some could finish by return of + // StartAsyncLookup, Wait/WaitAll will generally be used, so simpler to + // update here. + assert(cache_res_mgr_); + cache_res_mgr_->UpdateCacheReservation(target_->GetUsage()) + .PermitUncheckedError(); +} + bool ChargedCache::Release(Cache::Handle* handle, bool useful, bool erase_if_last_ref) { - size_t memory_used_delta = cache_->GetUsage(handle); - bool erased = cache_->Release(handle, useful, erase_if_last_ref); + size_t memory_used_delta = target_->GetUsage(handle); + bool erased = target_->Release(handle, useful, erase_if_last_ref); if (erased) { assert(cache_res_mgr_); cache_res_mgr_ @@ -64,8 +74,8 @@ bool ChargedCache::Release(Cache::Handle* handle, bool useful, } bool ChargedCache::Release(Cache::Handle* handle, bool erase_if_last_ref) { - size_t memory_used_delta = cache_->GetUsage(handle); - bool erased = cache_->Release(handle, erase_if_last_ref); + size_t memory_used_delta = target_->GetUsage(handle); + bool erased = target_->Release(handle, erase_if_last_ref); if (erased) { assert(cache_res_mgr_); cache_res_mgr_ @@ -76,25 +86,25 @@ bool ChargedCache::Release(Cache::Handle* handle, bool erase_if_last_ref) { } void ChargedCache::Erase(const Slice& key) { - cache_->Erase(key); + target_->Erase(key); assert(cache_res_mgr_); - cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage()) + cache_res_mgr_->UpdateCacheReservation(target_->GetUsage()) .PermitUncheckedError(); } void ChargedCache::EraseUnRefEntries() { - cache_->EraseUnRefEntries(); + target_->EraseUnRefEntries(); assert(cache_res_mgr_); - cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage()) + cache_res_mgr_->UpdateCacheReservation(target_->GetUsage()) .PermitUncheckedError(); } void ChargedCache::SetCapacity(size_t capacity) { - cache_->SetCapacity(capacity); + target_->SetCapacity(capacity); // SetCapacity can result in evictions when the cache capacity is decreased, // so we would want to update the cache reservation here as well. assert(cache_res_mgr_); - cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage()) + cache_res_mgr_->UpdateCacheReservation(target_->GetUsage()) .PermitUncheckedError(); } diff --git a/cache/charged_cache.h b/cache/charged_cache.h index 4bbb6675962a..a59c178abe45 100644 --- a/cache/charged_cache.h +++ b/cache/charged_cache.h @@ -8,7 +8,7 @@ #include #include "port/port.h" -#include "rocksdb/cache.h" +#include "rocksdb/advanced_cache.h" namespace ROCKSDB_NAMESPACE { @@ -17,21 +17,24 @@ class ConcurrentCacheReservationManager; // A cache interface which wraps around another cache and takes care of // reserving space in block cache towards a single global memory limit, and // forwards all the calls to the underlying cache. -class ChargedCache : public Cache { +class ChargedCache : public CacheWrapper { public: ChargedCache(std::shared_ptr cache, std::shared_ptr block_cache); - ~ChargedCache() override = default; - Status Insert(const Slice& key, ObjectPtr obj, const CacheItemHelper* helper, - size_t charge, Handle** handle = nullptr, - Priority priority = Priority::LOW) override; + Status Insert( + const Slice& key, ObjectPtr obj, const CacheItemHelper* helper, + size_t charge, Handle** handle = nullptr, + Priority priority = Priority::LOW, const Slice& compressed_val = Slice(), + CompressionType type = CompressionType::kNoCompression) override; Cache::Handle* Lookup(const Slice& key, const CacheItemHelper* helper, CreateContext* create_context, - Priority priority = Priority::LOW, bool wait = true, + Priority priority = Priority::LOW, Statistics* stats = nullptr) override; + void WaitAll(AsyncLookupHandle* async_handles, size_t count) override; + bool Release(Cache::Handle* handle, bool useful, bool erase_if_last_ref = false) override; bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override; @@ -42,66 +45,9 @@ class ChargedCache : public Cache { static const char* kClassName() { return "ChargedCache"; } const char* Name() const override { return kClassName(); } - uint64_t NewId() override { return cache_->NewId(); } - void SetCapacity(size_t capacity) override; - void SetStrictCapacityLimit(bool strict_capacity_limit) override { - cache_->SetStrictCapacityLimit(strict_capacity_limit); - } - - bool HasStrictCapacityLimit() const override { - return cache_->HasStrictCapacityLimit(); - } - - ObjectPtr Value(Cache::Handle* handle) override { - return cache_->Value(handle); - } - - bool IsReady(Cache::Handle* handle) override { - return cache_->IsReady(handle); - } - - void Wait(Cache::Handle* handle) override { cache_->Wait(handle); } - - void WaitAll(std::vector& handles) override { - cache_->WaitAll(handles); - } - - bool Ref(Cache::Handle* handle) override { return cache_->Ref(handle); } - - size_t GetCapacity() const override { return cache_->GetCapacity(); } - - size_t GetUsage() const override { return cache_->GetUsage(); } - - size_t GetUsage(Cache::Handle* handle) const override { - return cache_->GetUsage(handle); - } - - size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); } - - size_t GetCharge(Cache::Handle* handle) const override { - return cache_->GetCharge(handle); - } - - const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override { - return cache_->GetCacheItemHelper(handle); - } - - void ApplyToAllEntries( - const std::function& callback, - const Cache::ApplyToAllEntriesOptions& opts) override { - cache_->ApplyToAllEntries(callback, opts); - } - - std::string GetPrintableOptions() const override { - return cache_->GetPrintableOptions(); - } - - void DisownData() override { return cache_->DisownData(); } - - inline Cache* GetCache() const { return cache_.get(); } + inline Cache* GetCache() const { return target_.get(); } inline ConcurrentCacheReservationManager* TEST_GetCacheReservationManager() const { @@ -109,7 +55,6 @@ class ChargedCache : public Cache { } private: - std::shared_ptr cache_; std::shared_ptr cache_res_mgr_; }; diff --git a/cache/clock_cache.cc b/cache/clock_cache.cc index 9476dba7a8da..fd330d90d832 100644 --- a/cache/clock_cache.cc +++ b/cache/clock_cache.cc @@ -9,15 +9,26 @@ #include "cache/clock_cache.h" +#include +#include +#include #include +#include +#include +#include #include #include +#include +#include +#include #include "cache/cache_key.h" +#include "cache/secondary_cache_adapter.h" #include "logging/logging.h" #include "monitoring/perf_context_imp.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "port/lang.h" +#include "rocksdb/env.h" #include "util/hash.h" #include "util/math.h" #include "util/random.h" @@ -50,6 +61,17 @@ inline uint64_t GetInitialCountdown(Cache::Priority priority) { } } +inline void MarkEmpty(ClockHandle& h) { +#ifndef NDEBUG + // Mark slot as empty, with assertion + uint64_t meta = h.meta.Exchange(0); + assert(meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction); +#else + // Mark slot as empty + h.meta.Store(0); +#endif +} + inline void FreeDataMarkEmpty(ClockHandle& h, MemoryAllocator* allocator) { // NOTE: in theory there's more room for parallelism if we copy the handle // data and delay actions like this until after marking the entry as empty, @@ -57,33 +79,54 @@ inline void FreeDataMarkEmpty(ClockHandle& h, MemoryAllocator* allocator) { // of data. h.FreeData(allocator); -#ifndef NDEBUG - // Mark slot as empty, with assertion - uint64_t meta = h.meta.exchange(0, std::memory_order_release); - assert(meta >> ClockHandle::kStateShift == ClockHandle::kStateConstruction); -#else - // Mark slot as empty - h.meta.store(0, std::memory_order_release); -#endif + MarkEmpty(h); +} + +// Called to undo the effect of referencing an entry for internal purposes, +// so it should not be marked as having been used. +inline void Unref(const ClockHandle& h, uint64_t count = 1) { + // Pretend we never took the reference + // WART: there's a tiny chance we release last ref to invisible + // entry here. If that happens, we let eviction take care of it. + uint64_t old_meta = h.meta.FetchSub(ClockHandle::kAcquireIncrement * count); + assert(GetRefcount(old_meta) != 0); + (void)old_meta; } -inline bool ClockUpdate(ClockHandle& h) { - uint64_t meta = h.meta.load(std::memory_order_relaxed); +inline bool ClockUpdate(ClockHandle& h, bool* purgeable = nullptr) { + uint64_t meta; + if (purgeable) { + assert(*purgeable == false); + // In AutoHCC, our eviction process follows the chain structure, so we + // should ensure that we see the latest state of each entry, at least for + // assertion checking. + meta = h.meta.Load(); + } else { + // In FixedHCC, our eviction process is a simple iteration without regard + // to probing order, displacements, etc., so it doesn't matter if we see + // somewhat stale data. + meta = h.meta.LoadRelaxed(); + } + if (((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit) == + 0) { + // Only clock update Shareable entries + if (purgeable) { + *purgeable = true; + // AutoHCC only: make sure we only attempt to update non-empty slots + assert((meta >> ClockHandle::kStateShift) & + ClockHandle::kStateOccupiedBit); + } + return false; + } uint64_t acquire_count = (meta >> ClockHandle::kAcquireCounterShift) & ClockHandle::kCounterMask; uint64_t release_count = (meta >> ClockHandle::kReleaseCounterShift) & ClockHandle::kCounterMask; - // fprintf(stderr, "ClockUpdate @ %p: %lu %lu %u\n", &h, acquire_count, - // release_count, (unsigned)(meta >> ClockHandle::kStateShift)); if (acquire_count != release_count) { // Only clock update entries with no outstanding refs return false; } - if (!((meta >> ClockHandle::kStateShift) & ClockHandle::kStateShareableBit)) { - // Only clock update Shareable entries - return false; - } if ((meta >> ClockHandle::kStateShift == ClockHandle::kStateVisible) && acquire_count > 0) { // Decrement clock @@ -93,17 +136,17 @@ inline bool ClockUpdate(ClockHandle& h) { // not aggressively uint64_t new_meta = (uint64_t{ClockHandle::kStateVisible} << ClockHandle::kStateShift) | + (meta & ClockHandle::kHitBitMask) | (new_count << ClockHandle::kReleaseCounterShift) | (new_count << ClockHandle::kAcquireCounterShift); - h.meta.compare_exchange_strong(meta, new_meta, std::memory_order_relaxed); + h.meta.CasStrongRelaxed(meta, new_meta); return false; } // Otherwise, remove entry (either unreferenced invisible or // unreferenced and expired visible). - if (h.meta.compare_exchange_strong( - meta, - uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift, - std::memory_order_acquire)) { + if (h.meta.CasStrong(meta, (uint64_t{ClockHandle::kStateConstruction} + << ClockHandle::kStateShift) | + (meta & ClockHandle::kHitBitMask))) { // Took ownership. return true; } else { @@ -113,70 +156,6 @@ inline bool ClockUpdate(ClockHandle& h) { } } -} // namespace - -void ClockHandleBasicData::FreeData(MemoryAllocator* allocator) const { - if (helper->del_cb) { - helper->del_cb(value, allocator); - } -} - -HyperClockTable::HyperClockTable( - size_t capacity, bool /*strict_capacity_limit*/, - CacheMetadataChargePolicy metadata_charge_policy, - MemoryAllocator* allocator, const Opts& opts) - : length_bits_(CalcHashBits(capacity, opts.estimated_value_size, - metadata_charge_policy)), - length_bits_mask_((size_t{1} << length_bits_) - 1), - occupancy_limit_(static_cast((uint64_t{1} << length_bits_) * - kStrictLoadFactor)), - array_(new HandleImpl[size_t{1} << length_bits_]), - allocator_(allocator) { - if (metadata_charge_policy == - CacheMetadataChargePolicy::kFullChargeCacheMetadata) { - usage_ += size_t{GetTableSize()} * sizeof(HandleImpl); - } - - static_assert(sizeof(HandleImpl) == 64U, - "Expecting size / alignment with common cache line size"); -} - -HyperClockTable::~HyperClockTable() { - // Assumes there are no references or active operations on any slot/element - // in the table. - for (size_t i = 0; i < GetTableSize(); i++) { - HandleImpl& h = array_[i]; - switch (h.meta >> ClockHandle::kStateShift) { - case ClockHandle::kStateEmpty: - // noop - break; - case ClockHandle::kStateInvisible: // rare but possible - case ClockHandle::kStateVisible: - assert(GetRefcount(h.meta) == 0); - h.FreeData(allocator_); -#ifndef NDEBUG - Rollback(h.hashed_key, &h); - ReclaimEntryUsage(h.GetTotalCharge()); -#endif - break; - // otherwise - default: - assert(false); - break; - } - } - -#ifndef NDEBUG - for (size_t i = 0; i < GetTableSize(); i++) { - assert(array_[i].displacements.load() == 0); - } -#endif - - assert(usage_.load() == 0 || - usage_.load() == size_t{GetTableSize()} * sizeof(HandleImpl)); - assert(occupancy_ == 0); -} - // If an entry doesn't receive clock updates but is repeatedly referenced & // released, the acquire and release counters could overflow without some // intervention. This is that intervention, which should be inexpensive @@ -232,7 +211,7 @@ HyperClockTable::~HyperClockTable() { // motivates only checking for release counter in high state, not both in high // state.) inline void CorrectNearOverflow(uint64_t old_meta, - std::atomic& meta) { + AcqRelAtomic& meta) { // We clear both top-most counter bits at the same time. constexpr uint64_t kCounterTopBit = uint64_t{1} << (ClockHandle::kCounterNumBits - 1); @@ -246,28 +225,216 @@ inline void CorrectNearOverflow(uint64_t old_meta, << ClockHandle::kReleaseCounterShift; if (UNLIKELY(old_meta & kCheckBits)) { - meta.fetch_and(~kClearBits, std::memory_order_relaxed); + meta.FetchAndRelaxed(~kClearBits); + } +} + +inline bool BeginSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h, + uint64_t initial_countdown, bool* already_matches) { + assert(*already_matches == false); + // Optimistically transition the slot from "empty" to + // "under construction" (no effect on other states) + uint64_t old_meta = h.meta.FetchOr(uint64_t{ClockHandle::kStateOccupiedBit} + << ClockHandle::kStateShift); + uint64_t old_state = old_meta >> ClockHandle::kStateShift; + + if (old_state == ClockHandle::kStateEmpty) { + // We've started inserting into an available slot, and taken + // ownership. + return true; + } else if (old_state != ClockHandle::kStateVisible) { + // Slot not usable / touchable now + return false; + } + // Existing, visible entry, which might be a match. + // But first, we need to acquire a ref to read it. In fact, number of + // refs for initial countdown, so that we boost the clock state if + // this is a match. + old_meta = + h.meta.FetchAdd(ClockHandle::kAcquireIncrement * initial_countdown); + // Like Lookup + if ((old_meta >> ClockHandle::kStateShift) == ClockHandle::kStateVisible) { + // Acquired a read reference + if (h.hashed_key == proto.hashed_key) { + // Match. Release in a way that boosts the clock state + old_meta = + h.meta.FetchAdd(ClockHandle::kReleaseIncrement * initial_countdown); + // Correct for possible (but rare) overflow + CorrectNearOverflow(old_meta, h.meta); + // Insert detached instead (only if return handle needed) + *already_matches = true; + return false; + } else { + // Mismatch. + Unref(h, initial_countdown); + } + } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateInvisible)) { + // Pretend we never took the reference + Unref(h, initial_countdown); + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. + // Slot not usable / touchable now. + } + return false; +} + +inline void FinishSlotInsert(const ClockHandleBasicData& proto, ClockHandle& h, + uint64_t initial_countdown, bool keep_ref) { + // Save data fields + ClockHandleBasicData* h_alias = &h; + *h_alias = proto; + + // Transition from "under construction" state to "visible" state + uint64_t new_meta = uint64_t{ClockHandle::kStateVisible} + << ClockHandle::kStateShift; + + // Maybe with an outstanding reference + new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift; + new_meta |= (initial_countdown - keep_ref) + << ClockHandle::kReleaseCounterShift; + +#ifndef NDEBUG + // Save the state transition, with assertion + uint64_t old_meta = h.meta.Exchange(new_meta); + assert(old_meta >> ClockHandle::kStateShift == + ClockHandle::kStateConstruction); +#else + // Save the state transition + h.meta.Store(new_meta); +#endif +} + +bool TryInsert(const ClockHandleBasicData& proto, ClockHandle& h, + uint64_t initial_countdown, bool keep_ref, + bool* already_matches) { + bool b = BeginSlotInsert(proto, h, initial_countdown, already_matches); + if (b) { + FinishSlotInsert(proto, h, initial_countdown, keep_ref); + } + return b; +} + +// Func must be const HandleImpl& -> void callable +template +void ConstApplyToEntriesRange(const Func& func, const HandleImpl* begin, + const HandleImpl* end, + bool apply_if_will_be_deleted) { + uint64_t check_state_mask = ClockHandle::kStateShareableBit; + if (!apply_if_will_be_deleted) { + check_state_mask |= ClockHandle::kStateVisibleBit; + } + + for (const HandleImpl* h = begin; h < end; ++h) { + // Note: to avoid using compare_exchange, we have to be extra careful. + uint64_t old_meta = h->meta.LoadRelaxed(); + // Check if it's an entry visible to lookups + if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { + // Increment acquire counter. Note: it's possible that the entry has + // completely changed since we loaded old_meta, but incrementing acquire + // count is always safe. (Similar to optimistic Lookup here.) + old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement); + // Check whether we actually acquired a reference. + if ((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit) { + // Apply func if appropriate + if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { + func(*h); + } + // Pretend we never took the reference + Unref(*h); + // No net change, so don't need to check for overflow + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. Furthermore, we cannot safely undo + // it because we did not acquire a read reference to lock the + // entry in a Shareable state. + } + } } } -inline Status HyperClockTable::ChargeUsageMaybeEvictStrict( - size_t total_charge, size_t capacity, bool need_evict_for_occupancy) { +} // namespace + +void ClockHandleBasicData::FreeData(MemoryAllocator* allocator) const { + if (helper->del_cb) { + helper->del_cb(value, allocator); + } +} + +template +HandleImpl* BaseClockTable::StandaloneInsert( + const ClockHandleBasicData& proto) { + // Heap allocated separate from table + HandleImpl* h = new HandleImpl(); + ClockHandleBasicData* h_alias = h; + *h_alias = proto; + h->SetStandalone(); + // Single reference (standalone entries only created if returning a refed + // Handle back to user) + uint64_t meta = uint64_t{ClockHandle::kStateInvisible} + << ClockHandle::kStateShift; + meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift; + h->meta.Store(meta); + // Keep track of how much of usage is standalone + standalone_usage_.FetchAddRelaxed(proto.GetTotalCharge()); + return h; +} + +template +typename Table::HandleImpl* BaseClockTable::CreateStandalone( + ClockHandleBasicData& proto, size_t capacity, bool strict_capacity_limit, + bool allow_uncharged) { + Table& derived = static_cast(*this); + typename Table::InsertState state; + derived.StartInsert(state); + + const size_t total_charge = proto.GetTotalCharge(); + if (strict_capacity_limit) { + Status s = ChargeUsageMaybeEvictStrict( + total_charge, capacity, + /*need_evict_for_occupancy=*/false, state); + if (!s.ok()) { + if (allow_uncharged) { + proto.total_charge = 0; + } else { + return nullptr; + } + } + } else { + // Case strict_capacity_limit == false + bool success = ChargeUsageMaybeEvictNonStrict
( + total_charge, capacity, + /*need_evict_for_occupancy=*/false, state); + if (!success) { + // Force the issue + usage_.FetchAddRelaxed(total_charge); + } + } + + return StandaloneInsert(proto); +} + +template +Status BaseClockTable::ChargeUsageMaybeEvictStrict( + size_t total_charge, size_t capacity, bool need_evict_for_occupancy, + typename Table::InsertState& state) { if (total_charge > capacity) { return Status::MemoryLimit( "Cache entry too large for a single cache shard: " + std::to_string(total_charge) + " > " + std::to_string(capacity)); } // Grab any available capacity, and free up any more required. - size_t old_usage = usage_.load(std::memory_order_relaxed); + size_t old_usage = usage_.LoadRelaxed(); size_t new_usage; - if (LIKELY(old_usage != capacity)) { - do { - new_usage = std::min(capacity, old_usage + total_charge); - } while (!usage_.compare_exchange_weak(old_usage, new_usage, - std::memory_order_relaxed)); - } else { - new_usage = old_usage; - } + do { + new_usage = std::min(capacity, old_usage + total_charge); + if (new_usage == old_usage) { + // No change needed + break; + } + } while (!usage_.CasWeakRelaxed(old_usage, new_usage)); // How much do we need to evict then? size_t need_evict_charge = old_usage + total_charge - new_usage; size_t request_evict_charge = need_evict_charge; @@ -276,21 +443,18 @@ inline Status HyperClockTable::ChargeUsageMaybeEvictStrict( request_evict_charge = 1; } if (request_evict_charge > 0) { - size_t evicted_charge = 0; - size_t evicted_count = 0; - Evict(request_evict_charge, &evicted_charge, &evicted_count); - occupancy_.fetch_sub(evicted_count, std::memory_order_release); - if (LIKELY(evicted_charge > need_evict_charge)) { - assert(evicted_count > 0); + EvictionData data; + static_cast(this)->Evict(request_evict_charge, state, &data); + occupancy_.FetchSub(data.freed_count); + if (LIKELY(data.freed_charge > need_evict_charge)) { + assert(data.freed_count > 0); // Evicted more than enough - usage_.fetch_sub(evicted_charge - need_evict_charge, - std::memory_order_relaxed); - } else if (evicted_charge < need_evict_charge || - (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0)) { + usage_.FetchSubRelaxed(data.freed_charge - need_evict_charge); + } else if (data.freed_charge < need_evict_charge || + (UNLIKELY(need_evict_for_occupancy) && data.freed_count == 0)) { // Roll back to old usage minus evicted - usage_.fetch_sub(evicted_charge + (new_usage - old_usage), - std::memory_order_relaxed); - if (evicted_charge < need_evict_charge) { + usage_.FetchSubRelaxed(data.freed_charge + (new_usage - old_usage)); + if (data.freed_charge < need_evict_charge) { return Status::MemoryLimit( "Insert failed because unable to evict entries to stay within " "capacity limit."); @@ -302,13 +466,15 @@ inline Status HyperClockTable::ChargeUsageMaybeEvictStrict( } // If we needed to evict something and we are proceeding, we must have // evicted something. - assert(evicted_count > 0); + assert(data.freed_count > 0); } return Status::OK(); } -inline bool HyperClockTable::ChargeUsageMaybeEvictNonStrict( - size_t total_charge, size_t capacity, bool need_evict_for_occupancy) { +template +inline bool BaseClockTable::ChargeUsageMaybeEvictNonStrict( + size_t total_charge, size_t capacity, bool need_evict_for_occupancy, + typename Table::InsertState& state) { // For simplicity, we consider that either the cache can accept the insert // with no evictions, or we must evict enough to make (at least) enough // space. It could lead to unnecessary failures or excessive evictions in @@ -318,7 +484,7 @@ inline bool HyperClockTable::ChargeUsageMaybeEvictNonStrict( // charge. Thus, we should evict some extra if it's not a signifcant // portion of the shard capacity. This can have the side benefit of // involving fewer threads in eviction. - size_t old_usage = usage_.load(std::memory_order_relaxed); + size_t old_usage = usage_.LoadRelaxed(); size_t need_evict_charge; // NOTE: if total_charge > old_usage, there isn't yet enough to evict // `total_charge` amount. Even if we only try to evict `old_usage` amount, @@ -342,94 +508,99 @@ inline bool HyperClockTable::ChargeUsageMaybeEvictNonStrict( // deal with occupancy need_evict_charge = 1; } - size_t evicted_charge = 0; - size_t evicted_count = 0; + EvictionData data; if (need_evict_charge > 0) { - Evict(need_evict_charge, &evicted_charge, &evicted_count); + static_cast(this)->Evict(need_evict_charge, state, &data); // Deal with potential occupancy deficit - if (UNLIKELY(need_evict_for_occupancy) && evicted_count == 0) { - assert(evicted_charge == 0); + if (UNLIKELY(need_evict_for_occupancy) && data.freed_count == 0) { + assert(data.freed_charge == 0); // Can't meet occupancy requirement return false; } else { // Update occupancy for evictions - occupancy_.fetch_sub(evicted_count, std::memory_order_release); + occupancy_.FetchSub(data.freed_count); } } // Track new usage even if we weren't able to evict enough - usage_.fetch_add(total_charge - evicted_charge, std::memory_order_relaxed); + usage_.FetchAddRelaxed(total_charge - data.freed_charge); // No underflow - assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); + assert(usage_.LoadRelaxed() < SIZE_MAX / 2); // Success return true; } -inline HyperClockTable::HandleImpl* HyperClockTable::DetachedInsert( - const ClockHandleBasicData& proto) { - // Heap allocated separate from table - HandleImpl* h = new HandleImpl(); - ClockHandleBasicData* h_alias = h; - *h_alias = proto; - h->SetDetached(); - // Single reference (detached entries only created if returning a refed - // Handle back to user) - uint64_t meta = uint64_t{ClockHandle::kStateInvisible} - << ClockHandle::kStateShift; - meta |= uint64_t{1} << ClockHandle::kAcquireCounterShift; - h->meta.store(meta, std::memory_order_release); - // Keep track of how much of usage is detached - detached_usage_.fetch_add(proto.GetTotalCharge(), std::memory_order_relaxed); - return h; +void BaseClockTable::TrackAndReleaseEvictedEntry( + ClockHandle* h, BaseClockTable::EvictionData* data) { + data->freed_charge += h->GetTotalCharge(); + data->freed_count += 1; + + bool took_value_ownership = false; + if (eviction_callback_) { + // For key reconstructed from hash + UniqueId64x2 unhashed; + took_value_ownership = + eviction_callback_(ClockCacheShard::ReverseHash( + h->GetHash(), &unhashed, hash_seed_), + reinterpret_cast(h), + h->meta.LoadRelaxed() & ClockHandle::kHitBitMask); + } + if (!took_value_ownership) { + h->FreeData(allocator_); + } + MarkEmpty(*h); } -Status HyperClockTable::Insert(const ClockHandleBasicData& proto, - HandleImpl** handle, Cache::Priority priority, - size_t capacity, bool strict_capacity_limit) { +template +Status BaseClockTable::Insert(const ClockHandleBasicData& proto, + typename Table::HandleImpl** handle, + Cache::Priority priority, size_t capacity, + bool strict_capacity_limit) { + using HandleImpl = typename Table::HandleImpl; + Table& derived = static_cast(*this); + + typename Table::InsertState state; + derived.StartInsert(state); + // Do we have the available occupancy? Optimistically assume we do // and deal with it if we don't. - size_t old_occupancy = occupancy_.fetch_add(1, std::memory_order_acquire); - auto revert_occupancy_fn = [&]() { - occupancy_.fetch_sub(1, std::memory_order_relaxed); - }; + size_t old_occupancy = occupancy_.FetchAdd(1); // Whether we over-committed and need an eviction to make up for it - bool need_evict_for_occupancy = old_occupancy >= occupancy_limit_; + bool need_evict_for_occupancy = + !derived.GrowIfNeeded(old_occupancy + 1, state); // Usage/capacity handling is somewhat different depending on // strict_capacity_limit, but mostly pessimistic. - bool use_detached_insert = false; + bool use_standalone_insert = false; const size_t total_charge = proto.GetTotalCharge(); if (strict_capacity_limit) { - Status s = ChargeUsageMaybeEvictStrict(total_charge, capacity, - need_evict_for_occupancy); + Status s = ChargeUsageMaybeEvictStrict
( + total_charge, capacity, need_evict_for_occupancy, state); if (!s.ok()) { - revert_occupancy_fn(); + // Revert occupancy + occupancy_.FetchSubRelaxed(1); return s; } } else { // Case strict_capacity_limit == false - bool success = ChargeUsageMaybeEvictNonStrict(total_charge, capacity, - need_evict_for_occupancy); + bool success = ChargeUsageMaybeEvictNonStrict
( + total_charge, capacity, need_evict_for_occupancy, state); if (!success) { - revert_occupancy_fn(); + // Revert occupancy + occupancy_.FetchSubRelaxed(1); if (handle == nullptr) { // Don't insert the entry but still return ok, as if the entry // inserted into cache and evicted immediately. proto.FreeData(allocator_); return Status::OK(); } else { - // Need to track usage of fallback detached insert - usage_.fetch_add(total_charge, std::memory_order_relaxed); - use_detached_insert = true; + // Need to track usage of fallback standalone insert + usage_.FetchAddRelaxed(total_charge); + use_standalone_insert = true; } } } - auto revert_usage_fn = [&]() { - usage_.fetch_sub(total_charge, std::memory_order_relaxed); - // No underflow - assert(usage_.load(std::memory_order_relaxed) < SIZE_MAX / 2); - }; - if (!use_detached_insert) { + if (!use_standalone_insert) { // Attempt a table insert, but abort if we find an existing entry for the // key. If we were to overwrite old entries, we would either // * Have to gain ownership over an existing entry to overwrite it, which @@ -441,221 +612,268 @@ Status HyperClockTable::Insert(const ClockHandleBasicData& proto, uint64_t initial_countdown = GetInitialCountdown(priority); assert(initial_countdown > 0); - size_t probe = 0; - HandleImpl* e = FindSlot( - proto.hashed_key, - [&](HandleImpl* h) { - // Optimistically transition the slot from "empty" to - // "under construction" (no effect on other states) - uint64_t old_meta = - h->meta.fetch_or(uint64_t{ClockHandle::kStateOccupiedBit} - << ClockHandle::kStateShift, - std::memory_order_acq_rel); - uint64_t old_state = old_meta >> ClockHandle::kStateShift; - - if (old_state == ClockHandle::kStateEmpty) { - // We've started inserting into an available slot, and taken - // ownership Save data fields - ClockHandleBasicData* h_alias = h; - *h_alias = proto; - - // Transition from "under construction" state to "visible" state - uint64_t new_meta = uint64_t{ClockHandle::kStateVisible} - << ClockHandle::kStateShift; - - // Maybe with an outstanding reference - new_meta |= initial_countdown << ClockHandle::kAcquireCounterShift; - new_meta |= (initial_countdown - (handle != nullptr)) - << ClockHandle::kReleaseCounterShift; + HandleImpl* e = + derived.DoInsert(proto, initial_countdown, handle != nullptr, state); -#ifndef NDEBUG - // Save the state transition, with assertion - old_meta = h->meta.exchange(new_meta, std::memory_order_release); - assert(old_meta >> ClockHandle::kStateShift == - ClockHandle::kStateConstruction); -#else - // Save the state transition - h->meta.store(new_meta, std::memory_order_release); -#endif - return true; - } else if (old_state != ClockHandle::kStateVisible) { - // Slot not usable / touchable now - return false; - } - // Existing, visible entry, which might be a match. - // But first, we need to acquire a ref to read it. In fact, number of - // refs for initial countdown, so that we boost the clock state if - // this is a match. - old_meta = h->meta.fetch_add( - ClockHandle::kAcquireIncrement * initial_countdown, - std::memory_order_acq_rel); - // Like Lookup - if ((old_meta >> ClockHandle::kStateShift) == - ClockHandle::kStateVisible) { - // Acquired a read reference - if (h->hashed_key == proto.hashed_key) { - // Match. Release in a way that boosts the clock state - old_meta = h->meta.fetch_add( - ClockHandle::kReleaseIncrement * initial_countdown, - std::memory_order_acq_rel); - // Correct for possible (but rare) overflow - CorrectNearOverflow(old_meta, h->meta); - // Insert detached instead (only if return handle needed) - use_detached_insert = true; - return true; - } else { - // Mismatch. Pretend we never took the reference - old_meta = h->meta.fetch_sub( - ClockHandle::kAcquireIncrement * initial_countdown, - std::memory_order_acq_rel); - } - } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == - ClockHandle::kStateInvisible)) { - // Pretend we never took the reference - // WART: there's a tiny chance we release last ref to invisible - // entry here. If that happens, we let eviction take care of it. - old_meta = h->meta.fetch_sub( - ClockHandle::kAcquireIncrement * initial_countdown, - std::memory_order_acq_rel); - } else { - // For other states, incrementing the acquire counter has no effect - // so we don't need to undo it. - // Slot not usable / touchable now. - } - (void)old_meta; - return false; - }, - [&](HandleImpl* /*h*/) { return false; }, - [&](HandleImpl* h) { - h->displacements.fetch_add(1, std::memory_order_relaxed); - }, - probe); - if (e == nullptr) { - // Occupancy check and never abort FindSlot above should generally - // prevent this, except it's theoretically possible for other threads - // to evict and replace entries in the right order to hit every slot - // when it is populated. Assuming random hashing, the chance of that - // should be no higher than pow(kStrictLoadFactor, n) for n slots. - // That should be infeasible for roughly n >= 256, so if this assertion - // fails, that suggests something is going wrong. - assert(GetTableSize() < 256); - use_detached_insert = true; - } - if (!use_detached_insert) { + if (e) { // Successfully inserted if (handle) { *handle = e; } return Status::OK(); } - // Roll back table insertion - Rollback(proto.hashed_key, e); - revert_occupancy_fn(); - // Maybe fall back on detached insert + // Not inserted + // Revert occupancy + occupancy_.FetchSubRelaxed(1); + // Maybe fall back on standalone insert if (handle == nullptr) { - revert_usage_fn(); + // Revert usage + usage_.FetchSubRelaxed(total_charge); + // No underflow + assert(usage_.LoadRelaxed() < SIZE_MAX / 2); // As if unrefed entry immdiately evicted proto.FreeData(allocator_); return Status::OK(); } + + use_standalone_insert = true; } - // Run detached insert - assert(use_detached_insert); + // Run standalone insert + assert(use_standalone_insert); - *handle = DetachedInsert(proto); + *handle = StandaloneInsert(proto); // The OkOverwritten status is used to count "redundant" insertions into // block cache. This implementation doesn't strictly check for redundant // insertions, but we instead are probably interested in how many insertions - // didn't go into the table (instead "detached"), which could be redundant - // Insert or some other reason (use_detached_insert reasons above). + // didn't go into the table (instead "standalone"), which could be redundant + // Insert or some other reason (use_standalone_insert reasons above). return Status::OkOverwritten(); } -HyperClockTable::HandleImpl* HyperClockTable::Lookup( - const UniqueId64x2& hashed_key) { - size_t probe = 0; - HandleImpl* e = FindSlot( - hashed_key, - [&](HandleImpl* h) { - // Mostly branch-free version (similar performance) - /* - uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement, - std::memory_order_acquire); - bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U; - bool visible = (old_meta >> ClockHandle::kStateShift) & 1U; - bool match = (h->key == key) & visible; - h->meta.fetch_sub(static_cast(Shareable & !match) << - ClockHandle::kAcquireCounterShift, std::memory_order_release); return - match; - */ - // Optimistic lookup should pay off when the table is relatively - // sparse. - constexpr bool kOptimisticLookup = true; - uint64_t old_meta; - if (!kOptimisticLookup) { - old_meta = h->meta.load(std::memory_order_acquire); - if ((old_meta >> ClockHandle::kStateShift) != - ClockHandle::kStateVisible) { - return false; - } - } - // (Optimistically) increment acquire counter - old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement, - std::memory_order_acquire); - // Check if it's an entry visible to lookups - if ((old_meta >> ClockHandle::kStateShift) == - ClockHandle::kStateVisible) { - // Acquired a read reference - if (h->hashed_key == hashed_key) { - // Match - return true; - } else { - // Mismatch. Pretend we never took the reference - old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement, - std::memory_order_release); - } - } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == - ClockHandle::kStateInvisible)) { - // Pretend we never took the reference - // WART: there's a tiny chance we release last ref to invisible - // entry here. If that happens, we let eviction take care of it. - old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement, - std::memory_order_release); - } else { - // For other states, incrementing the acquire counter has no effect - // so we don't need to undo it. Furthermore, we cannot safely undo - // it because we did not acquire a read reference to lock the - // entry in a Shareable state. - } - (void)old_meta; - return false; - }, - [&](HandleImpl* h) { - return h->displacements.load(std::memory_order_relaxed) == 0; - }, - [&](HandleImpl* /*h*/) {}, probe); +void BaseClockTable::Ref(ClockHandle& h) { + // Increment acquire counter + uint64_t old_meta = h.meta.FetchAdd(ClockHandle::kAcquireIncrement); - return e; + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + // Must have already had a reference + assert(GetRefcount(old_meta) > 0); + (void)old_meta; } -bool HyperClockTable::Release(HandleImpl* h, bool useful, - bool erase_if_last_ref) { - // In contrast with LRUCache's Release, this function won't delete the handle - // when the cache is above capacity and the reference is the last one. Space - // is only freed up by EvictFromClock (called by Insert when space is needed) - // and Erase. We do this to avoid an extra atomic read of the variable usage_. - +#ifndef NDEBUG +void BaseClockTable::TEST_RefN(ClockHandle& h, size_t n) { + // Increment acquire counter + uint64_t old_meta = h.meta.FetchAdd(n * ClockHandle::kAcquireIncrement); + + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + (void)old_meta; +} + +void BaseClockTable::TEST_ReleaseNMinus1(ClockHandle* h, size_t n) { + assert(n > 0); + + // Like n-1 Releases, but assumes one more will happen in the caller to take + // care of anything like erasing an unreferenced, invisible entry. + uint64_t old_meta = + h->meta.FetchAdd((n - 1) * ClockHandle::kReleaseIncrement); + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + (void)old_meta; +} +#endif + +FixedHyperClockTable::FixedHyperClockTable( + size_t capacity, bool /*strict_capacity_limit*/, + CacheMetadataChargePolicy metadata_charge_policy, + MemoryAllocator* allocator, + const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, + const Opts& opts) + : BaseClockTable(metadata_charge_policy, allocator, eviction_callback, + hash_seed), + length_bits_(CalcHashBits(capacity, opts.estimated_value_size, + metadata_charge_policy)), + length_bits_mask_((size_t{1} << length_bits_) - 1), + occupancy_limit_(static_cast((uint64_t{1} << length_bits_) * + kStrictLoadFactor)), + array_(new HandleImpl[size_t{1} << length_bits_]) { + if (metadata_charge_policy == + CacheMetadataChargePolicy::kFullChargeCacheMetadata) { + usage_.FetchAddRelaxed(size_t{GetTableSize()} * sizeof(HandleImpl)); + } + + static_assert(sizeof(HandleImpl) == 64U, + "Expecting size / alignment with common cache line size"); +} + +FixedHyperClockTable::~FixedHyperClockTable() { + // Assumes there are no references or active operations on any slot/element + // in the table. + for (size_t i = 0; i < GetTableSize(); i++) { + HandleImpl& h = array_[i]; + switch (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) { + case ClockHandle::kStateEmpty: + // noop + break; + case ClockHandle::kStateInvisible: // rare but possible + case ClockHandle::kStateVisible: + assert(GetRefcount(h.meta.LoadRelaxed()) == 0); + h.FreeData(allocator_); +#ifndef NDEBUG + Rollback(h.hashed_key, &h); + ReclaimEntryUsage(h.GetTotalCharge()); +#endif + break; + // otherwise + default: + assert(false); + break; + } + } + +#ifndef NDEBUG + for (size_t i = 0; i < GetTableSize(); i++) { + assert(array_[i].displacements.LoadRelaxed() == 0); + } +#endif + + assert(usage_.LoadRelaxed() == 0 || + usage_.LoadRelaxed() == size_t{GetTableSize()} * sizeof(HandleImpl)); + assert(occupancy_.LoadRelaxed() == 0); +} + +void FixedHyperClockTable::StartInsert(InsertState&) {} + +bool FixedHyperClockTable::GrowIfNeeded(size_t new_occupancy, InsertState&) { + return new_occupancy <= occupancy_limit_; +} + +FixedHyperClockTable::HandleImpl* FixedHyperClockTable::DoInsert( + const ClockHandleBasicData& proto, uint64_t initial_countdown, + bool keep_ref, InsertState&) { + bool already_matches = false; + HandleImpl* e = FindSlot( + proto.hashed_key, + [&](HandleImpl* h) { + return TryInsert(proto, *h, initial_countdown, keep_ref, + &already_matches); + }, + [&](HandleImpl* h) { + if (already_matches) { + // Stop searching & roll back displacements + Rollback(proto.hashed_key, h); + return true; + } else { + // Keep going + return false; + } + }, + [&](HandleImpl* h, bool is_last) { + if (is_last) { + // Search is ending. Roll back displacements + Rollback(proto.hashed_key, h); + } else { + h->displacements.FetchAddRelaxed(1); + } + }); + if (already_matches) { + // Insertion skipped + return nullptr; + } + if (e != nullptr) { + // Successfully inserted + return e; + } + // Else, no available slot found. Occupancy check should generally prevent + // this, except it's theoretically possible for other threads to evict and + // replace entries in the right order to hit every slot when it is populated. + // Assuming random hashing, the chance of that should be no higher than + // pow(kStrictLoadFactor, n) for n slots. That should be infeasible for + // roughly n >= 256, so if this assertion fails, that suggests something is + // going wrong. + assert(GetTableSize() < 256); + return nullptr; +} + +FixedHyperClockTable::HandleImpl* FixedHyperClockTable::Lookup( + const UniqueId64x2& hashed_key) { + HandleImpl* e = FindSlot( + hashed_key, + [&](HandleImpl* h) { + // Mostly branch-free version (similar performance) + /* + uint64_t old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement, + std::memory_order_acquire); + bool Shareable = (old_meta >> (ClockHandle::kStateShift + 1)) & 1U; + bool visible = (old_meta >> ClockHandle::kStateShift) & 1U; + bool match = (h->key == key) & visible; + h->meta.FetchSub(static_cast(Shareable & !match) << + ClockHandle::kAcquireCounterShift); return + match; + */ + // Optimistic lookup should pay off when the table is relatively + // sparse. + constexpr bool kOptimisticLookup = true; + uint64_t old_meta; + if (!kOptimisticLookup) { + old_meta = h->meta.Load(); + if ((old_meta >> ClockHandle::kStateShift) != + ClockHandle::kStateVisible) { + return false; + } + } + // (Optimistically) increment acquire counter + old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement); + // Check if it's an entry visible to lookups + if ((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateVisible) { + // Acquired a read reference + if (h->hashed_key == hashed_key) { + // Match + // Update the hit bit + if (eviction_callback_) { + h->meta.FetchOrRelaxed(uint64_t{1} << ClockHandle::kHitBitShift); + } + return true; + } else { + // Mismatch. Pretend we never took the reference + Unref(*h); + } + } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == + ClockHandle::kStateInvisible)) { + // Pretend we never took the reference + Unref(*h); + } else { + // For other states, incrementing the acquire counter has no effect + // so we don't need to undo it. Furthermore, we cannot safely undo + // it because we did not acquire a read reference to lock the + // entry in a Shareable state. + } + return false; + }, + [&](HandleImpl* h) { return h->displacements.LoadRelaxed() == 0; }, + [&](HandleImpl* /*h*/, bool /*is_last*/) {}); + + return e; +} + +bool FixedHyperClockTable::Release(HandleImpl* h, bool useful, + bool erase_if_last_ref) { + // In contrast with LRUCache's Release, this function won't delete the handle + // when the cache is above capacity and the reference is the last one. Space + // is only freed up by EvictFromClock (called by Insert when space is needed) + // and Erase. We do this to avoid an extra atomic read of the variable usage_. + uint64_t old_meta; if (useful) { // Increment release counter to indicate was used - old_meta = h->meta.fetch_add(ClockHandle::kReleaseIncrement, - std::memory_order_release); + old_meta = h->meta.FetchAdd(ClockHandle::kReleaseIncrement); } else { // Decrement acquire counter to pretend it never happened - old_meta = h->meta.fetch_sub(ClockHandle::kAcquireIncrement, - std::memory_order_release); + old_meta = h->meta.FetchSub(ClockHandle::kAcquireIncrement); } assert((old_meta >> ClockHandle::kStateShift) & @@ -668,7 +886,10 @@ bool HyperClockTable::Release(HandleImpl* h, bool useful, if (erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift == ClockHandle::kStateInvisible)) { - // Update for last fetch_add op + // FIXME: There's a chance here that another thread could replace this + // entry and we end up erasing the wrong one. + + // Update for last FetchAdd op if (useful) { old_meta += ClockHandle::kReleaseIncrement; } else { @@ -690,18 +911,17 @@ bool HyperClockTable::Release(HandleImpl* h, bool useful, // Note that there's a small chance that we release, another thread // replaces this entry with another, reaches zero refs, and then we end // up erasing that other entry. That's an acceptable risk / imprecision. - } while (!h->meta.compare_exchange_weak( - old_meta, - uint64_t{ClockHandle::kStateConstruction} << ClockHandle::kStateShift, - std::memory_order_acquire)); + } while ( + !h->meta.CasWeak(old_meta, uint64_t{ClockHandle::kStateConstruction} + << ClockHandle::kStateShift)); // Took ownership size_t total_charge = h->GetTotalCharge(); - if (UNLIKELY(h->IsDetached())) { + if (UNLIKELY(h->IsStandalone())) { h->FreeData(allocator_); - // Delete detached handle + // Delete standalone handle delete h; - detached_usage_.fetch_sub(total_charge, std::memory_order_relaxed); - usage_.fetch_sub(total_charge, std::memory_order_relaxed); + standalone_usage_.FetchSubRelaxed(total_charge); + usage_.FetchSubRelaxed(total_charge); } else { Rollback(h->hashed_key, h); FreeDataMarkEmpty(*h, allocator_); @@ -715,50 +935,25 @@ bool HyperClockTable::Release(HandleImpl* h, bool useful, } } -void HyperClockTable::Ref(HandleImpl& h) { - // Increment acquire counter - uint64_t old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement, - std::memory_order_acquire); - - assert((old_meta >> ClockHandle::kStateShift) & - ClockHandle::kStateShareableBit); - // Must have already had a reference - assert(GetRefcount(old_meta) > 0); - (void)old_meta; -} - -void HyperClockTable::TEST_RefN(HandleImpl& h, size_t n) { - // Increment acquire counter - uint64_t old_meta = h.meta.fetch_add(n * ClockHandle::kAcquireIncrement, - std::memory_order_acquire); - - assert((old_meta >> ClockHandle::kStateShift) & - ClockHandle::kStateShareableBit); - (void)old_meta; -} - -void HyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) { +#ifndef NDEBUG +void FixedHyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) { if (n > 0) { - // Split into n - 1 and 1 steps. - uint64_t old_meta = h->meta.fetch_add( - (n - 1) * ClockHandle::kReleaseIncrement, std::memory_order_acquire); - assert((old_meta >> ClockHandle::kStateShift) & - ClockHandle::kStateShareableBit); - (void)old_meta; + // Do n-1 simple releases first + TEST_ReleaseNMinus1(h, n); + // Then the last release might be more involved Release(h, /*useful*/ true, /*erase_if_last_ref*/ false); } } +#endif -void HyperClockTable::Erase(const UniqueId64x2& hashed_key) { - size_t probe = 0; +void FixedHyperClockTable::Erase(const UniqueId64x2& hashed_key) { (void)FindSlot( hashed_key, [&](HandleImpl* h) { // Could be multiple entries in rare cases. Erase them all. // Optimistically increment acquire counter - uint64_t old_meta = h->meta.fetch_add(ClockHandle::kAcquireIncrement, - std::memory_order_acquire); + uint64_t old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement); // Check if it's an entry visible to lookups if ((old_meta >> ClockHandle::kStateShift) == ClockHandle::kStateVisible) { @@ -766,9 +961,8 @@ void HyperClockTable::Erase(const UniqueId64x2& hashed_key) { if (h->hashed_key == hashed_key) { // Match. Set invisible. old_meta = - h->meta.fetch_and(~(uint64_t{ClockHandle::kStateVisibleBit} - << ClockHandle::kStateShift), - std::memory_order_acq_rel); + h->meta.FetchAnd(~(uint64_t{ClockHandle::kStateVisibleBit} + << ClockHandle::kStateShift)); // Apply update to local copy old_meta &= ~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift); @@ -778,14 +972,11 @@ void HyperClockTable::Erase(const UniqueId64x2& hashed_key) { if (refcount > 1) { // Not last ref at some point in time during this Erase call // Pretend we never took the reference - h->meta.fetch_sub(ClockHandle::kAcquireIncrement, - std::memory_order_release); + Unref(*h); break; - } else if (h->meta.compare_exchange_weak( - old_meta, - uint64_t{ClockHandle::kStateConstruction} - << ClockHandle::kStateShift, - std::memory_order_acq_rel)) { + } else if (h->meta.CasWeak( + old_meta, uint64_t{ClockHandle::kStateConstruction} + << ClockHandle::kStateShift)) { // Took ownership assert(hashed_key == h->hashed_key); size_t total_charge = h->GetTotalCharge(); @@ -799,81 +990,32 @@ void HyperClockTable::Erase(const UniqueId64x2& hashed_key) { } } else { // Mismatch. Pretend we never took the reference - h->meta.fetch_sub(ClockHandle::kAcquireIncrement, - std::memory_order_release); + Unref(*h); } } else if (UNLIKELY((old_meta >> ClockHandle::kStateShift) == ClockHandle::kStateInvisible)) { // Pretend we never took the reference - // WART: there's a tiny chance we release last ref to invisible - // entry here. If that happens, we let eviction take care of it. - h->meta.fetch_sub(ClockHandle::kAcquireIncrement, - std::memory_order_release); + Unref(*h); } else { // For other states, incrementing the acquire counter has no effect // so we don't need to undo it. } return false; }, - [&](HandleImpl* h) { - return h->displacements.load(std::memory_order_relaxed) == 0; - }, - [&](HandleImpl* /*h*/) {}, probe); -} - -void HyperClockTable::ConstApplyToEntriesRange( - std::function func, size_t index_begin, - size_t index_end, bool apply_if_will_be_deleted) const { - uint64_t check_state_mask = ClockHandle::kStateShareableBit; - if (!apply_if_will_be_deleted) { - check_state_mask |= ClockHandle::kStateVisibleBit; - } - - for (size_t i = index_begin; i < index_end; i++) { - HandleImpl& h = array_[i]; - - // Note: to avoid using compare_exchange, we have to be extra careful. - uint64_t old_meta = h.meta.load(std::memory_order_relaxed); - // Check if it's an entry visible to lookups - if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { - // Increment acquire counter. Note: it's possible that the entry has - // completely changed since we loaded old_meta, but incrementing acquire - // count is always safe. (Similar to optimistic Lookup here.) - old_meta = h.meta.fetch_add(ClockHandle::kAcquireIncrement, - std::memory_order_acquire); - // Check whether we actually acquired a reference. - if ((old_meta >> ClockHandle::kStateShift) & - ClockHandle::kStateShareableBit) { - // Apply func if appropriate - if ((old_meta >> ClockHandle::kStateShift) & check_state_mask) { - func(h); - } - // Pretend we never took the reference - h.meta.fetch_sub(ClockHandle::kAcquireIncrement, - std::memory_order_release); - // No net change, so don't need to check for overflow - } else { - // For other states, incrementing the acquire counter has no effect - // so we don't need to undo it. Furthermore, we cannot safely undo - // it because we did not acquire a read reference to lock the - // entry in a Shareable state. - } - } - } + [&](HandleImpl* h) { return h->displacements.LoadRelaxed() == 0; }, + [&](HandleImpl* /*h*/, bool /*is_last*/) {}); } -void HyperClockTable::EraseUnRefEntries() { +void FixedHyperClockTable::EraseUnRefEntries() { for (size_t i = 0; i <= this->length_bits_mask_; i++) { HandleImpl& h = array_[i]; - uint64_t old_meta = h.meta.load(std::memory_order_relaxed); + uint64_t old_meta = h.meta.LoadRelaxed(); if (old_meta & (uint64_t{ClockHandle::kStateShareableBit} << ClockHandle::kStateShift) && GetRefcount(old_meta) == 0 && - h.meta.compare_exchange_strong(old_meta, - uint64_t{ClockHandle::kStateConstruction} - << ClockHandle::kStateShift, - std::memory_order_acquire)) { + h.meta.CasStrong(old_meta, uint64_t{ClockHandle::kStateConstruction} + << ClockHandle::kStateShift)) { // Took ownership size_t total_charge = h.GetTotalCharge(); Rollback(h.hashed_key, &h); @@ -883,10 +1025,10 @@ void HyperClockTable::EraseUnRefEntries() { } } -inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot( - const UniqueId64x2& hashed_key, std::function match_fn, - std::function abort_fn, - std::function update_fn, size_t& probe) { +template +inline FixedHyperClockTable::HandleImpl* FixedHyperClockTable::FindSlot( + const UniqueId64x2& hashed_key, const MatchFn& match_fn, + const AbortFn& abort_fn, const UpdateFn& update_fn) { // NOTE: upper 32 bits of hashed_key[0] is used for sharding // // We use double-hashing probing. Every probe in the sequence is a @@ -900,47 +1042,48 @@ inline HyperClockTable::HandleImpl* HyperClockTable::FindSlot( // TODO: we could also reconsider linear probing, though locality benefits // are limited because each slot is a full cache line size_t increment = static_cast(hashed_key[0]) | 1U; - size_t current = ModTableSize(base + probe * increment); - while (probe <= length_bits_mask_) { + size_t first = ModTableSize(base); + size_t current = first; + bool is_last; + do { HandleImpl* h = &array_[current]; if (match_fn(h)) { - probe++; return h; } if (abort_fn(h)) { return nullptr; } - probe++; - update_fn(h); current = ModTableSize(current + increment); - } + is_last = current == first; + update_fn(h, is_last); + } while (!is_last); // We looped back. return nullptr; } -inline void HyperClockTable::Rollback(const UniqueId64x2& hashed_key, - const HandleImpl* h) { +inline void FixedHyperClockTable::Rollback(const UniqueId64x2& hashed_key, + const HandleImpl* h) { size_t current = ModTableSize(hashed_key[1]); size_t increment = static_cast(hashed_key[0]) | 1U; while (&array_[current] != h) { - array_[current].displacements.fetch_sub(1, std::memory_order_relaxed); + array_[current].displacements.FetchSubRelaxed(1); current = ModTableSize(current + increment); } } -inline void HyperClockTable::ReclaimEntryUsage(size_t total_charge) { - auto old_occupancy = occupancy_.fetch_sub(1U, std::memory_order_release); +inline void FixedHyperClockTable::ReclaimEntryUsage(size_t total_charge) { + auto old_occupancy = occupancy_.FetchSub(1U); (void)old_occupancy; // No underflow assert(old_occupancy > 0); - auto old_usage = usage_.fetch_sub(total_charge, std::memory_order_relaxed); + auto old_usage = usage_.FetchSubRelaxed(total_charge); (void)old_usage; // No underflow assert(old_usage >= total_charge); } -inline void HyperClockTable::Evict(size_t requested_charge, - size_t* freed_charge, size_t* freed_count) { +inline void FixedHyperClockTable::Evict(size_t requested_charge, InsertState&, + EvictionData* data) { // precondition assert(requested_charge > 0); @@ -948,8 +1091,7 @@ inline void HyperClockTable::Evict(size_t requested_charge, constexpr size_t step_size = 4; // First (concurrent) increment clock pointer - uint64_t old_clock_pointer = - clock_pointer_.fetch_add(step_size, std::memory_order_relaxed); + uint64_t old_clock_pointer = clock_pointer_.FetchAddRelaxed(step_size); // Cap the eviction effort at this thread (along with those operating in // parallel) circling through the whole structure kMaxCountdown times. @@ -965,14 +1107,12 @@ inline void HyperClockTable::Evict(size_t requested_charge, bool evicting = ClockUpdate(h); if (evicting) { Rollback(h.hashed_key, &h); - *freed_charge += h.GetTotalCharge(); - *freed_count += 1; - FreeDataMarkEmpty(h, allocator_); + TrackAndReleaseEvictedEntry(&h, data); } } // Loop exit condition - if (*freed_charge >= requested_charge) { + if (data->freed_charge >= requested_charge) { return; } if (old_clock_pointer >= max_clock_pointer) { @@ -980,8 +1120,7 @@ inline void HyperClockTable::Evict(size_t requested_charge, } // Advance clock pointer (concurrently) - old_clock_pointer = - clock_pointer_.fetch_add(step_size, std::memory_order_relaxed); + old_clock_pointer = clock_pointer_.FetchAddRelaxed(step_size); } } @@ -989,14 +1128,17 @@ template ClockCacheShard
::ClockCacheShard( size_t capacity, bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy, - MemoryAllocator* allocator, const typename Table::Opts& opts) + MemoryAllocator* allocator, + const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, + const typename Table::Opts& opts) : CacheShardBase(metadata_charge_policy), table_(capacity, strict_capacity_limit, metadata_charge_policy, allocator, - opts), + eviction_callback, hash_seed, opts), capacity_(capacity), strict_capacity_limit_(strict_capacity_limit) { // Initial charge metadata should not exceed capacity - assert(table_.GetUsage() <= capacity_ || capacity_ < sizeof(HandleImpl)); + assert(table_.GetUsage() <= capacity_.LoadRelaxed() || + capacity_.LoadRelaxed() < sizeof(HandleImpl)); } template @@ -1010,37 +1152,35 @@ void ClockCacheShard
::ApplyToSomeEntries( size_t charge, const Cache::CacheItemHelper* helper)>& callback, size_t average_entries_per_lock, size_t* state) { - // The state is essentially going to be the starting hash, which works - // nicely even if we resize between calls because we use upper-most - // hash bits for table indexes. - size_t length_bits = table_.GetLengthBits(); + // The state will be a simple index into the table. Even with a dynamic + // hyper clock cache, entries will generally stay in their existing + // slots, so we don't need to be aware of the high-level organization + // that makes lookup efficient. size_t length = table_.GetTableSize(); assert(average_entries_per_lock > 0); - // Assuming we are called with same average_entries_per_lock repeatedly, - // this simplifies some logic (index_end will not overflow). - assert(average_entries_per_lock < length || *state == 0); - size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits); + size_t index_begin = *state; size_t index_end = index_begin + average_entries_per_lock; if (index_end >= length) { // Going to end. index_end = length; *state = SIZE_MAX; } else { - *state = index_end << (sizeof(size_t) * 8u - length_bits); + *state = index_end; } - table_.ConstApplyToEntriesRange( - [callback](const HandleImpl& h) { + auto hash_seed = table_.GetHashSeed(); + ConstApplyToEntriesRange( + [callback, hash_seed](const HandleImpl& h) { UniqueId64x2 unhashed; - callback(ReverseHash(h.hashed_key, &unhashed), h.value, + callback(ReverseHash(h.hashed_key, &unhashed, hash_seed), h.value, h.GetTotalCharge(), h.helper); }, - index_begin, index_end, false); + table_.HandlePtr(index_begin), table_.HandlePtr(index_end), false); } -int HyperClockTable::CalcHashBits( +int FixedHyperClockTable::CalcHashBits( size_t capacity, size_t estimated_value_size, CacheMetadataChargePolicy metadata_charge_policy) { double average_slot_charge = estimated_value_size * kLoadFactor; @@ -1064,15 +1204,14 @@ int HyperClockTable::CalcHashBits( template void ClockCacheShard
::SetCapacity(size_t capacity) { - capacity_.store(capacity, std::memory_order_relaxed); + capacity_.StoreRelaxed(capacity); // next Insert will take care of any necessary evictions } template void ClockCacheShard
::SetStrictCapacityLimit( bool strict_capacity_limit) { - strict_capacity_limit_.store(strict_capacity_limit, - std::memory_order_relaxed); + strict_capacity_limit_.StoreRelaxed(strict_capacity_limit); // next Insert will take care of any necessary evictions } @@ -1092,10 +1231,26 @@ Status ClockCacheShard
::Insert(const Slice& key, proto.value = value; proto.helper = helper; proto.total_charge = charge; - Status s = table_.Insert( - proto, handle, priority, capacity_.load(std::memory_order_relaxed), - strict_capacity_limit_.load(std::memory_order_relaxed)); - return s; + return table_.template Insert
(proto, handle, priority, + capacity_.LoadRelaxed(), + strict_capacity_limit_.LoadRelaxed()); +} + +template +typename Table::HandleImpl* ClockCacheShard
::CreateStandalone( + const Slice& key, const UniqueId64x2& hashed_key, Cache::ObjectPtr obj, + const Cache::CacheItemHelper* helper, size_t charge, bool allow_uncharged) { + if (UNLIKELY(key.size() != kCacheKeySize)) { + return nullptr; + } + ClockHandleBasicData proto; + proto.hashed_key = hashed_key; + proto.value = obj; + proto.helper = helper; + proto.total_charge = charge; + return table_.template CreateStandalone
( + proto, capacity_.LoadRelaxed(), strict_capacity_limit_.LoadRelaxed(), + allow_uncharged); } template @@ -1125,6 +1280,7 @@ bool ClockCacheShard
::Release(HandleImpl* handle, bool useful, return table_.Release(handle, useful, erase_if_last_ref); } +#ifndef NDEBUG template void ClockCacheShard
::TEST_RefN(HandleImpl* h, size_t n) { table_.TEST_RefN(*h, n); @@ -1134,6 +1290,7 @@ template void ClockCacheShard
::TEST_ReleaseN(HandleImpl* h, size_t n) { table_.TEST_ReleaseN(h, n); } +#endif template bool ClockCacheShard
::Release(HandleImpl* handle, @@ -1156,13 +1313,13 @@ size_t ClockCacheShard
::GetUsage() const { } template -size_t ClockCacheShard
::GetDetachedUsage() const { - return table_.GetDetachedUsage(); +size_t ClockCacheShard
::GetStandaloneUsage() const { + return table_.GetStandaloneUsage(); } template size_t ClockCacheShard
::GetCapacity() const { - return capacity_; + return capacity_.LoadRelaxed(); } template @@ -1176,9 +1333,9 @@ size_t ClockCacheShard
::GetPinnedUsage() const { size_t table_pinned_usage = 0; const bool charge_metadata = metadata_charge_policy_ == kFullChargeCacheMetadata; - table_.ConstApplyToEntriesRange( + ConstApplyToEntriesRange( [&table_pinned_usage, charge_metadata](const HandleImpl& h) { - uint64_t meta = h.meta.load(std::memory_order_relaxed); + uint64_t meta = h.meta.LoadRelaxed(); uint64_t refcount = GetRefcount(meta); // Holding one ref for ConstApplyToEntriesRange assert(refcount > 0); @@ -1189,9 +1346,9 @@ size_t ClockCacheShard
::GetPinnedUsage() const { } } }, - 0, table_.GetTableSize(), true); + table_.HandlePtr(0), table_.HandlePtr(table_.GetTableSize()), true); - return table_pinned_usage + table_.GetDetachedUsage(); + return table_pinned_usage + table_.GetStandaloneUsage(); } template @@ -1210,40 +1367,40 @@ size_t ClockCacheShard
::GetTableAddressCount() const { } // Explicit instantiation -template class ClockCacheShard; +template class ClockCacheShard; +template class ClockCacheShard; -HyperClockCache::HyperClockCache( - size_t capacity, size_t estimated_value_size, int num_shard_bits, - bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy, - std::shared_ptr memory_allocator) - : ShardedCache(capacity, num_shard_bits, strict_capacity_limit, - std::move(memory_allocator)) { - assert(estimated_value_size > 0 || - metadata_charge_policy != kDontChargeCacheMetadata); +template +BaseHyperClockCache
::BaseHyperClockCache( + const HyperClockCacheOptions& opts) + : ShardedCache>(opts) { // TODO: should not need to go through two levels of pointer indirection to // get to table entries - size_t per_shard = GetPerShardCapacity(); + size_t per_shard = this->GetPerShardCapacity(); MemoryAllocator* alloc = this->memory_allocator(); - InitShards([=](Shard* cs) { - HyperClockTable::Opts opts; - opts.estimated_value_size = estimated_value_size; - new (cs) Shard(per_shard, strict_capacity_limit, metadata_charge_policy, - alloc, opts); + this->InitShards([&](Shard* cs) { + typename Table::Opts table_opts{opts}; + new (cs) Shard(per_shard, opts.strict_capacity_limit, + opts.metadata_charge_policy, alloc, + &this->eviction_callback_, &this->hash_seed_, table_opts); }); } -Cache::ObjectPtr HyperClockCache::Value(Handle* handle) { - return reinterpret_cast(handle)->value; +template +Cache::ObjectPtr BaseHyperClockCache
::Value(Handle* handle) { + return reinterpret_cast(handle)->value; } -size_t HyperClockCache::GetCharge(Handle* handle) const { - return reinterpret_cast(handle)->GetTotalCharge(); +template +size_t BaseHyperClockCache
::GetCharge(Handle* handle) const { + return reinterpret_cast(handle) + ->GetTotalCharge(); } -const Cache::CacheItemHelper* HyperClockCache::GetCacheItemHelper( +template +const Cache::CacheItemHelper* BaseHyperClockCache
::GetCacheItemHelper( Handle* handle) const { - auto h = reinterpret_cast(handle); + auto h = reinterpret_cast(handle); return h->helper; } @@ -1256,10 +1413,10 @@ namespace { // or actual occupancy very close to limit (>95% of limit). // Also, for each shard compute the recommended estimated_entry_charge, // and keep the minimum one for use as overall recommendation. -void AddShardEvaluation(const HyperClockCache::Shard& shard, +void AddShardEvaluation(const FixedHyperClockCache::Shard& shard, std::vector& predicted_load_factors, size_t& min_recommendation) { - size_t usage = shard.GetUsage() - shard.GetDetachedUsage(); + size_t usage = shard.GetUsage() - shard.GetStandaloneUsage(); size_t capacity = shard.GetCapacity(); double usage_ratio = 1.0 * usage / capacity; @@ -1274,7 +1431,7 @@ void AddShardEvaluation(const HyperClockCache::Shard& shard, // If filled to capacity, what would the occupancy ratio be? double ratio = occ_ratio / usage_ratio; // Given max load factor, what that load factor be? - double lf = ratio * kStrictLoadFactor; + double lf = ratio * FixedHyperClockTable::kStrictLoadFactor; predicted_load_factors.push_back(lf); // Update min_recommendation also @@ -1282,17 +1439,90 @@ void AddShardEvaluation(const HyperClockCache::Shard& shard, min_recommendation = std::min(min_recommendation, recommendation); } +bool IsSlotOccupied(const ClockHandle& h) { + return (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) != 0; +} } // namespace -void HyperClockCache::ReportProblems( +// NOTE: GCC might warn about subobject linkage if this is in anon namespace +template +class LoadVarianceStats { + public: + std::string Report() const { + return "Overall " + PercentStr(positive_count_, samples_) + " (" + + std::to_string(positive_count_) + "/" + std::to_string(samples_) + + "), Min/Max/Window = " + PercentStr(min_, N) + "/" + + PercentStr(max_, N) + "/" + std::to_string(N) + + ", MaxRun{Pos/Neg} = " + std::to_string(max_pos_run_) + "/" + + std::to_string(max_neg_run_); + } + + void Add(bool positive) { + recent_[samples_ % N] = positive; + if (positive) { + ++positive_count_; + ++cur_pos_run_; + max_pos_run_ = std::max(max_pos_run_, cur_pos_run_); + cur_neg_run_ = 0; + } else { + ++cur_neg_run_; + max_neg_run_ = std::max(max_neg_run_, cur_neg_run_); + cur_pos_run_ = 0; + } + ++samples_; + if (samples_ >= N) { + size_t count_set = recent_.count(); + max_ = std::max(max_, count_set); + min_ = std::min(min_, count_set); + } + } + + private: + size_t max_ = 0; + size_t min_ = N; + size_t positive_count_ = 0; + size_t samples_ = 0; + size_t max_pos_run_ = 0; + size_t cur_pos_run_ = 0; + size_t max_neg_run_ = 0; + size_t cur_neg_run_ = 0; + std::bitset recent_; + + static std::string PercentStr(size_t a, size_t b) { + if (b == 0) { + return "??%"; + } else { + return std::to_string(uint64_t{100} * a / b) + "%"; + } + } +}; + +template +void BaseHyperClockCache
::ReportProblems( + const std::shared_ptr& info_log) const { + if (info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) { + LoadVarianceStats slot_stats; + this->ForEachShard([&](const BaseHyperClockCache
::Shard* shard) { + size_t count = shard->GetTableAddressCount(); + for (size_t i = 0; i < count; ++i) { + slot_stats.Add(IsSlotOccupied(*shard->GetTable().HandlePtr(i))); + } + }); + ROCKS_LOG_AT_LEVEL(info_log, InfoLogLevel::DEBUG_LEVEL, + "Slot occupancy stats: %s", slot_stats.Report().c_str()); + } +} + +void FixedHyperClockCache::ReportProblems( const std::shared_ptr& info_log) const { + BaseHyperClockCache::ReportProblems(info_log); + uint32_t shard_count = GetNumShards(); std::vector predicted_load_factors; size_t min_recommendation = SIZE_MAX; - const_cast(this)->ForEachShard( - [&](HyperClockCache::Shard* shard) { - AddShardEvaluation(*shard, predicted_load_factors, min_recommendation); - }); + ForEachShard([&](const FixedHyperClockCache::Shard* shard) { + AddShardEvaluation(*shard, predicted_load_factors, min_recommendation); + }); if (predicted_load_factors.empty()) { // None operating "at limit" -> nothing to report @@ -1313,17 +1543,19 @@ void HyperClockCache::ReportProblems( predicted_load_factors.end(), 0.0) / shard_count; - constexpr double kLowSpecLoadFactor = kLoadFactor / 2; - constexpr double kMidSpecLoadFactor = kLoadFactor / 1.414; - if (average_load_factor > kLoadFactor) { + constexpr double kLowSpecLoadFactor = FixedHyperClockTable::kLoadFactor / 2; + constexpr double kMidSpecLoadFactor = + FixedHyperClockTable::kLoadFactor / 1.414; + if (average_load_factor > FixedHyperClockTable::kLoadFactor) { // Out of spec => Consider reporting load factor too high // Estimate effective overall capacity loss due to enforcing occupancy limit double lost_portion = 0.0; int over_count = 0; for (double lf : predicted_load_factors) { - if (lf > kStrictLoadFactor) { + if (lf > FixedHyperClockTable::kStrictLoadFactor) { ++over_count; - lost_portion += (lf - kStrictLoadFactor) / lf / shard_count; + lost_portion += + (lf - FixedHyperClockTable::kStrictLoadFactor) / lf / shard_count; } } // >= 20% loss -> error @@ -1347,10 +1579,10 @@ void HyperClockCache::ReportProblems( if (report) { ROCKS_LOG_AT_LEVEL( info_log, level, - "HyperClockCache@%p unable to use estimated %.1f%% capacity because " - "of " - "full occupancy in %d/%u cache shards (estimated_entry_charge too " - "high). Recommend estimated_entry_charge=%zu", + "FixedHyperClockCache@%p unable to use estimated %.1f%% capacity " + "because of full occupancy in %d/%u cache shards " + "(estimated_entry_charge too high). " + "Recommend estimated_entry_charge=%zu", this, lost_portion * 100.0, over_count, (unsigned)shard_count, min_recommendation); } @@ -1368,40 +1600,2004 @@ void HyperClockCache::ReportProblems( } ROCKS_LOG_AT_LEVEL( info_log, level, - "HyperClockCache@%p table has low occupancy at full capacity. Higher " - "estimated_entry_charge (about %.1fx) would likely improve " + "FixedHyperClockCache@%p table has low occupancy at full capacity. " + "Higher estimated_entry_charge (about %.1fx) would likely improve " "performance. Recommend estimated_entry_charge=%zu", this, kMidSpecLoadFactor / average_load_factor, min_recommendation); } } } -} // namespace clock_cache +// ======================================================================= +// AutoHyperClockCache +// ======================================================================= -// DEPRECATED (see public API) -std::shared_ptr NewClockCache( - size_t capacity, int num_shard_bits, bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy) { - return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, - /* high_pri_pool_ratio */ 0.5, nullptr, - kDefaultToAdaptiveMutex, metadata_charge_policy, - /* low_pri_pool_ratio */ 0.0); +// See AutoHyperClockTable::length_info_ etc. for how the linear hashing +// metadata is encoded. Here are some example values: +// +// Used length | min shift | threshold | max shift +// 2 | 1 | 0 | 1 +// 3 | 1 | 1 | 2 +// 4 | 2 | 0 | 2 +// 5 | 2 | 1 | 3 +// 6 | 2 | 2 | 3 +// 7 | 2 | 3 | 3 +// 8 | 3 | 0 | 3 +// 9 | 3 | 1 | 4 +// ... +// Note: +// * min shift = floor(log2(used length)) +// * max shift = ceil(log2(used length)) +// * used length == (1 << shift) + threshold +// Also, shift=0 is never used in practice, so is reserved for "unset" + +namespace { + +inline int LengthInfoToMinShift(uint64_t length_info) { + int mask_shift = BitwiseAnd(length_info, int{255}); + assert(mask_shift <= 63); + assert(mask_shift > 0); + return mask_shift; } -std::shared_ptr HyperClockCacheOptions::MakeSharedCache() const { - auto my_num_shard_bits = num_shard_bits; - if (my_num_shard_bits >= 20) { - return nullptr; // The cache cannot be sharded into too many fine pieces. +inline size_t LengthInfoToThreshold(uint64_t length_info) { + return static_cast(length_info >> 8); +} + +inline size_t LengthInfoToUsedLength(uint64_t length_info) { + size_t threshold = LengthInfoToThreshold(length_info); + int shift = LengthInfoToMinShift(length_info); + assert(threshold < (size_t{1} << shift)); + size_t used_length = (size_t{1} << shift) + threshold; + assert(used_length >= 2); + return used_length; +} + +inline uint64_t UsedLengthToLengthInfo(size_t used_length) { + assert(used_length >= 2); + int shift = FloorLog2(used_length); + uint64_t threshold = BottomNBits(used_length, shift); + uint64_t length_info = + (uint64_t{threshold} << 8) + static_cast(shift); + assert(LengthInfoToUsedLength(length_info) == used_length); + assert(LengthInfoToMinShift(length_info) == shift); + assert(LengthInfoToThreshold(length_info) == threshold); + return length_info; +} + +inline size_t GetStartingLength(size_t capacity) { + if (capacity > port::kPageSize) { + // Start with one memory page + return port::kPageSize / sizeof(AutoHyperClockTable::HandleImpl); + } else { + // Mostly to make unit tests happy + return 4; } - if (my_num_shard_bits < 0) { - // Use larger shard size to reduce risk of large entries clustering - // or skewing individual shards. - constexpr size_t min_shard_size = 32U * 1024U * 1024U; - my_num_shard_bits = GetDefaultCacheShardBits(capacity, min_shard_size); +} + +inline size_t GetHomeIndex(uint64_t hash, int shift) { + return static_cast(BottomNBits(hash, shift)); +} + +inline void GetHomeIndexAndShift(uint64_t length_info, uint64_t hash, + size_t* home, int* shift) { + int min_shift = LengthInfoToMinShift(length_info); + size_t threshold = LengthInfoToThreshold(length_info); + bool extra_shift = GetHomeIndex(hash, min_shift) < threshold; + *home = GetHomeIndex(hash, min_shift + extra_shift); + *shift = min_shift + extra_shift; + assert(*home < LengthInfoToUsedLength(length_info)); +} + +inline int GetShiftFromNextWithShift(uint64_t next_with_shift) { + return BitwiseAnd(next_with_shift, + AutoHyperClockTable::HandleImpl::kShiftMask); +} + +inline size_t GetNextFromNextWithShift(uint64_t next_with_shift) { + return static_cast(next_with_shift >> + AutoHyperClockTable::HandleImpl::kNextShift); +} + +inline uint64_t MakeNextWithShift(size_t next, int shift) { + return (uint64_t{next} << AutoHyperClockTable::HandleImpl::kNextShift) | + static_cast(shift); +} + +inline uint64_t MakeNextWithShiftEnd(size_t head, int shift) { + return AutoHyperClockTable::HandleImpl::kNextEndFlags | + MakeNextWithShift(head, shift); +} + +// Helper function for Lookup +inline bool MatchAndRef(const UniqueId64x2* hashed_key, const ClockHandle& h, + int shift = 0, size_t home = 0, + bool* full_match_or_unknown = nullptr) { + // Must be at least something to match + assert(hashed_key || shift > 0); + + uint64_t old_meta; + // (Optimistically) increment acquire counter. + old_meta = h.meta.FetchAdd(ClockHandle::kAcquireIncrement); + // Check if it's a referencable (sharable) entry + if ((old_meta & (uint64_t{ClockHandle::kStateShareableBit} + << ClockHandle::kStateShift)) == 0) { + // For non-sharable states, incrementing the acquire counter has no effect + // so we don't need to undo it. Furthermore, we cannot safely undo + // it because we did not acquire a read reference to lock the + // entry in a Shareable state. + if (full_match_or_unknown) { + *full_match_or_unknown = true; + } + return false; + } + // Else acquired a read reference + assert(GetRefcount(old_meta + ClockHandle::kAcquireIncrement) > 0); + if (hashed_key && h.hashed_key == *hashed_key && + LIKELY(old_meta & (uint64_t{ClockHandle::kStateVisibleBit} + << ClockHandle::kStateShift))) { + // Match on full key, visible + if (full_match_or_unknown) { + *full_match_or_unknown = true; + } + return true; + } else if (shift > 0 && home == BottomNBits(h.hashed_key[1], shift)) { + // NOTE: upper 32 bits of hashed_key[0] is used for sharding + // Match on home address, possibly invisible + if (full_match_or_unknown) { + *full_match_or_unknown = false; + } + return true; + } else { + // Mismatch. Pretend we never took the reference + Unref(h); + if (full_match_or_unknown) { + *full_match_or_unknown = false; + } + return false; + } +} + +// Assumes a chain rewrite lock prevents concurrent modification of +// these chain pointers +void UpgradeShiftsOnRange(AutoHyperClockTable::HandleImpl* arr, + size_t& frontier, uint64_t stop_before_or_new_tail, + int old_shift, int new_shift) { + assert(frontier != SIZE_MAX); + assert(new_shift == old_shift + 1); + (void)old_shift; + (void)new_shift; + using HandleImpl = AutoHyperClockTable::HandleImpl; + for (;;) { + uint64_t next_with_shift = arr[frontier].chain_next_with_shift.Load(); + assert(GetShiftFromNextWithShift(next_with_shift) == old_shift); + if (next_with_shift == stop_before_or_new_tail) { + // Stopping at entry with pointer matching "stop before" + assert(!HandleImpl::IsEnd(next_with_shift)); + return; + } + if (HandleImpl::IsEnd(next_with_shift)) { + // Also update tail to new tail + assert(HandleImpl::IsEnd(stop_before_or_new_tail)); + arr[frontier].chain_next_with_shift.Store(stop_before_or_new_tail); + // Mark nothing left to upgrade + frontier = SIZE_MAX; + return; + } + // Next is another entry to process, so upgrade and advance frontier + arr[frontier].chain_next_with_shift.FetchAdd(1U); + assert(GetShiftFromNextWithShift(next_with_shift + 1) == new_shift); + frontier = GetNextFromNextWithShift(next_with_shift); + } +} + +size_t CalcOccupancyLimit(size_t used_length) { + return static_cast(used_length * AutoHyperClockTable::kMaxLoadFactor + + 0.999); +} + +} // namespace + +// An RAII wrapper for locking a chain of entries (flag bit on the head) +// so that there is only one thread allowed to remove entries from the +// chain, or to rewrite it by splitting for Grow. Without the lock, +// all lookups and insertions at the head can proceed wait-free. +// The class also provides functions for safely manipulating the head pointer +// while holding the lock--or wanting to should it become non-empty. +// +// The flag bits on the head are such that the head cannot be locked if it +// is an empty chain, so that a "blind" FetchOr will try to lock a non-empty +// chain but have no effect on an empty chain. When a potential rewrite +// operation see an empty head pointer, there is no need to lock as the +// operation is a no-op. However, there are some cases such as CAS-update +// where locking might be required after initially not being needed, if the +// operation is forced to revisit the head pointer. +class AutoHyperClockTable::ChainRewriteLock { + public: + using HandleImpl = AutoHyperClockTable::HandleImpl; + + // Acquire lock if head of h is not an end + explicit ChainRewriteLock(HandleImpl* h, RelaxedAtomic& yield_count) + : head_ptr_(&h->head_next_with_shift) { + Acquire(yield_count); + } + + // RAII wrap existing lock held (or end) + explicit ChainRewriteLock(HandleImpl* h, + RelaxedAtomic& /*yield_count*/, + uint64_t already_locked_or_end) + : head_ptr_(&h->head_next_with_shift) { + saved_head_ = already_locked_or_end; + // already locked or end + assert(saved_head_ & HandleImpl::kHeadLocked); + } + + ~ChainRewriteLock() { + if (!IsEnd()) { + // Release lock + uint64_t old = head_ptr_->FetchAnd(~HandleImpl::kHeadLocked); + (void)old; + assert((old & HandleImpl::kNextEndFlags) == HandleImpl::kHeadLocked); + } + } + + void Reset(HandleImpl* h, RelaxedAtomic& yield_count) { + this->~ChainRewriteLock(); + new (this) ChainRewriteLock(h, yield_count); + } + + // Expected current state, assuming no parallel updates. + uint64_t GetSavedHead() const { return saved_head_; } + + bool CasUpdate(uint64_t next_with_shift, + RelaxedAtomic& yield_count) { + uint64_t new_head = next_with_shift | HandleImpl::kHeadLocked; + uint64_t expected = GetSavedHead(); + bool success = head_ptr_->CasStrong(expected, new_head); + if (success) { + // Ensure IsEnd() is kept up-to-date, including for dtor + saved_head_ = new_head; + } else { + // Parallel update to head, such as Insert() + if (IsEnd()) { + // Didn't previously hold a lock + if (HandleImpl::IsEnd(expected)) { + // Still don't need to + saved_head_ = expected; + } else { + // Need to acquire lock before proceeding + Acquire(yield_count); + } + } else { + // Parallel update must preserve our lock + assert((expected & HandleImpl::kNextEndFlags) == + HandleImpl::kHeadLocked); + saved_head_ = expected; + } + } + return success; + } + + bool IsEnd() const { return HandleImpl::IsEnd(saved_head_); } + + private: + void Acquire(RelaxedAtomic& yield_count) { + for (;;) { + // Acquire removal lock on the chain + uint64_t old_head = head_ptr_->FetchOr(HandleImpl::kHeadLocked); + if ((old_head & HandleImpl::kNextEndFlags) != HandleImpl::kHeadLocked) { + // Either acquired the lock or lock not needed (end) + assert((old_head & HandleImpl::kNextEndFlags) == 0 || + (old_head & HandleImpl::kNextEndFlags) == + HandleImpl::kNextEndFlags); + + saved_head_ = old_head | HandleImpl::kHeadLocked; + break; + } + // NOTE: one of the few yield-wait loops, which is rare enough in practice + // for its performance to be insignificant. (E.g. using C++20 atomic + // wait/notify would likely be worse because of wasted notify costs.) + yield_count.FetchAddRelaxed(1); + std::this_thread::yield(); + } + } + + AcqRelAtomic* head_ptr_; + uint64_t saved_head_; +}; + +AutoHyperClockTable::AutoHyperClockTable( + size_t capacity, bool /*strict_capacity_limit*/, + CacheMetadataChargePolicy metadata_charge_policy, + MemoryAllocator* allocator, + const Cache::EvictionCallback* eviction_callback, const uint32_t* hash_seed, + const Opts& opts) + : BaseClockTable(metadata_charge_policy, allocator, eviction_callback, + hash_seed), + array_(MemMapping::AllocateLazyZeroed( + sizeof(HandleImpl) * CalcMaxUsableLength(capacity, + opts.min_avg_value_size, + metadata_charge_policy))), + length_info_(UsedLengthToLengthInfo(GetStartingLength(capacity))), + occupancy_limit_( + CalcOccupancyLimit(LengthInfoToUsedLength(length_info_.Load()))), + grow_frontier_(GetTableSize()), + clock_pointer_mask_( + BottomNBits(UINT64_MAX, LengthInfoToMinShift(length_info_.Load()))) { + if (metadata_charge_policy == + CacheMetadataChargePolicy::kFullChargeCacheMetadata) { + // NOTE: ignoring page boundaries for simplicity + usage_.FetchAddRelaxed(size_t{GetTableSize()} * sizeof(HandleImpl)); + } + + static_assert(sizeof(HandleImpl) == 64U, + "Expecting size / alignment with common cache line size"); + + // Populate head pointers + uint64_t length_info = length_info_.Load(); + int min_shift = LengthInfoToMinShift(length_info); + int max_shift = min_shift + 1; + size_t major = uint64_t{1} << min_shift; + size_t used_length = GetTableSize(); + + assert(major <= used_length); + assert(used_length <= major * 2); + + // Initialize the initial usable set of slots. This slightly odd iteration + // order makes it easier to get the correct shift amount on each head. + for (size_t i = 0; i < major; ++i) { +#ifndef NDEBUG + int shift; + size_t home; +#endif + if (major + i < used_length) { + array_[i].head_next_with_shift.StoreRelaxed( + MakeNextWithShiftEnd(i, max_shift)); + array_[major + i].head_next_with_shift.StoreRelaxed( + MakeNextWithShiftEnd(major + i, max_shift)); +#ifndef NDEBUG // Extra invariant checking + GetHomeIndexAndShift(length_info, i, &home, &shift); + assert(home == i); + assert(shift == max_shift); + GetHomeIndexAndShift(length_info, major + i, &home, &shift); + assert(home == major + i); + assert(shift == max_shift); +#endif + } else { + array_[i].head_next_with_shift.StoreRelaxed( + MakeNextWithShiftEnd(i, min_shift)); +#ifndef NDEBUG // Extra invariant checking + GetHomeIndexAndShift(length_info, i, &home, &shift); + assert(home == i); + assert(shift == min_shift); + GetHomeIndexAndShift(length_info, major + i, &home, &shift); + assert(home == i); + assert(shift == min_shift); +#endif + } + } +} + +AutoHyperClockTable::~AutoHyperClockTable() { + // As usual, destructor assumes there are no references or active operations + // on any slot/element in the table. + + // It's possible that there were not enough Insert() after final concurrent + // Grow to ensure length_info_ (published GetTableSize()) is fully up to + // date. Probe for first unused slot to ensure we see the whole structure. + size_t used_end = GetTableSize(); + while (used_end < array_.Count() && + array_[used_end].head_next_with_shift.LoadRelaxed() != + HandleImpl::kUnusedMarker) { + used_end++; + } +#ifndef NDEBUG + for (size_t i = used_end; i < array_.Count(); i++) { + assert(array_[i].head_next_with_shift.LoadRelaxed() == 0); + assert(array_[i].chain_next_with_shift.LoadRelaxed() == 0); + assert(array_[i].meta.LoadRelaxed() == 0); + } + std::vector was_populated(used_end); + std::vector was_pointed_to(used_end); +#endif + for (size_t i = 0; i < used_end; i++) { + HandleImpl& h = array_[i]; + switch (h.meta.LoadRelaxed() >> ClockHandle::kStateShift) { + case ClockHandle::kStateEmpty: + // noop + break; + case ClockHandle::kStateInvisible: // rare but possible + case ClockHandle::kStateVisible: + assert(GetRefcount(h.meta.LoadRelaxed()) == 0); + h.FreeData(allocator_); +#ifndef NDEBUG // Extra invariant checking + usage_.FetchSubRelaxed(h.total_charge); + occupancy_.FetchSubRelaxed(1U); + was_populated[i] = true; + if (!HandleImpl::IsEnd(h.chain_next_with_shift.LoadRelaxed())) { + assert((h.chain_next_with_shift.LoadRelaxed() & + HandleImpl::kHeadLocked) == 0); + size_t next = + GetNextFromNextWithShift(h.chain_next_with_shift.LoadRelaxed()); + assert(!was_pointed_to[next]); + was_pointed_to[next] = true; + } +#endif + break; + // otherwise + default: + assert(false); + break; + } +#ifndef NDEBUG // Extra invariant checking + if (!HandleImpl::IsEnd(h.head_next_with_shift.LoadRelaxed())) { + size_t next = + GetNextFromNextWithShift(h.head_next_with_shift.LoadRelaxed()); + assert(!was_pointed_to[next]); + was_pointed_to[next] = true; + } +#endif + } +#ifndef NDEBUG // Extra invariant checking + // This check is not perfect, but should detect most reasonable cases + // of abandonned or floating entries, etc. (A floating cycle would not + // be reported as bad.) + for (size_t i = 0; i < used_end; i++) { + if (was_populated[i]) { + assert(was_pointed_to[i]); + } else { + assert(!was_pointed_to[i]); + } + } +#endif + + // Metadata charging only follows the published table size + assert(usage_.LoadRelaxed() == 0 || + usage_.LoadRelaxed() == GetTableSize() * sizeof(HandleImpl)); + assert(occupancy_.LoadRelaxed() == 0); +} + +size_t AutoHyperClockTable::GetTableSize() const { + return LengthInfoToUsedLength(length_info_.Load()); +} + +size_t AutoHyperClockTable::GetOccupancyLimit() const { + return occupancy_limit_.LoadRelaxed(); +} + +void AutoHyperClockTable::StartInsert(InsertState& state) { + state.saved_length_info = length_info_.Load(); +} + +// Because we have linked lists, bugs or even hardware errors can make it +// possible to create a cycle, which would lead to infinite loop. +// Furthermore, when we have retry cases in the code, we want to be sure +// these are not (and do not become) spin-wait loops. Given the assumption +// of quality hashing and the infeasibility of consistently recurring +// concurrent modifications to an entry or chain, we can safely bound the +// number of loop iterations in feasible operation, whether following chain +// pointers or retrying with some backtracking. A smaller limit is used for +// stress testing, to detect potential issues such as cycles or spin-waits, +// and a larger limit is used to break cycles should they occur in production. +#define CHECK_TOO_MANY_ITERATIONS(i) \ + { \ + assert(i < 768); \ + if (UNLIKELY(i >= 4096)) { \ + std::terminate(); \ + } \ + } + +bool AutoHyperClockTable::GrowIfNeeded(size_t new_occupancy, + InsertState& state) { + // new_occupancy has taken into account other threads that are also trying + // to insert, so as soon as we see sufficient *published* usable size, we + // can declare success even if we aren't the one that grows the table. + // However, there's an awkward state where other threads own growing the + // table to sufficient usable size, but the udpated size is not yet + // published. If we wait, then that likely slows the ramp-up cache + // performance. If we unblock ourselves by ensuring we grow by at least one + // slot, we could technically overshoot required size by number of parallel + // threads accessing block cache. On balance considering typical cases and + // the modest consequences of table being slightly too large, the latter + // seems preferable. + // + // So if the published occupancy limit is too small, we unblock ourselves + // by committing to growing the table by at least one slot. Also note that + // we might need to grow more than once to actually increase the occupancy + // limit (due to max load factor < 1.0) + + while (UNLIKELY(new_occupancy > occupancy_limit_.LoadRelaxed())) { + // At this point we commit the thread to growing unless we've reached the + // limit (returns false). + if (!Grow(state)) { + return false; + } + } + // Success (didn't need to grow, or did successfully) + return true; +} + +bool AutoHyperClockTable::Grow(InsertState& state) { + // Allocate the next grow slot + size_t grow_home = grow_frontier_.FetchAddRelaxed(1); + if (grow_home >= array_.Count()) { + // Can't grow any more. + // (Tested by unit test ClockCacheTest/Limits) + // Make sure we don't overflow grow_frontier_ by reaching here repeatedly + grow_frontier_.StoreRelaxed(array_.Count()); + return false; + } +#ifdef COERCE_CONTEXT_SWITCH + // This is useful in reproducing concurrency issues in Grow() + while (Random::GetTLSInstance()->OneIn(2)) { + std::this_thread::yield(); + } +#endif + // Basically, to implement https://en.wikipedia.org/wiki/Linear_hashing + // entries that belong in a new chain starting at grow_home will be + // split off from the chain starting at old_home, which is computed here. + int old_shift = FloorLog2(grow_home); + size_t old_home = BottomNBits(grow_home, old_shift); + assert(old_home + (size_t{1} << old_shift) == grow_home); + + // Wait here to ensure any Grow operations that would directly feed into + // this one are finished, though the full waiting actually completes in + // acquiring the rewrite lock for old_home in SplitForGrow. Here we ensure + // the expected shift amount has been reached, and there we ensure the + // chain rewrite lock has been released. + size_t old_old_home = BottomNBits(grow_home, old_shift - 1); + for (;;) { + uint64_t old_old_head = array_[old_old_home].head_next_with_shift.Load(); + if (GetShiftFromNextWithShift(old_old_head) >= old_shift) { + if ((old_old_head & HandleImpl::kNextEndFlags) != + HandleImpl::kHeadLocked) { + break; + } + } + // NOTE: one of the few yield-wait loops, which is rare enough in practice + // for its performance to be insignificant. + yield_count_.FetchAddRelaxed(1); + std::this_thread::yield(); + } + + // Do the dirty work of splitting the chain, including updating heads and + // chain nexts for new shift amounts. + SplitForGrow(grow_home, old_home, old_shift); + + // length_info_ can be updated any time after the new shift amount is + // published to both heads, potentially before the end of SplitForGrow. + // But we also can't update length_info_ until the previous Grow operation + // (with grow_home := this grow_home - 1) has published the new shift amount + // to both of its heads. However, we don't want to artificially wait here + // on that Grow that is otherwise irrelevant. + // + // We could have each Grow operation advance length_info_ here as far as it + // can without waiting, by checking for updated shift on the corresponding + // old home and also stopping at an empty head value for possible grow_home. + // However, this could increase CPU cache line sharing and in 1/64 cases + // bring in an extra page from our mmap. + // + // Instead, part of the strategy is delegated to DoInsert(): + // * Here we try to bring length_info_ up to date with this grow_home as + // much as we can without waiting. It will fall short if a previous Grow + // is still between reserving the grow slot and making the first big step + // to publish the new shift amount. + // * To avoid length_info_ being perpetually out-of-date (for a small number + // of heads) after our last Grow, we do the same when Insert has to "fall + // forward" due to length_info_ being out-of-date. + CatchUpLengthInfoNoWait(grow_home); + + // See usage in DoInsert() + state.likely_empty_slot = grow_home; + + // Success + return true; +} + +// See call in Grow() +void AutoHyperClockTable::CatchUpLengthInfoNoWait( + size_t known_usable_grow_home) { + uint64_t current_length_info = length_info_.Load(); + size_t published_usable_size = LengthInfoToUsedLength(current_length_info); + while (published_usable_size <= known_usable_grow_home) { + // For when published_usable_size was grow_home + size_t next_usable_size = published_usable_size + 1; + uint64_t next_length_info = UsedLengthToLengthInfo(next_usable_size); + + // known_usable_grow_home is known to be ready for Lookup/Insert with + // the new shift amount, but between that and published usable size, we + // need to check. + if (published_usable_size < known_usable_grow_home) { + int old_shift = FloorLog2(next_usable_size - 1); + size_t old_home = BottomNBits(published_usable_size, old_shift); + int shift = GetShiftFromNextWithShift( + array_[old_home].head_next_with_shift.Load()); + if (shift <= old_shift) { + // Not ready + break; + } + } + // CAS update length_info_. This only moves in one direction, so if CAS + // fails, someone else made progress like we are trying, and we can just + // pick up the new value and keep going as appropriate. + if (length_info_.CasStrong(current_length_info, next_length_info)) { + current_length_info = next_length_info; + // Update usage_ if metadata charge policy calls for it + if (metadata_charge_policy_ == + CacheMetadataChargePolicy::kFullChargeCacheMetadata) { + // NOTE: ignoring page boundaries for simplicity + usage_.FetchAddRelaxed(sizeof(HandleImpl)); + } + } + published_usable_size = LengthInfoToUsedLength(current_length_info); + } + + // After updating lengh_info_ we can update occupancy_limit_, + // allowing for later operations to update it before us. + // Note: there is no AcqRelAtomic max operation, so we have to use a CAS loop + size_t old_occupancy_limit = occupancy_limit_.LoadRelaxed(); + size_t new_occupancy_limit = CalcOccupancyLimit(published_usable_size); + while (old_occupancy_limit < new_occupancy_limit) { + if (occupancy_limit_.CasWeakRelaxed(old_occupancy_limit, + new_occupancy_limit)) { + break; + } + } +} + +void AutoHyperClockTable::SplitForGrow(size_t grow_home, size_t old_home, + int old_shift) { + int new_shift = old_shift + 1; + HandleImpl* const arr = array_.Get(); + + // We implement a somewhat complicated splitting algorithm to ensure that + // entries are always wait-free visible to Lookup, without Lookup needing + // to double-check length_info_ to ensure every potentially relevant + // existing entry is seen. This works step-by-step, carefully sharing + // unmigrated parts of the chain between the source chain and the new + // destination chain. This means that Lookup might see a partially migrated + // chain so has to take that into consideration when checking that it hasn't + // "jumped off" its intended chain (due to a parallel modification to an + // "under (de)construction" entry that was found on the chain but has + // been reassigned). + // + // We use a "rewrite lock" on the source and desination chains to exclude + // removals from those, and we have a prior waiting step that ensures any Grow + // operations feeding into this one have completed. But this process does have + // to gracefully handle concurrent insertions to the head of the source chain, + // and once marked ready, the destination chain. + // + // With those considerations, the migration starts with one "big step," + // potentially with retries to deal with insertions in parallel. Part of the + // big step is to mark the two chain heads as updated with the new shift + // amount, which redirects Lookups to the appropriate new chain. + // + // After that big step that updates the heads, the rewrite lock makes it + // relatively easy to deal with the rest of the migration. Big + // simplifications come from being able to read the hashed_key of each + // entry on the chain without needing to hold a read reference, and + // from never "jumping our to another chain." Concurrent insertions only + // happen at the chain head, which is outside of what is left to migrate. + // + // A series of smaller steps finishes splitting apart the existing chain into + // two distinct chains, followed by some steps to fully commit the result. + // + // Except for trivial cases in which all entries (or remaining entries) + // on the input chain go to one output chain, there is an important invariant + // after each step of migration, including after the initial "big step": + // For each output chain, the "zero chain" (new hash bit is zero) and the + // "one chain" (new hash bit is one) we have a "frontier" entry marking the + // boundary between what has been migrated and what has not. One of the + // frontiers is along the old chain after the other, and all entries between + // them are for the same target chain as the earlier frontier. Thus, the + // chains share linked list tails starting at the latter frontier. All + // pointers from the new head locations to the frontier entries are marked + // with the new shift amount, while all pointers after the frontiers use the + // old shift amount. + // + // And after each step there is a strengthening step to reach a stronger + // invariant: the frontier earlier in the original chain is advanced to be + // immediately before the other frontier. + // + // Consider this original input chain, + // + // OldHome -Old-> A0 -Old-> B0 -Old-> A1 -Old-> C0 -Old-> OldHome(End) + // GrowHome (empty) + // + // == BIG STEP == + // The initial big step finds the first entry that will be on the each + // output chain (in this case A0 and A1). We use brackets ([]) to mark them + // as our prospective frontiers. + // + // OldHome -Old-> [A0] -Old-> B0 -Old-> [A1] -Old-> C0 -Old-> OldHome(End) + // GrowHome (empty) + // + // Next we speculatively update grow_home head to point to the first entry for + // the one chain. This will not be used by Lookup until the head at old_home + // uses the new shift amount. + // + // OldHome -Old-> [A0] -Old-> B0 -Old-> [A1] -Old-> C0 -Old-> OldHome(End) + // GrowHome --------------New------------/ + // + // Observe that if Lookup were to use the new head at GrowHome, it would be + // able to find all relevant entries. Finishing the initial big step + // requires a CAS (compare_exchange) of the OldHome head because there + // might have been parallel insertions there, in which case we roll back + // and try again. (We might need to point GrowHome head differently.) + // + // OldHome -New-> [A0] -Old-> B0 -Old-> [A1] -Old-> C0 -Old-> OldHome(End) + // GrowHome --------------New------------/ + // + // Upgrading the OldHome head pointer with the new shift amount, with a + // compare_exchange, completes the initial big step, with [A0] as zero + // chain frontier and [A1] as one chain frontier. Links before the frontiers + // use the new shift amount and links after use the old shift amount. + // == END BIG STEP== + // == STRENGTHENING == + // Zero chain frontier is advanced to [B0] (immediately before other + // frontier) by updating pointers with new shift amounts. + // + // OldHome -New-> A0 -New-> [B0] -Old-> [A1] -Old-> C0 -Old-> OldHome(End) + // GrowHome -------------New-----------/ + // + // == END STRENGTHENING == + // == SMALL STEP #1 == + // From the strong invariant state, we need to find the next entry for + // the new chain with the earlier frontier. In this case, we need to find + // the next entry for the zero chain that comes after [B0], which in this + // case is C0. This will be our next zero chain frontier, at least under + // the weak invariant. To get there, we simply update the link between + // the current two frontiers to skip over the entries irreleveant to the + // ealier frontier chain. In this case, the zero chain skips over A1. As a + // result, he other chain is now the "earlier." + // + // OldHome -New-> A0 -New-> B0 -New-> [C0] -Old-> OldHome(End) + // GrowHome -New-> [A1] ------Old-----/ + // + // == END SMALL STEP #1 == + // + // Repeating the cycle and end handling is not as interesting. + + // Acquire rewrite lock on zero chain (if it's non-empty) + ChainRewriteLock zero_head_lock(&arr[old_home], yield_count_); + + // Used for locking the one chain below + uint64_t saved_one_head; + // One head has not been written to + assert(arr[grow_home].head_next_with_shift.Load() == 0); + + // old_home will also the head of the new "zero chain" -- all entries in the + // "from" chain whose next hash bit is 0. grow_home will be head of the new + // "one chain". + + // For these, SIZE_MAX is like nullptr (unknown) + size_t zero_chain_frontier = SIZE_MAX; + size_t one_chain_frontier = SIZE_MAX; + size_t cur = SIZE_MAX; + + // Set to 0 (zero chain frontier earlier), 1 (one chain), or -1 (unknown) + int chain_frontier_first = -1; + + // Might need to retry initial update of heads + for (int i = 0;; ++i) { + CHECK_TOO_MANY_ITERATIONS(i); + assert(zero_chain_frontier == SIZE_MAX); + assert(one_chain_frontier == SIZE_MAX); + assert(cur == SIZE_MAX); + assert(chain_frontier_first == -1); + + uint64_t next_with_shift = zero_head_lock.GetSavedHead(); + + // Find a single representative for each target chain, or scan the whole + // chain if some target chain has no representative. + for (;; ++i) { + CHECK_TOO_MANY_ITERATIONS(i); + + // Loop invariants + assert((chain_frontier_first < 0) == (zero_chain_frontier == SIZE_MAX && + one_chain_frontier == SIZE_MAX)); + assert((cur == SIZE_MAX) == (zero_chain_frontier == SIZE_MAX && + one_chain_frontier == SIZE_MAX)); + + assert(GetShiftFromNextWithShift(next_with_shift) == old_shift); + + // Check for end of original chain + if (HandleImpl::IsEnd(next_with_shift)) { + cur = SIZE_MAX; + break; + } + + // next_with_shift is not End + cur = GetNextFromNextWithShift(next_with_shift); + + if (BottomNBits(arr[cur].hashed_key[1], new_shift) == old_home) { + // Entry for zero chain + if (zero_chain_frontier == SIZE_MAX) { + zero_chain_frontier = cur; + if (one_chain_frontier != SIZE_MAX) { + // Ready to update heads + break; + } + // Nothing yet for one chain + chain_frontier_first = 0; + } + } else { + assert(BottomNBits(arr[cur].hashed_key[1], new_shift) == grow_home); + // Entry for one chain + if (one_chain_frontier == SIZE_MAX) { + one_chain_frontier = cur; + if (zero_chain_frontier != SIZE_MAX) { + // Ready to update heads + break; + } + // Nothing yet for zero chain + chain_frontier_first = 1; + } + } + + next_with_shift = arr[cur].chain_next_with_shift.Load(); + } + + // Try to update heads for initial migration info + // We only reached the end of the migrate-from chain already if one of the + // target chains will be empty. + assert((cur == SIZE_MAX) == + (zero_chain_frontier == SIZE_MAX || one_chain_frontier == SIZE_MAX)); + assert((chain_frontier_first < 0) == + (zero_chain_frontier == SIZE_MAX && one_chain_frontier == SIZE_MAX)); + + // Always update one chain's head first (safe), and mark it as locked + saved_one_head = HandleImpl::kHeadLocked | + (one_chain_frontier != SIZE_MAX + ? MakeNextWithShift(one_chain_frontier, new_shift) + : MakeNextWithShiftEnd(grow_home, new_shift)); + arr[grow_home].head_next_with_shift.Store(saved_one_head); + + // Make sure length_info_ hasn't been updated too early, as we're about + // to make the change that makes it safe to update (e.g. in DoInsert()) + assert(LengthInfoToUsedLength(length_info_.Load()) <= grow_home); + + // Try to set zero's head. + if (zero_head_lock.CasUpdate( + zero_chain_frontier != SIZE_MAX + ? MakeNextWithShift(zero_chain_frontier, new_shift) + : MakeNextWithShiftEnd(old_home, new_shift), + yield_count_)) { + // Both heads successfully updated to new shift + break; + } else { + // Concurrent insertion. This should not happen too many times. + CHECK_TOO_MANY_ITERATIONS(i); + // The easiest solution is to restart. + zero_chain_frontier = SIZE_MAX; + one_chain_frontier = SIZE_MAX; + cur = SIZE_MAX; + chain_frontier_first = -1; + continue; + } + } + + // Create an RAII wrapper for the one chain rewrite lock we are already + // holding (if was not end) and is now "published" after successful CAS on + // zero chain head. + ChainRewriteLock one_head_lock(&arr[grow_home], yield_count_, saved_one_head); + + // Except for trivial cases, we have something like + // AHome -New-> [A0] -Old-> [B0] -Old-> [C0] \ | + // BHome --------------------New------------> [A1] -Old-> ... + // And we need to upgrade as much as we can on the "first" chain + // (the one eventually pointing to the other's frontier). This will + // also finish off any case in which one of the target chains will be empty. + if (chain_frontier_first >= 0) { + size_t& first_frontier = chain_frontier_first == 0 + ? /*&*/ zero_chain_frontier + : /*&*/ one_chain_frontier; + size_t& other_frontier = chain_frontier_first != 0 + ? /*&*/ zero_chain_frontier + : /*&*/ one_chain_frontier; + uint64_t stop_before_or_new_tail = + other_frontier != SIZE_MAX + ? /*stop before*/ MakeNextWithShift(other_frontier, old_shift) + : /*new tail*/ MakeNextWithShiftEnd( + chain_frontier_first == 0 ? old_home : grow_home, new_shift); + UpgradeShiftsOnRange(arr, first_frontier, stop_before_or_new_tail, + old_shift, new_shift); + } + + if (zero_chain_frontier == SIZE_MAX) { + // Already finished migrating + assert(one_chain_frontier == SIZE_MAX); + assert(cur == SIZE_MAX); + } else { + // Still need to migrate between two target chains + for (int i = 0;; ++i) { + CHECK_TOO_MANY_ITERATIONS(i); + // Overall loop invariants + assert(zero_chain_frontier != SIZE_MAX); + assert(one_chain_frontier != SIZE_MAX); + assert(cur != SIZE_MAX); + assert(chain_frontier_first >= 0); + size_t& first_frontier = chain_frontier_first == 0 + ? /*&*/ zero_chain_frontier + : /*&*/ one_chain_frontier; + size_t& other_frontier = chain_frontier_first != 0 + ? /*&*/ zero_chain_frontier + : /*&*/ one_chain_frontier; + assert(cur != first_frontier); + assert(GetNextFromNextWithShift( + arr[first_frontier].chain_next_with_shift.Load()) == + other_frontier); + + uint64_t next_with_shift = arr[cur].chain_next_with_shift.Load(); + + // Check for end of original chain + if (HandleImpl::IsEnd(next_with_shift)) { + // Can set upgraded tail on first chain + uint64_t first_new_tail = MakeNextWithShiftEnd( + chain_frontier_first == 0 ? old_home : grow_home, new_shift); + arr[first_frontier].chain_next_with_shift.Store(first_new_tail); + // And upgrade remainder of other chain + uint64_t other_new_tail = MakeNextWithShiftEnd( + chain_frontier_first != 0 ? old_home : grow_home, new_shift); + UpgradeShiftsOnRange(arr, other_frontier, other_new_tail, old_shift, + new_shift); + assert(other_frontier == SIZE_MAX); // Finished + break; + } + + // next_with_shift is not End + cur = GetNextFromNextWithShift(next_with_shift); + + int target_chain; + if (BottomNBits(arr[cur].hashed_key[1], new_shift) == old_home) { + // Entry for zero chain + target_chain = 0; + } else { + assert(BottomNBits(arr[cur].hashed_key[1], new_shift) == grow_home); + // Entry for one chain + target_chain = 1; + } + if (target_chain == chain_frontier_first) { + // Found next entry to skip to on the first chain + uint64_t skip_to = MakeNextWithShift(cur, new_shift); + arr[first_frontier].chain_next_with_shift.Store(skip_to); + first_frontier = cur; + // Upgrade other chain up to entry before that one + UpgradeShiftsOnRange(arr, other_frontier, next_with_shift, old_shift, + new_shift); + // Swap which is marked as first + chain_frontier_first = 1 - chain_frontier_first; + } else { + // Nothing to do yet, as we need to keep old generation pointers in + // place for lookups + } + } + } +} + +// Variant of PurgeImplLocked: Removes all "under (de) construction" entries +// from a chain where already holding a rewrite lock +using PurgeLockedOpData = void; +// Variant of PurgeImplLocked: Clock-updates all entries in a chain, in +// addition to functionality of PurgeLocked, where already holding a rewrite +// lock. (Caller finalizes eviction on entries added to the autovector, in part +// so that we don't hold the rewrite lock while doing potentially expensive +// callback and allocator free.) +using ClockUpdateChainLockedOpData = + autovector; + +template +void AutoHyperClockTable::PurgeImplLocked(OpData* op_data, + ChainRewriteLock& rewrite_lock, + size_t home) { + constexpr bool kIsPurge = std::is_same_v; + constexpr bool kIsClockUpdateChain = + std::is_same_v; + + // Exactly one op specified + static_assert(kIsPurge + kIsClockUpdateChain == 1); + + HandleImpl* const arr = array_.Get(); + + uint64_t next_with_shift = rewrite_lock.GetSavedHead(); + assert(!HandleImpl::IsEnd(next_with_shift)); + int home_shift = GetShiftFromNextWithShift(next_with_shift); + (void)home; + (void)home_shift; + size_t next = GetNextFromNextWithShift(next_with_shift); + assert(next < array_.Count()); + HandleImpl* h = &arr[next]; + HandleImpl* prev_to_keep = nullptr; +#ifndef NDEBUG + uint64_t prev_to_keep_next_with_shift = 0; +#endif + // Whether there are entries between h and prev_to_keep that should be + // purged from the chain. + bool pending_purge = false; + + // Walk the chain, and stitch together any entries that are still + // "shareable," possibly after clock update. prev_to_keep tells us where + // the last "stitch back to" location is (nullptr => head). + for (size_t i = 0;; ++i) { + CHECK_TOO_MANY_ITERATIONS(i); + + bool purgeable = false; + // In last iteration, h will be nullptr, to stitch together the tail of + // the chain. + if (h) { + // NOTE: holding a rewrite lock on the chain prevents any "under + // (de)construction" entries in the chain from being marked empty, which + // allows us to access the hashed_keys without holding a read ref. + assert(home == BottomNBits(h->hashed_key[1], home_shift)); + if constexpr (kIsClockUpdateChain) { + // Clock update and/or check for purgeable (under (de)construction) + if (ClockUpdate(*h, &purgeable)) { + // Remember for finishing eviction + op_data->push_back(h); + // Entries for eviction become purgeable + purgeable = true; + assert((h->meta.Load() >> ClockHandle::kStateShift) == + ClockHandle::kStateConstruction); + } + } else { + (void)op_data; + purgeable = ((h->meta.Load() >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit) == 0; + } + } + + if (purgeable) { + assert((h->meta.Load() >> ClockHandle::kStateShift) == + ClockHandle::kStateConstruction); + pending_purge = true; + } else if (pending_purge) { + if (prev_to_keep) { + // Update chain next to skip purgeable entries + assert(prev_to_keep->chain_next_with_shift.Load() == + prev_to_keep_next_with_shift); + prev_to_keep->chain_next_with_shift.Store(next_with_shift); + } else if (rewrite_lock.CasUpdate(next_with_shift, yield_count_)) { + // Managed to update head without any parallel insertions + } else { + // Parallel insertion must have interfered. Need to do a purge + // from updated head to here. Since we have no prev_to_keep, there's + // no risk of duplicate clock updates to entries. Any entries already + // updated must have been evicted (purgeable) and it's OK to clock + // update any new entries just inserted in parallel. + // Can simply restart (GetSavedHead() already updated from CAS failure). + next_with_shift = rewrite_lock.GetSavedHead(); + assert(!HandleImpl::IsEnd(next_with_shift)); + next = GetNextFromNextWithShift(next_with_shift); + assert(next < array_.Count()); + h = &arr[next]; + pending_purge = false; + assert(prev_to_keep == nullptr); + assert(GetShiftFromNextWithShift(next_with_shift) == home_shift); + continue; + } + pending_purge = false; + prev_to_keep = h; + } else { + prev_to_keep = h; + } + + if (h == nullptr) { + // Reached end of the chain + return; + } + + // Read chain pointer + next_with_shift = h->chain_next_with_shift.Load(); +#ifndef NDEBUG + if (prev_to_keep == h) { + prev_to_keep_next_with_shift = next_with_shift; + } +#endif + + assert(GetShiftFromNextWithShift(next_with_shift) == home_shift); + + // Check for end marker + if (HandleImpl::IsEnd(next_with_shift)) { + h = nullptr; + } else { + next = GetNextFromNextWithShift(next_with_shift); + assert(next < array_.Count()); + h = &arr[next]; + assert(h != prev_to_keep); + } + } +} + +// Variant of PurgeImpl: Removes all "under (de) construction" entries in a +// chain, such that any entry with the given key must have been purged. +using PurgeOpData = const UniqueId64x2; +// Variant of PurgeImpl: Clock-updates all entries in a chain, in addition to +// purging as appropriate. (Caller finalizes eviction on entries added to the +// autovector, in part so that we don't hold the rewrite lock while doing +// potentially expensive callback and allocator free.) +using ClockUpdateChainOpData = ClockUpdateChainLockedOpData; + +template +void AutoHyperClockTable::PurgeImpl(OpData* op_data, size_t home) { + // Early efforts to make AutoHCC fully wait-free ran into too many problems + // that needed obscure and potentially inefficient work-arounds to have a + // chance at working. + // + // The implementation settled on "essentially wait-free" which can be + // achieved by locking at the level of each probing chain and only for + // operations that might remove entries from the chain. Because parallel + // clock updates and Grow operations are ordered, contention is very rare. + // However, parallel insertions at any chain head have to be accommodated + // to keep them wait-free. + // + // This function implements Purge and ClockUpdateChain functions (see above + // OpData type definitions) as part of higher-level operations. This function + // ensures the correct chain is (eventually) covered and handles rewrite + // locking the chain. PurgeImplLocked has lower level details. + // + // In general, these operations and Grow are kept simpler by allowing eager + // purging of under (de-)construction entries. For example, an Erase + // operation might find that another thread has purged the entry from the + // chain by the time its own purge operation acquires the rewrite lock and + // proceeds. This is OK, and potentially reduces the number of lock/unlock + // cycles because empty chains are not rewrite-lockable. + + constexpr bool kIsPurge = std::is_same_v; + constexpr bool kIsClockUpdateChain = + std::is_same_v; + + // Exactly one op specified + static_assert(kIsPurge + kIsClockUpdateChain == 1); + + int home_shift = 0; + if constexpr (kIsPurge) { + // Purge callers leave home unspecified, to be determined from key + assert(home == SIZE_MAX); + GetHomeIndexAndShift(length_info_.Load(), (*op_data)[1], &home, + &home_shift); + assert(home_shift > 0); + } else { + assert(kIsClockUpdateChain); + // Evict callers must specify home + assert(home < SIZE_MAX); + } + + HandleImpl* const arr = array_.Get(); + + // Acquire the RAII rewrite lock (if not an empty chain) + ChainRewriteLock rewrite_lock(&arr[home], yield_count_); + + if constexpr (kIsPurge) { + // Ensure we are at the correct home for the shift in effect for the + // chain head. + for (;;) { + int shift = GetShiftFromNextWithShift(rewrite_lock.GetSavedHead()); + + if (shift > home_shift) { + // Found a newer shift at candidate head, which must apply to us. + // Newer shift might not yet be reflected in length_info_ (an atomicity + // gap in Grow), so operate as if it is. Note that other insertions + // could happen using this shift before length_info_ is updated, and + // it's possible (though unlikely) that multiple generations of Grow + // have occurred. If shift is more than one generation ahead of + // home_shift, it's possible that not all descendent homes have + // reached the `shift` generation. Thus, we need to advance only one + // shift at a time looking for a home+head with a matching shift + // amount. + home_shift++; + home = GetHomeIndex((*op_data)[1], home_shift); + rewrite_lock.Reset(&arr[home], yield_count_); + continue; + } else { + assert(shift == home_shift); + } + break; + } + } + + // If the chain is empty, nothing to do + if (!rewrite_lock.IsEnd()) { + if constexpr (kIsPurge) { + PurgeLockedOpData* locked_op_data{}; + PurgeImplLocked(locked_op_data, rewrite_lock, home); + } else { + PurgeImplLocked(op_data, rewrite_lock, home); + } + } +} + +AutoHyperClockTable::HandleImpl* AutoHyperClockTable::DoInsert( + const ClockHandleBasicData& proto, uint64_t initial_countdown, + bool take_ref, InsertState& state) { + size_t home; + int orig_home_shift; + GetHomeIndexAndShift(state.saved_length_info, proto.hashed_key[1], &home, + &orig_home_shift); + HandleImpl* const arr = array_.Get(); + + // We could go searching through the chain for any duplicate, but that's + // not typically helpful, except for the REDUNDANT block cache stats. + // (Inferior duplicates will age out with eviction.) However, we do skip + // insertion if the home slot (or some other we happen to probe) already + // has a match (already_matches below). This helps to keep better locality + // when we can. + // + // And we can do that as part of searching for an available slot to + // insert the new entry, because our preferred location and first slot + // checked will be the home slot. + // + // As the table initially grows to size, few entries will be in the same + // cache line as the chain head. However, churn in the cache relatively + // quickly improves the proportion of entries sharing that cache line with + // the chain head. Data: + // + // Initial population only: (cache_bench with -ops_per_thread=1) + // Entries at home count: 29,202 (out of 129,170 entries in 94,411 chains) + // Approximate average cache lines read to find an existing entry: + // 129.2 / 94.4 [without the heads] + // + (94.4 - 29.2) / 94.4 [the heads not included with entries] + // = 2.06 cache lines + // + // After 10 million ops: (-threads=10 -ops_per_thread=100000) + // Entries at home count: 67,556 (out of 129,359 entries in 94,756 chains) + // That's a majority of entries and more than 2/3rds of chains. + // Approximate average cache lines read to find an existing entry: + // = 1.65 cache lines + + // Even if we aren't saving a ref to this entry (take_ref == false), we need + // to keep a reference while we are inserting the entry into a chain, so that + // it is not erased by another thread while trying to insert it on the chain. + constexpr bool initial_take_ref = true; + + size_t used_length = LengthInfoToUsedLength(state.saved_length_info); + assert(home < used_length); + + size_t idx = home; + bool already_matches = false; + bool already_matches_ignore = false; + if (TryInsert(proto, arr[idx], initial_countdown, initial_take_ref, + &already_matches)) { + assert(idx == home); + } else if (already_matches) { + return nullptr; + // Here we try to populate newly-opened slots in the table, but not + // when we can add something to its home slot. This makes the structure + // more performant more quickly on (initial) growth. We ignore "already + // matches" in this case because it is unlikely and difficult to + // incorporate logic for here cleanly and efficiently. + } else if (UNLIKELY(state.likely_empty_slot > 0) && + TryInsert(proto, arr[state.likely_empty_slot], initial_countdown, + initial_take_ref, &already_matches_ignore)) { + idx = state.likely_empty_slot; + } else { + // We need to search for an available slot outside of the home. + // Linear hashing provides nice resizing but does typically mean + // that some heads (home locations) have (in expectation) twice as + // many entries mapped to them as other heads. For example if the + // usable length is 80, then heads 16-63 are (in expectation) twice + // as loaded as heads 0-15 and 64-79, which are using another hash bit. + // + // This means that if we just use linear probing (by a small constant) + // to find an available slot, part of the structure could easily fill up + // and resort to linear time operations even when the overall load factor + // is only modestly high, like 70%. Even though each slot has its own CPU + // cache line, there appears to be a small locality benefit (e.g. TLB and + // paging) to iterating one by one, as long as we don't afoul of the + // linear hashing imbalance. + // + // In a traditional non-concurrent structure, we could keep a "free list" + // to ensure immediate access to an available slot, but maintaining such + // a structure could require more cross-thread coordination to ensure + // all entries are eventually available to all threads. + // + // The way we solve this problem is to use unit-increment linear probing + // with a small bound, and then fall back on big jumps to have a good + // chance of finding a slot in an under-populated region quickly if that + // doesn't work. + size_t i = 0; + constexpr size_t kMaxLinearProbe = 4; + for (; i < kMaxLinearProbe; i++) { + idx++; + if (idx >= used_length) { + idx -= used_length; + } + if (TryInsert(proto, arr[idx], initial_countdown, initial_take_ref, + &already_matches)) { + break; + } + if (already_matches) { + return nullptr; + } + } + if (i == kMaxLinearProbe) { + // Keep searching, but change to a search method that should quickly + // find any under-populated region. Switching to an increment based + // on the golden ratio helps with that, but we also inject some minor + // variation (less than 2%, 1 in 2^6) to avoid clustering effects on + // this larger increment (if it were a fixed value in steady state + // operation). Here we are primarily using upper bits of hashed_key[1] + // while home is based on lowest bits. + uint64_t incr_ratio = 0x9E3779B185EBCA87U + (proto.hashed_key[1] >> 6); + size_t incr = FastRange64(incr_ratio, used_length); + assert(incr > 0); + size_t start = idx; + for (;; i++) { + idx += incr; + if (idx >= used_length) { + // Wrap around (faster than %) + idx -= used_length; + } + if (idx == start) { + // We have just completed a cycle that might not have covered all + // slots. (incr and used_length could have common factors.) + // Increment for the next cycle, which eventually ensures complete + // iteration over the set of slots before repeating. + idx++; + if (idx >= used_length) { + idx -= used_length; + } + start++; + if (start >= used_length) { + start -= used_length; + } + if (i >= used_length) { + used_length = LengthInfoToUsedLength(length_info_.Load()); + if (i >= used_length * 2) { + // Cycling back should not happen unless there is enough random + // churn in parallel that we happen to hit each slot at a time + // that it's occupied, which is really only feasible for small + // structures, though with linear probing to find empty slots, + // "small" here might be larger than for double hashing. + assert(used_length <= 256); + // Fall back on standalone insert in case something goes awry to + // cause this + return nullptr; + } + } + } + if (TryInsert(proto, arr[idx], initial_countdown, initial_take_ref, + &already_matches)) { + break; + } + if (already_matches) { + return nullptr; + } + } + } + } + + // Now insert into chain using head pointer + uint64_t next_with_shift; + int home_shift = orig_home_shift; + + // Might need to retry + for (int i = 0;; ++i) { + CHECK_TOO_MANY_ITERATIONS(i); + next_with_shift = arr[home].head_next_with_shift.Load(); + int shift = GetShiftFromNextWithShift(next_with_shift); + + if (UNLIKELY(shift != home_shift)) { + // NOTE: shift increases with table growth + if (shift > home_shift) { + // Must be grow in progress or completed since reading length_info. + // Pull out one more hash bit. (See Lookup() for why we can't + // safely jump to the shift that was read.) + home_shift++; + uint64_t hash_bit_mask = uint64_t{1} << (home_shift - 1); + assert((home & hash_bit_mask) == 0); + // BEGIN leftover updates to length_info_ for Grow() + size_t grow_home = home + hash_bit_mask; + assert(arr[grow_home].head_next_with_shift.Load() != + HandleImpl::kUnusedMarker); + CatchUpLengthInfoNoWait(grow_home); + // END leftover updates to length_info_ for Grow() + home += proto.hashed_key[1] & hash_bit_mask; + continue; + } else { + // Should not happen because length_info_ is only updated after both + // old and new home heads are marked with new shift + assert(false); + } + } + + // Values to update to + uint64_t head_next_with_shift = MakeNextWithShift(idx, home_shift); + uint64_t chain_next_with_shift = next_with_shift; + + // Preserve the locked state in head, without propagating to chain next + // where it is meaningless (and not allowed) + if (UNLIKELY((next_with_shift & HandleImpl::kNextEndFlags) == + HandleImpl::kHeadLocked)) { + head_next_with_shift |= HandleImpl::kHeadLocked; + chain_next_with_shift &= ~HandleImpl::kHeadLocked; + } + + arr[idx].chain_next_with_shift.Store(chain_next_with_shift); + if (arr[home].head_next_with_shift.CasWeak(next_with_shift, + head_next_with_shift)) { + // Success + if (!take_ref) { + Unref(arr[idx]); + } + return arr + idx; + } + } +} + +AutoHyperClockTable::HandleImpl* AutoHyperClockTable::Lookup( + const UniqueId64x2& hashed_key) { + // Lookups are wait-free with low occurrence of retries, back-tracking, + // and fallback. We do not have the benefit of holding a rewrite lock on + // the chain so must be prepared for many kinds of mayhem, most notably + // "falling off our chain" where a slot that Lookup has identified but + // has not read-referenced is removed from one chain and inserted into + // another. The full algorithm uses the following mitigation strategies to + // ensure every relevant entry inserted before this Lookup, and not yet + // evicted, is seen by Lookup, without excessive backtracking etc.: + // * Keep a known good read ref in the chain for "island hopping." When + // we observe that a concurrent write takes us off to another chain, we + // only need to fall back to our last known good read ref (most recent + // entry on the chain that is not "under construction," which is a transient + // state). We don't want to compound the CPU toil of a long chain with + // operations that might need to retry from scratch, with probability + // in proportion to chain length. + // * Only detect a chain is potentially incomplete because of a Grow in + // progress by looking at shift in the next pointer tags (rather than + // re-checking length_info_). + // * SplitForGrow, Insert, and PurgeImplLocked ensure that there are no + // transient states that might cause this full Lookup algorithm to skip over + // live entries. + + // Reading length_info_ is not strictly required for Lookup, if we were + // to increment shift sizes until we see a shift size match on the + // relevant head pointer. Thus, reading with relaxed memory order gives + // us a safe and almost always up-to-date jump into finding the correct + // home and head. + size_t home; + int home_shift; + GetHomeIndexAndShift(length_info_.LoadRelaxed(), hashed_key[1], &home, + &home_shift); + assert(home_shift > 0); + + // The full Lookup algorithm however is not great for hot path efficiency, + // because of the extra careful tracking described above. Overwhelmingly, + // we can find what we're looking for with a naive linked list traversal + // of the chain. Even if we "fall off our chain" to another, we don't + // violate memory safety. We just won't match the key we're looking for. + // And we would eventually reach an end state, possibly even experiencing a + // cycle as an entry is freed and reused during our traversal (though at + // any point in time the structure doesn't have cycles). + // + // So for hot path efficiency, we start with a naive Lookup attempt, and + // then fall back on full Lookup if we don't find the correct entry. To + // cap how much we invest into the naive Lookup, we simply cap the traversal + // length before falling back. Also, when we do fall back on full Lookup, + // we aren't paying much penalty by starting over. Much or most of the cost + // of Lookup is memory latency in following the chain pointers, and the + // naive Lookup has warmed the CPU cache for these entries, using as tight + // of a loop as possible. + + HandleImpl* const arr = array_.Get(); + uint64_t next_with_shift = arr[home].head_next_with_shift.LoadRelaxed(); + for (size_t i = 0; !HandleImpl::IsEnd(next_with_shift) && i < 10; ++i) { + HandleImpl* h = &arr[GetNextFromNextWithShift(next_with_shift)]; + // Attempt cheap key match without acquiring a read ref. This could give a + // false positive, which is re-checked after acquiring read ref, or false + // negative, which is re-checked in the full Lookup. Also, this is a + // technical UB data race according to TSAN, but we don't need to read + // a "correct" value here for correct overall behavior. +#ifdef __SANITIZE_THREAD__ + bool probably_equal = Random::GetTLSInstance()->OneIn(2); +#else + bool probably_equal = h->hashed_key == hashed_key; +#endif + if (probably_equal) { + // Increment acquire counter for definitive check + uint64_t old_meta = h->meta.FetchAdd(ClockHandle::kAcquireIncrement); + // Check if it's a referencable (sharable) entry + if (LIKELY(old_meta & (uint64_t{ClockHandle::kStateShareableBit} + << ClockHandle::kStateShift))) { + assert(GetRefcount(old_meta + ClockHandle::kAcquireIncrement) > 0); + if (LIKELY(h->hashed_key == hashed_key) && + LIKELY(old_meta & (uint64_t{ClockHandle::kStateVisibleBit} + << ClockHandle::kStateShift))) { + return h; + } else { + Unref(*h); + } + } else { + // For non-sharable states, incrementing the acquire counter has no + // effect so we don't need to undo it. Furthermore, we cannot safely + // undo it because we did not acquire a read reference to lock the entry + // in a Shareable state. + } + } + + next_with_shift = h->chain_next_with_shift.LoadRelaxed(); + } + + // If we get here, falling back on full Lookup algorithm. + HandleImpl* h = nullptr; + HandleImpl* read_ref_on_chain = nullptr; + + for (size_t i = 0;; ++i) { + CHECK_TOO_MANY_ITERATIONS(i); + // Read head or chain pointer + next_with_shift = h ? h->chain_next_with_shift.Load() + : arr[home].head_next_with_shift.Load(); + int shift = GetShiftFromNextWithShift(next_with_shift); + + // Make sure it's usable + size_t effective_home = home; + if (UNLIKELY(shift != home_shift)) { + // We have potentially gone awry somehow, but it's possible we're just + // hitting old data that is not yet completed Grow. + // NOTE: shift bits goes up with table growth. + if (shift < home_shift) { + // To avoid waiting on Grow in progress, an old shift amount needs + // to be processed as if we were still using it and (potentially + // different or the same) the old home. + // We can assert it's not too old, because each generation of Grow + // waits on its ancestor in the previous generation. + assert(shift + 1 == home_shift); + effective_home = GetHomeIndex(home, shift); + } else if (h == read_ref_on_chain) { + assert(shift > home_shift); + // At head or coming from an entry on our chain where we're holding + // a read reference. Thus, we know the newer shift applies to us. + // Newer shift might not yet be reflected in length_info_ (an atomicity + // gap in Grow), so operate as if it is. Note that other insertions + // could happen using this shift before length_info_ is updated, and + // it's possible (though unlikely) that multiple generations of Grow + // have occurred. If shift is more than one generation ahead of + // home_shift, it's possible that not all descendent homes have + // reached the `shift` generation. Thus, we need to advance only one + // shift at a time looking for a home+head with a matching shift + // amount. + home_shift++; + // Update home in case it has changed + home = GetHomeIndex(hashed_key[1], home_shift); + // This should be rare enough occurrence that it's simplest just + // to restart (TODO: improve in some cases?) + h = nullptr; + if (read_ref_on_chain) { + Unref(*read_ref_on_chain); + read_ref_on_chain = nullptr; + } + // Didn't make progress & retry + continue; + } else { + assert(shift > home_shift); + assert(h != nullptr); + // An "under (de)construction" entry has a new shift amount, which + // means we have either gotten off our chain or our home shift is out + // of date. If we revert back to saved ref, we will get updated info. + h = read_ref_on_chain; + // Didn't make progress & retry + continue; + } + } + + // Check for end marker + if (HandleImpl::IsEnd(next_with_shift)) { + // To ensure we didn't miss anything in the chain, the end marker must + // point back to the correct home. + if (LIKELY(GetNextFromNextWithShift(next_with_shift) == effective_home)) { + // Complete, clean iteration of the chain, not found. + // Clean up. + if (read_ref_on_chain) { + Unref(*read_ref_on_chain); + } + return nullptr; + } else { + // Something went awry. Revert back to a safe point (if we have it) + h = read_ref_on_chain; + // Didn't make progress & retry + continue; + } + } + + // Follow the next and check for full key match, home match, or neither + h = &arr[GetNextFromNextWithShift(next_with_shift)]; + bool full_match_or_unknown = false; + if (MatchAndRef(&hashed_key, *h, shift, effective_home, + &full_match_or_unknown)) { + // Got a read ref on next (h). + // + // There is a very small chance that between getting the next pointer + // (now h) and doing MatchAndRef on it, another thread erased/evicted it + // reinserted it into the same chain, causing us to cycle back in the + // same chain and potentially see some entries again if we keep walking. + // Newly-inserted entries are inserted before older ones, so we are at + // least guaranteed not to miss anything. Here in Lookup, it's just a + // transient, slight hiccup in performance. + + if (full_match_or_unknown) { + // Full match. + // Release old read ref on chain if applicable + if (read_ref_on_chain) { + // Pretend we never took the reference. + Unref(*read_ref_on_chain); + } + // Update the hit bit + if (eviction_callback_) { + h->meta.FetchOrRelaxed(uint64_t{1} << ClockHandle::kHitBitShift); + } + // All done. + return h; + } else if (UNLIKELY(shift != home_shift) && + home != BottomNBits(h->hashed_key[1], home_shift)) { + // This chain is in a Grow operation and we've landed on an entry + // that belongs to the wrong destination chain. We can keep going, but + // there's a chance we'll need to backtrack back *before* this entry, + // if the Grow finishes before this Lookup. We cannot save this entry + // for backtracking because it might soon or already be on the wrong + // chain. + // NOTE: if we simply backtrack rather than continuing, we would + // be in a wait loop (not allowed in Lookup!) until the other thread + // finishes its Grow. + Unref(*h); + } else { + // Correct home location, so we are on the right chain. + // With new usable read ref, can release old one (if applicable). + if (read_ref_on_chain) { + // Pretend we never took the reference. + Unref(*read_ref_on_chain); + } + // And keep the new one. + read_ref_on_chain = h; + } + } else { + if (full_match_or_unknown) { + // Must have been an "under construction" entry. Can safely skip it, + // but there's a chance we'll have to backtrack later + } else { + // Home mismatch! Revert back to a safe point (if we have it) + h = read_ref_on_chain; + // Didn't make progress & retry + } + } + } +} + +void AutoHyperClockTable::Remove(HandleImpl* h) { + assert((h->meta.Load() >> ClockHandle::kStateShift) == + ClockHandle::kStateConstruction); + + const HandleImpl& c_h = *h; + PurgeImpl(&c_h.hashed_key); +} + +bool AutoHyperClockTable::TryEraseHandle(HandleImpl* h, bool holding_ref, + bool mark_invisible) { + uint64_t meta; + if (mark_invisible) { + // Set invisible + meta = h->meta.FetchAnd( + ~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift)); + // To local variable also + meta &= + ~(uint64_t{ClockHandle::kStateVisibleBit} << ClockHandle::kStateShift); + } else { + meta = h->meta.Load(); + } + + // Take ownership if no other refs + do { + if (GetRefcount(meta) != uint64_t{holding_ref}) { + // Not last ref at some point in time during this call + return false; + } + if ((meta & (uint64_t{ClockHandle::kStateShareableBit} + << ClockHandle::kStateShift)) == 0) { + // Someone else took ownership + return false; + } + // Note that if !holding_ref, there's a small chance that we release, + // another thread replaces this entry with another, reaches zero refs, and + // then we end up erasing that other entry. That's an acceptable risk / + // imprecision. + } while (!h->meta.CasWeak(meta, uint64_t{ClockHandle::kStateConstruction} + << ClockHandle::kStateShift)); + // Took ownership + // TODO? Delay freeing? + h->FreeData(allocator_); + size_t total_charge = h->total_charge; + if (UNLIKELY(h->IsStandalone())) { + // Delete detached handle + delete h; + standalone_usage_.FetchSubRelaxed(total_charge); + } else { + Remove(h); + MarkEmpty(*h); + occupancy_.FetchSub(1U); + } + usage_.FetchSubRelaxed(total_charge); + assert(usage_.LoadRelaxed() < SIZE_MAX / 2); + return true; +} + +bool AutoHyperClockTable::Release(HandleImpl* h, bool useful, + bool erase_if_last_ref) { + // In contrast with LRUCache's Release, this function won't delete the handle + // when the cache is above capacity and the reference is the last one. Space + // is only freed up by Evict/PurgeImpl (called by Insert when space + // is needed) and Erase. We do this to avoid an extra atomic read of the + // variable usage_. + + uint64_t old_meta; + if (useful) { + // Increment release counter to indicate was used + old_meta = h->meta.FetchAdd(ClockHandle::kReleaseIncrement); + // Correct for possible (but rare) overflow + CorrectNearOverflow(old_meta, h->meta); + } else { + // Decrement acquire counter to pretend it never happened + old_meta = h->meta.FetchSub(ClockHandle::kAcquireIncrement); + } + + assert((old_meta >> ClockHandle::kStateShift) & + ClockHandle::kStateShareableBit); + // No underflow + assert(((old_meta >> ClockHandle::kAcquireCounterShift) & + ClockHandle::kCounterMask) != + ((old_meta >> ClockHandle::kReleaseCounterShift) & + ClockHandle::kCounterMask)); + + if ((erase_if_last_ref || UNLIKELY(old_meta >> ClockHandle::kStateShift == + ClockHandle::kStateInvisible))) { + // FIXME: There's a chance here that another thread could replace this + // entry and we end up erasing the wrong one. + return TryEraseHandle(h, /*holding_ref=*/false, /*mark_invisible=*/false); + } else { + return false; + } +} + +#ifndef NDEBUG +void AutoHyperClockTable::TEST_ReleaseN(HandleImpl* h, size_t n) { + if (n > 0) { + // Do n-1 simple releases first + TEST_ReleaseNMinus1(h, n); + + // Then the last release might be more involved + Release(h, /*useful*/ true, /*erase_if_last_ref*/ false); + } +} +#endif + +void AutoHyperClockTable::Erase(const UniqueId64x2& hashed_key) { + // Don't need to be efficient. + // Might be one match masking another, so loop. + while (HandleImpl* h = Lookup(hashed_key)) { + bool gone = + TryEraseHandle(h, /*holding_ref=*/true, /*mark_invisible=*/true); + if (!gone) { + // Only marked invisible, which is ok. + // Pretend we never took the reference from Lookup. + Unref(*h); + } + } +} + +void AutoHyperClockTable::EraseUnRefEntries() { + size_t usable_size = GetTableSize(); + for (size_t i = 0; i < usable_size; i++) { + HandleImpl& h = array_[i]; + + uint64_t old_meta = h.meta.LoadRelaxed(); + if (old_meta & (uint64_t{ClockHandle::kStateShareableBit} + << ClockHandle::kStateShift) && + GetRefcount(old_meta) == 0 && + h.meta.CasStrong(old_meta, uint64_t{ClockHandle::kStateConstruction} + << ClockHandle::kStateShift)) { + // Took ownership + h.FreeData(allocator_); + usage_.FetchSubRelaxed(h.total_charge); + // NOTE: could be more efficient with a dedicated variant of + // PurgeImpl, but this is not a common operation + Remove(&h); + MarkEmpty(h); + occupancy_.FetchSub(1U); + } + } +} + +void AutoHyperClockTable::Evict(size_t requested_charge, InsertState& state, + EvictionData* data) { + // precondition + assert(requested_charge > 0); + + // We need the clock pointer to seemlessly "wrap around" at the end of the + // table, and to be reasonably stable under Grow operations. This is + // challenging when the linear hashing progressively opens additional + // most-significant-hash-bits in determining home locations. + + // TODO: make a tuning parameter? + // Up to 2x this number of homes will be evicted per step. In very rare + // cases, possibly more, as homes of an out-of-date generation will be + // resolved to multiple in a newer generation. + constexpr size_t step_size = 4; + + // A clock_pointer_mask_ field separate from length_info_ enables us to use + // the same mask (way of dividing up the space among evicting threads) for + // iterating over the whole structure before considering changing the mask + // at the beginning of each pass. This ensures we do not have a large portion + // of the space that receives redundant or missed clock updates. However, + // with two variables, for each update to clock_pointer_mask (< 64 ever in + // the life of the cache), there will be a brief period where concurrent + // eviction threads could use the old mask value, possibly causing redundant + // or missed clock updates for a *small* portion of the table. + size_t clock_pointer_mask = clock_pointer_mask_.LoadRelaxed(); + + uint64_t max_clock_pointer = 0; // unset + + // TODO: consider updating during a long eviction + size_t used_length = LengthInfoToUsedLength(state.saved_length_info); + + autovector to_finish_eviction; + + // Loop until enough freed, or limit reached (see bottom of loop) + for (;;) { + // First (concurrent) increment clock pointer + uint64_t old_clock_pointer = clock_pointer_.FetchAddRelaxed(step_size); + + if (UNLIKELY((old_clock_pointer & clock_pointer_mask) == 0)) { + // Back at the beginning. See if clock_pointer_mask should be updated. + uint64_t mask = BottomNBits( + UINT64_MAX, LengthInfoToMinShift(state.saved_length_info)); + if (clock_pointer_mask != mask) { + clock_pointer_mask = static_cast(mask); + clock_pointer_mask_.StoreRelaxed(clock_pointer_mask); + } + } + + size_t major_step = clock_pointer_mask + 1; + assert((major_step & clock_pointer_mask) == 0); + + for (size_t base_home = old_clock_pointer & clock_pointer_mask; + base_home < used_length; base_home += major_step) { + for (size_t i = 0; i < step_size; i++) { + size_t home = base_home + i; + if (home >= used_length) { + break; + } + PurgeImpl(&to_finish_eviction, home); + } + } + + for (HandleImpl* h : to_finish_eviction) { + TrackAndReleaseEvictedEntry(h, data); + // NOTE: setting likely_empty_slot here can cause us to reduce the + // portion of "at home" entries, probably because an evicted entry + // is more likely to come back than a random new entry and would be + // unable to go into its home slot. + } + to_finish_eviction.clear(); + + // Loop exit conditions + if (data->freed_charge >= requested_charge) { + return; + } + + if (max_clock_pointer == 0) { + // Cap the eviction effort at this thread (along with those operating in + // parallel) circling through the whole structure kMaxCountdown times. + // In other words, this eviction run must find something/anything that is + // unreferenced at start of and during the eviction run that isn't + // reclaimed by a concurrent eviction run. + // TODO: Does HyperClockCache need kMaxCountdown + 1? + max_clock_pointer = + old_clock_pointer + + (uint64_t{ClockHandle::kMaxCountdown + 1} * major_step); + } + + if (old_clock_pointer + step_size >= max_clock_pointer) { + return; + } + } +} + +size_t AutoHyperClockTable::CalcMaxUsableLength( + size_t capacity, size_t min_avg_value_size, + CacheMetadataChargePolicy metadata_charge_policy) { + double min_avg_slot_charge = min_avg_value_size * kMaxLoadFactor; + if (metadata_charge_policy == kFullChargeCacheMetadata) { + min_avg_slot_charge += sizeof(HandleImpl); + } + assert(min_avg_slot_charge > 0.0); + size_t num_slots = + static_cast(capacity / min_avg_slot_charge + 0.999999); + + const size_t slots_per_page = port::kPageSize / sizeof(HandleImpl); + + // Round up to page size + return ((num_slots + slots_per_page - 1) / slots_per_page) * slots_per_page; +} + +namespace { +bool IsHeadNonempty(const AutoHyperClockTable::HandleImpl& h) { + return !AutoHyperClockTable::HandleImpl::IsEnd( + h.head_next_with_shift.LoadRelaxed()); +} +bool IsEntryAtHome(const AutoHyperClockTable::HandleImpl& h, int shift, + size_t home) { + if (MatchAndRef(nullptr, h, shift, home)) { + Unref(h); + return true; + } else { + return false; + } +} +} // namespace + +void AutoHyperClockCache::ReportProblems( + const std::shared_ptr& info_log) const { + BaseHyperClockCache::ReportProblems(info_log); + + if (info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) { + LoadVarianceStats head_stats; + size_t entry_at_home_count = 0; + uint64_t yield_count = 0; + this->ForEachShard([&](const Shard* shard) { + size_t count = shard->GetTableAddressCount(); + uint64_t length_info = UsedLengthToLengthInfo(count); + for (size_t i = 0; i < count; ++i) { + const auto& h = *shard->GetTable().HandlePtr(i); + head_stats.Add(IsHeadNonempty(h)); + int shift; + size_t home; + GetHomeIndexAndShift(length_info, i, &home, &shift); + assert(home == i); + entry_at_home_count += IsEntryAtHome(h, shift, home); + } + yield_count += shard->GetTable().GetYieldCount(); + }); + ROCKS_LOG_AT_LEVEL(info_log, InfoLogLevel::DEBUG_LEVEL, + "Head occupancy stats: %s", head_stats.Report().c_str()); + ROCKS_LOG_AT_LEVEL(info_log, InfoLogLevel::DEBUG_LEVEL, + "Entries at home count: %zu", entry_at_home_count); + ROCKS_LOG_AT_LEVEL(info_log, InfoLogLevel::DEBUG_LEVEL, + "Yield count: %" PRIu64, yield_count); + } +} + +} // namespace clock_cache + +// DEPRECATED (see public API) +std::shared_ptr NewClockCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy) { + return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, + /* high_pri_pool_ratio */ 0.5, nullptr, + kDefaultToAdaptiveMutex, metadata_charge_policy, + /* low_pri_pool_ratio */ 0.0); +} + +std::shared_ptr HyperClockCacheOptions::MakeSharedCache() const { + // For sanitized options + HyperClockCacheOptions opts = *this; + if (opts.num_shard_bits >= 20) { + return nullptr; // The cache cannot be sharded into too many fine pieces. + } + if (opts.num_shard_bits < 0) { + // Use larger shard size to reduce risk of large entries clustering + // or skewing individual shards. + constexpr size_t min_shard_size = 32U * 1024U * 1024U; + opts.num_shard_bits = + GetDefaultCacheShardBits(opts.capacity, min_shard_size); + } + std::shared_ptr cache; + if (opts.estimated_entry_charge == 0) { + cache = std::make_shared(opts); + } else { + cache = std::make_shared(opts); + } + if (opts.secondary_cache) { + cache = std::make_shared(cache, + opts.secondary_cache); } - return std::make_shared( - capacity, estimated_entry_charge, my_num_shard_bits, - strict_capacity_limit, metadata_charge_policy, memory_allocator); + return cache; } } // namespace ROCKSDB_NAMESPACE diff --git a/cache/clock_cache.h b/cache/clock_cache.h index 01185849b6d3..3086e7e972f3 100644 --- a/cache/clock_cache.h +++ b/cache/clock_cache.h @@ -20,16 +20,20 @@ #include "cache/sharded_cache.h" #include "port/lang.h" #include "port/malloc.h" +#include "port/mmap.h" #include "port/port.h" #include "rocksdb/cache.h" #include "rocksdb/secondary_cache.h" +#include "util/atomic.h" #include "util/autovector.h" +#include "util/math.h" namespace ROCKSDB_NAMESPACE { namespace clock_cache { // Forward declaration of friend class. +template class ClockCacheTest; // HyperClockCache is an alternative to LRUCache specifically tailored for @@ -37,24 +41,31 @@ class ClockCacheTest; // // Benefits // -------- -// * Fully lock free (no waits or spins) for efficiency under high concurrency +// * Lock/wait free (no waits or spins) for efficiency under high concurrency +// * Fixed version (estimated_entry_charge > 0) is fully lock/wait free +// * Automatic version (estimated_entry_charge = 0) has rare waits among +// certain insertion or erase operations that involve the same very small +// set of entries. // * Optimized for hot path reads. For concurrency control, most Lookup() and // essentially all Release() are a single atomic add operation. -// * Eviction on insertion is fully parallel and lock-free. +// * Eviction on insertion is fully parallel. // * Uses a generalized + aging variant of CLOCK eviction that might outperform // LRU in some cases. (For background, see // https://en.wikipedia.org/wiki/Page_replacement_algorithm) // // Costs // ----- -// * Hash table is not resizable (for lock-free efficiency) so capacity is not -// dynamically changeable. Rely on an estimated average value (block) size for +// * FixedHyperClockCache (estimated_entry_charge > 0) - Hash table is not +// resizable (for lock-free efficiency) so capacity is not dynamically +// changeable. Rely on an estimated average value (block) size for // space+time efficiency. (See estimated_entry_charge option details.) +// EXPERIMENTAL - This limitation is fixed in AutoHyperClockCache, activated +// with estimated_entry_charge == 0. // * Insert usually does not (but might) overwrite a previous entry associated -// with a cache key. This is OK for RocksDB uses of Cache. +// with a cache key. This is OK for RocksDB uses of Cache, though it does mess +// up our REDUNDANT block cache insertion statistics. // * Only supports keys of exactly 16 bytes, which is what RocksDB uses for -// block cache (not row cache or table cache). -// * SecondaryCache is not supported. +// block cache (but not row cache or table cache). // * Cache priorities are less aggressively enforced. Unlike LRUCache, enough // transient LOW or BOTTOM priority items can evict HIGH priority entries that // are not referenced recently (or often) enough. @@ -137,7 +148,8 @@ class ClockCacheTest; // * Empty - slot is not in use and unowned. All other metadata and data is // in an undefined state. // * Construction - slot is exclusively owned by one thread, the thread -// successfully entering this state, for populating or freeing data. +// successfully entering this state, for populating or freeing data +// (de-construction, same state marker). // * Shareable (group) - slot holds an entry with counted references for // pinning and reading, including // * Visible - slot holds an entry that can be returned by Lookup @@ -145,7 +157,7 @@ class ClockCacheTest; // (erased by user) but can be read by existing references, and ref count // changed by Ref and Release. // -// A special case is "detached" entries, which are heap-allocated handles +// A special case is "standalone" entries, which are heap-allocated handles // not in the table. They are always Invisible and freed on zero refs. // // State transitions: @@ -185,23 +197,27 @@ class ClockCacheTest; // know from our "redundant" stats that overwrites are very rare for the block // cache, so we should not spend much to make them effective. // -// So instead we Insert as soon as we find an empty slot in the probing -// sequence without seeing an existing (visible) entry for the same key. This -// way we only insert if we can improve the probing performance, and we don't -// need to probe beyond our insert position, assuming we are willing to let -// the previous entry for the same key die of old age (eventual eviction from -// not being used). We can reach a similar state with concurrent insertions, -// where one will pass over the other while it is "under construction." -// This temporary duplication is acceptable for RocksDB block cache because -// we know redundant insertion is rare. +// FixedHyperClockCache: Instead we Insert as soon as we find an empty slot in +// the probing sequence without seeing an existing (visible) entry for the same +// key. This way we only insert if we can improve the probing performance, and +// we don't need to probe beyond our insert position, assuming we are willing +// to let the previous entry for the same key die of old age (eventual eviction +// from not being used). We can reach a similar state with concurrent +// insertions, where one will pass over the other while it is "under +// construction." This temporary duplication is acceptable for RocksDB block +// cache because we know redundant insertion is rare. +// AutoHyperClockCache: Similar, except we only notice and return an existing +// match if it is found in the search for a suitable empty slot (starting with +// the same slot as the head pointer), not by following the existing chain of +// entries. Insertions are always made to the head of the chain. // // Another problem to solve is what to return to the caller when we find an // existing entry whose probing position we cannot improve on, or when the // table occupancy limit has been reached. If strict_capacity_limit=false, // we must never fail Insert, and if a Handle* is provided, we have to return // a usable Cache handle on success. The solution to this (typically rare) -// problem is "detached" handles, which are usable by the caller but not -// actually available for Lookup in the Cache. Detached handles are allocated +// problem is "standalone" handles, which are usable by the caller but not +// actually available for Lookup in the Cache. Standalone handles are allocated // independently on the heap and specially marked so that they are freed on // the heap when their last reference is released. // @@ -281,29 +297,6 @@ class ClockCacheTest; // ----------------------------------------------------------------------- // -// The load factor p is a real number in (0, 1) such that at all -// times at most a fraction p of all slots, without counting tombstones, -// are occupied by elements. This means that the probability that a random -// probe hits an occupied slot is at most p, and thus at most 1/p probes -// are required on average. For example, p = 70% implies that between 1 and 2 -// probes are needed on average (bear in mind that this reasoning doesn't -// consider the effects of clustering over time, which should be negligible -// with double hashing). -// Because the size of the hash table is always rounded up to the next -// power of 2, p is really an upper bound on the actual load factor---the -// actual load factor is anywhere between p/2 and p. This is a bit wasteful, -// but bear in mind that slots only hold metadata, not actual values. -// Since space cost is dominated by the values (the LSM blocks), -// overprovisioning the table with metadata only increases the total cache space -// usage by a tiny fraction. -constexpr double kLoadFactor = 0.7; - -// The user can exceed kLoadFactor if the sizes of the inserted values don't -// match estimated_value_size, or in some rare cases with -// strict_capacity_limit == false. To avoid degenerate performance, we set a -// strict upper bound on the load factor. -constexpr double kStrictLoadFactor = 0.84; - struct ClockHandleBasicData { Cache::ObjectPtr value = nullptr; const Cache::CacheItemHelper* helper = nullptr; @@ -312,12 +305,6 @@ struct ClockHandleBasicData { UniqueId64x2 hashed_key = kNullUniqueId64x2; size_t total_charge = 0; - // For total_charge_and_flags - // "Detached" means the handle is allocated separately from hash table. - static constexpr uint64_t kFlagDetached = uint64_t{1} << 63; - // Extract just the total charge - static constexpr uint64_t kTotalChargeMask = kFlagDetached - 1; - inline size_t GetTotalCharge() const { return total_charge; } // Calls deleter (if non-null) on cache key and value @@ -332,7 +319,7 @@ struct ClockHandle : public ClockHandleBasicData { // state of the handle. The meta word looks like this: // low bits high bits // ----------------------------------------------------------------------- - // | acquire counter | release counter | state marker | + // | acquire counter | release counter | hit bit | state marker | // ----------------------------------------------------------------------- // For reading or updating counters in meta word. @@ -346,8 +333,12 @@ struct ClockHandle : public ClockHandleBasicData { static constexpr uint64_t kReleaseIncrement = uint64_t{1} << kReleaseCounterShift; + // For setting the hit bit + static constexpr uint8_t kHitBitShift = 2U * kCounterNumBits; + static constexpr uint64_t kHitBitMask = uint64_t{1} << kHitBitShift; + // For reading or updating the state marker in meta word - static constexpr uint8_t kStateShift = 2U * kCounterNumBits; + static constexpr uint8_t kStateShift = kHitBitShift + 1; // Bits contribution to state marker. // Occupied means any state other than empty @@ -377,109 +368,254 @@ struct ClockHandle : public ClockHandleBasicData { static constexpr uint8_t kMaxCountdown = kHighCountdown; // TODO: make these coundown values tuning parameters for eviction? - // See above - std::atomic meta{}; - - // Anticipating use for SecondaryCache support - void* reserved_for_future_use = nullptr; + // See above. Mutable for read reference counting. + mutable AcqRelAtomic meta{}; }; // struct ClockHandle -class HyperClockTable { +class BaseClockTable { + public: + BaseClockTable(CacheMetadataChargePolicy metadata_charge_policy, + MemoryAllocator* allocator, + const Cache::EvictionCallback* eviction_callback, + const uint32_t* hash_seed) + : metadata_charge_policy_(metadata_charge_policy), + allocator_(allocator), + eviction_callback_(*eviction_callback), + hash_seed_(*hash_seed) {} + + template + typename Table::HandleImpl* CreateStandalone(ClockHandleBasicData& proto, + size_t capacity, + bool strict_capacity_limit, + bool allow_uncharged); + + template + Status Insert(const ClockHandleBasicData& proto, + typename Table::HandleImpl** handle, Cache::Priority priority, + size_t capacity, bool strict_capacity_limit); + + void Ref(ClockHandle& handle); + + size_t GetOccupancy() const { return occupancy_.LoadRelaxed(); } + + size_t GetUsage() const { return usage_.LoadRelaxed(); } + + size_t GetStandaloneUsage() const { return standalone_usage_.LoadRelaxed(); } + + uint32_t GetHashSeed() const { return hash_seed_; } + + uint64_t GetYieldCount() const { return yield_count_.LoadRelaxed(); } + + struct EvictionData { + size_t freed_charge = 0; + size_t freed_count = 0; + }; + + void TrackAndReleaseEvictedEntry(ClockHandle* h, EvictionData* data); + +#ifndef NDEBUG + // Acquire N references + void TEST_RefN(ClockHandle& handle, size_t n); + // Helper for TEST_ReleaseN + void TEST_ReleaseNMinus1(ClockHandle* handle, size_t n); +#endif + + private: // fns + // Creates a "standalone" handle for returning from an Insert operation that + // cannot be completed by actually inserting into the table. + // Updates `standalone_usage_` but not `usage_` nor `occupancy_`. + template + HandleImpl* StandaloneInsert(const ClockHandleBasicData& proto); + + // Helper for updating `usage_` for new entry with given `total_charge` + // and evicting if needed under strict_capacity_limit=true rules. This + // means the operation might fail with Status::MemoryLimit. If + // `need_evict_for_occupancy`, then eviction of at least one entry is + // required, and the operation should fail if not possible. + // NOTE: Otherwise, occupancy_ is not managed in this function + template + Status ChargeUsageMaybeEvictStrict(size_t total_charge, size_t capacity, + bool need_evict_for_occupancy, + typename Table::InsertState& state); + + // Helper for updating `usage_` for new entry with given `total_charge` + // and evicting if needed under strict_capacity_limit=false rules. This + // means that updating `usage_` always succeeds even if forced to exceed + // capacity. If `need_evict_for_occupancy`, then eviction of at least one + // entry is required, and the operation should return false if such eviction + // is not possible. `usage_` is not updated in that case. Otherwise, returns + // true, indicating success. + // NOTE: occupancy_ is not managed in this function + template + bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, size_t capacity, + bool need_evict_for_occupancy, + typename Table::InsertState& state); + + protected: // data + // We partition the following members into different cache lines + // to avoid false sharing among Lookup, Release, Erase and Insert + // operations in ClockCacheShard. + + // Clock algorithm sweep pointer. + // (Relaxed: only needs to be consistent with itself.) + RelaxedAtomic clock_pointer_{}; + + // Counter for number of times we yield to wait on another thread. + // (Relaxed: a simple stat counter.) + RelaxedAtomic yield_count_{}; + + // TODO: is this separation needed if we don't do background evictions? + ALIGN_AS(CACHE_LINE_SIZE) + // Number of elements in the table. + AcqRelAtomic occupancy_{}; + + // Memory usage by entries tracked by the cache (including standalone) + AcqRelAtomic usage_{}; + + // Part of usage by standalone entries (not in table) + AcqRelAtomic standalone_usage_{}; + + ALIGN_AS(CACHE_LINE_SIZE) + const CacheMetadataChargePolicy metadata_charge_policy_; + + // From Cache, for deleter + MemoryAllocator* const allocator_; + + // A reference to Cache::eviction_callback_ + const Cache::EvictionCallback& eviction_callback_; + + // A reference to ShardedCacheBase::hash_seed_ + const uint32_t& hash_seed_; +}; + +// Hash table for cache entries with size determined at creation time. +// Uses open addressing and double hashing. Since entries cannot be moved, +// the "displacements" count ensures probing sequences find entries even when +// entries earlier in the probing sequence have been removed. +class FixedHyperClockTable : public BaseClockTable { public: // Target size to be exactly a common cache line size (see static_assert in // clock_cache.cc) struct ALIGN_AS(64U) HandleImpl : public ClockHandle { // The number of elements that hash to this slot or a lower one, but wind // up in this slot or a higher one. - std::atomic displacements{}; + // (Relaxed: within a Cache op, does not need consistency with entries + // inserted/removed during that op. For example, a Lookup() that + // happens-after an Insert() will see an appropriate displacements value + // for the entry to be in a published state.) + RelaxedAtomic displacements{}; // Whether this is a "deteched" handle that is independently allocated // with `new` (so must be deleted with `delete`). // TODO: ideally this would be packed into some other data field, such // as upper bits of total_charge, but that incurs a measurable performance // regression. - bool detached = false; + bool standalone = false; - inline bool IsDetached() const { return detached; } + inline bool IsStandalone() const { return standalone; } - inline void SetDetached() { detached = true; } + inline void SetStandalone() { standalone = true; } }; // struct HandleImpl struct Opts { + explicit Opts(size_t _estimated_value_size) + : estimated_value_size(_estimated_value_size) {} + explicit Opts(const HyperClockCacheOptions& opts) { + assert(opts.estimated_entry_charge > 0); + estimated_value_size = opts.estimated_entry_charge; + } size_t estimated_value_size; }; - HyperClockTable(size_t capacity, bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy, - MemoryAllocator* allocator, const Opts& opts); - ~HyperClockTable(); + FixedHyperClockTable(size_t capacity, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy, + MemoryAllocator* allocator, + const Cache::EvictionCallback* eviction_callback, + const uint32_t* hash_seed, const Opts& opts); + ~FixedHyperClockTable(); + + // For BaseClockTable::Insert + struct InsertState {}; + + void StartInsert(InsertState& state); - Status Insert(const ClockHandleBasicData& proto, HandleImpl** handle, - Cache::Priority priority, size_t capacity, - bool strict_capacity_limit); + // Returns true iff there is room for the proposed number of entries. + bool GrowIfNeeded(size_t new_occupancy, InsertState& state); + + HandleImpl* DoInsert(const ClockHandleBasicData& proto, + uint64_t initial_countdown, bool take_ref, + InsertState& state); + + // Runs the clock eviction algorithm trying to reclaim at least + // requested_charge. Returns how much is evicted, which could be less + // if it appears impossible to evict the requested amount without blocking. + void Evict(size_t requested_charge, InsertState& state, EvictionData* data); HandleImpl* Lookup(const UniqueId64x2& hashed_key); bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref); - void Ref(HandleImpl& handle); - void Erase(const UniqueId64x2& hashed_key); - void ConstApplyToEntriesRange(std::function func, - size_t index_begin, size_t index_end, - bool apply_if_will_be_deleted) const; - void EraseUnRefEntries(); size_t GetTableSize() const { return size_t{1} << length_bits_; } - int GetLengthBits() const { return length_bits_; } - - size_t GetOccupancy() const { - return occupancy_.load(std::memory_order_relaxed); - } - size_t GetOccupancyLimit() const { return occupancy_limit_; } - size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); } + const HandleImpl* HandlePtr(size_t idx) const { return &array_[idx]; } - size_t GetDetachedUsage() const { - return detached_usage_.load(std::memory_order_relaxed); +#ifndef NDEBUG + size_t& TEST_MutableOccupancyLimit() { + return const_cast(occupancy_limit_); } - // Acquire/release N references - void TEST_RefN(HandleImpl& handle, size_t n); + // Release N references void TEST_ReleaseN(HandleImpl* handle, size_t n); +#endif + + // The load factor p is a real number in (0, 1) such that at all + // times at most a fraction p of all slots, without counting tombstones, + // are occupied by elements. This means that the probability that a random + // probe hits an occupied slot is at most p, and thus at most 1/p probes + // are required on average. For example, p = 70% implies that between 1 and 2 + // probes are needed on average (bear in mind that this reasoning doesn't + // consider the effects of clustering over time, which should be negligible + // with double hashing). + // Because the size of the hash table is always rounded up to the next + // power of 2, p is really an upper bound on the actual load factor---the + // actual load factor is anywhere between p/2 and p. This is a bit wasteful, + // but bear in mind that slots only hold metadata, not actual values. + // Since space cost is dominated by the values (the LSM blocks), + // overprovisioning the table with metadata only increases the total cache + // space usage by a tiny fraction. + static constexpr double kLoadFactor = 0.7; + + // The user can exceed kLoadFactor if the sizes of the inserted values don't + // match estimated_value_size, or in some rare cases with + // strict_capacity_limit == false. To avoid degenerate performance, we set a + // strict upper bound on the load factor. + static constexpr double kStrictLoadFactor = 0.84; private: // functions // Returns x mod 2^{length_bits_}. inline size_t ModTableSize(uint64_t x) { - return static_cast(x) & length_bits_mask_; + return BitwiseAnd(x, length_bits_mask_); } - // Runs the clock eviction algorithm trying to reclaim at least - // requested_charge. Returns how much is evicted, which could be less - // if it appears impossible to evict the requested amount without blocking. - inline void Evict(size_t requested_charge, size_t* freed_charge, - size_t* freed_count); - - // Returns the first slot in the probe sequence, starting from the given - // probe number, with a handle e such that match(e) is true. At every - // step, the function first tests whether match(e) holds. If this is false, - // it evaluates abort(e) to decide whether the search should be aborted, - // and in the affirmative returns -1. For every handle e probed except - // the last one, the function runs update(e). - // The probe parameter is modified as follows. We say a probe to a handle - // e is aborting if match(e) is false and abort(e) is true. Then the final - // value of probe is one more than the last non-aborting probe during the - // call. This is so that that the variable can be used to keep track of - // progress across consecutive calls to FindSlot. + // Returns the first slot in the probe sequence with a handle e such that + // match_fn(e) is true. At every step, the function first tests whether + // match_fn(e) holds. If this is false, it evaluates abort_fn(e) to decide + // whether the search should be aborted, and if so, FindSlot immediately + // returns nullptr. For every handle e that is not a match and not aborted, + // FindSlot runs update_fn(e, is_last) where is_last is set to true iff that + // slot will be the last probed because the next would cycle back to the first + // slot probed. This function uses templates instead of std::function to + // minimize the risk of heap-allocated closures being created. + template inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key, - std::function match, - std::function stop, - std::function update, - size_t& probe); + const MatchFn& match_fn, const AbortFn& abort_fn, + const UpdateFn& update_fn); // Re-decrement all displacements in probe path starting from beginning // until (not including) the given handle @@ -492,33 +628,6 @@ class HyperClockTable { // before releasing it so that it can be provided to this function. inline void ReclaimEntryUsage(size_t total_charge); - // Helper for updating `usage_` for new entry with given `total_charge` - // and evicting if needed under strict_capacity_limit=true rules. This - // means the operation might fail with Status::MemoryLimit. If - // `need_evict_for_occupancy`, then eviction of at least one entry is - // required, and the operation should fail if not possible. - // NOTE: Otherwise, occupancy_ is not managed in this function - inline Status ChargeUsageMaybeEvictStrict(size_t total_charge, - size_t capacity, - bool need_evict_for_occupancy); - - // Helper for updating `usage_` for new entry with given `total_charge` - // and evicting if needed under strict_capacity_limit=false rules. This - // means that updating `usage_` always succeeds even if forced to exceed - // capacity. If `need_evict_for_occupancy`, then eviction of at least one - // entry is required, and the operation should return false if such eviction - // is not possible. `usage_` is not updated in that case. Otherwise, returns - // true, indicating success. - // NOTE: occupancy_ is not managed in this function - inline bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, - size_t capacity, - bool need_evict_for_occupancy); - - // Creates a "detached" handle for returning from an Insert operation that - // cannot be completed by actually inserting into the table. - // Updates `detached_usage_` but not `usage_` nor `occupancy_`. - inline HandleImpl* DetachedInsert(const ClockHandleBasicData& proto); - MemoryAllocator* GetAllocator() const { return allocator_; } // Returns the number of bits used to hash an element in the hash @@ -539,36 +648,330 @@ class HyperClockTable { // Array of slots comprising the hash table. const std::unique_ptr array_; +}; // class FixedHyperClockTable - // From Cache, for deleter - MemoryAllocator* const allocator_; +// Hash table for cache entries that resizes automatically based on occupancy. +// However, it depends on a contiguous memory region to grow into +// incrementally, using linear hashing, so uses an anonymous mmap so that +// only the used portion of the memory region is mapped to physical memory +// (part of RSS). +// +// This table implementation uses the same "low-level protocol" for managing +// the contens of an entry slot as FixedHyperClockTable does, captured in the +// ClockHandle struct. The provides most of the essential data safety, but +// AutoHyperClockTable is another "high-level protocol" for organizing entries +// into a hash table, with automatic resizing. +// +// This implementation is not fully wait-free but we can call it "essentially +// wait-free," and here's why. First, like FixedHyperClockCache, there is no +// locking nor other forms of waiting at the cache or shard level. Also like +// FixedHCC there is essentially an entry-level read-write lock implemented +// with atomics, but our relaxed atomicity/consistency guarantees (e.g. +// duplicate inserts are possible) mean we do not need to wait for entry +// locking. Lookups, non-erasing Releases, and non-evicting non-growing Inserts +// are all fully wait-free. Of course, these waits are not dependent on any +// external factors such as I/O. +// +// For operations that remove entries from a chain or grow the table by +// splitting a chain, there is a chain-level locking mechanism that we call a +// "rewrite" lock, and the only waits are for these locks. On average, each +// chain lock is relevant to < 2 entries each. (The average would be less than +// one entry each, but we do not lock when there's no entry to remove or +// migrate.) And a given thread can only hold two such chain locks at a time, +// more typically just one. So in that sense alone, the waiting that does exist +// is very localized. +// +// If we look closer at the operations utilizing that locking mechanism, we +// can see why it's "essentially wait-free." +// * Grow operations to increase the size of the table: each operation splits +// an existing chain into two, and chains for splitting are chosen in table +// order. Grow operations are fully parallel except for the chain locking, but +// for one Grow operation to wait on another, it has to be feeding into the +// other, which means the table has doubled in size already from other Grow +// operations without the original one finishing. So Grow operations are very +// low latency (unlike LRUCache doubling the table size in one operation) and +// very parallelizeable. (We use some tricks to break up dependencies in +// updating metadata on the usable size of the table.) And obviously Grow +// operations are very rare after the initial population of the table. +// * Evict operations (part of many Inserts): clock updates and evictions +// sweep through the structure in table order, so like Grow operations, +// parallel Evict can only wait on each other if an Evict has lingered (slept) +// long enough that the clock pointer has wrapped around the entire structure. +// * Random erasures (Erase, Release with erase_if_last_ref, etc.): these +// operations are rare and not really considered performance critical. +// Currently they're mostly used for removing placeholder cache entries, e.g. +// for memory tracking, though that could use standalone entries instead to +// avoid potential contention in table operations. It's possible that future +// enhancements could pro-actively remove cache entries from obsolete files, +// but that's not yet implemented. +class AutoHyperClockTable : public BaseClockTable { + public: + // Target size to be exactly a common cache line size (see static_assert in + // clock_cache.cc) + struct ALIGN_AS(64U) HandleImpl : public ClockHandle { + // To orgainize AutoHyperClockTable entries into a hash table while + // allowing the table size to grow without existing entries being moved, + // a version of chaining is used. Rather than being heap allocated (and + // incurring overheads to ensure memory safety) entries must go into + // Handles ("slots") in the pre-allocated array. To improve CPU cache + // locality, the chain head pointers are interleved with the entries; + // specifically, a Handle contains + // * A head pointer for a chain of entries with this "home" location. + // * A ClockHandle, for an entry that may or may not be in the chain + // starting from that head (but for performance ideally is on that + // chain). + // * A next pointer for the continuation of the chain containing this + // entry. + // + // The pointers are not raw pointers, but are indices into the array, + // and are decorated in two ways to help detect and recover from + // relevant concurrent modifications during Lookup, so that Lookup is + // fully wait-free: + // * Each "with_shift" pointer contains a shift count that indicates + // how many hash bits were used in chosing the home address for the + // chain--specifically the next entry in the chain. + // * The end of a chain is given a special "end" marker and refers back + // to the head of the chain. + // + // Why do we need shift on each pointer? To make Lookup wait-free, we need + // to be able to query a chain without missing anything, and preferably + // avoid synchronously double-checking the length_info. Without the shifts, + // there is a risk that we start down a chain and while paused on an entry + // that goes to a new home, we then follow the rest of the + // partially-migrated chain to see the shared ending with the old home, but + // for a time were following the chain for the new home, missing some + // entries for the old home. + // + // Why do we need the end of the chain to loop back? If Lookup pauses + // at an "under construction" entry, and sees that "next" is null after + // waking up, we need something to tell whether the "under construction" + // entry was freed and reused for another chain. Otherwise, we could + // miss entries still on the original chain due in the presence of a + // concurrent modification. Until an entry is fully erased from a chain, + // it is normal to see "under construction" entries on the chain, and it + // is not safe to read their hashed key without either a read reference + // on the entry or a rewrite lock on the chain. + + // Marker in a "with_shift" head pointer for some thread owning writes + // to the chain structure (except for inserts), but only if not an + // "end" pointer. Also called the "rewrite lock." + static constexpr uint64_t kHeadLocked = uint64_t{1} << 7; + + // Marker in a "with_shift" pointer for the end of a chain. Must also + // point back to the head of the chain (with end marker removed). + // Also includes the "locked" bit so that attempting to lock an empty + // chain has no effect (not needed, as the lock is only needed for + // removals). + static constexpr uint64_t kNextEndFlags = (uint64_t{1} << 6) | kHeadLocked; + + static inline bool IsEnd(uint64_t next_with_shift) { + // Assuming certain values never used, suffices to check this one bit + constexpr auto kCheckBit = kNextEndFlags ^ kHeadLocked; + return next_with_shift & kCheckBit; + } + + // Bottom bits to right shift away to get an array index from a + // "with_shift" pointer. + static constexpr int kNextShift = 8; + + // A bit mask for the "shift" associated with each "with_shift" pointer. + // Always bottommost bits. + static constexpr int kShiftMask = 63; + + // A marker for head_next_with_shift that indicates this HandleImpl is + // heap allocated (standalone) rather than in the table. + static constexpr uint64_t kStandaloneMarker = UINT64_MAX; + + // A marker for head_next_with_shift indicating the head is not yet part + // of the usable table, or for chain_next_with_shift indicating that the + // entry is not present or is not yet part of a chain (must not be + // "shareable" state). + static constexpr uint64_t kUnusedMarker = 0; + + // See above. The head pointer is logically independent of the rest of + // the entry, including the chain next pointer. + AcqRelAtomic head_next_with_shift{kUnusedMarker}; + AcqRelAtomic chain_next_with_shift{kUnusedMarker}; + + // For supporting CreateStandalone and some fallback cases. + inline bool IsStandalone() const { + return head_next_with_shift.Load() == kStandaloneMarker; + } + + inline void SetStandalone() { + head_next_with_shift.Store(kStandaloneMarker); + } + }; // struct HandleImpl - // We partition the following members into different cache lines - // to avoid false sharing among Lookup, Release, Erase and Insert - // operations in ClockCacheShard. + struct Opts { + explicit Opts(size_t _min_avg_value_size) + : min_avg_value_size(_min_avg_value_size) {} + + explicit Opts(const HyperClockCacheOptions& opts) { + assert(opts.estimated_entry_charge == 0); + min_avg_value_size = opts.min_avg_entry_charge; + } + size_t min_avg_value_size; + }; - ALIGN_AS(CACHE_LINE_SIZE) - // Clock algorithm sweep pointer. - std::atomic clock_pointer_{}; + AutoHyperClockTable(size_t capacity, bool strict_capacity_limit, + CacheMetadataChargePolicy metadata_charge_policy, + MemoryAllocator* allocator, + const Cache::EvictionCallback* eviction_callback, + const uint32_t* hash_seed, const Opts& opts); + ~AutoHyperClockTable(); + + // For BaseClockTable::Insert + struct InsertState { + uint64_t saved_length_info = 0; + size_t likely_empty_slot = 0; + }; - ALIGN_AS(CACHE_LINE_SIZE) - // Number of elements in the table. - std::atomic occupancy_{}; + void StartInsert(InsertState& state); + + // Does initial check for whether there's hash table room for another + // inserted entry, possibly growing if needed. Returns true iff (after + // the call) there is room for the proposed number of entries. + bool GrowIfNeeded(size_t new_occupancy, InsertState& state); - // Memory usage by entries tracked by the cache (including detached) - std::atomic usage_{}; + HandleImpl* DoInsert(const ClockHandleBasicData& proto, + uint64_t initial_countdown, bool take_ref, + InsertState& state); - // Part of usage by detached entries (not in table) - std::atomic detached_usage_{}; -}; // class HyperClockTable + // Runs the clock eviction algorithm trying to reclaim at least + // requested_charge. Returns how much is evicted, which could be less + // if it appears impossible to evict the requested amount without blocking. + void Evict(size_t requested_charge, InsertState& state, EvictionData* data); + + HandleImpl* Lookup(const UniqueId64x2& hashed_key); + + bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref); + + void Erase(const UniqueId64x2& hashed_key); + + void EraseUnRefEntries(); + + size_t GetTableSize() const; + + size_t GetOccupancyLimit() const; + + const HandleImpl* HandlePtr(size_t idx) const { return &array_[idx]; } + +#ifndef NDEBUG + size_t& TEST_MutableOccupancyLimit() { + return *reinterpret_cast(&occupancy_limit_); + } + + // Release N references + void TEST_ReleaseN(HandleImpl* handle, size_t n); +#endif + + // Maximum ratio of number of occupied slots to number of usable slots. The + // actual load factor should float pretty close to this number, which should + // be a nice space/time trade-off, though large swings in WriteBufferManager + // memory could lead to low (but very much safe) load factors (only after + // seeing high load factors). Linear hashing along with (modified) linear + // probing to find an available slot increases potential risks of high + // load factors, so are disallowed. + static constexpr double kMaxLoadFactor = 0.60; + + private: // functions + // Returns true iff increased usable length. Due to load factor + // considerations, GrowIfNeeded might call this more than once to make room + // for one more entry. + bool Grow(InsertState& state); + + // Operational details of splitting a chain into two for Grow(). + void SplitForGrow(size_t grow_home, size_t old_home, int old_shift); + + // Takes an "under construction" entry and ensures it is no longer connected + // to its home chain (in preparaion for completing erasure and freeing the + // slot). Note that previous operations might have already noticed it being + // "under (de)construction" and removed it from its chain. + void Remove(HandleImpl* h); + + // Try to take ownership of an entry and erase+remove it from the table. + // Returns true if successful. Could fail if + // * There are other references to the entry + // * Some other thread has exclusive ownership or has freed it. + bool TryEraseHandle(HandleImpl* h, bool holding_ref, bool mark_invisible); + + // Calculates the appropriate maximum table size, for creating the memory + // mapping. + static size_t CalcMaxUsableLength( + size_t capacity, size_t min_avg_value_size, + CacheMetadataChargePolicy metadata_charge_policy); + + // Shared helper function that implements removing entries from a chain + // with proper handling to ensure all existing data is seen even in the + // presence of concurrent insertions, etc. (See implementation.) + template + void PurgeImpl(OpData* op_data, size_t home = SIZE_MAX); + + // An RAII wrapper for locking a chain of entries for removals. See + // implementation. + class ChainRewriteLock; + + // Helper function for PurgeImpl while holding a ChainRewriteLock. See + // implementation. + template + void PurgeImplLocked(OpData* op_data, ChainRewriteLock& rewrite_lock, + size_t home); + + // Update length_info_ as much as possible without waiting, given a known + // usable (ready for inserts and lookups) grow_home. (Previous grow_homes + // might not be usable yet, but we can check if they are by looking at + // the corresponding old home.) + void CatchUpLengthInfoNoWait(size_t known_usable_grow_home); + + private: // data + // mmaped area holding handles + const TypedMemMapping array_; + + // Metadata for table size under linear hashing. + // + // Lowest 8 bits are the minimum number of lowest hash bits to use + // ("min shift"). The upper 56 bits are a threshold. If that minumum number + // of bits taken from a hash value is < this threshold, then one more bit of + // hash value is taken and used. + // + // Other mechanisms (shift amounts on pointers) ensure complete availability + // of data already in the table even if a reader only sees a completely + // out-of-date version of this value. In the worst case, it could take + // log time to find the correct chain, but normally this value enables + // readers to find the correct chain on the first try. + // + // To maximize parallelization of Grow() operations, this field is only + // updated opportunistically after Grow() operations and in DoInsert() where + // it is found to be out-of-date. See CatchUpLengthInfoNoWait(). + AcqRelAtomic length_info_; + + // An already-computed version of the usable length times the max load + // factor. Could be slightly out of date but GrowIfNeeded()/Grow() handle + // that internally. + // (Relaxed: allowed to lag behind length_info_ by a little) + RelaxedAtomic occupancy_limit_; + + // The next index to use from array_ upon the next Grow(). Might be ahead of + // length_info_. + // (Relaxed: self-contained source of truth for next grow home) + RelaxedAtomic grow_frontier_; + + // See explanation in AutoHyperClockTable::Evict + // (Relaxed: allowed to lag behind clock_pointer_ and length_info_ state) + RelaxedAtomic clock_pointer_mask_; +}; // class AutoHyperClockTable // A single shard of sharded cache. -template +template class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase { public: + using Table = TableT; ClockCacheShard(size_t capacity, bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy, - MemoryAllocator* allocator, const typename Table::Opts& opts); + MemoryAllocator* allocator, + const Cache::EvictionCallback* eviction_callback, + const uint32_t* hash_seed, const typename Table::Opts& opts); // For CacheShard concept using HandleImpl = typename Table::HandleImpl; @@ -578,22 +981,23 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase { static inline uint32_t HashPieceForSharding(HashCref hash) { return Upper32of64(hash[0]); } - static inline HashVal ComputeHash(const Slice& key) { + static inline HashVal ComputeHash(const Slice& key, uint32_t seed) { assert(key.size() == kCacheKeySize); HashVal in; HashVal out; // NOTE: endian dependence // TODO: use GetUnaligned? std::memcpy(&in, key.data(), kCacheKeySize); - BijectiveHash2x64(in[1], in[0], &out[1], &out[0]); + BijectiveHash2x64(in[1], in[0] ^ seed, &out[1], &out[0]); return out; } // For reconstructing key from hashed_key. Requires the caller to provide // backing storage for the Slice in `unhashed` static inline Slice ReverseHash(const UniqueId64x2& hashed, - UniqueId64x2* unhashed) { + UniqueId64x2* unhashed, uint32_t seed) { BijectiveUnhash2x64(hashed[1], hashed[0], &(*unhashed)[1], &(*unhashed)[0]); + (*unhashed)[0] ^= seed; // NOTE: endian dependence return Slice(reinterpret_cast(unhashed), kCacheKeySize); } @@ -609,6 +1013,11 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase { Cache::ObjectPtr value, const Cache::CacheItemHelper* helper, size_t charge, HandleImpl** handle, Cache::Priority priority); + HandleImpl* CreateStandalone(const Slice& key, const UniqueId64x2& hashed_key, + Cache::ObjectPtr obj, + const Cache::CacheItemHelper* helper, + size_t charge, bool allow_uncharged); + HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key); bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref); @@ -623,7 +1032,7 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase { size_t GetUsage() const; - size_t GetDetachedUsage() const; + size_t GetStandaloneUsage() const; size_t GetPinnedUsage() const; @@ -646,43 +1055,42 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase { HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key, const Cache::CacheItemHelper* /*helper*/, Cache::CreateContext* /*create_context*/, - Cache::Priority /*priority*/, bool /*wait*/, - Statistics* /*stats*/) { + Cache::Priority /*priority*/, Statistics* /*stats*/) { return Lookup(key, hashed_key); } - bool IsReady(HandleImpl* /*handle*/) { return true; } - - void Wait(HandleImpl* /*handle*/) {} + Table& GetTable() { return table_; } + const Table& GetTable() const { return table_; } +#ifndef NDEBUG + size_t& TEST_MutableOccupancyLimit() { + return table_.TEST_MutableOccupancyLimit(); + } // Acquire/release N references void TEST_RefN(HandleImpl* handle, size_t n); void TEST_ReleaseN(HandleImpl* handle, size_t n); +#endif private: // data Table table_; // Maximum total charge of all elements stored in the table. - std::atomic capacity_; + // (Relaxed: eventual consistency/update is OK) + RelaxedAtomic capacity_; // Whether to reject insertion if cache reaches its full capacity. - std::atomic strict_capacity_limit_; + // (Relaxed: eventual consistency/update is OK) + RelaxedAtomic strict_capacity_limit_; }; // class ClockCacheShard -class HyperClockCache -#ifdef NDEBUG - final -#endif - : public ShardedCache> { +template +class BaseHyperClockCache : public ShardedCache> { public: - using Shard = ClockCacheShard; + using Shard = ClockCacheShard
; + using Handle = Cache::Handle; + using CacheItemHelper = Cache::CacheItemHelper; - HyperClockCache(size_t capacity, size_t estimated_value_size, - int num_shard_bits, bool strict_capacity_limit, - CacheMetadataChargePolicy metadata_charge_policy, - std::shared_ptr memory_allocator); - - const char* Name() const override { return "HyperClockCache"; } + explicit BaseHyperClockCache(const HyperClockCacheOptions& opts); Cache::ObjectPtr Value(Handle* handle) override; @@ -692,7 +1100,35 @@ class HyperClockCache void ReportProblems( const std::shared_ptr& /*info_log*/) const override; -}; // class HyperClockCache +}; + +class FixedHyperClockCache +#ifdef NDEBUG + final +#endif + : public BaseHyperClockCache { + public: + using BaseHyperClockCache::BaseHyperClockCache; + + const char* Name() const override { return "FixedHyperClockCache"; } + + void ReportProblems( + const std::shared_ptr& /*info_log*/) const override; +}; // class FixedHyperClockCache + +class AutoHyperClockCache +#ifdef NDEBUG + final +#endif + : public BaseHyperClockCache { + public: + using BaseHyperClockCache::BaseHyperClockCache; + + const char* Name() const override { return "AutoHyperClockCache"; } + + void ReportProblems( + const std::shared_ptr& /*info_log*/) const override; +}; // class AutoHyperClockCache } // namespace clock_cache diff --git a/cache/compressed_secondary_cache.cc b/cache/compressed_secondary_cache.cc index 23154d4f2a58..b29670b7730f 100644 --- a/cache/compressed_secondary_cache.cc +++ b/cache/compressed_secondary_cache.cc @@ -9,40 +9,40 @@ #include #include -#include "memory/memory_allocator.h" +#include "memory/memory_allocator_impl.h" #include "monitoring/perf_context_imp.h" +#include "util/coding.h" #include "util/compression.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { CompressedSecondaryCache::CompressedSecondaryCache( - size_t capacity, int num_shard_bits, bool strict_capacity_limit, - double high_pri_pool_ratio, double low_pri_pool_ratio, - std::shared_ptr memory_allocator, bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy, - CompressionType compression_type, uint32_t compress_format_version, - bool enable_custom_split_merge) - : cache_options_(capacity, num_shard_bits, strict_capacity_limit, - high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator, - use_adaptive_mutex, metadata_charge_policy, - compression_type, compress_format_version, - enable_custom_split_merge) { - cache_ = - NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, - high_pri_pool_ratio, memory_allocator, use_adaptive_mutex, - metadata_charge_policy, low_pri_pool_ratio); -} + const CompressedSecondaryCacheOptions& opts) + : cache_(opts.LRUCacheOptions::MakeSharedCache()), + cache_options_(opts), + cache_res_mgr_(std::make_shared( + std::make_shared>( + cache_))), + disable_cache_(opts.capacity == 0) {} -CompressedSecondaryCache::~CompressedSecondaryCache() { cache_.reset(); } +CompressedSecondaryCache::~CompressedSecondaryCache() {} std::unique_ptr CompressedSecondaryCache::Lookup( const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase, - bool& is_in_sec_cache) { + bool& kept_in_sec_cache) { assert(helper); + // This is a minor optimization. Its ok to skip it in TSAN in order to + // avoid a false positive. +#ifndef __SANITIZE_THREAD__ + if (disable_cache_) { + return nullptr; + } +#endif + std::unique_ptr handle; - is_in_sec_cache = false; + kept_in_sec_cache = false; Cache::Handle* lru_handle = cache_->Lookup(key); if (lru_handle == nullptr) { return nullptr; @@ -57,39 +57,65 @@ std::unique_ptr CompressedSecondaryCache::Lookup( CacheAllocationPtr* ptr{nullptr}; CacheAllocationPtr merged_value; size_t handle_value_charge{0}; + const char* data_ptr = nullptr; + CacheTier source = CacheTier::kVolatileCompressedTier; + CompressionType type = cache_options_.compression_type; if (cache_options_.enable_custom_split_merge) { CacheValueChunk* value_chunk_ptr = reinterpret_cast(handle_value); merged_value = MergeChunksIntoValue(value_chunk_ptr, handle_value_charge); ptr = &merged_value; + data_ptr = ptr->get(); } else { + uint32_t type_32 = static_cast(type); + uint32_t source_32 = static_cast(source); ptr = reinterpret_cast(handle_value); handle_value_charge = cache_->GetCharge(lru_handle); + data_ptr = ptr->get(); + data_ptr = GetVarint32Ptr(data_ptr, data_ptr + 1, + static_cast(&type_32)); + type = static_cast(type_32); + data_ptr = GetVarint32Ptr(data_ptr, data_ptr + 1, + static_cast(&source_32)); + source = static_cast(source_32); + handle_value_charge -= (data_ptr - ptr->get()); } MemoryAllocator* allocator = cache_options_.memory_allocator.get(); Status s; Cache::ObjectPtr value{nullptr}; size_t charge{0}; - if (cache_options_.compression_type == kNoCompression) { - s = helper->create_cb(Slice(ptr->get(), handle_value_charge), - create_context, allocator, &value, &charge); - } else { - UncompressionContext uncompression_context(cache_options_.compression_type); - UncompressionInfo uncompression_info(uncompression_context, - UncompressionDict::GetEmptyDict(), - cache_options_.compression_type); - - size_t uncompressed_size{0}; - CacheAllocationPtr uncompressed = UncompressData( - uncompression_info, (char*)ptr->get(), handle_value_charge, - &uncompressed_size, cache_options_.compress_format_version, allocator); - - if (!uncompressed) { - cache_->Release(lru_handle, /*erase_if_last_ref=*/true); - return nullptr; + if (source == CacheTier::kVolatileCompressedTier) { + if (cache_options_.compression_type == kNoCompression || + cache_options_.do_not_compress_roles.Contains(helper->role)) { + s = helper->create_cb(Slice(data_ptr, handle_value_charge), + kNoCompression, CacheTier::kVolatileTier, + create_context, allocator, &value, &charge); + } else { + UncompressionContext uncompression_context( + cache_options_.compression_type); + UncompressionInfo uncompression_info(uncompression_context, + UncompressionDict::GetEmptyDict(), + cache_options_.compression_type); + + size_t uncompressed_size{0}; + CacheAllocationPtr uncompressed = + UncompressData(uncompression_info, (char*)data_ptr, + handle_value_charge, &uncompressed_size, + cache_options_.compress_format_version, allocator); + + if (!uncompressed) { + cache_->Release(lru_handle, /*erase_if_last_ref=*/true); + return nullptr; + } + s = helper->create_cb(Slice(uncompressed.get(), uncompressed_size), + kNoCompression, CacheTier::kVolatileTier, + create_context, allocator, &value, &charge); } - s = helper->create_cb(Slice(uncompressed.get(), uncompressed_size), + } else { + // The item was not compressed by us. Let the helper create_cb + // uncompress it + s = helper->create_cb(Slice(data_ptr, handle_value_charge), type, source, create_context, allocator, &value, &charge); } @@ -107,46 +133,66 @@ std::unique_ptr CompressedSecondaryCache::Lookup( /*charge=*/0) .PermitUncheckedError(); } else { - is_in_sec_cache = true; + kept_in_sec_cache = true; cache_->Release(lru_handle, /*erase_if_last_ref=*/false); } handle.reset(new CompressedSecondaryCacheResultHandle(value, charge)); return handle; } -Status CompressedSecondaryCache::Insert(const Slice& key, - Cache::ObjectPtr value, - const Cache::CacheItemHelper* helper) { - if (value == nullptr) { - return Status::InvalidArgument(); - } - - Cache::Handle* lru_handle = cache_->Lookup(key); +bool CompressedSecondaryCache::MaybeInsertDummy(const Slice& key) { auto internal_helper = GetHelper(cache_options_.enable_custom_split_merge); + Cache::Handle* lru_handle = cache_->Lookup(key); if (lru_handle == nullptr) { PERF_COUNTER_ADD(compressed_sec_cache_insert_dummy_count, 1); // Insert a dummy handle if the handle is evicted for the first time. - return cache_->Insert(key, /*obj=*/nullptr, internal_helper, - /*charge=*/0); + cache_->Insert(key, /*obj=*/nullptr, internal_helper, /*charge=*/0) + .PermitUncheckedError(); + return true; } else { cache_->Release(lru_handle, /*erase_if_last_ref=*/false); } - size_t size = (*helper->size_cb)(value); + return false; +} + +Status CompressedSecondaryCache::InsertInternal( + const Slice& key, Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper, CompressionType type, + CacheTier source) { + if (source != CacheTier::kVolatileCompressedTier && + cache_options_.enable_custom_split_merge) { + // We don't support custom split/merge for the tiered case + return Status::OK(); + } + + auto internal_helper = GetHelper(cache_options_.enable_custom_split_merge); + char header[10]; + char* payload = header; + payload = EncodeVarint32(payload, static_cast(type)); + payload = EncodeVarint32(payload, static_cast(source)); + + size_t header_size = payload - header; + size_t data_size = (*helper->size_cb)(value); + size_t total_size = data_size + header_size; CacheAllocationPtr ptr = - AllocateBlock(size, cache_options_.memory_allocator.get()); + AllocateBlock(total_size, cache_options_.memory_allocator.get()); + char* data_ptr = ptr.get() + header_size; - Status s = (*helper->saveto_cb)(value, 0, size, ptr.get()); + Status s = (*helper->saveto_cb)(value, 0, data_size, data_ptr); if (!s.ok()) { return s; } - Slice val(ptr.get(), size); + Slice val(data_ptr, data_size); std::string compressed_val; - if (cache_options_.compression_type != kNoCompression) { - PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, size); + if (cache_options_.compression_type != kNoCompression && + type == kNoCompression && + !cache_options_.do_not_compress_roles.Contains(helper->role)) { + PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, data_size); CompressionOptions compression_opts; - CompressionContext compression_context(cache_options_.compression_type); + CompressionContext compression_context(cache_options_.compression_type, + compression_opts); uint64_t sample_for_compression{0}; CompressionInfo compression_info( compression_opts, compression_context, CompressionDict::GetEmptyDict(), @@ -161,12 +207,14 @@ Status CompressedSecondaryCache::Insert(const Slice& key, } val = Slice(compressed_val); - size = compressed_val.size(); - PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes, size); + data_size = compressed_val.size(); + total_size = header_size + data_size; + PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes, data_size); if (!cache_options_.enable_custom_split_merge) { - ptr = AllocateBlock(size, cache_options_.memory_allocator.get()); - memcpy(ptr.get(), compressed_val.data(), size); + ptr = AllocateBlock(total_size, cache_options_.memory_allocator.get()); + data_ptr = ptr.get() + header_size; + memcpy(data_ptr, compressed_val.data(), data_size); } } @@ -177,9 +225,43 @@ Status CompressedSecondaryCache::Insert(const Slice& key, SplitValueIntoChunks(val, cache_options_.compression_type, charge); return cache_->Insert(key, value_chunks_head, internal_helper, charge); } else { + std::memcpy(ptr.get(), header, header_size); CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr)); - return cache_->Insert(key, buf, internal_helper, size); + return cache_->Insert(key, buf, internal_helper, total_size); + } +} + +Status CompressedSecondaryCache::Insert(const Slice& key, + Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper, + bool force_insert) { + if (value == nullptr) { + return Status::InvalidArgument(); + } + + if (!force_insert && MaybeInsertDummy(key)) { + return Status::OK(); + } + + return InsertInternal(key, value, helper, kNoCompression, + CacheTier::kVolatileCompressedTier); +} + +Status CompressedSecondaryCache::InsertSaved( + const Slice& key, const Slice& saved, CompressionType type = kNoCompression, + CacheTier source = CacheTier::kVolatileTier) { + if (type == kNoCompression) { + return Status::OK(); } + + auto slice_helper = &kSliceCacheItemHelper; + if (MaybeInsertDummy(key)) { + return Status::OK(); + } + + return InsertInternal( + key, static_cast(const_cast(&saved)), + slice_helper, type, source); } void CompressedSecondaryCache::Erase(const Slice& key) { cache_->Erase(key); } @@ -188,6 +270,7 @@ Status CompressedSecondaryCache::SetCapacity(size_t capacity) { MutexLock l(&capacity_mutex_); cache_options_.capacity = capacity; cache_->SetCapacity(capacity); + disable_cache_ = capacity == 0; return Status::OK(); } @@ -308,30 +391,17 @@ const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper( } } -std::shared_ptr NewCompressedSecondaryCache( - size_t capacity, int num_shard_bits, bool strict_capacity_limit, - double high_pri_pool_ratio, double low_pri_pool_ratio, - std::shared_ptr memory_allocator, bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy, - CompressionType compression_type, uint32_t compress_format_version, - bool enable_custom_split_merge) { - return std::make_shared( - capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio, - low_pri_pool_ratio, memory_allocator, use_adaptive_mutex, - metadata_charge_policy, compression_type, compress_format_version, - enable_custom_split_merge); +std::shared_ptr +CompressedSecondaryCacheOptions::MakeSharedSecondaryCache() const { + return std::make_shared(*this); +} + +Status CompressedSecondaryCache::Deflate(size_t decrease) { + return cache_res_mgr_->UpdateCacheReservation(decrease, /*increase=*/true); } -std::shared_ptr NewCompressedSecondaryCache( - const CompressedSecondaryCacheOptions& opts) { - // The secondary_cache is disabled for this LRUCache instance. - assert(opts.secondary_cache == nullptr); - return NewCompressedSecondaryCache( - opts.capacity, opts.num_shard_bits, opts.strict_capacity_limit, - opts.high_pri_pool_ratio, opts.low_pri_pool_ratio, opts.memory_allocator, - opts.use_adaptive_mutex, opts.metadata_charge_policy, - opts.compression_type, opts.compress_format_version, - opts.enable_custom_split_merge); +Status CompressedSecondaryCache::Inflate(size_t increase) { + return cache_res_mgr_->UpdateCacheReservation(increase, /*increase=*/false); } } // namespace ROCKSDB_NAMESPACE diff --git a/cache/compressed_secondary_cache.h b/cache/compressed_secondary_cache.h index e38a1a861e17..32e6fd0df9b6 100644 --- a/cache/compressed_secondary_cache.h +++ b/cache/compressed_secondary_cache.h @@ -9,8 +9,9 @@ #include #include +#include "cache/cache_reservation_manager.h" #include "cache/lru_cache.h" -#include "memory/memory_allocator.h" +#include "memory/memory_allocator_impl.h" #include "rocksdb/secondary_cache.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" @@ -69,27 +70,23 @@ class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle { class CompressedSecondaryCache : public SecondaryCache { public: - CompressedSecondaryCache( - size_t capacity, int num_shard_bits, bool strict_capacity_limit, - double high_pri_pool_ratio, double low_pri_pool_ratio, - std::shared_ptr memory_allocator = nullptr, - bool use_adaptive_mutex = kDefaultToAdaptiveMutex, - CacheMetadataChargePolicy metadata_charge_policy = - kDefaultCacheMetadataChargePolicy, - CompressionType compression_type = CompressionType::kLZ4Compression, - uint32_t compress_format_version = 2, - bool enable_custom_split_merge = false); + explicit CompressedSecondaryCache( + const CompressedSecondaryCacheOptions& opts); ~CompressedSecondaryCache() override; const char* Name() const override { return "CompressedSecondaryCache"; } Status Insert(const Slice& key, Cache::ObjectPtr value, - const Cache::CacheItemHelper* helper) override; + const Cache::CacheItemHelper* helper, + bool force_insert) override; + + Status InsertSaved(const Slice& key, const Slice& saved, CompressionType type, + CacheTier source) override; std::unique_ptr Lookup( const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase, - bool& is_in_sec_cache) override; + bool& kept_in_sec_cache) override; bool SupportForceErase() const override { return true; } @@ -101,10 +98,16 @@ class CompressedSecondaryCache : public SecondaryCache { Status GetCapacity(size_t& capacity) override; + Status Deflate(size_t decrease) override; + + Status Inflate(size_t increase) override; + std::string GetPrintableOptions() const override; + size_t TEST_GetUsage() { return cache_->GetUsage(); } + private: - friend class CompressedSecondaryCacheTest; + friend class CompressedSecondaryCacheTestBase; static constexpr std::array malloc_bin_sizes_{ 128, 256, 512, 1024, 2048, 4096, 8192, 16384}; @@ -130,11 +133,19 @@ class CompressedSecondaryCache : public SecondaryCache { CacheAllocationPtr MergeChunksIntoValue(const void* chunks_head, size_t& charge); + bool MaybeInsertDummy(const Slice& key); + + Status InsertInternal(const Slice& key, Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper, + CompressionType type, CacheTier source); + // TODO: clean up to use cleaner interfaces in typed_cache.h const Cache::CacheItemHelper* GetHelper(bool enable_custom_split_merge) const; std::shared_ptr cache_; CompressedSecondaryCacheOptions cache_options_; mutable port::Mutex capacity_mutex_; + std::shared_ptr cache_res_mgr_; + bool disable_cache_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/cache/compressed_secondary_cache_test.cc b/cache/compressed_secondary_cache_test.cc index c13b8b390fb6..d72680b845eb 100644 --- a/cache/compressed_secondary_cache_test.cc +++ b/cache/compressed_secondary_cache_test.cc @@ -5,93 +5,46 @@ #include "cache/compressed_secondary_cache.h" +#include #include #include #include +#include "cache/secondary_cache_adapter.h" #include "memory/jemalloc_nodump_allocator.h" +#include "rocksdb/cache.h" #include "rocksdb/convenience.h" +#include "test_util/secondary_cache_test_util.h" #include "test_util/testharness.h" #include "test_util/testutil.h" +#include "util/cast_util.h" namespace ROCKSDB_NAMESPACE { -class CompressedSecondaryCacheTest : public testing::Test, - public Cache::CreateContext { - public: - CompressedSecondaryCacheTest() : fail_create_(false) {} - ~CompressedSecondaryCacheTest() override = default; - - protected: - class TestItem { - public: - TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) { - memcpy(buf_.get(), buf, size); - } - ~TestItem() = default; - - char* Buf() { return buf_.get(); } - [[nodiscard]] size_t Size() const { return size_; } - - private: - std::unique_ptr buf_; - size_t size_; - }; - - static size_t SizeCallback(Cache::ObjectPtr obj) { - return static_cast(obj)->Size(); - } - - static Status SaveToCallback(Cache::ObjectPtr from_obj, size_t from_offset, - size_t length, char* out) { - auto item = static_cast(from_obj); - const char* buf = item->Buf(); - EXPECT_EQ(length, item->Size()); - EXPECT_EQ(from_offset, 0); - memcpy(out, buf, length); - return Status::OK(); - } - - static void DeletionCallback(Cache::ObjectPtr obj, - MemoryAllocator* /*alloc*/) { - delete static_cast(obj); - obj = nullptr; - } - - static Status SaveToCallbackFail(Cache::ObjectPtr /*obj*/, size_t /*offset*/, - size_t /*size*/, char* /*out*/) { - return Status::NotSupported(); - } +using secondary_cache_test_util::GetTestingCacheTypes; +using secondary_cache_test_util::WithCacheType; - static Status CreateCallback(const Slice& data, Cache::CreateContext* context, - MemoryAllocator* /*allocator*/, - Cache::ObjectPtr* out_obj, size_t* out_charge) { - auto t = static_cast(context); - if (t->fail_create_) { - return Status::NotSupported(); - } - *out_obj = new TestItem(data.data(), data.size()); - *out_charge = data.size(); - return Status::OK(); - } - - static constexpr Cache::CacheItemHelper kHelper{ - CacheEntryRole::kMisc, &DeletionCallback, &SizeCallback, &SaveToCallback, - &CreateCallback}; - - static constexpr Cache::CacheItemHelper kHelperFail{ - CacheEntryRole::kMisc, &DeletionCallback, &SizeCallback, - &SaveToCallbackFail, &CreateCallback}; +// 16 bytes for HCC compatibility +const std::string key0 = "____ ____key0"; +const std::string key1 = "____ ____key1"; +const std::string key2 = "____ ____key2"; +const std::string key3 = "____ ____key3"; - void SetFailCreate(bool fail) { fail_create_ = fail; } +class CompressedSecondaryCacheTestBase : public testing::Test, + public WithCacheType { + public: + CompressedSecondaryCacheTestBase() {} + ~CompressedSecondaryCacheTestBase() override = default; + protected: void BasicTestHelper(std::shared_ptr sec_cache, bool sec_cache_is_compressed) { get_perf_context()->Reset(); - bool is_in_sec_cache{true}; + bool kept_in_sec_cache{true}; // Lookup an non-existent key. - std::unique_ptr handle0 = sec_cache->Lookup( - "k0", &kHelper, this, true, /*advise_erase=*/true, is_in_sec_cache); + std::unique_ptr handle0 = + sec_cache->Lookup(key0, GetHelper(), this, true, /*advise_erase=*/true, + kept_in_sec_cache); ASSERT_EQ(handle0, nullptr); Random rnd(301); @@ -99,23 +52,25 @@ class CompressedSecondaryCacheTest : public testing::Test, std::string str1(rnd.RandomString(1000)); TestItem item1(str1.data(), str1.length()); // A dummy handle is inserted if the item is inserted for the first time. - ASSERT_OK(sec_cache->Insert("k1", &item1, &kHelper)); + ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false)); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1); ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); - std::unique_ptr handle1_1 = sec_cache->Lookup( - "k1", &kHelper, this, true, /*advise_erase=*/false, is_in_sec_cache); + std::unique_ptr handle1_1 = + sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false, + kept_in_sec_cache); ASSERT_EQ(handle1_1, nullptr); // Insert and Lookup the item k1 for the second time and advise erasing it. - ASSERT_OK(sec_cache->Insert("k1", &item1, &kHelper)); + ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false)); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1); - std::unique_ptr handle1_2 = sec_cache->Lookup( - "k1", &kHelper, this, true, /*advise_erase=*/true, is_in_sec_cache); + std::unique_ptr handle1_2 = + sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true, + kept_in_sec_cache); ASSERT_NE(handle1_2, nullptr); - ASSERT_FALSE(is_in_sec_cache); + ASSERT_FALSE(kept_in_sec_cache); if (sec_cache_is_compressed) { ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 1000); @@ -132,20 +87,22 @@ class CompressedSecondaryCacheTest : public testing::Test, ASSERT_EQ(memcmp(val1->Buf(), item1.Buf(), item1.Size()), 0); // Lookup the item k1 again. - std::unique_ptr handle1_3 = sec_cache->Lookup( - "k1", &kHelper, this, true, /*advise_erase=*/true, is_in_sec_cache); + std::unique_ptr handle1_3 = + sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true, + kept_in_sec_cache); ASSERT_EQ(handle1_3, nullptr); // Insert and Lookup the item k2. std::string str2(rnd.RandomString(1000)); TestItem item2(str2.data(), str2.length()); - ASSERT_OK(sec_cache->Insert("k2", &item2, &kHelper)); + ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false)); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2); - std::unique_ptr handle2_1 = sec_cache->Lookup( - "k2", &kHelper, this, true, /*advise_erase=*/false, is_in_sec_cache); + std::unique_ptr handle2_1 = + sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false, + kept_in_sec_cache); ASSERT_EQ(handle2_1, nullptr); - ASSERT_OK(sec_cache->Insert("k2", &item2, &kHelper)); + ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false)); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2); if (sec_cache_is_compressed) { ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, @@ -156,8 +113,9 @@ class CompressedSecondaryCacheTest : public testing::Test, ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); } - std::unique_ptr handle2_2 = sec_cache->Lookup( - "k2", &kHelper, this, true, /*advise_erase=*/false, is_in_sec_cache); + std::unique_ptr handle2_2 = + sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false, + kept_in_sec_cache); ASSERT_NE(handle2_2, nullptr); std::unique_ptr val2 = std::unique_ptr(static_cast(handle2_2->Value())); @@ -226,24 +184,26 @@ class CompressedSecondaryCacheTest : public testing::Test, std::string str1(rnd.RandomString(1000)); TestItem item1(str1.data(), str1.length()); // Insert a dummy handle. - ASSERT_OK(sec_cache->Insert("k1", &item1, &kHelper)); + ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false)); // Insert k1. - ASSERT_OK(sec_cache->Insert("k1", &item1, &kHelper)); + ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false)); // Insert and Lookup the second item. std::string str2(rnd.RandomString(200)); TestItem item2(str2.data(), str2.length()); // Insert a dummy handle, k1 is not evicted. - ASSERT_OK(sec_cache->Insert("k2", &item2, &kHelper)); - bool is_in_sec_cache{false}; - std::unique_ptr handle1 = sec_cache->Lookup( - "k1", &kHelper, this, true, /*advise_erase=*/false, is_in_sec_cache); + ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false)); + bool kept_in_sec_cache{false}; + std::unique_ptr handle1 = + sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false, + kept_in_sec_cache); ASSERT_EQ(handle1, nullptr); // Insert k2 and k1 is evicted. - ASSERT_OK(sec_cache->Insert("k2", &item2, &kHelper)); - std::unique_ptr handle2 = sec_cache->Lookup( - "k2", &kHelper, this, true, /*advise_erase=*/false, is_in_sec_cache); + ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper(), false)); + std::unique_ptr handle2 = + sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false, + kept_in_sec_cache); ASSERT_NE(handle2, nullptr); std::unique_ptr val2 = std::unique_ptr(static_cast(handle2->Value())); @@ -251,24 +211,26 @@ class CompressedSecondaryCacheTest : public testing::Test, ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0); // Insert k1 again and a dummy handle is inserted. - ASSERT_OK(sec_cache->Insert("k1", &item1, &kHelper)); + ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper(), false)); - std::unique_ptr handle1_1 = sec_cache->Lookup( - "k1", &kHelper, this, true, /*advise_erase=*/false, is_in_sec_cache); + std::unique_ptr handle1_1 = + sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false, + kept_in_sec_cache); ASSERT_EQ(handle1_1, nullptr); // Create Fails. SetFailCreate(true); - std::unique_ptr handle2_1 = sec_cache->Lookup( - "k2", &kHelper, this, true, /*advise_erase=*/true, is_in_sec_cache); + std::unique_ptr handle2_1 = + sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/true, + kept_in_sec_cache); ASSERT_EQ(handle2_1, nullptr); // Save Fails. std::string str3 = rnd.RandomString(10); TestItem item3(str3.data(), str3.length()); // The Status is OK because a dummy handle is inserted. - ASSERT_OK(sec_cache->Insert("k3", &item3, &kHelperFail)); - ASSERT_NOK(sec_cache->Insert("k3", &item3, &kHelperFail)); + ASSERT_OK(sec_cache->Insert(key3, &item3, GetHelperFail(), false)); + ASSERT_NOK(sec_cache->Insert(key3, &item3, GetHelperFail(), false)); sec_cache.reset(); } @@ -292,26 +254,22 @@ class CompressedSecondaryCacheTest : public testing::Test, secondary_cache_opts.enable_custom_split_merge = enable_custom_split_merge; std::shared_ptr secondary_cache = NewCompressedSecondaryCache(secondary_cache_opts); - LRUCacheOptions lru_cache_opts( + std::shared_ptr cache = NewCache( /*_capacity =*/1300, /*_num_shard_bits =*/0, - /*_strict_capacity_limit =*/false, /*_high_pri_pool_ratio =*/0.5, - /*_memory_allocator =*/nullptr, kDefaultToAdaptiveMutex, - kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio =*/0.0); - lru_cache_opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(lru_cache_opts); + /*_strict_capacity_limit =*/true, secondary_cache); std::shared_ptr stats = CreateDBStatistics(); get_perf_context()->Reset(); Random rnd(301); std::string str1 = rnd.RandomString(1001); auto item1_1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert("k1", item1_1, &kHelper, str1.length())); + ASSERT_OK(cache->Insert(key1, item1_1, GetHelper(), str1.length())); std::string str2 = rnd.RandomString(1012); auto item2_1 = new TestItem(str2.data(), str2.length()); // After this Insert, primary cache contains k2 and secondary cache contains // k1's dummy item. - ASSERT_OK(cache->Insert("k2", item2_1, &kHelper, str2.length())); + ASSERT_OK(cache->Insert(key2, item2_1, GetHelper(), str2.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1); ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); @@ -320,19 +278,19 @@ class CompressedSecondaryCacheTest : public testing::Test, auto item3_1 = new TestItem(str3.data(), str3.length()); // After this Insert, primary cache contains k3 and secondary cache contains // k1's dummy item and k2's dummy item. - ASSERT_OK(cache->Insert("k3", item3_1, &kHelper, str3.length())); + ASSERT_OK(cache->Insert(key3, item3_1, GetHelper(), str3.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2); // After this Insert, primary cache contains k1 and secondary cache contains // k1's dummy item, k2's dummy item, and k3's dummy item. auto item1_2 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert("k1", item1_2, &kHelper, str1.length())); + ASSERT_OK(cache->Insert(key1, item1_2, GetHelper(), str1.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3); // After this Insert, primary cache contains k2 and secondary cache contains // k1's item, k2's dummy item, and k3's dummy item. auto item2_2 = new TestItem(str2.data(), str2.length()); - ASSERT_OK(cache->Insert("k2", item2_2, &kHelper, str2.length())); + ASSERT_OK(cache->Insert(key2, item2_2, GetHelper(), str2.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1); if (sec_cache_is_compressed) { ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, @@ -347,7 +305,7 @@ class CompressedSecondaryCacheTest : public testing::Test, // After this Insert, primary cache contains k3 and secondary cache contains // k1's item and k2's item. auto item3_2 = new TestItem(str3.data(), str3.length()); - ASSERT_OK(cache->Insert("k3", item3_2, &kHelper, str3.length())); + ASSERT_OK(cache->Insert(key3, item3_2, GetHelper(), str3.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2); if (sec_cache_is_compressed) { ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, @@ -360,7 +318,7 @@ class CompressedSecondaryCacheTest : public testing::Test, } Cache::Handle* handle; - handle = cache->Lookup("k3", &kHelper, this, Cache::Priority::LOW, true, + handle = cache->Lookup(key3, GetHelper(), this, Cache::Priority::LOW, stats.get()); ASSERT_NE(handle, nullptr); auto val3 = static_cast(cache->Value(handle)); @@ -369,13 +327,13 @@ class CompressedSecondaryCacheTest : public testing::Test, cache->Release(handle); // Lookup an non-existent key. - handle = cache->Lookup("k0", &kHelper, this, Cache::Priority::LOW, true, + handle = cache->Lookup(key0, GetHelper(), this, Cache::Priority::LOW, stats.get()); ASSERT_EQ(handle, nullptr); // This Lookup should just insert a dummy handle in the primary cache // and the k1 is still in the secondary cache. - handle = cache->Lookup("k1", &kHelper, this, Cache::Priority::LOW, true, + handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW, stats.get()); ASSERT_NE(handle, nullptr); ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1); @@ -387,7 +345,7 @@ class CompressedSecondaryCacheTest : public testing::Test, // This Lookup should erase k1 from the secondary cache and insert // it into primary cache; then k3 is demoted. // k2 and k3 are in secondary cache. - handle = cache->Lookup("k1", &kHelper, this, Cache::Priority::LOW, true, + handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW, stats.get()); ASSERT_NE(handle, nullptr); ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1); @@ -395,7 +353,7 @@ class CompressedSecondaryCacheTest : public testing::Test, cache->Release(handle); // k2 is still in secondary cache. - handle = cache->Lookup("k2", &kHelper, this, Cache::Priority::LOW, true, + handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW, stats.get()); ASSERT_NE(handle, nullptr); ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 2); @@ -403,7 +361,7 @@ class CompressedSecondaryCacheTest : public testing::Test, // Testing SetCapacity(). ASSERT_OK(secondary_cache->SetCapacity(0)); - handle = cache->Lookup("k3", &kHelper, this, Cache::Priority::LOW, true, + handle = cache->Lookup(key3, GetHelper(), this, Cache::Priority::LOW, stats.get()); ASSERT_EQ(handle, nullptr); @@ -413,30 +371,30 @@ class CompressedSecondaryCacheTest : public testing::Test, ASSERT_EQ(capacity, 7000); auto item1_3 = new TestItem(str1.data(), str1.length()); // After this Insert, primary cache contains k1. - ASSERT_OK(cache->Insert("k1", item1_3, &kHelper, str2.length())); + ASSERT_OK(cache->Insert(key1, item1_3, GetHelper(), str2.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 4); auto item2_3 = new TestItem(str2.data(), str2.length()); // After this Insert, primary cache contains k2 and secondary cache contains // k1's dummy item. - ASSERT_OK(cache->Insert("k2", item2_3, &kHelper, str1.length())); + ASSERT_OK(cache->Insert(key2, item2_3, GetHelper(), str1.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 4); auto item1_4 = new TestItem(str1.data(), str1.length()); // After this Insert, primary cache contains k1 and secondary cache contains // k1's dummy item and k2's dummy item. - ASSERT_OK(cache->Insert("k1", item1_4, &kHelper, str2.length())); + ASSERT_OK(cache->Insert(key1, item1_4, GetHelper(), str2.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 5); auto item2_4 = new TestItem(str2.data(), str2.length()); // After this Insert, primary cache contains k2 and secondary cache contains // k1's real item and k2's dummy item. - ASSERT_OK(cache->Insert("k2", item2_4, &kHelper, str2.length())); + ASSERT_OK(cache->Insert(key2, item2_4, GetHelper(), str2.length())); ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 5); // This Lookup should just insert a dummy handle in the primary cache // and the k1 is still in the secondary cache. - handle = cache->Lookup("k1", &kHelper, this, Cache::Priority::LOW, true, + handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW, stats.get()); ASSERT_NE(handle, nullptr); @@ -464,26 +422,31 @@ class CompressedSecondaryCacheTest : public testing::Test, std::shared_ptr secondary_cache = NewCompressedSecondaryCache(secondary_cache_opts); - LRUCacheOptions opts( + std::shared_ptr cache = NewCache( /*_capacity=*/1300, /*_num_shard_bits=*/0, - /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5, - /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex, - kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + /*_strict_capacity_limit=*/false, secondary_cache); Random rnd(301); std::string str1 = rnd.RandomString(1001); auto item1 = std::make_unique(str1.data(), str1.length()); - ASSERT_OK(cache->Insert("k1", item1.get(), &kHelper, str1.length())); + ASSERT_OK(cache->Insert(key1, item1.get(), GetHelper(), str1.length())); item1.release(); // Appease clang-analyze "potential memory leak" Cache::Handle* handle; - handle = cache->Lookup("k2", nullptr, this, Cache::Priority::LOW, true); + handle = cache->Lookup(key2, nullptr, this, Cache::Priority::LOW); ASSERT_EQ(handle, nullptr); - handle = cache->Lookup("k2", &kHelper, this, Cache::Priority::LOW, false); + handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW); ASSERT_EQ(handle, nullptr); + Cache::AsyncLookupHandle ah; + ah.key = key2; + ah.helper = GetHelper(); + ah.create_context = this; + ah.priority = Cache::Priority::LOW; + cache->StartAsyncLookup(ah); + cache->Wait(ah); + ASSERT_EQ(ah.Result(), nullptr); + cache.reset(); secondary_cache.reset(); } @@ -506,36 +469,29 @@ class CompressedSecondaryCacheTest : public testing::Test, std::shared_ptr secondary_cache = NewCompressedSecondaryCache(secondary_cache_opts); - LRUCacheOptions opts( + std::shared_ptr cache = NewCache( /*_capacity=*/1300, /*_num_shard_bits=*/0, - /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5, - /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex, - kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + /*_strict_capacity_limit=*/true, secondary_cache); Random rnd(301); std::string str1 = rnd.RandomString(1001); auto item1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert("k1", item1, &kHelperFail, str1.length())); + ASSERT_OK(cache->Insert(key1, item1, GetHelperFail(), str1.length())); std::string str2 = rnd.RandomString(1002); auto item2 = new TestItem(str2.data(), str2.length()); // k1 should be demoted to the secondary cache. - ASSERT_OK(cache->Insert("k2", item2, &kHelperFail, str2.length())); + ASSERT_OK(cache->Insert(key2, item2, GetHelperFail(), str2.length())); Cache::Handle* handle; - handle = - cache->Lookup("k2", &kHelperFail, this, Cache::Priority::LOW, true); + handle = cache->Lookup(key2, GetHelperFail(), this, Cache::Priority::LOW); ASSERT_NE(handle, nullptr); cache->Release(handle); // This lookup should fail, since k1 demotion would have failed. - handle = - cache->Lookup("k1", &kHelperFail, this, Cache::Priority::LOW, true); + handle = cache->Lookup(key1, GetHelperFail(), this, Cache::Priority::LOW); ASSERT_EQ(handle, nullptr); // Since k1 was not promoted, k2 should still be in cache. - handle = - cache->Lookup("k2", &kHelperFail, this, Cache::Priority::LOW, true); + handle = cache->Lookup(key2, GetHelperFail(), this, Cache::Priority::LOW); ASSERT_NE(handle, nullptr); cache->Release(handle); @@ -561,34 +517,30 @@ class CompressedSecondaryCacheTest : public testing::Test, std::shared_ptr secondary_cache = NewCompressedSecondaryCache(secondary_cache_opts); - LRUCacheOptions opts( + std::shared_ptr cache = NewCache( /*_capacity=*/1300, /*_num_shard_bits=*/0, - /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5, - /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex, - kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + /*_strict_capacity_limit=*/true, secondary_cache); Random rnd(301); std::string str1 = rnd.RandomString(1001); auto item1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert("k1", item1, &kHelper, str1.length())); + ASSERT_OK(cache->Insert(key1, item1, GetHelper(), str1.length())); std::string str2 = rnd.RandomString(1002); auto item2 = new TestItem(str2.data(), str2.length()); // k1 should be demoted to the secondary cache. - ASSERT_OK(cache->Insert("k2", item2, &kHelper, str2.length())); + ASSERT_OK(cache->Insert(key2, item2, GetHelper(), str2.length())); Cache::Handle* handle; SetFailCreate(true); - handle = cache->Lookup("k2", &kHelper, this, Cache::Priority::LOW, true); + handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW); ASSERT_NE(handle, nullptr); cache->Release(handle); // This lookup should fail, since k1 creation would have failed - handle = cache->Lookup("k1", &kHelper, this, Cache::Priority::LOW, true); + handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW); ASSERT_EQ(handle, nullptr); // Since k1 didn't get promoted, k2 should still be in cache - handle = cache->Lookup("k2", &kHelper, this, Cache::Priority::LOW, true); + handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW); ASSERT_NE(handle, nullptr); cache->Release(handle); @@ -614,38 +566,34 @@ class CompressedSecondaryCacheTest : public testing::Test, std::shared_ptr secondary_cache = NewCompressedSecondaryCache(secondary_cache_opts); - LRUCacheOptions opts( + std::shared_ptr cache = NewCache( /*_capacity=*/1300, /*_num_shard_bits=*/0, - /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5, - /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex, - kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + /*_strict_capacity_limit=*/false, secondary_cache); Random rnd(301); std::string str1 = rnd.RandomString(1001); auto item1_1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert("k1", item1_1, &kHelper, str1.length())); + ASSERT_OK(cache->Insert(key1, item1_1, GetHelper(), str1.length())); std::string str2 = rnd.RandomString(1002); std::string str2_clone{str2}; auto item2 = new TestItem(str2.data(), str2.length()); // After this Insert, primary cache contains k2 and secondary cache contains // k1's dummy item. - ASSERT_OK(cache->Insert("k2", item2, &kHelper, str2.length())); + ASSERT_OK(cache->Insert(key2, item2, GetHelper(), str2.length())); // After this Insert, primary cache contains k1 and secondary cache contains // k1's dummy item and k2's dummy item. auto item1_2 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert("k1", item1_2, &kHelper, str1.length())); + ASSERT_OK(cache->Insert(key1, item1_2, GetHelper(), str1.length())); auto item2_2 = new TestItem(str2.data(), str2.length()); // After this Insert, primary cache contains k2 and secondary cache contains // k1's item and k2's dummy item. - ASSERT_OK(cache->Insert("k2", item2_2, &kHelper, str2.length())); + ASSERT_OK(cache->Insert(key2, item2_2, GetHelper(), str2.length())); Cache::Handle* handle2; - handle2 = cache->Lookup("k2", &kHelper, this, Cache::Priority::LOW, true); + handle2 = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW); ASSERT_NE(handle2, nullptr); cache->Release(handle2); @@ -653,12 +601,12 @@ class CompressedSecondaryCacheTest : public testing::Test, // strict_capacity_limit is true, but the lookup should still succeed. // A k1's dummy item is inserted into primary cache. Cache::Handle* handle1; - handle1 = cache->Lookup("k1", &kHelper, this, Cache::Priority::LOW, true); + handle1 = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW); ASSERT_NE(handle1, nullptr); cache->Release(handle1); // Since k1 didn't get inserted, k2 should still be in cache - handle2 = cache->Lookup("k2", &kHelper, this, Cache::Priority::LOW, true); + handle2 = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW); ASSERT_NE(handle2, nullptr); cache->Release(handle2); @@ -681,8 +629,9 @@ class CompressedSecondaryCacheTest : public testing::Test, using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk; std::unique_ptr sec_cache = - std::make_unique(1000, 0, true, 0.5, 0.0, - allocator); + std::make_unique( + CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0, + allocator)); Random rnd(301); // 8500 = 8169 + 233 + 98, so there should be 3 chunks after split. size_t str_size{8500}; @@ -733,7 +682,8 @@ class CompressedSecondaryCacheTest : public testing::Test, std::string str = str1 + str2 + str3; std::unique_ptr sec_cache = - std::make_unique(1000, 0, true, 0.5, 0.0); + std::make_unique( + CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0)); size_t charge{0}; CacheAllocationPtr value = sec_cache->MergeChunksIntoValue(chunks_head, charge); @@ -763,8 +713,9 @@ class CompressedSecondaryCacheTest : public testing::Test, using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk; std::unique_ptr sec_cache = - std::make_unique(1000, 0, true, 0.5, 0.0, - allocator); + std::make_unique( + CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0, + allocator)); Random rnd(301); // 8500 = 8169 + 233 + 98, so there should be 3 chunks after split. size_t str_size{8500}; @@ -782,19 +733,27 @@ class CompressedSecondaryCacheTest : public testing::Test, sec_cache->GetHelper(true)->del_cb(chunks_head, /*alloc*/ nullptr); } +}; - private: - bool fail_create_; +class CompressedSecondaryCacheTest + : public CompressedSecondaryCacheTestBase, + public testing::WithParamInterface { + const std::string& Type() const override { return GetParam(); } }; +INSTANTIATE_TEST_CASE_P(CompressedSecondaryCacheTest, + CompressedSecondaryCacheTest, GetTestingCacheTypes()); + class CompressedSecCacheTestWithCompressAndAllocatorParam - : public CompressedSecondaryCacheTest, - public ::testing::WithParamInterface> { + : public CompressedSecondaryCacheTestBase, + public ::testing::WithParamInterface< + std::tuple> { public: CompressedSecCacheTestWithCompressAndAllocatorParam() { sec_cache_is_compressed_ = std::get<0>(GetParam()); use_jemalloc_ = std::get<1>(GetParam()); } + const std::string& Type() const override { return std::get<2>(GetParam()); } bool sec_cache_is_compressed_; bool use_jemalloc_; }; @@ -805,20 +764,20 @@ TEST_P(CompressedSecCacheTestWithCompressAndAllocatorParam, BasicTes) { INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests, CompressedSecCacheTestWithCompressAndAllocatorParam, - ::testing::Combine(testing::Bool(), testing::Bool())); + ::testing::Combine(testing::Bool(), testing::Bool(), + GetTestingCacheTypes())); class CompressedSecondaryCacheTestWithCompressionParam - : public CompressedSecondaryCacheTest, - public ::testing::WithParamInterface { + : public CompressedSecondaryCacheTestBase, + public ::testing::WithParamInterface> { public: CompressedSecondaryCacheTestWithCompressionParam() { - sec_cache_is_compressed_ = GetParam(); + sec_cache_is_compressed_ = std::get<0>(GetParam()); } + const std::string& Type() const override { return std::get<1>(GetParam()); } bool sec_cache_is_compressed_; }; -#ifndef ROCKSDB_LITE - TEST_P(CompressedSecondaryCacheTestWithCompressionParam, BasicTestFromString) { std::shared_ptr sec_cache{nullptr}; std::string sec_cache_uri; @@ -882,7 +841,6 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam, BasicTestHelper(sec_cache, sec_cache_is_compressed_); } -#endif // ROCKSDB_LITE TEST_P(CompressedSecondaryCacheTestWithCompressionParam, FailsTest) { FailsTest(sec_cache_is_compressed_); @@ -908,18 +866,92 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam, IntegrationFullCapacityTest(sec_cache_is_compressed_); } +TEST_P(CompressedSecondaryCacheTestWithCompressionParam, EntryRoles) { + CompressedSecondaryCacheOptions opts; + opts.capacity = 2048; + opts.num_shard_bits = 0; + + if (sec_cache_is_compressed_) { + if (!LZ4_Supported()) { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + return; + } + } else { + opts.compression_type = CompressionType::kNoCompression; + } + + // Select a random subset to include, for fast test + Random& r = *Random::GetTLSInstance(); + CacheEntryRoleSet do_not_compress; + for (uint32_t i = 0; i < kNumCacheEntryRoles; ++i) { + // A few included on average, but decent chance of zero + if (r.OneIn(5)) { + do_not_compress.Add(static_cast(i)); + } + } + opts.do_not_compress_roles = do_not_compress; + + std::shared_ptr sec_cache = NewCompressedSecondaryCache(opts); + + // Fixed seed to ensure consistent compressibility (doesn't compress) + std::string junk(Random(301).RandomString(1000)); + + for (uint32_t i = 0; i < kNumCacheEntryRoles; ++i) { + CacheEntryRole role = static_cast(i); + + // Uniquify `junk` + junk[0] = static_cast(i); + TestItem item{junk.data(), junk.length()}; + Slice ith_key = Slice(junk.data(), 16); + + get_perf_context()->Reset(); + ASSERT_OK(sec_cache->Insert(ith_key, &item, GetHelper(role), false)); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1U); + + ASSERT_OK(sec_cache->Insert(ith_key, &item, GetHelper(role), false)); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1U); + + bool kept_in_sec_cache{true}; + std::unique_ptr handle = + sec_cache->Lookup(ith_key, GetHelper(role), this, true, + /*advise_erase=*/true, kept_in_sec_cache); + ASSERT_NE(handle, nullptr); + + // Lookup returns the right data + std::unique_ptr val = + std::unique_ptr(static_cast(handle->Value())); + ASSERT_NE(val, nullptr); + ASSERT_EQ(memcmp(val->Buf(), item.Buf(), item.Size()), 0); + + bool compressed = + sec_cache_is_compressed_ && !do_not_compress.Contains(role); + if (compressed) { + ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, + 1000); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, + 1007); + } else { + ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0); + ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0); + } + } +} + INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests, CompressedSecondaryCacheTestWithCompressionParam, - testing::Bool()); + testing::Combine(testing::Bool(), + GetTestingCacheTypes())); class CompressedSecCacheTestWithCompressAndSplitParam - : public CompressedSecondaryCacheTest, - public ::testing::WithParamInterface> { + : public CompressedSecondaryCacheTestBase, + public ::testing::WithParamInterface< + std::tuple> { public: CompressedSecCacheTestWithCompressAndSplitParam() { sec_cache_is_compressed_ = std::get<0>(GetParam()); enable_custom_split_merge_ = std::get<1>(GetParam()); } + const std::string& Type() const override { return std::get<2>(GetParam()); } bool sec_cache_is_compressed_; bool enable_custom_split_merge_; }; @@ -930,20 +962,412 @@ TEST_P(CompressedSecCacheTestWithCompressAndSplitParam, BasicIntegrationTest) { INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests, CompressedSecCacheTestWithCompressAndSplitParam, - ::testing::Combine(testing::Bool(), testing::Bool())); + ::testing::Combine(testing::Bool(), testing::Bool(), + GetTestingCacheTypes())); -TEST_F(CompressedSecondaryCacheTest, SplitValueIntoChunksTest) { +TEST_P(CompressedSecondaryCacheTest, SplitValueIntoChunksTest) { SplitValueIntoChunksTest(); } -TEST_F(CompressedSecondaryCacheTest, MergeChunksIntoValueTest) { +TEST_P(CompressedSecondaryCacheTest, MergeChunksIntoValueTest) { MergeChunksIntoValueTest(); } -TEST_F(CompressedSecondaryCacheTest, SplictValueAndMergeChunksTest) { +TEST_P(CompressedSecondaryCacheTest, SplictValueAndMergeChunksTest) { SplictValueAndMergeChunksTest(); } +using secondary_cache_test_util::WithCacheType; + +class CompressedSecCacheTestWithTiered + : public testing::Test, + public WithCacheType, + public testing::WithParamInterface< + std::tuple> { + public: + using secondary_cache_test_util::WithCacheType::TestItem; + CompressedSecCacheTestWithTiered() { + LRUCacheOptions lru_opts; + HyperClockCacheOptions hcc_opts( + /*_capacity=*/0, + /*_estimated_entry_charge=*/256 << 10, + /*_num_shard_bits=*/0); + TieredCacheOptions opts; + lru_opts.capacity = 0; + lru_opts.num_shard_bits = 0; + lru_opts.high_pri_pool_ratio = 0; + opts.cache_type = std::get<0>(GetParam()); + if (opts.cache_type == PrimaryCacheType::kCacheTypeLRU) { + opts.cache_opts = &lru_opts; + } else { + opts.cache_opts = &hcc_opts; + } + opts.adm_policy = std::get<1>(GetParam()); + ; + opts.comp_cache_opts.capacity = 0; + opts.comp_cache_opts.num_shard_bits = 0; + opts.total_capacity = 100 << 20; + opts.compressed_secondary_ratio = 0.3; + cache_ = NewTieredCache(opts); + cache_res_mgr_ = + std::make_shared>( + cache_); + } + + const std::string& Type() const override { + if (std::get<0>(GetParam()) == PrimaryCacheType::kCacheTypeLRU) { + return lru_str; + } else { + return hcc_str; + } + } + + protected: + CacheReservationManager* cache_res_mgr() { return cache_res_mgr_.get(); } + + std::shared_ptr GetTieredCache() { return cache_; } + + Cache* GetCache() { + return static_cast_with_check( + cache_.get()) + ->TEST_GetCache(); + } + + SecondaryCache* GetSecondaryCache() { + return static_cast_with_check( + cache_.get()) + ->TEST_GetSecondaryCache(); + } + + size_t GetPercent(size_t val, unsigned int percent) { + return static_cast(val * percent / 100); + } + + private: + std::shared_ptr cache_; + std::shared_ptr cache_res_mgr_; + static std::string lru_str; + static std::string hcc_str; +}; + +std::string CompressedSecCacheTestWithTiered::lru_str(WithCacheType::kLRU); +std::string CompressedSecCacheTestWithTiered::hcc_str( + WithCacheType::kFixedHyperClock); + +bool CacheUsageWithinBounds(size_t val1, size_t val2, size_t error) { + return ((val1 < (val2 + error)) && (val1 > (val2 - error))); +} + +TEST_P(CompressedSecCacheTestWithTiered, CacheReservationManager) { + CompressedSecondaryCache* sec_cache = + reinterpret_cast(GetSecondaryCache()); + + // Use EXPECT_PRED3 instead of EXPECT_NEAR to void too many size_t to + // double explicit casts + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20), + GetPercent(30 << 20, 1)); + EXPECT_EQ(sec_cache->TEST_GetUsage(), 0); + + ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(10 << 20)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (37 << 20), + GetPercent(37 << 20, 1)); + EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (3 << 20), + GetPercent(3 << 20, 1)); + + ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(0)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20), + GetPercent(30 << 20, 1)); + EXPECT_EQ(sec_cache->TEST_GetUsage(), 0); +} + +TEST_P(CompressedSecCacheTestWithTiered, + CacheReservationManagerMultipleUpdate) { + CompressedSecondaryCache* sec_cache = + reinterpret_cast(GetSecondaryCache()); + + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20), + GetPercent(30 << 20, 1)); + EXPECT_EQ(sec_cache->TEST_GetUsage(), 0); + + int i; + for (i = 0; i < 10; ++i) { + ASSERT_OK(cache_res_mgr()->UpdateCacheReservation((1 + i) << 20)); + } + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (37 << 20), + GetPercent(37 << 20, 1)); + EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (3 << 20), + GetPercent(3 << 20, 1)); + + for (i = 10; i > 0; --i) { + ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(((i - 1) << 20))); + } + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20), + GetPercent(30 << 20, 1)); + EXPECT_EQ(sec_cache->TEST_GetUsage(), 0); +} + +TEST_P(CompressedSecCacheTestWithTiered, AdmissionPolicy) { + if (!LZ4_Supported()) { + ROCKSDB_GTEST_BYPASS("This test requires LZ4 support\n"); + return; + } + + Cache* tiered_cache = GetTieredCache().get(); + Cache* cache = GetCache(); + std::vector keys; + std::vector vals; + // Make the item size slightly less than 10MB to ensure we can fit the + // expected number of items in the cache + int item_size = (10 << 20) - (1 << 18); + int i; + Random rnd(301); + for (i = 0; i < 14; ++i) { + keys.emplace_back(CacheKey::CreateUniqueForCacheLifetime(cache)); + vals.emplace_back(rnd.RandomString(item_size)); + } + + for (i = 0; i < 7; ++i) { + TestItem* item = new TestItem(vals[i].data(), vals[i].length()); + ASSERT_OK(tiered_cache->Insert(keys[i].AsSlice(), item, GetHelper(), + vals[i].length())); + } + + Cache::Handle* handle1; + handle1 = tiered_cache->Lookup(keys[0].AsSlice(), GetHelper(), + /*context*/ this, Cache::Priority::LOW); + ASSERT_NE(handle1, nullptr); + Cache::Handle* handle2; + handle2 = tiered_cache->Lookup(keys[1].AsSlice(), GetHelper(), + /*context*/ this, Cache::Priority::LOW); + ASSERT_NE(handle2, nullptr); + tiered_cache->Release(handle1); + tiered_cache->Release(handle2); + + // Flush all previous entries out of the primary cache + for (i = 7; i < 14; ++i) { + TestItem* item = new TestItem(vals[i].data(), vals[i].length()); + ASSERT_OK(tiered_cache->Insert(keys[i].AsSlice(), item, GetHelper(), + vals[i].length())); + } + // keys 0 and 1 should be found as they had the hit bit set + handle1 = tiered_cache->Lookup(keys[0].AsSlice(), GetHelper(), + /*context*/ this, Cache::Priority::LOW); + ASSERT_NE(handle1, nullptr); + handle2 = tiered_cache->Lookup(keys[1].AsSlice(), GetHelper(), + /*context*/ this, Cache::Priority::LOW); + ASSERT_NE(handle2, nullptr); + tiered_cache->Release(handle1); + tiered_cache->Release(handle2); + + handle1 = tiered_cache->Lookup(keys[2].AsSlice(), GetHelper(), + /*context*/ this, Cache::Priority::LOW); + ASSERT_EQ(handle1, nullptr); + handle1 = tiered_cache->Lookup(keys[3].AsSlice(), GetHelper(), + /*context*/ this, Cache::Priority::LOW); + ASSERT_EQ(handle1, nullptr); +} + +TEST_P(CompressedSecCacheTestWithTiered, DynamicUpdate) { + CompressedSecondaryCache* sec_cache = + reinterpret_cast(GetSecondaryCache()); + std::shared_ptr tiered_cache = GetTieredCache(); + + // Use EXPECT_PRED3 instead of EXPECT_NEAR to void too many size_t to + // double explicit casts + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20), + GetPercent(30 << 20, 1)); + size_t sec_capacity; + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (30 << 20)); + + ASSERT_OK(UpdateTieredCache(tiered_cache, 130 << 20)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (39 << 20), + GetPercent(39 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (39 << 20)); + + ASSERT_OK(UpdateTieredCache(tiered_cache, 70 << 20)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (21 << 20), + GetPercent(21 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (21 << 20)); + + ASSERT_OK(UpdateTieredCache(tiered_cache, 100 << 20)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20), + GetPercent(30 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (30 << 20)); + + ASSERT_OK(UpdateTieredCache(tiered_cache, -1, 0.4)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (40 << 20), + GetPercent(40 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (40 << 20)); + + ASSERT_OK(UpdateTieredCache(tiered_cache, -1, 0.2)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (20 << 20), + GetPercent(20 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (20 << 20)); + + ASSERT_OK(UpdateTieredCache(tiered_cache, -1, 1.0)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (100 << 20), + GetPercent(100 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, 100 << 20); + + ASSERT_OK(UpdateTieredCache(tiered_cache, -1, 0.0)); + // Only check usage for LRU cache. HCC shows a 64KB usage for some reason + if (std::get<0>(GetParam()) == PrimaryCacheType::kCacheTypeLRU) { + ASSERT_EQ(GetCache()->GetUsage(), 0); + } + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, 0); + + ASSERT_OK(UpdateTieredCache(tiered_cache, -1, 0.3)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20), + GetPercent(30 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (30 << 20)); +} + +TEST_P(CompressedSecCacheTestWithTiered, DynamicUpdateWithReservation) { + CompressedSecondaryCache* sec_cache = + reinterpret_cast(GetSecondaryCache()); + std::shared_ptr tiered_cache = GetTieredCache(); + + ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(10 << 20)); + // Use EXPECT_PRED3 instead of EXPECT_NEAR to void too many size_t to + // double explicit casts + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (37 << 20), + GetPercent(37 << 20, 1)); + EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (3 << 20), + GetPercent(3 << 20, 1)); + size_t sec_capacity; + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (30 << 20)); + + ASSERT_OK(UpdateTieredCache(tiered_cache, 70 << 20)); + // Only check usage for LRU cache. HCC is slightly off for some reason + if (std::get<0>(GetParam()) == PrimaryCacheType::kCacheTypeLRU) { + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (28 << 20), + GetPercent(28 << 20, 1)); + } + EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (3 << 20), + GetPercent(3 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (21 << 20)); + + ASSERT_OK(UpdateTieredCache(tiered_cache, 130 << 20)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (46 << 20), + GetPercent(46 << 20, 1)); + EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (3 << 20), + GetPercent(3 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (39 << 20)); + + ASSERT_OK(UpdateTieredCache(tiered_cache, 100 << 20)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (37 << 20), + GetPercent(37 << 20, 1)); + EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (3 << 20), + GetPercent(3 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (30 << 20)); + + ASSERT_OK(tiered_cache->GetSecondaryCacheCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, 30 << 20); + size_t sec_usage; + ASSERT_OK(tiered_cache->GetSecondaryCachePinnedUsage(sec_usage)); + EXPECT_PRED3(CacheUsageWithinBounds, sec_usage, 3 << 20, + GetPercent(3 << 20, 1)); + + ASSERT_OK(UpdateTieredCache(tiered_cache, -1, 0.39)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (45 << 20), + GetPercent(45 << 20, 1)); + EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (4 << 20), + GetPercent(4 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (39 << 20)); + + ASSERT_OK(UpdateTieredCache(tiered_cache, -1, 0.2)); + // Only check usage for LRU cache. HCC is slightly off for some reason + if (std::get<0>(GetParam()) == PrimaryCacheType::kCacheTypeLRU) { + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (28 << 20), + GetPercent(28 << 20, 1)); + } + EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (2 << 20), + GetPercent(2 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (20 << 20)); + + ASSERT_OK(UpdateTieredCache(tiered_cache, -1, 1.0)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (100 << 20), + GetPercent(100 << 20, 1)); + EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (10 << 20), + GetPercent(10 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, 100 << 20); + + ASSERT_OK(UpdateTieredCache(tiered_cache, -1, 0.0)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (10 << 20), + GetPercent(10 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, 0); + + ASSERT_OK(UpdateTieredCache(tiered_cache, -1, 0.3)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (37 << 20), + GetPercent(37 << 20, 1)); + EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (3 << 20), + GetPercent(3 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, 30 << 20); + + ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(0)); +} + +TEST_P(CompressedSecCacheTestWithTiered, ReservationOverCapacity) { + CompressedSecondaryCache* sec_cache = + reinterpret_cast(GetSecondaryCache()); + std::shared_ptr tiered_cache = GetTieredCache(); + + ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(110 << 20)); + // Use EXPECT_PRED3 instead of EXPECT_NEAR to void too many size_t to + // double explicit casts + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (110 << 20), + GetPercent(110 << 20, 1)); + EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (30 << 20), + GetPercent(30 << 20, 1)); + size_t sec_capacity; + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (30 << 20)); + + ASSERT_OK(UpdateTieredCache(tiered_cache, -1, 0.39)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (110 << 20), + GetPercent(110 << 20, 1)); + EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (39 << 20), + GetPercent(39 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (39 << 20)); + + ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(90 << 20)); + EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (94 << 20), + GetPercent(94 << 20, 1)); + EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (35 << 20), + GetPercent(35 << 20, 1)); + ASSERT_OK(sec_cache->GetCapacity(sec_capacity)); + ASSERT_EQ(sec_capacity, (39 << 20)); + + ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(0)); +} + +INSTANTIATE_TEST_CASE_P( + CompressedSecCacheTests, CompressedSecCacheTestWithTiered, + ::testing::Values( + std::make_tuple(PrimaryCacheType::kCacheTypeLRU, + TieredAdmissionPolicy::kAdmPolicyAllowCacheHits), + std::make_tuple(PrimaryCacheType::kCacheTypeHCC, + TieredAdmissionPolicy::kAdmPolicyAllowCacheHits))); + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc index 95cd320a7bce..9d169522434f 100644 --- a/cache/lru_cache.cc +++ b/cache/lru_cache.cc @@ -14,22 +14,15 @@ #include #include +#include "cache/secondary_cache_adapter.h" #include "monitoring/perf_context_imp.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "port/lang.h" #include "util/distributed_mutex.h" namespace ROCKSDB_NAMESPACE { namespace lru_cache { -namespace { -// A distinct pointer value for marking "dummy" cache entries -struct DummyValue { - char val[12] = "kDummyValue"; -}; -DummyValue kDummyValue{}; -} // namespace - LRUHandleTable::LRUHandleTable(int max_upper_hash_bits, MemoryAllocator* allocator) : length_bits_(/* historical starting size*/ 4), @@ -103,7 +96,7 @@ void LRUHandleTable::Resize() { std::unique_ptr new_list { new LRUHandle* [size_t{1} << new_length_bits] {} }; - uint32_t count = 0; + [[maybe_unused]] uint32_t count = 0; for (uint32_t i = 0; i < old_length; i++) { LRUHandle* h = list_[i]; while (h != nullptr) { @@ -127,7 +120,7 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit, CacheMetadataChargePolicy metadata_charge_policy, int max_upper_hash_bits, MemoryAllocator* allocator, - SecondaryCache* secondary_cache) + const Cache::EvictionCallback* eviction_callback) : CacheShardBase(metadata_charge_policy), capacity_(0), high_pri_pool_usage_(0), @@ -141,7 +134,7 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit, usage_(0), lru_usage_(0), mutex_(use_adaptive_mutex), - secondary_cache_(secondary_cache) { + eviction_callback_(*eviction_callback) { // Make empty circular linked list. lru_.next = &lru_; lru_.prev = &lru_; @@ -341,16 +334,20 @@ void LRUCacheShard::EvictFromLRU(size_t charge, } } -void LRUCacheShard::TryInsertIntoSecondaryCache( - autovector evicted_handles) { - for (auto entry : evicted_handles) { - if (secondary_cache_ && entry->IsSecondaryCacheCompatible() && - !entry->IsInSecondaryCache()) { - secondary_cache_->Insert(entry->key(), entry->value, entry->helper) - .PermitUncheckedError(); +void LRUCacheShard::NotifyEvicted( + const autovector& evicted_handles) { + MemoryAllocator* alloc = table_.GetAllocator(); + for (LRUHandle* entry : evicted_handles) { + if (eviction_callback_ && + eviction_callback_(entry->key(), + reinterpret_cast(entry), + entry->HasHit())) { + // Callback took ownership of obj; just free handle + free(entry); + } else { + // Free the entries here outside of mutex for performance reasons. + entry->Free(alloc); } - // Free the entries here outside of mutex for performance reasons. - entry->Free(table_.GetAllocator()); } } @@ -364,7 +361,7 @@ void LRUCacheShard::SetCapacity(size_t capacity) { EvictFromLRU(0, &last_reference_list); } - TryInsertIntoSecondaryCache(last_reference_list); + NotifyEvicted(last_reference_list); } void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) { @@ -372,8 +369,7 @@ void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) { strict_capacity_limit_ = strict_capacity_limit; } -Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle, - bool free_handle_on_fail) { +Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle) { Status s = Status::OK(); autovector last_reference_list; @@ -392,10 +388,9 @@ Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle, // into cache and get evicted immediately. last_reference_list.push_back(e); } else { - if (free_handle_on_fail) { - free(e); - *handle = nullptr; - } + free(e); + e = nullptr; + *handle = nullptr; s = Status::MemoryLimit("Insert failed due to LRU cache being full."); } } else { @@ -427,185 +422,27 @@ Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle, } } - TryInsertIntoSecondaryCache(last_reference_list); + NotifyEvicted(last_reference_list); return s; } -void LRUCacheShard::Promote(LRUHandle* e) { - SecondaryCacheResultHandle* secondary_handle = e->sec_handle; - - assert(secondary_handle->IsReady()); - // e is not thread-shared here; OK to modify "immutable" fields as well as - // "mutable" (normally requiring mutex) - e->SetIsPending(false); - e->value = secondary_handle->Value(); - assert(e->total_charge == 0); - size_t value_size = secondary_handle->Size(); - delete secondary_handle; - - if (e->value) { - e->CalcTotalCharge(value_size, metadata_charge_policy_); - Status s; - if (e->IsStandalone()) { - assert(secondary_cache_ && secondary_cache_->SupportForceErase()); - - // Insert a dummy handle and return a standalone handle to caller. - // Charge the standalone handle. - autovector last_reference_list; - bool free_standalone_handle{false}; - { - DMutexLock l(mutex_); - - // Free the space following strict LRU policy until enough space - // is freed or the lru list is empty. - EvictFromLRU(e->total_charge, &last_reference_list); - - if ((usage_ + e->total_charge) > capacity_ && strict_capacity_limit_) { - free_standalone_handle = true; - } else { - usage_ += e->total_charge; - } - } - - TryInsertIntoSecondaryCache(last_reference_list); - if (free_standalone_handle) { - e->Unref(); - e->Free(table_.GetAllocator()); - e = nullptr; - } else { - PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1); - } - - // Insert a dummy handle into the primary cache. This dummy handle is - // not IsSecondaryCacheCompatible(). - // FIXME? This should not overwrite an existing non-dummy entry in the - // rare case that one exists - Cache::Priority priority = - e->IsHighPri() ? Cache::Priority::HIGH : Cache::Priority::LOW; - s = Insert(e->key(), e->hash, &kDummyValue, &kNoopCacheItemHelper, - /*charge=*/0, - /*handle=*/nullptr, priority); - } else { - e->SetInCache(true); - LRUHandle* handle = e; - // This InsertItem() could fail if the cache is over capacity and - // strict_capacity_limit_ is true. In such a case, we don't want - // InsertItem() to free the handle, since the item is already in memory - // and the caller will most likely just read it from disk if we erase it - // here. - s = InsertItem(e, &handle, /*free_handle_on_fail=*/false); - if (s.ok()) { - PERF_COUNTER_ADD(block_cache_real_handle_count, 1); - } - } - - if (!s.ok()) { - // Item is in memory, but not accounted against the cache capacity. - // When the handle is released, the item should get deleted. - assert(!e->InCache()); - } - } else { - // Secondary cache lookup failed. The caller will take care of detecting - // this and eventually releasing e. - assert(!e->value); - assert(!e->InCache()); - } -} - LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash, - const Cache::CacheItemHelper* helper, - Cache::CreateContext* create_context, - Cache::Priority priority, bool wait, - Statistics* stats) { - LRUHandle* e = nullptr; - bool found_dummy_entry{false}; - { - DMutexLock l(mutex_); - e = table_.Lookup(key, hash); - if (e != nullptr) { - assert(e->InCache()); - if (e->value == &kDummyValue) { - // For a dummy handle, if it was retrieved from secondary cache, - // it may still exist in secondary cache. - // If the handle exists in secondary cache, the value should be - // erased from sec cache and be inserted into primary cache. - found_dummy_entry = true; - // Let the dummy entry be overwritten - e = nullptr; - } else { - if (!e->HasRefs()) { - // The entry is in LRU since it's in hash and has no external - // references. - LRU_Remove(e); - } - e->Ref(); - e->SetHit(); - } - } - } - - // If handle table lookup failed or the handle is a dummy one, allocate - // a handle outside the mutex if we re going to lookup in the secondary cache. - // - // When a block is firstly Lookup from CompressedSecondaryCache, we just - // insert a dummy block into the primary cache (charging the actual size of - // the block) and don't erase the block from CompressedSecondaryCache. A - // standalone handle is returned to the caller. Only if the block is hit - // again, we erase it from CompressedSecondaryCache and add it into the - // primary cache. - if (!e && secondary_cache_ && helper && helper->create_cb) { - bool is_in_sec_cache{false}; - std::unique_ptr secondary_handle = - secondary_cache_->Lookup(key, helper, create_context, wait, - found_dummy_entry, is_in_sec_cache); - if (secondary_handle != nullptr) { - e = static_cast(malloc(sizeof(LRUHandle) - 1 + key.size())); - - e->m_flags = 0; - e->im_flags = 0; - e->helper = helper; - e->key_length = key.size(); - e->hash = hash; - e->refs = 0; - e->next = e->prev = nullptr; - e->SetPriority(priority); - memcpy(e->key_data, key.data(), key.size()); - e->value = nullptr; - e->sec_handle = secondary_handle.release(); - e->total_charge = 0; - e->Ref(); - e->SetIsInSecondaryCache(is_in_sec_cache); - e->SetIsStandalone(secondary_cache_->SupportForceErase() && - !found_dummy_entry); - - if (wait) { - Promote(e); - if (e) { - if (!e->value) { - // The secondary cache returned a handle, but the lookup failed. - e->Unref(); - e->Free(table_.GetAllocator()); - e = nullptr; - } else { - PERF_COUNTER_ADD(secondary_cache_hit_count, 1); - RecordTick(stats, SECONDARY_CACHE_HITS); - } - } - } else { - // If wait is false, we always return a handle and let the caller - // release the handle after checking for success or failure. - e->SetIsPending(true); - // This may be slightly inaccurate, if the lookup eventually fails. - // But the probability is very low. - PERF_COUNTER_ADD(secondary_cache_hit_count, 1); - RecordTick(stats, SECONDARY_CACHE_HITS); - } - } else { - // Caller will most likely overwrite the dummy entry with an Insert - // after this Lookup fails - assert(e == nullptr); + const Cache::CacheItemHelper* /*helper*/, + Cache::CreateContext* /*create_context*/, + Cache::Priority /*priority*/, + Statistics* /*stats*/) { + DMutexLock l(mutex_); + LRUHandle* e = table_.Lookup(key, hash); + if (e != nullptr) { + assert(e->InCache()); + if (!e->HasRefs()) { + // The entry is in LRU since it's in hash and has no external + // references. + LRU_Remove(e); } + e->Ref(); + e->SetHit(); } return e; } @@ -614,8 +451,6 @@ bool LRUCacheShard::Ref(LRUHandle* e) { DMutexLock l(mutex_); // To create another reference - entry must be already externally referenced. assert(e->HasRefs()); - // Pending handles are not for sharing - assert(!e->IsPending()); e->Ref(); return true; } @@ -639,14 +474,13 @@ bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/, if (e == nullptr) { return false; } - bool last_reference = false; - // Must Wait or WaitAll first on pending handles. Otherwise, would leak - // a secondary cache handle. - assert(!e->IsPending()); + bool must_free; + bool was_in_cache; { DMutexLock l(mutex_); - last_reference = e->Unref(); - if (last_reference && e->InCache()) { + must_free = e->Unref(); + was_in_cache = e->InCache(); + if (must_free && was_in_cache) { // The item is still in cache, and nobody else holds a reference to it. if (usage_ > capacity_ || erase_if_last_ref) { // The LRU list must be empty since the cache is full. @@ -657,29 +491,39 @@ bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/, } else { // Put the item back on the LRU list, and don't free it. LRU_Insert(e); - last_reference = false; + must_free = false; } } - // If it was the last reference, then decrement the cache usage. - if (last_reference) { + // If about to be freed, then decrement the cache usage. + if (must_free) { assert(usage_ >= e->total_charge); usage_ -= e->total_charge; } } // Free the entry here outside of mutex for performance reasons. - if (last_reference) { - e->Free(table_.GetAllocator()); + if (must_free) { + // Only call eviction callback if we're sure no one requested erasure + // FIXME: disabled because of test churn + if (false && was_in_cache && !erase_if_last_ref && eviction_callback_ && + eviction_callback_(e->key(), reinterpret_cast(e), + e->HasHit())) { + // Callback took ownership of obj; just free handle + free(e); + } else { + e->Free(table_.GetAllocator()); + } } - return last_reference; + return must_free; } -Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, - Cache::ObjectPtr value, - const Cache::CacheItemHelper* helper, - size_t charge, LRUHandle** handle, - Cache::Priority priority) { +LRUHandle* LRUCacheShard::CreateHandle(const Slice& key, uint32_t hash, + Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper, + size_t charge) { assert(helper); + // value == nullptr is reserved for indicating failure in SecondaryCache + assert(!(helper->IsSecondaryCacheCompatible() && value == nullptr)); // Allocate the memory here outside of the mutex. // If the cache is full, we'll have to release it. @@ -695,16 +539,53 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, e->hash = hash; e->refs = 0; e->next = e->prev = nullptr; - e->SetInCache(true); - e->SetPriority(priority); memcpy(e->key_data, key.data(), key.size()); e->CalcTotalCharge(charge, metadata_charge_policy_); - // value == nullptr is reserved for indicating failure for when secondary - // cache compatible - assert(!(e->IsSecondaryCacheCompatible() && value == nullptr)); + return e; +} + +Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, + Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper, + size_t charge, LRUHandle** handle, + Cache::Priority priority) { + LRUHandle* e = CreateHandle(key, hash, value, helper, charge); + e->SetPriority(priority); + e->SetInCache(true); + return InsertItem(e, handle); +} + +LRUHandle* LRUCacheShard::CreateStandalone(const Slice& key, uint32_t hash, + Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper, + size_t charge, + bool allow_uncharged) { + LRUHandle* e = CreateHandle(key, hash, value, helper, charge); + e->SetIsStandalone(true); + e->Ref(); + + autovector last_reference_list; + + { + DMutexLock l(mutex_); + + EvictFromLRU(e->total_charge, &last_reference_list); + + if (strict_capacity_limit_ && (usage_ + e->total_charge) > capacity_) { + if (allow_uncharged) { + e->total_charge = 0; + } else { + free(e); + e = nullptr; + } + } else { + usage_ += e->total_charge; + } + } - return InsertItem(e, handle, /* free_handle_on_fail */ true); + NotifyEvicted(last_reference_list); + return e; } void LRUCacheShard::Erase(const Slice& key, uint32_t hash) { @@ -733,16 +614,6 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) { } } -bool LRUCacheShard::IsReady(LRUHandle* e) { - bool ready = true; - if (e->IsPending()) { - assert(secondary_cache_); - assert(e->sec_handle); - ready = e->sec_handle->IsReady(); - } - return ready; -} - size_t LRUCacheShard::GetUsage() const { DMutexLock l(mutex_); return usage_; @@ -777,31 +648,20 @@ void LRUCacheShard::AppendPrintableOptions(std::string& str) const { str.append(buffer); } -LRUCache::LRUCache(size_t capacity, int num_shard_bits, - bool strict_capacity_limit, double high_pri_pool_ratio, - double low_pri_pool_ratio, - std::shared_ptr allocator, - bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy, - std::shared_ptr _secondary_cache) - : ShardedCache(capacity, num_shard_bits, strict_capacity_limit, - std::move(allocator)), - secondary_cache_(std::move(_secondary_cache)) { +LRUCache::LRUCache(const LRUCacheOptions& opts) : ShardedCache(opts) { size_t per_shard = GetPerShardCapacity(); - SecondaryCache* secondary_cache = secondary_cache_.get(); MemoryAllocator* alloc = memory_allocator(); - InitShards([=](LRUCacheShard* cs) { - new (cs) LRUCacheShard( - per_shard, strict_capacity_limit, high_pri_pool_ratio, - low_pri_pool_ratio, use_adaptive_mutex, metadata_charge_policy, - /* max_upper_hash_bits */ 32 - num_shard_bits, alloc, secondary_cache); + InitShards([&](LRUCacheShard* cs) { + new (cs) LRUCacheShard(per_shard, opts.strict_capacity_limit, + opts.high_pri_pool_ratio, opts.low_pri_pool_ratio, + opts.use_adaptive_mutex, opts.metadata_charge_policy, + /* max_upper_hash_bits */ 32 - opts.num_shard_bits, + alloc, &eviction_callback_); }); } Cache::ObjectPtr LRUCache::Value(Handle* handle) { auto h = reinterpret_cast(handle); - assert(!h->IsPending() || h->value == nullptr); - assert(h->value != &kDummyValue); return h->value; } @@ -824,51 +684,9 @@ double LRUCache::GetHighPriPoolRatio() { return GetShard(0).GetHighPriPoolRatio(); } -void LRUCache::WaitAll(std::vector& handles) { - if (secondary_cache_) { - std::vector sec_handles; - sec_handles.reserve(handles.size()); - for (Handle* handle : handles) { - if (!handle) { - continue; - } - LRUHandle* lru_handle = reinterpret_cast(handle); - if (!lru_handle->IsPending()) { - continue; - } - sec_handles.emplace_back(lru_handle->sec_handle); - } - secondary_cache_->WaitAll(sec_handles); - for (Handle* handle : handles) { - if (!handle) { - continue; - } - LRUHandle* lru_handle = reinterpret_cast(handle); - if (!lru_handle->IsPending()) { - continue; - } - GetShard(lru_handle->hash).Promote(lru_handle); - } - } -} - -void LRUCache::AppendPrintableOptions(std::string& str) const { - ShardedCache::AppendPrintableOptions(str); // options from shard - if (secondary_cache_) { - str.append(" secondary_cache:\n"); - str.append(secondary_cache_->GetPrintableOptions()); - } -} - } // namespace lru_cache -std::shared_ptr NewLRUCache( - size_t capacity, int num_shard_bits, bool strict_capacity_limit, - double high_pri_pool_ratio, - std::shared_ptr memory_allocator, bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy, - const std::shared_ptr& secondary_cache, - double low_pri_pool_ratio) { +std::shared_ptr LRUCacheOptions::MakeSharedCache() const { if (num_shard_bits >= 20) { return nullptr; // The cache cannot be sharded into too many fine pieces. } @@ -884,32 +702,24 @@ std::shared_ptr NewLRUCache( // Invalid high_pri_pool_ratio and low_pri_pool_ratio combination return nullptr; } - if (num_shard_bits < 0) { - num_shard_bits = GetDefaultCacheShardBits(capacity); - } - return std::make_shared( - capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio, - low_pri_pool_ratio, std::move(memory_allocator), use_adaptive_mutex, - metadata_charge_policy, secondary_cache); -} - -std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts) { - return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits, - cache_opts.strict_capacity_limit, - cache_opts.high_pri_pool_ratio, - cache_opts.memory_allocator, cache_opts.use_adaptive_mutex, - cache_opts.metadata_charge_policy, - cache_opts.secondary_cache, cache_opts.low_pri_pool_ratio); -} - -std::shared_ptr NewLRUCache( - size_t capacity, int num_shard_bits, bool strict_capacity_limit, - double high_pri_pool_ratio, - std::shared_ptr memory_allocator, bool use_adaptive_mutex, - CacheMetadataChargePolicy metadata_charge_policy, - double low_pri_pool_ratio) { - return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit, - high_pri_pool_ratio, memory_allocator, use_adaptive_mutex, - metadata_charge_policy, nullptr, low_pri_pool_ratio); + // For sanitized options + LRUCacheOptions opts = *this; + if (opts.num_shard_bits < 0) { + opts.num_shard_bits = GetDefaultCacheShardBits(capacity); + } + std::shared_ptr cache = std::make_shared(opts); + if (secondary_cache) { + cache = std::make_shared(cache, secondary_cache); + } + return cache; +} + +std::shared_ptr LRUCacheOptions::MakeSharedRowCache() const { + if (secondary_cache) { + // Not allowed for a RowCache + return nullptr; + } + // Works while RowCache is an alias for Cache + return MakeSharedCache(); } } // namespace ROCKSDB_NAMESPACE diff --git a/cache/lru_cache.h b/cache/lru_cache.h index 1edccd0ce2e0..1a9ba044251d 100644 --- a/cache/lru_cache.h +++ b/cache/lru_cache.h @@ -16,7 +16,6 @@ #include "port/likely.h" #include "port/malloc.h" #include "port/port.h" -#include "rocksdb/secondary_cache.h" #include "util/autovector.h" #include "util/distributed_mutex.h" @@ -51,12 +50,7 @@ namespace lru_cache { struct LRUHandle { Cache::ObjectPtr value; const Cache::CacheItemHelper* helper; - // An entry is not added to the LRUHandleTable until the secondary cache - // lookup is complete, so its safe to have this union. - union { - LRUHandle* next_hash; - SecondaryCacheResultHandle* sec_handle; - }; + LRUHandle* next_hash; LRUHandle* next; LRUHandle* prev; size_t total_charge; // TODO(opt): Only allow uint32_t? @@ -89,12 +83,8 @@ struct LRUHandle { IM_IS_HIGH_PRI = (1 << 0), // Whether this entry is low priority entry. IM_IS_LOW_PRI = (1 << 1), - // Is the handle still being read from a lower tier. - IM_IS_PENDING = (1 << 2), - // Whether this handle is still in a lower tier - IM_IS_IN_SECONDARY_CACHE = (1 << 3), // Marks result handles that should not be inserted into cache - IM_IS_STANDALONE = (1 << 4), + IM_IS_STANDALONE = (1 << 2), }; // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!) @@ -124,11 +114,6 @@ struct LRUHandle { bool IsLowPri() const { return im_flags & IM_IS_LOW_PRI; } bool InLowPriPool() const { return m_flags & M_IN_LOW_PRI_POOL; } bool HasHit() const { return m_flags & M_HAS_HIT; } - bool IsSecondaryCacheCompatible() const { return helper->size_cb != nullptr; } - bool IsPending() const { return im_flags & IM_IS_PENDING; } - bool IsInSecondaryCache() const { - return im_flags & IM_IS_IN_SECONDARY_CACHE; - } bool IsStandalone() const { return im_flags & IM_IS_STANDALONE; } void SetInCache(bool in_cache) { @@ -170,22 +155,6 @@ struct LRUHandle { void SetHit() { m_flags |= M_HAS_HIT; } - void SetIsPending(bool pending) { - if (pending) { - im_flags |= IM_IS_PENDING; - } else { - im_flags &= ~IM_IS_PENDING; - } - } - - void SetIsInSecondaryCache(bool is_in_secondary_cache) { - if (is_in_secondary_cache) { - im_flags |= IM_IS_IN_SECONDARY_CACHE; - } else { - im_flags &= ~IM_IS_IN_SECONDARY_CACHE; - } - } - void SetIsStandalone(bool is_standalone) { if (is_standalone) { im_flags |= IM_IS_STANDALONE; @@ -196,14 +165,6 @@ struct LRUHandle { void Free(MemoryAllocator* allocator) { assert(refs == 0); - - if (UNLIKELY(IsPending())) { - assert(sec_handle != nullptr); - SecondaryCacheResultHandle* tmp_sec_handle = sec_handle; - tmp_sec_handle->Wait(); - value = tmp_sec_handle->Value(); - delete tmp_sec_handle; - } assert(helper); if (helper->del_cb) { helper->del_cb(value, allocator); @@ -303,12 +264,14 @@ class LRUHandleTable { // A single shard of sharded cache. class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { public: + // NOTE: the eviction_callback ptr is saved, as is it assumed to be kept + // alive in Cache. LRUCacheShard(size_t capacity, bool strict_capacity_limit, double high_pri_pool_ratio, double low_pri_pool_ratio, bool use_adaptive_mutex, CacheMetadataChargePolicy metadata_charge_policy, int max_upper_hash_bits, MemoryAllocator* allocator, - SecondaryCache* secondary_cache); + const Cache::EvictionCallback* eviction_callback); public: // Type definitions expected as parameter to ShardedCache using HandleImpl = LRUHandle; @@ -316,8 +279,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { using HashCref = uint32_t; public: // Function definitions expected as parameter to ShardedCache - static inline HashVal ComputeHash(const Slice& key) { - return Lower32of64(GetSliceNPHash64(key)); + static inline HashVal ComputeHash(const Slice& key, uint32_t seed) { + return Lower32of64(GetSliceNPHash64(key, seed)); } // Separate from constructor so caller can easily make an array of LRUCache @@ -339,14 +302,17 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { const Cache::CacheItemHelper* helper, size_t charge, LRUHandle** handle, Cache::Priority priority); + LRUHandle* CreateStandalone(const Slice& key, uint32_t hash, + Cache::ObjectPtr obj, + const Cache::CacheItemHelper* helper, + size_t charge, bool allow_uncharged); + LRUHandle* Lookup(const Slice& key, uint32_t hash, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, - Cache::Priority priority, bool wait, Statistics* stats); + Cache::Priority priority, Statistics* stats); bool Release(LRUHandle* handle, bool useful, bool erase_if_last_ref); - bool IsReady(LRUHandle* /*handle*/); - void Wait(LRUHandle* /*handle*/) {} bool Ref(LRUHandle* handle); void Erase(const Slice& key, uint32_t hash); @@ -386,20 +352,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { private: friend class LRUCache; // Insert an item into the hash table and, if handle is null, insert into - // the LRU list. Older items are evicted as necessary. If the cache is full - // and free_handle_on_fail is true, the item is deleted and handle is set to - // nullptr. - Status InsertItem(LRUHandle* item, LRUHandle** handle, - bool free_handle_on_fail); - // Promote an item looked up from the secondary cache to the LRU cache. - // The item may be still in the secondary cache. - // It is only inserted into the hash table and not the LRU list, and only - // if the cache is not at full capacity, as is the case during Insert. The - // caller should hold a reference on the LRUHandle. When the caller releases - // the last reference, the item is added to the LRU list. - // The item is promoted to the high pri or low pri pool as specified by the - // caller in Lookup. - void Promote(LRUHandle* e); + // the LRU list. Older items are evicted as necessary. Frees `item` on + // non-OK status. + Status InsertItem(LRUHandle* item, LRUHandle** handle); + void LRU_Remove(LRUHandle* e); void LRU_Insert(LRUHandle* e); @@ -413,8 +369,11 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { // holding the mutex_. void EvictFromLRU(size_t charge, autovector* deleted); - // Try to insert the evicted handles into the secondary cache. - void TryInsertIntoSecondaryCache(autovector evicted_handles); + void NotifyEvicted(const autovector& evicted_handles); + + LRUHandle* CreateHandle(const Slice& key, uint32_t hash, + Cache::ObjectPtr value, + const Cache::CacheItemHelper* helper, size_t charge); // Initialized before use. size_t capacity_; @@ -477,8 +436,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase { // don't mind mutex_ invoking the non-const actions. mutable DMutex mutex_; - // Owned by LRUCache - SecondaryCache* secondary_cache_; + // A reference to Cache::eviction_callback_ + const Cache::EvictionCallback& eviction_callback_; }; class LRUCache @@ -487,28 +446,16 @@ class LRUCache #endif : public ShardedCache { public: - LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, - double high_pri_pool_ratio, double low_pri_pool_ratio, - std::shared_ptr memory_allocator = nullptr, - bool use_adaptive_mutex = kDefaultToAdaptiveMutex, - CacheMetadataChargePolicy metadata_charge_policy = - kDontChargeCacheMetadata, - std::shared_ptr secondary_cache = nullptr); + explicit LRUCache(const LRUCacheOptions& opts); const char* Name() const override { return "LRUCache"; } ObjectPtr Value(Handle* handle) override; size_t GetCharge(Handle* handle) const override; const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override; - void WaitAll(std::vector& handles) override; // Retrieves number of elements in LRU, for unit test purpose only. size_t TEST_GetLRUSize(); // Retrieves high pri pool ratio. double GetHighPriPoolRatio(); - - void AppendPrintableOptions(std::string& str) const override; - - private: - std::shared_ptr secondary_cache_; }; } // namespace lru_cache diff --git a/cache/lru_cache_test.cc b/cache/lru_cache_test.cc index f84312cb3f80..27fd5cc854ba 100644 --- a/cache/lru_cache_test.cc +++ b/cache/lru_cache_test.cc @@ -5,6 +5,7 @@ #include "cache/lru_cache.h" +#include #include #include @@ -19,6 +20,7 @@ #include "rocksdb/io_status.h" #include "rocksdb/sst_file_manager.h" #include "rocksdb/utilities/cache_dump_load.h" +#include "test_util/secondary_cache_test_util.h" #include "test_util/testharness.h" #include "typed_cache.h" #include "util/coding.h" @@ -51,8 +53,7 @@ class LRUCacheTest : public testing::Test { high_pri_pool_ratio, low_pri_pool_ratio, use_adaptive_mutex, kDontChargeCacheMetadata, /*max_upper_hash_bits=*/24, - /*allocator*/ nullptr, - /*secondary_cache=*/nullptr); + /*allocator*/ nullptr, &eviction_callback_); } void Insert(const std::string& key, @@ -68,7 +69,7 @@ class LRUCacheTest : public testing::Test { bool Lookup(const std::string& key) { auto handle = cache_->Lookup(key, 0 /*hash*/, nullptr, nullptr, - Cache::Priority::LOW, true, nullptr); + Cache::Priority::LOW, nullptr); if (handle) { cache_->Release(handle, true /*useful*/, false /*erase*/); return true; @@ -145,6 +146,7 @@ class LRUCacheTest : public testing::Test { private: LRUCacheShard* cache_ = nullptr; + Cache::EvictionCallback eviction_callback_; }; TEST_F(LRUCacheTest, BasicLRU) { @@ -369,11 +371,12 @@ TEST_F(LRUCacheTest, EntriesWithPriority) { namespace clock_cache { +template class ClockCacheTest : public testing::Test { public: - using Shard = HyperClockCache::Shard; - using Table = HyperClockTable; - using HandleImpl = Shard::HandleImpl; + using Shard = typename ClockCache::Shard; + using Table = typename Shard::Table; + using TableOpts = typename Table::Opts; ClockCacheTest() {} ~ClockCacheTest() override { DeleteShard(); } @@ -391,10 +394,10 @@ class ClockCacheTest : public testing::Test { shard_ = reinterpret_cast(port::cacheline_aligned_alloc(sizeof(Shard))); - Table::Opts opts; - opts.estimated_value_size = 1; - new (shard_) Shard(capacity, strict_capacity_limit, - kDontChargeCacheMetadata, /*allocator*/ nullptr, opts); + TableOpts opts{1 /*value_size*/}; + new (shard_) + Shard(capacity, strict_capacity_limit, kDontChargeCacheMetadata, + /*allocator*/ nullptr, &eviction_callback_, &hash_seed_, opts); } Status Insert(const UniqueId64x2& hashed_key, @@ -449,45 +452,59 @@ class ClockCacheTest : public testing::Test { } Shard* shard_ = nullptr; + + private: + Cache::EvictionCallback eviction_callback_; + uint32_t hash_seed_ = 0; }; -TEST_F(ClockCacheTest, Misc) { - NewShard(3); +using ClockCacheTypes = + ::testing::Types; +TYPED_TEST_CASE(ClockCacheTest, ClockCacheTypes); + +TYPED_TEST(ClockCacheTest, Misc) { + this->NewShard(3); + // NOTE: templated base class prevents simple naming of inherited members, + // so lots of `this->` + auto& shard = *this->shard_; // Key size stuff - EXPECT_OK(InsertWithLen('a', 16)); - EXPECT_NOK(InsertWithLen('b', 15)); - EXPECT_OK(InsertWithLen('b', 16)); - EXPECT_NOK(InsertWithLen('c', 17)); - EXPECT_NOK(InsertWithLen('d', 1000)); - EXPECT_NOK(InsertWithLen('e', 11)); - EXPECT_NOK(InsertWithLen('f', 0)); + EXPECT_OK(this->InsertWithLen('a', 16)); + EXPECT_NOK(this->InsertWithLen('b', 15)); + EXPECT_OK(this->InsertWithLen('b', 16)); + EXPECT_NOK(this->InsertWithLen('c', 17)); + EXPECT_NOK(this->InsertWithLen('d', 1000)); + EXPECT_NOK(this->InsertWithLen('e', 11)); + EXPECT_NOK(this->InsertWithLen('f', 0)); // Some of this is motivated by code coverage std::string wrong_size_key(15, 'x'); - EXPECT_FALSE(Lookup(wrong_size_key, TestHashedKey('x'))); - EXPECT_FALSE(shard_->Ref(nullptr)); - EXPECT_FALSE(shard_->Release(nullptr)); - shard_->Erase(wrong_size_key, TestHashedKey('x')); // no-op + EXPECT_FALSE(this->Lookup(wrong_size_key, this->TestHashedKey('x'))); + EXPECT_FALSE(shard.Ref(nullptr)); + EXPECT_FALSE(shard.Release(nullptr)); + shard.Erase(wrong_size_key, this->TestHashedKey('x')); // no-op } -TEST_F(ClockCacheTest, Limits) { - constexpr size_t kCapacity = 3; - NewShard(kCapacity, false /*strict_capacity_limit*/); +TYPED_TEST(ClockCacheTest, Limits) { + constexpr size_t kCapacity = 64; + this->NewShard(kCapacity, false /*strict_capacity_limit*/); + auto& shard = *this->shard_; + using HandleImpl = typename ClockCacheTest::Shard::HandleImpl; + for (bool strict_capacity_limit : {false, true, false}) { SCOPED_TRACE("strict_capacity_limit = " + std::to_string(strict_capacity_limit)); // Also tests switching between strict limit and not - shard_->SetStrictCapacityLimit(strict_capacity_limit); + shard.SetStrictCapacityLimit(strict_capacity_limit); - UniqueId64x2 hkey = TestHashedKey('x'); + UniqueId64x2 hkey = this->TestHashedKey('x'); // Single entry charge beyond capacity { - Status s = shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/, - &kNoopCacheItemHelper, 5 /*charge*/, - nullptr /*handle*/, Cache::Priority::LOW); + Status s = shard.Insert(this->TestKey(hkey), hkey, nullptr /*value*/, + &kNoopCacheItemHelper, kCapacity + 2 /*charge*/, + nullptr /*handle*/, Cache::Priority::LOW); if (strict_capacity_limit) { EXPECT_TRUE(s.IsMemoryLimit()); } else { @@ -498,11 +515,11 @@ TEST_F(ClockCacheTest, Limits) { // Single entry fills capacity { HandleImpl* h; - ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/, - &kNoopCacheItemHelper, 3 /*charge*/, &h, - Cache::Priority::LOW)); + ASSERT_OK(shard.Insert(this->TestKey(hkey), hkey, nullptr /*value*/, + &kNoopCacheItemHelper, kCapacity /*charge*/, &h, + Cache::Priority::LOW)); // Try to insert more - Status s = Insert('a'); + Status s = this->Insert('a'); if (strict_capacity_limit) { EXPECT_TRUE(s.IsMemoryLimit()); } else { @@ -510,22 +527,22 @@ TEST_F(ClockCacheTest, Limits) { } // Release entry filling capacity. // Cover useful = false case. - shard_->Release(h, false /*useful*/, false /*erase_if_last_ref*/); + shard.Release(h, false /*useful*/, false /*erase_if_last_ref*/); } // Insert more than table size can handle to exceed occupancy limit. // (Cleverly using mostly zero-charge entries, but some non-zero to // verify usage tracking on detached entries.) { - size_t n = shard_->GetTableAddressCount() + 1; + size_t n = kCapacity * 5 + 1; std::unique_ptr ha { new HandleImpl* [n] {} }; Status s; for (size_t i = 0; i < n && s.ok(); ++i) { hkey[1] = i; - s = shard_->Insert(TestKey(hkey), hkey, nullptr /*value*/, - &kNoopCacheItemHelper, - (i + kCapacity < n) ? 0 : 1 /*charge*/, &ha[i], - Cache::Priority::LOW); + s = shard.Insert(this->TestKey(hkey), hkey, nullptr /*value*/, + &kNoopCacheItemHelper, + (i + kCapacity < n) ? 0 : 1 /*charge*/, &ha[i], + Cache::Priority::LOW); if (i == 0) { EXPECT_OK(s); } @@ -536,130 +553,133 @@ TEST_F(ClockCacheTest, Limits) { EXPECT_OK(s); } // Same result if not keeping a reference - s = Insert('a'); + s = this->Insert('a'); if (strict_capacity_limit) { EXPECT_TRUE(s.IsMemoryLimit()); } else { EXPECT_OK(s); } + EXPECT_EQ(shard.GetOccupancyCount(), shard.GetOccupancyLimit()); + // Regardless, we didn't allow table to actually get full - EXPECT_LT(shard_->GetOccupancyCount(), shard_->GetTableAddressCount()); + EXPECT_LT(shard.GetOccupancyCount(), shard.GetTableAddressCount()); // Release handles for (size_t i = 0; i < n; ++i) { if (ha[i]) { - shard_->Release(ha[i]); + shard.Release(ha[i]); } } } } } -TEST_F(ClockCacheTest, ClockEvictionTest) { +TYPED_TEST(ClockCacheTest, ClockEvictionTest) { for (bool strict_capacity_limit : {false, true}) { SCOPED_TRACE("strict_capacity_limit = " + std::to_string(strict_capacity_limit)); - NewShard(6, strict_capacity_limit); - EXPECT_OK(Insert('a', Cache::Priority::BOTTOM)); - EXPECT_OK(Insert('b', Cache::Priority::LOW)); - EXPECT_OK(Insert('c', Cache::Priority::HIGH)); - EXPECT_OK(Insert('d', Cache::Priority::BOTTOM)); - EXPECT_OK(Insert('e', Cache::Priority::LOW)); - EXPECT_OK(Insert('f', Cache::Priority::HIGH)); - - EXPECT_TRUE(Lookup('a', /*use*/ false)); - EXPECT_TRUE(Lookup('b', /*use*/ false)); - EXPECT_TRUE(Lookup('c', /*use*/ false)); - EXPECT_TRUE(Lookup('d', /*use*/ false)); - EXPECT_TRUE(Lookup('e', /*use*/ false)); - EXPECT_TRUE(Lookup('f', /*use*/ false)); + this->NewShard(6, strict_capacity_limit); + auto& shard = *this->shard_; + EXPECT_OK(this->Insert('a', Cache::Priority::BOTTOM)); + EXPECT_OK(this->Insert('b', Cache::Priority::LOW)); + EXPECT_OK(this->Insert('c', Cache::Priority::HIGH)); + EXPECT_OK(this->Insert('d', Cache::Priority::BOTTOM)); + EXPECT_OK(this->Insert('e', Cache::Priority::LOW)); + EXPECT_OK(this->Insert('f', Cache::Priority::HIGH)); + + EXPECT_TRUE(this->Lookup('a', /*use*/ false)); + EXPECT_TRUE(this->Lookup('b', /*use*/ false)); + EXPECT_TRUE(this->Lookup('c', /*use*/ false)); + EXPECT_TRUE(this->Lookup('d', /*use*/ false)); + EXPECT_TRUE(this->Lookup('e', /*use*/ false)); + EXPECT_TRUE(this->Lookup('f', /*use*/ false)); // Ensure bottom are evicted first, even if new entries are low - EXPECT_OK(Insert('g', Cache::Priority::LOW)); - EXPECT_OK(Insert('h', Cache::Priority::LOW)); - - EXPECT_FALSE(Lookup('a', /*use*/ false)); - EXPECT_TRUE(Lookup('b', /*use*/ false)); - EXPECT_TRUE(Lookup('c', /*use*/ false)); - EXPECT_FALSE(Lookup('d', /*use*/ false)); - EXPECT_TRUE(Lookup('e', /*use*/ false)); - EXPECT_TRUE(Lookup('f', /*use*/ false)); + EXPECT_OK(this->Insert('g', Cache::Priority::LOW)); + EXPECT_OK(this->Insert('h', Cache::Priority::LOW)); + + EXPECT_FALSE(this->Lookup('a', /*use*/ false)); + EXPECT_TRUE(this->Lookup('b', /*use*/ false)); + EXPECT_TRUE(this->Lookup('c', /*use*/ false)); + EXPECT_FALSE(this->Lookup('d', /*use*/ false)); + EXPECT_TRUE(this->Lookup('e', /*use*/ false)); + EXPECT_TRUE(this->Lookup('f', /*use*/ false)); // Mark g & h useful - EXPECT_TRUE(Lookup('g', /*use*/ true)); - EXPECT_TRUE(Lookup('h', /*use*/ true)); + EXPECT_TRUE(this->Lookup('g', /*use*/ true)); + EXPECT_TRUE(this->Lookup('h', /*use*/ true)); // Then old LOW entries - EXPECT_OK(Insert('i', Cache::Priority::LOW)); - EXPECT_OK(Insert('j', Cache::Priority::LOW)); + EXPECT_OK(this->Insert('i', Cache::Priority::LOW)); + EXPECT_OK(this->Insert('j', Cache::Priority::LOW)); - EXPECT_FALSE(Lookup('b', /*use*/ false)); - EXPECT_TRUE(Lookup('c', /*use*/ false)); - EXPECT_FALSE(Lookup('e', /*use*/ false)); - EXPECT_TRUE(Lookup('f', /*use*/ false)); + EXPECT_FALSE(this->Lookup('b', /*use*/ false)); + EXPECT_TRUE(this->Lookup('c', /*use*/ false)); + EXPECT_FALSE(this->Lookup('e', /*use*/ false)); + EXPECT_TRUE(this->Lookup('f', /*use*/ false)); // Mark g & h useful once again - EXPECT_TRUE(Lookup('g', /*use*/ true)); - EXPECT_TRUE(Lookup('h', /*use*/ true)); - EXPECT_TRUE(Lookup('i', /*use*/ false)); - EXPECT_TRUE(Lookup('j', /*use*/ false)); + EXPECT_TRUE(this->Lookup('g', /*use*/ true)); + EXPECT_TRUE(this->Lookup('h', /*use*/ true)); + EXPECT_TRUE(this->Lookup('i', /*use*/ false)); + EXPECT_TRUE(this->Lookup('j', /*use*/ false)); // Then old HIGH entries - EXPECT_OK(Insert('k', Cache::Priority::LOW)); - EXPECT_OK(Insert('l', Cache::Priority::LOW)); - - EXPECT_FALSE(Lookup('c', /*use*/ false)); - EXPECT_FALSE(Lookup('f', /*use*/ false)); - EXPECT_TRUE(Lookup('g', /*use*/ false)); - EXPECT_TRUE(Lookup('h', /*use*/ false)); - EXPECT_TRUE(Lookup('i', /*use*/ false)); - EXPECT_TRUE(Lookup('j', /*use*/ false)); - EXPECT_TRUE(Lookup('k', /*use*/ false)); - EXPECT_TRUE(Lookup('l', /*use*/ false)); + EXPECT_OK(this->Insert('k', Cache::Priority::LOW)); + EXPECT_OK(this->Insert('l', Cache::Priority::LOW)); + + EXPECT_FALSE(this->Lookup('c', /*use*/ false)); + EXPECT_FALSE(this->Lookup('f', /*use*/ false)); + EXPECT_TRUE(this->Lookup('g', /*use*/ false)); + EXPECT_TRUE(this->Lookup('h', /*use*/ false)); + EXPECT_TRUE(this->Lookup('i', /*use*/ false)); + EXPECT_TRUE(this->Lookup('j', /*use*/ false)); + EXPECT_TRUE(this->Lookup('k', /*use*/ false)); + EXPECT_TRUE(this->Lookup('l', /*use*/ false)); // Then the (roughly) least recently useful - EXPECT_OK(Insert('m', Cache::Priority::HIGH)); - EXPECT_OK(Insert('n', Cache::Priority::HIGH)); + EXPECT_OK(this->Insert('m', Cache::Priority::HIGH)); + EXPECT_OK(this->Insert('n', Cache::Priority::HIGH)); - EXPECT_TRUE(Lookup('g', /*use*/ false)); - EXPECT_TRUE(Lookup('h', /*use*/ false)); - EXPECT_FALSE(Lookup('i', /*use*/ false)); - EXPECT_FALSE(Lookup('j', /*use*/ false)); - EXPECT_TRUE(Lookup('k', /*use*/ false)); - EXPECT_TRUE(Lookup('l', /*use*/ false)); + EXPECT_TRUE(this->Lookup('g', /*use*/ false)); + EXPECT_TRUE(this->Lookup('h', /*use*/ false)); + EXPECT_FALSE(this->Lookup('i', /*use*/ false)); + EXPECT_FALSE(this->Lookup('j', /*use*/ false)); + EXPECT_TRUE(this->Lookup('k', /*use*/ false)); + EXPECT_TRUE(this->Lookup('l', /*use*/ false)); // Now try changing capacity down - shard_->SetCapacity(4); + shard.SetCapacity(4); // Insert to ensure evictions happen - EXPECT_OK(Insert('o', Cache::Priority::LOW)); - EXPECT_OK(Insert('p', Cache::Priority::LOW)); - - EXPECT_FALSE(Lookup('g', /*use*/ false)); - EXPECT_FALSE(Lookup('h', /*use*/ false)); - EXPECT_FALSE(Lookup('k', /*use*/ false)); - EXPECT_FALSE(Lookup('l', /*use*/ false)); - EXPECT_TRUE(Lookup('m', /*use*/ false)); - EXPECT_TRUE(Lookup('n', /*use*/ false)); - EXPECT_TRUE(Lookup('o', /*use*/ false)); - EXPECT_TRUE(Lookup('p', /*use*/ false)); + EXPECT_OK(this->Insert('o', Cache::Priority::LOW)); + EXPECT_OK(this->Insert('p', Cache::Priority::LOW)); + + EXPECT_FALSE(this->Lookup('g', /*use*/ false)); + EXPECT_FALSE(this->Lookup('h', /*use*/ false)); + EXPECT_FALSE(this->Lookup('k', /*use*/ false)); + EXPECT_FALSE(this->Lookup('l', /*use*/ false)); + EXPECT_TRUE(this->Lookup('m', /*use*/ false)); + EXPECT_TRUE(this->Lookup('n', /*use*/ false)); + EXPECT_TRUE(this->Lookup('o', /*use*/ false)); + EXPECT_TRUE(this->Lookup('p', /*use*/ false)); // Now try changing capacity up - EXPECT_TRUE(Lookup('m', /*use*/ true)); - EXPECT_TRUE(Lookup('n', /*use*/ true)); - shard_->SetCapacity(6); - EXPECT_OK(Insert('q', Cache::Priority::HIGH)); - EXPECT_OK(Insert('r', Cache::Priority::HIGH)); - EXPECT_OK(Insert('s', Cache::Priority::HIGH)); - EXPECT_OK(Insert('t', Cache::Priority::HIGH)); - - EXPECT_FALSE(Lookup('o', /*use*/ false)); - EXPECT_FALSE(Lookup('p', /*use*/ false)); - EXPECT_TRUE(Lookup('m', /*use*/ false)); - EXPECT_TRUE(Lookup('n', /*use*/ false)); - EXPECT_TRUE(Lookup('q', /*use*/ false)); - EXPECT_TRUE(Lookup('r', /*use*/ false)); - EXPECT_TRUE(Lookup('s', /*use*/ false)); - EXPECT_TRUE(Lookup('t', /*use*/ false)); + EXPECT_TRUE(this->Lookup('m', /*use*/ true)); + EXPECT_TRUE(this->Lookup('n', /*use*/ true)); + shard.SetCapacity(6); + EXPECT_OK(this->Insert('q', Cache::Priority::HIGH)); + EXPECT_OK(this->Insert('r', Cache::Priority::HIGH)); + EXPECT_OK(this->Insert('s', Cache::Priority::HIGH)); + EXPECT_OK(this->Insert('t', Cache::Priority::HIGH)); + + EXPECT_FALSE(this->Lookup('o', /*use*/ false)); + EXPECT_FALSE(this->Lookup('p', /*use*/ false)); + EXPECT_TRUE(this->Lookup('m', /*use*/ false)); + EXPECT_TRUE(this->Lookup('n', /*use*/ false)); + EXPECT_TRUE(this->Lookup('q', /*use*/ false)); + EXPECT_TRUE(this->Lookup('r', /*use*/ false)); + EXPECT_TRUE(this->Lookup('s', /*use*/ false)); + EXPECT_TRUE(this->Lookup('t', /*use*/ false)); } } @@ -675,113 +695,178 @@ const Cache::CacheItemHelper kDeleteCounterHelper{ } // namespace // Testing calls to CorrectNearOverflow in Release -TEST_F(ClockCacheTest, ClockCounterOverflowTest) { - NewShard(6, /*strict_capacity_limit*/ false); +TYPED_TEST(ClockCacheTest, ClockCounterOverflowTest) { + this->NewShard(6, /*strict_capacity_limit*/ false); + auto& shard = *this->shard_; + using HandleImpl = typename ClockCacheTest::Shard::HandleImpl; + HandleImpl* h; DeleteCounter val; - UniqueId64x2 hkey = TestHashedKey('x'); - ASSERT_OK(shard_->Insert(TestKey(hkey), hkey, &val, &kDeleteCounterHelper, 1, - &h, Cache::Priority::HIGH)); + UniqueId64x2 hkey = this->TestHashedKey('x'); + ASSERT_OK(shard.Insert(this->TestKey(hkey), hkey, &val, &kDeleteCounterHelper, + 1, &h, Cache::Priority::HIGH)); // Some large number outstanding - shard_->TEST_RefN(h, 123456789); + shard.TEST_RefN(h, 123456789); // Simulate many lookup/ref + release, plenty to overflow counters for (int i = 0; i < 10000; ++i) { - shard_->TEST_RefN(h, 1234567); - shard_->TEST_ReleaseN(h, 1234567); + shard.TEST_RefN(h, 1234567); + shard.TEST_ReleaseN(h, 1234567); } // Mark it invisible (to reach a different CorrectNearOverflow() in Release) - shard_->Erase(TestKey(hkey), hkey); + shard.Erase(this->TestKey(hkey), hkey); // Simulate many more lookup/ref + release (one-by-one would be too // expensive for unit test) for (int i = 0; i < 10000; ++i) { - shard_->TEST_RefN(h, 1234567); - shard_->TEST_ReleaseN(h, 1234567); + shard.TEST_RefN(h, 1234567); + shard.TEST_ReleaseN(h, 1234567); } // Free all but last 1 - shard_->TEST_ReleaseN(h, 123456789); + shard.TEST_ReleaseN(h, 123456789); // Still alive ASSERT_EQ(val.deleted, 0); // Free last ref, which will finalize erasure - shard_->Release(h); + shard.Release(h); // Deleted ASSERT_EQ(val.deleted, 1); } +TYPED_TEST(ClockCacheTest, ClockTableFull) { + // Force clock cache table to fill up (not usually allowed) in order + // to test full probe sequence that is theoretically possible due to + // parallel operations + this->NewShard(6, /*strict_capacity_limit*/ false); + auto& shard = *this->shard_; + using HandleImpl = typename ClockCacheTest::Shard::HandleImpl; + + size_t size = shard.GetTableAddressCount(); + ASSERT_LE(size + 3, 256); // for using char keys + // Modify occupancy and capacity limits to attempt insert on full + shard.TEST_MutableOccupancyLimit() = size + 100; + shard.SetCapacity(size + 100); + + DeleteCounter val; + std::vector handles; + // NOTE: the three extra insertions should create standalone entries + for (size_t i = 0; i < size + 3; ++i) { + UniqueId64x2 hkey = this->TestHashedKey(static_cast(i)); + ASSERT_OK(shard.Insert(this->TestKey(hkey), hkey, &val, + &kDeleteCounterHelper, 1, &handles.emplace_back(), + Cache::Priority::HIGH)); + } + + for (size_t i = 0; i < size + 3; ++i) { + UniqueId64x2 hkey = this->TestHashedKey(static_cast(i)); + HandleImpl* h = shard.Lookup(this->TestKey(hkey), hkey); + if (i < size) { + ASSERT_NE(h, nullptr); + shard.Release(h); + } else { + // Standalone entries not visible by lookup + ASSERT_EQ(h, nullptr); + } + } + + for (size_t i = 0; i < size + 3; ++i) { + ASSERT_NE(handles[i], nullptr); + shard.Release(handles[i]); + if (i < size) { + // Everything still in cache + ASSERT_EQ(val.deleted, 0); + } else { + // Standalone entries freed on release + ASSERT_EQ(val.deleted, i + 1 - size); + } + } + + for (size_t i = size + 3; i > 0; --i) { + UniqueId64x2 hkey = this->TestHashedKey(static_cast(i - 1)); + shard.Erase(this->TestKey(hkey), hkey); + if (i - 1 > size) { + ASSERT_EQ(val.deleted, 3); + } else { + ASSERT_EQ(val.deleted, 3 + size - (i - 1)); + } + } +} + // This test is mostly to exercise some corner case logic, by forcing two // keys to have the same hash, and more -TEST_F(ClockCacheTest, CollidingInsertEraseTest) { - NewShard(6, /*strict_capacity_limit*/ false); +TYPED_TEST(ClockCacheTest, CollidingInsertEraseTest) { + this->NewShard(6, /*strict_capacity_limit*/ false); + auto& shard = *this->shard_; + using HandleImpl = typename ClockCacheTest::Shard::HandleImpl; + DeleteCounter val; - UniqueId64x2 hkey1 = TestHashedKey('x'); - Slice key1 = TestKey(hkey1); - UniqueId64x2 hkey2 = TestHashedKey('y'); - Slice key2 = TestKey(hkey2); - UniqueId64x2 hkey3 = TestHashedKey('z'); - Slice key3 = TestKey(hkey3); + UniqueId64x2 hkey1 = this->TestHashedKey('x'); + Slice key1 = this->TestKey(hkey1); + UniqueId64x2 hkey2 = this->TestHashedKey('y'); + Slice key2 = this->TestKey(hkey2); + UniqueId64x2 hkey3 = this->TestHashedKey('z'); + Slice key3 = this->TestKey(hkey3); HandleImpl* h1; - ASSERT_OK(shard_->Insert(key1, hkey1, &val, &kDeleteCounterHelper, 1, &h1, - Cache::Priority::HIGH)); + ASSERT_OK(shard.Insert(key1, hkey1, &val, &kDeleteCounterHelper, 1, &h1, + Cache::Priority::HIGH)); HandleImpl* h2; - ASSERT_OK(shard_->Insert(key2, hkey2, &val, &kDeleteCounterHelper, 1, &h2, - Cache::Priority::HIGH)); + ASSERT_OK(shard.Insert(key2, hkey2, &val, &kDeleteCounterHelper, 1, &h2, + Cache::Priority::HIGH)); HandleImpl* h3; - ASSERT_OK(shard_->Insert(key3, hkey3, &val, &kDeleteCounterHelper, 1, &h3, - Cache::Priority::HIGH)); + ASSERT_OK(shard.Insert(key3, hkey3, &val, &kDeleteCounterHelper, 1, &h3, + Cache::Priority::HIGH)); // Can repeatedly lookup+release despite the hash collision HandleImpl* tmp_h; for (bool erase_if_last_ref : {true, false}) { // but not last ref - tmp_h = shard_->Lookup(key1, hkey1); + tmp_h = shard.Lookup(key1, hkey1); ASSERT_EQ(h1, tmp_h); - ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + ASSERT_FALSE(shard.Release(tmp_h, erase_if_last_ref)); - tmp_h = shard_->Lookup(key2, hkey2); + tmp_h = shard.Lookup(key2, hkey2); ASSERT_EQ(h2, tmp_h); - ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + ASSERT_FALSE(shard.Release(tmp_h, erase_if_last_ref)); - tmp_h = shard_->Lookup(key3, hkey3); + tmp_h = shard.Lookup(key3, hkey3); ASSERT_EQ(h3, tmp_h); - ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + ASSERT_FALSE(shard.Release(tmp_h, erase_if_last_ref)); } // Make h1 invisible - shard_->Erase(key1, hkey1); + shard.Erase(key1, hkey1); // Redundant erase - shard_->Erase(key1, hkey1); + shard.Erase(key1, hkey1); // All still alive ASSERT_EQ(val.deleted, 0); // Invisible to Lookup - tmp_h = shard_->Lookup(key1, hkey1); + tmp_h = shard.Lookup(key1, hkey1); ASSERT_EQ(nullptr, tmp_h); // Can still find h2, h3 for (bool erase_if_last_ref : {true, false}) { // but not last ref - tmp_h = shard_->Lookup(key2, hkey2); + tmp_h = shard.Lookup(key2, hkey2); ASSERT_EQ(h2, tmp_h); - ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + ASSERT_FALSE(shard.Release(tmp_h, erase_if_last_ref)); - tmp_h = shard_->Lookup(key3, hkey3); + tmp_h = shard.Lookup(key3, hkey3); ASSERT_EQ(h3, tmp_h); - ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + ASSERT_FALSE(shard.Release(tmp_h, erase_if_last_ref)); } // Also Insert with invisible entry there - ASSERT_OK(shard_->Insert(key1, hkey1, &val, &kDeleteCounterHelper, 1, nullptr, - Cache::Priority::HIGH)); - tmp_h = shard_->Lookup(key1, hkey1); + ASSERT_OK(shard.Insert(key1, hkey1, &val, &kDeleteCounterHelper, 1, nullptr, + Cache::Priority::HIGH)); + tmp_h = shard.Lookup(key1, hkey1); // Found but distinct handle ASSERT_NE(nullptr, tmp_h); ASSERT_NE(h1, tmp_h); - ASSERT_TRUE(shard_->Release(tmp_h, /*erase_if_last_ref*/ true)); + ASSERT_TRUE(shard.Release(tmp_h, /*erase_if_last_ref*/ true)); // tmp_h deleted ASSERT_EQ(val.deleted--, 1); // Release last ref on h1 (already invisible) - ASSERT_TRUE(shard_->Release(h1, /*erase_if_last_ref*/ false)); + ASSERT_TRUE(shard.Release(h1, /*erase_if_last_ref*/ false)); // h1 deleted ASSERT_EQ(val.deleted--, 1); @@ -789,57 +874,57 @@ TEST_F(ClockCacheTest, CollidingInsertEraseTest) { // Can still find h2, h3 for (bool erase_if_last_ref : {true, false}) { // but not last ref - tmp_h = shard_->Lookup(key2, hkey2); + tmp_h = shard.Lookup(key2, hkey2); ASSERT_EQ(h2, tmp_h); - ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + ASSERT_FALSE(shard.Release(tmp_h, erase_if_last_ref)); - tmp_h = shard_->Lookup(key3, hkey3); + tmp_h = shard.Lookup(key3, hkey3); ASSERT_EQ(h3, tmp_h); - ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + ASSERT_FALSE(shard.Release(tmp_h, erase_if_last_ref)); } // Release last ref on h2 - ASSERT_FALSE(shard_->Release(h2, /*erase_if_last_ref*/ false)); + ASSERT_FALSE(shard.Release(h2, /*erase_if_last_ref*/ false)); // h2 still not deleted (unreferenced in cache) ASSERT_EQ(val.deleted, 0); // Can still find it - tmp_h = shard_->Lookup(key2, hkey2); + tmp_h = shard.Lookup(key2, hkey2); ASSERT_EQ(h2, tmp_h); // Release last ref on h2, with erase - ASSERT_TRUE(shard_->Release(h2, /*erase_if_last_ref*/ true)); + ASSERT_TRUE(shard.Release(h2, /*erase_if_last_ref*/ true)); // h2 deleted ASSERT_EQ(val.deleted--, 1); - tmp_h = shard_->Lookup(key2, hkey2); + tmp_h = shard.Lookup(key2, hkey2); ASSERT_EQ(nullptr, tmp_h); // Can still find h3 for (bool erase_if_last_ref : {true, false}) { // but not last ref - tmp_h = shard_->Lookup(key3, hkey3); + tmp_h = shard.Lookup(key3, hkey3); ASSERT_EQ(h3, tmp_h); - ASSERT_FALSE(shard_->Release(tmp_h, erase_if_last_ref)); + ASSERT_FALSE(shard.Release(tmp_h, erase_if_last_ref)); } // Release last ref on h3, without erase - ASSERT_FALSE(shard_->Release(h3, /*erase_if_last_ref*/ false)); + ASSERT_FALSE(shard.Release(h3, /*erase_if_last_ref*/ false)); // h3 still not deleted (unreferenced in cache) ASSERT_EQ(val.deleted, 0); // Explicit erase - shard_->Erase(key3, hkey3); + shard.Erase(key3, hkey3); // h3 deleted ASSERT_EQ(val.deleted--, 1); - tmp_h = shard_->Lookup(key3, hkey3); + tmp_h = shard.Lookup(key3, hkey3); ASSERT_EQ(nullptr, tmp_h); } // This uses the public API to effectively test CalcHashBits etc. -TEST_F(ClockCacheTest, TableSizesTest) { +TYPED_TEST(ClockCacheTest, TableSizesTest) { for (size_t est_val_size : {1U, 5U, 123U, 2345U, 345678U}) { SCOPED_TRACE("est_val_size = " + std::to_string(est_val_size)); for (double est_count : {1.1, 2.2, 511.9, 512.1, 2345.0}) { @@ -852,8 +937,10 @@ TEST_F(ClockCacheTest, TableSizesTest) { /*memory_allocator*/ nullptr, kDontChargeCacheMetadata) .MakeSharedCache(); // Table sizes are currently only powers of two - EXPECT_GE(cache->GetTableAddressCount(), est_count / kLoadFactor); - EXPECT_LE(cache->GetTableAddressCount(), est_count / kLoadFactor * 2.0); + EXPECT_GE(cache->GetTableAddressCount(), + est_count / FixedHyperClockTable::kLoadFactor); + EXPECT_LE(cache->GetTableAddressCount(), + est_count / FixedHyperClockTable::kLoadFactor * 2.0); EXPECT_EQ(cache->GetUsage(), 0); // kFullChargeMetaData @@ -870,9 +957,10 @@ TEST_F(ClockCacheTest, TableSizesTest) { double est_count_after_meta = (capacity - cache->GetUsage()) * 1.0 / est_val_size; EXPECT_GE(cache->GetTableAddressCount(), - est_count_after_meta / kLoadFactor); - EXPECT_LE(cache->GetTableAddressCount(), - est_count_after_meta / kLoadFactor * 2.0); + est_count_after_meta / FixedHyperClockTable::kLoadFactor); + EXPECT_LE( + cache->GetTableAddressCount(), + est_count_after_meta / FixedHyperClockTable::kLoadFactor * 2.0); } } } @@ -895,13 +983,14 @@ class TestSecondaryCache : public SecondaryCache { using ResultMap = std::unordered_map; - explicit TestSecondaryCache(size_t capacity) + explicit TestSecondaryCache(size_t capacity, bool insert_saved = false) : cache_(NewLRUCache(capacity, 0, false, 0.5 /* high_pri_pool_ratio */, nullptr, kDefaultToAdaptiveMutex, kDontChargeCacheMetadata)), num_inserts_(0), num_lookups_(0), - inject_failure_(false) {} + inject_failure_(false), + insert_saved_(insert_saved) {} const char* Name() const override { return "TestSecondaryCache"; } @@ -910,7 +999,8 @@ class TestSecondaryCache : public SecondaryCache { void ResetInjectFailure() { inject_failure_ = false; } Status Insert(const Slice& key, Cache::ObjectPtr value, - const Cache::CacheItemHelper* helper) override { + const Cache::CacheItemHelper* helper, + bool /*force_insert*/) override { if (inject_failure_) { return Status::Corruption("Insertion Data Corrupted"); } @@ -931,15 +1021,26 @@ class TestSecondaryCache : public SecondaryCache { return cache_.Insert(key, buf, size); } + Status InsertSaved(const Slice& key, const Slice& saved, + CompressionType /*type*/ = kNoCompression, + CacheTier /*source*/ = CacheTier::kVolatileTier) override { + if (insert_saved_) { + return Insert(key, const_cast(&saved), &kSliceCacheItemHelper, + /*force_insert=*/true); + } else { + return Status::OK(); + } + } + std::unique_ptr Lookup( const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool /*wait*/, - bool /*advise_erase*/, bool& is_in_sec_cache) override { + bool /*advise_erase*/, bool& kept_in_sec_cache) override { std::string key_str = key.ToString(); TEST_SYNC_POINT_CALLBACK("TestSecondaryCache::Lookup", &key_str); std::unique_ptr secondary_handle; - is_in_sec_cache = false; + kept_in_sec_cache = false; ResultType type = ResultType::SUCCESS; auto iter = result_map_.find(key.ToString()); if (iter != result_map_.end()) { @@ -959,13 +1060,14 @@ class TestSecondaryCache : public SecondaryCache { char* ptr = cache_.Value(handle); size_t size = DecodeFixed64(ptr); ptr += sizeof(uint64_t); - s = helper->create_cb(Slice(ptr, size), create_context, + s = helper->create_cb(Slice(ptr, size), kNoCompression, + CacheTier::kVolatileTier, create_context, /*alloc*/ nullptr, &value, &charge); } if (s.ok()) { secondary_handle.reset(new TestSecondaryCacheResultHandle( cache_.get(), handle, value, charge, type)); - is_in_sec_cache = true; + kept_in_sec_cache = true; } else { cache_.Release(handle); } @@ -1048,11 +1150,21 @@ class TestSecondaryCache : public SecondaryCache { uint32_t num_inserts_; uint32_t num_lookups_; bool inject_failure_; + bool insert_saved_; std::string ckey_prefix_; ResultMap result_map_; }; -class DBSecondaryCacheTest : public DBTestBase { +using secondary_cache_test_util::GetTestingCacheTypes; +using secondary_cache_test_util::WithCacheTypeParam; + +class BasicSecondaryCacheTest : public testing::Test, + public WithCacheTypeParam {}; + +INSTANTIATE_TEST_CASE_P(BasicSecondaryCacheTest, BasicSecondaryCacheTest, + GetTestingCacheTypes()); + +class DBSecondaryCacheTest : public DBTestBase, public WithCacheTypeParam { public: DBSecondaryCacheTest() : DBTestBase("db_secondary_cache_test", /*env_do_fsync=*/true) { @@ -1064,98 +1176,15 @@ class DBSecondaryCacheTest : public DBTestBase { std::unique_ptr fault_env_; }; -class LRUCacheSecondaryCacheTest : public LRUCacheTest, - public Cache::CreateContext { - public: - LRUCacheSecondaryCacheTest() : fail_create_(false) {} - ~LRUCacheSecondaryCacheTest() {} - - protected: - class TestItem { - public: - TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) { - memcpy(buf_.get(), buf, size); - } - ~TestItem() {} - - char* Buf() { return buf_.get(); } - size_t Size() { return size_; } - std::string ToString() { return std::string(Buf(), Size()); } +INSTANTIATE_TEST_CASE_P(DBSecondaryCacheTest, DBSecondaryCacheTest, + GetTestingCacheTypes()); - private: - std::unique_ptr buf_; - size_t size_; - }; - - static size_t SizeCallback(Cache::ObjectPtr obj) { - return static_cast(obj)->Size(); - } - - static Status SaveToCallback(Cache::ObjectPtr from_obj, size_t from_offset, - size_t length, char* out) { - TestItem* item = static_cast(from_obj); - char* buf = item->Buf(); - EXPECT_EQ(length, item->Size()); - EXPECT_EQ(from_offset, 0); - memcpy(out, buf, length); - return Status::OK(); - } - - static void DeletionCallback(Cache::ObjectPtr obj, - MemoryAllocator* /*alloc*/) { - delete static_cast(obj); - } - - static Cache::CacheItemHelper helper_; - - static Status SaveToCallbackFail(Cache::ObjectPtr /*from_obj*/, - size_t /*from_offset*/, size_t /*length*/, - char* /*out*/) { - return Status::NotSupported(); - } - - static Cache::CacheItemHelper helper_fail_; - - static Status CreateCallback(const Slice& data, Cache::CreateContext* context, - MemoryAllocator* /*allocator*/, - Cache::ObjectPtr* out_obj, size_t* out_charge) { - auto t = static_cast(context); - if (t->fail_create_) { - return Status::NotSupported(); - } - *out_obj = new TestItem(data.data(), data.size()); - *out_charge = data.size(); - return Status::OK(); - }; - - void SetFailCreate(bool fail) { fail_create_ = fail; } - - private: - bool fail_create_; -}; - -Cache::CacheItemHelper LRUCacheSecondaryCacheTest::helper_{ - CacheEntryRole::kMisc, LRUCacheSecondaryCacheTest::DeletionCallback, - LRUCacheSecondaryCacheTest::SizeCallback, - LRUCacheSecondaryCacheTest::SaveToCallback, - LRUCacheSecondaryCacheTest::CreateCallback}; - -Cache::CacheItemHelper LRUCacheSecondaryCacheTest::helper_fail_{ - CacheEntryRole::kMisc, LRUCacheSecondaryCacheTest::DeletionCallback, - LRUCacheSecondaryCacheTest::SizeCallback, - LRUCacheSecondaryCacheTest::SaveToCallbackFail, - LRUCacheSecondaryCacheTest::CreateCallback}; - -TEST_F(LRUCacheSecondaryCacheTest, BasicTest) { - LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); +TEST_P(BasicSecondaryCacheTest, BasicTest) { std::shared_ptr secondary_cache = - std::make_shared(4096); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + std::make_shared(4096, true); + std::shared_ptr cache = + NewCache(1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); std::shared_ptr stats = CreateDBStatistics(); CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); @@ -1166,37 +1195,32 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicTest) { std::string str3 = rnd.RandomString(1021); ASSERT_OK(secondary_cache->InsertSaved(k3.AsSlice(), str3)); - std::string str1 = rnd.RandomString(1020); + std::string str1 = rnd.RandomString(1021); TestItem* item1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert(k1.AsSlice(), item1, - &LRUCacheSecondaryCacheTest::helper_, str1.length())); + ASSERT_OK(cache->Insert(k1.AsSlice(), item1, GetHelper(), str1.length())); std::string str2 = rnd.RandomString(1021); TestItem* item2 = new TestItem(str2.data(), str2.length()); // k1 should be demoted to NVM - ASSERT_OK(cache->Insert(k2.AsSlice(), item2, - &LRUCacheSecondaryCacheTest::helper_, str2.length())); + ASSERT_OK(cache->Insert(k2.AsSlice(), item2, GetHelper(), str2.length())); get_perf_context()->Reset(); Cache::Handle* handle; - handle = - cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - /*context*/ this, Cache::Priority::LOW, true, stats.get()); + handle = cache->Lookup(k2.AsSlice(), GetHelper(), + /*context*/ this, Cache::Priority::LOW, stats.get()); ASSERT_NE(handle, nullptr); ASSERT_EQ(static_cast(cache->Value(handle))->Size(), str2.size()); cache->Release(handle); // This lookup should promote k1 and demote k2 - handle = - cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - /*context*/ this, Cache::Priority::LOW, true, stats.get()); + handle = cache->Lookup(k1.AsSlice(), GetHelper(), + /*context*/ this, Cache::Priority::LOW, stats.get()); ASSERT_NE(handle, nullptr); ASSERT_EQ(static_cast(cache->Value(handle))->Size(), str1.size()); cache->Release(handle); // This lookup should promote k3 and demote k1 - handle = - cache->Lookup(k3.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - /*context*/ this, Cache::Priority::LOW, true, stats.get()); + handle = cache->Lookup(k3.AsSlice(), GetHelper(), + /*context*/ this, Cache::Priority::LOW, stats.get()); ASSERT_NE(handle, nullptr); ASSERT_EQ(static_cast(cache->Value(handle))->Size(), str3.size()); cache->Release(handle); @@ -1212,16 +1236,66 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicTest) { secondary_cache.reset(); } -TEST_F(LRUCacheSecondaryCacheTest, BasicFailTest) { - LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); +TEST_P(BasicSecondaryCacheTest, StatsTest) { + std::shared_ptr secondary_cache = + std::make_shared(4096, true); + std::shared_ptr cache = + NewCache(1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); + std::shared_ptr stats = CreateDBStatistics(); + CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + CacheKey k3 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + + Random rnd(301); + // Start with warming secondary cache + std::string str1 = rnd.RandomString(1020); + std::string str2 = rnd.RandomString(1020); + std::string str3 = rnd.RandomString(1020); + ASSERT_OK(secondary_cache->InsertSaved(k1.AsSlice(), str1)); + ASSERT_OK(secondary_cache->InsertSaved(k2.AsSlice(), str2)); + ASSERT_OK(secondary_cache->InsertSaved(k3.AsSlice(), str3)); + + get_perf_context()->Reset(); + Cache::Handle* handle; + handle = cache->Lookup(k1.AsSlice(), GetHelper(CacheEntryRole::kFilterBlock), + /*context*/ this, Cache::Priority::LOW, stats.get()); + ASSERT_NE(handle, nullptr); + ASSERT_EQ(static_cast(cache->Value(handle))->Size(), str1.size()); + cache->Release(handle); + + handle = cache->Lookup(k2.AsSlice(), GetHelper(CacheEntryRole::kIndexBlock), + /*context*/ this, Cache::Priority::LOW, stats.get()); + ASSERT_NE(handle, nullptr); + ASSERT_EQ(static_cast(cache->Value(handle))->Size(), str2.size()); + cache->Release(handle); + + handle = cache->Lookup(k3.AsSlice(), GetHelper(CacheEntryRole::kDataBlock), + /*context*/ this, Cache::Priority::LOW, stats.get()); + ASSERT_NE(handle, nullptr); + ASSERT_EQ(static_cast(cache->Value(handle))->Size(), str3.size()); + cache->Release(handle); + + ASSERT_EQ(secondary_cache->num_inserts(), 3u); + ASSERT_EQ(secondary_cache->num_lookups(), 3u); + ASSERT_EQ(stats->getTickerCount(SECONDARY_CACHE_HITS), + secondary_cache->num_lookups()); + ASSERT_EQ(stats->getTickerCount(SECONDARY_CACHE_FILTER_HITS), 1); + ASSERT_EQ(stats->getTickerCount(SECONDARY_CACHE_INDEX_HITS), 1); + ASSERT_EQ(stats->getTickerCount(SECONDARY_CACHE_DATA_HITS), 1); + PerfContext perf_ctx = *get_perf_context(); + ASSERT_EQ(perf_ctx.secondary_cache_hit_count, secondary_cache->num_lookups()); + + cache.reset(); + secondary_cache.reset(); +} + +TEST_P(BasicSecondaryCacheTest, BasicFailTest) { std::shared_ptr secondary_cache = - std::make_shared(2048); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + std::make_shared(2048, true); + std::shared_ptr cache = + NewCache(1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); @@ -1231,65 +1305,65 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicFailTest) { // NOTE: changed to assert helper != nullptr for efficiency / code size // ASSERT_TRUE(cache->Insert(k1.AsSlice(), item1.get(), nullptr, // str1.length()).IsInvalidArgument()); - ASSERT_OK(cache->Insert(k1.AsSlice(), item1.get(), - &LRUCacheSecondaryCacheTest::helper_, str1.length())); + ASSERT_OK( + cache->Insert(k1.AsSlice(), item1.get(), GetHelper(), str1.length())); item1.release(); // Appease clang-analyze "potential memory leak" Cache::Handle* handle; handle = cache->Lookup(k2.AsSlice(), nullptr, /*context*/ this, - Cache::Priority::LOW, true); + Cache::Priority::LOW); ASSERT_EQ(handle, nullptr); - handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - /*context*/ this, Cache::Priority::LOW, false); + + handle = cache->Lookup(k2.AsSlice(), GetHelper(), + /*context*/ this, Cache::Priority::LOW); + ASSERT_EQ(handle, nullptr); + + Cache::AsyncLookupHandle async_handle; + async_handle.key = k2.AsSlice(); + async_handle.helper = GetHelper(); + async_handle.create_context = this; + async_handle.priority = Cache::Priority::LOW; + cache->StartAsyncLookup(async_handle); + cache->Wait(async_handle); + handle = async_handle.Result(); ASSERT_EQ(handle, nullptr); cache.reset(); secondary_cache.reset(); } -TEST_F(LRUCacheSecondaryCacheTest, SaveFailTest) { - LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); +TEST_P(BasicSecondaryCacheTest, SaveFailTest) { std::shared_ptr secondary_cache = - std::make_shared(2048); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + std::make_shared(2048, true); + std::shared_ptr cache = + NewCache(1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); Random rnd(301); std::string str1 = rnd.RandomString(1020); TestItem* item1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert(k1.AsSlice(), item1, - &LRUCacheSecondaryCacheTest::helper_fail_, - str1.length())); + ASSERT_OK(cache->Insert(k1.AsSlice(), item1, GetHelperFail(), str1.length())); std::string str2 = rnd.RandomString(1020); TestItem* item2 = new TestItem(str2.data(), str2.length()); // k1 should be demoted to NVM ASSERT_EQ(secondary_cache->num_inserts(), 0u); - ASSERT_OK(cache->Insert(k2.AsSlice(), item2, - &LRUCacheSecondaryCacheTest::helper_fail_, - str2.length())); + ASSERT_OK(cache->Insert(k2.AsSlice(), item2, GetHelperFail(), str2.length())); ASSERT_EQ(secondary_cache->num_inserts(), 1u); Cache::Handle* handle; - handle = - cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_, - /*context*/ this, Cache::Priority::LOW, true); + handle = cache->Lookup(k2.AsSlice(), GetHelperFail(), + /*context*/ this, Cache::Priority::LOW); ASSERT_NE(handle, nullptr); cache->Release(handle); // This lookup should fail, since k1 demotion would have failed - handle = - cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_, - /*context*/ this, Cache::Priority::LOW, true); + handle = cache->Lookup(k1.AsSlice(), GetHelperFail(), + /*context*/ this, Cache::Priority::LOW); ASSERT_EQ(handle, nullptr); // Since k1 didn't get promoted, k2 should still be in cache - handle = - cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_fail_, - /*context*/ this, Cache::Priority::LOW, true); + handle = cache->Lookup(k2.AsSlice(), GetHelperFail(), + /*context*/ this, Cache::Priority::LOW); ASSERT_NE(handle, nullptr); cache->Release(handle); ASSERT_EQ(secondary_cache->num_inserts(), 1u); @@ -1299,43 +1373,37 @@ TEST_F(LRUCacheSecondaryCacheTest, SaveFailTest) { secondary_cache.reset(); } -TEST_F(LRUCacheSecondaryCacheTest, CreateFailTest) { - LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); +TEST_P(BasicSecondaryCacheTest, CreateFailTest) { std::shared_ptr secondary_cache = - std::make_shared(2048); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + std::make_shared(2048, true); + std::shared_ptr cache = + NewCache(1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); Random rnd(301); std::string str1 = rnd.RandomString(1020); TestItem* item1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert(k1.AsSlice(), item1, - &LRUCacheSecondaryCacheTest::helper_, str1.length())); + ASSERT_OK(cache->Insert(k1.AsSlice(), item1, GetHelper(), str1.length())); std::string str2 = rnd.RandomString(1020); TestItem* item2 = new TestItem(str2.data(), str2.length()); // k1 should be demoted to NVM - ASSERT_OK(cache->Insert(k2.AsSlice(), item2, - &LRUCacheSecondaryCacheTest::helper_, str2.length())); + ASSERT_OK(cache->Insert(k2.AsSlice(), item2, GetHelper(), str2.length())); Cache::Handle* handle; SetFailCreate(true); - handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - /*context*/ this, Cache::Priority::LOW, true); + handle = cache->Lookup(k2.AsSlice(), GetHelper(), + /*context*/ this, Cache::Priority::LOW); ASSERT_NE(handle, nullptr); cache->Release(handle); // This lookup should fail, since k1 creation would have failed - handle = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - /*context*/ this, Cache::Priority::LOW, true); + handle = cache->Lookup(k1.AsSlice(), GetHelper(), + /*context*/ this, Cache::Priority::LOW); ASSERT_EQ(handle, nullptr); // Since k1 didn't get promoted, k2 should still be in cache - handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - /*context*/ this, Cache::Priority::LOW, true); + handle = cache->Lookup(k2.AsSlice(), GetHelper(), + /*context*/ this, Cache::Priority::LOW); ASSERT_NE(handle, nullptr); cache->Release(handle); ASSERT_EQ(secondary_cache->num_inserts(), 1u); @@ -1345,52 +1413,70 @@ TEST_F(LRUCacheSecondaryCacheTest, CreateFailTest) { secondary_cache.reset(); } -TEST_F(LRUCacheSecondaryCacheTest, FullCapacityTest) { - LRUCacheOptions opts(1024 /* capacity */, 0 /* num_shard_bits */, - true /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); - std::shared_ptr secondary_cache = - std::make_shared(2048); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); - CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); - CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); - - Random rnd(301); - std::string str1 = rnd.RandomString(1020); - TestItem* item1 = new TestItem(str1.data(), str1.length()); - ASSERT_OK(cache->Insert(k1.AsSlice(), item1, - &LRUCacheSecondaryCacheTest::helper_, str1.length())); - std::string str2 = rnd.RandomString(1020); - TestItem* item2 = new TestItem(str2.data(), str2.length()); - // k1 should be demoted to NVM - ASSERT_OK(cache->Insert(k2.AsSlice(), item2, - &LRUCacheSecondaryCacheTest::helper_, str2.length())); - - Cache::Handle* handle; - handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - /*context*/ this, Cache::Priority::LOW, true); - ASSERT_NE(handle, nullptr); - // k1 promotion should fail due to the block cache being at capacity, - // but the lookup should still succeed - Cache::Handle* handle2; - handle2 = cache->Lookup(k1.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - /*context*/ this, Cache::Priority::LOW, true); - ASSERT_NE(handle2, nullptr); - // Since k1 didn't get inserted, k2 should still be in cache - cache->Release(handle); - cache->Release(handle2); - handle = cache->Lookup(k2.AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - /*context*/ this, Cache::Priority::LOW, true); - ASSERT_NE(handle, nullptr); - cache->Release(handle); - ASSERT_EQ(secondary_cache->num_inserts(), 1u); - ASSERT_EQ(secondary_cache->num_lookups(), 1u); +TEST_P(BasicSecondaryCacheTest, FullCapacityTest) { + for (bool strict_capacity_limit : {false, true}) { + std::shared_ptr secondary_cache = + std::make_shared(2048, true); + std::shared_ptr cache = + NewCache(1024 /* capacity */, 0 /* num_shard_bits */, + strict_capacity_limit, secondary_cache); + CacheKey k1 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + CacheKey k2 = CacheKey::CreateUniqueForCacheLifetime(cache.get()); + + Random rnd(301); + std::string str1 = rnd.RandomString(1020); + TestItem* item1 = new TestItem(str1.data(), str1.length()); + ASSERT_OK(cache->Insert(k1.AsSlice(), item1, GetHelper(), str1.length())); + std::string str2 = rnd.RandomString(1020); + TestItem* item2 = new TestItem(str2.data(), str2.length()); + // k1 should be demoted to NVM + ASSERT_OK(cache->Insert(k2.AsSlice(), item2, GetHelper(), str2.length())); + + Cache::Handle* handle2; + handle2 = cache->Lookup(k2.AsSlice(), GetHelper(), + /*context*/ this, Cache::Priority::LOW); + ASSERT_NE(handle2, nullptr); + // k1 lookup fails without secondary cache support + Cache::Handle* handle1; + handle1 = cache->Lookup( + k1.AsSlice(), + GetHelper(CacheEntryRole::kDataBlock, /*secondary_compatible=*/false), + /*context*/ this, Cache::Priority::LOW); + ASSERT_EQ(handle1, nullptr); + + // k1 promotion can fail with strict_capacit_limit=true, but Lookup still + // succeeds using a standalone handle + handle1 = cache->Lookup(k1.AsSlice(), GetHelper(), + /*context*/ this, Cache::Priority::LOW); + ASSERT_NE(handle1, nullptr); + + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + ASSERT_EQ(secondary_cache->num_lookups(), 1u); + + // Releasing k2's handle first, k2 is evicted from primary iff k1 promotion + // was charged to the cache (except HCC doesn't erase in Release() over + // capacity) + // FIXME: Insert to secondary from Release disabled + cache->Release(handle2); + cache->Release(handle1); + handle2 = cache->Lookup( + k2.AsSlice(), + GetHelper(CacheEntryRole::kDataBlock, /*secondary_compatible=*/false), + /*context*/ this, Cache::Priority::LOW); + if (strict_capacity_limit || IsHyperClock()) { + ASSERT_NE(handle2, nullptr); + cache->Release(handle2); + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + } else { + ASSERT_EQ(handle2, nullptr); + // FIXME: Insert to secondary from Release disabled + // ASSERT_EQ(secondary_cache->num_inserts(), 2u); + ASSERT_EQ(secondary_cache->num_inserts(), 1u); + } - cache.reset(); - secondary_cache.reset(); + cache.reset(); + secondary_cache.reset(); + } } // In this test, the block cache size is set to 4096, after insert 6 KV-pairs @@ -1399,16 +1485,24 @@ TEST_F(LRUCacheSecondaryCacheTest, FullCapacityTest) { // of the meta blocks are about 900 to 1000. Therefore, in any situation, // if we try to insert block_1 to the block cache, it will always fails. Only // block_2 will be successfully inserted into the block cache. -TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness1) { - LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); +// CORRECTION: this is not quite right. block_1 can be inserted into the block +// cache because strict_capacity_limit=false, but it is removed from the cache +// in Release() because of being over-capacity, without demoting to secondary +// cache. FixedHyperClockCache doesn't check capacity on release (for +// efficiency) so can demote the over-capacity item to secondary cache. Also, we +// intend to add support for demotion in Release, but that currently causes too +// much unit test churn. +TEST_P(DBSecondaryCacheTest, TestSecondaryCacheCorrectness1) { + if (IsHyperClock()) { + // See CORRECTION above + ROCKSDB_GTEST_BYPASS("Test depends on LRUCache-specific behaviors"); + return; + } std::shared_ptr secondary_cache( new TestSecondaryCache(2048 * 1024)); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + std::shared_ptr cache = + NewCache(4 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); BlockBasedTableOptions table_options; table_options.block_cache = cache; table_options.block_size = 4 * 1024; @@ -1496,16 +1590,16 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness1) { // of the meta blocks are about 900 to 1000. Therefore, we can successfully // insert and cache block_1 in the block cache (this is the different place // from TestSecondaryCacheCorrectness1) -TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) { - LRUCacheOptions opts(6100 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); +TEST_P(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) { + if (IsHyperClock()) { + ROCKSDB_GTEST_BYPASS("Test depends on LRUCache-specific behaviors"); + return; + } std::shared_ptr secondary_cache( new TestSecondaryCache(2048 * 1024)); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + std::shared_ptr cache = + NewCache(6100 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); BlockBasedTableOptions table_options; table_options.block_cache = cache; table_options.block_size = 4 * 1024; @@ -1589,16 +1683,12 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheCorrectness2) { // of the meta blocks are about 900 to 1000. Therefore, we can successfully // cache all the blocks in the block cache and there is not secondary cache // insertion. 2 lookup is needed for the blocks. -TEST_F(DBSecondaryCacheTest, NoSecondaryCacheInsertion) { - LRUCacheOptions opts(1024 * 1024 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); +TEST_P(DBSecondaryCacheTest, NoSecondaryCacheInsertion) { std::shared_ptr secondary_cache( new TestSecondaryCache(2048 * 1024)); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + std::shared_ptr cache = + NewCache(1024 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); BlockBasedTableOptions table_options; table_options.block_cache = cache; table_options.block_size = 4 * 1024; @@ -1643,16 +1733,12 @@ TEST_F(DBSecondaryCacheTest, NoSecondaryCacheInsertion) { Destroy(options); } -TEST_F(DBSecondaryCacheTest, SecondaryCacheIntensiveTesting) { - LRUCacheOptions opts(8 * 1024 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); +TEST_P(DBSecondaryCacheTest, SecondaryCacheIntensiveTesting) { std::shared_ptr secondary_cache( new TestSecondaryCache(2048 * 1024)); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + std::shared_ptr cache = + NewCache(8 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); BlockBasedTableOptions table_options; table_options.block_cache = cache; table_options.block_size = 4 * 1024; @@ -1692,16 +1778,16 @@ TEST_F(DBSecondaryCacheTest, SecondaryCacheIntensiveTesting) { // of the meta blocks are about 900 to 1000. Therefore, in any situation, // if we try to insert block_1 to the block cache, it will always fails. Only // block_2 will be successfully inserted into the block cache. -TEST_F(DBSecondaryCacheTest, SecondaryCacheFailureTest) { - LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); +TEST_P(DBSecondaryCacheTest, SecondaryCacheFailureTest) { + if (IsHyperClock()) { + ROCKSDB_GTEST_BYPASS("Test depends on LRUCache-specific behaviors"); + return; + } std::shared_ptr secondary_cache( new TestSecondaryCache(2048 * 1024)); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + std::shared_ptr cache = + NewCache(4 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); BlockBasedTableOptions table_options; table_options.block_cache = cache; table_options.block_size = 4 * 1024; @@ -1784,53 +1870,12 @@ TEST_F(DBSecondaryCacheTest, SecondaryCacheFailureTest) { Destroy(options); } -TEST_F(DBSecondaryCacheTest, TestSecondaryWithCompressedCache) { - if (!Snappy_Supported()) { - ROCKSDB_GTEST_SKIP("Compressed cache test requires snappy support"); - return; - } - LRUCacheOptions opts(2000 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); - std::shared_ptr secondary_cache( - new TestSecondaryCache(2048 * 1024)); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); - BlockBasedTableOptions table_options; - table_options.block_cache_compressed = cache; - table_options.no_block_cache = true; - table_options.block_size = 1234; - Options options = GetDefaultOptions(); - options.compression = kSnappyCompression; - options.create_if_missing = true; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - DestroyAndReopen(options); - Random rnd(301); - const int N = 6; - for (int i = 0; i < N; i++) { - // Partly compressible - std::string p_v = rnd.RandomString(507) + std::string(500, ' '); - ASSERT_OK(Put(Key(i), p_v)); - } - ASSERT_OK(Flush()); - for (int i = 0; i < 2 * N; i++) { - std::string v = Get(Key(i % N)); - ASSERT_EQ(1007, v.size()); - } -} - -TEST_F(LRUCacheSecondaryCacheTest, BasicWaitAllTest) { - LRUCacheOptions opts(1024 /* capacity */, 2 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); +TEST_P(BasicSecondaryCacheTest, BasicWaitAllTest) { std::shared_ptr secondary_cache = std::make_shared(32 * 1024); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + std::shared_ptr cache = + NewCache(1024 /* capacity */, 2 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); const int num_keys = 32; OffsetableCacheKey ock{"foo", "bar", 1}; @@ -1840,12 +1885,19 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicWaitAllTest) { std::string str = rnd.RandomString(1020); values.emplace_back(str); TestItem* item = new TestItem(str.data(), str.length()); - ASSERT_OK(cache->Insert(ock.WithOffset(i).AsSlice(), item, - &LRUCacheSecondaryCacheTest::helper_, + ASSERT_OK(cache->Insert(ock.WithOffset(i).AsSlice(), item, GetHelper(), str.length())); } // Force all entries to be evicted to the secondary cache - cache->SetCapacity(0); + if (IsHyperClock()) { + // HCC doesn't respond immediately to SetCapacity + for (int i = 9000; i < 9030; ++i) { + ASSERT_OK(cache->Insert(ock.WithOffset(i).AsSlice(), nullptr, + &kNoopCacheItemHelper, 256)); + } + } else { + cache->SetCapacity(0); + } ASSERT_EQ(secondary_cache->num_inserts(), 32u); cache->SetCapacity(32 * 1024); @@ -1856,24 +1908,31 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicWaitAllTest) { TestSecondaryCache::ResultType::DEFER_AND_FAIL}, {ock.WithOffset(5).AsSlice().ToString(), TestSecondaryCache::ResultType::FAIL}}); - std::vector results; - for (int i = 0; i < 6; ++i) { - results.emplace_back(cache->Lookup( - ock.WithOffset(i).AsSlice(), &LRUCacheSecondaryCacheTest::helper_, - /*context*/ this, Cache::Priority::LOW, false)); - } - cache->WaitAll(results); - for (int i = 0; i < 6; ++i) { - if (i == 4) { - ASSERT_EQ(cache->Value(results[i]), nullptr); - } else if (i == 5) { - ASSERT_EQ(results[i], nullptr); + + std::array async_handles; + std::array cache_keys; + for (size_t i = 0; i < async_handles.size(); ++i) { + auto& ah = async_handles[i]; + cache_keys[i] = ock.WithOffset(i); + ah.key = cache_keys[i].AsSlice(); + ah.helper = GetHelper(); + ah.create_context = this; + ah.priority = Cache::Priority::LOW; + cache->StartAsyncLookup(ah); + } + cache->WaitAll(&async_handles[0], async_handles.size()); + for (size_t i = 0; i < async_handles.size(); ++i) { + SCOPED_TRACE("i = " + std::to_string(i)); + Cache::Handle* result = async_handles[i].Result(); + if (i == 4 || i == 5) { + ASSERT_EQ(result, nullptr); continue; } else { - TestItem* item = static_cast(cache->Value(results[i])); + ASSERT_NE(result, nullptr); + TestItem* item = static_cast(cache->Value(result)); ASSERT_EQ(item->ToString(), values[i]); } - cache->Release(results[i]); + cache->Release(result); } cache.reset(); @@ -1884,16 +1943,16 @@ TEST_F(LRUCacheSecondaryCacheTest, BasicWaitAllTest) { // the cache key associated with each data block (and thus each KV) by using // a sync point callback in TestSecondaryCache::Lookup. We then control the // lookup result by setting the ResultMap. -TEST_F(DBSecondaryCacheTest, TestSecondaryCacheMultiGet) { - LRUCacheOptions opts(1 << 20 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); +TEST_P(DBSecondaryCacheTest, TestSecondaryCacheMultiGet) { + if (IsHyperClock()) { + ROCKSDB_GTEST_BYPASS("Test depends on LRUCache-specific behaviors"); + return; + } std::shared_ptr secondary_cache( new TestSecondaryCache(2048 * 1024)); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + std::shared_ptr cache = + NewCache(1 << 20 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); BlockBasedTableOptions table_options; table_options.block_cache = cache; table_options.block_size = 4 * 1024; @@ -1967,37 +2026,26 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheMultiGet) { Destroy(options); } -class LRUCacheWithStat : public LRUCache { +class CacheWithStats : public CacheWrapper { public: - LRUCacheWithStat( - size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, - double _high_pri_pool_ratio, double _low_pri_pool_ratio, - std::shared_ptr _memory_allocator = nullptr, - bool _use_adaptive_mutex = kDefaultToAdaptiveMutex, - CacheMetadataChargePolicy _metadata_charge_policy = - kDontChargeCacheMetadata, - const std::shared_ptr& _secondary_cache = nullptr) - : LRUCache(_capacity, _num_shard_bits, _strict_capacity_limit, - _high_pri_pool_ratio, _low_pri_pool_ratio, _memory_allocator, - _use_adaptive_mutex, _metadata_charge_policy, - _secondary_cache) { - insert_count_ = 0; - lookup_count_ = 0; - } - ~LRUCacheWithStat() {} + using CacheWrapper::CacheWrapper; + + static const char* kClassName() { return "CacheWithStats"; } + const char* Name() const override { return kClassName(); } Status Insert(const Slice& key, Cache::ObjectPtr value, const CacheItemHelper* helper, size_t charge, - Handle** handle = nullptr, - Priority priority = Priority::LOW) override { + Handle** handle = nullptr, Priority priority = Priority::LOW, + const Slice& /*compressed*/ = Slice(), + CompressionType /*type*/ = kNoCompression) override { insert_count_++; - return LRUCache::Insert(key, value, helper, charge, handle, priority); + return target_->Insert(key, value, helper, charge, handle, priority); } Handle* Lookup(const Slice& key, const CacheItemHelper* helper, - CreateContext* create_context, Priority priority, bool wait, + CreateContext* create_context, Priority priority, Statistics* stats = nullptr) override { lookup_count_++; - return LRUCache::Lookup(key, helper, create_context, priority, wait, stats); + return target_->Lookup(key, helper, create_context, priority, stats); } uint32_t GetInsertCount() { return insert_count_; } @@ -2008,25 +2056,16 @@ class LRUCacheWithStat : public LRUCache { } private: - uint32_t insert_count_; - uint32_t lookup_count_; + uint32_t insert_count_ = 0; + uint32_t lookup_count_ = 0; }; -#ifndef ROCKSDB_LITE - -TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) { - LRUCacheOptions cache_opts(1024 * 1024 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, - kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); - LRUCacheWithStat* tmp_cache = new LRUCacheWithStat( - cache_opts.capacity, cache_opts.num_shard_bits, - cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, - cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator, - cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy, - cache_opts.secondary_cache); - std::shared_ptr cache(tmp_cache); +TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) { + std::shared_ptr base_cache = + NewCache(1024 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */); + std::shared_ptr cache = + std::make_shared(base_cache); BlockBasedTableOptions table_options; table_options.block_cache = cache; table_options.block_size = 4 * 1024; @@ -2054,15 +2093,15 @@ TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) { // do th eread for all the key value pairs, so all the blocks should be in // cache - uint32_t start_insert = tmp_cache->GetInsertCount(); - uint32_t start_lookup = tmp_cache->GetLookupcount(); + uint32_t start_insert = cache->GetInsertCount(); + uint32_t start_lookup = cache->GetLookupcount(); std::string v; for (int i = 0; i < N; i++) { v = Get(Key(i)); ASSERT_EQ(v, value[i]); } - uint32_t dump_insert = tmp_cache->GetInsertCount() - start_insert; - uint32_t dump_lookup = tmp_cache->GetLookupcount() - start_lookup; + uint32_t dump_insert = cache->GetInsertCount() - start_insert; + uint32_t dump_lookup = cache->GetLookupcount() - start_lookup; ASSERT_EQ(63, static_cast(dump_insert)); // the insert in the block cache ASSERT_EQ(256, @@ -2091,16 +2130,12 @@ TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) { // we have a new cache it is empty, then, before we do the Get, we do the // dumpload std::shared_ptr secondary_cache = - std::make_shared(2048 * 1024); - cache_opts.secondary_cache = secondary_cache; - tmp_cache = new LRUCacheWithStat( - cache_opts.capacity, cache_opts.num_shard_bits, - cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, - cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator, - cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy, - cache_opts.secondary_cache); - std::shared_ptr cache_new(tmp_cache); - table_options.block_cache = cache_new; + std::make_shared(2048 * 1024, true); + // This time with secondary cache + base_cache = NewCache(1024 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); + cache = std::make_shared(base_cache); + table_options.block_cache = cache; table_options.block_size = 4 * 1024; options.create_if_missing = true; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -2131,8 +2166,8 @@ TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) { // After load, we do the Get again start_insert = secondary_cache->num_inserts(); start_lookup = secondary_cache->num_lookups(); - uint32_t cache_insert = tmp_cache->GetInsertCount(); - uint32_t cache_lookup = tmp_cache->GetLookupcount(); + uint32_t cache_insert = cache->GetInsertCount(); + uint32_t cache_lookup = cache->GetLookupcount(); for (int i = 0; i < N; i++) { v = Get(Key(i)); ASSERT_EQ(v, value[i]); @@ -2143,8 +2178,8 @@ TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) { ASSERT_EQ(0, static_cast(final_insert)); // lookup the secondary to get all blocks ASSERT_EQ(64, static_cast(final_lookup)); - uint32_t block_insert = tmp_cache->GetInsertCount() - cache_insert; - uint32_t block_lookup = tmp_cache->GetLookupcount() - cache_lookup; + uint32_t block_insert = cache->GetInsertCount() - cache_insert; + uint32_t block_lookup = cache->GetLookupcount() - cache_lookup; // Check the new block cache insert and lookup, should be no insert since all // blocks are from the secondary cache. ASSERT_EQ(0, static_cast(block_insert)); @@ -2154,19 +2189,12 @@ TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadBasic) { Destroy(options); } -TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) { - LRUCacheOptions cache_opts(1024 * 1024 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, - kDefaultToAdaptiveMutex, kDontChargeCacheMetadata); - LRUCacheWithStat* tmp_cache = new LRUCacheWithStat( - cache_opts.capacity, cache_opts.num_shard_bits, - cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, - cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator, - cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy, - cache_opts.secondary_cache); - std::shared_ptr cache(tmp_cache); +TEST_P(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) { + std::shared_ptr base_cache = + NewCache(1024 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */); + std::shared_ptr cache = + std::make_shared(base_cache); BlockBasedTableOptions table_options; table_options.block_cache = cache; table_options.block_size = 4 * 1024; @@ -2216,8 +2244,8 @@ TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) { // do th eread for all the key value pairs, so all the blocks should be in // cache - uint32_t start_insert = tmp_cache->GetInsertCount(); - uint32_t start_lookup = tmp_cache->GetLookupcount(); + uint32_t start_insert = cache->GetInsertCount(); + uint32_t start_lookup = cache->GetLookupcount(); ReadOptions ro; std::string v; for (int i = 0; i < N; i++) { @@ -2228,8 +2256,8 @@ TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) { ASSERT_OK(db2->Get(ro, Key(i), &v)); ASSERT_EQ(v, value2[i]); } - uint32_t dump_insert = tmp_cache->GetInsertCount() - start_insert; - uint32_t dump_lookup = tmp_cache->GetLookupcount() - start_lookup; + uint32_t dump_insert = cache->GetInsertCount() - start_insert; + uint32_t dump_lookup = cache->GetLookupcount() - start_lookup; ASSERT_EQ(128, static_cast(dump_insert)); // the insert in the block cache ASSERT_EQ(512, @@ -2258,16 +2286,12 @@ TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) { // we have a new cache it is empty, then, before we do the Get, we do the // dumpload std::shared_ptr secondary_cache = - std::make_shared(2048 * 1024); - cache_opts.secondary_cache = secondary_cache; - tmp_cache = new LRUCacheWithStat( - cache_opts.capacity, cache_opts.num_shard_bits, - cache_opts.strict_capacity_limit, cache_opts.high_pri_pool_ratio, - cache_opts.low_pri_pool_ratio, cache_opts.memory_allocator, - cache_opts.use_adaptive_mutex, cache_opts.metadata_charge_policy, - cache_opts.secondary_cache); - std::shared_ptr cache_new(tmp_cache); - table_options.block_cache = cache_new; + std::make_shared(2048 * 1024, true); + // This time with secondary_cache + base_cache = NewCache(1024 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); + cache = std::make_shared(base_cache); + table_options.block_cache = cache; table_options.block_size = 4 * 1024; options.create_if_missing = true; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -2303,8 +2327,8 @@ TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) { fault_fs_->SetFilesystemActive(false, error_msg); start_insert = secondary_cache->num_inserts(); start_lookup = secondary_cache->num_lookups(); - uint32_t cache_insert = tmp_cache->GetInsertCount(); - uint32_t cache_lookup = tmp_cache->GetLookupcount(); + uint32_t cache_insert = cache->GetInsertCount(); + uint32_t cache_lookup = cache->GetLookupcount(); for (int i = 0; i < N; i++) { ASSERT_OK(db1->Get(ro, Key(i), &v)); ASSERT_EQ(v, value1[i]); @@ -2315,8 +2339,8 @@ TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) { ASSERT_EQ(0, static_cast(final_insert)); // lookup the secondary to get all blocks ASSERT_EQ(64, static_cast(final_lookup)); - uint32_t block_insert = tmp_cache->GetInsertCount() - cache_insert; - uint32_t block_lookup = tmp_cache->GetLookupcount() - cache_lookup; + uint32_t block_insert = cache->GetInsertCount() - cache_insert; + uint32_t block_lookup = cache->GetLookupcount() - cache_lookup; // Check the new block cache insert and lookup, should be no insert since all // blocks are from the secondary cache. ASSERT_EQ(0, static_cast(block_insert)); @@ -2330,16 +2354,12 @@ TEST_F(DBSecondaryCacheTest, LRUCacheDumpLoadWithFilter) { } // Test the option not to use the secondary cache in a certain DB. -TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionBasic) { - LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); +TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionBasic) { std::shared_ptr secondary_cache( new TestSecondaryCache(2048 * 1024)); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + std::shared_ptr cache = + NewCache(4 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); BlockBasedTableOptions table_options; table_options.block_cache = cache; table_options.block_size = 4 * 1024; @@ -2425,16 +2445,16 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionBasic) { // We disable the secondary cache in DBOptions at first. Close and reopen the DB // with new options, which set the lowest_used_cache_tier to // kNonVolatileBlockTier. So secondary cache will be used. -TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionChange) { - LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); +TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionChange) { + if (IsHyperClock()) { + ROCKSDB_GTEST_BYPASS("Test depends on LRUCache-specific behaviors"); + return; + } std::shared_ptr secondary_cache( new TestSecondaryCache(2048 * 1024)); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + std::shared_ptr cache = + NewCache(4 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); BlockBasedTableOptions table_options; table_options.block_cache = cache; table_options.block_size = 4 * 1024; @@ -2520,16 +2540,16 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionChange) { // Two DB test. We create 2 DBs sharing the same block cache and secondary // cache. We diable the secondary cache option for DB2. -TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) { - LRUCacheOptions opts(4 * 1024 /* capacity */, 0 /* num_shard_bits */, - false /* strict_capacity_limit */, - 0.5 /* high_pri_pool_ratio */, - nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, - kDontChargeCacheMetadata); +TEST_P(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) { + if (IsHyperClock()) { + ROCKSDB_GTEST_BYPASS("Test depends on LRUCache-specific behaviors"); + return; + } std::shared_ptr secondary_cache( new TestSecondaryCache(2048 * 1024)); - opts.secondary_cache = secondary_cache; - std::shared_ptr cache = NewLRUCache(opts); + std::shared_ptr cache = + NewCache(4 * 1024 /* capacity */, 0 /* num_shard_bits */, + false /* strict_capacity_limit */, secondary_cache); BlockBasedTableOptions table_options; table_options.block_cache = cache; table_options.block_size = 4 * 1024; @@ -2627,8 +2647,6 @@ TEST_F(DBSecondaryCacheTest, TestSecondaryCacheOptionTwoDB) { ASSERT_OK(DestroyDB(dbname2, options)); } -#endif // ROCKSDB_LITE - } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/cache/secondary_cache.cc b/cache/secondary_cache.cc index eb4972f8f016..4439869f19fb 100644 --- a/cache/secondary_cache.cc +++ b/cache/secondary_cache.cc @@ -9,33 +9,4 @@ namespace ROCKSDB_NAMESPACE { -namespace { - -void NoopDelete(Cache::ObjectPtr, MemoryAllocator*) {} - -size_t SliceSize(Cache::ObjectPtr obj) { - return static_cast(obj)->size(); -} - -Status SliceSaveTo(Cache::ObjectPtr from_obj, size_t from_offset, size_t length, - char* out) { - const Slice& slice = *static_cast(from_obj); - std::memcpy(out, slice.data() + from_offset, length); - return Status::OK(); -} - -Status FailCreate(const Slice&, Cache::CreateContext*, MemoryAllocator*, - Cache::ObjectPtr*, size_t*) { - return Status::NotSupported("Only for dumping data into SecondaryCache"); -} - -} // namespace - -Status SecondaryCache::InsertSaved(const Slice& key, const Slice& saved) { - static Cache::CacheItemHelper helper{CacheEntryRole::kMisc, &NoopDelete, - &SliceSize, &SliceSaveTo, &FailCreate}; - // NOTE: depends on Insert() being synchronous, not keeping pointer `&saved` - return Insert(key, const_cast(&saved), &helper); -} - } // namespace ROCKSDB_NAMESPACE diff --git a/cache/secondary_cache_adapter.cc b/cache/secondary_cache_adapter.cc new file mode 100644 index 000000000000..b36f3a381b27 --- /dev/null +++ b/cache/secondary_cache_adapter.cc @@ -0,0 +1,737 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "cache/secondary_cache_adapter.h" + +#include + +#include "cache/tiered_secondary_cache.h" +#include "monitoring/perf_context_imp.h" +#include "test_util/sync_point.h" +#include "util/cast_util.h" + +namespace ROCKSDB_NAMESPACE { + +namespace { +// A distinct pointer value for marking "dummy" cache entries +struct Dummy { + char val[7] = "kDummy"; +}; +const Dummy kDummy{}; +Cache::ObjectPtr const kDummyObj = const_cast(&kDummy); +const char* kTieredCacheName = "TieredCache"; +} // namespace + +// When CacheWithSecondaryAdapter is constructed with the distribute_cache_res +// parameter set to true, it manages the entire memory budget across the +// primary and secondary cache. The secondary cache is assumed to be in +// memory, such as the CompressedSecondaryCache. When a placeholder entry +// is inserted by a CacheReservationManager instance to reserve memory, +// the CacheWithSecondaryAdapter ensures that the reservation is distributed +// proportionally across the primary/secondary caches. +// +// The primary block cache is initially sized to the sum of the primary cache +// budget + teh secondary cache budget, as follows - +// |--------- Primary Cache Configured Capacity -----------| +// |---Secondary Cache Budget----|----Primary Cache Budget-----| +// +// A ConcurrentCacheReservationManager member in the CacheWithSecondaryAdapter, +// pri_cache_res_, +// is used to help with tracking the distribution of memory reservations. +// Initially, it accounts for the entire secondary cache budget as a +// reservation against the primary cache. This shrinks the usable capacity of +// the primary cache to the budget that the user originally desired. +// +// |--Reservation for Sec Cache--|-Pri Cache Usable Capacity---| +// +// When a reservation placeholder is inserted into the adapter, it is inserted +// directly into the primary cache. This means the entire charge of the +// placeholder is counted against the primary cache. To compensate and count +// a portion of it against the secondary cache, the secondary cache Deflate() +// method is called to shrink it. Since the Deflate() causes the secondary +// actual usage to shrink, it is refelcted here by releasing an equal amount +// from the pri_cache_res_ reservation. The Deflate() in the secondary cache +// can be, but is not required to be, implemented using its own cache +// reservation manager. +// +// For example, if the pri/sec ratio is 70/30, and the combined capacity is +// 100MB, the intermediate and final state after inserting a reservation +// placeholder for 10MB would be as follows - +// +// |-Reservation for Sec Cache-|-Pri Cache Usable Capacity-|---R---| +// 1. After inserting the placeholder in primary +// |------- 30MB -------------|------- 60MB -------------|-10MB--| +// 2. After deflating the secondary and adjusting the reservation for +// secondary against the primary +// |------- 27MB -------------|------- 63MB -------------|-10MB--| +// +// Likewise, when the user inserted placeholder is released, the secondary +// cache Inflate() method is called to grow it, and the pri_cache_res_ +// reservation is increased by an equal amount. +// +// Another way of implementing this would have been to simply split the user +// reservation into primary and seconary components. However, this would +// require allocating a structure to track the associated secondary cache +// reservation, which adds some complexity and overhead. +// +CacheWithSecondaryAdapter::CacheWithSecondaryAdapter( + std::shared_ptr target, + std::shared_ptr secondary_cache, + TieredAdmissionPolicy adm_policy, bool distribute_cache_res) + : CacheWrapper(std::move(target)), + secondary_cache_(std::move(secondary_cache)), + adm_policy_(adm_policy), + distribute_cache_res_(distribute_cache_res), + placeholder_usage_(0), + reserved_usage_(0), + sec_reserved_(0) { + target_->SetEvictionCallback( + [this](const Slice& key, Handle* handle, bool was_hit) { + return EvictionHandler(key, handle, was_hit); + }); + if (distribute_cache_res_) { + size_t sec_capacity = 0; + pri_cache_res_ = std::make_shared( + std::make_shared>( + target_)); + Status s = secondary_cache_->GetCapacity(sec_capacity); + assert(s.ok()); + // Initially, the primary cache is sized to uncompressed cache budget plsu + // compressed secondary cache budget. The secondary cache budget is then + // taken away from the primary cache through cache reservations. Later, + // when a placeholder entry is inserted by the caller, its inserted + // into the primary cache and the portion that should be assigned to the + // secondary cache is freed from the reservation. + s = pri_cache_res_->UpdateCacheReservation(sec_capacity); + assert(s.ok()); + sec_cache_res_ratio_ = (double)sec_capacity / target_->GetCapacity(); + } +} + +CacheWithSecondaryAdapter::~CacheWithSecondaryAdapter() { + // `*this` will be destroyed before `*target_`, so we have to prevent + // use after free + target_->SetEvictionCallback({}); +#ifndef NDEBUG + if (distribute_cache_res_) { + size_t sec_capacity = 0; + Status s = secondary_cache_->GetCapacity(sec_capacity); + assert(s.ok()); + assert(placeholder_usage_ == 0); + assert(reserved_usage_ == 0); + assert(pri_cache_res_->GetTotalMemoryUsed() == sec_capacity); + } +#endif // NDEBUG +} + +bool CacheWithSecondaryAdapter::EvictionHandler(const Slice& key, + Handle* handle, bool was_hit) { + auto helper = GetCacheItemHelper(handle); + if (helper->IsSecondaryCacheCompatible() && + adm_policy_ != TieredAdmissionPolicy::kAdmPolicyThreeQueue) { + auto obj = target_->Value(handle); + // Ignore dummy entry + if (obj != kDummyObj) { + bool hit = false; + if (adm_policy_ == TieredAdmissionPolicy::kAdmPolicyAllowCacheHits) { + hit = was_hit; + } + // Spill into secondary cache. + secondary_cache_->Insert(key, obj, helper, hit).PermitUncheckedError(); + } + } + // Never takes ownership of obj + return false; +} + +bool CacheWithSecondaryAdapter::ProcessDummyResult(Cache::Handle** handle, + bool erase) { + if (*handle && target_->Value(*handle) == kDummyObj) { + target_->Release(*handle, erase); + *handle = nullptr; + return true; + } else { + return false; + } +} + +void CacheWithSecondaryAdapter::CleanupCacheObject( + ObjectPtr obj, const CacheItemHelper* helper) { + if (helper->del_cb) { + helper->del_cb(obj, memory_allocator()); + } +} + +Cache::Handle* CacheWithSecondaryAdapter::Promote( + std::unique_ptr&& secondary_handle, + const Slice& key, const CacheItemHelper* helper, Priority priority, + Statistics* stats, bool found_dummy_entry, bool kept_in_sec_cache) { + assert(secondary_handle->IsReady()); + + ObjectPtr obj = secondary_handle->Value(); + if (!obj) { + // Nothing found. + return nullptr; + } + // Found something. + switch (helper->role) { + case CacheEntryRole::kFilterBlock: + RecordTick(stats, SECONDARY_CACHE_FILTER_HITS); + break; + case CacheEntryRole::kIndexBlock: + RecordTick(stats, SECONDARY_CACHE_INDEX_HITS); + break; + case CacheEntryRole::kDataBlock: + RecordTick(stats, SECONDARY_CACHE_DATA_HITS); + break; + default: + break; + } + PERF_COUNTER_ADD(secondary_cache_hit_count, 1); + RecordTick(stats, SECONDARY_CACHE_HITS); + + // Note: SecondaryCache::Size() is really charge (from the CreateCallback) + size_t charge = secondary_handle->Size(); + Handle* result = nullptr; + // Insert into primary cache, possibly as a standalone+dummy entries. + if (secondary_cache_->SupportForceErase() && !found_dummy_entry) { + // Create standalone and insert dummy + // Allow standalone to be created even if cache is full, to avoid + // reading the entry from storage. + result = + CreateStandalone(key, obj, helper, charge, /*allow_uncharged*/ true); + assert(result); + PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1); + + // Insert dummy to record recent use + // TODO: try to avoid case where inserting this dummy could overwrite a + // regular entry + Status s = Insert(key, kDummyObj, &kNoopCacheItemHelper, /*charge=*/0, + /*handle=*/nullptr, priority); + s.PermitUncheckedError(); + // Nothing to do or clean up on dummy insertion failure + } else { + // Insert regular entry into primary cache. + // Don't allow it to spill into secondary cache again if it was kept there. + Status s = Insert( + key, obj, kept_in_sec_cache ? helper->without_secondary_compat : helper, + charge, &result, priority); + if (s.ok()) { + assert(result); + PERF_COUNTER_ADD(block_cache_real_handle_count, 1); + } else { + // Create standalone result instead, even if cache is full, to avoid + // reading the entry from storage. + result = + CreateStandalone(key, obj, helper, charge, /*allow_uncharged*/ true); + assert(result); + PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1); + } + } + return result; +} + +Status CacheWithSecondaryAdapter::Insert(const Slice& key, ObjectPtr value, + const CacheItemHelper* helper, + size_t charge, Handle** handle, + Priority priority, + const Slice& compressed_value, + CompressionType type) { + Status s = target_->Insert(key, value, helper, charge, handle, priority); + if (s.ok() && value == nullptr && distribute_cache_res_ && handle) { + charge = target_->GetCharge(*handle); + + MutexLock l(&cache_res_mutex_); + placeholder_usage_ += charge; + // Check if total placeholder reservation is more than the overall + // cache capacity. If it is, then we don't try to charge the + // secondary cache because we don't want to overcharge it (beyond + // its capacity). + // In order to make this a bit more lightweight, we also check if + // the difference between placeholder_usage_ and reserved_usage_ is + // atleast kReservationChunkSize and avoid any adjustments if not. + if ((placeholder_usage_ <= target_->GetCapacity()) && + ((placeholder_usage_ - reserved_usage_) >= kReservationChunkSize)) { + reserved_usage_ = placeholder_usage_ & ~(kReservationChunkSize - 1); + size_t new_sec_reserved = + static_cast(reserved_usage_ * sec_cache_res_ratio_); + size_t sec_charge = new_sec_reserved - sec_reserved_; + s = secondary_cache_->Deflate(sec_charge); + assert(s.ok()); + s = pri_cache_res_->UpdateCacheReservation(sec_charge, + /*increase=*/false); + assert(s.ok()); + sec_reserved_ += sec_charge; + } + } + // Warm up the secondary cache with the compressed block. The secondary + // cache may choose to ignore it based on the admission policy. + if (value != nullptr && !compressed_value.empty() && + adm_policy_ == TieredAdmissionPolicy::kAdmPolicyThreeQueue) { + Status status = secondary_cache_->InsertSaved(key, compressed_value, type); + assert(status.ok() || status.IsNotSupported()); + } + + return s; +} + +Cache::Handle* CacheWithSecondaryAdapter::Lookup(const Slice& key, + const CacheItemHelper* helper, + CreateContext* create_context, + Priority priority, + Statistics* stats) { + // NOTE: we could just StartAsyncLookup() and Wait(), but this should be a bit + // more efficient + Handle* result = + target_->Lookup(key, helper, create_context, priority, stats); + bool secondary_compatible = helper && helper->IsSecondaryCacheCompatible(); + bool found_dummy_entry = + ProcessDummyResult(&result, /*erase=*/secondary_compatible); + if (!result && secondary_compatible) { + // Try our secondary cache + bool kept_in_sec_cache = false; + std::unique_ptr secondary_handle = + secondary_cache_->Lookup(key, helper, create_context, /*wait*/ true, + found_dummy_entry, /*out*/ kept_in_sec_cache); + if (secondary_handle) { + result = Promote(std::move(secondary_handle), key, helper, priority, + stats, found_dummy_entry, kept_in_sec_cache); + } + } + return result; +} + +bool CacheWithSecondaryAdapter::Release(Handle* handle, + bool erase_if_last_ref) { + if (erase_if_last_ref) { + ObjectPtr v = target_->Value(handle); + if (v == nullptr && distribute_cache_res_) { + size_t charge = target_->GetCharge(handle); + + MutexLock l(&cache_res_mutex_); + placeholder_usage_ -= charge; + // Check if total placeholder reservation is more than the overall + // cache capacity. If it is, then we do nothing as reserved_usage_ must + // be already maxed out + if ((placeholder_usage_ <= target_->GetCapacity()) && + (placeholder_usage_ < reserved_usage_)) { + // Adjust reserved_usage_ in chunks of kReservationChunkSize, so + // we don't hit this slow path too often. + reserved_usage_ = placeholder_usage_ & ~(kReservationChunkSize - 1); + size_t new_sec_reserved = + static_cast(reserved_usage_ * sec_cache_res_ratio_); + size_t sec_charge = sec_reserved_ - new_sec_reserved; + Status s = secondary_cache_->Inflate(sec_charge); + assert(s.ok()); + s = pri_cache_res_->UpdateCacheReservation(sec_charge, + /*increase=*/true); + assert(s.ok()); + sec_reserved_ -= sec_charge; + } + } + } + return target_->Release(handle, erase_if_last_ref); +} + +Cache::ObjectPtr CacheWithSecondaryAdapter::Value(Handle* handle) { + ObjectPtr v = target_->Value(handle); + // TODO with stacked secondaries: might fail in EvictionHandler + assert(v != kDummyObj); + return v; +} + +void CacheWithSecondaryAdapter::StartAsyncLookupOnMySecondary( + AsyncLookupHandle& async_handle) { + assert(!async_handle.IsPending()); + assert(async_handle.result_handle == nullptr); + + std::unique_ptr secondary_handle = + secondary_cache_->Lookup(async_handle.key, async_handle.helper, + async_handle.create_context, /*wait*/ false, + async_handle.found_dummy_entry, + /*out*/ async_handle.kept_in_sec_cache); + if (secondary_handle) { + // TODO with stacked secondaries: Check & process if already ready? + async_handle.pending_handle = secondary_handle.release(); + async_handle.pending_cache = secondary_cache_.get(); + } +} + +void CacheWithSecondaryAdapter::StartAsyncLookup( + AsyncLookupHandle& async_handle) { + target_->StartAsyncLookup(async_handle); + if (!async_handle.IsPending()) { + bool secondary_compatible = + async_handle.helper && + async_handle.helper->IsSecondaryCacheCompatible(); + async_handle.found_dummy_entry |= ProcessDummyResult( + &async_handle.result_handle, /*erase=*/secondary_compatible); + + if (async_handle.Result() == nullptr && secondary_compatible) { + // Not found and not pending on another secondary cache + StartAsyncLookupOnMySecondary(async_handle); + } + } +} + +void CacheWithSecondaryAdapter::WaitAll(AsyncLookupHandle* async_handles, + size_t count) { + if (count == 0) { + // Nothing to do + return; + } + // Requests that are pending on *my* secondary cache, at the start of this + // function + std::vector my_pending; + // Requests that are pending on an "inner" secondary cache (managed somewhere + // under target_), as of the start of this function + std::vector inner_pending; + + // Initial accounting of pending handles, excluding those already handled + // by "outer" secondary caches. (See cur->pending_cache = nullptr.) + for (size_t i = 0; i < count; ++i) { + AsyncLookupHandle* cur = async_handles + i; + if (cur->pending_cache) { + assert(cur->IsPending()); + assert(cur->helper); + assert(cur->helper->IsSecondaryCacheCompatible()); + if (cur->pending_cache == secondary_cache_.get()) { + my_pending.push_back(cur); + // Mark as "to be handled by this caller" + cur->pending_cache = nullptr; + } else { + // Remember as potentially needing a lookup in my secondary + inner_pending.push_back(cur); + } + } + } + + // Wait on inner-most cache lookups first + // TODO with stacked secondaries: because we are not using proper + // async/await constructs here yet, there is a false synchronization point + // here where all the results at one level are needed before initiating + // any lookups at the next level. Probably not a big deal, but worth noting. + if (!inner_pending.empty()) { + target_->WaitAll(async_handles, count); + } + + // For those that failed to find something, convert to lookup in my + // secondary cache. + for (AsyncLookupHandle* cur : inner_pending) { + if (cur->Result() == nullptr) { + // Not found, try my secondary + StartAsyncLookupOnMySecondary(*cur); + if (cur->IsPending()) { + assert(cur->pending_cache == secondary_cache_.get()); + my_pending.push_back(cur); + // Mark as "to be handled by this caller" + cur->pending_cache = nullptr; + } + } + } + + // Wait on all lookups on my secondary cache + { + std::vector my_secondary_handles; + for (AsyncLookupHandle* cur : my_pending) { + my_secondary_handles.push_back(cur->pending_handle); + } + secondary_cache_->WaitAll(std::move(my_secondary_handles)); + } + + // Process results + for (AsyncLookupHandle* cur : my_pending) { + std::unique_ptr secondary_handle( + cur->pending_handle); + cur->pending_handle = nullptr; + cur->result_handle = Promote( + std::move(secondary_handle), cur->key, cur->helper, cur->priority, + cur->stats, cur->found_dummy_entry, cur->kept_in_sec_cache); + assert(cur->pending_cache == nullptr); + } +} + +std::string CacheWithSecondaryAdapter::GetPrintableOptions() const { + std::string str = target_->GetPrintableOptions(); + str.append(" secondary_cache:\n"); + str.append(secondary_cache_->GetPrintableOptions()); + return str; +} + +const char* CacheWithSecondaryAdapter::Name() const { + if (distribute_cache_res_) { + return kTieredCacheName; + } else { + // To the user, at least for now, configure the underlying cache with + // a secondary cache. So we pretend to be that cache + return target_->Name(); + } +} + +// Update the total cache capacity. If we're distributing cache reservations +// to both primary and secondary, then update the pri_cache_res_reservation +// as well. At the moment, we don't have a good way of handling the case +// where the new capacity < total cache reservations. +void CacheWithSecondaryAdapter::SetCapacity(size_t capacity) { + size_t sec_capacity = static_cast( + capacity * (distribute_cache_res_ ? sec_cache_res_ratio_ : 0.0)); + size_t old_sec_capacity = 0; + + if (distribute_cache_res_) { + MutexLock m(&cache_res_mutex_); + + Status s = secondary_cache_->GetCapacity(old_sec_capacity); + if (!s.ok()) { + return; + } + if (old_sec_capacity > sec_capacity) { + // We're shrinking the cache. We do things in the following order to + // avoid a temporary spike in usage over the configured capacity - + // 1. Lower the secondary cache capacity + // 2. Credit an equal amount (by decreasing pri_cache_res_) to the + // primary cache + // 3. Decrease the primary cache capacity to the total budget + s = secondary_cache_->SetCapacity(sec_capacity); + if (s.ok()) { + if (placeholder_usage_ > capacity) { + // Adjust reserved_usage_ down + reserved_usage_ = capacity & ~(kReservationChunkSize - 1); + } + size_t new_sec_reserved = + static_cast(reserved_usage_ * sec_cache_res_ratio_); + s = pri_cache_res_->UpdateCacheReservation( + (old_sec_capacity - sec_capacity) - + (sec_reserved_ - new_sec_reserved), + /*increase=*/false); + sec_reserved_ = new_sec_reserved; + assert(s.ok()); + target_->SetCapacity(capacity); + } + } else { + // We're expanding the cache. Do it in the following order to avoid + // unnecessary evictions - + // 1. Increase the primary cache capacity to total budget + // 2. Reserve additional memory in primary on behalf of secondary (by + // increasing pri_cache_res_ reservation) + // 3. Increase secondary cache capacity + target_->SetCapacity(capacity); + s = pri_cache_res_->UpdateCacheReservation( + sec_capacity - old_sec_capacity, + /*increase=*/true); + assert(s.ok()); + s = secondary_cache_->SetCapacity(sec_capacity); + assert(s.ok()); + } + } else { + // No cache reservation distribution. Just set the primary cache capacity. + target_->SetCapacity(capacity); + } +} + +Status CacheWithSecondaryAdapter::GetSecondaryCacheCapacity( + size_t& size) const { + return secondary_cache_->GetCapacity(size); +} + +Status CacheWithSecondaryAdapter::GetSecondaryCachePinnedUsage( + size_t& size) const { + Status s; + if (distribute_cache_res_) { + MutexLock m(&cache_res_mutex_); + size_t capacity = 0; + s = secondary_cache_->GetCapacity(capacity); + if (s.ok()) { + size = capacity - pri_cache_res_->GetTotalMemoryUsed(); + } else { + size = 0; + } + } else { + size = 0; + } + return s; +} + +// Update the secondary/primary allocation ratio (remember, the primary +// capacity is the total memory budget when distribute_cache_res_ is true). +// When the ratio changes, we may accumulate some error in the calculations +// for secondary cache inflate/deflate and pri_cache_res_ reservations. +// This is due to the rounding of the reservation amount. +// +// We rely on the current pri_cache_res_ total memory used to estimate the +// new secondary cache reservation after the ratio change. For this reason, +// once the ratio is lowered to 0.0 (effectively disabling the secondary +// cache and pri_cache_res_ total mem used going down to 0), we cannot +// increase the ratio and re-enable it, We might remove this limitation +// in the future. +Status CacheWithSecondaryAdapter::UpdateCacheReservationRatio( + double compressed_secondary_ratio) { + if (!distribute_cache_res_) { + return Status::NotSupported(); + } + + MutexLock m(&cache_res_mutex_); + size_t pri_capacity = target_->GetCapacity(); + size_t sec_capacity = + static_cast(pri_capacity * compressed_secondary_ratio); + size_t old_sec_capacity; + Status s = secondary_cache_->GetCapacity(old_sec_capacity); + if (!s.ok()) { + return s; + } + + // Calculate the new secondary cache reservation + // reserved_usage_ will never be > the cache capacity, so we don't + // have to worry about adjusting it here. + sec_cache_res_ratio_ = compressed_secondary_ratio; + size_t new_sec_reserved = + static_cast(reserved_usage_ * sec_cache_res_ratio_); + if (sec_capacity > old_sec_capacity) { + // We're increasing the ratio, thus ending up with a larger secondary + // cache and a smaller usable primary cache capacity. Similar to + // SetCapacity(), we try to avoid a temporary increase in total usage + // beyond the configured capacity - + // 1. A higher secondary cache ratio means it gets a higher share of + // cache reservations. So first account for that by deflating the + // secondary cache + // 2. Increase pri_cache_res_ reservation to reflect the new secondary + // cache utilization (increase in capacity - increase in share of cache + // reservation) + // 3. Increase secondary cache capacity + s = secondary_cache_->Deflate(new_sec_reserved - sec_reserved_); + assert(s.ok()); + s = pri_cache_res_->UpdateCacheReservation( + (sec_capacity - old_sec_capacity) - (new_sec_reserved - sec_reserved_), + /*increase=*/true); + assert(s.ok()); + sec_reserved_ = new_sec_reserved; + s = secondary_cache_->SetCapacity(sec_capacity); + assert(s.ok()); + } else { + // We're shrinking the ratio. Try to avoid unnecessary evictions - + // 1. Lower the secondary cache capacity + // 2. Decrease pri_cache_res_ reservation to relect lower secondary + // cache utilization (decrease in capacity - decrease in share of cache + // reservations) + // 3. Inflate the secondary cache to give it back the reduction in its + // share of cache reservations + s = secondary_cache_->SetCapacity(sec_capacity); + if (s.ok()) { + s = pri_cache_res_->UpdateCacheReservation( + (old_sec_capacity - sec_capacity) - + (sec_reserved_ - new_sec_reserved), + /*increase=*/false); + assert(s.ok()); + s = secondary_cache_->Inflate(sec_reserved_ - new_sec_reserved); + assert(s.ok()); + sec_reserved_ = new_sec_reserved; + } + } + + return s; +} + +Status CacheWithSecondaryAdapter::UpdateAdmissionPolicy( + TieredAdmissionPolicy adm_policy) { + adm_policy_ = adm_policy; + return Status::OK(); +} + +std::shared_ptr NewTieredCache(const TieredCacheOptions& _opts) { + if (!_opts.cache_opts) { + return nullptr; + } + + TieredCacheOptions opts = _opts; + { + bool valid_adm_policy = true; + + switch (_opts.adm_policy) { + case TieredAdmissionPolicy::kAdmPolicyAuto: + // Select an appropriate default policy + if (opts.adm_policy == TieredAdmissionPolicy::kAdmPolicyAuto) { + if (opts.nvm_sec_cache) { + opts.adm_policy = TieredAdmissionPolicy::kAdmPolicyThreeQueue; + } else { + opts.adm_policy = TieredAdmissionPolicy::kAdmPolicyPlaceholder; + } + } + break; + case TieredAdmissionPolicy::kAdmPolicyPlaceholder: + case TieredAdmissionPolicy::kAdmPolicyAllowCacheHits: + if (opts.nvm_sec_cache) { + valid_adm_policy = false; + } + break; + case TieredAdmissionPolicy::kAdmPolicyThreeQueue: + if (!opts.nvm_sec_cache) { + valid_adm_policy = false; + } + break; + default: + valid_adm_policy = false; + } + if (!valid_adm_policy) { + return nullptr; + } + } + + std::shared_ptr cache; + if (opts.cache_type == PrimaryCacheType::kCacheTypeLRU) { + LRUCacheOptions cache_opts = + *(static_cast_with_check( + opts.cache_opts)); + cache_opts.capacity = opts.total_capacity; + cache = cache_opts.MakeSharedCache(); + } else if (opts.cache_type == PrimaryCacheType::kCacheTypeHCC) { + HyperClockCacheOptions cache_opts = + *(static_cast_with_check( + opts.cache_opts)); + cache_opts.capacity = opts.total_capacity; + cache = cache_opts.MakeSharedCache(); + } else { + return nullptr; + } + std::shared_ptr sec_cache; + opts.comp_cache_opts.capacity = static_cast( + opts.total_capacity * opts.compressed_secondary_ratio); + sec_cache = NewCompressedSecondaryCache(opts.comp_cache_opts); + + if (opts.nvm_sec_cache) { + if (opts.adm_policy == TieredAdmissionPolicy::kAdmPolicyThreeQueue) { + sec_cache = std::make_shared( + sec_cache, opts.nvm_sec_cache, + TieredAdmissionPolicy::kAdmPolicyThreeQueue); + } else { + return nullptr; + } + } + + return std::make_shared( + cache, sec_cache, opts.adm_policy, /*distribute_cache_res=*/true); +} + +Status UpdateTieredCache(const std::shared_ptr& cache, + int64_t total_capacity, + double compressed_secondary_ratio, + TieredAdmissionPolicy adm_policy) { + if (!cache || strcmp(cache->Name(), kTieredCacheName)) { + return Status::InvalidArgument(); + } + CacheWithSecondaryAdapter* tiered_cache = + static_cast(cache.get()); + + Status s; + if (total_capacity > 0) { + tiered_cache->SetCapacity(total_capacity); + } + if (compressed_secondary_ratio >= 0.0 && compressed_secondary_ratio <= 1.0) { + s = tiered_cache->UpdateCacheReservationRatio(compressed_secondary_ratio); + } + if (adm_policy < TieredAdmissionPolicy::kAdmPolicyMax) { + s = tiered_cache->UpdateAdmissionPolicy(adm_policy); + } + return s; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/cache/secondary_cache_adapter.h b/cache/secondary_cache_adapter.h new file mode 100644 index 000000000000..f0a514e78c02 --- /dev/null +++ b/cache/secondary_cache_adapter.h @@ -0,0 +1,103 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "cache/cache_reservation_manager.h" +#include "rocksdb/secondary_cache.h" + +namespace ROCKSDB_NAMESPACE { + +class CacheWithSecondaryAdapter : public CacheWrapper { + public: + explicit CacheWithSecondaryAdapter( + std::shared_ptr target, + std::shared_ptr secondary_cache, + TieredAdmissionPolicy adm_policy = TieredAdmissionPolicy::kAdmPolicyAuto, + bool distribute_cache_res = false); + + ~CacheWithSecondaryAdapter() override; + + Status Insert( + const Slice& key, ObjectPtr value, const CacheItemHelper* helper, + size_t charge, Handle** handle = nullptr, + Priority priority = Priority::LOW, + const Slice& compressed_value = Slice(), + CompressionType type = CompressionType::kNoCompression) override; + + Handle* Lookup(const Slice& key, const CacheItemHelper* helper, + CreateContext* create_context, + Priority priority = Priority::LOW, + Statistics* stats = nullptr) override; + + using Cache::Release; + bool Release(Handle* handle, bool erase_if_last_ref = false) override; + + ObjectPtr Value(Handle* handle) override; + + void StartAsyncLookup(AsyncLookupHandle& async_handle) override; + + void WaitAll(AsyncLookupHandle* async_handles, size_t count) override; + + std::string GetPrintableOptions() const override; + + const char* Name() const override; + + void SetCapacity(size_t capacity) override; + + Status GetSecondaryCacheCapacity(size_t& size) const override; + + Status GetSecondaryCachePinnedUsage(size_t& size) const override; + + Status UpdateCacheReservationRatio(double ratio); + + Status UpdateAdmissionPolicy(TieredAdmissionPolicy adm_policy); + + Cache* TEST_GetCache() { return target_.get(); } + + SecondaryCache* TEST_GetSecondaryCache() { return secondary_cache_.get(); } + + private: + static constexpr size_t kReservationChunkSize = 1 << 20; + + bool EvictionHandler(const Slice& key, Handle* handle, bool was_hit); + + void StartAsyncLookupOnMySecondary(AsyncLookupHandle& async_handle); + + Handle* Promote( + std::unique_ptr&& secondary_handle, + const Slice& key, const CacheItemHelper* helper, Priority priority, + Statistics* stats, bool found_dummy_entry, bool kept_in_sec_cache); + + bool ProcessDummyResult(Cache::Handle** handle, bool erase); + + void CleanupCacheObject(ObjectPtr obj, const CacheItemHelper* helper); + + std::shared_ptr secondary_cache_; + TieredAdmissionPolicy adm_policy_; + // Whether to proportionally distribute cache memory reservations, i.e + // placeholder entries with null value and a non-zero charge, across + // the primary and secondary caches. + bool distribute_cache_res_; + // A cache reservation manager to keep track of secondary cache memory + // usage by reserving equivalent capacity against the primary cache + std::shared_ptr pri_cache_res_; + // Fraction of a cache memory reservation to be assigned to the secondary + // cache + double sec_cache_res_ratio_; + // Mutex for use when managing cache memory reservations. Should not be used + // for other purposes, as it may risk causing deadlocks. + mutable port::Mutex cache_res_mutex_; + // Total memory reserved by placeholder entriesin the cache + size_t placeholder_usage_; + // Total placeholoder memory charged to both the primary and secondary + // caches. Will be <= placeholder_usage_. + size_t reserved_usage_; + // Amount of memory reserved in the secondary cache. This should be + // reserved_usage_ * sec_cache_res_ratio_ in steady state. + size_t sec_reserved_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/cache/sharded_cache.cc b/cache/sharded_cache.cc index 9ebca3ba827a..b270df751f02 100644 --- a/cache/sharded_cache.cc +++ b/cache/sharded_cache.cc @@ -13,20 +13,57 @@ #include #include +#include "env/unique_id_gen.h" +#include "rocksdb/env.h" #include "util/hash.h" #include "util/math.h" #include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { +namespace { +// The generated seeds must fit in 31 bits so that +// ShardedCacheOptions::hash_seed can be set to it explicitly, for +// diagnostic/debugging purposes. +constexpr uint32_t kSeedMask = 0x7fffffff; +uint32_t DetermineSeed(int32_t hash_seed_option) { + if (hash_seed_option >= 0) { + // User-specified exact seed + return static_cast(hash_seed_option); + } + static SemiStructuredUniqueIdGen gen; + if (hash_seed_option == ShardedCacheOptions::kHostHashSeed) { + std::string hostname; + Status s = Env::Default()->GetHostNameString(&hostname); + if (s.ok()) { + return GetSliceHash(hostname) & kSeedMask; + } else { + // Fall back on something stable within the process. + return BitwiseAnd(gen.GetBaseUpper(), kSeedMask); + } + } else { + // for kQuasiRandomHashSeed and fallback + uint32_t val = gen.GenerateNext() & kSeedMask; + // Perform some 31-bit bijective transformations so that we get + // quasirandom, not just incrementing. (An incrementing seed from a + // random starting point would be fine, but hard to describe in a name.) + // See https://en.wikipedia.org/wiki/Quasirandom and using a murmur-like + // transformation here for our bijection in the lower 31 bits. + // See https://en.wikipedia.org/wiki/MurmurHash + val *= /*31-bit prime*/ 1150630961; + val ^= (val & kSeedMask) >> 17; + val *= /*31-bit prime*/ 1320603883; + return val & kSeedMask; + } +} +} // namespace -ShardedCacheBase::ShardedCacheBase(size_t capacity, int num_shard_bits, - bool strict_capacity_limit, - std::shared_ptr allocator) - : Cache(std::move(allocator)), +ShardedCacheBase::ShardedCacheBase(const ShardedCacheOptions& opts) + : Cache(opts.memory_allocator), last_id_(1), - shard_mask_((uint32_t{1} << num_shard_bits) - 1), - strict_capacity_limit_(strict_capacity_limit), - capacity_(capacity) {} + shard_mask_((uint32_t{1} << opts.num_shard_bits) - 1), + hash_seed_(DetermineSeed(opts.hash_seed)), + strict_capacity_limit_(opts.strict_capacity_limit), + capacity_(opts.capacity) {} size_t ShardedCacheBase::ComputePerShardCapacity(size_t capacity) const { uint32_t num_shards = GetNumShards(); @@ -46,6 +83,16 @@ size_t ShardedCacheBase::GetCapacity() const { return capacity_; } +Status ShardedCacheBase::GetSecondaryCacheCapacity(size_t& size) const { + size = 0; + return Status::OK(); +} + +Status ShardedCacheBase::GetSecondaryCachePinnedUsage(size_t& size) const { + size = 0; + return Status::OK(); +} + bool ShardedCacheBase::HasStrictCapacityLimit() const { MutexLock l(&config_mutex_); return strict_capacity_limit_; diff --git a/cache/sharded_cache.h b/cache/sharded_cache.h index 65764579fea6..b7ef723a1844 100644 --- a/cache/sharded_cache.h +++ b/cache/sharded_cache.h @@ -15,7 +15,7 @@ #include "port/lang.h" #include "port/port.h" -#include "rocksdb/cache.h" +#include "rocksdb/advanced_cache.h" #include "util/hash.h" #include "util/mutexlock.h" @@ -34,8 +34,8 @@ class CacheShardBase { std::string GetPrintableOptions() const { return ""; } using HashVal = uint64_t; using HashCref = uint64_t; - static inline HashVal ComputeHash(const Slice& key) { - return GetSliceNPHash64(key); + static inline HashVal ComputeHash(const Slice& key, uint32_t seed) { + return GetSliceNPHash64(key, seed); } static inline uint32_t HashPieceForSharding(HashCref hash) { return Lower32of64(hash); @@ -51,15 +51,17 @@ class CacheShardBase { }; Status Insert(const Slice& key, HashCref hash, Cache::ObjectPtr value, const Cache::CacheItemHelper* helper, size_t charge, - HandleImpl** handle, Cache::Priority priority) = 0; + HandleImpl** handle, Cache::Priority priority, + bool standalone) = 0; + Handle* CreateStandalone(const Slice& key, HashCref hash, ObjectPtr obj, + const CacheItemHelper* helper, + size_t charge, bool allow_uncharged) = 0; HandleImpl* Lookup(const Slice& key, HashCref hash, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, - Cache::Priority priority, bool wait, + Cache::Priority priority, Statistics* stats) = 0; bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref) = 0; - bool IsReady(HandleImpl* handle) = 0; - void Wait(HandleImpl* handle) = 0; bool Ref(HandleImpl* handle) = 0; void Erase(const Slice& key, HashCref hash) = 0; void SetCapacity(size_t capacity) = 0; @@ -87,9 +89,7 @@ class CacheShardBase { // Portions of ShardedCache that do not depend on the template parameter class ShardedCacheBase : public Cache { public: - ShardedCacheBase(size_t capacity, int num_shard_bits, - bool strict_capacity_limit, - std::shared_ptr memory_allocator); + explicit ShardedCacheBase(const ShardedCacheOptions& opts); virtual ~ShardedCacheBase() = default; int GetNumShardBits() const; @@ -99,11 +99,15 @@ class ShardedCacheBase : public Cache { bool HasStrictCapacityLimit() const override; size_t GetCapacity() const override; + Status GetSecondaryCacheCapacity(size_t& size) const override; + Status GetSecondaryCachePinnedUsage(size_t& size) const override; using Cache::GetUsage; size_t GetUsage(Handle* handle) const override; std::string GetPrintableOptions() const override; + uint32_t GetHashSeed() const override { return hash_seed_; } + protected: // fns virtual void AppendPrintableOptions(std::string& str) const = 0; size_t GetPerShardCapacity() const; @@ -112,6 +116,7 @@ class ShardedCacheBase : public Cache { protected: // data std::atomic last_id_; // For NewId const uint32_t shard_mask_; + const uint32_t hash_seed_; // Dynamic configuration parameters, guarded by config_mutex_ bool strict_capacity_limit_; @@ -132,10 +137,8 @@ class ShardedCache : public ShardedCacheBase { using HashCref = typename CacheShard::HashCref; using HandleImpl = typename CacheShard::HandleImpl; - ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit, - std::shared_ptr allocator) - : ShardedCacheBase(capacity, num_shard_bits, strict_capacity_limit, - allocator), + explicit ShardedCache(const ShardedCacheOptions& opts) + : ShardedCacheBase(opts), shards_(reinterpret_cast(port::cacheline_aligned_alloc( sizeof(CacheShard) * GetNumShards()))), destroy_shards_in_dtor_(false) {} @@ -169,29 +172,41 @@ class ShardedCache : public ShardedCacheBase { [s_c_l](CacheShard* cs) { cs->SetStrictCapacityLimit(s_c_l); }); } - Status Insert(const Slice& key, ObjectPtr value, - const CacheItemHelper* helper, size_t charge, - Handle** handle = nullptr, - Priority priority = Priority::LOW) override { + Status Insert( + const Slice& key, ObjectPtr obj, const CacheItemHelper* helper, + size_t charge, Handle** handle = nullptr, + Priority priority = Priority::LOW, + const Slice& /*compressed_value*/ = Slice(), + CompressionType /*type*/ = CompressionType::kNoCompression) override { assert(helper); - HashVal hash = CacheShard::ComputeHash(key); + HashVal hash = CacheShard::ComputeHash(key, hash_seed_); auto h_out = reinterpret_cast(handle); - return GetShard(hash).Insert(key, hash, value, helper, charge, h_out, + return GetShard(hash).Insert(key, hash, obj, helper, charge, h_out, priority); } + Handle* CreateStandalone(const Slice& key, ObjectPtr obj, + const CacheItemHelper* helper, size_t charge, + bool allow_uncharged) override { + assert(helper); + HashVal hash = CacheShard::ComputeHash(key, hash_seed_); + HandleImpl* result = GetShard(hash).CreateStandalone( + key, hash, obj, helper, charge, allow_uncharged); + return reinterpret_cast(result); + } + Handle* Lookup(const Slice& key, const CacheItemHelper* helper = nullptr, CreateContext* create_context = nullptr, - Priority priority = Priority::LOW, bool wait = true, + Priority priority = Priority::LOW, Statistics* stats = nullptr) override { - HashVal hash = CacheShard::ComputeHash(key); - HandleImpl* result = GetShard(hash).Lookup( - key, hash, helper, create_context, priority, wait, stats); + HashVal hash = CacheShard::ComputeHash(key, hash_seed_); + HandleImpl* result = GetShard(hash).Lookup(key, hash, helper, + create_context, priority, stats); return reinterpret_cast(result); } void Erase(const Slice& key) override { - HashVal hash = CacheShard::ComputeHash(key); + HashVal hash = CacheShard::ComputeHash(key, hash_seed_); GetShard(hash).Erase(key, hash); } @@ -200,14 +215,6 @@ class ShardedCache : public ShardedCacheBase { auto h = reinterpret_cast(handle); return GetShard(h->GetHash()).Release(h, useful, erase_if_last_ref); } - bool IsReady(Handle* handle) override { - auto h = reinterpret_cast(handle); - return GetShard(h->GetHash()).IsReady(h); - } - void Wait(Handle* handle) override { - auto h = reinterpret_cast(handle); - GetShard(h->GetHash()).Wait(h); - } bool Ref(Handle* handle) override { auto h = reinterpret_cast(handle); return GetShard(h->GetHash()).Ref(h); @@ -223,7 +230,7 @@ class ShardedCache : public ShardedCacheBase { return SumOverShards2(&CacheShard::GetPinnedUsage); } size_t GetOccupancyCount() const override { - return SumOverShards2(&CacheShard::GetPinnedUsage); + return SumOverShards2(&CacheShard::GetOccupancyCount); } size_t GetTableAddressCount() const override { return SumOverShards2(&CacheShard::GetTableAddressCount); @@ -271,6 +278,14 @@ class ShardedCache : public ShardedCacheBase { } } + inline void ForEachShard( + const std::function& fn) const { + uint32_t num_shards = GetNumShards(); + for (uint32_t i = 0; i < num_shards; i++) { + fn(shards_ + i); + } + } + inline size_t SumOverShards( const std::function& fn) const { uint32_t num_shards = GetNumShards(); diff --git a/cache/tiered_secondary_cache.cc b/cache/tiered_secondary_cache.cc new file mode 100644 index 000000000000..493e695722ba --- /dev/null +++ b/cache/tiered_secondary_cache.cc @@ -0,0 +1,119 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "cache/tiered_secondary_cache.h" + +namespace ROCKSDB_NAMESPACE { + +// Creation callback for use in the lookup path. It calls the upper layer +// create_cb to create the object, and optionally calls the compressed +// secondary cache InsertSaved to save the compressed block. If +// advise_erase is set, it means the primary cache wants the block to be +// erased in the secondary cache, so we skip calling InsertSaved. +// +// For the time being, we assume that all blocks in the nvm tier belong to +// the primary block cache (i.e CacheTier::kVolatileTier). That can be changed +// if we implement demotion from the compressed secondary cache to the nvm +// cache in the future. +Status TieredSecondaryCache::MaybeInsertAndCreate( + const Slice& data, CompressionType type, CacheTier source, + Cache::CreateContext* ctx, MemoryAllocator* allocator, + Cache::ObjectPtr* out_obj, size_t* out_charge) { + TieredSecondaryCache::CreateContext* context = + static_cast(ctx); + assert(source == CacheTier::kVolatileTier); + if (!context->advise_erase && type != kNoCompression) { + // Attempt to insert into compressed secondary cache + // TODO: Don't hardcode the source + context->comp_sec_cache->InsertSaved(*context->key, data, type, source) + .PermitUncheckedError(); + } + // Primary cache will accept the object, so call its helper to create + // the object + return context->helper->create_cb(data, type, source, context->inner_ctx, + allocator, out_obj, out_charge); +} + +// The lookup first looks up in the compressed secondary cache. If its a miss, +// then the nvm cache lookup is called. The cache item helper and create +// context are wrapped in order to intercept the creation callback to make +// the decision on promoting to the compressed secondary cache. +std::unique_ptr TieredSecondaryCache::Lookup( + const Slice& key, const Cache::CacheItemHelper* helper, + Cache::CreateContext* create_context, bool wait, bool advise_erase, + bool& kept_in_sec_cache) { + bool dummy = false; + std::unique_ptr result = + target()->Lookup(key, helper, create_context, wait, advise_erase, + /*kept_in_sec_cache=*/dummy); + // We never want the item to spill back into the secondary cache + kept_in_sec_cache = true; + if (result) { + assert(result->IsReady()); + return result; + } + + // If wait is true, then we can be a bit more efficient and avoid a memory + // allocation for the CReateContext. + const Cache::CacheItemHelper* outer_helper = + TieredSecondaryCache::GetHelper(); + if (wait) { + TieredSecondaryCache::CreateContext ctx; + ctx.key = &key; + ctx.advise_erase = advise_erase; + ctx.helper = helper; + ctx.inner_ctx = create_context; + ctx.comp_sec_cache = target(); + + return nvm_sec_cache_->Lookup(key, outer_helper, &ctx, wait, advise_erase, + kept_in_sec_cache); + } + + // If wait is false, i.e its an async lookup, we have to allocate a result + // handle for tracking purposes. Embed the CreateContext inside the handle + // so we need only allocate memory once instead of twice. + std::unique_ptr handle(new ResultHandle()); + handle->ctx()->key = &key; + handle->ctx()->advise_erase = advise_erase; + handle->ctx()->helper = helper; + handle->ctx()->inner_ctx = create_context; + handle->ctx()->comp_sec_cache = target(); + handle->SetInnerHandle(nvm_sec_cache_->Lookup( + key, outer_helper, handle->ctx(), wait, advise_erase, kept_in_sec_cache)); + if (!handle->inner_handle()) { + handle.reset(); + } else { + result.reset(handle.release()); + } + + return result; +} + +// Call the nvm cache WaitAll to complete the lookups +void TieredSecondaryCache::WaitAll( + std::vector handles) { + std::vector nvm_handles; + std::vector my_handles; + nvm_handles.reserve(handles.size()); + for (auto handle : handles) { + // The handle could belong to the compressed secondary cache. Skip if + // that's the case. + if (handle->IsReady()) { + continue; + } + ResultHandle* hdl = static_cast(handle); + nvm_handles.push_back(hdl->inner_handle()); + my_handles.push_back(hdl); + } + nvm_sec_cache_->WaitAll(nvm_handles); + for (auto handle : my_handles) { + assert(handle->IsReady()); + auto nvm_handle = handle->inner_handle(); + handle->SetSize(nvm_handle->Size()); + handle->SetValue(nvm_handle->Value()); + } +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/cache/tiered_secondary_cache.h b/cache/tiered_secondary_cache.h new file mode 100644 index 000000000000..6e05364367c3 --- /dev/null +++ b/cache/tiered_secondary_cache.h @@ -0,0 +1,155 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/cache.h" +#include "rocksdb/secondary_cache.h" + +namespace ROCKSDB_NAMESPACE { + +// A SecondaryCache that implements stacking of a compressed secondary cache +// and a non-volatile (local flash) cache. It implements an admission +// policy of warming the bottommost tier (local flash) with compressed +// blocks from the SST on misses, and on hits in the bottommost tier, +// promoting to the compressed and/or primary block cache. The admission +// policies of the primary block cache and compressed secondary cache remain +// unchanged - promote on second access. There is no demotion ofablocks +// evicted from a tier. They are just discarded. +// +// In order to properly handle compressed blocks directly read from SSTs, and +// to allow writeback of blocks compressed by the compressed secondary +// cache in the future, we make use of the compression type and source +// cache tier arguments in InsertSaved. +class TieredSecondaryCache : public SecondaryCacheWrapper { + public: + TieredSecondaryCache(std::shared_ptr comp_sec_cache, + std::shared_ptr nvm_sec_cache, + TieredAdmissionPolicy adm_policy) + : SecondaryCacheWrapper(comp_sec_cache), nvm_sec_cache_(nvm_sec_cache) { +#ifndef NDEBUG + assert(adm_policy == TieredAdmissionPolicy::kAdmPolicyThreeQueue); +#else + (void)adm_policy; +#endif + } + + ~TieredSecondaryCache() override {} + + const char* Name() const override { return "TieredSecondaryCache"; } + + // This is a no-op as we currently don't allow demotion (i.e + // insertion by the upper layer) of evicted blocks. + virtual Status Insert(const Slice& /*key*/, Cache::ObjectPtr /*obj*/, + const Cache::CacheItemHelper* /*helper*/, + bool /*force_insert*/) override { + return Status::OK(); + } + + // Warm up the nvm tier directly + virtual Status InsertSaved( + const Slice& key, const Slice& saved, + CompressionType type = CompressionType::kNoCompression, + CacheTier source = CacheTier::kVolatileTier) override { + return nvm_sec_cache_->InsertSaved(key, saved, type, source); + } + + virtual std::unique_ptr Lookup( + const Slice& key, const Cache::CacheItemHelper* helper, + Cache::CreateContext* create_context, bool wait, bool advise_erase, + bool& kept_in_sec_cache) override; + + virtual void WaitAll( + std::vector handles) override; + + private: + struct CreateContext : public Cache::CreateContext { + const Slice* key; + bool advise_erase; + const Cache::CacheItemHelper* helper; + Cache::CreateContext* inner_ctx; + std::shared_ptr inner_handle; + SecondaryCache* comp_sec_cache; + }; + + class ResultHandle : public SecondaryCacheResultHandle { + public: + ~ResultHandle() override {} + + bool IsReady() override { + return !inner_handle_ || inner_handle_->IsReady(); + } + + void Wait() override { + inner_handle_->Wait(); + Complete(); + } + + size_t Size() override { return size_; } + + Cache::ObjectPtr Value() override { return value_; } + + void Complete() { + assert(IsReady()); + size_ = inner_handle_->Size(); + value_ = inner_handle_->Value(); + inner_handle_.reset(); + } + + void SetInnerHandle(std::unique_ptr&& handle) { + inner_handle_ = std::move(handle); + } + + void SetSize(size_t size) { size_ = size; } + + void SetValue(Cache::ObjectPtr val) { value_ = val; } + + CreateContext* ctx() { return &ctx_; } + + SecondaryCacheResultHandle* inner_handle() { return inner_handle_.get(); } + + private: + std::unique_ptr inner_handle_; + CreateContext ctx_; + size_t size_; + Cache::ObjectPtr value_; + }; + + static void NoopDelete(Cache::ObjectPtr /*obj*/, + MemoryAllocator* /*allocator*/) { + assert(false); + } + static size_t ZeroSize(Cache::ObjectPtr /*obj*/) { + assert(false); + return 0; + } + static Status NoopSaveTo(Cache::ObjectPtr /*from_obj*/, + size_t /*from_offset*/, size_t /*length*/, + char* /*out_buf*/) { + assert(false); + return Status::OK(); + } + static Status MaybeInsertAndCreate(const Slice& data, CompressionType type, + CacheTier source, + Cache::CreateContext* ctx, + MemoryAllocator* allocator, + Cache::ObjectPtr* out_obj, + size_t* out_charge); + + static const Cache::CacheItemHelper* GetHelper() { + const static Cache::CacheItemHelper basic_helper(CacheEntryRole::kMisc, + &NoopDelete); + const static Cache::CacheItemHelper maybe_insert_and_create_helper{ + CacheEntryRole::kMisc, &NoopDelete, &ZeroSize, + &NoopSaveTo, &MaybeInsertAndCreate, &basic_helper, + }; + return &maybe_insert_and_create_helper; + } + + std::shared_ptr comp_sec_cache_; + std::shared_ptr nvm_sec_cache_; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/cache/tiered_secondary_cache_test.cc b/cache/tiered_secondary_cache_test.cc new file mode 100644 index 000000000000..9d8cdf7fb76a --- /dev/null +++ b/cache/tiered_secondary_cache_test.cc @@ -0,0 +1,711 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +#include "cache/compressed_secondary_cache.h" +#include "cache/secondary_cache_adapter.h" +#include "db/db_test_util.h" +#include "rocksdb/cache.h" +#include "rocksdb/secondary_cache.h" +#include "typed_cache.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class TestSecondaryCache : public SecondaryCache { + public: + explicit TestSecondaryCache(size_t capacity) + : cache_(NewLRUCache(capacity, 0, false, 0.5 /* high_pri_pool_ratio */, + nullptr, kDefaultToAdaptiveMutex, + kDontChargeCacheMetadata)), + num_insert_saved_(0), + num_hits_(0), + num_misses_(0) {} + + const char* Name() const override { return "TestSecondaryCache"; } + + Status Insert(const Slice& /*key*/, Cache::ObjectPtr /*value*/, + const Cache::CacheItemHelper* /*helper*/, + bool /*force_insert*/) override { + assert(false); + return Status::NotSupported(); + } + + Status InsertSaved(const Slice& key, const Slice& saved, + CompressionType type = kNoCompression, + CacheTier source = CacheTier::kVolatileTier) override { + CheckCacheKeyCommonPrefix(key); + size_t size; + char* buf; + Status s; + + num_insert_saved_++; + size = saved.size(); + buf = new char[size + sizeof(uint64_t) + 2 * sizeof(uint16_t)]; + EncodeFixed64(buf, size); + buf += sizeof(uint64_t); + EncodeFixed16(buf, type); + buf += sizeof(uint16_t); + EncodeFixed16(buf, (uint16_t)source); + buf += sizeof(uint16_t); + memcpy(buf, saved.data(), size); + buf -= sizeof(uint64_t) + 2 * sizeof(uint16_t); + if (!s.ok()) { + delete[] buf; + return s; + } + return cache_.Insert(key, buf, size); + } + + std::unique_ptr Lookup( + const Slice& key, const Cache::CacheItemHelper* helper, + Cache::CreateContext* create_context, bool wait, bool /*advise_erase*/, + bool& kept_in_sec_cache) override { + std::string key_str = key.ToString(); + TEST_SYNC_POINT_CALLBACK("TestSecondaryCache::Lookup", &key_str); + + std::unique_ptr secondary_handle; + kept_in_sec_cache = false; + + TypedHandle* handle = cache_.Lookup(key); + if (handle) { + num_hits_++; + Cache::ObjectPtr value = nullptr; + size_t charge = 0; + Status s; + char* ptr = cache_.Value(handle); + CompressionType type; + CacheTier source; + size_t size = DecodeFixed64(ptr); + ptr += sizeof(uint64_t); + type = static_cast(DecodeFixed16(ptr)); + ptr += sizeof(uint16_t); + source = static_cast(DecodeFixed16(ptr)); + assert(source == CacheTier::kVolatileTier); + ptr += sizeof(uint16_t); + s = helper->create_cb(Slice(ptr, size), type, source, create_context, + /*alloc*/ nullptr, &value, &charge); + if (s.ok()) { + secondary_handle.reset(new TestSecondaryCacheResultHandle( + cache_.get(), handle, value, charge, /*ready=*/wait)); + kept_in_sec_cache = true; + } else { + cache_.Release(handle); + } + } else { + num_misses_++; + } + return secondary_handle; + } + + bool SupportForceErase() const override { return false; } + + void Erase(const Slice& /*key*/) override {} + + void WaitAll(std::vector handles) override { + for (SecondaryCacheResultHandle* handle : handles) { + TestSecondaryCacheResultHandle* sec_handle = + static_cast(handle); + EXPECT_FALSE(sec_handle->IsReady()); + sec_handle->SetReady(); + } + } + + std::string GetPrintableOptions() const override { return ""; } + + uint32_t num_insert_saved() { return num_insert_saved_; } + + uint32_t num_hits() { return num_hits_; } + + uint32_t num_misses() { return num_misses_; } + + void CheckCacheKeyCommonPrefix(const Slice& key) { + Slice current_prefix(key.data(), OffsetableCacheKey::kCommonPrefixSize); + if (ckey_prefix_.empty()) { + ckey_prefix_ = current_prefix.ToString(); + } else { + EXPECT_EQ(ckey_prefix_, current_prefix.ToString()); + } + } + + private: + class TestSecondaryCacheResultHandle : public SecondaryCacheResultHandle { + public: + TestSecondaryCacheResultHandle(Cache* cache, Cache::Handle* handle, + Cache::ObjectPtr value, size_t size, + bool ready) + : cache_(cache), + handle_(handle), + value_(value), + size_(size), + is_ready_(ready) {} + + ~TestSecondaryCacheResultHandle() override { cache_->Release(handle_); } + + bool IsReady() override { return is_ready_; } + + void Wait() override {} + + Cache::ObjectPtr Value() override { + assert(is_ready_); + return value_; + } + + size_t Size() override { return Value() ? size_ : 0; } + + void SetReady() { is_ready_ = true; } + + private: + Cache* cache_; + Cache::Handle* handle_; + Cache::ObjectPtr value_; + size_t size_; + bool is_ready_; + }; + + using SharedCache = + BasicTypedSharedCacheInterface; + using TypedHandle = SharedCache::TypedHandle; + SharedCache cache_; + uint32_t num_insert_saved_; + uint32_t num_hits_; + uint32_t num_misses_; + std::string ckey_prefix_; +}; + +class DBTieredSecondaryCacheTest : public DBTestBase { + public: + DBTieredSecondaryCacheTest() + : DBTestBase("db_tiered_secondary_cache_test", /*env_do_fsync=*/true) {} + + std::shared_ptr NewCache(size_t pri_capacity, + size_t compressed_capacity, + size_t nvm_capacity, + TieredAdmissionPolicy adm_policy = + TieredAdmissionPolicy::kAdmPolicyAuto) { + LRUCacheOptions lru_opts; + TieredCacheOptions opts; + lru_opts.capacity = 0; + lru_opts.num_shard_bits = 0; + lru_opts.high_pri_pool_ratio = 0; + opts.cache_opts = &lru_opts; + opts.cache_type = PrimaryCacheType::kCacheTypeLRU; + opts.comp_cache_opts.capacity = 0; + opts.comp_cache_opts.num_shard_bits = 0; + opts.total_capacity = pri_capacity + compressed_capacity; + opts.compressed_secondary_ratio = + (double)compressed_capacity / opts.total_capacity; + if (nvm_capacity > 0) { + nvm_sec_cache_.reset(new TestSecondaryCache(nvm_capacity)); + opts.nvm_sec_cache = nvm_sec_cache_; + } + opts.adm_policy = adm_policy; + cache_ = NewTieredCache(opts); + assert(cache_ != nullptr); + + return cache_; + } + + TestSecondaryCache* nvm_sec_cache() { return nvm_sec_cache_.get(); } + + CompressedSecondaryCache* compressed_secondary_cache() { + return static_cast( + static_cast(cache_.get()) + ->TEST_GetSecondaryCache()); + } + + private: + std::shared_ptr cache_; + std::shared_ptr nvm_sec_cache_; +}; + +// In this test, the block size is set to 4096. Each value is 1007 bytes, so +// each data block contains exactly 4 KV pairs. Metadata blocks are not +// cached, so we can accurately estimate the cache usage. +TEST_F(DBTieredSecondaryCacheTest, BasicTest) { + if (!LZ4_Supported()) { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + return; + } + + BlockBasedTableOptions table_options; + // We want a block cache of size 5KB, and a compressed secondary cache of + // size 5KB. However, we specify a block cache size of 256KB here in order + // to take into account the cache reservation in the block cache on + // behalf of the compressed cache. The unit of cache reservation is 256KB. + // The effective block cache capacity will be calculated as 256 + 5 = 261KB, + // and 256KB will be reserved for the compressed cache, leaving 5KB for + // the primary block cache. We only have to worry about this here because + // the cache size is so small. + table_options.block_cache = NewCache(256 * 1024, 5 * 1024, 256 * 1024); + table_options.block_size = 4 * 1024; + table_options.cache_index_and_filter_blocks = false; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // Disable paranoid_file_checks so that flush will not read back the newly + // written file + options.paranoid_file_checks = false; + DestroyAndReopen(options); + Random rnd(301); + const int N = 256; + for (int i = 0; i < N; i++) { + std::string p_v; + test::CompressibleString(&rnd, 0.5, 1007, &p_v); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + + // The first 2 Gets, for keys 0 and 5, will load the corresponding data + // blocks as they will be cache misses. The nvm secondary cache will be + // warmed up with the compressed blocks + std::string v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 1u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 1u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 2u); + + // At this point, the nvm cache is warmed up with the data blocks for 0 + // and 5. The next Get will lookup the block in nvm and will be a hit. + // It will be created as a standalone entry in memory, and a placeholder + // will be inserted in the primary and compressed caches. + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 1u); + + // For this Get, the primary and compressed only have placeholders for + // the required data block. So we will lookup the nvm cache and find the + // block there. This time, the block will be promoted to the primary + // block cache. No promotion to the compressed secondary cache happens, + // and it will retain the placeholder. + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 2u); + + // This Get will find the data block in the primary cache. + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 2u); + + // We repeat the sequence for key 5. This will end up evicting the block + // for 0 from the in-memory cache. + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 3u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 4u); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 4u); + + // This Get for key 0 will find the data block in nvm. Since the compressed + // cache still has the placeholder, the block (compressed) will be + // admitted. It is theh inserted into the primary as a standalone entry. + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 5u); + + // This Get for key 0 will find the data block in the compressed secondary + // cache. + v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 2u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 5u); + + Destroy(options); +} + +// This test is very similar to BasicTest, except it calls MultiGet rather +// than Get, in order to exercise the async lookup and WaitAll path. +TEST_F(DBTieredSecondaryCacheTest, BasicMultiGetTest) { + if (!LZ4_Supported()) { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + return; + } + + BlockBasedTableOptions table_options; + table_options.block_cache = NewCache(260 * 1024, 10 * 1024, 256 * 1024); + table_options.block_size = 4 * 1024; + table_options.cache_index_and_filter_blocks = false; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + options.paranoid_file_checks = false; + DestroyAndReopen(options); + Random rnd(301); + const int N = 256; + for (int i = 0; i < N; i++) { + std::string p_v; + test::CompressibleString(&rnd, 0.5, 1007, &p_v); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + + std::vector keys; + std::vector values; + + keys.push_back(Key(0)); + keys.push_back(Key(4)); + keys.push_back(Key(8)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (auto value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 3u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 3u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 0u); + + keys.clear(); + values.clear(); + keys.push_back(Key(12)); + keys.push_back(Key(16)); + keys.push_back(Key(20)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (auto value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 0u); + + keys.clear(); + values.clear(); + keys.push_back(Key(0)); + keys.push_back(Key(4)); + keys.push_back(Key(8)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (auto value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 3u); + + keys.clear(); + values.clear(); + keys.push_back(Key(0)); + keys.push_back(Key(4)); + keys.push_back(Key(8)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (auto value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 6u); + + keys.clear(); + values.clear(); + keys.push_back(Key(0)); + keys.push_back(Key(4)); + keys.push_back(Key(8)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (auto value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 6u); + + keys.clear(); + values.clear(); + keys.push_back(Key(12)); + keys.push_back(Key(16)); + keys.push_back(Key(20)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (auto value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 9u); + + keys.clear(); + values.clear(); + keys.push_back(Key(12)); + keys.push_back(Key(16)); + keys.push_back(Key(20)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (auto value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 12u); + + keys.clear(); + values.clear(); + keys.push_back(Key(12)); + keys.push_back(Key(16)); + keys.push_back(Key(20)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (auto value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 12u); + + Destroy(options); +} + +TEST_F(DBTieredSecondaryCacheTest, WaitAllTest) { + if (!LZ4_Supported()) { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + return; + } + + BlockBasedTableOptions table_options; + table_options.block_cache = NewCache(250 * 1024, 20 * 1024, 256 * 1024); + table_options.block_size = 4 * 1024; + table_options.cache_index_and_filter_blocks = false; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + options.paranoid_file_checks = false; + DestroyAndReopen(options); + Random rnd(301); + const int N = 256; + for (int i = 0; i < N; i++) { + std::string p_v; + test::CompressibleString(&rnd, 0.5, 1007, &p_v); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + + std::vector keys; + std::vector values; + + keys.push_back(Key(0)); + keys.push_back(Key(4)); + keys.push_back(Key(8)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (auto value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 3u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 3u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 0u); + + keys.clear(); + values.clear(); + keys.push_back(Key(12)); + keys.push_back(Key(16)); + keys.push_back(Key(20)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (auto value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 6u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 0u); + + // Insert placeholders for 4 in primary and compressed + std::string val = Get(Key(4)); + + // Force placeholder 4 out of primary + keys.clear(); + values.clear(); + keys.push_back(Key(24)); + keys.push_back(Key(28)); + keys.push_back(Key(32)); + keys.push_back(Key(36)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (auto value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 10u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 10u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 1u); + + // Now read 4 again. This will create a placeholder in primary, and insert + // in compressed secondary since it already has a placeholder + val = Get(Key(4)); + + // Now read 0, 4 and 8. While 4 is already in the compressed secondary + // cache, 0 and 8 will be read asynchronously from the nvm tier. The + // WaitAll will be called for all 3 blocks. + keys.clear(); + values.clear(); + keys.push_back(Key(0)); + keys.push_back(Key(4)); + keys.push_back(Key(8)); + values = MultiGet(keys, /*snapshot=*/nullptr, /*async=*/true); + ASSERT_EQ(values.size(), keys.size()); + for (auto value : values) { + ASSERT_EQ(1007, value.size()); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 10u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 10u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 4u); + + Destroy(options); +} + +// This test is for iteration. It iterates through a set of keys in two +// passes. First pass loads the compressed blocks into the nvm tier, and +// the second pass should hit all of those blocks. +TEST_F(DBTieredSecondaryCacheTest, IterateTest) { + if (!LZ4_Supported()) { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + return; + } + + BlockBasedTableOptions table_options; + table_options.block_cache = NewCache(250 * 1024, 10 * 1024, 256 * 1024); + table_options.block_size = 4 * 1024; + table_options.cache_index_and_filter_blocks = false; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + options.paranoid_file_checks = false; + DestroyAndReopen(options); + Random rnd(301); + const int N = 256; + for (int i = 0; i < N; i++) { + std::string p_v; + test::CompressibleString(&rnd, 0.5, 1007, &p_v); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + + ReadOptions ro; + ro.readahead_size = 256 * 1024; + auto iter = dbfull()->NewIterator(ro); + iter->SeekToFirst(); + for (int i = 0; i < 31; ++i) { + ASSERT_EQ(Key(i), iter->key().ToString()); + ASSERT_EQ(1007, iter->value().size()); + iter->Next(); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 8u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 8u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 0u); + delete iter; + + iter = dbfull()->NewIterator(ro); + iter->SeekToFirst(); + for (int i = 0; i < 31; ++i) { + ASSERT_EQ(Key(i), iter->key().ToString()); + ASSERT_EQ(1007, iter->value().size()); + iter->Next(); + } + ASSERT_EQ(nvm_sec_cache()->num_insert_saved(), 8u); + ASSERT_EQ(nvm_sec_cache()->num_misses(), 8u); + ASSERT_EQ(nvm_sec_cache()->num_hits(), 8u); + delete iter; + + Destroy(options); +} + +class DBTieredAdmPolicyTest + : public DBTieredSecondaryCacheTest, + public testing::WithParamInterface {}; + +TEST_P(DBTieredAdmPolicyTest, CompressedOnlyTest) { + if (!LZ4_Supported()) { + ROCKSDB_GTEST_SKIP("This test requires LZ4 support."); + return; + } + + BlockBasedTableOptions table_options; + // We want a block cache of size 10KB, and a compressed secondary cache of + // size 10KB. However, we specify a block cache size of 256KB here in order + // to take into account the cache reservation in the block cache on + // behalf of the compressed cache. The unit of cache reservation is 256KB. + // The effective block cache capacity will be calculated as 256 + 10 = 266KB, + // and 256KB will be reserved for the compressed cache, leaving 10KB for + // the primary block cache. We only have to worry about this here because + // the cache size is so small. + table_options.block_cache = NewCache(256 * 1024, 10 * 1024, 0, GetParam()); + table_options.block_size = 4 * 1024; + table_options.cache_index_and_filter_blocks = false; + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + size_t comp_cache_usage = compressed_secondary_cache()->TEST_GetUsage(); + // Disable paranoid_file_checks so that flush will not read back the newly + // written file + options.paranoid_file_checks = false; + DestroyAndReopen(options); + Random rnd(301); + const int N = 256; + for (int i = 0; i < N; i++) { + std::string p_v; + test::CompressibleString(&rnd, 0.5, 1007, &p_v); + ASSERT_OK(Put(Key(i), p_v)); + } + + ASSERT_OK(Flush()); + + // The first 2 Gets, for keys 0 and 5, will load the corresponding data + // blocks as they will be cache misses. Since this is a 2-tier cache ( + // primary and compressed), no warm-up should happen with the compressed + // blocks. + std::string v = Get(Key(0)); + ASSERT_EQ(1007, v.size()); + + v = Get(Key(5)); + ASSERT_EQ(1007, v.size()); + + ASSERT_EQ(compressed_secondary_cache()->TEST_GetUsage(), comp_cache_usage); + + Destroy(options); +} + +INSTANTIATE_TEST_CASE_P( + DBTieredAdmPolicyTest, DBTieredAdmPolicyTest, + ::testing::Values(TieredAdmissionPolicy::kAdmPolicyAuto, + TieredAdmissionPolicy::kAdmPolicyPlaceholder, + TieredAdmissionPolicy::kAdmPolicyAllowCacheHits)); + +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/cache/typed_cache.h b/cache/typed_cache.h index 76c82b4a05d9..125bfa0f506b 100644 --- a/cache/typed_cache.h +++ b/cache/typed_cache.h @@ -29,8 +29,8 @@ #include #include "cache/cache_helpers.h" +#include "rocksdb/advanced_cache.h" #include "rocksdb/advanced_options.h" -#include "rocksdb/cache.h" namespace ROCKSDB_NAMESPACE { @@ -83,11 +83,14 @@ class PlaceholderCacheInterface : public BaseCacheInterface { using BaseCacheInterface::BaseCacheInterface; inline Status Insert(const Slice& key, size_t charge, Handle** handle) { - return this->cache_->Insert(key, /*value=*/nullptr, &kHelper, charge, + return this->cache_->Insert(key, /*value=*/nullptr, GetHelper(), charge, handle); } - static constexpr Cache::CacheItemHelper kHelper{kRole}; + static const Cache::CacheItemHelper* GetHelper() { + static const Cache::CacheItemHelper kHelper{kRole}; + return &kHelper; + } }; template @@ -128,8 +131,11 @@ class BasicTypedCacheHelperFns { template class BasicTypedCacheHelper : public BasicTypedCacheHelperFns { public: - static constexpr Cache::CacheItemHelper kBasicHelper{ - kRole, &BasicTypedCacheHelper::Delete}; + static const Cache::CacheItemHelper* GetBasicHelper() { + static const Cache::CacheItemHelper kHelper{kRole, + &BasicTypedCacheHelper::Delete}; + return &kHelper; + } }; // BasicTypedCacheInterface - Used for primary cache storage of objects of @@ -144,9 +150,14 @@ class BasicTypedCacheInterface : public BaseCacheInterface, CACHE_TYPE_DEFS(); using typename BasicTypedCacheHelperFns::TValuePtr; struct TypedHandle : public Handle {}; - using BasicTypedCacheHelper::kBasicHelper; + using BasicTypedCacheHelper::GetBasicHelper; // ctor using BaseCacheInterface::BaseCacheInterface; + struct TypedAsyncLookupHandle : public Cache::AsyncLookupHandle { + TypedHandle* Result() { + return reinterpret_cast(Cache::AsyncLookupHandle::Result()); + } + }; inline Status Insert(const Slice& key, TValuePtr value, size_t charge, TypedHandle** handle = nullptr, @@ -154,7 +165,7 @@ class BasicTypedCacheInterface : public BaseCacheInterface, auto untyped_handle = reinterpret_cast(handle); return this->cache_->Insert( key, BasicTypedCacheHelperFns::UpCastValue(value), - &kBasicHelper, charge, untyped_handle, priority); + GetBasicHelper(), charge, untyped_handle, priority); } inline TypedHandle* Lookup(const Slice& key, Statistics* stats = nullptr) { @@ -162,6 +173,11 @@ class BasicTypedCacheInterface : public BaseCacheInterface, this->cache_->BasicLookup(key, stats)); } + inline void StartAsyncLookup(TypedAsyncLookupHandle& async_handle) { + assert(async_handle.helper == nullptr); + this->cache_->StartAsyncLookup(async_handle); + } + inline CacheHandleGuard Guard(TypedHandle* handle) { if (handle) { return CacheHandleGuard(&*this->cache_, handle); @@ -218,15 +234,19 @@ class FullTypedCacheHelperFns : public BasicTypedCacheHelperFns { return Status::OK(); } - static Status Create(const Slice& data, CreateContext* context, + static Status Create(const Slice& data, CompressionType type, + CacheTier source, CreateContext* context, MemoryAllocator* allocator, ObjectPtr* out_obj, size_t* out_charge) { std::unique_ptr value = nullptr; + if (source != CacheTier::kVolatileTier) { + return Status::InvalidArgument(); + } if constexpr (sizeof(TCreateContext) > 0) { TCreateContext* tcontext = static_cast(context); - tcontext->Create(&value, out_charge, data, allocator); + tcontext->Create(&value, out_charge, data, type, allocator); } else { - TCreateContext::Create(&value, out_charge, data, allocator); + TCreateContext::Create(&value, out_charge, data, type, allocator); } *out_obj = UpCastValue(value.release()); return Status::OK(); @@ -239,9 +259,16 @@ template class FullTypedCacheHelper : public FullTypedCacheHelperFns { public: - static constexpr Cache::CacheItemHelper kFullHelper{ - kRole, &FullTypedCacheHelper::Delete, &FullTypedCacheHelper::Size, - &FullTypedCacheHelper::SaveTo, &FullTypedCacheHelper::Create}; + static const Cache::CacheItemHelper* GetFullHelper() { + static const Cache::CacheItemHelper kHelper{ + kRole, + &FullTypedCacheHelper::Delete, + &FullTypedCacheHelper::Size, + &FullTypedCacheHelper::SaveTo, + &FullTypedCacheHelper::Create, + BasicTypedCacheHelper::GetBasicHelper()}; + return &kHelper; + } }; // FullTypedCacheHelper - Used for secondary cache compatible storage of @@ -262,9 +289,11 @@ class FullTypedCacheInterface public: CACHE_TYPE_DEFS(); using typename BasicTypedCacheInterface::TypedHandle; + using typename BasicTypedCacheInterface::TypedAsyncLookupHandle; using typename BasicTypedCacheHelperFns::TValuePtr; - using BasicTypedCacheHelper::kBasicHelper; - using FullTypedCacheHelper::kFullHelper; + using BasicTypedCacheHelper::GetBasicHelper; + using FullTypedCacheHelper::GetFullHelper; using BasicTypedCacheHelperFns::UpCastValue; using BasicTypedCacheHelperFns::DownCastValue; // ctor @@ -276,13 +305,15 @@ class FullTypedCacheInterface inline Status InsertFull( const Slice& key, TValuePtr value, size_t charge, TypedHandle** handle = nullptr, Priority priority = Priority::LOW, - CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) { + CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier, + const Slice& compressed = Slice(), + CompressionType type = CompressionType::kNoCompression) { auto untyped_handle = reinterpret_cast(handle); - auto helper = lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier - ? &kFullHelper - : &kBasicHelper; + auto helper = lowest_used_cache_tier > CacheTier::kVolatileTier + ? GetFullHelper() + : GetBasicHelper(); return this->cache_->Insert(key, UpCastValue(value), helper, charge, - untyped_handle, priority); + untyped_handle, priority, compressed, type); } // Like SecondaryCache::InsertSaved, with SecondaryCache compatibility @@ -294,9 +325,9 @@ class FullTypedCacheInterface size_t* out_charge = nullptr) { ObjectPtr value; size_t charge; - Status st = kFullHelper.create_cb(data, create_context, - this->cache_->memory_allocator(), &value, - &charge); + Status st = GetFullHelper()->create_cb( + data, kNoCompression, CacheTier::kVolatileTier, create_context, + this->cache_->memory_allocator(), &value, &charge); if (out_charge) { *out_charge = charge; } @@ -304,7 +335,7 @@ class FullTypedCacheInterface st = InsertFull(key, DownCastValue(value), charge, nullptr /*handle*/, priority, lowest_used_cache_tier); } else { - kFullHelper.del_cb(value, this->cache_->memory_allocator()); + GetFullHelper()->del_cb(value, this->cache_->memory_allocator()); } return st; } @@ -313,17 +344,28 @@ class FullTypedCacheInterface // (Basic Lookup() also inherited.) inline TypedHandle* LookupFull( const Slice& key, TCreateContext* create_context = nullptr, - Priority priority = Priority::LOW, bool wait = true, - Statistics* stats = nullptr, + Priority priority = Priority::LOW, Statistics* stats = nullptr, CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) { - if (lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier) { + if (lowest_used_cache_tier > CacheTier::kVolatileTier) { return reinterpret_cast(this->cache_->Lookup( - key, &kFullHelper, create_context, priority, wait, stats)); + key, GetFullHelper(), create_context, priority, stats)); } else { return BasicTypedCacheInterface::Lookup(key, stats); } } + + inline void StartAsyncLookupFull( + TypedAsyncLookupHandle& async_handle, + CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) { + if (lowest_used_cache_tier > CacheTier::kVolatileTier) { + async_handle.helper = GetFullHelper(); + this->cache_->StartAsyncLookup(async_handle); + } else { + BasicTypedCacheInterface::StartAsyncLookup( + async_handle); + } + } }; // FullTypedSharedCacheInterface - Like FullTypedCacheInterface but with a diff --git a/cloud/replication_test.cc b/cloud/replication_test.cc index 7d4a1c89b87b..20041631d28d 100644 --- a/cloud/replication_test.cc +++ b/cloud/replication_test.cc @@ -1103,7 +1103,7 @@ TEST_F(ReplicationTest, NoMemSwitchRecordIfEmpty) { TEST_F(ReplicationTest, EvictObsoleteFiles) { auto leader = openLeader(); - leader->EnableFileDeletions(); + leader->EnableFileDeletions(true); auto followerOptions = leaderOptions(); followerOptions.disable_delete_obsolete_files_on_open = true; auto follower = openFollower(followerOptions); diff --git a/coverage/coverage_test.sh b/coverage/coverage_test.sh index aa5f68c77983..d8d750c93479 100755 --- a/coverage/coverage_test.sh +++ b/coverage/coverage_test.sh @@ -12,7 +12,7 @@ fi ROOT=".." # Fetch right version of gcov if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then - source $ROOT/build_tools/fbcode_config_platform009.sh + source $ROOT/build_tools/fbcode_config_platform010.sh GCOV=$GCC_BASE/bin/gcov else GCOV=$(which gcov) diff --git a/crash_test.mk b/crash_test.mk index 5e8b3573a22d..a71a55c15c73 100644 --- a/crash_test.mk +++ b/crash_test.mk @@ -21,6 +21,8 @@ CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) -- blackbox_crash_test_with_multiops_wp_txn \ crash_test_with_tiered_storage blackbox_crash_test_with_tiered_storage \ whitebox_crash_test_with_tiered_storage \ + whitebox_crash_test_with_optimistic_txn \ + blackbox_crash_test_with_optimistic_txn \ crash_test: $(DB_STRESS_CMD) # Do not parallelize @@ -37,6 +39,11 @@ crash_test_with_txn: $(DB_STRESS_CMD) $(CRASHTEST_MAKE) whitebox_crash_test_with_txn $(CRASHTEST_MAKE) blackbox_crash_test_with_txn +crash_test_with_optimistic_txn: $(DB_STRESS_CMD) +# Do not parallelize + $(CRASHTEST_MAKE) whitebox_crash_test_with_optimistic_txn + $(CRASHTEST_MAKE) blackbox_crash_test_with_optimistic_txn + crash_test_with_best_efforts_recovery: blackbox_crash_test_with_best_efforts_recovery crash_test_with_ts: $(DB_STRESS_CMD) @@ -80,6 +87,9 @@ blackbox_crash_test_with_multiops_wp_txn: $(DB_STRESS_CMD) blackbox_crash_test_with_tiered_storage: $(DB_STRESS_CMD) $(CRASHTEST_PY) --test_tiered_storage blackbox $(CRASH_TEST_EXT_ARGS) +blackbox_crash_test_with_optimistic_txn: $(DB_STRESS_CMD) + $(CRASHTEST_PY) --optimistic_txn blackbox $(CRASH_TEST_EXT_ARGS) + ifeq ($(CRASH_TEST_KILL_ODD),) CRASH_TEST_KILL_ODD=888887 endif @@ -105,3 +115,7 @@ whitebox_crash_test_with_ts: $(DB_STRESS_CMD) whitebox_crash_test_with_tiered_storage: $(DB_STRESS_CMD) $(CRASHTEST_PY) --test_tiered_storage whitebox --random_kill_odd \ $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) + +whitebox_crash_test_with_optimistic_txn: $(DB_STRESS_CMD) + $(CRASHTEST_PY) --optimistic_txn whitebox --random_kill_odd \ + $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS) diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc index 607403ccc32b..e6dcb6696206 100644 --- a/db/arena_wrapped_db_iter.cc +++ b/db/arena_wrapped_db_iter.cc @@ -19,6 +19,14 @@ namespace ROCKSDB_NAMESPACE { +inline static SequenceNumber GetSeqNum(const DBImpl* db, const Snapshot* s) { + if (s) { + return s->GetSequenceNumber(); + } else { + return db->GetLatestSequenceNumber(); + } +} + Status ArenaWrappedDBIter::GetProperty(std::string prop_name, std::string* prop) { if (prop_name == "rocksdb.iterator.super-version-number") { @@ -47,9 +55,16 @@ void ArenaWrappedDBIter::Init( read_options_ = read_options; allow_refresh_ = allow_refresh; memtable_range_tombstone_iter_ = nullptr; + + if (!CheckFSFeatureSupport(env->GetFileSystem().get(), + FSSupportedOps::kAsyncIO)) { + read_options_.async_io = false; + } } -Status ArenaWrappedDBIter::Refresh() { +Status ArenaWrappedDBIter::Refresh() { return Refresh(nullptr); } + +Status ArenaWrappedDBIter::Refresh(const Snapshot* snapshot) { if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) { return Status::NotSupported("Creating renew iterator is not allowed."); } @@ -58,6 +73,10 @@ Status ArenaWrappedDBIter::Refresh() { // correct behavior. Will be corrected automatically when we take a snapshot // here for the case of WritePreparedTxnDB. uint64_t cur_sv_number = cfd_->GetSuperVersionNumber(); + // If we recreate a new internal iterator below (NewInternalIterator()), + // we will pass in read_options_. We need to make sure it + // has the right snapshot. + read_options_.snapshot = snapshot; TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1"); TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2"); auto reinit_internal_iter = [&]() { @@ -67,18 +86,19 @@ Status ArenaWrappedDBIter::Refresh() { new (&arena_) Arena(); SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_); - SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber(); + assert(sv->version_number >= cur_sv_number); + SequenceNumber read_seq = GetSeqNum(db_impl_, snapshot); if (read_callback_) { - read_callback_->Refresh(latest_seq); + read_callback_->Refresh(read_seq); } Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options, - sv->current, latest_seq, + sv->current, read_seq, sv->mutable_cf_options.max_sequential_skip_in_iterations, - cur_sv_number, read_callback_, db_impl_, cfd_, expose_blob_index_, + sv->version_number, read_callback_, db_impl_, cfd_, expose_blob_index_, allow_refresh_); InternalIterator* internal_iter = db_impl_->NewInternalIterator( - read_options_, cfd_, sv, &arena_, latest_seq, + read_options_, cfd_, sv, &arena_, read_seq, /* allow_unprepared_value */ true, /* db_iter */ this); SetIterUnderDBIter(internal_iter); }; @@ -87,13 +107,13 @@ Status ArenaWrappedDBIter::Refresh() { reinit_internal_iter(); break; } else { - SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber(); + SequenceNumber read_seq = GetSeqNum(db_impl_, snapshot); // Refresh range-tombstones in MemTable if (!read_options_.ignore_range_deletions) { SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_); TEST_SYNC_POINT_CALLBACK("ArenaWrappedDBIter::Refresh:SV", nullptr); auto t = sv->mem->NewRangeTombstoneIterator( - read_options_, latest_seq, false /* immutable_memtable */); + read_options_, read_seq, false /* immutable_memtable */); if (!t || t->empty()) { // If memtable_range_tombstone_iter_ points to a non-empty tombstone // iterator, then it means sv->mem is not the memtable that @@ -123,9 +143,6 @@ Status ArenaWrappedDBIter::Refresh() { } db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv); } - // Refresh latest sequence number - db_iter_->set_sequence(latest_seq); - db_iter_->set_valid(false); // Check again if the latest super version number is changed uint64_t latest_sv_number = cfd_->GetSuperVersionNumber(); if (latest_sv_number != cur_sv_number) { @@ -134,6 +151,8 @@ Status ArenaWrappedDBIter::Refresh() { cur_sv_number = latest_sv_number; continue; } + db_iter_->set_sequence(read_seq); + db_iter_->set_valid(false); break; } } diff --git a/db/arena_wrapped_db_iter.h b/db/arena_wrapped_db_iter.h index f223be1ec658..d30ee45c3a40 100644 --- a/db/arena_wrapped_db_iter.h +++ b/db/arena_wrapped_db_iter.h @@ -80,6 +80,7 @@ class ArenaWrappedDBIter : public Iterator { Status GetProperty(std::string prop_name, std::string* prop) override; Status Refresh() override; + Status Refresh(const Snapshot*) override; void Init(Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions, diff --git a/db/blob/blob_contents.h b/db/blob/blob_contents.h index 18ed27c69253..40b94d51f960 100644 --- a/db/blob/blob_contents.h +++ b/db/blob/blob_contents.h @@ -7,8 +7,8 @@ #include -#include "memory/memory_allocator.h" -#include "rocksdb/cache.h" +#include "memory/memory_allocator_impl.h" +#include "rocksdb/advanced_cache.h" #include "rocksdb/rocksdb_namespace.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" @@ -46,7 +46,8 @@ class BlobContents { class BlobContentsCreator : public Cache::CreateContext { public: static void Create(std::unique_ptr* out, size_t* out_charge, - const Slice& contents, MemoryAllocator* alloc) { + const Slice& contents, CompressionType /*type*/, + MemoryAllocator* alloc) { auto raw = new BlobContents(AllocateAndCopyBlock(contents, alloc), contents.size()); out->reset(raw); diff --git a/db/blob/blob_counting_iterator.h b/db/blob/blob_counting_iterator.h index de549afa22b1..b21651f66f76 100644 --- a/db/blob/blob_counting_iterator.h +++ b/db/blob/blob_counting_iterator.h @@ -123,6 +123,10 @@ class BlobCountingIterator : public InternalIterator { return iter_->GetProperty(prop_name, prop); } + bool IsDeleteRangeSentinelKey() const override { + return iter_->IsDeleteRangeSentinelKey(); + } + private: void UpdateAndCountBlobIfNeeded() { assert(!iter_->Valid() || iter_->status().ok()); diff --git a/db/blob/blob_file_builder.cc b/db/blob/blob_file_builder.cc index 952a5676bff3..35269fdb509d 100644 --- a/db/blob/blob_file_builder.cc +++ b/db/blob/blob_file_builder.cc @@ -259,8 +259,9 @@ Status BlobFileBuilder::CompressBlobIfNeeded( return Status::OK(); } + // TODO: allow user CompressionOptions, including max_compressed_bytes_per_kb CompressionOptions opts; - CompressionContext context(blob_compression_type_); + CompressionContext context(blob_compression_type_, opts); constexpr uint64_t sample_for_compression = 0; CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), diff --git a/db/blob/blob_file_builder_test.cc b/db/blob/blob_file_builder_test.cc index 3a0feee457f1..5882e219fe46 100644 --- a/db/blob/blob_file_builder_test.cc +++ b/db/blob/blob_file_builder_test.cc @@ -406,7 +406,7 @@ TEST_F(BlobFileBuilderTest, Compression) { ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1); CompressionOptions opts; - CompressionContext context(kSnappyCompression); + CompressionContext context(kSnappyCompression, opts); constexpr uint64_t sample_for_compression = 0; CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), diff --git a/db/blob/blob_file_cache.cc b/db/blob/blob_file_cache.cc index 19757946d6d1..5f340aadf555 100644 --- a/db/blob/blob_file_cache.cc +++ b/db/blob/blob_file_cache.cc @@ -25,7 +25,7 @@ BlobFileCache::BlobFileCache(Cache* cache, HistogramImpl* blob_file_read_hist, const std::shared_ptr& io_tracer) : cache_(cache), - mutex_(kNumberOfMutexStripes, kGetSliceNPHash64UnseededFnPtr), + mutex_(kNumberOfMutexStripes), immutable_options_(immutable_options), file_options_(file_options), column_family_id_(column_family_id), @@ -37,7 +37,7 @@ BlobFileCache::BlobFileCache(Cache* cache, } Status BlobFileCache::GetBlobFileReader( - uint64_t blob_file_number, + const ReadOptions& read_options, uint64_t blob_file_number, CacheHandleGuard* blob_file_reader) { assert(blob_file_reader); assert(blob_file_reader->IsEmpty()); @@ -55,7 +55,7 @@ Status BlobFileCache::GetBlobFileReader( TEST_SYNC_POINT("BlobFileCache::GetBlobFileReader:DoubleCheck"); // Check again while holding mutex - MutexLock lock(mutex_.get(key)); + MutexLock lock(&mutex_.Get(key)); handle = cache_.Lookup(key); if (handle) { @@ -73,7 +73,7 @@ Status BlobFileCache::GetBlobFileReader( { assert(file_options_); const Status s = BlobFileReader::Create( - *immutable_options_, *file_options_, column_family_id_, + *immutable_options_, read_options, *file_options_, column_family_id_, blob_file_read_hist_, blob_file_number, io_tracer_, &reader); if (!s.ok()) { RecordTick(statistics, NO_FILE_ERRORS); diff --git a/db/blob/blob_file_cache.h b/db/blob/blob_file_cache.h index 6281897d6010..740e67ada6cc 100644 --- a/db/blob/blob_file_cache.h +++ b/db/blob/blob_file_cache.h @@ -32,7 +32,8 @@ class BlobFileCache { BlobFileCache(const BlobFileCache&) = delete; BlobFileCache& operator=(const BlobFileCache&) = delete; - Status GetBlobFileReader(uint64_t blob_file_number, + Status GetBlobFileReader(const ReadOptions& read_options, + uint64_t blob_file_number, CacheHandleGuard* blob_file_reader); private: @@ -42,7 +43,7 @@ class BlobFileCache { CacheInterface cache_; // Note: mutex_ below is used to guard against multiple threads racing to open // the same file. - Striped mutex_; + Striped> mutex_; const ImmutableOptions* immutable_options_; const FileOptions* file_options_; uint32_t column_family_id_; diff --git a/db/blob/blob_file_cache_test.cc b/db/blob/blob_file_cache_test.cc index d3a61b3c5af0..8c3c56de9b4c 100644 --- a/db/blob/blob_file_cache_test.cc +++ b/db/blob/blob_file_cache_test.cc @@ -118,7 +118,9 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader) { // First try: reader should be opened and put in cache CacheHandleGuard first; - ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first)); + const ReadOptions read_options; + ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number, + &first)); ASSERT_NE(first.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); @@ -126,7 +128,8 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader) { // Second try: reader should be served from cache CacheHandleGuard second; - ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second)); + ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number, + &second)); ASSERT_NE(second.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); @@ -163,19 +166,21 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) { CacheHandleGuard first; CacheHandleGuard second; + const ReadOptions read_options; SyncPoint::GetInstance()->SetCallBack( "BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) { // Disabling sync points to prevent infinite recursion SyncPoint::GetInstance()->DisableProcessing(); - - ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second)); + ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, + blob_file_number, &second)); ASSERT_NE(second.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); }); SyncPoint::GetInstance()->EnableProcessing(); - ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first)); + ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number, + &first)); ASSERT_NE(first.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); @@ -213,8 +218,10 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) { CacheHandleGuard reader; + const ReadOptions read_options; ASSERT_TRUE( - blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError()); + blob_file_cache.GetBlobFileReader(read_options, blob_file_number, &reader) + .IsIOError()); ASSERT_EQ(reader.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1); @@ -253,8 +260,10 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) { // strict_capacity_limit is set CacheHandleGuard reader; - ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader) - .IsMemoryLimit()); + const ReadOptions read_options; + ASSERT_TRUE( + blob_file_cache.GetBlobFileReader(read_options, blob_file_number, &reader) + .IsMemoryLimit()); ASSERT_EQ(reader.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1); diff --git a/db/blob/blob_file_completion_callback.h b/db/blob/blob_file_completion_callback.h index ffe65a0ffa28..91596773155a 100644 --- a/db/blob/blob_file_completion_callback.h +++ b/db/blob/blob_file_completion_callback.h @@ -23,32 +23,19 @@ class BlobFileCompletionCallback { const std::vector>& listeners, const std::string& dbname) : event_logger_(event_logger), listeners_(listeners), dbname_(dbname) { -#ifndef ROCKSDB_LITE sst_file_manager_ = sst_file_manager; mutex_ = mutex; error_handler_ = error_handler; -#else - (void)sst_file_manager; - (void)mutex; - (void)error_handler; -#endif // ROCKSDB_LITE } void OnBlobFileCreationStarted(const std::string& file_name, const std::string& column_family_name, int job_id, BlobFileCreationReason creation_reason) { -#ifndef ROCKSDB_LITE // Notify the listeners. EventHelpers::NotifyBlobFileCreationStarted(listeners_, dbname_, column_family_name, file_name, job_id, creation_reason); -#else - (void)file_name; - (void)column_family_name; - (void)job_id; - (void)creation_reason; -#endif } Status OnBlobFileCompleted(const std::string& file_name, @@ -61,7 +48,6 @@ class BlobFileCompletionCallback { uint64_t blob_count, uint64_t blob_bytes) { Status s; -#ifndef ROCKSDB_LITE auto sfm = static_cast(sst_file_manager_); if (sfm) { // Report new blob files to SstFileManagerImpl @@ -74,7 +60,6 @@ class BlobFileCompletionCallback { error_handler_->SetBGError(s, BackgroundErrorReason::kFlush); } } -#endif // !ROCKSDB_LITE // Notify the listeners. EventHelpers::LogAndNotifyBlobFileCreationFinished( @@ -89,11 +74,9 @@ class BlobFileCompletionCallback { } private: -#ifndef ROCKSDB_LITE SstFileManager* sst_file_manager_; InstrumentedMutex* mutex_; ErrorHandler* error_handler_; -#endif // ROCKSDB_LITE EventLogger* event_logger_; std::vector> listeners_; std::string dbname_; diff --git a/db/blob/blob_file_reader.cc b/db/blob/blob_file_reader.cc index da7f2bb12eb1..0c30efbc119f 100644 --- a/db/blob/blob_file_reader.cc +++ b/db/blob/blob_file_reader.cc @@ -12,7 +12,7 @@ #include "db/blob/blob_log_format.h" #include "file/file_prefetch_buffer.h" #include "file/filename.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "options/cf_options.h" #include "rocksdb/file_system.h" #include "rocksdb/slice.h" @@ -26,9 +26,10 @@ namespace ROCKSDB_NAMESPACE { Status BlobFileReader::Create( - const ImmutableOptions& immutable_options, const FileOptions& file_options, - uint32_t column_family_id, HistogramImpl* blob_file_read_hist, - uint64_t blob_file_number, const std::shared_ptr& io_tracer, + const ImmutableOptions& immutable_options, const ReadOptions& read_options, + const FileOptions& file_options, uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, uint64_t blob_file_number, + const std::shared_ptr& io_tracer, std::unique_ptr* blob_file_reader) { assert(blob_file_reader); assert(!*blob_file_reader); @@ -52,15 +53,17 @@ Status BlobFileReader::Create( CompressionType compression_type = kNoCompression; { - const Status s = ReadHeader(file_reader.get(), column_family_id, statistics, - &compression_type); + const Status s = + ReadHeader(file_reader.get(), read_options, column_family_id, + statistics, &compression_type); if (!s.ok()) { return s; } } { - const Status s = ReadFooter(file_reader.get(), file_size, statistics); + const Status s = + ReadFooter(file_reader.get(), read_options, file_size, statistics); if (!s.ok()) { return s; } @@ -134,6 +137,7 @@ Status BlobFileReader::OpenFile( } Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint32_t column_family_id, Statistics* statistics, CompressionType* compression_type) { @@ -150,10 +154,9 @@ Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader, constexpr uint64_t read_offset = 0; constexpr size_t read_size = BlobLogHeader::kSize; - // TODO: rate limit reading headers from blob files. - const Status s = ReadFromFile(file_reader, read_offset, read_size, - statistics, &header_slice, &buf, &aligned_buf, - Env::IO_TOTAL /* rate_limiter_priority */); + const Status s = + ReadFromFile(file_reader, read_options, read_offset, read_size, + statistics, &header_slice, &buf, &aligned_buf); if (!s.ok()) { return s; } @@ -187,6 +190,7 @@ Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader, } Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint64_t file_size, Statistics* statistics) { assert(file_size >= BlobLogHeader::kSize + BlobLogFooter::kSize); assert(file_reader); @@ -201,10 +205,9 @@ Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader, const uint64_t read_offset = file_size - BlobLogFooter::kSize; constexpr size_t read_size = BlobLogFooter::kSize; - // TODO: rate limit reading footers from blob files. - const Status s = ReadFromFile(file_reader, read_offset, read_size, - statistics, &footer_slice, &buf, &aligned_buf, - Env::IO_TOTAL /* rate_limiter_priority */); + const Status s = + ReadFromFile(file_reader, read_options, read_offset, read_size, + statistics, &footer_slice, &buf, &aligned_buf); if (!s.ok()) { return s; } @@ -232,10 +235,10 @@ Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader, } Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint64_t read_offset, size_t read_size, Statistics* statistics, Slice* slice, - Buffer* buf, AlignedBuf* aligned_buf, - Env::IOPriority rate_limiter_priority) { + Buffer* buf, AlignedBuf* aligned_buf) { assert(slice); assert(buf); assert(aligned_buf); @@ -246,17 +249,23 @@ Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader, Status s; + IOOptions io_options; + s = file_reader->PrepareIOOptions(read_options, io_options); + if (!s.ok()) { + return s; + } + if (file_reader->use_direct_io()) { constexpr char* scratch = nullptr; - s = file_reader->Read(IOOptions(), read_offset, read_size, slice, scratch, - aligned_buf, rate_limiter_priority); + s = file_reader->Read(io_options, read_offset, read_size, slice, scratch, + aligned_buf); } else { buf->reset(new char[read_size]); constexpr AlignedBuf* aligned_scratch = nullptr; - s = file_reader->Read(IOOptions(), read_offset, read_size, slice, - buf->get(), aligned_scratch, rate_limiter_priority); + s = file_reader->Read(io_options, read_offset, read_size, slice, buf->get(), + aligned_scratch); } if (!s.ok()) { @@ -324,10 +333,14 @@ Status BlobFileReader::GetBlob( Status s; constexpr bool for_compaction = true; + IOOptions io_options; + s = file_reader_->PrepareIOOptions(read_options, io_options); + if (!s.ok()) { + return s; + } prefetched = prefetch_buffer->TryReadFromCache( - IOOptions(), file_reader_.get(), record_offset, - static_cast(record_size), &record_slice, &s, - read_options.rate_limiter_priority, for_compaction); + io_options, file_reader_.get(), record_offset, + static_cast(record_size), &record_slice, &s, for_compaction); if (!s.ok()) { return s; } @@ -338,10 +351,10 @@ Status BlobFileReader::GetBlob( PERF_COUNTER_ADD(blob_read_count, 1); PERF_COUNTER_ADD(blob_read_byte, record_size); PERF_TIMER_GUARD(blob_read_time); - const Status s = ReadFromFile(file_reader_.get(), record_offset, - static_cast(record_size), statistics_, - &record_slice, &buf, &aligned_buf, - read_options.rate_limiter_priority); + const Status s = + ReadFromFile(file_reader_.get(), read_options, record_offset, + static_cast(record_size), statistics_, + &record_slice, &buf, &aligned_buf); if (!s.ok()) { return s; } @@ -420,11 +433,11 @@ void BlobFileReader::MultiGetBlob( assert(req->offset >= adjustment); adjustments.push_back(adjustment); - FSReadRequest read_req = {}; + FSReadRequest read_req; read_req.offset = req->offset - adjustment; read_req.len = req->len + adjustment; - read_reqs.emplace_back(read_req); total_len += read_req.len; + read_reqs.emplace_back(std::move(read_req)); } RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, total_len); @@ -449,9 +462,12 @@ void BlobFileReader::MultiGetBlob( TEST_SYNC_POINT("BlobFileReader::MultiGetBlob:ReadFromFile"); PERF_COUNTER_ADD(blob_read_count, num_blobs); PERF_COUNTER_ADD(blob_read_byte, total_len); - s = file_reader_->MultiRead(IOOptions(), read_reqs.data(), read_reqs.size(), - direct_io ? &aligned_buf : nullptr, - read_options.rate_limiter_priority); + IOOptions opts; + s = file_reader_->PrepareIOOptions(read_options, opts); + if (s.ok()) { + s = file_reader_->MultiRead(opts, read_reqs.data(), read_reqs.size(), + direct_io ? &aligned_buf : nullptr); + } if (!s.ok()) { for (auto& req : read_reqs) { req.status.PermitUncheckedError(); @@ -569,7 +585,8 @@ Status BlobFileReader::UncompressBlobIfNeeded( assert(result); if (compression_type == kNoCompression) { - BlobContentsCreator::Create(result, nullptr, value_slice, allocator); + BlobContentsCreator::Create(result, nullptr, value_slice, kNoCompression, + allocator); return Status::OK(); } diff --git a/db/blob/blob_file_reader.h b/db/blob/blob_file_reader.h index 75b756da157a..fa8aa501d45f 100644 --- a/db/blob/blob_file_reader.h +++ b/db/blob/blob_file_reader.h @@ -29,6 +29,7 @@ class Statistics; class BlobFileReader { public: static Status Create(const ImmutableOptions& immutable_options, + const ReadOptions& read_options, const FileOptions& file_options, uint32_t column_family_id, HistogramImpl* blob_file_read_hist, @@ -74,19 +75,21 @@ class BlobFileReader { std::unique_ptr* file_reader); static Status ReadHeader(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint32_t column_family_id, Statistics* statistics, CompressionType* compression_type); static Status ReadFooter(const RandomAccessFileReader* file_reader, - uint64_t file_size, Statistics* statistics); + const ReadOptions& read_options, uint64_t file_size, + Statistics* statistics); using Buffer = std::unique_ptr; static Status ReadFromFile(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint64_t read_offset, size_t read_size, Statistics* statistics, Slice* slice, Buffer* buf, - AlignedBuf* aligned_buf, - Env::IOPriority rate_limiter_priority); + AlignedBuf* aligned_buf); static Status VerifyBlob(const Slice& record_slice, const Slice& user_key, uint64_t value_size); diff --git a/db/blob/blob_file_reader_test.cc b/db/blob/blob_file_reader_test.cc index 03458e2b5ba2..b6049d1ef5f2 100644 --- a/db/blob/blob_file_reader_test.cc +++ b/db/blob/blob_file_reader_test.cc @@ -74,7 +74,7 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, } } else { CompressionOptions opts; - CompressionContext context(compression); + CompressionContext context(compression, opts); constexpr uint64_t sample_for_compression = 0; CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), compression, sample_for_compression); @@ -172,12 +172,12 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) { std::unique_ptr reader; + ReadOptions read_options; ASSERT_OK(BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader)); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader)); // Make sure the blob can be retrieved with and without checksum verification - ReadOptions read_options; read_options.verify_checksums = false; constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; @@ -479,11 +479,11 @@ TEST_F(BlobFileReaderTest, Malformed) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - - ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, - &reader) + const ReadOptions read_options; + ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options, + FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, + nullptr /*IOTracer*/, &reader) .IsCorruption()); } @@ -513,11 +513,11 @@ TEST_F(BlobFileReaderTest, TTL) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - - ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, - &reader) + const ReadOptions read_options; + ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options, + FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, + nullptr /*IOTracer*/, &reader) .IsCorruption()); } @@ -552,11 +552,11 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - - ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, - &reader) + const ReadOptions read_options; + ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options, + FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, + nullptr /*IOTracer*/, &reader) .IsCorruption()); } @@ -591,11 +591,11 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - - ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, - &reader) + const ReadOptions read_options; + ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options, + FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, + nullptr /*IOTracer*/, &reader) .IsCorruption()); } @@ -629,9 +629,9 @@ TEST_F(BlobFileReaderTest, IncorrectColumnFamily) { std::unique_ptr reader; constexpr uint32_t incorrect_column_family_id = 2; - - ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), - incorrect_column_family_id, + const ReadOptions read_options; + ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options, + FileOptions(), incorrect_column_family_id, blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader) .IsCorruption()); @@ -664,10 +664,10 @@ TEST_F(BlobFileReaderTest, BlobCRCError) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - + const ReadOptions read_options; ASSERT_OK(BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader)); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader)); SyncPoint::GetInstance()->SetCallBack( "BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) { @@ -728,13 +728,12 @@ TEST_F(BlobFileReaderTest, Compression) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - + ReadOptions read_options; ASSERT_OK(BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader)); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader)); // Make sure the blob can be retrieved with and without checksum verification - ReadOptions read_options; read_options.verify_checksums = false; constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; @@ -803,10 +802,10 @@ TEST_F(BlobFileReaderTest, UncompressionError) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - + const ReadOptions read_options; ASSERT_OK(BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader)); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader)); SyncPoint::GetInstance()->SetCallBack( "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) { @@ -895,10 +894,10 @@ TEST_P(BlobFileReaderIOErrorTest, IOError) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - + const ReadOptions read_options; const Status s = BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader); const bool fail_during_create = (sync_point_ != "BlobFileReader::GetBlob:ReadFromFile"); @@ -983,10 +982,10 @@ TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - + const ReadOptions read_options; const Status s = BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader); const bool fail_during_create = sync_point_ != "BlobFileReader::GetBlob:TamperWithResult"; diff --git a/db/blob/blob_log_sequential_reader.cc b/db/blob/blob_log_sequential_reader.cc index 778725189712..579c98e295d3 100644 --- a/db/blob/blob_log_sequential_reader.cc +++ b/db/blob/blob_log_sequential_reader.cc @@ -7,7 +7,7 @@ #include "db/blob/blob_log_sequential_reader.h" #include "file/random_access_file_reader.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "util/stop_watch.h" namespace ROCKSDB_NAMESPACE { @@ -29,9 +29,8 @@ Status BlobLogSequentialReader::ReadSlice(uint64_t size, Slice* slice, StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS); // TODO: rate limit `BlobLogSequentialReader` reads (it appears unused?) - Status s = - file_->Read(IOOptions(), next_byte_, static_cast(size), slice, - buf, nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + Status s = file_->Read(IOOptions(), next_byte_, static_cast(size), + slice, buf, nullptr); next_byte_ += size; if (!s.ok()) { return s; diff --git a/db/blob/blob_log_writer.cc b/db/blob/blob_log_writer.cc index 9dbac7f259fc..bf5ef27c1d64 100644 --- a/db/blob/blob_log_writer.cc +++ b/db/blob/blob_log_writer.cc @@ -10,7 +10,7 @@ #include "db/blob/blob_log_format.h" #include "file/writable_file_writer.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "rocksdb/system_clock.h" #include "test_util/sync_point.h" #include "util/coding.h" diff --git a/db/blob/blob_source.cc b/db/blob/blob_source.cc index 19cfb1f89a1b..b524982e532f 100644 --- a/db/blob/blob_source.cc +++ b/db/blob/blob_source.cc @@ -13,7 +13,7 @@ #include "db/blob/blob_contents.h" #include "db/blob/blob_file_reader.h" #include "db/blob/blob_log_format.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "options/cf_options.h" #include "table/get_context.h" #include "table/multiget_context.h" @@ -30,7 +30,6 @@ BlobSource::BlobSource(const ImmutableOptions* immutable_options, blob_file_cache_(blob_file_cache), blob_cache_(immutable_options->blob_cache), lowest_used_cache_tier_(immutable_options->lowest_used_cache_tier) { -#ifndef ROCKSDB_LITE auto bbto = immutable_options->table_factory->GetOptions(); if (bbto && @@ -39,7 +38,6 @@ BlobSource::BlobSource(const ImmutableOptions* immutable_options, blob_cache_ = SharedCacheInterface{std::make_shared( immutable_options->blob_cache, bbto->block_cache)}; } -#endif // ROCKSDB_LITE } BlobSource::~BlobSource() = default; @@ -106,9 +104,9 @@ Status BlobSource::PutBlobIntoCache( } BlobSource::TypedHandle* BlobSource::GetEntryFromCache(const Slice& key) const { - return blob_cache_.LookupFull( - key, nullptr /* context */, Cache::Priority::BOTTOM, - true /* wait_for_cache */, statistics_, lowest_used_cache_tier_); + return blob_cache_.LookupFull(key, nullptr /* context */, + Cache::Priority::BOTTOM, statistics_, + lowest_used_cache_tier_); } void BlobSource::PinCachedBlob(CacheHandleGuard* cached_blob, @@ -211,7 +209,8 @@ Status BlobSource::GetBlob(const ReadOptions& read_options, { CacheHandleGuard blob_file_reader; - s = blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader); + s = blob_file_cache_->GetBlobFileReader(read_options, file_number, + &blob_file_reader); if (!s.ok()) { return s; } @@ -374,8 +373,8 @@ void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options, } CacheHandleGuard blob_file_reader; - Status s = - blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader); + Status s = blob_file_cache_->GetBlobFileReader(read_options, file_number, + &blob_file_reader); if (!s.ok()) { for (size_t i = 0; i < _blob_reqs.size(); ++i) { BlobReadRequest* const req = _blob_reqs[i].first; diff --git a/db/blob/blob_source.h b/db/blob/blob_source.h index cdc218747843..d5e009b54d36 100644 --- a/db/blob/blob_source.h +++ b/db/blob/blob_source.h @@ -95,9 +95,9 @@ class BlobSource { uint64_t* bytes_read); inline Status GetBlobFileReader( - uint64_t blob_file_number, + const ReadOptions& read_options, uint64_t blob_file_number, CacheHandleGuard* blob_file_reader) { - return blob_file_cache_->GetBlobFileReader(blob_file_number, + return blob_file_cache_->GetBlobFileReader(read_options, blob_file_number, blob_file_reader); } diff --git a/db/blob/blob_source_test.cc b/db/blob/blob_source_test.cc index 4a1ba84eeabe..c0e1aba6ec04 100644 --- a/db/blob/blob_source_test.cc +++ b/db/blob/blob_source_test.cc @@ -76,7 +76,7 @@ void WriteBlobFile(const ImmutableOptions& immutable_options, } } else { CompressionOptions opts; - CompressionContext context(compression); + CompressionContext context(compression, opts); constexpr uint64_t sample_for_compression = 0; CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), compression, sample_for_compression); @@ -517,7 +517,8 @@ TEST_F(BlobSourceTest, GetCompressedBlobs) { compression, blob_offsets, blob_sizes); CacheHandleGuard blob_file_reader; - ASSERT_OK(blob_source.GetBlobFileReader(file_number, &blob_file_reader)); + ASSERT_OK(blob_source.GetBlobFileReader(read_options, file_number, + &blob_file_reader)); ASSERT_NE(blob_file_reader.GetValue(), nullptr); const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize(); @@ -1139,12 +1140,13 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { blob_file_cache.get()); CacheHandleGuard file_reader; - ASSERT_OK(blob_source.GetBlobFileReader(file_number, &file_reader)); + ReadOptions read_options; + ASSERT_OK( + blob_source.GetBlobFileReader(read_options, file_number, &file_reader)); ASSERT_NE(file_reader.GetValue(), nullptr); const uint64_t file_size = file_reader.GetValue()->GetFileSize(); ASSERT_EQ(file_reader.GetValue()->GetCompressionType(), kNoCompression); - ReadOptions read_options; read_options.verify_checksums = true; auto blob_cache = options_.blob_cache; @@ -1214,12 +1216,12 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { ASSERT_EQ(handle0, nullptr); // key0's item should be in the secondary cache. - bool is_in_sec_cache = false; + bool kept_in_sec_cache = false; auto sec_handle0 = secondary_cache->Lookup( - key0, &BlobSource::SharedCacheInterface::kFullHelper, + key0, BlobSource::SharedCacheInterface::GetFullHelper(), /*context*/ nullptr, true, - /*advise_erase=*/true, is_in_sec_cache); - ASSERT_FALSE(is_in_sec_cache); + /*advise_erase=*/true, kept_in_sec_cache); + ASSERT_FALSE(kept_in_sec_cache); ASSERT_NE(sec_handle0, nullptr); ASSERT_TRUE(sec_handle0->IsReady()); auto value = static_cast(sec_handle0->Value()); @@ -1242,12 +1244,12 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { ASSERT_NE(handle1, nullptr); blob_cache->Release(handle1); - bool is_in_sec_cache = false; + bool kept_in_sec_cache = false; auto sec_handle1 = secondary_cache->Lookup( - key1, &BlobSource::SharedCacheInterface::kFullHelper, + key1, BlobSource::SharedCacheInterface::GetFullHelper(), /*context*/ nullptr, true, - /*advise_erase=*/true, is_in_sec_cache); - ASSERT_FALSE(is_in_sec_cache); + /*advise_erase=*/true, kept_in_sec_cache); + ASSERT_FALSE(kept_in_sec_cache); ASSERT_EQ(sec_handle1, nullptr); ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size, @@ -1372,7 +1374,7 @@ class BlobSourceCacheReservationTest : public DBTestBase { static constexpr std::size_t kSizeDummyEntry = CacheReservationManagerImpl< CacheEntryRole::kBlobCache>::GetDummyEntrySize(); - static constexpr std::size_t kCacheCapacity = 1 * kSizeDummyEntry; + static constexpr std::size_t kCacheCapacity = 2 * kSizeDummyEntry; static constexpr int kNumShardBits = 0; // 2^0 shard static constexpr uint32_t kColumnFamilyId = 1; @@ -1391,7 +1393,6 @@ class BlobSourceCacheReservationTest : public DBTestBase { std::string db_session_id_; }; -#ifndef ROCKSDB_LITE TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) { options_.cf_paths.emplace_back( test::PerThreadDBPath( @@ -1506,11 +1507,10 @@ TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) { } } -TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) { +TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservation) { options_.cf_paths.emplace_back( test::PerThreadDBPath( - env_, - "BlobSourceCacheReservationTest_IncreaseCacheReservationOnFullCache"), + env_, "BlobSourceCacheReservationTest_IncreaseCacheReservation"), 0); GenerateKeysAndBlobs(); @@ -1518,7 +1518,7 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) { DestroyAndReopen(options_); ImmutableOptions immutable_options(options_); - constexpr size_t blob_size = kSizeDummyEntry / (kNumBlobs / 2); + constexpr size_t blob_size = 24 << 10; // 24KB for (size_t i = 0; i < kNumBlobs; ++i) { blob_file_size_ -= blobs_[i].size(); // old blob size blob_strs_[i].resize(blob_size, '@'); @@ -1576,11 +1576,6 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) { std::vector values(keys_.size()); - // Since we resized each blob to be kSizeDummyEntry / (num_blobs / 2), we - // can't fit all the blobs in the cache at the same time, which means we - // should observe cache evictions once we reach the cache's capacity. - // Due to the overhead of the cache and the BlobContents objects, as well as - // jemalloc bin sizes, this happens after inserting seven blobs. uint64_t blob_bytes = 0; for (size_t i = 0; i < kNumBlobs; ++i) { ASSERT_OK(blob_source.GetBlob( @@ -1591,22 +1586,21 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) { // Release cache handle values[i].Reset(); - if (i < kNumBlobs / 2 - 1) { - size_t charge = 0; - ASSERT_TRUE(blob_source.TEST_BlobInCache( - kBlobFileNumber, blob_file_size_, blob_offsets[i], &charge)); + size_t charge = 0; + ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_, + blob_offsets[i], &charge)); - blob_bytes += charge; - } + blob_bytes += charge; - ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry); + ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), + (blob_bytes <= kSizeDummyEntry) ? kSizeDummyEntry + : (2 * kSizeDummyEntry)); ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes); ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), options_.blob_cache->GetUsage()); } } } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/blob/db_blob_basic_test.cc b/db/blob/db_blob_basic_test.cc index e6832a2ae448..1c0caba93d95 100644 --- a/db/blob/db_blob_basic_test.cc +++ b/db/blob/db_blob_basic_test.cc @@ -11,6 +11,7 @@ #include "db/blob/blob_index.h" #include "db/blob/blob_log_format.h" #include "db/db_test_util.h" +#include "db/db_with_timestamp_test_util.h" #include "port/stack_trace.h" #include "test_util/sync_point.h" #include "utilities/fault_injection_env.h" @@ -167,6 +168,7 @@ TEST_F(DBBlobBasicTest, IterateBlobsFromCache) { ASSERT_EQ(iter->value().ToString(), blobs[i]); ++i; } + ASSERT_OK(iter->status()); ASSERT_EQ(i, num_blobs); ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0); } @@ -202,6 +204,7 @@ TEST_F(DBBlobBasicTest, IterateBlobsFromCache) { ASSERT_EQ(iter->value().ToString(), blobs[i]); ++i; } + ASSERT_OK(iter->status()); ASSERT_EQ(i, num_blobs); ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), num_blobs); @@ -223,6 +226,7 @@ TEST_F(DBBlobBasicTest, IterateBlobsFromCache) { ASSERT_EQ(iter->value().ToString(), blobs[i]); ++i; } + ASSERT_OK(iter->status()); ASSERT_EQ(i, num_blobs); ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0); } @@ -584,7 +588,6 @@ TEST_F(DBBlobBasicTest, MultiGetBlobsFromCache) { } } -#ifndef ROCKSDB_LITE TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) { Options options = GetDefaultOptions(); @@ -773,7 +776,6 @@ TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) { ASSERT_EQ(values[2], second_blob); } } -#endif // !ROCKSDB_LITE TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) { Options options = GetDefaultOptions(); @@ -1062,7 +1064,6 @@ TEST_F(DBBlobBasicTest, GetBlob_IndexWithInvalidFileNumber) { .IsCorruption()); } -#ifndef ROCKSDB_LITE TEST_F(DBBlobBasicTest, GenerateIOTracing) { Options options = GetDefaultOptions(); options.enable_blob_files = true; @@ -1117,7 +1118,6 @@ TEST_F(DBBlobBasicTest, GenerateIOTracing) { ASSERT_GT(blob_files_op_count, 2); } } -#endif // !ROCKSDB_LITE TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) { Options options = GetDefaultOptions(); @@ -1219,7 +1219,6 @@ TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) { ASSERT_EQ(values[2], "v2_0"); } -#ifndef ROCKSDB_LITE TEST_F(DBBlobBasicTest, Properties) { Options options = GetDefaultOptions(); options.enable_blob_files = true; @@ -1382,7 +1381,6 @@ TEST_F(DBBlobBasicTest, PropertiesMultiVersion) { BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) + blob_size + BlobLogFooter::kSize)); } -#endif // !ROCKSDB_LITE class DBBlobBasicIOErrorTest : public DBBlobBasicTest, public testing::WithParamInterface { @@ -1632,7 +1630,6 @@ TEST_F(DBBlobBasicTest, WarmCacheWithBlobsDuringFlush) { options.statistics->getTickerCount(BLOB_DB_CACHE_ADD)); } -#ifndef ROCKSDB_LITE TEST_F(DBBlobBasicTest, DynamicallyWarmCacheDuringFlush) { Options options = GetDefaultOptions(); @@ -1700,7 +1697,6 @@ TEST_F(DBBlobBasicTest, DynamicallyWarmCacheDuringFlush) { /*end=*/nullptr)); EXPECT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD)); } -#endif // !ROCKSDB_LITE TEST_F(DBBlobBasicTest, WarmCacheWithBlobsSecondary) { CompressedSecondaryCacheOptions secondary_cache_opts; @@ -1779,6 +1775,466 @@ TEST_F(DBBlobBasicTest, WarmCacheWithBlobsSecondary) { 1); } +TEST_F(DBBlobBasicTest, GetEntityBlob) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = 0; + + Reopen(options); + + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + constexpr char other_key[] = "other_key"; + constexpr char other_blob_value[] = "other_blob_value"; + + ASSERT_OK(Put(key, blob_value)); + ASSERT_OK(Put(other_key, other_blob_value)); + + ASSERT_OK(Flush()); + + WideColumns expected_columns{{kDefaultWideColumnName, blob_value}}; + WideColumns other_expected_columns{ + {kDefaultWideColumnName, other_blob_value}}; + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), key, + &result)); + ASSERT_EQ(result.columns(), expected_columns); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + other_key, &result)); + + ASSERT_EQ(result.columns(), other_expected_columns); + } + + { + constexpr size_t num_keys = 2; + + std::array keys{{key, other_key}}; + std::array results; + std::array statuses; + + db_->MultiGetEntity(ReadOptions(), db_->DefaultColumnFamily(), num_keys, + &keys[0], &results[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(results[0].columns(), expected_columns); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(results[1].columns(), other_expected_columns); + } +} + +class DBBlobWithTimestampTest : public DBBasicTestWithTimestampBase { + protected: + DBBlobWithTimestampTest() + : DBBasicTestWithTimestampBase("db_blob_with_timestamp_test") {} +}; + +TEST_F(DBBlobWithTimestampTest, GetBlob) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.enable_blob_files = true; + options.min_blob_size = 0; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + + DestroyAndReopen(options); + WriteOptions write_opts; + const std::string ts = Timestamp(1, 0); + constexpr char key[] = "key"; + constexpr char blob_value[] = "blob_value"; + + ASSERT_OK(db_->Put(write_opts, key, ts, blob_value)); + + ASSERT_OK(Flush()); + + const std::string read_ts = Timestamp(2, 0); + Slice read_ts_slice(read_ts); + ReadOptions read_opts; + read_opts.timestamp = &read_ts_slice; + std::string value; + ASSERT_OK(db_->Get(read_opts, key, &value)); + ASSERT_EQ(value, blob_value); +} + +TEST_F(DBBlobWithTimestampTest, MultiGetBlobs) { + constexpr size_t min_blob_size = 6; + + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.min_blob_size = min_blob_size; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + + DestroyAndReopen(options); + + // Put then retrieve three key-values. The first value is below the size limit + // and is thus stored inline; the other two are stored separately as blobs. + constexpr size_t num_keys = 3; + + constexpr char first_key[] = "first_key"; + constexpr char first_value[] = "short"; + static_assert(sizeof(first_value) - 1 < min_blob_size, + "first_value too long to be inlined"); + + DestroyAndReopen(options); + WriteOptions write_opts; + const std::string ts = Timestamp(1, 0); + ASSERT_OK(db_->Put(write_opts, first_key, ts, first_value)); + + constexpr char second_key[] = "second_key"; + constexpr char second_value[] = "long_value"; + static_assert(sizeof(second_value) - 1 >= min_blob_size, + "second_value too short to be stored as blob"); + + ASSERT_OK(db_->Put(write_opts, second_key, ts, second_value)); + + constexpr char third_key[] = "third_key"; + constexpr char third_value[] = "other_long_value"; + static_assert(sizeof(third_value) - 1 >= min_blob_size, + "third_value too short to be stored as blob"); + + ASSERT_OK(db_->Put(write_opts, third_key, ts, third_value)); + + ASSERT_OK(Flush()); + + ReadOptions read_options; + const std::string read_ts = Timestamp(2, 0); + Slice read_ts_slice(read_ts); + read_options.timestamp = &read_ts_slice; + std::array keys{{first_key, second_key, third_key}}; + + { + std::array values; + std::array statuses; + + db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_value); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], second_value); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], third_value); + } +} + +TEST_F(DBBlobWithTimestampTest, GetMergeBlobWithPut) { + Options options = GetDefaultOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + + DestroyAndReopen(options); + + WriteOptions write_opts; + const std::string ts = Timestamp(1, 0); + ASSERT_OK(db_->Put(write_opts, "Key1", ts, "v1")); + ASSERT_OK(Flush()); + ASSERT_OK( + db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key1", ts, "v2")); + ASSERT_OK(Flush()); + ASSERT_OK( + db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key1", ts, "v3")); + ASSERT_OK(Flush()); + + std::string value; + const std::string read_ts = Timestamp(2, 0); + Slice read_ts_slice(read_ts); + ReadOptions read_opts; + read_opts.timestamp = &read_ts_slice; + ASSERT_OK(db_->Get(read_opts, "Key1", &value)); + ASSERT_EQ(value, "v1,v2,v3"); +} + +TEST_F(DBBlobWithTimestampTest, MultiGetMergeBlobWithPut) { + constexpr size_t num_keys = 3; + + Options options = GetDefaultOptions(); + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + options.enable_blob_files = true; + options.min_blob_size = 0; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + + DestroyAndReopen(options); + + WriteOptions write_opts; + const std::string ts = Timestamp(1, 0); + + ASSERT_OK(db_->Put(write_opts, "Key0", ts, "v0_0")); + ASSERT_OK(db_->Put(write_opts, "Key1", ts, "v1_0")); + ASSERT_OK(db_->Put(write_opts, "Key2", ts, "v2_0")); + ASSERT_OK(Flush()); + ASSERT_OK( + db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key0", ts, "v0_1")); + ASSERT_OK( + db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key1", ts, "v1_1")); + ASSERT_OK(Flush()); + ASSERT_OK( + db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key0", ts, "v0_2")); + ASSERT_OK(Flush()); + + const std::string read_ts = Timestamp(2, 0); + Slice read_ts_slice(read_ts); + ReadOptions read_opts; + read_opts.timestamp = &read_ts_slice; + std::array keys{{"Key0", "Key1", "Key2"}}; + std::array values; + std::array statuses; + + db_->MultiGet(read_opts, db_->DefaultColumnFamily(), num_keys, &keys[0], + &values[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], "v0_0,v0_1,v0_2"); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], "v1_0,v1_1"); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], "v2_0"); +} + +TEST_F(DBBlobWithTimestampTest, IterateBlobs) { + Options options = GetDefaultOptions(); + options.enable_blob_files = true; + options.create_if_missing = true; + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + + DestroyAndReopen(options); + + int num_blobs = 5; + std::vector keys; + std::vector blobs; + + WriteOptions write_opts; + std::vector write_timestamps = {Timestamp(1, 0), + Timestamp(2, 0)}; + + // For each key in ["key0", ... "keyi", ...], write two versions: + // Timestamp(1, 0), "blobi0" + // Timestamp(2, 0), "blobi1" + for (int i = 0; i < num_blobs; i++) { + keys.push_back("key" + std::to_string(i)); + blobs.push_back("blob" + std::to_string(i)); + for (size_t j = 0; j < write_timestamps.size(); j++) { + ASSERT_OK(db_->Put(write_opts, keys[i], write_timestamps[j], + blobs[i] + std::to_string(j))); + } + } + ASSERT_OK(Flush()); + + ReadOptions read_options; + std::vector read_timestamps = {Timestamp(0, 0), Timestamp(3, 0)}; + Slice ts_upper_bound(read_timestamps[1]); + read_options.timestamp = &ts_upper_bound; + + auto check_iter_entry = + [](const Iterator* iter, const std::string& expected_key, + const std::string& expected_ts, const std::string& expected_value, + bool key_is_internal = true) { + ASSERT_OK(iter->status()); + if (key_is_internal) { + std::string expected_ukey_and_ts; + expected_ukey_and_ts.assign(expected_key.data(), expected_key.size()); + expected_ukey_and_ts.append(expected_ts.data(), expected_ts.size()); + + ParsedInternalKey parsed_ikey; + ASSERT_OK(ParseInternalKey(iter->key(), &parsed_ikey, + true /* log_err_key */)); + ASSERT_EQ(parsed_ikey.user_key, expected_ukey_and_ts); + } else { + ASSERT_EQ(iter->key(), expected_key); + } + ASSERT_EQ(iter->timestamp(), expected_ts); + ASSERT_EQ(iter->value(), expected_value); + }; + + // Forward iterating one version of each key, get in this order: + // [("key0", Timestamp(2, 0), "blob01"), + // ("key1", Timestamp(2, 0), "blob11")...] + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToFirst(); + for (int i = 0; i < num_blobs; i++) { + check_iter_entry(iter.get(), keys[i], write_timestamps[1], + blobs[i] + std::to_string(1), /*key_is_internal*/ false); + iter->Next(); + } + } + + // Forward iteration, then reverse to backward. + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToFirst(); + for (int i = 0; i < num_blobs * 2 - 1; i++) { + if (i < num_blobs) { + check_iter_entry(iter.get(), keys[i], write_timestamps[1], + blobs[i] + std::to_string(1), + /*key_is_internal*/ false); + if (i != num_blobs - 1) { + iter->Next(); + } + } else { + if (i != num_blobs) { + check_iter_entry(iter.get(), keys[num_blobs * 2 - 1 - i], + write_timestamps[1], + blobs[num_blobs * 2 - 1 - i] + std::to_string(1), + /*key_is_internal*/ false); + } + iter->Prev(); + } + } + } + + // Backward iterating one versions of each key, get in this order: + // [("key4", Timestamp(2, 0), "blob41"), + // ("key3", Timestamp(2, 0), "blob31")...] + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToLast(); + for (int i = 0; i < num_blobs; i++) { + check_iter_entry(iter.get(), keys[num_blobs - 1 - i], write_timestamps[1], + blobs[num_blobs - 1 - i] + std::to_string(1), + /*key_is_internal*/ false); + iter->Prev(); + } + ASSERT_OK(iter->status()); + } + + // Backward iteration, then reverse to forward. + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToLast(); + for (int i = 0; i < num_blobs * 2 - 1; i++) { + if (i < num_blobs) { + check_iter_entry(iter.get(), keys[num_blobs - 1 - i], + write_timestamps[1], + blobs[num_blobs - 1 - i] + std::to_string(1), + /*key_is_internal*/ false); + if (i != num_blobs - 1) { + iter->Prev(); + } + } else { + if (i != num_blobs) { + check_iter_entry(iter.get(), keys[i - num_blobs], write_timestamps[1], + blobs[i - num_blobs] + std::to_string(1), + /*key_is_internal*/ false); + } + iter->Next(); + } + } + } + + Slice ts_lower_bound(read_timestamps[0]); + read_options.iter_start_ts = &ts_lower_bound; + // Forward iterating multiple versions of the same key, get in this order: + // [("key0", Timestamp(2, 0), "blob01"), + // ("key0", Timestamp(1, 0), "blob00"), + // ("key1", Timestamp(2, 0), "blob11")...] + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToFirst(); + for (int i = 0; i < num_blobs; i++) { + for (size_t j = write_timestamps.size(); j > 0; --j) { + check_iter_entry(iter.get(), keys[i], write_timestamps[j - 1], + blobs[i] + std::to_string(j - 1)); + iter->Next(); + } + } + ASSERT_OK(iter->status()); + } + + // Backward iterating multiple versions of the same key, get in this order: + // [("key4", Timestamp(1, 0), "blob00"), + // ("key4", Timestamp(2, 0), "blob01"), + // ("key3", Timestamp(1, 0), "blob10")...] + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToLast(); + for (int i = num_blobs; i > 0; i--) { + for (size_t j = 0; j < write_timestamps.size(); j++) { + check_iter_entry(iter.get(), keys[i - 1], write_timestamps[j], + blobs[i - 1] + std::to_string(j)); + iter->Prev(); + } + } + ASSERT_OK(iter->status()); + } + + int upper_bound_idx = num_blobs - 2; + int lower_bound_idx = 1; + Slice upper_bound_slice(keys[upper_bound_idx]); + Slice lower_bound_slice(keys[lower_bound_idx]); + read_options.iterate_upper_bound = &upper_bound_slice; + read_options.iterate_lower_bound = &lower_bound_slice; + + // Forward iteration with upper and lower bound. + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToFirst(); + for (int i = lower_bound_idx; i < upper_bound_idx; i++) { + for (size_t j = write_timestamps.size(); j > 0; --j) { + check_iter_entry(iter.get(), keys[i], write_timestamps[j - 1], + blobs[i] + std::to_string(j - 1)); + iter->Next(); + } + } + ASSERT_OK(iter->status()); + } + + // Backward iteration with upper and lower bound. + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + + iter->SeekToLast(); + for (int i = upper_bound_idx; i > lower_bound_idx; i--) { + for (size_t j = 0; j < write_timestamps.size(); j++) { + check_iter_entry(iter.get(), keys[i - 1], write_timestamps[j], + blobs[i - 1] + std::to_string(j)); + iter->Prev(); + } + } + ASSERT_OK(iter->status()); + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/blob/db_blob_compaction_test.cc b/db/blob/db_blob_compaction_test.cc index f3fe3c03bc36..14a3155e251b 100644 --- a/db/blob/db_blob_compaction_test.cc +++ b/db/blob/db_blob_compaction_test.cc @@ -16,7 +16,6 @@ class DBBlobCompactionTest : public DBTestBase { explicit DBBlobCompactionTest() : DBTestBase("db_blob_compaction_test", /*env_do_fsync=*/false) {} -#ifndef ROCKSDB_LITE const std::vector& GetCompactionStats() { VersionSet* const versions = dbfull()->GetVersionSet(); assert(versions); @@ -30,7 +29,6 @@ class DBBlobCompactionTest : public DBTestBase { return internal_stats->TEST_GetCompactionStats(); } -#endif // ROCKSDB_LITE }; namespace { @@ -250,7 +248,6 @@ TEST_F(DBBlobCompactionTest, FilterByKeyLength) { ASSERT_OK(db_->Get(ReadOptions(), long_key, &value)); ASSERT_EQ("value", value); -#ifndef ROCKSDB_LITE const auto& compaction_stats = GetCompactionStats(); ASSERT_GE(compaction_stats.size(), 2); @@ -258,7 +255,6 @@ TEST_F(DBBlobCompactionTest, FilterByKeyLength) { // this involves neither reading nor writing blobs ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); -#endif // ROCKSDB_LITE Close(); } @@ -299,7 +295,6 @@ TEST_F(DBBlobCompactionTest, FilterByValueLength) { ASSERT_EQ(long_value, value); } -#ifndef ROCKSDB_LITE const auto& compaction_stats = GetCompactionStats(); ASSERT_GE(compaction_stats.size(), 2); @@ -307,12 +302,10 @@ TEST_F(DBBlobCompactionTest, FilterByValueLength) { // this involves reading but not writing blobs ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); -#endif // ROCKSDB_LITE Close(); } -#ifndef ROCKSDB_LITE TEST_F(DBBlobCompactionTest, BlobCompactWithStartingLevel) { Options options = GetDefaultOptions(); @@ -388,7 +381,6 @@ TEST_F(DBBlobCompactionTest, BlobCompactWithStartingLevel) { Close(); } -#endif TEST_F(DBBlobCompactionTest, BlindWriteFilter) { Options options = GetDefaultOptions(); @@ -413,7 +405,6 @@ TEST_F(DBBlobCompactionTest, BlindWriteFilter) { ASSERT_EQ(new_blob_value, Get(key)); } -#ifndef ROCKSDB_LITE const auto& compaction_stats = GetCompactionStats(); ASSERT_GE(compaction_stats.size(), 2); @@ -421,7 +412,6 @@ TEST_F(DBBlobCompactionTest, BlindWriteFilter) { // this involves writing but not reading blobs ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0); ASSERT_GT(compaction_stats[1].bytes_written_blob, 0); -#endif // ROCKSDB_LITE Close(); } @@ -540,7 +530,6 @@ TEST_F(DBBlobCompactionTest, CompactionFilter) { ASSERT_EQ(kv.second + std::string(padding), Get(kv.first)); } -#ifndef ROCKSDB_LITE const auto& compaction_stats = GetCompactionStats(); ASSERT_GE(compaction_stats.size(), 2); @@ -548,7 +537,6 @@ TEST_F(DBBlobCompactionTest, CompactionFilter) { // this involves reading and writing blobs ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); ASSERT_GT(compaction_stats[1].bytes_written_blob, 0); -#endif // ROCKSDB_LITE Close(); } @@ -606,7 +594,6 @@ TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) { /*end=*/nullptr)); ASSERT_EQ(blob_files, GetBlobFileNumbers()); -#ifndef ROCKSDB_LITE const auto& compaction_stats = GetCompactionStats(); ASSERT_GE(compaction_stats.size(), 2); @@ -614,7 +601,6 @@ TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) { // this involves reading but not writing blobs ASSERT_GT(compaction_stats[1].bytes_read_blob, 0); ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0); -#endif // ROCKSDB_LITE Close(); } diff --git a/db/blob/db_blob_corruption_test.cc b/db/blob/db_blob_corruption_test.cc index 7ac7ce3fc703..694b25b2ecad 100644 --- a/db/blob/db_blob_corruption_test.cc +++ b/db/blob/db_blob_corruption_test.cc @@ -34,7 +34,6 @@ class DBBlobCorruptionTest : public DBTestBase { } }; -#ifndef ROCKSDB_LITE TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) { Options options = GetDefaultOptions(); options.enable_blob_files = true; @@ -71,7 +70,6 @@ TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/blob/db_blob_index_test.cc b/db/blob/db_blob_index_test.cc index 64c550894122..e2997603490c 100644 --- a/db/blob/db_blob_index_test.cc +++ b/db/blob/db_blob_index_test.cc @@ -96,9 +96,13 @@ class DBBlobIndexTest : public DBTestBase { } ArenaWrappedDBIter* GetBlobIterator() { - return dbfull()->NewIteratorImpl( - ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(), - nullptr /*read_callback*/, true /*expose_blob_index*/); + ColumnFamilyData* column_family = cfd(); + DBImpl* db_impl = dbfull(); + return db_impl->NewIteratorImpl( + ReadOptions(), column_family, + column_family->GetReferencedSuperVersion(db_impl), + db_impl->GetLatestSequenceNumber(), nullptr /*read_callback*/, + true /*expose_blob_index*/); } Options GetTestOptions() { @@ -131,9 +135,7 @@ class DBBlobIndexTest : public DBTestBase { ASSERT_OK(Flush()); ASSERT_OK( dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); -#ifndef ROCKSDB_LITE ASSERT_EQ("0,1", FilesPerLevel()); -#endif // !ROCKSDB_LITE break; } } @@ -459,7 +461,6 @@ TEST_F(DBBlobIndexTest, Iterate) { verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), create_blob_iterator, check_is_blob(false)); -#ifndef ROCKSDB_LITE // Iterator with blob support and using seek. ASSERT_OK(dbfull()->SetOptions( cfh(), {{"max_sequential_skip_in_iterations", "0"}})); @@ -484,7 +485,6 @@ TEST_F(DBBlobIndexTest, Iterate) { create_blob_iterator, check_is_blob(false)); verify(15, Status::kOk, get_value(16, 0), get_value(14, 0), create_blob_iterator, check_is_blob(false)); -#endif // !ROCKSDB_LITE for (auto* snapshot : snapshots) { dbfull()->ReleaseSnapshot(snapshot); @@ -584,12 +584,10 @@ TEST_F(DBBlobIndexTest, IntegratedBlobIterate) { Status expected_status; verify(1, expected_status, expected_value); -#ifndef ROCKSDB_LITE // Test DBIter::FindValueForCurrentKeyUsingSeek flow. ASSERT_OK(dbfull()->SetOptions(cfh(), {{"max_sequential_skip_in_iterations", "0"}})); verify(1, expected_status, expected_value); -#endif // !ROCKSDB_LITE } } // namespace ROCKSDB_NAMESPACE diff --git a/db/builder.cc b/db/builder.cc index a84bd5a45f46..d3040ee9e233 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -15,6 +15,7 @@ #include "db/blob/blob_file_builder.h" #include "db/compaction/compaction_iterator.h" +#include "db/dbformat.h" #include "db/event_helpers.h" #include "db/internal_stats.h" #include "db/merge_helper.h" @@ -56,8 +57,8 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, Status BuildTable( const std::string& dbname, VersionSet* versions, const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, - const FileOptions& file_options, TableCache* table_cache, - InternalIterator* iter, + const FileOptions& file_options, const ReadOptions& read_options, + TableCache* table_cache, InternalIterator* iter, std::vector> range_del_iters, FileMetaData* meta, std::vector* blob_file_additions, @@ -107,11 +108,9 @@ Status BuildTable( std::vector blob_file_paths; std::string file_checksum = kUnknownFileChecksum; std::string file_checksum_func_name = kUnknownFileChecksumFuncName; -#ifndef ROCKSDB_LITE EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname, tboptions.column_family_name, fname, job_id, tboptions.reason); -#endif // !ROCKSDB_LITE Env* env = db_options.env; assert(env); FileSystem* fs = db_options.fs.get(); @@ -204,24 +203,42 @@ Status BuildTable( blob_file_builder.get(), ioptions.allow_data_in_errors, ioptions.enforce_single_del_contracts, /*manual_compaction_canceled=*/kManualCompactionCanceledFalse, + true /* must_count_input_entries */, /*compaction=*/nullptr, compaction_filter.get(), /*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low); + const size_t ts_sz = ucmp->timestamp_size(); + const bool strip_timestamp = + ts_sz > 0 && !ioptions.persist_user_defined_timestamps; + + std::string key_after_flush_buf; c_iter.SeekToFirst(); for (; c_iter.Valid(); c_iter.Next()) { const Slice& key = c_iter.key(); const Slice& value = c_iter.value(); const ParsedInternalKey& ikey = c_iter.ikey(); - // Generate a rolling 64-bit hash of the key and values - // Note : - // Here "key" integrates 'sequence_number'+'kType'+'user key'. - s = output_validator.Add(key, value); + Slice key_after_flush = key; + // If user defined timestamps will be stripped from user key after flush, + // the in memory version of the key act logically the same as one with a + // minimum timestamp. We update the timestamp here so file boundary and + // output validator, block builder all see the effect of the stripping. + if (strip_timestamp) { + key_after_flush_buf.clear(); + ReplaceInternalKeyWithMinTimestamp(&key_after_flush_buf, key, ts_sz); + key_after_flush = key_after_flush_buf; + } + + // Generate a rolling 64-bit hash of the key and values + // Note : + // Here "key" integrates 'sequence_number'+'kType'+'user key'. + s = output_validator.Add(key_after_flush, value); if (!s.ok()) { break; } - builder->Add(key, value); + builder->Add(key_after_flush, value); - s = meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type); + s = meta->UpdateBoundaries(key_after_flush, value, ikey.sequence, + ikey.type); if (!s.ok()) { break; } @@ -246,6 +263,7 @@ Status BuildTable( range_del_it->Next()) { auto tombstone = range_del_it->Tombstone(); auto kv = tombstone.Serialize(); + // TODO(yuzhangyu): handle range deletion for UDT in memtables only. builder->Add(kv.first.Encode(), kv.second); InternalKey tombstone_end = tombstone.SerializeEndKey(); meta->UpdateBoundariesForRange(kv.first, tombstone_end, tombstone.seq_, @@ -257,8 +275,8 @@ Status BuildTable( SizeApproximationOptions approx_opts; approx_opts.files_size_error_margin = 0.1; meta->compensated_range_deletion_size += versions->ApproximateSize( - approx_opts, version, kv.first.Encode(), tombstone_end.Encode(), - 0 /* start_level */, -1 /* end_level */, + approx_opts, read_options, version, kv.first.Encode(), + tombstone_end.Encode(), 0 /* start_level */, -1 /* end_level */, TableReaderCaller::kFlush); } last_tombstone_start_user_key = range_del_it->start_key(); @@ -269,18 +287,19 @@ Status BuildTable( TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable"); const bool empty = builder->IsEmpty(); if (num_input_entries != nullptr) { + assert(c_iter.HasNumInputEntryScanned()); *num_input_entries = - c_iter.num_input_entry_scanned() + num_unfragmented_tombstones; + c_iter.NumInputEntryScanned() + num_unfragmented_tombstones; } if (!s.ok() || empty) { builder->Abandon(); } else { - std::string seqno_time_mapping_str; + std::string seqno_to_time_mapping_str; seqno_to_time_mapping.Encode( - seqno_time_mapping_str, meta->fd.smallest_seqno, + seqno_to_time_mapping_str, meta->fd.smallest_seqno, meta->fd.largest_seqno, meta->file_creation_time); builder->SetSeqnoTimeTableProperties( - seqno_time_mapping_str, + seqno_to_time_mapping_str, ioptions.compaction_style == CompactionStyle::kCompactionStyleFIFO ? meta->file_creation_time : meta->oldest_ancester_time); @@ -293,7 +312,10 @@ Status BuildTable( if (s.ok() && !empty) { uint64_t file_size = builder->FileSize(); meta->fd.file_size = file_size; + meta->tail_size = builder->GetTailSize(); meta->marked_for_compaction = builder->NeedCompact(); + meta->user_defined_timestamps_persisted = + ioptions.persist_user_defined_timestamps; assert(meta->fd.GetFileSize() > 0); tp = builder ->GetTableProperties(); // refresh now that builder is finished @@ -353,6 +375,8 @@ Status BuildTable( s = *io_status; } + // TODO(yuzhangyu): handle the key copy in the blob when ts should be + // stripped. if (blob_file_builder) { if (s.ok()) { s = blob_file_builder->Finish(); @@ -371,7 +395,6 @@ Status BuildTable( // here because this is a special case after we finish the table building. // No matter whether use_direct_io_for_flush_and_compaction is true, // the goal is to cache it here for further user reads. - ReadOptions read_options; std::unique_ptr it(table_cache->NewIterator( read_options, file_options, tboptions.internal_comparator, *meta, nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor, @@ -383,7 +406,8 @@ Status BuildTable( MaxFileSizeForL0MetaPin(mutable_cf_options), /*smallest_compaction_key=*/nullptr, /*largest_compaction_key*/ nullptr, - /*allow_unprepared_value*/ false)); + /*allow_unprepared_value*/ false, + mutable_cf_options.block_protection_bytes_per_key)); s = it->status(); if (s.ok() && paranoid_file_checks) { OutputValidator file_validator(tboptions.internal_comparator, diff --git a/db/builder.h b/db/builder.h index 063da5ca9eda..6a6a1866a133 100644 --- a/db/builder.h +++ b/db/builder.h @@ -53,8 +53,8 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, extern Status BuildTable( const std::string& dbname, VersionSet* versions, const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, - const FileOptions& file_options, TableCache* table_cache, - InternalIterator* iter, + const FileOptions& file_options, const ReadOptions& read_options, + TableCache* table_cache, InternalIterator* iter, std::vector> range_del_iters, FileMetaData* meta, std::vector* blob_file_additions, diff --git a/db/c.cc b/db/c.cc index 9615791a83c7..5555ae198752 100644 --- a/db/c.cc +++ b/db/c.cc @@ -7,8 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE - #include "rocksdb/c.h" #include @@ -17,7 +15,7 @@ #include #include "port/port.h" -#include "rocksdb/cache.h" +#include "rocksdb/advanced_cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/comparator.h" #include "rocksdb/convenience.h" @@ -47,6 +45,7 @@ #include "rocksdb/utilities/transaction_db.h" #include "rocksdb/utilities/write_batch_with_index.h" #include "rocksdb/write_batch.h" +#include "rocksdb/write_buffer_manager.h" #include "utilities/merge_operators.h" using ROCKSDB_NAMESPACE::BackupEngine; @@ -69,6 +68,7 @@ using ROCKSDB_NAMESPACE::CompactionOptionsFIFO; using ROCKSDB_NAMESPACE::CompactRangeOptions; using ROCKSDB_NAMESPACE::Comparator; using ROCKSDB_NAMESPACE::CompressionType; +using ROCKSDB_NAMESPACE::ConfigOptions; using ROCKSDB_NAMESPACE::CuckooTableOptions; using ROCKSDB_NAMESPACE::DB; using ROCKSDB_NAMESPACE::DBOptions; @@ -78,6 +78,8 @@ using ROCKSDB_NAMESPACE::EnvOptions; using ROCKSDB_NAMESPACE::FileLock; using ROCKSDB_NAMESPACE::FilterPolicy; using ROCKSDB_NAMESPACE::FlushOptions; +using ROCKSDB_NAMESPACE::HistogramData; +using ROCKSDB_NAMESPACE::HyperClockCacheOptions; using ROCKSDB_NAMESPACE::InfoLogLevel; using ROCKSDB_NAMESPACE::IngestExternalFileOptions; using ROCKSDB_NAMESPACE::Iterator; @@ -119,10 +121,12 @@ using ROCKSDB_NAMESPACE::TransactionDB; using ROCKSDB_NAMESPACE::TransactionDBOptions; using ROCKSDB_NAMESPACE::TransactionLogIterator; using ROCKSDB_NAMESPACE::TransactionOptions; +using ROCKSDB_NAMESPACE::WaitForCompactOptions; using ROCKSDB_NAMESPACE::WALRecoveryMode; using ROCKSDB_NAMESPACE::WritableFile; using ROCKSDB_NAMESPACE::WriteBatch; using ROCKSDB_NAMESPACE::WriteBatchWithIndex; +using ROCKSDB_NAMESPACE::WriteBufferManager; using ROCKSDB_NAMESPACE::WriteOptions; using std::unordered_set; @@ -208,12 +212,18 @@ struct rocksdb_logger_t { struct rocksdb_lru_cache_options_t { LRUCacheOptions rep; }; +struct rocksdb_hyper_clock_cache_options_t { + HyperClockCacheOptions rep; +}; struct rocksdb_memory_allocator_t { std::shared_ptr rep; }; struct rocksdb_cache_t { std::shared_ptr rep; }; +struct rocksdb_write_buffer_manager_t { + std::shared_ptr rep; +}; struct rocksdb_livefiles_t { std::vector rep; }; @@ -271,11 +281,19 @@ struct rocksdb_optimistictransactiondb_t { struct rocksdb_optimistictransaction_options_t { OptimisticTransactionOptions rep; }; +struct rocksdb_wait_for_compact_options_t { + WaitForCompactOptions rep; +}; struct rocksdb_compactionfiltercontext_t { CompactionFilter::Context rep; }; +struct rocksdb_statistics_histogram_data_t { + rocksdb_statistics_histogram_data_t() : rep() {} + HistogramData rep; +}; + struct rocksdb_compactionfilter_t : public CompactionFilter { void* state_; void (*destructor_)(void*); @@ -1054,6 +1072,36 @@ rocksdb_column_family_handle_t* rocksdb_create_column_family( return handle; } +rocksdb_column_family_handle_t** rocksdb_create_column_families( + rocksdb_t* db, const rocksdb_options_t* column_family_options, + int num_column_families, const char* const* column_family_names, + size_t* lencfs, char** errptr) { + std::vector handles; + std::vector names; + for (int i = 0; i != num_column_families; ++i) { + names.push_back(std::string(column_family_names[i])); + } + SaveError(errptr, db->rep->CreateColumnFamilies( + ColumnFamilyOptions(column_family_options->rep), names, + &handles)); + + *lencfs = handles.size(); + rocksdb_column_family_handle_t** c_handles = + static_cast( + malloc(sizeof(rocksdb_column_family_handle_t*) * handles.size())); + for (size_t i = 0; i != handles.size(); ++i) { + c_handles[i] = new rocksdb_column_family_handle_t; + c_handles[i]->rep = handles[i]; + } + + return c_handles; +} + +void rocksdb_create_column_families_destroy( + rocksdb_column_family_handle_t** list) { + free(list); +} + rocksdb_column_family_handle_t* rocksdb_create_column_family_with_ttl( rocksdb_t* db, const rocksdb_options_t* column_family_options, const char* column_family_name, int ttl, char** errptr) { @@ -1805,6 +1853,17 @@ void rocksdb_flush_cf(rocksdb_t* db, const rocksdb_flushoptions_t* options, SaveError(errptr, db->rep->Flush(options->rep, column_family->rep)); } +void rocksdb_flush_cfs(rocksdb_t* db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t** column_families, + int num_column_families, char** errptr) { + std::vector column_family_handles; + for (int i = 0; i < num_column_families; i++) { + column_family_handles.push_back(column_families[i]->rep); + } + + SaveError(errptr, db->rep->Flush(options->rep, column_family_handles)); +} + void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr) { SaveError(errptr, db->rep->FlushWAL(sync)); } @@ -2498,8 +2557,12 @@ void rocksdb_load_latest_options( rocksdb_options_t*** list_column_family_options, char** errptr) { DBOptions db_opt; std::vector cf_descs; - Status s = LoadLatestOptions(std::string(db_path), env->rep, &db_opt, - &cf_descs, ignore_unknown_options, &cache->rep); + ConfigOptions config_opts; + config_opts.ignore_unknown_options = ignore_unknown_options; + config_opts.input_strings_escaped = true; + config_opts.env = env->rep; + Status s = LoadLatestOptions(config_opts, std::string(db_path), &db_opt, + &cf_descs, &cache->rep); if (s.ok()) { char** cf_names = (char**)malloc(cf_descs.size() * sizeof(char*)); rocksdb_options_t** cf_options = (rocksdb_options_t**)malloc( @@ -2620,14 +2683,6 @@ void rocksdb_block_based_options_set_block_cache( } } -void rocksdb_block_based_options_set_block_cache_compressed( - rocksdb_block_based_table_options_t* options, - rocksdb_cache_t* block_cache_compressed) { - if (block_cache_compressed) { - options->rep.block_cache_compressed = block_cache_compressed->rep; - } -} - void rocksdb_block_based_options_set_whole_key_filtering( rocksdb_block_based_table_options_t* options, unsigned char v) { options->rep.whole_key_filtering = v; @@ -2854,6 +2909,16 @@ void rocksdb_options_set_db_paths(rocksdb_options_t* opt, opt->rep.db_paths = db_paths; } +void rocksdb_options_set_cf_paths(rocksdb_options_t* opt, + const rocksdb_dbpath_t** dbpath_values, + size_t num_paths) { + std::vector cf_paths(num_paths); + for (size_t i = 0; i < num_paths; ++i) { + cf_paths[i] = dbpath_values[i]->rep; + } + opt->rep.cf_paths = cf_paths; +} + void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) { opt->rep.env = (env ? env->rep : nullptr); } @@ -2885,6 +2950,11 @@ void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) { opt->rep.write_buffer_size = s; } +void rocksdb_options_set_write_buffer_manager( + rocksdb_options_t* opt, rocksdb_write_buffer_manager_t* wbm) { + opt->rep.write_buffer_manager = wbm->rep; +} + size_t rocksdb_options_get_write_buffer_size(rocksdb_options_t* opt) { return opt->rep.write_buffer_size; } @@ -2979,10 +3049,43 @@ void rocksdb_options_set_max_bytes_for_level_multiplier_additional( } } +void rocksdb_options_set_periodic_compaction_seconds(rocksdb_options_t* opt, + uint64_t seconds) { + opt->rep.periodic_compaction_seconds = seconds; +} + +uint64_t rocksdb_options_get_periodic_compaction_seconds( + rocksdb_options_t* opt) { + return opt->rep.periodic_compaction_seconds; +} + void rocksdb_options_enable_statistics(rocksdb_options_t* opt) { opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); } +void rocksdb_options_set_statistics_level(rocksdb_options_t* opt, int level) { + if (!opt->rep.statistics) { + return; + } + + if (level < rocksdb_statistics_level_disable_all) { + level = rocksdb_statistics_level_disable_all; + } + if (level > rocksdb_statistics_level_all) { + level = rocksdb_statistics_level_all; + } + opt->rep.statistics->set_stats_level( + static_cast(level)); +} + +int rocksdb_options_get_statistics_level(rocksdb_options_t* opt) { + if (!opt->rep.statistics) { + return ROCKSDB_NAMESPACE::StatsLevel::kDisableAll; + } + + return static_cast(opt->rep.statistics->get_stats_level()); +} + void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt, unsigned char val) { opt->rep.skip_stats_update_on_db_open = val; @@ -3730,16 +3833,21 @@ void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t* opt, ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(bucket_count)); } -void rocksdb_options_set_plain_table_factory(rocksdb_options_t* opt, - uint32_t user_key_len, - int bloom_bits_per_key, - double hash_table_ratio, - size_t index_sparseness) { +void rocksdb_options_set_plain_table_factory( + rocksdb_options_t* opt, uint32_t user_key_len, int bloom_bits_per_key, + double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size, + char encoding_type, unsigned char full_scan_mode, + unsigned char store_index_in_file) { ROCKSDB_NAMESPACE::PlainTableOptions options; options.user_key_len = user_key_len; options.bloom_bits_per_key = bloom_bits_per_key; options.hash_table_ratio = hash_table_ratio; options.index_sparseness = index_sparseness; + options.huge_page_tlb_size = huge_page_tlb_size; + options.encoding_type = + static_cast(encoding_type); + options.full_scan_mode = full_scan_mode; + options.store_index_in_file = store_index_in_file; ROCKSDB_NAMESPACE::TableFactory* factory = ROCKSDB_NAMESPACE::NewPlainTableFactory(options); @@ -3817,6 +3925,26 @@ char* rocksdb_options_statistics_get_string(rocksdb_options_t* opt) { return nullptr; } +uint64_t rocksdb_options_statistics_get_ticker_count(rocksdb_options_t* opt, + uint32_t ticker_type) { + ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get(); + if (statistics) { + return statistics->getTickerCount(ticker_type); + } + return 0; +} + +void rocksdb_options_statistics_get_histogram_data( + rocksdb_options_t* opt, uint32_t type, + rocksdb_statistics_histogram_data_t* const data) { + ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get(); + if (statistics) { + statistics->histogramData(type, &data->rep); + } else { + *data = rocksdb_statistics_histogram_data_t{}; + } +} + void rocksdb_options_set_ratelimiter(rocksdb_options_t* opt, rocksdb_ratelimiter_t* limiter) { if (limiter) { @@ -3859,6 +3987,16 @@ rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(int64_t rate_bytes_per_sec, return rate_limiter; } +rocksdb_ratelimiter_t* rocksdb_ratelimiter_create_auto_tuned( + int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness) { + rocksdb_ratelimiter_t* rate_limiter = new rocksdb_ratelimiter_t; + rate_limiter->rep.reset(NewGenericRateLimiter(rate_bytes_per_sec, + refill_period_us, fairness, + RateLimiter::Mode::kWritesOnly, + true)); // auto_tuned + return rate_limiter; +} + void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t* limiter) { delete limiter; } @@ -3878,6 +4016,15 @@ void rocksdb_options_add_compact_on_deletion_collector_factory( opt->rep.table_properties_collector_factories.emplace_back(compact_on_del); } +void rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio( + rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger, + double deletion_ratio) { + std::shared_ptr + compact_on_del = NewCompactOnDeletionCollectorFactory( + window_size, num_dels_trigger, deletion_ratio); + opt->rep.table_properties_collector_factories.emplace_back(compact_on_del); +} + void rocksdb_set_perf_level(int v) { PerfLevel level = static_cast(v); SetPerfLevel(level); @@ -4054,6 +4201,8 @@ uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context, return rep->blob_decompress_time; case rocksdb_internal_range_del_reseek_count: return rep->internal_range_del_reseek_count; + case rocksdb_block_read_cpu_time: + return rep->block_read_cpu_time; default: break; } @@ -4480,6 +4629,11 @@ void rocksdb_readoptions_set_iter_start_ts(rocksdb_readoptions_t* opt, } } +void rocksdb_readoptions_set_auto_readahead_size(rocksdb_readoptions_t* opt, + unsigned char v) { + opt->rep.auto_readahead_size = v; +} + rocksdb_writeoptions_t* rocksdb_writeoptions_create() { return new rocksdb_writeoptions_t; } @@ -4669,12 +4823,59 @@ rocksdb_cache_t* rocksdb_cache_create_lru_with_strict_capacity_limit( } rocksdb_cache_t* rocksdb_cache_create_lru_opts( - rocksdb_lru_cache_options_t* opt) { + const rocksdb_lru_cache_options_t* opt) { rocksdb_cache_t* c = new rocksdb_cache_t; c->rep = NewLRUCache(opt->rep); return c; } +rocksdb_hyper_clock_cache_options_t* rocksdb_hyper_clock_cache_options_create( + size_t capacity, size_t estimated_entry_charge) { + return new rocksdb_hyper_clock_cache_options_t{ + HyperClockCacheOptions(capacity, estimated_entry_charge)}; +} + +void rocksdb_hyper_clock_cache_options_destroy( + rocksdb_hyper_clock_cache_options_t* opt) { + delete opt; +} + +void rocksdb_hyper_clock_cache_options_set_capacity( + rocksdb_hyper_clock_cache_options_t* opts, size_t capacity) { + opts->rep.capacity = capacity; +} + +void rocksdb_hyper_clock_cache_options_set_estimated_entry_charge( + rocksdb_hyper_clock_cache_options_t* opts, size_t estimated_entry_charge) { + opts->rep.estimated_entry_charge = estimated_entry_charge; +} + +void rocksdb_hyper_clock_cache_options_set_num_shard_bits( + rocksdb_hyper_clock_cache_options_t* opts, int num_shard_bits) { + opts->rep.num_shard_bits = num_shard_bits; +} + +void rocksdb_hyper_clock_cache_options_set_memory_allocator( + rocksdb_hyper_clock_cache_options_t* opts, + rocksdb_memory_allocator_t* memory_allocator) { + opts->rep.memory_allocator = memory_allocator->rep; +} + +rocksdb_cache_t* rocksdb_cache_create_hyper_clock( + size_t capacity, size_t estimated_entry_charge) { + HyperClockCacheOptions opts(capacity, estimated_entry_charge); + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = opts.MakeSharedCache(); + return c; +} + +rocksdb_cache_t* rocksdb_cache_create_hyper_clock_opts( + const rocksdb_hyper_clock_cache_options_t* opts) { + rocksdb_cache_t* c = new rocksdb_cache_t; + c->rep = opts->rep.MakeSharedCache(); + return c; +} + void rocksdb_cache_destroy(rocksdb_cache_t* cache) { delete cache; } void rocksdb_cache_disown_data(rocksdb_cache_t* cache) { @@ -4685,18 +4886,80 @@ void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) { cache->rep->SetCapacity(capacity); } -size_t rocksdb_cache_get_capacity(rocksdb_cache_t* cache) { +size_t rocksdb_cache_get_capacity(const rocksdb_cache_t* cache) { return cache->rep->GetCapacity(); } -size_t rocksdb_cache_get_usage(rocksdb_cache_t* cache) { +size_t rocksdb_cache_get_usage(const rocksdb_cache_t* cache) { return cache->rep->GetUsage(); } -size_t rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache) { +size_t rocksdb_cache_get_pinned_usage(const rocksdb_cache_t* cache) { return cache->rep->GetPinnedUsage(); } +size_t rocksdb_cache_get_table_address_count(const rocksdb_cache_t* cache) { + return cache->rep->GetTableAddressCount(); +} + +size_t rocksdb_cache_get_occupancy_count(const rocksdb_cache_t* cache) { + return cache->rep->GetOccupancyCount(); +} + +rocksdb_write_buffer_manager_t* rocksdb_write_buffer_manager_create( + size_t buffer_size, bool allow_stall) { + rocksdb_write_buffer_manager_t* wbm = new rocksdb_write_buffer_manager_t; + wbm->rep.reset(new WriteBufferManager(buffer_size, {}, allow_stall)); + return wbm; +} + +rocksdb_write_buffer_manager_t* rocksdb_write_buffer_manager_create_with_cache( + size_t buffer_size, const rocksdb_cache_t* cache, bool allow_stall) { + rocksdb_write_buffer_manager_t* wbm = new rocksdb_write_buffer_manager_t; + wbm->rep.reset(new WriteBufferManager(buffer_size, cache->rep, allow_stall)); + return wbm; +} + +void rocksdb_write_buffer_manager_destroy(rocksdb_write_buffer_manager_t* wbm) { + delete wbm; +} + +bool rocksdb_write_buffer_manager_enabled(rocksdb_write_buffer_manager_t* wbm) { + return wbm->rep->enabled(); +} + +bool rocksdb_write_buffer_manager_cost_to_cache( + rocksdb_write_buffer_manager_t* wbm) { + return wbm->rep->cost_to_cache(); +} + +size_t rocksdb_write_buffer_manager_memory_usage( + rocksdb_write_buffer_manager_t* wbm) { + return wbm->rep->memory_usage(); +} + +size_t rocksdb_write_buffer_manager_mutable_memtable_memory_usage( + rocksdb_write_buffer_manager_t* wbm) { + return wbm->rep->mutable_memtable_memory_usage(); +} + +size_t rocksdb_write_buffer_manager_dummy_entries_in_cache_usage( + rocksdb_write_buffer_manager_t* wbm) { + return wbm->rep->dummy_entries_in_cache_usage(); +} +size_t rocksdb_write_buffer_manager_buffer_size( + rocksdb_write_buffer_manager_t* wbm) { + return wbm->rep->buffer_size(); +} +void rocksdb_write_buffer_manager_set_buffer_size( + rocksdb_write_buffer_manager_t* wbm, size_t new_size) { + wbm->rep->SetBufferSize(new_size); +} +ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_allow_stall( + rocksdb_write_buffer_manager_t* wbm, bool new_allow_stall) { + wbm->rep->SetAllowStall(new_allow_stall); +} + rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path, uint64_t target_size) { rocksdb_dbpath_t* result = new rocksdb_dbpath_t; @@ -4915,6 +5178,12 @@ void rocksdb_ingestexternalfileoptions_set_ingest_behind( opt->rep.ingest_behind = ingest_behind; } +void rocksdb_ingestexternalfileoptions_set_fail_if_not_bottommost_level( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char fail_if_not_bottommost_level) { + opt->rep.fail_if_not_bottommost_level = fail_if_not_bottommost_level; +} + void rocksdb_ingestexternalfileoptions_destroy( rocksdb_ingestexternalfileoptions_t* opt) { delete opt; @@ -5076,6 +5345,17 @@ rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create() { return result; } +void rocksdb_fifo_compaction_options_set_allow_compaction( + rocksdb_fifo_compaction_options_t* fifo_opts, + unsigned char allow_compaction) { + fifo_opts->rep.allow_compaction = allow_compaction; +} + +unsigned char rocksdb_fifo_compaction_options_get_allow_compaction( + rocksdb_fifo_compaction_options_t* fifo_opts) { + return fifo_opts->rep.allow_compaction; +} + void rocksdb_fifo_compaction_options_set_max_table_files_size( rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) { fifo_opts->rep.max_table_files_size = size; @@ -5278,6 +5558,11 @@ char* rocksdb_sst_file_metadata_get_relative_filename( return strdup(file_meta->rep->relative_filename.c_str()); } +char* rocksdb_sst_file_metadata_get_directory( + rocksdb_sst_file_metadata_t* file_meta) { + return strdup(file_meta->rep->directory.c_str()); +} + uint64_t rocksdb_sst_file_metadata_get_size( rocksdb_sst_file_metadata_t* file_meta) { return file_meta->rep->size; @@ -5496,6 +5781,20 @@ int rocksdb_transactiondb_property_int(rocksdb_transactiondb_t* db, } } +rocksdb_t* rocksdb_transactiondb_get_base_db(rocksdb_transactiondb_t* txn_db) { + DB* base_db = txn_db->rep->GetBaseDB(); + + if (base_db != nullptr) { + rocksdb_t* result = new rocksdb_t; + result->rep = base_db; + return result; + } + + return nullptr; +} + +void rocksdb_transactiondb_close_base_db(rocksdb_t* base_db) { delete base_db; } + rocksdb_transaction_t* rocksdb_transaction_begin( rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* write_options, @@ -5780,6 +6079,35 @@ void rocksdb_transaction_multi_get(rocksdb_transaction_t* txn, } } +void rocksdb_transaction_multi_get_for_update( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + } + std::vector values(num_keys); + std::vector statuses = + txn->rep->MultiGetForUpdate(options->rep, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + void rocksdb_transaction_multi_get_cf( rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, const rocksdb_column_family_handle_t* const* column_families, @@ -5812,6 +6140,38 @@ void rocksdb_transaction_multi_get_cf( } } +void rocksdb_transaction_multi_get_for_update_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs) { + std::vector keys(num_keys); + std::vector cfs(num_keys); + for (size_t i = 0; i < num_keys; i++) { + keys[i] = Slice(keys_list[i], keys_list_sizes[i]); + cfs[i] = column_families[i]->rep; + } + std::vector values(num_keys); + std::vector statuses = + txn->rep->MultiGetForUpdate(options->rep, cfs, keys, &values); + for (size_t i = 0; i < num_keys; i++) { + if (statuses[i].ok()) { + values_list[i] = CopyString(values[i]); + values_list_sizes[i] = values[i].size(); + errs[i] = nullptr; + } else { + values_list[i] = nullptr; + values_list_sizes[i] = 0; + if (!statuses[i].IsNotFound()) { + errs[i] = strdup(statuses[i].ToString().c_str()); + } else { + errs[i] = nullptr; + } + } + } +} + // Read a key outside a transaction char* rocksdb_transactiondb_get(rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, @@ -6113,6 +6473,18 @@ void rocksdb_transactiondb_flush_cf( SaveError(errptr, txn_db->rep->Flush(options->rep, column_family->rep)); } +void rocksdb_transactiondb_flush_cfs( + rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t** column_families, int num_column_families, + char** errptr) { + std::vector column_family_handles; + for (int i = 0; i < num_column_families; i++) { + column_family_handles.push_back(column_families[i]->rep); + } + + SaveError(errptr, txn_db->rep->Flush(options->rep, column_family_handles)); +} + rocksdb_checkpoint_t* rocksdb_transactiondb_checkpoint_object_create( rocksdb_transactiondb_t* txn_db, char** errptr) { Checkpoint* checkpoint; @@ -6400,6 +6772,114 @@ void rocksdb_enable_manual_compaction(rocksdb_t* db) { db->rep->EnableManualCompaction(); } -} // end extern "C" +rocksdb_statistics_histogram_data_t* +rocksdb_statistics_histogram_data_create() { + return new rocksdb_statistics_histogram_data_t{}; +} + +void rocksdb_statistics_histogram_data_destroy( + rocksdb_statistics_histogram_data_t* data) { + delete data; +} + +double rocksdb_statistics_histogram_data_get_median( + rocksdb_statistics_histogram_data_t* data) { + return data->rep.median; +} + +double rocksdb_statistics_histogram_data_get_p95( + rocksdb_statistics_histogram_data_t* data) { + return data->rep.percentile95; +} + +double rocksdb_statistics_histogram_data_get_p99( + rocksdb_statistics_histogram_data_t* data) { + return data->rep.percentile99; +} + +double rocksdb_statistics_histogram_data_get_average( + rocksdb_statistics_histogram_data_t* data) { + return data->rep.average; +} + +double rocksdb_statistics_histogram_data_get_std_dev( + rocksdb_statistics_histogram_data_t* data) { + return data->rep.standard_deviation; +} + +double rocksdb_statistics_histogram_data_get_max( + rocksdb_statistics_histogram_data_t* data) { + return data->rep.max; +} -#endif // !ROCKSDB_LITE +uint64_t rocksdb_statistics_histogram_data_get_count( + rocksdb_statistics_histogram_data_t* data) { + return data->rep.count; +} + +uint64_t rocksdb_statistics_histogram_data_get_sum( + rocksdb_statistics_histogram_data_t* data) { + return data->rep.sum; +} + +double rocksdb_statistics_histogram_data_get_min( + rocksdb_statistics_histogram_data_t* data) { + return data->rep.min; +} + +void rocksdb_wait_for_compact(rocksdb_t* db, + rocksdb_wait_for_compact_options_t* options, + char** errptr) { + SaveError(errptr, db->rep->WaitForCompact(options->rep)); +} + +rocksdb_wait_for_compact_options_t* rocksdb_wait_for_compact_options_create() { + return new rocksdb_wait_for_compact_options_t; +} + +void rocksdb_wait_for_compact_options_destroy( + rocksdb_wait_for_compact_options_t* opt) { + delete opt; +} + +void rocksdb_wait_for_compact_options_set_abort_on_pause( + rocksdb_wait_for_compact_options_t* opt, unsigned char v) { + opt->rep.abort_on_pause = v; +} + +unsigned char rocksdb_wait_for_compact_options_get_abort_on_pause( + rocksdb_wait_for_compact_options_t* opt) { + return opt->rep.abort_on_pause; +} + +void rocksdb_wait_for_compact_options_set_flush( + rocksdb_wait_for_compact_options_t* opt, unsigned char v) { + opt->rep.flush = v; +} + +unsigned char rocksdb_wait_for_compact_options_get_flush( + rocksdb_wait_for_compact_options_t* opt) { + return opt->rep.flush; +} + +void rocksdb_wait_for_compact_options_set_close_db( + rocksdb_wait_for_compact_options_t* opt, unsigned char v) { + opt->rep.close_db = v; +} + +unsigned char rocksdb_wait_for_compact_options_get_close_db( + rocksdb_wait_for_compact_options_t* opt) { + return opt->rep.close_db; +} + +void rocksdb_wait_for_compact_options_set_timeout( + rocksdb_wait_for_compact_options_t* opt, uint64_t microseconds) { + opt->rep.timeout = std::chrono::microseconds(microseconds); +} + +uint64_t rocksdb_wait_for_compact_options_get_timeout( + rocksdb_wait_for_compact_options_t* opt) { + return opt->rep.timeout.count(); +} + +} // end extern "C" diff --git a/db/c_test.c b/db/c_test.c index b6877d46a99a..667220496920 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -3,17 +3,14 @@ found in the LICENSE file. See the AUTHORS file for names of contributors. */ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -#include - -#ifndef ROCKSDB_LITE // Lite does not support C API +#include "rocksdb/c.h" #include #include +#include #include #include #include - -#include "rocksdb/c.h" #ifndef OS_WIN #include #endif @@ -378,6 +375,11 @@ static rocksdb_t* CheckCompaction(rocksdb_t* db, rocksdb_options_t* options, // Force compaction rocksdb_compact_range(db, NULL, 0, NULL, 0); + rocksdb_wait_for_compact_options_t* wco; + wco = rocksdb_wait_for_compact_options_create(); + rocksdb_wait_for_compact(db, wco, &err); + CheckNoError(err); + rocksdb_wait_for_compact_options_destroy(wco); // should have filtered bar, but not foo CheckGet(db, roptions, "foo", "foovalue"); CheckGet(db, roptions, "bar", NULL); @@ -490,6 +492,19 @@ static void CheckTxnPinGetCF(rocksdb_transaction_t* txn, rocksdb_pinnableslice_destroy(p); } +static void CheckTxnGetForUpdate(rocksdb_transaction_t* txn, + const rocksdb_readoptions_t* options, + const char* key, const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_transaction_get_for_update(txn, options, key, strlen(key), + &val_len, true, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + static void CheckTxnDBGet(rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, const char* key, const char* expected) { @@ -517,6 +532,20 @@ static void CheckTxnDBGetCF(rocksdb_transactiondb_t* txn_db, Free(&val); } +static void CheckTxnGetForUpdateCF( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + rocksdb_column_family_handle_t* column_family, const char* key, + const char* expected) { + char* err = NULL; + size_t val_len; + char* val; + val = rocksdb_transaction_get_for_update_cf( + txn, options, column_family, key, strlen(key), &val_len, true, &err); + CheckNoError(err); + CheckEqual(expected, val, val_len); + Free(&val); +} + static void CheckTxnDBPinGet(rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, const char* key, const char* expected) { @@ -684,6 +713,11 @@ int main(int argc, char** argv) { rocksdb_options_set_ratelimiter(options, rate_limiter); rocksdb_ratelimiter_destroy(rate_limiter); + rate_limiter = + rocksdb_ratelimiter_create_auto_tuned(1000 * 1024 * 1024, 100 * 1000, 10); + rocksdb_options_set_ratelimiter(options, rate_limiter); + rocksdb_ratelimiter_destroy(rate_limiter); + roptions = rocksdb_readoptions_create(); rocksdb_readoptions_set_verify_checksums(roptions, 1); rocksdb_readoptions_set_fill_cache(roptions, 1); @@ -696,6 +730,8 @@ int main(int argc, char** argv) { rocksdb_options_add_compact_on_deletion_collector_factory(options, 10000, 10001); + rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio( + options, 10000, 10001, 0.0); StartPhase("destroy"); rocksdb_destroy_db(options, dbname, &err); @@ -1443,10 +1479,20 @@ int main(int argc, char** argv) { CheckCondition(cflen == 2); rocksdb_list_column_families_destroy(column_fams, cflen); - rocksdb_options_t* cf_options = rocksdb_options_create(); + rocksdb_options_t* cf_options_1 = rocksdb_options_create(); + rocksdb_options_t* cf_options_2 = rocksdb_options_create(); + + // use dbpathname2 as the cf_path for "cf1" + rocksdb_dbpath_t* dbpath2; + char dbpathname2[200]; + snprintf(dbpathname2, sizeof(dbpathname2), "%s/rocksdb_c_test-%d-dbpath2", + GetTempDir(), ((int)geteuid())); + dbpath2 = rocksdb_dbpath_create(dbpathname2, 1024 * 1024); + const rocksdb_dbpath_t* cf_paths[1] = {dbpath2}; + rocksdb_options_set_cf_paths(cf_options_2, cf_paths, 1); const char* cf_names[2] = {"default", "cf1"}; - const rocksdb_options_t* cf_opts[2] = {cf_options, cf_options}; + const rocksdb_options_t* cf_opts[2] = {cf_options_1, cf_options_2}; rocksdb_column_family_handle_t* handles[2]; LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 2, cf_names, @@ -1474,6 +1520,37 @@ int main(int argc, char** argv) { rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create(); rocksdb_flushoptions_set_wait(flush_options, 1); rocksdb_flush_cf(db, flush_options, handles[1], &err); + + // make sure all files in "cf1" are under the specified cf path + { + rocksdb_column_family_metadata_t* cf_meta = + rocksdb_get_column_family_metadata_cf(db, handles[1]); + size_t cf_file_count = rocksdb_column_family_metadata_get_size(cf_meta); + assert(cf_file_count > 0); + size_t level_count = + rocksdb_column_family_metadata_get_level_count(cf_meta); + assert(level_count > 0); + for (size_t l = 0; l < level_count; ++l) { + rocksdb_level_metadata_t* level_meta = + rocksdb_column_family_metadata_get_level_metadata(cf_meta, l); + assert(level_meta); + + size_t file_count = rocksdb_level_metadata_get_file_count(level_meta); + for (size_t f = 0; f < file_count; ++f) { + rocksdb_sst_file_metadata_t* file_meta = + rocksdb_level_metadata_get_sst_file_metadata(level_meta, f); + assert(file_meta); + char* file_path = rocksdb_sst_file_metadata_get_directory(file_meta); + assert(strcmp(file_path, dbpathname2) == 0); + Free(&file_path); + rocksdb_sst_file_metadata_destroy(file_meta); + } + rocksdb_level_metadata_destroy(level_meta); + } + + rocksdb_column_family_metadata_destroy(cf_meta); + } + CheckNoError(err) rocksdb_flushoptions_destroy(flush_options); CheckGetCF(db, roptions, handles[1], "foo", "hello"); @@ -1637,7 +1714,9 @@ int main(int argc, char** argv) { } rocksdb_destroy_db(options, dbname, &err); rocksdb_options_destroy(db_options); - rocksdb_options_destroy(cf_options); + rocksdb_options_destroy(cf_options_1); + rocksdb_options_destroy(cf_options_2); + rocksdb_dbpath_destroy(dbpath2); } StartPhase("prefix"); @@ -1647,7 +1726,8 @@ int main(int argc, char** argv) { rocksdb_options_set_prefix_extractor( options, rocksdb_slicetransform_create_fixed_prefix(3)); rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4); - rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16); + rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16, 0, 0, 0, + 0); rocksdb_options_set_allow_concurrent_memtable_write(options, 0); db = rocksdb_open(options, dbname, &err); @@ -1841,6 +1921,10 @@ int main(int argc, char** argv) { CheckCondition(2.0 == rocksdb_options_get_max_bytes_for_level_multiplier(o)); + rocksdb_options_set_periodic_compaction_seconds(o, 100000); + CheckCondition(100000 == + rocksdb_options_get_periodic_compaction_seconds(o)); + rocksdb_options_set_skip_stats_update_on_db_open(o, 1); CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o)); @@ -2033,6 +2117,15 @@ int main(int argc, char** argv) { CheckCondition(29.0 == rocksdb_options_get_experimental_mempurge_threshold(o)); + CheckCondition(rocksdb_statistics_level_disable_all == + rocksdb_options_get_statistics_level(o)); + rocksdb_options_enable_statistics(o); + CheckCondition(rocksdb_statistics_level_disable_all != + rocksdb_options_get_statistics_level(o)); + rocksdb_options_set_statistics_level(o, rocksdb_statistics_level_all); + CheckCondition(rocksdb_statistics_level_all == + rocksdb_options_get_statistics_level(o)); + /* Blob Options */ rocksdb_options_set_enable_blob_files(o, 1); CheckCondition(1 == rocksdb_options_get_enable_blob_files(o)); @@ -2262,6 +2355,12 @@ int main(int argc, char** argv) { CheckCondition(2.0 == rocksdb_options_get_max_bytes_for_level_multiplier(o)); + rocksdb_options_set_periodic_compaction_seconds(copy, 8000); + CheckCondition(8000 == + rocksdb_options_get_periodic_compaction_seconds(copy)); + CheckCondition(100000 == + rocksdb_options_get_periodic_compaction_seconds(o)); + rocksdb_options_set_skip_stats_update_on_db_open(copy, 0); CheckCondition(0 == rocksdb_options_get_skip_stats_update_on_db_open(copy)); CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o)); @@ -3094,6 +3193,17 @@ int main(int argc, char** argv) { CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", NULL); CheckTxnDBPinGetCF(txn_db, roptions, cfh, "cf_foo", NULL); + // memory usage + rocksdb_t* base_db = rocksdb_transactiondb_get_base_db(txn_db); + rocksdb_memory_consumers_t* consumers = rocksdb_memory_consumers_create(); + rocksdb_memory_consumers_add_db(consumers, base_db); + rocksdb_memory_usage_t* usage = + rocksdb_approximate_memory_usage_create(consumers, &err); + CheckNoError(err); + rocksdb_approximate_memory_usage_destroy(usage); + rocksdb_memory_consumers_destroy(consumers); + rocksdb_transactiondb_close_base_db(base_db); + // flush rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create(); rocksdb_flushoptions_set_wait(flush_options, 1); @@ -3206,6 +3316,120 @@ int main(int argc, char** argv) { rocksdb_transactiondb_options_destroy(txn_db_options); } + StartPhase("transactions_multi_get_for_update"); + { + // open a TransactionDB + txn_db_options = rocksdb_transactiondb_options_create(); + rocksdb_transactiondb_options_set_transaction_lock_timeout(txn_db_options, + 0); + txn_options = rocksdb_transaction_options_create(); + rocksdb_options_set_create_if_missing(options, 1); + txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err); + CheckNoError(err); + + rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hey", 3, &err); + CheckNoError(err); + rocksdb_transactiondb_put(txn_db, woptions, "bar", 3, "hello", 5, &err); + CheckNoError(err); + + // begin transactions + txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL); + rocksdb_transaction_t* txn2 = + rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL); + + // multi get + { + const char* keys[2] = {"foo", "bar"}; + const size_t keys_sizes[2] = {3, 3}; + char* vals[2]; + size_t vals_sizes[2]; + char* errs[2]; + const char* expected[2] = {"hey", "hello"}; + rocksdb_transaction_multi_get_for_update( + txn, roptions, 2, keys, keys_sizes, vals, vals_sizes, errs); + CheckMultiGetValues(2, vals, vals_sizes, errs, expected); + } + + char* conflict_err = NULL; + size_t val_len; + rocksdb_transaction_get_for_update(txn2, roptions, "foo", 3, &val_len, true, + &conflict_err); + // get-for-update conflict + CheckCondition(conflict_err != NULL); + Free(&conflict_err); + + // commit + rocksdb_transaction_commit(txn, &err); + CheckNoError(err); + + // should work after first tx is commited + CheckTxnGetForUpdate(txn2, roptions, "foo", "hey"); + + // commit the second one + rocksdb_transaction_commit(txn2, &err); + CheckNoError(err); + + // destroy txns + rocksdb_transaction_destroy(txn); + rocksdb_transaction_destroy(txn2); + + // same for column families + + rocksdb_column_family_handle_t* cfh; + cfh = rocksdb_transactiondb_create_column_family(txn_db, options, + "txn_db_cf", &err); + CheckNoError(err); + + rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_foo", 6, "cf_hello", + 8, &err); + CheckNoError(err); + rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_bar", 6, "cf_hey", + 6, &err); + CheckNoError(err); + + txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL); + txn2 = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL); + + { + const rocksdb_column_family_handle_t* get_handles[2] = {cfh, cfh}; + const char* keys[2] = {"cf_foo", "cf_bar"}; + const size_t keys_sizes[2] = {6, 6}; + char* vals[2]; + size_t vals_sizes[2]; + char* errs[2]; + const char* expected[2] = {"cf_hello", "cf_hey"}; + rocksdb_transaction_multi_get_for_update_cf(txn, roptions, get_handles, 2, + keys, keys_sizes, vals, + vals_sizes, errs); + CheckMultiGetValues(2, vals, vals_sizes, errs, expected); + } + + char* conflict_err_cf = NULL; + size_t val_len_cf; + rocksdb_transaction_get_for_update_cf(txn2, roptions, cfh, "cf_foo", 6, + &val_len_cf, true, &conflict_err_cf); + CheckCondition(conflict_err_cf != NULL); + Free(&conflict_err_cf); + + rocksdb_transaction_commit(txn, &err); + CheckNoError(err); + + CheckTxnGetForUpdateCF(txn2, roptions, cfh, "cf_foo", "cf_hello"); + + rocksdb_transaction_commit(txn2, &err); + CheckNoError(err); + + // close and destroy + rocksdb_column_family_handle_destroy(cfh); + rocksdb_transaction_destroy(txn); + rocksdb_transaction_destroy(txn2); + rocksdb_transactiondb_close(txn_db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + rocksdb_transaction_options_destroy(txn_options); + rocksdb_transactiondb_options_destroy(txn_db_options); + } + StartPhase("optimistic_transactions"); { rocksdb_options_t* db_options = rocksdb_options_create(); @@ -3235,8 +3459,19 @@ int main(int argc, char** argv) { rocksdb_put(db, woptions, "key", 3, "value", 5, &err); CheckNoError(err); rocksdb_column_family_handle_t *cfh1, *cfh2; - cfh1 = rocksdb_create_column_family(db, db_options, "txn_db_cf1", &err); - cfh2 = rocksdb_create_column_family(db, db_options, "txn_db_cf2", &err); + char** list_const_cf_names = (char**)malloc(2 * sizeof(char*)); + list_const_cf_names[0] = "txn_db_cf1"; + list_const_cf_names[1] = "txn_db_cf2"; + size_t cflen; + rocksdb_column_family_handle_t** list_cfh = rocksdb_create_column_families( + db, db_options, 2, (const char* const*)list_const_cf_names, &cflen, + &err); + free(list_const_cf_names); + CheckNoError(err); + assert(cflen == 2); + cfh1 = list_cfh[0]; + cfh2 = list_cfh[1]; + rocksdb_create_column_families_destroy(list_cfh); txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options, NULL); rocksdb_transaction_put_cf(txn, cfh1, "key_cf1", 7, "val_cf1", 7, &err); @@ -3450,6 +3685,126 @@ int main(int argc, char** argv) { rocksdb_readoptions_destroy(ropts); } + StartPhase("statistics"); + { + const uint32_t BYTES_WRITTEN_TICKER = 40; + const uint32_t DB_WRITE_HIST = 1; + + rocksdb_statistics_histogram_data_t* hist = + rocksdb_statistics_histogram_data_create(); + { + // zero by default + CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_median(hist)); + CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p95(hist)); + CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p99(hist)); + CheckCondition(0.0 == + rocksdb_statistics_histogram_data_get_average(hist)); + CheckCondition(0.0 == + rocksdb_statistics_histogram_data_get_std_dev(hist)); + CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_max(hist)); + CheckCondition(0 == rocksdb_statistics_histogram_data_get_count(hist)); + CheckCondition(0 == rocksdb_statistics_histogram_data_get_sum(hist)); + CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_min(hist)); + } + + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + + rocksdb_options_enable_statistics(options); + rocksdb_options_set_statistics_level(options, rocksdb_statistics_level_all); + + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + CheckCondition(0 == rocksdb_options_statistics_get_ticker_count( + options, BYTES_WRITTEN_TICKER)); + rocksdb_options_statistics_get_histogram_data(options, DB_WRITE_HIST, hist); + CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_median(hist)); + CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p95(hist)); + CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p99(hist)); + CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_average(hist)); + CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_std_dev(hist)); + CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_max(hist)); + CheckCondition(0 == rocksdb_statistics_histogram_data_get_count(hist)); + CheckCondition(0 == rocksdb_statistics_histogram_data_get_sum(hist)); + + int i; + for (i = 0; i < 10; ++i) { + char key = '0' + (char)i; + rocksdb_put(db, woptions, &key, 1, "", 1, &err); + CheckNoError(err); + } + CheckCondition(0 != rocksdb_options_statistics_get_ticker_count( + options, BYTES_WRITTEN_TICKER)); + rocksdb_options_statistics_get_histogram_data(options, DB_WRITE_HIST, hist); + CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_median(hist)); + CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_p95(hist)); + CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_p99(hist)); + CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_average(hist)); + CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_std_dev(hist)); + CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_max(hist)); + CheckCondition(0 != rocksdb_statistics_histogram_data_get_count(hist)); + CheckCondition(0 != rocksdb_statistics_histogram_data_get_sum(hist)); + + rocksdb_statistics_histogram_data_destroy(hist); + } + + StartPhase("wait_for_compact_options"); + { + rocksdb_wait_for_compact_options_t* wco; + wco = rocksdb_wait_for_compact_options_create(); + + rocksdb_wait_for_compact_options_set_abort_on_pause(wco, 1); + CheckCondition(1 == + rocksdb_wait_for_compact_options_get_abort_on_pause(wco)); + + rocksdb_wait_for_compact_options_set_flush(wco, 1); + CheckCondition(1 == rocksdb_wait_for_compact_options_get_flush(wco)); + + rocksdb_wait_for_compact_options_set_close_db(wco, 1); + CheckCondition(1 == rocksdb_wait_for_compact_options_get_close_db(wco)); + + rocksdb_wait_for_compact_options_set_timeout(wco, 342); + CheckCondition(342 == rocksdb_wait_for_compact_options_get_timeout(wco)); + + rocksdb_wait_for_compact_options_destroy(wco); + } + StartPhase("wait_for_compact"); + { + rocksdb_wait_for_compact_options_t* wco; + wco = rocksdb_wait_for_compact_options_create(); + rocksdb_wait_for_compact_options_set_flush(wco, 1); + + rocksdb_wait_for_compact(db, wco, &err); + CheckNoError(err); + rocksdb_wait_for_compact_options_destroy(wco); + } + + StartPhase("write_buffer_manager"); + { + rocksdb_cache_t* lru; + lru = rocksdb_cache_create_lru(100); + + rocksdb_write_buffer_manager_t* write_buffer_manager; + write_buffer_manager = + rocksdb_write_buffer_manager_create_with_cache(200, lru, false); + + CheckCondition(true == + rocksdb_write_buffer_manager_enabled(write_buffer_manager)); + CheckCondition(true == rocksdb_write_buffer_manager_cost_to_cache( + write_buffer_manager)); + CheckCondition( + 200 == rocksdb_write_buffer_manager_buffer_size(write_buffer_manager)); + + rocksdb_write_buffer_manager_set_buffer_size(write_buffer_manager, 300); + CheckCondition( + 300 == rocksdb_write_buffer_manager_buffer_size(write_buffer_manager)); + + rocksdb_write_buffer_manager_destroy(write_buffer_manager); + rocksdb_cache_destroy(lru); + } + StartPhase("cancel_all_background_work"); rocksdb_cancel_all_background_work(db, 1); @@ -3468,12 +3823,3 @@ int main(int argc, char** argv) { fprintf(stderr, "PASS\n"); return 0; } - -#else - -int main(void) { - fprintf(stderr, "SKIPPED\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/db/column_family.cc b/db/column_family.cc index c6602b688f89..dc74c16d7b33 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -53,11 +53,9 @@ ColumnFamilyHandleImpl::ColumnFamilyHandleImpl( ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() { if (cfd_ != nullptr) { -#ifndef ROCKSDB_LITE for (auto& listener : cfd_->ioptions()->listeners) { listener->OnColumnFamilyHandleDeletionStarted(this); } -#endif // ROCKSDB_LITE // Job id == 0 means that this is not our background process, but rather // user thread // Need to hold some shared pointers owned by the initial_cf_options @@ -88,15 +86,10 @@ const std::string& ColumnFamilyHandleImpl::GetName() const { } Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) { -#ifndef ROCKSDB_LITE // accessing mutable cf-options requires db mutex. InstrumentedMutexLock l(mutex_); *desc = ColumnFamilyDescriptor(cfd()->GetName(), cfd()->GetLatestCFOptions()); return Status::OK(); -#else - (void)desc; - return Status::NotSupported(); -#endif // !ROCKSDB_LITE } const Comparator* ColumnFamilyHandleImpl::GetComparator() const { @@ -347,7 +340,6 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, result.hard_pending_compaction_bytes_limit; } -#ifndef ROCKSDB_LITE // When the DB is stopped, it's possible that there are some .trash files that // were not deleted yet, when we open the DB we will find these .trash files // and schedule them to be deleted (or delete immediately if SstFileManager @@ -359,7 +351,6 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, result.cf_paths[i].path) .PermitUncheckedError(); } -#endif if (result.cf_paths.empty()) { result.cf_paths = db_options.db_paths; @@ -391,8 +382,9 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60; if (result.ttl == kDefaultTtl) { - if (is_block_based_table && - result.compaction_style != kCompactionStyleFIFO) { + if (is_block_based_table) { + // FIFO also requires max_open_files=-1, which is checked in + // ValidateOptions(). result.ttl = kAdjustedTtl; } else { result.ttl = 0; @@ -400,40 +392,35 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, } const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60; - - // Turn on periodic compactions and set them to occur once every 30 days if - // compaction filters are used and periodic_compaction_seconds is set to the - // default value. - if (result.compaction_style != kCompactionStyleFIFO) { + if (result.compaction_style == kCompactionStyleLevel) { if ((result.compaction_filter != nullptr || result.compaction_filter_factory != nullptr) && result.periodic_compaction_seconds == kDefaultPeriodicCompSecs && is_block_based_table) { result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs; } - } else { - // result.compaction_style == kCompactionStyleFIFO - if (result.ttl == 0) { - if (is_block_based_table) { - if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) { - result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs; - } - result.ttl = result.periodic_compaction_seconds; - } - } else if (result.periodic_compaction_seconds != 0) { - result.ttl = std::min(result.ttl, result.periodic_compaction_seconds); + } else if (result.compaction_style == kCompactionStyleUniversal) { + if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs && + is_block_based_table) { + result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs; + } + } else if (result.compaction_style == kCompactionStyleFIFO) { + if (result.periodic_compaction_seconds != kDefaultPeriodicCompSecs) { + ROCKS_LOG_WARN( + db_options.info_log.get(), + "periodic_compaction_seconds does not support FIFO compaction. You" + "may want to set option TTL instead."); } } - // TTL compactions would work similar to Periodic Compactions in Universal in - // most of the cases. So, if ttl is set, execute the periodic compaction - // codepath. - if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) { - if (result.periodic_compaction_seconds != 0) { + // For universal compaction, `ttl` and `periodic_compaction_seconds` mean the + // same thing, take the stricter value. + if (result.compaction_style == kCompactionStyleUniversal) { + if (result.periodic_compaction_seconds == 0) { + result.periodic_compaction_seconds = result.ttl; + } else if (result.ttl != 0) { result.periodic_compaction_seconds = std::min(result.ttl, result.periodic_compaction_seconds); - } else { - result.periodic_compaction_seconds = result.ttl; } } @@ -489,6 +476,7 @@ void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem, mem = new_mem; imm = new_imm; current = new_current; + full_history_ts_low = cfd->GetFullHistoryTsLow(); cfd->Ref(); mem->Ref(); imm->Ref(); @@ -602,7 +590,6 @@ ColumnFamilyData::ColumnFamilyData( if (ioptions_.compaction_style == kCompactionStyleLevel) { compaction_picker_.reset( new LevelCompactionPicker(ioptions_, &internal_comparator_)); -#ifndef ROCKSDB_LITE } else if (ioptions_.compaction_style == kCompactionStyleUniversal) { compaction_picker_.reset( new UniversalCompactionPicker(ioptions_, &internal_comparator_)); @@ -616,7 +603,6 @@ ColumnFamilyData::ColumnFamilyData( "Column family %s does not use any background compaction. " "Compactions can only be done via CompactFiles\n", GetName().c_str()); -#endif // !ROCKSDB_LITE } else { ROCKS_LOG_ERROR(ioptions_.logger, "Unable to recognize the specified compaction style %d. " @@ -880,7 +866,7 @@ int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger, } } // anonymous namespace -std::pair +std::pair ColumnFamilyData::GetWriteStallConditionAndCause( int num_unflushed_memtables, int num_l0_files, uint64_t num_compaction_needed_bytes, @@ -942,7 +928,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( // - `SetOptions()` with `disable_write_stall=true` // - Compaction finishes and calls `InstallSuperVersion` with // `mutable_cf_options(disable_write_stall=true)` - std::pair + std::pair write_stall_condition_and_cause; if (mutable_cf_options_.disable_write_stall) { write_stall_condition_and_cause = {WriteStallCondition::kNormal, @@ -975,7 +961,8 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1); if (compaction_picker_->IsLevel0CompactionInProgress()) { internal_stats_->AddCFStats( - InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1); + InternalStats::L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION, + 1); } ROCKS_LOG_WARN(ioptions_.logger, "[%s] Stopping writes because we have %d level-0 files", @@ -996,7 +983,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( SetupDelay(write_controller, compaction_needed_bytes, prev_compaction_needed_bytes_, was_stopped, mutable_cf_options.disable_auto_compactions); - internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1); + internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_DELAYS, 1); ROCKS_LOG_WARN( ioptions_.logger, "[%s] Stalling writes because we have %d immutable memtables " @@ -1014,11 +1001,11 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( SetupDelay(write_controller, compaction_needed_bytes, prev_compaction_needed_bytes_, was_stopped || near_stop, mutable_cf_options.disable_auto_compactions); - internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_SLOWDOWNS, - 1); + internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_DELAYS, 1); if (compaction_picker_->IsLevel0CompactionInProgress()) { internal_stats_->AddCFStats( - InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1); + InternalStats::L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION, + 1); } ROCKS_LOG_WARN(ioptions_.logger, "[%s] Stalling writes because we have %d level-0 files " @@ -1044,7 +1031,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( prev_compaction_needed_bytes_, was_stopped || near_stop, mutable_cf_options.disable_auto_compactions); internal_stats_->AddCFStats( - InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1); + InternalStats::PENDING_COMPACTION_BYTES_LIMIT_DELAYS, 1); ROCKS_LOG_WARN( ioptions_.logger, "[%s] Stalling writes because of estimated pending compaction " @@ -1154,7 +1141,7 @@ Compaction* ColumnFamilyData::PickCompaction( GetName(), mutable_options, mutable_db_options, current_->storage_info(), log_buffer); if (result != nullptr) { - result->SetInputVersion(current_); + result->FinalizeInputInfo(current_); } return result; } @@ -1173,6 +1160,7 @@ Status ColumnFamilyData::RangesOverlapWithMemtables( *overlap = false; // Create an InternalIterator over all unflushed memtables Arena arena; + // TODO: plumb Env::IOActivity ReadOptions read_opts; read_opts.total_order_seek = true; MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); @@ -1237,7 +1225,7 @@ Compaction* ColumnFamilyData::CompactRange( compact_range_options, begin, end, compaction_end, conflict, max_file_num_to_ignore, trim_ts); if (result != nullptr) { - result->SetInputVersion(current_); + result->FinalizeInputInfo(current_); } TEST_SYNC_POINT("ColumnFamilyData::CompactRange:Return"); return result; @@ -1276,30 +1264,11 @@ SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) { // (if no Scrape happens). assert(ptr != SuperVersion::kSVInUse); SuperVersion* sv = static_cast(ptr); - if (sv == SuperVersion::kSVObsolete || - sv->version_number != super_version_number_.load()) { + if (sv == SuperVersion::kSVObsolete) { RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES); - SuperVersion* sv_to_delete = nullptr; - - if (sv && sv->Unref()) { - RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_CLEANUPS); - db->mutex()->Lock(); - // NOTE: underlying resources held by superversion (sst files) might - // not be released until the next background job. - sv->Cleanup(); - if (db->immutable_db_options().avoid_unnecessary_blocking_io) { - db->AddSuperVersionsToFreeQueue(sv); - db->SchedulePurge(); - } else { - sv_to_delete = sv; - } - } else { - db->mutex()->Lock(); - } + db->mutex()->Lock(); sv = super_version_->Ref(); db->mutex()->Unlock(); - - delete sv_to_delete; } assert(sv != nullptr); return sv; @@ -1337,8 +1306,6 @@ void ColumnFamilyData::InstallSuperVersion( new_superversion->Init(this, mem_, imm_.current(), current_); SuperVersion* old_superversion = super_version_; super_version_ = new_superversion; - ++super_version_number_; - super_version_->version_number = super_version_number_; if (old_superversion == nullptr || old_superversion->current != current() || old_superversion->mem != mem_ || old_superversion->imm != imm_.current() || @@ -1407,6 +1374,8 @@ void ColumnFamilyData::InstallSuperVersion( sv_context->superversions_to_free.push_back(old_superversion); } } + ++super_version_number_; + super_version_->version_number = super_version_number_; } void ColumnFamilyData::ResetThreadLocalSuperVersions() { @@ -1464,6 +1433,33 @@ Status ColumnFamilyData::ValidateOptions( } } + const auto* ucmp = cf_options.comparator; + assert(ucmp); + if (ucmp->timestamp_size() > 0 && + !cf_options.persist_user_defined_timestamps) { + if (db_options.atomic_flush) { + return Status::NotSupported( + "Not persisting user-defined timestamps feature is not supported" + "in combination with atomic flush."); + } + if (db_options.allow_concurrent_memtable_write) { + return Status::NotSupported( + "Not persisting user-defined timestamps feature is not supported" + " in combination with concurrent memtable write."); + } + const char* comparator_name = cf_options.comparator->Name(); + size_t name_size = strlen(comparator_name); + const char* suffix = ".u64ts"; + size_t suffix_size = strlen(suffix); + if (name_size <= suffix_size || + strcmp(comparator_name + name_size - suffix_size, suffix) != 0) { + return Status::NotSupported( + "Not persisting user-defined timestamps" + "feature only support user-defined timestamps formatted as " + "uint64_t."); + } + } + if (cf_options.enable_blob_garbage_collection) { if (cf_options.blob_garbage_collection_age_cutoff < 0.0 || cf_options.blob_garbage_collection_age_cutoff > 1.0) { @@ -1493,10 +1489,40 @@ Status ColumnFamilyData::ValidateOptions( "Memtable per key-value checksum protection only supports 0, 1, 2, 4 " "or 8 bytes per key."); } + if (std::find(supported.begin(), supported.end(), + cf_options.block_protection_bytes_per_key) == supported.end()) { + return Status::NotSupported( + "Block per key-value checksum protection only supports 0, 1, 2, 4 " + "or 8 bytes per key."); + } + + if (!cf_options.compaction_options_fifo.file_temperature_age_thresholds + .empty()) { + if (cf_options.compaction_style != kCompactionStyleFIFO) { + return Status::NotSupported( + "Option file_temperature_age_thresholds only supports FIFO " + "compaction."); + } else if (cf_options.num_levels > 1) { + return Status::NotSupported( + "Option file_temperature_age_thresholds is only supported when " + "num_levels = 1."); + } else { + const auto& ages = + cf_options.compaction_options_fifo.file_temperature_age_thresholds; + assert(ages.size() >= 1); + // check that age is sorted + for (size_t i = 0; i < ages.size() - 1; ++i) { + if (ages[i].age >= ages[i + 1].age) { + return Status::NotSupported( + "Option file_temperature_age_thresholds requires elements to be " + "sorted in increasing order with respect to `age` field."); + } + } + } + } return s; } -#ifndef ROCKSDB_LITE Status ColumnFamilyData::SetOptions( const DBOptions& db_opts, const std::unordered_map& options_map) { @@ -1524,7 +1550,6 @@ Status ColumnFamilyData::SetOptions( } return s; } -#endif // ROCKSDB_LITE // REQUIRES: DB mutex held Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) { @@ -1583,6 +1608,34 @@ FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const { return data_dirs_[path_id].get(); } +bool ColumnFamilyData::ShouldPostponeFlushToRetainUDT( + uint64_t max_memtable_id) { + const Comparator* ucmp = user_comparator(); + const size_t ts_sz = ucmp->timestamp_size(); + if (ts_sz == 0 || ioptions_.persist_user_defined_timestamps) { + return false; + } + // If users set the `persist_user_defined_timestamps` flag to false, they + // should also set the `full_history_ts_low` flag to indicate the range of + // user-defined timestamps to retain in memory. Otherwise, we do not + // explicitly postpone flush to retain UDTs. + const std::string& full_history_ts_low = GetFullHistoryTsLow(); + if (full_history_ts_low.empty()) { + return false; + } + for (const Slice& table_newest_udt : + imm()->GetTablesNewestUDT(max_memtable_id)) { + assert(table_newest_udt.size() == full_history_ts_low.size()); + // Checking the newest UDT contained in MemTable with ascending ID up to + // `max_memtable_id`. Return immediately on finding the first MemTable that + // needs postponing. + if (ucmp->CompareTimestamp(table_newest_udt, full_history_ts_low) >= 0) { + return true; + } + } + return false; +} + void ColumnFamilyData::RecoverEpochNumbers() { assert(current_); auto* vstorage = current_->storage_info(); @@ -1685,6 +1738,13 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( db_id_, db_session_id_); column_families_.insert({name, id}); column_family_data_.insert({id, new_cfd}); + auto ucmp = new_cfd->user_comparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + running_ts_sz_.insert({id, ts_sz}); + if (ts_sz > 0) { + ts_sz_for_record_.insert({id, ts_sz}); + } max_column_family_ = std::max(max_column_family_, id); // add to linked list new_cfd->next_ = dummy_cfd_; @@ -1700,10 +1760,13 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( // under a DB mutex AND from a write thread void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) { - auto cfd_iter = column_family_data_.find(cfd->GetID()); + uint32_t cf_id = cfd->GetID(); + auto cfd_iter = column_family_data_.find(cf_id); assert(cfd_iter != column_family_data_.end()); column_family_data_.erase(cfd_iter); column_families_.erase(cfd->GetName()); + running_ts_sz_.erase(cf_id); + ts_sz_for_record_.erase(cf_id); } // under a DB mutex OR from a write thread @@ -1750,4 +1813,20 @@ const Comparator* GetColumnFamilyUserComparator( return nullptr; } +const ImmutableOptions& GetImmutableOptions(ColumnFamilyHandle* column_family) { + assert(column_family); + + ColumnFamilyHandleImpl* const handle = + static_cast_with_check(column_family); + assert(handle); + + const ColumnFamilyData* const cfd = handle->cfd(); + assert(cfd); + + const ImmutableOptions* ioptions = cfd->ioptions(); + assert(ioptions); + + return *ioptions; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/db/column_family.h b/db/column_family.h index 0d5ebc454598..3a78ae875a73 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -211,6 +211,12 @@ struct SuperVersion { // Version number of the current SuperVersion uint64_t version_number; WriteStallCondition write_stall_condition; + // Each time `full_history_ts_low` collapses history, a new SuperVersion is + // installed. This field tracks the effective `full_history_ts_low` for that + // SuperVersion, to be used by read APIs for sanity checks. This field is + // immutable once SuperVersion is installed. For column family that doesn't + // enable UDT feature, this is an empty string. + std::string full_history_ts_low; // should be called outside the mutex SuperVersion() = default; @@ -335,12 +341,10 @@ class ColumnFamilyData { // Validate CF options against DB options static Status ValidateOptions(const DBOptions& db_options, const ColumnFamilyOptions& cf_options); -#ifndef ROCKSDB_LITE // REQUIRES: DB mutex held Status SetOptions( const DBOptions& db_options, const std::unordered_map& options_map); -#endif // ROCKSDB_LITE InternalStats* internal_stats() { return internal_stats_.get(); } @@ -465,12 +469,6 @@ class ColumnFamilyData { bool queued_for_flush() { return queued_for_flush_; } bool queued_for_compaction() { return queued_for_compaction_; } - enum class WriteStallCause { - kNone, - kMemtableLimit, - kL0FileCountLimit, - kPendingCompactionBytes, - }; static std::pair GetWriteStallConditionAndCause( int num_unflushed_memtables, int num_l0_files, @@ -515,6 +513,12 @@ class ColumnFamilyData { return full_history_ts_low_; } + // REQUIRES: DB mutex held. + // Return true if flushing up to MemTables with ID `max_memtable_id` + // should be postponed to retain user-defined timestamps according to the + // user's setting. Called by background flush job. + bool ShouldPostponeFlushToRetainUDT(uint64_t max_memtable_id); + ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); } WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; } std::shared_ptr @@ -522,8 +526,6 @@ class ColumnFamilyData { return file_metadata_cache_res_mgr_; } - SequenceNumber GetFirstMemtableSequenceNumber() const; - static const uint32_t kDummyColumnFamilyDataId; // Keep track of whether the mempurge feature was ever used. @@ -714,6 +716,16 @@ class ColumnFamilySet { Version* dummy_version, const ColumnFamilyOptions& options); + const UnorderedMap& GetRunningColumnFamiliesTimestampSize() + const { + return running_ts_sz_; + } + + const UnorderedMap& + GetColumnFamiliesTimestampSizeForRecord() const { + return ts_sz_for_record_; + } + iterator begin() { return iterator(dummy_cfd_->next_); } iterator end() { return iterator(dummy_cfd_); } @@ -739,6 +751,15 @@ class ColumnFamilySet { UnorderedMap column_families_; UnorderedMap column_family_data_; + // Mutating / reading `running_ts_sz_` and `ts_sz_for_record_` follow + // the same requirements as `column_families_` and `column_family_data_`. + // Mapping from column family id to user-defined timestamp size for all + // running column families. + UnorderedMap running_ts_sz_; + // Mapping from column family id to user-defined timestamp size for + // column families with non-zero user-defined timestamp size. + UnorderedMap ts_sz_for_record_; + uint32_t max_column_family_; const FileOptions file_options_; @@ -857,4 +878,7 @@ extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family); extern const Comparator* GetColumnFamilyUserComparator( ColumnFamilyHandle* column_family); +extern const ImmutableOptions& GetImmutableOptions( + ColumnFamilyHandle* column_family); + } // namespace ROCKSDB_NAMESPACE diff --git a/db/column_family_test.cc b/db/column_family_test.cc index d33cbe50a775..25bc0b36f61d 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -8,6 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include +#include #include #include #include @@ -17,6 +18,7 @@ #include "options/options_parser.h" #include "port/port.h" #include "port/stack_trace.h" +#include "rocksdb/comparator.h" #include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/env.h" @@ -26,6 +28,7 @@ #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/coding.h" +#include "util/defer.h" #include "util/string_util.h" #include "utilities/fault_injection_env.h" #include "utilities/merge_operators.h" @@ -63,6 +66,9 @@ class ColumnFamilyTestBase : public testing::Test { db_options_.create_if_missing = true; db_options_.fail_if_options_file_error = true; db_options_.env = env_; + } + + void SetUp() override { EXPECT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_))); } @@ -71,11 +77,7 @@ class ColumnFamilyTestBase : public testing::Test { for (auto h : handles_) { ColumnFamilyDescriptor cfdescriptor; Status s = h->GetDescriptor(&cfdescriptor); -#ifdef ROCKSDB_LITE - EXPECT_TRUE(s.IsNotSupported()); -#else EXPECT_OK(s); -#endif // ROCKSDB_LITE column_families.push_back(cfdescriptor); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -197,12 +199,10 @@ class ColumnFamilyTestBase : public testing::Test { &db_); } -#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported void AssertOpenReadOnly(std::vector cf, std::vector options = {}) { ASSERT_OK(OpenReadOnly(cf, options)); } -#endif // !ROCKSDB_LITE void Open(std::vector cf, std::vector options = {}) { @@ -224,27 +224,16 @@ class ColumnFamilyTestBase : public testing::Test { } bool IsDbWriteStopped() { -#ifndef ROCKSDB_LITE uint64_t v; EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.is-write-stopped", &v)); return (v == 1); -#else - return dbfull()->TEST_write_controler().IsStopped(); -#endif // !ROCKSDB_LITE } uint64_t GetDbDelayedWriteRate() { -#ifndef ROCKSDB_LITE uint64_t v; EXPECT_TRUE( dbfull()->GetIntProperty("rocksdb.actual-delayed-write-rate", &v)); return v; -#else - if (!dbfull()->TEST_write_controler().NeedsDelay()) { - return 0; - } - return dbfull()->TEST_write_controler().delayed_write_rate(); -#endif // !ROCKSDB_LITE } void Destroy(const std::vector& column_families = @@ -267,7 +256,6 @@ class ColumnFamilyTestBase : public testing::Test { db_->CreateColumnFamily(current_cf_opt, cfs[i], &handles_[cfi])); names_[cfi] = cfs[i]; -#ifndef ROCKSDB_LITE // RocksDBLite does not support GetDescriptor // Verify the CF options of the returned CF handle. ColumnFamilyDescriptor desc; ASSERT_OK(handles_[cfi]->GetDescriptor(&desc)); @@ -276,7 +264,6 @@ class ColumnFamilyTestBase : public testing::Test { ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( ConfigOptions(), desc.options, SanitizeOptions(dbfull()->immutable_db_options(), current_cf_opt))); -#endif // !ROCKSDB_LITE cfi++; } } @@ -325,7 +312,6 @@ class ColumnFamilyTestBase : public testing::Test { ASSERT_OK(db_->FlushWAL(/*sync=*/false)); } -#ifndef ROCKSDB_LITE // TEST functions in DB are not supported in lite void WaitForFlush(int cf) { ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf])); } @@ -339,7 +325,6 @@ class ColumnFamilyTestBase : public testing::Test { void AssertMaxTotalInMemoryState(uint64_t value) { ASSERT_EQ(value, MaxTotalInMemoryState()); } -#endif // !ROCKSDB_LITE Status Put(int cf, const std::string& key, const std::string& value) { return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value)); @@ -377,7 +362,6 @@ class ColumnFamilyTestBase : public testing::Test { "rocksdb.num-files-at-level" + std::to_string(level)); } -#ifndef ROCKSDB_LITE // Return spread of files per level std::string FilesPerLevel(int cf) { std::string result; @@ -394,31 +378,19 @@ class ColumnFamilyTestBase : public testing::Test { result.resize(last_non_zero_offset); return result; } -#endif void AssertFilesPerLevel(const std::string& value, int cf) { -#ifndef ROCKSDB_LITE ASSERT_EQ(value, FilesPerLevel(cf)); -#else - (void)value; - (void)cf; -#endif } -#ifndef ROCKSDB_LITE // GetLiveFilesMetaData is not supported int CountLiveFiles() { std::vector metadata; db_->GetLiveFilesMetaData(&metadata); return static_cast(metadata.size()); } -#endif // !ROCKSDB_LITE void AssertCountLiveFiles(int expected_value) { -#ifndef ROCKSDB_LITE ASSERT_EQ(expected_value, CountLiveFiles()); -#else - (void)expected_value; -#endif } // Do n memtable flushes, each of which produces an sstable @@ -432,7 +404,6 @@ class ColumnFamilyTestBase : public testing::Test { } } -#ifndef ROCKSDB_LITE // GetSortedWalFiles is not supported int CountLiveLogFiles() { int micros_wait_for_log_deletion = 20000; env_->SleepForMicroseconds(micros_wait_for_log_deletion); @@ -461,25 +432,18 @@ class ColumnFamilyTestBase : public testing::Test { return ret; return 0; } -#endif // !ROCKSDB_LITE void AssertCountLiveLogFiles(int value) { -#ifndef ROCKSDB_LITE // GetSortedWalFiles is not supported ASSERT_EQ(value, CountLiveLogFiles()); -#else - (void)value; -#endif // !ROCKSDB_LITE } void AssertNumberOfImmutableMemtables(std::vector num_per_cf) { assert(num_per_cf.size() == handles_.size()); -#ifndef ROCKSDB_LITE // GetProperty is not supported in lite for (size_t i = 0; i < num_per_cf.size(); ++i) { ASSERT_EQ(num_per_cf[i], GetProperty(static_cast(i), "rocksdb.num-immutable-mem-table")); } -#endif // !ROCKSDB_LITE } void CopyFile(const std::string& source, const std::string& destination, @@ -575,7 +539,6 @@ TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) { } } -#ifndef ROCKSDB_LITE TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) { Open(); @@ -598,7 +561,6 @@ TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -#endif // !ROCKSDB_LITE class FlushEmptyCFTestWithParam : public ColumnFamilyTestBase, @@ -942,7 +904,6 @@ TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) { } } -#ifndef ROCKSDB_LITE // TEST functions used are not supported TEST_P(ColumnFamilyTest, FlushTest) { Open(); CreateColumnFamiliesAndReopen({"one", "two"}); @@ -1057,7 +1018,6 @@ TEST_P(ColumnFamilyTest, LogDeletionTest) { AssertCountLiveLogFiles(4); Close(); } -#endif // !ROCKSDB_LITE TEST_P(ColumnFamilyTest, CrashAfterFlush) { std::unique_ptr fault_env( @@ -1097,7 +1057,6 @@ TEST_P(ColumnFamilyTest, OpenNonexistentColumnFamily) { ASSERT_TRUE(TryOpen({"default", "dne"}).IsInvalidArgument()); } -#ifndef ROCKSDB_LITE // WaitForFlush() is not supported // Makes sure that obsolete log files get deleted TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) { // disable flushing stale column families @@ -1205,14 +1164,12 @@ TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) { AssertCountLiveLogFiles(7); Close(); } -#endif // !ROCKSDB_LITE // The test is commented out because we want to test that snapshot is // not created for memtables not supported it, but There isn't a memtable // that doesn't support snapshot right now. If we have one later, we can // re-enable the test. // -// #ifndef ROCKSDB_LITE // Cuckoo is not supported in lite // TEST_P(ColumnFamilyTest, MemtableNotSupportSnapshot) { // db_options_.allow_concurrent_memtable_write = false; // Open(); @@ -1232,7 +1189,6 @@ TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) { // {second}); auto* s3 = dbfull()->GetSnapshot(); ASSERT_TRUE(s3 == nullptr); // Close(); // } -// #endif // !ROCKSDB_LITE class TestComparator : public Comparator { int Compare(const ROCKSDB_NAMESPACE::Slice& /*a*/, @@ -1299,13 +1255,13 @@ TEST_P(ColumnFamilyTest, DifferentMergeOperators) { Close(); } -#ifndef ROCKSDB_LITE // WaitForFlush() is not supported TEST_P(ColumnFamilyTest, DifferentCompactionStyles) { Open(); CreateColumnFamilies({"one", "two"}); ColumnFamilyOptions default_cf, one, two; db_options_.max_open_files = 20; // only 10 files in file cache + default_cf.level_compaction_dynamic_level_bytes = false; default_cf.compaction_style = kCompactionStyleLevel; default_cf.num_levels = 3; default_cf.write_buffer_size = 64 << 10; // 64KB @@ -1323,6 +1279,7 @@ TEST_P(ColumnFamilyTest, DifferentCompactionStyles) { one.level0_file_num_compaction_trigger = 4; one.write_buffer_size = 120000; + two.level_compaction_dynamic_level_bytes = false; two.compaction_style = kCompactionStyleLevel; two.num_levels = 4; two.level0_file_num_compaction_trigger = 3; @@ -1367,9 +1324,7 @@ TEST_P(ColumnFamilyTest, DifferentCompactionStyles) { Close(); } -#endif // !ROCKSDB_LITE -#ifndef ROCKSDB_LITE // Sync points not supported in RocksDB Lite TEST_P(ColumnFamilyTest, MultipleManualCompactions) { @@ -1379,6 +1334,7 @@ TEST_P(ColumnFamilyTest, MultipleManualCompactions) { db_options_.max_open_files = 20; // only 10 files in file cache db_options_.max_background_compactions = 3; + default_cf.level_compaction_dynamic_level_bytes = false; default_cf.compaction_style = kCompactionStyleLevel; default_cf.num_levels = 3; default_cf.write_buffer_size = 64 << 10; // 64KB @@ -1395,6 +1351,7 @@ TEST_P(ColumnFamilyTest, MultipleManualCompactions) { one.level0_file_num_compaction_trigger = 4; one.write_buffer_size = 120000; + two.level_compaction_dynamic_level_bytes = false; two.compaction_style = kCompactionStyleLevel; two.num_levels = 4; two.level0_file_num_compaction_trigger = 3; @@ -1477,13 +1434,14 @@ TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) { db_options_.max_open_files = 20; // only 10 files in file cache db_options_.max_background_compactions = 3; + default_cf.level_compaction_dynamic_level_bytes = false; default_cf.compaction_style = kCompactionStyleLevel; default_cf.num_levels = 3; default_cf.write_buffer_size = 64 << 10; // 64KB default_cf.target_file_size_base = 30 << 10; default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); - ; + table_options.no_block_cache = true; default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -1494,6 +1452,7 @@ TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) { one.level0_file_num_compaction_trigger = 4; one.write_buffer_size = 120000; + two.level_compaction_dynamic_level_bytes = false; two.compaction_style = kCompactionStyleLevel; two.num_levels = 4; two.level0_file_num_compaction_trigger = 3; @@ -1572,13 +1531,14 @@ TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) { db_options_.max_open_files = 20; // only 10 files in file cache db_options_.max_background_compactions = 3; + default_cf.level_compaction_dynamic_level_bytes = false; default_cf.compaction_style = kCompactionStyleLevel; default_cf.num_levels = 3; default_cf.write_buffer_size = 64 << 10; // 64KB default_cf.target_file_size_base = 30 << 10; default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100; BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); - ; + table_options.no_block_cache = true; default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -1589,6 +1549,7 @@ TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) { one.level0_file_num_compaction_trigger = 4; one.write_buffer_size = 120000; + two.level_compaction_dynamic_level_bytes = false; two.compaction_style = kCompactionStyleLevel; two.num_levels = 4; two.level0_file_num_compaction_trigger = 3; @@ -2033,9 +1994,7 @@ TEST_P(ColumnFamilyTest, SameCFAutomaticManualCompactions) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } -#endif // !ROCKSDB_LITE -#ifndef ROCKSDB_LITE // Tailing iterator not supported namespace { std::string IterStatus(Iterator* iter) { std::string result; @@ -2093,9 +2052,7 @@ TEST_P(ColumnFamilyTest, NewIteratorsTest) { Destroy(); } } -#endif // !ROCKSDB_LITE -#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported TEST_P(ColumnFamilyTest, ReadOnlyDBTest) { Open(); CreateColumnFamiliesAndReopen({"one", "two", "three", "four"}); @@ -2144,9 +2101,7 @@ TEST_P(ColumnFamilyTest, ReadOnlyDBTest) { s = OpenReadOnly({"one", "four"}); ASSERT_TRUE(!s.ok()); } -#endif // !ROCKSDB_LITE -#ifndef ROCKSDB_LITE // WaitForFlush() is not supported in lite TEST_P(ColumnFamilyTest, DontRollEmptyLogs) { Open(); CreateColumnFamiliesAndReopen({"one", "two", "three", "four"}); @@ -2168,9 +2123,7 @@ TEST_P(ColumnFamilyTest, DontRollEmptyLogs) { ASSERT_EQ(static_cast(total_new_writable_files), handles_.size() + 1); Close(); } -#endif // !ROCKSDB_LITE -#ifndef ROCKSDB_LITE // WaitForCompaction() is not supported in lite TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) { Open(); CreateColumnFamilies({"one", "two"}); @@ -2217,15 +2170,58 @@ TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) { ASSERT_EQ(0, dbfull()->TEST_total_log_size()); Close(); } -#endif // !ROCKSDB_LITE + +namespace { +struct CountOptionsFilesFs : public FileSystemWrapper { + explicit CountOptionsFilesFs(const std::shared_ptr& t) + : FileSystemWrapper(t) {} + const char* Name() const override { return "CountOptionsFilesFs"; } + + IOStatus NewWritableFile(const std::string& f, const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + if (f.find("OPTIONS-") != std::string::npos) { + options_files_created.fetch_add(1, std::memory_order_relaxed); + } + return FileSystemWrapper::NewWritableFile(f, file_opts, r, dbg); + } + + std::atomic options_files_created{}; +}; +} // namespace TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) { - Status s = TryOpen({"one", "two"}); - ASSERT_TRUE(!s.ok()); + // Can't accidentally add CFs to an existing DB + Open(); + Close(); + ASSERT_FALSE(db_options_.create_missing_column_families); + ASSERT_NOK(TryOpen({"one", "two"})); + + // Nor accidentally create in a new DB + Destroy(); + db_options_.create_if_missing = true; + ASSERT_NOK(TryOpen({"one", "two"})); + + // Only with the option (new DB case) db_options_.create_missing_column_families = true; - s = TryOpen({"default", "one", "two"}); - ASSERT_TRUE(s.ok()); + // Also setup to count number of options files created (see check below) + auto my_fs = + std::make_shared(db_options_.env->GetFileSystem()); + auto my_env = std::make_unique(db_options_.env, my_fs); + SaveAndRestore save_restore_env(&db_options_.env, my_env.get()); + + ASSERT_OK(TryOpen({"default", "one", "two"})); Close(); + + // An older version would write an updated options file for each column + // family created under create_missing_column_families, which would be + // quadratic I/O in the number of column families. + ASSERT_EQ(my_fs->options_files_created.load(), 1); + + // Add to existing DB case + ASSERT_OK(TryOpen({"default", "one", "two", "three", "four"})); + Close(); + ASSERT_EQ(my_fs->options_files_created.load(), 2); } TEST_P(ColumnFamilyTest, SanitizeOptions) { @@ -2457,8 +2453,6 @@ TEST_P(ColumnFamilyTest, FlushAndDropRaceCondition) { Destroy(); } -#ifndef ROCKSDB_LITE -// skipped as persisting options is not supported in ROCKSDB_LITE namespace { std::atomic test_stage(0); std::atomic ordered_by_writethread(false); @@ -2479,7 +2473,10 @@ void DropSingleColumnFamily(ColumnFamilyTest* cf_test, int cf_id, } } // anonymous namespace -TEST_P(ColumnFamilyTest, CreateAndDropRace) { +// This test attempts to set up a race condition in a way that is no longer +// possible, causing the test to hang. If DBImpl::options_mutex_ is removed +// in the future, this test might become relevant again. +TEST_P(ColumnFamilyTest, DISABLED_CreateAndDropRace) { const int kCfCount = 5; std::vector cf_opts; std::vector comparators; @@ -2540,7 +2537,53 @@ TEST_P(ColumnFamilyTest, CreateAndDropRace) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } -#endif // !ROCKSDB_LITE + +TEST_P(ColumnFamilyTest, CreateAndDropPeriodicRace) { + // This is a mini-stress test looking for inconsistency between the set of + // CFs in the DB, particularly whether any use preserve_internal_time_seconds, + // and whether that is accurately reflected in the periodic task setup. + constexpr size_t kNumThreads = 12; + std::vector threads; + bool last_cf_on = Random::GetTLSInstance()->OneIn(2); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::RegisterRecordSeqnoTimeWorker:BeforePeriodicTaskType", + [&](void* /*arg*/) { std::this_thread::yield(); }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_EQ(column_family_options_.preserve_internal_time_seconds, 0U); + ColumnFamilyOptions other_opts = column_family_options_; + ColumnFamilyOptions last_opts = column_family_options_; + (last_cf_on ? last_opts : other_opts).preserve_internal_time_seconds = + 1000000; + Open(); + + for (size_t i = 0; i < kNumThreads; i++) { + threads.emplace_back([this, &other_opts, i]() { + ColumnFamilyHandle* cfh; + ASSERT_OK(db_->CreateColumnFamily(other_opts, std::to_string(i), &cfh)); + ASSERT_OK(db_->DropColumnFamily(cfh)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh)); + }); + } + + ColumnFamilyHandle* last_cfh; + ASSERT_OK(db_->CreateColumnFamily(last_opts, "last", &last_cfh)); + + for (auto& t : threads) { + t.join(); + } + + bool task_enabled = dbfull()->TEST_GetPeriodicTaskScheduler().TEST_HasTask( + PeriodicTaskType::kRecordSeqnoTime); + ASSERT_EQ(last_cf_on, task_enabled); + + ASSERT_OK(db_->DropColumnFamily(last_cfh)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(last_cfh)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) { const uint64_t kBaseRate = 800000u; @@ -2950,7 +2993,6 @@ TEST_P(ColumnFamilyTest, CreateDropAndDestroy) { ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh)); } -#ifndef ROCKSDB_LITE TEST_P(ColumnFamilyTest, CreateDropAndDestroyWithoutFileDeletion) { ColumnFamilyHandle* cfh; Open(); @@ -3005,9 +3047,7 @@ TEST_P(ColumnFamilyTest, FlushCloseWALFiles) { db_options_.env = env_; Close(); } -#endif // !ROCKSDB_LITE -#ifndef ROCKSDB_LITE // WaitForFlush() is not supported TEST_P(ColumnFamilyTest, IteratorCloseWALFile1) { SpecialEnv env(Env::Default()); db_options_.env = &env; @@ -3114,9 +3154,7 @@ TEST_P(ColumnFamilyTest, IteratorCloseWALFile2) { db_options_.env = env_; Close(); } -#endif // !ROCKSDB_LITE -#ifndef ROCKSDB_LITE // TEST functions are not supported in lite TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) { SpecialEnv env(Env::Default()); // Allow both of flush and purge job to schedule. @@ -3192,7 +3230,6 @@ TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) { db_options_.env = env_; Close(); } -#endif // !ROCKSDB_LITE // Disable on windows because SyncWAL requires env->IsSyncThreadSafe() // to return true which is not so in unbuffered mode. @@ -3443,6 +3480,205 @@ TEST(ColumnFamilyTest, ValidateMemtableKVChecksumOption) { ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options)); } +// Tests the flushing behavior of a column family to retain user-defined +// timestamp when `persist_user_defined_timestamp` is false. +class ColumnFamilyRetainUDTTest : public ColumnFamilyTestBase { + public: + ColumnFamilyRetainUDTTest() : ColumnFamilyTestBase(kLatestFormatVersion) {} + + void SetUp() override { + db_options_.allow_concurrent_memtable_write = false; + column_family_options_.comparator = + test::BytewiseComparatorWithU64TsWrapper(); + column_family_options_.persist_user_defined_timestamps = false; + ColumnFamilyTestBase::SetUp(); + } + + Status Put(int cf, const std::string& key, const std::string& ts, + const std::string& value) { + return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(ts), + Slice(value)); + } +}; + +class TestTsComparator : public Comparator { + public: + TestTsComparator() : Comparator(8 /*ts_sz*/) {} + + int Compare(const ROCKSDB_NAMESPACE::Slice& /*a*/, + const ROCKSDB_NAMESPACE::Slice& /*b*/) const override { + return 0; + } + const char* Name() const override { return "TestTs"; } + void FindShortestSeparator( + std::string* /*start*/, + const ROCKSDB_NAMESPACE::Slice& /*limit*/) const override {} + void FindShortSuccessor(std::string* /*key*/) const override {} +}; + +TEST_F(ColumnFamilyRetainUDTTest, SanityCheck) { + Open(); + ColumnFamilyOptions cf_options; + cf_options.persist_user_defined_timestamps = false; + TestTsComparator test_comparator; + cf_options.comparator = &test_comparator; + ColumnFamilyHandle* handle; + // Not persisting user-defined timestamps feature only supports user-defined + // timestamps formatted as uint64_t. + ASSERT_TRUE( + db_->CreateColumnFamily(cf_options, "pikachu", &handle).IsNotSupported()); + + Destroy(); + // Not persisting user-defined timestamps feature doesn't work in combination + // with atomic flush. + db_options_.atomic_flush = true; + ASSERT_TRUE(TryOpen({"default"}).IsNotSupported()); + + // Not persisting user-defined timestamps feature doesn't work in combination + // with concurrent memtable write. + db_options_.atomic_flush = false; + db_options_.allow_concurrent_memtable_write = true; + ASSERT_TRUE(TryOpen({"default"}).IsNotSupported()); + Close(); +} + +TEST_F(ColumnFamilyRetainUDTTest, FullHistoryTsLowNotSet) { + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) { + ASSERT_NE(nullptr, arg); + auto reschedule_count = *static_cast(arg); + ASSERT_EQ(1, reschedule_count); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + Open(); + std::string write_ts; + PutFixed64(&write_ts, 1); + ASSERT_OK(Put(0, "foo", write_ts, "v1")); + // No `full_history_ts_low` explicitly set by user, flush is continued + // without checking if its UDTs expired. + ASSERT_OK(Flush(0)); + + // After flush, `full_history_ts_low` should be automatically advanced to + // the effective cutoff timestamp: write_ts + 1 + std::string cutoff_ts; + PutFixed64(&cutoff_ts, 2); + std::string effective_full_history_ts_low; + ASSERT_OK( + db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low)); + ASSERT_EQ(cutoff_ts, effective_full_history_ts_low); + Close(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(ColumnFamilyRetainUDTTest, AllKeysExpired) { + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) { + ASSERT_NE(nullptr, arg); + auto reschedule_count = *static_cast(arg); + ASSERT_EQ(1, reschedule_count); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + Open(); + std::string write_ts; + PutFixed64(&write_ts, 1); + ASSERT_OK(Put(0, "foo", write_ts, "v1")); + std::string cutoff_ts; + PutFixed64(&cutoff_ts, 3); + ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts)); + // All keys expired w.r.t the configured `full_history_ts_low`, flush continue + // without the need for a re-schedule. + ASSERT_OK(Flush(0)); + + // `full_history_ts_low` stays unchanged after flush. + std::string effective_full_history_ts_low; + ASSERT_OK( + db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low)); + ASSERT_EQ(cutoff_ts, effective_full_history_ts_low); + Close(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} +TEST_F(ColumnFamilyRetainUDTTest, NotAllKeysExpiredFlushToAvoidWriteStall) { + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) { + ASSERT_NE(nullptr, arg); + auto reschedule_count = *static_cast(arg); + ASSERT_EQ(1, reschedule_count); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + Open(); + std::string cutoff_ts; + std::string write_ts; + PutFixed64(&write_ts, 1); + ASSERT_OK(Put(0, "foo", write_ts, "v1")); + PutFixed64(&cutoff_ts, 1); + ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts)); + ASSERT_OK(db_->SetOptions(handles_[0], {{"max_write_buffer_number", "1"}})); + // Not all keys expired, but flush is continued without a re-schedule because + // of risk of write stall. + ASSERT_OK(Flush(0)); + + // After flush, `full_history_ts_low` should be automatically advanced to + // the effective cutoff timestamp: write_ts + 1 + std::string effective_full_history_ts_low; + ASSERT_OK( + db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low)); + + cutoff_ts.clear(); + PutFixed64(&cutoff_ts, 2); + ASSERT_EQ(cutoff_ts, effective_full_history_ts_low); + Close(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_F(ColumnFamilyRetainUDTTest, NotAllKeysExpiredFlushRescheduled) { + std::string cutoff_ts; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::AfterRetainUDTReschedule:cb", [&](void* /*arg*/) { + // Increasing full_history_ts_low so all keys expired after the initial + // FlushRequest is rescheduled + cutoff_ts.clear(); + PutFixed64(&cutoff_ts, 3); + ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts)); + }); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) { + ASSERT_NE(nullptr, arg); + auto reschedule_count = *static_cast(arg); + ASSERT_EQ(2, reschedule_count); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Open(); + std::string write_ts; + PutFixed64(&write_ts, 1); + ASSERT_OK(Put(0, "foo", write_ts, "v1")); + PutFixed64(&cutoff_ts, 1); + ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts)); + // Not all keys expired, and there is no risk of write stall. Flush is + // rescheduled. The actual flush happens after `full_history_ts_low` is + // increased to mark all keys expired. + ASSERT_OK(Flush(0)); + + std::string effective_full_history_ts_low; + ASSERT_OK( + db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low)); + // `full_history_ts_low` stays unchanged. + ASSERT_EQ(cutoff_ts, effective_full_history_ts_low); + Close(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc index ef38946f7e23..2d53f2b992dc 100644 --- a/db/compact_files_test.cc +++ b/db/compact_files_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include #include @@ -67,6 +66,7 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) { const int kWriteBufferSize = 10000; const int kLevel0Trigger = 2; options.create_if_missing = true; + options.level_compaction_dynamic_level_bytes = false; options.compaction_style = kCompactionStyleLevel; // Small slowdown and stop trigger for experimental purpose. options.level0_slowdown_writes_trigger = 20; @@ -121,7 +121,9 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) { TEST_F(CompactFilesTest, MultipleLevel) { Options options; options.create_if_missing = true; - options.level_compaction_dynamic_level_bytes = true; + // Otherwise background compaction can happen to + // drain unnecessary level + options.level_compaction_dynamic_level_bytes = false; options.num_levels = 6; // Add listener FlushedFileCollector* collector = new FlushedFileCollector(); @@ -182,7 +184,6 @@ TEST_F(CompactFilesTest, MultipleLevel) { for (int invalid_output_level = 0; invalid_output_level < 5; invalid_output_level++) { s = db->CompactFiles(CompactionOptions(), files, invalid_output_level); - std::cout << s.ToString() << std::endl; ASSERT_TRUE(s.IsInvalidArgument()); } @@ -344,7 +345,7 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) { return true; } std::string res; - db_->Get(ReadOptions(), "", &res); + EXPECT_TRUE(db_->Get(ReadOptions(), "", &res).IsNotFound()); return true; } @@ -359,6 +360,7 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) { std::shared_ptr cf(new FilterWithGet()); Options options; + options.level_compaction_dynamic_level_bytes = false; options.create_if_missing = true; options.compaction_filter = cf.get(); @@ -401,6 +403,7 @@ TEST_F(CompactFilesTest, SentinelCompressionType) { CompactionStyle::kCompactionStyleNone}) { ASSERT_OK(DestroyDB(db_name_, Options())); Options options; + options.level_compaction_dynamic_level_bytes = false; options.compaction_style = compaction_style; // L0: Snappy, L1: ZSTD, L2: Snappy options.compression_per_level = {CompressionType::kSnappyCompression, @@ -490,13 +493,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "SKIPPED as DBImpl::CompactFiles is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/db/compaction/clipping_iterator.h b/db/compaction/clipping_iterator.h index 1ed465c2c804..3f50cdd9ddfd 100644 --- a/db/compaction/clipping_iterator.h +++ b/db/compaction/clipping_iterator.h @@ -188,6 +188,11 @@ class ClippingIterator : public InternalIterator { return iter_->GetProperty(prop_name, prop); } + bool IsDeleteRangeSentinelKey() const override { + assert(valid_); + return iter_->IsDeleteRangeSentinelKey(); + } + private: void UpdateValid() { assert(!iter_->Valid() || iter_->status().ok()); diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc index d7d57bbf519b..bbab8f79fb56 100644 --- a/db/compaction/compaction.cc +++ b/db/compaction/compaction.cc @@ -13,6 +13,7 @@ #include #include "db/column_family.h" +#include "logging/logging.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/sst_partitioner.h" #include "test_util/sync_point.h" @@ -23,14 +24,13 @@ namespace ROCKSDB_NAMESPACE { const uint64_t kRangeTombstoneSentinel = PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion); -int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, - const InternalKey& b) { - auto c = user_cmp->CompareWithoutTimestamp(a.user_key(), b.user_key()); +int sstableKeyCompare(const Comparator* uc, const Slice& a, const Slice& b) { + auto c = uc->CompareWithoutTimestamp(ExtractUserKey(a), ExtractUserKey(b)); if (c != 0) { return c; } - auto a_footer = ExtractInternalKeyFooter(a.Encode()); - auto b_footer = ExtractInternalKeyFooter(b.Encode()); + auto a_footer = ExtractInternalKeyFooter(a); + auto b_footer = ExtractInternalKeyFooter(b); if (a_footer == kRangeTombstoneSentinel) { if (b_footer != kRangeTombstoneSentinel) { return -1; @@ -115,6 +115,42 @@ void Compaction::GetBoundaryKeys( } } +void Compaction::GetBoundaryInternalKeys( + VersionStorageInfo* vstorage, + const std::vector& inputs, InternalKey* smallest_key, + InternalKey* largest_key, int exclude_level) { + bool initialized = false; + const InternalKeyComparator* icmp = vstorage->InternalComparator(); + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i].files.empty() || inputs[i].level == exclude_level) { + continue; + } + if (inputs[i].level == 0) { + // we need to consider all files on level 0 + for (const auto* f : inputs[i].files) { + if (!initialized || icmp->Compare(f->smallest, *smallest_key) < 0) { + *smallest_key = f->smallest; + } + if (!initialized || icmp->Compare(f->largest, *largest_key) > 0) { + *largest_key = f->largest; + } + initialized = true; + } + } else { + // we only need to consider the first and last file + if (!initialized || + icmp->Compare(inputs[i].files[0]->smallest, *smallest_key) < 0) { + *smallest_key = inputs[i].files[0]->smallest; + } + if (!initialized || + icmp->Compare(inputs[i].files.back()->largest, *largest_key) > 0) { + *largest_key = inputs[i].files.back()->largest; + } + initialized = true; + } + } +} + std::vector Compaction::PopulateWithAtomicBoundaries( VersionStorageInfo* vstorage, std::vector inputs) { const Comparator* ucmp = vstorage->InternalComparator()->user_comparator(); @@ -204,6 +240,38 @@ bool Compaction::IsFullCompaction( return num_files_in_compaction == total_num_files; } +Status Compaction::InitInputTableProperties() { + if (!input_table_properties_.empty()) { + return Status::OK(); + } + + Status s; + const ReadOptions read_options(Env::IOActivity::kCompaction); + assert(input_version_); + for (size_t i = 0; i < num_input_levels(); ++i) { + for (const FileMetaData* fmd : *(this->inputs(i))) { + std::shared_ptr tp; + std::string file_name = + TableFileName(immutable_options_.cf_paths, fmd->fd.GetNumber(), + fmd->fd.GetPathId()); + s = input_version_->GetTableProperties(read_options, &tp, fmd, + &file_name); + if (s.ok()) { + input_table_properties_[file_name] = tp; + } else { + ROCKS_LOG_ERROR(immutable_options_.info_log, + "Unable to load table properties for file %" PRIu64 + " --- %s\n", + fmd->fd.GetNumber(), s.ToString().c_str()); + input_table_properties_.clear(); + return s; + } + } + } + + return s; +} + Compaction::Compaction( VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options, const MutableCFOptions& _mutable_cf_options, @@ -367,9 +435,14 @@ void Compaction::PopulatePenultimateLevelOutputRange() { } } - GetBoundaryKeys(input_vstorage_, inputs_, - &penultimate_level_smallest_user_key_, - &penultimate_level_largest_user_key_, exclude_level); + // FIXME: should make use of `penultimate_output_range_type_`. + // FIXME: when last level's input range does not overlap with + // penultimate level, and penultimate level input is empty, + // this call will not set penultimate_level_smallest_ or + // penultimate_level_largest_. No keys will be compacted up. + GetBoundaryInternalKeys(input_vstorage_, inputs_, + &penultimate_level_smallest_, + &penultimate_level_largest_, exclude_level); } Compaction::~Compaction() { @@ -394,33 +467,40 @@ bool Compaction::OverlapPenultimateLevelOutputRange( if (!SupportsPerKeyPlacement()) { return false; } + + // See FIXME in Compaction::PopulatePenultimateLevelOutputRange(). + // We do not compact any key up in this case. + if (penultimate_level_smallest_.size() == 0 || + penultimate_level_largest_.size() == 0) { + return false; + } + const Comparator* ucmp = input_vstorage_->InternalComparator()->user_comparator(); return ucmp->CompareWithoutTimestamp( - smallest_key, penultimate_level_largest_user_key_) <= 0 && + smallest_key, penultimate_level_largest_.user_key()) <= 0 && ucmp->CompareWithoutTimestamp( - largest_key, penultimate_level_smallest_user_key_) >= 0; + largest_key, penultimate_level_smallest_.user_key()) >= 0; } // key includes timestamp if user-defined timestamp is enabled. -bool Compaction::WithinPenultimateLevelOutputRange(const Slice& key) const { +bool Compaction::WithinPenultimateLevelOutputRange( + const ParsedInternalKey& ikey) const { if (!SupportsPerKeyPlacement()) { return false; } - if (penultimate_level_smallest_user_key_.empty() || - penultimate_level_largest_user_key_.empty()) { + if (penultimate_level_smallest_.size() == 0 || + penultimate_level_largest_.size() == 0) { return false; } - const Comparator* ucmp = - input_vstorage_->InternalComparator()->user_comparator(); + const InternalKeyComparator* icmp = input_vstorage_->InternalComparator(); - return ucmp->CompareWithoutTimestamp( - key, penultimate_level_smallest_user_key_) >= 0 && - ucmp->CompareWithoutTimestamp( - key, penultimate_level_largest_user_key_) <= 0; + // op_type of a key can change during compaction, e.g. Merge -> Put. + return icmp->CompareKeySeq(ikey, penultimate_level_smallest_.Encode()) >= 0 && + icmp->CompareKeySeq(ikey, penultimate_level_largest_.Encode()) <= 0; } bool Compaction::InputCompressionMatchesOutput() const { @@ -465,6 +545,11 @@ bool Compaction::IsTrivialMove() const { return false; } + if (compaction_reason_ == CompactionReason::kChangeTemperature) { + // Changing temperature usually requires rewriting the file. + return false; + } + // Used in universal compaction, where trivial move can be done if the // input files are non overlapping if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) && @@ -481,26 +566,25 @@ bool Compaction::IsTrivialMove() const { // assert inputs_.size() == 1 - std::unique_ptr partitioner = CreateSstPartitioner(); - - for (const auto& file : inputs_.front().files) { - std::vector file_grand_parents; - if (output_level_ + 1 >= number_levels_) { - continue; - } - input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest, - &file->largest, &file_grand_parents); - const auto compaction_size = - file->fd.GetFileSize() + TotalFileSize(file_grand_parents); - if (compaction_size > max_compaction_bytes_) { - return false; - } - - if (partitioner.get() != nullptr) { - if (!partitioner->CanDoTrivialMove(file->smallest.user_key(), - file->largest.user_key())) { + if (output_level_ + 1 < number_levels_) { + std::unique_ptr partitioner = CreateSstPartitioner(); + for (const auto& file : inputs_.front().files) { + std::vector file_grand_parents; + input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest, + &file->largest, + &file_grand_parents); + const auto compaction_size = + file->fd.GetFileSize() + TotalFileSize(file_grand_parents); + if (compaction_size > max_compaction_bytes_) { return false; } + + if (partitioner.get() != nullptr) { + if (!partitioner->CanDoTrivialMove(file->smallest.user_key(), + file->largest.user_key())) { + return false; + } + } } } @@ -558,6 +642,49 @@ bool Compaction::KeyNotExistsBeyondOutputLevel( return false; } +bool Compaction::KeyRangeNotExistsBeyondOutputLevel( + const Slice& begin_key, const Slice& end_key, + std::vector* level_ptrs) const { + assert(input_version_ != nullptr); + assert(level_ptrs != nullptr); + assert(level_ptrs->size() == static_cast(number_levels_)); + assert(cfd_->user_comparator()->CompareWithoutTimestamp(begin_key, end_key) < + 0); + if (bottommost_level_) { + return true /* does not overlap */; + } else if (output_level_ != 0 && + cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + const Comparator* user_cmp = cfd_->user_comparator(); + for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) { + const std::vector& files = + input_vstorage_->LevelFiles(lvl); + for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) { + auto* f = files[level_ptrs->at(lvl)]; + // Advance until the first file with begin_key <= f->largest.user_key() + if (user_cmp->CompareWithoutTimestamp(begin_key, + f->largest.user_key()) > 0) { + continue; + } + // We know that the previous file prev_f, if exists, has + // prev_f->largest.user_key() < begin_key. + if (user_cmp->CompareWithoutTimestamp(end_key, + f->smallest.user_key()) <= 0) { + // not overlapping with this level + break; + } else { + // We have: + // - begin_key < end_key, + // - begin_key <= f->largest.user_key(), and + // - end_key > f->smallest.user_key() + return false /* overlap */; + } + } + } + return true /* does not overlap */; + } + return false /* overlaps */; +}; + // Mark (or clear) each file that is being compacted void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) { for (size_t i = 0; i < num_input_levels(); i++) { @@ -699,8 +826,17 @@ std::unique_ptr Compaction::CreateCompactionFilter() const { CompactionFilter::Context context; context.is_full_compaction = is_full_compaction_; context.is_manual_compaction = is_manual_compaction_; + context.input_start_level = start_level_; context.column_family_id = cfd_->GetID(); context.reason = TableFileCreationReason::kCompaction; + context.input_table_properties = GetInputTableProperties(); + if (context.input_table_properties.empty()) { + ROCKS_LOG_WARN( + immutable_options_.info_log, + "Unable to set `input_table_properties` of `CompactionFilter::Context` " + "for compaction."); + } + return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter( context); } diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h index ee863960146a..50c75f70b22c 100644 --- a/db/compaction/compaction.h +++ b/db/compaction/compaction.h @@ -31,8 +31,19 @@ namespace ROCKSDB_NAMESPACE { // that key never appears in the database. We don't want adjacent sstables to // be considered overlapping if they are separated by the range tombstone // sentinel. -int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, - const InternalKey& b); +int sstableKeyCompare(const Comparator* user_cmp, const Slice&, const Slice&); +inline int sstableKeyCompare(const Comparator* user_cmp, const Slice& a, + const InternalKey& b) { + return sstableKeyCompare(user_cmp, a, b.Encode()); +} +inline int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const Slice& b) { + return sstableKeyCompare(user_cmp, a.Encode(), b); +} +inline int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, + const InternalKey& b) { + return sstableKeyCompare(user_cmp, a.Encode(), b.Encode()); +} int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a, const InternalKey& b); int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a, @@ -203,10 +214,18 @@ class Compaction { void AddInputDeletions(VersionEdit* edit); // Returns true if the available information we have guarantees that - // the input "user_key" does not exist in any level beyond "output_level()". + // the input "user_key" does not exist in any level beyond `output_level()`. bool KeyNotExistsBeyondOutputLevel(const Slice& user_key, std::vector* level_ptrs) const; + // Returns true if the user key range [begin_key, end_key) does not exist + // in any level beyond `output_level()`. + // Used for checking range tombstones, so we assume begin_key < end_key. + // begin_key and end_key should include timestamp if enabled. + bool KeyRangeNotExistsBeyondOutputLevel( + const Slice& begin_key, const Slice& end_key, + std::vector* level_ptrs) const; + // Clear all files to indicate that they are not being compacted // Delete this compaction from the list of running compactions. // @@ -270,7 +289,14 @@ class Compaction { // is the sum of all input file sizes. uint64_t OutputFilePreallocationSize() const; - void SetInputVersion(Version* input_version); + // TODO(hx235): eventually we should consider `InitInputTableProperties()`'s + // status and fail the compaction if needed + // TODO(hx235): consider making this function part of the construction so we + // don't forget to call it + void FinalizeInputInfo(Version* input_version) { + SetInputVersion(input_version); + InitInputTableProperties().PermitUncheckedError(); + } struct InputLevelSummaryBuffer { char buffer[128]; @@ -307,26 +333,26 @@ class Compaction { int output_level, VersionStorageInfo* vstorage, const std::vector& inputs); - TablePropertiesCollection GetOutputTableProperties() const { - return output_table_properties_; + const TablePropertiesCollection& GetInputTableProperties() const { + return input_table_properties_; + } + + // TODO(hx235): consider making this function symmetric to + // InitInputTableProperties() + void SetOutputTableProperties( + const std::string& file_name, + const std::shared_ptr& tp) { + output_table_properties_[file_name] = tp; } - void SetOutputTableProperties(TablePropertiesCollection tp) { - output_table_properties_ = std::move(tp); + const TablePropertiesCollection& GetOutputTableProperties() const { + return output_table_properties_; } Slice GetSmallestUserKey() const { return smallest_user_key_; } Slice GetLargestUserKey() const { return largest_user_key_; } - Slice GetPenultimateLevelSmallestUserKey() const { - return penultimate_level_smallest_user_key_; - } - - Slice GetPenultimateLevelLargestUserKey() const { - return penultimate_level_largest_user_key_; - } - PenultimateOutputRangeType GetPenultimateOutputRangeType() const { return penultimate_output_range_type_; } @@ -349,10 +375,8 @@ class Compaction { // per_key_placement feature, which is safe to place the key to the // penultimate level. different compaction strategy has different rules. // If per_key_placement is not supported, always return false. - // TODO: currently it doesn't support moving data from the last level to the - // penultimate level // key includes timestamp if user-defined timestamp is enabled. - bool WithinPenultimateLevelOutputRange(const Slice& key) const; + bool WithinPenultimateLevelOutputRange(const ParsedInternalKey& ikey) const; CompactionReason compaction_reason() const { return compaction_reason_; } @@ -409,6 +433,10 @@ class Compaction { const int output_level); private: + void SetInputVersion(Version* input_version); + + Status InitInputTableProperties(); + // mark (or clear) all files that are being compacted void MarkFilesBeingCompacted(bool mark_as_compacted); @@ -418,6 +446,13 @@ class Compaction { Slice* smallest_key, Slice* largest_key, int exclude_level = -1); + // get the smallest and largest internal key present in files to be compacted + static void GetBoundaryInternalKeys( + VersionStorageInfo* vstorage, + const std::vector& inputs, + InternalKey* smallest_key, InternalKey* largest_key, + int exclude_level = -1); + // populate penultimate level output range, which will be used to determine if // a key is safe to output to the penultimate level (details see // `Compaction::WithinPenultimateLevelOutputRange()`. @@ -499,7 +534,7 @@ class Compaction { // Does input compression match the output compression? bool InputCompressionMatchesOutput() const; - // table properties of output files + TablePropertiesCollection input_table_properties_; TablePropertiesCollection output_table_properties_; // smallest user keys in compaction @@ -530,8 +565,8 @@ class Compaction { // Key range for penultimate level output // includes timestamp if user-defined timestamp is enabled. // penultimate_output_range_type_ shows the range type - Slice penultimate_level_smallest_user_key_; - Slice penultimate_level_largest_user_key_; + InternalKey penultimate_level_smallest_; + InternalKey penultimate_level_largest_; PenultimateOutputRangeType penultimate_output_range_type_ = PenultimateOutputRangeType::kNotSupported; }; @@ -546,13 +581,16 @@ struct PerKeyPlacementContext { const Slice value; const SequenceNumber seq_num; - bool output_to_penultimate_level; + bool& output_to_penultimate_level; PerKeyPlacementContext(int _level, Slice _key, Slice _value, - SequenceNumber _seq_num) - : level(_level), key(_key), value(_value), seq_num(_seq_num) { - output_to_penultimate_level = false; - } + SequenceNumber _seq_num, + bool& _output_to_penultimate_level) + : level(_level), + key(_key), + value(_value), + seq_num(_seq_num), + output_to_penultimate_level(_output_to_penultimate_level) {} }; #endif /* !NDEBUG */ diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index e1bdddcb750c..85d1c039bd30 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -13,6 +13,8 @@ #include "db/blob/blob_index.h" #include "db/blob/prefetch_buffer_collection.h" #include "db/snapshot_checker.h" +#include "db/wide/wide_column_serialization.h" +#include "db/wide/wide_columns_helper.h" #include "logging/logging.h" #include "port/likely.h" #include "rocksdb/listener.h" @@ -30,7 +32,8 @@ CompactionIterator::CompactionIterator( BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, bool enforce_single_del_contracts, const std::atomic& manual_compaction_canceled, - const Compaction* compaction, const CompactionFilter* compaction_filter, + bool must_count_input_entries, const Compaction* compaction, + const CompactionFilter* compaction_filter, const std::atomic* shutting_down, const std::shared_ptr info_log, const std::string* full_history_ts_low, @@ -44,8 +47,9 @@ CompactionIterator::CompactionIterator( manual_compaction_canceled, std::unique_ptr( compaction ? new RealCompaction(compaction) : nullptr), - compaction_filter, shutting_down, info_log, full_history_ts_low, - preserve_time_min_seqno, preclude_last_level_min_seqno) {} + must_count_input_entries, compaction_filter, shutting_down, info_log, + full_history_ts_low, preserve_time_min_seqno, + preclude_last_level_min_seqno) {} CompactionIterator::CompactionIterator( InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, @@ -57,15 +61,14 @@ CompactionIterator::CompactionIterator( BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, bool enforce_single_del_contracts, const std::atomic& manual_compaction_canceled, - std::unique_ptr compaction, + std::unique_ptr compaction, bool must_count_input_entries, const CompactionFilter* compaction_filter, const std::atomic* shutting_down, const std::shared_ptr info_log, const std::string* full_history_ts_low, const SequenceNumber preserve_time_min_seqno, const SequenceNumber preclude_last_level_min_seqno) - : input_(input, cmp, - !compaction || compaction->DoesInputReferenceBlobFiles()), + : input_(input, cmp, must_count_input_entries), cmp_(cmp), merge_helper_(merge_helper), snapshots_(snapshots), @@ -221,39 +224,47 @@ void CompactionIterator::Next() { bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until) { - // TODO: support compaction filter for wide-column entities - if (!compaction_filter_ || - (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex)) { + if (!compaction_filter_) { return true; } - bool error = false; - // If the user has specified a compaction filter and the sequence - // number is greater than any external snapshot, then invoke the - // filter. If the return value of the compaction filter is true, - // replace the entry with a deletion marker. - CompactionFilter::Decision filter = CompactionFilter::Decision::kUndetermined; - compaction_filter_value_.clear(); - compaction_filter_skip_until_.Clear(); + + if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex && + ikey_.type != kTypeWideColumnEntity) { + return true; + } + + CompactionFilter::Decision decision = + CompactionFilter::Decision::kUndetermined; CompactionFilter::ValueType value_type = ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue - : CompactionFilter::ValueType::kBlobIndex; + : ikey_.type == kTypeBlobIndex + ? CompactionFilter::ValueType::kBlobIndex + : CompactionFilter::ValueType::kWideColumnEntity; + // Hack: pass internal key to BlobIndexCompactionFilter since it needs // to get sequence number. assert(compaction_filter_); - Slice& filter_key = - (ikey_.type == kTypeValue || + const Slice& filter_key = + (ikey_.type != kTypeBlobIndex || !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) ? ikey_.user_key : key_; + + compaction_filter_value_.clear(); + compaction_filter_skip_until_.Clear(); + + std::vector> new_columns; + { StopWatchNano timer(clock_, report_detailed_time_); - if (kTypeBlobIndex == ikey_.type) { - filter = compaction_filter_->FilterBlobByKey( + + if (ikey_.type == kTypeBlobIndex) { + decision = compaction_filter_->FilterBlobByKey( level_, filter_key, &compaction_filter_value_, compaction_filter_skip_until_.rep()); - if (CompactionFilter::Decision::kUndetermined == filter && + if (decision == CompactionFilter::Decision::kUndetermined && !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { - if (compaction_ == nullptr) { + if (!compaction_) { status_ = Status::Corruption("Unexpected blob index outside of compaction"); validity_info_.Invalidate(); @@ -299,33 +310,61 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, value_type = CompactionFilter::ValueType::kValue; } } - if (CompactionFilter::Decision::kUndetermined == filter) { - filter = compaction_filter_->FilterV2( - level_, filter_key, value_type, - blob_value_.empty() ? value_ : blob_value_, &compaction_filter_value_, + + if (decision == CompactionFilter::Decision::kUndetermined) { + const Slice* existing_val = nullptr; + const WideColumns* existing_col = nullptr; + + WideColumns existing_columns; + + if (ikey_.type != kTypeWideColumnEntity) { + if (!blob_value_.empty()) { + existing_val = &blob_value_; + } else { + existing_val = &value_; + } + } else { + Slice value_copy = value_; + const Status s = + WideColumnSerialization::Deserialize(value_copy, existing_columns); + + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + return false; + } + + existing_col = &existing_columns; + } + + decision = compaction_filter_->FilterV3( + level_, filter_key, value_type, existing_val, existing_col, + &compaction_filter_value_, &new_columns, compaction_filter_skip_until_.rep()); } + iter_stats_.total_filter_time += env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0; } - if (CompactionFilter::Decision::kUndetermined == filter) { - // Should not reach here, since FilterV2 should never return kUndetermined. - status_ = - Status::NotSupported("FilterV2() should never return kUndetermined"); + if (decision == CompactionFilter::Decision::kUndetermined) { + // Should not reach here, since FilterV2/FilterV3 should never return + // kUndetermined. + status_ = Status::NotSupported( + "FilterV2/FilterV3 should never return kUndetermined"); validity_info_.Invalidate(); return false; } - if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil && + if (decision == CompactionFilter::Decision::kRemoveAndSkipUntil && cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <= 0) { // Can't skip to a key smaller than the current one. - // Keep the key as per FilterV2 documentation. - filter = CompactionFilter::Decision::kKeep; + // Keep the key as per FilterV2/FilterV3 documentation. + decision = CompactionFilter::Decision::kKeep; } - if (filter == CompactionFilter::Decision::kRemove) { + if (decision == CompactionFilter::Decision::kRemove) { // convert the current key to a delete; key_ is pointing into // current_key_ at this point, so updating current_key_ updates key() ikey_.type = kTypeDeletion; @@ -333,7 +372,7 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, // no value associated with delete value_.clear(); iter_stats_.num_record_drop_user++; - } else if (filter == CompactionFilter::Decision::kPurge) { + } else if (decision == CompactionFilter::Decision::kPurge) { // convert the current key to a single delete; key_ is pointing into // current_key_ at this point, so updating current_key_ updates key() ikey_.type = kTypeSingleDeletion; @@ -341,19 +380,19 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, // no value associated with single delete value_.clear(); iter_stats_.num_record_drop_user++; - } else if (filter == CompactionFilter::Decision::kChangeValue) { - if (ikey_.type == kTypeBlobIndex) { - // value transfer from blob file to inlined data + } else if (decision == CompactionFilter::Decision::kChangeValue) { + if (ikey_.type != kTypeValue) { ikey_.type = kTypeValue; - current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + current_key_.UpdateInternalKey(ikey_.sequence, kTypeValue); } + value_ = compaction_filter_value_; - } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) { + } else if (decision == CompactionFilter::Decision::kRemoveAndSkipUntil) { *need_skip = true; compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber, kValueTypeForSeek); *skip_until = compaction_filter_skip_until_.Encode(); - } else if (filter == CompactionFilter::Decision::kChangeBlobIndex) { + } else if (decision == CompactionFilter::Decision::kChangeBlobIndex) { // Only the StackableDB-based BlobDB impl's compaction filter should return // kChangeBlobIndex. Decision about rewriting blob and changing blob index // in the integrated BlobDB impl is made in subsequent call to @@ -365,23 +404,53 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip, validity_info_.Invalidate(); return false; } - if (ikey_.type == kTypeValue) { - // value transfer from inlined data to blob file + + if (ikey_.type != kTypeBlobIndex) { ikey_.type = kTypeBlobIndex; - current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); + current_key_.UpdateInternalKey(ikey_.sequence, kTypeBlobIndex); } + value_ = compaction_filter_value_; - } else if (filter == CompactionFilter::Decision::kIOError) { + } else if (decision == CompactionFilter::Decision::kIOError) { if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) { status_ = Status::NotSupported( "CompactionFilter for integrated BlobDB should not return kIOError"); validity_info_.Invalidate(); return false; } + status_ = Status::IOError("Failed to access blob during compaction filter"); - error = true; + validity_info_.Invalidate(); + return false; + } else if (decision == CompactionFilter::Decision::kChangeWideColumnEntity) { + WideColumns sorted_columns; + sorted_columns.reserve(new_columns.size()); + + for (const auto& column : new_columns) { + sorted_columns.emplace_back(column.first, column.second); + } + + WideColumnsHelper::SortColumns(sorted_columns); + + { + const Status s = WideColumnSerialization::Serialize( + sorted_columns, compaction_filter_value_); + if (!s.ok()) { + status_ = s; + validity_info_.Invalidate(); + return false; + } + } + + if (ikey_.type != kTypeWideColumnEntity) { + ikey_.type = kTypeWideColumnEntity; + current_key_.UpdateInternalKey(ikey_.sequence, kTypeWideColumnEntity); + } + + value_ = compaction_filter_value_; } - return !error; + + return true; } void CompactionIterator::NextFromInput() { @@ -394,6 +463,7 @@ void CompactionIterator::NextFromInput() { value_ = input_.value(); blob_value_.Reset(); iter_stats_.num_input_records++; + is_range_del_ = input_.IsDeleteRangeSentinelKey(); Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_); if (!pik_status.ok()) { @@ -413,7 +483,10 @@ void CompactionIterator::NextFromInput() { break; } TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_); - + if (is_range_del_) { + validity_info_.SetValid(kRangeDeletion); + break; + } // Update input statistics if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion || ikey_.type == kTypeDeletionWithTimestamp) { @@ -635,6 +708,14 @@ void CompactionIterator::NextFromInput() { ParsedInternalKey next_ikey; AdvanceInputIter(); + while (input_.Valid() && input_.IsDeleteRangeSentinelKey() && + ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) + .ok() && + cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) { + // skip range tombstone start keys with the same user key + // since they are not "real" point keys. + AdvanceInputIter(); + } // Check whether the next key exists, is not corrupt, and is the same key // as the single delete. @@ -642,6 +723,7 @@ void CompactionIterator::NextFromInput() { ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) .ok() && cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) { + assert(!input_.IsDeleteRangeSentinelKey()); #ifndef NDEBUG const Compaction* c = compaction_ ? compaction_->real_compaction() : nullptr; @@ -866,12 +948,14 @@ void CompactionIterator::NextFromInput() { // Note that a deletion marker of type kTypeDeletionWithTimestamp will be // considered to have a different user key unless the timestamp is older // than *full_history_ts_low_. + // + // Range tombstone start keys are skipped as they are not "real" keys. while (!IsPausingManualCompaction() && !IsShuttingDown() && input_.Valid() && (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_) .ok()) && cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) && - (prev_snapshot == 0 || + (prev_snapshot == 0 || input_.IsDeleteRangeSentinelKey() || DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot))) { AdvanceInputIter(); } @@ -1116,17 +1200,7 @@ void CompactionIterator::GarbageCollectBlobIfNeeded() { void CompactionIterator::DecideOutputLevel() { assert(compaction_->SupportsPerKeyPlacement()); -#ifndef NDEBUG - // Could be overridden by unittest - PerKeyPlacementContext context(level_, ikey_.user_key, value_, - ikey_.sequence); - TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context", - &context); - output_to_penultimate_level_ = context.output_to_penultimate_level; -#else output_to_penultimate_level_ = false; -#endif // NDEBUG - // if the key is newer than the cutoff sequence or within the earliest // snapshot, it should output to the penultimate level. if (ikey_.sequence > preclude_last_level_min_seqno_ || @@ -1134,6 +1208,17 @@ void CompactionIterator::DecideOutputLevel() { output_to_penultimate_level_ = true; } +#ifndef NDEBUG + // Could be overridden by unittest + PerKeyPlacementContext context(level_, ikey_.user_key, value_, ikey_.sequence, + output_to_penultimate_level_); + TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context", + &context); + if (ikey_.sequence > earliest_snapshot_) { + output_to_penultimate_level_ = true; + } +#endif // NDEBUG + if (output_to_penultimate_level_) { // If it's decided to output to the penultimate level, but unsafe to do so, // still output to the last level. For example, moving the data from a lower @@ -1142,7 +1227,7 @@ void CompactionIterator::DecideOutputLevel() { // not from this compaction. // TODO: add statistic for declined output_to_penultimate_level bool safe_to_penultimate_level = - compaction_->WithinPenultimateLevelOutputRange(ikey_.user_key); + compaction_->WithinPenultimateLevelOutputRange(ikey_); if (!safe_to_penultimate_level) { output_to_penultimate_level_ = false; // It could happen when disable/enable `last_level_temperature` while @@ -1165,14 +1250,19 @@ void CompactionIterator::DecideOutputLevel() { void CompactionIterator::PrepareOutput() { if (Valid()) { - if (ikey_.type == kTypeValue) { - ExtractLargeValueIfNeeded(); - } else if (ikey_.type == kTypeBlobIndex) { - GarbageCollectBlobIfNeeded(); - } + if (LIKELY(!is_range_del_)) { + if (ikey_.type == kTypeValue) { + ExtractLargeValueIfNeeded(); + } else if (ikey_.type == kTypeBlobIndex) { + GarbageCollectBlobIfNeeded(); + } - if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) { - DecideOutputLevel(); + // For range del sentinel, we don't use it to cut files for bottommost + // compaction. So it should not make a difference which output level we + // decide. + if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) { + DecideOutputLevel(); + } } // Zeroing out the sequence number leads to better compression. @@ -1191,7 +1281,7 @@ void CompactionIterator::PrepareOutput() { DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) && ikey_.type != kTypeMerge && current_key_committed_ && !output_to_penultimate_level_ && - ikey_.sequence < preserve_time_min_seqno_) { + ikey_.sequence < preserve_time_min_seqno_ && !is_range_del_) { if (ikey_.type == kTypeDeletion || (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) { ROCKS_LOG_FATAL( @@ -1324,6 +1414,7 @@ std::unique_ptr CompactionIterator::CreateBlobFetcherIfNeeded( } ReadOptions read_options; + read_options.io_activity = Env::IOActivity::kCompaction; read_options.fill_cache = false; return std::unique_ptr(new BlobFetcher(version, read_options)); diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h index a224a8e0e299..1ff9c8869246 100644 --- a/db/compaction/compaction_iterator.h +++ b/db/compaction/compaction_iterator.h @@ -38,15 +38,18 @@ class SequenceIterWrapper : public InternalIterator { bool Valid() const override { return inner_iter_->Valid(); } Status status() const override { return inner_iter_->status(); } void Next() override { - num_itered_++; + if (!inner_iter_->IsDeleteRangeSentinelKey()) { + num_itered_++; + } inner_iter_->Next(); } void Seek(const Slice& target) override { if (!need_count_entries_) { + has_num_itered_ = false; inner_iter_->Seek(target); } else { - // For flush cases, we need to count total number of entries, so we - // do Next() rather than Seek(). + // Need to count total number of entries, + // so we do Next() rather than Seek(). while (inner_iter_->Valid() && icmp_.Compare(inner_iter_->key(), target) < 0) { Next(); @@ -62,13 +65,19 @@ class SequenceIterWrapper : public InternalIterator { void SeekForPrev(const Slice& /* target */) override { assert(false); } void SeekToLast() override { assert(false); } - uint64_t num_itered() const { return num_itered_; } + uint64_t NumItered() const { return num_itered_; } + bool HasNumItered() const { return has_num_itered_; } + bool IsDeleteRangeSentinelKey() const override { + assert(Valid()); + return inner_iter_->IsDeleteRangeSentinelKey(); + } private: InternalKeyComparator icmp_; InternalIterator* inner_iter_; // not owned uint64_t num_itered_ = 0; bool need_count_entries_; + bool has_num_itered_ = true; }; class CompactionIterator { @@ -110,7 +119,8 @@ class CompactionIterator { virtual bool SupportsPerKeyPlacement() const = 0; // `key` includes timestamp if user-defined timestamp is enabled. - virtual bool WithinPenultimateLevelOutputRange(const Slice& key) const = 0; + virtual bool WithinPenultimateLevelOutputRange( + const ParsedInternalKey&) const = 0; }; class RealCompaction : public CompactionProxy { @@ -177,14 +187,19 @@ class CompactionIterator { // Check if key is within penultimate level output range, to see if it's // safe to output to the penultimate level for per_key_placement feature. // `key` includes timestamp if user-defined timestamp is enabled. - bool WithinPenultimateLevelOutputRange(const Slice& key) const override { - return compaction_->WithinPenultimateLevelOutputRange(key); + bool WithinPenultimateLevelOutputRange( + const ParsedInternalKey& ikey) const override { + return compaction_->WithinPenultimateLevelOutputRange(ikey); } private: const Compaction* compaction_; }; + // @param must_count_input_entries if true, `NumInputEntryScanned()` will + // return the number of input keys scanned. If false, `NumInputEntryScanned()` + // will return this number if no Seek was called on `input`. User should call + // `HasNumInputEntryScanned()` first in this case. CompactionIterator( InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, SequenceNumber last_sequence, std::vector* snapshots, @@ -195,7 +210,7 @@ class CompactionIterator { BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, bool enforce_single_del_contracts, const std::atomic& manual_compaction_canceled, - const Compaction* compaction = nullptr, + bool must_count_input_entries, const Compaction* compaction = nullptr, const CompactionFilter* compaction_filter = nullptr, const std::atomic* shutting_down = nullptr, const std::shared_ptr info_log = nullptr, @@ -215,6 +230,7 @@ class CompactionIterator { bool enforce_single_del_contracts, const std::atomic& manual_compaction_canceled, std::unique_ptr compaction, + bool must_count_input_entries, const CompactionFilter* compaction_filter = nullptr, const std::atomic* shutting_down = nullptr, const std::shared_ptr info_log = nullptr, @@ -242,9 +258,15 @@ class CompactionIterator { const Status& status() const { return status_; } const ParsedInternalKey& ikey() const { return ikey_; } inline bool Valid() const { return validity_info_.IsValid(); } - const Slice& user_key() const { return current_user_key_; } + const Slice& user_key() const { + if (UNLIKELY(is_range_del_)) { + return ikey_.user_key; + } + return current_user_key_; + } const CompactionIterationStats& iter_stats() const { return iter_stats_; } - uint64_t num_input_entry_scanned() const { return input_.num_itered(); } + bool HasNumInputEntryScanned() const { return input_.HasNumItered(); } + uint64_t NumInputEntryScanned() const { return input_.NumItered(); } // If the current key should be placed on penultimate level, only valid if // per_key_placement is supported bool output_to_penultimate_level() const { @@ -252,6 +274,8 @@ class CompactionIterator { } Status InputStatus() const { return input_.status(); } + bool IsDeleteRangeSentinelKey() const { return is_range_del_; } + private: // Processes the input stream to find the next output void NextFromInput(); @@ -385,6 +409,7 @@ class CompactionIterator { kKeepSD = 8, kKeepDel = 9, kNewUserKey = 10, + kRangeDeletion = 11, }; struct ValidityInfo { @@ -493,6 +518,10 @@ class CompactionIterator { // This is a best-effort facility, so memory_order_relaxed is sufficient. return manual_compaction_canceled_.load(std::memory_order_relaxed); } + + // Stores whether the current compaction iterator output + // is a range tombstone start key. + bool is_range_del_{false}; }; inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq, diff --git a/db/compaction/compaction_iterator_test.cc b/db/compaction/compaction_iterator_test.cc index 81362d792159..699e629693df 100644 --- a/db/compaction/compaction_iterator_test.cc +++ b/db/compaction/compaction_iterator_test.cc @@ -184,8 +184,9 @@ class FakeCompaction : public CompactionIterator::CompactionProxy { return supports_per_key_placement; } - bool WithinPenultimateLevelOutputRange(const Slice& key) const override { - return (!key.starts_with("unsafe_pb")); + bool WithinPenultimateLevelOutputRange( + const ParsedInternalKey& key) const override { + return (!key.user_key.starts_with("unsafe_pb")); } bool key_not_exists_beyond_output_level = false; @@ -293,8 +294,8 @@ class CompactionIteratorTest : public testing::TestWithParam { nullptr /* blob_file_builder */, true /*allow_data_in_errors*/, true /*enforce_single_del_contracts*/, /*manual_compaction_canceled=*/kManualCompactionCanceledFalse_, - std::move(compaction), filter, &shutting_down_, /*info_log=*/nullptr, - full_history_ts_low)); + std::move(compaction), /*must_count_input_entries=*/false, filter, + &shutting_down_, /*info_log=*/nullptr, full_history_ts_low)); } void AddSnapshot(SequenceNumber snapshot, diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index cd14d6f79530..d0ff1d14566b 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -192,8 +192,8 @@ CompactionJob::CompactionJob( assert(log_buffer_ != nullptr); const auto* cfd = compact_->compaction->column_family_data(); - ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, - db_options_.enable_thread_tracking); + ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking); + ThreadStatusUtil::SetColumnFamily(cfd); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); ReportStartedCompaction(compaction); } @@ -204,10 +204,6 @@ CompactionJob::~CompactionJob() { } void CompactionJob::ReportStartedCompaction(Compaction* compaction) { - const auto* cfd = compact_->compaction->column_family_data(); - ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, - db_options_.enable_thread_tracking); - ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID, job_id_); @@ -264,7 +260,7 @@ void CompactionJob::Prepare() { StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME); GenSubcompactionBoundaries(); } - if (boundaries_.size() > 1) { + if (boundaries_.size() >= 1) { for (size_t i = 0; i <= boundaries_.size(); i++) { compact_->sub_compact_states.emplace_back( c, (i != 0) ? std::optional(boundaries_[i - 1]) : std::nullopt, @@ -291,22 +287,24 @@ void CompactionJob::Prepare() { c->immutable_options()->preclude_last_level_data_seconds); if (preserve_time_duration > 0) { - // setup seqno_time_mapping_ - seqno_time_mapping_.SetMaxTimeDuration(preserve_time_duration); + const ReadOptions read_options(Env::IOActivity::kCompaction); + // setup seqno_to_time_mapping_ + seqno_to_time_mapping_.SetMaxTimeDuration(preserve_time_duration); for (const auto& each_level : *c->inputs()) { for (const auto& fmd : each_level.files) { std::shared_ptr tp; - Status s = cfd->current()->GetTableProperties(&tp, fmd, nullptr); + Status s = + cfd->current()->GetTableProperties(read_options, &tp, fmd, nullptr); if (s.ok()) { - seqno_time_mapping_.Add(tp->seqno_to_time_mapping) + seqno_to_time_mapping_.Add(tp->seqno_to_time_mapping) .PermitUncheckedError(); - seqno_time_mapping_.Add(fmd->fd.smallest_seqno, - fmd->oldest_ancester_time); + seqno_to_time_mapping_.Add(fmd->fd.smallest_seqno, + fmd->oldest_ancester_time); } } } - auto status = seqno_time_mapping_.Sort(); + auto status = seqno_to_time_mapping_.Sort(); if (!status.ok()) { ROCKS_LOG_WARN(db_options_.info_log, "Invalid sequence number to time mapping: Status: %s", @@ -322,13 +320,17 @@ void CompactionJob::Prepare() { preserve_time_min_seqno_ = 0; preclude_last_level_min_seqno_ = 0; } else { - seqno_time_mapping_.TruncateOldEntries(_current_time); + seqno_to_time_mapping_.TruncateOldEntries(_current_time); uint64_t preserve_time = static_cast(_current_time) > preserve_time_duration ? _current_time - preserve_time_duration : 0; + // GetProximalSeqnoBeforeTime tells us the last seqno known to have been + // written at or before the given time. + 1 to get the minimum we should + // preserve without excluding anything that might have been written on or + // after the given time. preserve_time_min_seqno_ = - seqno_time_mapping_.GetOldestSequenceNum(preserve_time); + seqno_to_time_mapping_.GetProximalSeqnoBeforeTime(preserve_time) + 1; if (c->immutable_options()->preclude_last_level_data_seconds > 0) { uint64_t preclude_last_level_time = static_cast(_current_time) > @@ -337,7 +339,9 @@ void CompactionJob::Prepare() { c->immutable_options()->preclude_last_level_data_seconds : 0; preclude_last_level_min_seqno_ = - seqno_time_mapping_.GetOldestSequenceNum(preclude_last_level_time); + seqno_to_time_mapping_.GetProximalSeqnoBeforeTime( + preclude_last_level_time) + + 1; } } } @@ -472,7 +476,7 @@ void CompactionJob::GenSubcompactionBoundaries() { // overlap with N-1 other ranges. Since we requested a relatively large number // (128) of ranges from each input files, even N range overlapping would // cause relatively small inaccuracy. - + const ReadOptions read_options(Env::IOActivity::kCompaction); auto* c = compact_->compaction; if (c->max_subcompactions() <= 1 && !(c->immutable_options()->compaction_pri == kRoundRobin && @@ -506,7 +510,9 @@ void CompactionJob::GenSubcompactionBoundaries() { FileMetaData* f = flevel->files[i].file_metadata; std::vector my_anchors; Status s = cfd->table_cache()->ApproximateKeyAnchors( - ReadOptions(), icomp, *f, my_anchors); + read_options, icomp, *f, + c->mutable_cf_options()->block_protection_bytes_per_key, + my_anchors); if (!s.ok() || my_anchors.empty()) { my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize()); } @@ -722,11 +728,12 @@ Status CompactionJob::Run() { // use_direct_io_for_flush_and_compaction is true, we will regard this // verification as user reads since the goal is to cache it here for // further user reads - ReadOptions read_options; + const ReadOptions verify_table_read_options( + Env::IOActivity::kCompaction); InternalIterator* iter = cfd->table_cache()->NewIterator( - read_options, file_options_, cfd->internal_comparator(), - files_output[file_idx]->meta, /*range_del_agg=*/nullptr, - prefix_extractor, + verify_table_read_options, file_options_, + cfd->internal_comparator(), files_output[file_idx]->meta, + /*range_del_agg=*/nullptr, prefix_extractor, /*table_reader_ptr=*/nullptr, cfd->internal_stats()->GetFileReadHist( compact_->compaction->output_level()), @@ -736,7 +743,9 @@ Status CompactionJob::Run() { *compact_->compaction->mutable_cf_options()), /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, - /*allow_unprepared_value=*/false); + /*allow_unprepared_value=*/false, + compact_->compaction->mutable_cf_options() + ->block_protection_bytes_per_key); auto s = iter->status(); if (s.ok() && paranoid_file_checks_) { @@ -793,24 +802,56 @@ Status CompactionJob::Run() { auto fn = TableFileName(state.compaction->immutable_options()->cf_paths, output.meta.fd.GetNumber(), output.meta.fd.GetPathId()); - tp[fn] = output.table_properties; + compact_->compaction->SetOutputTableProperties(fn, + output.table_properties); } } - compact_->compaction->SetOutputTableProperties(std::move(tp)); - // Finish up all book-keeping to unify the subcompaction results + // Finish up all bookkeeping to unify the subcompaction results. compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); - UpdateCompactionStats(); - + uint64_t num_input_range_del = 0; + bool ok = UpdateCompactionStats(&num_input_range_del); + // (Sub)compactions returned ok, do sanity check on the number of input keys. + if (status.ok() && ok && compaction_job_stats_->has_num_input_records) { + size_t ts_sz = compact_->compaction->column_family_data() + ->user_comparator() + ->timestamp_size(); + // When trim_ts_ is non-empty, CompactionIterator takes + // HistoryTrimmingIterator as input iterator and sees a trimmed view of + // input keys. So the number of keys it processed is not suitable for + // verification here. + // TODO: support verification when trim_ts_ is non-empty. + if (!(ts_sz > 0 && !trim_ts_.empty()) && + db_options_.compaction_verify_record_count) { + assert(compaction_stats_.stats.num_input_records > 0); + // TODO: verify the number of range deletion entries. + uint64_t expected = + compaction_stats_.stats.num_input_records - num_input_range_del; + uint64_t actual = compaction_job_stats_->num_input_records; + if (expected != actual) { + std::string msg = + "Total number of input records: " + std::to_string(expected) + + ", but processed " + std::to_string(actual) + " records."; + ROCKS_LOG_WARN( + db_options_.info_log, "[%s] [JOB %d] Compaction %s", + compact_->compaction->column_family_data()->GetName().c_str(), + job_context_->job_id, msg.c_str()); + status = Status::Corruption( + "Compaction number of input keys does not match number of keys " + "processed."); + } + } + } RecordCompactionIOStats(); LogFlush(db_options_.info_log); TEST_SYNC_POINT("CompactionJob::Run():End"); - compact_->status = status; + TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():EndStatusSet", &status); return status; } -Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { +Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options, + bool* compaction_released) { assert(compact_); AutoThreadOperationStageUpdater stage_updater( @@ -826,7 +867,7 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { compaction_stats_); if (status.ok()) { - status = InstallCompactionResults(mutable_cf_options); + status = InstallCompactionResults(mutable_cf_options, compaction_released); } if (!versions_->io_status().ok()) { io_status_ = versions_->io_status(); @@ -978,7 +1019,6 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { void CompactionJob::NotifyOnSubcompactionBegin( SubcompactionState* sub_compact) { -#ifndef ROCKSDB_LITE Compaction* c = compact_->compaction; if (db_options_.listeners.empty()) { @@ -1004,14 +1044,10 @@ void CompactionJob::NotifyOnSubcompactionBegin( } info.status.PermitUncheckedError(); -#else - (void)sub_compact; -#endif // ROCKSDB_LITE } void CompactionJob::NotifyOnSubcompactionCompleted( SubcompactionState* sub_compact) { -#ifndef ROCKSDB_LITE if (db_options_.listeners.empty()) { return; @@ -1032,16 +1068,11 @@ void CompactionJob::NotifyOnSubcompactionCompleted( for (const auto& listener : db_options_.listeners) { listener->OnSubcompactionCompleted(info); } -#else - (void)sub_compact; -#endif // ROCKSDB_LITE } void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { assert(sub_compact); assert(sub_compact->compaction); - -#ifndef ROCKSDB_LITE if (db_options_.compaction_service) { CompactionServiceJobStatus comp_status = ProcessKeyValueCompactionWithCompactionService(sub_compact); @@ -1052,7 +1083,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { // fallback to local compaction assert(comp_status == CompactionServiceJobStatus::kUseLocal); } -#endif // !ROCKSDB_LITE uint64_t prev_cpu_micros = db_options_.clock->CPUMicros(); @@ -1093,6 +1123,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { read_options.verify_checksums = true; read_options.fill_cache = false; read_options.rate_limiter_priority = GetRateLimiterPriority(); + read_options.io_activity = Env::IOActivity::kCompaction; // Compaction iterators shouldn't be confined to a single prefix. // Compactions use Seek() for // (a) concurrent compactions, @@ -1103,17 +1134,17 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { // GenSubcompactionBoundaries doesn't strip away the timestamp. size_t ts_sz = cfd->user_comparator()->timestamp_size(); if (start.has_value()) { - read_options.iterate_lower_bound = &start.value(); + read_options.iterate_lower_bound = &(*start); if (ts_sz > 0) { - start_without_ts = StripTimestampFromUserKey(start.value(), ts_sz); - read_options.iterate_lower_bound = &start_without_ts.value(); + start_without_ts = StripTimestampFromUserKey(*start, ts_sz); + read_options.iterate_lower_bound = &(*start_without_ts); } } if (end.has_value()) { - read_options.iterate_upper_bound = &end.value(); + read_options.iterate_upper_bound = &(*end); if (ts_sz > 0) { - end_without_ts = StripTimestampFromUserKey(end.value(), ts_sz); - read_options.iterate_upper_bound = &end_without_ts.value(); + end_without_ts = StripTimestampFromUserKey(*end, ts_sz); + read_options.iterate_upper_bound = &(*end_without_ts); } } @@ -1128,6 +1159,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { IterKey end_ikey; Slice start_slice; Slice end_slice; + Slice start_user_key{}; + Slice end_user_key{}; static constexpr char kMaxTs[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"; @@ -1143,21 +1176,22 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { } if (start.has_value()) { - start_ikey.SetInternalKey(start.value(), kMaxSequenceNumber, - kValueTypeForSeek); + start_ikey.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek); if (ts_sz > 0) { start_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek, &ts_slice); } start_slice = start_ikey.GetInternalKey(); + start_user_key = start_ikey.GetUserKey(); } if (end.has_value()) { - end_ikey.SetInternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek); + end_ikey.SetInternalKey(*end, kMaxSequenceNumber, kValueTypeForSeek); if (ts_sz > 0) { end_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek, &ts_slice); } end_slice = end_ikey.GetInternalKey(); + end_user_key = end_ikey.GetUserKey(); } std::unique_ptr clip; @@ -1257,6 +1291,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { /*expect_valid_internal_key=*/true, range_del_agg.get(), blob_file_builder.get(), db_options_.allow_data_in_errors, db_options_.enforce_single_del_contracts, manual_compaction_canceled_, + sub_compact->compaction + ->DoesInputReferenceBlobFiles() /* must_count_input_entries */, sub_compact->compaction, compaction_filter, shutting_down_, db_options_.info_log, full_history_ts_low, preserve_time_min_seqno_, preclude_last_level_min_seqno_); @@ -1274,11 +1310,15 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { [this, sub_compact](CompactionOutputs& outputs) { return this->OpenCompactionOutputFile(sub_compact, outputs); }; + const CompactionFileCloseFunc close_file_func = - [this, sub_compact](CompactionOutputs& outputs, const Status& status, - const Slice& next_table_min_key) { - return this->FinishCompactionOutputFile(status, sub_compact, outputs, - next_table_min_key); + [this, sub_compact, start_user_key, end_user_key]( + CompactionOutputs& outputs, const Status& status, + const Slice& next_table_min_key) { + return this->FinishCompactionOutputFile( + status, sub_compact, outputs, next_table_min_key, + sub_compact->start.has_value() ? &start_user_key : nullptr, + sub_compact->end.has_value() ? &end_user_key : nullptr); }; Status status; @@ -1286,18 +1326,24 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { "CompactionJob::ProcessKeyValueCompaction()::Processing", reinterpret_cast( const_cast(sub_compact->compaction))); + uint64_t last_cpu_micros = prev_cpu_micros; while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) { // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid() // returns true. - - assert(!end.has_value() || cfd->user_comparator()->Compare( - c_iter->user_key(), end.value()) < 0); + assert(!end.has_value() || + cfd->user_comparator()->Compare(c_iter->user_key(), *end) < 0); if (c_iter_stats.num_input_records % kRecordStatsEvery == kRecordStatsEvery - 1) { RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats); c_iter->ResetRecordCounts(); RecordCompactionIOStats(); + + uint64_t cur_cpu_micros = db_options_.clock->CPUMicros(); + assert(cur_cpu_micros >= last_cpu_micros); + RecordTick(stats_, COMPACTION_CPU_TOTAL_TIME, + cur_cpu_micros - last_cpu_micros); + last_cpu_micros = cur_cpu_micros; } // Add current compaction_iterator key to target compaction output, if the @@ -1318,8 +1364,25 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { if (c_iter->status().IsManualCompactionPaused()) { break; } + +#ifndef NDEBUG + bool stop = false; + TEST_SYNC_POINT_CALLBACK("CompactionJob::ProcessKeyValueCompaction()::stop", + static_cast(&stop)); + if (stop) { + break; + } +#endif // NDEBUG } + // This number may not be accurate when CompactionIterator was created + // with `must_count_input_entries=false`. + assert(!sub_compact->compaction->DoesInputReferenceBlobFiles() || + c_iter->HasNumInputEntryScanned()); + sub_compact->compaction_job_stats.has_num_input_records = + c_iter->HasNumInputEntryScanned(); + sub_compact->compaction_job_stats.num_input_records = + c_iter->NumInputEntryScanned(); sub_compact->compaction_job_stats.num_blobs_read = c_iter_stats.num_blobs_read; sub_compact->compaction_job_stats.total_blob_bytes_read = @@ -1388,8 +1451,11 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { sub_compact->Current().UpdateBlobStats(); } + uint64_t cur_cpu_micros = db_options_.clock->CPUMicros(); sub_compact->compaction_job_stats.cpu_micros = - db_options_.clock->CPUMicros() - prev_cpu_micros; + cur_cpu_micros - prev_cpu_micros; + RecordTick(stats_, COMPACTION_CPU_TOTAL_TIME, + cur_cpu_micros - last_cpu_micros); if (measure_io_stats_) { sub_compact->compaction_job_stats.file_write_nanos += @@ -1469,7 +1535,8 @@ void CompactionJob::RecordDroppedKeys( Status CompactionJob::FinishCompactionOutputFile( const Status& input_status, SubcompactionState* sub_compact, - CompactionOutputs& outputs, const Slice& next_table_min_key) { + CompactionOutputs& outputs, const Slice& next_table_min_key, + const Slice* comp_start_user_key, const Slice* comp_end_user_key) { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_SYNC_FILE); assert(sub_compact != nullptr); @@ -1499,12 +1566,10 @@ Status CompactionJob::FinishCompactionOutputFile( // output_to_penultimate_level compaction here, as it's only used to decide // if range dels could be dropped. if (outputs.HasRangeDel()) { - s = outputs.AddRangeDels( - sub_compact->start.has_value() ? &(sub_compact->start.value()) - : nullptr, - sub_compact->end.has_value() ? &(sub_compact->end.value()) : nullptr, - range_del_out_stats, bottommost_level_, cfd->internal_comparator(), - earliest_snapshot, next_table_min_key, full_history_ts_low_); + s = outputs.AddRangeDels(comp_start_user_key, comp_end_user_key, + range_del_out_stats, bottommost_level_, + cfd->internal_comparator(), earliest_snapshot, + next_table_min_key, full_history_ts_low_); } RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats); TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1"); @@ -1512,7 +1577,7 @@ Status CompactionJob::FinishCompactionOutputFile( const uint64_t current_entries = outputs.NumEntries(); - s = outputs.Finish(s, seqno_time_mapping_); + s = outputs.Finish(s, seqno_to_time_mapping_); if (s.ok()) { // With accurate smallest and largest key, we can get a slightly more @@ -1617,7 +1682,6 @@ Status CompactionJob::FinishCompactionOutputFile( TableFileCreationReason::kCompaction, status_for_listener, file_checksum, file_checksum_func_name); -#ifndef ROCKSDB_LITE // Report new file to SstFileManagerImpl auto sfm = static_cast(db_options_.sst_file_manager.get()); @@ -1636,18 +1700,18 @@ Status CompactionJob::FinishCompactionOutputFile( db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction); } } -#endif outputs.ResetBuilder(); return s; } Status CompactionJob::InstallCompactionResults( - const MutableCFOptions& mutable_cf_options) { + const MutableCFOptions& mutable_cf_options, bool* compaction_released) { assert(compact_); db_mutex_->AssertHeld(); + const ReadOptions read_options(Env::IOActivity::kCompaction); auto* compaction = compact_->compaction; assert(compaction); @@ -1724,9 +1788,15 @@ Status CompactionJob::InstallCompactionResults( } } - return versions_->LogAndApply(compaction->column_family_data(), - mutable_cf_options, edit, db_mutex_, - db_directory_); + auto manifest_wcb = [&compaction, &compaction_released](const Status& s) { + compaction->ReleaseCompactionFiles(s); + *compaction_released = true; + }; + + return versions_->LogAndApply( + compaction->column_family_data(), mutable_cf_options, read_options, edit, + db_mutex_, db_directory_, /*new_descriptor_log=*/false, + /*column_family_options=*/nullptr, manifest_wcb); } void CompactionJob::RecordCompactionIOStats() { @@ -1761,11 +1831,9 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, std::string fname = GetTableFileName(file_number); // Fire events. ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); -#ifndef ROCKSDB_LITE EventHelpers::NotifyTableFileCreationStarted( cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_, TableFileCreationReason::kCompaction); -#endif // !ROCKSDB_LITE // Make the output file std::unique_ptr writable_file; #ifndef NDEBUG @@ -1824,16 +1892,18 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, uint64_t current_time = static_cast(temp_current_time); InternalKey tmp_start, tmp_end; if (sub_compact->start.has_value()) { - tmp_start.SetMinPossibleForUserKey(sub_compact->start.value()); + tmp_start.SetMinPossibleForUserKey(*(sub_compact->start)); } if (sub_compact->end.has_value()) { - tmp_end.SetMinPossibleForUserKey(sub_compact->end.value()); + tmp_end.SetMinPossibleForUserKey(*(sub_compact->end)); } uint64_t oldest_ancester_time = sub_compact->compaction->MinInputFileOldestAncesterTime( sub_compact->start.has_value() ? &tmp_start : nullptr, sub_compact->end.has_value() ? &tmp_end : nullptr); if (oldest_ancester_time == std::numeric_limits::max()) { + // TODO: fix DBSSTTest.GetTotalSstFilesSize and use + // kUnknownOldestAncesterTime oldest_ancester_time = current_time; } @@ -1878,6 +1948,7 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, db_options_.stats, listeners, db_options_.file_checksum_gen_factory.get(), tmp_set.Contains(FileType::kTableFile), false)); + // TODO(hx235): pass in the correct `oldest_key_time` instead of `0` TableBuilderOptions tboptions( *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()), cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(), @@ -1902,7 +1973,6 @@ void CompactionJob::CleanupCompaction() { compact_ = nullptr; } -#ifndef ROCKSDB_LITE namespace { void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) { assert(prefix_length > 0); @@ -1911,25 +1981,53 @@ void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) { } } // namespace -#endif // !ROCKSDB_LITE - -void CompactionJob::UpdateCompactionStats() { +bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) { assert(compact_); Compaction* compaction = compact_->compaction; compaction_stats_.stats.num_input_files_in_non_output_levels = 0; compaction_stats_.stats.num_input_files_in_output_level = 0; + + bool has_error = false; + const ReadOptions read_options(Env::IOActivity::kCompaction); + const auto& input_table_properties = compaction->GetInputTableProperties(); for (int input_level = 0; input_level < static_cast(compaction->num_input_levels()); ++input_level) { + size_t num_input_files = compaction->num_input_files(input_level); + uint64_t* bytes_read; if (compaction->level(input_level) != compaction->output_level()) { - UpdateCompactionInputStatsHelper( - &compaction_stats_.stats.num_input_files_in_non_output_levels, - &compaction_stats_.stats.bytes_read_non_output_levels, input_level); + compaction_stats_.stats.num_input_files_in_non_output_levels += + static_cast(num_input_files); + bytes_read = &compaction_stats_.stats.bytes_read_non_output_levels; } else { - UpdateCompactionInputStatsHelper( - &compaction_stats_.stats.num_input_files_in_output_level, - &compaction_stats_.stats.bytes_read_output_level, input_level); + compaction_stats_.stats.num_input_files_in_output_level += + static_cast(num_input_files); + bytes_read = &compaction_stats_.stats.bytes_read_output_level; + } + for (size_t i = 0; i < num_input_files; ++i) { + const FileMetaData* file_meta = compaction->input(input_level, i); + *bytes_read += file_meta->fd.GetFileSize(); + uint64_t file_input_entries = file_meta->num_entries; + uint64_t file_num_range_del = file_meta->num_range_deletions; + if (file_input_entries == 0) { + uint64_t file_number = file_meta->fd.GetNumber(); + // Try getting info from table property + std::string fn = + TableFileName(compaction->immutable_options()->cf_paths, + file_number, file_meta->fd.GetPathId()); + const auto& tp = input_table_properties.find(fn); + if (tp != input_table_properties.end()) { + file_input_entries = tp->second->num_entries; + file_num_range_del = tp->second->num_range_deletions; + } else { + has_error = true; + } + } + compaction_stats_.stats.num_input_records += file_input_entries; + if (num_input_range_del) { + *num_input_range_del += file_num_range_del; + } } } @@ -1939,26 +2037,11 @@ void CompactionJob::UpdateCompactionStats() { compaction_stats_.stats.num_dropped_records = compaction_stats_.DroppedRecords(); -} - -void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files, - uint64_t* bytes_read, - int input_level) { - const Compaction* compaction = compact_->compaction; - auto num_input_files = compaction->num_input_files(input_level); - *num_files += static_cast(num_input_files); - - for (size_t i = 0; i < num_input_files; ++i) { - const auto* file_meta = compaction->input(input_level, i); - *bytes_read += file_meta->fd.GetFileSize(); - compaction_stats_.stats.num_input_records += - static_cast(file_meta->num_entries); - } + return !has_error; } void CompactionJob::UpdateCompactionJobStats( const InternalStats::CompactionStats& stats) const { -#ifndef ROCKSDB_LITE compaction_job_stats_->elapsed_micros = stats.micros; // input information @@ -1985,9 +2068,6 @@ void CompactionJob::UpdateCompactionJobStats( CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength, &compaction_job_stats_->largest_output_key_prefix); } -#else - (void)stats; -#endif // !ROCKSDB_LITE } void CompactionJob::LogCompaction() { diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h index 2f8cb08da551..e812cfc72a30 100644 --- a/db/compaction/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -186,13 +186,30 @@ class CompactionJob { // REQUIRED: mutex held // Add compaction input/output to the current version - Status Install(const MutableCFOptions& mutable_cf_options); + // Releases compaction file through Compaction::ReleaseCompactionFiles(). + // Sets *compaction_released to true if compaction is released. + Status Install(const MutableCFOptions& mutable_cf_options, + bool* compaction_released); // Return the IO status IOStatus io_status() const { return io_status_; } protected: - void UpdateCompactionStats(); + // Update the following stats in compaction_stats_.stats + // - num_input_files_in_non_output_levels + // - num_input_files_in_output_level + // - bytes_read_non_output_levels + // - bytes_read_output_level + // - num_input_records + // - bytes_read_blob + // - num_dropped_records + // + // @param num_input_range_del if non-null, will be set to the number of range + // deletion entries in this compaction input. + // + // Returns true iff compaction_stats_.stats.num_input_records and + // num_input_range_del are calculated successfully. + bool UpdateCompactionStats(uint64_t* num_input_range_del = nullptr); void LogCompaction(); virtual void RecordCompactionIOStats(); void CleanupCompaction(); @@ -256,8 +273,11 @@ class CompactionJob { Status FinishCompactionOutputFile(const Status& input_status, SubcompactionState* sub_compact, CompactionOutputs& outputs, - const Slice& next_table_min_key); - Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options); + const Slice& next_table_min_key, + const Slice* comp_start_user_key, + const Slice* comp_end_user_key); + Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options, + bool* compaction_released); Status OpenCompactionOutputFile(SubcompactionState* sub_compact, CompactionOutputs& outputs); void UpdateCompactionJobStats( @@ -265,9 +285,6 @@ class CompactionJob { void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats, CompactionJobStats* compaction_job_stats = nullptr); - void UpdateCompactionInputStatsHelper(int* num_files, uint64_t* bytes_read, - int input_level); - void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact); void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact); @@ -333,7 +350,7 @@ class CompactionJob { // Stores the sequence number to time mapping gathered from all input files // it also collects the smallest_seqno -> oldest_ancester_time from the SST. - SeqnoToTimeMapping seqno_time_mapping_; + SeqnoToTimeMapping seqno_to_time_mapping_; // Minimal sequence number for preserving the time information. The time info // older than this sequence number won't be preserved after the compaction and diff --git a/db/compaction/compaction_job_stats_test.cc b/db/compaction/compaction_job_stats_test.cc index 9302707780b1..56fc51d0582f 100644 --- a/db/compaction/compaction_job_stats_test.cc +++ b/db/compaction/compaction_job_stats_test.cc @@ -24,7 +24,7 @@ #include "db/write_batch_internal.h" #include "env/mock_env.h" #include "file/filename.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "monitoring/thread_status_util.h" #include "port/stack_trace.h" #include "rocksdb/cache.h" @@ -54,12 +54,11 @@ #include "util/compression.h" #include "util/hash.h" #include "util/mutexlock.h" -#include "util/rate_limiter.h" +#include "util/rate_limiter_impl.h" #include "util/string_util.h" #include "utilities/merge_operators.h" #if !defined(IOS_CROSS_COMPILE) -#ifndef ROCKSDB_LITE namespace ROCKSDB_NAMESPACE { static std::string RandomString(Random* rnd, int len, double ratio) { @@ -617,6 +616,7 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) { // via AddExpectedStats(). auto* stats_checker = new CompactionJobStatsChecker(); Options options; + options.level_compaction_dynamic_level_bytes = false; options.listeners.emplace_back(stats_checker); options.create_if_missing = true; // just enough setting to hold off auto-compaction. @@ -816,6 +816,7 @@ TEST_P(CompactionJobStatsTest, DeletionStatsTest) { // what we expect. auto* stats_checker = new CompactionJobDeletionStatsChecker(); Options options; + options.level_compaction_dynamic_level_bytes = false; options.listeners.emplace_back(stats_checker); options.create_if_missing = true; options.level0_file_num_compaction_trigger = kTestScale + 1; @@ -959,15 +960,6 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE #else diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 008b3d2d1fd5..a16891110020 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "db/compaction/compaction_job.h" @@ -216,7 +215,9 @@ class CompactionJobTestBase : public testing::Test { dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, &write_controller_, /*block_cache_tracer=*/nullptr, - /*io_tracer=*/nullptr, /*db_id*/ "", /*db_session_id*/ "")), + /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", + /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr)), shutting_down_(false), mock_table_factory_(new mock::MockTableFactory()), error_handler_(nullptr, db_options_, &mutex_), @@ -387,12 +388,13 @@ class CompactionJobTestBase : public testing::Test { kUnknownFileCreationTime, versions_->GetColumnFamilySet()->GetDefault()->NewEpochNumber(), kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, - 0); + /*compensated_range_deletion_size=*/0, /*tail_size=*/0, + /*user_defined_timestamps_persisted=*/true); mutex_.Lock(); - EXPECT_OK( - versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options_, &edit, &mutex_, nullptr)); + EXPECT_OK(versions_->LogAndApply( + versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, + read_options_, &edit, &mutex_, nullptr)); mutex_.Unlock(); } @@ -455,7 +457,8 @@ class CompactionJobTestBase : public testing::Test { Status s = cf_options_.table_factory->NewTableReader( read_opts, TableReaderOptions(*cfd->ioptions(), nullptr, FileOptions(), - cfd_->internal_comparator()), + cfd_->internal_comparator(), + 0 /* block_protection_bytes_per_key */), std::move(freader), file_size, &table_reader, false); ASSERT_OK(s); assert(table_reader); @@ -539,11 +542,12 @@ class CompactionJobTestBase : public testing::Test { ASSERT_OK(s); db_options_.info_log = info_log; - versions_.reset( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ "")); + versions_.reset(new VersionSet( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr)); compaction_job_stats_.Reset(); ASSERT_OK(SetIdentityFile(env_, dbname_)); @@ -643,7 +647,7 @@ class CompactionJobTestBase : public testing::Test { mutable_cf_options_.max_compaction_bytes, 0, kNoCompression, cfd->GetLatestMutableCFOptions()->compression_opts, Temperature::kUnknown, max_subcompactions, grandparents, true); - compaction.SetInputVersion(cfd->current()); + compaction.FinalizeInputInfo(cfd->current()); assert(db_options_.info_log); LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); @@ -654,11 +658,12 @@ class CompactionJobTestBase : public testing::Test { ASSERT_TRUE(full_history_ts_low_.empty() || ucmp_->timestamp_size() == full_history_ts_low_.size()); const std::atomic kManualCompactionCanceledFalse{false}; + JobContext job_context(1, false /* create_superversion */); CompactionJob compaction_job( 0, &compaction, db_options_, mutable_db_options_, env_options_, versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr, nullptr, nullptr, &mutex_, &error_handler_, snapshots, - earliest_write_conflict_snapshot, snapshot_checker, nullptr, + earliest_write_conflict_snapshot, snapshot_checker, &job_context, table_cache_, &event_logger, false, false, dbname_, &compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */, /*manual_compaction_canceled=*/kManualCompactionCanceledFalse, @@ -672,7 +677,9 @@ class CompactionJobTestBase : public testing::Test { ASSERT_OK(s); ASSERT_OK(compaction_job.io_status()); mutex_.Lock(); - ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions())); + bool compaction_released = false; + ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions(), + &compaction_released)); ASSERT_OK(compaction_job.io_status()); mutex_.Unlock(); log_buffer.FlushBufferToLog(); @@ -728,6 +735,7 @@ class CompactionJobTestBase : public testing::Test { ColumnFamilyOptions cf_options_; MutableCFOptions mutable_cf_options_; MutableDBOptions mutable_db_options_; + const ReadOptions read_options_; std::shared_ptr table_cache_; WriteController write_controller_; WriteBufferManager write_buffer_manager_; @@ -1519,13 +1527,15 @@ TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) { {files0, files1, files2, files3}, input_levels, /*verify_func=*/[&](Compaction& comp) { for (char c = 'a'; c <= 'z'; c++) { - std::string c_str; - c_str = c; - const Slice key(c_str); if (c == 'a') { - ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(key)); + ParsedInternalKey pik("a", 0U, kTypeValue); + ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(pik)); } else { - ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(key)); + std::string c_str{c}; + // WithinPenultimateLevelOutputRange checks internal key range. + // 'z' is the last key, so set seqno properly. + ParsedInternalKey pik(c_str, c == 'z' ? 12U : 0U, kTypeValue); + ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(pik)); } } }); @@ -2441,14 +2451,3 @@ int main(int argc, char** argv) { RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } - -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index 598bffb242f0..eb76cd849a9c 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -18,16 +18,18 @@ void CompactionOutputs::NewBuilder(const TableBuilderOptions& tboptions) { builder_.reset(NewTableBuilder(tboptions, file_writer_.get())); } -Status CompactionOutputs::Finish(const Status& intput_status, - const SeqnoToTimeMapping& seqno_time_mapping) { +Status CompactionOutputs::Finish( + const Status& intput_status, + const SeqnoToTimeMapping& seqno_to_time_mapping) { FileMetaData* meta = GetMetaData(); assert(meta != nullptr); Status s = intput_status; if (s.ok()) { - std::string seqno_time_mapping_str; - seqno_time_mapping.Encode(seqno_time_mapping_str, meta->fd.smallest_seqno, - meta->fd.largest_seqno, meta->file_creation_time); - builder_->SetSeqnoTimeTableProperties(seqno_time_mapping_str, + std::string seqno_to_time_mapping_str; + seqno_to_time_mapping.Encode( + seqno_to_time_mapping_str, meta->fd.smallest_seqno, + meta->fd.largest_seqno, meta->file_creation_time); + builder_->SetSeqnoTimeTableProperties(seqno_to_time_mapping_str, meta->oldest_ancester_time); s = builder_->Finish(); @@ -43,7 +45,10 @@ Status CompactionOutputs::Finish(const Status& intput_status, const uint64_t current_bytes = builder_->FileSize(); if (s.ok()) { meta->fd.file_size = current_bytes; + meta->tail_size = builder_->GetTailSize(); meta->marked_for_compaction = builder_->NeedCompact(); + meta->user_defined_timestamps_persisted = static_cast( + builder_->GetTableProperties().user_defined_timestamps_persisted); } current_output().finished = true; stats_.bytes_written += current_bytes; @@ -124,11 +129,6 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo( if (grandparents.empty()) { return curr_key_boundary_switched_num; } - assert(!internal_key.empty()); - InternalKey ikey; - ikey.DecodeFrom(internal_key); - assert(ikey.Valid()); - const Comparator* ucmp = compaction_->column_family_data()->user_comparator(); // Move the grandparent_index_ to the file containing the current user_key. @@ -136,7 +136,7 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo( // index points to the last file containing the key. while (grandparent_index_ < grandparents.size()) { if (being_grandparent_gap_) { - if (sstableKeyCompare(ucmp, ikey, + if (sstableKeyCompare(ucmp, internal_key, grandparents[grandparent_index_]->smallest) < 0) { break; } @@ -149,13 +149,13 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo( being_grandparent_gap_ = false; } else { int cmp_result = sstableKeyCompare( - ucmp, ikey, grandparents[grandparent_index_]->largest); + ucmp, internal_key, grandparents[grandparent_index_]->largest); // If it's same key, make sure grandparent_index_ is pointing to the last // one. if (cmp_result < 0 || (cmp_result == 0 && (grandparent_index_ == grandparents.size() - 1 || - sstableKeyCompare(ucmp, ikey, + sstableKeyCompare(ucmp, internal_key, grandparents[grandparent_index_ + 1]->smallest) < 0))) { break; @@ -226,6 +226,15 @@ uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes( bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { assert(c_iter.Valid()); const Slice& internal_key = c_iter.key(); +#ifndef NDEBUG + bool should_stop = false; + std::pair p{&should_stop, internal_key}; + TEST_SYNC_POINT_CALLBACK( + "CompactionOutputs::ShouldStopBefore::manual_decision", (void*)&p); + if (should_stop) { + return true; + } +#endif // NDEBUG const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_; const InternalKeyComparator* icmp = &compaction_->column_family_data()->internal_comparator(); @@ -347,8 +356,14 @@ Status CompactionOutputs::AddToOutput( const CompactionFileOpenFunc& open_file_func, const CompactionFileCloseFunc& close_file_func) { Status s; + bool is_range_del = c_iter.IsDeleteRangeSentinelKey(); + if (is_range_del && compaction_->bottommost_level()) { + // We don't consider range tombstone for bottommost level since: + // 1. there is no grandparent and hence no overlap to consider + // 2. range tombstone may be dropped at bottommost level. + return s; + } const Slice& key = c_iter.key(); - if (ShouldStopBefore(c_iter) && HasBuilder()) { s = close_file_func(*this, c_iter.InputStatus(), key); if (!s.ok()) { @@ -358,6 +373,13 @@ Status CompactionOutputs::AddToOutput( grandparent_boundary_switched_num_ = 0; grandparent_overlapped_bytes_ = GetCurrentKeyGrandparentOverlappedBytes(key); + if (UNLIKELY(is_range_del)) { + // lower bound for this new output file, this is needed as the lower bound + // does not come from the smallest point key in this case. + range_tombstone_lower_bound_.DecodeFrom(key); + } else { + range_tombstone_lower_bound_.Clear(); + } } // Open output file if necessary @@ -368,6 +390,17 @@ Status CompactionOutputs::AddToOutput( } } + // c_iter may emit range deletion keys, so update `last_key_for_partitioner_` + // here before returning below when `is_range_del` is true + if (partitioner_) { + last_key_for_partitioner_.assign(c_iter.user_key().data_, + c_iter.user_key().size_); + } + + if (UNLIKELY(is_range_del)) { + return s; + } + assert(builder_ != nullptr); const Slice& value = c_iter.value(); s = current_output().validator.Add(key, value); @@ -391,28 +424,33 @@ Status CompactionOutputs::AddToOutput( s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence, ikey.type); - if (partitioner_) { - last_key_for_partitioner_.assign(c_iter.user_key().data_, - c_iter.user_key().size_); - } - return s; } +namespace { +void SetMaxSeqAndTs(InternalKey& internal_key, const Slice& user_key, + const size_t ts_sz) { + if (ts_sz) { + static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; + if (ts_sz <= strlen(kTsMax)) { + internal_key = InternalKey(user_key, kMaxSequenceNumber, + kTypeRangeDeletion, Slice(kTsMax, ts_sz)); + } else { + internal_key = + InternalKey(user_key, kMaxSequenceNumber, kTypeRangeDeletion, + std::string(ts_sz, '\xff')); + } + } else { + internal_key.Set(user_key, kMaxSequenceNumber, kTypeRangeDeletion); + } +} +} // namespace + Status CompactionOutputs::AddRangeDels( const Slice* comp_start_user_key, const Slice* comp_end_user_key, CompactionIterationStats& range_del_out_stats, bool bottommost_level, const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot, const Slice& next_table_min_key, const std::string& full_history_ts_low) { - assert(HasRangeDel()); - FileMetaData& meta = current_output().meta; - const Comparator* ucmp = icmp.user_comparator(); - - Slice lower_bound_guard, upper_bound_guard; - std::string smallest_user_key; - const Slice *lower_bound, *upper_bound; - bool lower_bound_from_sub_compact = false; - // The following example does not happen since // CompactionOutput::ShouldStopBefore() always return false for the first // point key. But we should consider removing this dependency. Suppose for the @@ -424,108 +462,147 @@ Status CompactionOutputs::AddRangeDels( // Then meta.smallest will be set to comp_start_user_key@seqno // and meta.largest will be set to comp_start_user_key@kMaxSequenceNumber // which violates the assumption that meta.smallest should be <= meta.largest. + assert(HasRangeDel()); + FileMetaData& meta = current_output().meta; + const Comparator* ucmp = icmp.user_comparator(); + InternalKey lower_bound_buf, upper_bound_buf; + Slice lower_bound_guard, upper_bound_guard; + std::string smallest_user_key; + const Slice *lower_bound, *upper_bound; + + // We first determine the internal key lower_bound and upper_bound for + // this output file. All and only range tombstones that overlap with + // [lower_bound, upper_bound] should be added to this file. File + // boundaries (meta.smallest/largest) should be updated accordingly when + // extended by range tombstones. size_t output_size = outputs_.size(); if (output_size == 1) { - // For the first output table, include range tombstones before the min - // key but after the subcompaction boundary. - lower_bound = comp_start_user_key; - lower_bound_from_sub_compact = true; - } else if (meta.smallest.size() > 0) { + // This is the first file in the subcompaction. + // + // When outputting a range tombstone that spans a subcompaction boundary, + // the files on either side of that boundary need to include that + // boundary's user key. Otherwise, the spanning range tombstone would lose + // coverage. + // + // To achieve this while preventing files from overlapping in internal key + // (an LSM invariant violation), we allow the earlier file to include the + // boundary user key up to `kMaxSequenceNumber,kTypeRangeDeletion`. The + // later file can begin at the boundary user key at the newest key version + // it contains. At this point that version number is unknown since we have + // not processed the range tombstones yet, so permit any version. Same story + // applies to timestamp, and a non-nullptr `comp_start_user_key` should have + // `kMaxTs` here, which similarly permits any timestamp. + if (comp_start_user_key) { + lower_bound_buf.Set(*comp_start_user_key, kMaxSequenceNumber, + kTypeRangeDeletion); + lower_bound_guard = lower_bound_buf.Encode(); + lower_bound = &lower_bound_guard; + } else { + lower_bound = nullptr; + } + } else { // For subsequent output tables, only include range tombstones from min // key onwards since the previous file was extended to contain range // tombstones falling before min key. - smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/); - lower_bound_guard = Slice(smallest_user_key); - lower_bound = &lower_bound_guard; - } else { - lower_bound = nullptr; - } - if (!next_table_min_key.empty()) { - // This may be the last file in the subcompaction in some cases, so we - // need to compare the end key of subcompaction with the next file start - // key. When the end key is chosen by the subcompaction, we know that - // it must be the biggest key in output file. Therefore, it is safe to - // use the smaller key as the upper bound of the output file, to ensure - // that there is no overlapping between different output files. - upper_bound_guard = ExtractUserKey(next_table_min_key); - if (comp_end_user_key != nullptr && - ucmp->CompareWithoutTimestamp(upper_bound_guard, *comp_end_user_key) >= - 0) { - upper_bound = comp_end_user_key; + if (range_tombstone_lower_bound_.size() > 0) { + assert(meta.smallest.size() == 0 || + icmp.Compare(range_tombstone_lower_bound_, meta.smallest) < 0); + lower_bound_guard = range_tombstone_lower_bound_.Encode(); } else { + assert(meta.smallest.size() > 0); + lower_bound_guard = meta.smallest.Encode(); + } + lower_bound = &lower_bound_guard; + } + + const size_t ts_sz = ucmp->timestamp_size(); + if (next_table_min_key.empty()) { + // Last file of the subcompaction. + if (comp_end_user_key) { + upper_bound_buf.Set(*comp_end_user_key, kMaxSequenceNumber, + kTypeRangeDeletion); + upper_bound_guard = upper_bound_buf.Encode(); upper_bound = &upper_bound_guard; + } else { + upper_bound = nullptr; } } else { - // This is the last file in the subcompaction, so extend until the - // subcompaction ends. - upper_bound = comp_end_user_key; - } - bool has_overlapping_endpoints; - if (upper_bound != nullptr && meta.largest.size() > 0) { - has_overlapping_endpoints = ucmp->CompareWithoutTimestamp( - meta.largest.user_key(), *upper_bound) == 0; - } else { - has_overlapping_endpoints = false; + // There is another file coming whose coverage will begin at + // `next_table_min_key`. The current file needs to extend range tombstone + // coverage through its own keys (through `meta.largest`) and through user + // keys preceding `next_table_min_key`'s user key. + ParsedInternalKey next_table_min_key_parsed; + ParseInternalKey(next_table_min_key, &next_table_min_key_parsed, + false /* log_err_key */) + .PermitUncheckedError(); + assert(next_table_min_key_parsed.sequence < kMaxSequenceNumber); + assert(meta.largest.size() == 0 || + icmp.Compare(meta.largest.Encode(), next_table_min_key) < 0); + assert(!lower_bound || icmp.Compare(*lower_bound, next_table_min_key) <= 0); + if (meta.largest.size() > 0 && + ucmp->EqualWithoutTimestamp(meta.largest.user_key(), + next_table_min_key_parsed.user_key)) { + // Caution: this assumes meta.largest.Encode() lives longer than + // upper_bound, which is only true if meta.largest is never updated. + // This just happens to be the case here since meta.largest serves + // as the upper_bound. + upper_bound_guard = meta.largest.Encode(); + } else { + SetMaxSeqAndTs(upper_bound_buf, next_table_min_key_parsed.user_key, + ts_sz); + upper_bound_guard = upper_bound_buf.Encode(); + } + upper_bound = &upper_bound_guard; + } + if (lower_bound && upper_bound && + icmp.Compare(*lower_bound, *upper_bound) > 0) { + assert(meta.smallest.size() == 0 && + ucmp->EqualWithoutTimestamp(ExtractUserKey(*lower_bound), + ExtractUserKey(*upper_bound))); + // This can only happen when lower_bound have the same user key as + // next_table_min_key and that there is no point key in the current + // compaction output file. + return Status::OK(); } - // The end key of the subcompaction must be bigger or equal to the upper // bound. If the end of subcompaction is null or the upper bound is null, // it means that this file is the last file in the compaction. So there // will be no overlapping between this file and others. assert(comp_end_user_key == nullptr || upper_bound == nullptr || - ucmp->CompareWithoutTimestamp(*upper_bound, *comp_end_user_key) <= 0); - auto it = range_del_agg_->NewIterator(lower_bound, upper_bound, - has_overlapping_endpoints); - // Position the range tombstone output iterator. There may be tombstone - // fragments that are entirely out of range, so make sure that we do not - // include those. - if (lower_bound != nullptr) { - it->Seek(*lower_bound); - } else { - it->SeekToFirst(); - } + ucmp->CompareWithoutTimestamp(ExtractUserKey(*upper_bound), + *comp_end_user_key) <= 0); + auto it = range_del_agg_->NewIterator(lower_bound, upper_bound); Slice last_tombstone_start_user_key{}; - for (; it->Valid(); it->Next()) { + bool reached_lower_bound = false; + const ReadOptions read_options(Env::IOActivity::kCompaction); + for (it->SeekToFirst(); it->Valid(); it->Next()) { auto tombstone = it->Tombstone(); - if (upper_bound != nullptr) { - int cmp = - ucmp->CompareWithoutTimestamp(*upper_bound, tombstone.start_key_); - // Tombstones starting after upper_bound only need to be included in - // the next table. - // If the current SST ends before upper_bound, i.e., - // `has_overlapping_endpoints == false`, we can also skip over range - // tombstones that start exactly at upper_bound. Such range - // tombstones will be included in the next file and are not relevant - // to the point keys or endpoints of the current file. - // If the current SST ends at the same user key at upper_bound, - // i.e., `has_overlapping_endpoints == true`, AND the tombstone has - // the same start key as upper_bound, i.e., cmp == 0, then - // the tombstone is relevant only if the tombstone's sequence number - // is no larger than this file's largest key's sequence number. This - // is because the upper bound to truncate this file's range tombstone - // will be meta.largest in this case, and any tombstone that starts after - // it will not be relevant. - if (cmp < 0) { - break; - } else if (cmp == 0) { - if (!has_overlapping_endpoints || - tombstone.seq_ < GetInternalKeySeqno(meta.largest.Encode())) { - break; - } - } + auto kv = tombstone.Serialize(); + InternalKey tombstone_end = tombstone.SerializeEndKey(); + // TODO: the underlying iterator should support clamping the bounds. + // tombstone_end.Encode is of form user_key@kMaxSeqno + // if it is equal to lower_bound, there is no need to include + // such range tombstone. + if (!reached_lower_bound && lower_bound && + icmp.Compare(tombstone_end.Encode(), *lower_bound) <= 0) { + continue; } + assert(!lower_bound || + icmp.Compare(*lower_bound, tombstone_end.Encode()) <= 0); + reached_lower_bound = true; - const size_t ts_sz = ucmp->timestamp_size(); // Garbage collection for range tombstones. // If user-defined timestamp is enabled, range tombstones are dropped if // they are at bottommost_level, below full_history_ts_low and not visible // in any snapshot. trim_ts_ is passed to the constructor for // range_del_agg_, and range_del_agg_ internally drops tombstones above // trim_ts_. - if (bottommost_level && tombstone.seq_ <= earliest_snapshot && + bool consider_drop = + tombstone.seq_ <= earliest_snapshot && (ts_sz == 0 || (!full_history_ts_low.empty() && - ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0))) { + ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0)); + if (consider_drop && bottommost_level) { // TODO(andrewkr): tombstones that span multiple output files are // counted for each compaction output file, so lots of double // counting. @@ -534,83 +611,100 @@ Status CompactionOutputs::AddRangeDels( continue; } - auto kv = tombstone.Serialize(); assert(lower_bound == nullptr || - ucmp->CompareWithoutTimestamp(*lower_bound, kv.second) < 0); - // Range tombstone is not supported by output validator yet. - builder_->Add(kv.first.Encode(), kv.second); - InternalKey tombstone_start = std::move(kv.first); - InternalKey smallest_candidate{tombstone_start}; - if (lower_bound != nullptr && - ucmp->CompareWithoutTimestamp(smallest_candidate.user_key(), - *lower_bound) <= 0) { - // Pretend the smallest key has the same user key as lower_bound - // (the max key in the previous table or subcompaction) in order for - // files to appear key-space partitioned. - if (lower_bound_from_sub_compact) { - // When lower_bound is chosen by a subcompaction - // (lower_bound_from_sub_compact), we know that subcompactions over - // smaller keys cannot contain any keys at lower_bound. We also know - // that smaller subcompactions exist, because otherwise the - // subcompaction woud be unbounded on the left. As a result, we know - // that no other files on the output level will contain actual keys at - // lower_bound (an output file may have a largest key of - // lower_bound@kMaxSequenceNumber, but this only indicates a large range - // tombstone was truncated). Therefore, it is safe to use the - // tombstone's sequence number, to ensure that keys at lower_bound at - // lower levels are covered by truncated tombstones. - if (ts_sz) { - assert(tombstone.ts_.size() == ts_sz); - smallest_candidate = InternalKey(*lower_bound, tombstone.seq_, - kTypeRangeDeletion, tombstone.ts_); - } else { - smallest_candidate = - InternalKey(*lower_bound, tombstone.seq_, kTypeRangeDeletion); - } - } else { - // If lower_bound was chosen by the smallest data key in the file, - // choose lowest seqnum so this file's smallest internal key comes - // after the previous file's largest. The fake seqnum is OK because - // the read path's file-picking code only considers user key. - smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion); - } + ucmp->CompareWithoutTimestamp(ExtractUserKey(*lower_bound), + kv.second) < 0); + InternalKey tombstone_start = kv.first; + if (lower_bound && + ucmp->CompareWithoutTimestamp(tombstone_start.user_key(), + ExtractUserKey(*lower_bound)) < 0) { + // This just updates the non-timestamp portion of `tombstone_start`'s user + // key. Ideally there would be a simpler API usage + ParsedInternalKey tombstone_start_parsed; + ParseInternalKey(tombstone_start.Encode(), &tombstone_start_parsed, + false /* log_err_key */) + .PermitUncheckedError(); + // timestamp should be from where sequence number is from, which is from + // tombstone in this case + std::string ts = + tombstone_start_parsed.GetTimestamp(ucmp->timestamp_size()) + .ToString(); + tombstone_start_parsed.user_key = ExtractUserKey(*lower_bound); + tombstone_start.SetFrom(tombstone_start_parsed, ts); } - InternalKey tombstone_end = tombstone.SerializeEndKey(); - InternalKey largest_candidate{tombstone_end}; if (upper_bound != nullptr && - ucmp->CompareWithoutTimestamp(*upper_bound, - largest_candidate.user_key()) <= 0) { - // Pretend the largest key has the same user key as upper_bound (the - // min key in the following table or subcompaction) in order for files - // to appear key-space partitioned. - // - // Choose highest seqnum so this file's largest internal key comes - // before the next file's/subcompaction's smallest. The fake seqnum is - // OK because the read path's file-picking code only considers the - // user key portion. - // - // Note Seek() also creates InternalKey with (user_key, - // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of - // kTypeRangeDeletion (0xF), so the range tombstone comes before the - // Seek() key in InternalKey's ordering. So Seek() will look in the - // next file for the user key - if (ts_sz) { - static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff"; - if (ts_sz <= strlen(kTsMax)) { - largest_candidate = - InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion, - Slice(kTsMax, ts_sz)); - } else { - largest_candidate = - InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion, - std::string(ts_sz, '\xff')); - } - } else { - largest_candidate = - InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion); - } + icmp.Compare(*upper_bound, tombstone_start.Encode()) < 0) { + break; + } + if (lower_bound && + icmp.Compare(tombstone_start.Encode(), *lower_bound) < 0) { + tombstone_start.DecodeFrom(*lower_bound); } - meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate, + if (upper_bound && icmp.Compare(*upper_bound, tombstone_end.Encode()) < 0) { + tombstone_end.DecodeFrom(*upper_bound); + } + if (consider_drop && compaction_->KeyRangeNotExistsBeyondOutputLevel( + tombstone_start.user_key(), + tombstone_end.user_key(), &level_ptrs_)) { + range_del_out_stats.num_range_del_drop_obsolete++; + range_del_out_stats.num_record_drop_obsolete++; + continue; + } + // Here we show that *only* range tombstones that overlap with + // [lower_bound, upper_bound] are added to the current file, and + // sanity checking invariants that should hold: + // - [tombstone_start, tombstone_end] overlaps with [lower_bound, + // upper_bound] + // - meta.smallest <= meta.largest + // Corresponding assertions are made, the proof is broken is any of them + // fails. + // TODO: show that *all* range tombstones that overlap with + // [lower_bound, upper_bound] are added. + // TODO: some invariant about boundaries are correctly updated. + // + // Note that `tombstone_start` is updated in the if condition above, we use + // tombstone_start to refer to its initial value, i.e., + // it->Tombstone().first, and use tombstone_start* to refer to its value + // after the update. + // + // To show [lower_bound, upper_bound] overlaps with [tombstone_start, + // tombstone_end]: + // lower_bound <= upper_bound from the if condition right after all + // bounds are initialized. We assume each tombstone fragment has + // start_key.user_key < end_key.user_key, so + // tombstone_start < tombstone_end by + // FragmentedTombstoneIterator::Tombstone(). So these two ranges are both + // non-emtpy. The flag `reached_lower_bound` and the if logic before it + // ensures lower_bound <= tombstone_end. tombstone_start is only updated + // if it has a smaller user_key than lower_bound user_key, so + // tombstone_start <= tombstone_start*. The above if condition implies + // tombstone_start* <= upper_bound. So we have + // tombstone_start <= upper_bound and lower_bound <= tombstone_end + // and the two ranges overlap. + // + // To show meta.smallest <= meta.largest: + // From the implementation of UpdateBoundariesForRange(), it suffices to + // prove that when it is first called in this function, its parameters + // satisfy `start <= end`, where start = max(tombstone_start*, lower_bound) + // and end = min(tombstone_end, upper_bound). From the above proof we have + // lower_bound <= tombstone_end and lower_bound <= upper_bound. We only need + // to show that tombstone_start* <= min(tombstone_end, upper_bound). + // Note that tombstone_start*.user_key = max(tombstone_start.user_key, + // lower_bound.user_key). Assuming tombstone_end always has + // kMaxSequenceNumber and lower_bound.seqno < kMaxSequenceNumber. + // Since lower_bound <= tombstone_end and lower_bound.seqno < + // tombstone_end.seqno (in absolute number order, not internal key order), + // lower_bound.user_key < tombstone_end.user_key. + // Since lower_bound.user_key < tombstone_end.user_key and + // tombstone_start.user_key < tombstone_end.user_key, tombstone_start* < + // tombstone_end. Since tombstone_start* <= upper_bound from the above proof + // and tombstone_start* < tombstone_end, tombstone_start* <= + // min(tombstone_end, upper_bound), so the two ranges overlap. + + // Range tombstone is not supported by output validator yet. + builder_->Add(kv.first.Encode(), kv.second); + assert(icmp.Compare(tombstone_start, tombstone_end) <= 0); + meta.UpdateBoundariesForRange(tombstone_start, tombstone_end, tombstone.seq_, icmp); if (!bottommost_level) { bool start_user_key_changed = @@ -618,17 +712,8 @@ Status CompactionOutputs::AddRangeDels( ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key, it->start_key()) < 0; last_tombstone_start_user_key = it->start_key(); - // Range tombstones are truncated at file boundaries - if (icmp.Compare(tombstone_start, meta.smallest) < 0) { - tombstone_start = meta.smallest; - } - if (icmp.Compare(tombstone_end, meta.largest) > 0) { - tombstone_end = meta.largest; - } - // this assertion validates invariant (2) in the comment below. - assert(icmp.Compare(tombstone_start, tombstone_end) <= 0); if (start_user_key_changed) { - // if tombstone_start >= tombstone_end, then either no key range is + // If tombstone_start >= tombstone_end, then either no key range is // covered, or that they have the same user key. If they have the same // user key, then the internal key range should only be within this // level, and no keys from older levels is covered. @@ -638,7 +723,7 @@ Status CompactionOutputs::AddRangeDels( approx_opts.files_size_error_margin = 0.1; auto approximate_covered_size = compaction_->input_version()->version_set()->ApproximateSize( - approx_opts, compaction_->input_version(), + approx_opts, read_options, compaction_->input_version(), tombstone_start.Encode(), tombstone_end.Encode(), compaction_->output_level() + 1 /* start_level */, -1 /* end_level */, kCompaction); @@ -646,138 +731,6 @@ Status CompactionOutputs::AddRangeDels( } } } - // TODO: show invariants that ensure all necessary range tombstones are - // added - // and that file boundaries ensure no coverage is lost. - // Each range tombstone with internal key range [tombstone_start, - // tombstone_end] is being added to the current compaction output file here. - // The range tombstone is going to be truncated at range [meta.smallest, - // meta.largest] during reading/scanning. We should maintain invariants - // (1) meta.smallest <= meta.largest and, - // (2) [tombstone_start, tombstone_end] and [meta.smallest, meta.largest] - // overlaps, as there is no point adding range tombstone with a range - // outside the file's range. - // Since `tombstone_end` is always some user_key@kMaxSeqno, it is okay to - // use either open or closed range. Using closed range here to make - // reasoning easier, and it is more consistent with an ongoing work that - // tries to simplify this method. - // - // There are two cases: - // Case 1. Output file has no point key: - // First we show this case only happens when the entire compaction output - // is range tombstone only. This is true if CompactionIterator does not - // emit any point key. Suppose CompactionIterator emits some point key. - // Based on the assumption that CompactionOutputs::ShouldStopBefore() - // always return false for the first point key, the first compaction - // output file always contains a point key. Each new compaction output - // file is created if there is a point key for which ShouldStopBefore() - // returns true, and the point key would be added to the new compaction - // output file. So each new compaction file always contains a point key. - // So Case 1 only happens when CompactionIterator does not emit any - // point key. - // - // To show (1) meta.smallest <= meta.largest: - // Since the compaction output is range tombstone only, `lower_bound` and - // `upper_bound` are either null or comp_start/end_user_key respectively. - // According to how UpdateBoundariesForRange() is implemented, it blindly - // updates meta.smallest and meta.largest to smallest_candidate and - // largest_candidate the first time it is called. Subsequently, it - // compares input parameter with meta.smallest and meta.largest and only - // updates them when input is smaller/larger. So we only need to show - // smallest_candidate <= largest_candidate the first time - // UpdateBoundariesForRange() is called. Here we show something stronger - // that smallest_candidate.user_key < largest_candidate.user_key always - // hold for Case 1. - // We assume comp_start_user_key < comp_end_user_key, if provided. We - // assume that tombstone_start < tombstone_end. This assumption is based - // on that each fragment in FragmentedTombstoneList has - // start_key < end_key (user_key) and that - // FragmentedTombstoneIterator::Tombstone() returns the pair - // (start_key@tombstone_seqno with op_type kTypeRangeDeletion, end_key). - // The logic in this loop sets smallest_candidate to - // max(tombstone_start.user_key, comp_start_user_key)@tombstone.seq_ with - // op_type kTypeRangeDeletion, largest_candidate to - // min(tombstone_end.user_key, comp_end_user_key)@kMaxSequenceNumber with - // op_type kTypeRangeDeletion. When a bound is null, there is no - // truncation on that end. To show that smallest_candidate.user_key < - // largest_candidate.user_key, it suffices to show - // tombstone_start.user_key < comp_end_user_key (if not null) AND - // comp_start_user_key (if not null) < tombstone_end.user_key. - // Since the file has no point key, `has_overlapping_endpoints` is false. - // In the first sanity check of this for-loop, we compare - // tombstone_start.user_key against upper_bound = comp_end_user_key, - // and only proceed if tombstone_start.user_key < comp_end_user_key. - // We assume FragmentedTombstoneIterator::Seek(k) lands - // on a tombstone with end_key > k. So the call it->Seek(*lower_bound) - // above implies compact_start_user_key < tombstone_end.user_key. - // - // To show (2) [tombstone_start, tombstone_end] and [meta.smallest, - // meta.largest] overlaps (after the call to UpdateBoundariesForRange()): - // In the proof for (1) we have shown that - // smallest_candidate <= largest_candidate. Since tombstone_start <= - // smallest_candidate <= largest_candidate <= tombstone_end, for (2) to - // hold, it suffices to show that [smallest_candidate, largest_candidate] - // overlaps with [meta.smallest, meta.largest]. too. - // Given meta.smallest <= meta.largest shown above, we need to show - // that it is impossible to have largest_candidate < meta.smallest or - // meta.largest < smallest_candidate. If the above - // meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate) - // updates meta.largest or meta.smallest, then the two ranges overlap. - // So we assume meta.UpdateBoundariesForRange(smallest_candidate, - // largest_candidate) did not update meta.smallest nor meta.largest, which - // means meta.smallest < smallest_candidate and largest_candidate < - // meta.largest. - // - // Case 2. Output file has >= 1 point key. This means meta.smallest and - // meta.largest are not empty when AddRangeDels() is called. - // To show (1) meta.smallest <= meta.largest: - // Assume meta.smallest <= meta.largest when AddRangeDels() is called, - // this follow from how UpdateBoundariesForRange() is implemented where it - // takes min or max to update meta.smallest or meta.largest. - // - // To show (2) [tombstone_start, tombstone_end] and [meta.smallest, - // meta.largest] overlaps (after the call to UpdateBoundariesForRange()): - // When smallest_candidate <= largest_candidate, the proof in Case 1 - // applies, so we only need to show (2) holds when smallest_candidate > - // largest_candidate. When both bounds are either null or from - // subcompaction boundary, the proof in Case 1 applies, so we only need to - // show (2) holds when at least one bound is from a point key (either - // meta.smallest for lower bound or next_table_min_key for upper bound). - // - // Suppose lower bound is meta.smallest.user_key. The call - // it->Seek(*lower_bound) implies tombstone_end.user_key > - // meta.smallest.user_key. We have smallest_candidate.user_key = - // max(tombstone_start.user_key, meta.smallest.user_key). For - // smallest_candidate to be > largest_candidate, we need - // largest_candidate.user_key = upper_bound = smallest_candidate.user_key, - // where tombstone_end is truncated to largest_candidate. - // Subcase 1: - // Suppose largest_candidate.user_key = comp_end_user_key (there is no - // next point key). Subcompaction ensures any point key from this - // subcompaction has a user_key < comp_end_user_key, so 1) - // meta.smallest.user_key < comp_end_user_key, 2) - // `has_overlapping_endpoints` is false, and the first if condition in - // this for-loop ensures tombstone_start.user_key < comp_end_user_key. So - // smallest_candidate.user_key < largest_candidate.user_key. This case - // cannot happen when smallest > largest_candidate. - // Subcase 2: - // Suppose largest_candidate.user_key = next_table_min_key.user_key. - // The first if condition in this for-loop together with - // smallest_candidate.user_key = next_table_min_key.user_key = - // upper_bound implies `has_overlapping_endpoints` is true (so meta - // largest.user_key = upper_bound) and - // tombstone.seq_ < meta.largest.seqno. So - // tombstone_start < meta.largest < tombstone_end. - // - // Suppose lower bound is comp_start_user_key and upper_bound is - // next_table_min_key. The call it->Seek(*lower_bound) implies we have - // tombstone_end_key.user_key > comp_start_user_key. So - // tombstone_end_key.user_key > smallest_candidate.user_key. For - // smallest_candidate to be > largest_candidate, we need - // tombstone_start.user_key = largest_candidate.user_key = upper_bound = - // next_table_min_key.user_key. This means `has_overlapping_endpoints` is - // true (so meta.largest.user_key = upper_bound) and tombstone.seq_ < - // meta.largest.seqno. So tombstone_start < meta.largest < tombstone_end. } return Status::OK(); } @@ -834,6 +787,8 @@ CompactionOutputs::CompactionOutputs(const Compaction* compaction, if (compaction->output_level() != 0) { FillFilesToCutForTtl(); } + + level_ptrs_ = std::vector(compaction_->number_levels(), 0); } } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h index 52233917f0f8..18246cf2faa8 100644 --- a/db/compaction/compaction_outputs.h +++ b/db/compaction/compaction_outputs.h @@ -107,7 +107,7 @@ class CompactionOutputs { // Finish the current output file Status Finish(const Status& intput_status, - const SeqnoToTimeMapping& seqno_time_mapping); + const SeqnoToTimeMapping& seqno_to_time_mapping); // Update output table properties from table builder void UpdateTableProperties() { @@ -167,9 +167,15 @@ class CompactionOutputs { current_output_file_size_ = 0; } - // Add range-dels from the aggregator to the current output file + // Add range deletions from the range_del_agg_ to the current output file. + // Input parameters, `range_tombstone_lower_bound_` and current output's + // metadata determine the bounds on range deletions to add. Updates output + // file metadata boundary if extended by range tombstones. + // // @param comp_start_user_key and comp_end_user_key include timestamp if - // user-defined timestamp is enabled. + // user-defined timestamp is enabled. Their timestamp should be max timestamp. + // @param next_table_min_key internal key lower bound for the next compaction + // output. // @param full_history_ts_low used for range tombstone garbage collection. Status AddRangeDels(const Slice* comp_start_user_key, const Slice* comp_end_user_key, @@ -200,10 +206,10 @@ class CompactionOutputs { // We may only split the output when the cursor is in the range. Split if ((!end.has_value() || icmp->user_comparator()->Compare( - ExtractUserKey(output_split_key->Encode()), end.value()) < 0) && - (!start.has_value() || icmp->user_comparator()->Compare( - ExtractUserKey(output_split_key->Encode()), - start.value()) > 0)) { + ExtractUserKey(output_split_key->Encode()), *end) < 0) && + (!start.has_value() || + icmp->user_comparator()->Compare( + ExtractUserKey(output_split_key->Encode()), *start) > 0)) { local_output_split_key_ = output_split_key; } } @@ -314,6 +320,7 @@ class CompactionOutputs { std::unique_ptr partitioner_; // A flag determines if this subcompaction has been split by the cursor + // for RoundRobin compaction bool is_split_ = false; // We also maintain the output split key for each subcompaction to avoid @@ -345,6 +352,19 @@ class CompactionOutputs { // for the current output file, how many file boundaries has it crossed, // basically number of files overlapped * 2 size_t grandparent_boundary_switched_num_ = 0; + + // The smallest key of the current output file, this is set when current + // output file's smallest key is a range tombstone start key. + InternalKey range_tombstone_lower_bound_; + + // Used for calls to compaction->KeyRangeNotExistsBeyondOutputLevel() in + // CompactionOutputs::AddRangeDels(). + // level_ptrs_[i] holds index of the file that was checked during the last + // call to compaction->KeyRangeNotExistsBeyondOutputLevel(). This allows + // future calls to the function to pick up where it left off, since each + // range tombstone added to output file within each subcompaction is in + // increasing key range. + std::vector level_ptrs_; }; // helper struct to concatenate the last level and penultimate level outputs diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index 5fe058b56d19..4d40ab503417 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -20,7 +20,7 @@ #include "file/filename.h" #include "logging/log_buffer.h" #include "logging/logging.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "test_util/sync_point.h" #include "util/random.h" #include "util/string_util.h" @@ -611,23 +611,21 @@ Compaction* CompactionPicker::CompactRange( // Universal compaction with more than one level always compacts all the // files together to the last level. assert(vstorage->num_levels() > 1); + int max_output_level = + vstorage->MaxOutputLevel(ioptions_.allow_ingest_behind); // DBImpl::CompactRange() set output level to be the last level - if (ioptions_.allow_ingest_behind) { - assert(output_level == vstorage->num_levels() - 2); - } else { - assert(output_level == vstorage->num_levels() - 1); - } + assert(output_level == max_output_level); // DBImpl::RunManualCompaction will make full range for universal compaction assert(begin == nullptr); assert(end == nullptr); *compaction_end = nullptr; int start_level = 0; - for (; start_level < vstorage->num_levels() && + for (; start_level <= max_output_level && vstorage->NumLevelFiles(start_level) == 0; start_level++) { } - if (start_level == vstorage->num_levels()) { + if (start_level > max_output_level) { return nullptr; } @@ -637,9 +635,9 @@ Compaction* CompactionPicker::CompactRange( return nullptr; } - std::vector inputs(vstorage->num_levels() - + std::vector inputs(max_output_level + 1 - start_level); - for (int level = start_level; level < vstorage->num_levels(); level++) { + for (int level = start_level; level <= max_output_level; level++) { inputs[level - start_level].level = level; auto& files = inputs[level - start_level].files; for (FileMetaData* f : vstorage->LevelFiles(level)) { @@ -753,8 +751,10 @@ Compaction* CompactionPicker::CompactRange( // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out // files that are created during the current compaction. - if (compact_range_options.bottommost_level_compaction == - BottommostLevelCompaction::kForceOptimized && + if ((compact_range_options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized || + compact_range_options.bottommost_level_compaction == + BottommostLevelCompaction::kIfHaveCompactionFilter) && max_file_num_to_ignore != std::numeric_limits::max()) { assert(input_level == output_level); // inputs_shrunk holds a continuous subset of input files which were all @@ -877,7 +877,6 @@ Compaction* CompactionPicker::CompactRange( return compaction; } -#ifndef ROCKSDB_LITE namespace { // Test whether two files have overlapping key-ranges. bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a, @@ -1116,7 +1115,6 @@ Status CompactionPicker::SanitizeCompactionInputFiles( return Status::OK(); } -#endif // !ROCKSDB_LITE void CompactionPicker::RegisterCompaction(Compaction* c) { if (c == nullptr) { diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h index d98af851bfe0..0556e992754e 100644 --- a/db/compaction/compaction_picker.h +++ b/db/compaction/compaction_picker.h @@ -93,11 +93,9 @@ class CompactionPicker { // into a valid one by adding more files, the function will return a // non-ok status with specific reason. // -#ifndef ROCKSDB_LITE Status SanitizeCompactionInputFiles(std::unordered_set* input_files, const ColumnFamilyMetaData& cf_meta, const int output_level) const; -#endif // ROCKSDB_LITE // Free up the files that participated in a compaction // @@ -229,11 +227,9 @@ class CompactionPicker { // A helper function to SanitizeCompactionInputFiles() that // sanitizes "input_files" by adding necessary files. -#ifndef ROCKSDB_LITE virtual Status SanitizeCompactionInputFilesForAllLevels( std::unordered_set* input_files, const ColumnFamilyMetaData& cf_meta, const int output_level) const; -#endif // ROCKSDB_LITE // Keeps track of all compactions that are running on Level0. // Protected by DB mutex @@ -246,7 +242,6 @@ class CompactionPicker { const InternalKeyComparator* const icmp_; }; -#ifndef ROCKSDB_LITE // A dummy compaction that never triggers any automatic // compaction. class NullCompactionPicker : public CompactionPicker { @@ -287,7 +282,6 @@ class NullCompactionPicker : public CompactionPicker { return false; } }; -#endif // !ROCKSDB_LITE // Attempts to find an intra L0 compaction conforming to the given parameters. // diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc index 362e64e16f21..505297770286 100644 --- a/db/compaction/compaction_picker_fifo.cc +++ b/db/compaction/compaction_picker_fifo.cc @@ -8,7 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/compaction/compaction_picker_fifo.h" -#ifndef ROCKSDB_LITE #include #include @@ -17,6 +16,10 @@ #include "db/column_family.h" #include "logging/log_buffer.h" #include "logging/logging.h" +#include "options/options_helper.h" +#include "rocksdb/listener.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -285,31 +288,36 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction( return c; } -Compaction* FIFOCompactionPicker::PickCompactionToWarm( +Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, LogBuffer* log_buffer) { - if (mutable_cf_options.compaction_options_fifo.age_for_warm == 0) { + const std::vector& ages = + mutable_cf_options.compaction_options_fifo + .file_temperature_age_thresholds; + if (ages.empty()) { return nullptr; } - // PickCompactionToWarm is only triggered if there is no non-L0 files. - for (int level = 1; level < vstorage->num_levels(); ++level) { - if (GetTotalFilesSize(vstorage->LevelFiles(level)) > 0) { - return nullptr; - } + // Does not apply to multi-level FIFO. + if (vstorage->num_levels() > 1) { + return nullptr; } const int kLevel0 = 0; const std::vector& level_files = vstorage->LevelFiles(kLevel0); + if (level_files.empty()) { + return nullptr; + } int64_t _current_time; auto status = ioptions_.clock->GetCurrentTime(&_current_time); if (!status.ok()) { - ROCKS_LOG_BUFFER(log_buffer, - "[%s] FIFO compaction: Couldn't get current time: %s. " - "Not doing compactions based on warm threshold. ", - cf_name.c_str(), status.ToString().c_str()); + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: Couldn't get current time: %s. " + "Not doing compactions based on file temperature-age threshold. ", + cf_name.c_str(), status.ToString().c_str()); return nullptr; } const uint64_t current_time = static_cast(_current_time); @@ -328,56 +336,77 @@ Compaction* FIFOCompactionPicker::PickCompactionToWarm( inputs[0].level = 0; // avoid underflow - if (current_time > mutable_cf_options.compaction_options_fifo.age_for_warm) { - uint64_t create_time_threshold = - current_time - mutable_cf_options.compaction_options_fifo.age_for_warm; + uint64_t min_age = ages[0].age; + // kLastTemperature means target temperature is to be determined. + Temperature compaction_target_temp = Temperature::kLastTemperature; + if (current_time > min_age) { + uint64_t create_time_threshold = current_time - min_age; uint64_t compaction_size = 0; - // We will ideally identify a file qualifying for warm tier by knowing - // the timestamp for the youngest entry in the file. However, right now - // we don't have the information. We infer it by looking at timestamp - // of the next file's (which is just younger) oldest entry's timestamp. - FileMetaData* prev_file = nullptr; - for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) { - FileMetaData* f = *ritr; - assert(f); - if (f->being_compacted) { - // Right now this probably won't happen as we never try to schedule - // two compactions in parallel, so here we just simply don't schedule - // anything. + // We will ideally identify a file qualifying for temperature change by + // knowing the timestamp for the youngest entry in the file. However, right + // now we don't have the information. We infer it by looking at timestamp of + // the previous file's (which is just younger) oldest entry's timestamp. + Temperature cur_target_temp; + // avoid index underflow + assert(level_files.size() >= 1); + for (size_t index = level_files.size() - 1; index >= 1; --index) { + // Try to add cur_file to compaction inputs. + FileMetaData* cur_file = level_files[index]; + // prev_file is just younger than cur_file + FileMetaData* prev_file = level_files[index - 1]; + if (cur_file->being_compacted) { + // Should not happen since we check for + // `level0_compactions_in_progress_` above. Here we simply just don't + // schedule anything. return nullptr; } - uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime(); - if (oldest_ancester_time == kUnknownOldestAncesterTime) { + uint64_t oldest_ancestor_time = prev_file->TryGetOldestAncesterTime(); + if (oldest_ancestor_time == kUnknownOldestAncesterTime) { // Older files might not have enough information. It is possible to // handle these files by looking at newer files, but maintaining the // logic isn't worth it. break; } - if (oldest_ancester_time > create_time_threshold) { - // The previous file (which has slightly older data) doesn't qualify - // for warm tier. + if (oldest_ancestor_time > create_time_threshold) { + // cur_file is too fresh break; } - if (prev_file != nullptr) { - compaction_size += prev_file->fd.GetFileSize(); - if (compaction_size > mutable_cf_options.max_compaction_bytes) { + cur_target_temp = ages[0].temperature; + for (size_t i = 1; i < ages.size(); ++i) { + if (current_time >= ages[i].age && + oldest_ancestor_time <= current_time - ages[i].age) { + cur_target_temp = ages[i].temperature; + } + } + if (cur_file->temperature == cur_target_temp) { + if (inputs[0].empty()) { + continue; + } else { break; } - inputs[0].files.push_back(prev_file); - ROCKS_LOG_BUFFER(log_buffer, - "[%s] FIFO compaction: picking file %" PRIu64 - " with next file's oldest time %" PRIu64 " for warm", - cf_name.c_str(), prev_file->fd.GetNumber(), - oldest_ancester_time); } - if (f->temperature == Temperature::kUnknown || - f->temperature == Temperature::kHot) { - prev_file = f; - } else if (!inputs[0].files.empty()) { - // A warm file newer than files picked. + + // cur_file needs to change temperature + if (compaction_target_temp == Temperature::kLastTemperature) { + assert(inputs[0].empty()); + compaction_target_temp = cur_target_temp; + } else if (cur_target_temp != compaction_target_temp) { + assert(!inputs[0].empty()); + break; + } + if (inputs[0].empty() || compaction_size + cur_file->fd.GetFileSize() <= + mutable_cf_options.max_compaction_bytes) { + inputs[0].files.push_back(cur_file); + compaction_size += cur_file->fd.GetFileSize(); + ROCKS_LOG_BUFFER( + log_buffer, + "[%s] FIFO compaction: picking file %" PRIu64 + " with next file's oldest time %" PRIu64 " for temperature %s.", + cf_name.c_str(), cur_file->fd.GetNumber(), oldest_ancestor_time, + temperature_to_string[cur_target_temp].c_str()); + } + if (compaction_size > mutable_cf_options.max_compaction_bytes) { break; - } else { - assert(prev_file == nullptr); } } } @@ -391,7 +420,7 @@ Compaction* FIFOCompactionPicker::PickCompactionToWarm( std::move(inputs), 0, 0 /* output file size limit */, 0 /* max compaction bytes, not applicable */, 0 /* output path ID */, mutable_cf_options.compression, mutable_cf_options.compression_opts, - Temperature::kWarm, + compaction_target_temp, /* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "", vstorage->CompactionScore(0), /* is deletion compaction */ false, /* l0_files_might_overlap */ true, @@ -413,8 +442,8 @@ Compaction* FIFOCompactionPicker::PickCompaction( vstorage, log_buffer); } if (c == nullptr) { - c = PickCompactionToWarm(cf_name, mutable_cf_options, mutable_db_options, - vstorage, log_buffer); + c = PickTemperatureChangeCompaction( + cf_name, mutable_cf_options, mutable_db_options, vstorage, log_buffer); } RegisterCompaction(c); return c; @@ -443,4 +472,3 @@ Compaction* FIFOCompactionPicker::CompactRange( } } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/db/compaction/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h index 1db760185dea..df21a1bde0f2 100644 --- a/db/compaction/compaction_picker_fifo.h +++ b/db/compaction/compaction_picker_fifo.h @@ -8,7 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#ifndef ROCKSDB_LITE #include "db/compaction/compaction_picker.h" @@ -53,11 +52,9 @@ class FIFOCompactionPicker : public CompactionPicker { VersionStorageInfo* version, LogBuffer* log_buffer); - Compaction* PickCompactionToWarm(const std::string& cf_name, - const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, - VersionStorageInfo* version, - LogBuffer* log_buffer); + Compaction* PickTemperatureChangeCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, + LogBuffer* log_buffer); }; } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc index 2162d30a30a1..c436689bb65b 100644 --- a/db/compaction/compaction_picker_level.cc +++ b/db/compaction/compaction_picker_level.cc @@ -83,7 +83,7 @@ class LevelCompactionBuilder { Compaction* GetCompaction(); - // For the specfied level, pick a file that we want to compact. + // From `start_level_`, pick files to compact to `output_level_`. // Returns false if there is no file to compact. // If it returns true, inputs->files.size() will be exactly one for // all compaction priorities except round-robin. For round-robin, @@ -107,8 +107,9 @@ class LevelCompactionBuilder { bool PickIntraL0Compaction(); // Return true if TrivialMove is extended. `start_index` is the index of - // the intiial file picked, which should already be in `start_level_inputs_`. - bool TryExtendNonL0TrivialMove(int start_index); + // the initial file picked, which should already be in `start_level_inputs_`. + bool TryExtendNonL0TrivialMove(int start_index, + bool only_expand_right = false); // Picks a file from level_files to compact. // level_files is a vector of (level, file metadata) in ascending order of @@ -355,7 +356,8 @@ void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() { vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, &output_level_inputs.files); if (output_level_inputs.empty()) { - if (TryExtendNonL0TrivialMove((int)start_index)) { + if (TryExtendNonL0TrivialMove((int)start_index, + true /* only_expand_right */)) { return; } } @@ -501,6 +503,16 @@ Compaction* LevelCompactionBuilder::PickCompaction() { } Compaction* LevelCompactionBuilder::GetCompaction() { + // TryPickL0TrivialMove() does not apply to the case when compacting L0 to an + // empty output level. So L0 files is picked in PickFileToCompact() by + // compaction score. We may still be able to do trivial move when this file + // does not overlap with other L0s. This happens when + // compaction_inputs_[0].size() == 1 since SetupOtherL0FilesIfNeeded() did not + // pull in more L0s. + assert(!compaction_inputs_.empty()); + bool l0_files_might_overlap = + start_level_ == 0 && !is_l0_trivial_move_ && + (compaction_inputs_.size() > 1 || compaction_inputs_[0].size() > 1); auto c = new Compaction( vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_, std::move(compaction_inputs_), output_level_, @@ -515,8 +527,7 @@ Compaction* LevelCompactionBuilder::GetCompaction() { Temperature::kUnknown, /* max_subcompactions */ 0, std::move(grandparents_), is_manual_, /* trim_ts */ "", start_level_score_, false /* deletion_compaction */, - /* l0_files_might_overlap */ start_level_ == 0 && !is_l0_trivial_move_, - compaction_reason_); + l0_files_might_overlap, compaction_reason_); // If it's level 0 compaction, make sure we don't execute any other level 0 // compactions in parallel @@ -653,7 +664,8 @@ bool LevelCompactionBuilder::TryPickL0TrivialMove() { return false; } -bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) { +bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index, + bool only_expand_right) { if (start_level_inputs_.size() == 1 && (ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) && (mutable_cf_options_.compression_per_level.empty())) { @@ -670,6 +682,7 @@ bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) { size_t total_size = initial_file->fd.GetFileSize(); CompactionInputFiles output_level_inputs; output_level_inputs.level = output_level_; + // Expand towards right for (int i = start_index + 1; i < static_cast(level_files.size()) && start_level_inputs_.size() < kMaxMultiTrivialMove; @@ -702,6 +715,37 @@ bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) { } start_level_inputs_.files.push_back(next_file); } + // Expand towards left + if (!only_expand_right) { + for (int i = start_index - 1; + i >= 0 && start_level_inputs_.size() < kMaxMultiTrivialMove; i--) { + FileMetaData* next_file = level_files[i]; + if (next_file->being_compacted) { + break; + } + vstorage_->GetOverlappingInputs(output_level_, &(next_file->smallest), + &(initial_file->largest), + &output_level_inputs.files); + if (!output_level_inputs.empty()) { + break; + } + if (i > 0 && compaction_picker_->icmp() + ->user_comparator() + ->CompareWithoutTimestamp( + next_file->smallest.user_key(), + level_files[i - 1]->largest.user_key()) == 0) { + // Not a clean up after adding the next file. Skip. + break; + } + total_size += next_file->fd.GetFileSize(); + if (total_size > mutable_cf_options_.max_compaction_bytes) { + break; + } + // keep `files` sorted in increasing order by key range + start_level_inputs_.files.insert(start_level_inputs_.files.begin(), + next_file); + } + } return start_level_inputs_.size() > 1; } return false; @@ -785,7 +829,10 @@ bool LevelCompactionBuilder::PickFileToCompact() { vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, &output_level_inputs.files); if (output_level_inputs.empty()) { - if (TryExtendNonL0TrivialMove(index)) { + if (start_level_ > 0 && + TryExtendNonL0TrivialMove(index, + ioptions_.compaction_pri == + kRoundRobin /* only_expand_right */)) { break; } } else { diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index 865518cb2007..3241d034d314 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -70,6 +70,11 @@ class CompactionPickerTestBase : public testing::Test { mutable_cf_options_.RefreshDerivedOptions(ioptions_); ioptions_.cf_paths.emplace_back("dummy", std::numeric_limits::max()); + // When the default value of this option is true, universal compaction + // tests can encounter assertion failure since SanitizeOption() is + // not run to set this option to false. So we do the sanitization + // here. Tests that test this option set this option to true explicitly. + ioptions_.level_compaction_dynamic_level_bytes = false; } ~CompactionPickerTestBase() override {} @@ -79,7 +84,9 @@ class CompactionPickerTestBase : public testing::Test { options_.num_levels = num_levels; vstorage_.reset(new VersionStorageInfo( &icmp_, ucmp_, options_.num_levels, style, nullptr, false, - EpochNumberRequirement::kMustPresent)); + EpochNumberRequirement::kMustPresent, ioptions_.clock, + options_.bottommost_file_compaction_delay, + OffpeakTimeOption(mutable_db_options_.daily_offpeak_time_utc))); vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_); } @@ -88,7 +95,9 @@ class CompactionPickerTestBase : public testing::Test { void AddVersionStorage() { temp_vstorage_.reset(new VersionStorageInfo( &icmp_, ucmp_, options_.num_levels, ioptions_.compaction_style, - vstorage_.get(), false, EpochNumberRequirement::kMustPresent)); + vstorage_.get(), false, EpochNumberRequirement::kMustPresent, + ioptions_.clock, options_.bottommost_file_compaction_delay, + OffpeakTimeOption(mutable_db_options_.daily_offpeak_time_utc))); } void DeleteVersionStorage() { @@ -148,7 +157,8 @@ class CompactionPickerTestBase : public testing::Test { smallest_seq, largest_seq, marked_for_compact, temperature, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, + true /* user_defined_timestamps_persisted */); f->compensated_file_size = (compensated_file_size != 0) ? compensated_file_size : file_size; f->oldest_ancester_time = oldest_ancestor_time; @@ -482,8 +492,6 @@ TEST_F(CompactionPickerTest, LevelTriggerDynamic4) { ASSERT_EQ(num_levels - 1, compaction->output_level()); } -// Universal and FIFO Compactions are not supported in ROCKSDB_LITE -#ifndef ROCKSDB_LITE TEST_F(CompactionPickerTest, NeedsCompactionUniversal) { NewVersionStorage(1, kCompactionStyleUniversal); UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); @@ -507,7 +515,7 @@ TEST_F(CompactionPickerTest, NeedsCompactionUniversal) { TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) { const uint64_t kFileSize = 100000; - NewVersionStorage(1, kCompactionStyleUniversal); + NewVersionStorage(3 /* num_levels */, kCompactionStyleUniversal); ioptions_.allow_ingest_behind = true; ioptions_.num_levels = 3; UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); @@ -534,6 +542,14 @@ TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) { // output level should be the one above the bottom-most ASSERT_EQ(1, compaction->output_level()); + + // input should not include the reserved level + const std::vector* inputs = compaction->inputs(); + for (const auto& compaction_input : *inputs) { + if (!compaction_input.empty()) { + ASSERT_LT(compaction_input.level, 2); + } + } } // Tests if the files can be trivially moved in multi level // universal compaction when allow_trivial_move option is set @@ -980,6 +996,61 @@ TEST_F(CompactionPickerTest, UniversalIncrementalSpace5) { ASSERT_EQ(13, compaction->num_input_files(1)); } +TEST_F(CompactionPickerTest, + PartiallyExcludeL0ToReduceWriteStopForSizeAmpCompaction) { + const uint64_t kFileSize = 100000; + const uint64_t kL0FileCount = 30; + const uint64_t kLastLevelFileCount = 1; + const uint64_t kNumLevels = 5; + + for (const uint64_t test_no_exclusion : {false, true}) { + const uint64_t kExpectedNumExcludedL0 = + test_no_exclusion ? 0 : kL0FileCount * 1 / 10; + + mutable_cf_options_.level0_stop_writes_trigger = 36; + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent = 1; + mutable_cf_options_.compaction_options_universal.max_merge_width = + test_no_exclusion + // In universal compaction, sorted runs from non L0 levels are + // counted toward `level0_stop_writes_trigger`. Therefore we need to + // subtract the total number of sorted runs picked originally for + // this compaction (i.e, kL0FileCount + kLastLevelFileCount) from + // `level0_stop_writes_trigger` to calculate `max_merge_width` that + // results in no L0 exclusion for testing purpose. + ? mutable_cf_options_.level0_stop_writes_trigger - + (kL0FileCount + kLastLevelFileCount) + : UINT_MAX; + + UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); + NewVersionStorage(kNumLevels, kCompactionStyleUniversal); + + for (uint64_t i = 1; i <= kL0FileCount + kLastLevelFileCount; ++i) { + Add(i <= kL0FileCount ? 0 : kNumLevels - 1, static_cast(i), + std::to_string((i + 100) * 1000).c_str(), + std::to_string((i + 100) * 1000 + 999).c_str(), kFileSize, 0, i * 100, + i * 100 + 99); + } + + UpdateVersionStorageInfo(); + + ASSERT_TRUE(universal_compaction_picker.NeedsCompaction(vstorage_.get())); + std::unique_ptr compaction( + universal_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kUniversalSizeAmplification); + ASSERT_EQ(compaction->num_input_files(0), + kL0FileCount - kExpectedNumExcludedL0); + ASSERT_EQ(compaction->num_input_files(kNumLevels - 1), kLastLevelFileCount); + for (uint64_t level = 1; level <= kNumLevels - 2; level++) { + ASSERT_EQ(compaction->num_input_files(level), 0); + } + } +} + TEST_F(CompactionPickerTest, NeedsCompactionFIFO) { NewVersionStorage(1, kCompactionStyleFIFO); const int kFileCount = @@ -1007,29 +1078,28 @@ TEST_F(CompactionPickerTest, NeedsCompactionFIFO) { } } -TEST_F(CompactionPickerTest, FIFOToWarm1) { +TEST_F(CompactionPickerTest, FIFOToCold1) { NewVersionStorage(1, kCompactionStyleFIFO); const uint64_t kFileSize = 100000; const uint64_t kMaxSize = kFileSize * 100000; - uint64_t kWarmThreshold = 2000; + uint64_t kColdThreshold = 2000; fifo_options_.max_table_files_size = kMaxSize; - fifo_options_.age_for_warm = kWarmThreshold; + fifo_options_.file_temperature_age_thresholds = { + {Temperature::kCold, kColdThreshold}}; mutable_cf_options_.compaction_options_fifo = fifo_options_; - mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.level0_file_num_compaction_trigger = 100; mutable_cf_options_.max_compaction_bytes = kFileSize * 100; FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); int64_t current_time = 0; ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); uint64_t threshold_time = - static_cast(current_time) - kWarmThreshold; - Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, - Temperature::kUnknown, static_cast(current_time) - 100); - Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, - Temperature::kUnknown, threshold_time + 100); - Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, - Temperature::kUnknown, threshold_time - 2000); + static_cast(current_time) - kColdThreshold; + Add(0 /* level */, 4U /* file_number */, "260", "300", 1 * kFileSize, 0, 2500, + 2600, 0, true, Temperature::kUnknown, + threshold_time - 2000 /* oldest_ancestor_time */); + // Qualifies for compaction to kCold. Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, Temperature::kUnknown, threshold_time - 3000); UpdateVersionStorageInfo(); @@ -1039,33 +1109,36 @@ TEST_F(CompactionPickerTest, FIFOToWarm1) { cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kChangeTemperature); + ASSERT_EQ(compaction->output_temperature(), Temperature::kCold); ASSERT_EQ(1U, compaction->num_input_files(0)); ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber()); } -TEST_F(CompactionPickerTest, FIFOToWarm2) { +TEST_F(CompactionPickerTest, FIFOToCold2) { NewVersionStorage(1, kCompactionStyleFIFO); const uint64_t kFileSize = 100000; const uint64_t kMaxSize = kFileSize * 100000; - uint64_t kWarmThreshold = 2000; + uint64_t kColdThreshold = 2000; fifo_options_.max_table_files_size = kMaxSize; - fifo_options_.age_for_warm = kWarmThreshold; + fifo_options_.file_temperature_age_thresholds = { + {Temperature::kCold, kColdThreshold}}; mutable_cf_options_.compaction_options_fifo = fifo_options_; - mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.level0_file_num_compaction_trigger = 100; mutable_cf_options_.max_compaction_bytes = kFileSize * 100; FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); int64_t current_time = 0; ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); uint64_t threshold_time = - static_cast(current_time) - kWarmThreshold; + static_cast(current_time) - kColdThreshold; Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, Temperature::kUnknown, static_cast(current_time) - 100); - Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, - Temperature::kUnknown, threshold_time + 100); Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, - Temperature::kUnknown, threshold_time - 2000); + Temperature::kUnknown, threshold_time); + // The following two files qualify for compaction to kCold. Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, Temperature::kUnknown, threshold_time - 3000); Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, @@ -1077,34 +1150,40 @@ TEST_F(CompactionPickerTest, FIFOToWarm2) { cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kChangeTemperature); + ASSERT_EQ(compaction->output_temperature(), Temperature::kCold); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); } -TEST_F(CompactionPickerTest, FIFOToWarmMaxSize) { +TEST_F(CompactionPickerTest, FIFOToColdMaxCompactionSize) { NewVersionStorage(1, kCompactionStyleFIFO); const uint64_t kFileSize = 100000; const uint64_t kMaxSize = kFileSize * 100000; - uint64_t kWarmThreshold = 2000; + uint64_t kColdThreshold = 2000; fifo_options_.max_table_files_size = kMaxSize; - fifo_options_.age_for_warm = kWarmThreshold; + fifo_options_.file_temperature_age_thresholds = { + {Temperature::kCold, kColdThreshold}}; mutable_cf_options_.compaction_options_fifo = fifo_options_; - mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.level0_file_num_compaction_trigger = 100; mutable_cf_options_.max_compaction_bytes = kFileSize * 9; FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); int64_t current_time = 0; ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); uint64_t threshold_time = - static_cast(current_time) - kWarmThreshold; + static_cast(current_time) - kColdThreshold; Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, Temperature::kUnknown, static_cast(current_time) - 100); Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, Temperature::kUnknown, threshold_time + 100); Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, Temperature::kUnknown, threshold_time - 2000); + // The following two files qualify for compaction to kCold. + // But only the last two should be included to respect `max_compaction_bytes`. Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, Temperature::kUnknown, threshold_time - 3000); Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, @@ -1118,40 +1197,45 @@ TEST_F(CompactionPickerTest, FIFOToWarmMaxSize) { cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kChangeTemperature); + ASSERT_EQ(compaction->output_temperature(), Temperature::kCold); ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); } -TEST_F(CompactionPickerTest, FIFOToWarmWithExistingWarm) { +TEST_F(CompactionPickerTest, FIFOToColdWithExistingCold) { NewVersionStorage(1, kCompactionStyleFIFO); const uint64_t kFileSize = 100000; const uint64_t kMaxSize = kFileSize * 100000; - uint64_t kWarmThreshold = 2000; + uint64_t kColdThreshold = 2000; fifo_options_.max_table_files_size = kMaxSize; - fifo_options_.age_for_warm = kWarmThreshold; + fifo_options_.file_temperature_age_thresholds = { + {Temperature::kCold, kColdThreshold}}; mutable_cf_options_.compaction_options_fifo = fifo_options_; - mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.level0_file_num_compaction_trigger = 100; mutable_cf_options_.max_compaction_bytes = kFileSize * 100; FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); int64_t current_time = 0; ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); uint64_t threshold_time = - static_cast(current_time) - kWarmThreshold; + static_cast(current_time) - kColdThreshold; Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, Temperature::kUnknown, static_cast(current_time) - 100); Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, Temperature::kUnknown, threshold_time + 100); Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, Temperature::kUnknown, threshold_time - 2000); + // The following two files qualify for compaction to kCold. Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, Temperature::kUnknown, threshold_time - 3000); Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, Temperature::kUnknown, threshold_time - 4000); Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true, - Temperature::kWarm, threshold_time - 5000); + Temperature::kCold, threshold_time - 5000); UpdateVersionStorageInfo(); ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); @@ -1159,28 +1243,32 @@ TEST_F(CompactionPickerTest, FIFOToWarmWithExistingWarm) { cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), &log_buffer_)); ASSERT_TRUE(compaction.get() != nullptr); - ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kChangeTemperature); + ASSERT_EQ(compaction->output_temperature(), Temperature::kCold); ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->num_input_files(0)); ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber()); } -TEST_F(CompactionPickerTest, FIFOToWarmWithOngoing) { +TEST_F(CompactionPickerTest, FIFOToColdWithHotBetweenCold) { NewVersionStorage(1, kCompactionStyleFIFO); const uint64_t kFileSize = 100000; const uint64_t kMaxSize = kFileSize * 100000; - uint64_t kWarmThreshold = 2000; + uint64_t kColdThreshold = 2000; fifo_options_.max_table_files_size = kMaxSize; - fifo_options_.age_for_warm = kWarmThreshold; + fifo_options_.file_temperature_age_thresholds = { + {Temperature::kCold, kColdThreshold}}; mutable_cf_options_.compaction_options_fifo = fifo_options_; - mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.level0_file_num_compaction_trigger = 100; mutable_cf_options_.max_compaction_bytes = kFileSize * 100; FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); int64_t current_time = 0; ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); uint64_t threshold_time = - static_cast(current_time) - kWarmThreshold; + static_cast(current_time) - kColdThreshold; Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, Temperature::kUnknown, static_cast(current_time) - 100); Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, @@ -1188,65 +1276,78 @@ TEST_F(CompactionPickerTest, FIFOToWarmWithOngoing) { Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, Temperature::kUnknown, threshold_time - 2000); Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, - Temperature::kUnknown, threshold_time - 3000); + Temperature::kCold, threshold_time - 3000); + // Qualifies for compaction to kCold. Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, Temperature::kUnknown, threshold_time - 4000); Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true, - Temperature::kWarm, threshold_time - 5000); - file_map_[2].first->being_compacted = true; + Temperature::kCold, threshold_time - 5000); UpdateVersionStorageInfo(); ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), &log_buffer_)); - // Stop if a file is being compacted - ASSERT_TRUE(compaction.get() == nullptr); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kChangeTemperature); + ASSERT_EQ(compaction->output_temperature(), Temperature::kCold); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); } -TEST_F(CompactionPickerTest, FIFOToWarmWithHotBetweenWarms) { +TEST_F(CompactionPickerTest, FIFOToColdAndWarm) { NewVersionStorage(1, kCompactionStyleFIFO); const uint64_t kFileSize = 100000; const uint64_t kMaxSize = kFileSize * 100000; - uint64_t kWarmThreshold = 2000; + uint64_t kWarmThreshold = 10000; + uint64_t kHotThreshold = 2000; fifo_options_.max_table_files_size = kMaxSize; - fifo_options_.age_for_warm = kWarmThreshold; + // Test that multiple threshold works. + fifo_options_.file_temperature_age_thresholds = { + {Temperature::kHot, kHotThreshold}, {Temperature::kWarm, kWarmThreshold}}; mutable_cf_options_.compaction_options_fifo = fifo_options_; - mutable_cf_options_.level0_file_num_compaction_trigger = 2; + mutable_cf_options_.level0_file_num_compaction_trigger = 100; mutable_cf_options_.max_compaction_bytes = kFileSize * 100; FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); int64_t current_time = 0; ASSERT_OK(Env::Default()->GetCurrentTime(¤t_time)); - uint64_t threshold_time = + uint64_t hot_threshold_time = + static_cast(current_time) - kHotThreshold; + uint64_t warm_threshold_time = static_cast(current_time) - kWarmThreshold; Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true, Temperature::kUnknown, static_cast(current_time) - 100); Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true, - Temperature::kUnknown, threshold_time + 100); + Temperature::kUnknown, hot_threshold_time + 100); Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true, - Temperature::kUnknown, threshold_time - 2000); + Temperature::kUnknown, hot_threshold_time - 200); + // Qualifies for Hot Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true, - Temperature::kWarm, threshold_time - 3000); + Temperature::kUnknown, warm_threshold_time - 100); + // Qualifies for Warm Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true, - Temperature::kUnknown, threshold_time - 4000); + Temperature::kUnknown, warm_threshold_time - 4000); Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true, - Temperature::kWarm, threshold_time - 5000); + Temperature::kUnknown, warm_threshold_time - 5000); UpdateVersionStorageInfo(); ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true); std::unique_ptr compaction(fifo_compaction_picker.PickCompaction( cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), &log_buffer_)); - // Stop if a file is being compacted ASSERT_TRUE(compaction.get() != nullptr); - ASSERT_EQ(1U, compaction->num_input_files(0)); - ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(compaction->compaction_reason(), + CompactionReason::kChangeTemperature); + // Assumes compaction picker picks older files first. + ASSERT_EQ(compaction->output_temperature(), Temperature::kWarm); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); } -#endif // ROCKSDB_LITE - TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) { NewVersionStorage(6, kCompactionStyleLevel); ioptions_.compaction_pri = kMinOverlappingRatio; @@ -1926,6 +2027,15 @@ TEST_F(CompactionPickerTest, OverlappingUserKeys11) { ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber()); } +TEST_F(CompactionPickerTest, FileTtlBoosterLargeNumLevels) { + const uint64_t kCurrentTime = 1000000; + FileTtlBooster booster(kCurrentTime, /*ttl=*/2048, + /*num_non_empty_levels=*/100, /*level=*/1); + FileMetaData meta; + meta.oldest_ancester_time = kCurrentTime - 1023; + ASSERT_EQ(1, booster.GetBoostScore(&meta)); +} + TEST_F(CompactionPickerTest, FileTtlBooster) { // Set TTL to 2048 // TTL boosting for all levels starts at 1024, @@ -2523,6 +2633,61 @@ TEST_F(CompactionPickerTest, L0TrivialMoveWholeL0) { ASSERT_TRUE(compaction->IsTrivialMove()); } +TEST_F(CompactionPickerTest, NonL0TrivialMoveExtendBothDirection) { + mutable_cf_options_.max_bytes_for_level_base = 5000; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + mutable_cf_options_.max_compaction_bytes = 10000000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + + Add(1, 1U, "300", "350", 3000U, 0, 710, 800, 3000U); + Add(1, 2U, "600", "651", 3001U, 0, 610, 700, 3001U); + Add(1, 3U, "700", "750", 3000U, 0, 500, 550, 3000U); + Add(2, 4U, "800", "850", 4000U, 0, 150, 200, 4000U); + + UpdateVersionStorageInfo(); + // File #2 should be picked first, and expand both directions to include + // files #1 and #3. + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(3, compaction->num_input_files(0)); + ASSERT_EQ(1, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2, compaction->input(0, 1)->fd.GetNumber()); + ASSERT_EQ(3, compaction->input(0, 2)->fd.GetNumber()); + ASSERT_TRUE(compaction->IsTrivialMove()); +} + +TEST_F(CompactionPickerTest, L0TrivialMoveToEmptyLevel) { + mutable_cf_options_.max_bytes_for_level_base = 5000; + mutable_cf_options_.level0_file_num_compaction_trigger = 4; + mutable_cf_options_.max_compaction_bytes = 10000000u; + ioptions_.level_compaction_dynamic_level_bytes = false; + NewVersionStorage(6, kCompactionStyleLevel); + + // File 2 will be picked first, which by itself is trivial movable. + // There was a bug before where compaction also picks file 3 and 4, + // (and then file 1 since it overlaps with the key range), + // which makes the compaction not trivial movable. + Add(0, 1U, "450", "599", 3000U, 0, 710, 800, 3000U); + Add(0, 2U, "600", "651", 3001U, 0, 610, 700, 3001U); + Add(0, 3U, "300", "350", 3000U, 0, 500, 550, 3000U); + Add(0, 4U, "500", "550", 2999U, 0, 300, 350, 2999U); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), + &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1, compaction->num_input_levels()); + ASSERT_EQ(1, compaction->num_input_files(0)); + ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_TRUE(compaction->IsTrivialMove()); +} + TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) { mutable_cf_options_.max_bytes_for_level_base = 10000u; mutable_cf_options_.max_compaction_bytes = 10001u; @@ -2873,7 +3038,6 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) { ASSERT_EQ(0, compaction->output_level()); } -#ifndef ROCKSDB_LITE TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) { const uint64_t kFileSize = 100000; @@ -3273,6 +3437,9 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNonLastLevel) { ioptions_.preclude_last_level_data_seconds = 1000; mutable_cf_options_.compaction_options_universal .max_size_amplification_percent = 200; + // To avoid any L0 file exclusion in size amp compaction intended for reducing + // write stop + mutable_cf_options_.compaction_options_universal.max_merge_width = 2; UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); NewVersionStorage(kNumLevels, kCompactionStyleUniversal); @@ -3346,6 +3513,9 @@ TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) { ioptions_.preclude_last_level_data_seconds = 1000; mutable_cf_options_.compaction_options_universal .max_size_amplification_percent = 200; + // To avoid any L0 file exclusion in size amp compaction intended for reducing + // write stop + mutable_cf_options_.compaction_options_universal.max_merge_width = 2; UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_); NewVersionStorage(kNumLevels, kCompactionStyleUniversal); @@ -3982,7 +4152,6 @@ TEST_P(PerKeyPlacementCompactionPickerTest, INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompactionPickerTest, PerKeyPlacementCompactionPickerTest, ::testing::Bool()); -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc index 3ef4e70b3a68..6d9ff43cd544 100644 --- a/db/compaction/compaction_picker_universal.cc +++ b/db/compaction/compaction_picker_universal.cc @@ -8,9 +8,8 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/compaction/compaction_picker_universal.h" -#ifndef ROCKSDB_LITE -#include +#include #include #include #include @@ -20,7 +19,7 @@ #include "file/filename.h" #include "logging/log_buffer.h" #include "logging/logging.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "test_util/sync_point.h" #include "util/random.h" #include "util/string_util.h" @@ -115,6 +114,13 @@ class UniversalCompactionBuilder { // because some files are being compacted. Compaction* PickPeriodicCompaction(); + bool ShouldSkipLastSortedRunForSizeAmpCompaction() const { + assert(!sorted_runs_.empty()); + return ioptions_.preclude_last_level_data_seconds > 0 && + ioptions_.num_levels > 2 && + sorted_runs_.back().level == ioptions_.num_levels - 1 && + sorted_runs_.size() > 1; + } // Used in universal compaction when the allow_trivial_move // option is set. Checks whether there are any overlapping files // in the input. Returns true if the input files are non @@ -123,6 +129,100 @@ class UniversalCompactionBuilder { uint64_t GetMaxOverlappingBytes() const; + // To conditionally exclude some of the newest L0 files + // from a size amp compaction. This is to prevent a large number of L0 + // files from being locked by a size amp compaction, potentially leading to + // write stop with a few more flushes. + // + // Such exclusion is based on `num_l0_input_pre_exclusion`, + // `level0_stop_writes_trigger`, `max/min_merge_width` and the pre-exclusion + // compaction score. Noted that it will not make the size amp compaction of + // interest invalid from running as a size amp compaction as long as its + // pre-exclusion compaction score satisfies the condition to run. + // + // @param `num_l0_input_pre_exclusion` Number of L0 input files prior to + // exclusion + // @param `end_index` Index of the last sorted run selected as compaction + // input. Will not be affected by this exclusion. + // @param `start_index` Index of the first input sorted run prior to + // exclusion. Will be modified as output based on the exclusion. + // @param `candidate_size` Total size of all except for the last input sorted + // runs prior to exclusion. Will be modified as output based on the exclusion. + // + // @return Number of L0 files to exclude. `start_index` and + // `candidate_size` will be modified accordingly + std::size_t MightExcludeNewL0sToReduceWriteStop( + std::size_t num_l0_input_pre_exclusion, std::size_t end_index, + std::size_t& start_index, uint64_t& candidate_size) const { + if (num_l0_input_pre_exclusion == 0) { + return 0; + } + + assert(start_index <= end_index && sorted_runs_.size() > end_index); + assert(mutable_cf_options_.level0_stop_writes_trigger > 0); + const std::size_t level0_stop_writes_trigger = static_cast( + mutable_cf_options_.level0_stop_writes_trigger); + const std::size_t max_merge_width = static_cast( + mutable_cf_options_.compaction_options_universal.max_merge_width); + const std::size_t min_merge_width = static_cast( + mutable_cf_options_.compaction_options_universal.min_merge_width); + const uint64_t max_size_amplification_percent = + mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent; + const uint64_t base_sr_size = sorted_runs_[end_index].size; + + // Leave at least 1 L0 file and 2 input sorted runs after exclusion + const std::size_t max_num_l0_to_exclude = + std::min(num_l0_input_pre_exclusion - 1, end_index - start_index - 1); + // In universal compaction, sorted runs from non L0 levels are counted + // toward `level0_stop_writes_trigger`. Therefore we need to subtract the + // total number of sorted runs picked originally for this compaction from + // `level0_stop_writes_trigger` to calculate + // `num_extra_l0_before_write_stop` + const std::size_t num_extra_l0_before_write_stop = + level0_stop_writes_trigger - + std::min(level0_stop_writes_trigger, end_index - start_index + 1); + const std::size_t num_l0_to_exclude_for_max_merge_width = + std::min(max_merge_width - + std::min(max_merge_width, num_extra_l0_before_write_stop), + max_num_l0_to_exclude); + const std::size_t num_l0_to_exclude_for_min_merge_width = + std::min(min_merge_width - + std::min(min_merge_width, num_extra_l0_before_write_stop), + max_num_l0_to_exclude); + + std::size_t num_l0_to_exclude = 0; + uint64_t candidate_size_post_exclusion = candidate_size; + + for (std::size_t possible_num_l0_to_exclude = + num_l0_to_exclude_for_min_merge_width; + possible_num_l0_to_exclude <= num_l0_to_exclude_for_max_merge_width; + ++possible_num_l0_to_exclude) { + uint64_t current_candidate_size = candidate_size_post_exclusion; + for (std::size_t j = num_l0_to_exclude; j < possible_num_l0_to_exclude; + ++j) { + current_candidate_size -= + sorted_runs_.at(start_index + j).compensated_file_size; + } + + // To ensure the compaction score before and after exclusion is similar + // so this exclusion will not make the size amp compaction of + // interest invalid from running as a size amp compaction as long as its + // pre-exclusion compaction score satisfies the condition to run. + if (current_candidate_size * 100 < + max_size_amplification_percent * base_sr_size || + current_candidate_size < candidate_size * 9 / 10) { + break; + } + num_l0_to_exclude = possible_num_l0_to_exclude; + candidate_size_post_exclusion = current_candidate_size; + } + + start_index += num_l0_to_exclude; + candidate_size = candidate_size_post_exclusion; + return num_l0_to_exclude; + } + const ImmutableOptions& ioptions_; const InternalKeyComparator* icmp_; double score_; @@ -134,8 +234,8 @@ class UniversalCompactionBuilder { UniversalCompactionPicker* picker_; LogBuffer* log_buffer_; - static std::vector CalculateSortedRuns( - const VersionStorageInfo& vstorage); + static std::vector CalculateSortedRuns( + const VersionStorageInfo& vstorage, int last_level); // Pick a path ID to place a newly generated file, with its estimated file // size. @@ -340,13 +440,13 @@ void UniversalCompactionBuilder::SortedRun::DumpSizeInfo( std::vector UniversalCompactionBuilder::CalculateSortedRuns( - const VersionStorageInfo& vstorage) { + const VersionStorageInfo& vstorage, int last_level) { std::vector ret; for (FileMetaData* f : vstorage.LevelFiles(0)) { ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size, f->being_compacted); } - for (int level = 1; level < vstorage.num_levels(); level++) { + for (int level = 1; level <= last_level; level++) { uint64_t total_compensated_size = 0U; uint64_t total_size = 0U; bool being_compacted = false; @@ -375,7 +475,9 @@ UniversalCompactionBuilder::CalculateSortedRuns( Compaction* UniversalCompactionBuilder::PickCompaction() { const int kLevel0 = 0; score_ = vstorage_->CompactionScore(kLevel0); - sorted_runs_ = CalculateSortedRuns(*vstorage_); + int max_output_level = + vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind); + sorted_runs_ = CalculateSortedRuns(*vstorage_, max_output_level); if (sorted_runs_.size() == 0 || (vstorage_->FilesMarkedForPeriodicCompaction().empty() && @@ -472,6 +574,8 @@ Compaction* UniversalCompactionBuilder::PickCompaction() { "UniversalCompactionBuilder::PickCompaction:Return", nullptr); return nullptr; } + assert(c->output_level() <= + vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind)); if (mutable_cf_options_.compaction_options_universal.allow_trivial_move == true && @@ -699,22 +803,18 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); int start_level = sorted_runs_[start_index].level; int output_level; + // last level is reserved for the files ingested behind + int max_output_level = + vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind); if (first_index_after == sorted_runs_.size()) { - output_level = vstorage_->num_levels() - 1; + output_level = max_output_level; } else if (sorted_runs_[first_index_after].level == 0) { output_level = 0; } else { output_level = sorted_runs_[first_index_after].level - 1; } - // last level is reserved for the files ingested behind - if (ioptions_.allow_ingest_behind && - (output_level == vstorage_->num_levels() - 1)) { - assert(output_level > 1); - output_level--; - } - - std::vector inputs(vstorage_->num_levels()); + std::vector inputs(max_output_level + 1); for (size_t i = 0; i < inputs.size(); ++i) { inputs[i].level = start_level + static_cast(i); } @@ -779,85 +879,67 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns( // Look at overall size amplification. If size amplification // exceeds the configured value, then do a compaction -// of the candidate files all the way upto the earliest -// base file (overrides configured values of file-size ratios, -// min_merge_width and max_merge_width). -// +// on longest span of candidate files without conflict with other compactions +// ending at the earliest base file (overriding configured values of file-size +// ratios, min_merge_width and max_merge_width). Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { - // percentage flexibility while reducing size amplification - uint64_t ratio = mutable_cf_options_.compaction_options_universal - .max_size_amplification_percent; - - unsigned int candidate_count = 0; - uint64_t candidate_size = 0; - size_t start_index = 0; - const SortedRun* sr = nullptr; - assert(!sorted_runs_.empty()); - if (sorted_runs_.back().being_compacted) { + + const size_t end_index = ShouldSkipLastSortedRunForSizeAmpCompaction() + ? sorted_runs_.size() - 2 + : sorted_runs_.size() - 1; + if (sorted_runs_[end_index].being_compacted) { return nullptr; } + const uint64_t base_sr_size = sorted_runs_[end_index].size; + size_t start_index = end_index; + uint64_t candidate_size = 0; + size_t num_l0_files = 0; - // Skip files that are already being compacted - for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) { - sr = &sorted_runs_[loop]; - if (!sr->being_compacted) { - start_index = loop; // Consider this as the first candidate. + // Get longest span (i.e, [start_index, end_index]) of available sorted runs + while (start_index > 0) { + const SortedRun* sr = &sorted_runs_[start_index - 1]; + if (sr->being_compacted) { + char file_num_buf[kFormatFileNumberBufSize]; + sr->Dump(file_num_buf, sizeof(file_num_buf), true); + ROCKS_LOG_BUFFER( + log_buffer_, + "[%s] Universal: stopping at sorted run undergoing compaction: " + "%s[%" ROCKSDB_PRIszt "]", + cf_name_.c_str(), file_num_buf, start_index - 1); break; } - char file_num_buf[kFormatFileNumberBufSize]; - sr->Dump(file_num_buf, sizeof(file_num_buf), true); - ROCKS_LOG_BUFFER(log_buffer_, - "[%s] Universal: skipping %s[%d] compacted %s", - cf_name_.c_str(), file_num_buf, loop, - " cannot be a candidate to reduce size amp.\n"); - sr = nullptr; + candidate_size += sr->compensated_file_size; + num_l0_files += sr->level == 0 ? 1 : 0; + --start_index; } - if (sr == nullptr) { - return nullptr; // no candidate files + if (start_index == end_index) { + return nullptr; } + + { + const size_t num_l0_to_exclude = MightExcludeNewL0sToReduceWriteStop( + num_l0_files, end_index, start_index, candidate_size); + ROCKS_LOG_BUFFER(log_buffer_, + "[%s] Universal: Excluding %" ROCKSDB_PRIszt + " latest L0 files to reduce potential write stop " + "triggered by `level0_stop_writes_trigger`", + cf_name_.c_str(), num_l0_to_exclude); + } + { char file_num_buf[kFormatFileNumberBufSize]; - sr->Dump(file_num_buf, sizeof(file_num_buf), true); + sorted_runs_[start_index].Dump(file_num_buf, sizeof(file_num_buf), true); ROCKS_LOG_BUFFER( log_buffer_, "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s", cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n"); } - // size of the base sorted run for size amp calculation - uint64_t base_sr_size = sorted_runs_.back().size; - size_t sr_end_idx = sorted_runs_.size() - 1; - // If tiered compaction is enabled and the last sorted run is the last level - if (ioptions_.preclude_last_level_data_seconds > 0 && - ioptions_.num_levels > 2 && - sorted_runs_.back().level == ioptions_.num_levels - 1 && - sorted_runs_.size() > 1) { - sr_end_idx = sorted_runs_.size() - 2; - base_sr_size = sorted_runs_[sr_end_idx].size; - } - - // keep adding up all the remaining files - for (size_t loop = start_index; loop < sr_end_idx; loop++) { - sr = &sorted_runs_[loop]; - if (sr->being_compacted) { - // TODO with incremental compaction is supported, we might want to - // schedule some incremental compactions in parallel if needed. - char file_num_buf[kFormatFileNumberBufSize]; - sr->Dump(file_num_buf, sizeof(file_num_buf), true); - ROCKS_LOG_BUFFER( - log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s", - cf_name_.c_str(), file_num_buf, start_index, - " is already being compacted. No size amp reduction possible.\n"); - return nullptr; - } - candidate_size += sr->compensated_file_size; - candidate_count++; - } - if (candidate_count == 0) { - return nullptr; - } + // percentage flexibility while reducing size amplification + const uint64_t ratio = mutable_cf_options_.compaction_options_universal + .max_size_amplification_percent; // size amplification = percentage of additional size if (candidate_size * 100 < ratio * base_sr_size) { @@ -894,7 +976,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() { } } return PickCompactionWithSortedRunRange( - start_index, sr_end_idx, CompactionReason::kUniversalSizeAmplification); + start_index, end_index, CompactionReason::kUniversalSizeAmplification); } Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp( @@ -1193,8 +1275,10 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { return nullptr; } + int max_output_level = + vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind); // Pick the first non-empty level after the start_level - for (output_level = start_level + 1; output_level < vstorage_->num_levels(); + for (output_level = start_level + 1; output_level <= max_output_level; output_level++) { if (vstorage_->NumLevelFiles(output_level) != 0) { break; @@ -1202,9 +1286,9 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { } // If all higher levels are empty, pick the highest level as output level - if (output_level == vstorage_->num_levels()) { + if (output_level > max_output_level) { if (start_level == 0) { - output_level = vstorage_->num_levels() - 1; + output_level = max_output_level; } else { // If start level is non-zero and all higher levels are empty, this // compaction will translate into a trivial move. Since the idea is @@ -1213,11 +1297,7 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() { return nullptr; } } - if (ioptions_.allow_ingest_behind && - output_level == vstorage_->num_levels() - 1) { - assert(output_level > 1); - output_level--; - } + assert(output_level <= max_output_level); if (output_level != 0) { if (start_level == 0) { @@ -1294,8 +1374,9 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange( uint32_t path_id = GetPathId(ioptions_, mutable_cf_options_, estimated_total_size); int start_level = sorted_runs_[start_index].level; - - std::vector inputs(vstorage_->num_levels()); + int max_output_level = + vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind); + std::vector inputs(max_output_level + 1); for (size_t i = 0; i < inputs.size(); ++i) { inputs[i].level = start_level + static_cast(i); } @@ -1332,13 +1413,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange( int output_level; if (end_index == sorted_runs_.size() - 1) { - // output files at the last level, unless it's reserved - output_level = vstorage_->num_levels() - 1; - // last level is reserved for the files ingested behind - if (ioptions_.allow_ingest_behind) { - assert(output_level > 1); - output_level--; - } + output_level = max_output_level; } else { // if it's not including all sorted_runs, it can only output to the level // above the `end_index + 1` sorted_run. @@ -1450,5 +1525,3 @@ uint64_t UniversalCompactionBuilder::GetMaxOverlappingBytes() const { } } } // namespace ROCKSDB_NAMESPACE - -#endif // !ROCKSDB_LITE diff --git a/db/compaction/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h index 558733195d85..cb1605969295 100644 --- a/db/compaction/compaction_picker_universal.h +++ b/db/compaction/compaction_picker_universal.h @@ -8,7 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#ifndef ROCKSDB_LITE #include "db/compaction/compaction_picker.h" @@ -29,4 +28,3 @@ class UniversalCompactionPicker : public CompactionPicker { const VersionStorageInfo* vstorage) const override; }; } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc index 1f6c0b710108..3149bb500258 100644 --- a/db/compaction/compaction_service_job.cc +++ b/db/compaction/compaction_service_job.cc @@ -16,7 +16,6 @@ #include "options/options_helper.h" #include "rocksdb/utilities/options_type.h" -#ifndef ROCKSDB_LITE namespace ROCKSDB_NAMESPACE { class SubcompactionState; @@ -832,4 +831,3 @@ bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other, #endif // NDEBUG } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/db/compaction/compaction_service_test.cc b/db/compaction/compaction_service_test.cc index c475c4e3be03..7c87f88d1be0 100644 --- a/db/compaction/compaction_service_test.cc +++ b/db/compaction/compaction_service_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "db/db_test_util.h" #include "port/stack_trace.h" @@ -929,7 +928,7 @@ TEST_F(CompactionServiceTest, TablePropertiesCollector) { } ASSERT_OK(Flush()); } - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props)); @@ -953,14 +952,3 @@ int main(int argc, char** argv) { RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } - -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "SKIPPED as CompactionService is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/db/compaction/file_pri.h b/db/compaction/file_pri.h index 82dddcf9384a..e60d73e88d4e 100644 --- a/db/compaction/file_pri.h +++ b/db/compaction/file_pri.h @@ -53,8 +53,10 @@ class FileTtlBooster { enabled_ = true; uint64_t all_boost_start_age = ttl / 2; uint64_t all_boost_age_range = (ttl / 32) * 31 - all_boost_start_age; + // TODO(cbi): more reasonable algorithm that gives different values + // when num_non_empty_levels - level - 1 > 63. uint64_t boost_age_range = - all_boost_age_range >> (num_non_empty_levels - level - 1); + all_boost_age_range >> std::min(63, num_non_empty_levels - level - 1); boost_age_start_ = all_boost_start_age + boost_age_range; const uint64_t kBoostRatio = 16; // prevent 0 value to avoid divide 0 error. diff --git a/db/compaction/sst_partitioner.cc b/db/compaction/sst_partitioner.cc index 9e7f9fa89252..2f4d87935724 100644 --- a/db/compaction/sst_partitioner.cc +++ b/db/compaction/sst_partitioner.cc @@ -15,11 +15,9 @@ namespace ROCKSDB_NAMESPACE { static std::unordered_map sst_fixed_prefix_type_info = { -#ifndef ROCKSDB_LITE {"length", {0, OptionType::kSizeT, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, -#endif // ROCKSDB_LITE }; SstPartitionerFixedPrefixFactory::SstPartitionerFixedPrefixFactory(size_t len) @@ -58,7 +56,6 @@ std::shared_ptr NewSstPartitionerFixedPrefixFactory( return std::make_shared(prefix_len); } -#ifndef ROCKSDB_LITE namespace { static int RegisterSstPartitionerFactories(ObjectLibrary& library, const std::string& /*arg*/) { @@ -73,18 +70,14 @@ static int RegisterSstPartitionerFactories(ObjectLibrary& library, return 1; } } // namespace -#endif // ROCKSDB_LITE Status SstPartitionerFactory::CreateFromString( const ConfigOptions& options, const std::string& value, std::shared_ptr* result) { -#ifndef ROCKSDB_LITE static std::once_flag once; std::call_once(once, [&]() { RegisterSstPartitionerFactories(*(ObjectLibrary::Default().get()), ""); }); -#endif // ROCKSDB_LITE - return LoadSharedObject(options, value, nullptr, - result); + return LoadSharedObject(options, value, result); } } // namespace ROCKSDB_NAMESPACE diff --git a/db/compaction/subcompaction_state.h b/db/compaction/subcompaction_state.h index c748be31bb53..b933a62a51fa 100644 --- a/db/compaction/subcompaction_state.h +++ b/db/compaction/subcompaction_state.h @@ -84,6 +84,11 @@ class SubcompactionState { // Assign range dels aggregator, for each range_del, it can only be assigned // to one output level, for per_key_placement, it's going to be the // penultimate level. + // TODO: This does not work for per_key_placement + user-defined timestamp + + // DeleteRange() combo. If user-defined timestamp is enabled, + // it is possible for a range tombstone to belong to bottommost level ( + // seqno < earliest snapshot) without being dropped (garbage collection + // for user-defined timestamp). void AssignRangeDelAggregator( std::unique_ptr&& range_del_agg) { if (compaction->SupportsPerKeyPlacement()) { @@ -99,7 +104,6 @@ class SubcompactionState { penultimate_level_outputs_.RemoveLastEmptyOutput(); } -#ifndef ROCKSDB_LITE void BuildSubcompactionJobInfo( SubcompactionJobInfo& subcompaction_job_info) const { const Compaction* c = compaction; @@ -113,7 +117,6 @@ class SubcompactionState { subcompaction_job_info.output_level = c->output_level(); subcompaction_job_info.stats = compaction_job_stats; } -#endif // !ROCKSDB_LITE SubcompactionState() = delete; SubcompactionState(const SubcompactionState&) = delete; diff --git a/db/compaction/tiered_compaction_test.cc b/db/compaction/tiered_compaction_test.cc index f4837dcf9b3b..779b980d825d 100644 --- a/db/compaction/tiered_compaction_test.cc +++ b/db/compaction/tiered_compaction_test.cc @@ -17,7 +17,6 @@ namespace ROCKSDB_NAMESPACE { -#if !defined(ROCKSDB_LITE) class TieredCompactionTest : public DBTestBase, public testing::WithParamInterface { @@ -210,7 +209,7 @@ TEST_P(TieredCompactionTest, SequenceBasedTieredStorageUniversal) { seq_history.emplace_back(dbfull()->GetLatestSequenceNumber()); expect_stats[0].Add(kBasicFlushStats); } - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // the penultimate level file temperature is not cold, all data are output to // the penultimate level. @@ -375,7 +374,7 @@ TEST_P(TieredCompactionTest, RangeBasedTieredStorageUniversal) { ASSERT_OK(Flush()); expect_stats[0].Add(kBasicFlushStats); } - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); @@ -446,8 +445,8 @@ TEST_P(TieredCompactionTest, RangeBasedTieredStorageUniversal) { } ASSERT_OK(Flush()); } - ASSERT_OK(dbfull()->WaitForCompact( - true)); // make sure the compaction is able to finish + // make sure the compaction is able to finish + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); @@ -912,7 +911,7 @@ TEST_P(TieredCompactionTest, SequenceBasedTieredStorageLevel) { ASSERT_OK(Flush()); expect_stats[0].Add(kBasicFlushStats); } - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // non last level is hot ASSERT_EQ("0,1", FilesPerLevel()); @@ -955,7 +954,7 @@ TEST_P(TieredCompactionTest, SequenceBasedTieredStorageLevel) { ASSERT_OK(Flush()); seq_history.emplace_back(dbfull()->GetLatestSequenceNumber()); } - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel()); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); @@ -1006,7 +1005,7 @@ TEST_P(TieredCompactionTest, SequenceBasedTieredStorageLevel) { ASSERT_OK(Flush()); seq_history.emplace_back(dbfull()->GetLatestSequenceNumber()); } - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); latest_cold_seq = seq_history[0]; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); @@ -1112,6 +1111,7 @@ TEST_P(TieredCompactionTest, RangeBasedTieredStorageLevel) { options.num_levels = kNumLevels; options.statistics = CreateDBStatistics(); options.max_subcompactions = 10; + options.preclude_last_level_data_seconds = 10000; DestroyAndReopen(options); auto cmp = options.comparator; @@ -1135,7 +1135,7 @@ TEST_P(TieredCompactionTest, RangeBasedTieredStorageLevel) { } ASSERT_OK(Flush()); } - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); @@ -1202,20 +1202,126 @@ TEST_P(TieredCompactionTest, RangeBasedTieredStorageLevel) { ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); - ASSERT_EQ( options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE), 1); + + // Tests that we only compact keys up to penultimate level + // that are within penultimate level input's internal key range. + { + MutexLock l(&mutex); + hot_start = Key(0); + hot_end = Key(100); + } + const Snapshot* temp_snap = db_->GetSnapshot(); + // Key(0) and Key(1) here are inserted with higher sequence number + // than Key(0) and Key(1) inserted above. + // Only Key(0) in last level will be compacted up, not Key(1). + ASSERT_OK(Put(Key(0), "value" + std::to_string(0))); + ASSERT_OK(Put(Key(1), "value" + std::to_string(100))); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel()); + { + std::vector metas; + db_->GetLiveFilesMetaData(&metas); + for (const auto& f : metas) { + if (f.temperature == Temperature::kUnknown) { + // Expect Key(0), Key(0), Key(1) + ASSERT_EQ(f.num_entries, 3); + ASSERT_EQ(f.smallestkey, Key(0)); + ASSERT_EQ(f.largestkey, Key(1)); + } else { + ASSERT_EQ(f.temperature, Temperature::kCold); + // Key(2)-Key(49) and Key(100). + ASSERT_EQ(f.num_entries, 50); + } + } + } + db_->ReleaseSnapshot(temp_snap); } INSTANTIATE_TEST_CASE_P(TieredCompactionTest, TieredCompactionTest, testing::Bool()); +TEST_P(TieredCompactionTest, CheckInternalKeyRange) { + // When compacting keys from the last level to penultimate level, + // output to penultimate level should be within internal key range + // of input files from penultimate level. + // Set up: + // L5: + // File 1: DeleteRange[1, 3)@4, File 2: [3@5, 100@6] + // L6: + // File 3: [2@1, 3@2], File 4: [50@3] + // + // When File 1 and File 3 are being compacted, + // Key(3) cannot be compacted up, otherwise it causes + // inconsistency where File 3's Key(3) has a lower sequence number + // than File 2's Key(3). + const int kNumLevels = 7; + auto options = CurrentOptions(); + SetColdTemperature(options); + options.level_compaction_dynamic_level_bytes = true; + options.num_levels = kNumLevels; + options.statistics = CreateDBStatistics(); + options.max_subcompactions = 10; + options.preclude_last_level_data_seconds = 10000; + DestroyAndReopen(options); + auto cmp = options.comparator; + + std::string hot_start = Key(0); + std::string hot_end = Key(0); + SyncPoint::GetInstance()->SetCallBack( + "CompactionIterator::PrepareOutput.context", [&](void* arg) { + auto context = static_cast(arg); + context->output_to_penultimate_level = + cmp->Compare(context->key, hot_start) >= 0 && + cmp->Compare(context->key, hot_end) < 0; + }); + SyncPoint::GetInstance()->EnableProcessing(); + // File 1 + ASSERT_OK(Put(Key(2), "val2")); + ASSERT_OK(Put(Key(3), "val3")); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + // File 2 + ASSERT_OK(Put(Key(50), "val50")); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + + const Snapshot* snapshot = db_->GetSnapshot(); + hot_end = Key(100); + std::string start = Key(1); + std::string end = Key(3); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end)); + ASSERT_OK(Flush()); + MoveFilesToLevel(5); + // File 3 + ASSERT_OK(Put(Key(3), "vall")); + ASSERT_OK(Put(Key(100), "val100")); + ASSERT_OK(Flush()); + MoveFilesToLevel(5); + // Try to compact keys up + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + start = Key(1); + end = Key(2); + Slice begin_slice(start); + Slice end_slice(end); + ASSERT_OK(db_->CompactRange(cro, &begin_slice, &end_slice)); + // Without internal key range checking, we get the following error: + // Corruption: force_consistency_checks(DEBUG): VersionBuilder: L5 has + // overlapping ranges: file #18 largest key: '6B6579303030303033' seq:102, + // type:1 vs. file #15 smallest key: '6B6579303030303033' seq:104, type:1 + db_->ReleaseSnapshot(snapshot); +} + class PrecludeLastLevelTest : public DBTestBase { public: PrecludeLastLevelTest() : DBTestBase("preclude_last_level_test", /*env_do_fsync=*/false) { mock_clock_ = std::make_shared(env_->GetSystemClock()); + mock_clock_->SetCurrentTime(kMockStartTime); mock_env_ = std::make_unique(env_, mock_clock_); } @@ -1223,6 +1329,10 @@ class PrecludeLastLevelTest : public DBTestBase { std::unique_ptr mock_env_; std::shared_ptr mock_clock_; + // Sufficient starting time that preserve time doesn't under-flow into + // pre-history + static constexpr uint32_t kMockStartTime = 10000000; + void SetUp() override { mock_clock_->InstallTimedWaitFixCallback(); SyncPoint::GetInstance()->SetCallBack( @@ -1231,7 +1341,7 @@ class PrecludeLastLevelTest : public DBTestBase { reinterpret_cast(arg); periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get()); }); - mock_clock_->SetCurrentTime(0); + mock_clock_->SetCurrentTime(kMockStartTime); } }; @@ -1249,11 +1359,6 @@ TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeManualCompaction) { options.num_levels = kNumLevels; DestroyAndReopen(options); - // pass some time first, otherwise the first a few keys write time are going - // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeriodicTaskRun( - [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); - int sst_num = 0; // Write files that are overlap and enough to trigger compaction for (; sst_num < kNumTrigger; sst_num++) { @@ -1265,7 +1370,7 @@ TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeManualCompaction) { } ASSERT_OK(Flush()); } - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // all data is pushed to the last level ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); @@ -1311,11 +1416,6 @@ TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeAutoCompaction) { options.num_levels = kNumLevels; DestroyAndReopen(options); - // pass some time first, otherwise the first a few keys write time are going - // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeriodicTaskRun( - [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); - int sst_num = 0; // Write files that are overlap and enough to trigger compaction for (; sst_num < kNumTrigger; sst_num++) { @@ -1327,7 +1427,7 @@ TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeAutoCompaction) { } ASSERT_OK(Flush()); } - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // all data is pushed to the last level ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); @@ -1360,7 +1460,7 @@ TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeAutoCompaction) { }); } ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } // all data is moved up to the penultimate level @@ -1387,11 +1487,6 @@ TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimePartial) { options.num_levels = kNumLevels; DestroyAndReopen(options); - // pass some time first, otherwise the first a few keys write time are going - // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeriodicTaskRun( - [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); - int sst_num = 0; // Write files that are overlap and enough to trigger compaction for (; sst_num < kNumTrigger; sst_num++) { @@ -1403,7 +1498,7 @@ TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimePartial) { } ASSERT_OK(Flush()); } - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // all data is pushed to the last level ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); @@ -1514,11 +1609,6 @@ TEST_F(PrecludeLastLevelTest, LastLevelOnlyCompactionPartial) { options.num_levels = kNumLevels; DestroyAndReopen(options); - // pass some time first, otherwise the first a few keys write time are going - // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeriodicTaskRun( - [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); - int sst_num = 0; // Write files that are overlap and enough to trigger compaction for (; sst_num < kNumTrigger; sst_num++) { @@ -1530,7 +1620,7 @@ TEST_F(PrecludeLastLevelTest, LastLevelOnlyCompactionPartial) { } ASSERT_OK(Flush()); } - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // all data is pushed to the last level ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); @@ -1592,11 +1682,6 @@ TEST_P(PrecludeLastLevelTestWithParms, LastLevelOnlyCompactionNoPreclude) { options.num_levels = kNumLevels; DestroyAndReopen(options); - // pass some time first, otherwise the first a few keys write time are going - // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeriodicTaskRun( - [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); - Random rnd(301); int sst_num = 0; // Write files that are overlap and enough to trigger compaction @@ -1609,7 +1694,7 @@ TEST_P(PrecludeLastLevelTestWithParms, LastLevelOnlyCompactionNoPreclude) { } ASSERT_OK(Flush()); } - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // all data is pushed to the last level ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); @@ -1705,7 +1790,7 @@ TEST_P(PrecludeLastLevelTestWithParms, LastLevelOnlyCompactionNoPreclude) { manual_compaction_thread.join(); - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (enable_preclude_last_level) { ASSERT_NE("0,0,0,0,0,1,1", FilesPerLevel()); @@ -1841,7 +1926,7 @@ TEST_P(PrecludeLastLevelTestWithParms, PeriodicCompactionToPenultimateLevel) { } ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); stop_token.reset(); @@ -1906,11 +1991,6 @@ TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompaction) { options.num_levels = kNumLevels; DestroyAndReopen(options); - // pass some time first, otherwise the first a few keys write time are going - // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeriodicTaskRun( - [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); - Random rnd(301); for (int i = 0; i < 300; i++) { @@ -1940,7 +2020,7 @@ TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompaction) { ASSERT_OK(Flush()); } - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // L5: [0,19] [20,39] [40,299] // L6: [0, 299] @@ -2017,12 +2097,6 @@ TEST_F(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) { options.target_file_size_base = kFileBytes; DestroyAndReopen(options); - // pass some time first, otherwise the first a few keys write time are going - // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeriodicTaskRun([&] { - mock_clock_->MockSleepForSeconds(static_cast(kSecondsPerKey)); - }); - // Flush an L0 file with the following contents (new to old): // // Range deletions [4, 6) [7, 8) [9, 11) @@ -2106,7 +2180,7 @@ TEST_F(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) { Slice begin_key(begin_key_buf), end_key(end_key_buf); ASSERT_OK(db_->SuggestCompactRange(db_->DefaultColumnFamily(), &begin_key, &end_key)); - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,3,3", FilesPerLevel()); ASSERT_EQ(1, per_key_comp_num); verify_db(); @@ -2116,7 +2190,7 @@ TEST_F(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) { db_->ReleaseSnapshot(snap2); ASSERT_OK(db_->SuggestCompactRange(db_->DefaultColumnFamily(), &begin_key, &end_key)); - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,3,3", FilesPerLevel()); ASSERT_EQ(2, per_key_comp_num); verify_db(); @@ -2126,7 +2200,7 @@ TEST_F(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) { db_->ReleaseSnapshot(snap1); ASSERT_OK(db_->SuggestCompactRange(db_->DefaultColumnFamily(), &begin_key, &end_key)); - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,2,3", FilesPerLevel()); ASSERT_EQ(3, per_key_comp_num); verify_db(); @@ -2139,18 +2213,10 @@ TEST_F(PrecludeLastLevelTest, RangeDelsCauseFileEndpointsToOverlap) { Close(); } -#endif // !defined(ROCKSDB_LITE) - } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { -#if !defined(ROCKSDB_LITE) ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - (void)argc; - (void)argv; - return 0; -#endif } diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index e5e3493b3e6b..0bf79bef1917 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -77,7 +77,7 @@ void DoRandomIteraratorTest(DB* db, std::vector source_strings, for (int i = 0; i < num_writes; i++) { if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) { - db->Flush(FlushOptions()); + ASSERT_OK(db->Flush(FlushOptions())); } int type = rnd->Uniform(2); @@ -156,6 +156,7 @@ void DoRandomIteraratorTest(DB* db, std::vector source_strings, if (map.find(key) == map.end()) { ASSERT_TRUE(status.IsNotFound()); } else { + ASSERT_OK(status); ASSERT_EQ(map[key], result); } break; @@ -164,6 +165,7 @@ void DoRandomIteraratorTest(DB* db, std::vector source_strings, AssertItersEqual(iter.get(), result_iter.get()); is_valid = iter->Valid(); } + ASSERT_OK(iter->status()); } class DoubleComparator : public Comparator { diff --git a/db/convenience.cc b/db/convenience.cc index 6344d356df35..08bddc8e8f62 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -4,10 +4,10 @@ // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include "rocksdb/convenience.h" +#include "db/convenience_impl.h" #include "db/db_impl/db_impl.h" #include "util/cast_util.h" @@ -34,13 +34,31 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, Status VerifySstFileChecksum(const Options& options, const EnvOptions& env_options, const std::string& file_path) { - return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + return VerifySstFileChecksum(options, env_options, read_options, file_path); } Status VerifySstFileChecksum(const Options& options, const EnvOptions& env_options, - const ReadOptions& read_options, + const ReadOptions& _read_options, const std::string& file_path, const SequenceNumber& largest_seqno) { + if (_read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Can only call VerifySstFileChecksum with `ReadOptions::io_activity` " + "is " + "`Env::IOActivity::kUnknown`"); + } + ReadOptions read_options(_read_options); + return VerifySstFileChecksumInternal(options, env_options, read_options, + file_path, largest_seqno); +} + +Status VerifySstFileChecksumInternal(const Options& options, + const EnvOptions& env_options, + const ReadOptions& read_options, + const std::string& file_path, + const SequenceNumber& largest_seqno) { std::unique_ptr file; uint64_t file_size; InternalKeyComparator internal_comparator(options.comparator); @@ -57,17 +75,18 @@ Status VerifySstFileChecksum(const Options& options, std::unique_ptr file_reader( new RandomAccessFileReader( std::move(file), file_path, ioptions.clock, nullptr /* io_tracer */, - nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */, - ioptions.rate_limiter.get())); + ioptions.stats /* stats */, + Histograms::SST_READ_MICROS /* hist_type */, + nullptr /* file_read_hist */, ioptions.rate_limiter.get())); const bool kImmortal = true; auto reader_options = TableReaderOptions( ioptions, options.prefix_extractor, env_options, internal_comparator, - false /* skip_filters */, !kImmortal, false /* force_direct_prefetch */, - -1 /* level */); + options.block_protection_bytes_per_key, false /* skip_filters */, + !kImmortal, false /* force_direct_prefetch */, -1 /* level */); reader_options.largest_seqno = largest_seqno; s = ioptions.table_factory->NewTableReader( - reader_options, std::move(file_reader), file_size, &table_reader, - false /* prefetch_index_and_filter_in_cache */); + read_options, reader_options, std::move(file_reader), file_size, + &table_reader, false /* prefetch_index_and_filter_in_cache */); if (!s.ok()) { return s; } @@ -77,5 +96,3 @@ Status VerifySstFileChecksum(const Options& options, } } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/db/convenience_impl.h b/db/convenience_impl.h new file mode 100644 index 000000000000..32f4476bde99 --- /dev/null +++ b/db/convenience_impl.h @@ -0,0 +1,15 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include "rocksdb/db.h" + +namespace ROCKSDB_NAMESPACE { +Status VerifySstFileChecksumInternal(const Options& options, + const EnvOptions& env_options, + const ReadOptions& read_options, + const std::string& file_path, + const SequenceNumber& largest_seqno = 0); +} // namespace ROCKSDB_NAMESPACE diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 7544d098c440..d1cb022588f9 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -8,7 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/options.h" -#ifndef ROCKSDB_LITE #include #include @@ -43,33 +42,36 @@ namespace ROCKSDB_NAMESPACE { static constexpr int kValueSize = 1000; namespace { // A wrapper that allows injection of errors. -class ErrorEnv : public EnvWrapper { +class ErrorFS : public FileSystemWrapper { public: bool writable_file_error_; int num_writable_file_errors_; - explicit ErrorEnv(Env* _target) - : EnvWrapper(_target), + explicit ErrorFS(const std::shared_ptr& _target) + : FileSystemWrapper(_target), writable_file_error_(false), num_writable_file_errors_(0) {} const char* Name() const override { return "ErrorEnv"; } - virtual Status NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& soptions) override { + virtual IOStatus NewWritableFile(const std::string& fname, + const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { result->reset(); if (writable_file_error_) { ++num_writable_file_errors_; - return Status::IOError(fname, "fake error"); + return IOStatus::IOError(fname, "fake error"); } - return target()->NewWritableFile(fname, result, soptions); + return target()->NewWritableFile(fname, opts, result, dbg); } }; } // anonymous namespace class CorruptionTest : public testing::Test { public: std::shared_ptr env_guard_; - ErrorEnv* env_; + std::shared_ptr fs_; + std::unique_ptr env_; + Env* base_env_; std::string dbname_; std::shared_ptr tiny_cache_; Options options_; @@ -80,14 +82,15 @@ class CorruptionTest : public testing::Test { // set it to 0), test SequenceNumberRecovery will fail, likely because of a // bug in recovery code. Keep it 4 for now to make the test passes. tiny_cache_ = NewLRUCache(100, 4); - Env* base_env = Env::Default(); + base_env_ = Env::Default(); EXPECT_OK( - test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_)); - EXPECT_NE(base_env, nullptr); - env_ = new ErrorEnv(base_env); + test::CreateEnvFromSystem(ConfigOptions(), &base_env_, &env_guard_)); + EXPECT_NE(base_env_, nullptr); + fs_.reset(new ErrorFS(base_env_->GetFileSystem())); + env_ = NewCompositeEnv(fs_); options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; - options_.env = env_; - dbname_ = test::PerThreadDBPath(env_, "corruption_test"); + options_.env = env_.get(); + dbname_ = test::PerThreadDBPath(env_.get(), "corruption_test"); Status s = DestroyDB(dbname_, options_); EXPECT_OK(s); @@ -110,10 +113,9 @@ class CorruptionTest : public testing::Test { fprintf(stdout, "db is still at %s\n", dbname_.c_str()); } else { Options opts; - opts.env = env_->target(); + opts.env = base_env_; EXPECT_OK(DestroyDB(dbname_, opts)); } - delete env_; } void CloseDb() { @@ -128,7 +130,7 @@ class CorruptionTest : public testing::Test { if (opt.env == Options().env) { // If env is not overridden, replace it with ErrorEnv. // Otherwise, the test already uses a non-default Env. - opt.env = env_; + opt.env = env_.get(); } opt.arena_block_size = 4096; BlockBasedTableOptions table_options; @@ -165,6 +167,10 @@ class CorruptionTest : public testing::Test { void Build(int n, int flush_every = 0) { Build(n, 0, flush_every); } void Check(int min_expected, int max_expected) { + Check(min_expected, max_expected, ReadOptions(false, true)); + } + + void Check(int min_expected, int max_expected, ReadOptions read_options) { uint64_t next_expected = 0; uint64_t missed = 0; int bad_keys = 0; @@ -176,7 +182,7 @@ class CorruptionTest : public testing::Test { // Instead, we want the reads to be successful and this test // will detect whether the appropriate corruptions have // occurred. - Iterator* iter = db_->NewIterator(ReadOptions(false, true)); + Iterator* iter = db_->NewIterator(read_options); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ASSERT_OK(iter->status()); uint64_t key; @@ -223,7 +229,7 @@ class CorruptionTest : public testing::Test { } ASSERT_TRUE(!fname.empty()) << filetype; - ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt, + ASSERT_OK(test::CorruptFile(env_.get(), fname, offset, bytes_to_corrupt, /*verify_checksum*/ filetype == kTableFile)); } @@ -234,7 +240,7 @@ class CorruptionTest : public testing::Test { db_->GetLiveFilesMetaData(&metadata); for (const auto& m : metadata) { if (m.level == level) { - ASSERT_OK(test::CorruptFile(env_, dbname_ + "/" + m.name, offset, + ASSERT_OK(test::CorruptFile(env_.get(), dbname_ + "/" + m.name, offset, bytes_to_corrupt)); return; } @@ -308,7 +314,7 @@ class CorruptionTest : public testing::Test { if (bytes_to_truncate == 0) { new_size = 0; } - ASSERT_OK(test::TruncateFile(env_, path, new_size)); + ASSERT_OK(test::TruncateFile(env_.get(), path, new_size)); } }; @@ -402,14 +408,14 @@ TEST_F(CorruptionTest, PostPITRCorruptionWALsRetained) { } TEST_F(CorruptionTest, RecoverWriteError) { - env_->writable_file_error_ = true; + fs_->writable_file_error_ = true; Status s = TryReopen(); ASSERT_TRUE(!s.ok()); } TEST_F(CorruptionTest, NewFileErrorDuringWrite) { // Do enough writing to force minor compaction - env_->writable_file_error_ = true; + fs_->writable_file_error_ = true; const int num = static_cast(3 + (Options().write_buffer_size / kValueSize)); std::string value_storage; @@ -425,8 +431,8 @@ TEST_F(CorruptionTest, NewFileErrorDuringWrite) { ASSERT_TRUE(!failed || !s.ok()); } ASSERT_TRUE(!s.ok()); - ASSERT_GE(env_->num_writable_file_errors_, 1); - env_->writable_file_error_ = false; + ASSERT_GE(fs_->num_writable_file_errors_, 1); + fs_->writable_file_error_ = false; Reopen(); } @@ -444,7 +450,8 @@ TEST_F(CorruptionTest, TableFile) { TEST_F(CorruptionTest, VerifyChecksumReadahead) { Options options; - SpecialEnv senv(env_->target()); + options.level_compaction_dynamic_level_bytes = false; + SpecialEnv senv(base_env_); options.env = &senv; // Disable block cache as we are going to check checksum for // the same file twice and measure number of reads. @@ -497,6 +504,7 @@ TEST_F(CorruptionTest, VerifyChecksumReadahead) { TEST_F(CorruptionTest, TableFileIndexData) { Options options; + options.level_compaction_dynamic_level_bytes = false; // very big, we'll trigger flushes manually options.write_buffer_size = 100 * 1024 * 1024; Reopen(&options); @@ -512,7 +520,7 @@ TEST_F(CorruptionTest, TableFileIndexData) { dbi = static_cast_with_check(db_); // one full file may be readable, since only one was corrupted // the other file should be fully non-readable, since index was corrupted - Check(0, 5000); + Check(0, 5000, ReadOptions(true, true)); ASSERT_NOK(dbi->VerifyChecksum()); // In paranoid mode, the db cannot be opened due to the corrupted file. @@ -587,8 +595,8 @@ TEST_F(CorruptionTest, TableFileWrongSize) { // Make the file smaller with truncation. // First leaving a partial footer, and then completely removing footer. for (size_t bytes_lost : {8, 100}) { - ASSERT_OK( - test::TruncateFile(env_, filename, metadata[0].size - bytes_lost)); + ASSERT_OK(test::TruncateFile(env_.get(), filename, + metadata[0].size - bytes_lost)); // Reported well with paranoid checks options_.paranoid_checks = true; @@ -653,7 +661,8 @@ TEST_F(CorruptionTest, CorruptedDescriptor) { TEST_F(CorruptionTest, CompactionInputError) { Options options; - options.env = env_; + options.level_compaction_dynamic_level_bytes = false; + options.env = env_.get(); Reopen(&options); Build(10); DBImpl* dbi = static_cast_with_check(db_); @@ -674,7 +683,8 @@ TEST_F(CorruptionTest, CompactionInputError) { TEST_F(CorruptionTest, CompactionInputErrorParanoid) { Options options; - options.env = env_; + options.level_compaction_dynamic_level_bytes = false; + options.env = env_.get(); options.paranoid_checks = true; options.write_buffer_size = 131072; options.max_write_buffer_number = 2; @@ -756,12 +766,14 @@ TEST_F(CorruptionTest, RangeDeletionCorrupted) { fs->GetFileSize(filename, file_opts.io_options, &file_size, nullptr)); BlockHandle range_del_handle; - ASSERT_OK(FindMetaBlockInFile( - file_reader.get(), file_size, kBlockBasedTableMagicNumber, - ImmutableOptions(options_), kRangeDelBlockName, &range_del_handle)); + const ReadOptions read_options; + ASSERT_OK(FindMetaBlockInFile(file_reader.get(), file_size, + kBlockBasedTableMagicNumber, + ImmutableOptions(options_), read_options, + kRangeDelBlockName, &range_del_handle)); ASSERT_OK(TryReopen()); - ASSERT_OK(test::CorruptFile(env_, filename, + ASSERT_OK(test::CorruptFile(env_.get(), filename, static_cast(range_del_handle.offset()), 1)); ASSERT_TRUE(TryReopen().IsCorruption()); } @@ -769,7 +781,8 @@ TEST_F(CorruptionTest, RangeDeletionCorrupted) { TEST_F(CorruptionTest, FileSystemStateCorrupted) { for (int iter = 0; iter < 2; ++iter) { Options options; - options.env = env_; + options.level_compaction_dynamic_level_bytes = false; + options.env = env_.get(); options.paranoid_checks = true; options.create_if_missing = true; Reopen(&options); @@ -808,7 +821,8 @@ static const auto& corruption_modes = { TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) { Options options; - options.env = env_; + options.level_compaction_dynamic_level_bytes = false; + options.env = env_.get(); options.check_flush_compaction_key_order = false; options.paranoid_file_checks = true; options.create_if_missing = true; @@ -836,7 +850,8 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) { TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) { Options options; - options.env = env_; + options.level_compaction_dynamic_level_bytes = false; + options.env = env_.get(); options.paranoid_file_checks = true; options.create_if_missing = true; options.check_flush_compaction_key_order = false; @@ -869,7 +884,8 @@ TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) { TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) { Options options; - options.env = env_; + options.level_compaction_dynamic_level_bytes = false; + options.env = env_.get(); options.check_flush_compaction_key_order = false; options.paranoid_file_checks = true; options.create_if_missing = true; @@ -905,7 +921,8 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) { TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) { Options options; - options.env = env_; + options.level_compaction_dynamic_level_bytes = false; + options.env = env_.get(); options.check_flush_compaction_key_order = false; options.paranoid_file_checks = true; options.create_if_missing = true; @@ -944,7 +961,8 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) { TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) { Options options; - options.env = env_; + options.level_compaction_dynamic_level_bytes = false; + options.env = env_.get(); options.check_flush_compaction_key_order = false; options.paranoid_file_checks = true; options.create_if_missing = true; @@ -980,7 +998,8 @@ TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) { TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) { Options options; - options.env = env_; + options.level_compaction_dynamic_level_bytes = false; + options.env = env_.get(); options.create_if_missing = true; options.allow_data_in_errors = true; auto mode = mock::MockTableFactory::kCorruptKey; @@ -1009,7 +1028,8 @@ TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) { TEST_F(CorruptionTest, CompactionKeyOrderCheck) { Options options; - options.env = env_; + options.level_compaction_dynamic_level_bytes = false; + options.env = env_.get(); options.paranoid_file_checks = false; options.create_if_missing = true; options.check_flush_compaction_key_order = false; @@ -1036,7 +1056,8 @@ TEST_F(CorruptionTest, CompactionKeyOrderCheck) { TEST_F(CorruptionTest, FlushKeyOrderCheck) { Options options; - options.env = env_; + options.level_compaction_dynamic_level_bytes = false; + options.env = env_.get(); options.paranoid_file_checks = false; options.create_if_missing = true; ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}})); @@ -1089,7 +1110,8 @@ TEST_F(CorruptionTest, DisableKeyOrderCheck) { TEST_F(CorruptionTest, VerifyWholeTableChecksum) { CloseDb(); Options options; - options.env = env_; + options.level_compaction_dynamic_level_bytes = false; + options.env = env_.get(); ASSERT_OK(DestroyDB(dbname_, options)); options.create_if_missing = true; options.file_checksum_gen_factory = @@ -1174,11 +1196,12 @@ INSTANTIATE_TEST_CASE_P(CorruptionTest, CrashDuringRecoveryWithCorruptionTest, TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) { CloseDb(); Options options; + options.level_compaction_dynamic_level_bytes = false; options.track_and_verify_wals_in_manifest = track_and_verify_wals_in_manifest_; options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; options.avoid_flush_during_recovery = false; - options.env = env_; + options.env = env_.get(); ASSERT_OK(DestroyDB(dbname_, options)); options.create_if_missing = true; options.max_write_buffer_number = 8; @@ -1346,11 +1369,12 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) { TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) { CloseDb(); Options options; + options.level_compaction_dynamic_level_bytes = false; options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; options.track_and_verify_wals_in_manifest = track_and_verify_wals_in_manifest_; options.avoid_flush_during_recovery = false; - options.env = env_; + options.env = env_.get(); ASSERT_OK(DestroyDB(dbname_, options)); options.create_if_missing = true; options.max_write_buffer_number = 3; @@ -1543,9 +1567,10 @@ TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) { TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecoveryWithFlush) { CloseDb(); Options options; + options.level_compaction_dynamic_level_bytes = false; options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; options.avoid_flush_during_recovery = false; - options.env = env_; + options.env = env_.get(); options.create_if_missing = true; ASSERT_OK(DestroyDB(dbname_, options)); @@ -1660,13 +1685,3 @@ int main(int argc, char** argv) { RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } - -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as RepairDB() is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index 868b798ea58f..dec5c05a335d 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" @@ -40,6 +39,7 @@ class CuckooTableDBTest : public testing::Test { Options CurrentOptions() { Options options; + options.level_compaction_dynamic_level_bytes = false; options.table_factory.reset(NewCuckooTableFactory()); options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true)); options.allow_mmap_reads = true; @@ -350,12 +350,3 @@ int main(int argc, char** argv) { } } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index f180d3ff9cb6..ba2609473805 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -20,9 +20,7 @@ #include "rocksdb/utilities/debug.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_builder.h" -#if !defined(ROCKSDB_LITE) #include "test_util/sync_point.h" -#endif #include "util/file_checksum_helper.h" #include "util/random.h" #include "utilities/counted_fs.h" @@ -98,7 +96,6 @@ TEST_F(DBBasicTest, UniqueSession) { EXPECT_MATCHES_REGEX(sid2, expected); EXPECT_MATCHES_REGEX(sid3, expected); -#ifndef ROCKSDB_LITE Close(); ASSERT_OK(ReadOnlyReopen(options)); ASSERT_OK(db_->GetDbSessionId(sid1)); @@ -113,7 +110,6 @@ TEST_F(DBBasicTest, UniqueSession) { ASSERT_NE(sid1, sid2); ASSERT_EQ(sid2, sid3); -#endif // ROCKSDB_LITE CreateAndReopenWithCF({"goku"}, options); ASSERT_OK(db_->GetDbSessionId(sid1)); @@ -130,7 +126,6 @@ TEST_F(DBBasicTest, UniqueSession) { ASSERT_NE(sid1, sid4); } -#ifndef ROCKSDB_LITE TEST_F(DBBasicTest, ReadOnlyDB) { ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("bar", "v2")); @@ -143,6 +138,7 @@ TEST_F(DBBasicTest, ReadOnlyDB) { ASSERT_OK(iter->status()); ++count; } + ASSERT_OK(iter->status()); // Always expect two keys: "foo" and "bar" ASSERT_EQ(count, 2); }; @@ -367,7 +363,6 @@ TEST_F(DBBasicTest, LevelLimitReopen) { options.max_bytes_for_level_multiplier_additional.resize(10, 1); ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); } -#endif // ROCKSDB_LITE TEST_F(DBBasicTest, PutDeleteGet) { do { @@ -429,7 +424,6 @@ TEST_F(DBBasicTest, GetFromVersions) { } while (ChangeOptions()); } -#ifndef ROCKSDB_LITE TEST_F(DBBasicTest, GetSnapshot) { anon::OptionsOverride options_override; options_override.skip_policy = kSkipNoSnapshot; @@ -450,7 +444,6 @@ TEST_F(DBBasicTest, GetSnapshot) { } } while (ChangeOptions()); } -#endif // ROCKSDB_LITE TEST_F(DBBasicTest, CheckLock) { do { @@ -682,7 +675,27 @@ TEST_F(DBBasicTest, IdentityAcrossRestarts) { } while (ChangeCompactOptions()); } -#ifndef ROCKSDB_LITE +TEST_F(DBBasicTest, LockFileRecovery) { + Options options = CurrentOptions(); + // Regardless of best_efforts_recovery + for (bool ber : {false, true}) { + options.best_efforts_recovery = ber; + DestroyAndReopen(options); + std::string id1, id2; + ASSERT_OK(db_->GetDbIdentity(id1)); + Close(); + + // Should be OK to re-open DB after lock file deleted + std::string lockfilename = LockFileName(dbname_); + ASSERT_OK(env_->DeleteFile(lockfilename)); + Reopen(options); + + // Should be same DB as before + ASSERT_OK(db_->GetDbIdentity(id2)); + ASSERT_EQ(id1, id2); + } +} + TEST_F(DBBasicTest, Snapshot) { env_->SetMockSleep(); anon::OptionsOverride options_override; @@ -754,7 +767,6 @@ TEST_F(DBBasicTest, Snapshot) { } while (ChangeOptions()); } -#endif // ROCKSDB_LITE class DBBasicMultiConfigs : public DBBasicTest, public ::testing::WithParamInterface { @@ -1193,9 +1205,23 @@ TEST_F(DBBasicTest, DBClose) { delete db; ASSERT_EQ(env->GetCloseCount(), 2); + // close by WaitForCompact() with close_db option + options.create_if_missing = false; + s = DB::Open(options, dbname, &db); + ASSERT_OK(s); + ASSERT_TRUE(db != nullptr); + WaitForCompactOptions wait_for_compact_options = WaitForCompactOptions(); + wait_for_compact_options.close_db = true; + s = db->WaitForCompact(wait_for_compact_options); + ASSERT_EQ(env->GetCloseCount(), 3); + // see TestLogger::CloseHelper() + ASSERT_EQ(s, Status::IOError()); + + delete db; + ASSERT_EQ(env->GetCloseCount(), 3); + // Provide our own logger and ensure DB::Close() does not close it options.info_log.reset(new TestEnv::TestLogger(env)); - options.create_if_missing = false; s = DB::Open(options, dbname, &db); ASSERT_OK(s); ASSERT_TRUE(db != nullptr); @@ -1203,9 +1229,9 @@ TEST_F(DBBasicTest, DBClose) { s = db->Close(); ASSERT_EQ(s, Status::OK()); delete db; - ASSERT_EQ(env->GetCloseCount(), 2); - options.info_log.reset(); ASSERT_EQ(env->GetCloseCount(), 3); + options.info_log.reset(); + ASSERT_EQ(env->GetCloseCount(), 4); } TEST_F(DBBasicTest, DBCloseAllDirectoryFDs) { @@ -1402,10 +1428,7 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) { int retries = 0; bool last_try = false; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "DBImpl::MultiGet::LastTry", [&](void* /*arg*/) { - last_try = true; - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - }); + "DBImpl::MultiGet::LastTry", [&](void* /*arg*/) { last_try = true; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) { if (last_try) { @@ -1422,8 +1445,28 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) { } } }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"DBImpl::MultiGet::AfterLastTryRefSV", + "DBMultiGetTestWithParam::MultiGetMultiCFMutex:BeforeCreateSV"}, + {"DBMultiGetTestWithParam::MultiGetMultiCFMutex:AfterCreateSV", + "DBImpl::MultiGet::BeforeLastTryUnRefSV"}, + }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + port::Thread create_sv_thread([this]() { + TEST_SYNC_POINT( + "DBMultiGetTestWithParam::MultiGetMultiCFMutex:BeforeCreateSV"); + // Create a new SuperVersion for each column family after last_try + // of MultiGet ref SuperVersion and before unref it. + for (int i = 0; i < 8; ++i) { + ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key", + "cf" + std::to_string(i) + "_val_after_last_try")); + ASSERT_OK(Flush(i)); + } + TEST_SYNC_POINT( + "DBMultiGetTestWithParam::MultiGetMultiCFMutex:AfterCreateSV"); + }); + std::vector cfs; std::vector keys; std::vector values; @@ -1435,6 +1478,7 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) { values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()), std::get<1>(GetParam())); + create_sv_thread.join(); ASSERT_TRUE(last_try); ASSERT_EQ(values.size(), 8); for (unsigned int j = 0; j < values.size(); ++j) { @@ -1448,6 +1492,7 @@ TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) { ->cfd(); ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) { @@ -2291,9 +2336,7 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL0) { ASSERT_EQ(multiget_io_batch_size.count, 3); } #else // ROCKSDB_IOURING_PRESENT - if (GetParam()) { - ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3); - } + ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0); #endif // ROCKSDB_IOURING_PRESENT } @@ -2332,11 +2375,15 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL1) { statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); +#ifdef ROCKSDB_IOURING_PRESENT // A batch of 3 async IOs is expected, one for each overlapping file in L1 ASSERT_EQ(multiget_io_batch_size.count, 1); ASSERT_EQ(multiget_io_batch_size.max, 3); #endif // ROCKSDB_IOURING_PRESENT ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3); +#else // ROCKSDB_IOURING_PRESENT + ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0); +#endif // ROCKSDB_IOURING_PRESENT } #ifdef ROCKSDB_IOURING_PRESENT @@ -2520,8 +2567,12 @@ TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) { ASSERT_EQ(values[0], "val_l2_" + std::to_string(19)); ASSERT_EQ(values[1], "val_l2_" + std::to_string(26)); +#ifdef ROCKSDB_IOURING_PRESENT // Bloom filters in L0/L1 will avoid the coroutine calls in those levels ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2); +#else // ROCKSDB_IOURING_PRESENT + ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0); +#endif // ROCKSDB_IOURING_PRESENT } #ifdef ROCKSDB_IOURING_PRESENT @@ -2612,18 +2663,17 @@ TEST_P(DBMultiGetAsyncIOTest, GetNoIOUring) { dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data()); ASSERT_EQ(values.size(), 3); - ASSERT_EQ(statuses[0], Status::NotSupported()); - ASSERT_EQ(statuses[1], Status::NotSupported()); - ASSERT_EQ(statuses[2], Status::NotSupported()); + ASSERT_EQ(statuses[0], Status::OK()); + ASSERT_EQ(statuses[1], Status::OK()); + ASSERT_EQ(statuses[2], Status::OK()); - HistogramData multiget_io_batch_size; + HistogramData async_read_bytes; - statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); + statistics()->histogramData(ASYNC_READ_BYTES, &async_read_bytes); // A batch of 3 async IOs is expected, one for each overlapping file in L1 - ASSERT_EQ(multiget_io_batch_size.count, 1); - ASSERT_EQ(multiget_io_batch_size.max, 3); - ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3); + ASSERT_EQ(async_read_bytes.count, 0); + ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0); } INSTANTIATE_TEST_CASE_P(DBMultiGetAsyncIOTest, DBMultiGetAsyncIOTest, @@ -2794,7 +2844,6 @@ TEST_P(MultiGetPrefixExtractorTest, Batched) { INSTANTIATE_TEST_CASE_P(MultiGetPrefix, MultiGetPrefixExtractorTest, ::testing::Bool()); -#ifndef ROCKSDB_LITE class DBMultiGetRowCacheTest : public DBBasicTest, public ::testing::WithParamInterface {}; @@ -2949,7 +2998,6 @@ TEST_F(DBBasicTest, ValueTypeString) { ASSERT_TRUE(key_version.GetTypeName() != "Invalid"); } } -#endif // !ROCKSDB_LITE TEST_F(DBBasicTest, MultiGetIOBufferOverrun) { Options options = CurrentOptions(); @@ -3050,7 +3098,6 @@ TEST_F(DBBasicTest, BestEffortsRecoveryWithVersionBuildingFailure) { SyncPoint::GetInstance()->ClearAllCallBacks(); } -#ifndef ROCKSDB_LITE namespace { class TableFileListener : public EventListener { public: @@ -3325,7 +3372,6 @@ TEST_F(DBBasicTest, DisableTrackWal) { ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty()); Close(); } -#endif // !ROCKSDB_LITE TEST_F(DBBasicTest, ManifestChecksumMismatch) { Options options = CurrentOptions(); @@ -3369,7 +3415,6 @@ TEST_F(DBBasicTest, ConcurrentlyCloseDB) { } } -#ifndef ROCKSDB_LITE class DBBasicTestTrackWal : public DBTestBase, public testing::WithParamInterface { public: @@ -3425,21 +3470,16 @@ TEST_P(DBBasicTestTrackWal, DoNotTrackObsoleteWal) { INSTANTIATE_TEST_CASE_P(DBBasicTestTrackWal, DBBasicTestTrackWal, testing::Bool()); -#endif // ROCKSDB_LITE class DBBasicTestMultiGet : public DBTestBase { public: - DBBasicTestMultiGet(std::string test_dir, int num_cfs, bool compressed_cache, + DBBasicTestMultiGet(std::string test_dir, int num_cfs, bool uncompressed_cache, bool _compression_enabled, bool _fill_cache, uint32_t compression_parallel_threads) : DBTestBase(test_dir, /*env_do_fsync=*/false) { compression_enabled_ = _compression_enabled; fill_cache_ = _fill_cache; - if (compressed_cache) { - std::shared_ptr cache = NewLRUCache(1048576); - compressed_cache_ = std::make_shared(cache); - } if (uncompressed_cache) { std::shared_ptr cache = NewLRUCache(1048576); uncompressed_cache_ = std::make_shared(cache); @@ -3451,7 +3491,6 @@ class DBBasicTestMultiGet : public DBTestBase { Random rnd(301); BlockBasedTableOptions table_options; -#ifndef ROCKSDB_LITE if (compression_enabled_) { std::vector compression_types; compression_types = GetSupportedCompressions(); @@ -3470,12 +3509,6 @@ class DBBasicTestMultiGet : public DBTestBase { compression_enabled_ = false; } } -#else - // GetSupportedCompressions() is not available in LITE build - if (!Snappy_Supported()) { - compression_enabled_ = false; - } -#endif // ROCKSDB_LITE table_options.block_cache = uncompressed_cache_; if (table_options.block_cache == nullptr) { @@ -3483,7 +3516,6 @@ class DBBasicTestMultiGet : public DBTestBase { } else { table_options.pin_l0_filter_and_index_blocks_in_cache = true; } - table_options.block_cache_compressed = compressed_cache_; table_options.flush_block_policy_factory.reset( new MyFlushBlockPolicyFactory()); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -3625,19 +3657,21 @@ class DBBasicTestMultiGet : public DBTestBase { Status Insert(const Slice& key, Cache::ObjectPtr value, const CacheItemHelper* helper, size_t charge, - Handle** handle = nullptr, - Priority priority = Priority::LOW) override { + Handle** handle = nullptr, Priority priority = Priority::LOW, + const Slice& compressed = Slice(), + CompressionType type = kNoCompression) override { num_inserts_++; - return target_->Insert(key, value, helper, charge, handle, priority); + return target_->Insert(key, value, helper, charge, handle, priority, + compressed, type); } Handle* Lookup(const Slice& key, const CacheItemHelper* helper, CreateContext* create_context, - Priority priority = Priority::LOW, bool wait = true, + Priority priority = Priority::LOW, Statistics* stats = nullptr) override { num_lookups_++; Handle* handle = - target_->Lookup(key, helper, create_context, priority, wait, stats); + target_->Lookup(key, helper, create_context, priority, stats); if (handle != nullptr) { num_found_++; } @@ -3666,16 +3700,14 @@ class DBBasicTestMultiGet : public DBTestBase { std::vector cf_names_; }; -class DBBasicTestWithParallelIO - : public DBBasicTestMultiGet, - public testing::WithParamInterface< - std::tuple> { +class DBBasicTestWithParallelIO : public DBBasicTestMultiGet, + public testing::WithParamInterface< + std::tuple> { public: DBBasicTestWithParallelIO() : DBBasicTestMultiGet("/db_basic_test_with_parallel_io", 1, std::get<0>(GetParam()), std::get<1>(GetParam()), - std::get<2>(GetParam()), std::get<3>(GetParam()), - std::get<4>(GetParam())) {} + std::get<2>(GetParam()), std::get<3>(GetParam())) {} }; TEST_P(DBBasicTestWithParallelIO, MultiGet) { @@ -3801,7 +3833,6 @@ TEST_P(DBBasicTestWithParallelIO, MultiGet) { } } -#ifndef ROCKSDB_LITE TEST_P(DBBasicTestWithParallelIO, MultiGetDirectIO) { class FakeDirectIOEnv : public EnvWrapper { class FakeDirectIOSequentialFile; @@ -3918,7 +3949,6 @@ TEST_P(DBBasicTestWithParallelIO, MultiGetDirectIO) { } Close(); } -#endif // ROCKSDB_LITE TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) { std::vector key_data(10); @@ -4006,13 +4036,12 @@ TEST_P(DBBasicTestWithParallelIO, MultiGetWithMissingFile) { INSTANTIATE_TEST_CASE_P(ParallelIO, DBBasicTestWithParallelIO, // Params are as follows - - // Param 0 - Compressed cache enabled - // Param 1 - Uncompressed cache enabled - // Param 2 - Data compression enabled - // Param 3 - ReadOptions::fill_cache - // Param 4 - CompressionOptions::parallel_threads + // Param 0 - Uncompressed cache enabled + // Param 1 - Data compression enabled + // Param 2 - ReadOptions::fill_cache + // Param 3 - CompressionOptions::parallel_threads ::testing::Combine(::testing::Bool(), ::testing::Bool(), - ::testing::Bool(), ::testing::Bool(), + ::testing::Bool(), ::testing::Values(1, 4))); // Forward declaration @@ -4223,9 +4252,8 @@ class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet, DBBasicTestMultiGetDeadline() : DBBasicTestMultiGet( "db_basic_test_multiget_deadline" /*Test dir*/, - 10 /*# of column families*/, false /*compressed cache enabled*/, - true /*uncompressed cache enabled*/, true /*compression enabled*/, - true /*ReadOptions.fill_cache*/, + 10 /*# of column families*/, true /*uncompressed cache enabled*/, + true /*compression enabled*/, true /*ReadOptions.fill_cache*/, 1 /*# of parallel compression threads*/) {} inline void CheckStatus(std::vector& statuses, size_t num_ok) { @@ -4389,6 +4417,8 @@ TEST_F(DBBasicTest, ManifestWriteFailure) { options.create_if_missing = true; options.disable_auto_compactions = true; options.env = env_; + options.enable_blob_files = true; + options.blob_file_size = 0; DestroyAndReopen(options); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Flush()); @@ -4409,6 +4439,11 @@ TEST_F(DBBasicTest, ManifestWriteFailure) { SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->EnableProcessing(); Reopen(options); + // The IO error was a mocked one from the `AfterSyncManifest` callback. The + // Flush's VersionEdit actually made it into the Manifest. So these keys can + // be read back. Read them to check all live sst files and blob files. + ASSERT_EQ("bar", Get("foo")); + ASSERT_EQ("value", Get("key")); } TEST_F(DBBasicTest, DestroyDefaultCfHandle) { @@ -4461,7 +4496,6 @@ TEST_F(DBBasicTest, FailOpenIfLoggerCreationFail) { SyncPoint::GetInstance()->ClearAllCallBacks(); } -#ifndef ROCKSDB_LITE TEST_F(DBBasicTest, VerifyFileChecksums) { Options options = GetDefaultOptions(); options.create_if_missing = true; @@ -4505,6 +4539,63 @@ TEST_F(DBBasicTest, VerifyFileChecksums) { ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument()); } +TEST_F(DBBasicTest, VerifyFileChecksumsReadahead) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.env = env_; + options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); + DestroyAndReopen(options); + + Random rnd(301); + int alignment = 256 * 1024; + for (int i = 0; i < 16; ++i) { + ASSERT_OK(Put("key" + std::to_string(i), rnd.RandomString(alignment))); + } + ASSERT_OK(Flush()); + + std::vector filenames; + int sst_cnt = 0; + std::string sst_name; + uint64_t sst_size; + uint64_t number; + FileType type; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + for (auto name : filenames) { + if (ParseFileName(name, &number, &type)) { + if (type == kTableFile) { + sst_cnt++; + sst_name = name; + } + } + } + ASSERT_EQ(sst_cnt, 1); + ASSERT_OK(env_->GetFileSize(dbname_ + '/' + sst_name, &sst_size)); + + bool last_read = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "GenerateOneFileChecksum::Chunk:0", [&](void* /*arg*/) { + if (env_->random_read_bytes_counter_.load() == sst_size) { + EXPECT_FALSE(last_read); + last_read = true; + } else { + ASSERT_EQ(env_->random_read_bytes_counter_.load() & (alignment - 1), + 0); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + env_->count_random_reads_ = true; + env_->random_read_bytes_counter_ = 0; + env_->random_read_counter_.Reset(); + + ReadOptions ro; + ro.readahead_size = alignment; + ASSERT_OK(db_->VerifyFileChecksums(ro)); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ASSERT_TRUE(last_read); + ASSERT_EQ(env_->random_read_counter_.Read(), + (sst_size + alignment - 1) / (alignment)); +} + // TODO: re-enable after we provide finer-grained control for WAL tracking to // meet the needs of different use cases, durability levels and recovery modes. TEST_F(DBBasicTest, DISABLED_ManualWalSync) { @@ -4527,7 +4618,6 @@ TEST_F(DBBasicTest, DISABLED_ManualWalSync) { ASSERT_TRUE(TryReopen(options).IsCorruption()); } -#endif // !ROCKSDB_LITE // A test class for intercepting random reads and injecting artificial // delays. Used for testing the deadline/timeout feature diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 1c45a8aabfe2..4acdc64b2221 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -44,10 +44,6 @@ class DBBlockCacheTest : public DBTestBase { size_t compression_dict_miss_count_ = 0; size_t compression_dict_hit_count_ = 0; size_t compression_dict_insert_count_ = 0; - size_t compressed_miss_count_ = 0; - size_t compressed_hit_count_ = 0; - size_t compressed_insert_count_ = 0; - size_t compressed_failure_count_ = 0; public: const size_t kNumBlocks = 10; @@ -85,14 +81,6 @@ class DBBlockCacheTest : public DBTestBase { hit_count_ = TestGetTickerCount(options, BLOCK_CACHE_HIT); insert_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD); failure_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES); - compressed_miss_count_ = - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS); - compressed_hit_count_ = - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT); - compressed_insert_count_ = - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD); - compressed_failure_count_ = - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); } void RecordCacheCountersForCompressionDict(const Options& options) { @@ -144,30 +132,6 @@ class DBBlockCacheTest : public DBTestBase { compression_dict_insert_count_ = new_compression_dict_insert_count; } - void CheckCompressedCacheCounters(const Options& options, - size_t expected_misses, - size_t expected_hits, - size_t expected_inserts, - size_t expected_failures) { - size_t new_miss_count = - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS); - size_t new_hit_count = - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT); - size_t new_insert_count = - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD); - size_t new_failure_count = - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); - ASSERT_EQ(compressed_miss_count_ + expected_misses, new_miss_count); - ASSERT_EQ(compressed_hit_count_ + expected_hits, new_hit_count); - ASSERT_EQ(compressed_insert_count_ + expected_inserts, new_insert_count); - ASSERT_EQ(compressed_failure_count_ + expected_failures, new_failure_count); - compressed_miss_count_ = new_miss_count; - compressed_hit_count_ = new_hit_count; - compressed_insert_count_ = new_insert_count; - compressed_failure_count_ = new_failure_count; - } - -#ifndef ROCKSDB_LITE const std::array GetCacheEntryRoleCountsBg() { // Verify in cache entry role stats std::array cache_entry_role_counts; @@ -181,7 +145,6 @@ class DBBlockCacheTest : public DBTestBase { } return cache_entry_role_counts; } -#endif // ROCKSDB_LITE }; TEST_F(DBBlockCacheTest, IteratorBlockCacheUsage) { @@ -274,84 +237,6 @@ TEST_F(DBBlockCacheTest, TestWithoutCompressedBlockCache) { } #ifdef SNAPPY -TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) { - Options options = CurrentOptions(); - options.create_if_missing = true; - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - - BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.block_cache_compressed = nullptr; - table_options.block_size = 1; - table_options.filter_policy.reset(NewBloomFilterPolicy(20)); - table_options.cache_index_and_filter_blocks = false; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.compression = CompressionType::kSnappyCompression; - - DestroyAndReopen(options); - - std::string value(kValueSize, 'a'); - for (size_t i = 0; i < kNumBlocks; i++) { - ASSERT_OK(Put(std::to_string(i), value)); - ASSERT_OK(Flush()); - } - - ReadOptions read_options; - std::shared_ptr compressed_cache = NewLRUCache(1 << 25, 0, false); - LRUCacheOptions co; - co.capacity = 0; - co.num_shard_bits = 0; - co.strict_capacity_limit = false; - // Needed not to count entry stats collector - co.metadata_charge_policy = kDontChargeCacheMetadata; - std::shared_ptr cache = NewLRUCache(co); - table_options.block_cache = cache; - table_options.no_block_cache = false; - table_options.block_cache_compressed = compressed_cache; - table_options.max_auto_readahead_size = 0; - table_options.cache_index_and_filter_blocks = false; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(options); - RecordCacheCounters(options); - - // Load blocks into cache. - for (size_t i = 0; i < kNumBlocks - 1; i++) { - ASSERT_EQ(value, Get(std::to_string(i))); - CheckCacheCounters(options, 1, 0, 1, 0); - CheckCompressedCacheCounters(options, 1, 0, 1, 0); - } - - size_t usage = cache->GetUsage(); - ASSERT_EQ(0, usage); - ASSERT_EQ(usage, cache->GetPinnedUsage()); - size_t compressed_usage = compressed_cache->GetUsage(); - ASSERT_LT(0, compressed_usage); - // Compressed block cache cannot be pinned. - ASSERT_EQ(0, compressed_cache->GetPinnedUsage()); - - // Set strict capacity limit flag. Now block will only load into compressed - // block cache. - cache->SetCapacity(usage); - cache->SetStrictCapacityLimit(true); - ASSERT_EQ(usage, cache->GetPinnedUsage()); - - // Load last key block. - ASSERT_EQ( - "Operation aborted: Memory limit reached: Insert failed due to LRU cache " - "being full.", - Get(std::to_string(kNumBlocks - 1))); - // Failure will also record the miss counter. - CheckCacheCounters(options, 1, 0, 0, 1); - CheckCompressedCacheCounters(options, 1, 0, 1, 0); - - // Clear strict capacity limit flag. This time we shall hit compressed block - // cache and load into block cache. - cache->SetStrictCapacityLimit(false); - // Load last key block. - ASSERT_EQ(value, Get(std::to_string(kNumBlocks - 1))); - CheckCacheCounters(options, 1, 0, 1, 0); - CheckCompressedCacheCounters(options, 0, 1, 0, 0); -} namespace { class PersistentCacheFromCache : public PersistentCache { @@ -402,94 +287,23 @@ class PersistentCacheFromCache : public PersistentCache { }; class ReadOnlyCacheWrapper : public CacheWrapper { + public: using CacheWrapper::CacheWrapper; - using Cache::Insert; + const char* Name() const override { return "ReadOnlyCacheWrapper"; } + Status Insert(const Slice& /*key*/, Cache::ObjectPtr /*value*/, const CacheItemHelper* /*helper*/, size_t /*charge*/, - Handle** /*handle*/, Priority /*priority*/) override { + Handle** /*handle*/, Priority /*priority*/, + const Slice& /*compressed*/, + CompressionType /*type*/) override { return Status::NotSupported(); } }; } // anonymous namespace - -TEST_F(DBBlockCacheTest, TestWithSameCompressed) { - auto table_options = GetTableOptions(); - auto options = GetOptions(table_options); - InitTable(options); - - std::shared_ptr rw_cache{NewLRUCache(1000000)}; - std::shared_ptr rw_pcache{ - new PersistentCacheFromCache(rw_cache, /*read_only*/ false)}; - // Exercise some obscure behavior with read-only wrappers - std::shared_ptr ro_cache{new ReadOnlyCacheWrapper(rw_cache)}; - std::shared_ptr ro_pcache{ - new PersistentCacheFromCache(rw_cache, /*read_only*/ true)}; - - // Simple same pointer - table_options.block_cache = rw_cache; - table_options.block_cache_compressed = rw_cache; - table_options.persistent_cache.reset(); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ASSERT_EQ(TryReopen(options).ToString(), - "Invalid argument: block_cache same as block_cache_compressed not " - "currently supported, and would be bad for performance anyway"); - - // Other cases - table_options.block_cache = ro_cache; - table_options.block_cache_compressed = rw_cache; - table_options.persistent_cache.reset(); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ASSERT_EQ(TryReopen(options).ToString(), - "Invalid argument: block_cache and block_cache_compressed share " - "the same key space, which is not supported"); - - table_options.block_cache = rw_cache; - table_options.block_cache_compressed = ro_cache; - table_options.persistent_cache.reset(); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ASSERT_EQ(TryReopen(options).ToString(), - "Invalid argument: block_cache_compressed and block_cache share " - "the same key space, which is not supported"); - - table_options.block_cache = ro_cache; - table_options.block_cache_compressed.reset(); - table_options.persistent_cache = rw_pcache; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ASSERT_EQ(TryReopen(options).ToString(), - "Invalid argument: block_cache and persistent_cache share the same " - "key space, which is not supported"); - - table_options.block_cache = rw_cache; - table_options.block_cache_compressed.reset(); - table_options.persistent_cache = ro_pcache; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ASSERT_EQ(TryReopen(options).ToString(), - "Invalid argument: persistent_cache and block_cache share the same " - "key space, which is not supported"); - - table_options.block_cache.reset(); - table_options.no_block_cache = true; - table_options.block_cache_compressed = ro_cache; - table_options.persistent_cache = rw_pcache; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ASSERT_EQ(TryReopen(options).ToString(), - "Invalid argument: block_cache_compressed and persistent_cache " - "share the same key space, which is not supported"); - - table_options.block_cache.reset(); - table_options.no_block_cache = true; - table_options.block_cache_compressed = rw_cache; - table_options.persistent_cache = ro_pcache; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ASSERT_EQ(TryReopen(options).ToString(), - "Invalid argument: persistent_cache and block_cache_compressed " - "share the same key space, which is not supported"); -} #endif // SNAPPY -#ifndef ROCKSDB_LITE // Make sure that when options.block_cache is set, after a new table is // created its index/filter blocks are added to block cache. @@ -575,6 +389,7 @@ TEST_F(DBBlockCacheTest, FillCacheAndIterateDB) { while (iter->Valid()) { iter->Next(); } + ASSERT_OK(iter->status()); delete iter; iter = nullptr; } @@ -609,11 +424,6 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert); // set the cache capacity to the current usage cache->SetCapacity(index_bytes_insert + filter_bytes_insert); - // The index and filter eviction statistics were broken by the refactoring - // that moved the readers out of the block cache. Disabling these until we can - // bring the stats back. - // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0); - // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0); // Note that the second key needs to be no longer than the first one. // Otherwise the second index block may not fit in cache. ASSERT_OK(Put(1, "key", "val")); @@ -624,13 +434,6 @@ TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) { index_bytes_insert); ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT), filter_bytes_insert); - // The index and filter eviction statistics were broken by the refactoring - // that moved the readers out of the block cache. Disabling these until we can - // bring the stats back. - // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), - // index_bytes_insert); - // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), - // filter_bytes_insert); } #if (defined OS_LINUX || defined OS_WIN) @@ -820,21 +623,23 @@ class MockCache : public LRUCache { static uint32_t low_pri_insert_count; MockCache() - : LRUCache((size_t)1 << 25 /*capacity*/, 0 /*num_shard_bits*/, - false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/, - 0.0 /*low_pri_pool_ratio*/) {} + : LRUCache(LRUCacheOptions( + size_t{1} << 25 /*capacity*/, 0 /*num_shard_bits*/, + false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/)) {} using ShardedCache::Insert; Status Insert(const Slice& key, Cache::ObjectPtr value, const Cache::CacheItemHelper* helper, size_t charge, - Handle** handle, Priority priority) override { + Handle** handle, Priority priority, const Slice& compressed, + CompressionType type) override { if (priority == Priority::LOW) { low_pri_insert_count++; } else { high_pri_insert_count++; } - return LRUCache::Insert(key, value, helper, charge, handle, priority); + return LRUCache::Insert(key, value, helper, charge, handle, priority, + compressed, type); } }; @@ -913,10 +718,11 @@ class LookupLiarCache : public CacheWrapper { explicit LookupLiarCache(std::shared_ptr target) : CacheWrapper(std::move(target)) {} - using Cache::Lookup; + const char* Name() const override { return "LookupLiarCache"; } + Handle* Lookup(const Slice& key, const CacheItemHelper* helper = nullptr, CreateContext* create_context = nullptr, - Priority priority = Priority::LOW, bool wait = true, + Priority priority = Priority::LOW, Statistics* stats = nullptr) override { if (nth_lookup_not_found_ == 1) { nth_lookup_not_found_ = 0; @@ -925,8 +731,7 @@ class LookupLiarCache : public CacheWrapper { if (nth_lookup_not_found_ > 1) { --nth_lookup_not_found_; } - return CacheWrapper::Lookup(key, helper, create_context, priority, wait, - stats); + return CacheWrapper::Lookup(key, helper, create_context, priority, stats); } // 1 == next lookup, 2 == after next, etc. @@ -941,10 +746,15 @@ TEST_F(DBBlockCacheTest, AddRedundantStats) { int iterations_tested = 0; for (std::shared_ptr base_cache : {NewLRUCache(capacity, num_shard_bits), + // FixedHyperClockCache HyperClockCacheOptions( capacity, BlockBasedTableOptions().block_size /*estimated_value_size*/, num_shard_bits) + .MakeSharedCache(), + // AutoHyperClockCache + HyperClockCacheOptions(capacity, 0 /*estimated_value_size*/, + num_shard_bits) .MakeSharedCache()}) { if (!base_cache) { // Skip clock cache when not supported @@ -1086,124 +896,6 @@ TEST_F(DBBlockCacheTest, ParanoidFileChecks) { TestGetTickerCount(options, BLOCK_CACHE_ADD)); } -TEST_F(DBBlockCacheTest, CompressedCache) { - if (!Snappy_Supported()) { - return; - } - int num_iter = 80; - - // Run this test three iterations. - // Iteration 1: only a uncompressed block cache - // Iteration 2: only a compressed block cache - // Iteration 3: both block cache and compressed cache - // Iteration 4: both block cache and compressed cache, but DB is not - // compressed - for (int iter = 0; iter < 4; iter++) { - Options options = CurrentOptions(); - options.write_buffer_size = 64 * 1024; // small write buffer - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - - BlockBasedTableOptions table_options; - switch (iter) { - case 0: - // only uncompressed block cache - table_options.block_cache = NewLRUCache(8 * 1024); - table_options.block_cache_compressed = nullptr; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - break; - case 1: - // no block cache, only compressed cache - table_options.no_block_cache = true; - table_options.block_cache = nullptr; - table_options.block_cache_compressed = NewLRUCache(8 * 1024); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - break; - case 2: - // both compressed and uncompressed block cache - table_options.block_cache = NewLRUCache(1024); - table_options.block_cache_compressed = NewLRUCache(8 * 1024); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - break; - case 3: - // both block cache and compressed cache, but DB is not compressed - // also, make block cache sizes bigger, to trigger block cache hits - table_options.block_cache = NewLRUCache(1024 * 1024); - table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.compression = kNoCompression; - break; - default: - FAIL(); - } - CreateAndReopenWithCF({"pikachu"}, options); - // default column family doesn't have block cache - Options no_block_cache_opts; - no_block_cache_opts.statistics = options.statistics; - no_block_cache_opts = CurrentOptions(no_block_cache_opts); - BlockBasedTableOptions table_options_no_bc; - table_options_no_bc.no_block_cache = true; - no_block_cache_opts.table_factory.reset( - NewBlockBasedTableFactory(table_options_no_bc)); - ReopenWithColumnFamilies( - {"default", "pikachu"}, - std::vector({no_block_cache_opts, options})); - - Random rnd(301); - - // Write 8MB (80 values, each 100K) - ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); - std::vector values; - std::string str; - for (int i = 0; i < num_iter; i++) { - if (i % 4 == 0) { // high compression ratio - str = rnd.RandomString(1000); - } - values.push_back(str); - ASSERT_OK(Put(1, Key(i), values[i])); - } - - // flush all data from memtable so that reads are from block cache - ASSERT_OK(Flush(1)); - - for (int i = 0; i < num_iter; i++) { - ASSERT_EQ(Get(1, Key(i)), values[i]); - } - - // check that we triggered the appropriate code paths in the cache - switch (iter) { - case 0: - // only uncompressed block cache - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); - break; - case 1: - // no block cache, only compressed cache - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); - break; - case 2: - // both compressed and uncompressed block cache - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); - break; - case 3: - // both compressed and uncompressed block cache - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0); - ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); - // compressed doesn't have any hits since blocks are not compressed on - // storage - ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0); - break; - default: - FAIL(); - } - - options.create_if_missing = true; - DestroyAndReopen(options); - } -} - TEST_F(DBBlockCacheTest, CacheCompressionDict) { const int kNumFiles = 4; const int kNumEntriesPerFile = 128; @@ -1297,12 +989,14 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { const size_t capacity = size_t{1} << 25; int iterations_tested = 0; for (bool partition : {false, true}) { + SCOPED_TRACE("Partition? " + std::to_string(partition)); for (std::shared_ptr cache : {NewLRUCache(capacity), HyperClockCacheOptions( capacity, BlockBasedTableOptions().block_size /*estimated_value_size*/) .MakeSharedCache()}) { + SCOPED_TRACE(std::string("Cache: ") + cache->Name()); ++iterations_tested; Options options = CurrentOptions(); @@ -1596,6 +1290,7 @@ TEST_F(DBBlockCacheTest, HyperClockCacheReportProblems) { HyperClockCacheOptions hcc_opts{capacity, value_size_est}; hcc_opts.num_shard_bits = 2; // 4 shards hcc_opts.metadata_charge_policy = kDontChargeCacheMetadata; + hcc_opts.hash_seed = 0; // deterministic hashing std::shared_ptr cache = hcc_opts.MakeSharedCache(); std::shared_ptr logger = std::make_shared(); @@ -1652,8 +1347,6 @@ TEST_F(DBBlockCacheTest, HyperClockCacheReportProblems) { EXPECT_EQ(logger->PopCounts(), (std::array{{0, 1, 0}})); } -#endif // ROCKSDB_LITE - class DBBlockCacheKeyTest : public DBTestBase, public testing::WithParamInterface> { @@ -1708,31 +1401,16 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) { uint64_t expected_stat = 0; std::function verify_stats; - if (use_compressed_cache_) { - if (!Snappy_Supported()) { - ROCKSDB_GTEST_SKIP("Compressed cache test requires snappy support"); - return; - } - options.compression = CompressionType::kSnappyCompression; - table_options.no_block_cache = true; - table_options.block_cache_compressed = NewLRUCache(1 << 25, 0, false); - verify_stats = [&options, &expected_stat] { - // One for ordinary SST file and one for external SST file - ASSERT_EQ(expected_stat, - options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_ADD)); - }; - } else { - table_options.cache_index_and_filter_blocks = true; - table_options.block_cache = NewLRUCache(1 << 25, 0, false); - verify_stats = [&options, &expected_stat] { - ASSERT_EQ(expected_stat, - options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD)); - ASSERT_EQ(expected_stat, - options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD)); - ASSERT_EQ(expected_stat, - options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD)); - }; - } + table_options.cache_index_and_filter_blocks = true; + table_options.block_cache = NewLRUCache(1 << 25, 0, false); + verify_stats = [&options, &expected_stat] { + ASSERT_EQ(expected_stat, + options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD)); + ASSERT_EQ(expected_stat, + options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD)); + ASSERT_EQ(expected_stat, + options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD)); + }; table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); @@ -1774,7 +1452,6 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) { ++key_count; } -#ifndef ROCKSDB_LITE // Save an export of those ordinary SST files for later std::string export_files_dir = dbname_ + "/exported"; ExportImportFilesMetaData* metadata_ptr_ = nullptr; @@ -1803,13 +1480,6 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) { ASSERT_OK(db_->IngestExternalFile(handles_[1], {f}, ingest_opts)); } - if (exclude_file_numbers_) { - // FIXME(peterd): figure out where these extra ADDs are coming from - options.statistics->recordTick(BLOCK_CACHE_COMPRESSED_ADD, - uint64_t{0} - uint64_t{2}); - } -#endif - perform_gets(); verify_stats(); @@ -1822,7 +1492,6 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) { // Make sure we can cache hit even on a full copy of the DB. Using // StableCacheKeyTestFS, Checkpoint will resort to full copy not hard link. // (Checkpoint not available in LITE mode to test this.) -#ifndef ROCKSDB_LITE auto db_copy_name = dbname_ + "-copy"; ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); ASSERT_OK(checkpoint->CreateCheckpoint(db_copy_name)); @@ -1861,7 +1530,6 @@ TEST_P(DBBlockCacheKeyTest, StableCacheKeys) { perform_gets(); verify_stats(); -#endif // !ROCKSDB_LITE Close(); Destroy(options); diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc index 0d8329496e73..abe7f2476109 100644 --- a/db/db_bloom_filter_test.cc +++ b/db/db_bloom_filter_test.cc @@ -43,6 +43,16 @@ const std::string kStandard128Ribbon = test::Standard128RibbonFilterPolicy::kClassName(); const std::string kAutoBloom = BloomFilterPolicy::kClassName(); const std::string kAutoRibbon = RibbonFilterPolicy::kClassName(); + +template +T Pop(T& var) { + auto rv = var; + var = 0; + return rv; +} +PerfContextByLevel& GetLevelPerfContext(uint32_t level) { + return (*(get_perf_context()->level_to_perf_context))[level]; +} } // anonymous namespace // DB tests related to bloom filter. @@ -209,59 +219,43 @@ TEST_F(DBBloomFilterTest, GetFilterByPrefixBloomCustomPrefixExtractor) { ASSERT_OK(dbfull()->Flush(fo)); ASSERT_EQ("foo", Get("barbarbar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ( - 0, - (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); + ASSERT_EQ("foo2", Get("barbarbar2")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ( - 0, - (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); + ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); - ASSERT_EQ( - 0, - (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); - ASSERT_EQ( - 1, - (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); - ASSERT_EQ( - 2, - (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); ro.total_order_seek = true; // NOTE: total_order_seek no longer affects Get() ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); - ASSERT_EQ( - 3, - (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); // No bloom on extractor changed -#ifndef ROCKSDB_LITE ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}})); ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); - ASSERT_EQ( - 3, - (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); -#endif // ROCKSDB_LITE + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); // No bloom on extractor changed, after re-open options.prefix_extractor.reset(NewCappedPrefixTransform(10)); Reopen(options); ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); - ASSERT_EQ( - 3, - (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); get_perf_context()->Reset(); } @@ -296,35 +290,32 @@ TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) { ASSERT_OK(dbfull()->Flush(fo)); ASSERT_EQ("foo", Get("barbarbar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); ASSERT_EQ("foo2", Get("barbarbar2")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); ro.total_order_seek = true; // NOTE: total_order_seek no longer affects Get() ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); - ASSERT_EQ( - 3, - (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 1); // No bloom on extractor changed -#ifndef ROCKSDB_LITE ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}})); ASSERT_EQ("NOT_FOUND", Get("foobarbar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); - ASSERT_EQ( - 3, - (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful); -#endif // ROCKSDB_LITE + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(Pop(GetLevelPerfContext(0).bloom_filter_useful), 0); get_perf_context()->Reset(); } @@ -361,12 +352,17 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) { ASSERT_OK(dbfull()->Flush(fo)); Reopen(options); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); // Reopen with whole key filtering enabled and prefix extractor // NULL. Bloom filter should be off for both of whole key and @@ -376,13 +372,17 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) { options.prefix_extractor.reset(); Reopen(options); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); // Write DB with only full key filtering. ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); // Needs insert some keys to make sure files are not filtered out by key @@ -398,13 +398,17 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) { options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); // Try to create a DB with mixed files: ASSERT_OK(dbfull()->Put(wo, "foobar", "foo")); @@ -428,61 +432,81 @@ TEST_F(DBBloomFilterTest, WholeKeyFilterProp) { ASSERT_OK(Flush()); // Now we have two files: - // File 1: An older file with prefix bloom. + // File 1: An older file with prefix bloom (disabled) // File 2: A newer file with whole bloom filter. - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("bar", Get("barfoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); // Reopen with the same setting: only whole key is used Reopen(options); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("bar", Get("barfoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); // Restart with both filters are allowed options.prefix_extractor.reset(NewFixedPrefixTransform(3)); bbto.whole_key_filtering = true; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); // File 1 will has it filtered out. // File 2 will not, as prefix `foo` exists in the file. ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); ASSERT_EQ("bar", Get("barfoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); // Restart with only prefix bloom is allowed. options.prefix_extractor.reset(NewFixedPrefixTransform(3)); bbto.whole_key_filtering = false; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("NOT_FOUND", Get("foo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("NOT_FOUND", Get("bar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("foo", Get("foobar")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); ASSERT_EQ("bar", Get("barfoo")); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 0); uint64_t bloom_filter_useful_all_levels = 0; for (auto& kv : (*(get_perf_context()->level_to_perf_context))) { if (kv.second.bloom_filter_useful > 0) { @@ -564,7 +588,6 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) { ASSERT_LE(reads, 3 * N / 100); } -#ifndef ROCKSDB_LITE // Sanity check some table properties std::map props; ASSERT_TRUE(db_->GetMapProperty( @@ -583,7 +606,6 @@ TEST_P(DBBloomFilterTestWithParam, BloomFilter) { uint64_t num_filter_entries = ParseUint64(props["num_filter_entries"]); EXPECT_EQ(num_filter_entries, nkeys); -#endif // ROCKSDB_LITE env_->delay_sstable_sync_.store(false, std::memory_order_release); Close(); @@ -632,7 +654,7 @@ TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) { for (i = 0; i < maxKey; i++) { ASSERT_OK(Put(Key(i), Key(i))); } - Flush(); + ASSERT_OK(Flush()); }; auto GetFn = [&]() { int i; @@ -649,10 +671,8 @@ TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) { PutFn(); GetFn(); }; -#ifndef ROCKSDB_LITE std::map props; const auto& kAggTableProps = DB::Properties::kAggregatedTableProperties; -#endif // ROCKSDB_LITE Options options = CurrentOptions(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); @@ -675,11 +695,9 @@ TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) { EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0); EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0); -#ifndef ROCKSDB_LITE props.clear(); ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); EXPECT_EQ(props["filter_size"], "0"); -#endif // ROCKSDB_LITE // Test 2: use custom API to skip filters -> no filter constructed // or read. @@ -693,11 +711,9 @@ TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) { EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0); EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0); -#ifndef ROCKSDB_LITE props.clear(); ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); EXPECT_EQ(props["filter_size"], "0"); -#endif // ROCKSDB_LITE // Control test: using an actual filter with 100% FP rate -> the filter // is constructed and checked on read. @@ -708,16 +724,11 @@ TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) { PutAndGetFn(); // Verify filter is accessed (and constructed) - EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), - maxKey * 2); - EXPECT_EQ( - TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), - maxKey); -#ifndef ROCKSDB_LITE + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_FULL_POSITIVE), maxKey * 2); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), maxKey); props.clear(); ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); EXPECT_NE(props["filter_size"], "0"); -#endif // ROCKSDB_LITE // Test 3 (options test): Able to read existing filters with longstanding // generated options file entry `filter_policy=rocksdb.BuiltinBloomFilter` @@ -729,11 +740,8 @@ TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) { GetFn(); // Verify filter is accessed - EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), - maxKey * 2); - EXPECT_EQ( - TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), - maxKey); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_FULL_POSITIVE), maxKey * 2); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), maxKey); // But new filters are not generated (configuration details unknown) DestroyAndReopen(options); @@ -743,11 +751,9 @@ TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) { EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0); EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0); -#ifndef ROCKSDB_LITE props.clear(); ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props)); EXPECT_EQ(props["filter_size"], "0"); -#endif // ROCKSDB_LITE } #if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) @@ -786,7 +792,7 @@ TEST_F(DBBloomFilterTest, BloomFilterRate) { } // Add a large key to make the file contain wide range ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); - Flush(1); + ASSERT_OK(Flush(1)); // Check if they can be found for (int i = 0; i < maxKey; i++) { @@ -799,9 +805,7 @@ TEST_F(DBBloomFilterTest, BloomFilterRate) { ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); } ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98); - ASSERT_GE( - (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful, - maxKey * 0.98); + ASSERT_GE(GetLevelPerfContext(0).bloom_filter_useful, maxKey * 0.98); get_perf_context()->Reset(); } } @@ -880,9 +884,8 @@ TEST_F(DBBloomFilterTest, BloomFilterCompatibility) { ASSERT_EQ("val", Get(prefix + "Z")); // Filter positive // Filter negative, with high probability ASSERT_EQ("NOT_FOUND", Get(prefix + "Q")); - EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), - 2); - EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_FULL_POSITIVE), 2); + EXPECT_EQ(PopTicker(options, BLOOM_FILTER_USEFUL), 1); } } } @@ -1503,7 +1506,6 @@ TEST_P(DBFilterConstructionCorruptionTestWithParam, DetectCorruption) { } // RocksDB lite does not support dynamic options -#ifndef ROCKSDB_LITE TEST_P(DBFilterConstructionCorruptionTestWithParam, DynamicallyTurnOnAndOffDetectConstructCorruption) { Options options = CurrentOptions(); @@ -1587,7 +1589,6 @@ TEST_P(DBFilterConstructionCorruptionTestWithParam, db_->GetOptions().table_factory->GetOptions(); EXPECT_FALSE(updated_table_options->detect_filter_construct_corruption); } -#endif // ROCKSDB_LITE namespace { // NOTE: This class is referenced by HISTORY.md as a model for a wrapper @@ -1695,7 +1696,7 @@ TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { table_options.format_version = 5; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - TryReopen(options); + ASSERT_OK(TryReopen(options)); CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options); const int maxKey = 10000; @@ -1704,7 +1705,7 @@ TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { } // Add a large key to make the file contain wide range ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); - Flush(1); + ASSERT_OK(Flush(1)); EXPECT_EQ(policy->DumpTestReport(), fifo ? "cf=abe,s=kCompactionStyleFIFO,n=7,l=0,b=0,r=kFlush\n" : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n"); @@ -1712,7 +1713,7 @@ TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { for (int i = maxKey / 2; i < maxKey; i++) { ASSERT_OK(Put(1, Key(i), Key(i))); } - Flush(1); + ASSERT_OK(Flush(1)); EXPECT_EQ(policy->DumpTestReport(), fifo ? "cf=abe,s=kCompactionStyleFIFO,n=7,l=0,b=0,r=kFlush\n" : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n"); @@ -1723,7 +1724,7 @@ TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { } // Since we have two tables / two filters, we might have Bloom checks on // our queries, but no more than one "useful" per query on a found key. - EXPECT_LE(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey); + EXPECT_LE(PopTicker(options, BLOOM_FILTER_USEFUL), maxKey); // Check that we have two filters, each about // fifo: 0.12% FP rate (15 bits per key) @@ -1732,8 +1733,7 @@ TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); } { - auto useful_count = - TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); + auto useful_count = PopTicker(options, BLOOM_FILTER_USEFUL); EXPECT_GE(useful_count, maxKey * 2 * (fifo ? 0.9980 : 0.975)); EXPECT_LE(useful_count, maxKey * 2 * (fifo ? 0.9995 : 0.98)); } @@ -1750,13 +1750,11 @@ TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333))); } { - auto useful_count = - TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); + auto useful_count = PopTicker(options, BLOOM_FILTER_USEFUL); EXPECT_GE(useful_count, maxKey * 0.90); EXPECT_LE(useful_count, maxKey * 0.91); } } else { -#ifndef ROCKSDB_LITE // Also try external SST file { std::string file_path = dbname_ + "/external.sst"; @@ -1768,7 +1766,6 @@ TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { // Note: kCompactionStyleLevel is default, ignored if num_levels == -1 EXPECT_EQ(policy->DumpTestReport(), "cf=abe,s=kCompactionStyleLevel,n=-1,l=-1,b=0,r=kMisc\n"); -#endif } // Destroy @@ -1778,6 +1775,64 @@ TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) { } } +TEST_F(DBBloomFilterTest, MutatingRibbonFilterPolicy) { + // Test that RibbonFilterPolicy has a mutable bloom_before_level fields that + // can be updated through SetOptions + + Options options = CurrentOptions(); + options.statistics = CreateDBStatistics(); + auto& stats = *options.statistics; + BlockBasedTableOptions table_options; + // First config forces Bloom filter, to establish a baseline before + // SetOptions(). + table_options.filter_policy.reset(NewRibbonFilterPolicy(10, INT_MAX)); + double expected_bpk = 10.0; + // Other configs to try, with approx expected bits per key + std::vector> configs = {{"-1", 7.0}, + {"0", 10.0}}; + + table_options.cache_index_and_filter_blocks = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + ASSERT_OK(TryReopen(options)); + + char v[] = "a"; + + for (;; ++(v[0])) { + const int maxKey = 8000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(Key(i), v)); + } + ASSERT_OK(Flush()); + + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Get(Key(i)), v); + } + + uint64_t filter_bytes = + stats.getAndResetTickerCount(BLOCK_CACHE_FILTER_BYTES_INSERT); + + EXPECT_NEAR(filter_bytes * 8.0 / maxKey, expected_bpk, 0.3); + + if (configs.empty()) { + break; + } + + ASSERT_OK( + db_->SetOptions({{"table_factory.filter_policy.bloom_before_level", + configs.back().first}})); + + // Ensure original object is mutated + std::string val; + ASSERT_OK( + table_options.filter_policy->GetOption({}, "bloom_before_level", &val)); + ASSERT_EQ(configs.back().first, val); + + expected_bpk = configs.back().second; + configs.pop_back(); + } +} + class SliceTransformLimitedDomain : public SliceTransform { const char* Name() const override { return "SliceTransformLimitedDomain"; } @@ -1850,6 +1905,7 @@ TEST_F(DBBloomFilterTest, PrefixExtractorWithFilter2) { for (iter->Seek("zzzzz_AAAA"); iter->Valid(); iter->Next()) { iter_res.emplace_back(iter->value().ToString()); } + ASSERT_OK(iter->status()); std::vector expected_res = {"val1", "val2", "val3", "val4"}; ASSERT_EQ(iter_res, expected_res); @@ -2036,9 +2092,7 @@ TEST_P(DBBloomFilterTestVaryPrefixAndFormatVer, PartitionedMultiGet) { ASSERT_OK(Put(UKey(i), UKey(i))); } ASSERT_OK(Flush()); -#ifndef ROCKSDB_LITE ASSERT_EQ(TotalTableFiles(), 1); -#endif constexpr uint32_t Q = 29; // MultiGet In @@ -2049,13 +2103,14 @@ TEST_P(DBBloomFilterTestVaryPrefixAndFormatVer, PartitionedMultiGet) { std::array statuses; std::array values; - TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT); - TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS); - TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL); - TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL); - TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED); - TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE); - TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE); + PopTicker(options, BLOCK_CACHE_FILTER_HIT); + PopTicker(options, BLOCK_CACHE_FILTER_MISS); + PopTicker(options, BLOOM_FILTER_PREFIX_USEFUL); + PopTicker(options, BLOOM_FILTER_USEFUL); + PopTicker(options, BLOOM_FILTER_PREFIX_CHECKED); + PopTicker(options, BLOOM_FILTER_FULL_POSITIVE); + PopTicker(options, BLOOM_FILTER_FULL_TRUE_POSITIVE); + PopTicker(options, BLOOM_FILTER_PREFIX_TRUE_POSITIVE); // Check that initial clump of keys only loads one partition filter from // block cache. @@ -2085,26 +2140,22 @@ TEST_P(DBBloomFilterTestVaryPrefixAndFormatVer, PartitionedMultiGet) { } // Confirm correct Bloom stats (no FPs) - uint64_t filter_useful = TestGetAndResetTickerCount( - options, - use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL); + uint64_t filter_useful = + PopTicker(options, use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL + : BLOOM_FILTER_USEFUL); uint64_t filter_checked = - TestGetAndResetTickerCount(options, use_prefix_ - ? BLOOM_FILTER_PREFIX_CHECKED - : BLOOM_FILTER_FULL_POSITIVE) + + PopTicker(options, use_prefix_ ? BLOOM_FILTER_PREFIX_CHECKED + : BLOOM_FILTER_FULL_POSITIVE) + (use_prefix_ ? 0 : filter_useful); EXPECT_EQ(filter_useful, number_not_found); EXPECT_EQ(filter_checked, Q); - if (!use_prefix_) { - EXPECT_EQ( - TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), - Q - number_not_found); - } + EXPECT_EQ(PopTicker(options, use_prefix_ ? BLOOM_FILTER_PREFIX_TRUE_POSITIVE + : BLOOM_FILTER_FULL_TRUE_POSITIVE), + Q - number_not_found); // Confirm no duplicate loading same filter partition - uint64_t filter_accesses = - TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) + - TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + uint64_t filter_accesses = PopTicker(options, BLOCK_CACHE_FILTER_HIT) + + PopTicker(options, BLOCK_CACHE_FILTER_MISS); if (stride == 1) { EXPECT_EQ(filter_accesses, 1); } else { @@ -2140,26 +2191,22 @@ TEST_P(DBBloomFilterTestVaryPrefixAndFormatVer, PartitionedMultiGet) { } // Confirm correct Bloom stats (might see some FPs) - uint64_t filter_useful = TestGetAndResetTickerCount( - options, - use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL); + uint64_t filter_useful = + PopTicker(options, use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL + : BLOOM_FILTER_USEFUL); uint64_t filter_checked = - TestGetAndResetTickerCount(options, use_prefix_ - ? BLOOM_FILTER_PREFIX_CHECKED - : BLOOM_FILTER_FULL_POSITIVE) + + PopTicker(options, use_prefix_ ? BLOOM_FILTER_PREFIX_CHECKED + : BLOOM_FILTER_FULL_POSITIVE) + (use_prefix_ ? 0 : filter_useful); EXPECT_GE(filter_useful, number_not_found - 2); // possible FP EXPECT_EQ(filter_checked, Q); - if (!use_prefix_) { - EXPECT_EQ( - TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), - Q - number_not_found); - } + EXPECT_EQ(PopTicker(options, use_prefix_ ? BLOOM_FILTER_PREFIX_TRUE_POSITIVE + : BLOOM_FILTER_FULL_TRUE_POSITIVE), + Q - number_not_found); // Confirm no duplicate loading of same filter partition - uint64_t filter_accesses = - TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) + - TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + uint64_t filter_accesses = PopTicker(options, BLOCK_CACHE_FILTER_HIT) + + PopTicker(options, BLOCK_CACHE_FILTER_MISS); if (filter_accesses == 2) { // Spanned across partitions. ++found_spanning; @@ -2190,7 +2237,6 @@ INSTANTIATE_TEST_CASE_P(DBBloomFilterTestVaryPrefixAndFormatVer, std::make_tuple(true, 3), std::make_tuple(true, 4), std::make_tuple(true, 5))); -#ifndef ROCKSDB_LITE namespace { static const std::string kPlainTable = "test_PlainTableBloom"; } // anonymous namespace @@ -2274,7 +2320,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTest) { ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count); ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count); - Flush(); + ASSERT_OK(Flush()); // sanity checks ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count); @@ -2324,7 +2370,7 @@ TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) { ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count); ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count); - Flush(); + ASSERT_OK(Flush()); iter.reset(dbfull()->NewIterator(ReadOptions())); @@ -2392,7 +2438,7 @@ void PrefixScanInit(DBBloomFilterTest* dbtest) { snprintf(buf, sizeof(buf), "%02d______:end", i + 1); keystr = std::string(buf); ASSERT_OK(dbtest->Put(keystr, keystr)); - dbtest->Flush(); + ASSERT_OK(dbtest->Flush()); } // GROUP 2 @@ -2403,7 +2449,7 @@ void PrefixScanInit(DBBloomFilterTest* dbtest) { snprintf(buf, sizeof(buf), "%02d______:end", small_range_sstfiles + i + 1); keystr = std::string(buf); ASSERT_OK(dbtest->Put(keystr, keystr)); - dbtest->Flush(); + ASSERT_OK(dbtest->Flush()); } } } // anonymous namespace @@ -2460,9 +2506,11 @@ TEST_F(DBBloomFilterTest, PrefixScan) { } TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { + const int kNumKeysPerFlush = 1000; + Options options = CurrentOptions(); - options.write_buffer_size = 64 * 1024; - options.arena_block_size = 4 * 1024; + options.memtable_factory.reset( + test::NewSpecialSkipListFactory(kNumKeysPerFlush)); options.target_file_size_base = 64 * 1024; options.level0_file_num_compaction_trigger = 2; options.level0_slowdown_writes_trigger = 2; @@ -2498,8 +2546,13 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { int num_inserted = 0; for (int key : keys) { ASSERT_OK(Put(1, Key(key), "val")); - if (++num_inserted % 1000 == 0) { - ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + num_inserted++; + // The write after each `kNumKeysPerFlush` keys triggers a flush. Always + // wait for that flush and any follow-on compactions for deterministic LSM + // shape. + if (num_inserted > kNumKeysPerFlush && + num_inserted % kNumKeysPerFlush == 1) { + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } } @@ -2620,7 +2673,7 @@ TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) { compact_options.target_level = 7; ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); - ASSERT_EQ(trivial_move, 1); + ASSERT_GE(trivial_move, 1); ASSERT_EQ(non_trivial_move, 0); prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); @@ -2702,6 +2755,9 @@ int CountIter(std::unique_ptr& iter, const Slice& key) { int count = 0; for (iter->Seek(key); iter->Valid(); iter->Next()) { count++; + // Access key & value as if we were using them + (void)iter->key(); + (void)iter->value(); } EXPECT_OK(iter->status()); return count; @@ -2741,6 +2797,12 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abcd0000"), 4); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), + 1); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); + ASSERT_EQ(TestGetTickerCount( + options, NON_LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH), + 1); } { Slice upper_bound("abcdzzzz"); @@ -2749,8 +2811,9 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abcd0000"), 4); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), + 2); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}})); ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), @@ -2764,8 +2827,9 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abcdxx00"), 4); // should check bloom filter since upper bound meets requirement - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), + 3); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); } { // [abcdxx01, abcey) is not valid bound since upper bound is too long for @@ -2777,8 +2841,9 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abcdxx01"), 4); // should skip bloom filter since upper bound is too long - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), + 3); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); } { // [abcdxx02, abcdy) is a valid bound since the prefix is the same @@ -2790,8 +2855,9 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { ASSERT_EQ(CountIter(iter, "abcdxx02"), 4); // should check bloom filter since upper bound matches transformed seek // key - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), + 4); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); } { // [aaaaaaaa, abce) is not a valid bound since 1) they don't share the @@ -2803,8 +2869,9 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0); // should skip bloom filter since mismatch is found - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), + 4); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}})); { @@ -2816,8 +2883,9 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abc"), 4); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), + 4); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); } // Same with re-open options.prefix_extractor.reset(NewFixedPrefixTransform(3)); @@ -2829,8 +2897,9 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abc"), 4); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), + 4); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); } // Set back to capped:4 and verify BF is always read options.prefix_extractor.reset(NewCappedPrefixTransform(4)); @@ -2842,8 +2911,9 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abc"), 0); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 5); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), + 4); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTERED), 1); } // Same if there's a problem initally loading prefix transform SyncPoint::GetInstance()->SetCallBack( @@ -2858,8 +2928,9 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) { read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "abc"), 0); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), + 4); + ASSERT_EQ(TestGetTickerCount(options, NON_LAST_LEVEL_SEEK_FILTERED), 2); } SyncPoint::GetInstance()->DisableProcessing(); } @@ -2891,10 +2962,11 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("foq1", "bar1")); ASSERT_OK(Put("fpa", "0")); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); std::unique_ptr iter_old(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_old, "foo"), 4); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 1); ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), @@ -2902,10 +2974,11 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { read_options.iterate_upper_bound = &upper_bound; std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "foo"), 2); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 1); ASSERT_EQ(CountIter(iter, "gpk"), 0); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 0); // second SST with capped:3 BF ASSERT_OK(Put("foo3", "bar3")); @@ -2917,13 +2990,13 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { // BF is cappped:3 now std::unique_ptr iter_tmp(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_tmp, "foo"), 4); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 2); ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); // both counters are incremented because BF is "not changed" for 1 of the // 2 SST files, so filter is checked once and found no match. - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 5); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 1); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 0); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}})); @@ -2940,33 +3013,34 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { std::unique_ptr iter_tmp(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_tmp, "foo"), 9); // the first and last BF are checked - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 7); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 2); ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0); // only last BF is checked and not found - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 8); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 1); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 0); } - // iter_old can only see the first SST, so checked plus 1 + // iter_old can only see the first SST ASSERT_EQ(CountIter(iter_old, "foo"), 4); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9); - // iter was created after the first setoptions call so only full filter - // will check the filter + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 1); + // same with iter, but different prefix extractor ASSERT_EQ(CountIter(iter, "foo"), 2); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 10); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 1); { // keys in all three SSTs are visible to iterator // The range of [foo, foz90000] is compatible with (fixed:1) and (fixed:2) - // so +2 for checked counter std::unique_ptr iter_all(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_all, "foo"), 9); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 2); ASSERT_EQ(CountIter(iter_all, "gpk"), 0); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 13); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + // FIXME? isn't seek key out of SST range? + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 1); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 0); } ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), @@ -2976,11 +3050,12 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) { ASSERT_EQ(CountIter(iter_all, "foo"), 6); // all three SST are checked because the current options has the same as // the remaining SST (capped:3) - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 16); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 3); ASSERT_EQ(CountIter(iter_all, "gpk"), 0); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 17); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4); + // FIXME? isn't seek key out of SST range? + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 1); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 0); } // TODO(Zhongyi): Maybe also need to add Get calls to test point look up? } @@ -3015,7 +3090,7 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) { ASSERT_OK(Put(2, "foo5", "bar5")); ASSERT_OK(Put(2, "foq6", "bar6")); ASSERT_OK(Put(2, "fpq7", "bar7")); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); { std::unique_ptr iter( db_->NewIterator(read_options, handles_[2])); @@ -3065,30 +3140,30 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("foo1", "bar1")); ASSERT_OK(Put("fpa", "0")); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("foo3", "bar3")); ASSERT_OK(Put("foo4", "bar4")); ASSERT_OK(Put("foo5", "bar5")); ASSERT_OK(Put("fpb", "1")); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); ASSERT_OK(Put("foo6", "bar6")); ASSERT_OK(Put("foo7", "bar7")); ASSERT_OK(Put("foo8", "bar8")); ASSERT_OK(Put("fpc", "2")); - dbfull()->Flush(FlushOptions()); + ASSERT_OK(dbfull()->Flush(FlushOptions())); ReadOptions read_options; read_options.prefix_same_as_start = true; { std::unique_ptr iter(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter, "foo"), 12); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 3); } std::unique_ptr iter_old(db_->NewIterator(read_options)); ASSERT_EQ(CountIter(iter_old, "foo"), 12); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 3); ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}})); ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(), @@ -3097,17 +3172,18 @@ TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) { std::unique_ptr iter(db_->NewIterator(read_options)); // "fp*" should be skipped ASSERT_EQ(CountIter(iter, "foo"), 9); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 0); } // iterator created before should not be affected and see all keys ASSERT_EQ(CountIter(iter_old, "foo"), 12); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 0); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 3); ASSERT_EQ(CountIter(iter_old, "abc"), 0); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12); - ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3); + // FIXME? isn't seek key out of SST range? + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTERED), 3); + EXPECT_EQ(PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 0); } } @@ -3202,19 +3278,19 @@ class FixedSuffix4Transform : public SliceTransform { std::pair GetBloomStat(const Options& options, bool sst) { if (sst) { - return { - options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_CHECKED), - options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_USEFUL)}; + return {options.statistics->getAndResetTickerCount( + NON_LAST_LEVEL_SEEK_FILTER_MATCH), + options.statistics->getAndResetTickerCount( + NON_LAST_LEVEL_SEEK_FILTERED)}; } else { auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0); auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0); - return {hit + miss, miss}; + return {hit, miss}; } } -std::pair CheckedAndUseful(uint64_t checked, - uint64_t useful) { - return {checked, useful}; +std::pair HitAndMiss(uint64_t hits, uint64_t misses) { + return {hits, misses}; } } // anonymous namespace @@ -3252,27 +3328,27 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter1) { if (flushed) { // TODO: support auto_prefix_mode in memtable? read_options.auto_prefix_mode = true; } - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0)); { Slice ub("999aaaa"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaaa"), 3); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(1, 0)); } { Slice ub("999abaa"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "abaa"), 1); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(1, 0)); } { Slice ub("999acaa"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "acaa"), 0); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 1)); } { Slice ub("zzzz"); @@ -3280,7 +3356,7 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter1) { std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "baa"), 3); if (flushed) { // TODO: fix memtable case - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0)); } } } @@ -3326,13 +3402,13 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) { get_perf_context()->bloom_memtable_hit_count = 0; get_perf_context()->bloom_memtable_miss_count = 0; } - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0)); { Slice ub("aaaa000"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaaa999"), 3); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(1, 0)); } { // Note: prefix does work as upper bound @@ -3340,7 +3416,7 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) { read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaaa999"), 3); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(1, 0)); } { // Note: prefix does not work here as seek key @@ -3348,28 +3424,28 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) { read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaaa"), 0); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(1, 0)); } { Slice ub("aaba000"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaba999"), 1); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(1, 0)); } { Slice ub("aaca000"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaca999"), 0); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 1)); } { Slice ub("aaaz"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "zzz"), 5); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0)); } { // Note: prefix does work here as seek key, but only finds key equal @@ -3379,7 +3455,7 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) { read_options.prefix_same_as_start = true; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "qqqq"), 1); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(1, 0)); } } } @@ -3470,13 +3546,13 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) { get_perf_context()->bloom_memtable_hit_count = 0; get_perf_context()->bloom_memtable_miss_count = 0; } - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0)); { Slice ub("aaaa999"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaaa000"), 3); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(1, 0)); } { // Note: prefix as seek key is not bloom-optimized @@ -3486,28 +3562,28 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) { read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaaa"), 3); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0)); } { Slice ub("aaba9"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaba0"), 1); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(1, 0)); } { Slice ub("aaca9"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aaca0"), 0); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 1)); } { Slice ub("qqqq9"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "qqqq0"), 1); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(1, 0)); } { // Note: prefix as seek key is not bloom-optimized @@ -3515,7 +3591,7 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) { read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "qqqq"), weird_comparator ? 7 : 2); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0)); } { // Note: prefix as seek key is not bloom-optimized @@ -3523,20 +3599,19 @@ TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) { read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "zzzz"), weird_comparator ? 8 : 1); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0)); } { Slice ub("zzzz9"); read_options.iterate_upper_bound = &ub; std::unique_ptr iter(db_->NewIterator(read_options)); EXPECT_EQ(CountIter(iter, "aab"), weird_comparator ? 6 : 5); - EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0)); + EXPECT_EQ(GetBloomStat(options, flushed), HitAndMiss(0, 0)); } } } } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_clip_test.cc b/db/db_clip_test.cc new file mode 100644 index 000000000000..fd0bb57170fd --- /dev/null +++ b/db/db_clip_test.cc @@ -0,0 +1,142 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "port/port.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class DBClipTest : public DBTestBase { + public: + DBClipTest() : DBTestBase("db_clip_test", /*env_do_fsync=*/true) {} +}; + +TEST_F(DBClipTest, TestClipRange) { + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 3; + options.max_background_compactions = 3; + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + + // file [0 => 100), [100 => 200), ... [900, 1000) + for (auto i = 0; i < 10; i++) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + values[k] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("10", FilesPerLevel(0)); + auto begin_key = Key(251), end_key = Key(751); + ASSERT_OK( + db_->ClipColumnFamily(db_->DefaultColumnFamily(), begin_key, end_key)); + + for (auto i = 0; i < 251; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 251; i < 751; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (auto i = 751; i < 1000; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + + std::vector all_metadata; + db_->GetLiveFilesMetaData(&all_metadata); + for (auto& md : all_metadata) { + // make sure clip_begin_key <= file_smallestkey <= file_largestkey <= + // clip_end_key + bool in_range = false; + + if (options.comparator->Compare(begin_key, md.smallestkey) <= 0 && + options.comparator->Compare(end_key, md.largestkey) > 0) { + in_range = true; + } + ASSERT_TRUE(in_range); + } + + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,3", FilesPerLevel(0)); + + for (auto i = 0; i < 10; i += 2) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("5,0,3", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_EQ("0,5,3", FilesPerLevel(0)); + + for (auto i = 1; i < 10; i += 2) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("5,5,3", FilesPerLevel(0)); + + auto begin_key_2 = Key(222), end_key_2 = Key(888); + + ASSERT_OK(db_->ClipColumnFamily(db_->DefaultColumnFamily(), begin_key_2, + end_key_2)); + + for (auto i = 0; i < 222; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 222; i < 888; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (auto i = 888; i < 1000; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + + std::vector all_metadata_2; + db_->GetLiveFilesMetaData(&all_metadata_2); + for (auto& md : all_metadata_2) { + // make sure clip_begin_key <= file_smallestkey <= file_largestkey <= + // clip_end_key + bool in_range = false; + if (begin_key_2.compare(md.smallestkey) <= 0 && + end_key_2.compare(md.largestkey) > 0) { + in_range = true; + } + ASSERT_TRUE(in_range); + } +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc index be863d4f66b9..44c406c4965e 100644 --- a/db/db_compaction_filter_test.cc +++ b/db/db_compaction_filter_test.cc @@ -166,9 +166,12 @@ class ChangeFilter : public CompactionFilter { class KeepFilterFactory : public CompactionFilterFactory { public: explicit KeepFilterFactory(bool check_context = false, - bool check_context_cf_id = false) + bool check_context_cf_id = false, + bool check_context_input_table_properties = false) : check_context_(check_context), check_context_cf_id_(check_context_cf_id), + check_context_input_table_properties_( + check_context_input_table_properties), compaction_filter_created_(false) {} std::unique_ptr CreateCompactionFilter( @@ -176,6 +179,11 @@ class KeepFilterFactory : public CompactionFilterFactory { if (check_context_) { EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction); EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); + EXPECT_EQ(expect_input_start_level_.load(), context.input_start_level); + } + if (check_context_input_table_properties_) { + EXPECT_TRUE(expect_input_table_properties_ == + context.input_table_properties); } if (check_context_cf_id_) { EXPECT_EQ(expect_cf_id_.load(), context.column_family_id); @@ -189,9 +197,15 @@ class KeepFilterFactory : public CompactionFilterFactory { const char* Name() const override { return "KeepFilterFactory"; } bool check_context_; bool check_context_cf_id_; + // `check_context_input_table_properties_` can be true only when access to + // `expect_input_table_properties_` is syncronized since we can't have + // std::atomic unfortunately + bool check_context_input_table_properties_; std::atomic_bool expect_full_compaction_; std::atomic_bool expect_manual_compaction_; std::atomic expect_cf_id_; + std::atomic expect_input_start_level_; + TablePropertiesCollection expect_input_table_properties_; bool compaction_filter_created_; }; @@ -285,7 +299,6 @@ class ChangeFilterFactory : public CompactionFilterFactory { const char* Name() const override { return "ChangeFilterFactory"; } }; -#ifndef ROCKSDB_LITE TEST_F(DBTestCompactionFilter, CompactionFilter) { Options options = CurrentOptions(); options.max_open_files = -1; @@ -469,7 +482,6 @@ TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) { delete itr; } -#endif // ROCKSDB_LITE TEST_F(DBTestCompactionFilter, CompactionFilterFlush) { // Tests a `CompactionFilterFactory` that filters when table file is created @@ -655,9 +667,10 @@ TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) { ASSERT_EQ(newvalue, four); } -#ifndef ROCKSDB_LITE TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) { - KeepFilterFactory* filter = new KeepFilterFactory(true, true); + KeepFilterFactory* filter = new KeepFilterFactory( + true /* check_context */, true /* check_context_cf_id */, + true /* check_context_input_table_properties */); Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; @@ -665,8 +678,9 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) { options.compression = kNoCompression; options.level0_file_num_compaction_trigger = 8; Reopen(options); + const int kNumFiles = 3; int num_keys_per_file = 400; - for (int j = 0; j < 3; j++) { + for (int j = 0; j < kNumFiles; j++) { // Write several keys. const std::string value(10, 'x'); for (int i = 0; i < num_keys_per_file; i++) { @@ -686,6 +700,11 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) { filter->expect_manual_compaction_.store(true); filter->expect_full_compaction_.store(true); filter->expect_cf_id_.store(0); + filter->expect_input_start_level_.store(0); + ASSERT_OK(dbfull()->GetPropertiesOfAllTables( + &filter->expect_input_table_properties_)); + ASSERT_TRUE(filter->expect_input_table_properties_.size() == kNumFiles); + ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(cfilter_count, 700); ASSERT_EQ(NumSortedRuns(0), 1); @@ -715,7 +734,6 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) { ASSERT_EQ(count, 0); } } -#endif // ROCKSDB_LITE TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) { KeepFilterFactory* filter = new KeepFilterFactory(false, true); @@ -746,8 +764,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) { ASSERT_TRUE(filter->compaction_filter_created()); } -#ifndef ROCKSDB_LITE -// Compaction filters aplies to all records, regardless snapshots. +// Compaction filters applies to all records, regardless snapshots. TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) { std::string five = std::to_string(5); Options options = CurrentOptions(); @@ -788,6 +805,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) { count++; iter->Next(); } + ASSERT_OK(iter->status()); ASSERT_EQ(count, 6); read_options.snapshot = nullptr; std::unique_ptr iter1(db_->NewIterator(read_options)); @@ -798,6 +816,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) { count++; iter1->Next(); } + ASSERT_OK(iter1->status()); // We have deleted 10 keys from 40 using the compaction filter // Keys 6-9 before the snapshot and 100-105 after the snapshot ASSERT_EQ(count, 30); @@ -807,7 +826,6 @@ TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) { // removed. db_->ReleaseSnapshot(snapshot); } -#endif // ROCKSDB_LITE TEST_F(DBTestCompactionFilter, SkipUntil) { Options options = CurrentOptions(); diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 41ab69b85afc..2d71231173b3 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -19,7 +19,7 @@ #include "rocksdb/concurrent_task_limiter.h" #include "rocksdb/experimental.h" #include "rocksdb/sst_file_writer.h" -#include "rocksdb/utilities/convenience.h" +#include "test_util/mock_time_env.h" #include "test_util/sync_point.h" #include "test_util/testutil.h" #include "util/concurrent_task_limiter_impl.h" @@ -30,7 +30,6 @@ namespace ROCKSDB_NAMESPACE { // SYNC_POINT is not supported in released Windows mode. -#if !defined(ROCKSDB_LITE) class CompactionStatsCollector : public EventListener { public: @@ -137,11 +136,12 @@ class DBCompactionTestWithParam class DBCompactionTestWithBottommostParam : public DBTestBase, - public testing::WithParamInterface { + public testing::WithParamInterface< + std::tuple> { public: DBCompactionTestWithBottommostParam() : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) { - bottommost_level_compaction_ = GetParam(); + bottommost_level_compaction_ = std::get<0>(GetParam()); } BottommostLevelCompaction bottommost_level_compaction_; @@ -153,6 +153,56 @@ class DBCompactionDirectIOTest : public DBCompactionTest, DBCompactionDirectIOTest() : DBCompactionTest() {} }; +// Params: See WaitForCompactOptions for details +class DBCompactionWaitForCompactTest + : public DBTestBase, + public testing::WithParamInterface< + std::tuple> { + public: + DBCompactionWaitForCompactTest() + : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) { + abort_on_pause_ = std::get<0>(GetParam()); + flush_ = std::get<1>(GetParam()); + close_db_ = std::get<2>(GetParam()); + timeout_ = std::get<3>(GetParam()); + } + bool abort_on_pause_; + bool flush_; + bool close_db_; + std::chrono::microseconds timeout_; + Options options_; + WaitForCompactOptions wait_for_compact_options_; + + void SetUp() override { + // This test sets up a scenario that one more L0 file will trigger a + // compaction + const int kNumKeysPerFile = 4; + const int kNumFiles = 2; + + options_ = CurrentOptions(); + options_.level0_file_num_compaction_trigger = kNumFiles + 1; + + wait_for_compact_options_ = WaitForCompactOptions(); + wait_for_compact_options_.abort_on_pause = abort_on_pause_; + wait_for_compact_options_.flush = flush_; + wait_for_compact_options_.close_db = close_db_; + wait_for_compact_options_.timeout = timeout_; + + DestroyAndReopen(options_); + + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(100 /* len */))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("2", FilesPerLevel()); + } +}; + // Param = true : target level is non-empty // Param = false: level between target level and source level // is not empty. @@ -1105,7 +1155,7 @@ TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) { ASSERT_OK(Put("bbbb1", "B")); ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); std::vector files; dbfull()->GetLiveFilesMetaData(&files); @@ -2623,6 +2673,7 @@ TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) { keys_in_db.append(iter->key().ToString()); keys_in_db.push_back(','); } + ASSERT_OK(iter->status()); delete iter; std::string expected_keys; @@ -3285,6 +3336,304 @@ TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) { TEST_SYNC_POINT( "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2"); ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +INSTANTIATE_TEST_CASE_P( + DBCompactionWaitForCompactTest, DBCompactionWaitForCompactTest, + ::testing::Combine( + testing::Bool() /* abort_on_pause */, testing::Bool() /* flush */, + testing::Bool() /* close_db */, + testing::Values( + std::chrono::microseconds::zero(), + std::chrono::microseconds{ + 60 * 60 * + 1000000ULL} /* timeout */))); // 1 hour (long enough to + // make sure that tests + // don't fail unexpectedly + // when running slow) + +TEST_P(DBCompactionWaitForCompactTest, + WaitForCompactWaitsOnCompactionToFinish) { + // Triggers a compaction. Before the compaction finishes, test + // closes the DB Upon reopen, wait for the compaction to finish and checks for + // the number of compaction finished + + int compaction_finished = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::Run():EndStatusSet", [&](void* arg) { + auto status = static_cast(arg); + if (status->ok()) { + compaction_finished++; + } + }); + // To make sure there's a flush/compaction debt + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::MaybeScheduleFlushOrCompaction:BeforeSchedule", [&](void* arg) { + auto unscheduled_flushes = *static_cast(arg); + ASSERT_GT(unscheduled_flushes, 0); + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBCompactionTest::WaitForCompactWaitsOnCompactionToFinish", + "DBImpl::MaybeScheduleFlushOrCompaction:BeforeSchedule"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // create compaction debt by adding one more L0 file then closing + Random rnd(123); + GenerateNewRandomFile(&rnd, /* nowait */ true); + ASSERT_EQ(0, compaction_finished); + Close(); + TEST_SYNC_POINT("DBCompactionTest::WaitForCompactWaitsOnCompactionToFinish"); + ASSERT_EQ(0, compaction_finished); + + // Reopen the db and we expect the compaction to be triggered. + Reopen(options_); + + // Wait for compaction to finish + ASSERT_OK(dbfull()->WaitForCompact(wait_for_compact_options_)); + ASSERT_GT(compaction_finished, 0); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(DBCompactionWaitForCompactTest, WaitForCompactAbortOnPause) { + // Triggers a compaction. Before the compaction finishes, test + // pauses the compaction. Calling WaitForCompact() with option + // abort_on_pause=true should return Status::Aborted Or + // ContinueBackgroundWork() must be called + + // Now trigger L0 compaction by adding a file + Random rnd(123); + GenerateNewRandomFile(&rnd, /* nowait */ true); + ASSERT_OK(Flush()); + + // Pause the background jobs. + ASSERT_OK(dbfull()->PauseBackgroundWork()); + + // If not abort_on_pause_ continue the background jobs. + if (!abort_on_pause_) { + ASSERT_OK(dbfull()->ContinueBackgroundWork()); + } + + Status s = dbfull()->WaitForCompact(wait_for_compact_options_); + if (abort_on_pause_) { + ASSERT_NOK(s); + ASSERT_TRUE(s.IsAborted()); + } else { + ASSERT_OK(s); + } +} + +TEST_P(DBCompactionWaitForCompactTest, WaitForCompactShutdownWhileWaiting) { + // Triggers a compaction. Before the compaction finishes, db + // shuts down (by calling CancelAllBackgroundWork()). Calling WaitForCompact() + // should return Status::IsShutdownInProgress() + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ + {"CompactionJob::Run():Start", + "DBCompactionTest::WaitForCompactShutdownWhileWaiting:0"}, + {"DBImpl::WaitForCompact:StartWaiting", + "DBCompactionTest::WaitForCompactShutdownWhileWaiting:1"}, + {"DBImpl::~DBImpl:WaitJob", "CompactionJob::Run():End"}, + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Now trigger L0 compaction by adding a file + Random rnd(123); + GenerateNewRandomFile(&rnd, /* nowait */ true); + ASSERT_OK(Flush()); + // Wait for compaction to start + TEST_SYNC_POINT("DBCompactionTest::WaitForCompactShutdownWhileWaiting:0"); + + // Wait for Compaction in another thread + auto waiting_for_compaction_thread = port::Thread([this]() { + Status s = dbfull()->WaitForCompact(wait_for_compact_options_); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsShutdownInProgress()); + }); + TEST_SYNC_POINT("DBCompactionTest::WaitForCompactShutdownWhileWaiting:1"); + // Shutdown after wait started, but before the compaction finishes + auto closing_thread = port::Thread([this]() { ASSERT_OK(db_->Close()); }); + + waiting_for_compaction_thread.join(); + closing_thread.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBCompactionWaitForCompactTest, WaitForCompactWithOptionToFlush) { + // After creating enough L0 files that one more file will trigger the + // compaction, write some data in memtable. Calls WaitForCompact with option + // to flush. This will flush the memtable to a new L0 file which will trigger + // compaction. Lastly check for expected number of files, closing + reopening + // DB won't trigger any flush or compaction + + int compaction_finished = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:AfterCompaction", + [&](void*) { compaction_finished++; }); + + int flush_finished = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FlushJob::End", [&](void*) { flush_finished++; }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // write to memtable (overlapping key with first L0 file), but no flush is + // needed at this point. + ASSERT_OK(Put(Key(0), "some random string")); + ASSERT_EQ(0, compaction_finished); + ASSERT_EQ(0, flush_finished); + ASSERT_EQ("2", FilesPerLevel()); + + ASSERT_OK(dbfull()->WaitForCompact(wait_for_compact_options_)); + ASSERT_EQ(flush_, compaction_finished); + ASSERT_EQ(flush_, flush_finished); + + if (!close_db_) { + std::string expected_files_per_level = flush_ ? "1,2" : "2"; + ASSERT_EQ(expected_files_per_level, FilesPerLevel()); + } + + compaction_finished = 0; + flush_finished = 0; + if (!close_db_) { + Close(); + } + Reopen(options_); + + ASSERT_EQ(0, flush_finished); + if (flush_) { + // if flushed already prior to close and reopen, expect there's no + // additional compaction needed + ASSERT_EQ(0, compaction_finished); + } else { + // if not flushed prior to close and reopen, expect L0 file creation from + // WAL when reopening which will trigger the compaction. + ASSERT_OK(dbfull()->WaitForCompact(wait_for_compact_options_)); + ASSERT_EQ(1, compaction_finished); + } + + if (!close_db_) { + ASSERT_EQ("1,2", FilesPerLevel()); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(DBCompactionWaitForCompactTest, + WaitForCompactWithOptionToFlushAndCloseDB) { + // After creating enough L0 files that one more file will trigger the + // compaction, write some data in memtable (WAL disabled). Calls + // WaitForCompact. If flush option is true, WaitForCompact will flush the + // memtable to a new L0 file which will trigger compaction. We expect the + // no-op second flush upon closing because WAL is disabled + // (has_unpersisted_data_ true) Check to make sure there's no extra L0 file + // created from WAL. Re-opening DB won't trigger any flush or compaction + + std::atomic_int compaction_finished = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:Finish", + [&](void*) { compaction_finished++; }); + + std::atomic_int flush_finished = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FlushJob::End", [&](void*) { flush_finished++; }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_FALSE(options_.avoid_flush_during_shutdown); + + // write to memtable, but no flush is needed at this point. + WriteOptions write_without_wal; + write_without_wal.disableWAL = true; + ASSERT_OK(Put(Key(0), "some random string", write_without_wal)); + ASSERT_EQ(0, compaction_finished); + ASSERT_EQ(0, flush_finished); + ASSERT_EQ("2", FilesPerLevel()); + + ASSERT_OK(dbfull()->WaitForCompact(wait_for_compact_options_)); + + int expected_flush_count = flush_ || close_db_; + ASSERT_EQ(expected_flush_count, flush_finished); + + if (!close_db_) { + // During CancelAllBackgroundWork(), a flush can be initiated due to + // unpersisted data (data that's still in the memtable when WAL is off). + // This results in an additional L0 file which can trigger a compaction. + // However, the compaction may not complete if the background thread's + // execution is slow enough for the front thread to set the 'shutting_down_' + // flag to true before the compaction job even starts. + ASSERT_EQ(expected_flush_count, compaction_finished); + Close(); + } + + // Because we had has_unpersisted_data_ = true, flush must have been triggered + // upon closing regardless of WaitForCompact. Reopen should have no flush + // debt. + flush_finished = 0; + Reopen(options_); + ASSERT_EQ(0, flush_finished); + + // However, if db was closed directly by calling Close(), instead + // of WaitForCompact with close_db option or we are in the scenario commented + // above, it's possible that the last compaction triggered by flushing + // unpersisted data was cancelled. Call WaitForCompact() here again to finish + // the compaction + if (compaction_finished == 0) { + ASSERT_OK(dbfull()->WaitForCompact(wait_for_compact_options_)); + } + ASSERT_EQ(1, compaction_finished); + if (!close_db_) { + ASSERT_EQ("1,2", FilesPerLevel()); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +TEST_P(DBCompactionWaitForCompactTest, WaitForCompactToTimeout) { + // When timeout is set, this test makes CompactionJob hangs forever + // using sync point. This test also sets the timeout to be 1 ms for + // WaitForCompact to time out early. WaitForCompact() is expected to return + // Status::TimedOut. + // When timeout is not set, we expect WaitForCompact() to wait indefinitely. + // We don't want the test to hang forever. When timeout = 0, this test is not + // much different from WaitForCompactWaitsOnCompactionToFinish + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"DBCompactionTest::WaitForCompactToTimeout", + "CompactionJob::Run():Start"}}); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Now trigger L0 compaction by adding a file + Random rnd(123); + GenerateNewRandomFile(&rnd, /* nowait */ true); + ASSERT_OK(Flush()); + + if (wait_for_compact_options_.timeout.count()) { + // Make timeout shorter to finish test early + wait_for_compact_options_.timeout = std::chrono::microseconds{1000}; + } else { + // if timeout is not set, WaitForCompact() will wait forever. We don't + // want test to hang forever. Just let compaction go through + TEST_SYNC_POINT("DBCompactionTest::WaitForCompactToTimeout"); + } + Status s = dbfull()->WaitForCompact(wait_for_compact_options_); + if (wait_for_compact_options_.timeout.count()) { + ASSERT_NOK(s); + ASSERT_TRUE(s.IsTimedOut()); + } else { + ASSERT_OK(s); + } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } static std::string ShortKey(int i) { @@ -3614,7 +3963,10 @@ TEST_F(DBCompactionTest, CancelCompactionWaitingOnConflict) { Random rnd(301); for (int i = 0; i < kNumSortedRuns; ++i) { int key_idx = 0; - GenerateNewFile(&rnd, &key_idx, true /* nowait */); + // We hold the compaction from happening, so when generating the last SST + // file, we cannot wait. Otherwise, we'll hit a deadlock. + GenerateNewFile(&rnd, &key_idx, + (i == kNumSortedRuns - 1) ? true : false /* nowait */); } auto_compaction_sleeping_task.WaitUntilSleeping(); @@ -3827,11 +4179,6 @@ TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) { // files does not need to be preserved in case of a future snapshot. ASSERT_OK(Put(Key(0), "val")); ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_); - // release snapshot and wait for compactions to finish. Single-file - // compactions should be triggered, which reduce the size of each bottom-level - // file without changing file count. - db_->ReleaseSnapshot(snapshot); - ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { Compaction* compaction = reinterpret_cast(arg); @@ -3839,6 +4186,11 @@ TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) { CompactionReason::kBottommostFiles); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // release snapshot and wait for compactions to finish. Single-file + // compactions should be triggered, which reduce the size of each bottom-level + // file without changing file count. + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_); ASSERT_OK(dbfull()->TEST_WaitForCompact()); db_->GetLiveFilesMetaData(&post_release_metadata); ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size()); @@ -3855,6 +4207,78 @@ TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } +TEST_F(DBCompactionTest, DelayCompactBottomLevelFilesWithDeletions) { + // bottom-level files may contain deletions due to snapshots protecting the + // deleted keys. Once the snapshot is released and the files are old enough, + // we should see them undergo single-file compactions. + Options options = CurrentOptions(); + env_->SetMockSleep(); + options.bottommost_file_compaction_delay = 3600; + DestroyAndReopen(options); + CreateColumnFamilies({"one"}, options); + const int kNumKey = 100; + const int kValLen = 100; + + Random rnd(301); + for (int i = 0; i < kNumKey; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValLen))); + } + const Snapshot* snapshot = db_->GetSnapshot(); + for (int i = 0; i < kNumKey; i += 2) { + ASSERT_OK(Delete(Key(i))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + std::vector pre_release_metadata; + db_->GetLiveFilesMetaData(&pre_release_metadata); + ASSERT_EQ(1, pre_release_metadata.size()); + std::atomic_int compaction_count = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(compaction->compaction_reason() == + CompactionReason::kBottommostFiles); + compaction_count++; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST + // files does not need to be preserved in case of a future snapshot. + ASSERT_OK(Put(Key(0), "val")); + ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_); + // release snapshot will not trigger compaction. + db_->ReleaseSnapshot(snapshot); + ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, compaction_count); + // Now the file is old enough for compaction. + env_->MockSleepForSeconds(3600); + // Another flush will trigger re-computation of the compaction score + // to find out that the file is qualified for compaction. + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(1, compaction_count); + + std::vector post_release_metadata; + db_->GetLiveFilesMetaData(&post_release_metadata); + ASSERT_EQ(2, post_release_metadata.size()); + + const auto& pre_file = pre_release_metadata[0]; + // Get the L1 (bottommost level) file. + const auto& post_file = post_release_metadata[0].level == 0 + ? post_release_metadata[1] + : post_release_metadata[0]; + + ASSERT_EQ(1, pre_file.level); + ASSERT_EQ(1, post_file.level); + // the file is smaller than it was before as it was rewritten without + // deletion markers/deleted keys. + ASSERT_LT(post_file.size, pre_file.size); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBCompactionTest, NoCompactBottomLevelFilesWithDeletions) { // bottom-level files may contain deletions due to snapshots protecting the // deleted keys. Once the snapshot is released, we should see files with many @@ -4370,11 +4794,12 @@ TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "VersionEdit::EncodeTo:VarintOldestAncesterTime", [&](void* arg) { if (if_restart && if_open_all_files) { - std::string* encoded_fieled = static_cast(arg); - *encoded_fieled = ""; - PutVarint64(encoded_fieled, 0); + std::string* encoded_field = static_cast(arg); + *encoded_field = ""; + PutVarint64(encoded_field, 0); } }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); options.env = env_; @@ -4391,7 +4816,6 @@ TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { ttl_compactions++; } }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); // Add two L6 files with key ranges: [1 .. 100], [101 .. 200]. Random rnd(301); @@ -4518,11 +4942,12 @@ TEST_F(DBCompactionTest, LevelPeriodicCompaction) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) { if (if_restart && if_open_all_files) { - std::string* encoded_fieled = static_cast(arg); - *encoded_fieled = ""; - PutVarint64(encoded_fieled, 0); + std::string* encoded_field = static_cast(arg); + *encoded_field = ""; + PutVarint64(encoded_field, 0); } }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); options.env = env_; @@ -4539,7 +4964,6 @@ TEST_F(DBCompactionTest, LevelPeriodicCompaction) { periodic_compactions++; } }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); Random rnd(301); for (int i = 0; i < kNumLevelFiles; ++i) { @@ -4596,6 +5020,106 @@ TEST_F(DBCompactionTest, LevelPeriodicCompaction) { } } +TEST_F(DBCompactionTest, LevelPeriodicCompactionOffpeak) { + // This test simply checks if offpeak adjustment works in Leveled + // Compactions. For testing offpeak periodic compactions in various + // scenarios, please refer to + // DBTestUniversalCompaction2::PeriodicCompactionOffpeak + constexpr int kNumKeysPerFile = 32; + constexpr int kNumLevelFiles = 2; + constexpr int kValueSize = 100; + constexpr int kSecondsPerDay = 86400; + constexpr int kSecondsPerHour = 3600; + constexpr int kSecondsPerMinute = 60; + + for (bool if_restart : {false, true}) { + SCOPED_TRACE("if_restart=" + std::to_string(if_restart)); + Options options = CurrentOptions(); + options.ttl = 0; + options.periodic_compaction_seconds = 5 * kSecondsPerDay; // 5 days + // In the case where all files are opened and doing DB restart + // forcing the file creation time in manifest file to be 0 to + // simulate the case of reading from an old version. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) { + if (if_restart) { + std::string* encoded_field = static_cast(arg); + *encoded_field = ""; + PutVarint64(encoded_field, 0); + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Just to add some extra random days to current time + Random rnd(test::RandomSeed()); + int days = rnd.Uniform(100); + + int periodic_compactions = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) { + Compaction* compaction = static_cast(arg); + auto compaction_reason = compaction->compaction_reason(); + if (compaction_reason == CompactionReason::kPeriodicCompaction) { + periodic_compactions++; + } + }); + + // Starting at 12:15AM + int now_hour = 0; + int now_minute = 15; + auto mock_clock = std::make_shared(env_->GetSystemClock()); + auto mock_env = std::make_unique(env_, mock_clock); + options.env = mock_env.get(); + mock_clock->SetCurrentTime(days * kSecondsPerDay + + now_hour * kSecondsPerHour + + now_minute * kSecondsPerMinute); + // Offpeak is set from 12:30AM to 4:30AM + options.daily_offpeak_time_utc = "00:30-04:30"; + Reopen(options); + + for (int i = 0; i < kNumLevelFiles; ++i) { + for (int j = 0; j < kNumKeysPerFile; ++j) { + ASSERT_OK( + Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("2", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + + // Move clock forward by 1 hour. Now at 1:15AM Day 0. No compaction. + mock_clock->MockSleepForSeconds(1 * kSecondsPerHour); + ASSERT_OK(Put("a", "1")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + // Assert that the files stay in the same level + ASSERT_EQ("3", FilesPerLevel()); + ASSERT_EQ(0, periodic_compactions); + MoveFilesToLevel(1); + ASSERT_EQ("0,3", FilesPerLevel()); + + // Move clock forward by 4 days and check if it triggers periodic + // comapaction at 1:15AM Day 4. Files created on Day 0 at 12:15AM is + // expected to expire before the offpeak starts next day at 12:30AM + mock_clock->MockSleepForSeconds(4 * kSecondsPerDay); + ASSERT_OK(Put("b", "2")); + if (if_restart) { + Reopen(options); + } else { + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ("1,3", FilesPerLevel()); + // The two old files go through the periodic compaction process + ASSERT_EQ(2, periodic_compactions); + + Destroy(options); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } +} + TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) { // This test makes sure that periodic compactions are working with a DB // where file_creation_time of some files is 0. @@ -5006,6 +5530,12 @@ TEST_F(DBCompactionTest, CompactRangeDelayedByImmMemTableCount) { } auto manual_compaction_thread = port::Thread([this]() { + // Write something to make the current Memtable non-empty, so an extra + // immutable Memtable will be created upon manual flush requested by + // CompactRange, triggering a write stall mode to be entered because of + // accumulation of write buffers due to manual flush. + Random compact_rnd(301); + ASSERT_OK(Put(Key(0), compact_rnd.RandomString(1024))); CompactRangeOptions cro; cro.allow_write_stall = false; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); @@ -5072,7 +5602,11 @@ TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) { manual_compaction_thread.join(); TEST_SYNC_POINT( "DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual"); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); + if (i == 0) { + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } else { + ASSERT_NOK(dbfull()->TEST_WaitForCompact()); + } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } } @@ -5507,8 +6041,9 @@ TEST_F(DBCompactionTest, CompactionLimiter) { for (int n = 0; n < options.level0_file_num_compaction_trigger; n++) { for (unsigned int cf = 0; cf < cf_count; cf++) { + // All L0s should overlap with each other for (int i = 0; i < kNumKeysPerFile; i++) { - ASSERT_OK(Put(cf, Key(keyIndex++), "")); + ASSERT_OK(Put(cf, Key(i), "")); } // put extra key to trigger flush ASSERT_OK(Put(cf, "", "")); @@ -5591,23 +6126,18 @@ TEST_P(DBCompactionDirectIOTest, DirectIO) { options.use_direct_io_for_flush_and_compaction = GetParam(); options.env = MockEnv::Create(Env::Default()); Reopen(options); - bool readahead = false; SyncPoint::GetInstance()->SetCallBack( "CompactionJob::OpenCompactionOutputFile", [&](void* arg) { bool* use_direct_writes = static_cast(arg); ASSERT_EQ(*use_direct_writes, options.use_direct_io_for_flush_and_compaction); }); - if (options.use_direct_io_for_flush_and_compaction) { - SyncPoint::GetInstance()->SetCallBack( - "SanitizeOptions:direct_io", [&](void* /*arg*/) { readahead = true; }); - } SyncPoint::GetInstance()->EnableProcessing(); CreateAndReopenWithCF({"pikachu"}, options); MakeTables(3, "p", "q", 1); ASSERT_EQ("1,1,1", FilesPerLevel(1)); Compact(1, "p", "q"); - ASSERT_EQ(readahead, options.use_direct_reads); + ASSERT_EQ(false, options.use_direct_reads); ASSERT_EQ("0,0,1", FilesPerLevel(1)); Destroy(options); delete options.env; @@ -5814,7 +6344,7 @@ TEST_P(RoundRobinSubcompactionsAgainstPressureToken, PressureTokenTest) { } TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:2"); - ASSERT_OK(dbfull()->WaitForCompact()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(num_planned_subcompactions_verified); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -5889,7 +6419,7 @@ TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) { "CompactionJob::ReleaseSubcompactionResources:1"}}); SyncPoint::GetInstance()->EnableProcessing(); - ASSERT_OK(dbfull()->WaitForCompact()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()})); TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:0"); TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:1"); @@ -5905,7 +6435,7 @@ TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) { total_low_pri_threads_ - 1, env_->ReleaseThreads(total_low_pri_threads_ - 1, Env::Priority::LOW)); TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:4"); - ASSERT_OK(dbfull()->WaitForCompact()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(num_planned_subcompactions_verified); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -5977,11 +6507,11 @@ TEST_P(DBCompactionTestWithParam, RoundRobinWithoutAdditionalResources) { "BackgroundCallCompaction:0"}}); SyncPoint::GetInstance()->EnableProcessing(); - ASSERT_OK(dbfull()->WaitForCompact()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()})); TEST_SYNC_POINT("DBCompactionTest::RoundRobinWithoutAdditionalResources:0"); - ASSERT_OK(dbfull()->WaitForCompact()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_TRUE(num_planned_subcompactions_verified); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); @@ -6736,10 +7266,8 @@ class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest { if (compaction_path_to_test == "FindIntraL0Compaction" || compaction_path_to_test == "CompactRange") { fifo_options.allow_compaction = true; - fifo_options.age_for_warm = 0; } else if (compaction_path_to_test == "CompactFile") { fifo_options.allow_compaction = false; - fifo_options.age_for_warm = 0; } options_.compaction_options_fifo = fifo_options; } @@ -7336,46 +7864,116 @@ TEST_P(DBCompactionTestL0FilesMisorderCorruptionWithParam, Destroy(options_); } -TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) { - constexpr int kSstNum = 10; +TEST_F(DBCompactionTest, SingleLevelUniveresal) { + // Tests that manual compaction works with single level universal compaction. Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleUniversal; options.disable_auto_compactions = true; + options.num_levels = 1; DestroyAndReopen(options); - // Generate some sst files on level 0 with sequence keys (no overlap) - for (int i = 0; i < kSstNum; i++) { - for (int j = 1; j < UCHAR_MAX; j++) { - auto key = std::string(kSstNum, '\0'); - key[kSstNum - i] += static_cast(j); - ASSERT_OK(Put(key, std::string(i % 1000, 'A'))); + Random rnd(31); + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < 50; ++j) { + ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(50))); } ASSERT_OK(Flush()); } - ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(NumTableFilesAtLevel(0), 10); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); +} + +TEST_F(DBCompactionTest, SingleOverlappingNonL0BottommostManualCompaction) { + // Tests that manual compact will rewrite bottommost level + // when there is only a single non-L0 level that overlaps with + // manual compaction range. + constexpr int kSstNum = 10; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + for (auto b : {BottommostLevelCompaction::kForce, + BottommostLevelCompaction::kForceOptimized}) { + DestroyAndReopen(options); + + // Generate some sst files on level 0 with sequence keys (no overlap) + for (int i = 0; i < kSstNum; i++) { + for (int j = 1; j < UCHAR_MAX; j++) { + auto key = std::string(kSstNum, '\0'); + key[kSstNum - i] += static_cast(j); + ASSERT_OK(Put(key, std::string(i % 1000, 'A'))); + } + ASSERT_OK(Flush()); + } + MoveFilesToLevel(4); + ASSERT_EQ(NumTableFilesAtLevel(4), kSstNum); + CompactRangeOptions cro; + cro.bottommost_level_compaction = b; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(4), 1); + } +} + +TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) { + constexpr int kSstNum = 10; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = 7; + const bool dynamic_level = std::get<1>(GetParam()); + options.level_compaction_dynamic_level_bytes = dynamic_level; + DestroyAndReopen(options); + + // Generate some sst files on level 0 with sequence keys (no overlap) + for (int i = 0; i < kSstNum; i++) { + for (int j = 1; j < UCHAR_MAX; j++) { + auto key = std::string(kSstNum, '\0'); + key[kSstNum - i] += static_cast(j); + ASSERT_OK(Put(key, std::string(i % 1000, 'A'))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_EQ(std::to_string(kSstNum), FilesPerLevel(0)); auto cro = CompactRangeOptions(); cro.bottommost_level_compaction = bottommost_level_compaction_; + bool trivial_moved = false; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:TrivialMove", + [&](void* /*arg*/) { trivial_moved = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // All bottommost_level_compaction options should allow l0 -> l1 trivial move. ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_TRUE(trivial_moved); if (bottommost_level_compaction_ == BottommostLevelCompaction::kForce || bottommost_level_compaction_ == BottommostLevelCompaction::kForceOptimized) { - // Real compaction to compact all sst files from level 0 to 1 file on level - // 1 - ASSERT_EQ("0,1", FilesPerLevel(0)); + // bottommost level should go through intra-level compaction + // and has only 1 file + if (dynamic_level) { + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel(0)); + } else { + ASSERT_EQ("0,1", FilesPerLevel(0)); + } } else { - // Just trivial move from level 0 -> 1 - ASSERT_EQ("0," + std::to_string(kSstNum), FilesPerLevel(0)); + // Just trivial move from level 0 -> 1/base + if (dynamic_level) { + ASSERT_EQ("0,0,0,0,0,0," + std::to_string(kSstNum), FilesPerLevel(0)); + } else { + ASSERT_EQ("0," + std::to_string(kSstNum), FilesPerLevel(0)); + } } } INSTANTIATE_TEST_CASE_P( DBCompactionTestWithBottommostParam, DBCompactionTestWithBottommostParam, - ::testing::Values(BottommostLevelCompaction::kSkip, - BottommostLevelCompaction::kIfHaveCompactionFilter, - BottommostLevelCompaction::kForce, - BottommostLevelCompaction::kForceOptimized)); + ::testing::Combine( + ::testing::Values(BottommostLevelCompaction::kSkip, + BottommostLevelCompaction::kIfHaveCompactionFilter, + BottommostLevelCompaction::kForce, + BottommostLevelCompaction::kForceOptimized), + ::testing::Bool())); TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) { Options options = CurrentOptions(); @@ -7668,26 +8266,14 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) { auto start_idx = key_idx; GenerateNewFile(&rnd, &key_idx); GenerateNewFile(&rnd, &key_idx); - auto end_idx = key_idx - 1; ASSERT_EQ("1,1,2", FilesPerLevel(0)); - // Next two CompactRange() calls are used to test exercise error paths within - // RefitLevel() before triggering a valid RefitLevel() call - - // Trigger a refit to L1 first - { - std::string begin_string = Key(start_idx); - std::string end_string = Key(end_idx); - Slice begin(begin_string); - Slice end(end_string); - - CompactRangeOptions cro; - cro.change_level = true; - cro.target_level = 1; - ASSERT_OK(dbfull()->CompactRange(cro, &begin, &end)); - } - ASSERT_EQ("0,3,2", FilesPerLevel(0)); + MoveFilesToLevel(1); + ASSERT_EQ("0,2,2", FilesPerLevel(0)); + // The next CompactRange() call is used to test exercise error paths within + // RefitLevel() before triggering a valid RefitLevel() call + // // Try a refit from L2->L1 - this should fail and exercise error paths in // RefitLevel() { @@ -7702,7 +8288,7 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) { cro.target_level = 1; ASSERT_NOK(dbfull()->CompactRange(cro, &begin, &end)); } - ASSERT_EQ("0,3,2", FilesPerLevel(0)); + ASSERT_EQ("0,2,2", FilesPerLevel(0)); // Try a valid Refit request to ensure, the path is still working { @@ -7715,10 +8301,8 @@ TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) { } TEST_F(DBCompactionTest, CompactionWithBlob) { - Options options; - options.env = env_; + Options options = CurrentOptions(); options.disable_auto_compactions = true; - Reopen(options); constexpr char first_key[] = "first_key"; @@ -7810,10 +8394,8 @@ INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobError, DBCompactionTestBlobError, "BlobFileBuilder::WriteBlobToFile:AppendFooter"})); TEST_P(DBCompactionTestBlobError, CompactionError) { - Options options; + Options options = CurrentOptions(); options.disable_auto_compactions = true; - options.env = env_; - Reopen(options); constexpr char first_key[] = "first_key"; @@ -7979,8 +8561,7 @@ TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGCOverrides) { } TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGC) { - Options options; - options.env = env_; + Options options = CurrentOptions(); options.disable_auto_compactions = true; options.enable_blob_files = true; options.blob_file_size = 32; // one blob per file @@ -8519,7 +9100,7 @@ TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) { Destroy(options); } -TEST_F(DBCompactionTest, FIFOWarm) { +TEST_F(DBCompactionTest, FIFOChangeTemperature) { Options options = CurrentOptions(); options.compaction_style = kCompactionStyleFIFO; options.num_levels = 1; @@ -8527,18 +9108,18 @@ TEST_F(DBCompactionTest, FIFOWarm) { options.level0_file_num_compaction_trigger = 2; options.create_if_missing = true; CompactionOptionsFIFO fifo_options; - fifo_options.age_for_warm = 1000; + fifo_options.file_temperature_age_thresholds = {{Temperature::kCold, 1000}}; fifo_options.max_table_files_size = 100000000; options.compaction_options_fifo = fifo_options; env_->SetMockSleep(); Reopen(options); - int total_warm = 0; + int total_cold = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "NewWritableFile::FileOptions.temperature", [&](void* arg) { Temperature temperature = *(static_cast(arg)); - if (temperature == Temperature::kWarm) { - total_warm++; + if (temperature == Temperature::kCold) { + total_cold++; } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -8575,9 +9156,9 @@ TEST_F(DBCompactionTest, FIFOWarm) { ASSERT_EQ(4, metadata.file_count); ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[1].temperature); - ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[2].temperature); - ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[3].temperature); - ASSERT_EQ(2, total_warm); + ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[2].temperature); + ASSERT_EQ(Temperature::kCold, metadata.levels[0].files[3].temperature); + ASSERT_EQ(2, total_cold); Destroy(options); } @@ -8642,7 +9223,7 @@ TEST_F(DBCompactionTest, DisableMultiManualCompaction) { sleeping_task_low.WakeUp(); sleeping_task_low.WaitUntilDone(); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } TEST_F(DBCompactionTest, DisableJustStartedManualCompaction) { @@ -8774,7 +9355,7 @@ TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFull) { sleeping_task_low.WakeUp(); sleeping_task_low.WaitUntilDone(); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel(0)); } @@ -9109,18 +9690,641 @@ TEST_F(DBCompactionTest, BottommostFileCompactionAllowIngestBehind) { // ASSERT_OK(dbfull()->TEST_WaitForCompact(true /* wait_unscheduled */)); } -#endif // !defined(ROCKSDB_LITE) +TEST_F(DBCompactionTest, TurnOnLevelCompactionDynamicLevelBytes) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.allow_ingest_behind = false; + options.level_compaction_dynamic_level_bytes = false; + options.num_levels = 6; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 1 << 20; + options.max_bytes_for_level_multiplier = 10; + DestroyAndReopen(options); + + // put files in L0, L1 and L2 + WriteOptions write_opts; + ASSERT_OK(db_->Put(write_opts, Key(1), "val1")); + Random rnd(33); + // Fill L2 with size larger than max_bytes_for_level_base, + // so the level above it won't be drained. + for (int i = 2; i <= (1 << 10); ++i) { + ASSERT_OK(db_->Put(write_opts, Key(i), rnd.RandomString(2 << 10))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + ASSERT_OK(db_->Put(write_opts, Key(2), "val2")); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + ASSERT_OK(db_->Put(write_opts, Key(1), "new_val1")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_OK(db_->Put(write_opts, Key(3), "val3")); + ASSERT_OK(Flush()); + ASSERT_EQ("1,1,2", FilesPerLevel()); + auto verify_db = [&]() { + ASSERT_EQ(Get(Key(1)), "new_val1"); + ASSERT_EQ(Get(Key(2)), "val2"); + ASSERT_EQ(Get(Key(3)), "val3"); + }; + verify_db(); + + options.level_compaction_dynamic_level_bytes = true; + Reopen(options); + // except for L0, files should be pushed down as much as possible + ASSERT_EQ("1,0,0,0,1,2", FilesPerLevel()); + verify_db(); + + // turning the options on and off should be safe + options.level_compaction_dynamic_level_bytes = false; + Reopen(options); + MoveFilesToLevel(1); + ASSERT_EQ("0,1,0,0,1,2", FilesPerLevel()); + verify_db(); + + // newly flushed file is also pushed down + options.level_compaction_dynamic_level_bytes = true; + Reopen(options); + // Files in L1 should be trivially moved down during DB opening. + // The file should be moved to L3, and then may be drained and compacted to + // L4. So we just check L1 and L2 here. + ASSERT_EQ(0, NumTableFilesAtLevel(1)); + ASSERT_EQ(0, NumTableFilesAtLevel(2)); + verify_db(); +} + +TEST_F(DBCompactionTest, TurnOnLevelCompactionDynamicLevelBytesUCToLC) { + // Basic test for migrating from UC to LC. + // DB has non-empty L1 that should be pushed down to last level (L49). + Options options = CurrentOptions(); + options.compaction_style = CompactionStyle::kCompactionStyleUniversal; + options.allow_ingest_behind = false; + options.level_compaction_dynamic_level_bytes = false; + options.num_levels = 50; + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(33); + for (int f = 0; f < 10; ++f) { + ASSERT_OK(Put(1, Key(f), rnd.RandomString(1000))); + ASSERT_OK(Flush(1)); + } + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 1; + ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); + ASSERT_EQ("0,1", FilesPerLevel(1)); + + options.compaction_style = CompactionStyle::kCompactionStyleLevel; + options.level_compaction_dynamic_level_bytes = true; + ReopenWithColumnFamilies({"default", "pikachu"}, options); + std::string expected_lsm = ""; + for (int i = 0; i < 49; ++i) { + expected_lsm += "0,"; + } + expected_lsm += "1"; + ASSERT_EQ(expected_lsm, FilesPerLevel(1)); + + // Tests that entries for trial move in MANIFEST should be valid + ReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_EQ(expected_lsm, FilesPerLevel(1)); +} + +TEST_F(DBCompactionTest, DrainUnnecessaryLevelsAfterMultiplierChanged) { + // When the level size multiplier increases such that fewer levels become + // necessary, unnecessary levels should to be drained. + const int kBaseLevelBytes = 256 << 10; // 256KB + const int kFileBytes = 64 << 10; // 64KB + const int kInitMultiplier = 2, kChangedMultiplier = 10; + const int kNumFiles = 32; + const int kNumLevels = 5; + const int kValueBytes = 1 << 10; // 1KB + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = kBaseLevelBytes; + options.max_bytes_for_level_multiplier = kInitMultiplier; + options.num_levels = kNumLevels; + Reopen(options); + + // Initially we setup the LSM to look roughly as follows: + // + // L0: empty + // L1: 256KB + // ... + // L4: 1MB + Random rnd(301); + for (int file = 0; file < kNumFiles; ++file) { + for (int i = 0; i < kFileBytes / kValueBytes; ++i) { + ASSERT_OK(Put(Key(file * kFileBytes / kValueBytes + i), + rnd.RandomString(kValueBytes))); + } + ASSERT_OK(Flush()); + } + + int init_num_nonempty = 0; + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + for (int level = 1; level < kNumLevels; ++level) { + if (NumTableFilesAtLevel(level) > 0) { + ++init_num_nonempty; + } + } + + // After increasing the multiplier and running compaction fewer levels are + // needed to hold all the data. Unnecessary levels should be drained. + ASSERT_OK(db_->SetOptions({{"max_bytes_for_level_multiplier", + std::to_string(kChangedMultiplier)}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + int final_num_nonempty = 0; + for (int level = 1; level < kNumLevels; ++level) { + if (NumTableFilesAtLevel(level) > 0) { + ++final_num_nonempty; + } + } + ASSERT_GT(init_num_nonempty, final_num_nonempty); +} + +TEST_F(DBCompactionTest, DrainUnnecessaryLevelsAfterDBBecomesSmall) { + // When the DB size is smaller, e.g., large chunk of data deleted by + // DeleteRange(), unnecessary levels should to be drained. + const int kBaseLevelBytes = 256 << 10; // 256KB + const int kFileBytes = 64 << 10; // 64KB + const int kMultiplier = 2; + const int kNumFiles = 32; + const int kNumLevels = 5; + const int kValueBytes = 1 << 10; // 1KB + const int kDeleteFileNum = 8; + + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.level_compaction_dynamic_level_bytes = true; + options.max_bytes_for_level_base = kBaseLevelBytes; + options.max_bytes_for_level_multiplier = kMultiplier; + options.num_levels = kNumLevels; + Reopen(options); + + // Initially we setup the LSM to look roughly as follows: + // + // L0: empty + // L1: 256KB + // ... + // L4: 1MB + Random rnd(301); + for (int file = 0; file < kNumFiles; ++file) { + for (int i = 0; i < kFileBytes / kValueBytes; ++i) { + ASSERT_OK(Put(Key(file * kFileBytes / kValueBytes + i), + rnd.RandomString(kValueBytes))); + } + ASSERT_OK(Flush()); + if (file == kDeleteFileNum) { + // Ensure the DeleteRange() call below only delete data from last level + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(NumTableFilesAtLevel(kNumLevels - 1), kDeleteFileNum + 1); + } + } + + int init_num_nonempty = 0; + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + for (int level = 1; level < kNumLevels; ++level) { + if (NumTableFilesAtLevel(level) > 0) { + ++init_num_nonempty; + } + } + + // Disable auto compaction CompactRange() below + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "true"}})); + // Delete keys within first (kDeleteFileNum + 1) files' key ranges. + // This should reduce DB size enough such that there is now + // an unneeded level. + std::string begin = Key(0); + std::string end = Key(kDeleteFileNum * kFileBytes / kValueBytes); + ASSERT_OK( + db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), begin, end)); + Slice begin_slice = begin; + Slice end_slice = end; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_slice, &end_slice)); + int after_delete_range_nonempty = 0; + for (int level = 1; level < kNumLevels; ++level) { + if (NumTableFilesAtLevel(level) > 0) { + ++after_delete_range_nonempty; + } + } + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + int final_num_nonempty = 0; + for (int level = 1; level < kNumLevels; ++level) { + if (NumTableFilesAtLevel(level) > 0) { + ++final_num_nonempty; + } + } + ASSERT_GE(init_num_nonempty, after_delete_range_nonempty); + ASSERT_GT(after_delete_range_nonempty, final_num_nonempty); +} + +TEST_F(DBCompactionTest, ManualCompactionCompactAllKeysInRange) { + // CompactRange() used to pre-compute target level to compact to + // before running compactions. However, the files at target level + // could be trivially moved down by some background compaction. This means + // some keys in the manual compaction key range may not be compacted + // during the manual compaction. This unit test tests this scenario. + // A fix has been applied for this scenario to always compact + // to the bottommost level. + const int kBaseLevelBytes = 8 << 20; // 8MB + const int kMultiplier = 2; + Options options = CurrentOptions(); + options.num_levels = 7; + options.level_compaction_dynamic_level_bytes = false; + options.compaction_style = kCompactionStyleLevel; + options.max_bytes_for_level_base = kBaseLevelBytes; + options.max_bytes_for_level_multiplier = kMultiplier; + options.compression = kNoCompression; + options.target_file_size_base = 2 * kBaseLevelBytes; + + DestroyAndReopen(options); + Random rnd(301); + // Populate L2 so that manual compaction will compact to at least L2. + // Otherwise, there is still a possibility of race condition where + // the manual compaction thread believes that max non-empty level is L1 + // while there is some auto compaction that moves some files from L1 to L2. + ASSERT_OK(db_->Put(WriteOptions(), Key(1000), rnd.RandomString(100))); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + // one file in L1: [Key(5), Key(6)] + ASSERT_OK( + db_->Put(WriteOptions(), Key(5), rnd.RandomString(kBaseLevelBytes / 3))); + ASSERT_OK( + db_->Put(WriteOptions(), Key(6), rnd.RandomString(kBaseLevelBytes / 3))); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + ASSERT_OK( + db_->Put(WriteOptions(), Key(1), rnd.RandomString(kBaseLevelBytes / 2))); + // We now do manual compaction for key range [Key(1), Key(6)]. + // First it compacts file [Key(1)] to L1. + // L1 will have two files [Key(1)], and [Key(5), Key(6)]. + // After L0 -> L1 manual compaction, an automatic compaction will trivially + // move both files from L1 to L2. Here the dependency makes manual compaction + // wait for auto-compaction to pick a compaction before proceeding. Manual + // compaction should not stop at L1 and keep compacting L2. With kForce + // specified, expected output is that manual compaction compacts to L2 and L2 + // will contain 2 files: one for Key(1000) and one for Key(1), Key(5) and + // Key(6). + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCompaction():AfterPickCompaction", + "DBImpl::RunManualCompaction()::1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + std::string begin_str = Key(1); + std::string end_str = Key(6); + Slice begin_slice = begin_str; + Slice end_slice = end_str; + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + ASSERT_OK(db_->CompactRange(cro, &begin_slice, &end_slice)); + + ASSERT_EQ(NumTableFilesAtLevel(2), 2); +} + +TEST_F(DBCompactionTest, + ManualCompactionCompactAllKeysInRangeDynamicLevelBytes) { + // Similar to the test above (ManualCompactionCompactAllKeysInRange), but with + // level_compaction_dynamic_level_bytes = true. + const int kBaseLevelBytes = 8 << 20; // 8MB + const int kMultiplier = 2; + Options options = CurrentOptions(); + options.num_levels = 7; + options.level_compaction_dynamic_level_bytes = true; + options.compaction_style = kCompactionStyleLevel; + options.max_bytes_for_level_base = kBaseLevelBytes; + options.max_bytes_for_level_multiplier = kMultiplier; + options.compression = kNoCompression; + options.target_file_size_base = 2 * kBaseLevelBytes; + DestroyAndReopen(options); + + Random rnd(301); + ASSERT_OK(db_->Put(WriteOptions(), Key(5), + rnd.RandomString(3 * kBaseLevelBytes / 2))); + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + ASSERT_EQ(1, NumTableFilesAtLevel(6)); + // L6 now has one file with size ~ 3/2 * kBaseLevelBytes. + // L5 is the new base level, with target size ~ 3/4 * kBaseLevelBytes. + + ASSERT_OK( + db_->Put(WriteOptions(), Key(3), rnd.RandomString(kBaseLevelBytes / 3))); + ASSERT_OK( + db_->Put(WriteOptions(), Key(4), rnd.RandomString(kBaseLevelBytes / 3))); + ASSERT_OK(Flush()); + + MoveFilesToLevel(5); + ASSERT_EQ(1, NumTableFilesAtLevel(5)); + // L5 now has one file with size ~ 2/3 * kBaseLevelBytes, which is below its + // target size. + + ASSERT_OK( + db_->Put(WriteOptions(), Key(1), rnd.RandomString(kBaseLevelBytes / 3))); + ASSERT_OK( + db_->Put(WriteOptions(), Key(2), rnd.RandomString(kBaseLevelBytes / 3))); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::BackgroundCompaction():AfterPickCompaction", + "DBImpl::RunManualCompaction()::1"}}); + SyncPoint::GetInstance()->EnableProcessing(); + // After compacting the file with [Key(1), Key(2)] to L5, + // L5 has size ~ 4/3 * kBaseLevelBytes > its target size. + // We let manual compaction wait for an auto-compaction to pick + // a compaction before proceeding. The auto-compaction would + // trivially move both files in L5 down to L6. If manual compaction + // works correctly with kForce specified, it should rewrite the two files in + // L6 into a single file. + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + std::string begin_str = Key(1); + std::string end_str = Key(4); + Slice begin_slice = begin_str; + Slice end_slice = end_str; + ASSERT_OK(db_->CompactRange(cro, &begin_slice, &end_slice)); + ASSERT_EQ(2, NumTableFilesAtLevel(6)); + ASSERT_EQ(0, NumTableFilesAtLevel(5)); +} + +TEST_F(DBCompactionTest, NumberOfSubcompactions) { + // Tests that expected number of subcompactions are created. + class SubCompactionEventListener : public EventListener { + public: + void OnSubcompactionCompleted(const SubcompactionJobInfo&) override { + sub_compaction_finished_++; + } + void OnCompactionCompleted(DB*, const CompactionJobInfo&) override { + compaction_finished_++; + } + std::atomic sub_compaction_finished_{0}; + std::atomic compaction_finished_{0}; + }; + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.compression = kNoCompression; + const int kFileSize = 100 << 10; // 100KB + options.target_file_size_base = kFileSize; + const int kLevel0CompactTrigger = 2; + options.level0_file_num_compaction_trigger = kLevel0CompactTrigger; + Destroy(options); + Random rnd(301); + + // Exposing internal implementation detail here where the + // number of subcompactions depends on the size of data + // being compacted. In particular, to enable x subcompactions, + // we need to compact at least x * target file size amount + // of data. + // + // Will write two files below to avoid trivial move. + // Size written in total: 500 * 1000 * 2 ~ 10MB ~ 100 * target file size. + const int kValueSize = 500; + const int kNumKeyPerFile = 1000; + for (int i = 1; i <= 8; ++i) { + options.max_subcompactions = i; + SubCompactionEventListener* listener = new SubCompactionEventListener(); + options.listeners.clear(); + options.listeners.emplace_back(listener); + ASSERT_OK(TryReopen(options)); + + for (int file = 0; file < kLevel0CompactTrigger; ++file) { + for (int key = file; key < 2 * kNumKeyPerFile; key += 2) { + ASSERT_OK(Put(Key(key), rnd.RandomString(kValueSize))); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(listener->compaction_finished_, 1); + EXPECT_EQ(listener->sub_compaction_finished_, i); + Destroy(options); + } +} + +TEST_F(DBCompactionTest, VerifyRecordCount) { + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 3; + options.compaction_verify_record_count = true; + DestroyAndReopen(options); + Random rnd(301); + + // Create 2 overlapping L0 files + for (int i = 1; i < 20; i += 2) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + } + ASSERT_OK(Flush()); + + for (int i = 0; i < 20; i += 2) { + ASSERT_OK(Put(Key(i), rnd.RandomString(100))); + } + ASSERT_OK(Flush()); + + // Only iterator through 10 keys and force compaction to finish. + int num_iter = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::ProcessKeyValueCompaction()::stop", [&](void* stop_ptr) { + num_iter++; + if (num_iter == 10) { + *(bool*)stop_ptr = true; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_TRUE(s.IsCorruption()); + const char* expect = + "Compaction number of input keys does not match number of keys " + "processed."; + ASSERT_TRUE(std::strstr(s.getState(), expect)); +} + +TEST_F(DBCompactionTest, ErrorWhenReadFileHead) { + // This is to test a bug that is fixed in + // https://github.com/facebook/rocksdb/pull/11782. + // + // Ingest error when reading from a file with offset = 0, + // See if compaction handles it correctly. + Options opts = CurrentOptions(); + opts.num_levels = 7; + opts.compression = kNoCompression; + DestroyAndReopen(opts); + + // Set up LSM + // L5: F1 [key0, key99], F2 [key100, key199] + // L6: F3 [key50, key149] + Random rnd(301); + const int kValLen = 100; + for (int error_file = 1; error_file <= 3; ++error_file) { + for (int i = 50; i < 150; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValLen))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + + std::vector values; + for (int i = 0; i < 100; ++i) { + values.emplace_back(rnd.RandomString(kValLen)); + ASSERT_OK(Put(Key(i), values.back())); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(5); + + for (int i = 100; i < 200; ++i) { + values.emplace_back(rnd.RandomString(kValLen)); + ASSERT_OK(Put(Key(i), values.back())); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(5); + + ASSERT_EQ(2, NumTableFilesAtLevel(5)); + ASSERT_EQ(1, NumTableFilesAtLevel(6)); + + std::atomic_int count = 0; + SyncPoint::GetInstance()->SetCallBack( + "RandomAccessFileReader::Read::BeforeReturn", + [&count, &error_file](void* pair_ptr) { + auto p = + reinterpret_cast*>(pair_ptr); + int cur = ++count; + if (cur == error_file) { + IOStatus* io_s = p->second; + *io_s = IOStatus::IOError(); + io_s->SetRetryable(true); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + // Failed compaction should not lose data. + PinnableSlice slice; + for (int i = 0; i < 200; ++i) { + ASSERT_OK(Get(Key(i), &slice)); + ASSERT_EQ(slice, values[i]); + } + ASSERT_NOK(s); + ASSERT_TRUE(s.IsIOError()); + s = db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ASSERT_OK(s); + for (int i = 0; i < 200; ++i) { + ASSERT_OK(Get(Key(i), &slice)); + ASSERT_EQ(slice, values[i]); + } + SyncPoint::GetInstance()->DisableProcessing(); + DestroyAndReopen(opts); + } +} + +TEST_F(DBCompactionTest, ReleaseCompactionDuringManifestWrite) { + // Tests the fix for issue #10257. + // Compactions are released in LogAndApply() so that picking a compaction + // from the new Version won't see these compactions as registered. + Options options = CurrentOptions(); + options.compaction_style = kCompactionStyleLevel; + // Make sure we can run multiple compactions at the same time. + env_->SetBackgroundThreads(3, Env::Priority::LOW); + env_->SetBackgroundThreads(3, Env::Priority::BOTTOM); + options.max_background_compactions = 3; + options.num_levels = 4; + DestroyAndReopen(options); + Random rnd(301); + + // Construct the following LSM + // L2: [K1-K2] [K10-K11] [k100-k101] + // L3: [K1] [K10] [k100] + // We will have 3 threads to run 3 manual compactions. + // The first thread that writes to MANIFEST will not finish + // until the next two threads enters LogAndApply() and form + // a write group. + // We check that compactions are all released after the first + // thread from the write group finishes writing to MANIFEST. + + // L3 + ASSERT_OK(Put(Key(1), rnd.RandomString(20))); + ASSERT_OK(Flush()); + MoveFilesToLevel(3); + ASSERT_OK(Put(Key(10), rnd.RandomString(20))); + ASSERT_OK(Flush()); + MoveFilesToLevel(3); + ASSERT_OK(Put(Key(100), rnd.RandomString(20))); + ASSERT_OK(Flush()); + MoveFilesToLevel(3); + // L2 + ASSERT_OK(Put(Key(100), rnd.RandomString(20))); + ASSERT_OK(Put(Key(101), rnd.RandomString(20))); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + ASSERT_OK(Put(Key(1), rnd.RandomString(20))); + ASSERT_OK(Put(Key(2), rnd.RandomString(20))); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + ASSERT_OK(Put(Key(10), rnd.RandomString(20))); + ASSERT_OK(Put(Key(11), rnd.RandomString(20))); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 3); + ASSERT_EQ(NumTableFilesAtLevel(3), 3); + + SyncPoint::GetInstance()->ClearAllCallBacks(); + std::atomic_int count = 0; + SyncPoint::GetInstance()->SetCallBack( + "VersionSet::LogAndApply:BeforeWriterWaiting", [&](void*) { + int c = count.fetch_add(1); + if (c == 2) { + TEST_SYNC_POINT("all threads to enter LogAndApply"); + } + }); + SyncPoint::GetInstance()->LoadDependency( + {{"all threads to enter LogAndApply", + "VersionSet::LogAndApply:WriteManifestStart"}}); + // Verify that compactions are released after writing to MANIFEST + std::atomic_int after_compact_count = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::BackgroundCompaction:AfterCompaction", [&](void* ptr) { + int c = after_compact_count.fetch_add(1); + if (c > 0) { + ColumnFamilyData* cfd = (ColumnFamilyData*)(ptr); + ASSERT_TRUE( + cfd->compaction_picker()->compactions_in_progress()->empty()); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + std::vector threads; + threads.emplace_back(std::thread([&]() { + std::string k1_str = Key(1); + std::string k2_str = Key(2); + Slice k1 = k1_str; + Slice k2 = k2_str; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &k1, &k2)); + })); + threads.emplace_back(std::thread([&]() { + std::string k10_str = Key(10); + std::string k11_str = Key(11); + Slice k10 = k10_str; + Slice k11 = k11_str; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &k10, &k11)); + })); + std::string k100_str = Key(100); + std::string k101_str = Key(101); + Slice k100 = k100_str; + Slice k101 = k101_str; + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &k100, &k101)); + + for (auto& thread : threads) { + thread.join(); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { -#if !defined(ROCKSDB_LITE) ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - (void)argc; - (void)argv; - return 0; -#endif } diff --git a/db/db_dynamic_level_test.cc b/db/db_dynamic_level_test.cc index 17fa67cb2001..a1c2fa943a3c 100644 --- a/db/db_dynamic_level_test.cc +++ b/db/db_dynamic_level_test.cc @@ -10,7 +10,6 @@ // Introduction of SyncPoint effectively disabled building and running this test // in Release build. // which is a pity, it is a good test -#if !defined(ROCKSDB_LITE) #include "db/db_test_util.h" #include "port/port.h" @@ -492,16 +491,9 @@ TEST_F(DBTestDynamicLevel, DISABLED_MigrateToDynamicLevelMaxBytesBase) { } } // namespace ROCKSDB_NAMESPACE -#endif // !defined(ROCKSDB_LITE) int main(int argc, char** argv) { -#if !defined(ROCKSDB_LITE) ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - (void)argc; - (void)argv; - return 0; -#endif } diff --git a/db/db_encryption_test.cc b/db/db_encryption_test.cc index 73e89d158bd4..fc8be5b6948f 100644 --- a/db/db_encryption_test.cc +++ b/db/db_encryption_test.cc @@ -6,9 +6,7 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" #include "rocksdb/perf_context.h" -#if !defined(ROCKSDB_LITE) #include "test_util/sync_point.h" -#endif #include #include @@ -27,7 +25,6 @@ class DBEncryptionTest : public DBTestBase { } }; -#ifndef ROCKSDB_LITE TEST_F(DBEncryptionTest, CheckEncrypted) { ASSERT_OK(Put("foo567", "v1.fetdq")); @@ -119,7 +116,6 @@ TEST_F(DBEncryptionTest, ReadEmptyFile) { ASSERT_TRUE(data.empty()); } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index aa9bd738a51e..40e7ac15548f 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include #include @@ -30,38 +29,8 @@ namespace ROCKSDB_NAMESPACE { Status DBImpl::FlushForGetLiveFiles() { - mutex_.AssertHeld(); - - // flush all dirty data to disk. - Status status; - if (immutable_db_options_.atomic_flush) { - autovector cfds; - SelectColumnFamiliesForAtomicFlush(&cfds); - mutex_.Unlock(); - status = - AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kGetLiveFiles); - if (status.IsColumnFamilyDropped()) { - status = Status::OK(); - } - mutex_.Lock(); - } else { - for (auto cfd : versions_->GetRefedColumnFamilySet()) { - if (cfd->IsDropped()) { - continue; - } - mutex_.Unlock(); - status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles); - TEST_SYNC_POINT("DBImpl::GetLiveFiles:1"); - TEST_SYNC_POINT("DBImpl::GetLiveFiles:2"); - mutex_.Lock(); - if (!status.ok() && !status.IsColumnFamilyDropped()) { - break; - } else if (status.IsColumnFamilyDropped()) { - status = Status::OK(); - } - } - } - return status; + return DBImpl::FlushAllColumnFamilies(FlushOptions(), + FlushReason::kGetLiveFiles); } Status DBImpl::GetLiveFiles(std::vector& ret, @@ -152,7 +121,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { // DisableFileDeletions / EnableFileDeletions not supported in read-only DB if (deletions_disabled.ok()) { - Status s2 = EnableFileDeletions(/*force*/ false); + Status s2 = EnableFileDeletions(/*force=*/false); assert(s2.ok()); s2.PermitUncheckedError(); } else { @@ -438,5 +407,3 @@ Status DBImpl::GetLiveFilesStorageInfo( } } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc index 5804cd3de189..515d24f13d48 100644 --- a/db/db_flush_test.cc +++ b/db/db_flush_test.cc @@ -74,9 +74,7 @@ TEST_F(DBFlushTest, FlushWhileWritingManifest) { ASSERT_OK(dbfull()->Flush(no_wait)); // If the issue is hit we will wait here forever. ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); -#ifndef ROCKSDB_LITE ASSERT_EQ(2, TotalTableFiles()); -#endif // ROCKSDB_LITE } // Disable this test temporarily on Travis as it fails intermittently. @@ -106,9 +104,7 @@ TEST_F(DBFlushTest, SyncFail) { // Now the background job will do the flush; wait for it. // Returns the IO error happend during flush. ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable()); -#ifndef ROCKSDB_LITE ASSERT_EQ("", FilesPerLevel()); // flush failed. -#endif // ROCKSDB_LITE Destroy(options); } @@ -227,7 +223,7 @@ TEST_F(DBFlushTest, CloseDBWhenFlushInLowPri) { sleeping_task_low.WaitUntilDone(); ASSERT_EQ(0, num_flushes); - TryReopenWithColumnFamilies({"default", "cf1", "cf2"}, options); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "cf1", "cf2"}, options)); ASSERT_OK(Put(0, "key3", DummyString(8192))); ASSERT_OK(Flush(0)); ASSERT_EQ(1, num_flushes); @@ -665,7 +661,6 @@ TEST_F(DBFlushTest, StatisticsGarbageRangeDeletes) { Close(); } -#ifndef ROCKSDB_LITE // This simple Listener can only handle one flush at a time. class TestFlushListener : public EventListener { public: @@ -745,10 +740,98 @@ class TestFlushListener : public EventListener { Env* env_; DBFlushTest* test_; }; -#endif // !ROCKSDB_LITE -// RocksDB lite does not support GetLiveFiles() -#ifndef ROCKSDB_LITE +TEST_F( + DBFlushTest, + FixUnrecoverableWriteDuringAtomicFlushWaitUntilFlushWouldNotStallWrites) { + Options options = CurrentOptions(); + options.atomic_flush = true; + + // To simulate a real-life crash where we can't flush during db's shutdown + options.avoid_flush_during_shutdown = true; + + // Set 3 low thresholds (while `disable_auto_compactions=false`) here so flush + // adding one more L0 file during `GetLiveFiles()` will have to wait till such + // flush will not stall writes + options.level0_stop_writes_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + // Disable level-0 compaction triggered by number of files to avoid + // stalling check being skipped (resulting in the flush mentioned above didn't + // wait) + options.level0_file_num_compaction_trigger = -1; + + CreateAndReopenWithCF({"cf1"}, options); + + // Manually pause compaction thread to ensure enough L0 files as + // `disable_auto_compactions=false`is needed, in order to meet the 3 low + // thresholds above + std::unique_ptr sleeping_task_; + sleeping_task_.reset(new test::SleepingBackgroundTask()); + env_->SetBackgroundThreads(1, Env::LOW); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + sleeping_task_.get(), Env::Priority::LOW); + sleeping_task_->WaitUntilSleeping(); + + // Create some initial file to help meet the 3 low thresholds above + ASSERT_OK(Put(1, "dontcare", "dontcare")); + ASSERT_OK(Flush(1)); + + // Insert some initial data so we have something to atomic-flush later + // triggered by `GetLiveFiles()` + WriteOptions write_opts; + write_opts.disableWAL = true; + ASSERT_OK(Put(1, "k1", "v1", write_opts)); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({{ + "DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait", + "DBFlushTest::" + "UnrecoverableWriteInAtomicFlushWaitUntilFlushWouldNotStallWrites::Write", + }}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Write to db when atomic flush releases the lock to wait on write stall + // condition to be gone in `WaitUntilFlushWouldNotStallWrites()` + port::Thread write_thread([&] { + TEST_SYNC_POINT( + "DBFlushTest::" + "UnrecoverableWriteInAtomicFlushWaitUntilFlushWouldNotStallWrites::" + "Write"); + // Before the fix, the empty default CF would've been prematurely excluded + // from this atomic flush. The following two writes together make default CF + // later contain data that should've been included in the atomic flush. + ASSERT_OK(Put(0, "k2", "v2", write_opts)); + // The following write increases the max seqno of this atomic flush to be 3, + // which is greater than the seqno of default CF's data. This then violates + // the invariant that all entries of seqno less than the max seqno + // of this atomic flush should've been flushed by the time of this atomic + // flush finishes. + ASSERT_OK(Put(1, "k3", "v3", write_opts)); + + // Resume compaction threads and reduce L0 files so `GetLiveFiles()` can + // resume from the wait + sleeping_task_->WakeUp(); + sleeping_task_->WaitUntilDone(); + MoveFilesToLevel(1, 1); + }); + + // Trigger an atomic flush by `GetLiveFiles()` + std::vector files; + uint64_t manifest_file_size; + ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true)); + + write_thread.join(); + + ReopenWithColumnFamilies({"default", "cf1"}, options); + + ASSERT_EQ(Get(1, "k3"), "v3"); + // Prior to the fix, `Get()` will return `NotFound as "k2" entry in default CF + // can't be recovered from a crash right after the atomic flush finishes, + // resulting in a "recovery hole" as "k3" can be recovered. It's due to the + // invariant violation described above. + ASSERT_EQ(Get(0, "k2"), "v2"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBFlushTest, FixFlushReasonRaceFromConcurrentFlushes) { Options options = CurrentOptions(); options.atomic_flush = true; @@ -803,7 +886,6 @@ TEST_F(DBFlushTest, FixFlushReasonRaceFromConcurrentFlushes) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -#endif // !ROCKSDB_LITE TEST_F(DBFlushTest, MemPurgeBasic) { Options options = CurrentOptions(); @@ -836,24 +918,16 @@ TEST_F(DBFlushTest, MemPurgeBasic) { // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes). options.write_buffer_size = 1 << 20; -#ifndef ROCKSDB_LITE // Initially deactivate the MemPurge prototype. options.experimental_mempurge_threshold = 0.0; TestFlushListener* listener = new TestFlushListener(options.env, this); options.listeners.emplace_back(listener); -#else - // Activate directly the MemPurge prototype. - // (RocksDB lite does not support dynamic options) - options.experimental_mempurge_threshold = 1.0; -#endif // !ROCKSDB_LITE ASSERT_OK(TryReopen(options)); // RocksDB lite does not support dynamic options -#ifndef ROCKSDB_LITE // Dynamically activate the MemPurge prototype without restarting the DB. ColumnFamilyHandle* cfh = db_->DefaultColumnFamily(); ASSERT_OK(db_->SetOptions(cfh, {{"experimental_mempurge_threshold", "1.0"}})); -#endif std::atomic mempurge_count{0}; std::atomic sst_count{0}; @@ -986,7 +1060,6 @@ TEST_F(DBFlushTest, MemPurgeBasic) { } // RocksDB lite does not support dynamic options -#ifndef ROCKSDB_LITE TEST_F(DBFlushTest, MemPurgeBasicToggle) { Options options = CurrentOptions(); @@ -1099,12 +1172,10 @@ TEST_F(DBFlushTest, MemPurgeBasicToggle) { Close(); } -// Closes the "#ifndef ROCKSDB_LITE" // End of MemPurgeBasicToggle, which is not // supported with RocksDB LITE because it // relies on dynamically changing the option // flag experimental_mempurge_threshold. -#endif // At the moment, MemPurge feature is deactivated // when atomic_flush is enabled. This is because the level @@ -1222,10 +1293,8 @@ TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) { options.compression = kNoCompression; options.inplace_update_support = false; options.allow_concurrent_memtable_write = true; -#ifndef ROCKSDB_LITE TestFlushListener* listener = new TestFlushListener(options.env, this); options.listeners.emplace_back(listener); -#endif // !ROCKSDB_LITE // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes). options.write_buffer_size = 1 << 20; // Activate the MemPurge prototype. @@ -1309,6 +1378,7 @@ TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) { ASSERT_EQ(value, NOT_FOUND); count++; } + ASSERT_OK(iter->status()); // Expected count here is 3: KEY3, KEY4, KEY5. ASSERT_EQ(count, EXPECTED_COUNT_FORLOOP); @@ -1423,10 +1493,8 @@ TEST_F(DBFlushTest, MemPurgeAndCompactionFilter) { options.compression = kNoCompression; options.inplace_update_support = false; options.allow_concurrent_memtable_write = true; -#ifndef ROCKSDB_LITE TestFlushListener* listener = new TestFlushListener(options.env, this); options.listeners.emplace_back(listener); -#endif // !ROCKSDB_LITE // Create a ConditionalUpdate compaction filter // that will update all the values of the KV pairs // where the keys are "lower" than KEY4. @@ -1879,12 +1947,10 @@ TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) { ASSERT_OK(db_->ContinueBackgroundWork()); // We ingested the error to env, so the returned status is not OK. ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable()); -#ifndef ROCKSDB_LITE uint64_t num_bg_errors; ASSERT_TRUE( db_->GetIntProperty(DB::Properties::kBackgroundErrors, &num_bg_errors)); ASSERT_GT(num_bg_errors, 0); -#endif // ROCKSDB_LITE // In the bug scenario, triggering another flush would cause the second flush // to hang forever. After the fix we expect it to return an error. @@ -1926,7 +1992,6 @@ TEST_F(DBFlushTest, CFDropRaceWithWaitForFlushMemTables) { SyncPoint::GetInstance()->DisableProcessing(); } -#ifndef ROCKSDB_LITE TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) { class TestListener : public EventListener { public: @@ -2017,7 +2082,6 @@ TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) { SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); } -#endif // !ROCKSDB_LITE TEST_F(DBFlushTest, FlushWithBlob) { constexpr uint64_t min_blob_size = 10; @@ -2079,7 +2143,6 @@ TEST_F(DBFlushTest, FlushWithBlob) { ASSERT_EQ(blob_file->GetTotalBlobCount(), 1); -#ifndef ROCKSDB_LITE const InternalStats* const internal_stats = cfd->internal_stats(); assert(internal_stats); @@ -2095,7 +2158,6 @@ TEST_F(DBFlushTest, FlushWithBlob) { ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], compaction_stats[0].bytes_written + compaction_stats[0].bytes_written_blob); -#endif // ROCKSDB_LITE } TEST_F(DBFlushTest, FlushWithChecksumHandoff1) { @@ -2409,7 +2471,6 @@ TEST_P(DBFlushTestBlobError, FlushError) { ASSERT_NE(type, kBlobFile); } -#ifndef ROCKSDB_LITE const InternalStats* const internal_stats = cfd->internal_stats(); assert(internal_stats); @@ -2433,10 +2494,8 @@ TEST_P(DBFlushTestBlobError, FlushError) { ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], compaction_stats[0].bytes_written + compaction_stats[0].bytes_written_blob); -#endif // ROCKSDB_LITE } -#ifndef ROCKSDB_LITE TEST_F(DBFlushTest, TombstoneVisibleInSnapshot) { class SimpleTestFlushListener : public EventListener { public: @@ -2617,7 +2676,6 @@ TEST_P(DBAtomicFlushTest, ManualAtomicFlush) { ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty()); } } -#endif // ROCKSDB_LITE TEST_P(DBAtomicFlushTest, PrecomputeMinLogNumberToKeepNon2PC) { Options options = CurrentOptions(); @@ -3367,7 +3425,6 @@ TEST_F(DBFlushTest, AutoCompactionBeforeEnablingFlush) { options.num_levels = 3; options.level0_file_num_compaction_trigger = 3; - options.info_log = info_log_; CreateAndReopenWithCF({"pikachu"}, options); auto cfd = static_cast_with_check(handles_[1])->cfd(); @@ -3516,6 +3573,279 @@ INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest, INSTANTIATE_TEST_CASE_P(DBAtomicFlushTest, DBAtomicFlushTest, testing::Bool()); +TEST_F(DBFlushTest, NonAtomicFlushRollbackPendingFlushes) { + // Fix a bug in when atomic_flush=false. + // The bug can happen as follows: + // Start Flush0 for memtable M0 to SST0 + // Start Flush1 for memtable M1 to SST1 + // Flush1 returns OK, but don't install to MANIFEST and let whoever flushes + // M0 to take care of it + // Flush0 finishes with a retryable IOError + // - It rollbacks M0, (incorrectly) not M1 + // - Deletes SST1 and SST2 + // + // Auto-recovery will start Flush2 for M0, it does not pick up M1 since it + // thinks that M1 is flushed + // Flush2 writes SST3 and finishes OK, tries to install SST3 and SST2 + // Error opening SST2 since it's already deleted + // + // The fix is to let Flush0 also rollback M1. + Options opts = CurrentOptions(); + opts.atomic_flush = false; + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); + opts.max_write_buffer_number = 64; + opts.max_background_flushes = 4; + env_->SetBackgroundThreads(4, Env::HIGH); + DestroyAndReopen(opts); + std::atomic_int flush_count = 0; + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table:s", [&](void* s_ptr) { + int c = flush_count.fetch_add(1); + if (c == 0) { + Status* s = (Status*)(s_ptr); + IOStatus io_error = IOStatus::IOError("injected foobar"); + io_error.SetRetryable(true); + *s = io_error; + TEST_SYNC_POINT("Let mem1 flush start"); + TEST_SYNC_POINT("Wait for mem1 flush to finish"); + } + }); + SyncPoint::GetInstance()->LoadDependency( + {{"Let mem1 flush start", "Mem1 flush starts"}, + {"DBImpl::BGWorkFlush:done", "Wait for mem1 flush to finish"}, + {"RecoverFromRetryableBGIOError:RecoverSuccess", + "Wait for error recover"}}); + // Need first flush to wait for the second flush to finish + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "val1")); + // trigger bg flush mem0 + ASSERT_OK(Put(Key(2), "val2")); + TEST_SYNC_POINT("Mem1 flush starts"); + // trigger bg flush mem1 + ASSERT_OK(Put(Key(3), "val3")); + + TEST_SYNC_POINT("Wait for error recover"); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBFlushTest, AbortNonAtomicFlushWhenBGError) { + // Fix a bug in when atomic_flush=false. + // The bug can happen as follows: + // Start Flush0 for memtable M0 to SST0 + // Start Flush1 for memtable M1 to SST1 + // Flush1 returns OK, but doesn't install output MANIFEST and let whoever + // flushes M0 to take care of it + // Start Flush2 for memtable M2 to SST2 + // Flush0 finishes with a retryable IOError + // - It rollbacks M0 AND M1 + // - Deletes SST1 and SST2 + // Flush2 finishes, does not rollback M2, + // - releases the pending file number that keeps SST2 alive + // - deletes SST2 + // + // Then auto-recovery starts, error opening SST2 when try to install + // flush result + // + // The fix is to let Flush2 rollback M2 if it finds that + // there is a background error. + Options opts = CurrentOptions(); + opts.atomic_flush = false; + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); + opts.max_write_buffer_number = 64; + opts.max_background_flushes = 4; + env_->SetBackgroundThreads(4, Env::HIGH); + DestroyAndReopen(opts); + std::atomic_int flush_count = 0; + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table:s", [&](void* s_ptr) { + int c = flush_count.fetch_add(1); + if (c == 0) { + Status* s = (Status*)(s_ptr); + IOStatus io_error = IOStatus::IOError("injected foobar"); + io_error.SetRetryable(true); + *s = io_error; + TEST_SYNC_POINT("Let mem1 flush start"); + TEST_SYNC_POINT("Wait for mem1 flush to finish"); + + TEST_SYNC_POINT("Let mem2 flush start"); + TEST_SYNC_POINT("Wait for mem2 to start writing table"); + } + }); + + SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table", [&](void* mems) { + autovector* mems_ptr = (autovector*)mems; + if ((*mems_ptr)[0]->GetID() == 3) { + TEST_SYNC_POINT("Mem2 flush starts writing table"); + TEST_SYNC_POINT("Mem2 flush waits until rollback"); + } + }); + SyncPoint::GetInstance()->LoadDependency( + {{"Let mem1 flush start", "Mem1 flush starts"}, + {"DBImpl::BGWorkFlush:done", "Wait for mem1 flush to finish"}, + {"Let mem2 flush start", "Mem2 flush starts"}, + {"Mem2 flush starts writing table", + "Wait for mem2 to start writing table"}, + {"RollbackMemtableFlush", "Mem2 flush waits until rollback"}, + {"RecoverFromRetryableBGIOError:RecoverSuccess", + "Wait for error recover"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_OK(Put(Key(1), "val1")); + // trigger bg flush mem0 + ASSERT_OK(Put(Key(2), "val2")); + TEST_SYNC_POINT("Mem1 flush starts"); + // trigger bg flush mem1 + ASSERT_OK(Put(Key(3), "val3")); + + TEST_SYNC_POINT("Mem2 flush starts"); + ASSERT_OK(Put(Key(4), "val4")); + + TEST_SYNC_POINT("Wait for error recover"); + // Recovery flush writes 3 memtables together into 1 file. + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBFlushTest, NonAtomicNormalFlushAbortWhenBGError) { + Options opts = CurrentOptions(); + opts.atomic_flush = false; + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); + opts.max_write_buffer_number = 64; + opts.max_background_flushes = 1; + env_->SetBackgroundThreads(2, Env::HIGH); + DestroyAndReopen(opts); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + std::atomic_int flush_write_table_count = 0; + SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table:s", [&](void* s_ptr) { + int c = flush_write_table_count.fetch_add(1); + if (c == 0) { + Status* s = (Status*)(s_ptr); + IOStatus io_error = IOStatus::IOError("injected foobar"); + io_error.SetRetryable(true); + *s = io_error; + } + }); + + SyncPoint::GetInstance()->EnableProcessing(); + SyncPoint::GetInstance()->LoadDependency( + {{"Let error recovery start", + "RecoverFromRetryableBGIOError:BeforeStart"}, + {"RecoverFromRetryableBGIOError:RecoverSuccess", + "Wait for error recover"}}); + + ASSERT_OK(Put(Key(1), "val1")); + // trigger bg flush0 for mem0 + ASSERT_OK(Put(Key(2), "val2")); + // Not checking status since this wait can finish before flush starts. + dbfull()->TEST_WaitForFlushMemTable().PermitUncheckedError(); + + // trigger bg flush1 for mem1, should see bg error and abort + // before picking a memtable to flush + ASSERT_OK(Put(Key(3), "val3")); + ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + TEST_SYNC_POINT("Let error recovery start"); + TEST_SYNC_POINT("Wait for error recover"); + // Recovery flush writes 2 memtables together into 1 file. + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + // 1 for flush 0 and 1 for recovery flush + ASSERT_EQ(2, flush_write_table_count); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_F(DBFlushTest, DBStuckAfterAtomicFlushError) { + // Test for a bug with atomic flush where DB can become stuck + // after a flush error. A repro timeline: + // + // Start Flush0 for mem0 + // Start Flush1 for mem1 + // Now Flush1 will wait for Flush0 to install mem0 + // Flush0 finishes with retryable IOError, rollbacks mem0 + // Resume starts and waits for background job to finish, i.e., Flush1 + // Fill memtable again, trigger Flush2 for mem0 + // Flush2 will get error status, and not rollback mem0, see code in + // https://github.com/facebook/rocksdb/blob/b927ba5936216861c2c35ab68f50ba4a78e65747/db/db_impl/db_impl_compaction_flush.cc#L725 + // + // DB is stuck since mem0 can never be picked now + // + // The fix is to rollback mem0 in Flush2, and let Flush1 also abort upon + // background error besides waiting for older memtables to be installed. + // The recovery flush in this case should pick up all memtables + // and write them to a single L0 file. + Options opts = CurrentOptions(); + opts.atomic_flush = true; + opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1)); + opts.max_write_buffer_number = 64; + opts.max_background_flushes = 4; + env_->SetBackgroundThreads(4, Env::HIGH); + DestroyAndReopen(opts); + + std::atomic_int flush_count = 0; + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->SetCallBack( + "FlushJob::WriteLevel0Table:s", [&](void* s_ptr) { + int c = flush_count.fetch_add(1); + if (c == 0) { + Status* s = (Status*)(s_ptr); + IOStatus io_error = IOStatus::IOError("injected foobar"); + io_error.SetRetryable(true); + *s = io_error; + TEST_SYNC_POINT("Let flush for mem1 start"); + // Wait for Flush1 to start waiting to install flush result + TEST_SYNC_POINT("Wait for flush for mem1"); + } + }); + SyncPoint::GetInstance()->LoadDependency( + {{"Let flush for mem1 start", "Flush for mem1"}, + {"DBImpl::AtomicFlushMemTablesToOutputFiles:WaitCV", + "Wait for flush for mem1"}, + {"RecoverFromRetryableBGIOError:BeforeStart", + "Wait for resume to start"}, + {"Recovery should continue here", + "RecoverFromRetryableBGIOError:BeforeStart2"}, + {"RecoverFromRetryableBGIOError:RecoverSuccess", + "Wait for error recover"}}); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Put(Key(1), "val1")); + // trigger Flush0 for mem0 + ASSERT_OK(Put(Key(2), "val2")); + + // trigger Flush1 for mem1 + TEST_SYNC_POINT("Flush for mem1"); + ASSERT_OK(Put(Key(3), "val3")); + + // Wait until resume started to schedule another flush + TEST_SYNC_POINT("Wait for resume to start"); + // This flush should not be scheduled due to bg error + ASSERT_OK(Put(Key(4), "val4")); + + // TEST_WaitForBackgroundWork() returns background error + // after all background work is done. + ASSERT_NOK(dbfull()->TEST_WaitForBackgroundWork()); + // Flush should abort and not writing any table + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + + // Wait until this flush is done. + TEST_SYNC_POINT("Recovery should continue here"); + TEST_SYNC_POINT("Wait for error recover"); + // error recovery can schedule new flushes, but should not + // encounter error + ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_impl/compacted_db_impl.cc b/db/db_impl/compacted_db_impl.cc index f18ee0d72395..3b665ea26b3e 100644 --- a/db/db_impl/compacted_db_impl.cc +++ b/db/db_impl/compacted_db_impl.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "db/db_impl/compacted_db_impl.h" #include "db/db_impl/db_impl.h" @@ -44,16 +43,34 @@ Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, /*timestamp*/ nullptr); } -Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, - const Slice& key, PinnableSlice* value, - std::string* timestamp) { +Status CompactedDBImpl::Get(const ReadOptions& _read_options, + ColumnFamilyHandle*, const Slice& key, + PinnableSlice* value, std::string* timestamp) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kGet) { + return Status::InvalidArgument( + "Can only call Get with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kGet`"); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kGet; + } + assert(user_comparator_); - if (options.timestamp) { - const Status s = FailIfTsMismatchCf( - DefaultColumnFamily(), *(options.timestamp), /*ts_for_read=*/true); + if (read_options.timestamp) { + Status s = + FailIfTsMismatchCf(DefaultColumnFamily(), *(read_options.timestamp)); if (!s.ok()) { return s; } + if (read_options.timestamp->size() > 0) { + s = FailIfReadCollapsedHistory(cfd_, cfd_->GetSuperVersion(), + *(read_options.timestamp)); + if (!s.ok()) { + return s; + } + } } else { const Status s = FailIfCfHasTs(DefaultColumnFamily()); if (!s.ok()) { @@ -70,7 +87,7 @@ Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, GetWithTimestampReadCallback read_cb(kMaxSequenceNumber); std::string* ts = user_comparator_->timestamp_size() > 0 ? timestamp : nullptr; - LookupKey lkey(key, kMaxSequenceNumber, options.timestamp); + LookupKey lkey(key, kMaxSequenceNumber, read_options.timestamp); GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, GetContext::kNotFound, lkey.user_key(), value, /*columns=*/nullptr, ts, nullptr, nullptr, true, @@ -84,8 +101,8 @@ Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, /*b_has_ts=*/false) < 0) { return Status::NotFound(); } - Status s = f.fd.table_reader->Get(options, lkey.internal_key(), &get_context, - nullptr); + Status s = f.fd.table_reader->Get(read_options, lkey.internal_key(), + &get_context, nullptr); if (!s.ok() && !s.IsNotFound()) { return s; } @@ -102,18 +119,37 @@ std::vector CompactedDBImpl::MultiGet( } std::vector CompactedDBImpl::MultiGet( - const ReadOptions& options, const std::vector&, + const ReadOptions& _read_options, const std::vector&, const std::vector& keys, std::vector* values, std::vector* timestamps) { assert(user_comparator_); size_t num_keys = keys.size(); + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kMultiGet) { + Status s = Status::InvalidArgument( + "Can only call MultiGet with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); + return std::vector(num_keys, s); + } + + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kMultiGet; + } - if (options.timestamp) { - Status s = FailIfTsMismatchCf(DefaultColumnFamily(), *(options.timestamp), - /*ts_for_read=*/true); + if (read_options.timestamp) { + Status s = + FailIfTsMismatchCf(DefaultColumnFamily(), *(read_options.timestamp)); if (!s.ok()) { return std::vector(num_keys, s); } + if (read_options.timestamp->size() > 0) { + s = FailIfReadCollapsedHistory(cfd_, cfd_->GetSuperVersion(), + *(read_options.timestamp)); + if (!s.ok()) { + return std::vector(num_keys, s); + } + } } else { Status s = FailIfCfHasTs(DefaultColumnFamily()); if (!s.ok()) { @@ -132,7 +168,7 @@ std::vector CompactedDBImpl::MultiGet( GetWithTimestampReadCallback read_cb(kMaxSequenceNumber); autovector reader_list; for (const auto& key : keys) { - LookupKey lkey(key, kMaxSequenceNumber, options.timestamp); + LookupKey lkey(key, kMaxSequenceNumber, read_options.timestamp); const FdWithKeyRange& f = files_.files[FindFile(lkey.user_key())]; if (user_comparator_->CompareWithoutTimestamp( key, /*a_has_ts=*/false, @@ -155,14 +191,15 @@ std::vector CompactedDBImpl::MultiGet( if (r != nullptr) { PinnableSlice pinnable_val; std::string& value = (*values)[idx]; - LookupKey lkey(keys[idx], kMaxSequenceNumber, options.timestamp); + LookupKey lkey(keys[idx], kMaxSequenceNumber, read_options.timestamp); std::string* timestamp = timestamps ? &(*timestamps)[idx] : nullptr; GetContext get_context( user_comparator_, nullptr, nullptr, nullptr, GetContext::kNotFound, lkey.user_key(), &pinnable_val, /*columns=*/nullptr, user_comparator_->timestamp_size() > 0 ? timestamp : nullptr, nullptr, nullptr, true, nullptr, nullptr, nullptr, nullptr, &read_cb); - Status s = r->Get(options, lkey.internal_key(), &get_context, nullptr); + Status s = + r->Get(read_options, lkey.internal_key(), &get_context, nullptr); assert(static_cast(idx) < statuses.size()); if (!s.ok() && !s.IsNotFound()) { statuses[idx] = s; @@ -254,4 +291,3 @@ Status CompactedDBImpl::Open(const Options& options, const std::string& dbname, } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/db/db_impl/compacted_db_impl.h b/db/db_impl/compacted_db_impl.h index eb458b85d4e2..e1c605e420b2 100644 --- a/db/db_impl/compacted_db_impl.h +++ b/db/db_impl/compacted_db_impl.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -31,9 +30,9 @@ class CompactedDBImpl : public DBImpl { ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) override; - Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, - const Slice& key, PinnableSlice* value, - std::string* timestamp) override; + Status Get(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, std::string* timestamp) override; using DB::MultiGet; // Note that CompactedDBImpl::MultiGet is not the optimized version of @@ -44,7 +43,7 @@ class CompactedDBImpl : public DBImpl { const std::vector& keys, std::vector* values) override; - std::vector MultiGet(const ReadOptions& options, + std::vector MultiGet(const ReadOptions& _read_options, const std::vector&, const std::vector& keys, std::vector* values, @@ -119,26 +118,32 @@ class CompactedDBImpl : public DBImpl { const IngestExternalFileOptions& /*ingestion_options*/) override { return Status::NotSupported("Not supported in compacted db mode."); } + using DB::CreateColumnFamilyWithImport; virtual Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& /*options*/, const std::string& /*column_family_name*/, const ImportColumnFamilyOptions& /*import_options*/, - const ExportImportFilesMetaData& /*metadata*/, + const std::vector& /*metadatas*/, ColumnFamilyHandle** /*handle*/) override { return Status::NotSupported("Not supported in compacted db mode."); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + // FIXME: some missing overrides for more "write" functions // Share with DBImplReadOnly? protected: -#ifndef ROCKSDB_LITE Status FlushForGetLiveFiles() override { // No-op for read-only DB return Status::OK(); } -#endif // !ROCKSDB_LITE private: friend class DB; @@ -151,4 +156,3 @@ class CompactedDBImpl : public DBImpl { LevelFilesBrief files_; }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 86c01367edbd..b74f3e59e477 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -13,7 +13,6 @@ #include #endif -#include #include #include #include @@ -28,6 +27,7 @@ #include "db/arena_wrapped_db_iter.h" #include "db/builder.h" #include "db/compaction/compaction_job.h" +#include "db/convenience_impl.h" #include "db/db_info_dumper.h" #include "db/db_iter.h" #include "db/dbformat.h" @@ -75,7 +75,6 @@ #include "port/port.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" -#include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" @@ -107,6 +106,7 @@ #include "util/mutexlock.h" #include "util/stop_watch.h" #include "util/string_util.h" +#include "util/udt_util.h" #include "utilities/trace/replayer_impl.h" namespace ROCKSDB_NAMESPACE { @@ -184,6 +184,7 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, batch_per_txn_(batch_per_txn), next_job_id_(1), shutting_down_(false), + reject_new_background_jobs_(false), db_lock_(nullptr), manual_compaction_paused_(false), bg_cv_(&mutex_), @@ -211,21 +212,16 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, immutable_db_options_.disable_delete_obsolete_files_on_open)), pending_purge_obsolete_files_(0), delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()), - last_stats_dump_time_microsec_(0), has_unpersisted_data_(false), unable_to_release_oldest_log_(false), num_running_ingest_file_(0), -#ifndef ROCKSDB_LITE wal_manager_(immutable_db_options_, file_options_, io_tracer_, seq_per_batch), -#endif // ROCKSDB_LITE bg_work_paused_(0), bg_compaction_paused_(0), refitting_level_(false), opened_successfully_(false), -#ifndef ROCKSDB_LITE periodic_task_scheduler_(), -#endif // ROCKSDB_LITE two_write_queues_(options.two_write_queues), manual_wal_flush_(options.manual_wal_flush), // last_sequencee_ is always maintained by the main queue that also writes @@ -249,7 +245,8 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, atomic_flush_install_cv_(&mutex_), blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_, &error_handler_, &event_logger_, - immutable_db_options_.listeners, dbname_) { + immutable_db_options_.listeners, dbname_), + lock_wal_count_(0) { // !batch_per_trx_ implies seq_per_batch_ because it is only unset for // WriteUnprepared, which should use seq_per_batch_. assert(batch_per_txn_ || seq_per_batch_); @@ -263,11 +260,13 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, co.capacity = table_cache_size; co.num_shard_bits = immutable_db_options_.table_cache_numshardbits; co.metadata_charge_policy = kDontChargeCacheMetadata; + // TODO: Consider a non-fixed seed once test fallout (prefetch_test) is + // dealt with + co.hash_seed = 0; table_cache_ = NewLRUCache(co); SetDbSessionId(); assert(!db_session_id_.empty()); -#ifndef ROCKSDB_LITE periodic_task_functions_.emplace(PeriodicTaskType::kDumpStats, [this]() { this->DumpStats(); }); periodic_task_functions_.emplace(PeriodicTaskType::kPersistStats, @@ -275,14 +274,15 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname, periodic_task_functions_.emplace(PeriodicTaskType::kFlushInfoLog, [this]() { this->FlushInfoLog(); }); periodic_task_functions_.emplace( - PeriodicTaskType::kRecordSeqnoTime, - [this]() { this->RecordSeqnoToTimeMapping(); }); -#endif // ROCKSDB_LITE - - versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_, - table_cache_.get(), write_buffer_manager_, - &write_controller_, &block_cache_tracer_, - io_tracer_, db_id_, db_session_id_)); + PeriodicTaskType::kRecordSeqnoTime, [this]() { + this->RecordSeqnoToTimeMapping(/*populate_historical_seconds=*/0); + }); + + versions_.reset(new VersionSet( + dbname_, &immutable_db_options_, file_options_, table_cache_.get(), + write_buffer_manager_, &write_controller_, &block_cache_tracer_, + io_tracer_, db_id_, db_session_id_, options.daily_offpeak_time_utc, + &error_handler_)); column_family_memtables_.reset( new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); @@ -334,6 +334,9 @@ Status DBImpl::Resume() { // means a new super version wouldn't have been installed Status DBImpl::ResumeImpl(DBRecoverContext context) { mutex_.AssertHeld(); + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; WaitForBackgroundWork(); Status s; @@ -354,16 +357,13 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { } // Make sure the IO Status stored in version set is set to OK. - bool file_deletion_disabled = !IsFileDeletionsEnabled(); if (s.ok()) { IOStatus io_s = versions_->io_status(); if (io_s.IsIOError()) { // If resuming from IOError resulted from MANIFEST write, then assert // that we must have already set the MANIFEST writer to nullptr during - // clean-up phase MANIFEST writing. We must have also disabled file - // deletions. + // clean-up phase MANIFEST writing. assert(!versions_->descriptor_log_); - assert(file_deletion_disabled); // Since we are trying to recover from MANIFEST write error, we need to // switch to a new MANIFEST anyway. The old MANIFEST can be corrupted. // Therefore, force writing a dummy version edit because we do not know @@ -375,7 +375,7 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { assert(cfh); ColumnFamilyData* cfd = cfh->cfd(); const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions(); - s = versions_->LogAndApply(cfd, cf_opts, &edit, &mutex_, + s = versions_->LogAndApply(cfd, cf_opts, read_options, &edit, &mutex_, directories_.GetDbDir()); if (!s.ok()) { io_s = versions_->io_status(); @@ -387,29 +387,17 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { } } - // We cannot guarantee consistency of the WAL. So force flush Memtables of - // all the column families if (s.ok()) { - FlushOptions flush_opts; - // We allow flush to stall write since we are trying to resume from error. - flush_opts.allow_write_stall = true; - if (immutable_db_options_.atomic_flush) { - autovector cfds; - SelectColumnFamiliesForAtomicFlush(&cfds); - mutex_.Unlock(); - s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason); - mutex_.Lock(); + if (context.flush_reason == FlushReason::kErrorRecoveryRetryFlush) { + s = RetryFlushesForErrorRecovery(FlushReason::kErrorRecoveryRetryFlush, + true /* wait */); } else { - for (auto cfd : versions_->GetRefedColumnFamilySet()) { - if (cfd->IsDropped()) { - continue; - } - InstrumentedMutexUnlock u(&mutex_); - s = FlushMemTable(cfd, flush_opts, context.flush_reason); - if (!s.ok()) { - break; - } - } + // We cannot guarantee consistency of the WAL. So force flush Memtables of + // all the column families + FlushOptions flush_opts; + // We allow flush to stall write since we are trying to resume from error. + flush_opts.allow_write_stall = true; + s = FlushAllColumnFamilies(flush_opts, context.flush_reason); } if (!s.ok()) { ROCKS_LOG_INFO(immutable_db_options_.info_log, @@ -418,34 +406,6 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { } } - JobContext job_context(0); - FindObsoleteFiles(&job_context, true); - mutex_.Unlock(); - - job_context.manifest_file_number = 1; - if (job_context.HaveSomethingToDelete()) { - PurgeObsoleteFiles(job_context); - } - job_context.Clean(); - - if (s.ok()) { - assert(versions_->io_status().ok()); - // If we reach here, we should re-enable file deletions if it was disabled - // during previous error handling. - if (file_deletion_disabled) { - // Always return ok - s = EnableFileDeletions(/*force=*/true); - if (!s.ok()) { - ROCKS_LOG_INFO( - immutable_db_options_.info_log, - "DB resume requested but could not enable file deletions [%s]", - s.ToString().c_str()); - assert(false); - } - } - } - - mutex_.Lock(); if (s.ok()) { // This will notify and unblock threads waiting for error recovery to // finish. Those previouly waiting threads can now proceed, which may @@ -458,6 +418,15 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { error_handler_.GetRecoveryError().PermitUncheckedError(); } + JobContext job_context(0); + FindObsoleteFiles(&job_context, true); + mutex_.Unlock(); + job_context.manifest_file_number = 1; + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + if (s.ok()) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB"); } else { @@ -465,11 +434,31 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { s.ToString().c_str()); } + mutex_.Lock(); // Check for shutdown again before scheduling further compactions, // since we released and re-acquired the lock above if (shutdown_initiated_) { s = Status::ShutdownInProgress(); } + if (s.ok() && context.flush_after_recovery) { + // Since we drop all non-recovery flush requests during recovery, + // and new memtable may fill up during recovery, + // schedule one more round of flush. + Status status = RetryFlushesForErrorRecovery( + FlushReason::kCatchUpAfterErrorRecovery, false /* wait */); + if (!status.ok()) { + // FlushAllColumnFamilies internally should take care of setting + // background error if needed. + ROCKS_LOG_INFO(immutable_db_options_.info_log, + "The catch up flush after successful recovery failed [%s]", + s.ToString().c_str()); + } + // FlushAllColumnFamilies releases and re-acquires mutex. + if (shutdown_initiated_) { + s = Status::ShutdownInProgress(); + } + } + if (s.ok()) { for (auto cfd : *versions_->GetColumnFamilySet()) { SchedulePendingCompaction(cfd); @@ -497,41 +486,15 @@ void DBImpl::WaitForBackgroundWork() { void DBImpl::CancelAllBackgroundWork(bool wait) { ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown: canceling all background work"); - -#ifndef ROCKSDB_LITE - for (uint8_t task_type = 0; - task_type < static_cast(PeriodicTaskType::kMax); task_type++) { - Status s = periodic_task_scheduler_.Unregister( - static_cast(task_type)); - if (!s.ok()) { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "Failed to unregister periodic task %d, status: %s", - task_type, s.ToString().c_str()); - } - } -#endif // !ROCKSDB_LITE + Status s = CancelPeriodicTaskScheduler(); + s.PermitUncheckedError(); InstrumentedMutexLock l(&mutex_); if (!shutting_down_.load(std::memory_order_acquire) && has_unpersisted_data_.load(std::memory_order_relaxed) && !mutable_db_options_.avoid_flush_during_shutdown) { - if (immutable_db_options_.atomic_flush) { - autovector cfds; - SelectColumnFamiliesForAtomicFlush(&cfds); - mutex_.Unlock(); - Status s = - AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown); - s.PermitUncheckedError(); //**TODO: What to do on error? - mutex_.Lock(); - } else { - for (auto cfd : versions_->GetRefedColumnFamilySet()) { - if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) { - InstrumentedMutexUnlock u(&mutex_); - Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown); - s.PermitUncheckedError(); //**TODO: What to do on error? - } - } - } + s = DBImpl::FlushAllColumnFamilies(FlushOptions(), FlushReason::kShutDown); + s.PermitUncheckedError(); //**TODO: What to do on error? } shutting_down_.store(true, std::memory_order_release); @@ -706,7 +669,6 @@ Status DBImpl::CloseHelper() { ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete"); LogFlush(immutable_db_options_.info_log); -#ifndef ROCKSDB_LITE // If the sst_file_manager was allocated by us during DB::Open(), ccall // Close() on it before closing the info_log. Otherwise, background thread // in SstFileManagerImpl might try to log something @@ -715,7 +677,6 @@ Status DBImpl::CloseHelper() { immutable_db_options_.sst_file_manager.get()); sfm->Close(); } -#endif // ROCKSDB_LITE if (immutable_db_options_.info_log && own_info_log_) { Status s = immutable_db_options_.info_log->Close(); @@ -793,8 +754,6 @@ void DBImpl::PrintStatistics() { } Status DBImpl::StartPeriodicTaskScheduler() { -#ifndef ROCKSDB_LITE - #ifndef NDEBUG // It only used by test to disable scheduler bool disable_scheduler = false; @@ -836,48 +795,109 @@ Status DBImpl::StartPeriodicTaskScheduler() { periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog)); return s; -#else - return Status::OK(); -#endif // !ROCKSDB_LITE } -Status DBImpl::RegisterRecordSeqnoTimeWorker() { -#ifndef ROCKSDB_LITE - uint64_t min_time_duration = std::numeric_limits::max(); - uint64_t max_time_duration = std::numeric_limits::min(); +Status DBImpl::RegisterRecordSeqnoTimeWorker(bool is_new_db) { + options_mutex_.AssertHeld(); + + uint64_t min_preserve_seconds = std::numeric_limits::max(); + uint64_t max_preserve_seconds = std::numeric_limits::min(); + bool mapping_was_empty = false; { InstrumentedMutexLock l(&mutex_); for (auto cfd : *versions_->GetColumnFamilySet()) { // preserve time is the max of 2 options. - uint64_t preserve_time_duration = + uint64_t preserve_seconds = std::max(cfd->ioptions()->preserve_internal_time_seconds, cfd->ioptions()->preclude_last_level_data_seconds); - if (!cfd->IsDropped() && preserve_time_duration > 0) { - min_time_duration = std::min(preserve_time_duration, min_time_duration); - max_time_duration = std::max(preserve_time_duration, max_time_duration); + if (!cfd->IsDropped() && preserve_seconds > 0) { + min_preserve_seconds = std::min(preserve_seconds, min_preserve_seconds); + max_preserve_seconds = std::max(preserve_seconds, max_preserve_seconds); } } - if (min_time_duration == std::numeric_limits::max()) { - seqno_time_mapping_.Resize(0, 0); + if (min_preserve_seconds == std::numeric_limits::max()) { + seqno_to_time_mapping_.Resize(0, 0); } else { - seqno_time_mapping_.Resize(min_time_duration, max_time_duration); + seqno_to_time_mapping_.Resize(min_preserve_seconds, max_preserve_seconds); } + mapping_was_empty = seqno_to_time_mapping_.Empty(); } uint64_t seqno_time_cadence = 0; - if (min_time_duration != std::numeric_limits::max()) { + if (min_preserve_seconds != std::numeric_limits::max()) { // round up to 1 when the time_duration is smaller than // kMaxSeqnoTimePairsPerCF - seqno_time_cadence = - (min_time_duration + SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF - 1) / - SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF; + seqno_time_cadence = (min_preserve_seconds + + SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF - 1) / + SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF; } + TEST_SYNC_POINT_CALLBACK( + "DBImpl::RegisterRecordSeqnoTimeWorker:BeforePeriodicTaskType", nullptr); + Status s; if (seqno_time_cadence == 0) { s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kRecordSeqnoTime); } else { + // Before registering the periodic task, we need to be sure to fulfill two + // promises: + // 1) Any DB created with preserve/preclude options set from the beginning + // will get pre-allocated seqnos with pre-populated time mappings back to + // the times we are interested in. (This will enable future import of data + // while preserving rough write time. We can only do this reliably from + // DB::Open, as otherwise there could be a race between CreateColumnFamily + // and the first Write to the DB, and seqno-to-time mappings need to be + // monotonic. + // 2) In any DB, any data written after setting preserve/preclude options + // must have a reasonable time estimate (so that we can accurately place + // the data), which means at least one entry in seqno_to_time_mapping_. + // + // FIXME: We don't currently guarantee that if the first column family with + // that setting is added or configured after initial DB::Open but before + // the first user Write. Fixing this causes complications with the crash + // test because if DB starts without preserve/preclude option, does some + // user writes but all those writes are lost in crash, then re-opens with + // preserve/preclude option, it sees seqno==1 which looks like one of the + // user writes was recovered, when actually it was not. + bool last_seqno_zero = GetLatestSequenceNumber() == 0; + assert(!is_new_db || last_seqno_zero); + if (is_new_db && last_seqno_zero) { + // Pre-allocate seqnos and pre-populate historical mapping + assert(mapping_was_empty); + + // We can simply modify these, before writes are allowed + constexpr uint64_t kMax = SeqnoToTimeMapping::kMaxSeqnoTimePairsPerSST; + versions_->SetLastAllocatedSequence(kMax); + versions_->SetLastPublishedSequence(kMax); + versions_->SetLastSequence(kMax); + + // And record in manifest, to avoid going backwards in seqno on re-open + // (potentially with different options). Concurrency is simple because we + // are in DB::Open + { + InstrumentedMutexLock l(&mutex_); + VersionEdit edit; + edit.SetLastSequence(kMax); + s = versions_->LogAndApplyToDefaultColumnFamily( + {}, &edit, &mutex_, directories_.GetDbDir()); + if (!s.ok() && versions_->io_status().IsIOError()) { + s = error_handler_.SetBGError(versions_->io_status(), + BackgroundErrorReason::kManifestWrite); + } + } + + // Pre-populate mappings for reserved sequence numbers. + RecordSeqnoToTimeMapping(max_preserve_seconds); + } else if (mapping_was_empty) { + if (!last_seqno_zero) { + // Ensure at least one mapping (or log a warning) + RecordSeqnoToTimeMapping(/*populate_historical_seconds=*/0); + } else { + // FIXME (see limitation described above) + } + } + s = periodic_task_scheduler_.Register( PeriodicTaskType::kRecordSeqnoTime, periodic_task_functions_.at(PeriodicTaskType::kRecordSeqnoTime), @@ -885,13 +905,26 @@ Status DBImpl::RegisterRecordSeqnoTimeWorker() { } return s; -#else - return Status::OK(); -#endif // !ROCKSDB_LITE +} + +Status DBImpl::CancelPeriodicTaskScheduler() { + Status s = Status::OK(); + for (uint8_t task_type = 0; + task_type < static_cast(PeriodicTaskType::kMax); task_type++) { + s = periodic_task_scheduler_.Unregister( + static_cast(task_type)); + if (!s.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Failed to unregister periodic task %d, status: %s", + task_type, s.ToString().c_str()); + } + } + return s; } // esitmate the total size of stats_history_ size_t DBImpl::EstimateInMemoryStatsHistorySize() const { + stats_history_mutex_.AssertHeld(); size_t size_total = sizeof(std::map>); if (stats_history_.size() == 0) return size_total; @@ -908,7 +941,6 @@ size_t DBImpl::EstimateInMemoryStatsHistorySize() const { void DBImpl::PersistStats() { TEST_SYNC_POINT("DBImpl::PersistStats:Entry"); -#ifndef ROCKSDB_LITE if (shutdown_initiated_) { return; } @@ -989,7 +1021,7 @@ void DBImpl::PersistStats() { "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64 " to in-memory stats history", stats_slice_.size(), now_seconds); - stats_history_[now_seconds] = stats_delta; + stats_history_[now_seconds] = std::move(stats_delta); } stats_slice_initialized_ = true; std::swap(stats_slice_, stats_map); @@ -1013,7 +1045,6 @@ void DBImpl::PersistStats() { stats_history_size, stats_history_.size()); } TEST_SYNC_POINT("DBImpl::PersistStats:End"); -#endif // !ROCKSDB_LITE } bool DBImpl::FindStatsByTime(uint64_t start_time, uint64_t end_time, @@ -1055,7 +1086,6 @@ Status DBImpl::GetStatsHistory( void DBImpl::DumpStats() { TEST_SYNC_POINT("DBImpl::DumpStats:1"); -#ifndef ROCKSDB_LITE std::string stats; if (shutdown_initiated_) { return; @@ -1120,7 +1150,6 @@ void DBImpl::DumpStats() { ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str()); } } -#endif // !ROCKSDB_LITE PrintStatistics(); } @@ -1447,8 +1476,8 @@ Status DBImpl::ApplyReplicationLogRecord(ReplicationLogRecord record, if (!s.ok()) { break; } - s = versions_->LogAndApply(cfds, mutable_cf_options_list, edit_lists, - &mutex_, directories_.GetDbDir(), + s = versions_->LogAndApply(cfds, mutable_cf_options_list, ReadOptions(), + edit_lists, &mutex_, directories_.GetDbDir(), false /* new_descriptor_log */, &*cf_options); if (!s.ok()) { @@ -1633,11 +1662,8 @@ Status DBImpl::GetManifestUpdateSequence(uint64_t* out) { Status DBImpl::SetOptions( ColumnFamilyHandle* column_family, const std::unordered_map& options_map) { -#ifdef ROCKSDB_LITE - (void)column_family; - (void)options_map; - return Status::NotSupported("Not supported in ROCKSDB LITE"); -#else + // TODO: plumb Env::IOActivity + const ReadOptions read_options; auto* cfd = static_cast_with_check(column_family)->cfd(); if (options_map.empty()) { @@ -1653,6 +1679,7 @@ Status DBImpl::SetOptions( only_set_disable_write_stall = true; } + InstrumentedMutexLock ol(&options_mutex_); MutableCFOptions new_options; Status s; Status persist_options_status; @@ -1668,16 +1695,15 @@ Status DBImpl::SetOptions( if (!only_set_disable_write_stall) { // Append new version to recompute compaction score. VersionEdit dummy_edit; - s = versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_, - directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, new_options, read_options, &dummy_edit, + &mutex_, directories_.GetDbDir()); } // Trigger possible flush/compactions. This has to be before we persist // options to file, otherwise there will be a deadlock with writer // thread. InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options); - persist_options_status = WriteOptionsFile( - false /*need_mutex_lock*/, true /*need_enter_write_thread*/); + persist_options_status = WriteOptionsFile(true /*db_mutex_already_held*/); bg_cv_.SignalAll(); } } @@ -1705,25 +1731,22 @@ Status DBImpl::SetOptions( } LogFlush(immutable_db_options_.info_log); return s; -#endif // ROCKSDB_LITE } Status DBImpl::SetDBOptions( const std::unordered_map& options_map) { -#ifdef ROCKSDB_LITE - (void)options_map; - return Status::NotSupported("Not supported in ROCKSDB LITE"); -#else if (options_map.empty()) { ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions(), empty input."); return Status::InvalidArgument("empty input"); } + InstrumentedMutexLock ol(&options_mutex_); MutableDBOptions new_options; Status s; Status persist_options_status = Status::OK(); - bool wal_changed = false; + bool wal_size_option_changed = false; + bool wal_other_option_changed = false; WriteContext write_context; { InstrumentedMutexLock l(&mutex_); @@ -1774,17 +1797,24 @@ Status DBImpl::SetDBOptions( const bool max_compactions_increased = new_bg_job_limits.max_compactions > current_bg_job_limits.max_compactions; + const bool offpeak_time_changed = + versions_->offpeak_time_option().daily_offpeak_time_utc != + new_db_options.daily_offpeak_time_utc; - if (max_flushes_increased || max_compactions_increased) { + if (max_flushes_increased || max_compactions_increased || + offpeak_time_changed) { if (max_flushes_increased) { env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_flushes, Env::Priority::HIGH); } - if (max_compactions_increased) { env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_compactions, Env::Priority::LOW); } + if (offpeak_time_changed) { + versions_->ChangeOffpeakTimeOption( + new_db_options.daily_offpeak_time_utc); + } MaybeScheduleFlushOrCompaction(); } @@ -1824,8 +1854,10 @@ Status DBImpl::SetDBOptions( table_cache_.get()->SetCapacity(new_options.max_open_files == -1 ? TableCache::kInfiniteCapacity : new_options.max_open_files - 10); - wal_changed = mutable_db_options_.wal_bytes_per_sync != - new_options.wal_bytes_per_sync; + wal_other_option_changed = mutable_db_options_.wal_bytes_per_sync != + new_options.wal_bytes_per_sync; + wal_size_option_changed = mutable_db_options_.max_total_wal_size != + new_options.max_total_wal_size; mutable_db_options_ = new_options; file_options_for_compaction_ = FileOptions(new_db_options); file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite( @@ -1836,19 +1868,21 @@ Status DBImpl::SetDBOptions( file_options_for_compaction_, immutable_db_options_); file_options_for_compaction_.compaction_readahead_size = mutable_db_options_.compaction_readahead_size; - WriteThread::Writer w; - write_thread_.EnterUnbatched(&w, &mutex_); - if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) { - Status purge_wal_status = SwitchWAL(&write_context); - if (!purge_wal_status.ok()) { - ROCKS_LOG_WARN(immutable_db_options_.info_log, - "Unable to purge WAL files in SetDBOptions() -- %s", - purge_wal_status.ToString().c_str()); + if (wal_other_option_changed || wal_size_option_changed) { + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + if (wal_other_option_changed || + total_log_size_ > GetMaxTotalWalSize()) { + Status purge_wal_status = SwitchWAL(&write_context); + if (!purge_wal_status.ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unable to purge WAL files in SetDBOptions() -- %s", + purge_wal_status.ToString().c_str()); + } } + write_thread_.ExitUnbatched(&w); } - persist_options_status = WriteOptionsFile( - false /*need_mutex_lock*/, false /*need_enter_write_thread*/); - write_thread_.ExitUnbatched(&w); + persist_options_status = WriteOptionsFile(true /*db_mutex_already_held*/); } else { // To get here, we must have had invalid options and will not attempt to // persist the options, which means the status is "OK/Uninitialized. @@ -1878,7 +1912,6 @@ Status DBImpl::SetDBOptions( } LogFlush(immutable_db_options_.info_log); return s; -#endif // ROCKSDB_LITE } // return the same level if it cannot be moved @@ -1932,15 +1965,10 @@ Status DBImpl::FlushWAL(bool sync) { return SyncWAL(); } -bool DBImpl::WALBufferIsEmpty(bool lock) { - if (lock) { - log_write_mutex_.Lock(); - } +bool DBImpl::WALBufferIsEmpty() { + InstrumentedMutexLock l(&log_write_mutex_); log::Writer* cur_log_writer = logs_.back().writer; auto res = cur_log_writer->BufferIsEmpty(); - if (lock) { - log_write_mutex_.Unlock(); - } return res; } @@ -2019,7 +2047,9 @@ Status DBImpl::SyncWAL() { } if (status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); - status = ApplyWALToManifest(&synced_wals); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status = ApplyWALToManifest(read_options, &synced_wals); } TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); @@ -2027,11 +2057,13 @@ Status DBImpl::SyncWAL() { return status; } -Status DBImpl::ApplyWALToManifest(VersionEdit* synced_wals) { +Status DBImpl::ApplyWALToManifest(const ReadOptions& read_options, + VersionEdit* synced_wals) { // not empty, write to MANIFEST. mutex_.AssertHeld(); + Status status = versions_->LogAndApplyToDefaultColumnFamily( - synced_wals, &mutex_, directories_.GetDbDir()); + read_options, synced_wals, &mutex_, directories_.GetDbDir()); if (!status.ok() && versions_->io_status().IsIOError()) { status = error_handler_.SetBGError(versions_->io_status(), BackgroundErrorReason::kManifestWrite); @@ -2042,29 +2074,83 @@ Status DBImpl::ApplyWALToManifest(VersionEdit* synced_wals) { Status DBImpl::LockWAL() { { InstrumentedMutexLock lock(&mutex_); - WriteThread::Writer w; - write_thread_.EnterUnbatched(&w, &mutex_); - WriteThread::Writer nonmem_w; - if (two_write_queues_) { - nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); - } + if (lock_wal_count_ > 0) { + assert(lock_wal_write_token_); + ++lock_wal_count_; + } else { + // NOTE: this will "unnecessarily" wait for other non-LockWAL() write + // stalls to clear before LockWAL returns, however fixing that would + // not be simple because if we notice the primary queue is already + // stalled, that stall might clear while we release DB mutex in + // EnterUnbatched() for the nonmem queue. And if we work around that in + // the naive way, we could deadlock by locking the two queues in different + // orders. - lock_wal_write_token_ = write_controller_.GetStopToken(); + WriteThread::Writer w; + write_thread_.EnterUnbatched(&w, &mutex_); + WriteThread::Writer nonmem_w; + if (two_write_queues_) { + nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_); + } - if (two_write_queues_) { - nonmem_write_thread_.ExitUnbatched(&nonmem_w); + // NOTE: releasing mutex in EnterUnbatched might mean we are actually + // now lock_wal_count > 0 + if (lock_wal_count_ == 0) { + assert(!lock_wal_write_token_); + lock_wal_write_token_ = write_controller_.GetStopToken(); + } + ++lock_wal_count_; + + if (two_write_queues_) { + nonmem_write_thread_.ExitUnbatched(&nonmem_w); + } + write_thread_.ExitUnbatched(&w); } - write_thread_.ExitUnbatched(&w); } - return FlushWAL(/*sync=*/false); + // NOTE: avoid I/O holding DB mutex + Status s = FlushWAL(/*sync=*/false); + if (!s.ok()) { + // Non-OK return should not be in locked state + UnlockWAL().PermitUncheckedError(); + } + return s; } Status DBImpl::UnlockWAL() { + bool signal = false; + uint64_t maybe_stall_begun_count = 0; + uint64_t nonmem_maybe_stall_begun_count = 0; { InstrumentedMutexLock lock(&mutex_); - lock_wal_write_token_.reset(); + if (lock_wal_count_ == 0) { + return Status::Aborted("No LockWAL() in effect"); + } + --lock_wal_count_; + if (lock_wal_count_ == 0) { + lock_wal_write_token_.reset(); + signal = true; + // For the last UnlockWAL, we don't want to return from UnlockWAL() + // until the thread(s) that called BeginWriteStall() have had a chance to + // call EndWriteStall(), so that no_slowdown writes after UnlockWAL() are + // guaranteed to succeed if there's no other source of stall. + maybe_stall_begun_count = write_thread_.GetBegunCountOfOutstandingStall(); + if (two_write_queues_) { + nonmem_maybe_stall_begun_count = + nonmem_write_thread_.GetBegunCountOfOutstandingStall(); + } + } + } + if (signal) { + // SignalAll outside of mutex for efficiency + bg_cv_.SignalAll(); + } + // Ensure stalls have cleared + if (maybe_stall_begun_count) { + write_thread_.WaitForStallEndedCount(maybe_stall_begun_count); + } + if (nonmem_maybe_stall_begun_count) { + nonmem_write_thread_.WaitForStallEndedCount(nonmem_maybe_stall_begun_count); } - bg_cv_.SignalAll(); return Status::OK(); } @@ -2141,7 +2227,8 @@ Status DBImpl::GetFullHistoryTsLow(ColumnFamilyHandle* column_family, } InstrumentedMutexLock l(&mutex_); *ts_low = cfd->GetFullHistoryTsLow(); - assert(cfd->user_comparator()->timestamp_size() == ts_low->size()); + assert(ts_low->empty() || + cfd->user_comparator()->timestamp_size() == ts_low->size()); return Status::OK(); } @@ -2383,32 +2470,68 @@ Status DBImpl::Get(const ReadOptions& read_options, return Get(read_options, column_family, key, value, /*timestamp=*/nullptr); } -Status DBImpl::Get(const ReadOptions& read_options, +Status DBImpl::GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) { + return GetImpl(read_options, column_family, key, value, + /*timestamp=*/nullptr); +} + +Status DBImpl::Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value, std::string* timestamp) { assert(value != nullptr); value->Reset(); + + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kGet) { + return Status::InvalidArgument( + "Can only call Get with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kGet`"); + } + + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kGet; + } + + Status s = GetImpl(read_options, column_family, key, value, timestamp); + return s; +} + +Status DBImpl::GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, std::string* timestamp) { GetImplOptions get_impl_options; get_impl_options.column_family = column_family; get_impl_options.value = value; get_impl_options.timestamp = timestamp; + Status s = GetImpl(read_options, key, get_impl_options); return s; } -Status DBImpl::GetEntity(const ReadOptions& read_options, +Status DBImpl::GetEntity(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableWideColumns* columns) { if (!column_family) { return Status::InvalidArgument( "Cannot call GetEntity without a column family handle"); } - if (!columns) { return Status::InvalidArgument( "Cannot call GetEntity without a PinnableWideColumns object"); } - + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kGetEntity) { + return Status::InvalidArgument( + "Cannot call GetEntity with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kGetEntity`"); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kGetEntity; + } columns->Reset(); GetImplOptions get_impl_options; @@ -2418,6 +2541,73 @@ Status DBImpl::GetEntity(const ReadOptions& read_options, return GetImpl(read_options, key, get_impl_options); } +Status DBImpl::GetEntity(const ReadOptions& _read_options, const Slice& key, + PinnableAttributeGroups* result) { + if (!result) { + return Status::InvalidArgument( + "Cannot call GetEntity without PinnableAttributeGroups object"); + } + Status s; + const size_t num_column_families = result->size(); + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kGetEntity) { + s = Status::InvalidArgument( + "Cannot call GetEntity with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kGetEntity`"); + for (size_t i = 0; i < num_column_families; ++i) { + (*result)[i].SetStatus(s); + } + return s; + } + // return early if no CF was passed in + if (num_column_families == 0) { + return s; + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kGetEntity; + } + std::vector keys; + std::vector column_families; + for (size_t i = 0; i < num_column_families; ++i) { + // If any of the CFH is null, break early since the entire query will fail + if (!(*result)[i].column_family()) { + s = Status::InvalidArgument( + "DB failed to query because one or more group(s) have null column " + "family handle"); + (*result)[i].SetStatus( + Status::InvalidArgument("Column family handle cannot be null")); + break; + } + // Adding the same key slice for different CFs + keys.emplace_back(key); + column_families.emplace_back((*result)[i].column_family()); + } + if (!s.ok()) { + for (size_t i = 0; i < num_column_families; ++i) { + if ((*result)[i].status().ok()) { + (*result)[i].SetStatus( + Status::Incomplete("DB not queried due to invalid argument(s) in " + "one or more of the attribute groups")); + } + } + return s; + } + std::vector columns(num_column_families); + std::vector statuses(num_column_families); + MultiGetCommon( + read_options, num_column_families, column_families.data(), keys.data(), + /* values */ nullptr, columns.data(), + /* timestamps */ nullptr, statuses.data(), /* sorted_input */ false); + // Set results + for (size_t i = 0; i < num_column_families; ++i) { + (*result)[i].Reset(); + (*result)[i].SetStatus(statuses[i]); + (*result)[i].SetColumns(std::move(columns[i])); + } + return s; +} + bool DBImpl::ShouldReferenceSuperVersion(const MergeContext& merge_context) { // If both thresholds are reached, a function returning merge operands as // `PinnableSlice`s should reference the `SuperVersion` to avoid large and/or @@ -2456,8 +2646,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, if (read_options.timestamp) { const Status s = FailIfTsMismatchCf(get_impl_options.column_family, - *(read_options.timestamp), - /*ts_for_read=*/true); + *(read_options.timestamp)); if (!s.ok()) { return s; } @@ -2516,7 +2705,16 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, SuperVersion* sv = super_snapshot ? super_snapshot->sv() : GetAndRefSuperVersion(cfd); // RocksDB-Cloud contribution end + if (read_options.timestamp && read_options.timestamp->size() > 0) { + const Status s = + FailIfReadCollapsedHistory(cfd, sv, *(read_options.timestamp)); + if (!s.ok()) { + ReturnAndCleanupSuperVersion(cfd, sv); + return s; + } + } + TEST_SYNC_POINT_CALLBACK("DBImpl::GetImpl:AfterAcquireSv", nullptr); TEST_SYNC_POINT("DBImpl::GetImpl:1"); TEST_SYNC_POINT("DBImpl::GetImpl:2"); @@ -2669,6 +2867,12 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, RecordTick(stats_, NUMBER_KEYS_READ); size_t size = 0; if (s.ok()) { + const auto& merge_threshold = read_options.merge_operand_count_threshold; + if (merge_threshold.has_value() && + merge_context.GetNumOperands() > merge_threshold.value()) { + s = Status::OkMergeOperandThresholdExceeded(); + } + if (get_impl_options.get_value) { if (get_impl_options.value) { size = get_impl_options.value->size(); @@ -2764,7 +2968,7 @@ std::vector DBImpl::MultiGet( } std::vector DBImpl::MultiGet( - const ReadOptions& read_options, + const ReadOptions& _read_options, const std::vector& column_family, const std::vector& keys, std::vector* values, std::vector* timestamps) { @@ -2778,15 +2982,32 @@ std::vector DBImpl::MultiGet( // RocksDB-Cloud contribution begin auto super_snapshot = - dynamic_cast(read_options.snapshot); + dynamic_cast(_read_options.snapshot); // RocksDB-Cloud contribution end + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kMultiGet) { + Status s = Status::InvalidArgument( + "Can only call MultiGet with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); + + for (size_t i = 0; i < num_keys; ++i) { + stat_list[i] = s; + } + return stat_list; + } + + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kMultiGet; + } + bool should_fail = false; for (size_t i = 0; i < num_keys; ++i) { assert(column_family[i]); if (read_options.timestamp) { - stat_list[i] = FailIfTsMismatchCf( - column_family[i], *(read_options.timestamp), /*ts_for_read=*/true); + stat_list[i] = + FailIfTsMismatchCf(column_family[i], *(read_options.timestamp)); if (!stat_list[i].ok()) { should_fail = true; } @@ -2830,8 +3051,6 @@ std::vector DBImpl::MultiGet( } } - SequenceNumber consistent_seqnum; - UnorderedMap multiget_cf_data( column_family.size()); for (auto cf : column_family) { @@ -2849,10 +3068,21 @@ std::vector DBImpl::MultiGet( [](UnorderedMap::iterator& cf_iter) { return &cf_iter->second; }; - bool unref_only = + SequenceNumber consistent_seqnum; + bool sv_from_thread_local; + Status status = MultiCFSnapshot>( read_options, nullptr, iter_deref_lambda, &multiget_cf_data, - &consistent_seqnum); + &consistent_seqnum, &sv_from_thread_local); + + if (!status.ok()) { + for (auto& s : stat_list) { + if (s.ok()) { + s = status; + } + } + return stat_list; + } TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum1"); TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum2"); @@ -2933,8 +3163,15 @@ std::vector DBImpl::MultiGet( } if (s.ok()) { + const auto& merge_threshold = read_options.merge_operand_count_threshold; + if (merge_threshold.has_value() && + merge_context.GetNumOperands() > merge_threshold.value()) { + s = Status::OkMergeOperandThresholdExceeded(); + } + bytes_read += value->size(); num_found++; + curr_value_size += value->size(); if (curr_value_size > read_options.value_size_soft_limit) { while (++keys_read < num_keys) { @@ -2962,7 +3199,6 @@ std::vector DBImpl::MultiGet( // Post processing (decrement reference counts and record statistics) PERF_TIMER_GUARD(get_post_process_time); - autovector superversions_to_delete; // Only cleanup the super versions if we don't have super snapshot, which // brought its own superversion. @@ -2971,13 +3207,15 @@ std::vector DBImpl::MultiGet( // RocksDB-Cloud contribution end for (auto mgd_iter : multiget_cf_data) { auto mgd = mgd_iter.second; - if (!unref_only) { + if (sv_from_thread_local) { ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version); } else { - mgd.cfd->GetSuperVersion()->Unref(); + TEST_SYNC_POINT("DBImpl::MultiGet::BeforeLastTryUnRefSV"); + CleanupSuperVersion(mgd.super_version); } } } + RecordTick(stats_, NUMBER_MULTIGET_CALLS); RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found); @@ -2990,16 +3228,40 @@ std::vector DBImpl::MultiGet( } template -bool DBImpl::MultiCFSnapshot( +Status DBImpl::MultiCFSnapshot( const ReadOptions& read_options, ReadCallback* callback, std::function& iter_deref_func, - T* cf_list, SequenceNumber* snapshot) { + T* cf_list, SequenceNumber* snapshot, bool* sv_from_thread_local) { PERF_TIMER_GUARD(get_snapshot_time); + assert(sv_from_thread_local); + *sv_from_thread_local = true; + Status s = Status::OK(); + const bool check_read_ts = + read_options.timestamp && read_options.timestamp->size() > 0; + // sv_from_thread_local set to false means the SuperVersion to be cleaned up + // is acquired directly via ColumnFamilyData instead of thread local. + const auto sv_cleanup_func = [&]() -> void { + for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); + ++cf_iter) { + auto node = iter_deref_func(cf_iter); + SuperVersion* super_version = node->super_version; + ColumnFamilyData* cfd = node->cfd; + if (super_version != nullptr) { + if (*sv_from_thread_local) { + ReturnAndCleanupSuperVersion(cfd, super_version); + } else { + CleanupSuperVersion(super_version); + } + } + node->super_version = nullptr; + } + }; + bool last_try = false; if (cf_list->size() == 1) { - // Fast path for a single column family. We can simply get the thread loca + // Fast path for a single column family. We can simply get the thread local // super version auto cf_iter = cf_list->begin(); auto node = iter_deref_func(cf_iter); @@ -3009,7 +3271,11 @@ bool DBImpl::MultiCFSnapshot( node->super_version = super_snapshot ? super_snapshot->sv() : GetAndRefSuperVersion(node->cfd); // RocksDB-Cloud contribution end - if (read_options.snapshot != nullptr) { + if (check_read_ts) { + s = FailIfReadCollapsedHistory(node->cfd, node->super_version, + *(read_options.timestamp)); + } + if (s.ok() && read_options.snapshot != nullptr) { // Note: In WritePrepared txns this is not necessary but not harmful // either. Because prep_seq > snapshot => commit_seq > snapshot so if // a snapshot is specified we should be fine with skipping seq numbers @@ -3023,7 +3289,7 @@ bool DBImpl::MultiCFSnapshot( if (callback) { *snapshot = std::max(*snapshot, callback->max_visible_seq()); } - } else { + } else if (s.ok()) { // Since we get and reference the super version before getting // the snapshot number, without a mutex protection, it is possible // that a memtable switch happened in the middle and not all the @@ -3041,26 +3307,17 @@ bool DBImpl::MultiCFSnapshot( // MultiGet across column families is not supported with super snapshot assert(!dynamic_cast(read_options.snapshot)); // RocksDB-Cloud contribution end - // If we end up with the same issue of memtable geting sealed during 2 + // If we end up with the same issue of memtable getting sealed during 2 // consecutive retries, it means the write rate is very high. In that case - // its probably ok to take the mutex on the 3rd try so we can succeed for - // sure + // it's probably ok to take the mutex on the 3rd try so we can succeed for + // sure. constexpr int num_retries = 3; for (int i = 0; i < num_retries; ++i) { last_try = (i == num_retries - 1); bool retry = false; if (i > 0) { - for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end(); - ++cf_iter) { - auto node = iter_deref_func(cf_iter); - SuperVersion* super_version = node->super_version; - ColumnFamilyData* cfd = node->cfd; - if (super_version != nullptr) { - ReturnAndCleanupSuperVersion(cfd, super_version); - } - node->super_version = nullptr; - } + sv_cleanup_func(); } if (read_options.snapshot == nullptr) { if (last_try) { @@ -3084,6 +3341,19 @@ bool DBImpl::MultiCFSnapshot( node->super_version = node->cfd->GetSuperVersion()->Ref(); } TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV"); + if (check_read_ts) { + s = FailIfReadCollapsedHistory(node->cfd, node->super_version, + *(read_options.timestamp)); + if (!s.ok()) { + // If read timestamp check failed, a.k.a ReadOptions.timestamp < + // super_version.full_history_ts_low. There is no need to continue + // because this check will keep failing for the same and newer + // SuperVersions, instead we fail fast and ask user to provide + // a higher read timestamp. + retry = false; + break; + } + } if (read_options.snapshot != nullptr || last_try) { // If user passed a snapshot, then we don't care if a memtable is // sealed or compaction happens because the snapshot would ensure @@ -3107,6 +3377,7 @@ bool DBImpl::MultiCFSnapshot( if (!retry) { if (last_try) { mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::MultiGet::AfterLastTryRefSV"); } break; } @@ -3115,22 +3386,52 @@ bool DBImpl::MultiCFSnapshot( // Keep track of bytes that we read for statistics-recording later PERF_TIMER_STOP(get_snapshot_time); - - return last_try; + *sv_from_thread_local = !last_try; + if (!s.ok()) { + sv_cleanup_func(); + } + return s; } void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, ColumnFamilyHandle** column_families, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input) { - return MultiGet(read_options, num_keys, column_families, keys, values, - /*timestamps=*/nullptr, statuses, sorted_input); + MultiGet(read_options, num_keys, column_families, keys, values, + /* timestamps */ nullptr, statuses, sorted_input); } -void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, +void DBImpl::MultiGet(const ReadOptions& _read_options, const size_t num_keys, ColumnFamilyHandle** column_families, const Slice* keys, PinnableSlice* values, std::string* timestamps, Status* statuses, const bool sorted_input) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kMultiGet) { + Status s = Status::InvalidArgument( + "Can only call MultiGet with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + statuses[i] = s; + } + } + return; + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kMultiGet; + } + MultiGetCommon(read_options, num_keys, column_families, keys, values, + /* columns */ nullptr, timestamps, statuses, sorted_input); +} + +void DBImpl::MultiGetCommon(const ReadOptions& read_options, + const size_t num_keys, + ColumnFamilyHandle** column_families, + const Slice* keys, PinnableSlice* values, + PinnableWideColumns* columns, + std::string* timestamps, Status* statuses, + const bool sorted_input) { if (num_keys == 0) { return; } @@ -3148,10 +3449,8 @@ void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, bool should_fail = false; for (size_t i = 0; i < num_keys; ++i) { ColumnFamilyHandle* cfh = column_families[i]; - assert(cfh); if (read_options.timestamp) { - statuses[i] = FailIfTsMismatchCf(cfh, *(read_options.timestamp), - /*ts_for_read=*/true); + statuses[i] = FailIfTsMismatchCf(cfh, *(read_options.timestamp)); if (!statuses[i].ok()) { should_fail = true; } @@ -3186,8 +3485,20 @@ void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, autovector sorted_keys; sorted_keys.resize(num_keys); for (size_t i = 0; i < num_keys; ++i) { - values[i].Reset(); - key_context.emplace_back(column_families[i], keys[i], &values[i], + PinnableSlice* val = nullptr; + PinnableWideColumns* col = nullptr; + + if (values) { + val = &values[i]; + val->Reset(); + } else { + assert(columns); + + col = &columns[i]; + col->Reset(); + } + + key_context.emplace_back(column_families[i], keys[i], val, col, timestamps ? ×tamps[i] : nullptr, &statuses[i]); } @@ -3222,10 +3533,20 @@ void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, }; SequenceNumber consistent_seqnum; - bool unref_only = MultiCFSnapshot< + bool sv_from_thread_local; + Status s = MultiCFSnapshot< autovector>( read_options, nullptr, iter_deref_lambda, &multiget_cf_data, - &consistent_seqnum); + &consistent_seqnum, &sv_from_thread_local); + + if (!s.ok()) { + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + statuses[i] = s; + } + } + return; + } GetWithTimestampReadCallback timestamp_read_callback(0); ReadCallback* read_callback = nullptr; @@ -3234,7 +3555,6 @@ void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, read_callback = ×tamp_read_callback; } - Status s; auto cf_iter = multiget_cf_data.begin(); for (; cf_iter != multiget_cf_data.end(); ++cf_iter) { s = MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys, @@ -3255,10 +3575,11 @@ void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, } for (const auto& iter : multiget_cf_data) { - if (!unref_only) { + if (sv_from_thread_local) { ReturnAndCleanupSuperVersion(iter.cfd, iter.super_version); } else { - iter.cfd->GetSuperVersion()->Unref(); + TEST_SYNC_POINT("DBImpl::MultiGet::BeforeLastTryUnRefSV"); + CleanupSuperVersion(iter.super_version); } } } @@ -3311,15 +3632,42 @@ void DBImpl::MultiGet(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input) { - return MultiGet(read_options, column_family, num_keys, keys, values, - /*timestamp=*/nullptr, statuses, sorted_input); + MultiGet(read_options, column_family, num_keys, keys, values, + /* timestamps */ nullptr, statuses, sorted_input); } -void DBImpl::MultiGet(const ReadOptions& read_options, +void DBImpl::MultiGet(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const size_t num_keys, const Slice* keys, PinnableSlice* values, std::string* timestamps, Status* statuses, const bool sorted_input) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kMultiGet) { + Status s = Status::InvalidArgument( + "Can only call MultiGet with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + statuses[i] = s; + } + } + return; + } + + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kMultiGet; + } + MultiGetCommon(read_options, column_family, num_keys, keys, values, + /* columns */ nullptr, timestamps, statuses, sorted_input); +} + +void DBImpl::MultiGetCommon(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, + PinnableSlice* values, PinnableWideColumns* columns, + std::string* timestamps, Status* statuses, + bool sorted_input) { if (tracer_) { // TODO: This mutex should be removed later, to improve performance when // tracing is enabled. @@ -3333,8 +3681,20 @@ void DBImpl::MultiGet(const ReadOptions& read_options, autovector sorted_keys; sorted_keys.resize(num_keys); for (size_t i = 0; i < num_keys; ++i) { - values[i].Reset(); - key_context.emplace_back(column_family, keys[i], &values[i], + PinnableSlice* val = nullptr; + PinnableWideColumns* col = nullptr; + + if (values) { + val = &values[i]; + val->Reset(); + } else { + assert(columns); + + col = &columns[i]; + col->Reset(); + } + + key_context.emplace_back(column_family, keys[i], val, col, timestamps ? ×tamps[i] : nullptr, &statuses[i]); } @@ -3342,10 +3702,27 @@ void DBImpl::MultiGet(const ReadOptions& read_options, sorted_keys[i] = &key_context[i]; } PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys); - MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys); + MultiGetWithCallbackImpl(read_options, column_family, nullptr, &sorted_keys); } void DBImpl::MultiGetWithCallback( + const ReadOptions& _read_options, ColumnFamilyHandle* column_family, + ReadCallback* callback, + autovector* sorted_keys) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kMultiGet) { + assert(false); + return; + } + + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kMultiGet; + } + MultiGetWithCallbackImpl(read_options, column_family, callback, sorted_keys); +} + +void DBImpl::MultiGetWithCallbackImpl( const ReadOptions& read_options, ColumnFamilyHandle* column_family, ReadCallback* callback, autovector* sorted_keys) { @@ -3360,14 +3737,18 @@ void DBImpl::MultiGetWithCallback( size_t num_keys = sorted_keys->size(); SequenceNumber consistent_seqnum; - bool unref_only = MultiCFSnapshot>( + bool sv_from_thread_local; + Status s = MultiCFSnapshot>( read_options, callback, iter_deref_lambda, &multiget_cf_data, - &consistent_seqnum); + &consistent_seqnum, &sv_from_thread_local); + if (!s.ok()) { + return; + } #ifndef NDEBUG - assert(!unref_only); + assert(sv_from_thread_local); #else // Silence unused variable warning - (void)unref_only; + (void)sv_from_thread_local; #endif // NDEBUG if (callback && read_options.snapshot == nullptr) { @@ -3397,9 +3778,9 @@ void DBImpl::MultiGetWithCallback( read_callback = ×tamp_read_callback; } - Status s = MultiGetImpl(read_options, 0, num_keys, sorted_keys, - multiget_cf_data[0].super_version, consistent_seqnum, - read_callback); + s = MultiGetImpl(read_options, 0, num_keys, sorted_keys, + multiget_cf_data[0].super_version, consistent_seqnum, + read_callback); assert(s.ok() || s.IsTimedOut() || s.IsAborted()); // RocksDB-Cloud contribution begin if (!dynamic_cast(read_options.snapshot)) { @@ -3459,7 +3840,7 @@ Status DBImpl::MultiGetImpl( stats_); MultiGetRange range = ctx.GetMultiGetRange(); range.AddValueSize(curr_value_size); - bool lookup_current = false; + bool lookup_current = true; keys_left -= batch_size; for (auto mget_iter = range.begin(); mget_iter != range.end(); @@ -3478,9 +3859,10 @@ Status DBImpl::MultiGetImpl( super_version->imm->MultiGet(read_options, &range, callback); } if (!range.empty()) { - lookup_current = true; uint64_t left = range.KeysLeft(); RecordTick(stats_, MEMTABLE_MISS, left); + } else { + lookup_current = false; } } if (lookup_current) { @@ -3500,8 +3882,23 @@ Status DBImpl::MultiGetImpl( uint64_t bytes_read = 0; for (size_t i = start_key; i < start_key + num_keys - keys_left; ++i) { KeyContext* key = (*sorted_keys)[i]; + assert(key); + assert(key->s); + if (key->s->ok()) { - bytes_read += key->value->size(); + const auto& merge_threshold = read_options.merge_operand_count_threshold; + if (merge_threshold.has_value() && + key->merge_context.GetNumOperands() > merge_threshold) { + *(key->s) = Status::OkMergeOperandThresholdExceeded(); + } + + if (key->value) { + bytes_read += key->value->size(); + } else { + assert(key->columns); + bytes_read += key->columns->serialized_size(); + } + num_found++; } } @@ -3525,14 +3922,136 @@ Status DBImpl::MultiGetImpl( return s; } +void DBImpl::MultiGetEntity(const ReadOptions& _read_options, size_t num_keys, + ColumnFamilyHandle** column_families, + const Slice* keys, PinnableWideColumns* results, + Status* statuses, bool sorted_input) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kMultiGetEntity) { + Status s = Status::InvalidArgument( + "Can only call MultiGetEntity with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGetEntity`"); + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + statuses[i] = s; + } + } + return; + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kMultiGetEntity; + } + MultiGetCommon(read_options, num_keys, column_families, keys, + /* values */ nullptr, results, /* timestamps */ nullptr, + statuses, sorted_input); +} + +void DBImpl::MultiGetEntity(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family, size_t num_keys, + const Slice* keys, PinnableWideColumns* results, + Status* statuses, bool sorted_input) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kMultiGetEntity) { + Status s = Status::InvalidArgument( + "Can only call MultiGetEntity with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGetEntity`"); + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + statuses[i] = s; + } + } + return; + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kMultiGetEntity; + } + MultiGetCommon(read_options, column_family, num_keys, keys, + /* values */ nullptr, results, /* timestamps */ nullptr, + statuses, sorted_input); +} + +void DBImpl::MultiGetEntity(const ReadOptions& _read_options, size_t num_keys, + const Slice* keys, + PinnableAttributeGroups* results) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kMultiGetEntity) { + Status s = Status::InvalidArgument( + "Can only call MultiGetEntity with ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGetEntity`"); + for (size_t i = 0; i < num_keys; ++i) { + for (size_t j = 0; j < results[i].size(); ++j) { + results[i][j].SetStatus(s); + } + } + return; + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kMultiGetEntity; + } + + std::vector column_families; + std::vector all_keys; + size_t total_count = 0; + + for (size_t i = 0; i < num_keys; ++i) { + for (size_t j = 0; j < results[i].size(); ++j) { + // Adding the same key slice for different CFs + all_keys.emplace_back(keys[i]); + column_families.emplace_back(results[i][j].column_family()); + ++total_count; + } + } + std::vector statuses(total_count); + std::vector columns(total_count); + MultiGetCommon(read_options, total_count, column_families.data(), + all_keys.data(), + /* values */ nullptr, columns.data(), + /* timestamps */ nullptr, statuses.data(), + /* sorted_input */ false); + + // Set results + size_t index = 0; + for (size_t i = 0; i < num_keys; ++i) { + for (size_t j = 0; j < results[i].size(); ++j) { + results[i][j].Reset(); + results[i][j].SetStatus(std::move(statuses[index])); + results[i][j].SetColumns(std::move(columns[index])); + ++index; + } + } +} + +Status DBImpl::WrapUpCreateColumnFamilies( + const std::vector& cf_options) { + // NOTE: this function is skipped for create_missing_column_families and + // DB::Open, so new functionality here might need to go into Open also. + bool register_worker = false; + for (auto* opts_ptr : cf_options) { + if (opts_ptr->preserve_internal_time_seconds > 0 || + opts_ptr->preclude_last_level_data_seconds > 0) { + register_worker = true; + break; + } + } + // Attempt both follow-up actions even if one fails + Status s = WriteOptionsFile(false /*db_mutex_already_held*/); + if (register_worker) { + s.UpdateIfOk(RegisterRecordSeqnoTimeWorker(/*from_db_open=*/false)); + } + return s; +} + Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, const std::string& column_family, ColumnFamilyHandle** handle) { assert(handle != nullptr); + InstrumentedMutexLock ol(&options_mutex_); Status s = CreateColumnFamilyImpl(cf_options, column_family, handle); if (s.ok()) { - s = WriteOptionsFile(true /*need_mutex_lock*/, - true /*need_enter_write_thread*/); + s.UpdateIfOk(WrapUpCreateColumnFamilies({&cf_options})); } return s; } @@ -3542,6 +4061,7 @@ Status DBImpl::CreateColumnFamilies( const std::vector& column_family_names, std::vector* handles) { assert(handles != nullptr); + InstrumentedMutexLock ol(&options_mutex_); handles->clear(); size_t num_cf = column_family_names.size(); Status s; @@ -3556,11 +4076,7 @@ Status DBImpl::CreateColumnFamilies( success_once = true; } if (success_once) { - Status persist_options_status = WriteOptionsFile( - true /*need_mutex_lock*/, true /*need_enter_write_thread*/); - if (s.ok() && !persist_options_status.ok()) { - s = persist_options_status; - } + s.UpdateIfOk(WrapUpCreateColumnFamilies({&cf_options})); } return s; } @@ -3569,10 +4085,13 @@ Status DBImpl::CreateColumnFamilies( const std::vector& column_families, std::vector* handles) { assert(handles != nullptr); + InstrumentedMutexLock ol(&options_mutex_); handles->clear(); size_t num_cf = column_families.size(); Status s; bool success_once = false; + std::vector cf_opts; + cf_opts.reserve(num_cf); for (size_t i = 0; i < num_cf; i++) { ColumnFamilyHandle* handle; s = CreateColumnFamilyImpl(column_families[i].options, @@ -3582,13 +4101,10 @@ Status DBImpl::CreateColumnFamilies( } handles->push_back(handle); success_once = true; + cf_opts.push_back(&column_families[i].options); } if (success_once) { - Status persist_options_status = WriteOptionsFile( - true /*need_mutex_lock*/, true /*need_enter_write_thread*/); - if (s.ok() && !persist_options_status.ok()) { - s = persist_options_status; - } + s.UpdateIfOk(WrapUpCreateColumnFamilies(cf_opts)); } return s; } @@ -3596,6 +4112,9 @@ Status DBImpl::CreateColumnFamilies( Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, const std::string& column_family_name, ColumnFamilyHandle** handle) { + options_mutex_.AssertHeld(); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; Status s; *handle = nullptr; @@ -3628,6 +4147,8 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, edit.SetColumnFamily(new_id); edit.SetLogNumber(logfile_number_); edit.SetComparatorName(cf_options.comparator->Name()); + edit.SetPersistUserDefinedTimestamps( + cf_options.persist_user_defined_timestamps); // LogAndApply will both write the creation in MANIFEST and create // ColumnFamilyData object @@ -3636,9 +4157,9 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, write_thread_.EnterUnbatched(&w, &mutex_); // LogAndApply will both write the creation in MANIFEST and create // ColumnFamilyData object - s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), &edit, - &mutex_, directories_.GetDbDir(), false, - &cf_options); + s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), + read_options, &edit, &mutex_, + directories_.GetDbDir(), false, &cf_options); write_thread_.ExitUnbatched(&w); } if (s.ok()) { @@ -3672,10 +4193,6 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, } } // InstrumentedMutexLock l(&mutex_) - if (cf_options.preserve_internal_time_seconds > 0 || - cf_options.preclude_last_level_data_seconds > 0) { - s = RegisterRecordSeqnoTimeWorker(); - } sv_context.Clean(); // this is outside the mutex if (s.ok()) { @@ -3687,16 +4204,17 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { assert(column_family != nullptr); + InstrumentedMutexLock ol(&options_mutex_); Status s = DropColumnFamilyImpl(column_family); if (s.ok()) { - s = WriteOptionsFile(true /*need_mutex_lock*/, - true /*need_enter_write_thread*/); + s = WriteOptionsFile(false /*db_mutex_already_held*/); } return s; } Status DBImpl::DropColumnFamilies( const std::vector& column_families) { + InstrumentedMutexLock ol(&options_mutex_); Status s; bool success_once = false; for (auto* handle : column_families) { @@ -3707,8 +4225,8 @@ Status DBImpl::DropColumnFamilies( success_once = true; } if (success_once) { - Status persist_options_status = WriteOptionsFile( - true /*need_mutex_lock*/, true /*need_enter_write_thread*/); + Status persist_options_status = + WriteOptionsFile(false /*db_mutex_already_held*/); if (s.ok() && !persist_options_status.ok()) { s = persist_options_status; } @@ -3717,6 +4235,8 @@ Status DBImpl::DropColumnFamilies( } Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); if (cfd->GetID() == 0) { @@ -3739,8 +4259,9 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { // we drop column family from a single write thread WriteThread::Writer w; write_thread_.EnterUnbatched(&w, &mutex_); - s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, - &mutex_, directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + read_options, &edit, &mutex_, + directories_.GetDbDir()); write_thread_.ExitUnbatched(&w); } if (s.ok()) { @@ -3766,7 +4287,7 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { if (cfd->ioptions()->preserve_internal_time_seconds > 0 || cfd->ioptions()->preclude_last_level_data_seconds > 0) { - s = RegisterRecordSeqnoTimeWorker(); + s = RegisterRecordSeqnoTimeWorker(/*from_db_open=*/false); } if (s.ok()) { @@ -3791,10 +4312,13 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options, std::string* value, std::string* timestamp, bool* value_found) { assert(value != nullptr); + assert(read_options.io_activity == Env::IOActivity::kUnknown); + if (value_found != nullptr) { // falsify later if key-may-exist but can't fetch value *value_found = true; } + // TODO: plumb Env::IOActivity ReadOptions roptions = read_options; roptions.read_tier = kBlockCacheTier; // read from block cache only PinnableSlice pinnable_val; @@ -3812,8 +4336,19 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options, return s.ok() || s.IsIncomplete(); } -Iterator* DBImpl::NewIterator(const ReadOptions& read_options, +Iterator* DBImpl::NewIterator(const ReadOptions& _read_options, ColumnFamilyHandle* column_family) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kDBIterator) { + return NewErrorIterator(Status::InvalidArgument( + "Can only call NewIterator with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kDBIterator`")); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kDBIterator; + } + if (read_options.managed) { return NewErrorIterator( Status::NotSupported("Managed iterator is not supported anymore.")); @@ -3823,12 +4358,11 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, return NewErrorIterator(Status::NotSupported( "ReadTier::kPersistedData is not yet supported in iterators.")); } - assert(column_family); if (read_options.timestamp) { - const Status s = FailIfTsMismatchCf( - column_family, *(read_options.timestamp), /*ts_for_read=*/true); + const Status s = + FailIfTsMismatchCf(column_family, *(read_options.timestamp)); if (!s.ok()) { return NewErrorIterator(s); } @@ -3843,20 +4377,30 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, ColumnFamilyData* cfd = cfh->cfd(); assert(cfd != nullptr); ReadCallback* read_callback = nullptr; // No read callback provided. - if (read_options.tailing) { -#ifdef ROCKSDB_LITE - // not supported in lite version - result = nullptr; + // RocksDB-Cloud contribution begin + auto super_snapshot = + dynamic_cast(read_options.snapshot); + SuperVersion* sv = super_snapshot ? super_snapshot->sv() + : cfd->GetReferencedSuperVersion(this); -#else + if (read_options.timestamp && read_options.timestamp->size() > 0) { + const Status s = + FailIfReadCollapsedHistory(cfd, sv, *(read_options.timestamp)); + if (!s.ok()) { + if (!super_snapshot) { + CleanupSuperVersion(sv); + } + return NewErrorIterator(s); + } + } + if (read_options.tailing) { // RocksDB-Cloud contribution begin - if (dynamic_cast(read_options.snapshot)) { + if (super_snapshot) { return NewErrorIterator(Status::NotSupported( "Tailing iterator not supported with super snapshot")); } // RocksDB-Cloud contribution end - SuperVersion* sv = cfd->GetReferencedSuperVersion(this); auto iter = new ForwardIterator(this, read_options, cfd, sv, /* allow_unprepared_value */ true); result = NewDBIterator( @@ -3864,11 +4408,8 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber, sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback, this, cfd); -#endif } else { // RocksDB-Cloud contribution begin - auto super_snapshot = - dynamic_cast(read_options.snapshot); if (super_snapshot && cfd->GetID() != super_snapshot->cfd()->GetID()) { std::ostringstream oss; oss << "SuperSnapshot column family " << super_snapshot->cfd()->GetName() @@ -3882,7 +4423,7 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, // Note: no need to consider the special case of // last_seq_same_as_publish_seq_==false since NewIterator is overridden in // WritePreparedTxnDB - result = NewIteratorImpl(read_options, cfd, + result = NewIteratorImpl(read_options, cfd, sv, (read_options.snapshot != nullptr) ? read_options.snapshot->GetSequenceNumber() : kMaxSequenceNumber, @@ -3891,21 +4432,10 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, return result; } -ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options, - ColumnFamilyData* cfd, - SequenceNumber snapshot, - ReadCallback* read_callback, - bool expose_blob_index, - bool allow_refresh) { - // RocksDB-Cloud contribution begin - auto super_snapshot = - dynamic_cast(read_options.snapshot); - - // Acquire SuperVersion - SuperVersion* sv = super_snapshot ? super_snapshot->sv() - : cfd->GetReferencedSuperVersion(this); - // RocksDB-Cloud contribution end - +ArenaWrappedDBIter* DBImpl::NewIteratorImpl( + const ReadOptions& read_options, ColumnFamilyData* cfd, SuperVersion* sv, + SequenceNumber snapshot, ReadCallback* read_callback, + bool expose_blob_index, bool allow_refresh) { TEST_SYNC_POINT("DBImpl::NewIterator:1"); TEST_SYNC_POINT("DBImpl::NewIterator:2"); @@ -3970,7 +4500,7 @@ ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options, env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, sv->current, snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations, sv->version_number, read_callback, this, cfd, expose_blob_index, - read_options.snapshot != nullptr ? false : allow_refresh); + allow_refresh); InternalIterator* internal_iter = NewInternalIterator( db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), snapshot, @@ -3981,9 +4511,19 @@ ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options, } Status DBImpl::NewIterators( - const ReadOptions& read_options, + const ReadOptions& _read_options, const std::vector& column_families, std::vector* iterators) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kDBIterator) { + return Status::InvalidArgument( + "Can only call NewIterators with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kDBIterator`"); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kDBIterator; + } if (read_options.managed) { return Status::NotSupported("Managed iterator is not supported anymore."); } @@ -3995,8 +4535,7 @@ Status DBImpl::NewIterators( if (read_options.timestamp) { for (auto* cf : column_families) { assert(cf); - const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp), - /*ts_for_read=*/true); + const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp)); if (!s.ok()) { return s; } @@ -4014,18 +4553,47 @@ Status DBImpl::NewIterators( ReadCallback* read_callback = nullptr; // No read callback provided. iterators->clear(); iterators->reserve(column_families.size()); + autovector> cfd_to_sv; + const bool check_read_ts = + read_options.timestamp && read_options.timestamp->size() > 0; + for (auto cfh : column_families) { + auto cfd = static_cast_with_check(cfh)->cfd(); + + // RocksDB-Cloud contribution begin + auto super_snapshot = + dynamic_cast(read_options.snapshot); + if (super_snapshot && cfd->GetID() != super_snapshot->cfd()->GetID()) { + std::ostringstream oss; + oss << "SuperSnapshot column family " << super_snapshot->cfd()->GetName() + << " doesn't match provided column family " << cfd->GetName(); + // We do a check here instead of in NewIteratorImpl because + // NewIteratorImpl returns ArenaWrappedDBIter, which ErrorIterator does + // not subclass + return Status::InvalidArgument(oss.str()); + } + SuperVersion* sv = super_snapshot ? super_snapshot->sv() + : cfd->GetReferencedSuperVersion(this); + // RocksDB-Cloud contribution end + + cfd_to_sv.emplace_back(cfd, sv); + if (check_read_ts) { + const Status s = + FailIfReadCollapsedHistory(cfd, sv, *(read_options.timestamp)); + if (!s.ok()) { + for (auto prev_entry : cfd_to_sv) { + CleanupSuperVersion(std::get<1>(prev_entry)); + } + return s; + } + } + } + assert(cfd_to_sv.size() == column_families.size()); if (read_options.tailing) { -#ifdef ROCKSDB_LITE - return Status::InvalidArgument( - "Tailing iterator not supported in RocksDB lite"); -#else if (dynamic_cast(read_options.snapshot)) { return Status::NotSupported( "Tailing iterator not supported with super snapshot"); } - for (auto cfh : column_families) { - auto cfd = static_cast_with_check(cfh)->cfd(); - SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + for (auto [cfd, sv] : cfd_to_sv) { auto iter = new ForwardIterator(this, read_options, cfd, sv, /* allow_unprepared_value */ true); iterators->push_back(NewDBIterator( @@ -4034,7 +4602,6 @@ Status DBImpl::NewIterators( sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback, this, cfd)); } -#endif } else { // Note: no need to consider the special case of // last_seq_same_as_publish_seq_==false since NewIterators is overridden in @@ -4042,24 +4609,9 @@ Status DBImpl::NewIterators( auto snapshot = read_options.snapshot != nullptr ? read_options.snapshot->GetSequenceNumber() : versions_->LastSequence(); - for (size_t i = 0; i < column_families.size(); ++i) { - auto* cfd = - static_cast_with_check(column_families[i]) - ->cfd(); - // RocksDB-Cloud contribution begin - auto super_snapshot = - dynamic_cast(read_options.snapshot); - if (super_snapshot && cfd->GetID() != super_snapshot->cfd()->GetID()) { - std::ostringstream oss; - oss << "SuperSnapshot column family " << super_snapshot->cfd()->GetName() - << " doesn't match provided column family " << cfd->GetName(); - // We do a check here instead of in NewIteratorImpl because - // NewIteratorImpl returns ArenaWrappedDBIter, which ErrorIterator does - // not subclass - return Status::InvalidArgument(oss.str()); - } + for (auto [cfd, sv] : cfd_to_sv) { iterators->push_back( - NewIteratorImpl(read_options, cfd, snapshot, read_callback)); + NewIteratorImpl(read_options, cfd, sv, snapshot, read_callback)); } } @@ -4113,11 +4665,9 @@ Status DBImpl::GetSuperSnapshots( } // RocksDB-Cloud contribution end -#ifndef ROCKSDB_LITE const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() { return GetSnapshotImpl(true); } -#endif // ROCKSDB_LITE std::pair> DBImpl::CreateTimestampedSnapshot(SequenceNumber snapshot_seq, uint64_t ts) { @@ -4340,7 +4890,8 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) { CfdList cf_scheduled; for (auto* cfd : *versions_->GetColumnFamilySet()) { if (!cfd->ioptions()->allow_ingest_behind) { - cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot); + cfd->current()->storage_info()->UpdateOldestSnapshot( + oldest_snapshot, /*allow_ingest_behind=*/false); if (!cfd->current() ->storage_info() ->BottommostFilesMarkedForCompaction() @@ -4371,7 +4922,6 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) { delete casted_s; } -#ifndef ROCKSDB_LITE Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, TablePropertiesCollection* props) { auto cfh = static_cast_with_check(column_family); @@ -4383,7 +4933,9 @@ Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, version->Ref(); mutex_.Unlock(); - auto s = version->GetPropertiesOfAllTables(props); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = version->GetPropertiesOfAllTables(read_options, props); // Decrement the ref count mutex_.Lock(); @@ -4405,7 +4957,9 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, version->Ref(); mutex_.Unlock(); - auto s = version->GetPropertiesOfTablesInRange(range, n, props); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = version->GetPropertiesOfTablesInRange(read_options, range, n, props); // Decrement the ref count mutex_.Lock(); @@ -4415,8 +4969,6 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, return s; } -#endif // ROCKSDB_LITE - const std::string& DBImpl::GetName() const { return dbname_; } Env* DBImpl::GetEnv() const { return env_; } @@ -4434,8 +4986,6 @@ SystemClock* DBImpl::GetSystemClock() const { return immutable_db_options_.clock; } -#ifndef ROCKSDB_LITE - Status DBImpl::StartIOTrace(const TraceOptions& trace_options, std::unique_ptr&& trace_writer) { assert(trace_writer != nullptr); @@ -4448,8 +4998,6 @@ Status DBImpl::EndIOTrace() { return Status::OK(); } -#endif // ROCKSDB_LITE - Options DBImpl::GetOptions(ColumnFamilyHandle* column_family) const { InstrumentedMutexLock l(&mutex_); auto cfh = static_cast_with_check(column_family); @@ -4577,7 +5125,6 @@ bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) { return true; } -#ifndef ROCKSDB_LITE Status DBImpl::ResetStats() { InstrumentedMutexLock l(&mutex_); for (auto* cfd : *versions_->GetColumnFamilySet()) { @@ -4587,7 +5134,6 @@ Status DBImpl::ResetStats() { } return Status::OK(); } -#endif // ROCKSDB_LITE bool DBImpl::GetAggregatedIntProperty(const Slice& property, uint64_t* aggregated_value) { @@ -4713,9 +5259,19 @@ void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family, ColumnFamilyData* cfd = cfh->cfd(); SuperVersion* sv = GetAndRefSuperVersion(cfd); + const Comparator* const ucmp = column_family->GetComparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + + // Add timestamp if needed + std::string start_with_ts, limit_with_ts; + auto [start, limit] = MaybeAddTimestampsToRange( + &range.start, &range.limit, ts_sz, &start_with_ts, &limit_with_ts); + assert(start.has_value()); + assert(limit.has_value()); // Convert user_key into a corresponding internal key. - InternalKey k1(range.start, kMaxSequenceNumber, kValueTypeForSeek); - InternalKey k2(range.limit, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k1(start.value(), kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(limit.value(), kMaxSequenceNumber, kValueTypeForSeek); MemTable::MemTableStats memStats = sv->mem->ApproximateStats(k1.Encode(), k2.Encode()); MemTable::MemTableStats immStats = @@ -4743,28 +5299,23 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, SuperVersion* sv = GetAndRefSuperVersion(cfd); v = sv->current; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; for (int i = 0; i < n; i++) { - Slice start = range[i].start; - Slice limit = range[i].limit; - // Add timestamp if needed std::string start_with_ts, limit_with_ts; - if (ts_sz > 0) { - // Maximum timestamp means including all key with any timestamp - AppendKeyWithMaxTimestamp(&start_with_ts, start, ts_sz); - // Append a maximum timestamp as the range limit is exclusive: - // [start, limit) - AppendKeyWithMaxTimestamp(&limit_with_ts, limit, ts_sz); - start = start_with_ts; - limit = limit_with_ts; - } + auto [start, limit] = + MaybeAddTimestampsToRange(&range[i].start, &range[i].limit, ts_sz, + &start_with_ts, &limit_with_ts); + assert(start.has_value()); + assert(limit.has_value()); // Convert user_key into a corresponding internal key. - InternalKey k1(start, kMaxSequenceNumber, kValueTypeForSeek); - InternalKey k2(limit, kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k1(start.value(), kMaxSequenceNumber, kValueTypeForSeek); + InternalKey k2(limit.value(), kMaxSequenceNumber, kValueTypeForSeek); sizes[i] = 0; if (options.include_files) { sizes[i] += versions_->ApproximateSize( - options, v, k1.Encode(), k2.Encode(), /*start_level=*/0, + options, read_options, v, k1.Encode(), k2.Encode(), /*start_level=*/0, /*end_level=*/-1, TableReaderCaller::kUserApproximateSize); } if (options.include_memtables) { @@ -4796,7 +5347,6 @@ void DBImpl::ReleaseFileNumberFromPendingOutputs( } } -#ifndef ROCKSDB_LITE Status DBImpl::GetUpdatesSince( SequenceNumber seq, std::unique_ptr* iter, const TransactionLogIterator::ReadOptions& read_options) { @@ -4813,6 +5363,8 @@ Status DBImpl::GetUpdatesSince( } Status DBImpl::DeleteFile(std::string name) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; uint64_t number; FileType type; WalFileType log_type; @@ -4892,7 +5444,8 @@ Status DBImpl::DeleteFile(std::string name) { edit.SetColumnFamily(cfd->GetID()); edit.DeleteFile(level, number); status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_, directories_.GetDbDir()); + read_options, &edit, &mutex_, + directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &job_context.superversion_contexts[0], @@ -4914,6 +5467,8 @@ Status DBImpl::DeleteFile(std::string name) { Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, const RangePtr* ranges, size_t n, bool include_end) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; Status status = Status::OK(); auto cfh = static_cast_with_check(column_family); ColumnFamilyData* cfd = cfh->cfd(); @@ -4969,17 +5524,20 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, deleted_files.insert(level_file); level_file->being_compacted = true; } - vstorage->ComputeCompactionScore(*cfd->ioptions(), - *cfd->GetLatestMutableCFOptions()); } } + if (!deleted_files.empty()) { + vstorage->ComputeCompactionScore(*cfd->ioptions(), + *cfd->GetLatestMutableCFOptions()); + } if (edit.GetDeletedFiles().empty()) { job_context.Clean(); return status; } input_version->Ref(); status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_, directories_.GetDbDir()); + read_options, &edit, &mutex_, + directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &job_context.superversion_contexts[0], @@ -5045,8 +5603,6 @@ void DBImpl::GetAllColumnFamilyMetaData( } } -#endif // ROCKSDB_LITE - Status DBImpl::CheckConsistency() { mutex_.AssertHeld(); std::vector metadata; @@ -5394,21 +5950,13 @@ Status DestroyDB(const std::string& dbname, const Options& options, return result; } -Status DBImpl::WriteOptionsFile(bool need_mutex_lock, - bool need_enter_write_thread) { -#ifndef ROCKSDB_LITE - if (!immutable_db_options_.use_options_file) { - return Status::OK(); - } +Status DBImpl::WriteOptionsFile(bool db_mutex_already_held) { + options_mutex_.AssertHeld(); - WriteThread::Writer w; - if (need_mutex_lock) { - mutex_.Lock(); - } else { + if (db_mutex_already_held) { mutex_.AssertHeld(); - } - if (need_enter_write_thread) { - write_thread_.EnterUnbatched(&w, &mutex_); + } else { + mutex_.Lock(); } std::vector cf_names; @@ -5423,10 +5971,10 @@ Status DBImpl::WriteOptionsFile(bool need_mutex_lock, cf_opts.push_back(cfd->GetLatestCFOptions()); } - // Unlock during expensive operations. New writes cannot get here - // because the single write thread ensures all new writes get queued. DBOptions db_options = BuildDBOptions(immutable_db_options_, mutable_db_options_); + + // Unlock during expensive operations. mutex_.Unlock(); TEST_SYNC_POINT("DBImpl::WriteOptionsFile:1"); @@ -5442,29 +5990,33 @@ Status DBImpl::WriteOptionsFile(bool need_mutex_lock, if (s.ok()) { s = RenameTempFileToOptionsFile(file_name); } - // restore lock - if (!need_mutex_lock) { - mutex_.Lock(); - } - if (need_enter_write_thread) { - write_thread_.ExitUnbatched(&w); + + if (!s.ok() && GetEnv()->FileExists(file_name).ok()) { + if (!GetEnv()->DeleteFile(file_name).ok()) { + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "Unable to delete temp options file %s", + file_name.c_str()); + } } + if (!s.ok()) { ROCKS_LOG_WARN(immutable_db_options_.info_log, "Unnable to persist options -- %s", s.ToString().c_str()); if (immutable_db_options_.fail_if_options_file_error) { - return Status::IOError("Unable to persist options.", - s.ToString().c_str()); + s = Status::IOError("Unable to persist options.", s.ToString().c_str()); + } else { + // Ignore error + s = Status::OK(); } } -#else - (void)need_mutex_lock; - (void)need_enter_write_thread; -#endif // !ROCKSDB_LITE - return Status::OK(); + + // Restore lock if appropriate + if (db_mutex_already_held) { + mutex_.Lock(); + } + return s; } -#ifndef ROCKSDB_LITE namespace { void DeleteOptionsFilesHelper(const std::map& filenames, const size_t num_files_to_keep, @@ -5482,10 +6034,8 @@ void DeleteOptionsFilesHelper(const std::map& filenames, } } } // namespace -#endif // !ROCKSDB_LITE Status DBImpl::DeleteObsoleteOptionsFiles() { -#ifndef ROCKSDB_LITE std::vector filenames; // use ordered map to store keep the filenames sorted from the newest // to the oldest. @@ -5513,13 +6063,9 @@ Status DBImpl::DeleteObsoleteOptionsFiles() { DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept, immutable_db_options_.info_log, GetEnv()); return Status::OK(); -#else - return Status::OK(); -#endif // !ROCKSDB_LITE } Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) { -#ifndef ROCKSDB_LITE Status s; uint64_t options_file_number = versions_->NewFileNumber(); @@ -5563,10 +6109,6 @@ Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) { DeleteObsoleteOptionsFiles().PermitUncheckedError(); } return s; -#else - (void)file_name; - return Status::OK(); -#endif // !ROCKSDB_LITE } #ifdef ROCKSDB_USING_THREAD_STATUS @@ -5614,7 +6156,6 @@ void DumpRocksDBBuildVersion(Logger* log) { } } -#ifndef ROCKSDB_LITE SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv, bool include_history) { // Find the earliest sequence number that we know we can rely on reading @@ -5637,6 +6178,7 @@ Status DBImpl::GetLatestSequenceForKey( MergeContext merge_context; SequenceNumber max_covering_tombstone_seq = 0; + // TODO: plumb Env::IOActivity ReadOptions read_options; SequenceNumber current_seq = versions_->LastSequence(); @@ -5792,6 +6334,8 @@ Status DBImpl::IngestExternalFile( Status DBImpl::IngestExternalFiles( const std::vector& args) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; if (args.empty()) { return Status::InvalidArgument("ingestion arg list is empty"); } @@ -5948,12 +6492,10 @@ Status DBImpl::IngestExternalFiles( FlushOptions flush_opts; flush_opts.allow_write_stall = true; if (immutable_db_options_.atomic_flush) { - autovector cfds_to_flush; - SelectColumnFamiliesForAtomicFlush(&cfds_to_flush); mutex_.Unlock(); - status = AtomicFlushMemTables(cfds_to_flush, flush_opts, - FlushReason::kExternalFileIngestion, - true /* entered_write_thread */); + status = AtomicFlushMemTables( + flush_opts, FlushReason::kExternalFileIngestion, + {} /* provided_candidate_cfds */, true /* entered_write_thread */); mutex_.Lock(); } else { for (size_t i = 0; i != num_cfs; ++i) { @@ -6011,9 +6553,9 @@ Status DBImpl::IngestExternalFiles( } assert(0 == num_entries); } - status = - versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list, - edit_lists, &mutex_, directories_.GetDbDir()); + status = versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list, + read_options, edit_lists, &mutex_, + directories_.GetDbDir()); // It is safe to update VersionSet last seqno here after LogAndApply since // LogAndApply persists last sequence number from VersionEdits, // which are from file's largest seqno and not from VersionSet. @@ -6111,12 +6653,24 @@ Status DBImpl::IngestExternalFiles( Status DBImpl::CreateColumnFamilyWithImport( const ColumnFamilyOptions& options, const std::string& column_family_name, const ImportColumnFamilyOptions& import_options, - const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) { + const std::vector& metadatas, + ColumnFamilyHandle** handle) { assert(handle != nullptr); assert(*handle == nullptr); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; std::string cf_comparator_name = options.comparator->Name(); - if (cf_comparator_name != metadata.db_comparator_name) { - return Status::InvalidArgument("Comparator name mismatch"); + + size_t total_file_num = 0; + std::vector> metadata_files(metadatas.size()); + for (size_t i = 0; i < metadatas.size(); i++) { + if (cf_comparator_name != metadatas[i]->db_comparator_name) { + return Status::InvalidArgument("Comparator name mismatch"); + } + for (auto& file : metadatas[i]->files) { + metadata_files[i].push_back((LiveFileMetaData*)&file); + } + total_file_num += metadatas[i]->files.size(); } // Create column family. @@ -6130,7 +6684,7 @@ Status DBImpl::CreateColumnFamilyWithImport( auto cfd = cfh->cfd(); ImportColumnFamilyJob import_job(versions_.get(), cfd, immutable_db_options_, file_options_, import_options, - metadata.files, io_tracer_); + metadata_files, io_tracer_); SuperVersionContext dummy_sv_ctx(/* create_superversion */ true); VersionEdit dummy_edit; @@ -6153,10 +6707,11 @@ Status DBImpl::CreateColumnFamilyWithImport( // reuse the file number that has already assigned to the internal file, // and this will overwrite the external file. To protect the external // file, we have to make sure the file number will never being reused. - next_file_number = versions_->FetchAddFileNumber(metadata.files.size()); + next_file_number = versions_->FetchAddFileNumber(total_file_num); auto cf_options = cfd->GetLatestMutableCFOptions(); - status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_, - directories_.GetDbDir()); + status = + versions_->LogAndApply(cfd, *cf_options, read_options, &dummy_edit, + &mutex_, directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); } @@ -6192,8 +6747,9 @@ Status DBImpl::CreateColumnFamilyWithImport( // Install job edit [Mutex will be unlocked here] if (status.ok()) { auto cf_options = cfd->GetLatestMutableCFOptions(); - status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(), - &mutex_, directories_.GetDbDir()); + status = versions_->LogAndApply(cfd, *cf_options, read_options, + import_job.edit(), &mutex_, + directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options); } @@ -6236,12 +6792,110 @@ Status DBImpl::CreateColumnFamilyWithImport( return status; } -Status DBImpl::VerifyFileChecksums(const ReadOptions& read_options) { - return VerifyChecksumInternal(read_options, /*use_file_checksum=*/true); +Status DBImpl::ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + assert(column_family); + Status status; + // Flush memtable + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + auto* cfd = + static_cast_with_check(column_family)->cfd(); + if (immutable_db_options_.atomic_flush) { + status = AtomicFlushMemTables(flush_opts, FlushReason::kDeleteFiles, + {} /* provided_candidate_cfds */, + false /* entered_write_thread */); + } else { + status = FlushMemTable(cfd, flush_opts, FlushReason::kDeleteFiles, + false /* entered_write_thread */); + } + + if (status.ok()) { + // DeleteFilesInRanges non-overlap files except L0 + std::vector ranges; + ranges.push_back(RangePtr(nullptr, &begin_key)); + ranges.push_back(RangePtr(&end_key, nullptr)); + status = DeleteFilesInRanges(column_family, ranges.data(), ranges.size()); + } + + // DeleteRange the remaining overlapping keys + bool empty_after_delete = false; + if (status.ok()) { + Slice smallest_user_key, largest_user_key; + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + cfd->current()->GetSstFilesBoundaryKeys(&smallest_user_key, + &largest_user_key); + } + // all the files has been deleted after DeleteFilesInRanges; + if (smallest_user_key.empty() && largest_user_key.empty()) { + empty_after_delete = true; + } else { + const Comparator* const ucmp = column_family->GetComparator(); + WriteOptions wo; + // Delete [smallest_user_key, clip_begin_key) + if (ucmp->Compare(smallest_user_key, begin_key) < 0) { + status = DeleteRange(wo, column_family, smallest_user_key, begin_key); + } + + if (status.ok()) { + // Delete [clip_end_key, largest_use_key] + if (ucmp->Compare(end_key, largest_user_key) <= 0) { + status = DeleteRange(wo, column_family, end_key, largest_user_key); + if (status.ok()) { + status = Delete(wo, column_family, largest_user_key); + } + } + } + } + } + + if (status.ok() && !empty_after_delete) { + // CompactRange delete all the tombstones + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = true; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + // We could just compact the ranges [null, clip_begin_key] and + // [clip_end_key, null]. But due to how manual compaction calculates the + // last level to compact to and that range tombstones are not dropped + // during non-bottommost compactions, calling CompactRange() on these two + // ranges may not clear all range tombstones. + status = CompactRange(compact_options, nullptr, nullptr); + } + return status; } -Status DBImpl::VerifyChecksum(const ReadOptions& read_options) { - return VerifyChecksumInternal(read_options, /*use_file_checksum=*/false); +Status DBImpl::VerifyFileChecksums(const ReadOptions& _read_options) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kVerifyFileChecksums) { + return Status::InvalidArgument( + "Can only call VerifyFileChecksums with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or " + "`Env::IOActivity::kVerifyFileChecksums`"); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kVerifyFileChecksums; + } + return VerifyChecksumInternal(read_options, + /*use_file_checksum=*/true); +} + +Status DBImpl::VerifyChecksum(const ReadOptions& _read_options) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kVerifyDBChecksum) { + return Status::InvalidArgument( + "Can only call VerifyChecksum with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kVerifyDBChecksum`"); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kVerifyDBChecksum; + } + return VerifyChecksumInternal(read_options, + /*use_file_checksum=*/false); } Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options, @@ -6263,6 +6917,7 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options, return s; } } + // FIXME? What does it mean if read_options.verify_checksums == false? // TODO: simplify using GetRefedColumnFamilySet? std::vector cfd_list; @@ -6303,7 +6958,7 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options, fmeta->file_checksum_func_name, fname, read_options); } else { - s = ROCKSDB_NAMESPACE::VerifySstFileChecksum( + s = ROCKSDB_NAMESPACE::VerifySstFileChecksumInternal( opts, file_options_, read_options, fname, fd.largest_seqno); } RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES, @@ -6376,8 +7031,8 @@ Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected, fs_.get(), fname, immutable_db_options_.file_checksum_gen_factory.get(), func_name_expected, &file_checksum, &func_name, read_options.readahead_size, immutable_db_options_.allow_mmap_reads, - io_tracer_, immutable_db_options_.rate_limiter.get(), - read_options.rate_limiter_priority); + io_tracer_, immutable_db_options_.rate_limiter.get(), read_options, + immutable_db_options_.stats, immutable_db_options_.clock); if (s.ok()) { assert(func_name_expected == func_name); if (file_checksum != file_checksum_expected) { @@ -6498,6 +7153,8 @@ Status DBImpl::ReserveFileNumbersBeforeIngestion( ColumnFamilyData* cfd, uint64_t num, std::unique_ptr::iterator>& pending_output_elem, uint64_t* next_file_number) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; Status s; SuperVersionContext dummy_sv_ctx(true /* create_superversion */); assert(nullptr != next_file_number); @@ -6515,8 +7172,8 @@ Status DBImpl::ReserveFileNumbersBeforeIngestion( // reuse the file number that has already assigned to the internal file, // and this will overwrite the external file. To protect the external // file, we have to make sure the file number will never being reused. - s = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_, - directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, *cf_options, read_options, &dummy_edit, + &mutex_, directories_.GetDbDir()); if (s.ok()) { InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); } @@ -6552,26 +7209,57 @@ Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) { } } -void DBImpl::RecordSeqnoToTimeMapping() { - // Get time first then sequence number, so the actual time of seqno is <= - // unix_time recorded - int64_t unix_time = 0; - immutable_db_options_.clock->GetCurrentTime(&unix_time) - .PermitUncheckedError(); // Ignore error +void DBImpl::RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds) { + // TECHNICALITY: Sample last sequence number *before* time, as prescribed + // for SeqnoToTimeMapping. We don't know how long it has been since the last + // sequence number was written, so we at least have a one-sided bound by + // sampling in this order. SequenceNumber seqno = GetLatestSequenceNumber(); + int64_t unix_time_signed = 0; + immutable_db_options_.clock->GetCurrentTime(&unix_time_signed) + .PermitUncheckedError(); // Ignore error + uint64_t unix_time = static_cast(unix_time_signed); bool appended = false; { InstrumentedMutexLock l(&mutex_); - appended = seqno_time_mapping_.Append(seqno, unix_time); + if (populate_historical_seconds > 0) { + if (seqno > 1 && unix_time > populate_historical_seconds) { + // seqno=0 is reserved + SequenceNumber from_seqno = 1; + appended = seqno_to_time_mapping_.PrePopulate( + from_seqno, seqno, unix_time - populate_historical_seconds, + unix_time); + } else { + // One of these will fail + assert(seqno > 1); + assert(unix_time > populate_historical_seconds); + } + } else { + // FIXME: assert(seqno > 0); + appended = seqno_to_time_mapping_.Append(seqno, unix_time); + } } - if (!appended) { + if (populate_historical_seconds > 0) { + if (appended) { + ROCKS_LOG_INFO( + immutable_db_options_.info_log, + "Pre-populated sequence number to time entries: [1,%" PRIu64 + "] -> [%" PRIu64 ",%" PRIu64 "]", + seqno, unix_time - populate_historical_seconds, unix_time); + } else { + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "Failed to pre-populate sequence number to time entries: [1,%" PRIu64 + "] -> [%" PRIu64 ",%" PRIu64 "]", + seqno, unix_time - populate_historical_seconds, unix_time); + } + } else if (!appended) { ROCKS_LOG_WARN(immutable_db_options_.info_log, "Failed to insert sequence number to time entry: %" PRIu64 " -> %" PRIu64, seqno, unix_time); } } -#endif // ROCKSDB_LITE ColumnFamilyData* DBImpl::GetAnyCFWithAutoFlushDisabled() const { for (auto cfd: *versions_->GetColumnFamilySet()) { diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 8bf9949865d5..c9870ee5b726 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -55,13 +55,9 @@ #include "rocksdb/memtablerep.h" #include "rocksdb/pre_release_callback.h" #include "rocksdb/status.h" -#ifndef ROCKSDB_LITE #include "rocksdb/trace_reader_writer.h" -#endif // ROCKSDB_LITE #include "rocksdb/transaction_log.h" -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/replayer.h" -#endif // ROCKSDB_LITE #include "rocksdb/write_buffer_manager.h" #include "table/merging_iterator.h" #include "table/scoped_arena_iterator.h" @@ -203,6 +199,8 @@ class DBImpl : public DB { Status PutEntity(const WriteOptions& options, ColumnFamilyHandle* column_family, const Slice& key, const WideColumns& columns) override; + Status PutEntity(const WriteOptions& options, const Slice& key, + const AttributeGroups& attribute_groups) override; using DB::Merge; Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family, @@ -240,7 +238,7 @@ class DBImpl : public DB { virtual Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) override; - virtual Status Get(const ReadOptions& options, + virtual Status Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value, std::string* timestamp) override; @@ -248,6 +246,8 @@ class DBImpl : public DB { Status GetEntity(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableWideColumns* columns) override; + Status GetEntity(const ReadOptions& options, const Slice& key, + PinnableAttributeGroups* result) override; using DB::GetMergeOperands; Status GetMergeOperands(const ReadOptions& options, @@ -271,7 +271,7 @@ class DBImpl : public DB { const std::vector& keys, std::vector* values) override; virtual std::vector MultiGet( - const ReadOptions& options, + const ReadOptions& _read_options, const std::vector& column_family, const std::vector& keys, std::vector* values, std::vector* timestamps) override; @@ -283,33 +283,44 @@ class DBImpl : public DB { // The values and statuses parameters are arrays with number of elements // equal to keys.size(). This allows the storage for those to be alloacted // by the caller on the stack for small batches - virtual void MultiGet(const ReadOptions& options, - ColumnFamilyHandle* column_family, - const size_t num_keys, const Slice* keys, - PinnableSlice* values, Status* statuses, - const bool sorted_input = false) override; - virtual void MultiGet(const ReadOptions& options, - ColumnFamilyHandle* column_family, - const size_t num_keys, const Slice* keys, - PinnableSlice* values, std::string* timestamps, - Status* statuses, - const bool sorted_input = false) override; - - virtual void MultiGet(const ReadOptions& options, const size_t num_keys, - ColumnFamilyHandle** column_families, const Slice* keys, - PinnableSlice* values, Status* statuses, - const bool sorted_input = false) override; - virtual void MultiGet(const ReadOptions& options, const size_t num_keys, - ColumnFamilyHandle** column_families, const Slice* keys, - PinnableSlice* values, std::string* timestamps, - Status* statuses, - const bool sorted_input = false) override; - - virtual void MultiGetWithCallback( - const ReadOptions& options, ColumnFamilyHandle* column_family, + void MultiGet(const ReadOptions& options, ColumnFamilyHandle* column_family, + const size_t num_keys, const Slice* keys, PinnableSlice* values, + Status* statuses, const bool sorted_input = false) override; + void MultiGet(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family, const size_t num_keys, + const Slice* keys, PinnableSlice* values, + std::string* timestamps, Status* statuses, + const bool sorted_input = false) override; + + void MultiGet(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; + void MultiGet(const ReadOptions& _read_options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, std::string* timestamps, + Status* statuses, const bool sorted_input = false) override; + + void MultiGetWithCallback( + const ReadOptions& _read_options, ColumnFamilyHandle* column_family, ReadCallback* callback, autovector* sorted_keys); + using DB::MultiGetEntity; + + void MultiGetEntity(const ReadOptions& options, + ColumnFamilyHandle* column_family, size_t num_keys, + const Slice* keys, PinnableWideColumns* results, + Status* statuses, bool sorted_input) override; + + void MultiGetEntity(const ReadOptions& options, size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableWideColumns* results, Status* statuses, + bool sorted_input) override; + void MultiGetEntity(const ReadOptions& options, size_t num_keys, + const Slice* keys, + PinnableAttributeGroups* results) override; + virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options, const std::string& column_family, ColumnFamilyHandle** handle) override; @@ -335,10 +346,10 @@ class DBImpl : public DB { bool* value_found = nullptr) override; using DB::NewIterator; - virtual Iterator* NewIterator(const ReadOptions& options, + virtual Iterator* NewIterator(const ReadOptions& _read_options, ColumnFamilyHandle* column_family) override; virtual Status NewIterators( - const ReadOptions& options, + const ReadOptions& _read_options, const std::vector& column_families, std::vector* iterators) override; @@ -454,7 +465,7 @@ class DBImpl : public DB { const FlushOptions& options, const std::vector& column_families) override; virtual Status FlushWAL(bool sync) override; - bool WALBufferIsEmpty(bool lock = true); + bool WALBufferIsEmpty(); virtual Status SyncWAL() override; virtual Status LockWAL() override; virtual Status UnlockWAL() override; @@ -493,7 +504,6 @@ class DBImpl : public DB { uint64_t start_time, uint64_t end_time, std::unique_ptr* stats_iterator) override; -#ifndef ROCKSDB_LITE using DB::ResetStats; virtual Status ResetStats() override; // All the returned filenames start with "/" @@ -553,9 +563,14 @@ class DBImpl : public DB { virtual Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& options, const std::string& column_family_name, const ImportColumnFamilyOptions& import_options, - const ExportImportFilesMetaData& metadata, + const std::vector& metadatas, ColumnFamilyHandle** handle) override; + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) override; + using DB::VerifyFileChecksums; Status VerifyFileChecksums(const ReadOptions& read_options) override; @@ -621,7 +636,6 @@ class DBImpl : public DB { ColumnFamilyHandle* column_family, const Range* range, std::size_t n, TablePropertiesCollection* props) override; -#endif // ROCKSDB_LITE // ---- End of implementations of the DB interface ---- SystemClock* GetSystemClock() const; @@ -645,6 +659,14 @@ class DBImpl : public DB { int* number_of_operands = nullptr; }; + Status GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value); + + Status GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, std::string* timestamp); + // Function that Get and KeyMayExist call with no_io true or false // Note: 'value_found' from KeyMayExist propagates here // This function is also called by GetMergeOperands @@ -652,12 +674,12 @@ class DBImpl : public DB { // get_impl_options.key via get_impl_options.value // If get_impl_options.get_value = false get merge operands associated with // get_impl_options.key via get_impl_options.merge_operands - Status GetImpl(const ReadOptions& options, const Slice& key, - GetImplOptions& get_impl_options); + virtual Status GetImpl(const ReadOptions& options, const Slice& key, + GetImplOptions& get_impl_options); // If `snapshot` == kMaxSequenceNumber, set a recent one inside the file. ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options, - ColumnFamilyData* cfd, + ColumnFamilyData* cfd, SuperVersion* sv, SequenceNumber snapshot, ReadCallback* read_callback, bool expose_blob_index = false, @@ -679,7 +701,6 @@ class DBImpl : public DB { // depends also on data written to the WAL but not to the memtable. SequenceNumber TEST_GetLastVisibleSequence() const; -#ifndef ROCKSDB_LITE // Similar to Write() but will call the callback once on the single write // thread to determine whether it is safe to perform the write. virtual Status WriteWithCallback(const WriteOptions& write_options, @@ -744,7 +765,6 @@ class DBImpl : public DB { Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key, const Slice& lower_bound, const Slice upper_bound); -#endif // ROCKSDB_LITE // Similar to GetSnapshot(), but also lets the db know that this snapshot // will be used for transaction write-conflict checking. The DB can then @@ -759,13 +779,17 @@ class DBImpl : public DB { // max_file_num_to_ignore allows bottom level compaction to filter out newly // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will // disable the filtering + // If `final_output_level` is not nullptr, it is set to manual compaction's + // output level if returned status is OK, and it may or may not be set to + // manual compaction's output level if returned status is not OK. Status RunManualCompaction(ColumnFamilyData* cfd, int input_level, int output_level, const CompactRangeOptions& compact_range_options, const Slice* begin, const Slice* end, bool exclusive, bool disallow_trivial_move, uint64_t max_file_num_to_ignore, - const std::string& trim_ts); + const std::string& trim_ts, + int* final_output_level = nullptr); // Return an internal iterator over the current state of the database. // The keys of this iterator are internal keys (see format.h). @@ -827,6 +851,8 @@ class DBImpl : public DB { // being deleted. uint64_t MinObsoleteSstNumberToKeep(); + uint64_t GetObsoleteSstFilesSize(); + // Returns the list of live files in 'live' and the list // of all files in the filesystem in 'candidate_files'. // If force == false and the last call was less than @@ -1074,10 +1100,8 @@ class DBImpl : public DB { VersionSet* GetVersionSet() const { return versions_.get(); } - // Wait for any compaction - // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this - // is only for the special test of CancelledCompactions - Status WaitForCompact(bool waitUnscheduled = false); + Status WaitForCompact( + const WaitForCompactOptions& wait_for_compact_options) override; void NewManifestOnNextUpdate() override { versions_->NewManifestOnNextUpdate(); @@ -1114,8 +1138,9 @@ class DBImpl : public DB { // is because in certain cases, we can flush column families, wait for the // flush to complete, but delete the column family handle before the wait // finishes. For example in CompactRange. - Status TEST_AtomicFlushMemTables(const autovector& cfds, - const FlushOptions& flush_opts); + Status TEST_AtomicFlushMemTables( + const autovector& provided_candidate_cfds, + const FlushOptions& flush_opts); // Wait for background threads to complete scheduled work. Status TEST_WaitForBackgroundWork(); @@ -1123,10 +1148,9 @@ class DBImpl : public DB { // Wait for memtable compaction Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr); - // Wait for any compaction - // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this - // is only for the special test of CancelledCompactions - Status TEST_WaitForCompact(bool waitUnscheduled = false); + Status TEST_WaitForCompact(); + Status TEST_WaitForCompact( + const WaitForCompactOptions& wait_for_compact_options); // Wait until all scheduled compactions are done Status TEST_WaitForScheduledCompaction(); @@ -1160,6 +1184,8 @@ class DBImpl : public DB { void TEST_UnlockMutex(); + void TEST_SignalAllBgCv(); + // REQUIRES: mutex locked void* TEST_BeginWrite(); @@ -1199,6 +1225,7 @@ class DBImpl : public DB { size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const; void TEST_WaitForPeriodicTaskRun(std::function callback) const; SeqnoToTimeMapping TEST_GetSeqnoToTimeMapping() const; + const autovector& TEST_GetFilesToQuarantine() const; size_t TEST_EstimateInMemoryStatsHistorySize() const; uint64_t TEST_GetCurrentLogNumber() const { @@ -1211,9 +1238,11 @@ class DBImpl : public DB { return files_grabbed_for_purge_; } -#ifndef ROCKSDB_LITE const PeriodicTaskScheduler& TEST_GetPeriodicTaskScheduler() const; -#endif // !ROCKSDB_LITE + + static Status TEST_ValidateOptions(const DBOptions& db_options) { + return ValidateOptions(db_options); + } #endif // NDEBUG @@ -1226,8 +1255,11 @@ class DBImpl : public DB { // flush LOG out of application buffer void FlushInfoLog(); - // record current sequence number to time mapping - void RecordSeqnoToTimeMapping(); + // record current sequence number to time mapping. If + // populate_historical_seconds > 0 then pre-populate all the + // sequence numbers from [1, last] to map to [now minus + // populate_historical_seconds, now]. + void RecordSeqnoToTimeMapping(uint64_t populate_historical_seconds); // Interface to block and signal the DB in case of stalling writes by // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface. @@ -1367,6 +1399,11 @@ class DBImpl : public DB { std::atomic shutting_down_; + // No new background jobs can be queued if true. This is used to prevent new + // background jobs from being queued after WaitForCompact() completes waiting + // all background jobs then attempts to close when close_db_ option is true. + bool reject_new_background_jobs_; + // RecoveryContext struct stores the context about version edits along // with corresponding column_family_data and column_family_options. class RecoveryContext { @@ -1396,15 +1433,15 @@ class DBImpl : public DB { autovector cfds_; autovector mutable_cf_opts_; autovector> edit_lists_; - // files_to_delete_ contains sst files - std::unordered_set files_to_delete_; + // Stale SST files to delete found upon recovery. This stores a mapping from + // such a file's absolute path to its parent directory. + std::unordered_map files_to_delete_; + bool is_new_db_ = false; }; - // Except in DB::Open(), WriteOptionsFile can only be called when: - // Persist options to options file. - // If need_mutex_lock = false, the method will lock DB mutex. - // If need_enter_write_thread = false, the method will enter write thread. - Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread); + // Persist options to options file. Must be holding options_mutex_. + // Will lock DB mutex if !db_mutex_already_held. + Status WriteOptionsFile(bool db_mutex_already_held); Status CompactRangeInternal(const CompactRangeOptions& options, ColumnFamilyHandle* column_family, @@ -1436,12 +1473,13 @@ class DBImpl : public DB { void NotifyOnMemTableSealed(ColumnFamilyData* cfd, const MemTableInfo& mem_table_info); -#ifndef ROCKSDB_LITE void NotifyOnExternalFileIngested( ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job); + Status FlushAllColumnFamilies(const FlushOptions& flush_options, + FlushReason flush_reason); + virtual Status FlushForGetLiveFiles(); -#endif // !ROCKSDB_LITE void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const; @@ -1555,8 +1593,18 @@ class DBImpl : public DB { void SetDbSessionId(); Status FailIfCfHasTs(const ColumnFamilyHandle* column_family) const; - Status FailIfTsMismatchCf(ColumnFamilyHandle* column_family, const Slice& ts, - bool ts_for_read) const; + Status FailIfTsMismatchCf(ColumnFamilyHandle* column_family, + const Slice& ts) const; + + // Check that the read timestamp `ts` is at or above the `full_history_ts_low` + // timestamp in a `SuperVersion`. It's necessary to do this check after + // grabbing the SuperVersion. If the check passed, the referenced SuperVersion + // this read holds on to can ensure the read won't be affected if + // `full_history_ts_low` is increased concurrently, and this achieves that + // without explicitly locking by piggybacking the SuperVersion. + Status FailIfReadCollapsedHistory(const ColumnFamilyData* cfd, + const SuperVersion* sv, + const Slice& ts) const; // recovery_ctx stores the context about version edits and // LogAndApplyForRecovery persist all those edits to new Manifest after @@ -1589,17 +1637,17 @@ class DBImpl : public DB { friend class WriteUnpreparedTxnDB; friend class WriteUnpreparedTxn; -#ifndef ROCKSDB_LITE friend class ForwardIterator; -#endif friend struct SuperVersion; friend class CompactedDBImpl; +#ifndef NDEBUG friend class DBTest_ConcurrentFlushWAL_Test; friend class DBTest_MixedSlowdownOptionsStop_Test; friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test; friend class DBCompactionTest_CompactionDuringShutdown_Test; + friend class DBCompactionTest_DelayCompactBottomLevelFilesWithDeletions_Test; + friend class DBCompactionTest_DisableCompactBottomLevelFiles_Test; friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test; -#ifndef NDEBUG friend class DBTest2_ReadCallbackTest_Test; friend class WriteCallbackPTest_WriteWithCallbackTest_Test; friend class XFTransactionWriteHandler; @@ -1821,10 +1869,15 @@ class DBImpl : public DB { const Status CreateArchivalDirectory(); + // Create a column family, without some of the follow-up work yet Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, const std::string& cf_name, ColumnFamilyHandle** handle); + // Follow-up work to user creating a column family or (families) + Status WrapUpCreateColumnFamilies( + const std::vector& cf_options); + Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family); // Delete any unneeded files and stale in-memory entries. @@ -1854,7 +1907,8 @@ class DBImpl : public DB { void ReleaseFileNumberFromPendingOutputs( std::unique_ptr::iterator>& v); - IOStatus SyncClosedLogs(JobContext* job_context, VersionEdit* synced_wals); + IOStatus SyncClosedLogs(JobContext* job_context, VersionEdit* synced_wals, + bool error_recovery_in_prog); // Flush the in-memory write buffer to storage. Switches to a new // log-file/memtable and writes a new descriptor iff successful. Then @@ -1942,18 +1996,31 @@ class DBImpl : public DB { Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context); - void SelectColumnFamiliesForAtomicFlush(autovector* cfds); + // Select and output column families qualified for atomic flush in + // `selected_cfds`. If `provided_candidate_cfds` is non-empty, it will be used + // as candidate CFs to select qualified ones from. Otherwise, all column + // families are used as candidate to select from. + // + // REQUIRES: mutex held + void SelectColumnFamiliesForAtomicFlush( + autovector* selected_cfds, + const autovector& provided_candidate_cfds = {}); // Force current memtable contents to be flushed. Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options, FlushReason flush_reason, bool entered_write_thread = false); + // Atomic-flush memtables from quanlified CFs among `provided_candidate_cfds` + // (if non-empty) or amomg all column families and atomically record the + // result to the MANIFEST. Status AtomicFlushMemTables( - const autovector& column_family_datas, const FlushOptions& options, FlushReason flush_reason, + const autovector& provided_candidate_cfds = {}, bool entered_write_thread = false); + Status RetryFlushesForErrorRecovery(FlushReason flush_reason, bool wait); + // Wait until flushing this column family won't stall writes Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, bool* flush_needed); @@ -2074,7 +2141,6 @@ class DBImpl : public DB { // Used by WriteImpl to update bg_error_ in case of memtable insert error. void MemTableInsertStatusCheck(const Status& memtable_insert_status); -#ifndef ROCKSDB_LITE Status CompactFilesImpl(const CompactionOptions& compact_options, ColumnFamilyData* cfd, Version* version, const std::vector& input_file_names, @@ -2082,7 +2148,6 @@ class DBImpl : public DB { const int output_level, int output_path_id, JobContext* job_context, LogBuffer* log_buffer, CompactionJobInfo* compaction_job_info); -#endif // ROCKSDB_LITE ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name); @@ -2098,8 +2163,18 @@ class DBImpl : public DB { // flush is considered complete. std::unordered_map cfd_to_max_mem_id_to_persist; + +#ifndef NDEBUG + int reschedule_count = 1; +#endif /* !NDEBUG */ }; + // In case of atomic flush, generates a `FlushRequest` for the latest atomic + // cuts for these `cfds`. Atomic cuts are recorded in + // `AssignAtomicFlushSeq()`. For each entry in `cfds`, all CFDs sharing the + // same latest atomic cut must also be present. + // + // REQUIRES: mutex held void GenerateFlushRequest(const autovector& cfds, FlushReason flush_reason, FlushRequest* req); @@ -2126,6 +2201,7 @@ class DBImpl : public DB { Env::Priority thread_pri); Status BackgroundFlush(bool* madeProgress, JobContext* job_context, LogBuffer* log_buffer, FlushReason* reason, + bool* flush_rescheduled_to_retain_udt, Env::Priority thread_pri); bool EnoughRoomForCompaction(ColumnFamilyData* cfd, @@ -2138,10 +2214,19 @@ class DBImpl : public DB { std::unique_ptr* token, LogBuffer* log_buffer); + // Return true if the `FlushRequest` can be rescheduled to retain the UDT. + // Only true if there are user-defined timestamps in the involved MemTables + // with newer than cutoff timestamp `full_history_ts_low` and not flushing + // immediately will not cause entering write stall mode. + bool ShouldRescheduleFlushRequestToRetainUDT(const FlushRequest& flush_req); + // Schedule background tasks Status StartPeriodicTaskScheduler(); - Status RegisterRecordSeqnoTimeWorker(); + // Cancel scheduled periodic tasks + Status CancelPeriodicTaskScheduler(); + + Status RegisterRecordSeqnoTimeWorker(bool is_new_db); void PrintStatistics(); @@ -2169,7 +2254,7 @@ class DBImpl : public DB { // helper function to call after some of the logs_ were synced void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit); - Status ApplyWALToManifest(VersionEdit* edit); + Status ApplyWALToManifest(const ReadOptions& read_options, VersionEdit* edit); // WALs with log number up to up_to are not synced successfully. void MarkLogsNotSynced(uint64_t up_to); @@ -2221,11 +2306,11 @@ class DBImpl : public DB { bool ShouldntRunManualCompaction(ManualCompactionState* m); bool HaveManualCompaction(ColumnFamilyData* cfd); bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1); -#ifndef ROCKSDB_LITE + void UpdateDeletionCompactionStats(const std::unique_ptr& c); void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c, const Status& st, const CompactionJobStats& compaction_job_stats, - const int job_id, const Version* current, + const int job_id, CompactionJobInfo* compaction_job_info) const; // Reserve the next 'num' file numbers for to-be-ingested external SST files, // and return the current file_number in 'next_file_number'. @@ -2234,7 +2319,6 @@ class DBImpl : public DB { ColumnFamilyData* cfd, uint64_t num, std::unique_ptr::iterator>& pending_output_elem, uint64_t* next_file_number); -#endif //! ROCKSDB_LITE bool ShouldPurge(uint64_t file_number) const; void MarkAsGrabbedForPurge(uint64_t file_number); @@ -2258,6 +2342,18 @@ class DBImpl : public DB { const size_t num_keys, bool sorted, autovector* key_ptrs); + void MultiGetCommon(const ReadOptions& options, + ColumnFamilyHandle* column_family, const size_t num_keys, + const Slice* keys, PinnableSlice* values, + PinnableWideColumns* columns, std::string* timestamps, + Status* statuses, bool sorted_input); + + void MultiGetCommon(const ReadOptions& options, const size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableSlice* values, PinnableWideColumns* columns, + std::string* timestamps, Status* statuses, + bool sorted_input); + // A structure to hold the information required to process MultiGet of keys // belonging to one column family. For a multi column family MultiGet, there // will be a container of these objects. @@ -2307,15 +2403,18 @@ class DBImpl : public DB { // If callback is non-null, the callback is refreshed with the snapshot // sequence number // - // A return value of true indicates that the SuperVersions were obtained - // from the ColumnFamilyData, whereas false indicates they are thread - // local + // `sv_from_thread_local` being set to false indicates that the SuperVersion + // obtained from the ColumnFamilyData, whereas true indicates they are thread + // local. + // A non-OK status will be returned if for a column family that enables + // user-defined timestamp feature, the specified `ReadOptions.timestamp` + // attemps to read collapsed history. template - bool MultiCFSnapshot( + Status MultiCFSnapshot( const ReadOptions& read_options, ReadCallback* callback, std::function& iter_deref_func, - T* cf_list, SequenceNumber* snapshot); + T* cf_list, SequenceNumber* snapshot, bool* sv_from_thread_local); // The actual implementation of the batching MultiGet. The caller is expected // to have acquired the SuperVersion and pass in a snapshot sequence number @@ -2326,6 +2425,11 @@ class DBImpl : public DB { autovector* sorted_keys, SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback); + void MultiGetWithCallbackImpl( + const ReadOptions& read_options, ColumnFamilyHandle* column_family, + ReadCallback* callback, + autovector* sorted_keys); + Status DisableFileDeletionsWithLock(); Status IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, @@ -2341,13 +2445,23 @@ class DBImpl : public DB { // Lock over the persistent DB state. Non-nullptr iff successfully acquired. FileLock* db_lock_; - // In addition to mutex_, log_write_mutex_ protected writes to stats_history_ + // Guards changes to DB and CF options to ensure consistency between + // * In-memory options objects + // * Settings in effect + // * Options file contents + // while allowing the DB mutex to be released during slow operations like + // persisting options file or modifying global periodic task timer. + // Always acquired *before* DB mutex when this one is applicable. + InstrumentedMutex options_mutex_; + + // Guards reads and writes to in-memory stats_history_. InstrumentedMutex stats_history_mutex_; - // In addition to mutex_, log_write_mutex_ protected writes to logs_ and + + // In addition to mutex_, log_write_mutex_ protects writes to logs_ and // logfile_number_. With two_write_queues it also protects alive_log_files_, // and log_empty_. Refer to the definition of each variable below for more // details. - // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and + // Note: to avoid deadlock, if needed to acquire both log_write_mutex_ and // mutex_, the order should be first mutex_ and then log_write_mutex_. InstrumentedMutex log_write_mutex_; @@ -2620,9 +2734,6 @@ class DBImpl : public DB { // initialized with startup time. uint64_t delete_obsolete_files_last_run_; - // last time stats were dumped to LOG - std::atomic last_stats_dump_time_microsec_; - // The thread that wants to switch memtable, can wait on this cv until the // pending writes to memtable finishes. std::condition_variable switch_cv_; @@ -2649,9 +2760,7 @@ class DBImpl : public DB { // REQUIRES: mutex held int num_running_ingest_file_; -#ifndef ROCKSDB_LITE WalManager wal_manager_; -#endif // ROCKSDB_LITE // A value of > 0 temporarily disables scheduling of background work int bg_work_paused_; @@ -2679,14 +2788,12 @@ class DBImpl : public DB { // Only to be set during initialization std::unique_ptr recoverable_state_pre_release_callback_; -#ifndef ROCKSDB_LITE // Scheduler to run DumpStats(), PersistStats(), and FlushInfoLog(). // Currently, internally it has a global timer instance for running the tasks. PeriodicTaskScheduler periodic_task_scheduler_; // It contains the implementations for each periodic task. std::map periodic_task_functions_; -#endif // When set, we use a separate queue for writes that don't write to memtable. // In 2PC these are the writes at Prepare phase. @@ -2737,13 +2844,18 @@ class DBImpl : public DB { // Pointer to WriteBufferManager stalling interface. std::unique_ptr wbm_stall_; - // seqno_time_mapping_ stores the sequence number to time mapping, it's not + // seqno_to_time_mapping_ stores the sequence number to time mapping, it's not // thread safe, both read and write need db mutex hold. - SeqnoToTimeMapping seqno_time_mapping_; + SeqnoToTimeMapping seqno_to_time_mapping_; - // stop write token that is acquired when LockWal() is called. Destructed - // when UnlockWal() is called. + // Stop write token that is acquired when first LockWAL() is called. + // Destroyed when last UnlockWAL() is called. Controlled by DB mutex. + // See lock_wal_count_ std::unique_ptr lock_wal_write_token_; + + // The number of LockWAL called without matching UnlockWAL call. + // See also lock_wal_write_token_ + uint32_t lock_wal_count_; }; class GetWithTimestampReadCallback : public ReadCallback { @@ -2814,7 +2926,9 @@ static void ClipToRange(T* ptr, V minvalue, V maxvalue) { inline Status DBImpl::FailIfCfHasTs( const ColumnFamilyHandle* column_family) const { - column_family = column_family ? column_family : DefaultColumnFamily(); + if (!column_family) { + return Status::InvalidArgument("column family handle cannot be null"); + } assert(column_family); const Comparator* const ucmp = column_family->GetComparator(); assert(ucmp); @@ -2828,8 +2942,7 @@ inline Status DBImpl::FailIfCfHasTs( } inline Status DBImpl::FailIfTsMismatchCf(ColumnFamilyHandle* column_family, - const Slice& ts, - bool ts_for_read) const { + const Slice& ts) const { if (!column_family) { return Status::InvalidArgument("column family handle cannot be null"); } @@ -2849,20 +2962,28 @@ inline Status DBImpl::FailIfTsMismatchCf(ColumnFamilyHandle* column_family, << ts_sz << " given"; return Status::InvalidArgument(oss.str()); } - if (ts_for_read) { - auto cfh = static_cast_with_check(column_family); - auto cfd = cfh->cfd(); - std::string current_ts_low = cfd->GetFullHistoryTsLow(); - if (!current_ts_low.empty() && - ucmp->CompareTimestamp(ts, current_ts_low) < 0) { - std::stringstream oss; - oss << "Read timestamp: " << ts.ToString(true) - << " is smaller than full_history_ts_low: " - << Slice(current_ts_low).ToString(true) << std::endl; - return Status::InvalidArgument(oss.str()); - } - } return Status::OK(); } +inline Status DBImpl::FailIfReadCollapsedHistory(const ColumnFamilyData* cfd, + const SuperVersion* sv, + const Slice& ts) const { + // Reaching to this point means the timestamp size matching sanity check in + // `DBImpl::FailIfTsMismatchCf` already passed. So we skip that and assume + // column family has the same user-defined timestamp format as `ts`. + const Comparator* const ucmp = cfd->user_comparator(); + assert(ucmp); + const std::string& full_history_ts_low = sv->full_history_ts_low; + assert(full_history_ts_low.empty() || + full_history_ts_low.size() == ts.size()); + if (!full_history_ts_low.empty() && + ucmp->CompareTimestamp(ts, full_history_ts_low) < 0) { + std::stringstream oss; + oss << "Read timestamp: " << ts.ToString(true) + << " is smaller than full_history_ts_low: " + << Slice(full_history_ts_low).ToString(true) << std::endl; + return Status::InvalidArgument(oss.str()); + } + return Status::OK(); +} } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 5d3ce1889c26..5b616b48728e 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -22,7 +22,9 @@ #include "monitoring/thread_status_util.h" #include "test_util/sync_point.h" #include "util/cast_util.h" +#include "util/coding.h" #include "util/concurrent_task_limiter_impl.h" +#include "util/udt_util.h" namespace ROCKSDB_NAMESPACE { @@ -31,7 +33,6 @@ bool DBImpl::EnoughRoomForCompaction( bool* sfm_reserved_compact_space, LogBuffer* log_buffer) { // Check if we have enough room to do the compaction bool enough_room = true; -#ifndef ROCKSDB_LITE auto sfm = static_cast( immutable_db_options_.sst_file_manager.get()); if (sfm) { @@ -46,11 +47,6 @@ bool DBImpl::EnoughRoomForCompaction( *sfm_reserved_compact_space = true; } } -#else - (void)cfd; - (void)inputs; - (void)sfm_reserved_compact_space; -#endif // ROCKSDB_LITE if (!enough_room) { // Just in case tests want to change the value of enough_room TEST_SYNC_POINT_CALLBACK( @@ -83,8 +79,43 @@ bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force, return false; } +bool DBImpl::ShouldRescheduleFlushRequestToRetainUDT( + const FlushRequest& flush_req) { + mutex_.AssertHeld(); + assert(flush_req.cfd_to_max_mem_id_to_persist.size() == 1); + ColumnFamilyData* cfd = flush_req.cfd_to_max_mem_id_to_persist.begin()->first; + uint64_t max_memtable_id = + flush_req.cfd_to_max_mem_id_to_persist.begin()->second; + if (cfd->IsDropped() || + !cfd->ShouldPostponeFlushToRetainUDT(max_memtable_id)) { + return false; + } + // Check if holding on the flush will cause entering write stall mode. + // Write stall entered because of the accumulation of write buffers can be + // alleviated if we continue with the flush instead of postponing it. + const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions(); + + // Taking the status of the active Memtable into consideration so that we are + // not just checking if DB is currently already in write stall mode. + int mem_to_flush = cfd->mem()->ApproximateMemoryUsageFast() >= + cfd->mem()->write_buffer_size() / 2 + ? 1 + : 0; + WriteStallCondition write_stall = + ColumnFamilyData::GetWriteStallConditionAndCause( + cfd->imm()->NumNotFlushed() + mem_to_flush, /*num_l0_files=*/0, + /*num_compaction_needed_bytes=*/0, mutable_cf_options, + *cfd->ioptions()) + .first; + if (write_stall != WriteStallCondition::kNormal) { + return false; + } + return true; +} + IOStatus DBImpl::SyncClosedLogs(JobContext* job_context, - VersionEdit* synced_wals) { + VersionEdit* synced_wals, + bool error_recovery_in_prog) { TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start"); InstrumentedMutexLock l(&log_write_mutex_); autovector logs_to_sync; @@ -110,7 +141,7 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context, ROCKS_LOG_INFO(immutable_db_options_.info_log, "[JOB %d] Syncing log #%" PRIu64, job_context->job_id, log->get_log_number()); - if (error_handler_.IsRecoveryInProgress()) { + if (error_recovery_in_prog) { log->file()->reset_seen_error(); } io_s = log->file()->Sync(immutable_db_options_.use_fsync); @@ -119,7 +150,7 @@ IOStatus DBImpl::SyncClosedLogs(JobContext* job_context, } if (immutable_db_options_.recycle_log_file_num > 0) { - if (error_handler_.IsRecoveryInProgress()) { + if (error_recovery_in_prog) { log->file()->reset_seen_error(); } io_s = log->Close(); @@ -193,9 +224,10 @@ Status DBImpl::FlushMemTableToOutputFile( // `snapshot_seqs` has already been computed before this function starts. // Recording the max memtable ID ensures that the flush job does not flush // a memtable without knowing such snapshot(s). - uint64_t max_memtable_id = needs_to_sync_closed_wals - ? cfd->imm()->GetLatestMemTableID() - : std::numeric_limits::max(); + uint64_t max_memtable_id = + needs_to_sync_closed_wals + ? cfd->imm()->GetLatestMemTableID(false /* for_atomic_flush */) + : std::numeric_limits::max(); // If needs_to_sync_closed_wals is false, then the flush job will pick ALL // existing memtables of the column family when PickMemTable() is called @@ -204,7 +236,7 @@ Status DBImpl::FlushMemTableToOutputFile( // releases and re-acquires the db mutex. In the meantime, the application // can still insert into the memtables and increase the db's sequence number. // The application can take a snapshot, hoping that the latest visible state - // to this snapshto is preserved. This is hard to guarantee since db mutex + // to this snapshot is preserved. This is hard to guarantee since db mutex // not held. This newly-created snapshot is not included in `snapshot_seqs` // and the flush job is unaware of its presence. Consequently, the flush job // may drop certain keys when generating the L0, causing incorrect data to be @@ -221,7 +253,7 @@ Status DBImpl::FlushMemTableToOutputFile( GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, &event_logger_, mutable_cf_options.report_bg_io_stats, true /* sync_output_directory */, true /* write_manifest */, thread_pri, - io_tracer_, seqno_time_mapping_, db_id_, db_session_id_, + io_tracer_, seqno_to_time_mapping_, db_id_, db_session_id_, cfd->GetFullHistoryTsLow(), &blob_callback_); FileMetaData file_meta; @@ -232,11 +264,15 @@ Status DBImpl::FlushMemTableToOutputFile( // SyncClosedLogs() may unlock and re-lock the log_write_mutex multiple // times. VersionEdit synced_wals; + bool error_recovery_in_prog = error_handler_.IsRecoveryInProgress(); mutex_.Unlock(); - log_io_s = SyncClosedLogs(job_context, &synced_wals); + log_io_s = + SyncClosedLogs(job_context, &synced_wals, error_recovery_in_prog); mutex_.Lock(); if (log_io_s.ok() && synced_wals.IsWalAddition()) { - log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals)); + const ReadOptions read_options(Env::IOActivity::kFlush); + log_io_s = + status_to_io_status(ApplyWALToManifest(read_options, &synced_wals)); TEST_SYNC_POINT_CALLBACK("DBImpl::FlushMemTableToOutputFile:CommitWal:1", nullptr); } @@ -253,6 +289,24 @@ Status DBImpl::FlushMemTableToOutputFile( // If the log sync failed, we do not need to pick memtable. Otherwise, // num_flush_not_started_ needs to be rollback. TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables"); + // Exit a flush due to bg error should not set bg error again. + bool skip_set_bg_error = false; + if (s.ok() && !error_handler_.GetBGError().ok() && + error_handler_.IsBGWorkStopped() && + flush_reason != FlushReason::kErrorRecovery && + flush_reason != FlushReason::kErrorRecoveryRetryFlush) { + // Error recovery in progress, should not pick memtable which excludes + // them from being picked up by recovery flush. + // This ensures that when bg error is set, no new flush can pick + // memtables. + skip_set_bg_error = true; + s = error_handler_.GetBGError(); + assert(!s.ok()); + ROCKS_LOG_BUFFER(log_buffer, + "[JOB %d] Skip flush due to background error %s", + job_context->job_id, s.ToString().c_str()); + } + if (s.ok()) { flush_job.PickMemTable(); need_cancel = true; @@ -260,11 +314,9 @@ Status DBImpl::FlushMemTableToOutputFile( TEST_SYNC_POINT_CALLBACK( "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", &flush_job); -#ifndef ROCKSDB_LITE // may temporarily unlock and lock the mutex. NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id, flush_reason); -#endif // ROCKSDB_LITE bool switched_to_mempurge = false; // Within flush_job.Run, rocksdb may call event listener to notify @@ -275,7 +327,8 @@ Status DBImpl::FlushMemTableToOutputFile( // is unlocked by the current thread. if (s.ok()) { s = flush_job.Run(&logs_with_prep_tracker_, &file_meta, - &switched_to_mempurge); + &switched_to_mempurge, &skip_set_bg_error, + &error_handler_); need_cancel = false; } @@ -316,7 +369,8 @@ Status DBImpl::FlushMemTableToOutputFile( } } - if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) { + if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() && + !skip_set_bg_error) { if (log_io_s.ok()) { // Error while writing to MANIFEST. // In fact, versions_->io_status() can also be the result of renaming @@ -345,7 +399,6 @@ Status DBImpl::FlushMemTableToOutputFile( // If flush ran smoothly and no mempurge happened // install new SST file path. if (s.ok() && (!switched_to_mempurge)) { -#ifndef ROCKSDB_LITE // may temporarily unlock and lock the mutex. NotifyOnFlushCompleted(cfd, mutable_cf_options, flush_job.GetCommittedFlushJobsInfo()); @@ -368,7 +421,6 @@ Status DBImpl::FlushMemTableToOutputFile( error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush); } } -#endif // ROCKSDB_LITE } TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:Finish"); return s; @@ -425,7 +477,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( assert(cfd->imm()->NumNotFlushed() != 0); assert(cfd->imm()->IsFlushPending()); } - for (const auto bg_flush_arg : bg_flush_args) { + for (const auto& bg_flush_arg : bg_flush_args) { assert(bg_flush_arg.flush_reason_ == bg_flush_args[0].flush_reason_); } #endif /* !NDEBUG */ @@ -475,7 +527,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_, &event_logger_, mutable_cf_options.report_bg_io_stats, false /* sync_output_directory */, false /* write_manifest */, - thread_pri, io_tracer_, seqno_time_mapping_, db_id_, db_session_id_, + thread_pri, io_tracer_, seqno_to_time_mapping_, db_id_, db_session_id_, cfd->GetFullHistoryTsLow(), &blob_callback_)); } @@ -487,7 +539,6 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( IOStatus log_io_s = IOStatus::OK(); assert(num_cfs == static_cast(jobs.size())); -#ifndef ROCKSDB_LITE for (int i = 0; i != num_cfs; ++i) { const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i); // may temporarily unlock and lock the mutex. @@ -495,17 +546,20 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options, job_context->job_id, flush_reason); } -#endif /* !ROCKSDB_LITE */ if (logfile_number_ > 0) { // TODO (yanqin) investigate whether we should sync the closed logs for // single column family case. VersionEdit synced_wals; + bool error_recovery_in_prog = error_handler_.IsRecoveryInProgress(); mutex_.Unlock(); - log_io_s = SyncClosedLogs(job_context, &synced_wals); + log_io_s = + SyncClosedLogs(job_context, &synced_wals, error_recovery_in_prog); mutex_.Lock(); if (log_io_s.ok() && synced_wals.IsWalAddition()) { - log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals)); + const ReadOptions read_options(Env::IOActivity::kFlush); + log_io_s = + status_to_io_status(ApplyWALToManifest(read_options, &synced_wals)); } if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && @@ -530,6 +584,21 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( pick_status.push_back(false); } + bool flush_for_recovery = + bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecovery || + bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecoveryRetryFlush; + bool skip_set_bg_error = false; + + if (s.ok() && !error_handler_.GetBGError().ok() && + error_handler_.IsBGWorkStopped() && !flush_for_recovery) { + s = error_handler_.GetBGError(); + skip_set_bg_error = true; + assert(!s.ok()); + ROCKS_LOG_BUFFER(log_buffer, + "[JOB %d] Skip flush due to background error %s", + job_context->job_id, s.ToString().c_str()); + } + if (s.ok()) { for (int i = 0; i != num_cfs; ++i) { jobs[i]->PickMemTable(); @@ -594,7 +663,10 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( } } } - } else { + } else if (!skip_set_bg_error) { + // When `skip_set_bg_error` is true, no memtable is picked so + // there is no need to call Cancel() or RollbackMemtableFlush(). + // // Need to undo atomic flush if something went wrong, i.e. s is not OK and // it is not because of CF drop. // Have to cancel the flush jobs that have NOT executed because we need to @@ -607,8 +679,8 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( for (int i = 0; i != num_cfs; ++i) { if (exec_status[i].second.ok() && exec_status[i].first) { auto& mems = jobs[i]->GetMemTables(); - cfds[i]->imm()->RollbackMemtableFlush(mems, - file_meta[i].fd.GetNumber()); + cfds[i]->imm()->RollbackMemtableFlush( + mems, /*rollback_succeeding_memtables=*/false); } } } @@ -650,10 +722,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( }; bool resuming_from_bg_err = - error_handler_.IsDBStopped() || - (bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecovery || - bg_flush_args[0].flush_reason_ == - FlushReason::kErrorRecoveryRetryFlush); + error_handler_.IsDBStopped() || flush_for_recovery; while ((!resuming_from_bg_err || error_handler_.GetRecoveryError().ok())) { std::pair res = wait_to_install_func(); @@ -664,15 +733,27 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( s = res.first; break; } else if (!res.second) { + // we are the oldest immutable memtable + break; + } + // We are not the oldest immutable memtable + TEST_SYNC_POINT_CALLBACK( + "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitCV", &res); + // + // If bg work is stopped, recovery thread first calls + // WaitForBackgroundWork() before proceeding to flush for recovery. This + // flush can block WaitForBackgroundWork() while waiting for recovery + // flush to install result. To avoid this deadlock, we should abort here + // if there is background error. + if (!flush_for_recovery && error_handler_.IsBGWorkStopped() && + !error_handler_.GetBGError().ok()) { + s = error_handler_.GetBGError(); + assert(!s.ok()); break; } atomic_flush_install_cv_.Wait(); - resuming_from_bg_err = - error_handler_.IsDBStopped() || - (bg_flush_args[0].flush_reason_ == FlushReason::kErrorRecovery || - bg_flush_args[0].flush_reason_ == - FlushReason::kErrorRecoveryRetryFlush); + resuming_from_bg_err = error_handler_.IsDBStopped() || flush_for_recovery; } if (!resuming_from_bg_err) { @@ -688,6 +769,17 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( // installation. s = error_handler_.GetRecoveryError(); } + // Since we are not installing these memtables, need to rollback + // to allow future flush job to pick up these memtables. + if (!s.ok()) { + for (int i = 0; i != num_cfs; ++i) { + assert(exec_status[i].first); + assert(exec_status[i].second.ok()); + auto& mems = jobs[i]->GetMemTables(); + cfds[i]->imm()->RollbackMemtableFlush( + mems, /*rollback_succeeding_memtables=*/false); + } + } } if (s.ok()) { @@ -704,10 +796,8 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( mems_list.emplace_back(&mems); mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]); tmp_file_meta.emplace_back(&file_meta[i]); -#ifndef ROCKSDB_LITE committed_flush_jobs_info.emplace_back( jobs[i]->GetCommittedFlushJobsInfo()); -#endif //! ROCKSDB_LITE } } @@ -759,7 +849,6 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( if (made_progress) { *made_progress = true; } -#ifndef ROCKSDB_LITE auto sfm = static_cast( immutable_db_options_.sst_file_manager.get()); assert(all_mutable_cf_options.size() == static_cast(num_cfs)); @@ -790,12 +879,11 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( } } } -#endif // ROCKSDB_LITE } // Need to undo atomic flush if something went wrong, i.e. s is not OK and // it is not because of CF drop. - if (!s.ok() && !s.IsColumnFamilyDropped()) { + if (!s.ok() && !s.IsColumnFamilyDropped() && !skip_set_bg_error) { if (log_io_s.ok()) { // Error while writing to MANIFEST. // In fact, versions_->io_status() can also be the result of renaming @@ -828,7 +916,6 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, const MutableCFOptions& mutable_cf_options, int job_id, FlushReason flush_reason) { -#ifndef ROCKSDB_LITE if (immutable_db_options_.listeners.size() == 0U) { return; } @@ -866,21 +953,13 @@ void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta, } } mutex_.Lock(); -// no need to signal bg_cv_ as it will be signaled at the end of the -// flush process. -#else - (void)cfd; - (void)file_meta; - (void)mutable_cf_options; - (void)job_id; - (void)flush_reason; -#endif // ROCKSDB_LITE + // no need to signal bg_cv_ as it will be signaled at the end of the + // flush process. } void DBImpl::NotifyOnFlushCompleted( ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, std::list>* flush_jobs_info) { -#ifndef ROCKSDB_LITE assert(flush_jobs_info != nullptr); if (immutable_db_options_.listeners.size() == 0U) { return; @@ -912,11 +991,6 @@ void DBImpl::NotifyOnFlushCompleted( mutex_.Lock(); // no need to signal bg_cv_ as it will be signaled at the end of the // flush process. -#else - (void)cfd; - (void)mutable_cf_options; - (void)flush_jobs_info; -#endif // ROCKSDB_LITE } Status DBImpl::CompactRange(const CompactRangeOptions& options, @@ -939,26 +1013,14 @@ Status DBImpl::CompactRange(const CompactRangeOptions& options, end_without_ts, "" /*trim_ts*/); } - std::string begin_str; - std::string end_str; + std::string begin_str, end_str; + auto [begin, end] = + MaybeAddTimestampsToRange(begin_without_ts, end_without_ts, ts_sz, + &begin_str, &end_str, false /*exclusive_end*/); - // CompactRange compact all keys: [begin, end] inclusively. Add maximum - // timestamp to include all `begin` keys, and add minimal timestamp to include - // all `end` keys. - if (begin_without_ts != nullptr) { - AppendKeyWithMaxTimestamp(&begin_str, *begin_without_ts, ts_sz); - } - if (end_without_ts != nullptr) { - AppendKeyWithMinTimestamp(&end_str, *end_without_ts, ts_sz); - } - Slice begin(begin_str); - Slice end(end_str); - - Slice* begin_with_ts = begin_without_ts ? &begin : nullptr; - Slice* end_with_ts = end_without_ts ? &end : nullptr; - - return CompactRangeInternal(options, column_family, begin_with_ts, - end_with_ts, "" /*trim_ts*/); + return CompactRangeInternal( + options, column_family, begin.has_value() ? &begin.value() : nullptr, + end.has_value() ? &end.value() : nullptr, "" /*trim_ts*/); } Status DBImpl::IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family, @@ -987,6 +1049,9 @@ Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, VersionEdit edit; edit.SetColumnFamily(cfd->GetID()); edit.SetFullHistoryTsLow(ts_low); + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; TEST_SYNC_POINT_CALLBACK("DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit", &edit); @@ -1000,7 +1065,8 @@ Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, } Status s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_, directories_.GetDbDir()); + read_options, &edit, &mutex_, + directories_.GetDbDir()); if (!s.ok()) { return s; } @@ -1062,15 +1128,9 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, FlushOptions fo; fo.allow_write_stall = options.allow_write_stall; if (immutable_db_options_.atomic_flush) { - autovector cfds; - mutex_.Lock(); - SelectColumnFamiliesForAtomicFlush(&cfds); - mutex_.Unlock(); - s = AtomicFlushMemTables(cfds, fo, FlushReason::kManualCompaction, - false /* entered_write_thread */); + s = AtomicFlushMemTables(fo, FlushReason::kManualCompaction); } else { - s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction, - false /* entered_write_thread */); + s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction); } if (!s.ok()) { LogFlush(immutable_db_options_.info_log); @@ -1091,11 +1151,10 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, } s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels, final_output_level, options, begin, end, exclusive, - false, std::numeric_limits::max(), - trim_ts); + false /* disable_trivial_move */, + std::numeric_limits::max(), trim_ts); } else { int first_overlapped_level = kInvalidLevel; - int max_overlapped_level = kInvalidLevel; { SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); Version* current_version = super_version->current; @@ -1117,6 +1176,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, ReadOptions ro; ro.total_order_seek = true; + ro.io_activity = Env::IOActivity::kCompaction; bool overlap; for (int level = 0; level < current_version->storage_info()->num_non_empty_levels(); @@ -1170,83 +1230,104 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, begin, end); } if (overlap) { - if (first_overlapped_level == kInvalidLevel) { - first_overlapped_level = level; - } - max_overlapped_level = level; + first_overlapped_level = level; + break; } } CleanupSuperVersion(super_version); } if (s.ok() && first_overlapped_level != kInvalidLevel) { - // max_file_num_to_ignore can be used to filter out newly created SST - // files, useful for bottom level compaction in a manual compaction - uint64_t max_file_num_to_ignore = std::numeric_limits::max(); - uint64_t next_file_number = versions_->current_next_file_number(); - final_output_level = max_overlapped_level; - int output_level; - for (int level = first_overlapped_level; level <= max_overlapped_level; - level++) { - bool disallow_trivial_move = false; - // in case the compaction is universal or if we're compacting the - // bottom-most level, the output level will be the same as input one. - // level 0 can never be the bottommost level (i.e. if all files are in - // level 0, we will compact to level 1) - if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || - cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { - output_level = level; - } else if (level == max_overlapped_level && level > 0) { - if (options.bottommost_level_compaction == - BottommostLevelCompaction::kSkip) { - // Skip bottommost level compaction - continue; - } else if (options.bottommost_level_compaction == - BottommostLevelCompaction::kIfHaveCompactionFilter && - cfd->ioptions()->compaction_filter == nullptr && - cfd->ioptions()->compaction_filter_factory == nullptr) { - // Skip bottommost level compaction since we don't have a compaction - // filter - continue; + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + assert(first_overlapped_level == 0); + s = RunManualCompaction( + cfd, first_overlapped_level, first_overlapped_level, options, begin, + end, exclusive, true /* disallow_trivial_move */, + std::numeric_limits::max() /* max_file_num_to_ignore */, + trim_ts); + final_output_level = first_overlapped_level; + } else { + assert(cfd->ioptions()->compaction_style == kCompactionStyleLevel); + uint64_t next_file_number = versions_->current_next_file_number(); + // Start compaction from `first_overlapped_level`, one level down at a + // time, until output level >= max_overlapped_level. + // When max_overlapped_level == 0, we will still compact from L0 -> L1 + // (or LBase), and followed by a bottommost level intra-level compaction + // at L1 (or LBase), if applicable. + int level = first_overlapped_level; + final_output_level = level; + int output_level = 0, base_level = 0; + for (;;) { + // Always allow L0 -> L1 compaction + if (level > 0) { + if (cfd->ioptions()->level_compaction_dynamic_level_bytes) { + assert(final_output_level < cfd->ioptions()->num_levels); + if (final_output_level + 1 == cfd->ioptions()->num_levels) { + break; + } + } else { + // TODO(cbi): there is still a race condition here where + // if a background compaction compacts some file beyond + // current()->storage_info()->num_non_empty_levels() right after + // the check here.This should happen very infrequently and should + // not happen once a user populates the last level of the LSM. + InstrumentedMutexLock l(&mutex_); + // num_non_empty_levels may be lower after a compaction, so + // we check for >= here. + if (final_output_level + 1 >= + cfd->current()->storage_info()->num_non_empty_levels()) { + break; + } + } } - output_level = level; - // update max_file_num_to_ignore only for bottom level compaction - // because data in newly compacted files in middle levels may still - // need to be pushed down - max_file_num_to_ignore = next_file_number; - } else { output_level = level + 1; - if (cfd->ioptions()->compaction_style == kCompactionStyleLevel && - cfd->ioptions()->level_compaction_dynamic_level_bytes && + if (cfd->ioptions()->level_compaction_dynamic_level_bytes && level == 0) { output_level = ColumnFamilyData::kCompactToBaseLevel; } - // if it's a BottommostLevel compaction and `kForce*` compaction is - // set, disallow trivial move - if (level == max_overlapped_level && - (options.bottommost_level_compaction == - BottommostLevelCompaction::kForce || - options.bottommost_level_compaction == - BottommostLevelCompaction::kForceOptimized)) { - disallow_trivial_move = true; + // Use max value for `max_file_num_to_ignore` to always compact + // files down. + s = RunManualCompaction( + cfd, level, output_level, options, begin, end, exclusive, + !trim_ts.empty() /* disallow_trivial_move */, + std::numeric_limits::max() /* max_file_num_to_ignore */, + trim_ts, + output_level == ColumnFamilyData::kCompactToBaseLevel + ? &base_level + : nullptr); + if (!s.ok()) { + break; } + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + assert(base_level > 0); + level = base_level; + } else { + ++level; + } + final_output_level = level; + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); + TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); } - // trim_ts need real compaction to remove latest record - if (!trim_ts.empty()) { - disallow_trivial_move = true; - } - s = RunManualCompaction(cfd, level, output_level, options, begin, end, - exclusive, disallow_trivial_move, - max_file_num_to_ignore, trim_ts); - if (!s.ok()) { - break; - } - if (output_level == ColumnFamilyData::kCompactToBaseLevel) { - final_output_level = cfd->NumberLevels() - 1; - } else if (output_level > final_output_level) { - final_output_level = output_level; + if (s.ok()) { + assert(final_output_level > 0); + // bottommost level intra-level compaction + if ((options.bottommost_level_compaction == + BottommostLevelCompaction::kIfHaveCompactionFilter && + (cfd->ioptions()->compaction_filter != nullptr || + cfd->ioptions()->compaction_filter_factory != nullptr)) || + options.bottommost_level_compaction == + BottommostLevelCompaction::kForceOptimized || + options.bottommost_level_compaction == + BottommostLevelCompaction::kForce) { + // Use `next_file_number` as `max_file_num_to_ignore` to avoid + // rewriting newly compacted files when it is kForceOptimized + // or kIfHaveCompactionFilter with compaction filter set. + s = RunManualCompaction( + cfd, final_output_level, final_output_level, options, begin, + end, exclusive, true /* disallow_trivial_move */, + next_file_number /* max_file_num_to_ignore */, trim_ts); + } } - TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1"); - TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2"); } } } @@ -1299,17 +1380,6 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options, const int output_level, const int output_path_id, std::vector* const output_file_names, CompactionJobInfo* compaction_job_info) { -#ifdef ROCKSDB_LITE - (void)compact_options; - (void)column_family; - (void)input_file_names; - (void)output_level; - (void)output_path_id; - (void)output_file_names; - (void)compaction_job_info; - // not supported in lite version - return Status::NotSupported("Not supported in ROCKSDB LITE"); -#else if (column_family == nullptr) { return Status::InvalidArgument("ColumnFamilyHandle must be non-null."); } @@ -1368,10 +1438,8 @@ Status DBImpl::CompactFiles(const CompactionOptions& compact_options, } return s; -#endif // ROCKSDB_LITE } -#ifndef ROCKSDB_LITE Status DBImpl::CompactFilesImpl( const CompactionOptions& compact_options, ColumnFamilyData* cfd, Version* version, const std::vector& input_file_names, @@ -1407,6 +1475,14 @@ Status DBImpl::CompactFilesImpl( } } + if (cfd->ioptions()->allow_ingest_behind && + output_level >= cfd->ioptions()->num_levels - 1) { + return Status::InvalidArgument( + "Exceed the maximum output level defined by " + "the current compaction algorithm with ingest_behind --- " + + std::to_string(cfd->ioptions()->num_levels - 1)); + } + Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles( &input_set, cf_meta, output_level); TEST_SYNC_POINT("DBImpl::CompactFilesImpl::PostSanitizeCompactionInputFiles"); @@ -1451,7 +1527,8 @@ Status DBImpl::CompactFilesImpl( // without releasing the lock, so we're guaranteed a compaction can be formed. assert(c != nullptr); - c->SetInputVersion(version); + c->FinalizeInputInfo(version); + // deletion compaction currently not allowed in CompactFiles. assert(!c->deletion_compaction()); @@ -1501,7 +1578,12 @@ Status DBImpl::CompactFilesImpl( TEST_SYNC_POINT("CompactFilesImpl:3"); mutex_.Lock(); - Status status = compaction_job.Install(*c->mutable_cf_options()); + bool compaction_released = false; + Status status = + compaction_job.Install(*c->mutable_cf_options(), &compaction_released); + if (!compaction_released) { + c->ReleaseCompactionFiles(s); + } if (status.ok()) { assert(compaction_job.io_status().ok()); InstallSuperVersionAndScheduleWork(c->column_family_data(), @@ -1512,21 +1594,18 @@ Status DBImpl::CompactFilesImpl( // not check compaction_job.io_status() explicitly if we're not calling // SetBGError compaction_job.io_status().PermitUncheckedError(); - c->ReleaseCompactionFiles(s); -#ifndef ROCKSDB_LITE // Need to make sure SstFileManager does its bookkeeping auto sfm = static_cast( immutable_db_options_.sst_file_manager.get()); if (sfm && sfm_reserved_compact_space) { sfm->OnCompactionCompletion(c.get()); } -#endif // ROCKSDB_LITE ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); if (compaction_job_info != nullptr) { BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats, - job_context->job_id, version, compaction_job_info); + job_context->job_id, compaction_job_info); } if (status.ok()) { @@ -1577,7 +1656,6 @@ Status DBImpl::CompactFilesImpl( return status; } -#endif // ROCKSDB_LITE Status DBImpl::PauseBackgroundWork() { InstrumentedMutexLock guard_lock(&mutex_); @@ -1611,7 +1689,6 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, const Status& st, const CompactionJobStats& job_stats, int job_id) { -#ifndef ROCKSDB_LITE if (immutable_db_options_.listeners.empty()) { return; } @@ -1625,34 +1702,23 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c, } c->SetNotifyOnCompactionCompleted(); - Version* current = cfd->current(); - current->Ref(); // release lock while notifying events mutex_.Unlock(); TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex"); { CompactionJobInfo info{}; - BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, current, &info); + BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, &info); for (auto listener : immutable_db_options_.listeners) { listener->OnCompactionBegin(this, info); } info.status.PermitUncheckedError(); } mutex_.Lock(); - current->Unref(); -#else - (void)cfd; - (void)c; - (void)st; - (void)job_stats; - (void)job_id; -#endif // ROCKSDB_LITE } void DBImpl::NotifyOnCompactionCompleted( ColumnFamilyData* cfd, Compaction* c, const Status& st, const CompactionJobStats& compaction_job_stats, const int job_id) { -#ifndef ROCKSDB_LITE if (immutable_db_options_.listeners.size() == 0U) { return; } @@ -1665,30 +1731,19 @@ void DBImpl::NotifyOnCompactionCompleted( return; } - Version* current = cfd->current(); - current->Ref(); // release lock while notifying events mutex_.Unlock(); TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex"); { CompactionJobInfo info{}; - BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current, - &info); + BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, &info); for (auto listener : immutable_db_options_.listeners) { listener->OnCompactionCompleted(this, info); } } mutex_.Lock(); - current->Unref(); // no need to signal bg_cv_ as it will be signaled at the end of the // flush process. -#else - (void)cfd; - (void)c; - (void)st; - (void)compaction_job_stats; - (void)job_id; -#endif // ROCKSDB_LITE } // REQUIREMENT: block all background work by calling PauseBackgroundWork() @@ -1699,6 +1754,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { return Status::InvalidArgument("Target level exceeds number of levels"); } + const ReadOptions read_options(Env::IOActivity::kCompaction); + SuperVersionContext sv_context(/* create_superversion */ true); InstrumentedMutexLock guard_lock(&mutex_); @@ -1807,14 +1864,19 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size, + f->user_defined_timestamps_persisted); } ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), edit.DebugString().data()); - Status status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, - &mutex_, directories_.GetDbDir()); + Status status = + versions_->LogAndApply(cfd, mutable_cf_options, read_options, &edit, + &mutex_, directories_.GetDbDir()); + + cfd->compaction_picker()->UnregisterCompaction(c.get()); + c.reset(); cfd->compaction_picker()->UnregisterCompaction(c.get()); c.reset(); @@ -1856,6 +1918,37 @@ int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { ->mutable_cf_options.level0_stop_writes_trigger; } +Status DBImpl::FlushAllColumnFamilies(const FlushOptions& flush_options, + FlushReason flush_reason) { + mutex_.AssertHeld(); + Status status; + if (immutable_db_options_.atomic_flush) { + mutex_.Unlock(); + status = AtomicFlushMemTables(flush_options, flush_reason); + if (status.IsColumnFamilyDropped()) { + status = Status::OK(); + } + mutex_.Lock(); + } else { + for (auto cfd : versions_->GetRefedColumnFamilySet()) { + if (cfd->IsDropped()) { + continue; + } + mutex_.Unlock(); + status = FlushMemTable(cfd, flush_options, flush_reason); + TEST_SYNC_POINT("DBImpl::FlushAllColumnFamilies:1"); + TEST_SYNC_POINT("DBImpl::FlushAllColumnFamilies:2"); + mutex_.Lock(); + if (!status.ok() && !status.IsColumnFamilyDropped()) { + break; + } else if (status.IsColumnFamilyDropped()) { + status = Status::OK(); + } + } + } + return status; +} + Status DBImpl::Flush(const FlushOptions& flush_options, ColumnFamilyHandle* column_family) { auto cfh = static_cast_with_check(column_family); @@ -1863,8 +1956,8 @@ Status DBImpl::Flush(const FlushOptions& flush_options, cfh->GetName().c_str()); Status s; if (immutable_db_options_.atomic_flush) { - s = AtomicFlushMemTables({cfh->cfd()}, flush_options, - FlushReason::kManualFlush); + s = AtomicFlushMemTables(flush_options, FlushReason::kManualFlush, + {cfh->cfd()}); } else { s = FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush); } @@ -1902,7 +1995,7 @@ Status DBImpl::Flush(const FlushOptions& flush_options, auto cfh = static_cast(elem); cfds.emplace_back(cfh->cfd()); }); - s = AtomicFlushMemTables(cfds, flush_options, FlushReason::kManualFlush); + s = AtomicFlushMemTables(flush_options, FlushReason::kManualFlush, cfds); ROCKS_LOG_INFO(immutable_db_options_.info_log, "Manual atomic flush finished, status: %s\n" "=====Column families:=====", @@ -1922,7 +2015,8 @@ Status DBImpl::RunManualCompaction( ColumnFamilyData* cfd, int input_level, int output_level, const CompactRangeOptions& compact_range_options, const Slice* begin, const Slice* end, bool exclusive, bool disallow_trivial_move, - uint64_t max_file_num_to_ignore, const std::string& trim_ts) { + uint64_t max_file_num_to_ignore, const std::string& trim_ts, + int* final_output_level) { assert(input_level == ColumnFamilyData::kCompactAllLevels || input_level >= 0); @@ -2073,6 +2167,15 @@ Status DBImpl::RunManualCompaction( } else if (!scheduled) { if (compaction == nullptr) { manual.done = true; + if (final_output_level) { + // No compaction needed or there is a conflicting compaction. + // Still set `final_output_level` to the level where we would + // have compacted to. + *final_output_level = output_level; + if (output_level == ColumnFamilyData::kCompactToBaseLevel) { + *final_output_level = cfd->current()->storage_info()->base_level(); + } + } bg_cv_.SignalAll(); continue; } @@ -2106,6 +2209,9 @@ Status DBImpl::RunManualCompaction( } scheduled = true; TEST_SYNC_POINT("DBImpl::RunManualCompaction:Scheduled"); + if (final_output_level) { + *final_output_level = compaction->output_level(); + } } } @@ -2134,7 +2240,8 @@ void DBImpl::GenerateFlushRequest(const autovector& cfds, // cfd may be null, see DBImpl::ScheduleFlushes continue; } - uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID(); + uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID( + immutable_db_options_.atomic_flush /* for_atomic_flush */); req->cfd_to_max_mem_id_to_persist.emplace(cfd, max_memtable_id); } } @@ -2182,15 +2289,7 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, } WaitForPendingWrites(); - if (flush_reason != FlushReason::kErrorRecoveryRetryFlush && - (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load())) { - // Note that, when flush reason is kErrorRecoveryRetryFlush, during the - // auto retry resume, we want to avoid creating new small memtables. - // Therefore, SwitchMemtable will not be called. Also, since ResumeImpl - // will iterate through all the CFs and call FlushMemtable during auto - // retry resume, it is possible that in some CFs, - // cfd->imm()->NumNotFlushed() = 0. In this case, so no flush request will - // be created and scheduled, status::OK() will be returned. + if (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) { s = SwitchMemtable(cfd, &context); } const uint64_t flush_memtable_id = std::numeric_limits::max(); @@ -2199,10 +2298,10 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, !cached_recoverable_state_empty_.load()) { FlushRequest req{flush_reason, {{cfd, flush_memtable_id}}}; flush_reqs.emplace_back(std::move(req)); - memtable_ids_to_wait.emplace_back(cfd->imm()->GetLatestMemTableID()); + memtable_ids_to_wait.emplace_back( + cfd->imm()->GetLatestMemTableID(false /* for_atomic_flush */)); } - if (immutable_db_options_.persist_stats_to_disk && - flush_reason != FlushReason::kErrorRecoveryRetryFlush) { + if (immutable_db_options_.persist_stats_to_disk) { ColumnFamilyData* cfd_stats = versions_->GetColumnFamilySet()->GetColumnFamily( kPersistentStatsColumnFamilyName); @@ -2228,7 +2327,8 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, FlushRequest req{flush_reason, {{cfd_stats, flush_memtable_id}}}; flush_reqs.emplace_back(std::move(req)); memtable_ids_to_wait.emplace_back( - cfd_stats->imm()->GetLatestMemTableID()); + cfd_stats->imm()->GetLatestMemTableID( + false /* for_atomic_flush */)); } } } @@ -2279,8 +2379,7 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, } s = WaitForFlushMemTables( cfds, flush_memtable_ids, - (flush_reason == FlushReason::kErrorRecovery || - flush_reason == FlushReason::kErrorRecoveryRetryFlush)); + flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */); InstrumentedMutexLock lock_guard(&mutex_); for (auto* tmp_cfd : cfds) { tmp_cfd->UnrefAndTryDelete(); @@ -2290,11 +2389,9 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, return s; } -// Flush all elements in 'column_family_datas' -// and atomically record the result to the MANIFEST. Status DBImpl::AtomicFlushMemTables( - const autovector& column_family_datas, const FlushOptions& flush_options, FlushReason flush_reason, + const autovector& provided_candidate_cfds, bool entered_write_thread) { assert(immutable_db_options_.atomic_flush); #if 0 // RocksDB-Cloud disabled @@ -2306,18 +2403,48 @@ Status DBImpl::AtomicFlushMemTables( } #endif Status s; + autovector candidate_cfds; + if (provided_candidate_cfds.empty()) { + // Generate candidate cfds if not provided + { + InstrumentedMutexLock l(&mutex_); + for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped() && cfd->initialized()) { + cfd->Ref(); + candidate_cfds.push_back(cfd); + } + } + } + } else { + candidate_cfds = provided_candidate_cfds; + } + if (!flush_options.allow_write_stall) { int num_cfs_to_flush = 0; - for (auto cfd : column_family_datas) { + for (auto cfd : candidate_cfds) { bool flush_needed = true; s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed); if (!s.ok()) { + // Unref the newly generated candidate cfds (when not provided) in + // `candidate_cfds` + if (provided_candidate_cfds.empty()) { + for (auto candidate_cfd : candidate_cfds) { + candidate_cfd->UnrefAndTryDelete(); + } + } return s; } else if (flush_needed) { ++num_cfs_to_flush; } } if (0 == num_cfs_to_flush) { + // Unref the newly generated candidate cfds (when not provided) in + // `candidate_cfds` + if (provided_candidate_cfds.empty()) { + for (auto candidate_cfd : candidate_cfds) { + candidate_cfd->UnrefAndTryDelete(); + } + } return s; } } @@ -2338,21 +2465,7 @@ Status DBImpl::AtomicFlushMemTables( } WaitForPendingWrites(); - if (immutable_db_options_.replication_log_listener) { - // If replication_log_listener is installed the only thing we are - // allowed to do is flush all column families. - SelectColumnFamiliesForAtomicFlush(&cfds); - } else { - for (auto cfd : column_family_datas) { - if (cfd->IsDropped()) { - continue; - } - if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || - !cached_recoverable_state_empty_.load()) { - cfds.emplace_back(cfd); - } - } - } + SelectColumnFamiliesForAtomicFlush(&cfds, candidate_cfds); MemTableSwitchRecord mem_switch_record; std::string replication_sequence; @@ -2363,9 +2476,16 @@ Status DBImpl::AtomicFlushMemTables( mem_switch_record); } + // Unref the newly generated candidate cfds (when not provided) in + // `candidate_cfds` + if (provided_candidate_cfds.empty()) { + for (auto candidate_cfd : candidate_cfds) { + candidate_cfd->UnrefAndTryDelete(); + } + } + for (auto cfd : cfds) { - if ((cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) || - flush_reason == FlushReason::kErrorRecoveryRetryFlush) { + if (cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) { continue; } cfd->Ref(); @@ -2417,8 +2537,7 @@ Status DBImpl::AtomicFlushMemTables( } s = WaitForFlushMemTables( cfds, flush_memtable_ids, - (flush_reason == FlushReason::kErrorRecovery || - flush_reason == FlushReason::kErrorRecoveryRetryFlush)); + flush_reason == FlushReason::kErrorRecovery /* resuming_from_bg_err */); InstrumentedMutexLock lock_guard(&mutex_); for (auto* cfd : cfds) { cfd->UnrefAndTryDelete(); @@ -2427,6 +2546,68 @@ Status DBImpl::AtomicFlushMemTables( return s; } +Status DBImpl::RetryFlushesForErrorRecovery(FlushReason flush_reason, + bool wait) { + mutex_.AssertHeld(); + assert(flush_reason == FlushReason::kErrorRecoveryRetryFlush || + flush_reason == FlushReason::kCatchUpAfterErrorRecovery); + + // Collect referenced CFDs. + autovector cfds; + for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped() && cfd->initialized() && + cfd->imm()->NumNotFlushed() != 0) { + cfd->Ref(); + cfd->imm()->FlushRequested(); + cfds.push_back(cfd); + } + } + + // Submit flush requests for all immutable memtables needing flush. + // `flush_memtable_ids` will be populated such that all immutable + // memtables eligible for flush are waited on before this function + // returns. + autovector flush_memtable_ids; + if (immutable_db_options_.atomic_flush) { + FlushRequest flush_req; + GenerateFlushRequest(cfds, flush_reason, &flush_req); + SchedulePendingFlush(flush_req); + for (auto& iter : flush_req.cfd_to_max_mem_id_to_persist) { + flush_memtable_ids.push_back(iter.second); + } + } else { + for (auto cfd : cfds) { + flush_memtable_ids.push_back( + cfd->imm()->GetLatestMemTableID(false /* for_atomic_flush */)); + // Impose no bound on the highest memtable ID flushed. There is no + // reason to do so outside of atomic flush. + FlushRequest flush_req{ + flush_reason, + {{cfd, + std::numeric_limits::max() /* max_mem_id_to_persist */}}}; + SchedulePendingFlush(flush_req); + } + } + MaybeScheduleFlushOrCompaction(); + + Status s; + if (wait) { + mutex_.Unlock(); + autovector flush_memtable_id_ptrs; + for (auto& flush_memtable_id : flush_memtable_ids) { + flush_memtable_id_ptrs.push_back(&flush_memtable_id); + } + s = WaitForFlushMemTables(cfds, flush_memtable_id_ptrs, + true /* resuming_from_bg_err */); + mutex_.Lock(); + } + + for (auto* cfd : cfds) { + cfd->UnrefAndTryDelete(); + } + return s; +} + // Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can // cause write stall, for example if one memtable is being flushed already. // This method tries to avoid write stall (similar to CompactRange() behavior) @@ -2490,8 +2671,11 @@ Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd, // check whether one extra immutable memtable or an extra L0 file would // cause write stalling mode to be entered. It could still enter stall // mode due to pending compaction bytes, but that's less common + // No extra immutable Memtable will be created if the current Memtable is + // empty. + int mem_to_flush = cfd->mem()->IsEmpty() ? 0 : 1; write_stall_condition = ColumnFamilyData::GetWriteStallConditionAndCause( - cfd->imm()->NumNotFlushed() + 1, + cfd->imm()->NumNotFlushed() + mem_to_flush, vstorage->l0_delay_trigger_count() + 1, vstorage->estimated_compaction_needed_bytes(), mutable_cf_options, *cfd->ioptions()) @@ -2625,6 +2809,7 @@ void DBImpl::EnableManualCompaction() { void DBImpl::MaybeScheduleFlushOrCompaction() { mutex_.AssertHeld(); + TEST_SYNC_POINT("DBImpl::MaybeScheduleFlushOrCompaction:Start"); if (!opened_successfully_) { // Compaction may introduce data race to DB open return; @@ -2637,6 +2822,11 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { // There has been a hard error and this call is not part of the recovery // sequence. Bail out here so we don't get into an endless loop of // scheduling BG work which will again call this function + // + // Note that a non-recovery flush can still be scheduled if + // error_handler_.IsRecoveryInProgress() returns true. We rely on + // BackgroundCallFlush() to check flush reason and drop non-recovery + // flushes. return; } else if (shutting_down_.load(std::memory_order_acquire)) { // DB is being deleted; no more background compactions @@ -2647,6 +2837,9 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { env_->GetBackgroundThreads(Env::Priority::HIGH) == 0; while (!is_flush_pool_empty && unscheduled_flushes_ > 0 && bg_flush_scheduled_ < bg_job_limits.max_flushes) { + TEST_SYNC_POINT_CALLBACK( + "DBImpl::MaybeScheduleFlushOrCompaction:BeforeSchedule", + &unscheduled_flushes_); bg_flush_scheduled_++; FlushThreadArg* fta = new FlushThreadArg; fta->db_ = this; @@ -2756,7 +2949,7 @@ ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() { DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() { assert(!flush_queue_.empty()); - FlushRequest flush_req = flush_queue_.front(); + FlushRequest flush_req = std::move(flush_queue_.front()); flush_queue_.pop_front(); if (!immutable_db_options_.atomic_flush) { assert(flush_req.cfd_to_max_mem_id_to_persist.size() == 1); @@ -2800,6 +2993,9 @@ ColumnFamilyData* DBImpl::PickCompactionFromQueue( void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) { mutex_.AssertHeld(); + if (reject_new_background_jobs_) { + return; + } if (flush_req.cfd_to_max_mem_id_to_persist.empty()) { return; } @@ -2829,6 +3025,9 @@ void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req) { void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { mutex_.AssertHeld(); + if (reject_new_background_jobs_) { + return; + } if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) { AddToCompactionQueue(cfd); ++unscheduled_compactions_; @@ -2838,6 +3037,9 @@ void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync, FileType type, uint64_t number, int job_id) { mutex_.AssertHeld(); + if (reject_new_background_jobs_) { + return; + } PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id); purge_files_.insert({{number, std::move(file_info)}}); } @@ -2926,6 +3128,7 @@ void DBImpl::UnscheduleFlushCallback(void* arg) { Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, LogBuffer* log_buffer, FlushReason* reason, + bool* flush_rescheduled_to_retain_udt, Env::Priority thread_pri) { mutex_.AssertHeld(); @@ -2951,14 +3154,61 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, autovector column_families_not_to_flush; while (!flush_queue_.empty()) { // This cfd is already referenced - const FlushRequest& flush_req = PopFirstFromFlushQueue(); + FlushRequest flush_req = PopFirstFromFlushQueue(); FlushReason flush_reason = flush_req.flush_reason; + if (!error_handler_.GetBGError().ok() && error_handler_.IsBGWorkStopped() && + flush_reason != FlushReason::kErrorRecovery && + flush_reason != FlushReason::kErrorRecoveryRetryFlush) { + // Stop non-recovery flush when bg work is stopped + // Note that we drop the flush request here. + // Recovery thread should schedule further flushes after bg error + // is cleared. + status = error_handler_.GetBGError(); + assert(!status.ok()); + ROCKS_LOG_BUFFER(log_buffer, + "[JOB %d] Abort flush due to background error %s", + job_context->job_id, status.ToString().c_str()); + *reason = flush_reason; + for (auto item : flush_req.cfd_to_max_mem_id_to_persist) { + item.first->UnrefAndTryDelete(); + } + return status; + } + if (!immutable_db_options_.atomic_flush && + ShouldRescheduleFlushRequestToRetainUDT(flush_req)) { + assert(flush_req.cfd_to_max_mem_id_to_persist.size() == 1); + ColumnFamilyData* cfd = + flush_req.cfd_to_max_mem_id_to_persist.begin()->first; + if (cfd->UnrefAndTryDelete()) { + return Status::OK(); + } + ROCKS_LOG_BUFFER(log_buffer, + "FlushRequest for column family %s is re-scheduled to " + "retain user-defined timestamps.", + cfd->GetName().c_str()); + // Reschedule the `FlushRequest` as is without checking dropped column + // family etc. The follow-up job will do the check anyways, so save the + // duplication. Column family is deduplicated by `SchdulePendingFlush` and + // `PopFirstFromFlushQueue` contains at flush request enqueueing and + // dequeueing time. + // This flush request is rescheduled right after it's popped from the + // queue while the db mutex is held, so there should be no other + // FlushRequest for the same column family with higher `max_memtable_id` + // in the queue to block the reschedule from succeeding. +#ifndef NDEBUG + flush_req.reschedule_count += 1; +#endif /* !NDEBUG */ + SchedulePendingFlush(flush_req); + *reason = flush_reason; + *flush_rescheduled_to_retain_udt = true; + return Status::TryAgain(); + } superversion_contexts.clear(); superversion_contexts.reserve( flush_req.cfd_to_max_mem_id_to_persist.size()); - for (const auto& iter : flush_req.cfd_to_max_mem_id_to_persist) { - ColumnFamilyData* cfd = iter.first; + for (const auto& [cfd, max_memtable_id] : + flush_req.cfd_to_max_mem_id_to_persist) { if (cfd->GetMempurgeUsed()) { // If imm() contains silent memtables (e.g.: because // MemPurge was activated), requesting a flush will @@ -2972,10 +3222,16 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, continue; } superversion_contexts.emplace_back(SuperVersionContext(true)); - bg_flush_args.emplace_back(cfd, iter.second, + bg_flush_args.emplace_back(cfd, max_memtable_id, &(superversion_contexts.back()), flush_reason); } - if (!bg_flush_args.empty()) { + // `MaybeScheduleFlushOrCompaction` schedules as many `BackgroundCallFlush` + // jobs as the number of `FlushRequest` in the `flush_queue_`, a.k.a + // `unscheduled_flushes_`. So it's sufficient to make each `BackgroundFlush` + // handle one `FlushRequest` and each have a Status returned. + if (!bg_flush_args.empty() || !column_families_not_to_flush.empty()) { + TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundFlush:CheckFlushRequest:cb", + const_cast(&flush_req.reschedule_count)); break; } } @@ -3000,7 +3256,7 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, // All the CFD/bg_flush_arg in the FlushReq must have the same flush reason, so // just grab the first one #ifndef NDEBUG - for (const auto bg_flush_arg : bg_flush_args) { + for (const auto& bg_flush_arg : bg_flush_args) { assert(bg_flush_arg.flush_reason_ == bg_flush_args[0].flush_reason_); } #endif /* !NDEBUG */ @@ -3037,11 +3293,20 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { pending_outputs_inserted_elem(new std::list::iterator( CaptureCurrentFileNumberInPendingOutputs())); FlushReason reason; - - Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer, - &reason, thread_pri); - if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() && - reason != FlushReason::kErrorRecovery) { + bool flush_rescheduled_to_retain_udt = false; + Status s = + BackgroundFlush(&made_progress, &job_context, &log_buffer, &reason, + &flush_rescheduled_to_retain_udt, thread_pri); + if (s.IsTryAgain() && flush_rescheduled_to_retain_udt) { + bg_cv_.SignalAll(); // In case a waiter can proceed despite the error + mutex_.Unlock(); + TEST_SYNC_POINT_CALLBACK("DBImpl::AfterRetainUDTReschedule:cb", nullptr); + immutable_db_options_.clock->SleepForMicroseconds( + 100000); // prevent hot loop + mutex_.Lock(); + } else if (!s.ok() && !s.IsShutdownInProgress() && + !s.IsColumnFamilyDropped() && + reason != FlushReason::kErrorRecovery) { // Wait a little bit before retrying background flush in // case this is an environmental problem and we do not want to // chew up resources for failed flushes for the duration of @@ -3051,9 +3316,9 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); ROCKS_LOG_ERROR(immutable_db_options_.info_log, - "Waiting after background flush error: %s" + "[JOB %d] Waiting after background flush error: %s" "Accumulated background error counts: %" PRIu64, - s.ToString().c_str(), error_cnt); + job_context.job_id, s.ToString().c_str(), error_cnt); log_buffer.FlushBufferToLog(); LogFlush(immutable_db_options_.info_log); immutable_db_options_.clock->SleepForMicroseconds(1000000); @@ -3062,29 +3327,33 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0"); ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); - - // If flush failed, we want to delete all temporary files that we might have - // created. Thus, we force full scan in FindObsoleteFiles() - FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && - !s.IsColumnFamilyDropped()); - // delete unnecessary files if any, this is done outside the mutex - if (job_context.HaveSomethingToClean() || - job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { - mutex_.Unlock(); - TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound"); - // Have to flush the info logs before bg_flush_scheduled_-- - // because if bg_flush_scheduled_ becomes 0 and the lock is - // released, the deconstructor of DB can kick in and destroy all the - // states of DB so info_log might not be available after that point. - // It also applies to access other states that DB owns. - log_buffer.FlushBufferToLog(); - if (job_context.HaveSomethingToDelete()) { - PurgeObsoleteFiles(job_context); + // There is no need to do these clean up if the flush job is rescheduled + // to retain user-defined timestamps because the job doesn't get to the + // stage of actually flushing the MemTables. + if (!flush_rescheduled_to_retain_udt) { + // If flush failed, we want to delete all temporary files that we might + // have created. Thus, we force full scan in FindObsoleteFiles() + FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() && + !s.IsColumnFamilyDropped()); + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToClean() || + job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound"); + // Have to flush the info logs before bg_flush_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + mutex_.Lock(); } - job_context.Clean(); - mutex_.Lock(); + TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp"); } - TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp"); assert(num_running_flushes_ > 0); num_running_flushes_--; @@ -3236,6 +3505,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, mutex_.AssertHeld(); TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start"); + const ReadOptions read_options(Env::IOActivity::kCompaction); + bool is_manual = (manual_compaction != nullptr); std::unique_ptr c; if (prepicked_compaction != nullptr && @@ -3289,8 +3560,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, std::unique_ptr task_token; - // InternalKey manual_end_storage; - // InternalKey* manual_end = &manual_end_storage; bool sfm_reserved_compact_space = false; if (is_manual) { ManualCompactionState* m = manual_compaction; @@ -3426,6 +3695,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, } IOStatus io_s; + bool compaction_released = false; if (!c) { // Nothing to do ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do"); @@ -3446,9 +3716,14 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, for (const auto& f : *c->inputs(0)) { c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); } - status = versions_->LogAndApply(c->column_family_data(), - *c->mutable_cf_options(), c->edit(), - &mutex_, directories_.GetDbDir()); + status = versions_->LogAndApply( + c->column_family_data(), *c->mutable_cf_options(), read_options, + c->edit(), &mutex_, directories_.GetDbDir(), + /*new_descriptor_log=*/false, /*column_family_options=*/nullptr, + [&c, &compaction_released](const Status& s) { + c->ReleaseCompactionFiles(s); + compaction_released = true; + }); io_s = versions_->io_status(); InstallSuperVersionAndScheduleWork(c->column_family_data(), &job_context->superversion_contexts[0], @@ -3456,6 +3731,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n", c->column_family_data()->GetName().c_str(), c->num_input_files(0)); + if (status.ok() && io_s.ok()) { + UpdateDeletionCompactionStats(c); + } *made_progress = true; TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction", c->column_family_data()); @@ -3465,9 +3743,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, c->column_family_data()); // Instrument for event update // TODO(yhchiang): add op details for showing trivial-move. - ThreadStatusUtil::SetColumnFamily( - c->column_family_data(), c->column_family_data()->ioptions()->env, - immutable_db_options_.enable_thread_tracking); + ThreadStatusUtil::SetColumnFamily(c->column_family_data()); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); compaction_job_stats.num_input_files = c->num_input_files(0); @@ -3492,7 +3768,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size, + f->user_defined_timestamps_persisted); ROCKS_LOG_BUFFER( log_buffer, @@ -3513,9 +3790,14 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, vstorage->GetNextCompactCursor(start_level, c->num_input_files(0))); } } - status = versions_->LogAndApply(c->column_family_data(), - *c->mutable_cf_options(), c->edit(), - &mutex_, directories_.GetDbDir()); + status = versions_->LogAndApply( + c->column_family_data(), *c->mutable_cf_options(), read_options, + c->edit(), &mutex_, directories_.GetDbDir(), + /*new_descriptor_log=*/false, /*column_family_options=*/nullptr, + [&c, &compaction_released](const Status& s) { + c->ReleaseCompactionFiles(s); + compaction_released = true; + }); io_s = versions_->io_status(); // Use latest MutableCFOptions InstallSuperVersionAndScheduleWork(c->column_family_data(), @@ -3565,6 +3847,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, // Transfer requested token, so it doesn't need to do it again. ca->prepicked_compaction->task_token = std::move(task_token); ++bg_bottom_compaction_scheduled_; + assert(c == nullptr); env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, Env::Priority::BOTTOM, this, &DBImpl::UnscheduleCompactionCallback); } else { @@ -3605,12 +3888,12 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, mutex_.Unlock(); TEST_SYNC_POINT_CALLBACK( "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr); - // Should handle erorr? + // Should handle error? compaction_job.Run().PermitUncheckedError(); TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); mutex_.Lock(); - - status = compaction_job.Install(*c->mutable_cf_options()); + status = + compaction_job.Install(*c->mutable_cf_options(), &compaction_released); io_s = compaction_job.io_status(); if (status.ok()) { InstallSuperVersionAndScheduleWork(c->column_family_data(), @@ -3629,17 +3912,31 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, } if (c != nullptr) { - c->ReleaseCompactionFiles(status); + if (!compaction_released) { + c->ReleaseCompactionFiles(status); + } else { +#ifndef NDEBUG + // Sanity checking that compaction files are freed. + for (size_t i = 0; i < c->num_input_levels(); i++) { + for (size_t j = 0; j < c->inputs(i)->size(); j++) { + assert(!c->input(i, j)->being_compacted); + } + } + std::unordered_set* cip = c->column_family_data() + ->compaction_picker() + ->compactions_in_progress(); + assert(cip->find(c.get()) == cip->end()); +#endif + } + *made_progress = true; -#ifndef ROCKSDB_LITE // Need to make sure SstFileManager does its bookkeeping auto sfm = static_cast( immutable_db_options_.sst_file_manager.get()); if (sfm && sfm_reserved_compact_space) { sfm->OnCompactionCompletion(c.get()); } -#endif // ROCKSDB_LITE NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status, compaction_job_stats, job_context->job_id); @@ -3816,11 +4113,31 @@ bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) { return false; } -#ifndef ROCKSDB_LITE +void DBImpl::UpdateDeletionCompactionStats( + const std::unique_ptr& c) { + if (c == nullptr) { + return; + } + + CompactionReason reason = c->compaction_reason(); + + switch (reason) { + case CompactionReason::kFIFOMaxSize: + RecordTick(stats_, FIFO_MAX_SIZE_COMPACTIONS); + break; + case CompactionReason::kFIFOTtl: + RecordTick(stats_, FIFO_TTL_COMPACTIONS); + break; + default: + assert(false); + break; + } +} + void DBImpl::BuildCompactionJobInfo( const ColumnFamilyData* cfd, Compaction* c, const Status& st, const CompactionJobStats& compaction_job_stats, const int job_id, - const Version* current, CompactionJobInfo* compaction_job_info) const { + CompactionJobInfo* compaction_job_info) const { assert(compaction_job_info != nullptr); compaction_job_info->cf_id = cfd->GetID(); compaction_job_info->cf_name = cfd->GetName(); @@ -3830,9 +4147,16 @@ void DBImpl::BuildCompactionJobInfo( compaction_job_info->base_input_level = c->start_level(); compaction_job_info->output_level = c->output_level(); compaction_job_info->stats = compaction_job_stats; - compaction_job_info->table_properties = c->GetOutputTableProperties(); + const auto& input_table_properties = c->GetInputTableProperties(); + const auto& output_table_properties = c->GetOutputTableProperties(); + compaction_job_info->table_properties.insert(input_table_properties.begin(), + input_table_properties.end()); + compaction_job_info->table_properties.insert(output_table_properties.begin(), + output_table_properties.end()); compaction_job_info->compaction_reason = c->compaction_reason(); compaction_job_info->compression = c->output_compression(); + + const ReadOptions read_options(Env::IOActivity::kCompaction); for (size_t i = 0; i < c->num_input_levels(); ++i) { for (const auto fmd : *c->inputs(i)) { const FileDescriptor& desc = fmd->fd; @@ -3842,15 +4166,9 @@ void DBImpl::BuildCompactionJobInfo( compaction_job_info->input_files.push_back(fn); compaction_job_info->input_file_infos.push_back(CompactionFileInfo{ static_cast(i), file_number, fmd->oldest_blob_file_number}); - if (compaction_job_info->table_properties.count(fn) == 0) { - std::shared_ptr tp; - auto s = current->GetTableProperties(&tp, fmd, &fn); - if (s.ok()) { - compaction_job_info->table_properties[fn] = tp; - } - } } } + for (const auto& newf : c->edit()->GetNewFiles()) { const FileMetaData& meta = newf.second; const FileDescriptor& desc = meta.fd; @@ -3885,7 +4203,6 @@ void DBImpl::BuildCompactionJobInfo( std::move(blob_file_garbage_info)); } } -#endif // SuperVersionContext gets created and destructed outside of the lock -- // we use this conveniently to: @@ -3995,16 +4312,58 @@ void DBImpl::GetSnapshotContext( *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot); } -Status DBImpl::WaitForCompact(bool wait_unscheduled) { - // Wait until the compaction completes +Status DBImpl::WaitForCompact( + const WaitForCompactOptions& wait_for_compact_options) { InstrumentedMutexLock l(&mutex_); - while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || - bg_flush_scheduled_ || - (wait_unscheduled && unscheduled_compactions_)) && - (error_handler_.GetBGError().ok())) { - bg_cv_.Wait(); + if (wait_for_compact_options.flush) { + Status s = DBImpl::FlushAllColumnFamilies(FlushOptions(), + FlushReason::kManualFlush); + if (!s.ok()) { + return s; + } + } else if (wait_for_compact_options.close_db && + has_unpersisted_data_.load(std::memory_order_relaxed) && + !mutable_db_options_.avoid_flush_during_shutdown) { + Status s = + DBImpl::FlushAllColumnFamilies(FlushOptions(), FlushReason::kShutDown); + if (!s.ok()) { + return s; + } + } + TEST_SYNC_POINT("DBImpl::WaitForCompact:StartWaiting"); + const auto deadline = immutable_db_options_.clock->NowMicros() + + wait_for_compact_options.timeout.count(); + for (;;) { + if (shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + if (bg_work_paused_ && wait_for_compact_options.abort_on_pause) { + return Status::Aborted(); + } + if ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ || + bg_flush_scheduled_ || unscheduled_compactions_ || + unscheduled_flushes_ || error_handler_.IsRecoveryInProgress()) && + (error_handler_.GetBGError().ok())) { + if (wait_for_compact_options.timeout.count()) { + if (bg_cv_.TimedWait(deadline)) { + return Status::TimedOut(); + } + } else { + bg_cv_.Wait(); + } + } else if (wait_for_compact_options.close_db) { + reject_new_background_jobs_ = true; + mutex_.Unlock(); + Status s = Close(); + mutex_.Lock(); + if (!s.ok()) { + reject_new_background_jobs_ = false; + } + return s; + } else { + return error_handler_.GetBGError(); + } } - return error_handler_.GetBGError(); } } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_impl/db_impl_debug.cc b/db/db_impl/db_impl_debug.cc index c971156b9a3c..2588fd8eb821 100644 --- a/db/db_impl/db_impl_debug.cc +++ b/db/db_impl/db_impl_debug.cc @@ -155,8 +155,10 @@ Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd, } Status DBImpl::TEST_AtomicFlushMemTables( - const autovector& cfds, const FlushOptions& flush_opts) { - return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest); + const autovector& provided_candidate_cfds, + const FlushOptions& flush_opts) { + return AtomicFlushMemTables(flush_opts, FlushReason::kTest, + provided_candidate_cfds); } Status DBImpl::TEST_WaitForBackgroundWork() { @@ -176,9 +178,12 @@ Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) { return WaitForFlushMemTable(cfd, nullptr, false); } -Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) { - // Wait until the compaction completes - return WaitForCompact(wait_unscheduled); +Status DBImpl::TEST_WaitForCompact() { + return WaitForCompact(WaitForCompactOptions()); +} +Status DBImpl::TEST_WaitForCompact( + const WaitForCompactOptions& wait_for_compact_options) { + return WaitForCompact(wait_for_compact_options); } Status DBImpl::TEST_WaitForScheduledCompaction() { @@ -206,6 +211,8 @@ void DBImpl::TEST_LockMutex() { mutex_.Lock(); } void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); } +void DBImpl::TEST_SignalAllBgCv() { bg_cv_.SignalAll(); } + void* DBImpl::TEST_BeginWrite() { auto w = new WriteThread::Writer(); write_thread_.EnterUnbatched(w, &mutex_); @@ -297,7 +304,6 @@ size_t DBImpl::TEST_GetWalPreallocateBlockSize( return GetWalPreallocateBlockSize(write_buffer_size); } -#ifndef ROCKSDB_LITE void DBImpl::TEST_WaitForPeriodicTaskRun(std::function callback) const { periodic_task_scheduler_.TEST_WaitForRun(callback); } @@ -308,12 +314,16 @@ const PeriodicTaskScheduler& DBImpl::TEST_GetPeriodicTaskScheduler() const { SeqnoToTimeMapping DBImpl::TEST_GetSeqnoToTimeMapping() const { InstrumentedMutexLock l(&mutex_); - return seqno_time_mapping_; + return seqno_to_time_mapping_; } -#endif // !ROCKSDB_LITE +const autovector& DBImpl::TEST_GetFilesToQuarantine() const { + InstrumentedMutexLock l(&mutex_); + return error_handler_.GetFilesToQuarantine(); +} size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const { + InstrumentedMutexLock l(&const_cast(this)->stats_history_mutex_); return EstimateInMemoryStatsHistorySize(); } } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index 2f732c1e47d0..442cb47679db 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -20,7 +20,6 @@ namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end) { auto cfh = static_cast_with_check(column_family); @@ -62,7 +61,8 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { "PromoteL0 FAILED. Invalid target level %d\n", target_level); return Status::InvalidArgument("Invalid target level"); } - + // TODO: plumb Env::IOActivity + const ReadOptions read_options; Status status; VersionEdit edit; JobContext job_context(next_job_id_.fetch_add(1), true); @@ -138,11 +138,13 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size, + f->user_defined_timestamps_persisted); } status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_, directories_.GetDbDir()); + read_options, &edit, &mutex_, + directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &job_context.superversion_contexts[0], @@ -154,6 +156,5 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { return status; } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc index 9834ff4b27f5..3887e9a79407 100644 --- a/db/db_impl/db_impl_files.cc +++ b/db/db_impl/db_impl_files.cc @@ -35,6 +35,11 @@ uint64_t DBImpl::MinObsoleteSstNumberToKeep() { return std::numeric_limits::max(); } +uint64_t DBImpl::GetObsoleteSstFilesSize() { + mutex_.AssertHeld(); + return versions_->GetObsoleteSstFilesSize(); +} + Status DBImpl::DisableFileDeletions() { Status s; int my_disable_delete_obsolete_files; @@ -141,6 +146,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, // mutex_ cannot be released. Otherwise, we might see no min_pending_output // here but later find newer generated unfinalized files while scanning. job_context->min_pending_output = MinObsoleteSstNumberToKeep(); + job_context->files_to_quarantine = error_handler_.GetFilesToQuarantine(); // Get obsolete files. This function will also update the list of // pending files in VersionSet(). @@ -286,6 +292,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, return; } + bool mutex_unlocked = false; if (!alive_log_files_.empty() && !logs_.empty()) { uint64_t min_log_number = job_context->log_number; size_t num_alive_log_files = alive_log_files_.size(); @@ -315,6 +322,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, } log_write_mutex_.Unlock(); mutex_.Unlock(); + mutex_unlocked = true; TEST_SYNC_POINT_CALLBACK("FindObsoleteFiles::PostMutexUnlock", nullptr); log_write_mutex_.Lock(); while (!logs_.empty() && logs_.front().number < min_log_number) { @@ -337,7 +345,9 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, logs_to_free_.clear(); log_write_mutex_.Unlock(); - mutex_.Lock(); + if (mutex_unlocked) { + mutex_.Lock(); + } job_context->log_recycle_files.assign(log_recycle_files_.begin(), log_recycle_files_.end()); } @@ -412,6 +422,8 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { state.blob_live.end()); std::unordered_set log_recycle_files_set( state.log_recycle_files.begin(), state.log_recycle_files.end()); + std::unordered_set quarantine_files_set( + state.files_to_quarantine.begin(), state.files_to_quarantine.end()); auto candidate_files = state.full_scan_candidate_files; candidate_files.reserve( @@ -450,12 +462,12 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { std::sort(candidate_files.begin(), candidate_files.end(), [](const JobContext::CandidateFileInfo& lhs, const JobContext::CandidateFileInfo& rhs) { - if (lhs.file_name > rhs.file_name) { + if (lhs.file_name < rhs.file_name) { return true; - } else if (lhs.file_name < rhs.file_name) { + } else if (lhs.file_name > rhs.file_name) { return false; } else { - return (lhs.file_path > rhs.file_path); + return (lhs.file_path < rhs.file_path); } }); candidate_files.erase( @@ -515,6 +527,10 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { continue; } + if (quarantine_files_set.find(number) != quarantine_files_set.end()) { + continue; + } + bool keep = true; switch (type) { case kWalFile: @@ -599,13 +615,11 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { to_delete; } -#ifndef ROCKSDB_LITE if (type == kWalFile && (immutable_db_options_.WAL_ttl_seconds > 0 || immutable_db_options_.WAL_size_limit_MB > 0)) { wal_manager_.ArchiveWALFile(fname, number); continue; } -#endif // !ROCKSDB_LITE // If I do not own these files, e.g. secondary instance with max_open_files // = -1, then no need to delete or schedule delete these files since they @@ -669,9 +683,7 @@ void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) { } } } -#ifndef ROCKSDB_LITE wal_manager_.PurgeObsoleteWALFiles(); -#endif // ROCKSDB_LITE LogFlush(immutable_db_options_.info_log); InstrumentedMutexLock l(&mutex_); --pending_purge_obsolete_files_; @@ -992,7 +1004,7 @@ Status DBImpl::DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx) { if (type == kTableFile && number >= next_file_number && recovery_ctx->files_to_delete_.find(normalized_fpath) == recovery_ctx->files_to_delete_.end()) { - recovery_ctx->files_to_delete_.emplace(normalized_fpath); + recovery_ctx->files_to_delete_.emplace(normalized_fpath, path); } } } diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 958652c9fc42..fac365d2261d 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -19,11 +19,14 @@ #include "file/writable_file_writer.h" #include "logging/logging.h" #include "monitoring/persistent_stats_history.h" +#include "monitoring/thread_status_util.h" #include "options/options_helper.h" #include "rocksdb/table.h" #include "rocksdb/wal_filter.h" #include "test_util/sync_point.h" -#include "util/rate_limiter.h" +#include "util/rate_limiter_impl.h" +#include "util/string_util.h" +#include "util/udt_util.h" namespace ROCKSDB_NAMESPACE { Options SanitizeOptions(const std::string& dbname, const Options& src, @@ -141,11 +144,6 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src, result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1); } - if (result.use_direct_reads && result.compaction_readahead_size == 0) { - TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr); - result.compaction_readahead_size = 1024 * 1024 * 2; - } - // Force flush on DB open if 2PC is enabled, since with 2PC we have no // guarantee that consecutive log files have consecutive sequence id, which // make recovery complicated. @@ -153,7 +151,6 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src, result.avoid_flush_during_recovery = false; } -#ifndef ROCKSDB_LITE ImmutableDBOptions immutable_db_options(result); if (!immutable_db_options.IsWalDirSameAsDBPath()) { // Either the WAL dir and db_paths[0]/db_name are not the same, or we @@ -195,7 +192,6 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src, NewSstFileManager(result.env, result.info_log)); result.sst_file_manager = sst_file_manager; } -#endif // !ROCKSDB_LITE // Supported wal compression types if (!StreamingCompressionTypeSupported(result.wal_compression)) { @@ -301,6 +297,18 @@ Status DBImpl::ValidateOptions(const DBOptions& db_options) { "writes in direct IO require writable_file_max_buffer_size > 0"); } + if (db_options.daily_offpeak_time_utc != "") { + int start_time, end_time; + if (!TryParseTimeRangeString(db_options.daily_offpeak_time_utc, start_time, + end_time)) { + return Status::InvalidArgument( + "daily_offpeak_time_utc should be set in the format HH:mm-HH:mm " + "(e.g. 04:30-07:30)"); + } else if (start_time == end_time) { + return Status::InvalidArgument( + "start_time and end_time cannot be the same"); + } + } return Status::OK(); } @@ -415,7 +423,8 @@ Status DBImpl::Recover( uint64_t* recovered_seq, RecoveryContext* recovery_ctx) { mutex_.AssertHeld(); - bool is_new_db = false; + bool tmp_is_new_db = false; + bool& is_new_db = recovery_ctx ? recovery_ctx->is_new_db_ : tmp_is_new_db; assert(db_lock_ == nullptr); std::vector files_in_dbname; if (!read_only) { @@ -540,6 +549,106 @@ Status DBImpl::Recover( if (!s.ok()) { return s; } + // If we have replication_log_listener registered, we can not write to + // MANIFEST during open, only when we are upgraded to be leaders. + if (s.ok() && !read_only && !immutable_db_options_.replication_log_listener) { + for (auto cfd : *versions_->GetColumnFamilySet()) { + // Try to trivially move files down the LSM tree to start from bottommost + // level when level_compaction_dynamic_level_bytes is enabled. This should + // only be useful when user is migrating to turning on this option. + // If a user is migrating from Level Compaction with a smaller level + // multiplier or from Universal Compaction, there may be too many + // non-empty levels and the trivial moves here are not sufficed for + // migration. Additional compactions are needed to drain unnecessary + // levels. + // + // Note that this step moves files down LSM without consulting + // SSTPartitioner. Further compactions are still needed if + // the user wants to partition SST files. + // Note that files moved in this step may not respect the compression + // option in target level. + if (cfd->ioptions()->compaction_style == + CompactionStyle::kCompactionStyleLevel && + cfd->ioptions()->level_compaction_dynamic_level_bytes && + !cfd->GetLatestMutableCFOptions()->disable_auto_compactions) { + int to_level = cfd->ioptions()->num_levels - 1; + // last level is reserved + // allow_ingest_behind does not support Level Compaction, + // and per_key_placement can have infinite compaction loop for Level + // Compaction. Adjust to_level here just to be safe. + if (cfd->ioptions()->allow_ingest_behind || + cfd->ioptions()->preclude_last_level_data_seconds > 0) { + to_level -= 1; + } + // Whether this column family has a level trivially moved + bool moved = false; + // Fill the LSM starting from to_level and going up one level at a time. + // Some loop invariants (when last level is not reserved): + // - levels in (from_level, to_level] are empty, and + // - levels in (to_level, last_level] are non-empty. + for (int from_level = to_level; from_level >= 0; --from_level) { + const std::vector& level_files = + cfd->current()->storage_info()->LevelFiles(from_level); + if (level_files.empty() || from_level == 0) { + continue; + } + assert(from_level <= to_level); + // Trivial move files from `from_level` to `to_level` + if (from_level < to_level) { + if (!moved) { + // lsm_state will look like "[1,2,3,4,5,6,0]" for an LSM with + // 7 levels + std::string lsm_state = "["; + for (int i = 0; i < cfd->ioptions()->num_levels; ++i) { + lsm_state += std::to_string( + cfd->current()->storage_info()->NumLevelFiles(i)); + if (i < cfd->ioptions()->num_levels - 1) { + lsm_state += ","; + } + } + lsm_state += "]"; + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] Trivially move files down the LSM when open " + "with level_compaction_dynamic_level_bytes=true," + " lsm_state: %s (Files are moved only if DB " + "Recovery is successful).", + cfd->GetName().c_str(), lsm_state.c_str()); + moved = true; + } + ROCKS_LOG_WARN( + immutable_db_options_.info_log, + "[%s] Moving %zu files from from_level-%d to from_level-%d", + cfd->GetName().c_str(), level_files.size(), from_level, + to_level); + VersionEdit edit; + edit.SetColumnFamily(cfd->GetID()); + for (const FileMetaData* f : level_files) { + edit.DeleteFile(from_level, f->fd.GetNumber()); + edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(), + f->fd.GetFileSize(), f->smallest, f->largest, + f->fd.smallest_seqno, f->fd.largest_seqno, + f->marked_for_compaction, + f->temperature, // this can be different from + // `last_level_temperature` + f->oldest_blob_file_number, f->oldest_ancester_time, + f->file_creation_time, f->epoch_number, + f->file_checksum, f->file_checksum_func_name, + f->unique_id, f->compensated_range_deletion_size, + f->tail_size, f->user_defined_timestamps_persisted); + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] Moving #%" PRIu64 + " from from_level-%d to from_level-%d %" PRIu64 + " bytes\n", + cfd->GetName().c_str(), f->fd.GetNumber(), + from_level, to_level, f->fd.GetFileSize()); + } + recovery_ctx->UpdateVersionEdits(cfd, edit); + } + --to_level; + } + } + } + } s = SetupDBId(read_only, recovery_ctx); ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str()); #if 0 @@ -780,7 +889,8 @@ Status DBImpl::PersistentStatsProcessFormatVersion() { if (s.ok()) { ColumnFamilyOptions cfo; OptimizeForPersistentStats(&cfo); - s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); + s = CreateColumnFamilyImpl(cfo, kPersistentStatsColumnFamilyName, + &handle); } if (s.ok()) { persist_stats_cf_handle_ = static_cast(handle); @@ -833,7 +943,7 @@ Status DBImpl::InitPersistStatsColumnFamily() { ColumnFamilyHandle* handle = nullptr; ColumnFamilyOptions cfo; OptimizeForPersistentStats(&cfo); - s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle); + s = CreateColumnFamilyImpl(cfo, kPersistentStatsColumnFamilyName, &handle); persist_stats_cf_handle_ = static_cast(handle); mutex_.Lock(); } @@ -845,14 +955,18 @@ Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) { assert(versions_->descriptor_log_ == nullptr); Status s; if (!recovery_ctx.edit_lists_.empty()) { + const ReadOptions read_options(Env::IOActivity::kDBOpen); s = versions_->LogAndApply( - recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_, + recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_, read_options, recovery_ctx.edit_lists_, &mutex_, directories_.GetDbDir()); } if (s.ok() && !(recovery_ctx.files_to_delete_.empty())) { mutex_.Unlock(); - for (const auto& fname : recovery_ctx.files_to_delete_) { - s = env_->DeleteFile(fname); + for (const auto& stale_sst_file : recovery_ctx.files_to_delete_) { + s = DeleteDBFile(&immutable_db_options_, stale_sst_file.first, + stale_sst_file.second, + /*force_bg=*/false, + /*force_fg=*/false); if (!s.ok()) { break; } @@ -863,7 +977,6 @@ Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) { } void DBImpl::InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap() { -#ifndef ROCKSDB_LITE if (immutable_db_options_.wal_filter == nullptr) { return; } @@ -881,7 +994,6 @@ void DBImpl::InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap() { } wal_filter.ColumnFamilyLogNumberMap(cf_lognumber_map, cf_name_id_map); -#endif // !ROCKSDB_LITE } bool DBImpl::InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number, @@ -890,7 +1002,6 @@ bool DBImpl::InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number, Status& status, bool& stop_replay, WriteBatch& batch) { -#ifndef ROCKSDB_LITE if (immutable_db_options_.wal_filter == nullptr) { return true; } @@ -976,15 +1087,6 @@ bool DBImpl::InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number, batch = new_batch; } return true; -#else // !ROCKSDB_LITE - (void)wal_number; - (void)wal_fname; - (void)reporter; - (void)status; - (void)stop_replay; - (void)batch; - return true; -#endif // ROCKSDB_LITE } // REQUIRES: wal_numbers are sorted in ascending order @@ -992,6 +1094,11 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, SequenceNumber* next_sequence, bool read_only, bool* corrupted_wal_found, RecoveryContext* recovery_ctx) { + if (immutable_db_options_.replication_log_listener) { + // No WALs if replication_log_listener exists + return Status::OK(); + } + struct LogReporter : public log::Reader::Reporter { Env* env; Logger* info_log; @@ -1118,6 +1225,9 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, std::string scratch; Slice record; + const UnorderedMap& running_ts_sz = + versions_->GetRunningColumnFamiliesTimestampSize(); + TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal", /*arg=*/nullptr); uint64_t record_checksum; @@ -1131,27 +1241,48 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, Status::Corruption("log record too small")); continue; } - // We create a new batch and initialize with a valid prot_info_ to store // the data checksums WriteBatch batch; + std::unique_ptr new_batch; status = WriteBatchInternal::SetContents(&batch, record); if (!status.ok()) { return status; } + + const UnorderedMap& record_ts_sz = + reader.GetRecordedTimestampSize(); + status = HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kReconcileInconsistency, &new_batch); + if (!status.ok()) { + return status; + } + + bool batch_updated = new_batch != nullptr; + WriteBatch* batch_to_use = batch_updated ? new_batch.get() : &batch; TEST_SYNC_POINT_CALLBACK( - "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch", &batch); + "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch", + batch_to_use); TEST_SYNC_POINT_CALLBACK( "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum", &record_checksum); status = WriteBatchInternal::UpdateProtectionInfo( - &batch, 8 /* bytes_per_key */, &record_checksum); + batch_to_use, 8 /* bytes_per_key */, + batch_updated ? nullptr : &record_checksum); if (!status.ok()) { return status; } - SequenceNumber sequence = WriteBatchInternal::Sequence(&batch); + SequenceNumber sequence = WriteBatchInternal::Sequence(batch_to_use); + if (sequence > kMaxSequenceNumber) { + reporter.Corruption( + record.size(), + Status::Corruption("sequence " + std::to_string(sequence) + + " is too large")); + continue; + } if (immutable_db_options_.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { @@ -1172,7 +1303,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, // and returns true. if (!InvokeWalFilterIfNeededOnWalRecord(wal_number, fname, reporter, status, stop_replay_by_wal_filter, - batch)) { + *batch_to_use)) { continue; } @@ -1183,7 +1314,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, // That's why we set ignore missing column families to true bool has_valid_writes = false; status = WriteBatchInternal::InsertInto( - &batch, column_family_memtables_.get(), &flush_scheduler_, + batch_to_use, column_family_memtables_.get(), &flush_scheduler_, &trim_history_scheduler_, true, wal_number, this, false /* concurrent_memtable_writes */, next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_); @@ -1217,7 +1348,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, flushed = true; cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), - *next_sequence); + *next_sequence - 1); } } } @@ -1388,7 +1519,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, } } - if (flushed) { + if (flushed || !data_seen) { VersionEdit wal_deletion; if (immutable_db_options_.track_and_verify_wals_in_manifest) { wal_deletion.DeleteWalsBefore(max_wal_number + 1); @@ -1514,6 +1645,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); ReadOptions ro; ro.total_order_seek = true; + ro.io_activity = Env::IOActivity::kDBOpen; Arena arena; Status s; TableProperties table_properties; @@ -1569,19 +1701,22 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, TableFileCreationReason::kRecovery, 0 /* oldest_key_time */, 0 /* file_creation_time */, db_id_, db_session_id_, 0 /* target_file_size */, meta.fd.GetNumber()); - SeqnoToTimeMapping empty_seqno_time_mapping; + SeqnoToTimeMapping empty_seqno_to_time_mapping; Version* version = cfd->current(); version->Ref(); + const ReadOptions read_option(Env::IOActivity::kDBOpen); + uint64_t num_input_entries = 0; s = BuildTable( dbname_, versions_.get(), immutable_db_options_, tboptions, - file_options_for_compaction_, cfd->table_cache(), iter.get(), - std::move(range_del_iters), &meta, &blob_file_additions, + file_options_for_compaction_, read_option, cfd->table_cache(), + iter.get(), std::move(range_del_iters), &meta, &blob_file_additions, snapshot_seqs, earliest_write_conflict_snapshot, kMaxSequenceNumber, snapshot_checker, paranoid_file_checks, cfd->internal_stats(), &io_s, io_tracer_, BlobFileCreationReason::kRecovery, - empty_seqno_time_mapping, &event_logger_, job_id, Env::IO_HIGH, + empty_seqno_to_time_mapping, &event_logger_, job_id, Env::IO_HIGH, nullptr /* table_properties */, write_hint, - nullptr /*full_history_ts_low*/, &blob_callback_, version); + nullptr /*full_history_ts_low*/, &blob_callback_, version, + &num_input_entries); version->Unref(); LogFlush(immutable_db_options_.info_log); ROCKS_LOG_DEBUG(immutable_db_options_.info_log, @@ -1595,6 +1730,19 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, if (!io_s.ok() && s.ok()) { s = io_s; } + + uint64_t total_num_entries = mem->num_entries(); + if (s.ok() && total_num_entries != num_input_entries) { + std::string msg = "Expected " + std::to_string(total_num_entries) + + " entries in memtable, but read " + + std::to_string(num_input_entries); + ROCKS_LOG_WARN(immutable_db_options_.info_log, + "[%s] [JOB %d] Level-0 flush during recover: %s", + cfd->GetName().c_str(), job_id, msg.c_str()); + if (immutable_db_options_.flush_verify_memtable_count) { + s = Status::Corruption(msg); + } + } } } ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); @@ -1613,11 +1761,28 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, meta.oldest_blob_file_number, meta.oldest_ancester_time, meta.file_creation_time, meta.epoch_number, meta.file_checksum, meta.file_checksum_func_name, - meta.unique_id, meta.compensated_range_deletion_size); + meta.unique_id, meta.compensated_range_deletion_size, + meta.tail_size, meta.user_defined_timestamps_persisted); for (const auto& blob : blob_file_additions) { edit->AddBlobFile(blob); } + + // For UDT in memtable only feature, move up the cutoff timestamp whenever + // a flush happens. + const Comparator* ucmp = cfd->user_comparator(); + size_t ts_sz = ucmp->timestamp_size(); + if (ts_sz > 0 && !cfd->ioptions()->persist_user_defined_timestamps) { + Slice mem_newest_udt = mem->GetNewestUDT(); + std::string full_history_ts_low = cfd->GetFullHistoryTsLow(); + if (full_history_ts_low.empty() || + ucmp->CompareTimestamp(mem_newest_udt, full_history_ts_low) >= 0) { + std::string new_full_history_ts_low; + GetFullHistoryTsLowFromU64CutoffTs(&mem_newest_udt, + &new_full_history_ts_low); + edit->SetFullHistoryTsLow(new_full_history_ts_low); + } + } } InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); @@ -1676,8 +1841,12 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, std::vector* handles, DB** dbptr) { const bool kSeqPerBatch = true; const bool kBatchPerTxn = true; - return DBImpl::Open(db_options, dbname, column_families, handles, dbptr, - !kSeqPerBatch, kBatchPerTxn); + ThreadStatusUtil::SetEnableTracking(db_options.enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_DBOPEN); + Status s = DBImpl::Open(db_options, dbname, column_families, handles, dbptr, + !kSeqPerBatch, kBatchPerTxn); + ThreadStatusUtil::ResetThreadStatus(); + return s; } // TODO: Implement the trimming in flush code path. @@ -1858,11 +2027,14 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath(); RecoveryContext recovery_ctx; + impl->options_mutex_.Lock(); impl->mutex_.Lock(); // Handles create_if_missing, error_if_exists uint64_t recovered_seq(kMaxSequenceNumber); - s = impl->Recover(column_families, false, false, false, &recovered_seq, + s = impl->Recover(column_families, false /* read_only */, + false /* error_if_wal_file_exists */, + false /* error_if_data_exists_in_wals */, &recovered_seq, &recovery_ctx); if (s.ok()) { uint64_t new_log_number = impl->versions_->NewFileNumber(); @@ -1937,7 +2109,9 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, // missing column family, create it ColumnFamilyHandle* handle = nullptr; impl->mutex_.Unlock(); - s = impl->CreateColumnFamily(cf.options, cf.name, &handle); + // NOTE: the work normally done in WrapUpCreateColumnFamilies will + // be done separately below. + s = impl->CreateColumnFamilyImpl(cf.options, cf.name, &handle); impl->mutex_.Lock(); if (s.ok()) { handles->push_back(handle); @@ -1988,9 +2162,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, if (s.ok()) { // Persist RocksDB Options before scheduling the compaction. // The WriteOptionsFile() will release and lock the mutex internally. - persist_options_status = impl->WriteOptionsFile( - false /*need_mutex_lock*/, false /*need_enter_write_thread*/); - + persist_options_status = + impl->WriteOptionsFile(true /*db_mutex_already_held*/); *dbptr = impl; impl->opened_successfully_ = true; impl->DeleteObsoleteFiles(); @@ -2001,7 +2174,6 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, } impl->mutex_.Unlock(); -#ifndef ROCKSDB_LITE auto sfm = static_cast( impl->immutable_db_options_.sst_file_manager.get()); if (s.ok() && sfm) { @@ -2085,7 +2257,6 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, impl->immutable_db_options_.db_paths[0].path); } -#endif // !ROCKSDB_LITE if (s.ok()) { ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p", @@ -2113,10 +2284,10 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, if (s.ok()) { s = impl->StartPeriodicTaskScheduler(); } - if (s.ok()) { - s = impl->RegisterRecordSeqnoTimeWorker(); + s = impl->RegisterRecordSeqnoTimeWorker(recovery_ctx.is_new_db_); } + impl->options_mutex_.Unlock(); if (!s.ok()) { for (auto* h : *handles) { delete h; diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc index 0f10baf24975..997a4e2edf13 100644 --- a/db/db_impl/db_impl_readonly.cc +++ b/db/db_impl/db_impl_readonly.cc @@ -16,7 +16,6 @@ namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options, const std::string& dbname) @@ -30,30 +29,23 @@ DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options, DBImplReadOnly::~DBImplReadOnly() {} // Implementations of the DB interface -Status DBImplReadOnly::Get(const ReadOptions& read_options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* pinnable_val) { - return Get(read_options, column_family, key, pinnable_val, - /*timestamp*/ nullptr); -} +Status DBImplReadOnly::GetImpl(const ReadOptions& read_options, + const Slice& key, + GetImplOptions& get_impl_options) { + assert(get_impl_options.value != nullptr || + get_impl_options.columns != nullptr); + assert(get_impl_options.column_family); -Status DBImplReadOnly::Get(const ReadOptions& read_options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* pinnable_val, - std::string* timestamp) { - assert(pinnable_val != nullptr); - // TODO: stopwatch DB_GET needed?, perf timer needed? - PERF_TIMER_GUARD(get_snapshot_time); + Status s; - assert(column_family); if (read_options.timestamp) { - const Status s = FailIfTsMismatchCf( - column_family, *(read_options.timestamp), /*ts_for_read=*/true); + s = FailIfTsMismatchCf(get_impl_options.column_family, + *(read_options.timestamp)); if (!s.ok()) { return s; } } else { - const Status s = FailIfCfHasTs(column_family); + s = FailIfCfHasTs(get_impl_options.column_family); if (!s.ok()) { return s; } @@ -61,62 +53,99 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options, // Clear the timestamps for returning results so that we can distinguish // between tombstone or key that has never been written - if (timestamp) { - timestamp->clear(); + if (get_impl_options.timestamp) { + get_impl_options.timestamp->clear(); } - const Comparator* ucmp = column_family->GetComparator(); - assert(ucmp); - std::string* ts = ucmp->timestamp_size() > 0 ? timestamp : nullptr; + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_GET); + PERF_TIMER_GUARD(get_snapshot_time); - Status s; + const Comparator* ucmp = get_impl_options.column_family->GetComparator(); + assert(ucmp); + std::string* ts = + ucmp->timestamp_size() > 0 ? get_impl_options.timestamp : nullptr; SequenceNumber snapshot = versions_->LastSequence(); GetWithTimestampReadCallback read_cb(snapshot); - auto cfh = static_cast_with_check(column_family); + auto cfh = static_cast_with_check( + get_impl_options.column_family); auto cfd = cfh->cfd(); if (tracer_) { InstrumentedMutexLock lock(&trace_mutex_); if (tracer_) { - tracer_->Get(column_family, key); + tracer_->Get(get_impl_options.column_family, key); } } + + // In read-only mode Get(), no super version operation is needed (i.e. + // GetAndRefSuperVersion and ReturnAndCleanupSuperVersion) SuperVersion* super_version = cfd->GetSuperVersion(); + if (read_options.timestamp && read_options.timestamp->size() > 0) { + s = FailIfReadCollapsedHistory(cfd, super_version, + *(read_options.timestamp)); + if (!s.ok()) { + return s; + } + } MergeContext merge_context; SequenceNumber max_covering_tombstone_seq = 0; LookupKey lkey(key, snapshot, read_options.timestamp); PERF_TIMER_STOP(get_snapshot_time); - if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), - /*columns=*/nullptr, ts, &s, &merge_context, - &max_covering_tombstone_seq, read_options, - false /* immutable_memtable */, &read_cb)) { - pinnable_val->PinSelf(); + + // Look up starts here + if (super_version->mem->Get( + lkey, + get_impl_options.value ? get_impl_options.value->GetSelf() : nullptr, + get_impl_options.columns, ts, &s, &merge_context, + &max_covering_tombstone_seq, read_options, + false /* immutable_memtable */, &read_cb)) { + if (get_impl_options.value) { + get_impl_options.value->PinSelf(); + } RecordTick(stats_, MEMTABLE_HIT); } else { PERF_TIMER_GUARD(get_from_output_files_time); PinnedIteratorsManager pinned_iters_mgr; super_version->current->Get( - read_options, lkey, pinnable_val, /*columns=*/nullptr, ts, &s, - &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr, + read_options, lkey, get_impl_options.value, get_impl_options.columns, + ts, &s, &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr, /*value_found*/ nullptr, /*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb, /*is_blob*/ nullptr, /*do_merge*/ true); RecordTick(stats_, MEMTABLE_MISS); } - RecordTick(stats_, NUMBER_KEYS_READ); - size_t size = pinnable_val->size(); - RecordTick(stats_, BYTES_READ, size); - RecordInHistogram(stats_, BYTES_PER_READ, size); - PERF_COUNTER_ADD(get_read_bytes, size); + { + RecordTick(stats_, NUMBER_KEYS_READ); + size_t size = 0; + if (get_impl_options.value) { + size = get_impl_options.value->size(); + } else if (get_impl_options.columns) { + size = get_impl_options.columns->serialized_size(); + } + RecordTick(stats_, BYTES_READ, size); + RecordInHistogram(stats_, BYTES_PER_READ, size); + PERF_COUNTER_ADD(get_read_bytes, size); + } return s; } -Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, +Iterator* DBImplReadOnly::NewIterator(const ReadOptions& _read_options, ColumnFamilyHandle* column_family) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kDBIterator) { + return NewErrorIterator(Status::InvalidArgument( + "Can only call NewIterator with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kDBIterator`")); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kDBIterator; + } assert(column_family); if (read_options.timestamp) { - const Status s = FailIfTsMismatchCf( - column_family, *(read_options.timestamp), /*ts_for_read=*/true); + const Status s = + FailIfTsMismatchCf(column_family, *(read_options.timestamp)); if (!s.ok()) { return NewErrorIterator(s); } @@ -129,6 +158,14 @@ Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); + if (read_options.timestamp && read_options.timestamp->size() > 0) { + const Status s = FailIfReadCollapsedHistory(cfd, super_version, + *(read_options.timestamp)); + if (!s.ok()) { + cfd->GetSuperVersion()->Unref(); + return NewErrorIterator(s); + } + } SequenceNumber latest_snapshot = versions_->LastSequence(); SequenceNumber read_seq = read_options.snapshot != nullptr @@ -155,8 +192,7 @@ Status DBImplReadOnly::NewIterators( if (read_options.timestamp) { for (auto* cf : column_families) { assert(cf); - const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp), - /*ts_for_read=*/true); + const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp)); if (!s.ok()) { return s; } @@ -184,9 +220,27 @@ Status DBImplReadOnly::NewIterators( ->number_ : latest_snapshot; + autovector> cfd_to_sv; + + const bool check_read_ts = + read_options.timestamp && read_options.timestamp->size() > 0; for (auto cfh : column_families) { auto* cfd = static_cast_with_check(cfh)->cfd(); auto* sv = cfd->GetSuperVersion()->Ref(); + cfd_to_sv.emplace_back(cfd, sv); + if (check_read_ts) { + const Status s = + FailIfReadCollapsedHistory(cfd, sv, *(read_options.timestamp)); + if (!s.ok()) { + for (auto prev_entry : cfd_to_sv) { + std::get<1>(prev_entry)->Unref(); + } + return s; + } + } + } + assert(cfd_to_sv.size() == column_families.size()); + for (auto [cfd, sv] : cfd_to_sv) { auto* db_iter = NewArenaWrappedDbIterator( env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, sv->current, read_seq, @@ -321,21 +375,5 @@ Status DBImplReadOnly::OpenForReadOnlyWithoutCheck( return s; } -#else // !ROCKSDB_LITE - -Status DB::OpenForReadOnly(const Options& /*options*/, - const std::string& /*dbname*/, DB** /*dbptr*/, - bool /*error_if_wal_file_exists*/) { - return Status::NotSupported("Not supported in ROCKSDB_LITE."); -} - -Status DB::OpenForReadOnly( - const DBOptions& /*db_options*/, const std::string& /*dbname*/, - const std::vector& /*column_families*/, - std::vector* /*handles*/, DB** /*dbptr*/, - bool /*error_if_wal_file_exists*/) { - return Status::NotSupported("Not supported in ROCKSDB_LITE."); -} -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h index b876a0fdaf0a..32bc85607061 100644 --- a/db/db_impl/db_impl_readonly.h +++ b/db/db_impl/db_impl_readonly.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -25,18 +24,14 @@ class DBImplReadOnly : public DBImpl { virtual ~DBImplReadOnly(); // Implementations of the DB interface - using DB::Get; - virtual Status Get(const ReadOptions& options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) override; - Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, - const Slice& key, PinnableSlice* value, - std::string* timestamp) override; + using DBImpl::GetImpl; + Status GetImpl(const ReadOptions& options, const Slice& key, + GetImplOptions& get_impl_options) override; // TODO: Implement ReadOnly MultiGet? using DBImpl::NewIterator; - virtual Iterator* NewIterator(const ReadOptions&, + virtual Iterator* NewIterator(const ReadOptions& _read_options, ColumnFamilyHandle* column_family) override; virtual Status NewIterators( @@ -58,6 +53,10 @@ class DBImplReadOnly : public DBImpl { const WideColumns& /* columns */) override { return Status::NotSupported("Not supported operation in read only mode."); } + Status PutEntity(const WriteOptions& /* options */, const Slice& /* key */, + const AttributeGroups& /* attribute_groups */) override { + return Status::NotSupported("Not supported operation in read only mode."); + } using DBImpl::Merge; virtual Status Merge(const WriteOptions& /*options*/, @@ -143,15 +142,29 @@ class DBImplReadOnly : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& /*options*/, + const std::string& /*column_family_name*/, + const ImportColumnFamilyOptions& /*import_options*/, + const std::vector& /*metadatas*/, + ColumnFamilyHandle** /*handle*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + // FIXME: some missing overrides for more "write" functions protected: -#ifndef ROCKSDB_LITE Status FlushForGetLiveFiles() override { // No-op for read-only DB return Status::OK(); } -#endif // !ROCKSDB_LITE private: // A "helper" function for DB::OpenForReadOnly without column families @@ -166,5 +179,3 @@ class DBImplReadOnly : public DBImpl { friend class DB; }; } // namespace ROCKSDB_NAMESPACE - -#endif // !ROCKSDB_LITE diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index 5189d17d9843..235a528ba08a 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -14,10 +14,10 @@ #include "monitoring/perf_context_imp.h" #include "rocksdb/configurable.h" #include "util/cast_util.h" +#include "util/write_batch_util.h" namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE DBImplSecondary::DBImplSecondary(const DBOptions& db_options, const std::string& dbname, std::string secondary_path) @@ -198,6 +198,9 @@ Status DBImplSecondary::RecoverLogFiles( } assert(reader != nullptr); } + + const UnorderedMap& running_ts_sz = + versions_->GetRunningColumnFamiliesTimestampSize(); for (auto log_number : log_numbers) { auto it = log_readers_.find(log_number); assert(it != log_readers_.end()); @@ -225,6 +228,14 @@ Status DBImplSecondary::RecoverLogFiles( if (!status.ok()) { break; } + const UnorderedMap& record_ts_sz = + reader->GetRecordedTimestampSize(); + status = HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kVerifyConsistency); + if (!status.ok()) { + break; + } SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch); std::vector column_family_ids; status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids); @@ -328,85 +339,93 @@ Status DBImplSecondary::RecoverLogFiles( return status; } -// Implementation of the DB interface -Status DBImplSecondary::Get(const ReadOptions& read_options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value) { - return GetImpl(read_options, column_family, key, value, - /*timestamp*/ nullptr); -} - -Status DBImplSecondary::Get(const ReadOptions& read_options, - ColumnFamilyHandle* column_family, const Slice& key, - PinnableSlice* value, std::string* timestamp) { - return GetImpl(read_options, column_family, key, value, timestamp); -} - Status DBImplSecondary::GetImpl(const ReadOptions& read_options, - ColumnFamilyHandle* column_family, - const Slice& key, PinnableSlice* pinnable_val, - std::string* timestamp) { - assert(pinnable_val != nullptr); - PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); - StopWatch sw(immutable_db_options_.clock, stats_, DB_GET); - PERF_TIMER_GUARD(get_snapshot_time); + const Slice& key, + GetImplOptions& get_impl_options) { + assert(get_impl_options.value != nullptr || + get_impl_options.columns != nullptr); + assert(get_impl_options.column_family); + + Status s; - assert(column_family); if (read_options.timestamp) { - const Status s = FailIfTsMismatchCf( - column_family, *(read_options.timestamp), /*ts_for_read=*/true); + s = FailIfTsMismatchCf(get_impl_options.column_family, + *(read_options.timestamp)); if (!s.ok()) { return s; } } else { - const Status s = FailIfCfHasTs(column_family); + s = FailIfCfHasTs(get_impl_options.column_family); if (!s.ok()) { return s; } } - // Clear the timestamp for returning results so that we can distinguish - // between tombstone or key that has never been written later. - if (timestamp) { - timestamp->clear(); + // Clear the timestamps for returning results so that we can distinguish + // between tombstone or key that has never been written + if (get_impl_options.timestamp) { + get_impl_options.timestamp->clear(); } - auto cfh = static_cast(column_family); - ColumnFamilyData* cfd = cfh->cfd(); + PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); + StopWatch sw(immutable_db_options_.clock, stats_, DB_GET); + PERF_TIMER_GUARD(get_snapshot_time); + + const Comparator* ucmp = get_impl_options.column_family->GetComparator(); + assert(ucmp); + std::string* ts = + ucmp->timestamp_size() > 0 ? get_impl_options.timestamp : nullptr; + SequenceNumber snapshot = versions_->LastSequence(); + GetWithTimestampReadCallback read_cb(snapshot); + auto cfh = static_cast_with_check( + get_impl_options.column_family); + auto cfd = cfh->cfd(); if (tracer_) { InstrumentedMutexLock lock(&trace_mutex_); if (tracer_) { - tracer_->Get(column_family, key); + tracer_->Get(get_impl_options.column_family, key); } } + // Acquire SuperVersion SuperVersion* super_version = GetAndRefSuperVersion(cfd); - SequenceNumber snapshot = versions_->LastSequence(); - GetWithTimestampReadCallback read_cb(snapshot); + if (read_options.timestamp && read_options.timestamp->size() > 0) { + s = FailIfReadCollapsedHistory(cfd, super_version, + *(read_options.timestamp)); + if (!s.ok()) { + ReturnAndCleanupSuperVersion(cfd, super_version); + return s; + } + } MergeContext merge_context; SequenceNumber max_covering_tombstone_seq = 0; - Status s; LookupKey lkey(key, snapshot, read_options.timestamp); PERF_TIMER_STOP(get_snapshot_time); - bool done = false; - const Comparator* ucmp = column_family->GetComparator(); - assert(ucmp); - std::string* ts = ucmp->timestamp_size() > 0 ? timestamp : nullptr; - if (super_version->mem->Get(lkey, pinnable_val->GetSelf(), - /*columns=*/nullptr, ts, &s, &merge_context, - &max_covering_tombstone_seq, read_options, - false /* immutable_memtable */, &read_cb)) { + + // Look up starts here + if (super_version->mem->Get( + lkey, + get_impl_options.value ? get_impl_options.value->GetSelf() : nullptr, + get_impl_options.columns, ts, &s, &merge_context, + &max_covering_tombstone_seq, read_options, + false /* immutable_memtable */, &read_cb)) { done = true; - pinnable_val->PinSelf(); + if (get_impl_options.value) { + get_impl_options.value->PinSelf(); + } RecordTick(stats_, MEMTABLE_HIT); } else if ((s.ok() || s.IsMergeInProgress()) && super_version->imm->Get( - lkey, pinnable_val->GetSelf(), /*columns=*/nullptr, ts, &s, - &merge_context, &max_covering_tombstone_seq, read_options, - &read_cb)) { + lkey, + get_impl_options.value ? get_impl_options.value->GetSelf() + : nullptr, + get_impl_options.columns, ts, &s, &merge_context, + &max_covering_tombstone_seq, read_options, &read_cb)) { done = true; - pinnable_val->PinSelf(); + if (get_impl_options.value) { + get_impl_options.value->PinSelf(); + } RecordTick(stats_, MEMTABLE_HIT); } if (!done && !s.ok() && !s.IsMergeInProgress()) { @@ -417,8 +436,8 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options, PERF_TIMER_GUARD(get_from_output_files_time); PinnedIteratorsManager pinned_iters_mgr; super_version->current->Get( - read_options, lkey, pinnable_val, /*columns=*/nullptr, ts, &s, - &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr, + read_options, lkey, get_impl_options.value, get_impl_options.columns, + ts, &s, &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr, /*value_found*/ nullptr, /*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb, /*is_blob*/ nullptr, /*do_merge*/ true); @@ -428,7 +447,12 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options, PERF_TIMER_GUARD(get_post_process_time); ReturnAndCleanupSuperVersion(cfd, super_version); RecordTick(stats_, NUMBER_KEYS_READ); - size_t size = pinnable_val->size(); + size_t size = 0; + if (get_impl_options.value) { + size = get_impl_options.value->size(); + } else if (get_impl_options.columns) { + size = get_impl_options.columns->serialized_size(); + } RecordTick(stats_, BYTES_READ, size); RecordTimeToHistogram(stats_, BYTES_PER_READ, size); PERF_COUNTER_ADD(get_read_bytes, size); @@ -436,8 +460,18 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options, return s; } -Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options, +Iterator* DBImplSecondary::NewIterator(const ReadOptions& _read_options, ColumnFamilyHandle* column_family) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kDBIterator) { + return NewErrorIterator(Status::InvalidArgument( + "Can only call NewIterator with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kDBIterator`")); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kDBIterator; + } if (read_options.managed) { return NewErrorIterator( Status::NotSupported("Managed iterator is not supported anymore.")); @@ -449,8 +483,8 @@ Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options, assert(column_family); if (read_options.timestamp) { - const Status s = FailIfTsMismatchCf( - column_family, *(read_options.timestamp), /*ts_for_read=*/true); + const Status s = + FailIfTsMismatchCf(column_family, *(read_options.timestamp)); if (!s.ok()) { return NewErrorIterator(s); } @@ -474,17 +508,25 @@ Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options, Status::NotSupported("snapshot not supported in secondary mode")); } else { SequenceNumber snapshot(kMaxSequenceNumber); - result = NewIteratorImpl(read_options, cfd, snapshot, read_callback); + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + if (read_options.timestamp && read_options.timestamp->size() > 0) { + const Status s = + FailIfReadCollapsedHistory(cfd, sv, *(read_options.timestamp)); + if (!s.ok()) { + CleanupSuperVersion(sv); + return NewErrorIterator(s); + } + } + result = NewIteratorImpl(read_options, cfd, sv, snapshot, read_callback); } return result; } ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl( const ReadOptions& read_options, ColumnFamilyData* cfd, - SequenceNumber snapshot, ReadCallback* read_callback, - bool expose_blob_index, bool allow_refresh) { + SuperVersion* super_version, SequenceNumber snapshot, + ReadCallback* read_callback, bool expose_blob_index, bool allow_refresh) { assert(nullptr != cfd); - SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); assert(snapshot == kMaxSequenceNumber); snapshot = versions_->LastSequence(); assert(snapshot != kMaxSequenceNumber); @@ -493,7 +535,7 @@ ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl( super_version->current, snapshot, super_version->mutable_cf_options.max_sequential_skip_in_iterations, super_version->version_number, read_callback, this, cfd, - expose_blob_index, read_options.snapshot ? false : allow_refresh); + expose_blob_index, allow_refresh); auto internal_iter = NewInternalIterator( db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(), snapshot, /* allow_unprepared_value */ true, db_iter); @@ -502,9 +544,19 @@ ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl( } Status DBImplSecondary::NewIterators( - const ReadOptions& read_options, + const ReadOptions& _read_options, const std::vector& column_families, std::vector* iterators) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kDBIterator) { + return Status::InvalidArgument( + "Can only call NewIterators with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kDBIterator`"); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kDBIterator; + } if (read_options.managed) { return Status::NotSupported("Managed iterator is not supported anymore."); } @@ -520,8 +572,7 @@ Status DBImplSecondary::NewIterators( if (read_options.timestamp) { for (auto* cf : column_families) { assert(cf); - const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp), - /*ts_for_read=*/true); + const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp)); if (!s.ok()) { return s; } @@ -545,10 +596,28 @@ Status DBImplSecondary::NewIterators( return Status::NotSupported("snapshot not supported in secondary mode"); } else { SequenceNumber read_seq(kMaxSequenceNumber); + autovector> cfd_to_sv; + const bool check_read_ts = + read_options.timestamp && read_options.timestamp->size() > 0; for (auto cfh : column_families) { ColumnFamilyData* cfd = static_cast(cfh)->cfd(); + SuperVersion* sv = cfd->GetReferencedSuperVersion(this); + cfd_to_sv.emplace_back(cfd, sv); + if (check_read_ts) { + const Status s = + FailIfReadCollapsedHistory(cfd, sv, *(read_options.timestamp)); + if (!s.ok()) { + for (auto prev_entry : cfd_to_sv) { + CleanupSuperVersion(std::get<1>(prev_entry)); + } + return s; + } + } + } + assert(cfd_to_sv.size() == column_families.size()); + for (auto [cfd, sv] : cfd_to_sv) { iterators->push_back( - NewIteratorImpl(read_options, cfd, read_seq, read_callback)); + NewIteratorImpl(read_options, cfd, sv, read_seq, read_callback)); } } return Status::OK(); @@ -816,7 +885,7 @@ Status DBImplSecondary::CompactWithoutInstallation( *mutable_cf_options, mutable_db_options_, 0)); assert(c != nullptr); - c->SetInputVersion(version); + c->FinalizeInputInfo(version); // Create output directory if it's not existed yet std::unique_ptr output_dir; @@ -934,6 +1003,8 @@ Status DB::OpenAndCompact( delete db; if (s.ok()) { return serialization_status; + } else { + serialization_status.PermitUncheckedError(); } return s; } @@ -946,22 +1017,5 @@ Status DB::OpenAndCompact( output, override_options); } -#else // !ROCKSDB_LITE - -Status DB::OpenAsSecondary(const Options& /*options*/, - const std::string& /*name*/, - const std::string& /*secondary_path*/, - DB** /*dbptr*/) { - return Status::NotSupported("Not supported in ROCKSDB_LITE."); -} - -Status DB::OpenAsSecondary( - const DBOptions& /*db_options*/, const std::string& /*dbname*/, - const std::string& /*secondary_path*/, - const std::vector& /*column_families*/, - std::vector* /*handles*/, DB** /*dbptr*/) { - return Status::NotSupported("Not supported in ROCKSDB_LITE."); -} -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_impl/db_impl_secondary.h b/db/db_impl/db_impl_secondary.h index eb93618752cd..12a8bbdd7070 100644 --- a/db/db_impl/db_impl_secondary.h +++ b/db/db_impl/db_impl_secondary.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -86,8 +85,6 @@ class DBImplSecondary : public DBImpl { bool error_if_data_exists_in_wals, uint64_t* = nullptr, RecoveryContext* recovery_ctx = nullptr) override; - // Implementations of the DB interface. - using DB::Get; // Can return IOError due to files being deleted by the primary. To avoid // IOError in this case, application can coordinate between primary and // secondaries so that primary will not delete files that are currently being @@ -97,16 +94,9 @@ class DBImplSecondary : public DBImpl { // workaround, the secondaries can be opened with `max_open_files=-1` so that // it eagerly keeps all talbe files open and is able to access the contents of // deleted files via prior open fd. - Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, - const Slice& key, PinnableSlice* value) override; - - Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, - const Slice& key, PinnableSlice* value, - std::string* timestamp) override; - - Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, - const Slice& key, PinnableSlice* value, - std::string* timestamp); + using DBImpl::GetImpl; + Status GetImpl(const ReadOptions& options, const Slice& key, + GetImplOptions& get_impl_options) override; using DBImpl::NewIterator; // Operations on the created iterators can return IOError due to files being @@ -118,17 +108,17 @@ class DBImplSecondary : public DBImpl { // deleted. As a partial hacky workaround, the secondaries can be opened with // `max_open_files=-1` so that it eagerly keeps all talbe files open and is // able to access the contents of deleted files via prior open fd. - Iterator* NewIterator(const ReadOptions&, + Iterator* NewIterator(const ReadOptions& _read_options, ColumnFamilyHandle* column_family) override; ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options, - ColumnFamilyData* cfd, + ColumnFamilyData* cfd, SuperVersion* sv, SequenceNumber snapshot, ReadCallback* read_callback, bool expose_blob_index = false, bool allow_refresh = true); - Status NewIterators(const ReadOptions& options, + Status NewIterators(const ReadOptions& _read_options, const std::vector& column_families, std::vector* iterators) override; @@ -146,6 +136,10 @@ class DBImplSecondary : public DBImpl { const WideColumns& /* columns */) override { return Status::NotSupported("Not supported operation in secondary mode."); } + Status PutEntity(const WriteOptions& /* options */, const Slice& /* key */, + const AttributeGroups& /* attribute_groups */) override { + return Status::NotSupported("Not supported operation in secondary mode."); + } using DBImpl::Merge; Status Merge(const WriteOptions& /*options*/, @@ -269,91 +263,10 @@ class DBImplSecondary : public DBImpl { #endif // NDEBUG protected: -#ifndef ROCKSDB_LITE Status FlushForGetLiveFiles() override { // No-op for read-only DB return Status::OK(); } -#endif // !ROCKSDB_LITE - - // ColumnFamilyCollector is a write batch handler which does nothing - // except recording unique column family IDs - class ColumnFamilyCollector : public WriteBatch::Handler { - std::unordered_set column_family_ids_; - - Status AddColumnFamilyId(uint32_t column_family_id) { - if (column_family_ids_.find(column_family_id) == - column_family_ids_.end()) { - column_family_ids_.insert(column_family_id); - } - return Status::OK(); - } - - public: - explicit ColumnFamilyCollector() {} - - ~ColumnFamilyCollector() override {} - - Status PutCF(uint32_t column_family_id, const Slice&, - const Slice&) override { - return AddColumnFamilyId(column_family_id); - } - - Status DeleteCF(uint32_t column_family_id, const Slice&) override { - return AddColumnFamilyId(column_family_id); - } - - Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override { - return AddColumnFamilyId(column_family_id); - } - - Status DeleteRangeCF(uint32_t column_family_id, const Slice&, - const Slice&) override { - return AddColumnFamilyId(column_family_id); - } - - Status MergeCF(uint32_t column_family_id, const Slice&, - const Slice&) override { - return AddColumnFamilyId(column_family_id); - } - - Status PutBlobIndexCF(uint32_t column_family_id, const Slice&, - const Slice&) override { - return AddColumnFamilyId(column_family_id); - } - - Status MarkBeginPrepare(bool) override { return Status::OK(); } - - Status MarkEndPrepare(const Slice&) override { return Status::OK(); } - - Status MarkRollback(const Slice&) override { return Status::OK(); } - - Status MarkCommit(const Slice&) override { return Status::OK(); } - - Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { - return Status::OK(); - } - - Status MarkNoop(bool) override { return Status::OK(); } - - const std::unordered_set& column_families() const { - return column_family_ids_; - } - }; - - Status CollectColumnFamilyIdsFromWriteBatch( - const WriteBatch& batch, std::vector* column_family_ids) { - assert(column_family_ids != nullptr); - column_family_ids->clear(); - ColumnFamilyCollector handler; - Status s = batch.Iterate(&handler); - if (s.ok()) { - for (const auto& cf : handler.column_families()) { - column_family_ids->push_back(cf); - } - } - return s; - } bool OwnTablesAndLogs() const override { // Currently, the secondary instance does not own the database files. It @@ -406,5 +319,3 @@ class DBImplSecondary : public DBImpl { }; } // namespace ROCKSDB_NAMESPACE - -#endif // !ROCKSDB_LITE diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 3b9bd7b80ebb..2f1e93f4d6c0 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -31,7 +31,7 @@ Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, const Slice& key, const Slice& ts, const Slice& val) { - const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + const Status s = FailIfTsMismatchCf(column_family, ts); if (!s.ok()) { return s; } @@ -49,6 +49,17 @@ Status DBImpl::PutEntity(const WriteOptions& options, return DB::PutEntity(options, column_family, key, columns); } +Status DBImpl::PutEntity(const WriteOptions& options, const Slice& key, + const AttributeGroups& attribute_groups) { + for (const AttributeGroup& ag : attribute_groups) { + const Status s = FailIfCfHasTs(ag.column_family()); + if (!s.ok()) { + return s; + } + } + return DB::PutEntity(options, key, attribute_groups); +} + Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, const Slice& key, const Slice& val) { const Status s = FailIfCfHasTs(column_family); @@ -65,7 +76,7 @@ Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, const Slice& key, const Slice& ts, const Slice& val) { - const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + const Status s = FailIfTsMismatchCf(column_family, ts); if (!s.ok()) { return s; } @@ -84,7 +95,7 @@ Status DBImpl::Delete(const WriteOptions& write_options, Status DBImpl::Delete(const WriteOptions& write_options, ColumnFamilyHandle* column_family, const Slice& key, const Slice& ts) { - const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + const Status s = FailIfTsMismatchCf(column_family, ts); if (!s.ok()) { return s; } @@ -104,7 +115,7 @@ Status DBImpl::SingleDelete(const WriteOptions& write_options, Status DBImpl::SingleDelete(const WriteOptions& write_options, ColumnFamilyHandle* column_family, const Slice& key, const Slice& ts) { - const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + const Status s = FailIfTsMismatchCf(column_family, ts); if (!s.ok()) { return s; } @@ -125,7 +136,7 @@ Status DBImpl::DeleteRange(const WriteOptions& write_options, ColumnFamilyHandle* column_family, const Slice& begin_key, const Slice& end_key, const Slice& ts) { - const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false); + const Status s = FailIfTsMismatchCf(column_family, ts); if (!s.ok()) { return s; } @@ -150,7 +161,6 @@ Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { return s; } -#ifndef ROCKSDB_LITE Status DBImpl::WriteWithCallback(const WriteOptions& write_options, WriteBatch* my_batch, WriteCallback* callback) { @@ -164,7 +174,6 @@ Status DBImpl::WriteWithCallback(const WriteOptions& write_options, } return s; } -#endif // ROCKSDB_LITE // The main write queue. This is the only write queue that updates LastSequence. // When using one write queue, the same sequence also indicates the last @@ -429,17 +438,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, IOStatus io_s; Status pre_release_cb_status; if (status.ok()) { - // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock - // grabs but does not seem thread-safe. - if (tracer_) { - InstrumentedMutexLock lock(&trace_mutex_); - if (tracer_ && tracer_->IsWriteOrderPreserved()) { - for (auto* writer : write_group) { - // TODO: maybe handle the tracing status? - tracer_->Write(writer->batch).PermitUncheckedError(); - } - } - } // Rules for when we can update the memtable concurrently // 1. supported by memtable // 2. Puts are not okay if inplace_update_support @@ -472,6 +470,20 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, } } } + // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock + // grabs but does not seem thread-safe. + if (tracer_) { + InstrumentedMutexLock lock(&trace_mutex_); + if (tracer_ && tracer_->IsWriteOrderPreserved()) { + for (auto* writer : write_group) { + if (writer->CallbackFailed()) { + continue; + } + // TODO: maybe handle the tracing status? + tracer_->Write(writer->batch).PermitUncheckedError(); + } + } + } // Note about seq_per_batch_: either disableWAL is set for the entire write // group or not. In either case we inc seq for each write batch with no // failed callback. This means that there could be a batch with @@ -654,7 +666,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, log_write_mutex_.Unlock(); if (status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); - status = ApplyWALToManifest(&synced_wals); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status = ApplyWALToManifest(read_options, &synced_wals); } // Requesting sync with two_write_queues_ is expected to be very rare. We @@ -815,7 +829,9 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, } if (w.status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); - w.status = ApplyWALToManifest(&synced_wals); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + w.status = ApplyWALToManifest(read_options, &synced_wals); } write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status); } @@ -1259,6 +1275,9 @@ Status DBImpl::PreprocessWrite(const WriteOptions& write_options, // exceeded at this point so no new write (including current one) will go // through until memory usage is decreased. if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldStall())) { + default_cf_internal_stats_->AddDBStats( + InternalStats::kIntStatsWriteBufferManagerLimitStopsCounts, 1, + true /* concurrent */); if (write_options.no_slowdown) { status = Status::Incomplete("Write stall"); } else { @@ -1369,7 +1388,13 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch, if (UNLIKELY(needs_locking)) { log_write_mutex_.Lock(); } - IOStatus io_s = log_writer->AddRecord(log_entry, rate_limiter_priority); + IOStatus io_s = log_writer->MaybeAddUserDefinedTimestampSizeRecord( + versions_->GetColumnFamiliesTimestampSizeForRecord(), + rate_limiter_priority); + if (!io_s.ok()) { + return io_s; + } + io_s = log_writer->AddRecord(log_entry, rate_limiter_priority); if (UNLIKELY(needs_locking)) { log_write_mutex_.Unlock(); @@ -1591,14 +1616,40 @@ Status DBImpl::WriteRecoverableState() { } void DBImpl::SelectColumnFamiliesForAtomicFlush( - autovector* cfds) { - for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) { + autovector* selected_cfds, + const autovector& provided_candidate_cfds) { + mutex_.AssertHeld(); + assert(selected_cfds); + + autovector candidate_cfds; + + // Generate candidate cfds if not provided + if (provided_candidate_cfds.empty()) { + for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped() && cfd->initialized()) { + cfd->Ref(); + candidate_cfds.push_back(cfd); + } + } + } else { + candidate_cfds = provided_candidate_cfds; + } + + for (ColumnFamilyData* cfd : candidate_cfds) { if (cfd->IsDropped()) { continue; } if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load()) { - cfds->push_back(cfd); + selected_cfds->push_back(cfd); + } + } + + // Unref the newly generated candidate cfds (when not provided) in + // `candidate_cfds` + if (provided_candidate_cfds.empty()) { + for (auto candidate_cfd : candidate_cfds) { + candidate_cfd->UnrefAndTryDelete(); } } } @@ -1842,11 +1893,9 @@ uint64_t DBImpl::GetMaxTotalWalSize() const { Status DBImpl::DelayWrite(uint64_t num_bytes, WriteThread& write_thread, const WriteOptions& write_options) { mutex_.AssertHeld(); - uint64_t time_delayed = 0; + uint64_t start_time = 0; bool delayed = false; { - StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL, - &time_delayed); // To avoid parallel timed delays (bad throttling), only support them // on the primary write queue. uint64_t delay; @@ -1862,6 +1911,7 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, WriteThread& write_thread, if (write_options.no_slowdown) { return Status::Incomplete("Write stall"); } + start_time = immutable_db_options_.clock->NowMicros(); TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep"); // Notify write_thread about the stall so it can setup a barrier and @@ -1874,7 +1924,7 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, WriteThread& write_thread, // (slightly longer because WriteController minimum delay is 1ms, in // case of sleep imprecision, rounding, etc.) const uint64_t kDelayInterval = 1001; - uint64_t stall_end = sw.start_time() + delay; + uint64_t stall_end = start_time + delay; while (write_controller_.NeedsDelay()) { if (immutable_db_options_.clock->NowMicros() >= stall_end) { // We already delayed this write `delay` microseconds @@ -1889,11 +1939,11 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, WriteThread& write_thread, write_thread.EndWriteStall(); } - // Don't wait if there's a background error, even if its a soft error. We - // might wait here indefinitely as the background compaction may never - // finish successfully, resulting in the stall condition lasting - // indefinitely - while (error_handler_.GetBGError().ok() && write_controller_.IsStopped() && + // Don't wait if there's a background error that is not pending recovery + // since recovery might never be attempted. + while ((error_handler_.GetBGError().ok() || + error_handler_.IsRecoveryInProgress()) && + write_controller_.IsStopped() && !shutting_down_.load(std::memory_order_relaxed)) { if (write_options.no_slowdown) { return Status::Incomplete("Write stall"); @@ -1903,16 +1953,23 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, WriteThread& write_thread, // Notify write_thread about the stall so it can setup a barrier and // fail any pending writers with no_slowdown write_thread.BeginWriteStall(); - TEST_SYNC_POINT("DBImpl::DelayWrite:Wait"); + if (&write_thread == &write_thread_) { + TEST_SYNC_POINT("DBImpl::DelayWrite:Wait"); + } else { + TEST_SYNC_POINT("DBImpl::DelayWrite:NonmemWait"); + } bg_cv_.Wait(); + TEST_SYNC_POINT_CALLBACK("DBImpl::DelayWrite:AfterWait", &mutex_); write_thread.EndWriteStall(); } } assert(!delayed || !write_options.no_slowdown); if (delayed) { + auto time_delayed = immutable_db_options_.clock->NowMicros() - start_time; default_cf_internal_stats_->AddDBStats( InternalStats::kIntStatsWriteStallMicros, time_delayed); RecordTick(stats_, STALL_MICROS, time_delayed); + RecordInHistogram(stats_, WRITE_STALL, time_delayed); } // If DB is not in read-only mode and write_controller is not stopping @@ -1979,9 +2036,13 @@ Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options, // a chance to run. Now we guarantee we are still slowly making // progress. PERF_TIMER_GUARD(write_delay_time); - write_controller_.low_pri_rate_limiter()->Request( - my_batch->GetDataSize(), Env::IO_HIGH, nullptr /* stats */, - RateLimiter::OpType::kWrite); + auto data_size = my_batch->GetDataSize(); + while (data_size > 0) { + size_t allowed = write_controller_.low_pri_rate_limiter()->RequestToken( + data_size, 0 /* alignment */, Env::IO_HIGH, nullptr /* stats */, + RateLimiter::OpType::kWrite); + data_size -= allowed; + } } } return Status::OK(); @@ -2123,7 +2184,6 @@ Status DBImpl::ScheduleFlushes(WriteContext* context) { return status; } -#ifndef ROCKSDB_LITE void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/, const MemTableInfo& mem_table_info) { if (immutable_db_options_.listeners.size() == 0U) { @@ -2139,7 +2199,6 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/, } mutex_.Lock(); } -#endif // ROCKSDB_LITE Status DBImpl::SwitchMemtableWithoutCreatingWAL( ColumnFamilyData* cfd, WriteContext* context, uint64_t next_log_num, @@ -2153,14 +2212,12 @@ Status DBImpl::SwitchMemtableWithoutCreatingWAL( const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); // Set memtable_info for memtable sealed callback -#ifndef ROCKSDB_LITE MemTableInfo memtable_info; memtable_info.cf_name = cfd->GetName(); memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber(); memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber(); memtable_info.num_entries = cfd->mem()->num_entries(); memtable_info.num_deletes = cfd->mem()->num_deletes(); -#endif // ROCKSDB_LITE int num_imm_unflushed = cfd->imm()->NumNotFlushed(); SequenceNumber seq = versions_->LastSequence(); new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq); @@ -2201,11 +2258,9 @@ Status DBImpl::SwitchMemtableWithoutCreatingWAL( cfd->SetMemtable(new_mem); InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context, mutable_cf_options); -#ifndef ROCKSDB_LITE // Notify client that memtable is sealed, now that we have successfully // installed a new memtable NotifyOnMemTableSealed(cfd, memtable_info); -#endif // ROCKSDB_LITE return Status::OK(); } @@ -2218,6 +2273,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { // replication_log_listener is set assert(!immutable_db_options_.replication_log_listener); mutex_.AssertHeld(); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; log::Writer* new_log = nullptr; MemTable* new_mem = nullptr; IOStatus io_s; @@ -2249,14 +2306,12 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); // Set memtable_info for memtable sealed callback -#ifndef ROCKSDB_LITE MemTableInfo memtable_info; memtable_info.cf_name = cfd->GetName(); memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber(); memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber(); memtable_info.num_entries = cfd->mem()->num_entries(); memtable_info.num_deletes = cfd->mem()->num_deletes(); -#endif // ROCKSDB_LITE // Log this later after lock release. It may be outdated, e.g., if background // flush happens before logging, but that should be ok. int num_imm_unflushed = cfd->imm()->NumNotFlushed(); @@ -2371,8 +2426,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { VersionEdit wal_deletion; wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); - s = versions_->LogAndApplyToDefaultColumnFamily(&wal_deletion, &mutex_, - directories_.GetDbDir()); + s = versions_->LogAndApplyToDefaultColumnFamily( + read_options, &wal_deletion, &mutex_, directories_.GetDbDir()); if (!s.ok() && versions_->io_status().IsIOError()) { s = error_handler_.SetBGError(versions_->io_status(), BackgroundErrorReason::kManifestWrite); @@ -2415,11 +2470,9 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context, mutable_cf_options); -#ifndef ROCKSDB_LITE // Notify client that memtable is sealed, now that we have successfully // installed a new memtable NotifyOnMemTableSealed(cfd, memtable_info); -#endif // ROCKSDB_LITE // It is possible that we got here without checking the value of i_os, but // that is okay. If we did, it most likely means that s was already an error. // In any case, ignore any unchecked error for i_os here. @@ -2502,6 +2555,22 @@ Status DB::PutEntity(const WriteOptions& options, return Write(options, &batch); } +Status DB::PutEntity(const WriteOptions& options, const Slice& key, + const AttributeGroups& attribute_groups) { + ColumnFamilyHandle* default_cf = DefaultColumnFamily(); + assert(default_cf); + const Comparator* const default_cf_ucmp = default_cf->GetComparator(); + assert(default_cf_ucmp); + WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, + options.protection_bytes_per_key, + default_cf_ucmp->timestamp_size()); + const Status s = batch.PutEntity(key, attribute_groups); + if (!s.ok()) { + return s; + } + return Write(options, &batch); +} + Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family, const Slice& key) { WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */, diff --git a/db/db_info_dumper.cc b/db/db_info_dumper.cc index be8d5bee1cd4..7dd64795513b 100644 --- a/db/db_info_dumper.cc +++ b/db/db_info_dumper.cc @@ -34,6 +34,12 @@ void DumpDBFileSummary(const ImmutableDBOptions& options, std::string file_info, wal_info; Header(options.info_log, "DB SUMMARY\n"); + { + std::string hostname; + if (env->GetHostNameString(&hostname).ok()) { + Header(options.info_log, "Host name (Env): %s\n", hostname.c_str()); + } + } Header(options.info_log, "DB Session ID: %s\n", session_id.c_str()); Status s; diff --git a/db/db_io_failure_test.cc b/db/db_io_failure_test.cc index 2a405fd38d41..e79272ea7ecc 100644 --- a/db/db_io_failure_test.cc +++ b/db/db_io_failure_test.cc @@ -19,7 +19,6 @@ class DBIOFailureTest : public DBTestBase { DBIOFailureTest() : DBTestBase("db_io_failure_test", /*env_do_fsync=*/true) {} }; -#ifndef ROCKSDB_LITE // Check that number of files does not grow when writes are dropped TEST_F(DBIOFailureTest, DropWrites) { do { @@ -123,7 +122,6 @@ TEST_F(DBIOFailureTest, NoSpaceCompactRange) { env_->no_space_.store(false, std::memory_order_release); } while (ChangeCompactOptions()); } -#endif // ROCKSDB_LITE TEST_F(DBIOFailureTest, NonWritableFileSystem) { do { @@ -147,7 +145,6 @@ TEST_F(DBIOFailureTest, NonWritableFileSystem) { } while (ChangeCompactOptions()); } -#ifndef ROCKSDB_LITE TEST_F(DBIOFailureTest, ManifestWriteError) { // Test for the following problem: // (a) Compaction produces file F @@ -582,7 +579,6 @@ TEST_F(DBIOFailureTest, CompactionSstSyncError) { ASSERT_EQ("bar3", Get(1, "foo")); } #endif // !(defined NDEBUG) || !defined(OS_WIN) -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_iter.cc b/db/db_iter.cc index 1e4a735dca4a..418c538d4370 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -18,6 +18,7 @@ #include "db/merge_helper.h" #include "db/pinned_iterators_manager.h" #include "db/wide/wide_column_serialization.h" +#include "db/wide/wide_columns_helper.h" #include "file/filename.h" #include "logging/logging.h" #include "memory/arena.h" @@ -77,11 +78,13 @@ DBIter::DBIter(Env* _env, const ReadOptions& read_options, expose_blob_index_(expose_blob_index), is_blob_(false), arena_mode_(arena_mode), + io_activity_(read_options.io_activity), db_impl_(db_impl), cfd_(cfd), timestamp_ub_(read_options.timestamp), timestamp_lb_(read_options.iter_start_ts), - timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0) { + timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0), + auto_readahead_size_(read_options.auto_readahead_size) { RecordTick(statistics_, NO_ITERATOR_CREATED); if (pin_thru_lifetime_) { pinned_iters_mgr_.StartPinning(); @@ -111,6 +114,9 @@ Status DBIter::GetProperty(std::string prop_name, std::string* prop) { } else if (prop_name == "rocksdb.iterator.internal-key") { *prop = saved_key_.GetUserKey().ToString(); return Status::OK(); + } else if (prop_name == "rocksdb.iterator.write-time") { + // TODO(yuzhangyu): implement return the actual write time. + return Status::NotSupported("write time property is under construction"); } return Status::InvalidArgument("Unidentified property."); } @@ -131,6 +137,7 @@ void DBIter::Next() { assert(valid_); assert(status_.ok()); + PERF_COUNTER_ADD(iter_next_count, 1); PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_); // Release temporarily pinned blocks from last operation ReleaseTempPinnedData(); @@ -199,7 +206,7 @@ bool DBIter::SetBlobValueIfNeeded(const Slice& user_key, read_options.read_tier = read_tier_; read_options.fill_cache = fill_cache_; read_options.verify_checksums = verify_checksums_; - + read_options.io_activity = io_activity_; constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; constexpr uint64_t* bytes_read = nullptr; @@ -228,11 +235,35 @@ bool DBIter::SetValueAndColumnsFromEntity(Slice slice) { return false; } - if (!wide_columns_.empty() && - wide_columns_[0].name() == kDefaultWideColumnName) { - value_ = wide_columns_[0].value(); + if (WideColumnsHelper::HasDefaultColumn(wide_columns_)) { + value_ = WideColumnsHelper::GetDefaultColumn(wide_columns_); + } + + return true; +} + +bool DBIter::SetValueAndColumnsFromMergeResult(const Status& merge_status, + ValueType result_type) { + if (!merge_status.ok()) { + valid_ = false; + status_ = merge_status; + return false; + } + + if (result_type == kTypeWideColumnEntity) { + if (!SetValueAndColumnsFromEntity(saved_value_)) { + assert(!valid_); + return false; + } + + valid_ = true; + return true; } + assert(result_type == kTypeValue); + SetValueAndColumnsFromPlain(pinned_value_.data() ? pinned_value_ + : saved_value_); + valid_ = true; return true; } @@ -341,11 +372,6 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, } else { assert(!skipping_saved_key || CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0); - if (!iter_.PrepareValue()) { - assert(!iter_.status().ok()); - valid_ = false; - return false; - } num_skipped = 0; reseek_done = false; switch (ikey_.type) { @@ -369,6 +395,11 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, case kTypeValue: case kTypeBlobIndex: case kTypeWideColumnEntity: + if (!iter_.PrepareValue()) { + assert(!iter_.status().ok()); + valid_ = false; + return false; + } if (timestamp_lb_) { saved_key_.SetInternalKey(ikey_); } else { @@ -397,6 +428,11 @@ bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key, return true; break; case kTypeMerge: + if (!iter_.PrepareValue()) { + assert(!iter_.status().ok()); + valid_ = false; + return false; + } saved_key_.SetUserKey( ikey_.user_key, !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */); @@ -516,6 +552,8 @@ bool DBIter::MergeValuesNewToOld() { // Start the merge process by pushing the first operand merge_context_.PushOperand( iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + PERF_COUNTER_ADD(internal_merge_count, 1); + TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand"); ParsedInternalKey ikey; @@ -545,8 +583,7 @@ bool DBIter::MergeValuesNewToOld() { if (kTypeValue == ikey.type) { // hit a put, merge the put value with operands and store the // final result in saved_value_. We are done! - const Slice val = iter_.value(); - if (!Merge(&val, ikey.user_key)) { + if (!MergeWithPlainBaseValue(iter_.value(), ikey.user_key)) { return false; } // iter_ is positioned after put @@ -575,7 +612,7 @@ bool DBIter::MergeValuesNewToOld() { return false; } valid_ = true; - if (!Merge(&blob_value_, ikey.user_key)) { + if (!MergeWithPlainBaseValue(blob_value_, ikey.user_key)) { return false; } @@ -589,7 +626,7 @@ bool DBIter::MergeValuesNewToOld() { } return true; } else if (kTypeWideColumnEntity == ikey.type) { - if (!MergeEntity(iter_.value(), ikey.user_key)) { + if (!MergeWithWideColumnBaseValue(iter_.value(), ikey.user_key)) { return false; } @@ -619,7 +656,7 @@ bool DBIter::MergeValuesNewToOld() { // a deletion marker. // feed null as the existing value to the merge operator, such that // client can differentiate this scenario and do things accordingly. - if (!Merge(nullptr, saved_key_.GetUserKey())) { + if (!MergeWithNoBaseValue(saved_key_.GetUserKey())) { return false; } assert(status_.ok()); @@ -630,6 +667,7 @@ void DBIter::Prev() { assert(valid_); assert(status_.ok()); + PERF_COUNTER_ADD(iter_prev_count, 1); PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_); ReleaseTempPinnedData(); ResetBlobValue(); @@ -709,15 +747,22 @@ bool DBIter::ReverseToBackward() { // When current_entry_is_merged_ is true, iter_ may be positioned on the next // key, which may not exist or may have prefix different from current. // If that's the case, seek to saved_key_. - if (current_entry_is_merged_ && - (!expect_total_order_inner_iter() || !iter_.Valid())) { + // + // In case of auto_readahead_size enabled, index_iter moves forward during + // forward scan for block cache lookup and points to different block. If Prev + // op is called, it needs to call SeekForPrev to point to right index_iter_ in + // BlockBasedTableIterator. This only happens when direction is changed from + // forward to backward. + if ((current_entry_is_merged_ && + (!expect_total_order_inner_iter() || !iter_.Valid())) || + auto_readahead_size_) { IterKey last_key; // Using kMaxSequenceNumber and kValueTypeForSeek // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller // than saved_key_. last_key.SetInternalKey(ParsedInternalKey( saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek)); - if (!expect_total_order_inner_iter()) { + if (!expect_total_order_inner_iter() || auto_readahead_size_) { iter_.SeekForPrev(last_key.GetInternalKey()); } else { // Some iterators may not support SeekForPrev(), so we avoid using it @@ -872,9 +917,14 @@ bool DBIter::FindValueForCurrentKey() { if (timestamp_lb_ != nullptr) { // Only needed when timestamp_lb_ is not null [[maybe_unused]] const bool ret = ParseKey(&ikey_); - saved_ikey_.assign(iter_.key().data(), iter_.key().size()); // Since the preceding ParseKey(&ikey) succeeds, so must this. assert(ret); + saved_key_.SetInternalKey(ikey); + } else if (user_comparator_.Compare(ikey.user_key, + saved_key_.GetUserKey()) < 0) { + saved_key_.SetUserKey( + ikey.user_key, + !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */); } valid_entry_seen = true; @@ -949,9 +999,6 @@ bool DBIter::FindValueForCurrentKey() { assert(last_key_entry_type == ikey_.type); } - Status s; - s.PermitUncheckedError(); - switch (last_key_entry_type) { case kTypeDeletion: case kTypeDeletionWithTimestamp: @@ -959,7 +1006,6 @@ bool DBIter::FindValueForCurrentKey() { if (timestamp_lb_ == nullptr) { valid_ = false; } else { - saved_key_.SetInternalKey(saved_ikey_); valid_ = true; } return true; @@ -968,7 +1014,7 @@ bool DBIter::FindValueForCurrentKey() { if (last_not_merge_type == kTypeDeletion || last_not_merge_type == kTypeSingleDeletion || last_not_merge_type == kTypeDeletionWithTimestamp) { - if (!Merge(nullptr, saved_key_.GetUserKey())) { + if (!MergeWithNoBaseValue(saved_key_.GetUserKey())) { return false; } return true; @@ -983,7 +1029,7 @@ bool DBIter::FindValueForCurrentKey() { return false; } valid_ = true; - if (!Merge(&blob_value_, saved_key_.GetUserKey())) { + if (!MergeWithPlainBaseValue(blob_value_, saved_key_.GetUserKey())) { return false; } @@ -991,24 +1037,21 @@ bool DBIter::FindValueForCurrentKey() { return true; } else if (last_not_merge_type == kTypeWideColumnEntity) { - if (!MergeEntity(pinned_value_, saved_key_.GetUserKey())) { + if (!MergeWithWideColumnBaseValue(pinned_value_, + saved_key_.GetUserKey())) { return false; } return true; } else { assert(last_not_merge_type == kTypeValue); - if (!Merge(&pinned_value_, saved_key_.GetUserKey())) { + if (!MergeWithPlainBaseValue(pinned_value_, saved_key_.GetUserKey())) { return false; } return true; } break; case kTypeValue: - if (timestamp_lb_ != nullptr) { - saved_key_.SetInternalKey(saved_ikey_); - } - SetValueAndColumnsFromPlain(pinned_value_); break; @@ -1033,11 +1076,6 @@ bool DBIter::FindValueForCurrentKey() { std::to_string(static_cast(last_key_entry_type))); return false; } - if (!s.ok()) { - valid_ = false; - status_ = s; - return false; - } valid_ = true; return true; } @@ -1154,6 +1192,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { merge_context_.Clear(); merge_context_.PushOperand( iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */); + PERF_COUNTER_ADD(internal_merge_count, 1); + while (true) { iter_.Next(); @@ -1181,8 +1221,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { } if (ikey.type == kTypeValue) { - const Slice val = iter_.value(); - if (!Merge(&val, saved_key_.GetUserKey())) { + if (!MergeWithPlainBaseValue(iter_.value(), saved_key_.GetUserKey())) { return false; } return true; @@ -1201,7 +1240,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { return false; } valid_ = true; - if (!Merge(&blob_value_, saved_key_.GetUserKey())) { + if (!MergeWithPlainBaseValue(blob_value_, saved_key_.GetUserKey())) { return false; } @@ -1209,7 +1248,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { return true; } else if (ikey.type == kTypeWideColumnEntity) { - if (!MergeEntity(iter_.value(), saved_key_.GetUserKey())) { + if (!MergeWithWideColumnBaseValue(iter_.value(), + saved_key_.GetUserKey())) { return false; } @@ -1223,7 +1263,7 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { } } - if (!Merge(nullptr, saved_key_.GetUserKey())) { + if (!MergeWithNoBaseValue(saved_key_.GetUserKey())) { return false; } @@ -1246,47 +1286,42 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { return true; } -bool DBIter::Merge(const Slice* val, const Slice& user_key) { +bool DBIter::MergeWithNoBaseValue(const Slice& user_key) { // `op_failure_scope` (an output parameter) is not provided (set to nullptr) // since a failure must be propagated regardless of its value. - Status s = MergeHelper::TimedFullMerge( - merge_operator_, user_key, val, merge_context_.GetOperands(), - &saved_value_, logger_, statistics_, clock_, &pinned_value_, - /* update_num_ops_stats */ true, - /* op_failure_scope */ nullptr); - if (!s.ok()) { - valid_ = false; - status_ = s; - return false; - } - - SetValueAndColumnsFromPlain(pinned_value_.data() ? pinned_value_ - : saved_value_); - - valid_ = true; - return true; + ValueType result_type; + const Status s = MergeHelper::TimedFullMerge( + merge_operator_, user_key, MergeHelper::kNoBaseValue, + merge_context_.GetOperands(), logger_, statistics_, clock_, + /* update_num_ops_stats */ true, &saved_value_, &pinned_value_, + &result_type, /* op_failure_scope */ nullptr); + return SetValueAndColumnsFromMergeResult(s, result_type); } -bool DBIter::MergeEntity(const Slice& entity, const Slice& user_key) { +bool DBIter::MergeWithPlainBaseValue(const Slice& value, + const Slice& user_key) { // `op_failure_scope` (an output parameter) is not provided (set to nullptr) // since a failure must be propagated regardless of its value. - Status s = MergeHelper::TimedFullMergeWithEntity( - merge_operator_, user_key, entity, merge_context_.GetOperands(), - &saved_value_, logger_, statistics_, clock_, - /* update_num_ops_stats */ true, - /* op_failure_scope */ nullptr); - if (!s.ok()) { - valid_ = false; - status_ = s; - return false; - } - - if (!SetValueAndColumnsFromEntity(saved_value_)) { - return false; - } + ValueType result_type; + const Status s = MergeHelper::TimedFullMerge( + merge_operator_, user_key, MergeHelper::kPlainBaseValue, value, + merge_context_.GetOperands(), logger_, statistics_, clock_, + /* update_num_ops_stats */ true, &saved_value_, &pinned_value_, + &result_type, /* op_failure_scope */ nullptr); + return SetValueAndColumnsFromMergeResult(s, result_type); +} - valid_ = true; - return true; +bool DBIter::MergeWithWideColumnBaseValue(const Slice& entity, + const Slice& user_key) { + // `op_failure_scope` (an output parameter) is not provided (set to nullptr) + // since a failure must be propagated regardless of its value. + ValueType result_type; + const Status s = MergeHelper::TimedFullMerge( + merge_operator_, user_key, MergeHelper::kWideBaseValue, entity, + merge_context_.GetOperands(), logger_, statistics_, clock_, + /* update_num_ops_stats */ true, &saved_value_, &pinned_value_, + &result_type, /* op_failure_scope */ nullptr); + return SetValueAndColumnsFromMergeResult(s, result_type); } // Move backwards until the key smaller than saved_key_. @@ -1428,18 +1463,17 @@ void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) { if (timestamp_size_ > 0) { const std::string kTsMax(timestamp_size_, '\xff'); Slice ts = kTsMax; - saved_key_.UpdateInternalKey( - kMaxSequenceNumber, kValueTypeForSeekForPrev, - timestamp_lb_ != nullptr ? timestamp_lb_ : &ts); + saved_key_.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeekForPrev, + &ts); } } } void DBIter::Seek(const Slice& target) { + PERF_COUNTER_ADD(iter_seek_count, 1); PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); StopWatch sw(clock_, statistics_, DB_SEEK); -#ifndef ROCKSDB_LITE if (db_impl_ != nullptr && cfd_ != nullptr) { // TODO: What do we do if this returns an error? Slice lower_bound, upper_bound; @@ -1456,7 +1490,6 @@ void DBIter::Seek(const Slice& target) { db_impl_->TraceIteratorSeek(cfd_->GetID(), target, lower_bound, upper_bound) .PermitUncheckedError(); } -#endif // ROCKSDB_LITE status_ = Status::OK(); ReleaseTempPinnedData(); @@ -1511,10 +1544,10 @@ void DBIter::Seek(const Slice& target) { } void DBIter::SeekForPrev(const Slice& target) { + PERF_COUNTER_ADD(iter_seek_count, 1); PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); StopWatch sw(clock_, statistics_, DB_SEEK); -#ifndef ROCKSDB_LITE if (db_impl_ != nullptr && cfd_ != nullptr) { // TODO: What do we do if this returns an error? Slice lower_bound, upper_bound; @@ -1533,7 +1566,6 @@ void DBIter::SeekForPrev(const Slice& target) { upper_bound) .PermitUncheckedError(); } -#endif // ROCKSDB_LITE status_ = Status::OK(); ReleaseTempPinnedData(); @@ -1586,6 +1618,7 @@ void DBIter::SeekToFirst() { Seek(*iterate_lower_bound_); return; } + PERF_COUNTER_ADD(iter_seek_count, 1); PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek will be used. @@ -1636,27 +1669,19 @@ void DBIter::SeekToLast() { if (iterate_upper_bound_ != nullptr) { // Seek to last key strictly less than ReadOptions.iterate_upper_bound. SeekForPrev(*iterate_upper_bound_); - const bool is_ikey = (timestamp_size_ > 0 && timestamp_lb_ != nullptr); +#ifndef NDEBUG Slice k = Valid() ? key() : Slice(); - if (is_ikey && Valid()) { + if (Valid() && timestamp_size_ > 0 && timestamp_lb_) { k.remove_suffix(kNumInternalBytes + timestamp_size_); } - while (Valid() && 0 == user_comparator_.CompareWithoutTimestamp( - *iterate_upper_bound_, /*a_has_ts=*/false, k, - /*b_has_ts=*/false)) { - ReleaseTempPinnedData(); - ResetBlobValue(); - ResetValueAndColumns(); - PrevInternal(nullptr); - - k = key(); - if (is_ikey) { - k.remove_suffix(kNumInternalBytes + timestamp_size_); - } - } + assert(!Valid() || user_comparator_.CompareWithoutTimestamp( + k, /*a_has_ts=*/false, *iterate_upper_bound_, + /*b_has_ts=*/false) < 0); +#endif return; } + PERF_COUNTER_ADD(iter_seek_count, 1); PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_); // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek will be used. diff --git a/db/db_iter.h b/db/db_iter.h index a5eab43c7ff4..d18bf019c4b4 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -209,6 +209,7 @@ class DBIter final : public Iterator { if (read_callback_) { read_callback_->Refresh(s); } + iter_.SetRangeDelReadSeqno(s); } SequenceNumber get_sequence() const { return sequence_; } @@ -314,14 +315,20 @@ class DBIter final : public Iterator { bool SetValueAndColumnsFromEntity(Slice slice); + bool SetValueAndColumnsFromMergeResult(const Status& merge_status, + ValueType result_type); + void ResetValueAndColumns() { value_.clear(); wide_columns_.clear(); } + // The following methods perform the actual merge operation for the + // no base value/plain base value/wide-column base value cases. // If user-defined timestamp is enabled, `user_key` includes timestamp. - bool Merge(const Slice* val, const Slice& user_key); - bool MergeEntity(const Slice& entity, const Slice& user_key); + bool MergeWithNoBaseValue(const Slice& user_key); + bool MergeWithPlainBaseValue(const Slice& value, const Slice& user_key); + bool MergeWithWideColumnBaseValue(const Slice& entity, const Slice& user_key); const SliceTransform* prefix_extractor_; Env* const env_; @@ -386,25 +393,18 @@ class DBIter final : public Iterator { bool expose_blob_index_; bool is_blob_; bool arena_mode_; + const Env::IOActivity io_activity_; // List of operands for merge operator. MergeContext merge_context_; LocalStatistics local_stats_; PinnedIteratorsManager pinned_iters_mgr_; -#ifdef ROCKSDB_LITE - ROCKSDB_FIELD_UNUSED -#endif DBImpl* db_impl_; -#ifdef ROCKSDB_LITE - ROCKSDB_FIELD_UNUSED -#endif ColumnFamilyData* cfd_; const Slice* const timestamp_ub_; const Slice* const timestamp_lb_; const size_t timestamp_size_; std::string saved_timestamp_; - - // Used only if timestamp_lb_ is not nullptr. - std::string saved_ikey_; + bool auto_readahead_size_; }; // Return a new iterator that converts internal keys (yielded by diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc index 65290bfad38b..6fd4469700b6 100644 --- a/db/db_iter_test.cc +++ b/db/db_iter_test.cc @@ -275,6 +275,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { db_iter->Next(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } // Test to check the SeekToLast() with iterate_upper_bound not set { @@ -1415,6 +1416,7 @@ TEST_F(DBIteratorTest, DBIterator1) { ASSERT_EQ(db_iter->key().ToString(), "b"); db_iter->Next(); ASSERT_FALSE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); } TEST_F(DBIteratorTest, DBIterator2) { @@ -1528,6 +1530,7 @@ TEST_F(DBIteratorTest, DBIterator5) { ASSERT_EQ(db_iter->value().ToString(), "merge_1"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -1552,6 +1555,7 @@ TEST_F(DBIteratorTest, DBIterator5) { ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -1576,6 +1580,7 @@ TEST_F(DBIteratorTest, DBIterator5) { ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -1600,6 +1605,7 @@ TEST_F(DBIteratorTest, DBIterator5) { ASSERT_EQ(db_iter->value().ToString(), "put_1"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -1624,6 +1630,7 @@ TEST_F(DBIteratorTest, DBIterator5) { ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -1648,6 +1655,7 @@ TEST_F(DBIteratorTest, DBIterator5) { ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -1672,6 +1680,7 @@ TEST_F(DBIteratorTest, DBIterator5) { ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5,merge_6"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -1726,6 +1735,7 @@ TEST_F(DBIteratorTest, DBIterator6) { ASSERT_EQ(db_iter->value().ToString(), "merge_1"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -1750,6 +1760,7 @@ TEST_F(DBIteratorTest, DBIterator6) { ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -1774,6 +1785,7 @@ TEST_F(DBIteratorTest, DBIterator6) { ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -1794,6 +1806,7 @@ TEST_F(DBIteratorTest, DBIterator6) { nullptr /* read_callback */)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -1818,6 +1831,7 @@ TEST_F(DBIteratorTest, DBIterator6) { ASSERT_EQ(db_iter->value().ToString(), "merge_4"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -1842,6 +1856,7 @@ TEST_F(DBIteratorTest, DBIterator6) { ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -1866,6 +1881,7 @@ TEST_F(DBIteratorTest, DBIterator6) { ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5,merge_6"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } } @@ -1910,6 +1926,7 @@ TEST_F(DBIteratorTest, DBIterator7) { ASSERT_EQ(db_iter->value().ToString(), "merge_1"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -1952,6 +1969,7 @@ TEST_F(DBIteratorTest, DBIterator7) { ASSERT_EQ(db_iter->value().ToString(), "merge_1"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -1994,6 +2012,7 @@ TEST_F(DBIteratorTest, DBIterator7) { ASSERT_EQ(db_iter->value().ToString(), "merge_1"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -2041,6 +2060,7 @@ TEST_F(DBIteratorTest, DBIterator7) { ASSERT_EQ(db_iter->value().ToString(), "merge_1"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -2089,6 +2109,7 @@ TEST_F(DBIteratorTest, DBIterator7) { ASSERT_EQ(db_iter->value().ToString(), "merge_1"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -2131,6 +2152,7 @@ TEST_F(DBIteratorTest, DBIterator7) { ASSERT_EQ(db_iter->value().ToString(), "merge_1"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -2179,6 +2201,7 @@ TEST_F(DBIteratorTest, DBIterator7) { ASSERT_EQ(db_iter->value().ToString(), "merge_1"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -2228,6 +2251,7 @@ TEST_F(DBIteratorTest, DBIterator7) { ASSERT_EQ(db_iter->value().ToString(), "merge_1"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } { @@ -2271,6 +2295,7 @@ TEST_F(DBIteratorTest, DBIterator7) { ASSERT_EQ(db_iter->value().ToString(), "merge_1"); db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); + ASSERT_OK(db_iter->status()); } } @@ -2440,6 +2465,7 @@ TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) { ASSERT_EQ(db_iter->value().ToString(), "2"); db_iter->Next(); ASSERT_FALSE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); } TEST_F(DBIteratorTest, DBIterator11) { @@ -2469,6 +2495,7 @@ TEST_F(DBIteratorTest, DBIterator11) { ASSERT_EQ(db_iter->key().ToString(), "b"); db_iter->Next(); ASSERT_FALSE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); } TEST_F(DBIteratorTest, DBIterator12) { @@ -2497,6 +2524,7 @@ TEST_F(DBIteratorTest, DBIterator12) { ASSERT_EQ(db_iter->value().ToString(), "1"); db_iter->Prev(); ASSERT_FALSE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); } TEST_F(DBIteratorTest, DBIterator13) { @@ -2635,6 +2663,7 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIterator1) { ASSERT_EQ(db_iter_->value().ToString(), "3"); db_iter_->Next(); ASSERT_FALSE(db_iter_->Valid()); + ASSERT_OK(db_iter_->status()); } TEST_F(DBIterWithMergeIterTest, InnerMergeIterator2) { diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index c72494d7c340..a29aab6d1418 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -16,7 +16,7 @@ #include "port/stack_trace.h" #include "rocksdb/iostats_context.h" #include "rocksdb/perf_context.h" -#include "table/block_based/flush_block_policy.h" +#include "table/block_based/flush_block_policy_impl.h" #include "util/random.h" #include "utilities/merge_operators/string_append/stringappend2.h" @@ -30,12 +30,63 @@ class DummyReadCallback : public ReadCallback { void SetSnapshot(SequenceNumber seq) { max_visible_seq_ = seq; } }; +class DBIteratorBaseTest : public DBTestBase { + public: + DBIteratorBaseTest() + : DBTestBase("db_iterator_test", /*env_do_fsync=*/true) {} +}; + +TEST_F(DBIteratorBaseTest, APICallsWithPerfContext) { + // Set up the DB + Options options = CurrentOptions(); + DestroyAndReopen(options); + Random rnd(301); + for (int i = 1; i <= 3; i++) { + ASSERT_OK(Put(std::to_string(i), std::to_string(i))); + } + + // Setup iterator and PerfContext + Iterator* iter = db_->NewIterator(ReadOptions()); + std::string key_str = std::to_string(2); + Slice key(key_str); + SetPerfLevel(kEnableCount); + get_perf_context()->Reset(); + + // Initial PerfContext counters + ASSERT_EQ(0, get_perf_context()->iter_seek_count); + ASSERT_EQ(0, get_perf_context()->iter_next_count); + ASSERT_EQ(0, get_perf_context()->iter_prev_count); + + // Test Seek-related API calls PerfContext counter + iter->Seek(key); + iter->SeekToFirst(); + iter->SeekToLast(); + iter->SeekForPrev(key); + ASSERT_EQ(4, get_perf_context()->iter_seek_count); + ASSERT_EQ(0, get_perf_context()->iter_next_count); + ASSERT_EQ(0, get_perf_context()->iter_prev_count); + + // Test Next() calls PerfContext counter + iter->Next(); + ASSERT_EQ(4, get_perf_context()->iter_seek_count); + ASSERT_EQ(1, get_perf_context()->iter_next_count); + ASSERT_EQ(0, get_perf_context()->iter_prev_count); + + // Test Prev() calls PerfContext counter + iter->Prev(); + ASSERT_EQ(4, get_perf_context()->iter_seek_count); + ASSERT_EQ(1, get_perf_context()->iter_next_count); + ASSERT_EQ(1, get_perf_context()->iter_prev_count); + + delete iter; +} + // Test param: // bool: whether to pass read_callback to NewIterator(). -class DBIteratorTest : public DBTestBase, +class DBIteratorTest : public DBIteratorBaseTest, public testing::WithParamInterface { public: - DBIteratorTest() : DBTestBase("db_iterator_test", /*env_do_fsync=*/true) {} + DBIteratorTest() {} Iterator* NewIterator(const ReadOptions& read_options, ColumnFamilyHandle* column_family = nullptr) { @@ -56,7 +107,10 @@ class DBIteratorTest : public DBTestBase, read_callbacks_.push_back( std::unique_ptr(read_callback)); } - return dbfull()->NewIteratorImpl(read_options, cfd, seq, read_callback); + DBImpl* db_impl = dbfull(); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(db_impl); + return db_impl->NewIteratorImpl(read_options, cfd, super_version, seq, + read_callback); } private: @@ -129,6 +183,7 @@ TEST_P(DBIteratorTest, NonBlockingIteration) { ASSERT_OK(iter->status()); count++; } + ASSERT_OK(iter->status()); ASSERT_EQ(count, 1); delete iter; @@ -163,6 +218,7 @@ TEST_P(DBIteratorTest, NonBlockingIteration) { ASSERT_OK(iter->status()); count++; } + ASSERT_OK(iter->status()); ASSERT_EQ(count, 1); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); @@ -834,7 +890,7 @@ TEST_P(DBIteratorTest, IterWithSnapshot) { ASSERT_EQ(IterStatus(iter2), "key0->val0"); db_->ReleaseSnapshot(snapshot); - db_->ReleaseSnapshot(snapshot2); + ASSERT_OK(iter->status()); delete iter; delete iter2; } while (ChangeOptions()); @@ -911,7 +967,6 @@ TEST_P(DBIteratorTest, IteratorDeleteAfterCfDrop) { } // SetOptions not defined in ROCKSDB LITE -#ifndef ROCKSDB_LITE TEST_P(DBIteratorTest, DBIteratorBoundTest) { Options options = CurrentOptions(); options.env = env_; @@ -1139,7 +1194,6 @@ TEST_P(DBIteratorTest, DBIteratorBoundMultiSeek) { TestGetTickerCount(options, BLOCK_CACHE_MISS)); } } -#endif TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) { for (auto format_version : {2, 3, 4}) { @@ -1182,6 +1236,7 @@ TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) { iter->Next(); ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); ASSERT_EQ(upper_bound_hits, 1); } } @@ -1306,6 +1361,7 @@ TEST_P(DBIteratorTest, IndexWithFirstKey) { iter->Next(); ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_HIT)); EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS)); } @@ -1547,6 +1603,7 @@ class DBIteratorTestForPinnedData : public DBIteratorTest { ASSERT_EQ("1", prop_value); all_keys.push_back(iter->key()); } + ASSERT_OK(iter->status()); ASSERT_EQ(all_keys.size(), true_data.size()); // Verify that all keys slices are valid (backward) @@ -1584,7 +1641,6 @@ INSTANTIATE_TEST_CASE_P(DBIteratorTestForPinnedDataInstance, DBIteratorTestForPinnedData, testing::Values(true, false)); -#ifndef ROCKSDB_LITE TEST_P(DBIteratorTest, PinnedDataIteratorMultipleFiles) { Options options = CurrentOptions(); BlockBasedTableOptions table_options; @@ -1651,10 +1707,9 @@ TEST_P(DBIteratorTest, PinnedDataIteratorMultipleFiles) { ASSERT_EQ(kv.first, data_iter->first); ASSERT_EQ(kv.second, data_iter->second); } - + ASSERT_OK(iter->status()); delete iter; } -#endif TEST_P(DBIteratorTest, PinnedDataIteratorMergeOperator) { Options options = CurrentOptions(); @@ -1698,6 +1753,7 @@ TEST_P(DBIteratorTest, PinnedDataIteratorMergeOperator) { ASSERT_EQ("1", prop_value); results.emplace_back(iter->key(), iter->value().ToString()); } + ASSERT_OK(iter->status()); ASSERT_EQ(results.size(), 1000); for (size_t i = 0; i < results.size(); i++) { @@ -1755,6 +1811,7 @@ TEST_P(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) { ASSERT_EQ("1", prop_value); results.emplace_back(iter->key(), iter->value().ToString()); } + ASSERT_OK(iter->status()); auto data_iter = true_data.begin(); for (size_t i = 0; i < results.size(); i++, data_iter++) { @@ -2049,6 +2106,7 @@ TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) { ASSERT_EQ(iter->value().ToString(), data_iter->second); data_iter++; } + ASSERT_OK(iter->status()); ASSERT_EQ(data_iter, true_data.rend()); delete iter; @@ -2106,6 +2164,7 @@ TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) { entries_right++; data_iter++; } + ASSERT_OK(iter->status()); ASSERT_EQ(data_iter, true_data.rend()); delete iter; @@ -2145,6 +2204,7 @@ TEST_P(DBIteratorTest, IteratorWithLocalStatistics) { total_next++; if (!iter->Valid()) { + EXPECT_OK(iter->status()); break; } total_next_found++; @@ -2172,6 +2232,7 @@ TEST_P(DBIteratorTest, IteratorWithLocalStatistics) { total_prev++; if (!iter->Valid()) { + EXPECT_OK(iter->status()); break; } total_prev_found++; @@ -2236,9 +2297,7 @@ TEST_P(DBIteratorTest, ReadAhead) { ASSERT_OK(Put(Key(i), value)); } ASSERT_OK(Flush()); -#ifndef ROCKSDB_LITE ASSERT_EQ("1,1,1", FilesPerLevel()); -#endif // !ROCKSDB_LITE env_->random_read_bytes_counter_ = 0; options.statistics->setTickerCount(NO_FILE_OPENS, 0); @@ -2249,7 +2308,6 @@ TEST_P(DBIteratorTest, ReadAhead) { size_t bytes_read = env_->random_read_bytes_counter_; delete iter; - int64_t num_file_closes = TestGetTickerCount(options, NO_FILE_CLOSES); env_->random_read_bytes_counter_ = 0; options.statistics->setTickerCount(NO_FILE_OPENS, 0); read_options.readahead_size = 1024 * 10; @@ -2258,10 +2316,7 @@ TEST_P(DBIteratorTest, ReadAhead) { int64_t num_file_opens_readahead = TestGetTickerCount(options, NO_FILE_OPENS); size_t bytes_read_readahead = env_->random_read_bytes_counter_; delete iter; - int64_t num_file_closes_readahead = - TestGetTickerCount(options, NO_FILE_CLOSES); ASSERT_EQ(num_file_opens, num_file_opens_readahead); - ASSERT_EQ(num_file_closes, num_file_closes_readahead); ASSERT_GT(bytes_read_readahead, bytes_read); ASSERT_GT(bytes_read_readahead, read_options.readahead_size * 3); @@ -2305,12 +2360,10 @@ TEST_P(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) { ASSERT_OK(Put("b", std::to_string(i + 1).c_str())); } -#ifndef ROCKSDB_LITE // Check that memtable wasn't flushed. std::string val; ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &val)); EXPECT_EQ("0", val); -#endif // Seek iterator to a smaller key. get_perf_context()->Reset(); @@ -2394,37 +2447,98 @@ TEST_P(DBIteratorTest, Refresh) { ASSERT_EQ(iter->key().compare(Slice("x")), 0); iter->Next(); ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); iter.reset(); } TEST_P(DBIteratorTest, RefreshWithSnapshot) { - ASSERT_OK(Put("x", "y")); + // L1 file, uses LevelIterator internally + ASSERT_OK(Put(Key(0), "val0")); + ASSERT_OK(Put(Key(5), "val5")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + // L0 file, uses table iterator internally + ASSERT_OK(Put(Key(1), "val1")); + ASSERT_OK(Put(Key(4), "val4")); + ASSERT_OK(Flush()); + + // Memtable + ASSERT_OK(Put(Key(2), "val2")); + ASSERT_OK(Put(Key(3), "val3")); const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(Put(Key(2), "new val")); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4), + Key(7))); + const Snapshot* snapshot2 = db_->GetSnapshot(); + + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + ReadOptions options; options.snapshot = snapshot; Iterator* iter = NewIterator(options); + ASSERT_OK(Put(Key(6), "val6")); ASSERT_OK(iter->status()); - iter->Seek(Slice("a")); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("x")), 0); - iter->Next(); - ASSERT_FALSE(iter->Valid()); + auto verify_iter = [&](int start, int end, bool new_key2 = false) { + for (int i = start; i < end; ++i) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(i)); + if (i == 2 && new_key2) { + ASSERT_EQ(iter->value(), "new val"); + } else { + ASSERT_EQ(iter->value(), "val" + std::to_string(i)); + } + iter->Next(); + } + }; - ASSERT_OK(Put("c", "d")); + for (int j = 0; j < 2; j++) { + iter->Seek(Key(1)); + verify_iter(1, 3); + // Refresh to same snapshot + ASSERT_OK(iter->Refresh(snapshot)); + ASSERT_TRUE(!iter->Valid() && iter->status().ok()); + iter->Seek(Key(3)); + verify_iter(3, 6); + ASSERT_TRUE(!iter->Valid() && iter->status().ok()); + + // Refresh to a newer snapshot + ASSERT_OK(iter->Refresh(snapshot2)); + ASSERT_TRUE(!iter->Valid() && iter->status().ok()); + iter->SeekToFirst(); + verify_iter(0, 4, /*new_key2=*/true); + ASSERT_TRUE(!iter->Valid() && iter->status().ok()); + + // Refresh to an older snapshot + ASSERT_OK(iter->Refresh(snapshot)); + ASSERT_TRUE(!iter->Valid() && iter->status().ok()); + iter->Seek(Key(3)); + verify_iter(3, 6); + ASSERT_TRUE(!iter->Valid() && iter->status().ok()); + + // Refresh to no snapshot + ASSERT_OK(iter->Refresh()); + ASSERT_TRUE(!iter->Valid() && iter->status().ok()); + iter->Seek(Key(2)); + verify_iter(2, 4, /*new_key2=*/true); + verify_iter(6, 7); + ASSERT_TRUE(!iter->Valid() && iter->status().ok()); + + // Change LSM shape, new SuperVersion is created. + ASSERT_OK(Flush()); - iter->Seek(Slice("a")); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(Slice("x")), 0); - iter->Next(); - ASSERT_FALSE(iter->Valid()); + // Refresh back to original snapshot + ASSERT_OK(iter->Refresh(snapshot)); + } - ASSERT_OK(iter->status()); - Status s = iter->Refresh(); - ASSERT_TRUE(s.IsNotSupported()); - db_->ReleaseSnapshot(snapshot); delete iter; + db_->ReleaseSnapshot(snapshot); + db_->ReleaseSnapshot(snapshot2); + ASSERT_OK(db_->Close()); } TEST_P(DBIteratorTest, CreationFailure) { @@ -2517,6 +2631,7 @@ TEST_P(DBIteratorTest, TableFilter) { ASSERT_EQ(IterStatus(iter), "f->6"); iter->Next(); ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); ASSERT_TRUE(unseen.empty()); delete iter; } @@ -2539,6 +2654,7 @@ TEST_P(DBIteratorTest, TableFilter) { ASSERT_EQ(IterStatus(iter), "f->6"); iter->Next(); ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); delete iter; } } @@ -2623,6 +2739,7 @@ TEST_P(DBIteratorTest, SkipStatistics) { ASSERT_OK(iter->status()); count++; } + ASSERT_OK(iter->status()); ASSERT_EQ(count, 3); delete iter; skip_count += 8; // Same as above, but in reverse order @@ -2658,6 +2775,7 @@ TEST_P(DBIteratorTest, SkipStatistics) { ASSERT_OK(iter->status()); count++; } + ASSERT_OK(iter->status()); ASSERT_EQ(count, 2); delete iter; // 3 deletes + 3 original keys + lower sequence of "a" @@ -3111,8 +3229,10 @@ TEST_F(DBIteratorWithReadCallbackTest, ReadCallback) { static_cast_with_check(db_->DefaultColumnFamily()) ->cfd(); // The iterator are suppose to see data before seq1. - Iterator* iter = - dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq2, &callback1); + DBImpl* db_impl = dbfull(); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(db_impl); + Iterator* iter = db_impl->NewIteratorImpl(ReadOptions(), cfd, super_version, + seq2, &callback1); // Seek // The latest value of "foo" before seq1 is "v3" @@ -3190,7 +3310,9 @@ TEST_F(DBIteratorWithReadCallbackTest, ReadCallback) { SequenceNumber seq4 = db_->GetLatestSequenceNumber(); // The iterator is suppose to see data before seq3. - iter = dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq4, &callback2); + super_version = cfd->GetReferencedSuperVersion(db_impl); + iter = db_impl->NewIteratorImpl(ReadOptions(), cfd, super_version, seq4, + &callback2); // Seek to "z", which is visible. iter->Seek("z"); ASSERT_TRUE(iter->Valid()); @@ -3236,6 +3358,7 @@ TEST_F(DBIteratorTest, BackwardIterationOnInplaceUpdateMemtable) { for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { ++count; } + ASSERT_OK(iter->status()); ASSERT_EQ(kNumKeys, count); } @@ -3276,6 +3399,176 @@ TEST_F(DBIteratorTest, IteratorRefreshReturnSV) { Close(); } +TEST_F(DBIteratorTest, ErrorWhenReadFile) { + // This is to test a bug that is fixed in + // https://github.com/facebook/rocksdb/pull/11782. + // + // Ingest error when reading from a file, and + // see if Iterator handles it correctly. + Options opts = CurrentOptions(); + opts.num_levels = 7; + opts.compression = kNoCompression; + BlockBasedTableOptions bbto; + // Always do I/O + bbto.no_block_cache = true; + opts.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(opts); + + // Set up LSM + // L5: F1 [key0, key99], F2 [key100, key199] + // L6: F3 [key50, key149] + Random rnd(301); + const int kValLen = 100; + for (int i = 50; i < 150; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(kValLen))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + + std::vector values; + for (int i = 0; i < 100; ++i) { + values.emplace_back(rnd.RandomString(kValLen)); + ASSERT_OK(Put(Key(i), values.back())); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(5); + + for (int i = 100; i < 200; ++i) { + values.emplace_back(rnd.RandomString(kValLen)); + ASSERT_OK(Put(Key(i), values.back())); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(5); + + ASSERT_EQ(2, NumTableFilesAtLevel(5)); + ASSERT_EQ(1, NumTableFilesAtLevel(6)); + + std::vector files; + db_->GetLiveFilesMetaData(&files); + // Get file names for F1, F2 and F3. + // These are file names, not full paths. + std::string f1, f2, f3; + for (auto& file_meta : files) { + if (file_meta.level == 6) { + f3 = file_meta.name; + } else { + if (file_meta.smallestkey == Key(0)) { + f1 = file_meta.name; + } else { + f2 = file_meta.name; + } + } + } + ASSERT_TRUE(!f1.empty()); + ASSERT_TRUE(!f2.empty()); + ASSERT_TRUE(!f3.empty()); + + std::string error_file; + SyncPoint::GetInstance()->SetCallBack( + "RandomAccessFileReader::Read::BeforeReturn", + [&error_file](void* io_s_ptr) { + auto p = + reinterpret_cast*>(io_s_ptr); + if (p->first->find(error_file) != std::string::npos) { + *p->second = IOStatus::IOError(); + p->second->SetRetryable(true); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + // Error reading F1 + error_file = f1; + std::unique_ptr iter{db_->NewIterator(ReadOptions())}; + iter->SeekToFirst(); + ASSERT_NOK(iter->status()); + ASSERT_TRUE(iter->status().IsIOError()); + // This does not require reading the first block. + iter->Seek(Key(90)); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value(), values[90]); + // iter has ok status before this Seek. + iter->Seek(Key(1)); + ASSERT_NOK(iter->status()); + ASSERT_TRUE(iter->status().IsIOError()); + + // Error reading F2 + error_file = f2; + iter.reset(db_->NewIterator(ReadOptions())); + iter->Seek(Key(99)); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value(), values[99]); + // Need to read from F2. + iter->Next(); + ASSERT_NOK(iter->status()); + ASSERT_TRUE(iter->status().IsIOError()); + iter->Seek(Key(190)); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value(), values[190]); + // Seek for first key of F2. + iter->Seek(Key(100)); + ASSERT_NOK(iter->status()); + ASSERT_TRUE(iter->status().IsIOError()); + iter->SeekToLast(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value(), values[199]); + // SeekForPrev for first key of F2. + iter->SeekForPrev(Key(100)); + ASSERT_NOK(iter->status()); + ASSERT_TRUE(iter->status().IsIOError()); + // Does not read first block (offset 0). + iter->SeekForPrev(Key(98)); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value(), values[98]); + + // Error reading F3 + error_file = f3; + iter.reset(db_->NewIterator(ReadOptions())); + iter->SeekToFirst(); + ASSERT_NOK(iter->status()); + ASSERT_TRUE(iter->status().IsIOError()); + iter->Seek(Key(50)); + ASSERT_NOK(iter->status()); + ASSERT_TRUE(iter->status().IsIOError()); + iter->SeekForPrev(Key(50)); + ASSERT_NOK(iter->status()); + ASSERT_TRUE(iter->status().IsIOError()); + // Does not read file 3 + iter->Seek(Key(150)); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value(), values[150]); + + // Test when file read error occurs during Prev(). + // This requires returning an error when reading near the end of a file + // instead of offset 0. + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->SetCallBack( + "RandomAccessFileReader::Read::AnyOffset", [&f1](void* pair_ptr) { + auto p = + reinterpret_cast*>(pair_ptr); + if (p->first->find(f1) != std::string::npos) { + *p->second = IOStatus::IOError(); + p->second->SetRetryable(true); + } + }); + iter->SeekForPrev(Key(101)); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value(), values[101]); + // DBIter will not stop at Key(100) since it needs + // to make sure the key it returns has the max sequence number for Key(100). + // So it will call MergingIterator::Prev() which will read F1. + iter->Prev(); + ASSERT_NOK(iter->status()); + ASSERT_TRUE(iter->status().IsIOError()); + SyncPoint::GetInstance()->DisableProcessing(); + iter->Reset(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_log_iter_test.cc b/db/db_log_iter_test.cc index 4e982858c4c1..87313971a516 100644 --- a/db/db_log_iter_test.cc +++ b/db/db_log_iter_test.cc @@ -10,7 +10,6 @@ // Introduction of SyncPoint effectively disabled building and running this test // in Release build. // which is a pity, it is a good test -#if !defined(ROCKSDB_LITE) #include "db/db_test_util.h" #include "env/mock_env.h" @@ -146,6 +145,41 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorRace) { } while (ChangeCompactOptions()); } } + +TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckWhenArchive) { + do { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); + Options options = OptionsForLogIterTest(); + DestroyAndReopen(options); + ColumnFamilyHandle* cf; + auto s = dbfull()->CreateColumnFamily(ColumnFamilyOptions(), "CF", &cf); + ASSERT_TRUE(s.ok()); + + ASSERT_OK(dbfull()->Put(WriteOptions(), cf, "key1", DummyString(1024))); + + ASSERT_OK(dbfull()->Put(WriteOptions(), "key2", DummyString(1024))); + + ASSERT_OK(dbfull()->Flush(FlushOptions())); + + ASSERT_OK(dbfull()->Put(WriteOptions(), "key3", DummyString(1024))); + + ASSERT_OK(dbfull()->Flush(FlushOptions())); + + ASSERT_OK(dbfull()->Put(WriteOptions(), "key4", DummyString(1024))); + ASSERT_OK(dbfull()->Flush(FlushOptions())); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "WalManager::PurgeObsoleteFiles:1", [&](void*) { + auto iter = OpenTransactionLogIter(0); + ExpectRecords(4, iter); + }); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(dbfull()->Flush(FlushOptions(), cf)); + + delete cf; + } while (ChangeCompactOptions()); +} #endif TEST_F(DBTestXactLogIterator, TransactionLogIteratorStallAtLastRecord) { @@ -202,7 +236,7 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorCorruptedLog) { ASSERT_OK(test::TruncateFile(env_, logfile_path, wal_files.front()->SizeFileBytes() / 2)); - ASSERT_OK(db_->EnableFileDeletions()); + ASSERT_OK(db_->EnableFileDeletions(/*force=*/false)); // Insert a new entry to a new log file ASSERT_OK(Put("key1025", DummyString(10))); @@ -290,16 +324,9 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) { } } // namespace ROCKSDB_NAMESPACE -#endif // !defined(ROCKSDB_LITE) int main(int argc, char** argv) { -#if !defined(ROCKSDB_LITE) ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - (void)argc; - (void)argv; - return 0; -#endif } diff --git a/db/db_logical_block_size_cache_test.cc b/db/db_logical_block_size_cache_test.cc index 13c16618e6ba..ff56d56e370d 100644 --- a/db/db_logical_block_size_cache_test.cc +++ b/db/db_logical_block_size_cache_test.cc @@ -72,12 +72,8 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenClose) { printf("Open\n"); ASSERT_OK(DB::Open(options, dbname_, &db)); } else { -#ifdef ROCKSDB_LITE - break; -#else printf("OpenForReadOnly\n"); ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db)); -#endif } ASSERT_EQ(2, cache_->Size()); ASSERT_TRUE(cache_->Contains(data_path_0_)); @@ -104,12 +100,8 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenDelete) { printf("Open\n"); ASSERT_OK(DB::Open(options, dbname_, &db)); } else { -#ifdef ROCKSDB_LITE - break; -#else printf("OpenForReadOnly\n"); ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db)); -#endif } ASSERT_EQ(1, cache_->Size()); ASSERT_TRUE(cache_->Contains(dbname_)); @@ -261,16 +253,12 @@ TEST_F(DBLogicalBlockSizeCacheTest, OpenWithColumnFamilies) { {"default", ColumnFamilyOptions()}}, &cfs, &db)); } else { -#ifdef ROCKSDB_LITE - break; -#else printf("OpenForReadOnly\n"); ASSERT_OK(DB::OpenForReadOnly(options, dbname_, {{"cf1", cf_options}, {"cf2", cf_options}, {"default", ColumnFamilyOptions()}}, &cfs, &db)); -#endif } // Logical block sizes of dbname_ and cf_path_0_ are cached during Open. @@ -360,14 +348,10 @@ TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) { options, dbname_, {{"cf", cf_options}, {"default", ColumnFamilyOptions()}}, &cfs, &db)); } else { -#ifdef ROCKSDB_LITE - break; -#else printf("OpenForReadOnly\n"); ASSERT_OK(DB::OpenForReadOnly( options, dbname_, {{"cf", cf_options}, {"default", ColumnFamilyOptions()}}, &cfs, &db)); -#endif } // cf_path_0_ and dbname_ are cached. ASSERT_EQ(2, cache_->Size()); diff --git a/db/db_merge_operand_test.cc b/db/db_merge_operand_test.cc index 629d3923f630..b6b9ff2afe1f 100644 --- a/db/db_merge_operand_test.cc +++ b/db/db_merge_operand_test.cc @@ -8,9 +8,7 @@ #include "rocksdb/perf_context.h" #include "rocksdb/utilities/debug.h" #include "table/block_based/block_builder.h" -#if !defined(ROCKSDB_LITE) #include "test_util/sync_point.h" -#endif #include "rocksdb/merge_operator.h" #include "utilities/fault_injection_env.h" #include "utilities/merge_operators.h" @@ -51,10 +49,8 @@ TEST_F(DBMergeOperandTest, CacheEvictedMergeOperandReadAfterFreeBug) { // There was a bug of reading merge operands after they are mistakely freed // in DB::GetMergeOperands, which is surfaced by cache full. // See PR#9507 for more. - Options options; - options.create_if_missing = true; + Options options = CurrentOptions(); options.merge_operator = MergeOperators::CreateStringAppendOperator(); - options.env = env_; BlockBasedTableOptions table_options; // Small cache to simulate cache full @@ -123,11 +119,9 @@ TEST_F(DBMergeOperandTest, FlushedMergeOperandReadAfterFreeBug) { } TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { - Options options; - options.create_if_missing = true; + Options options = CurrentOptions(); // Use only the latest two merge operands. options.merge_operator = std::make_shared(2, ','); - options.env = env_; Reopen(options); int num_records = 4; int number_of_operands = 0; @@ -311,13 +305,11 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) { } TEST_F(DBMergeOperandTest, BlobDBGetMergeOperandsBasic) { - Options options; - options.create_if_missing = true; + Options options = CurrentOptions(); options.enable_blob_files = true; options.min_blob_size = 0; // Use only the latest two merge operands. options.merge_operator = std::make_shared(2, ','); - options.env = env_; Reopen(options); int num_records = 4; int number_of_operands = 0; @@ -403,8 +395,7 @@ TEST_F(DBMergeOperandTest, GetMergeOperandsLargeResultOptimization) { const int kNumOperands = 1024; const int kOperandLen = 1024; - Options options; - options.create_if_missing = true; + Options options = CurrentOptions(); options.merge_operator = MergeOperators::CreateStringAppendOperator(); DestroyAndReopen(options); diff --git a/db/db_merge_operator_test.cc b/db/db_merge_operator_test.cc index f8c90c15871f..e82e0cbf0938 100644 --- a/db/db_merge_operator_test.cc +++ b/db/db_merge_operator_test.cc @@ -6,9 +6,12 @@ #include #include "db/db_test_util.h" +#include "db/dbformat.h" #include "db/forward_iterator.h" #include "port/stack_trace.h" #include "rocksdb/merge_operator.h" +#include "rocksdb/snapshot.h" +#include "rocksdb/utilities/debug.h" #include "util/random.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend2.h" @@ -81,7 +84,7 @@ TEST_F(DBMergeOperatorTest, LimitMergeOperands) { size_t limit_ = 0; }; - Options options; + Options options = CurrentOptions(); options.create_if_missing = true; // Use only the latest two merge operands. options.merge_operator = std::make_shared(2, ','); @@ -134,7 +137,7 @@ TEST_F(DBMergeOperatorTest, LimitMergeOperands) { } TEST_F(DBMergeOperatorTest, MergeErrorOnRead) { - Options options; + Options options = CurrentOptions(); options.create_if_missing = true; options.merge_operator.reset(new TestPutOperator()); options.env = env_; @@ -147,7 +150,7 @@ TEST_F(DBMergeOperatorTest, MergeErrorOnRead) { } TEST_F(DBMergeOperatorTest, MergeErrorOnWrite) { - Options options; + Options options = CurrentOptions(); options.create_if_missing = true; options.merge_operator.reset(new TestPutOperator()); options.max_successive_merges = 3; @@ -163,7 +166,7 @@ TEST_F(DBMergeOperatorTest, MergeErrorOnWrite) { } TEST_F(DBMergeOperatorTest, MergeErrorOnIteration) { - Options options; + Options options = CurrentOptions(); options.create_if_missing = true; options.merge_operator.reset(new TestPutOperator()); options.env = env_; @@ -202,8 +205,6 @@ TEST_F(DBMergeOperatorTest, MergeErrorOnIteration) { VerifyDBInternal({{"k1", "v1"}, {"k2", "corrupted"}, {"k2", "v2"}}); } -#ifndef ROCKSDB_LITE - TEST_F(DBMergeOperatorTest, MergeOperatorFailsWithMustMerge) { // This is like a mini-stress test dedicated to `OpFailureScope::kMustMerge`. // Some or most of it might be deleted upon adding that option to the actual @@ -222,7 +223,7 @@ TEST_F(DBMergeOperatorTest, MergeOperatorFailsWithMustMerge) { // expect "k0" and "k2" to always be readable. "k1" is expected to be readable // only by APIs that do not require merging, such as `GetMergeOperands()`. const int kNumOperands = 3; - Options options; + Options options = CurrentOptions(); options.merge_operator.reset(new TestPutOperator()); options.env = env_; Reopen(options); @@ -232,7 +233,9 @@ TEST_F(DBMergeOperatorTest, MergeOperatorFailsWithMustMerge) { { std::string value; ASSERT_OK(db_->Get(ReadOptions(), "k0", &value)); - ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).IsCorruption()); + Status s = db_->Get(ReadOptions(), "k1", &value); + ASSERT_TRUE(s.IsCorruption()); + ASSERT_EQ(Status::SubCode::kMergeOperatorFailed, s.subcode()); ASSERT_OK(db_->Get(ReadOptions(), "k2", &value)); } @@ -244,6 +247,8 @@ TEST_F(DBMergeOperatorTest, MergeOperatorFailsWithMustMerge) { ASSERT_EQ("k0", iter->key()); iter->Next(); ASSERT_TRUE(iter->status().IsCorruption()); + ASSERT_EQ(Status::SubCode::kMergeOperatorFailed, + iter->status().subcode()); iter->SeekToLast(); ASSERT_TRUE(iter->Valid()); @@ -355,7 +360,140 @@ TEST_F(DBMergeOperatorTest, MergeOperatorFailsWithMustMerge) { } } -#endif // ROCKSDB_LITE +TEST_F(DBMergeOperatorTest, MergeOperandThresholdExceeded) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreatePutOperator(); + options.env = env_; + Reopen(options); + + std::vector keys{"foo", "bar", "baz"}; + + // Write base values. + for (const auto& key : keys) { + ASSERT_OK(Put(key, key.ToString() + "0")); + } + + // Write merge operands. Note that the first key has 1 merge operand, the + // second one has 2 merge operands, and the third one has 3 merge operands. + // Also, we'll take some snapshots to make sure the merge operands are + // preserved during flush. + std::vector snapshots; + snapshots.reserve(3); + + for (size_t i = 0; i < keys.size(); ++i) { + snapshots.emplace_back(db_); + + const std::string suffix = std::to_string(i + 1); + + for (size_t j = i; j < keys.size(); ++j) { + ASSERT_OK(Merge(keys[j], keys[j].ToString() + suffix)); + } + } + + // Verify the results and status codes of various types of point lookups. + auto verify = [&](const std::optional& threshold) { + ReadOptions read_options; + read_options.merge_operand_count_threshold = threshold; + + // Check Get() + { + for (size_t i = 0; i < keys.size(); ++i) { + PinnableSlice value; + const Status status = + db_->Get(read_options, db_->DefaultColumnFamily(), keys[i], &value); + ASSERT_OK(status); + ASSERT_EQ(status.IsOkMergeOperandThresholdExceeded(), + threshold.has_value() && i + 1 > threshold.value()); + ASSERT_EQ(value, keys[i].ToString() + std::to_string(i + 1)); + } + } + + // Check old-style MultiGet() + { + std::vector values; + std::vector statuses = db_->MultiGet(read_options, keys, &values); + + for (size_t i = 0; i < keys.size(); ++i) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(statuses[i].IsOkMergeOperandThresholdExceeded(), + threshold.has_value() && i + 1 > threshold.value()); + ASSERT_EQ(values[i], keys[i].ToString() + std::to_string(i + 1)); + } + } + + // Check batched MultiGet() + { + std::vector values(keys.size()); + std::vector statuses(keys.size()); + db_->MultiGet(read_options, db_->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data()); + + for (size_t i = 0; i < keys.size(); ++i) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(statuses[i].IsOkMergeOperandThresholdExceeded(), + threshold.has_value() && i + 1 > threshold.value()); + ASSERT_EQ(values[i], keys[i].ToString() + std::to_string(i + 1)); + } + } + }; + + // Test the case when the feature is disabled as well as various thresholds. + verify(std::nullopt); + for (size_t i = 0; i < 5; ++i) { + verify(i); + } + + // Flush and try again to test the case when results are served from SSTs. + ASSERT_OK(Flush()); + verify(std::nullopt); + for (size_t i = 0; i < 5; ++i) { + verify(i); + } +} + +TEST_F(DBMergeOperatorTest, DataBlockBinaryAndHash) { + // Basic test to check that merge operator works with data block index type + // DataBlockBinaryAndHash. + Options options = CurrentOptions(); + options.create_if_missing = true; + options.merge_operator.reset(new TestPutOperator()); + options.env = env_; + BlockBasedTableOptions table_options; + table_options.block_restart_interval = 16; + table_options.data_block_index_type = + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + const int kNumKeys = 100; + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(db_->Merge(WriteOptions(), Key(i), std::to_string(i))); + } + ASSERT_OK(Flush()); + std::string value; + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value)); + ASSERT_EQ(std::to_string(i), value); + } + + std::vector snapshots; + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(db_->Delete(WriteOptions(), Key(i))); + for (int j = 0; j < 3; ++j) { + ASSERT_OK(db_->Merge(WriteOptions(), Key(i), std::to_string(i * 3 + j))); + snapshots.push_back(db_->GetSnapshot()); + } + } + ASSERT_OK(Flush()); + for (int i = 0; i < kNumKeys; ++i) { + ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value)); + ASSERT_EQ(std::to_string(i * 3 + 2), value); + } + for (auto snapshot : snapshots) { + db_->ReleaseSnapshot(snapshot); + } +} class MergeOperatorPinningTest : public DBMergeOperatorTest, public testing::WithParamInterface { @@ -368,7 +506,6 @@ class MergeOperatorPinningTest : public DBMergeOperatorTest, INSTANTIATE_TEST_CASE_P(MergeOperatorPinningTest, MergeOperatorPinningTest, ::testing::Bool()); -#ifndef ROCKSDB_LITE TEST_P(MergeOperatorPinningTest, OperandsMultiBlocks) { Options options = CurrentOptions(); BlockBasedTableOptions table_options; @@ -639,7 +776,6 @@ TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) { EXPECT_TRUE(pushed_first_operand); EXPECT_TRUE(stepped_to_next_operand); } -#endif // ROCKSDB_LITE TEST_F(DBMergeOperatorTest, SnapshotCheckerAndReadCallback) { Options options = CurrentOptions(); @@ -815,6 +951,98 @@ TEST_P(PerConfigMergeOperatorPinningTest, Randomized) { VerifyDBFromMap(true_data); } +TEST_F(DBMergeOperatorTest, MaxSuccessiveMergesBaseValues) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreatePutOperator(); + options.max_successive_merges = 1; + options.env = env_; + Reopen(options); + + constexpr char foo[] = "foo"; + constexpr char bar[] = "bar"; + constexpr char baz[] = "baz"; + constexpr char qux[] = "qux"; + constexpr char corge[] = "corge"; + + // No base value + { + constexpr char key[] = "key1"; + + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, foo)); + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, bar)); + + PinnableSlice result; + ASSERT_OK( + db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)); + ASSERT_EQ(result, bar); + + // We expect the second Merge to be converted to a Put because of + // max_successive_merges. + constexpr size_t max_key_versions = 8; + std::vector key_versions; + ASSERT_OK(GetAllKeyVersions(db_, db_->DefaultColumnFamily(), key, key, + max_key_versions, &key_versions)); + ASSERT_EQ(key_versions.size(), 2); + ASSERT_EQ(key_versions[0].type, kTypeValue); + ASSERT_EQ(key_versions[1].type, kTypeMerge); + } + + // Plain base value + { + constexpr char key[] = "key2"; + + ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), key, foo)); + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, bar)); + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, baz)); + + PinnableSlice result; + ASSERT_OK( + db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)); + ASSERT_EQ(result, baz); + + // We expect the second Merge to be converted to a Put because of + // max_successive_merges. + constexpr size_t max_key_versions = 8; + std::vector key_versions; + ASSERT_OK(GetAllKeyVersions(db_, db_->DefaultColumnFamily(), key, key, + max_key_versions, &key_versions)); + ASSERT_EQ(key_versions.size(), 3); + ASSERT_EQ(key_versions[0].type, kTypeValue); + ASSERT_EQ(key_versions[1].type, kTypeMerge); + ASSERT_EQ(key_versions[2].type, kTypeValue); + } + + // Wide-column base value + { + constexpr char key[] = "key3"; + const WideColumns columns{{kDefaultWideColumnName, foo}, {bar, baz}}; + + ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), key, + columns)); + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, qux)); + ASSERT_OK( + db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), key, corge)); + + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), key, + &result)); + const WideColumns expected{{kDefaultWideColumnName, corge}, {bar, baz}}; + ASSERT_EQ(result.columns(), expected); + + // We expect the second Merge to be converted to a PutEntity because of + // max_successive_merges. + constexpr size_t max_key_versions = 8; + std::vector key_versions; + ASSERT_OK(GetAllKeyVersions(db_, db_->DefaultColumnFamily(), key, key, + max_key_versions, &key_versions)); + ASSERT_EQ(key_versions.size(), 3); + ASSERT_EQ(key_versions[0].type, kTypeWideColumnEntity); + ASSERT_EQ(key_versions[1].type, kTypeMerge); + ASSERT_EQ(key_versions[2].type, kTypeWideColumnEntity); + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_options_test.cc b/db/db_options_test.cc index 691081db9dbf..8f60f0051b95 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -19,9 +19,12 @@ #include "rocksdb/convenience.h" #include "rocksdb/rate_limiter.h" #include "rocksdb/stats_history.h" +#include "rocksdb/utilities/options_util.h" +#include "test_util/mock_time_env.h" #include "test_util/sync_point.h" #include "test_util/testutil.h" #include "util/random.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { @@ -29,7 +32,6 @@ class DBOptionsTest : public DBTestBase { public: DBOptionsTest() : DBTestBase("db_options_test", /*env_do_fsync=*/true) {} -#ifndef ROCKSDB_LITE std::unordered_map GetMutableDBOptionsMap( const DBOptions& options) { std::string options_str; @@ -76,7 +78,6 @@ class DBOptionsTest : public DBTestBase { auto sanitized_options = SanitizeOptions(dbname_, db_options); return GetMutableDBOptionsMap(sanitized_options); } -#endif // ROCKSDB_LITE }; TEST_F(DBOptionsTest, ImmutableTrackAndVerifyWalsInManifest) { @@ -112,7 +113,6 @@ TEST_F(DBOptionsTest, ImmutableVerifySstUniqueIdInManifest) { } // RocksDB lite don't support dynamic options. -#ifndef ROCKSDB_LITE TEST_F(DBOptionsTest, AvoidUpdatingOptions) { Options options; @@ -584,6 +584,7 @@ TEST_F(DBOptionsTest, EnableAutoCompactionAndTriggerStall) { TEST_F(DBOptionsTest, SetOptionsMayTriggerCompaction) { Options options; + options.level_compaction_dynamic_level_bytes = false; options.create_if_missing = true; options.level0_file_num_compaction_trigger = 1000; options.env = env_; @@ -741,6 +742,55 @@ TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) { Close(); } +TEST_F(DBOptionsTest, SetStatsDumpPeriodSecRace) { + // This is a mini-stress test looking for inconsistency between the reported + // state of the option and the behavior in effect for the DB, after the last + // modification to that option (indefinite inconsistency). + std::vector threads; + for (int i = 0; i < 12; i++) { + threads.emplace_back([this, i]() { + ASSERT_OK(dbfull()->SetDBOptions( + {{"stats_dump_period_sec", i % 2 ? "100" : "0"}})); + }); + } + + for (auto& t : threads) { + t.join(); + } + + bool stats_dump_set = dbfull()->GetDBOptions().stats_dump_period_sec > 0; + bool task_enabled = dbfull()->TEST_GetPeriodicTaskScheduler().TEST_HasTask( + PeriodicTaskType::kDumpStats); + + ASSERT_EQ(stats_dump_set, task_enabled); +} + +TEST_F(DBOptionsTest, SetOptionsAndFileRace) { + // This is a mini-stress test looking for inconsistency between the reported + // state of the option and what is persisted in the options file, after the + // last modification to that option (indefinite inconsistency). + std::vector threads; + for (int i = 0; i < 12; i++) { + threads.emplace_back([this, i]() { + ASSERT_OK(dbfull()->SetOptions({{"ttl", std::to_string(i * 100)}})); + }); + } + + for (auto& t : threads) { + t.join(); + } + + auto setting_in_mem = dbfull()->GetOptions().ttl; + + std::vector cf_descs; + DBOptions db_options; + ConfigOptions cfg; + cfg.env = env_; + ASSERT_OK(LoadLatestOptions(cfg, dbname_, &db_options, &cf_descs, nullptr)); + ASSERT_EQ(cf_descs.size(), 1); + ASSERT_EQ(setting_in_mem, cf_descs[0].options.ttl); +} + TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) { Options options; options.create_if_missing = true; @@ -881,9 +931,13 @@ TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) { Options options; options.compaction_style = kCompactionStyleFIFO; options.env = CurrentOptions().env; + // Default value allows RocksDB to set ttl to 30 days. + ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl); + + // Disable options.ttl = 0; Reopen(options); - ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl); + ASSERT_EQ(0, dbfull()->GetOptions().ttl); options.ttl = 100; Reopen(options); @@ -893,26 +947,25 @@ TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) { Reopen(options); ASSERT_EQ(100 * 24 * 60 * 60, dbfull()->GetOptions().ttl); - options.ttl = 200; - options.periodic_compaction_seconds = 300; - Reopen(options); - ASSERT_EQ(200, dbfull()->GetOptions().ttl); - + // periodic_compaction_seconds should have no effect + // on FIFO compaction. options.ttl = 500; options.periodic_compaction_seconds = 300; Reopen(options); - ASSERT_EQ(300, dbfull()->GetOptions().ttl); + ASSERT_EQ(500, dbfull()->GetOptions().ttl); } TEST_F(DBOptionsTest, SetFIFOCompactionOptions) { Options options; options.env = CurrentOptions().env; options.compaction_style = kCompactionStyleFIFO; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.write_buffer_size = 10 << 10; // 10KB options.arena_block_size = 4096; options.compression = kNoCompression; options.create_if_missing = true; options.compaction_options_fifo.allow_compaction = false; + options.num_levels = 1; env_->SetMockSleep(); options.env = env_; @@ -940,6 +993,9 @@ TEST_F(DBOptionsTest, SetFIFOCompactionOptions) { ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(NumTableFilesAtLevel(0), 10); + ASSERT_EQ(options.statistics->getTickerCount(FIFO_TTL_COMPACTIONS), 0); + ASSERT_EQ(options.statistics->getTickerCount(FIFO_MAX_SIZE_COMPACTIONS), 0); + // Set ttl to 1 minute. So all files should get deleted. ASSERT_OK(dbfull()->SetOptions({{"ttl", "60"}})); ASSERT_EQ(dbfull()->GetOptions().ttl, 60); @@ -947,6 +1003,10 @@ TEST_F(DBOptionsTest, SetFIFOCompactionOptions) { ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_GT(options.statistics->getTickerCount(FIFO_TTL_COMPACTIONS), 0); + ASSERT_EQ(options.statistics->getTickerCount(FIFO_MAX_SIZE_COMPACTIONS), 0); + ASSERT_OK(options.statistics->Reset()); + // NOTE: Presumed unnecessary and removed: resetting mock time in env // Test dynamically changing compaction_options_fifo.max_table_files_size @@ -970,6 +1030,9 @@ TEST_F(DBOptionsTest, SetFIFOCompactionOptions) { ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(NumTableFilesAtLevel(0), 10); + ASSERT_EQ(options.statistics->getTickerCount(FIFO_MAX_SIZE_COMPACTIONS), 0); + ASSERT_EQ(options.statistics->getTickerCount(FIFO_TTL_COMPACTIONS), 0); + // Set max_table_files_size to 12 KB. So only 1 file should remain now. ASSERT_OK(dbfull()->SetOptions( {{"compaction_options_fifo", "{max_table_files_size=12288;}"}})); @@ -979,6 +1042,10 @@ TEST_F(DBOptionsTest, SetFIFOCompactionOptions) { ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_GT(options.statistics->getTickerCount(FIFO_MAX_SIZE_COMPACTIONS), 0); + ASSERT_EQ(options.statistics->getTickerCount(FIFO_TTL_COMPACTIONS), 0); + ASSERT_OK(options.statistics->Reset()); + // Test dynamically changing compaction_options_fifo.allow_compaction options.compaction_options_fifo.max_table_files_size = 500 << 10; // 500KB options.ttl = 0; @@ -1012,40 +1079,245 @@ TEST_F(DBOptionsTest, SetFIFOCompactionOptions) { ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GE(NumTableFilesAtLevel(0), 1); ASSERT_LE(NumTableFilesAtLevel(0), 5); + + // Test dynamically setting `file_temperature_age_thresholds` + ASSERT_TRUE( + dbfull() + ->GetOptions() + .compaction_options_fifo.file_temperature_age_thresholds.empty()); + ASSERT_OK(dbfull()->SetOptions({{"compaction_options_fifo", + "{file_temperature_age_thresholds={{age=10;" + "temperature=kWarm}:{age=30000;" + "temperature=kCold}}}"}})); + auto opts = dbfull()->GetOptions(); + const auto& fifo_temp_opt = + opts.compaction_options_fifo.file_temperature_age_thresholds; + ASSERT_EQ(fifo_temp_opt.size(), 2); + ASSERT_EQ(fifo_temp_opt[0].temperature, Temperature::kWarm); + ASSERT_EQ(fifo_temp_opt[0].age, 10); + ASSERT_EQ(fifo_temp_opt[1].temperature, Temperature::kCold); + ASSERT_EQ(fifo_temp_opt[1].age, 30000); } -TEST_F(DBOptionsTest, CompactionReadaheadSizeChange) { - SpecialEnv env(env_); +TEST_F(DBOptionsTest, OffpeakTimes) { Options options; - options.env = &env; + options.create_if_missing = true; + Random rnd(test::RandomSeed()); - options.compaction_readahead_size = 0; - options.level0_file_num_compaction_trigger = 2; - const std::string kValue(1024, 'v'); - Reopen(options); + auto verify_invalid = [&]() { + Status s = DBImpl::TEST_ValidateOptions(options); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + }; - ASSERT_EQ(0, dbfull()->GetDBOptions().compaction_readahead_size); - ASSERT_OK(dbfull()->SetDBOptions({{"compaction_readahead_size", "256"}})); - ASSERT_EQ(256, dbfull()->GetDBOptions().compaction_readahead_size); - for (int i = 0; i < 1024; i++) { - ASSERT_OK(Put(Key(i), kValue)); + auto verify_valid = [&]() { + Status s = DBImpl::TEST_ValidateOptions(options); + ASSERT_OK(s); + ASSERT_FALSE(s.IsInvalidArgument()); + }; + std::vector invalid_cases = { + "06:30-", + "-23:30", // Both need to be set + "00:00-00:00", + "06:30-06:30" // Start time cannot be the same as end time + "12:30 PM-23:30", + "12:01AM-11:00PM", // Invalid format + "01:99-22:00", // Invalid value for minutes + "00:00-24:00", // 24:00 is an invalid value + "6-7", + "6:-7", + "06:31.42-7:00", + "6.31:42-7:00", + "6:0-7:", + "15:0.2-3:.7", + ":00-00:02", + "02:00-:00", + "random-value", + "No:No-Hi:Hi", + }; + + std::vector valid_cases = { + "", // Not enabled. Valid case + "06:30-11:30", + "06:30-23:30", + "13:30-14:30", + "00:00-23:59", // Entire Day + "23:30-01:15", // From 11:30PM to 1:15AM next day. Valid case. + "1:0000000000000-2:000000000042", // Weird, but we can parse the int. + }; + + for (std::string invalid_case : invalid_cases) { + options.daily_offpeak_time_utc = invalid_case; + verify_invalid(); } - ASSERT_OK(Flush()); - for (int i = 0; i < 1024 * 2; i++) { - ASSERT_OK(Put(Key(i), kValue)); + for (std::string valid_case : valid_cases) { + options.daily_offpeak_time_utc = valid_case; + verify_valid(); } - ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ASSERT_EQ(256, env_->compaction_readahead_size_); + + auto verify_offpeak_info = [&](bool expected_is_now_off_peak, + int expected_seconds_till_next_offpeak_start, + int now_utc_hour, int now_utc_minute, + int now_utc_second = 0) { + auto mock_clock = std::make_shared(env_->GetSystemClock()); + // Add some extra random days to current time + int days = rnd.Uniform(100); + mock_clock->SetCurrentTime( + days * OffpeakTimeOption::kSecondsPerDay + + now_utc_hour * OffpeakTimeOption::kSecondsPerHour + + now_utc_minute * OffpeakTimeOption::kSecondsPerMinute + now_utc_second); + Status s = DBImpl::TEST_ValidateOptions(options); + ASSERT_OK(s); + auto offpeak_option = OffpeakTimeOption(options.daily_offpeak_time_utc); + int64_t now; + ASSERT_OK(mock_clock.get()->GetCurrentTime(&now)); + auto offpeak_info = offpeak_option.GetOffpeakTimeInfo(now); + ASSERT_EQ(expected_is_now_off_peak, offpeak_info.is_now_offpeak); + ASSERT_EQ(expected_seconds_till_next_offpeak_start, + offpeak_info.seconds_till_next_offpeak_start); + }; + + options.daily_offpeak_time_utc = ""; + verify_offpeak_info(false, 0, 12, 30); + + options.daily_offpeak_time_utc = "06:30-11:30"; + verify_offpeak_info(false, 1 * OffpeakTimeOption::kSecondsPerHour, 5, 30); + verify_offpeak_info(true, 24 * OffpeakTimeOption::kSecondsPerHour, 6, 30); + verify_offpeak_info(true, 20 * OffpeakTimeOption::kSecondsPerHour, 10, 30); + verify_offpeak_info(true, 19 * OffpeakTimeOption::kSecondsPerHour, 11, 30); + verify_offpeak_info(false, 17 * OffpeakTimeOption::kSecondsPerHour, 13, 30); + + options.daily_offpeak_time_utc = "23:30-04:30"; + verify_offpeak_info(false, 17 * OffpeakTimeOption::kSecondsPerHour, 6, 30); + verify_offpeak_info(true, 24 * OffpeakTimeOption::kSecondsPerHour, 23, 30); + verify_offpeak_info(true, + 23 * OffpeakTimeOption::kSecondsPerHour + + 30 * OffpeakTimeOption::kSecondsPerMinute, + 0, 0); + verify_offpeak_info(true, + 22 * OffpeakTimeOption::kSecondsPerHour + + 30 * OffpeakTimeOption::kSecondsPerMinute, + 1, 0); + verify_offpeak_info(true, 19 * OffpeakTimeOption::kSecondsPerHour, 4, 30); + verify_offpeak_info(false, + 18 * OffpeakTimeOption::kSecondsPerHour + + 59 * OffpeakTimeOption::kSecondsPerMinute, + 4, 31); + + // Entire day offpeak + options.daily_offpeak_time_utc = "00:00-23:59"; + verify_offpeak_info(true, 24 * OffpeakTimeOption::kSecondsPerHour, 0, 0); + verify_offpeak_info(true, 12 * OffpeakTimeOption::kSecondsPerHour, 12, 00); + verify_offpeak_info(true, 1 * OffpeakTimeOption::kSecondsPerMinute, 23, 59); + verify_offpeak_info(true, 59, 23, 59, 1); + verify_offpeak_info(true, 1, 23, 59, 59); + + // Start with a valid option + options.daily_offpeak_time_utc = "01:30-04:15"; + DestroyAndReopen(options); + ASSERT_EQ("01:30-04:15", dbfull()->GetDBOptions().daily_offpeak_time_utc); + + int may_schedule_compaction_called = 0; + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::MaybeScheduleFlushOrCompaction:Start", + [&](void*) { may_schedule_compaction_called++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Make sure calling SetDBOptions with invalid option does not change the + // value nor call MaybeScheduleFlushOrCompaction() + for (std::string invalid_case : invalid_cases) { + ASSERT_NOK( + dbfull()->SetDBOptions({{"daily_offpeak_time_utc", invalid_case}})); + ASSERT_EQ("01:30-04:15", dbfull() + ->GetVersionSet() + ->offpeak_time_option() + .daily_offpeak_time_utc); + ASSERT_EQ(1 * kSecondInHour + 30 * kSecondInMinute, + dbfull() + ->GetVersionSet() + ->offpeak_time_option() + .daily_offpeak_start_time_utc); + ASSERT_EQ(4 * kSecondInHour + 15 * kSecondInMinute, + dbfull() + ->GetVersionSet() + ->offpeak_time_option() + .daily_offpeak_end_time_utc); + } + ASSERT_EQ(0, may_schedule_compaction_called); + + // Changing to new valid values should call MaybeScheduleFlushOrCompaction() + // and sets the offpeak_time_option in VersionSet + int expected_count = 0; + for (std::string valid_case : valid_cases) { + if (dbfull() + ->GetVersionSet() + ->offpeak_time_option() + .daily_offpeak_time_utc != valid_case) { + expected_count++; + } + ASSERT_OK(dbfull()->SetDBOptions({{"daily_offpeak_time_utc", valid_case}})); + ASSERT_EQ(valid_case, dbfull()->GetDBOptions().daily_offpeak_time_utc); + ASSERT_EQ(valid_case, dbfull() + ->GetVersionSet() + ->offpeak_time_option() + .daily_offpeak_time_utc); + } + ASSERT_EQ(expected_count, may_schedule_compaction_called); + + // Changing to the same value should not call MaybeScheduleFlushOrCompaction() + ASSERT_OK( + dbfull()->SetDBOptions({{"daily_offpeak_time_utc", "06:30-11:30"}})); + may_schedule_compaction_called = 0; + ASSERT_OK( + dbfull()->SetDBOptions({{"daily_offpeak_time_utc", "06:30-11:30"}})); + ASSERT_EQ(0, may_schedule_compaction_called); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); Close(); } +TEST_F(DBOptionsTest, CompactionReadaheadSizeChange) { + for (bool use_direct_reads : {true, false}) { + SpecialEnv env(env_); + Options options; + options.env = &env; + + options.use_direct_reads = use_direct_reads; + options.level0_file_num_compaction_trigger = 2; + const std::string kValue(1024, 'v'); + Status s = TryReopen(options); + if (use_direct_reads && (s.IsNotSupported() || s.IsInvalidArgument())) { + continue; + } else { + ASSERT_OK(s); + } + + ASSERT_EQ(1024 * 1024 * 2, + dbfull()->GetDBOptions().compaction_readahead_size); + ASSERT_OK(dbfull()->SetDBOptions({{"compaction_readahead_size", "256"}})); + ASSERT_EQ(256, dbfull()->GetDBOptions().compaction_readahead_size); + for (int i = 0; i < 1024; i++) { + ASSERT_OK(Put(Key(i), kValue)); + } + ASSERT_OK(Flush()); + for (int i = 0; i < 1024 * 2; i++) { + ASSERT_OK(Put(Key(i), kValue)); + } + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(256, env_->compaction_readahead_size_); + Close(); + } +} + TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) { Options options; options.compaction_style = kCompactionStyleFIFO; options.write_buffer_size = 10 << 10; // 10KB options.create_if_missing = true; options.env = CurrentOptions().env; + options.num_levels = 1; ASSERT_OK(TryReopen(options)); @@ -1066,12 +1338,19 @@ TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) { // ttl under compaction_options_fifo. ASSERT_OK(dbfull()->SetOptions( {{"compaction_options_fifo", - "{allow_compaction=true;max_table_files_size=1024;ttl=731;}"}, + "{allow_compaction=true;max_table_files_size=1024;ttl=731;file_" + "temperature_age_thresholds={temperature=kCold;age=12345}}"}, {"ttl", "60"}})); ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction, true); ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, 1024); + auto opts = dbfull()->GetOptions(); + const auto& file_temp_age = + opts.compaction_options_fifo.file_temperature_age_thresholds; + ASSERT_EQ(file_temp_age.size(), 1); + ASSERT_EQ(file_temp_age[0].temperature, Temperature::kCold); + ASSERT_EQ(file_temp_age[0].age, 12345); ASSERT_EQ(dbfull()->GetOptions().ttl, 60); // Put ttl as the first option inside compaction_options_fifo. That works as @@ -1084,6 +1363,9 @@ TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) { true); ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size, 1024); + ASSERT_EQ(file_temp_age.size(), 1); + ASSERT_EQ(file_temp_age[0].temperature, Temperature::kCold); + ASSERT_EQ(file_temp_age[0].age, 12345); ASSERT_EQ(dbfull()->GetOptions().ttl, 191); } @@ -1148,7 +1430,6 @@ TEST_F(DBOptionsTest, ChangeCompression) { SyncPoint::GetInstance()->DisableProcessing(); } -#endif // ROCKSDB_LITE TEST_F(DBOptionsTest, BottommostCompressionOptsWithFallbackType) { // Verify the bottommost compression options still take effect even when the @@ -1210,6 +1491,90 @@ TEST_F(DBOptionsTest, BottommostCompressionOptsWithFallbackType) { ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level); } +TEST_F(DBOptionsTest, FIFOTemperatureAgeThresholdValidation) { + Options options = CurrentOptions(); + Destroy(options); + + options.num_levels = 1; + options.compaction_style = kCompactionStyleFIFO; + options.max_open_files = -1; + // elements are not sorted + // During DB open + options.compaction_options_fifo.file_temperature_age_thresholds.push_back( + {Temperature::kCold, 1000}); + options.compaction_options_fifo.file_temperature_age_thresholds.push_back( + {Temperature::kWarm, 500}); + Status s = TryReopen(options); + ASSERT_TRUE(s.IsNotSupported()); + ASSERT_TRUE(std::strstr( + s.getState(), + "Option file_temperature_age_thresholds requires elements to be sorted " + "in increasing order with respect to `age` field.")); + // Dynamically set option + options.compaction_options_fifo.file_temperature_age_thresholds.pop_back(); + ASSERT_OK(TryReopen(options)); + s = db_->SetOptions({{"compaction_options_fifo", + "{file_temperature_age_thresholds={{temperature=kCold;" + "age=1000000}:{temperature=kWarm;age=1}}}"}}); + ASSERT_TRUE(s.IsNotSupported()); + ASSERT_TRUE(std::strstr( + s.getState(), + "Option file_temperature_age_thresholds requires elements to be sorted " + "in increasing order with respect to `age` field.")); + + // not single level + // During DB open + options.num_levels = 2; + s = TryReopen(options); + ASSERT_TRUE(s.IsNotSupported()); + ASSERT_TRUE(std::strstr(s.getState(), + "Option file_temperature_age_thresholds is only " + "supported when num_levels = 1.")); + // Dynamically set option + options.compaction_options_fifo.file_temperature_age_thresholds.clear(); + DestroyAndReopen(options); + s = db_->SetOptions( + {{"compaction_options_fifo", + "{file_temperature_age_thresholds={temperature=kCold;age=1000}}"}}); + ASSERT_TRUE(s.IsNotSupported()); + ASSERT_TRUE(std::strstr(s.getState(), + "Option file_temperature_age_thresholds is only " + "supported when num_levels = 1.")); +} + +TEST_F(DBOptionsTest, TempOptionsFailTest) { + std::shared_ptr fs; + std::unique_ptr env; + + fs.reset(new FaultInjectionTestFS(env_->GetFileSystem())); + env = NewCompositeEnv(fs); + Options options = CurrentOptions(); + options.env = env.get(); + + SyncPoint::GetInstance()->SetCallBack( + "PersistRocksDBOptions:create", + [&](void* /*arg*/) { fs->SetFilesystemActive(false); }); + SyncPoint::GetInstance()->SetCallBack( + "PersistRocksDBOptions:written", + [&](void* /*arg*/) { fs->SetFilesystemActive(true); }); + + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_NOK(TryReopen(options)); + SyncPoint::GetInstance()->DisableProcessing(); + + std::vector filenames; + ASSERT_OK(env_->GetChildren(dbname_, &filenames)); + uint64_t number; + FileType type; + bool found_temp_file = false; + for (size_t i = 0; i < filenames.size(); i++) { + if (ParseFileName(filenames[i], &number, &type) && type == kTempFile) { + found_temp_file = true; + } + } + ASSERT_FALSE(found_temp_file); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index 735b29ea5303..e761f96d9ce4 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -13,6 +13,7 @@ #include #include "db/db_test_util.h" +#include "db/write_stall_stats.h" #include "options/cf_options.h" #include "port/stack_trace.h" #include "rocksdb/listener.h" @@ -55,7 +56,6 @@ class DBPropertiesTest : public DBTestBase { } }; -#ifndef ROCKSDB_LITE TEST_F(DBPropertiesTest, Empty) { do { Options options; @@ -107,12 +107,12 @@ TEST_F(DBPropertiesTest, Empty) { dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); ASSERT_EQ("0", num); - ASSERT_OK(db_->EnableFileDeletions(false)); + ASSERT_OK(db_->EnableFileDeletions(/*force=*/false)); ASSERT_TRUE( dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); ASSERT_EQ("0", num); - ASSERT_OK(db_->EnableFileDeletions()); + ASSERT_OK(db_->EnableFileDeletions(/*force=*/true)); ASSERT_TRUE( dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); ASSERT_EQ("1", num); @@ -188,40 +188,6 @@ TEST_F(DBPropertiesTest, GetAggregatedIntPropertyTest) { } namespace { -void ResetTableProperties(TableProperties* tp) { - tp->data_size = 0; - tp->index_size = 0; - tp->filter_size = 0; - tp->raw_key_size = 0; - tp->raw_value_size = 0; - tp->num_data_blocks = 0; - tp->num_entries = 0; - tp->num_deletions = 0; - tp->num_merge_operands = 0; - tp->num_range_deletions = 0; -} - -void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { - double dummy_double; - std::replace(tp_string.begin(), tp_string.end(), ';', ' '); - std::replace(tp_string.begin(), tp_string.end(), '=', ' '); - ResetTableProperties(tp); - sscanf(tp_string.c_str(), - "# data blocks %" SCNu64 " # entries %" SCNu64 " # deletions %" SCNu64 - " # merge operands %" SCNu64 " # range deletions %" SCNu64 - " raw key size %" SCNu64 - " raw average key size %lf " - " raw value size %" SCNu64 - " raw average value size %lf " - " data block size %" SCNu64 " index block size (user-key? %" SCNu64 - ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64, - &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions, - &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size, - &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, - &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded, - &tp->index_size, &tp->filter_size); -} - void VerifySimilar(uint64_t a, uint64_t b, double bias) { ASSERT_EQ(a == 0U, b == 0U); if (a == 0) { @@ -1112,19 +1078,20 @@ TEST_F(DBPropertiesTest, EstimateCompressionRatio) { ASSERT_GT(CompressionRatioAtLevel(1), 10.0); } -#endif // ROCKSDB_LITE class CountingUserTblPropCollector : public TablePropertiesCollector { public: const char* Name() const override { return "CountingUserTblPropCollector"; } Status Finish(UserCollectedProperties* properties) override { + assert(!finish_called_); std::string encoded; PutVarint32(&encoded, count_); *properties = UserCollectedProperties{ {"CountingUserTblPropCollector", message_}, {"Count", encoded}, }; + finish_called_ = true; return Status::OK(); } @@ -1136,12 +1103,14 @@ class CountingUserTblPropCollector : public TablePropertiesCollector { } UserCollectedProperties GetReadableProperties() const override { + assert(finish_called_); return UserCollectedProperties{}; } private: std::string message_ = "Rocksdb"; uint32_t count_ = 0; + bool finish_called_ = false; }; class CountingUserTblPropCollectorFactory @@ -1263,7 +1232,6 @@ class BlockCountingTablePropertiesCollectorFactory } }; -#ifndef ROCKSDB_LITE TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) { Options options = CurrentOptions(); options.level0_file_num_compaction_trigger = (1 << 30); @@ -1303,7 +1271,6 @@ TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) { ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); ASSERT_GT(collector_factory->num_created_, 0U); } -#endif // ROCKSDB_LITE TEST_F(DBPropertiesTest, UserDefinedTablePropertiesContext) { Options options = CurrentOptions(); @@ -1365,7 +1332,6 @@ TEST_F(DBPropertiesTest, UserDefinedTablePropertiesContext) { ASSERT_GT(collector_factory->num_created_, 0U); } -#ifndef ROCKSDB_LITE TEST_F(DBPropertiesTest, TablePropertiesNeedCompactTest) { Random rnd(301); @@ -1743,7 +1709,6 @@ TEST_F(DBPropertiesTest, SstFilesSize) { options.env = CurrentOptions().env; options.disable_auto_compactions = true; options.listeners.push_back(listener); - options.level_compaction_dynamic_level_bytes = true; Reopen(options); for (int i = 0; i < 10; i++) { @@ -1754,22 +1719,35 @@ TEST_F(DBPropertiesTest, SstFilesSize) { ASSERT_OK(Delete("key" + std::to_string(i))); } ASSERT_OK(Flush()); + uint64_t sst_size; - bool ok = db_->GetIntProperty(DB::Properties::kTotalSstFilesSize, &sst_size); - ASSERT_TRUE(ok); - ASSERT_GT(sst_size, 0); - ok = db_->GetIntProperty(DB::Properties::kLiveNonBottommostSstFilesSize, - &sst_size); - ASSERT_TRUE(ok); + ASSERT_TRUE( + db_->GetIntProperty(DB::Properties::kTotalSstFilesSize, &sst_size)); ASSERT_GT(sst_size, 0); listener->size_before_compaction = sst_size; + + uint64_t obsolete_sst_size; + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kObsoleteSstFilesSize, + &obsolete_sst_size)); + ASSERT_EQ(obsolete_sst_size, 0); + + // Hold files from being deleted so we can test property for size of obsolete + // SST files. + ASSERT_OK(db_->DisableFileDeletions()); + // Compact to clean all keys and trigger listener. ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_TRUE(listener->callback_triggered); - ok = db_->GetIntProperty(DB::Properties::kLiveNonBottommostSstFilesSize, - &sst_size); - ASSERT_TRUE(ok); - ASSERT_EQ(sst_size, 0); + + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kObsoleteSstFilesSize, + &obsolete_sst_size)); + ASSERT_EQ(obsolete_sst_size, sst_size); + + // Let the obsolete files be deleted. + ASSERT_OK(db_->EnableFileDeletions(/*force=*/false)); + ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kObsoleteSstFilesSize, + &obsolete_sst_size)); + ASSERT_EQ(obsolete_sst_size, 0); } TEST_F(DBPropertiesTest, MinObsoleteSstNumberToKeep) { @@ -2120,6 +2098,181 @@ TEST_F(DBPropertiesTest, GetMapPropertyBlockCacheEntryStats) { ASSERT_EQ(3 * kNumCacheEntryRoles + 4, values.size()); } +TEST_F(DBPropertiesTest, WriteStallStatsSanityCheck) { + for (uint32_t i = 0; i < static_cast(WriteStallCause::kNone); ++i) { + WriteStallCause cause = static_cast(i); + const std::string& str = WriteStallCauseToHyphenString(cause); + ASSERT_TRUE(!str.empty()) + << "Please ensure mapping from `WriteStallCause` to " + "`WriteStallCauseToHyphenString` is complete"; + if (cause == WriteStallCause::kCFScopeWriteStallCauseEnumMax || + cause == WriteStallCause::kDBScopeWriteStallCauseEnumMax) { + ASSERT_EQ(str, InvalidWriteStallHyphenString()) + << "Please ensure order in `WriteStallCauseToHyphenString` is " + "consistent with `WriteStallCause`"; + } + } + + for (uint32_t i = 0; i < static_cast(WriteStallCondition::kNormal); + ++i) { + WriteStallCondition condition = static_cast(i); + const std::string& str = WriteStallConditionToHyphenString(condition); + ASSERT_TRUE(!str.empty()) + << "Please ensure mapping from `WriteStallCondition` to " + "`WriteStallConditionToHyphenString` is complete"; + } + + for (uint32_t i = 0; i < static_cast(WriteStallCause::kNone); ++i) { + for (uint32_t j = 0; + j < static_cast(WriteStallCondition::kNormal); ++j) { + WriteStallCause cause = static_cast(i); + WriteStallCondition condition = static_cast(j); + + if (isCFScopeWriteStallCause(cause)) { + ASSERT_TRUE(InternalCFStat(cause, condition) != + InternalStats::INTERNAL_CF_STATS_ENUM_MAX) + << "Please ensure the combination of WriteStallCause(" + + std::to_string(static_cast(cause)) + + ") + WriteStallCondition(" + + std::to_string(static_cast(condition)) + + ") is correctly mapped to a valid `InternalStats` or bypass " + "its check in this test"; + } else if (isDBScopeWriteStallCause(cause)) { + InternalStats::InternalDBStatsType internal_db_stat = + InternalDBStat(cause, condition); + if (internal_db_stat == InternalStats::kIntStatsNumMax) { + ASSERT_TRUE(cause == WriteStallCause::kWriteBufferManagerLimit && + condition == WriteStallCondition::kDelayed) + << "Please ensure the combination of WriteStallCause(" + + std::to_string(static_cast(cause)) + + ") + WriteStallCondition(" + + std::to_string(static_cast(condition)) + + ") is correctly mapped to a valid `InternalStats` or " + "bypass its check in this test"; + } + } else if (cause != WriteStallCause::kCFScopeWriteStallCauseEnumMax && + cause != WriteStallCause::kDBScopeWriteStallCauseEnumMax) { + ASSERT_TRUE(false) << "Please ensure the WriteStallCause(" + + std::to_string(static_cast(cause)) + + ") is either CF-scope or DB-scope write " + "stall cause in enum `WriteStallCause`"; + } + } + } +} +TEST_F(DBPropertiesTest, GetMapPropertyWriteStallStats) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"heavy_write_cf"}, options); + + for (auto test_cause : {WriteStallCause::kWriteBufferManagerLimit, + WriteStallCause::kMemtableLimit}) { + if (test_cause == WriteStallCause::kWriteBufferManagerLimit) { + options.write_buffer_manager.reset( + new WriteBufferManager(100000, nullptr, true)); + } else if (test_cause == WriteStallCause::kMemtableLimit) { + options.max_write_buffer_number = 2; + options.disable_auto_compactions = true; + } + ReopenWithColumnFamilies({"default", "heavy_write_cf"}, options); + + // Assert initial write stall stats are all 0 + std::map db_values; + ASSERT_TRUE(dbfull()->GetMapProperty(DB::Properties::kDBWriteStallStats, + &db_values)); + ASSERT_EQ(std::stoi(db_values[WriteStallStatsMapKeys::CauseConditionCount( + WriteStallCause::kWriteBufferManagerLimit, + WriteStallCondition::kStopped)]), + 0); + + for (int cf = 0; cf <= 1; ++cf) { + std::map cf_values; + ASSERT_TRUE(dbfull()->GetMapProperty( + handles_[cf], DB::Properties::kCFWriteStallStats, &cf_values)); + ASSERT_EQ(std::stoi(cf_values[WriteStallStatsMapKeys::TotalStops()]), 0); + ASSERT_EQ(std::stoi(cf_values[WriteStallStatsMapKeys::TotalDelays()]), 0); + } + + // Pause flush thread to help coerce write stall + std::unique_ptr sleeping_task( + new test::SleepingBackgroundTask()); + env_->SetBackgroundThreads(1, Env::HIGH); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + sleeping_task.get(), Env::Priority::HIGH); + sleeping_task->WaitUntilSleeping(); + + // Coerce write stall + if (test_cause == WriteStallCause::kWriteBufferManagerLimit) { + ASSERT_OK(dbfull()->Put( + WriteOptions(), handles_[1], Key(1), + DummyString(options.write_buffer_manager->buffer_size()))); + + WriteOptions wo; + wo.no_slowdown = true; + Status s = dbfull()->Put( + wo, handles_[1], Key(2), + DummyString(options.write_buffer_manager->buffer_size())); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(s.ToString().find("Write stall") != std::string::npos); + } else if (test_cause == WriteStallCause::kMemtableLimit) { + FlushOptions fo; + fo.allow_write_stall = true; + fo.wait = false; + + ASSERT_OK( + dbfull()->Put(WriteOptions(), handles_[1], Key(1), DummyString(1))); + ASSERT_OK(dbfull()->Flush(fo, handles_[1])); + + ASSERT_OK( + dbfull()->Put(WriteOptions(), handles_[1], Key(2), DummyString(1))); + ASSERT_OK(dbfull()->Flush(fo, handles_[1])); + } + + if (test_cause == WriteStallCause::kWriteBufferManagerLimit) { + db_values.clear(); + EXPECT_TRUE(dbfull()->GetMapProperty(DB::Properties::kDBWriteStallStats, + &db_values)); + EXPECT_EQ(std::stoi(db_values[WriteStallStatsMapKeys::CauseConditionCount( + WriteStallCause::kWriteBufferManagerLimit, + WriteStallCondition::kStopped)]), + 1); + // `WriteStallCause::kWriteBufferManagerLimit` should not result in any + // CF-scope write stall stats changes + for (int cf = 0; cf <= 1; ++cf) { + std::map cf_values; + EXPECT_TRUE(dbfull()->GetMapProperty( + handles_[cf], DB::Properties::kCFWriteStallStats, &cf_values)); + EXPECT_EQ(std::stoi(cf_values[WriteStallStatsMapKeys::TotalStops()]), + 0); + EXPECT_EQ(std::stoi(cf_values[WriteStallStatsMapKeys::TotalDelays()]), + 0); + } + } else if (test_cause == WriteStallCause::kMemtableLimit) { + for (int cf = 0; cf <= 1; ++cf) { + std::map cf_values; + EXPECT_TRUE(dbfull()->GetMapProperty( + handles_[cf], DB::Properties::kCFWriteStallStats, &cf_values)); + EXPECT_EQ(std::stoi(cf_values[WriteStallStatsMapKeys::TotalStops()]), + cf == 1 ? 1 : 0); + EXPECT_EQ( + std::stoi(cf_values[WriteStallStatsMapKeys::CauseConditionCount( + WriteStallCause::kMemtableLimit, + WriteStallCondition::kStopped)]), + cf == 1 ? 1 : 0); + EXPECT_EQ(std::stoi(cf_values[WriteStallStatsMapKeys::TotalDelays()]), + 0); + EXPECT_EQ( + std::stoi(cf_values[WriteStallStatsMapKeys::CauseConditionCount( + WriteStallCause::kMemtableLimit, + WriteStallCondition::kDelayed)]), + 0); + } + } + + sleeping_task->WakeUp(); + sleeping_task->WaitUntilDone(); + } +} + namespace { std::string PopMetaIndexKey(InternalIterator* meta_iter) { Status s = meta_iter->status(); @@ -2171,8 +2324,9 @@ TEST_F(DBPropertiesTest, TableMetaIndexKeys) { // Read metaindex BlockContents bc; - ASSERT_OK(ReadMetaIndexBlockInFile(r.get(), file_size, 0U, - ImmutableOptions(options), &bc)); + const ReadOptions read_options; + ASSERT_OK(ReadMetaIndexBlockInFile( + r.get(), file_size, 0U, ImmutableOptions(options), read_options, &bc)); Block metaindex_block(std::move(bc)); std::unique_ptr meta_iter; meta_iter.reset(metaindex_block.NewMetaIterator()); @@ -2198,13 +2352,15 @@ TEST_F(DBPropertiesTest, TableMetaIndexKeys) { EXPECT_EQ("rocksdb.hashindex.prefixes", PopMetaIndexKey(meta_iter.get())); } + if (bbto->format_version >= 6) { + EXPECT_EQ("rocksdb.index", PopMetaIndexKey(meta_iter.get())); + } } EXPECT_EQ("rocksdb.properties", PopMetaIndexKey(meta_iter.get())); EXPECT_EQ("NOT_FOUND", PopMetaIndexKey(meta_iter.get())); } while (ChangeOptions()); } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index bfabc42fb674..003117eec9a8 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -27,9 +27,6 @@ class DBRangeDelTest : public DBTestBase { } }; -// PlainTableFactory, WriteBatchWithIndex, and NumTableFilesAtLevel() are not -// supported in ROCKSDB_LITE -#ifndef ROCKSDB_LITE TEST_F(DBRangeDelTest, NonBlockBasedTableNotSupported) { // TODO: figure out why MmapReads trips the iterator pinning assertion in // RangeDelAggregator. Ideally it would be supported; otherwise it should at @@ -298,7 +295,6 @@ TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) { ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound()); } } -#endif // ROCKSDB_LITE TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) { const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250; @@ -335,8 +331,6 @@ TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) { db_->ReleaseSnapshot(snapshot); } -// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE -#ifndef ROCKSDB_LITE TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) { const int kNumPerFile = 100, kNumFiles = 4; Options opts = CurrentOptions(); @@ -517,7 +511,6 @@ TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) { std::numeric_limits::max() /* max_file_num_to_ignore */, "" /*trim_ts*/)); } -#endif // ROCKSDB_LITE TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) { const int kNumPerFile = 3, kNumFiles = 3; @@ -589,8 +582,6 @@ TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) { ASSERT_EQ(expected, actual); } -// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE -#ifndef ROCKSDB_LITE TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) { // During compaction to bottommost level, verify range tombstones older than // the oldest snapshot are removed, while others are preserved. @@ -691,6 +682,7 @@ TEST_F(DBRangeDelTest, TableEvictedDuringScan) { // soon as its refcount drops to zero. bbto.block_cache->EraseUnRefEntries(); } + ASSERT_OK(iter->status()); ASSERT_EQ(kNum, expected); delete iter; db_->ReleaseSnapshot(snapshot); @@ -849,6 +841,7 @@ TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) { ++expected; } } + ASSERT_OK(iter->status()); ASSERT_EQ(kNum, expected); delete iter; } @@ -917,6 +910,7 @@ TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) { std::string key; ASSERT_EQ(expected[i], iter->key()); } + ASSERT_OK(iter->status()); ASSERT_EQ(3, i); delete iter; db_->ReleaseSnapshot(snapshot); @@ -1391,6 +1385,7 @@ TEST_F(DBRangeDelTest, UntruncatedTombstoneDoesNotDeleteNewerKey) { for (; iter->Valid(); iter->Next()) { ++keys_found; } + EXPECT_OK(iter->status()); delete iter; return keys_found; }; @@ -1494,6 +1489,7 @@ TEST_F(DBRangeDelTest, DeletedMergeOperandReappearsIterPrev) { for (; iter->Valid(); iter->Prev()) { ++keys_found; } + ASSERT_OK(iter->status()); delete iter; ASSERT_EQ(kNumKeys, keys_found); @@ -1528,6 +1524,7 @@ TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) { iter->Next(); ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); delete iter; db_->ReleaseSnapshot(snapshot); @@ -1573,6 +1570,7 @@ TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeysInImmMemTables) { iter->Next(); ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); } TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) { @@ -1670,6 +1668,220 @@ TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) { ASSERT_EQ(1, num_range_deletions); } +TEST_F(DBRangeDelTest, LevelCompactOutputCutAtRangeTombstoneForTtlFiles) { + Options options = CurrentOptions(); + options.compression = kNoCompression; + options.compaction_pri = kMinOverlappingRatio; + options.disable_auto_compactions = true; + options.ttl = 24 * 60 * 60; // 24 hours + options.target_file_size_base = 8 << 10; + env_->SetMockSleep(); + options.env = env_; + DestroyAndReopen(options); + + Random rnd(301); + // Fill some data so that future compactions are not bottommost level + // compaction, and hence they would try cut around files for ttl + for (int i = 5; i < 10; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(1 << 10))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(3); + ASSERT_EQ("0,0,0,1", FilesPerLevel()); + + for (int i = 5; i < 10; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomString(1 << 10))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_EQ("0,1,0,1", FilesPerLevel()); + + env_->MockSleepForSeconds(20 * 60 * 60); + // Prevent range tombstone from being dropped during compaction. + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(11), Key(12))); + ASSERT_OK(Put(Key(0), rnd.RandomString(1 << 10))); + ASSERT_OK(Flush()); + ASSERT_EQ("1,1,0,1", FilesPerLevel()); + // L0 file is new, L1 and L3 file are old and qualified for TTL + env_->MockSleepForSeconds(10 * 60 * 60); + MoveFilesToLevel(1); + // L1 output should be cut into 3 files: + // File 0: Key(0) + // File 1: (qualified for TTL): Key(5) - Key(10) + // File 1: DeleteRange [11, 12) + ASSERT_EQ("0,3,0,1", FilesPerLevel()); + db_->ReleaseSnapshot(snapshot); +} + +// Test SST partitioner cut after every single key +class SingleKeySstPartitioner : public SstPartitioner { + public: + const char* Name() const override { return "SingleKeySstPartitioner"; } + + PartitionerResult ShouldPartition( + const PartitionerRequest& /*request*/) override { + return kRequired; + } + + bool CanDoTrivialMove(const Slice& /*smallest_user_key*/, + const Slice& /*largest_user_key*/) override { + return false; + } +}; + +class SingleKeySstPartitionerFactory : public SstPartitionerFactory { + public: + static const char* kClassName() { return "SingleKeySstPartitionerFactory"; } + const char* Name() const override { return kClassName(); } + + std::unique_ptr CreatePartitioner( + const SstPartitioner::Context& /* context */) const override { + return std::unique_ptr(new SingleKeySstPartitioner()); + } +}; + +TEST_F(DBRangeDelTest, CompactionEmitRangeTombstoneToSSTPartitioner) { + Options options = CurrentOptions(); + auto factory = std::make_shared(); + options.sst_partitioner_factory = factory; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + Random rnd(301); + // range deletion keys are not processed when compacting to bottommost level, + // so creating a file at older level to make the next compaction not + // bottommost level + ASSERT_OK(db_->Put(WriteOptions(), Key(4), rnd.RandomString(10))); + ASSERT_OK(Flush()); + MoveFilesToLevel(5); + + ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(10))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2), + Key(5))); + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + MoveFilesToLevel(1); + // SSTPartitioner decides to cut when range tombstone start key is passed to + // it. Note that the range tombstone [2, 5) itself span multiple keys, but we + // are not able to partition within its range yet. + ASSERT_EQ(2, NumTableFilesAtLevel(1)); +} + +TEST_F(DBRangeDelTest, OversizeCompactionGapBetweenPointKeyAndTombstone) { + // L2 has 2 files + // L2_0: 0, 1, 2, 3, 4 + // L2_1: 5, 6, 7 + // L0 has 1 file + // L0: 0, [5, 6), 8 + // max_compaction_bytes is less than the size of L2_0 and L2_1. + // When compacting L0 into L1, it should split into 3 files: + // compaction output should cut before key 5 and key 8 to + // limit future compaction size. + const int kNumPerFile = 4, kNumFiles = 2; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.target_file_size_base = 9 * 1024; + options.max_compaction_bytes = 9 * 1024; + DestroyAndReopen(options); + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + std::vector values; + for (int j = 0; j < kNumPerFile; j++) { + values.push_back(rnd.RandomString(3 << 10)); + ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j])); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + MoveFilesToLevel(2); + ASSERT_EQ(2, NumTableFilesAtLevel(2)); + ASSERT_OK(Put(Key(0), rnd.RandomString(1 << 10))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(5), + Key(6))); + ASSERT_OK(Put(Key(8), rnd.RandomString(1 << 10))); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); + ASSERT_EQ(3, NumTableFilesAtLevel(1)); +} + +TEST_F(DBRangeDelTest, OversizeCompactionGapBetweenTombstone) { + // L2 has two files + // L2_0: 0, 1, 2, 3, 4. L2_1: 5, 6, 7 + // L0 has two range tombstones [0, 1), [7, 8). + // max_compaction_bytes is less than the size of L2_0. + // When compacting L0 into L1, the two range tombstones should be + // split into two files. + const int kNumPerFile = 4, kNumFiles = 2; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.target_file_size_base = 9 * 1024; + options.max_compaction_bytes = 9 * 1024; + DestroyAndReopen(options); + Random rnd(301); + for (int i = 0; i < kNumFiles; ++i) { + std::vector values; + // Write 12K (4 values, each 3K) + for (int j = 0; j < kNumPerFile; j++) { + values.push_back(rnd.RandomString(3 << 10)); + ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j])); + } + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + MoveFilesToLevel(2); + ASSERT_EQ(2, NumTableFilesAtLevel(2)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(1))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(7), + Key(8))); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); + // This is L0 -> L1 compaction + // The two range tombstones are broken up into two output files + // to limit compaction size. + ASSERT_EQ(2, NumTableFilesAtLevel(1)); +} + +TEST_F(DBRangeDelTest, OversizeCompactionPointKeyWithinRangetombstone) { + // L2 has two files + // L2_0: 0, 1, 2, 3, 4. L2_1: 6, 7, 8 + // L0 has [0, 9) and point key 5 + // max_compaction_bytes is less than the size of L2_0. + // When compacting L0 into L1, the compaction should cut at point key 5. + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.target_file_size_base = 9 * 1024; + options.max_compaction_bytes = 9 * 1024; + DestroyAndReopen(options); + Random rnd(301); + for (int i = 0; i < 9; ++i) { + if (i == 5) { + ++i; + } + ASSERT_OK(Put(Key(i), rnd.RandomString(3 << 10))); + } + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + MoveFilesToLevel(2); + ASSERT_EQ(2, NumTableFilesAtLevel(2)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(9))); + ASSERT_OK(Put(Key(5), rnd.RandomString(1 << 10))); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, + true /* disallow_trivial_move */)); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); +} + TEST_F(DBRangeDelTest, OverlappedTombstones) { const int kNumPerFile = 4, kNumFiles = 2; Options options = CurrentOptions(); @@ -1773,6 +1985,7 @@ TEST_F(DBRangeDelTest, IteratorRefresh) { ASSERT_EQ("key1", iter->key()); iter->Next(); ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); delete iter; } @@ -2102,6 +2315,7 @@ TEST_F(DBRangeDelTest, NonOverlappingTombstonAtBoundary) { options.compression = kNoCompression; options.disable_auto_compactions = true; options.target_file_size_base = 2 * 1024; + options.level_compaction_dynamic_file_size = false; DestroyAndReopen(options); Random rnd(301); @@ -2517,7 +2731,7 @@ TEST_F(DBRangeDelTest, LeftSentinelKeyTest) { options.compression = kNoCompression; options.disable_auto_compactions = true; options.target_file_size_base = 3 * 1024; - options.max_compaction_bytes = 1024; + options.max_compaction_bytes = 2048; DestroyAndReopen(options); // L2 @@ -2563,7 +2777,7 @@ TEST_F(DBRangeDelTest, LeftSentinelKeyTestWithNewerKey) { options.compression = kNoCompression; options.disable_auto_compactions = true; options.target_file_size_base = 3 * 1024; - options.max_compaction_bytes = 1024; + options.max_compaction_bytes = 3 * 1024; DestroyAndReopen(options); // L2 @@ -3024,8 +3238,537 @@ TEST_F(DBRangeDelTest, DoubleCountRangeTombstoneCompensatedSize) { db_->ReleaseSnapshot(snapshot); } -#endif // ROCKSDB_LITE +TEST_F(DBRangeDelTest, AddRangeDelsSameLowerAndUpperBound) { + // Test for an edge case where CompactionOutputs::AddRangeDels() + // is called with an empty range: `range_tombstone_lower_bound_` is not empty + // and have the same user_key and sequence number as `next_table_min_key. + // This used to cause file's smallest and largest key to be incorrectly set + // such that smallest > largest, and fail some assertions in iterator and/or + // assertion in VersionSet::ApproximateSize(). + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + opts.target_file_size_base = 1 << 10; + opts.level_compaction_dynamic_file_size = false; + DestroyAndReopen(opts); + + Random rnd(301); + // Create file at bottommost level so the manual compaction below is + // non-bottommost level and goes through code path like compensate range + // tombstone size. + ASSERT_OK(Put(Key(1), "v1")); + ASSERT_OK(Put(Key(4), "v2")); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + + ASSERT_OK(Put(Key(1), rnd.RandomString(4 << 10))); + ASSERT_OK(Put(Key(3), rnd.RandomString(4 << 10))); + // So Key(3) does not get dropped. + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2), + Key(4))); + ASSERT_OK(Flush()); + + ASSERT_OK(Put(Key(3), rnd.RandomString(4 << 10))); + ASSERT_OK(Put(Key(4), rnd.RandomString(4 << 10))); + ASSERT_OK(Flush()); + + MoveFilesToLevel(1); + // Each file will have two keys, with Key(3) straddle between two files. + // File 1: Key(1)@1, Key(3)@6, DeleteRange ends at Key(3)@6 + // File 2: Key(3)@4, Key(4)@7, DeleteRange start from Key(3)@4 + ASSERT_EQ(NumTableFilesAtLevel(1), 2); + + // Manually update compaction output file cutting decisions + // to cut before range tombstone sentinel Key(3)@4 + // and the point key Key(3)@4 itself + SyncPoint::GetInstance()->SetCallBack( + "CompactionOutputs::ShouldStopBefore::manual_decision", [opts](void* p) { + auto* pair = (std::pair*)p; + if ((opts.comparator->Compare(ExtractUserKey(pair->second), Key(3)) == + 0) && + (GetInternalKeySeqno(pair->second) <= 4)) { + *(pair->first) = true; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + std::string begin_key = Key(0); + std::string end_key = Key(5); + Slice begin_slice{begin_key}; + Slice end_slice{end_key}; + ASSERT_OK(dbfull()->RunManualCompaction( + static_cast_with_check(db_->DefaultColumnFamily()) + ->cfd(), + 1, 2, CompactRangeOptions(), &begin_slice, &end_slice, true, + true /* disallow_trivial_move */, + std::numeric_limits::max() /*max_file_num_to_ignore*/, + "" /*trim_ts*/)); + // iterate through to check if any assertion breaks + std::unique_ptr iter{db_->NewIterator(ReadOptions())}; + iter->SeekToFirst(); + std::vector expected{1, 3, 4}; + for (auto i : expected) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(i)); + iter->Next(); + } + ASSERT_TRUE(iter->status().ok() && !iter->Valid()); + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, AddRangeDelsSingleUserKeyTombstoneOnlyFile) { + // Test for an edge case where CompactionOutputs::AddRangeDels() + // is called with an SST file that has no point keys, and that + // the lower bound and upper bound have the same user key. + // This could cause a file's smallest and largest key to be incorrectly set + // such that smallest > largest, and fail some assertions in iterator and/or + // assertion in VersionSet::ApproximateSize(). + Options opts = CurrentOptions(); + opts.disable_auto_compactions = true; + opts.target_file_size_base = 1 << 10; + opts.level_compaction_dynamic_file_size = false; + DestroyAndReopen(opts); + Random rnd(301); + // Create file at bottommost level so the manual compaction below is + // non-bottommost level and goes through code path like compensate range + // tombstone size. + ASSERT_OK(Put(Key(1), "v1")); + ASSERT_OK(Put(Key(4), "v2")); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + + ASSERT_OK(Put(Key(1), rnd.RandomString(10))); + // Key(3)@4 + ASSERT_OK(Put(Key(3), rnd.RandomString(10))); + const Snapshot* snapshot1 = db_->GetSnapshot(); + // Key(3)@5 + ASSERT_OK(Put(Key(3), rnd.RandomString(10))); + const Snapshot* snapshot2 = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2), + Key(4))); + // Key(3)@7 + ASSERT_OK(Put(Key(3), rnd.RandomString(10))); + ASSERT_OK(Flush()); + + // L0 -> L1 compaction: cut output into two files: + // File 1: Key(1), Key(3)@7, Range tombstone ends at Key(3)@7 + // File 2: Key(3)@5, Key(3)@4, Range tombstone starts from Key(3)@5 + SyncPoint::GetInstance()->SetCallBack( + "CompactionOutputs::ShouldStopBefore::manual_decision", [opts](void* p) { + auto* pair = (std::pair*)p; + if ((opts.comparator->Compare(ExtractUserKey(pair->second), Key(3)) == + 0) && + (GetInternalKeySeqno(pair->second) <= 6)) { + *(pair->first) = true; + SyncPoint::GetInstance()->DisableProcessing(); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + std::string begin_key = Key(0); + std::string end_key = Key(5); + Slice begin_slice{begin_key}; + Slice end_slice{end_key}; + ASSERT_OK(dbfull()->RunManualCompaction( + static_cast_with_check(db_->DefaultColumnFamily()) + ->cfd(), + 0, 1, CompactRangeOptions(), &begin_slice, &end_slice, true, + true /* disallow_trivial_move */, + std::numeric_limits::max() /*max_file_num_to_ignore*/, + "" /*trim_ts*/)); + ASSERT_EQ(NumTableFilesAtLevel(1), 2); + + // L1 -> L2 compaction, drop the snapshot protecting Key(3)@5. + // Let ShouldStopBefore() return true for Key(3)@5 (delete range sentinel) + // and Key(3)@4. + // Output should have two files: + // File 1: Key(1), Key(3)@7, range tombstone ends at Key(3)@7 + // File dropped: range tombstone only file (from Key(3)@5 to Key(3)@4) + // File 2: Range tombstone starting from Key(3)@4, Key(3)@4 + db_->ReleaseSnapshot(snapshot2); + SyncPoint::GetInstance()->SetCallBack( + "CompactionOutputs::ShouldStopBefore::manual_decision", [opts](void* p) { + auto* pair = (std::pair*)p; + if ((opts.comparator->Compare(ExtractUserKey(pair->second), Key(3)) == + 0) && + (GetInternalKeySeqno(pair->second) <= 6)) { + *(pair->first) = true; + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(dbfull()->RunManualCompaction( + static_cast_with_check(db_->DefaultColumnFamily()) + ->cfd(), + 1, 2, CompactRangeOptions(), &begin_slice, &end_slice, true, + true /* disallow_trivial_move */, + std::numeric_limits::max() /*max_file_num_to_ignore*/, + "" /*trim_ts*/)); + ASSERT_EQ(NumTableFilesAtLevel(2), 2); + // iterate through to check if any assertion breaks + std::unique_ptr iter{db_->NewIterator(ReadOptions())}; + iter->SeekToFirst(); + std::vector expected{1, 3, 4}; + for (auto i : expected) { + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(i)); + iter->Next(); + } + ASSERT_TRUE(iter->status().ok() && !iter->Valid()); + db_->ReleaseSnapshot(snapshot1); +} + +TEST_F(DBRangeDelTest, NonBottommostCompactionDropRangetombstone) { + // L0: file 1: [DeleteRange[4, 5)], file 2: [3, 6, DeleteRange[8, 9)] + // L6 file 1: [2, 3], file 2: [7, 8] + // When compacting the two L0 files to L1, the compaction is non-bottommost + // since the compaction key range overlaps with L6 file 1. The range tombstone + // [4, 5) should be dropped since it does not overlap with any file in lower + // levels. The range tombstone [8, 9) should not be dropped. + Options opts = CurrentOptions(); + opts.level_compaction_dynamic_level_bytes = false; + opts.num_levels = 7; + opts.level0_file_num_compaction_trigger = 3; + DestroyAndReopen(opts); + + Random rnd(301); + // L6 file 1 + ASSERT_OK(Put(Key(2), rnd.RandomString(100))); + ASSERT_OK(Put(Key(3), rnd.RandomString(100))); + ASSERT_OK(Flush()); + // L6 file 2 + ASSERT_OK(Put(Key(7), rnd.RandomString(100))); + ASSERT_OK(Put(Key(8), rnd.RandomString(100))); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + ASSERT_EQ(NumTableFilesAtLevel(6), 2); + // L0 file 1 + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4), + Key(5))); + ASSERT_OK(Flush()); + // L0 file 2 + ASSERT_OK(Put(Key(3), rnd.RandomString(100))); + ASSERT_OK(Put(Key(6), rnd.RandomString(100))); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(8), + Key(9))); + ASSERT_OK(Flush()); + // nothing is dropped during flush + std::string property; + db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property); + TableProperties output_tp; + ParseTablePropertiesString(property, &output_tp); + ASSERT_EQ(output_tp.num_range_deletions, 2); + // Add one more L0 file to trigger L0->L1 compaction + ASSERT_OK(Put(Key(1), rnd.RandomString(100))); + ASSERT_OK(Put(Key(9), rnd.RandomString(100))); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property); + ParseTablePropertiesString(property, &output_tp); + ASSERT_EQ(output_tp.num_range_deletions, 1); + + // Now create a snapshot protected range tombstone [4, 5), it should not + // be dropped. + ASSERT_OK(Put(Key(4), rnd.RandomString(100))); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4), + Key(5))); + CompactRangeOptions cro; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + // All compacted to L6 + ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel(0)); + db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property); + ParseTablePropertiesString(property, &output_tp); + ASSERT_EQ(output_tp.num_range_deletions, 1); + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(DBRangeDelTest, MemtableMaxRangeDeletions) { + // Tests option `memtable_max_range_deletions`. + Options options = CurrentOptions(); + options.level_compaction_dynamic_file_size = false; + options.memtable_max_range_deletions = 50; + options.level0_file_num_compaction_trigger = 5; + DestroyAndReopen(options); + + for (int i = 0; i < 50; ++i) { + // Intentionally delete overlapping ranges to see if the option + // checks number of range tombstone fragments instead. + ASSERT_OK(Put(Key(i), "val1")); + ASSERT_OK(Put(Key(i + 1), "val2")); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(i), Key(i + 2))); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + } + // One more write to trigger flush. + ASSERT_OK(Put(Key(50), "val")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // This should take effect for the next new memtable. + ASSERT_OK(db_->SetOptions({{"memtable_max_range_deletions", "1"}})); + ASSERT_OK(Flush()); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), + Key(50), Key(100))); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + // One more write to trigger flush. + ASSERT_OK(Put(Key(50), "new val")); + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); + ASSERT_EQ(3, NumTableFilesAtLevel(0)); +} + +TEST_F(DBRangeDelTest, RangeDelReseekAfterFileReadError) { + // This is to test a bug that is fixed in + // https://github.com/facebook/rocksdb/pull/11786. + Options opts = CurrentOptions(); + opts.num_levels = 7; + + // Set up LSM + // + // L4: F1: [key1] F2: [key2] + // L5: F3:[DeleteRange(key3, key6)] + // L6: F4:[key3, key6] + // Will inject error when reading from F2. + // SeekToFirst() should land on key1. + // Next() should encounter error when reading from F2, + // and range del reseek should not reset this status. + Random rnd(301); + // L6 + ASSERT_OK(Put(Key(3), rnd.RandomString(100))); + ASSERT_OK(Put(Key(6), rnd.RandomString(100))); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + // L5 + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3), + Key(6))); + ASSERT_OK(Flush()); + MoveFilesToLevel(5); + // L4 + ASSERT_OK(Put(Key(2), rnd.RandomString(100))); + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + std::string fname; + std::vector live_files; + db_->GetLiveFilesMetaData(&live_files); + for (auto& meta : live_files) { + if (meta.level == 4) { + fname = meta.name; + break; + } + } + ASSERT_TRUE(!fname.empty()); + ASSERT_OK(Put(Key(1), rnd.RandomString(100))); + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + + SyncPoint::GetInstance()->SetCallBack( + "RandomAccessFileReader::Read::BeforeReturn", [&fname](void* pair_ptr) { + auto p = + reinterpret_cast*>(pair_ptr); + if (p->first->find(fname) != std::string::npos) { + *p->second = IOStatus::IOError(); + p->second->SetRetryable(true); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + std::unique_ptr iter{db_->NewIterator(ReadOptions())}; + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), Key(1)); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_NOK(iter->status()); + ASSERT_TRUE(iter->status().IsIOError()); + iter.reset(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + + // Reverse scan + // LSM setup + // L4: F1: [key2] F2: [key7, key8] + // L5: F3:[[key3, key6)] + // L6: F4:[key1, key5] + // Ingest error when read from F1. + // SeekToLast() should land on key8. + // During Prev(), MergingIterator will encounter error when reading from F1 + // and do a range del reseek (it sees key5 covered by a range tombstone). + DestroyAndReopen(opts); + // L6 + ASSERT_OK(Put(Key(1), rnd.RandomString(100))); + ASSERT_OK(Put(Key(5), rnd.RandomString(100))); + ASSERT_OK(Flush()); + MoveFilesToLevel(6); + // L5 + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3), + Key(6))); + ASSERT_OK(Flush()); + MoveFilesToLevel(5); + // L4 + ASSERT_OK(Put(Key(2), rnd.RandomString(100))); + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + live_files.clear(); + db_->GetLiveFilesMetaData(&live_files); + for (auto& meta : live_files) { + if (meta.level == 4) { + fname = meta.name; + break; + } + } + ASSERT_TRUE(!fname.empty()); + ASSERT_OK(Put(Key(7), rnd.RandomString(100))); + ASSERT_OK(Put(Key(8), rnd.RandomString(100))); + ASSERT_OK(Flush()); + MoveFilesToLevel(4); + + SyncPoint::GetInstance()->SetCallBack( + "RandomAccessFileReader::Read::AnyOffset", [&fname](void* pair_ptr) { + auto p = + reinterpret_cast*>(pair_ptr); + if (p->first->find(fname) != std::string::npos) { + *p->second = IOStatus::IOError(); + p->second->SetRetryable(true); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + iter.reset(db_->NewIterator(ReadOptions())); + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), Key(8)); + // Note that for reverse scan, DBIter will need to ensure + // the key it returns is the one with the highest sequence number. + // To return key7, it internally calls MergingIterator::Prev() + // until it reaches a previous user key. + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_NOK(iter->status()); + ASSERT_TRUE(iter->status().IsIOError()); + + iter.reset(); +} + +TEST_F(DBRangeDelTest, ReleaseSnapshotAfterIteratorCreation) { + // Test that range tombstone code path in LevelIterator + // does access ReadOptions::snapshot after Iterator creation. + // + // Put some data in L2 so that range tombstone in L1 will not be dropped. + ASSERT_OK(Put(Key(0), "v")); + ASSERT_OK(Put(Key(100), "v")); + ASSERT_OK(Flush()); + MoveFilesToLevel(2); + + // two L1 file with range del + ASSERT_OK(Put(Key(1), "v")); + ASSERT_OK(Put(Key(2), "v")); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3), + Key(4))); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + ASSERT_OK(Put(Key(5), "v")); + ASSERT_OK(Put(Key(6), "v")); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(5), + Key(6))); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + ASSERT_EQ(1, NumTableFilesAtLevel(2)); + + const Snapshot* snapshot = db_->GetSnapshot(); + ReadOptions ro; + ro.snapshot = snapshot; + + Iterator* iter = db_->NewIterator(ro); + db_->ReleaseSnapshot(snapshot); + + iter->Seek(Key(1)); + std::vector expected_keys{1, 2, 6, 100}; + for (int i : expected_keys) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(i)); + iter->Next(); + } + ASSERT_TRUE(!iter->Valid() && iter->status().ok()); + + delete iter; +} + +TEST_F(DBRangeDelTest, RefreshWithSnapshot) { + ASSERT_OK(Put(Key(4), "4")); + ASSERT_OK(Put(Key(6), "6")); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3), + Key(5))); + + std::unique_ptr iter{db_->NewIterator(ReadOptions())}; + // Live Memtable + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(6)); + ASSERT_OK(iter->Refresh(snapshot)); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(4)); + // Immutable Memtable + ASSERT_OK(dbfull()->TEST_SwitchMemtable()); + ASSERT_OK(iter->Refresh(nullptr)); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(6)); + ASSERT_OK(iter->Refresh(snapshot)); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(4)); + // L0 + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + ASSERT_OK(iter->Refresh(nullptr)); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(6)); + ASSERT_OK(iter->Refresh(snapshot)); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(4)); + // L1 + MoveFilesToLevel(1); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + ASSERT_OK(iter->Refresh(nullptr)); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(6)); + ASSERT_OK(iter->Refresh(snapshot)); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(4)); + // L1 with two file. + // Test that when LevelIterator enters a new file, + // it remembers which snapshot sequence number to use. + ASSERT_OK(Put(Key(2), "2")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + ASSERT_EQ(2, NumTableFilesAtLevel(1)); + ASSERT_OK(iter->Refresh(nullptr)); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + // LevelIterator is at the first file + ASSERT_EQ(iter->key(), Key(2)); + ASSERT_OK(iter->Refresh(snapshot)); + // Will enter the second file, and create a new range tombstone iterator. + // It should use the snapshot sequence number. + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), Key(4)); + iter.reset(); + db_->ReleaseSnapshot(snapshot); +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_rate_limiter_test.cc b/db/db_rate_limiter_test.cc index e44cc047dcd2..05419db44a17 100644 --- a/db/db_rate_limiter_test.cc +++ b/db/db_rate_limiter_test.cc @@ -96,19 +96,10 @@ std::string GetTestNameSuffix( return oss.str(); } -#ifndef ROCKSDB_LITE INSTANTIATE_TEST_CASE_P(DBRateLimiterOnReadTest, DBRateLimiterOnReadTest, ::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool()), GetTestNameSuffix); -#else // ROCKSDB_LITE -// Cannot use direct I/O in lite mode. -INSTANTIATE_TEST_CASE_P(DBRateLimiterOnReadTest, DBRateLimiterOnReadTest, - ::testing::Combine(::testing::Values(false), - ::testing::Bool(), - ::testing::Bool()), - GetTestNameSuffix); -#endif // ROCKSDB_LITE TEST_P(DBRateLimiterOnReadTest, Get) { if (use_direct_io_ && !IsDirectIOSupported()) { @@ -229,12 +220,12 @@ TEST_P(DBRateLimiterOnReadTest, Iterator) { ++expected; } } + ASSERT_OK(iter->status()); // Reverse scan does not read evenly (one block per iteration) due to // descending seqno ordering, so wait until after the loop to check total. ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); } -#if !defined(ROCKSDB_LITE) TEST_P(DBRateLimiterOnReadTest, VerifyChecksum) { if (use_direct_io_ && !IsDirectIOSupported()) { @@ -245,8 +236,18 @@ TEST_P(DBRateLimiterOnReadTest, VerifyChecksum) { ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); ASSERT_OK(db_->VerifyChecksum(GetReadOptions())); - // The files are tiny so there should have just been one read per file. - int expected = kNumFiles; + // In BufferedIO, + // there are 7 reads per file, each of which will be rate-limited. + // During open: read footer, meta index block, properties block, index block. + // During actual checksum verification: read meta index block, verify checksum + // in meta blocks and verify checksum in file blocks. + // + // In DirectIO, where we support tail prefetching, during table open, we only + // do 1 read instead of 4 as described above. Actual checksum verification + // reads stay the same. + int num_read_per_file = (!use_direct_io_) ? 7 : 4; + int expected = kNumFiles * num_read_per_file; + ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); } @@ -264,7 +265,6 @@ TEST_P(DBRateLimiterOnReadTest, VerifyFileChecksums) { ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); } -#endif // !defined(ROCKSDB_LITE) class DBRateLimiterOnWriteTest : public DBTestBase { public: @@ -319,10 +319,8 @@ TEST_F(DBRateLimiterOnWriteTest, Compact) { // Pre-comaction: // level-0 : `kNumFiles` SST files overlapping on [kStartKey, kEndKey] -#ifndef ROCKSDB_LITE std::string files_per_level_pre_compaction = std::to_string(kNumFiles); ASSERT_EQ(files_per_level_pre_compaction, FilesPerLevel(0 /* cf */)); -#endif // !ROCKSDB_LITE std::int64_t prev_total_request = options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL); @@ -337,10 +335,8 @@ TEST_F(DBRateLimiterOnWriteTest, Compact) { // Post-comaction: // level-0 : 0 SST file // level-1 : 1 SST file -#ifndef ROCKSDB_LITE std::string files_per_level_post_compaction = "0,1"; ASSERT_EQ(files_per_level_post_compaction, FilesPerLevel(0 /* cf */)); -#endif // !ROCKSDB_LITE std::int64_t exepcted_compaction_request = 1; EXPECT_EQ(actual_compaction_request, exepcted_compaction_request); diff --git a/db/db_readonly_with_timestamp_test.cc b/db/db_readonly_with_timestamp_test.cc index 3f53e780632c..7a37bfec81c5 100644 --- a/db/db_readonly_with_timestamp_test.cc +++ b/db/db_readonly_with_timestamp_test.cc @@ -17,7 +17,6 @@ class DBReadOnlyTestWithTimestamp : public DBBasicTestWithTimestampBase { : DBBasicTestWithTimestampBase("db_readonly_test_with_timestamp") {} protected: -#ifndef ROCKSDB_LITE void CheckDBOpenedAsCompactedDBWithOneLevel0File() { VersionSet* const versions = dbfull()->GetVersionSet(); ASSERT_NE(versions, nullptr); @@ -63,10 +62,8 @@ class DBReadOnlyTestWithTimestamp : public DBBasicTestWithTimestampBase { ASSERT_TRUE( storage_info->LevelFilesBrief(highest_non_empty_level).num_files > 0); } -#endif // !ROCKSDB_LITE }; -#ifndef ROCKSDB_LITE TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGetReadTimestampSizeMismatch) { const int kNumKeysPerFile = 128; const uint64_t kMaxKey = 1024; @@ -243,6 +240,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) { get_value_and_check(db_, read_opts, it->key(), it->value(), write_timestamps[i]); } + ASSERT_OK(it->status()); size_t expected_count = kMaxKey - start_keys[i] + 1; ASSERT_EQ(expected_count, count); @@ -255,6 +253,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) { get_value_and_check(db_, read_opts, it->key(), it->value(), write_timestamps[i]); } + ASSERT_OK(it->status()); ASSERT_EQ(static_cast(kMaxKey) - start_keys[i] + 1, count); // SeekToFirst()/SeekToLast() with lower/upper bounds. @@ -276,6 +275,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) { get_value_and_check(db_, read_opts, it->key(), it->value(), write_timestamps[i]); } + ASSERT_OK(it->status()); ASSERT_EQ(r - std::max(l, start_keys[i]), count); for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0; @@ -285,6 +285,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) { get_value_and_check(db_, read_opts, it->key(), it->value(), write_timestamps[i]); } + ASSERT_OK(it->status()); l += (kMaxKey / 100); r -= (kMaxKey / 100); } @@ -331,6 +332,7 @@ TEST_F(DBReadOnlyTestWithTimestamp, Iterators) { CheckIterUserEntry(iters[0], Key1(key), kTypeValue, "value" + std::to_string(key), write_timestamp); } + ASSERT_OK(iters[0]->status()); size_t expected_count = kMaxKey - 0 + 1; ASSERT_EQ(expected_count, count); @@ -339,6 +341,53 @@ TEST_F(DBReadOnlyTestWithTimestamp, Iterators) { Close(); } +TEST_F(DBReadOnlyTestWithTimestamp, FullHistoryTsLowSanityCheckFail) { + Options options = CurrentOptions(); + options.env = env_; + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + // Use UDT in memtable only feature for this test, so we can control that + // newly set `full_history_ts_low` collapse history when Flush happens. + options.persist_user_defined_timestamps = false; + options.allow_concurrent_memtable_write = false; + DestroyAndReopen(options); + + std::string write_ts; + PutFixed64(&write_ts, 1); + ASSERT_OK(db_->Put(WriteOptions(), "foo", write_ts, "val1")); + + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 3); + ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), + full_history_ts_low)); + ASSERT_OK(Flush(0)); + + // Reopen the database in read only mode to test its timestamp support. + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + + // Reading below full_history_ts_low fails a sanity check. + std::string read_ts; + PutFixed64(&read_ts, 2); + Slice read_ts_slice = read_ts; + ReadOptions read_opts; + read_opts.timestamp = &read_ts_slice; + + // Get() + std::string value; + ASSERT_TRUE(db_->Get(read_opts, "foo", &value).IsInvalidArgument()); + // NewIterator() + std::unique_ptr iter( + db_->NewIterator(read_opts, db_->DefaultColumnFamily())); + ASSERT_TRUE(iter->status().IsInvalidArgument()); + + // NewIterators() + std::vector cfhs = {db_->DefaultColumnFamily()}; + std::vector iterators; + ASSERT_TRUE( + db_->NewIterators(read_opts, cfhs, &iterators).IsInvalidArgument()); + Close(); +} + TEST_F(DBReadOnlyTestWithTimestamp, IteratorsReadTimestampSizeMismatch) { const int kNumKeysPerFile = 128; const uint64_t kMaxKey = 1024; @@ -949,7 +998,6 @@ TEST_F(DBReadOnlyTestWithTimestamp, Close(); } -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_secondary_test.cc b/db/db_secondary_test.cc index 20d7534e057a..987756906e28 100644 --- a/db/db_secondary_test.cc +++ b/db/db_secondary_test.cc @@ -18,7 +18,6 @@ namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE class DBSecondaryTestBase : public DBBasicTestWithTimestampBase { public: explicit DBSecondaryTestBase(const std::string& dbname) @@ -165,12 +164,22 @@ TEST_F(DBSecondaryTest, ReopenAsSecondary) { Reopen(options); ASSERT_OK(Put("foo", "foo_value")); ASSERT_OK(Put("bar", "bar_value")); + WideColumns columns{{kDefaultWideColumnName, "attr_default_val"}, + {"attr_name1", "attr_value_1"}, + {"attr_name2", "attr_value_2"}}; + ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), "baz", + columns)); ASSERT_OK(dbfull()->Flush(FlushOptions())); Close(); ASSERT_OK(ReopenAsSecondary(options)); ASSERT_EQ("foo_value", Get("foo")); ASSERT_EQ("bar_value", Get("bar")); + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), "baz", + &result)); + ASSERT_EQ(result.columns(), columns); + ReadOptions ropts; ropts.verify_checksums = true; auto db1 = static_cast(db_); @@ -183,13 +192,17 @@ TEST_F(DBSecondaryTest, ReopenAsSecondary) { ASSERT_EQ("bar", iter->key().ToString()); ASSERT_EQ("bar_value", iter->value().ToString()); } else if (1 == count) { + ASSERT_EQ("baz", iter->key().ToString()); + ASSERT_EQ(columns, iter->columns()); + } else if (2 == count) { ASSERT_EQ("foo", iter->key().ToString()); ASSERT_EQ("foo_value", iter->value().ToString()); } ++count; } + ASSERT_OK(iter->status()); delete iter; - ASSERT_EQ(2, count); + ASSERT_EQ(3, count); } TEST_F(DBSecondaryTest, SimpleInternalCompaction) { @@ -522,6 +535,8 @@ TEST_F(DBSecondaryTest, SecondaryCloseFiles) { } ASSERT_FALSE(iter1->Valid()); ASSERT_FALSE(iter2->Valid()); + ASSERT_OK(iter1->status()); + ASSERT_OK(iter2->status()); }; ASSERT_OK(Put("a", "value")); @@ -794,6 +809,7 @@ TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ++count; } + ASSERT_OK(iter->status()); ASSERT_EQ(2, count); delete iter; } @@ -851,6 +867,7 @@ TEST_F(DBSecondaryTest, MissingTableFile) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ++count; } + ASSERT_OK(iter->status()); ASSERT_EQ(2, count); delete iter; } @@ -923,6 +940,7 @@ TEST_F(DBSecondaryTest, SwitchManifest) { ASSERT_EQ("value_" + std::to_string(kNumFiles - 1), iter->value().ToString()); } + EXPECT_OK(iter->status()); }; range_scan_db(); @@ -1473,6 +1491,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) { get_value_and_check(db_, read_opts, it->key(), it->value(), write_timestamps[i]); } + ASSERT_OK(it->status()); size_t expected_count = kMaxKey - start_keys[i] + 1; ASSERT_EQ(expected_count, count); @@ -1485,6 +1504,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) { get_value_and_check(db_, read_opts, it->key(), it->value(), write_timestamps[i]); } + ASSERT_OK(it->status()); ASSERT_EQ(static_cast(kMaxKey) - start_keys[i] + 1, count); // SeekToFirst()/SeekToLast() with lower/upper bounds. @@ -1506,6 +1526,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) { get_value_and_check(db_, read_opts, it->key(), it->value(), write_timestamps[i]); } + ASSERT_OK(it->status()); ASSERT_EQ(r - std::max(l, start_keys[i]), count); for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0; @@ -1515,6 +1536,7 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) { get_value_and_check(db_, read_opts, it->key(), it->value(), write_timestamps[i]); } + ASSERT_OK(it->status()); l += (kMaxKey / 100); r -= (kMaxKey / 100); } @@ -1562,6 +1584,55 @@ TEST_F(DBSecondaryTestWithTimestamp, IteratorsReadTimestampSizeMismatch) { Close(); } +TEST_F(DBSecondaryTestWithTimestamp, FullHistoryTsLowSanityCheckFail) { + Options options = CurrentOptions(); + options.env = env_; + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + // Use UDT in memtable only feature for this test, so we can control that + // newly set `full_history_ts_low` collapse history when Flush happens. + options.persist_user_defined_timestamps = false; + options.allow_concurrent_memtable_write = false; + DestroyAndReopen(options); + + std::string write_ts; + PutFixed64(&write_ts, 1); + ASSERT_OK(db_->Put(WriteOptions(), "foo", write_ts, "val1")); + + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 3); + ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), + full_history_ts_low)); + ASSERT_OK(Flush(0)); + + // Reopen the database as secondary instance to test its timestamp support. + Close(); + options.max_open_files = -1; + ASSERT_OK(ReopenAsSecondary(options)); + + // Reading below full_history_ts_low fails a sanity check. + std::string read_ts; + PutFixed64(&read_ts, 2); + Slice read_ts_slice = read_ts; + ReadOptions read_opts; + read_opts.timestamp = &read_ts_slice; + + // Get() + std::string value; + ASSERT_TRUE(db_->Get(read_opts, "foo", &value).IsInvalidArgument()); + + // NewIterator() + std::unique_ptr iter( + db_->NewIterator(read_opts, db_->DefaultColumnFamily())); + ASSERT_TRUE(iter->status().IsInvalidArgument()); + + // NewIterators() + std::vector cfhs = {db_->DefaultColumnFamily()}; + std::vector iterators; + ASSERT_TRUE( + db_->NewIterators(read_opts, cfhs, &iterators).IsInvalidArgument()); + Close(); +} + TEST_F(DBSecondaryTestWithTimestamp, IteratorsReadTimestampSpecifiedWithoutWriteTimestamp) { const int kNumKeysPerFile = 128; @@ -1675,6 +1746,7 @@ TEST_F(DBSecondaryTestWithTimestamp, Iterators) { CheckIterUserEntry(iters[0], Key1(key), kTypeValue, "value" + std::to_string(key), write_timestamp); } + ASSERT_OK(iters[0]->status()); size_t expected_count = kMaxKey - 0 + 1; ASSERT_EQ(expected_count, count); @@ -1682,7 +1754,6 @@ TEST_F(DBSecondaryTestWithTimestamp, Iterators) { Close(); } -#endif //! ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_sst_test.cc b/db/db_sst_test.cc index 7f031444a4ca..7590aa2f1134 100644 --- a/db/db_sst_test.cc +++ b/db/db_sst_test.cc @@ -24,7 +24,6 @@ class DBSSTTest : public DBTestBase { DBSSTTest() : DBTestBase("db_sst_test", /*env_do_fsync=*/true) {} }; -#ifndef ROCKSDB_LITE // A class which remembers the name of each flushed file. class FlushedFileCollector : public EventListener { public: @@ -53,7 +52,6 @@ class FlushedFileCollector : public EventListener { std::vector flushed_files_; std::mutex mutex_; }; -#endif // ROCKSDB_LITE TEST_F(DBSSTTest, DontDeletePendingOutputs) { Options options; @@ -151,7 +149,6 @@ TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) { ASSERT_EQ("choo", Get("pika")); } -#ifndef ROCKSDB_LITE TEST_F(DBSSTTest, DontDeleteMovedFile) { // This test triggers move compaction and verifies that the file is not // deleted when it's part of move compaction @@ -746,7 +743,7 @@ TEST_P(DBSSTTestRateLimit, RateLimitedDelete) { // Compaction will move the 4 files in L0 to trash and create 1 L1 file ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel(0)); uint64_t delete_start_time = env_->NowMicros(); @@ -813,11 +810,12 @@ TEST_F(DBSSTTest, RateLimitedWALDelete) { // We created 4 sst files in L0 ASSERT_EQ("4", FilesPerLevel(0)); - // Compaction will move the 4 files in L0 to trash and create 1 L1 file + // Compaction will move the 4 files in L0 to trash and create 1 L1 file. + // Use kForceOptimized to not rewrite the new L1 file. CompactRangeOptions cro; - cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; + cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel(0)); sfm->WaitForEmptyTrash(); @@ -939,12 +937,21 @@ INSTANTIATE_TEST_CASE_P(DBWALTestWithParam, DBWALTestWithParam, ::testing::Values(std::make_tuple("", true), std::make_tuple("_wal_dir", false))); -TEST_F(DBSSTTest, OpenDBWithExistingTrash) { +TEST_F(DBSSTTest, OpenDBWithExistingTrashAndObsoleteSstFile) { Options options = CurrentOptions(); - options.sst_file_manager.reset( NewSstFileManager(env_, nullptr, "", 1024 * 1024 /* 1 MB/sec */)); auto sfm = static_cast(options.sst_file_manager.get()); + // Set an extra high trash ratio to prevent immediate/non-rate limited + // deletions + sfm->SetDeleteRateBytesPerSecond(1024 * 1024); + sfm->delete_scheduler()->SetMaxTrashDBRatio(1000.0); + + int bg_delete_file = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "DeleteScheduler::DeleteTrashFile:DeleteFile", + [&](void* /*arg*/) { bg_delete_file++; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); Destroy(last_options_); @@ -953,13 +960,22 @@ TEST_F(DBSSTTest, OpenDBWithExistingTrash) { ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash")); ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash")); ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash")); - - // Reopen the DB and verify that it deletes existing trash files + // Manually add an obsolete sst file. Obsolete SST files are discovered and + // deleted upon recovery. + constexpr uint64_t kSstFileNumber = 100; + const std::string kObsoleteSstFile = + MakeTableFileName(dbname_, kSstFileNumber); + ASSERT_OK(WriteStringToFile(env_, "abc", kObsoleteSstFile)); + + // Reopen the DB and verify that it deletes existing trash files and obsolete + // SST files with rate limiting. Reopen(options); sfm->WaitForEmptyTrash(); ASSERT_NOK(env_->FileExists(dbname_ + "/" + "001.sst.trash")); ASSERT_NOK(env_->FileExists(dbname_ + "/" + "002.sst.trash")); ASSERT_NOK(env_->FileExists(dbname_ + "/" + "003.sst.trash")); + ASSERT_NOK(env_->FileExists(kObsoleteSstFile)); + ASSERT_EQ(bg_delete_file, 4); } // Create a DB with 2 db_paths, and generate multiple files in the 2 @@ -1226,7 +1242,7 @@ TEST_F(DBSSTTest, CancellingCompactionsWorks) { ASSERT_OK(Put(Key(i), rnd.RandomString(50))); } ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Because we set a callback in CancelledCompaction, we actually // let the compaction run @@ -1283,7 +1299,7 @@ TEST_F(DBSSTTest, CancellingManualCompactionsWorks) { .IsCompactionTooLarge()); // Wait for manual compaction to get scheduled and finish - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); // Make sure the stat is bumped @@ -1299,7 +1315,7 @@ TEST_F(DBSSTTest, CancellingManualCompactionsWorks) { .IsCompactionTooLarge()); // Wait for manual compaction to get scheduled and finish - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount( COMPACTION_CANCELLED), @@ -1316,7 +1332,7 @@ TEST_F(DBSSTTest, CancellingManualCompactionsWorks) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0)); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0); ASSERT_GT(completed_compactions, 0); @@ -1522,6 +1538,11 @@ TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFilesSubjectToMemoryLimit) { } TEST_F(DBSSTTest, GetTotalSstFilesSize) { + // FIXME: L0 file and L1+ file also differ in size of `oldest_key_time`. + // L0 file has non-zero `oldest_key_time` while L1+ files have 0. + // The test passes since L1+ file uses current time instead of 0 + // as oldest_ancestor_time. + // // We don't propagate oldest-key-time table property on compaction and // just write 0 as default value. This affect the exact table size, since // we encode table properties as varint64. Force time to be 0 to work around @@ -1856,7 +1877,6 @@ TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_statistics_test.cc b/db/db_statistics_test.cc index 4d46553611c5..054fbc56c727 100644 --- a/db/db_statistics_test.cc +++ b/db/db_statistics_test.cc @@ -20,76 +20,115 @@ class DBStatisticsTest : public DBTestBase { }; TEST_F(DBStatisticsTest, CompressionStatsTest) { - CompressionType type; - - if (Snappy_Supported()) { - type = kSnappyCompression; - fprintf(stderr, "using snappy\n"); - } else if (Zlib_Supported()) { - type = kZlibCompression; - fprintf(stderr, "using zlib\n"); - } else if (BZip2_Supported()) { - type = kBZip2Compression; - fprintf(stderr, "using bzip2\n"); - } else if (LZ4_Supported()) { - type = kLZ4Compression; - fprintf(stderr, "using lz4\n"); - } else if (XPRESS_Supported()) { - type = kXpressCompression; - fprintf(stderr, "using xpress\n"); - } else if (ZSTD_Supported()) { - type = kZSTD; - fprintf(stderr, "using ZSTD\n"); - } else { - fprintf(stderr, "skipping test, compression disabled\n"); - return; - } + for (CompressionType type : GetSupportedCompressions()) { + if (type == kNoCompression) { + continue; + } + if (type == kBZip2Compression) { + // Weird behavior in this test + continue; + } + SCOPED_TRACE("Compression type: " + std::to_string(type)); - Options options = CurrentOptions(); - options.compression = type; - options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex); - DestroyAndReopen(options); + Options options = CurrentOptions(); + options.compression = type; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex); + BlockBasedTableOptions bbto; + bbto.enable_index_compression = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); - int kNumKeysWritten = 100000; + auto PopStat = [&](Tickers t) -> uint64_t { + return options.statistics->getAndResetTickerCount(t); + }; - // Check that compressions occur and are counted when compression is turned on - Random rnd(301); - for (int i = 0; i < kNumKeysWritten; ++i) { - // compressible string - ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a'))); - } - ASSERT_OK(Flush()); - ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED), 0); + int kNumKeysWritten = 100; + double compress_to = 0.5; + // About three KVs per block + int len = static_cast(BlockBasedTableOptions().block_size / 3); + int uncomp_est = kNumKeysWritten * (len + 20); - for (int i = 0; i < kNumKeysWritten; ++i) { - auto r = Get(Key(i)); - } - ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED), 0); + Random rnd(301); + std::string buf; - options.compression = kNoCompression; - DestroyAndReopen(options); - uint64_t currentCompressions = - options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED); - uint64_t currentDecompressions = - options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED); - - // Check that compressions do not occur when turned off - for (int i = 0; i < kNumKeysWritten; ++i) { - // compressible string - ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a'))); - } - ASSERT_OK(Flush()); - ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED) - - currentCompressions, - 0); + // Check that compressions occur and are counted when compression is turned + // on + for (int i = 0; i < kNumKeysWritten; ++i) { + ASSERT_OK( + Put(Key(i), test::CompressibleString(&rnd, compress_to, len, &buf))); + } + ASSERT_OK(Flush()); + EXPECT_EQ(34, PopStat(NUMBER_BLOCK_COMPRESSED)); + EXPECT_NEAR2(uncomp_est, PopStat(BYTES_COMPRESSED_FROM), uncomp_est / 10); + EXPECT_NEAR2(uncomp_est * compress_to, PopStat(BYTES_COMPRESSED_TO), + uncomp_est / 10); + + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_DECOMPRESSED)); + EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_FROM)); + EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_TO)); + + // And decompressions + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + } + EXPECT_EQ(34, PopStat(NUMBER_BLOCK_DECOMPRESSED)); + EXPECT_NEAR2(uncomp_est, PopStat(BYTES_DECOMPRESSED_TO), uncomp_est / 10); + EXPECT_NEAR2(uncomp_est * compress_to, PopStat(BYTES_DECOMPRESSED_FROM), + uncomp_est / 10); + + EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_BYPASSED)); + EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_REJECTED)); + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED)); + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED)); + + // Check when compression is rejected. + DestroyAndReopen(options); - for (int i = 0; i < kNumKeysWritten; ++i) { - auto r = Get(Key(i)); + for (int i = 0; i < kNumKeysWritten; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomBinaryString(len))); + } + ASSERT_OK(Flush()); + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + } + EXPECT_EQ(34, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED)); + EXPECT_NEAR2(uncomp_est, PopStat(BYTES_COMPRESSION_REJECTED), + uncomp_est / 10); + + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSED)); + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED)); + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_DECOMPRESSED)); + EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_FROM)); + EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_TO)); + EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_BYPASSED)); + EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_FROM)); + EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_TO)); + + // Check when compression is disabled. + options.compression = kNoCompression; + DestroyAndReopen(options); + + for (int i = 0; i < kNumKeysWritten; ++i) { + ASSERT_OK(Put(Key(i), rnd.RandomBinaryString(len))); + } + ASSERT_OK(Flush()); + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + } + EXPECT_EQ(34, PopStat(NUMBER_BLOCK_COMPRESSION_BYPASSED)); + EXPECT_NEAR2(uncomp_est, PopStat(BYTES_COMPRESSION_BYPASSED), + uncomp_est / 10); + + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSED)); + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_COMPRESSION_REJECTED)); + EXPECT_EQ(0, PopStat(NUMBER_BLOCK_DECOMPRESSED)); + EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_FROM)); + EXPECT_EQ(0, PopStat(BYTES_COMPRESSED_TO)); + EXPECT_EQ(0, PopStat(BYTES_COMPRESSION_REJECTED)); + EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_FROM)); + EXPECT_EQ(0, PopStat(BYTES_DECOMPRESSED_TO)); } - ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED) - - currentDecompressions, - 0); } TEST_F(DBStatisticsTest, MutexWaitStatsDisabledByDefault) { @@ -157,7 +196,6 @@ TEST_F(DBStatisticsTest, ExcludeTickers) { ASSERT_GT(options.statistics->getTickerCount(BYTES_READ), 0); } -#ifndef ROCKSDB_LITE TEST_F(DBStatisticsTest, VerifyChecksumReadStat) { Options options = CurrentOptions(); @@ -181,7 +219,7 @@ TEST_F(DBStatisticsTest, VerifyChecksumReadStat) { ASSERT_OK(Flush()); std::unordered_map table_files; uint64_t table_files_size = 0; - GetAllDataFiles(kTableFile, &table_files, &table_files_size); + ASSERT_OK(GetAllDataFiles(kTableFile, &table_files, &table_files_size)); { // Scenario 1: Table verified in `VerifyFileChecksums()`. This should read @@ -204,7 +242,46 @@ TEST_F(DBStatisticsTest, VerifyChecksumReadStat) { } } -#endif // !ROCKSDB_LITE +TEST_F(DBStatisticsTest, BlockChecksumStats) { + Options options = CurrentOptions(); + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + Reopen(options); + + // Scenario 0: only WAL data. Not verified so require ticker to be zero. + ASSERT_OK(Put("foo", "value")); + ASSERT_OK(db_->VerifyChecksum()); + ASSERT_EQ(0, + options.statistics->getTickerCount(BLOCK_CHECKSUM_COMPUTE_COUNT)); + ASSERT_EQ(0, + options.statistics->getTickerCount(BLOCK_CHECKSUM_MISMATCH_COUNT)); + + // Scenario 1: Flushed table verified in `VerifyChecksum()`. This opens a + // `TableReader` to verify each of the four blocks (meta-index, table + // properties, index, and data block). + ASSERT_OK(Flush()); + ASSERT_OK(options.statistics->Reset()); + ASSERT_OK(db_->VerifyChecksum()); + ASSERT_EQ(4, + options.statistics->getTickerCount(BLOCK_CHECKSUM_COMPUTE_COUNT)); + ASSERT_EQ(0, + options.statistics->getTickerCount(BLOCK_CHECKSUM_MISMATCH_COUNT)); + + // Scenario 2: Corrupted table verified in `VerifyChecksum()`. The corruption + // is in the fourth and final verified block, i.e., the data block. + std::unordered_map table_files; + ASSERT_OK(GetAllDataFiles(kTableFile, &table_files)); + ASSERT_EQ(1, table_files.size()); + std::string table_name = table_files.begin()->first; + // Assumes the data block starts at offset zero. + ASSERT_OK(test::CorruptFile(options.env, table_name, 0 /* offset */, + 3 /* bytes_to_corrupt */)); + ASSERT_OK(options.statistics->Reset()); + ASSERT_NOK(db_->VerifyChecksum()); + ASSERT_EQ(4, + options.statistics->getTickerCount(BLOCK_CHECKSUM_COMPUTE_COUNT)); + ASSERT_EQ(1, + options.statistics->getTickerCount(BLOCK_CHECKSUM_MISMATCH_COUNT)); +} } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc index 981a514ad8f5..61dcf3c1e1d7 100644 --- a/db/db_table_properties_test.cc +++ b/db/db_table_properties_test.cc @@ -24,7 +24,6 @@ #include "test_util/testutil.h" #include "util/random.h" -#ifndef ROCKSDB_LITE namespace ROCKSDB_NAMESPACE { @@ -283,6 +282,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) { Random rnd(301); Options options; + options.level_compaction_dynamic_level_bytes = false; options.create_if_missing = true; options.write_buffer_size = 4096; options.max_write_buffer_number = 2; @@ -616,7 +616,6 @@ INSTANTIATE_TEST_CASE_P(DBTablePropertiesTest, DBTablePropertiesTest, } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); diff --git a/db/db_tailing_iter_test.cc b/db/db_tailing_iter_test.cc index af3194ac4b03..07ffadc2af2d 100644 --- a/db/db_tailing_iter_test.cc +++ b/db/db_tailing_iter_test.cc @@ -10,12 +10,16 @@ // Introduction of SyncPoint effectively disabled building and running this test // in Release build. // which is a pity, it is a good test -#if !defined(ROCKSDB_LITE) #include "db/db_test_util.h" #include "db/forward_iterator.h" #include "port/stack_trace.h" +namespace { +static bool enable_io_uring = true; +extern "C" bool RocksDbIOUringEnable() { return enable_io_uring; } +} // namespace + namespace ROCKSDB_NAMESPACE { class DBTestTailingIterator : public DBTestBase, @@ -48,97 +52,129 @@ TEST_P(DBTestTailingIterator, TailingIteratorSingle) { iter->Next(); ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); } TEST_P(DBTestTailingIterator, TailingIteratorKeepAdding) { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_BYPASS("Test requires non-mem or non-encrypted environment"); + return; + } + std::unique_ptr env( + new CompositeEnvWrapper(env_, FileSystem::Default())); + Options options = CurrentOptions(); + options.env = env.get(); + CreateAndReopenWithCF({"pikachu"}, options); ReadOptions read_options; read_options.tailing = true; if (GetParam()) { read_options.async_io = true; } - std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); - ASSERT_OK(iter->status()); - std::string value(1024, 'a'); - const int num_records = 10000; - for (int i = 0; i < num_records; ++i) { - char buf[32]; - snprintf(buf, sizeof(buf), "%016d", i); + { + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); + std::string value(1024, 'a'); - Slice key(buf, 16); - ASSERT_OK(Put(1, key, value)); + const int num_records = 10000; + for (int i = 0; i < num_records; ++i) { + char buf[32]; + snprintf(buf, sizeof(buf), "%016d", i); - iter->Seek(key); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(key), 0); + Slice key(buf, 16); + ASSERT_OK(Put(1, key, value)); + + iter->Seek(key); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(key), 0); + } } + Close(); } TEST_P(DBTestTailingIterator, TailingIteratorSeekToNext) { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_BYPASS("Test requires non-mem or non-encrypted environment"); + return; + } + std::unique_ptr env( + new CompositeEnvWrapper(env_, FileSystem::Default())); + Options options = CurrentOptions(); + options.env = env.get(); + CreateAndReopenWithCF({"pikachu"}, options); ReadOptions read_options; read_options.tailing = true; if (GetParam()) { read_options.async_io = true; } - std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); - ASSERT_OK(iter->status()); - std::unique_ptr itern(db_->NewIterator(read_options, handles_[1])); - ASSERT_OK(itern->status()); - std::string value(1024, 'a'); - - const int num_records = 1000; - for (int i = 1; i < num_records; ++i) { - char buf1[32]; - char buf2[32]; - snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5); - - Slice key(buf1, 20); - ASSERT_OK(Put(1, key, value)); + { + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); + std::unique_ptr itern( + db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(itern->status()); + std::string value(1024, 'a'); + + const int num_records = 1000; + for (int i = 1; i < num_records; ++i) { + char buf1[32]; + char buf2[32]; + snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5); + + Slice key(buf1, 20); + ASSERT_OK(Put(1, key, value)); + + if (i % 100 == 99) { + ASSERT_OK(Flush(1)); + } - if (i % 100 == 99) { - ASSERT_OK(Flush(1)); + snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2); + Slice target(buf2, 20); + iter->Seek(target); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(key), 0); + if (i == 1) { + itern->SeekToFirst(); + } else { + itern->Next(); + } + ASSERT_TRUE(itern->Valid()); + ASSERT_EQ(itern->key().compare(key), 0); } + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + for (int i = 2 * num_records; i > 0; --i) { + char buf1[32]; + char buf2[32]; + snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5); + + Slice key(buf1, 20); + ASSERT_OK(Put(1, key, value)); + + if (i % 100 == 99) { + ASSERT_OK(Flush(1)); + } - snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2); - Slice target(buf2, 20); - iter->Seek(target); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(key), 0); - if (i == 1) { - itern->SeekToFirst(); - } else { - itern->Next(); + snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2); + Slice target(buf2, 20); + iter->Seek(target); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(key), 0); } - ASSERT_TRUE(itern->Valid()); - ASSERT_EQ(itern->key().compare(key), 0); - } - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - for (int i = 2 * num_records; i > 0; --i) { - char buf1[32]; - char buf2[32]; - snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5); - - Slice key(buf1, 20); - ASSERT_OK(Put(1, key, value)); - - if (i % 100 == 99) { - ASSERT_OK(Flush(1)); - } - - snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2); - Slice target(buf2, 20); - iter->Seek(target); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().compare(key), 0); } + Close(); } TEST_P(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_BYPASS("Test requires non-mem or non-encrypted environment"); + return; + } const uint64_t k150KB = 150 * 1024; + std::unique_ptr env( + new CompositeEnvWrapper(env_, FileSystem::Default())); Options options; + options.env = env.get(); options.write_buffer_size = k150KB; options.max_write_buffer_number = 3; options.min_write_buffer_number_to_merge = 2; @@ -241,7 +277,6 @@ TEST_P(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { iterh = nullptr; BlockBasedTableOptions table_options; table_options.no_block_cache = true; - table_options.block_cache_compressed = nullptr; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); ReopenWithColumnFamilies({"default", "pikachu"}, options); read_options.read_tier = kBlockCacheTier; @@ -280,56 +315,73 @@ TEST_P(DBTestTailingIterator, TailingIteratorTrimSeekToNext) { } TEST_P(DBTestTailingIterator, TailingIteratorDeletes) { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_BYPASS("Test requires non-mem or non-encrypted environment"); + return; + } + std::unique_ptr env( + new CompositeEnvWrapper(env_, FileSystem::Default())); + Options options = CurrentOptions(); + options.env = env.get(); + CreateAndReopenWithCF({"pikachu"}, options); ReadOptions read_options; read_options.tailing = true; if (GetParam()) { read_options.async_io = true; } + { + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); - std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); - ASSERT_OK(iter->status()); - - // write a single record, read it using the iterator, then delete it - ASSERT_OK(Put(1, "0test", "test")); - iter->SeekToFirst(); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().ToString(), "0test"); - ASSERT_OK(Delete(1, "0test")); + // write a single record, read it using the iterator, then delete it + ASSERT_OK(Put(1, "0test", "test")); + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "0test"); + ASSERT_OK(Delete(1, "0test")); - // write many more records - const int num_records = 10000; - std::string value(1024, 'A'); + // write many more records + const int num_records = 10000; + std::string value(1024, 'A'); - for (int i = 0; i < num_records; ++i) { - char buf[32]; - snprintf(buf, sizeof(buf), "1%015d", i); + for (int i = 0; i < num_records; ++i) { + char buf[32]; + snprintf(buf, sizeof(buf), "1%015d", i); - Slice key(buf, 16); - ASSERT_OK(Put(1, key, value)); - } - - // force a flush to make sure that no records are read from memtable - ASSERT_OK(Flush(1)); + Slice key(buf, 16); + ASSERT_OK(Put(1, key, value)); + } - // skip "0test" - iter->Next(); + // force a flush to make sure that no records are read from memtable + ASSERT_OK(Flush(1)); - // make sure we can read all new records using the existing iterator - int count = 0; - for (; iter->Valid(); iter->Next(), ++count) - ; + // skip "0test" + iter->Next(); - ASSERT_EQ(count, num_records); + // make sure we can read all new records using the existing iterator + int count = 0; + for (; iter->Valid(); iter->Next(), ++count) + ; + ASSERT_OK(iter->status()); + ASSERT_EQ(count, num_records); + } + Close(); } TEST_P(DBTestTailingIterator, TailingIteratorPrefixSeek) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_BYPASS("Test requires non-mem or non-encrypted environment"); + return; + } ReadOptions read_options; read_options.tailing = true; if (GetParam()) { read_options.async_io = true; } + std::unique_ptr env( + new CompositeEnvWrapper(env_, FileSystem::Default())); Options options = CurrentOptions(); + options.env = env.get(); options.create_if_missing = true; options.disable_auto_compactions = true; options.prefix_extractor.reset(NewFixedPrefixTransform(2)); @@ -338,28 +390,40 @@ TEST_P(DBTestTailingIterator, TailingIteratorPrefixSeek) { DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); - std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); - ASSERT_OK(iter->status()); - ASSERT_OK(Put(1, "0101", "test")); + { + std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(iter->status()); + ASSERT_OK(Put(1, "0101", "test")); - ASSERT_OK(Flush(1)); + ASSERT_OK(Flush(1)); - ASSERT_OK(Put(1, "0202", "test")); + ASSERT_OK(Put(1, "0202", "test")); - // Seek(0102) shouldn't find any records since 0202 has a different prefix - iter->Seek("0102"); - ASSERT_TRUE(!iter->Valid()); + // Seek(0102) shouldn't find any records since 0202 has a different prefix + iter->Seek("0102"); + ASSERT_TRUE(!iter->Valid()); - iter->Seek("0202"); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(iter->key().ToString(), "0202"); + iter->Seek("0202"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().ToString(), "0202"); - iter->Next(); - ASSERT_TRUE(!iter->Valid()); + iter->Next(); + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + } + Close(); } TEST_P(DBTestTailingIterator, TailingIteratorIncomplete) { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_BYPASS("Test requires non-mem or non-encrypted environment"); + return; + } + std::unique_ptr env( + new CompositeEnvWrapper(env_, FileSystem::Default())); + Options options = CurrentOptions(); + options.env = env.get(); + CreateAndReopenWithCF({"pikachu"}, options); ReadOptions read_options; read_options.tailing = true; if (GetParam()) { @@ -372,20 +436,30 @@ TEST_P(DBTestTailingIterator, TailingIteratorIncomplete) { ASSERT_OK(db_->Put(WriteOptions(), key, value)); - std::unique_ptr iter(db_->NewIterator(read_options)); - ASSERT_OK(iter->status()); - iter->SeekToFirst(); - // we either see the entry or it's not in cache - ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete()); - - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - iter->SeekToFirst(); - // should still be true after compaction - ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete()); + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + // we either see the entry or it's not in cache + ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + iter->SeekToFirst(); + // should still be true after compaction + ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete()); + } + Close(); } TEST_P(DBTestTailingIterator, TailingIteratorSeekToSame) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_BYPASS("Test requires non-mem or non-encrypted environment"); + return; + } + std::unique_ptr env( + new CompositeEnvWrapper(env_, FileSystem::Default())); Options options = CurrentOptions(); + options.env = env.get(); options.compaction_style = kCompactionStyleUniversal; options.write_buffer_size = 1000; CreateAndReopenWithCF({"pikachu"}, options); @@ -405,28 +479,39 @@ TEST_P(DBTestTailingIterator, TailingIteratorSeekToSame) { ASSERT_OK(db_->Put(WriteOptions(), key, value)); } - std::unique_ptr iter(db_->NewIterator(read_options)); - ASSERT_OK(iter->status()); - // Seek to 00001. We expect to find 00002. - std::string start_key = "00001"; - iter->Seek(start_key); - ASSERT_TRUE(iter->Valid()); + { + std::unique_ptr iter(db_->NewIterator(read_options)); + ASSERT_OK(iter->status()); + // Seek to 00001. We expect to find 00002. + std::string start_key = "00001"; + iter->Seek(start_key); + ASSERT_TRUE(iter->Valid()); - std::string found = iter->key().ToString(); - ASSERT_EQ("00002", found); + std::string found = iter->key().ToString(); + ASSERT_EQ("00002", found); - // Now seek to the same key. The iterator should remain in the same - // position. - iter->Seek(found); - ASSERT_TRUE(iter->Valid()); - ASSERT_EQ(found, iter->key().ToString()); + // Now seek to the same key. The iterator should remain in the same + // position. + iter->Seek(found); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(found, iter->key().ToString()); + } + Close(); } // Sets iterate_upper_bound and verifies that ForwardIterator doesn't call // Seek() on immutable iterators when target key is >= prev_key and all // iterators, including the memtable iterator, are over the upper bound. TEST_P(DBTestTailingIterator, TailingIteratorUpperBound) { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_BYPASS("Test requires non-mem or non-encrypted environment"); + return; + } + std::unique_ptr env( + new CompositeEnvWrapper(env_, FileSystem::Default())); + Options options = CurrentOptions(); + options.env = env.get(); + CreateAndReopenWithCF({"pikachu"}, options); const Slice upper_bound("20", 3); ReadOptions read_options; @@ -443,34 +528,51 @@ TEST_P(DBTestTailingIterator, TailingIteratorUpperBound) { // Add another key to the memtable. ASSERT_OK(Put(1, "21", "21")); - std::unique_ptr it(db_->NewIterator(read_options, handles_[1])); - ASSERT_OK(it->status()); - it->Seek("12"); - ASSERT_TRUE(it->Valid()); - ASSERT_EQ("12", it->key().ToString()); - - it->Next(); - // Not valid since "21" is over the upper bound. - ASSERT_FALSE(it->Valid()); - ASSERT_OK(it->status()); - // This keeps track of the number of times NeedToSeekImmutable() was true. - int immutable_seeks = 0; - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "ForwardIterator::SeekInternal:Immutable", - [&](void* /*arg*/) { ++immutable_seeks; }); - - // Seek to 13. This should not require any immutable seeks. - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - it->Seek("13"); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - - ASSERT_FALSE(it->Valid()); - ASSERT_OK(it->status()); - if (GetParam()) { - ASSERT_EQ(1, immutable_seeks); - } else { - ASSERT_EQ(0, immutable_seeks); + { + bool read_async_called = false; + + SyncPoint::GetInstance()->SetCallBack( + "UpdateResults::io_uring_result", + [&](void* /*arg*/) { read_async_called = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + auto it = + std::unique_ptr(db_->NewIterator(read_options, handles_[1])); + ASSERT_OK(it->status()); + it->Seek("12"); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("12", it->key().ToString()); + + it->Next(); + // Not valid since "21" is over the upper bound. + ASSERT_FALSE(it->Valid()); + ASSERT_OK(it->status()); + // This keeps track of the number of times NeedToSeekImmutable() was true. + int immutable_seeks = 0; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "ForwardIterator::SeekInternal:Immutable", + [&](void* /*arg*/) { ++immutable_seeks; }); + + // Seek to 13. This should not require any immutable seeks. + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + it->Seek("13"); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + + SyncPoint::GetInstance()->SetCallBack( + "UpdateResults::io_uring_result", + [&](void* /*arg*/) { read_async_called = true; }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + ASSERT_FALSE(it->Valid()); + ASSERT_OK(it->status()); + if (GetParam() && read_async_called) { + ASSERT_EQ(1, immutable_seeks); + } else { + ASSERT_EQ(0, immutable_seeks); + } } + Close(); } TEST_P(DBTestTailingIterator, TailingIteratorGap) { @@ -482,7 +584,15 @@ TEST_P(DBTestTailingIterator, TailingIteratorGap) { // the largest key of index n file and the smallest key of index n+1 file // if both file fit in that gap. In this example, 25 < key < 35 // https://github.com/facebook/rocksdb/issues/1372 - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_BYPASS("Test requires non-mem or non-encrypted environment"); + return; + } + std::unique_ptr env( + new CompositeEnvWrapper(env_, FileSystem::Default())); + Options options = CurrentOptions(); + options.env = env.get(); + CreateAndReopenWithCF({"pikachu"}, options); ReadOptions read_options; read_options.tailing = true; @@ -514,20 +624,23 @@ TEST_P(DBTestTailingIterator, TailingIteratorGap) { ColumnFamilyMetaData meta; db_->GetColumnFamilyMetaData(handles_[1], &meta); - std::unique_ptr it(db_->NewIterator(read_options, handles_[1])); - it->Seek("30"); - ASSERT_TRUE(it->Valid()); - ASSERT_EQ("30", it->key().ToString()); + { + std::unique_ptr it(db_->NewIterator(read_options, handles_[1])); + it->Seek("30"); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("30", it->key().ToString()); - it->Next(); - ASSERT_TRUE(it->Valid()); - ASSERT_EQ("35", it->key().ToString()); + it->Next(); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("35", it->key().ToString()); - it->Next(); - ASSERT_TRUE(it->Valid()); - ASSERT_EQ("40", it->key().ToString()); + it->Next(); + ASSERT_TRUE(it->Valid()); + ASSERT_EQ("40", it->key().ToString()); - ASSERT_OK(it->status()); + ASSERT_OK(it->status()); + } + Close(); } TEST_P(DBTestTailingIterator, SeekWithUpperBoundBug) { @@ -589,16 +702,9 @@ TEST_P(DBTestTailingIterator, SeekToFirstWithUpperBoundBug) { } // namespace ROCKSDB_NAMESPACE -#endif // !defined(ROCKSDB_LITE) int main(int argc, char** argv) { -#if !defined(ROCKSDB_LITE) ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - (void)argc; - (void)argv; - return 0; -#endif } diff --git a/db/db_test.cc b/db/db_test.cc index 215f4e3ed81f..1240b285dc5c 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -66,7 +66,7 @@ #include "util/compression.h" #include "util/mutexlock.h" #include "util/random.h" -#include "util/rate_limiter.h" +#include "util/rate_limiter_impl.h" #include "util/string_util.h" #include "utilities/merge_operators.h" @@ -127,10 +127,9 @@ TEST_F(DBTest, MockEnvTest) { iterator->Next(); } ASSERT_TRUE(!iterator->Valid()); + ASSERT_OK(iterator->status()); delete iterator; -// TEST_FlushMemTable() is not supported in ROCKSDB_LITE -#ifndef ROCKSDB_LITE DBImpl* dbi = static_cast_with_check(db); ASSERT_OK(dbi->TEST_FlushMemTable()); @@ -139,14 +138,10 @@ TEST_F(DBTest, MockEnvTest) { ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); ASSERT_TRUE(res == vals[i]); } -#endif // ROCKSDB_LITE delete db; } -// NewMemEnv returns nullptr in ROCKSDB_LITE since class InMemoryEnv isn't -// defined. -#ifndef ROCKSDB_LITE TEST_F(DBTest, MemEnvTest) { std::unique_ptr env{NewMemEnv(Env::Default())}; Options options; @@ -177,6 +172,7 @@ TEST_F(DBTest, MemEnvTest) { iterator->Next(); } ASSERT_TRUE(!iterator->Valid()); + ASSERT_OK(iterator->status()); delete iterator; DBImpl* dbi = static_cast_with_check(db); @@ -199,7 +195,6 @@ TEST_F(DBTest, MemEnvTest) { } delete db; } -#endif // ROCKSDB_LITE TEST_F(DBTest, WriteEmptyBatch) { Options options = CurrentOptions(); @@ -360,7 +355,7 @@ TEST_F(DBTest, MixedSlowdownOptionsInQueue) { for (int i = 0; i < 2; ++i) { threads.emplace_back(write_no_slowdown_func); } - // Sleep for 2s to allow the threads to insert themselves into the + // Sleep for 3s to allow the threads to insert themselves into the // write queue env_->SleepForMicroseconds(3000000ULL); } @@ -431,7 +426,7 @@ TEST_F(DBTest, MixedSlowdownOptionsStop) { for (int i = 0; i < 2; ++i) { threads.emplace_back(write_no_slowdown_func); } - // Sleep for 2s to allow the threads to insert themselves into the + // Sleep for 3s to allow the threads to insert themselves into the // write queue env_->SleepForMicroseconds(3000000ULL); } @@ -458,7 +453,6 @@ TEST_F(DBTest, MixedSlowdownOptionsStop) { wo.no_slowdown = true; ASSERT_OK(dbfull()->Put(wo, "foo3", "bar")); } -#ifndef ROCKSDB_LITE TEST_F(DBTest, LevelLimitReopen) { Options options = CurrentOptions(); @@ -481,9 +475,7 @@ TEST_F(DBTest, LevelLimitReopen) { options.max_bytes_for_level_multiplier_additional.resize(10, 1); ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); } -#endif // ROCKSDB_LITE -#ifndef ROCKSDB_LITE TEST_F(DBTest, LevelReopenWithFIFO) { const int kLevelCount = 4; const int kKeyCount = 5; @@ -593,7 +585,7 @@ TEST_F(DBTest, LevelReopenWithFIFO) { TryReopenWithColumnFamilies({"default", "pikachu"}, fifo_options)); // For FIFO to pick a compaction ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); - ASSERT_OK(dbfull()->TEST_WaitForCompact(false)); + ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); for (int g = 0; g < kKeyCount; ++g) { std::string get_key = std::string(1, char('a' + g)); int status_index = i / kKeyCount; @@ -610,7 +602,6 @@ TEST_F(DBTest, LevelReopenWithFIFO) { ASSERT_EQ(expected_files_per_level_after_fifo[i], FilesPerLevel(kCF)); } } -#endif // !ROCKSDB_LITE TEST_F(DBTest, PutSingleDeleteGet) { do { @@ -656,6 +647,33 @@ TEST_F(DBTest, ReadFromPersistedTier) { ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value)); } + const auto check_multiget_func = + [&](const ReadOptions& read_opts, + std::vector cfhs, std::vector& keys, + std::vector& values, + bool batched) -> std::vector { + if (!batched) { + return db_->MultiGet(read_opts, cfhs, keys, &values); + } else { + size_t num_keys = keys.size(); + std::vector statuses; + std::vector pinnable_values; + statuses.resize(num_keys); + pinnable_values.resize(num_keys); + values.resize(num_keys); + db_->MultiGet(read_opts, cfhs[0], num_keys, keys.data(), + pinnable_values.data(), statuses.data(), false); + for (size_t i = 0; i < statuses.size(); ++i) { + if (statuses[i].ok()) { + values[i].assign(pinnable_values[i].data(), + pinnable_values[i].size()); + pinnable_values[i].Reset(); + } + } + return statuses; + } + }; + // Multiget std::vector multiget_cfs; multiget_cfs.push_back(handles_[1]); @@ -664,14 +682,17 @@ TEST_F(DBTest, ReadFromPersistedTier) { multiget_keys.push_back("foo"); multiget_keys.push_back("bar"); std::vector multiget_values; - auto statuses = - db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values); - if (wopt.disableWAL) { - ASSERT_TRUE(statuses[0].IsNotFound()); - ASSERT_TRUE(statuses[1].IsNotFound()); - } else { - ASSERT_OK(statuses[0]); - ASSERT_OK(statuses[1]); + for (int i = 0; i < 2; i++) { + bool batched = i == 0; + auto statuses = check_multiget_func(ropt, multiget_cfs, multiget_keys, + multiget_values, batched); + if (wopt.disableWAL) { + ASSERT_TRUE(statuses[0].IsNotFound()); + ASSERT_TRUE(statuses[1].IsNotFound()); + } else { + ASSERT_OK(statuses[0]); + ASSERT_OK(statuses[1]); + } } // 2nd round: flush and put a new value in memtable. @@ -695,21 +716,26 @@ TEST_F(DBTest, ReadFromPersistedTier) { // Expect same result in multiget multiget_cfs.push_back(handles_[1]); multiget_keys.push_back("rocksdb"); - statuses = - db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values); - ASSERT_TRUE(statuses[0].ok()); - ASSERT_EQ("first", multiget_values[0]); - ASSERT_TRUE(statuses[1].ok()); - ASSERT_EQ("one", multiget_values[1]); - if (wopt.disableWAL) { - ASSERT_TRUE(statuses[2].IsNotFound()); - } else { - ASSERT_OK(statuses[2]); + multiget_values.clear(); + + for (int i = 0; i < 2; i++) { + bool batched = i == 0; + auto statuses = check_multiget_func(ropt, multiget_cfs, multiget_keys, + multiget_values, batched); + ASSERT_TRUE(statuses[0].ok()); + ASSERT_EQ("first", multiget_values[0]); + ASSERT_TRUE(statuses[1].ok()); + ASSERT_EQ("one", multiget_values[1]); + if (wopt.disableWAL) { + ASSERT_TRUE(statuses[2].IsNotFound()); + } else { + ASSERT_OK(statuses[2]); + } } // 3rd round: delete and flush ASSERT_OK(db_->Delete(wopt, handles_[1], "foo")); - Flush(1); + ASSERT_OK(Flush(1)); ASSERT_OK(db_->Delete(wopt, handles_[1], "bar")); ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound()); @@ -724,17 +750,21 @@ TEST_F(DBTest, ReadFromPersistedTier) { ASSERT_TRUE(db_->Get(ropt, handles_[1], "rocksdb", &value).ok()); ASSERT_EQ(value, "hello"); - statuses = - db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values); - ASSERT_TRUE(statuses[0].IsNotFound()); - if (wopt.disableWAL) { - ASSERT_TRUE(statuses[1].ok()); - ASSERT_EQ("one", multiget_values[1]); - } else { - ASSERT_TRUE(statuses[1].IsNotFound()); + multiget_values.clear(); + for (int i = 0; i < 2; i++) { + bool batched = i == 0; + auto statuses = check_multiget_func(ropt, multiget_cfs, multiget_keys, + multiget_values, batched); + ASSERT_TRUE(statuses[0].IsNotFound()); + if (wopt.disableWAL) { + ASSERT_TRUE(statuses[1].ok()); + ASSERT_EQ("one", multiget_values[1]); + } else { + ASSERT_TRUE(statuses[1].IsNotFound()); + } + ASSERT_TRUE(statuses[2].ok()); + ASSERT_EQ("hello", multiget_values[2]); } - ASSERT_TRUE(statuses[2].ok()); - ASSERT_EQ("hello", multiget_values[2]); if (wopt.disableWAL == 0) { DestroyAndReopen(options); } @@ -859,9 +889,7 @@ TEST_F(DBTest, DISABLED_VeryLargeValue) { ASSERT_OK(Put(key2, raw)); dbfull()->TEST_WaitForFlushMemTable(); -#ifndef ROCKSDB_LITE ASSERT_EQ(1, NumTableFilesAtLevel(0)); -#endif // !ROCKSDB_LITE std::string value; Status s = db_->Get(ReadOptions(), key1, &value); @@ -875,7 +903,7 @@ TEST_F(DBTest, DISABLED_VeryLargeValue) { ASSERT_EQ('w', value[0]); // Compact all files. - Flush(); + ASSERT_OK(Flush()); db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); // Check DB is not in read-only state. @@ -938,7 +966,6 @@ TEST_F(DBTest, WrongLevel0Config) { ASSERT_OK(DB::Open(options, dbname_, &db_)); } -#ifndef ROCKSDB_LITE TEST_F(DBTest, GetOrderedByLevels) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); @@ -1009,7 +1036,6 @@ TEST_F(DBTest, GetEncountersEmptyLevel) { ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1); // XXX } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction)); } -#endif // ROCKSDB_LITE TEST_F(DBTest, FlushMultipleMemtable) { do { @@ -1029,7 +1055,6 @@ TEST_F(DBTest, FlushMultipleMemtable) { ASSERT_OK(Flush(1)); } while (ChangeCompactOptions()); } -#ifndef ROCKSDB_LITE TEST_F(DBTest, FlushSchedule) { Options options = CurrentOptions(); options.disable_auto_compactions = true; @@ -1074,7 +1099,6 @@ TEST_F(DBTest, FlushSchedule) { ASSERT_LE(pikachu_tables, static_cast(10)); ASSERT_GT(pikachu_tables, static_cast(0)); } -#endif // ROCKSDB_LITE namespace { class KeepFilter : public CompactionFilter { @@ -1139,14 +1163,12 @@ class DelayFilterFactory : public CompactionFilterFactory { }; } // anonymous namespace -#ifndef ROCKSDB_LITE static std::string CompressibleString(Random* rnd, int len) { std::string r; test::CompressibleString(rnd, 0.8, len, &r); return r; } -#endif // ROCKSDB_LITE TEST_F(DBTest, FailMoreDbPaths) { Options options = CurrentOptions(); @@ -1268,7 +1290,6 @@ void CheckLiveFilesMeta( } } -#ifndef ROCKSDB_LITE void AddBlobFile(const ColumnFamilyHandle* cfh, uint64_t blob_file_number, uint64_t total_blob_count, uint64_t total_blob_bytes, const std::string& checksum_method, @@ -1322,7 +1343,7 @@ TEST_F(DBTest, MetaDataTest) { options.disable_auto_compactions = true; int64_t temp_time = 0; - options.env->GetCurrentTime(&temp_time); + ASSERT_OK(options.env->GetCurrentTime(&temp_time)); uint64_t start_time = static_cast(temp_time); DestroyAndReopen(options); @@ -1351,7 +1372,7 @@ TEST_F(DBTest, MetaDataTest) { std::vector> files_by_level; dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level); - options.env->GetCurrentTime(&temp_time); + ASSERT_OK(options.env->GetCurrentTime(&temp_time)); uint64_t end_time = static_cast(temp_time); ColumnFamilyMetaData cf_meta; @@ -1560,9 +1581,7 @@ TEST_F(DBTest, DISABLED_RepeatedWritesToSameKey) { } } while (ChangeCompactOptions()); } -#endif // ROCKSDB_LITE -#ifndef ROCKSDB_LITE static bool Between(uint64_t val, uint64_t low, uint64_t high) { bool result = (val >= low) && (val <= high); if (!result) { @@ -1959,9 +1978,7 @@ TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) { // ApproximateOffsetOf() is not yet implemented in plain table format. } while (ChangeOptions(kSkipPlainTable)); } -#endif // ROCKSDB_LITE -#ifndef ROCKSDB_LITE TEST_F(DBTest, Snapshot) { env_->SetMockSleep(); anon::OptionsOverride options_override; @@ -2083,7 +2100,6 @@ TEST_F(DBTest, HiddenValuesAreRemoved) { } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction | kSkipPlainTable)); } -#endif // ROCKSDB_LITE TEST_F(DBTest, UnremovableSingleDelete) { // If we compact: @@ -2134,7 +2150,6 @@ TEST_F(DBTest, UnremovableSingleDelete) { kSkipMergePut)); } -#ifndef ROCKSDB_LITE TEST_F(DBTest, DeletionMarkers1) { Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu"}, options); @@ -2254,7 +2269,6 @@ TEST_F(DBTest, OverlapInLevel0) { ASSERT_EQ("NOT_FOUND", Get(1, "600")); } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction)); } -#endif // ROCKSDB_LITE TEST_F(DBTest, ComparatorCheck) { class NewComparator : public Comparator { @@ -2442,7 +2456,6 @@ TEST_F(DBTest, DestroyDBMetaDatabase) { ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok()); } -#ifndef ROCKSDB_LITE TEST_F(DBTest, SnapshotFiles) { do { Options options = CurrentOptions(); @@ -2669,7 +2682,6 @@ TEST_F(DBTest, GetLiveBlobFiles) { ASSERT_EQ(cfmd.blob_file_count, 1U); ASSERT_EQ(cfmd.blob_file_size, bmd.blob_file_size); } -#endif TEST_F(DBTest, PurgeInfoLogs) { Options options = CurrentOptions(); @@ -2717,7 +2729,6 @@ TEST_F(DBTest, PurgeInfoLogs) { } } -#ifndef ROCKSDB_LITE // Multi-threaded test: namespace { @@ -2904,7 +2915,6 @@ INSTANTIATE_TEST_CASE_P( ::testing::Combine( ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs()), ::testing::Bool())); -#endif // ROCKSDB_LITE // Group commit test: #if !defined(OS_WIN) @@ -2977,6 +2987,7 @@ TEST_F(DBTest, GroupCommitTest) { itr->Next(); } ASSERT_TRUE(!itr->Valid()); + ASSERT_OK(itr->status()); delete itr; HistogramData hist_data; @@ -3111,7 +3122,6 @@ class ModelDB : public DB { return s; } -#ifndef ROCKSDB_LITE using DB::IngestExternalFile; Status IngestExternalFile( ColumnFamilyHandle* /*column_family*/, @@ -3131,7 +3141,7 @@ class ModelDB : public DB { const ColumnFamilyOptions& /*options*/, const std::string& /*column_family_name*/, const ImportColumnFamilyOptions& /*import_options*/, - const ExportImportFilesMetaData& /*metadata*/, + const std::vector& /*metadatas*/, ColumnFamilyHandle** /*handle*/) override { return Status::NotSupported("Not implemented."); } @@ -3141,6 +3151,13 @@ class ModelDB : public DB { return Status::NotSupported("Not implemented."); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not implemented."); + } + using DB::GetPropertiesOfAllTables; Status GetPropertiesOfAllTables( ColumnFamilyHandle* /*column_family*/, @@ -3153,7 +3170,6 @@ class ModelDB : public DB { std::size_t /*n*/, TablePropertiesCollection* /*props*/) override { return Status(); } -#endif // ROCKSDB_LITE using DB::KeyMayExist; bool KeyMayExist(const ReadOptions& /*options*/, @@ -3304,6 +3320,11 @@ class ModelDB : public DB { void DisableManualCompaction() override { return; } + virtual Status WaitForCompact( + const WaitForCompactOptions& /* wait_for_compact_options */) override { + return Status::OK(); + } + using DB::NumberLevels; int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; } @@ -3346,7 +3367,6 @@ class ModelDB : public DB { Status DisableFileDeletions() override { return Status::OK(); } Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); } -#ifndef ROCKSDB_LITE Status GetLiveFiles(std::vector&, uint64_t* /*size*/, bool /*flush_memtable*/ = true) override { @@ -3390,7 +3410,6 @@ class ModelDB : public DB { void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/, ColumnFamilyMetaData* /*metadata*/) override {} -#endif // ROCKSDB_LITE Status GetDbIdentity(std::string& /*identity*/) const override { return Status::OK(); @@ -3532,6 +3551,9 @@ static bool CompareIterators(int step, DB* model, DB* db, ok = false; } } + EXPECT_OK(miter->status()); + EXPECT_OK(dbiter->status()); + (void)count; delete miter; delete dbiter; return ok; @@ -3662,12 +3684,10 @@ TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { ASSERT_EQ("v1", Get("k1")); ASSERT_EQ("v2", Get("k2")); -#ifndef ROCKSDB_LITE // Back to original ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}})); ASSERT_EQ("v1", Get("k1")); ASSERT_EQ("v2", Get("k2")); -#endif // !ROCKSDB_LITE // Same if there's a problem initally loading prefix transform options.prefix_extractor.reset(NewFixedPrefixTransform(1)); @@ -3679,12 +3699,10 @@ TEST_F(DBTest, BlockBasedTablePrefixIndexTest) { ASSERT_EQ("v1", Get("k1")); ASSERT_EQ("v2", Get("k2")); -#ifndef ROCKSDB_LITE // Change again ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}})); ASSERT_EQ("v1", Get("k1")); ASSERT_EQ("v2", Get("k2")); -#endif // !ROCKSDB_LITE SyncPoint::GetInstance()->DisableProcessing(); // Reopen with no prefix extractor, make sure everything still works. @@ -3711,7 +3729,7 @@ TEST_F(DBTest, BlockBasedTablePrefixHashIndexTest) { ASSERT_OK(Put("kk2", "v2")); ASSERT_OK(Put("kk", "v3")); ASSERT_OK(Put("k", "v4")); - Flush(); + ASSERT_OK(Flush()); ASSERT_EQ("v1", Get("kk1")); ASSERT_EQ("v2", Get("kk2")); @@ -3806,7 +3824,6 @@ TEST_F(DBTest, ChecksumTest) { ASSERT_EQ("h", Get("g")); } -#ifndef ROCKSDB_LITE TEST_P(DBTestWithParam, FIFOCompactionTest) { for (int iter = 0; iter < 2; ++iter) { // first iteration -- auto compaction @@ -4178,9 +4195,7 @@ TEST_F(DBTest, FIFOCompactionWithTTLTest) { options.compaction_options_fifo.max_table_files_size); } } -#endif // ROCKSDB_LITE -#ifndef ROCKSDB_LITE /* * This test is not reliable enough as it heavily depends on disk behavior. * Disable as it is flaky. @@ -4348,8 +4363,8 @@ TEST_F(DBTest, ConcurrentMemtableNotSupported) { options.soft_pending_compaction_bytes_limit = 0; options.hard_pending_compaction_bytes_limit = 100; options.create_if_missing = true; - - DestroyDB(dbname_, options); + Close(); + ASSERT_OK(DestroyDB(dbname_, options)); options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 4)); ASSERT_NOK(TryReopen(options)); @@ -4363,7 +4378,6 @@ TEST_F(DBTest, ConcurrentMemtableNotSupported) { ASSERT_NOK(db_->CreateColumnFamily(cf_options, "name", &handle)); } -#endif // ROCKSDB_LITE TEST_F(DBTest, SanitizeNumThreads) { for (int attempt = 0; attempt < 2; attempt++) { @@ -4517,7 +4531,6 @@ TEST_F(DBTest, ManualFlushWalAndWriteRace) { ASSERT_EQ("value2", Get("foo2")); } -#ifndef ROCKSDB_LITE TEST_F(DBTest, DynamicMemtableOptions) { const uint64_t k64KB = 1 << 16; const uint64_t k128KB = 1 << 17; @@ -4671,7 +4684,6 @@ TEST_F(DBTest, DynamicMemtableOptions) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -#endif // ROCKSDB_LITE #ifdef ROCKSDB_USING_THREAD_STATUS namespace { @@ -4693,7 +4705,7 @@ TEST_F(DBTest, GetThreadStatus) { Options options; options.env = env_; options.enable_thread_tracking = true; - TryReopen(options); + ASSERT_OK(TryReopen(options)); std::vector thread_list; Status s = env_->GetThreadList(&thread_list); @@ -4764,7 +4776,7 @@ TEST_F(DBTest, DisableThreadStatus) { Options options; options.env = env_; options.enable_thread_tracking = false; - TryReopen(options); + ASSERT_OK(TryReopen(options)); CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options); // Verify non of the column family info exists env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_, @@ -4973,7 +4985,7 @@ TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) { options.level0_slowdown_writes_trigger = 1 << 10; options.max_subcompactions = max_subcompactions_; - TryReopen(options); + ASSERT_OK(TryReopen(options)); Random rnd(301); std::vector thread_list; @@ -5020,7 +5032,7 @@ TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) { ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1); CancelAllBackgroundWork(db_); TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); // Record the number of compactions at a time. for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { operation_count[i] = 0; @@ -5062,7 +5074,7 @@ TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) { options.level0_slowdown_writes_trigger = 1 << 10; options.max_subcompactions = max_subcompactions_; - TryReopen(options); + ASSERT_OK(TryReopen(options)); Random rnd(301); std::vector thread_list; @@ -5107,7 +5119,7 @@ TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) { CancelAllBackgroundWork(db_); TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown"); TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); // Record the number of compactions at a time. for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) { operation_count[i] = 0; @@ -5121,7 +5133,6 @@ TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) { #endif // ROCKSDB_USING_THREAD_STATUS -#ifndef ROCKSDB_LITE TEST_F(DBTest, FlushOnDestroy) { WriteOptions wo; wo.disableWAL = true; @@ -5344,6 +5355,7 @@ TEST_F(DBTest, DynamicCompactionOptions) { const uint64_t k1MB = 1 << 20; const uint64_t k4KB = 1 << 12; Options options; + options.level_compaction_dynamic_level_bytes = false; options.env = env_; options.create_if_missing = true; options.compression = kNoCompression; @@ -5674,7 +5686,6 @@ TEST_F(DBTest, DynamicUniversalCompactionOptions) { dbfull()->GetOptions().compaction_options_universal.allow_trivial_move, false); } -#endif // ROCKSDB_LITE TEST_F(DBTest, FileCreationRandomFailure) { Options options; @@ -5737,7 +5748,6 @@ TEST_F(DBTest, FileCreationRandomFailure) { } } -#ifndef ROCKSDB_LITE TEST_F(DBTest, DynamicMiscOptions) { // Test max_sequential_skip_in_iterations @@ -5832,7 +5842,6 @@ TEST_F(DBTest, DynamicMiscOptions) { &mutable_cf_options)); ASSERT_FALSE(mutable_cf_options.check_flush_compaction_key_order); } -#endif // ROCKSDB_LITE TEST_F(DBTest, L0L1L2AndUpHitCounter) { const int kNumLevels = 3; @@ -6045,6 +6054,7 @@ TEST_F(DBTest, MergeTestTime) { ASSERT_OK(iter->status()); ++count; } + ASSERT_OK(iter->status()); ASSERT_EQ(1, count); ASSERT_EQ(4000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME)); @@ -6054,7 +6064,6 @@ TEST_F(DBTest, MergeTestTime) { } #endif // OS_WIN -#ifndef ROCKSDB_LITE TEST_P(DBTestWithParam, MergeCompactionTimeTest) { SetPerfLevel(kEnableTime); Options options = CurrentOptions(); @@ -6117,7 +6126,64 @@ TEST_P(DBTestWithParam, FilterCompactionTimeTest) { TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME)); delete itr; } -#endif // ROCKSDB_LITE + +#ifndef OS_WIN +// CPUMicros() is not supported. See WinClock::CPUMicros(). +TEST_P(DBTestWithParam, CompactionTotalTimeTest) { + int record_count = 0; + class TestStatistics : public StatisticsImpl { + public: + explicit TestStatistics(int* record_count) + : StatisticsImpl(nullptr), record_count_(record_count) {} + void recordTick(uint32_t ticker_type, uint64_t count) override { + if (ticker_type == COMPACTION_CPU_TOTAL_TIME) { + ASSERT_GT(count, 0); + (*record_count_)++; + } + StatisticsImpl::recordTick(ticker_type, count); + } + + int* record_count_; + }; + + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.create_if_missing = true; + options.statistics = std::make_shared(&record_count); + options.statistics->set_stats_level(kExceptTimeForMutex); + options.max_subcompactions = max_subcompactions_; + DestroyAndReopen(options); + + int n = 0; + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 1000; ++i) { + ASSERT_OK(Put(std::to_string(table * 1000 + i), "val")); + ++n; + } + // Overlapping tables + ASSERT_OK(Put(std::to_string(0), "val")); + ++n; + ASSERT_OK(Flush()); + } + + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + + // Hard-coded number in CompactionJob::ProcessKeyValueCompaction(). + const int kRecordStatsEvery = 1000; + // The stat COMPACTION_CPU_TOTAL_TIME should be recorded + // during compaction and once more after compaction. + ASSERT_EQ(n / kRecordStatsEvery + 1, record_count); + + // Check that COMPACTION_CPU_TOTAL_TIME correctly + // records compaction time after a compaction. + HistogramData h; + options.statistics->histogramData(COMPACTION_CPU_TIME, &h); + ASSERT_EQ(1, h.count); + ASSERT_EQ(h.max, TestGetTickerCount(options, COMPACTION_CPU_TOTAL_TIME)); +} +#endif TEST_F(DBTest, TestLogCleanup) { Options options = CurrentOptions(); @@ -6134,7 +6200,6 @@ TEST_F(DBTest, TestLogCleanup) { } } -#ifndef ROCKSDB_LITE TEST_F(DBTest, EmptyCompactedDB) { Options options = CurrentOptions(); options.max_open_files = -1; @@ -6144,9 +6209,7 @@ TEST_F(DBTest, EmptyCompactedDB) { ASSERT_TRUE(s.IsNotSupported()); Close(); } -#endif // ROCKSDB_LITE -#ifndef ROCKSDB_LITE TEST_F(DBTest, SuggestCompactRangeTest) { class CompactionFilterFactoryGetContext : public CompactionFilterFactory { public: @@ -6375,7 +6438,6 @@ TEST_F(DBTest, CompactRangeWithEmptyBottomLevel) { ASSERT_EQ(NumTableFilesAtLevel(0), 0); ASSERT_EQ(NumTableFilesAtLevel(1), kNumL0Files); } -#endif // ROCKSDB_LITE TEST_F(DBTest, AutomaticConflictsWithManualCompaction) { const int kNumL0Files = 50; @@ -6433,7 +6495,6 @@ TEST_F(DBTest, AutomaticConflictsWithManualCompaction) { ASSERT_OK(dbfull()->TEST_WaitForCompact()); } -#ifndef ROCKSDB_LITE TEST_F(DBTest, CompactFilesShouldTriggerAutoCompaction) { Options options = CurrentOptions(); options.max_background_compactions = 1; @@ -6493,7 +6554,6 @@ TEST_F(DBTest, CompactFilesShouldTriggerAutoCompaction) { ASSERT_LE(cf_meta_data.levels[0].files.size(), options.level0_file_num_compaction_trigger); } -#endif // ROCKSDB_LITE // Github issue #595 // Large write batch with column families @@ -6704,7 +6764,7 @@ TEST_F(DBTest, HardLimit) { sleeping_task_low.WaitUntilDone(); } -#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) +#if !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) class WriteStallListener : public EventListener { public: WriteStallListener() : condition_(WriteStallCondition::kNormal) {} @@ -6946,8 +7006,7 @@ TEST_F(DBTest, LastWriteBufferDelay) { sleeping_task.WakeUp(); sleeping_task.WaitUntilDone(); } -#endif // !defined(ROCKSDB_LITE) && - // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) +#endif // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) TEST_F(DBTest, FailWhenCompressionNotSupportedTest) { CompressionType compressions[] = {kZlibCompression, kBZip2Compression, @@ -6983,11 +7042,23 @@ TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) { delete handle; } -#ifndef ROCKSDB_LITE TEST_F(DBTest, RowCache) { Options options = CurrentOptions(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); - options.row_cache = NewLRUCache(8192); + LRUCacheOptions cache_options; + cache_options.capacity = 8192; + options.row_cache = cache_options.MakeSharedRowCache(); + // BEGIN check that Cache classes as aliases of each other. + // Currently, RowCache and BlockCache are aliases for Cache. + // This is expected to change (carefully, intentionally) + std::shared_ptr row_cache = options.row_cache; + std::shared_ptr cache = row_cache; + std::shared_ptr block_cache = row_cache; + row_cache = cache; + block_cache = cache; + row_cache = block_cache; + cache = block_cache; + // END check that Cache classes as aliases of each other. DestroyAndReopen(options); ASSERT_OK(Put("foo", "bar")); @@ -7001,6 +7072,28 @@ TEST_F(DBTest, RowCache) { ASSERT_EQ(Get("foo"), "bar"); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + + // Also test non-OK cache insertion (would be ASAN failure on memory leak) + class FailInsertionCache : public CacheWrapper { + public: + using CacheWrapper::CacheWrapper; + const char* Name() const override { return "FailInsertionCache"; } + Status Insert(const Slice&, Cache::ObjectPtr, const CacheItemHelper*, + size_t, Handle** = nullptr, Priority = Priority::LOW, + const Slice& /*compressed*/ = Slice(), + CompressionType /*type*/ = kNoCompression) override { + return Status::MemoryLimit(); + } + }; + options.row_cache = std::make_shared(options.row_cache); + ASSERT_OK(options.statistics->Reset()); + Reopen(options); + + ASSERT_EQ(Get("foo"), "bar"); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(Get("foo"), "bar"); + // Test condition requires row cache insertion to fail + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); } TEST_F(DBTest, PinnableSliceAndRowCache) { @@ -7115,7 +7208,6 @@ TEST_F(DBTest, ReusePinnableSlice) { 1); } -#endif // ROCKSDB_LITE TEST_F(DBTest, DeletingOldWalAfterDrop) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( @@ -7241,7 +7333,6 @@ TEST_F(DBTest, LargeBlockSizeTest) { ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); } -#ifndef ROCKSDB_LITE TEST_F(DBTest, CreationTimeOfOldestFile) { const int kNumKeysPerFile = 32; @@ -7261,14 +7352,14 @@ TEST_F(DBTest, CreationTimeOfOldestFile) { int idx = 0; int64_t time_1 = 0; - env_->GetCurrentTime(&time_1); + ASSERT_OK(env_->GetCurrentTime(&time_1)); const uint64_t uint_time_1 = static_cast(time_1); // Add 50 hours env_->MockSleepForSeconds(50 * 60 * 60); int64_t time_2 = 0; - env_->GetCurrentTime(&time_2); + ASSERT_OK(env_->GetCurrentTime(&time_2)); const uint64_t uint_time_2 = static_cast(time_2); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( @@ -7430,7 +7521,6 @@ TEST_F(DBTest, ShuttingDownNotBlockStalledWrites) { thd.join(); } -#endif } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_test2.cc b/db/db_test2.cc index b4f1664f47b8..e471685b210d 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -48,7 +48,6 @@ class DBTest2 : public DBTestBase { } }; -#ifndef ROCKSDB_LITE TEST_F(DBTest2, OpenForReadOnly) { DB* db_ptr = nullptr; std::string dbname = test::PerThreadDBPath("db_readonly"); @@ -106,66 +105,6 @@ TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) { ASSERT_NOK(env_->FileExists(dbname)); } -class TestReadOnlyWithCompressedCache - : public DBTestBase, - public testing::WithParamInterface> { - public: - TestReadOnlyWithCompressedCache() - : DBTestBase("test_readonly_with_compressed_cache", - /*env_do_fsync=*/true) { - max_open_files_ = std::get<0>(GetParam()); - use_mmap_ = std::get<1>(GetParam()); - } - int max_open_files_; - bool use_mmap_; -}; - -TEST_P(TestReadOnlyWithCompressedCache, ReadOnlyWithCompressedCache) { - if (use_mmap_ && !IsMemoryMappedAccessSupported()) { - ROCKSDB_GTEST_SKIP("Test requires MMAP support"); - return; - } - ASSERT_OK(Put("foo", "bar")); - ASSERT_OK(Put("foo2", "barbarbarbarbarbarbarbar")); - ASSERT_OK(Flush()); - - DB* db_ptr = nullptr; - Options options = CurrentOptions(); - options.allow_mmap_reads = use_mmap_; - options.max_open_files = max_open_files_; - options.compression = kSnappyCompression; - BlockBasedTableOptions table_options; - table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); - table_options.no_block_cache = true; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.statistics = CreateDBStatistics(); - - ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db_ptr)); - - std::string v; - ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("bar", v); - ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT)); - ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v)); - ASSERT_EQ("bar", v); - if (Snappy_Supported()) { - if (use_mmap_) { - ASSERT_EQ(0, - options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT)); - } else { - ASSERT_EQ(1, - options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT)); - } - } - - delete db_ptr; -} - -INSTANTIATE_TEST_CASE_P(TestReadOnlyWithCompressedCache, - TestReadOnlyWithCompressedCache, - ::testing::Combine(::testing::Values(-1, 100), - ::testing::Bool())); - class PartitionedIndexTestListener : public EventListener { public: void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { @@ -206,7 +145,6 @@ TEST_F(DBTest2, PartitionedIndexUserToInternalKey) { } } -#endif // ROCKSDB_LITE class PrefixFullBloomWithReverseComparator : public DBTestBase, @@ -328,7 +266,7 @@ TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) { ASSERT_OK(Put(1, "a", "begin")); ASSERT_OK(Put(1, "z", "end")); ASSERT_OK(Flush(1)); - TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); std::string value; value = Get(1, "a"); @@ -350,7 +288,6 @@ TEST_F(DBTest2, MaxSuccessiveMergesChangeWithDBRecovery) { Reopen(options); } -#ifndef ROCKSDB_LITE class DBTestSharedWriteBufferAcrossCFs : public DBTestBase, public testing::WithParamInterface> { @@ -420,10 +357,10 @@ TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { // are newer CFs created. flush_listener->expected_flush_reason = FlushReason::kManualFlush; ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); - Flush(3); + ASSERT_OK(Flush(3)); ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); - Flush(0); + ASSERT_OK(Flush(0)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), static_cast(1)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), @@ -1604,9 +1541,7 @@ TEST_P(PresetCompressionDictTest, CompactNonBottommost) { } ASSERT_OK(Flush()); } -#ifndef ROCKSDB_LITE ASSERT_EQ("2,0,1", FilesPerLevel(0)); -#endif // ROCKSDB_LITE uint64_t prev_compression_dict_bytes_inserted = TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT); @@ -1614,9 +1549,7 @@ TEST_P(PresetCompressionDictTest, CompactNonBottommost) { // file is not bottommost due to the existing L2 file covering the same key- // range. ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); -#ifndef ROCKSDB_LITE ASSERT_EQ("0,1,1", FilesPerLevel(0)); -#endif // ROCKSDB_LITE // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a // compression dictionary exists since dictionaries would be preloaded when // the compaction finishes. @@ -1680,17 +1613,13 @@ TEST_P(PresetCompressionDictTest, CompactBottommost) { } ASSERT_OK(Flush()); } -#ifndef ROCKSDB_LITE ASSERT_EQ("2", FilesPerLevel(0)); -#endif // ROCKSDB_LITE uint64_t prev_compression_dict_bytes_inserted = TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT); CompactRangeOptions cro; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); -#ifndef ROCKSDB_LITE ASSERT_EQ("0,1", FilesPerLevel(0)); -#endif // ROCKSDB_LITE ASSERT_GT( TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), prev_compression_dict_bytes_inserted); @@ -1876,6 +1805,7 @@ TEST_P(CompressionFailuresTest, CompressionFailures) { ASSERT_EQ(key_value_written[key], value); key_value_written.erase(key); } + ASSERT_OK(db_iter->status()); ASSERT_EQ(0, key_value_written.size()); } else if (compression_failure_type_ == kTestDecompressionFail) { ASSERT_EQ(std::string(s.getState()), @@ -2057,7 +1987,6 @@ TEST_F(DBTest2, CompactionStall) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -#endif // ROCKSDB_LITE TEST_F(DBTest2, FirstSnapshotTest) { Options options; @@ -2074,7 +2003,6 @@ TEST_F(DBTest2, FirstSnapshotTest) { db_->ReleaseSnapshot(s1); } -#ifndef ROCKSDB_LITE TEST_F(DBTest2, DuplicateSnapshot) { Options options; options = CurrentOptions(options); @@ -2106,7 +2034,6 @@ TEST_F(DBTest2, DuplicateSnapshot) { db_->ReleaseSnapshot(s); } } -#endif // ROCKSDB_LITE class PinL0IndexAndFilterBlocksTest : public DBTestBase, @@ -2137,11 +2064,12 @@ class PinL0IndexAndFilterBlocksTest ASSERT_OK(Flush(1)); // move this table to L1 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); + ASSERT_EQ(1, NumTableFilesAtLevel(1, 1)); // reset block cache table_options.block_cache = NewLRUCache(64 * 1024); options->table_factory.reset(NewBlockBasedTableFactory(table_options)); - TryReopenWithColumnFamilies({"default", "pikachu"}, *options); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, *options)); // create new table at L0 ASSERT_OK(Put(1, "a2", "begin2")); ASSERT_OK(Put(1, "z2", "end2")); @@ -2261,7 +2189,7 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { // Reopen database. If max_open_files is set as -1, table readers will be // preloaded. This will trigger a BlockBasedTable::Open() and prefetch // L0 index and filter. Level 1's prefetching is disabled in DB::Open() - TryReopenWithColumnFamilies({"default", "pikachu"}, options); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -2294,7 +2222,7 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { // this should be read from L1 value = Get(1, "a"); if (!disallow_preload_) { - // In inifinite max files case, there's a cache miss in executing Get() + // In infinite max files case, there's a cache miss in executing Get() // because index and filter are not prefetched before. ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); @@ -2322,12 +2250,12 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } // Bloom and index hit will happen when a Get() happens. @@ -2336,12 +2264,12 @@ TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(ih + 5, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } } @@ -2351,7 +2279,6 @@ INSTANTIATE_TEST_CASE_P(PinL0IndexAndFilterBlocksTest, std::make_tuple(false, false), std::make_tuple(false, true))); -#ifndef ROCKSDB_LITE TEST_F(DBTest2, MaxCompactionBytesTest) { Options options = CurrentOptions(); options.memtable_factory.reset(test::NewSpecialSkipListFactory( @@ -2625,7 +2552,6 @@ TEST_F(DBTest2, PersistentCache) { new MockPersistentCache(type, 10 * 1024)); table_options.no_block_cache = true; table_options.block_cache = bsize ? NewLRUCache(bsize) : nullptr; - table_options.block_cache_compressed = nullptr; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); @@ -2718,7 +2644,6 @@ TEST_F(DBTest2, SyncPointMarker) { ASSERT_EQ(sync_point_called.load(), 1); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -#endif size_t GetEncodedEntrySize(size_t key_size, size_t value_size) { std::string buffer; @@ -2925,7 +2850,6 @@ TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) { } #endif // !OS_SOLARIS -#ifndef ROCKSDB_LITE TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) { Options options = CurrentOptions(); options.num_levels = 3; @@ -3122,7 +3046,7 @@ TEST_F(DBTest2, PausingManualCompaction1) { .IsManualCompactionPaused()); // Wait for compactions to get scheduled and stopped - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Get file names after compaction is stopped files_meta.clear(); @@ -3142,7 +3066,7 @@ TEST_F(DBTest2, PausingManualCompaction1) { files_before_compact, 0) .IsManualCompactionPaused()); // Wait for manual compaction to get scheduled and finish - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); files_meta.clear(); files_after_compact.clear(); @@ -3175,7 +3099,7 @@ TEST_F(DBTest2, PausingManualCompaction2) { } ASSERT_OK(Flush()); } - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); std::vector files_meta; dbfull()->GetLiveFilesMetaData(&files_meta); @@ -3206,9 +3130,7 @@ TEST_F(DBTest2, PausingManualCompaction3) { DestroyAndReopen(options); generate_files(); -#ifndef ROCKSDB_LITE ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); -#endif // !ROCKSDB_LITE int run_manual_compactions = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::Run():PausingManualCompaction:1", @@ -3219,21 +3141,17 @@ TEST_F(DBTest2, PausingManualCompaction3) { ASSERT_TRUE(dbfull() ->CompactRange(compact_options, nullptr, nullptr) .IsManualCompactionPaused()); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // As manual compaction disabled, not even reach sync point ASSERT_EQ(run_manual_compactions, 0); -#ifndef ROCKSDB_LITE ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); -#endif // !ROCKSDB_LITE ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( "CompactionJob::Run():PausingManualCompaction:1"); dbfull()->EnableManualCompaction(); ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); -#ifndef ROCKSDB_LITE + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); -#endif // !ROCKSDB_LITE ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -3262,9 +3180,7 @@ TEST_F(DBTest2, PausingManualCompaction4) { DestroyAndReopen(options); generate_files(); -#ifndef ROCKSDB_LITE ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); -#endif // !ROCKSDB_LITE int run_manual_compactions = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) { @@ -3289,19 +3205,15 @@ TEST_F(DBTest2, PausingManualCompaction4) { ASSERT_TRUE(dbfull() ->CompactRange(compact_options, nullptr, nullptr) .IsManualCompactionPaused()); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(run_manual_compactions, 1); -#ifndef ROCKSDB_LITE ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); -#endif // !ROCKSDB_LITE ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( "CompactionJob::Run():PausingManualCompaction:2"); ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); -#ifndef ROCKSDB_LITE + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); -#endif // !ROCKSDB_LITE ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -3334,9 +3246,7 @@ TEST_F(DBTest2, CancelManualCompaction1) { DestroyAndReopen(options); generate_files(); -#ifndef ROCKSDB_LITE ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); -#endif // !ROCKSDB_LITE int run_manual_compactions = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( @@ -3354,15 +3264,13 @@ TEST_F(DBTest2, CancelManualCompaction1) { ASSERT_TRUE(dbfull() ->CompactRange(compact_options, nullptr, nullptr) .IsManualCompactionPaused()); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Since compactions are disabled, we shouldn't start compacting. // E.g. we should call the compaction function exactly one time. ASSERT_EQ(compactions_run, 0); ASSERT_EQ(run_manual_compactions, 0); -#ifndef ROCKSDB_LITE ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); -#endif // !ROCKSDB_LITE compactions_run = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( @@ -3380,7 +3288,7 @@ TEST_F(DBTest2, CancelManualCompaction1) { ASSERT_TRUE(dbfull() ->CompactRange(compact_options, nullptr, nullptr) .IsManualCompactionPaused()); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(compactions_run, 3); @@ -3392,10 +3300,8 @@ TEST_F(DBTest2, CancelManualCompaction1) { // Compactions should work again if we re-enable them.. compact_options.canceled->store(false, std::memory_order_relaxed); ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); -#ifndef ROCKSDB_LITE + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); -#endif // !ROCKSDB_LITE ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -3429,9 +3335,7 @@ TEST_F(DBTest2, CancelManualCompaction2) { DestroyAndReopen(options); generate_files(); -#ifndef ROCKSDB_LITE ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); -#endif // !ROCKSDB_LITE ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -3460,7 +3364,7 @@ TEST_F(DBTest2, CancelManualCompaction2) { ASSERT_TRUE(dbfull() ->CompactRange(compact_options, nullptr, nullptr) .IsManualCompactionPaused()); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // NOTE: as we set compact_options.max_subcompacitons = 1, and store true to // the canceled variable from the single compacting thread (via callback), @@ -3478,10 +3382,8 @@ TEST_F(DBTest2, CancelManualCompaction2) { // Compactions should work again if we re-enable them.. compact_options.canceled->store(false, std::memory_order_relaxed); ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); -#ifndef ROCKSDB_LITE + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); -#endif // !ROCKSDB_LITE ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -3555,7 +3457,7 @@ TEST_F(DBTest2, CancelManualCompactionWithListener) { ASSERT_TRUE(dbfull() ->CompactRange(compact_options, nullptr, nullptr) .IsManualCompactionPaused()); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(listener->num_compaction_started_, 0); ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_); @@ -3570,7 +3472,7 @@ TEST_F(DBTest2, CancelManualCompactionWithListener) { ASSERT_TRUE(dbfull() ->CompactRange(compact_options, nullptr, nullptr) .IsManualCompactionPaused()); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(listener->num_compaction_started_, 0); ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_); @@ -3592,7 +3494,7 @@ TEST_F(DBTest2, CancelManualCompactionWithListener) { compact_options.canceled->store(false, std::memory_order_release); ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(listener->num_compaction_started_, 0); ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_); @@ -3709,7 +3611,6 @@ TEST_F(DBTest2, OptimizeForSmallDB) { value.Reset(); } -#endif // ROCKSDB_LITE TEST_F(DBTest2, IterRaceFlush1) { ASSERT_OK(Put("foo", "v1")); @@ -3901,10 +3802,11 @@ TEST_F(DBTest2, MemtableOnlyIterator) { count++; } ASSERT_TRUE(!it->Valid()); + ASSERT_OK(it->status()); ASSERT_EQ(2, count); delete it; - Flush(1); + ASSERT_OK(Flush(1)); // After flushing // point lookups @@ -3980,19 +3882,28 @@ TEST_F(DBTest2, LowPriWrite) { ASSERT_OK(Put("", "", wo)); ASSERT_EQ(1, rate_limit_count.load()); + wo.low_pri = true; + std::string big_value = std::string(1 * 1024 * 1024, 'x'); + ASSERT_OK(Put("", big_value, wo)); + ASSERT_LT(1, rate_limit_count.load()); + // Reset + rate_limit_count = 0; + wo.low_pri = false; + ASSERT_OK(Put("", big_value, wo)); + ASSERT_EQ(0, rate_limit_count.load()); + TEST_SYNC_POINT("DBTest.LowPriWrite:0"); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_OK(dbfull()->TEST_WaitForCompact()); wo.low_pri = true; ASSERT_OK(Put("", "", wo)); - ASSERT_EQ(1, rate_limit_count.load()); + ASSERT_EQ(0, rate_limit_count.load()); wo.low_pri = false; ASSERT_OK(Put("", "", wo)); - ASSERT_EQ(1, rate_limit_count.load()); + ASSERT_EQ(0, rate_limit_count.load()); } -#ifndef ROCKSDB_LITE TEST_F(DBTest2, RateLimitedCompactionReads) { // compaction input has 512KB data const int kNumKeysPerFile = 128; @@ -4073,7 +3984,6 @@ TEST_F(DBTest2, RateLimitedCompactionReads) { } } } -#endif // ROCKSDB_LITE // Make sure DB can be reopen with reduced number of levels, given no file // is on levels higher than the new num_levels. @@ -4086,21 +3996,15 @@ TEST_F(DBTest2, ReduceLevel) { ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Flush()); MoveFilesToLevel(6); -#ifndef ROCKSDB_LITE ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); -#endif // !ROCKSDB_LITE CompactRangeOptions compact_options; compact_options.change_level = true; compact_options.target_level = 1; ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); -#ifndef ROCKSDB_LITE ASSERT_EQ("0,1", FilesPerLevel()); -#endif // !ROCKSDB_LITE options.num_levels = 3; Reopen(options); -#ifndef ROCKSDB_LITE ASSERT_EQ("0,1", FilesPerLevel()); -#endif // !ROCKSDB_LITE } // Test that ReadCallback is actually used in both memtbale and sst tables @@ -4135,18 +4039,14 @@ TEST_F(DBTest2, ReadCallbackTest) { } ASSERT_OK(Flush()); MoveFilesToLevel(6); -#ifndef ROCKSDB_LITE ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); -#endif // !ROCKSDB_LITE for (; i < 30; i++) { ASSERT_OK(Put(key, value + std::to_string(i))); auto snapshot = dbfull()->GetSnapshot(); snapshots.push_back(snapshot); } ASSERT_OK(Flush()); -#ifndef ROCKSDB_LITE ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel()); -#endif // !ROCKSDB_LITE // And also add some values to the memtable for (; i < 40; i++) { ASSERT_OK(Put(key, value + std::to_string(i))); @@ -4188,7 +4088,6 @@ TEST_F(DBTest2, ReadCallbackTest) { } } -#ifndef ROCKSDB_LITE TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) { // Regression test for race condition where an obsolete file is returned to @@ -4225,7 +4124,7 @@ TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) { ASSERT_OK(Put("key", "val")); FlushOptions flush_opts; flush_opts.wait = false; - db_->Flush(flush_opts); + ASSERT_OK(db_->Flush(flush_opts)); TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"); ASSERT_OK(db_->DisableFileDeletions()); @@ -4236,7 +4135,7 @@ TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) { ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber()))); } - ASSERT_OK(db_->EnableFileDeletions()); + ASSERT_OK(db_->EnableFileDeletions(/*force=*/false)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -5257,7 +5156,6 @@ TEST_F(DBTest2, TraceWithFilter) { ASSERT_EQ(count, 6); } -#endif // ROCKSDB_LITE TEST_F(DBTest2, PinnableSliceAndMmapReads) { Options options = CurrentOptions(); @@ -5288,7 +5186,6 @@ TEST_F(DBTest2, PinnableSliceAndMmapReads) { // compaction. It crashes if it does. ASSERT_EQ(pinned_value.ToString(), "bar"); -#ifndef ROCKSDB_LITE pinned_value.Reset(); // Unsafe to pin mmap files when they could be kicked out of table cache Close(); @@ -5306,7 +5203,6 @@ TEST_F(DBTest2, PinnableSliceAndMmapReads) { ASSERT_EQ(Get("foo", &pinned_value), Status::OK()); ASSERT_TRUE(pinned_value.IsPinned()); ASSERT_EQ(pinned_value.ToString(), "bar"); -#endif } TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) { @@ -5415,88 +5311,6 @@ TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -TEST_F(DBTest2, TestBBTTailPrefetch) { - std::atomic called(false); - size_t expected_lower_bound = 512 * 1024; - size_t expected_higher_bound = 512 * 1024; - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { - size_t* prefetch_size = static_cast(arg); - EXPECT_LE(expected_lower_bound, *prefetch_size); - EXPECT_GE(expected_higher_bound, *prefetch_size); - called = true; - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - expected_lower_bound = 0; - expected_higher_bound = 8 * 1024; - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - // Full compaction to make sure there is no L0 file after the open. - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - - ASSERT_TRUE(called.load()); - called = false; - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - - std::atomic first_call(true); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { - size_t* prefetch_size = static_cast(arg); - if (first_call) { - EXPECT_EQ(4 * 1024, *prefetch_size); - first_call = false; - } else { - EXPECT_GE(4 * 1024, *prefetch_size); - } - called = true; - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - - Options options = CurrentOptions(); - options.max_file_opening_threads = 1; // one thread - BlockBasedTableOptions table_options; - table_options.cache_index_and_filter_blocks = true; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.max_open_files = -1; - Reopen(options); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - ASSERT_TRUE(called.load()); - called = false; - - // Parallel loading SST files - options.max_file_opening_threads = 16; - Reopen(options); - - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - - ASSERT_TRUE(called.load()); - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); -} - TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) { // Setup sync point dependency to reproduce the race condition of // DBImpl::GetColumnFamilyHandleUnlocked @@ -5539,7 +5353,6 @@ TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } -#ifndef ROCKSDB_LITE TEST_F(DBTest2, TestCompactFiles) { // Setup sync point dependency to reproduce the race condition of // DBImpl::GetColumnFamilyHandleUnlocked @@ -5607,7 +5420,6 @@ TEST_F(DBTest2, TestCompactFiles) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } -#endif // ROCKSDB_LITE TEST_F(DBTest2, MultiDBParallelOpenTest) { const int kNumDbs = 2; @@ -5793,7 +5605,6 @@ TEST_F(DBTest2, PrefixBloomFilteredOut) { delete iter; } -#ifndef ROCKSDB_LITE TEST_F(DBTest2, RowCacheSnapshot) { Options options = CurrentOptions(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); @@ -5837,7 +5648,6 @@ TEST_F(DBTest2, RowCacheSnapshot) { db_->ReleaseSnapshot(s2); db_->ReleaseSnapshot(s3); } -#endif // ROCKSDB_LITE // When DB is reopened with multiple column families, the manifest file // is written after the first CF is flushed, and it is written again @@ -6026,10 +5836,8 @@ TEST_F(DBTest2, SameSmallestInSameLevel) { ASSERT_OK(Flush()); ASSERT_OK(db_->Merge(WriteOptions(), "key", "8")); ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); -#ifndef ROCKSDB_LITE + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,4,1", FilesPerLevel()); -#endif // ROCKSDB_LITE ASSERT_EQ("2,3,4,5,6,7,8", Get("key")); } @@ -6162,11 +5970,7 @@ TEST_F(DBTest2, ChangePrefixExtractor) { // Sometimes filter is checked based on upper bound. Assert counters // for that case. Otherwise, only check data correctness. -#ifndef ROCKSDB_LITE bool expect_filter_check = !use_partitioned_filter; -#else - bool expect_filter_check = false; -#endif table_options.partition_filters = use_partitioned_filter; if (use_partitioned_filter) { table_options.index_type = @@ -6201,17 +6005,15 @@ TEST_F(DBTest2, ChangePrefixExtractor) { iterator->Seek("xa"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xb", iterator->key().ToString()); - // It's a bug that the counter BLOOM_FILTER_PREFIX_CHECKED is not - // correct in this case. So don't check counters in this case. if (expect_filter_check) { - ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(0, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } iterator->Seek("xz"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xz1", iterator->key().ToString()); if (expect_filter_check) { - ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(0, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } } @@ -6229,7 +6031,7 @@ TEST_F(DBTest2, ChangePrefixExtractor) { ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xb", iterator->key().ToString()); if (expect_filter_check) { - ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(0, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } } @@ -6243,14 +6045,14 @@ TEST_F(DBTest2, ChangePrefixExtractor) { ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xb", iterator->key().ToString()); if (expect_filter_check) { - ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(0, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } iterator->Seek("xx0"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xx1", iterator->key().ToString()); if (expect_filter_check) { - ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } } @@ -6268,21 +6070,21 @@ TEST_F(DBTest2, ChangePrefixExtractor) { ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xb", iterator->key().ToString()); if (expect_filter_check) { - ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } iterator->Seek("xg"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xx1", iterator->key().ToString()); if (expect_filter_check) { - ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } iterator->Seek("xz"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xz1", iterator->key().ToString()); if (expect_filter_check) { - ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } ASSERT_OK(iterator->status()); @@ -6294,14 +6096,14 @@ TEST_F(DBTest2, ChangePrefixExtractor) { ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xb", iterator->key().ToString()); if (expect_filter_check) { - ASSERT_EQ(5, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } iterator->Seek("xx0"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xx1", iterator->key().ToString()); if (expect_filter_check) { - ASSERT_EQ(6, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } ASSERT_OK(iterator->status()); @@ -6315,7 +6117,7 @@ TEST_F(DBTest2, ChangePrefixExtractor) { ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xb", iterator->key().ToString()); if (expect_filter_check) { - ASSERT_EQ(7, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED)); + EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } ASSERT_OK(iterator->status()); } @@ -6364,7 +6166,6 @@ TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) { ASSERT_EQ("ok", Get("b1")); } -#ifndef ROCKSDB_LITE TEST_F(DBTest2, AutoPrefixMode1) { do { // create a DB with block prefix index @@ -6390,13 +6191,19 @@ TEST_F(DBTest2, AutoPrefixMode1) { ro.total_order_seek = false; ro.auto_prefix_mode = true; - const auto stat = BLOOM_FILTER_PREFIX_CHECKED; + const auto hit_stat = options.num_levels == 1 + ? LAST_LEVEL_SEEK_FILTER_MATCH + : NON_LAST_LEVEL_SEEK_FILTER_MATCH; + const auto miss_stat = options.num_levels == 1 + ? LAST_LEVEL_SEEK_FILTERED + : NON_LAST_LEVEL_SEEK_FILTERED; { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); - EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); } @@ -6408,7 +6215,8 @@ TEST_F(DBTest2, AutoPrefixMode1) { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); - EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); } @@ -6418,7 +6226,8 @@ TEST_F(DBTest2, AutoPrefixMode1) { iterator->Seek("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); - EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); } @@ -6427,7 +6236,8 @@ TEST_F(DBTest2, AutoPrefixMode1) { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); - EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); } @@ -6436,7 +6246,8 @@ TEST_F(DBTest2, AutoPrefixMode1) { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); - EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); } @@ -6447,25 +6258,29 @@ TEST_F(DBTest2, AutoPrefixMode1) { ub = "b9"; iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); - EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); ub = "z"; iterator->Seek("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); - EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "c"; iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); - EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat)); ub = "b9"; iterator->SeekForPrev("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("a1", iterator->key().ToString()); - EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "zz"; iterator->SeekToLast(); @@ -6497,26 +6312,30 @@ TEST_F(DBTest2, AutoPrefixMode1) { ub = "b1"; iterator->Seek("b9"); ASSERT_FALSE(iterator->Valid()); - EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); ub = "b1"; iterator->Seek("z"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("y1", iterator->key().ToString()); - EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "b1"; iterator->Seek("c"); ASSERT_FALSE(iterator->Valid()); - EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "b"; iterator->Seek("c9"); ASSERT_FALSE(iterator->Valid()); // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor // is "correctly" implemented. - EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "a"; iterator->Seek("b9"); @@ -6524,7 +6343,8 @@ TEST_F(DBTest2, AutoPrefixMode1) { // is "correctly" implemented. ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("a1", iterator->key().ToString()); - EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "b"; iterator->Seek("a"); @@ -6532,7 +6352,8 @@ TEST_F(DBTest2, AutoPrefixMode1) { // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor // matches BytewiseComparator::IsSameLengthImmediateSuccessor. Upper // comparing before seek key prevents a real bug from surfacing. - EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "b1"; iterator->SeekForPrev("b9"); @@ -6540,7 +6361,8 @@ TEST_F(DBTest2, AutoPrefixMode1) { // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor // is "correctly" implemented. ASSERT_EQ("x1", iterator->key().ToString()); - EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "a"; iterator->SeekToLast(); @@ -6582,7 +6404,8 @@ TEST_F(DBTest2, AutoPrefixMode1) { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek(Slice(a_end_stuff, 2)); ASSERT_FALSE(iterator->Valid()); - EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); // test, cannot be validly optimized with auto_prefix_mode @@ -6592,7 +6415,8 @@ TEST_F(DBTest2, AutoPrefixMode1) { iterator->Seek(Slice(a_end_stuff, 2)); // !!! BUG !!! See "BUG" section of auto_prefix_mode. ASSERT_FALSE(iterator->Valid()); - EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); // To prove that is the wrong result, now use total order seek @@ -6603,7 +6427,8 @@ TEST_F(DBTest2, AutoPrefixMode1) { iterator->Seek(Slice(a_end_stuff, 2)); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("b", iterator->key().ToString()); - EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); + EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); } } while (ChangeOptions(kSkipPlainTable)); @@ -7063,6 +6888,7 @@ TEST_F(DBTest2, LastLevelTemperatureUniversal) { TEST_F(DBTest2, LastLevelStatistics) { Options options = CurrentOptions(); options.bottommost_temperature = Temperature::kWarm; + options.default_temperature = Temperature::kHot; options.level0_file_num_compaction_trigger = 2; options.level_compaction_dynamic_level_bytes = true; options.statistics = CreateDBStatistics(); @@ -7076,6 +6902,10 @@ TEST_F(DBTest2, LastLevelStatistics) { ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), 0); ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), 0); + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(HOT_FILE_READ_COUNT)); ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), 0); ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), 0); @@ -7086,6 +6916,10 @@ TEST_F(DBTest2, LastLevelStatistics) { ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("bar", Get("bar")); + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(HOT_FILE_READ_COUNT)); ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), @@ -7106,6 +6940,30 @@ TEST_F(DBTest2, LastLevelStatistics) { pre_bytes); ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), pre_count); + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(HOT_FILE_READ_COUNT)); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(WARM_FILE_READ_COUNT)); + + // Not a realistic setting to make last level kWarm and default temp kCold. + // This is just for testing default temp can be reset on reopen while the + // last level temp is consistent across DB reopen because those file's temp + // are persisted in manifest. + options.default_temperature = Temperature::kCold; + ASSERT_OK(options.statistics->Reset()); + Reopen(options); + ASSERT_EQ("bar", Get("bar")); + + ASSERT_EQ(0, options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); + + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), + options.statistics->getTickerCount(COLD_FILE_READ_BYTES)); + ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), + options.statistics->getTickerCount(COLD_FILE_READ_COUNT)); ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), @@ -7242,7 +7100,6 @@ TEST_F(DBTest2, FileTemperatureManifestFixup) { std::vector column_families; for (size_t i = 0; i < handles_.size(); ++i) { ColumnFamilyDescriptor cfdescriptor; - // GetDescriptor is not implemented for ROCKSDB_LITE handles_[i]->GetDescriptor(&cfdescriptor).PermitUncheckedError(); column_families.push_back(cfdescriptor); } @@ -7282,7 +7139,6 @@ TEST_F(DBTest2, FileTemperatureManifestFixup) { Close(); } -#endif // ROCKSDB_LITE // WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery. TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) { @@ -7338,7 +7194,6 @@ TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) { ReopenWithColumnFamilies({"default", "test1", "test2"}, options); } -#ifndef ROCKSDB_LITE TEST_F(DBTest2, SortL0FilesByEpochNumber) { Options options = CurrentOptions(); options.num_levels = 1; @@ -7548,7 +7403,6 @@ TEST_F(DBTest2, RecoverEpochNumber) { } } -#endif // ROCKSDB_LITE TEST_F(DBTest2, RenameDirectory) { Options options = CurrentOptions(); @@ -7626,9 +7480,7 @@ TEST_F(DBTest2, SstUniqueIdVerifyBackwardCompatible) { } ASSERT_OK(dbfull()->TEST_WaitForCompact()); -#ifndef ROCKSDB_LITE ASSERT_EQ("0,1", FilesPerLevel(0)); -#endif // ROCKSDB_LITE // Reopen (with verification) ASSERT_TRUE(options.verify_sst_unique_id_in_manifest); @@ -7683,9 +7535,7 @@ TEST_F(DBTest2, SstUniqueIdVerify) { } ASSERT_OK(dbfull()->TEST_WaitForCompact()); -#ifndef ROCKSDB_LITE ASSERT_EQ("0,1", FilesPerLevel(0)); -#endif // ROCKSDB_LITE // Reopen with verification should fail options.verify_sst_unique_id_in_manifest = true; @@ -7750,6 +7600,7 @@ TEST_F(DBTest2, BestEffortsRecoveryWithSstUniqueIdVerification) { ASSERT_EQ(std::to_string(cnt), it->key()); ASSERT_EQ(expected_v, it->value()); } + EXPECT_OK(it->status()); ASSERT_EQ(expected_count, cnt); }; @@ -7808,7 +7659,6 @@ TEST_F(DBTest2, BestEffortsRecoveryWithSstUniqueIdVerification) { } } -#ifndef ROCKSDB_LITE TEST_F(DBTest2, GetLatestSeqAndTsForKey) { Destroy(last_options_); @@ -7865,7 +7715,41 @@ TEST_F(DBTest2, GetLatestSeqAndTsForKey) { // Verify that no read to SST files. ASSERT_EQ(0, options.statistics->getTickerCount(GET_HIT_L0)); } -#endif // ROCKSDB_LITE + +#if defined(ZSTD_ADVANCED) +TEST_F(DBTest2, ZSTDChecksum) { + // Verify that corruption during decompression is caught. + Options options = CurrentOptions(); + options.create_if_missing = true; + options.compression = kZSTD; + options.compression_opts.max_compressed_bytes_per_kb = 1024; + options.compression_opts.checksum = true; + DestroyAndReopen(options); + Random rnd(33); + ASSERT_OK(Put(Key(0), rnd.RandomString(4 << 10))); + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTableBuilder::WriteBlock:TamperWithCompressedData", + [&](void* arg) { + std::string* output = static_cast(arg); + // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#zstandard-frames + // Checksum is the last 4 bytes, corrupting that part in unit test is + // more controllable. + output->data()[output->size() - 1]++; + }); + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_OK(Flush()); + PinnableSlice val; + Status s = Get(Key(0), &val); + ASSERT_TRUE(s.IsCorruption()); + + // Corruption caught during flush. + options.paranoid_file_checks = true; + DestroyAndReopen(options); + ASSERT_OK(Put(Key(0), rnd.RandomString(4 << 10))); + s = Flush(); + ASSERT_TRUE(s.IsCorruption()); +} +#endif } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_test_util.cc b/db/db_test_util.cc index 149768efb6c9..3fb45767630d 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -16,17 +16,10 @@ #include "rocksdb/cache.h" #include "rocksdb/convenience.h" #include "rocksdb/env_encryption.h" -#include "util/stderr_logger.h" -#ifdef USE_AWS -#include -#include "rocksdb/cloud/cloud_file_system_impl.h" -#include "rocksdb/cloud/cloud_storage_provider.h" -#endif #include "rocksdb/unique_id.h" #include "rocksdb/utilities/object_registry.h" #include "table/format.h" #include "util/random.h" -#include "logging/logging.h" namespace ROCKSDB_NAMESPACE { @@ -68,17 +61,8 @@ SpecialEnv::SpecialEnv(Env* base, bool time_elapse_only_sleep) non_writable_count_ = 0; table_write_callback_ = nullptr; } -#ifdef USE_AWS -namespace { -void shutdownAws() { Aws::ShutdownAPI(Aws::SDKOptions()); } -} // namespace -#endif DBTestBase::DBTestBase(const std::string path, bool env_do_fsync) - : option_env_(kDefaultEnv), - mem_env_(nullptr), - encrypted_env_(nullptr), - option_config_(kDefault), - s3_env_(nullptr) { + : mem_env_(nullptr), encrypted_env_(nullptr), option_config_(kDefault) { Env* base_env = Env::Default(); ConfigOptions config_options; EXPECT_OK(test::CreateEnvFromSystem(config_options, &base_env, &env_guard_)); @@ -86,7 +70,6 @@ DBTestBase::DBTestBase(const std::string path, bool env_do_fsync) if (getenv("MEM_ENV")) { mem_env_ = MockEnv::Create(base_env, base_env->GetSystemClock()); } -#ifndef ROCKSDB_LITE if (getenv("ENCRYPTED_ENV")) { std::shared_ptr provider; std::string provider_id = getenv("ENCRYPTED_ENV"); @@ -98,26 +81,8 @@ DBTestBase::DBTestBase(const std::string path, bool env_do_fsync) &provider)); encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env, provider); } -#endif // !ROCKSDB_LITE env_ = new SpecialEnv(encrypted_env_ ? encrypted_env_ : (mem_env_ ? mem_env_ : base_env)); -#ifndef ROCKSDB_LITE -#ifdef USE_AWS - // Randomize the test path so that multiple tests can run in parallel - srand(static_cast(time(nullptr))); - std::string mypath = path + "_" + std::to_string(rand()); - - env_->NewLogger(test::TmpDir(env_) + "/rocksdb-cloud.log", &info_log_); - info_log_->SetInfoLogLevel(InfoLogLevel::DEBUG_LEVEL); - - static std::once_flag aws_init; - std::call_once(aws_init, []() { - Aws::InitAPI(Aws::SDKOptions()); - std::atexit(shutdownAws); - }); - s3_env_ = CreateNewAwsEnv(mypath, env_); -#endif -#endif // !ROCKSDB_LITE env_->SetBackgroundThreads(1, Env::LOW); env_->SetBackgroundThreads(1, Env::HIGH); env_->skip_fsync_ = !env_do_fsync; @@ -154,34 +119,9 @@ DBTestBase::~DBTestBase() { EXPECT_OK(DestroyDB(dbname_, options)); } delete env_; - -#ifndef ROCKSDB_LITE -#ifdef USE_AWS - auto* cfs = static_cast(s3_env_->GetFileSystem().get()); - cfs->GetStorageProvider()->EmptyBucket(cfs->GetSrcBucketName(), - cfs->GetSrcObjectPath()); -#endif // USE_AWS -#endif // !ROCKSDB_LITE - delete s3_env_; } bool DBTestBase::ShouldSkipOptions(int option_config, int skip_mask) { -#ifdef ROCKSDB_LITE - // These options are not supported in ROCKSDB_LITE - if (option_config == kHashSkipList || - option_config == kPlainTableFirstBytePrefix || - option_config == kPlainTableCappedPrefix || - option_config == kPlainTableCappedPrefixNonMmap || - option_config == kPlainTableAllBytesPrefix || - option_config == kVectorRep || option_config == kHashLinkList || - option_config == kUniversalCompaction || - option_config == kUniversalCompactionMultiLevel || - option_config == kUniversalSubcompactions || - option_config == kFIFOCompaction || - option_config == kConcurrentSkipList) { - return true; - } -#endif if ((skip_mask & kSkipUniversalCompaction) && (option_config == kUniversalCompaction || @@ -217,45 +157,24 @@ bool DBTestBase::ShouldSkipOptions(int option_config, int skip_mask) { return false; } -bool DBTestBase::ShouldSkipAwsOptions(int option_config) { - // AWS Env doesn't work with DirectIO - return option_config == kDirectIO; -} - // Switch to a fresh database with the next option configuration to // test. Return false if there are no more configurations to test. bool DBTestBase::ChangeOptions(int skip_mask) { - while (true) { - for (option_config_++; option_config_ < kEnd; option_config_++) { - if (ShouldSkipOptions(option_config_, skip_mask)) { - continue; - } - if (option_env_ == kAwsEnv && ShouldSkipAwsOptions(option_config_)) { - continue; - } - break; - } - if (option_config_ >= kEnd) { -#ifndef USE_AWS - // If not built for AWS, skip it - if (option_env_ + 1 == kAwsEnv) { - option_env_++; - } -#endif - if (option_env_ + 1 >= kEndEnv) { - Destroy(last_options_); - return false; - } else { - option_env_++; - option_config_ = kDefault; - continue; - } - } else { - auto options = CurrentOptions(); - options.create_if_missing = true; - DestroyAndReopen(options); - return true; + for (option_config_++; option_config_ < kEnd; option_config_++) { + if (ShouldSkipOptions(option_config_, skip_mask)) { + continue; } + break; + } + + if (option_config_ >= kEnd) { + Destroy(last_options_); + return false; + } else { + auto options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + return true; } } @@ -340,7 +259,7 @@ bool DBTestBase::ChangeFilterOptions() { auto options = CurrentOptions(); options.create_if_missing = true; - TryReopen(options); + EXPECT_OK(TryReopen(options)); return true; } @@ -351,34 +270,34 @@ bool DBTestBase::ChangeOptionsForFileIngestionTest() { Destroy(last_options_); auto options = CurrentOptions(); options.create_if_missing = true; - TryReopen(options); + EXPECT_OK(TryReopen(options)); return true; } else if (option_config_ == kUniversalCompaction) { option_config_ = kUniversalCompactionMultiLevel; Destroy(last_options_); auto options = CurrentOptions(); options.create_if_missing = true; - TryReopen(options); + EXPECT_OK(TryReopen(options)); return true; } else if (option_config_ == kUniversalCompactionMultiLevel) { option_config_ = kLevelSubcompactions; Destroy(last_options_); auto options = CurrentOptions(); assert(options.max_subcompactions > 1); - TryReopen(options); + EXPECT_OK(TryReopen(options)); return true; } else if (option_config_ == kLevelSubcompactions) { option_config_ = kUniversalSubcompactions; Destroy(last_options_); auto options = CurrentOptions(); assert(options.max_subcompactions > 1); - TryReopen(options); + EXPECT_OK(TryReopen(options)); return true; } else if (option_config_ == kUniversalSubcompactions) { option_config_ = kDirectIO; Destroy(last_options_); auto options = CurrentOptions(); - TryReopen(options); + EXPECT_OK(TryReopen(options)); return true; } else { return false; @@ -405,6 +324,12 @@ Options DBTestBase::GetDefaultOptions() const { options.max_open_files = 5000; options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords; options.compaction_pri = CompactionPri::kByCompensatedSize; + // The original default value for this option is false, + // and many unit tests assume this value. It also makes + // it easier to create desired LSM shape in unit tests. + // Unit tests for this option sets level_compaction_dynamic_level_bytes=true + // explicitly. + options.level_compaction_dynamic_level_bytes = false; options.env = env_; if (!env_->skip_fsync_) { options.track_and_verify_wals_in_manifest = true; @@ -440,7 +365,6 @@ Options DBTestBase::GetOptions( bool can_allow_mmap = IsMemoryMappedAccessSupported(); switch (option_config) { -#ifndef ROCKSDB_LITE case kHashSkipList: options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.memtable_factory.reset(NewHashSkipListRepFactory(16)); @@ -494,7 +418,6 @@ Options DBTestBase::GetOptions( SetupSyncPointsToMockDirectIO(); break; } -#endif // ROCKSDB_LITE case kMergePut: options.merge_operator = MergeOperators::CreatePutOperator(); break; @@ -543,10 +466,6 @@ Options DBTestBase::GetOptions( options.compaction_style = kCompactionStyleUniversal; options.num_levels = 8; break; - case kCompressedBlockCache: - options.allow_mmap_writes = can_allow_mmap; - table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024); - break; case kInfiniteMaxOpenFiles: options.max_open_files = -1; break; @@ -648,25 +567,6 @@ Options DBTestBase::GetOptions( break; } - switch (option_env_) { - case kDefaultEnv: { - options.env = env_; - break; - } -#ifdef USE_AWS - case kAwsEnv: { - assert(s3_env_); - options.env = s3_env_; - options.recycle_log_file_num = 0; // do not reuse log files - options.allow_mmap_reads = false; // mmap is incompatible with S3 - break; - } -#endif /* USE_AWS */ - - default: - break; - } - if (options_override.filter_policy) { table_options.filter_policy = options_override.filter_policy; table_options.partition_filters = options_override.partition_filters; @@ -675,41 +575,14 @@ Options DBTestBase::GetOptions( if (set_block_based_table_factory) { options.table_factory.reset(NewBlockBasedTableFactory(table_options)); } + options.level_compaction_dynamic_level_bytes = + options_override.level_compaction_dynamic_level_bytes; + options.env = env_; options.create_if_missing = true; options.fail_if_options_file_error = true; return options; } -#ifndef ROCKSDB_LITE -#ifdef USE_AWS -Env* DBTestBase::CreateNewAwsEnv(const std::string& prefix, Env* parent) { - if (!prefix.empty()) { - fprintf(stderr, "Creating new cloud env with prefix %s\n", prefix.c_str()); - } - - // get credentials - CloudFileSystemOptions coptions; - CloudFileSystem* cfs = nullptr; - std::string region; - coptions.TEST_Initialize("dbtest.", prefix, region); - // Delete cloud files immediately - coptions.cloud_file_deletion_delay = std::nullopt; - Status st = CloudFileSystem::NewAwsFileSystem(parent->GetFileSystem(), - coptions, info_log_, &cfs); - auto* cimpl = dynamic_cast(cfs); - assert(cimpl); - cimpl->TEST_DisableCloudManifest(); - ROCKS_LOG_INFO(info_log_, "Created new aws env with path %s", prefix.c_str()); - if (!st.ok()) { - Log(InfoLogLevel::DEBUG_LEVEL, info_log_, "%s", st.ToString().c_str()); - } - assert(st.ok() && cfs); - std::shared_ptr cloud_fs(cfs); - return new CompositeEnvWrapper(parent, std::move(cloud_fs)); -} -#endif // USE_AWS -#endif // ROCKSDB_LITE - void DBTestBase::CreateColumnFamilies(const std::vector& cfs, const Options& options) { ColumnFamilyOptions cf_opts(options); @@ -817,34 +690,16 @@ void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) { if (delete_cf_paths) { for (size_t i = 0; i < handles_.size(); ++i) { ColumnFamilyDescriptor cfdescriptor; - // GetDescriptor is not implemented for ROCKSDB_LITE handles_[i]->GetDescriptor(&cfdescriptor).PermitUncheckedError(); column_families.push_back(cfdescriptor); } } Close(); ASSERT_OK(DestroyDB(dbname_, options, column_families)); -#ifdef USE_AWS - if (s3_env_) { - auto* cfs = static_cast(s3_env_->GetFileSystem().get()); - auto st = cfs->GetStorageProvider()->EmptyBucket(cfs->GetSrcBucketName(), - dbname_); - ASSERT_TRUE(st.ok() || st.IsNotFound()); - for (int r = 0; r < 10; ++r) { - // The existance is not propagated atomically, so wait until - // IDENTITY file no longer exists. - if (cfs->FileExists(dbname_ + "/IDENTITY", IOOptions(), nullptr /*dbg*/) - .ok()) { - std::this_thread::sleep_for(std::chrono::milliseconds(10 * (r + 1))); - continue; - } - break; - } - } -#endif } Status DBTestBase::ReadOnlyReopen(const Options& options) { + Close(); MaybeInstallTimeElapseOnlySleep(options); return DB::OpenForReadOnly(options, dbname_, &db_); } @@ -1088,6 +943,7 @@ std::string DBTestBase::Contents(int cf) { EXPECT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]); matched++; } + EXPECT_OK(iter->status()); EXPECT_EQ(matched, forward.size()); delete iter; @@ -1176,7 +1032,6 @@ std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) { return result; } -#ifndef ROCKSDB_LITE int DBTestBase::NumSortedRuns(int cf) { ColumnFamilyMetaData cf_meta; if (cf == 0) { @@ -1229,6 +1084,24 @@ size_t DBTestBase::TotalLiveFiles(int cf) { return num_files; } +size_t DBTestBase::TotalLiveFilesAtPath(int cf, const std::string& path) { + ColumnFamilyMetaData cf_meta; + if (cf == 0) { + db_->GetColumnFamilyMetaData(&cf_meta); + } else { + db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta); + } + size_t num_files = 0; + for (auto& level : cf_meta.levels) { + for (auto& f : level.files) { + if (f.directory == path) { + num_files++; + } + } + } + return num_files; +} + size_t DBTestBase::CountLiveFiles() { std::vector metadata; db_->GetLiveFilesMetaData(&metadata); @@ -1295,7 +1168,6 @@ std::string DBTestBase::FilesPerLevel(int cf) { return result; } -#endif // !ROCKSDB_LITE std::vector DBTestBase::GetBlobFileNumbers() { VersionSet* const versions = dbfull()->GetVersionSet(); @@ -1413,7 +1285,6 @@ void DBTestBase::MoveFilesToLevel(int level, int cf) { } } -#ifndef ROCKSDB_LITE void DBTestBase::DumpFileCounts(const char* label) { fprintf(stderr, "---\n%s:\n", label); fprintf(stderr, "maxoverlap: %" PRIu64 "\n", @@ -1425,7 +1296,6 @@ void DBTestBase::DumpFileCounts(const char* label) { } } } -#endif // !ROCKSDB_LITE std::string DBTestBase::DumpSSTableList() { std::string property; @@ -1496,6 +1366,7 @@ std::string DBTestBase::IterStatus(Iterator* iter) { if (iter->Valid()) { result = iter->key().ToString() + "->" + iter->value().ToString(); } else { + EXPECT_OK(iter->status()); result = "(invalid)"; } return result; @@ -1714,6 +1585,7 @@ void DBTestBase::VerifyDBFromMap(std::map true_data, iter_cnt++; total_reads++; } + ASSERT_OK(iter->status()); ASSERT_EQ(data_iter, true_data.end()) << iter_cnt << " / " << true_data.size(); delete iter; @@ -1737,6 +1609,7 @@ void DBTestBase::VerifyDBFromMap(std::map true_data, iter_cnt++; total_reads++; } + ASSERT_OK(iter->status()); ASSERT_EQ(data_rev, true_data.rend()) << iter_cnt << " / " << true_data.size(); @@ -1751,7 +1624,6 @@ void DBTestBase::VerifyDBFromMap(std::map true_data, } if (tailing_iter) { -#ifndef ROCKSDB_LITE // Tailing iterator int iter_cnt = 0; ReadOptions ro; @@ -1780,7 +1652,6 @@ void DBTestBase::VerifyDBFromMap(std::map true_data, } delete iter; -#endif // ROCKSDB_LITE } if (total_reads_res) { @@ -1808,7 +1679,6 @@ void DBTestBase::VerifyDBInternal( iter->~InternalIterator(); } -#ifndef ROCKSDB_LITE uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily( DB* db, std::string column_family_name) { @@ -1829,7 +1699,6 @@ uint64_t DBTestBase::GetSstSizeHelper(Temperature temperature) { &prop)); return static_cast(std::atoi(prop.c_str())); } -#endif // ROCKSDB_LITE void VerifySstUniqueIds(const TablePropertiesCollection& props) { ASSERT_FALSE(props.empty()); // suspicious test if empty @@ -1852,12 +1721,12 @@ TargetCacheChargeTrackingCache::TargetCacheChargeTrackingCache( cache_charge_increments_sum_(0) {} template -Status TargetCacheChargeTrackingCache::Insert(const Slice& key, - ObjectPtr value, - const CacheItemHelper* helper, - size_t charge, Handle** handle, - Priority priority) { - Status s = target_->Insert(key, value, helper, charge, handle, priority); +Status TargetCacheChargeTrackingCache::Insert( + const Slice& key, ObjectPtr value, const CacheItemHelper* helper, + size_t charge, Handle** handle, Priority priority, const Slice& compressed, + CompressionType type) { + Status s = target_->Insert(key, value, helper, charge, handle, priority, + compressed, type); if (helper == kCrmHelper) { if (last_peak_tracked_) { cache_charge_peak_ = 0; diff --git a/db/db_test_util.h b/db/db_test_util.h index 06212868a8ac..023784f61526 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -114,6 +114,12 @@ struct OptionsOverride { // Used as a bit mask of individual enums in which to skip an XF test point int skip_policy = 0; + + // The default value for this option is changed from false to true. + // Keeping the default to false for unit tests as old unit tests assume + // this behavior. Tests for level_compaction_dynamic_level_bytes + // will set the option to true explicitly. + bool level_compaction_dynamic_level_bytes = false; }; } // namespace anon @@ -227,6 +233,7 @@ class SpecialEnv : public EnvWrapper { size_t GetUniqueId(char* id, size_t max_size) const override { return base_->GetUniqueId(id, max_size); } + uint64_t GetFileSize() final { return base_->GetFileSize(); } }; class ManifestFile : public WritableFile { public: @@ -339,6 +346,7 @@ class SpecialEnv : public EnvWrapper { Status Allocate(uint64_t offset, uint64_t len) override { return base_->Allocate(offset, len); } + uint64_t GetFileSize() final { return base_->GetFileSize(); } private: SpecialEnv* env_; @@ -695,7 +703,6 @@ class SpecialEnv : public EnvWrapper { bool no_slowdown_; }; -#ifndef ROCKSDB_LITE class FileTemperatureTestFS : public FileSystemWrapper { public: explicit FileTemperatureTestFS(const std::shared_ptr& fs) @@ -870,7 +877,6 @@ class FlushCounterListener : public EventListener { ASSERT_EQ(expected_flush_reason.load(), flush_job_info.flush_reason); } }; -#endif // A test merge operator mimics put but also fails if one of merge operands is // "corrupted", "corrupted_try_merge", or "corrupted_must_merge". @@ -911,82 +917,6 @@ class TestPutOperator : public MergeOperator { virtual const char* Name() const override { return "TestPutOperator"; } }; -// A wrapper around Cache that can easily be extended with instrumentation, -// etc. -class CacheWrapper : public Cache { - public: - explicit CacheWrapper(std::shared_ptr target) - : target_(std::move(target)) {} - - const char* Name() const override { return target_->Name(); } - - Status Insert(const Slice& key, ObjectPtr value, - const CacheItemHelper* helper, size_t charge, - Handle** handle = nullptr, - Priority priority = Priority::LOW) override { - return target_->Insert(key, value, helper, charge, handle, priority); - } - - Handle* Lookup(const Slice& key, const CacheItemHelper* helper, - CreateContext* create_context, - Priority priority = Priority::LOW, bool wait = true, - Statistics* stats = nullptr) override { - return target_->Lookup(key, helper, create_context, priority, wait, stats); - } - - bool Ref(Handle* handle) override { return target_->Ref(handle); } - - using Cache::Release; - bool Release(Handle* handle, bool erase_if_last_ref = false) override { - return target_->Release(handle, erase_if_last_ref); - } - - ObjectPtr Value(Handle* handle) override { return target_->Value(handle); } - - void Erase(const Slice& key) override { target_->Erase(key); } - uint64_t NewId() override { return target_->NewId(); } - - void SetCapacity(size_t capacity) override { target_->SetCapacity(capacity); } - - void SetStrictCapacityLimit(bool strict_capacity_limit) override { - target_->SetStrictCapacityLimit(strict_capacity_limit); - } - - bool HasStrictCapacityLimit() const override { - return target_->HasStrictCapacityLimit(); - } - - size_t GetCapacity() const override { return target_->GetCapacity(); } - - size_t GetUsage() const override { return target_->GetUsage(); } - - size_t GetUsage(Handle* handle) const override { - return target_->GetUsage(handle); - } - - size_t GetPinnedUsage() const override { return target_->GetPinnedUsage(); } - - size_t GetCharge(Handle* handle) const override { - return target_->GetCharge(handle); - } - - const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override { - return target_->GetCacheItemHelper(handle); - } - - void ApplyToAllEntries( - const std::function& callback, - const ApplyToAllEntriesOptions& opts) override { - target_->ApplyToAllEntries(callback, opts); - } - - void EraseUnRefEntries() override { target_->EraseUnRefEntries(); } - - protected: - std::shared_ptr target_; -}; - /* * A cache wrapper that tracks certain CacheEntryRole's cache charge, its * peaks and increments @@ -1004,10 +934,13 @@ class TargetCacheChargeTrackingCache : public CacheWrapper { public: explicit TargetCacheChargeTrackingCache(std::shared_ptr target); + const char* Name() const override { return "TargetCacheChargeTrackingCache"; } + Status Insert(const Slice& key, ObjectPtr value, const CacheItemHelper* helper, size_t charge, - Handle** handle = nullptr, - Priority priority = Priority::LOW) override; + Handle** handle = nullptr, Priority priority = Priority::LOW, + const Slice& compressed = Slice(), + CompressionType type = kNoCompression) override; using Cache::Release; bool Release(Handle* handle, bool erase_if_last_ref = false) override; @@ -1056,16 +989,15 @@ class DBTestBase : public testing::Test { kHashSkipList = 18, kUniversalCompaction = 19, kUniversalCompactionMultiLevel = 20, - kCompressedBlockCache = 21, - kInfiniteMaxOpenFiles = 22, - kCRC32cChecksum = 23, - kFIFOCompaction = 24, - kOptimizeFiltersForHits = 25, - kRowCache = 26, - kRecycleLogFiles = 27, - kConcurrentSkipList = 28, - kPipelinedWrite = 29, - kConcurrentWALWrites = 30, + kInfiniteMaxOpenFiles = 21, + kCRC32cChecksum = 22, + kFIFOCompaction = 23, + kOptimizeFiltersForHits = 24, + kRowCache = 25, + kRecycleLogFiles = 26, + kConcurrentSkipList = 27, + kPipelinedWrite = 28, + kConcurrentWALWrites = 29, kDirectIO, kLevelSubcompactions, kBlockBasedTableWithIndexRestartInterval, @@ -1079,14 +1011,6 @@ class DBTestBase : public testing::Test { kEnd, }; - // The types of envs that we want to test with - enum OptionConfigEnv { - kDefaultEnv = 0, // posix env - kAwsEnv = 1, // aws env - kEndEnv = 2, - }; - int option_env_; - public: std::string dbname_; std::string alternative_wal_dir_; @@ -1101,8 +1025,6 @@ class DBTestBase : public testing::Test { int option_config_; Options last_options_; - Env* s3_env_; - // Skip some options, as they may not be applicable to a specific test. // To add more skip constants, use values 4, 8, 16, etc. enum OptionSkip { @@ -1117,11 +1039,6 @@ class DBTestBase : public testing::Test { kSkipMmapReads = 256, }; -#ifdef USE_AWS - Env* CreateNewAwsEnv(const std::string& pathPrefix, Env* env); -#endif - std::shared_ptr info_log_; - const int kRangeDelSkipConfigs = // Plain tables do not support range deletions. kSkipPlainTable | @@ -1143,7 +1060,6 @@ class DBTestBase : public testing::Test { } static bool ShouldSkipOptions(int option_config, int skip_mask = kNoSkip); - static bool ShouldSkipAwsOptions(int option_config); // Switch to a fresh database with the next option configuration to // test. Return false if there are no more configurations to test. @@ -1277,7 +1193,6 @@ class DBTestBase : public testing::Test { const std::vector& cfs, const Options& options); -#ifndef ROCKSDB_LITE int NumSortedRuns(int cf = 0); uint64_t TotalSize(int cf = 0); @@ -1286,6 +1201,8 @@ class DBTestBase : public testing::Test { size_t TotalLiveFiles(int cf = 0); + size_t TotalLiveFilesAtPath(int cf, const std::string& path); + size_t CountLiveFiles(); int NumTableFilesAtLevel(int level, int cf = 0); @@ -1293,7 +1210,6 @@ class DBTestBase : public testing::Test { double CompressionRatioAtLevel(int level, int cf = 0); int TotalTableFiles(int cf = 0, int levels = -1); -#endif // ROCKSDB_LITE std::vector GetBlobFileNumbers(); @@ -1329,9 +1245,7 @@ class DBTestBase : public testing::Test { void MoveFilesToLevel(int level, int cf = 0); -#ifndef ROCKSDB_LITE void DumpFileCounts(const char* label); -#endif // ROCKSDB_LITE std::string DumpSSTableList(); @@ -1400,12 +1314,10 @@ class DBTestBase : public testing::Test { void VerifyDBInternal( std::vector> true_data); -#ifndef ROCKSDB_LITE uint64_t GetNumberOfSstFilesForColumnFamily(DB* db, std::string column_family_name); uint64_t GetSstSizeHelper(Temperature temperature); -#endif // ROCKSDB_LITE uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) { return options.statistics->getTickerCount(ticker_type); @@ -1415,11 +1327,49 @@ class DBTestBase : public testing::Test { Tickers ticker_type) { return options.statistics->getAndResetTickerCount(ticker_type); } + // Short name for TestGetAndResetTickerCount + uint64_t PopTicker(const Options& options, Tickers ticker_type) { + return options.statistics->getAndResetTickerCount(ticker_type); + } // Note: reverting this setting within the same test run is not yet // supported void SetTimeElapseOnlySleepOnReopen(DBOptions* options); + void ResetTableProperties(TableProperties* tp) { + tp->data_size = 0; + tp->index_size = 0; + tp->filter_size = 0; + tp->raw_key_size = 0; + tp->raw_value_size = 0; + tp->num_data_blocks = 0; + tp->num_entries = 0; + tp->num_deletions = 0; + tp->num_merge_operands = 0; + tp->num_range_deletions = 0; + } + + void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { + double dummy_double; + std::replace(tp_string.begin(), tp_string.end(), ';', ' '); + std::replace(tp_string.begin(), tp_string.end(), '=', ' '); + ResetTableProperties(tp); + sscanf(tp_string.c_str(), + "# data blocks %" SCNu64 " # entries %" SCNu64 + " # deletions %" SCNu64 " # merge operands %" SCNu64 + " # range deletions %" SCNu64 " raw key size %" SCNu64 + " raw average key size %lf " + " raw value size %" SCNu64 + " raw average value size %lf " + " data block size %" SCNu64 " index block size (user-key? %" SCNu64 + ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64, + &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions, + &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size, + &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, + &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded, + &tp->index_size, &tp->filter_size); + } + private: // Prone to error on direct use void MaybeInstallTimeElapseOnlySleep(const DBOptions& options); diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc index f53c36f229a1..5c10cdaacf43 100644 --- a/db/db_universal_compaction_test.cc +++ b/db/db_universal_compaction_test.cc @@ -7,10 +7,12 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include + #include "db/db_test_util.h" #include "port/stack_trace.h" -#if !defined(ROCKSDB_LITE) #include "rocksdb/utilities/table_properties_collectors.h" +#include "test_util/mock_time_env.h" #include "test_util/sync_point.h" #include "test_util/testutil.h" #include "util/random.h" @@ -1470,6 +1472,7 @@ TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) { keys_in_db.append(iter->key().ToString()); keys_in_db.push_back(','); } + EXPECT_OK(iter->status()); delete iter; std::string expected_keys; @@ -1851,31 +1854,31 @@ TEST_P(DBTestUniversalManualCompactionOutputPathId, compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); ASSERT_EQ(1, TotalLiveFiles(1)); - ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); - ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, TotalLiveFilesAtPath(1, options.db_paths[0].path)); + ASSERT_EQ(1, TotalLiveFilesAtPath(1, options.db_paths[1].path)); ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); ASSERT_EQ(1, TotalLiveFiles(1)); - ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); - ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, TotalLiveFilesAtPath(1, options.db_paths[0].path)); + ASSERT_EQ(1, TotalLiveFilesAtPath(1, options.db_paths[1].path)); MakeTables(1, "p", "q", 1); ASSERT_EQ(2, TotalLiveFiles(1)); - ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); - ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, TotalLiveFilesAtPath(1, options.db_paths[0].path)); + ASSERT_EQ(1, TotalLiveFilesAtPath(1, options.db_paths[1].path)); ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); ASSERT_EQ(2, TotalLiveFiles(1)); - ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); - ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, TotalLiveFilesAtPath(1, options.db_paths[0].path)); + ASSERT_EQ(1, TotalLiveFilesAtPath(1, options.db_paths[1].path)); // Full compaction to DB path 0 compact_options.target_path_id = 0; compact_options.exclusive_manual_compaction = exclusive_manual_compaction_; ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)); ASSERT_EQ(1, TotalLiveFiles(1)); - ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); - ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, TotalLiveFilesAtPath(1, options.db_paths[0].path)); + ASSERT_EQ(0, TotalLiveFilesAtPath(1, options.db_paths[1].path)); // Fail when compacting to an invalid path ID compact_options.target_path_id = 2; @@ -2146,7 +2149,19 @@ TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) { options.ttl = 60 * 24 * 60 * 60; options.compaction_filter = nullptr; Reopen(options); - ASSERT_EQ(60 * 24 * 60 * 60, + ASSERT_EQ(30 * 24 * 60 * 60, + dbfull()->GetOptions().periodic_compaction_seconds); + + options.periodic_compaction_seconds = 45 * 24 * 60 * 60; + options.ttl = 50 * 24 * 60 * 60; + Reopen(options); + ASSERT_EQ(45 * 24 * 60 * 60, + dbfull()->GetOptions().periodic_compaction_seconds); + + options.periodic_compaction_seconds = 0; + options.ttl = 50 * 24 * 60 * 60; + Reopen(options); + ASSERT_EQ(50 * 24 * 60 * 60, dbfull()->GetOptions().periodic_compaction_seconds); } @@ -2218,18 +2233,146 @@ TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) { ASSERT_EQ(4, output_level); } +TEST_F(DBTestUniversalCompaction2, PeriodicCompactionOffpeak) { + constexpr int kSecondsPerDay = 86400; + constexpr int kSecondsPerHour = 3600; + constexpr int kSecondsPerMinute = 60; + + Options opts = CurrentOptions(); + opts.compaction_style = kCompactionStyleUniversal; + opts.level0_file_num_compaction_trigger = 10; + opts.max_open_files = -1; + opts.compaction_options_universal.size_ratio = 10; + opts.compaction_options_universal.min_merge_width = 2; + opts.compaction_options_universal.max_size_amplification_percent = 200; + opts.periodic_compaction_seconds = 5 * kSecondsPerDay; // 5 days + opts.num_levels = 5; + + // Just to add some extra random days to current time + Random rnd(test::RandomSeed()); + int days = rnd.Uniform(100); + + int periodic_compactions = 0; + int start_level = -1; + int output_level = -1; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "UniversalCompactionPicker::PickPeriodicCompaction:Return", + [&](void* arg) { + Compaction* compaction = reinterpret_cast(arg); + ASSERT_TRUE(arg != nullptr); + ASSERT_TRUE(compaction->compaction_reason() == + CompactionReason::kPeriodicCompaction); + start_level = compaction->start_level(); + output_level = compaction->output_level(); + periodic_compactions++; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (std::string preset_offpeak_time : {"", "00:30-04:30", "10:30-02:30"}) { + SCOPED_TRACE("preset_offpeak_time=" + preset_offpeak_time); + for (std::string new_offpeak_time : {"", "23:30-02:30"}) { + SCOPED_TRACE("new_offpeak_time=" + new_offpeak_time); + std::vector> times_to_test = { + {0, 0}, {2, 30}, {3, 15}, {5, 10}, {13, 30}, {23, 30}}; + for (std::pair now : times_to_test) { + int now_hour = now.first; + int now_minute = now.second; + SCOPED_TRACE("now=" + std::to_string(now_hour) + ":" + + std::to_string(now_minute)); + + auto mock_clock = + std::make_shared(env_->GetSystemClock()); + auto mock_env = std::make_unique(env_, mock_clock); + opts.env = mock_env.get(); + mock_clock->SetCurrentTime(days * kSecondsPerDay + + now_hour * kSecondsPerHour + + now_minute * kSecondsPerMinute); + opts.daily_offpeak_time_utc = preset_offpeak_time; + Reopen(opts); + + ASSERT_OK(Put("foo", "bar1")); + ASSERT_OK(Flush()); + ASSERT_EQ(0, periodic_compactions); + + // Move clock forward by 8 hours. There should be no periodic + // compaction, yet. + mock_clock->MockSleepForSeconds(8 * kSecondsPerHour); + ASSERT_OK(Put("foo", "bar2")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(0, periodic_compactions); + + // Move clock forward by 4 days + mock_clock->MockSleepForSeconds(4 * kSecondsPerDay); + ASSERT_OK(Put("foo", "bar3")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + int64_t mock_now; + ASSERT_OK(mock_clock->GetCurrentTime(&mock_now)); + + auto offpeak_time_info = + dbfull()->GetVersionSet()->offpeak_time_option().GetOffpeakTimeInfo( + mock_now); + // At this point, the first file is 4 days and 8 hours old. + // If it's offpeak now and the file is expected to expire before the + // next offpeak starts + if (offpeak_time_info.is_now_offpeak && + offpeak_time_info.seconds_till_next_offpeak_start / + kSecondsPerHour > + 16) { + ASSERT_EQ(1, periodic_compactions); + } else { + ASSERT_EQ(0, periodic_compactions); + // Change offpeak option by SetDBOption() + if (preset_offpeak_time != new_offpeak_time) { + ASSERT_OK(dbfull()->SetDBOptions( + {{"daily_offpeak_time_utc", new_offpeak_time}})); + ASSERT_OK(Put("foo", "bar4")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + offpeak_time_info = dbfull() + ->GetVersionSet() + ->offpeak_time_option() + .GetOffpeakTimeInfo(mock_now); + // if the first file is now eligible to be picked up + if (offpeak_time_info.is_now_offpeak && + offpeak_time_info.seconds_till_next_offpeak_start / + kSecondsPerHour > + 16) { + ASSERT_OK(Put("foo", "bar5")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + ASSERT_EQ(1, periodic_compactions); + } + } + + // If the file has not been picked up yet (no offpeak set, or offpeak + // set but then unset before the file becomes eligible) + if (periodic_compactions == 0) { + // move clock forward by one more day + mock_clock->MockSleepForSeconds(1 * kSecondsPerDay); + ASSERT_OK(Put("foo", "bar6")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + } + } + ASSERT_EQ(1, periodic_compactions); + ASSERT_EQ(0, start_level); + ASSERT_EQ(4, output_level); + Destroy(opts); + + periodic_compactions = 0; + } + } + } +} + } // namespace ROCKSDB_NAMESPACE -#endif // !defined(ROCKSDB_LITE) int main(int argc, char** argv) { -#if !defined(ROCKSDB_LITE) ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - (void)argc; - (void)argv; - return 0; -#endif } diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 99d0b3c4c8d9..fbc01131e50d 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -8,11 +8,13 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/db_test_util.h" +#include "db/db_with_timestamp_test_util.h" #include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/file_system.h" #include "test_util/sync_point.h" +#include "util/udt_util.h" #include "utilities/fault_injection_env.h" #include "utilities/fault_injection_fs.h" @@ -145,6 +147,11 @@ TEST_F(DBWALTestWithEnrichedEnv, SkipDeletedWALs) { options.write_buffer_size = 128; Reopen(options); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::PurgeObsoleteFiles:End", + "DBWALTestWithEnrichedEnv.SkipDeletedWALs:AfterFlush"}}); + SyncPoint::GetInstance()->EnableProcessing(); + WriteOptions writeOpt = WriteOptions(); for (int i = 0; i < 128 * 5; i++) { ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1")); @@ -153,6 +160,8 @@ TEST_F(DBWALTestWithEnrichedEnv, SkipDeletedWALs) { fo.wait = true; ASSERT_OK(db_->Flush(fo)); + TEST_SYNC_POINT("DBWALTestWithEnrichedEnv.SkipDeletedWALs:AfterFlush"); + // some wals are deleted ASSERT_NE(0, enriched_env_->deleted_wal_cnt); // but not the first one @@ -163,6 +172,8 @@ TEST_F(DBWALTestWithEnrichedEnv, SkipDeletedWALs) { Reopen(options); ASSERT_FALSE(enriched_env_->deleted_wal_reopened); ASSERT_FALSE(enriched_env_->gap_in_wals); + + SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBWALTest, WAL) { @@ -301,6 +312,239 @@ TEST_F(DBWALTest, Recover) { } while (ChangeWalOptions()); } +class DBWALTestWithTimestamp + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface { + public: + DBWALTestWithTimestamp() + : DBBasicTestWithTimestampBase("db_wal_test_with_timestamp") {} + + Status CreateAndReopenWithTs(const std::vector& cfs, + const Options& ts_options, bool persist_udt, + bool avoid_flush_during_recovery = false) { + Options default_options = CurrentOptions(); + default_options.allow_concurrent_memtable_write = + persist_udt ? true : false; + DestroyAndReopen(default_options); + CreateColumnFamilies(cfs, ts_options); + return ReopenColumnFamiliesWithTs(cfs, ts_options, persist_udt, + avoid_flush_during_recovery); + } + + Status ReopenColumnFamiliesWithTs(const std::vector& cfs, + Options ts_options, bool persist_udt, + bool avoid_flush_during_recovery = false) { + Options default_options = CurrentOptions(); + default_options.create_if_missing = false; + default_options.allow_concurrent_memtable_write = + persist_udt ? true : false; + default_options.avoid_flush_during_recovery = avoid_flush_during_recovery; + ts_options.create_if_missing = false; + + std::vector cf_options(cfs.size(), ts_options); + std::vector cfs_plus_default = cfs; + cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName); + cf_options.insert(cf_options.begin(), default_options); + Close(); + return TryReopenWithColumnFamilies(cfs_plus_default, cf_options); + } + + Status Put(uint32_t cf, const Slice& key, const Slice& ts, + const Slice& value) { + WriteOptions write_opts; + return db_->Put(write_opts, handles_[cf], key, ts, value); + } + + void CheckGet(const ReadOptions& read_opts, uint32_t cf, const Slice& key, + const std::string& expected_value, + const std::string& expected_ts) { + std::string actual_value; + std::string actual_ts; + ASSERT_OK( + db_->Get(read_opts, handles_[cf], key, &actual_value, &actual_ts)); + ASSERT_EQ(expected_value, actual_value); + ASSERT_EQ(expected_ts, actual_ts); + } +}; + +TEST_P(DBWALTestWithTimestamp, RecoverAndNoFlush) { + // Set up the option that enables user defined timestmp size. + std::string ts1; + PutFixed64(&ts1, 1); + Options ts_options; + ts_options.create_if_missing = true; + ts_options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + // Test that user-defined timestamps are recovered from WAL regardless of + // the value of this flag because UDTs are saved in WAL nonetheless. + // We however need to explicitly disable flush during recovery by setting + // `avoid_flush_during_recovery=true` so that we can avoid timestamps getting + // stripped when the `persist_user_defined_timestamps` flag is false, so that + // all written timestamps are available for testing user-defined time travel + // read. + bool persist_udt = test::ShouldPersistUDT(GetParam()); + ts_options.persist_user_defined_timestamps = persist_udt; + bool avoid_flush_during_recovery = true; + + std::string full_history_ts_low; + ReadOptions read_opts; + do { + Slice ts_slice = ts1; + read_opts.timestamp = &ts_slice; + ASSERT_OK(CreateAndReopenWithTs({"pikachu"}, ts_options, persist_udt, + avoid_flush_during_recovery)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U); + ASSERT_OK(Put(1, "foo", ts1, "v1")); + ASSERT_OK(Put(1, "baz", ts1, "v5")); + + ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options, persist_udt, + avoid_flush_during_recovery)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U); + // Do a timestamped read with ts1 after second reopen. + CheckGet(read_opts, 1, "foo", "v1", ts1); + CheckGet(read_opts, 1, "baz", "v5", ts1); + + // Write more value versions for key "foo" and "bar" before and after second + // reopen. + std::string ts2; + PutFixed64(&ts2, 2); + ASSERT_OK(Put(1, "bar", ts2, "v2")); + ASSERT_OK(Put(1, "foo", ts2, "v3")); + + ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options, persist_udt, + avoid_flush_during_recovery)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U); + std::string ts3; + PutFixed64(&ts3, 3); + ASSERT_OK(Put(1, "foo", ts3, "v4")); + + // All the key value pairs available for read: + // "foo" -> [(ts1, "v1"), (ts2, "v3"), (ts3, "v4")] + // "bar" -> [(ts2, "v2")] + // "baz" -> [(ts1, "v5")] + // Do a timestamped read with ts1 after third reopen. + // read_opts.timestamp is set to ts1 for below reads + CheckGet(read_opts, 1, "foo", "v1", ts1); + std::string value; + ASSERT_TRUE(db_->Get(read_opts, handles_[1], "bar", &value).IsNotFound()); + CheckGet(read_opts, 1, "baz", "v5", ts1); + + // Do a timestamped read with ts2 after third reopen. + ts_slice = ts2; + // read_opts.timestamp is set to ts2 for below reads. + CheckGet(read_opts, 1, "foo", "v3", ts2); + CheckGet(read_opts, 1, "bar", "v2", ts2); + CheckGet(read_opts, 1, "baz", "v5", ts1); + + // Do a timestamped read with ts3 after third reopen. + ts_slice = ts3; + // read_opts.timestamp is set to ts3 for below reads. + CheckGet(read_opts, 1, "foo", "v4", ts3); + CheckGet(read_opts, 1, "bar", "v2", ts2); + CheckGet(read_opts, 1, "baz", "v5", ts1); + ASSERT_OK(db_->GetFullHistoryTsLow(handles_[1], &full_history_ts_low)); + ASSERT_TRUE(full_history_ts_low.empty()); + } while (ChangeWalOptions()); +} + +TEST_P(DBWALTestWithTimestamp, RecoverAndFlush) { + // Set up the option that enables user defined timestamp size. + std::string min_ts; + std::string write_ts; + PutFixed64(&min_ts, 0); + PutFixed64(&write_ts, 1); + Options ts_options; + ts_options.create_if_missing = true; + ts_options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + bool persist_udt = test::ShouldPersistUDT(GetParam()); + ts_options.persist_user_defined_timestamps = persist_udt; + + std::string smallest_ukey_without_ts = "baz"; + std::string largest_ukey_without_ts = "foo"; + + ASSERT_OK(CreateAndReopenWithTs({"pikachu"}, ts_options, persist_udt)); + // No flush, no sst files, because of no data. + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 0U); + ASSERT_OK(Put(1, largest_ukey_without_ts, write_ts, "v1")); + ASSERT_OK(Put(1, smallest_ukey_without_ts, write_ts, "v5")); + + ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, ts_options, persist_udt)); + // Memtable recovered from WAL flushed because `avoid_flush_during_recovery` + // defaults to false, created one L0 file. + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 1U); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(handles_[1], &level_to_files); + std::string full_history_ts_low; + ASSERT_OK(db_->GetFullHistoryTsLow(handles_[1], &full_history_ts_low)); + ASSERT_GT(level_to_files.size(), 1); + // L0 only has one SST file. + ASSERT_EQ(level_to_files[0].size(), 1); + auto meta = level_to_files[0][0]; + if (persist_udt) { + ASSERT_EQ(smallest_ukey_without_ts + write_ts, meta.smallest.user_key()); + ASSERT_EQ(largest_ukey_without_ts + write_ts, meta.largest.user_key()); + ASSERT_TRUE(full_history_ts_low.empty()); + } else { + ASSERT_EQ(smallest_ukey_without_ts + min_ts, meta.smallest.user_key()); + ASSERT_EQ(largest_ukey_without_ts + min_ts, meta.largest.user_key()); + std::string effective_cutoff; + Slice write_ts_slice = write_ts; + GetFullHistoryTsLowFromU64CutoffTs(&write_ts_slice, &effective_cutoff); + ASSERT_EQ(effective_cutoff, full_history_ts_low); + } +} + +// Param 0: test mode for the user-defined timestamp feature +INSTANTIATE_TEST_CASE_P( + P, DBWALTestWithTimestamp, + ::testing::Values( + test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp, + test::UserDefinedTimestampTestMode::kNormal)); + +TEST_F(DBWALTestWithTimestamp, EnableDisableUDT) { + Options options; + options.create_if_missing = true; + options.comparator = BytewiseComparator(); + bool avoid_flush_during_recovery = true; + ASSERT_OK(CreateAndReopenWithTs({"pikachu"}, options, true /* persist_udt */, + avoid_flush_during_recovery)); + + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "foo", "v1")); + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "baz", "v5")); + + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + options.persist_user_defined_timestamps = false; + // Test handle timestamp size inconsistency in WAL when enabling user-defined + // timestamps. + ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, options, + false /* persist_udt */, + avoid_flush_during_recovery)); + + std::string ts; + PutFixed64(&ts, 0); + Slice ts_slice = ts; + ReadOptions read_opts; + read_opts.timestamp = &ts_slice; + // Pre-existing entries are treated as if they have the min timestamp. + CheckGet(read_opts, 1, "foo", "v1", ts); + CheckGet(read_opts, 1, "baz", "v5", ts); + ts.clear(); + PutFixed64(&ts, 1); + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "foo", ts, "v2")); + ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "baz", ts, "v6")); + CheckGet(read_opts, 1, "foo", "v2", ts); + CheckGet(read_opts, 1, "baz", "v6", ts); + + options.comparator = BytewiseComparator(); + // Open the column family again with the UDT feature disabled. Test handle + // timestamp size inconsistency in WAL when disabling user-defined timestamps + ASSERT_OK(ReopenColumnFamiliesWithTs({"pikachu"}, options, + true /* persist_udt */, + avoid_flush_during_recovery)); + ASSERT_EQ("v2", Get(1, "foo")); + ASSERT_EQ("v6", Get(1, "baz")); +} + TEST_F(DBWALTest, RecoverWithTableHandle) { do { Options options = CurrentOptions(); @@ -444,7 +688,6 @@ TEST_F(DBWALTest, RecoverWithBlob) { ASSERT_EQ(blob_file->GetTotalBlobCount(), 1); -#ifndef ROCKSDB_LITE const InternalStats* const internal_stats = cfd->internal_stats(); ASSERT_NE(internal_stats, nullptr); @@ -460,7 +703,6 @@ TEST_F(DBWALTest, RecoverWithBlob) { ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED], compaction_stats[0].bytes_written + compaction_stats[0].bytes_written_blob); -#endif // ROCKSDB_LITE } TEST_F(DBWALTest, RecoverWithBlobMultiSST) { @@ -610,25 +852,14 @@ TEST_F(DBWALTest, WALWithChecksumHandoff) { #endif // ROCKSDB_ASSERT_STATUS_CHECKED } -#ifndef ROCKSDB_LITE TEST_F(DBWALTest, LockWal) { do { Options options = CurrentOptions(); options.create_if_missing = true; DestroyAndReopen(options); - SyncPoint::GetInstance()->DisableProcessing(); - SyncPoint::GetInstance()->LoadDependency( - {{"DBWALTest::LockWal:AfterGetSortedWal", - "DBWALTest::LockWal:BeforeFlush:1"}}); - SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Put("foo", "v")); ASSERT_OK(Put("bar", "v")); - port::Thread worker([&]() { - TEST_SYNC_POINT("DBWALTest::LockWal:BeforeFlush:1"); - Status tmp_s = db_->Flush(FlushOptions()); - ASSERT_OK(tmp_s); - }); ASSERT_OK(db_->LockWAL()); // Verify writes are stopped @@ -641,7 +872,10 @@ TEST_F(DBWALTest, LockWal) { ASSERT_OK(db_->GetSortedWalFiles(wals)); ASSERT_FALSE(wals.empty()); } - TEST_SYNC_POINT("DBWALTest::LockWal:AfterGetSortedWal"); + port::Thread worker([&]() { + Status tmp_s = db_->Flush(FlushOptions()); + ASSERT_OK(tmp_s); + }); FlushOptions flush_opts; flush_opts.wait = false; s = db_->Flush(flush_opts); @@ -650,11 +884,8 @@ TEST_F(DBWALTest, LockWal) { ASSERT_OK(db_->Put(WriteOptions(), "foo", "dontcare")); worker.join(); - - SyncPoint::GetInstance()->DisableProcessing(); } while (ChangeWalOptions()); } -#endif //! ROCKSDB_LITE class DBRecoveryTestBlobError : public DBWALTest, @@ -892,7 +1123,6 @@ TEST_F(DBWALTest, PreallocateBlock) { } #endif // !(defined NDEBUG) || !defined(OS_WIN) -#ifndef ROCKSDB_LITE TEST_F(DBWALTest, DISABLED_FullPurgePreservesRecycledLog) { // TODO(ajkr): Disabled until WAL recycling is fixed for // `kPointInTimeRecovery`. @@ -965,7 +1195,7 @@ TEST_F(DBWALTest, DISABLED_FullPurgePreservesLogPendingReuse) { ROCKSDB_NAMESPACE::port::Thread thread([&]() { TEST_SYNC_POINT( "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge"); - ASSERT_OK(db_->EnableFileDeletions(true)); + ASSERT_OK(db_->EnableFileDeletions(/*force=*/true)); TEST_SYNC_POINT( "DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge"); }); @@ -1311,7 +1541,9 @@ class RecoveryTestHelper { test->dbname_, &db_options, file_options, table_cache.get(), &write_buffer_manager, &write_controller, /*block_cache_tracer=*/nullptr, - /*io_tracer=*/nullptr, /*db_id*/ "", /*db_session_id*/ "")); + /*io_tracer=*/nullptr, /*db_id=*/"", /*db_session_id=*/"", + options.daily_offpeak_time_utc, + /*error_handler=*/nullptr)); wal_manager.reset( new WalManager(db_options, file_options, /*io_tracer=*/nullptr)); @@ -1647,6 +1879,8 @@ TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) { TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) { Options options = CurrentOptions(); + // Small size to force manifest creation + options.max_manifest_file_size = 1; options.track_and_verify_wals_in_manifest = true; DestroyAndReopen(options); @@ -1663,53 +1897,33 @@ TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) { // (2) SyncWAL() proceeds with the lock. It // creates a new manifest and syncs all the inactive wals before the latest // (i.e, active log), which is 4.log. Note that SyncWAL() is not aware of the - // fact that 4.log has marked as to be obseleted. Prior to the fix, such wal + // fact that 4.log has marked as to be obseleted. Such wal // sync will then add a WAL addition record of 4.log to the new manifest - // without any special treatment. - // (3) BackgroundFlush() will eventually purge 4.log. + // without any special treatment. Prior to the fix, there is no WAL deletion + // record to offset it. (3) BackgroundFlush() will eventually purge 4.log. + bool wal_synced = false; SyncPoint::GetInstance()->SetCallBack( "FindObsoleteFiles::PostMutexUnlock", [&](void*) { ASSERT_OK(env_->FileExists(wal_file_path)); - - SyncPoint::GetInstance()->SetCallBack( - "VersionSet::ProcessManifestWrites:" - "PostDecidingCreateNewManifestOrNot", - [&](void* arg) { - bool* new_descriptor_log = (bool*)arg; - *new_descriptor_log = true; - }); - + uint64_t pre_sync_wal_manifest_no = + dbfull()->TEST_Current_Manifest_FileNo(); ASSERT_OK(db_->SyncWAL()); + uint64_t post_sync_wal_manifest_no = + dbfull()->TEST_Current_Manifest_FileNo(); + bool new_manifest_created = + post_sync_wal_manifest_no == pre_sync_wal_manifest_no + 1; + ASSERT_TRUE(new_manifest_created); wal_synced = true; }); - SyncPoint::GetInstance()->SetCallBack( - "DBImpl::DeleteObsoleteFileImpl:AfterDeletion2", [&](void* arg) { - std::string* file_name = (std::string*)arg; - if (*file_name == wal_file_path) { - TEST_SYNC_POINT( - "DBWALTest::" - "FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::" - "PostDeleteWAL"); - } - }); - - SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::BackgroundCallFlush:FilesFound", - "PreConfrimObsoletedWALSynced"}, - {"DBWALTest::FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL::" - "PostDeleteWAL", - "PreConfrimWALDeleted"}}); SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); - TEST_SYNC_POINT("PreConfrimObsoletedWALSynced"); ASSERT_TRUE(wal_synced); - - TEST_SYNC_POINT("PreConfrimWALDeleted"); // BackgroundFlush() purged 4.log // because the memtable associated with the WAL was flushed and new WAL was // created (i.e, 8.log) @@ -2016,6 +2230,7 @@ TEST_P(DBWALTestWithParamsVaryingRecoveryMode, data.push_back( std::make_pair(iter->key().ToString(), iter->value().ToString())); } + EXPECT_OK(iter->status()); delete iter; return data; }; @@ -2372,7 +2587,6 @@ TEST_F(DBWALTest, WalInManifestButNotInSortedWals) { Close(); } -#endif // ROCKSDB_LITE TEST_F(DBWALTest, WalTermTest) { Options options = CurrentOptions(); @@ -2398,7 +2612,6 @@ TEST_F(DBWALTest, WalTermTest) { ASSERT_EQ("NOT_FOUND", Get(1, "foo2")); } -#ifndef ROCKSDB_LITE TEST_F(DBWALTest, GetCompressedWalsAfterSync) { if (db_->GetOptions().wal_compression == kNoCompression) { ROCKSDB_GTEST_BYPASS("stream compression not present"); @@ -2433,7 +2646,31 @@ TEST_F(DBWALTest, GetCompressedWalsAfterSync) { Status s = dbfull()->GetSortedWalFiles(wals); ASSERT_OK(s); } -#endif // ROCKSDB_LITE + +TEST_F(DBWALTest, EmptyWalReopenTest) { + Options options = CurrentOptions(); + options.env = env_; + CreateAndReopenWithCF({"pikachu"}, options); + + // make sure we can re-open it. + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); + + { + std::vector files; + int num_wal_files = 0; + ASSERT_OK(env_->GetChildren(dbname_, &files)); + for (const auto& file : files) { + uint64_t number = 0; + FileType type = kWalFile; + if (ParseFileName(file, &number, &type) && type == kWalFile) { + num_wal_files++; + } + } + + ASSERT_EQ(num_wal_files, 1); + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/db_with_timestamp_basic_test.cc b/db/db_with_timestamp_basic_test.cc index 4208169236d1..4bd8eaa0bfbc 100644 --- a/db/db_with_timestamp_basic_test.cc +++ b/db/db_with_timestamp_basic_test.cc @@ -13,9 +13,7 @@ #include "rocksdb/utilities/debug.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_builder.h" -#if !defined(ROCKSDB_LITE) #include "test_util/sync_point.h" -#endif #include "test_util/testutil.h" #include "utilities/fault_injection_env.h" #include "utilities/merge_operators/string_append/stringappend2.h" @@ -461,6 +459,13 @@ TEST_F(DBBasicTestWithTimestamp, GetApproximateSizes) { db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size)); ASSERT_GT(size, 0); + uint64_t total_mem_count; + uint64_t total_mem_size; + db_->GetApproximateMemTableStats(default_cf, r, &total_mem_count, + &total_mem_size); + ASSERT_GT(total_mem_count, 0); + ASSERT_GT(total_mem_size, 0); + // Should exclude end key start = Key(900); end = Key(1000); @@ -520,6 +525,7 @@ TEST_F(DBBasicTestWithTimestamp, SimpleIterate) { CheckIterUserEntry(it.get(), Key1(key), kTypeValue, "value" + std::to_string(i), write_timestamps[i]); } + ASSERT_OK(it->status()); ASSERT_EQ(static_cast(kMaxKey) - start_keys[i] + 1, count); // SeekToFirst()/SeekToLast() with lower/upper bounds. @@ -539,6 +545,7 @@ TEST_F(DBBasicTestWithTimestamp, SimpleIterate) { CheckIterUserEntry(it.get(), Key1(key), kTypeValue, "value" + std::to_string(i), write_timestamps[i]); } + ASSERT_OK(it->status()); ASSERT_EQ(r - std::max(l, start_keys[i]), count); for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0; @@ -546,6 +553,7 @@ TEST_F(DBBasicTestWithTimestamp, SimpleIterate) { CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue, "value" + std::to_string(i), write_timestamps[i]); } + ASSERT_OK(it->status()); l += (kMaxKey / 100); r -= (kMaxKey / 100); } @@ -645,7 +653,6 @@ TEST_F(DBBasicTestWithTimestamp, OpenAndTrimHistoryInvalidOptionTest) { .IsInvalidArgument()); } -#ifndef ROCKSDB_LITE TEST_F(DBBasicTestWithTimestamp, GetTimestampTableProperties) { Options options = CurrentOptions(); const size_t kTimestampSize = Timestamp(0, 0).size(); @@ -675,7 +682,6 @@ TEST_F(DBBasicTestWithTimestamp, GetTimestampTableProperties) { } Close(); } -#endif // !ROCKSDB_LITE class DBBasicTestWithTimestampTableOptions : public DBBasicTestWithTimestampBase, @@ -730,6 +736,7 @@ TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) { ASSERT_EQ(it->value(), value_from_get); ASSERT_EQ(Timestamp(1, 0), timestamp); } + ASSERT_OK(it->status()); // verify MultiGet() constexpr uint64_t step = 2; @@ -1062,6 +1069,7 @@ TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterateLowerTsBound) { write_timestamps[i - 1]); } } + ASSERT_OK(it->status()); size_t expected_count = kMaxKey + 1; ASSERT_EQ(expected_count, count); } @@ -1140,6 +1148,7 @@ TEST_F(DBBasicTestWithTimestamp, BackwardIterateLowerTsBound) { write_timestamps[1]); } } + ASSERT_OK(it->status()); size_t expected_count = kMaxKey + 1; ASSERT_EQ(expected_count, count); } @@ -1170,6 +1179,7 @@ TEST_F(DBBasicTestWithTimestamp, BackwardIterateLowerTsBound) { CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(), write_timestamp); } + ASSERT_OK(it->status()); ASSERT_EQ(kMaxKey + 1, count); } Close(); @@ -1275,6 +1285,7 @@ TEST_F(DBBasicTestWithTimestamp, BackwardIterateLowerTsBound_Reseek) { CheckIterEntry(it.get(), "a", kTypeValue, "v" + std::to_string(4 + i), Timestamp(4 + i, 0)); } + ASSERT_OK(it->status()); } Close(); @@ -1614,6 +1625,105 @@ TEST_F(DBBasicTestWithTimestamp, MultiGetRangeFiltering) { Close(); } +TEST_F(DBBasicTestWithTimestamp, GetWithRowCache) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + LRUCacheOptions cache_options; + cache_options.capacity = 8192; + options.row_cache = cache_options.MakeSharedRowCache(); + + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + WriteOptions write_opts; + std::string ts_early = Timestamp(1, 0); + std::string ts_later = Timestamp(10, 0); + Slice ts_later_slice = ts_later; + + const Snapshot* snap_with_nothing = db_->GetSnapshot(); + ASSERT_OK(db_->Put(write_opts, "foo", ts_early, "bar")); + const Snapshot* snap_with_foo = db_->GetSnapshot(); + + // Ensure file has sequence number greater than snapshot_with_foo + for (int i = 0; i < 10; i++) { + std::string numStr = std::to_string(i); + ASSERT_OK(db_->Put(write_opts, numStr, ts_later, numStr)); + } + ASSERT_OK(Flush()); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0); + + ReadOptions read_opts; + read_opts.timestamp = &ts_later_slice; + + std::string read_value; + std::string read_ts; + Status s = db_->Get(read_opts, "foo", &read_value, &read_ts); + ASSERT_OK(s); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + ASSERT_EQ(read_ts, ts_early); + + s = db_->Get(read_opts, "foo", &read_value, &read_ts); + ASSERT_OK(s); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); + // Row cache is not storing the ts when record is inserted/updated. + // To be fixed after enabling ROW_CACHE with timestamp. + // ASSERT_EQ(read_ts, ts_early); + + { + std::string ts_nothing = Timestamp(0, 0); + Slice ts_nothing_slice = ts_nothing; + read_opts.timestamp = &ts_nothing_slice; + s = db_->Get(read_opts, "foo", &read_value, &read_ts); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + + read_opts.timestamp = &ts_later_slice; + s = db_->Get(read_opts, "foo", &read_value, &read_ts); + ASSERT_OK(s); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); + } + + { + read_opts.snapshot = snap_with_foo; + + s = db_->Get(read_opts, "foo", &read_value, &read_ts); + ASSERT_OK(s); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 3); + + s = db_->Get(read_opts, "foo", &read_value, &read_ts); + ASSERT_OK(s); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 3); + } + + { + read_opts.snapshot = snap_with_nothing; + s = db_->Get(read_opts, "foo", &read_value, &read_ts); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 4); + + s = db_->Get(read_opts, "foo", &read_value, &read_ts); + ASSERT_TRUE(s.IsNotFound()); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3); + ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 5); + } + + db_->ReleaseSnapshot(snap_with_nothing); + db_->ReleaseSnapshot(snap_with_foo); + Close(); +} + TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetPrefixFilter) { Options options = CurrentOptions(); options.env = env_; @@ -2677,7 +2787,6 @@ TEST_P(DBBasicTestWithTimestampCompressionSettings, PutDeleteGet) { } } -#ifndef ROCKSDB_LITE // A class which remembers the name of each flushed file. class FlushedFileCollector : public EventListener { public: @@ -2970,7 +3079,6 @@ TEST_F(DBBasicTestWithTimestamp, MultiGetNoReturnTs) { Close(); } -#endif // !ROCKSDB_LITE INSTANTIATE_TEST_CASE_P( Timestamp, DBBasicTestWithTimestampCompressionSettings, @@ -3045,6 +3153,7 @@ TEST_P(DBBasicTestWithTimestampPrefixSeek, IterateWithPrefix) { "value" + std::to_string(i), write_ts_list[i]); iter->Next(); ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); // Seek to kMinKey iter->Seek(Key1(kMinKey)); @@ -3052,6 +3161,7 @@ TEST_P(DBBasicTestWithTimestampPrefixSeek, IterateWithPrefix) { "value" + std::to_string(i), write_ts_list[i]); iter->Prev(); ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); } const std::vector targets = {kMinKey, kMinKey + 0x10, kMinKey + 0x100, kMaxKey}; @@ -3090,6 +3200,7 @@ TEST_P(DBBasicTestWithTimestampPrefixSeek, IterateWithPrefix) { ++expected_key; it->Next(); } + ASSERT_OK(it->status()); ASSERT_EQ(expected_ub - targets[j] + 1, count); count = 0; @@ -3108,6 +3219,7 @@ TEST_P(DBBasicTestWithTimestampPrefixSeek, IterateWithPrefix) { --expected_key; it->Prev(); } + ASSERT_OK(it->status()); ASSERT_EQ(targets[j] - std::max(expected_lb, kMinKey) + 1, count); } } @@ -3213,6 +3325,7 @@ TEST_P(DBBasicTestWithTsIterTombstones, IterWithDelete) { ASSERT_EQ(Key1(key), iter->key()); ASSERT_EQ("value1" + std::to_string(key), iter->value()); } + ASSERT_OK(iter->status()); ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count); } Close(); @@ -3279,6 +3392,361 @@ TEST_F(UpdateFullHistoryTsLowTest, ConcurrentUpdate) { Close(); } +// Tests the effect of flag `persist_user_defined_timestamps` on the file +// boundaries contained in the Manifest, a.k.a FileMetaData.smallest, +// FileMetaData.largest. +class HandleFileBoundariesTest + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface { + public: + HandleFileBoundariesTest() + : DBBasicTestWithTimestampBase("/handle_file_boundaries") {} +}; + +TEST_P(HandleFileBoundariesTest, ConfigurePersistUdt) { + Options options = CurrentOptions(); + options.env = env_; + // Write a timestamp that is not the min timestamp to help test the behavior + // of flag `persist_user_defined_timestamps`. + std::string write_ts; + std::string min_ts; + PutFixed64(&write_ts, 1); + PutFixed64(&min_ts, 0); + std::string smallest_ukey_without_ts = "bar"; + std::string largest_ukey_without_ts = "foo"; + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + bool persist_udt = test::ShouldPersistUDT(GetParam()); + options.persist_user_defined_timestamps = persist_udt; + if (!persist_udt) { + options.allow_concurrent_memtable_write = false; + } + DestroyAndReopen(options); + + ASSERT_OK( + db_->Put(WriteOptions(), smallest_ukey_without_ts, write_ts, "val1")); + ASSERT_OK( + db_->Put(WriteOptions(), largest_ukey_without_ts, write_ts, "val2")); + + // Create a L0 SST file and its record is added to the Manfiest. + ASSERT_OK(Flush()); + Close(); + + options.create_if_missing = false; + // Reopen the DB and process manifest file. + Reopen(options); + + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_GT(level_to_files.size(), 1); + // L0 only has one SST file. + ASSERT_EQ(level_to_files[0].size(), 1); + auto file_meta = level_to_files[0][0]; + if (persist_udt) { + ASSERT_EQ(smallest_ukey_without_ts + write_ts, + file_meta.smallest.user_key()); + ASSERT_EQ(largest_ukey_without_ts + write_ts, file_meta.largest.user_key()); + } else { + // If `persist_user_defined_timestamps` is false, the file boundaries should + // have the min timestamp. Behind the scenes, when file boundaries in + // FileMetaData is persisted to Manifest, the original user-defined + // timestamps in user key are stripped. When manifest is read and processed + // during DB open, a min timestamp is padded to the file boundaries. This + // test's writes contain non min timestamp to verify this logic end-to-end. + ASSERT_EQ(smallest_ukey_without_ts + min_ts, file_meta.smallest.user_key()); + ASSERT_EQ(largest_ukey_without_ts + min_ts, file_meta.largest.user_key()); + } + Close(); +} + +INSTANTIATE_TEST_CASE_P( + ConfigurePersistUdt, HandleFileBoundariesTest, + ::testing::Values( + test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp, + test::UserDefinedTimestampTestMode::kNormal)); + +TEST_F(DBBasicTestWithTimestamp, EnableDisableUDT) { + Options options = CurrentOptions(); + options.env = env_; + // Create a column family without user-defined timestamps. + options.comparator = BytewiseComparator(); + options.persist_user_defined_timestamps = true; + DestroyAndReopen(options); + + // Create one SST file, its user keys have no user-defined timestamps. + ASSERT_OK(db_->Put(WriteOptions(), "foo", "val1")); + ASSERT_OK(Flush(0)); + Close(); + + // Reopen the existing column family and enable user-defined timestamps + // feature for it. + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + options.persist_user_defined_timestamps = false; + options.allow_concurrent_memtable_write = false; + Reopen(options); + + std::string value; + ASSERT_TRUE(db_->Get(ReadOptions(), "foo", &value).IsInvalidArgument()); + std::string read_ts; + PutFixed64(&read_ts, 0); + ReadOptions ropts; + Slice read_ts_slice = read_ts; + ropts.timestamp = &read_ts_slice; + std::string key_ts; + // Entries in pre-existing SST files are treated as if they have minimum + // user-defined timestamps. + ASSERT_OK(db_->Get(ropts, "foo", &value, &key_ts)); + ASSERT_EQ("val1", value); + ASSERT_EQ(read_ts, key_ts); + + // Do timestamped read / write. + std::string write_ts; + PutFixed64(&write_ts, 1); + ASSERT_OK(db_->Put(WriteOptions(), "foo", write_ts, "val2")); + read_ts.clear(); + PutFixed64(&read_ts, 1); + ASSERT_OK(db_->Get(ropts, "foo", &value, &key_ts)); + ASSERT_EQ("val2", value); + ASSERT_EQ(write_ts, key_ts); + // The user keys in this SST file don't have user-defined timestamps either, + // because `persist_user_defined_timestamps` flag is set to false. + ASSERT_OK(Flush(0)); + Close(); + + // Reopen the existing column family while disabling user-defined timestamps. + options.comparator = BytewiseComparator(); + Reopen(options); + + ASSERT_TRUE(db_->Get(ropts, "foo", &value).IsInvalidArgument()); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &value)); + ASSERT_EQ("val2", value); + + // Continue to write / read the column family without user-defined timestamps. + ASSERT_OK(db_->Put(WriteOptions(), "foo", "val3")); + ASSERT_OK(db_->Get(ReadOptions(), "foo", &value)); + ASSERT_EQ("val3", value); + Close(); +} + +// Tests that as long as the +// `ReadOptions.timestamp >= SuperVersion.full_history_ts_low` sanity check +// passes. The read will be consistent even if the column family's +// full_history_ts_low is concurrently increased and collapsed some history +// above `ReadOptions.timestamp`. +TEST_F(DBBasicTestWithTimestamp, + FullHistoryTsLowSanityCheckPassReadIsConsistent) { + Options options = CurrentOptions(); + options.env = env_; + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + // Use UDT in memtable only feature for this test, so we can control that + // newly set `full_history_ts_low` collapse history when Flush happens. + options.persist_user_defined_timestamps = false; + options.allow_concurrent_memtable_write = false; + DestroyAndReopen(options); + std::string min_ts; + PutFixed64(&min_ts, 0); + + // Write two versions of the key (1, v1), (3, v3), and always read with + // timestamp 2. + std::string write_ts; + PutFixed64(&write_ts, 1); + ASSERT_OK(db_->Put(WriteOptions(), "foo", write_ts, "val1")); + + std::string read_ts; + PutFixed64(&read_ts, 2); + Slice read_ts_slice = read_ts; + ReadOptions read_opts; + read_opts.timestamp = &read_ts_slice; + + // First read, no full_history_ts_low set, sanity check pass. + std::string value; + std::string timestamp; + ASSERT_OK(db_->Get(read_opts, "foo", &value, ×tamp)); + ASSERT_EQ("val1", value); + ASSERT_EQ(write_ts, timestamp); + + std::string full_history_ts_low; + std::string marked_ts_low; + PutFixed64(&full_history_ts_low, 2); + marked_ts_low = full_history_ts_low; + ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), + full_history_ts_low)); + ASSERT_OK(Flush(0)); + + // Write the (3, v3) entry after flush, otherwise with UDT in memtable only + // the previous Flush(0) with full_history_ts_low = 2 will be postponed + // waiting for (3, v3) to expire too. + write_ts.clear(); + PutFixed64(&write_ts, 3); + ASSERT_OK(db_->Put(WriteOptions(), "foo", write_ts, "val3")); + + // Second read: + // ReadOptions.timestamp(2) >= SuperVersion.full_history_ts_low(2), + // and ReadOptions.timestamp(2) >= ColumnFamilyData.full_history_ts_low(2). + // history below 2 is collapsed. Reading at 2 or above 2 is ok. + // Sanity check pass. Read return consistent value, but timestamp is already + // collapsed. + ASSERT_OK(db_->Get(read_opts, "foo", &value, ×tamp)); + ASSERT_EQ("val1", value); + ASSERT_EQ(min_ts, timestamp); + + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::GetImpl:AfterAcquireSv", [&](void* /*arg*/) { + // Concurrently increasing full_history_ts_low and flush to create a + // new SuperVersion + std::string current_ts_low; + ASSERT_OK(db_->GetFullHistoryTsLow(db_->DefaultColumnFamily(), + ¤t_ts_low)); + if (current_ts_low.empty() || current_ts_low != marked_ts_low) { + return; + } + full_history_ts_low.clear(); + PutFixed64(&full_history_ts_low, 4); + ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), + full_history_ts_low)); + ASSERT_OK(Flush(0)); + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + // Third read: + // ReadOptions.timestamp(2) >= SuperVersion.full_history_ts_low(2), + // but ReadOptions.timestamp(2) < ColumnFamilyData.full_history_ts_low(4). + // History below 4 is collapsed in the newly installed SuperVersion. But the + // SuperVersion attached to this read still has the history below 4 available. + // Sanity check pass. Read return consistent value, timestamp is collapsed. + ASSERT_OK(db_->Get(read_opts, "foo", &value, ×tamp)); + ASSERT_EQ("val1", value); + ASSERT_EQ(min_ts, timestamp); + + // Fourth read: + // ReadOptions.timestamp(2) < SuperVersion.full_history_ts_low(4). + // Sanity check fails. Had it succeeded, the read would return "v3", + // which is inconsistent. + ASSERT_TRUE( + db_->Get(read_opts, "foo", &value, ×tamp).IsInvalidArgument()); + Close(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); +} + +// Tests that in cases when +// `ReadOptions.timestamp >= SuperVersion.full_history_ts_low` sanity check +// fails. The referenced SuperVersion is dereferenced and cleaned up properly +// for all read APIs that involves this sanity check. +TEST_F(DBBasicTestWithTimestamp, FullHistoryTsLowSanityCheckFail) { + Options options = CurrentOptions(); + options.env = env_; + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + // Use UDT in memtable only feature for this test, so we can control that + // newly set `full_history_ts_low` collapse history when Flush happens. + options.persist_user_defined_timestamps = false; + options.allow_concurrent_memtable_write = false; + DestroyAndReopen(options); + + ColumnFamilyHandle* handle2 = nullptr; + Status s = db_->CreateColumnFamily(options, "data", &handle2); + ASSERT_OK(s); + + std::string write_ts; + PutFixed64(&write_ts, 1); + ASSERT_OK(db_->Put(WriteOptions(), "foo", write_ts, "val1")); + ASSERT_OK(db_->Put(WriteOptions(), handle2, "foo", write_ts, "val1")); + + std::string full_history_ts_low; + PutFixed64(&full_history_ts_low, 3); + ASSERT_OK(db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), + full_history_ts_low)); + ASSERT_OK(db_->IncreaseFullHistoryTsLow(handle2, full_history_ts_low)); + ASSERT_OK(Flush(0)); + ASSERT_OK(db_->Flush(FlushOptions(), handle2)); + + std::string read_ts; + PutFixed64(&read_ts, 2); + Slice read_ts_slice = read_ts; + ReadOptions read_opts; + read_opts.timestamp = &read_ts_slice; + + // Get() + std::string value; + ASSERT_TRUE(db_->Get(read_opts, "foo", &value).IsInvalidArgument()); + + // MultiGet() + std::vector cfhs = {db_->DefaultColumnFamily(), handle2}; + { + std::vector key_vals = {"foo", "foo"}; + std::vector keys; + std::vector values; + for (size_t j = 0; j < 2; ++j) { + keys.push_back(key_vals[j]); + } + + std::vector statuses = + db_->MultiGet(read_opts, cfhs, keys, &values); + for (auto status : statuses) { + ASSERT_TRUE(status.IsInvalidArgument()); + } + } + + // MultiGet with only one column family + { + std::vector one_cfh = {db_->DefaultColumnFamily()}; + std::vector key_vals = {"foo"}; + std::vector keys; + std::vector values; + for (size_t j = 0; j < 1; ++j) { + keys.push_back(key_vals[j]); + } + + std::vector statuses = + db_->MultiGet(read_opts, one_cfh, keys, &values); + for (auto status : statuses) { + ASSERT_TRUE(status.IsInvalidArgument()); + } + } + + // Overloaded version of MultiGet + ColumnFamilyHandle* column_families[] = {db_->DefaultColumnFamily(), handle2}; + { + Slice keys[] = {"foo", "foo"}; + PinnableSlice values[] = {PinnableSlice(), PinnableSlice()}; + Status statuses[] = {Status::OK(), Status::OK()}; + db_->MultiGet(read_opts, /*num_keys=*/2, &column_families[0], &keys[0], + &values[0], &statuses[0], /*sorted_input=*/false); + for (auto status : statuses) { + ASSERT_TRUE(status.IsInvalidArgument()); + } + } + + // Overloaded versions of MultiGet with one column family + { + ColumnFamilyHandle* one_column_family[] = {db_->DefaultColumnFamily()}; + Slice keys[] = {"foo"}; + PinnableSlice values[] = {PinnableSlice()}; + Status statuses[] = {Status::OK()}; + db_->MultiGet(read_opts, /*num_keys=*/1, &one_column_family[0], &keys[0], + &values[0], &statuses[0], /*sorted_input=*/false); + for (auto status : statuses) { + ASSERT_TRUE(status.IsInvalidArgument()); + } + } + + // NewIterator() + std::unique_ptr iter( + db_->NewIterator(read_opts, db_->DefaultColumnFamily())); + ASSERT_TRUE(iter->status().IsInvalidArgument()); + std::unique_ptr iter2(db_->NewIterator(read_opts, handle2)); + ASSERT_TRUE(iter2->status().IsInvalidArgument()); + + // NewIterators() + std::vector iterators; + ASSERT_TRUE( + db_->NewIterators(read_opts, cfhs, &iterators).IsInvalidArgument()); + delete handle2; + Close(); +} + TEST_F(DBBasicTestWithTimestamp, GCPreserveRangeTombstoneWhenNoOrSmallFullHistoryLow) { Options options = CurrentOptions(); @@ -3477,6 +3945,7 @@ TEST_P(DBBasicTestWithTimestampTableOptions, DeleteRangeBaiscReadAndIterate) { ++expected; } } + ASSERT_OK(iter->status()); ASSERT_EQ(kNum, expected); expected = kNum / 2; @@ -3484,6 +3953,7 @@ TEST_P(DBBasicTestWithTimestampTableOptions, DeleteRangeBaiscReadAndIterate) { ASSERT_EQ(Key1(expected), iter->key()); ++expected; } + ASSERT_OK(iter->status()); ASSERT_EQ(kNum, expected); expected = kRangeBegin - 1; @@ -3491,6 +3961,7 @@ TEST_P(DBBasicTestWithTimestampTableOptions, DeleteRangeBaiscReadAndIterate) { ASSERT_EQ(Key1(expected), iter->key()); --expected; } + ASSERT_OK(iter->status()); ASSERT_EQ(-1, expected); read_ts = Timestamp(0, 0); @@ -3772,6 +4243,7 @@ TEST_F(DBBasicTestWithTimestamp, MergeBasic) { ASSERT_EQ(value, it->value()); ASSERT_EQ(write_ts_strs[i], it->timestamp()); } + EXPECT_OK(it->status()); ASSERT_EQ(kNumOfUniqKeys, key_int_val); key_int_val = kNumOfUniqKeys - 1; @@ -3783,6 +4255,7 @@ TEST_F(DBBasicTestWithTimestamp, MergeBasic) { ASSERT_EQ(value, it->value()); ASSERT_EQ(write_ts_strs[i], it->timestamp()); } + ASSERT_OK(it->status()); ASSERT_EQ(std::numeric_limits::max(), key_int_val); value_suffix = value_suffix + "." + std::to_string(i + 1); @@ -3898,6 +4371,120 @@ TEST_F(DBBasicTestWithTimestamp, RangeTombstoneApproximateSize) { std::numeric_limits::max() /* max_file_num_to_ignore */, "" /*trim_ts*/)); } + +TEST_F(DBBasicTestWithTimestamp, IterSeekToLastWithIterateUpperbound) { + // Test for a bug fix where DBIter::SeekToLast() could fail when + // iterate_upper_bound and iter_start_ts are both set. + Options options = CurrentOptions(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + DestroyAndReopen(options); + + ASSERT_OK(db_->Put(WriteOptions(), Key(1), Timestamp(2, 0), "val")); + ReadOptions ro; + std::string k = Key(1); + Slice k_slice = k; + ro.iterate_upper_bound = &k_slice; + std::string ts = Timestamp(3, 0); + Slice read_ts = ts; + ro.timestamp = &read_ts; + std::string start_ts = Timestamp(0, 0); + Slice start_ts_slice = start_ts; + ro.iter_start_ts = &start_ts_slice; + std::unique_ptr iter{db_->NewIterator(ro)}; + iter->SeekToLast(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); +} + +TEST_F(DBBasicTestWithTimestamp, TimestampFilterTableReadOnGet) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + const size_t kTimestampSize = Timestamp(0, 0).size(); + TestComparator test_cmp(kTimestampSize); + options.comparator = &test_cmp; + BlockBasedTableOptions bbto; + bbto.block_size = 100; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + // Put + // Create two SST files + // file1: key => [1, 3], timestamp => [10, 20] + // file2, key => [2, 4], timestamp => [30, 40] + { + WriteOptions write_opts; + std::string write_ts = Timestamp(10, 0); + ASSERT_OK(db_->Put(write_opts, Key1(1), write_ts, "value1")); + write_ts = Timestamp(20, 0); + ASSERT_OK(db_->Put(write_opts, Key1(3), write_ts, "value3")); + ASSERT_OK(Flush()); + + write_ts = Timestamp(30, 0); + ASSERT_OK(db_->Put(write_opts, Key1(2), write_ts, "value2")); + write_ts = Timestamp(40, 0); + ASSERT_OK(db_->Put(write_opts, Key1(4), write_ts, "value4")); + ASSERT_OK(Flush()); + } + + // Get with timestamp + { + auto prev_checked_events = options.statistics->getTickerCount( + Tickers::TIMESTAMP_FILTER_TABLE_CHECKED); + auto prev_filtered_events = options.statistics->getTickerCount( + Tickers::TIMESTAMP_FILTER_TABLE_FILTERED); + + // key=3 (ts=20) does not exist at timestamp=1 + std::string read_ts_str = Timestamp(1, 0); + Slice read_ts_slice = Slice(read_ts_str); + ReadOptions read_opts; + read_opts.timestamp = &read_ts_slice; + std::string value_from_get = ""; + std::string timestamp_from_get = ""; + auto status = + db_->Get(read_opts, Key1(3), &value_from_get, ×tamp_from_get); + ASSERT_TRUE(status.IsNotFound()); + ASSERT_EQ(value_from_get, std::string("")); + ASSERT_EQ(timestamp_from_get, std::string("")); + + // key=3 is in the key ranges for both files, so both files will be queried. + // The table read was skipped because the timestamp is out of the table + // range, i.e.., 1 < [10,20], [30,40]. + // The tickers increase by 2 due to 2 files. + ASSERT_EQ(prev_checked_events + 2, + options.statistics->getTickerCount( + Tickers::TIMESTAMP_FILTER_TABLE_CHECKED)); + ASSERT_EQ(prev_filtered_events + 2, + options.statistics->getTickerCount( + Tickers::TIMESTAMP_FILTER_TABLE_FILTERED)); + + // key=3 (ts=20) exists at timestamp = 25 + read_ts_str = Timestamp(25, 0); + read_ts_slice = Slice(read_ts_str); + read_opts.timestamp = &read_ts_slice; + ASSERT_OK( + db_->Get(read_opts, Key1(3), &value_from_get, ×tamp_from_get)); + ASSERT_EQ("value3", value_from_get); + ASSERT_EQ(Timestamp(20, 0), timestamp_from_get); + + // file1 was not skipped, because the timestamp is in range, [10,20] < 25. + // file2 was skipped, because the timestamp is not in range, 25 < [30,40]. + // So the checked ticker increase by 2 due to 2 files; + // filtered ticker increase by 1 because file2 was skipped + ASSERT_EQ(prev_checked_events + 4, + options.statistics->getTickerCount( + Tickers::TIMESTAMP_FILTER_TABLE_CHECKED)); + ASSERT_EQ(prev_filtered_events + 3, + options.statistics->getTickerCount( + Tickers::TIMESTAMP_FILTER_TABLE_FILTERED)); + } + + Close(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { @@ -3905,4 +4492,4 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); -} +} \ No newline at end of file diff --git a/db/db_with_timestamp_compaction_test.cc b/db/db_with_timestamp_compaction_test.cc index 403d9907c57a..7d80c85c42b6 100644 --- a/db/db_with_timestamp_compaction_test.cc +++ b/db/db_with_timestamp_compaction_test.cc @@ -198,7 +198,6 @@ class TestFilePartitionerFactory : public SstPartitionerFactory { const char* Name() const override { return "TestFilePartitionerFactory"; } }; -#ifndef ROCKSDB_LITE TEST_F(TimestampCompatibleCompactionTest, CompactFilesRangeCheckL0) { Options options = CurrentOptions(); options.env = env_; @@ -344,7 +343,6 @@ TEST_F(TimestampCompatibleCompactionTest, EmptyCompactionOutput) { cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); } -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_write_buffer_manager_test.cc b/db/db_write_buffer_manager_test.cc index 4c31a7824659..eb33ec41e12f 100644 --- a/db/db_write_buffer_manager_test.cc +++ b/db/db_write_buffer_manager_test.cc @@ -42,10 +42,10 @@ TEST_P(DBWriteBufferManagerTest, SharedBufferAcrossCFs1) { CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options); ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); - Flush(3); + ASSERT_OK(Flush(3)); ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); - Flush(0); + ASSERT_OK(Flush(0)); // Write to "Default", "cf2" and "cf3". ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); @@ -84,10 +84,10 @@ TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) { CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options); ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); - Flush(3); + ASSERT_OK(Flush(3)); ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); - Flush(0); + ASSERT_OK(Flush(0)); // Write to "Default", "cf2" and "cf3". No flush will be triggered. ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); @@ -471,10 +471,10 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) { CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options); ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); - Flush(3); + ASSERT_OK(Flush(3)); ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); - Flush(0); + ASSERT_OK(Flush(0)); // Write to "Default", "cf2" and "cf3". No flush will be triggered. ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); @@ -780,7 +780,6 @@ TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -#ifndef ROCKSDB_LITE // Tests a `WriteBufferManager` constructed with `allow_stall == false` does not // thrash memtable switching when full and a CF receives multiple writes. @@ -847,7 +846,73 @@ TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) { delete shared_wbm_db; } -#endif // ROCKSDB_LITE +TEST_F(DBWriteBufferManagerTest, RuntimeChangeableAllowStall) { + constexpr int kBigValue = 10000; + + Options options = CurrentOptions(); + options.write_buffer_manager.reset( + new WriteBufferManager(1, nullptr /* cache */, true /* allow_stall */)); + DestroyAndReopen(options); + + // Pause flush thread so that + // (a) the only way to exist write stall below is to change the `allow_stall` + // (b) the write stall is "stable" without being interfered by flushes so that + // we can check it without flakiness + std::unique_ptr sleeping_task( + new test::SleepingBackgroundTask()); + env_->SetBackgroundThreads(1, Env::HIGH); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + sleeping_task.get(), Env::Priority::HIGH); + sleeping_task->WaitUntilSleeping(); + + // Test 1: test setting `allow_stall` from true to false + // + // Assert existence of a write stall + WriteOptions wo_no_slowdown; + wo_no_slowdown.no_slowdown = true; + Status s = Put(Key(0), DummyString(kBigValue), wo_no_slowdown); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(s.ToString().find("Write stall") != std::string::npos); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"WBMStallInterface::BlockDB", + "DBWriteBufferManagerTest::RuntimeChangeableThreadSafeParameters::" + "ChangeParameter"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Test `SetAllowStall()` + port::Thread thread1([&] { ASSERT_OK(Put(Key(0), DummyString(kBigValue))); }); + port::Thread thread2([&] { + TEST_SYNC_POINT( + "DBWriteBufferManagerTest::RuntimeChangeableThreadSafeParameters::" + "ChangeParameter"); + options.write_buffer_manager->SetAllowStall(false); + }); + + // Verify `allow_stall` is successfully set to false in thread2. + // Othwerwise, thread1's write will be stalled and this test will hang + // forever. + thread1.join(); + thread2.join(); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + + // Test 2: test setting `allow_stall` from false to true + // + // Assert no write stall + ASSERT_OK(Put(Key(0), DummyString(kBigValue), wo_no_slowdown)); + + // Test `SetAllowStall()` + options.write_buffer_manager->SetAllowStall(true); + + // Verify `allow_stall` is successfully set to true. + // Otherwise the following write will not be stalled and therefore succeed. + s = Put(Key(0), DummyString(kBigValue), wo_no_slowdown); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(s.ToString().find("Write stall") != std::string::npos); + sleeping_task->WakeUp(); +} INSTANTIATE_TEST_CASE_P(DBWriteBufferManagerTest, DBWriteBufferManagerTest, testing::Bool()); diff --git a/db/db_write_test.cc b/db/db_write_test.cc index 1ce2b14b20e5..59c26eaaaf56 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -4,6 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include +#include #include #include #include @@ -18,6 +19,7 @@ #include "util/random.h" #include "util/string_util.h" #include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { @@ -493,7 +495,7 @@ TEST_P(DBWriteTest, UnflushedPutRaceWithTrackedWalSync) { // Simulate full loss of unsynced data. This drops "key2" -> "val2" from the // DB WAL. - fault_env->DropUnsyncedFileData(); + ASSERT_OK(fault_env->DropUnsyncedFileData()); Reopen(options); @@ -534,7 +536,7 @@ TEST_P(DBWriteTest, InactiveWalFullySyncedBeforeUntracked) { // Simulate full loss of unsynced data. This should drop nothing since we did // `FlushWAL(true /* sync */)` before `Close()`. - fault_env->DropUnsyncedFileData(); + ASSERT_OK(fault_env->DropUnsyncedFileData()); Reopen(options); @@ -605,23 +607,137 @@ TEST_P(DBWriteTest, IOErrorOnSwitchMemtable) { Close(); } -// Test that db->LockWAL() flushes the WAL after locking. -TEST_P(DBWriteTest, LockWalInEffect) { +// Test that db->LockWAL() flushes the WAL after locking, which can fail +TEST_P(DBWriteTest, LockWALInEffect) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } Options options = GetOptions(); + std::shared_ptr fault_fs( + new FaultInjectionTestFS(FileSystem::Default())); + std::unique_ptr fault_fs_env(NewCompositeEnv(fault_fs)); + options.env = fault_fs_env.get(); + options.disable_auto_compactions = true; + options.paranoid_checks = false; + options.max_bgerror_resume_count = 0; // manual Resume() Reopen(options); // try the 1st WAL created during open - ASSERT_OK(Put("key" + std::to_string(0), "value")); - ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty()); - ASSERT_OK(dbfull()->LockWAL()); - ASSERT_TRUE(dbfull()->WALBufferIsEmpty(false)); - ASSERT_OK(dbfull()->UnlockWAL()); + ASSERT_OK(Put("key0", "value")); + ASSERT_NE(options.manual_wal_flush, dbfull()->WALBufferIsEmpty()); + ASSERT_OK(db_->LockWAL()); + ASSERT_TRUE(dbfull()->WALBufferIsEmpty()); + ASSERT_OK(db_->UnlockWAL()); // try the 2nd wal created during SwitchWAL ASSERT_OK(dbfull()->TEST_SwitchWAL()); - ASSERT_OK(Put("key" + std::to_string(0), "value")); - ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty()); - ASSERT_OK(dbfull()->LockWAL()); - ASSERT_TRUE(dbfull()->WALBufferIsEmpty(false)); - ASSERT_OK(dbfull()->UnlockWAL()); + ASSERT_OK(Put("key1", "value")); + ASSERT_NE(options.manual_wal_flush, dbfull()->WALBufferIsEmpty()); + ASSERT_OK(db_->LockWAL()); + ASSERT_TRUE(dbfull()->WALBufferIsEmpty()); + ASSERT_OK(db_->UnlockWAL()); + + // The above `TEST_SwitchWAL()` triggered a flush. That flush needs to finish + // before we make the filesystem inactive, otherwise the flush might hit an + // unrecoverable error (e.g., failed MANIFEST update). + ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr)); + + // Fail the WAL flush if applicable + fault_fs->SetFilesystemActive(false); + Status s = Put("key2", "value"); + if (options.manual_wal_flush) { + ASSERT_OK(s); + // I/O failure + ASSERT_NOK(db_->LockWAL()); + // Should not need UnlockWAL after LockWAL fails + } else { + ASSERT_NOK(s); + ASSERT_OK(db_->LockWAL()); + ASSERT_OK(db_->UnlockWAL()); + } + fault_fs->SetFilesystemActive(true); + ASSERT_OK(db_->Resume()); + // Writes should work again + ASSERT_OK(Put("key3", "value")); + ASSERT_EQ(Get("key3"), "value"); + + // Should be extraneous, but allowed + ASSERT_NOK(db_->UnlockWAL()); + + // Close before mock_env destruct. + Close(); +} + +TEST_P(DBWriteTest, LockWALConcurrentRecursive) { + Options options = GetOptions(); + Reopen(options); + ASSERT_OK(Put("k1", "val")); + ASSERT_OK(db_->LockWAL()); // 0 -> 1 + auto frozen_seqno = db_->GetLatestSequenceNumber(); + std::atomic t1_completed{false}; + port::Thread t1{[&]() { + // Won't finish until WAL unlocked + ASSERT_OK(Put("k1", "val2")); + t1_completed = true; + }}; + + ASSERT_OK(db_->LockWAL()); // 1 -> 2 + // Read-only ops are OK + ASSERT_EQ(Get("k1"), "val"); + { + std::vector files; + LiveFilesStorageInfoOptions lf_opts; + // A DB flush could deadlock + lf_opts.wal_size_for_flush = UINT64_MAX; + ASSERT_OK(db_->GetLiveFilesStorageInfo({lf_opts}, &files)); + } + + port::Thread t2{[&]() { + ASSERT_OK(db_->LockWAL()); // 2 -> 3 or 1 -> 2 + }}; + + ASSERT_OK(db_->UnlockWAL()); // 2 -> 1 or 3 -> 2 + // Give t1 an extra chance to jump in case of bug + std::this_thread::yield(); + t2.join(); + ASSERT_FALSE(t1_completed.load()); + + // Should now have 2 outstanding LockWAL + ASSERT_EQ(Get("k1"), "val"); + + ASSERT_OK(db_->UnlockWAL()); // 2 -> 1 + + ASSERT_FALSE(t1_completed.load()); + ASSERT_EQ(Get("k1"), "val"); + ASSERT_EQ(frozen_seqno, db_->GetLatestSequenceNumber()); + + // Ensure final Unlock is concurrency safe and extra Unlock is safe but + // non-OK + std::atomic unlock_ok{0}; + port::Thread t3{[&]() { + if (db_->UnlockWAL().ok()) { + unlock_ok++; + } + ASSERT_OK(db_->LockWAL()); + if (db_->UnlockWAL().ok()) { + unlock_ok++; + } + }}; + + if (db_->UnlockWAL().ok()) { + unlock_ok++; + } + t3.join(); + + // There was one extra unlock, so just one non-ok + ASSERT_EQ(unlock_ok.load(), 2); + + // Write can proceed + t1.join(); + ASSERT_TRUE(t1_completed.load()); + ASSERT_EQ(Get("k1"), "val2"); + // And new writes + ASSERT_OK(Put("k2", "val")); + ASSERT_EQ(Get("k2"), "val"); } TEST_P(DBWriteTest, ConcurrentlyDisabledWAL) { @@ -664,231 +780,6 @@ TEST_P(DBWriteTest, ConcurrentlyDisabledWAL) { ASSERT_LE(bytes_num, 1024 * 100); } -TEST_P(DBWriteTest, DisableWriteStall) { - Options options = GetOptions(); - options.disable_write_stall = true; - options.max_write_buffer_number = 2; - options.use_options_file = false; - Reopen(options); - db_->PauseBackgroundWork(); - ASSERT_OK(Put("k1", "v1")); - FlushOptions opts; - opts.wait = false; - opts.allow_write_stall = true; - ASSERT_OK(db_->Flush(opts)); - ASSERT_OK(Put("k2", "v2")); - ASSERT_OK(db_->Flush(opts)); - - // no write stall since it's disabled - ASSERT_OK(Put("k3", "v3")); - - // now enable write stall - ASSERT_OK(db_->SetOptions({{"disable_write_stall", "false"}})); - - WriteOptions wopts; - wopts.no_slowdown = true; - auto st = db_->Put(wopts, "k4", "v4"); - EXPECT_TRUE(st.IsIncomplete()); - - // now disable again - ASSERT_OK(db_->SetOptions({{"disable_write_stall", "true"}})); - // no write stall since it's disabled - ASSERT_OK(Put("k4", "v4")); - - // verify that disable write stall will unblock writes - ASSERT_OK(db_->SetOptions({{"disable_write_stall", "false"}})); - - std::thread t([&]() { - // writes will be blocked due to write stall - // but once we disable write stall, the writes are unblocked - ASSERT_OK(Put("k5", "v5")); - }); - // sleep to make sure t is blocked on write. Not ideal but it works - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - ASSERT_OK(db_->SetOptions({{"disable_write_stall", "true"}})); - t.join(); - - Close(); -} - -class DummyListener : public ReplicationLogListener { - public: - std::string OnReplicationLogRecord( - ReplicationLogRecord /*record*/) override { - seq_ += 1; - return std::to_string(seq_); - } - - private: - std::atomic_int seq_{0}; -}; - -// verifies that when `disable_write_stall` is the only cf option we set, -// there won't be manifest updates -TEST_P(DBWriteTest, DisableWriteStallNotWriteManifest) { - // pipelined write is conflicted with atomic flush - if (GetParam() == kPipelinedWrite) { - return ; - } - Options options = GetOptions(); - options.disable_write_stall = false; - // make sure manifest update seq is bumped - options.replication_log_listener = std::make_shared(); - options.atomic_flush = true; - Reopen(options); - - uint64_t manifestUpdateSeq; - ASSERT_OK(db_->GetManifestUpdateSequence(&manifestUpdateSeq)); - - db_->SetOptions({{"disable_write_stall", "true"}}); - - uint64_t newManifestUpdateSeq; - ASSERT_OK(db_->GetManifestUpdateSequence(&newManifestUpdateSeq)); - - EXPECT_EQ(manifestUpdateSeq, newManifestUpdateSeq); - - Close(); -} - -void functionTrampoline(void* arg) { - (*reinterpret_cast*>(arg))(); -} - -// Test the case that non-trival compaction is triggered before we disable write stall -// and make sure compaction job with old mutable_cf_options won't cause write stall -TEST_P(DBWriteTest, AutoCompactionBeforeDisableWriteStall) { - const int kNumKeysPerFile = 100; - - Options options; - options.env = env_; - options.use_options_file = false; - - // auto flush/compaction enabled so that write stall will be triggered - options.disable_auto_compactions = false; - options.disable_auto_flush = false; - - // set write buffer number to trigger write stall - options.max_write_buffer_number = 2; - options.disable_write_stall = false; - - // set compaction trigger to trigger non trival auto compaction - options.num_levels = 3; - options.level0_file_num_compaction_trigger = 3; - - // large write buffer size so auto flush never triggered - options.write_buffer_size = 10 << 20; - - options.max_background_jobs = 2; - - options.info_log = info_log_; - CreateAndReopenWithCF({"pikachu"}, options); - - auto cfd = static_cast_with_check(handles_[1])->cfd(); - - Random rnd(301); - - for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; - num++) { - std::vector values; - // Write 100KB (100 values, each 1K) - for (int i = 0; i < kNumKeysPerFile; i++) { - values.push_back(rnd.RandomString(990)); - ASSERT_OK(Put(1, Key(i), values[i])); - } - ASSERT_OK(dbfull()->Flush({}, handles_[1])); - ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1); - } - - // We are trying to simulate following case: - // 1. non trival compaction job scheduled but not starting yet - // 2. continuous writes trigger flush, which generates too many memtables and - // stalls writes - // 3. disable write stall through setOption API - // 4. compaction job is done. Even though it installs super version with stale - // `mutable_cf_options`, which still has `disable_write_stall=false`, the - // writes are not stalled since latest `mutable_cf_options` has - // `disable_write_stall=true` - // 5. flush jobs are done - SyncPoint::GetInstance()->LoadDependency( - {{"DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", - "DBWriteTest::CompactionBeforeDisableWriteStall:BeforeDisableWriteStall"}, - { - "DBWriteTest::CompactionBeforeDisableWriteStall:AfterDisableWriteStall", - "CompactionJob::Run():Start" - }}); - SyncPoint::GetInstance()->EnableProcessing(); - - // Write one more file to trigger auto compaction - std::vector values; - for (int i = 0; i < kNumKeysPerFile; i++) { - values.push_back(rnd.RandomString(990)); - ASSERT_OK(Put(1, Key(i), values[i])); - } - ASSERT_OK(dbfull()->Flush({}, handles_[1])); - - TEST_SYNC_POINT("DBWriteTest::CompactionBeforeDisableWriteStall:BeforeDisableWriteStall"); - // writes not stalled yet - EXPECT_FALSE(cfd->GetSuperVersion()->mutable_cf_options.disable_write_stall); - EXPECT_FALSE(dbfull() - ->GetVersionSet() - ->GetColumnFamilySet() - ->write_controller() - ->IsStopped()); - - auto cork = std::make_shared>(); - cork->store(true); - std::function corkFunction = [cork]() { - while (cork->load()) { - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } - }; - - // schedule high priority jobs to block the flush from finishing. - // We can't `pauseBackgroundWork` here since that would prevent the compaction - // from finishing as well - for (int i = 0; i < 2; i++) { - env_->Schedule(&functionTrampoline, &corkFunction, rocksdb::Env::Priority::HIGH); - } - - ASSERT_OK(Put(1, "k1", "v1")); - FlushOptions fopts; - fopts.wait = false; - fopts.allow_write_stall = true; - ASSERT_OK(dbfull()->Flush(fopts, handles_[1])); - ASSERT_OK(Put(1, "k2", "v2")); - // write stall condition triggered after this flush - ASSERT_OK(dbfull()->Flush(fopts, handles_[1])); - EXPECT_EQ(cfd->imm()->NumNotFlushed(), 2); - EXPECT_TRUE(dbfull() - ->GetVersionSet() - ->GetColumnFamilySet() - ->write_controller() - ->IsStopped()); - - ASSERT_OK(db_->SetOptions( - handles_[1], {{"disable_write_stall", "true"}})); - - TEST_SYNC_POINT("DBWriteTest::CompactionBeforeDisableWriteStall:AfterDisableWriteStall"); - - ASSERT_OK(dbfull()->TEST_WaitForScheduledCompaction()); - // compaction job installs super version with stale mutable_cf_options - EXPECT_FALSE(cfd->GetSuperVersion()->mutable_cf_options.disable_write_stall); - // but latest mutable_cf_options should be correctly set - EXPECT_TRUE(cfd->GetLatestMutableCFOptions()->disable_write_stall); - // and writes are not stalled! - EXPECT_FALSE(dbfull()->GetVersionSet()->GetColumnFamilySet()->write_controller()->IsStopped()); - - WriteOptions wopts; - wopts.no_slowdown = true; - EXPECT_OK(db_->Put(wopts, handles_[1], "k3", "v3")); - - cork->store(false); - // wait for flush to be done - ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); - - Close(); -} - INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest, testing::Values(DBTestBase::kDefault, DBTestBase::kConcurrentWALWrites, diff --git a/db/dbformat.cc b/db/dbformat.cc index 2c3581ca005e..63bb354de87a 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -88,6 +88,13 @@ void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key, result->append(kTsMax.data(), ts_sz); } +void AppendUserKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz) { + assert(ts_sz > 0); + result->append(key.data(), key.size() - ts_sz); + result->append(ts_sz, static_cast(0)); +} + void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key, size_t ts_sz) { assert(ts_sz > 0); @@ -101,6 +108,35 @@ void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key, } } +void PadInternalKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz) { + assert(ts_sz > 0); + size_t user_key_size = key.size() - kNumInternalBytes; + result->reserve(key.size() + ts_sz); + result->append(key.data(), user_key_size); + result->append(ts_sz, static_cast(0)); + result->append(key.data() + user_key_size, kNumInternalBytes); +} + +void StripTimestampFromInternalKey(std::string* result, const Slice& key, + size_t ts_sz) { + assert(key.size() >= ts_sz + kNumInternalBytes); + result->reserve(key.size() - ts_sz); + result->append(key.data(), key.size() - kNumInternalBytes - ts_sz); + result->append(key.data() + key.size() - kNumInternalBytes, + kNumInternalBytes); +} + +void ReplaceInternalKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz) { + const size_t key_sz = key.size(); + assert(key_sz >= ts_sz + kNumInternalBytes); + result->reserve(key_sz); + result->append(key.data(), key_sz - kNumInternalBytes - ts_sz); + result->append(ts_sz, static_cast(0)); + result->append(key.data() + key_sz - kNumInternalBytes, kNumInternalBytes); +} + std::string ParsedInternalKey::DebugString(bool log_err_key, bool hex) const { std::string result = "'"; if (log_err_key) { diff --git a/db/dbformat.h b/db/dbformat.h index d9fadea1ca9d..981866c09dc5 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -86,8 +86,10 @@ inline bool IsValueType(ValueType t) { // Checks whether a type is from user operation // kTypeRangeDeletion is in meta block so this API is separated from above +// kTypeMaxValid can be from keys generated by +// TruncatedRangeDelIterator::start_key() inline bool IsExtendedValueType(ValueType t) { - return IsValueType(t) || t == kTypeRangeDeletion; + return IsValueType(t) || t == kTypeRangeDeletion || t == kTypeMaxValid; } // We leave eight bits empty at the bottom so a type and sequence# @@ -166,46 +168,113 @@ inline void UnPackSequenceAndType(uint64_t packed, uint64_t* seq, EntryType GetEntryType(ValueType value_type); // Append the serialization of "key" to *result. -extern void AppendInternalKey(std::string* result, - const ParsedInternalKey& key); +// +// input [internal key]: +// output before: empty +// output: +void AppendInternalKey(std::string* result, const ParsedInternalKey& key); // Append the serialization of "key" to *result, replacing the original // timestamp with argument ts. -extern void AppendInternalKeyWithDifferentTimestamp( - std::string* result, const ParsedInternalKey& key, const Slice& ts); +// +// input [internal key]: +// output before: empty +// output after: +void AppendInternalKeyWithDifferentTimestamp(std::string* result, + const ParsedInternalKey& key, + const Slice& ts); // Serialized internal key consists of user key followed by footer. // This function appends the footer to *result, assuming that *result already // contains the user key at the end. -extern void AppendInternalKeyFooter(std::string* result, SequenceNumber s, - ValueType t); +// +// output before: +// output after: +void AppendInternalKeyFooter(std::string* result, SequenceNumber s, + ValueType t); // Append the key and a minimal timestamp to *result -extern void AppendKeyWithMinTimestamp(std::string* result, const Slice& key, - size_t ts_sz); +// +// input [user key without ts]: +// output before: empty +// output after: +void AppendKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz); // Append the key and a maximal timestamp to *result -extern void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key, - size_t ts_sz); +// +// input [user key without ts]: +// output before: empty +// output after: +void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz); + +// `key` is a user key with timestamp. Append the user key without timestamp +// and the minimum timestamp to *result. +// +// input [user key]: +// output before: empty +// output after: +void AppendUserKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz); // `key` is a user key with timestamp. Append the user key without timestamp // and the maximal timestamp to *result. -extern void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key, - size_t ts_sz); +// +// input [user key]: +// output before: empty +// output after: +void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key, + size_t ts_sz); + +// `key` is an internal key containing a user key without timestamp. Create a +// new key in *result by padding a min timestamp of size `ts_sz` to the user key +// and copying the remaining internal key bytes. +// +// input [internal key]: +// output before: empty +// output after: +void PadInternalKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz); + +// `key` is an internal key containing a user key with timestamp of size +// `ts_sz`. Create a new internal key in *result by stripping the timestamp from +// the user key and copying the remaining internal key bytes. +// +// input [internal key]: +// output before: empty +// output after: +void StripTimestampFromInternalKey(std::string* result, const Slice& key, + size_t ts_sz); + +// `key` is an internal key containing a user key with timestamp of size +// `ts_sz`. Create a new internal key in *result while replace the original +// timestamp with min timestamp. +// +// input [internal key]: +// output before: empty +// output after: +void ReplaceInternalKeyWithMinTimestamp(std::string* result, const Slice& key, + size_t ts_sz); // Attempt to parse an internal key from "internal_key". On success, // stores the parsed data in "*result", and returns true. // // On error, returns false, leaves "*result" in an undefined state. -extern Status ParseInternalKey(const Slice& internal_key, - ParsedInternalKey* result, bool log_err_key); +Status ParseInternalKey(const Slice& internal_key, ParsedInternalKey* result, + bool log_err_key); // Returns the user key portion of an internal key. +// +// input [internal key]: +// output: inline Slice ExtractUserKey(const Slice& internal_key) { assert(internal_key.size() >= kNumInternalBytes); return Slice(internal_key.data(), internal_key.size() - kNumInternalBytes); } +// input [internal key]: +// output : inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key, size_t ts_sz) { Slice ret = internal_key; @@ -213,17 +282,23 @@ inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key, return ret; } +// input [user key]: +// output: inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) { Slice ret = user_key; ret.remove_suffix(ts_sz); return ret; } +// input [user key]: +// output: inline Slice ExtractTimestampFromUserKey(const Slice& user_key, size_t ts_sz) { assert(user_key.size() >= ts_sz); return Slice(user_key.data() + user_key.size() - ts_sz, ts_sz); } +// input [internal key]: +// output: inline Slice ExtractTimestampFromKey(const Slice& internal_key, size_t ts_sz) { const size_t key_size = internal_key.size(); assert(key_size >= kNumInternalBytes + ts_sz); @@ -231,12 +306,16 @@ inline Slice ExtractTimestampFromKey(const Slice& internal_key, size_t ts_sz) { ts_sz); } +// input [internal key]: +// output: inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) { assert(internal_key.size() >= kNumInternalBytes); const size_t n = internal_key.size(); return DecodeFixed64(internal_key.data() + n - kNumInternalBytes); } +// input [internal key]: +// output: inline ValueType ExtractValueType(const Slice& internal_key) { uint64_t num = ExtractInternalKeyFooter(internal_key); unsigned char c = num & 0xff; @@ -276,6 +355,7 @@ class InternalKeyComparator // Same as Compare except that it excludes the value type from comparison int CompareKeySeq(const Slice& a, const Slice& b) const; + int CompareKeySeq(const ParsedInternalKey& a, const Slice& b) const; const Comparator* user_comparator() const { return user_comparator_.user_comparator(); @@ -502,6 +582,62 @@ class IterKey { key_size_ = total_size; } + // A version of `TrimAppend` assuming the last bytes of length `ts_sz` in the + // user key part of `key_` is not counted towards shared bytes. And the + // decoded key needed a min timestamp of length `ts_sz` pad to the user key. + void TrimAppendWithTimestamp(const size_t shared_len, + const char* non_shared_data, + const size_t non_shared_len, + const size_t ts_sz) { + std::string kTsMin(ts_sz, static_cast(0)); + std::string key_with_ts; + std::vector key_parts_with_ts; + if (IsUserKey()) { + key_parts_with_ts = {Slice(key_, shared_len), + Slice(non_shared_data, non_shared_len), + Slice(kTsMin)}; + } else { + assert(shared_len + non_shared_len >= kNumInternalBytes); + // Invaraint: shared_user_key_len + shared_internal_bytes_len = shared_len + // In naming below `*_len` variables, keyword `user_key` refers to the + // user key part of the existing key in `key_` as apposed to the new key. + // Similary, `internal_bytes` refers to the footer part of the existing + // key. These bytes potentially will move between user key part and the + // footer part in the new key. + const size_t user_key_len = key_size_ - kNumInternalBytes; + const size_t sharable_user_key_len = user_key_len - ts_sz; + const size_t shared_user_key_len = + std::min(shared_len, sharable_user_key_len); + const size_t shared_internal_bytes_len = shared_len - shared_user_key_len; + + // One Slice among the three Slices will get split into two Slices, plus + // a timestamp slice. + key_parts_with_ts.reserve(5); + bool ts_added = false; + // Add slice parts and find the right location to add the min timestamp. + MaybeAddKeyPartsWithTimestamp( + key_, shared_user_key_len, + shared_internal_bytes_len + non_shared_len < kNumInternalBytes, + shared_len + non_shared_len - kNumInternalBytes, kTsMin, + key_parts_with_ts, &ts_added); + MaybeAddKeyPartsWithTimestamp( + key_ + user_key_len, shared_internal_bytes_len, + non_shared_len < kNumInternalBytes, + shared_internal_bytes_len + non_shared_len - kNumInternalBytes, + kTsMin, key_parts_with_ts, &ts_added); + MaybeAddKeyPartsWithTimestamp(non_shared_data, non_shared_len, + non_shared_len >= kNumInternalBytes, + non_shared_len - kNumInternalBytes, kTsMin, + key_parts_with_ts, &ts_added); + assert(ts_added); + } + + Slice new_key(SliceParts(&key_parts_with_ts.front(), + static_cast(key_parts_with_ts.size())), + &key_with_ts); + SetKey(new_key); + } + Slice SetKey(const Slice& key, bool copy = true) { // is_user_key_ expected to be set already via SetIsUserKey return SetKeyImpl(key, copy); @@ -618,7 +754,7 @@ class IterKey { const char* key_; size_t key_size_; size_t buf_size_; - char space_[32]; // Avoid allocation for short keys + char space_[39]; // Avoid allocation for short keys bool is_user_key_; Slice SetKeyImpl(const Slice& key, bool copy) { @@ -659,6 +795,23 @@ class IterKey { } void EnlargeBuffer(size_t key_size); + + void MaybeAddKeyPartsWithTimestamp(const char* slice_data, + const size_t slice_sz, bool add_timestamp, + const size_t left_sz, + const std::string& min_timestamp, + std::vector& key_parts, + bool* ts_added) { + if (add_timestamp && !*ts_added) { + assert(slice_sz >= left_sz); + key_parts.emplace_back(slice_data, left_sz); + key_parts.emplace_back(min_timestamp); + key_parts.emplace_back(slice_data + left_sz, slice_sz - left_sz); + *ts_added = true; + } else { + key_parts.emplace_back(slice_data, slice_sz); + } + } }; // Convert from a SliceTransform of user keys, to a SliceTransform of @@ -696,8 +849,7 @@ class InternalKeySliceTransform : public SliceTransform { // Read the key of a record from a write batch. // if this record represent the default column family then cf_record // must be passed as false, otherwise it must be passed as true. -extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, - bool cf_record); +bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record); // Read record from a write batch piece from input. // tag, column_family, key, value and blob are return values. Callers own the @@ -706,9 +858,9 @@ extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, // input will be advanced to after the record. // If user-defined timestamp is enabled for a column family, then the `key` // resulting from this call will include timestamp. -extern Status ReadRecordFromWriteBatch(Slice* input, char* tag, - uint32_t* column_family, Slice* key, - Slice* value, Slice* blob, Slice* xid); +Status ReadRecordFromWriteBatch(Slice* input, char* tag, + uint32_t* column_family, Slice* key, + Slice* value, Slice* blob, Slice* xid); // When user call DeleteRange() to delete a range of keys, // we will store a serialized RangeTombstone in MemTable and SST. @@ -825,6 +977,26 @@ inline int InternalKeyComparator::CompareKeySeq(const Slice& akey, return r; } +inline int InternalKeyComparator::CompareKeySeq(const ParsedInternalKey& a, + const Slice& b) const { + // Order by: + // increasing user key (according to user-supplied comparator) + // decreasing sequence number + int r = user_comparator_.Compare(a.user_key, ExtractUserKey(b)); + if (r == 0) { + // Shift the number to exclude the last byte which contains the value type + const uint64_t anum = a.sequence; + const uint64_t bnum = + DecodeFixed64(b.data() + b.size() - kNumInternalBytes) >> 8; + if (anum > bnum) { + r = -1; + } else if (anum < bnum) { + r = +1; + } + } + return r; +} + inline int InternalKeyComparator::Compare(const Slice& a, SequenceNumber a_global_seqno, const Slice& b, diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc index 8dc3387df033..3b6190d92b05 100644 --- a/db/dbformat_test.cc +++ b/db/dbformat_test.cc @@ -178,6 +178,79 @@ TEST_F(FormatTest, IterKeyOperation) { "abcdefghijklmnopqrstuvwxyz")); } +TEST_F(FormatTest, IterKeyWithTimestampOperation) { + IterKey k; + k.SetUserKey(""); + const char p[] = "abcdefghijklmnopqrstuvwxyz"; + const char q[] = "0123456789"; + + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + std::string("")); + + size_t ts_sz = 8; + std::string min_timestamp(ts_sz, static_cast(0)); + k.TrimAppendWithTimestamp(0, p, 3, ts_sz); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + "abc" + min_timestamp); + + k.TrimAppendWithTimestamp(1, p, 3, ts_sz); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + "aabc" + min_timestamp); + + k.TrimAppendWithTimestamp(0, p, 26, ts_sz); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + "abcdefghijklmnopqrstuvwxyz" + min_timestamp); + + k.TrimAppendWithTimestamp(26, q, 10, ts_sz); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + "abcdefghijklmnopqrstuvwxyz0123456789" + min_timestamp); + + k.TrimAppendWithTimestamp(36, q, 1, ts_sz); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + "abcdefghijklmnopqrstuvwxyz01234567890" + min_timestamp); + + k.TrimAppendWithTimestamp(26, q, 1, ts_sz); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + "abcdefghijklmnopqrstuvwxyz0" + min_timestamp); + + k.TrimAppendWithTimestamp(27, p, 26, ts_sz); + ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()), + "abcdefghijklmnopqrstuvwxyz0" + "abcdefghijklmnopqrstuvwxyz" + + min_timestamp); + // IterKey holds an internal key, the last 8 bytes hold the key footer, the + // timestamp is expected to be added before the key footer. + std::string key_without_ts = "keywithoutts"; + k.SetInternalKey(key_without_ts + min_timestamp + "internal"); + + ASSERT_EQ(std::string(k.GetInternalKey().data(), k.GetInternalKey().size()), + key_without_ts + min_timestamp + "internal"); + k.TrimAppendWithTimestamp(0, p, 10, ts_sz); + ASSERT_EQ(std::string(k.GetInternalKey().data(), k.GetInternalKey().size()), + "ab" + min_timestamp + "cdefghij"); + + k.TrimAppendWithTimestamp(1, p, 8, ts_sz); + ASSERT_EQ(std::string(k.GetInternalKey().data(), k.GetInternalKey().size()), + "a" + min_timestamp + "abcdefgh"); + + k.TrimAppendWithTimestamp(9, p, 3, ts_sz); + ASSERT_EQ(std::string(k.GetInternalKey().data(), k.GetInternalKey().size()), + "aabc" + min_timestamp + "defghabc"); + + k.TrimAppendWithTimestamp(10, q, 10, ts_sz); + ASSERT_EQ(std::string(k.GetInternalKey().data(), k.GetInternalKey().size()), + "aabcdefgha01" + min_timestamp + "23456789"); + + k.TrimAppendWithTimestamp(20, q, 1, ts_sz); + ASSERT_EQ(std::string(k.GetInternalKey().data(), k.GetInternalKey().size()), + "aabcdefgha012" + min_timestamp + "34567890"); + + k.TrimAppendWithTimestamp(21, p, 26, ts_sz); + ASSERT_EQ( + std::string(k.GetInternalKey().data(), k.GetInternalKey().size()), + "aabcdefgha01234567890abcdefghijklmnopqr" + min_timestamp + "stuvwxyz"); +} + TEST_F(FormatTest, UpdateInternalKey) { std::string user_key("abcdefghijklmnopqrstuvwxyz"); uint64_t new_seq = 0x123456; @@ -204,6 +277,62 @@ TEST_F(FormatTest, RangeTombstoneSerializeEndKey) { ASSERT_LT(cmp.Compare(t.SerializeEndKey(), k), 0); } +TEST_F(FormatTest, PadInternalKeyWithMinTimestamp) { + std::string orig_user_key = "foo"; + std::string orig_internal_key = IKey(orig_user_key, 100, kTypeValue); + size_t ts_sz = 8; + + std::string key_buf; + PadInternalKeyWithMinTimestamp(&key_buf, orig_internal_key, ts_sz); + ParsedInternalKey key_with_timestamp; + Slice in(key_buf); + ASSERT_OK(ParseInternalKey(in, &key_with_timestamp, true /*log_err_key*/)); + + std::string min_timestamp(ts_sz, static_cast(0)); + ASSERT_EQ(orig_user_key + min_timestamp, key_with_timestamp.user_key); + ASSERT_EQ(100, key_with_timestamp.sequence); + ASSERT_EQ(kTypeValue, key_with_timestamp.type); +} + +TEST_F(FormatTest, StripTimestampFromInternalKey) { + std::string orig_user_key = "foo"; + size_t ts_sz = 8; + std::string timestamp(ts_sz, static_cast(0)); + orig_user_key.append(timestamp.data(), timestamp.size()); + std::string orig_internal_key = IKey(orig_user_key, 100, kTypeValue); + + std::string key_buf; + StripTimestampFromInternalKey(&key_buf, orig_internal_key, ts_sz); + ParsedInternalKey key_without_timestamp; + Slice in(key_buf); + ASSERT_OK(ParseInternalKey(in, &key_without_timestamp, true /*log_err_key*/)); + + ASSERT_EQ("foo", key_without_timestamp.user_key); + ASSERT_EQ(100, key_without_timestamp.sequence); + ASSERT_EQ(kTypeValue, key_without_timestamp.type); +} + +TEST_F(FormatTest, ReplaceInternalKeyWithMinTimestamp) { + std::string orig_user_key = "foo"; + size_t ts_sz = 8; + orig_user_key.append(ts_sz, static_cast(1)); + std::string orig_internal_key = IKey(orig_user_key, 100, kTypeValue); + + std::string key_buf; + ReplaceInternalKeyWithMinTimestamp(&key_buf, orig_internal_key, ts_sz); + ParsedInternalKey new_key; + Slice in(key_buf); + ASSERT_OK(ParseInternalKey(in, &new_key, true /*log_err_key*/)); + + std::string min_timestamp(ts_sz, static_cast(0)); + size_t ukey_diff_offset = new_key.user_key.difference_offset(orig_user_key); + ASSERT_EQ(min_timestamp, + Slice(new_key.user_key.data() + ukey_diff_offset, ts_sz)); + ASSERT_EQ(orig_user_key.size(), new_key.user_key.size()); + ASSERT_EQ(100, new_key.sequence); + ASSERT_EQ(kTypeValue, new_key.type); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index 34925e828b7a..b6d4f559e2ec 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -7,7 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #include @@ -576,6 +575,7 @@ TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) { ASSERT_OK(itr->status()); ++count; } + ASSERT_OK(itr->status()); ASSERT_EQ(count, 1000); } @@ -589,6 +589,7 @@ TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) { ASSERT_OK(itr->status()); ++count; } + ASSERT_OK(itr->status()); ASSERT_EQ(count, 1000); } } @@ -602,13 +603,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/db/error_handler.cc b/db/error_handler.cc index 7f68bb026c27..f4326100182f 100644 --- a/db/error_handler.cc +++ b/db/error_handler.cc @@ -228,7 +228,6 @@ std::map, Status::Severity> }; void ErrorHandler::CancelErrorRecovery() { -#ifndef ROCKSDB_LITE db_mutex_->AssertHeld(); // We'll release the lock before calling sfm, so make sure no new @@ -249,7 +248,6 @@ void ErrorHandler::CancelErrorRecovery() { // If auto recovery is also runing to resume from the retryable error, // we should wait and end the auto recovery. EndAutoRecovery(); -#endif } STATIC_AVOID_DESTRUCTION(const Status, kOkStatus){Status::OK()}; @@ -281,6 +279,7 @@ const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err, if (bg_error_stats_ != nullptr) { RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT_MISSPELLED); } ROCKS_LOG_INFO(db_options_.info_log, "ErrorHandler: Set regular background error\n"); @@ -321,7 +320,7 @@ const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err, // Check if recovery is currently in progress. If it is, we will save this // error so we can check it at the end to see if recovery succeeded or not if (recovery_in_prog_ && recovery_error_.ok()) { - recovery_error_ = new_bg_err; + recovery_error_ = status_to_io_status(Status(new_bg_err)); } bool auto_recovery = auto_recovery_; @@ -397,16 +396,6 @@ const Status& ErrorHandler::SetBGError(const Status& bg_status, ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s", bg_io_err.ToString().c_str()); - if (recovery_in_prog_ && recovery_io_error_.ok()) { - recovery_io_error_ = bg_io_err; - } - if (BackgroundErrorReason::kManifestWrite == reason || - BackgroundErrorReason::kManifestWriteNoWAL == reason) { - // Always returns ok - ROCKS_LOG_INFO(db_options_.info_log, "Disabling File Deletions"); - db_->DisableFileDeletionsWithLock().PermitUncheckedError(); - } - Status new_bg_io_err = bg_io_err; DBRecoverContext context; if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile && @@ -418,7 +407,11 @@ const Status& ErrorHandler::SetBGError(const Status& bg_status, CheckAndSetRecoveryAndBGError(bg_err); if (bg_error_stats_ != nullptr) { RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_BG_ERROR_COUNT_MISSPELLED); RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_BG_IO_ERROR_COUNT_MISSPELLED); } ROCKS_LOG_INFO( db_options_.info_log, @@ -445,9 +438,15 @@ const Status& ErrorHandler::SetBGError(const Status& bg_status, &auto_recovery); if (bg_error_stats_ != nullptr) { RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_BG_ERROR_COUNT_MISSPELLED); RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_BG_IO_ERROR_COUNT_MISSPELLED); RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT_MISSPELLED); } ROCKS_LOG_INFO(db_options_.info_log, "ErrorHandler: Set background retryable IO error\n"); @@ -463,6 +462,8 @@ const Status& ErrorHandler::SetBGError(const Status& bg_status, ROCKS_LOG_INFO( db_options_.info_log, "ErrorHandler: Compaction will schedule by itself to resume\n"); + // Not used in this code path. + new_bg_io_err.PermitUncheckedError(); return bg_error_; } else if (BackgroundErrorReason::kFlushNoWAL == reason || BackgroundErrorReason::kManifestWriteNoWAL == reason) { @@ -488,18 +489,40 @@ const Status& ErrorHandler::SetBGError(const Status& bg_status, } else { if (bg_error_stats_ != nullptr) { RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT); + RecordTick(bg_error_stats_.get(), + ERROR_HANDLER_BG_IO_ERROR_COUNT_MISSPELLED); } - // HandleKnownErrors() will use recovery_error_, so ignore - // recovery_io_error_. - // TODO: Do some refactoring and use only one recovery_error_ - recovery_io_error_.PermitUncheckedError(); return HandleKnownErrors(new_bg_io_err, reason); } } +void ErrorHandler::AddFilesToQuarantine( + autovector*> files_to_quarantine) { + db_mutex_->AssertHeld(); + std::ostringstream quarantine_files_oss; + bool is_first_one = true; + for (const auto* files : files_to_quarantine) { + assert(files); + for (uint64_t file_number : *files) { + files_to_quarantine_.push_back(file_number); + quarantine_files_oss << (is_first_one ? "" : ", ") << file_number; + is_first_one = false; + } + } + ROCKS_LOG_INFO(db_options_.info_log, + "ErrorHandler: added file numbers %s to quarantine.\n", + quarantine_files_oss.str().c_str()); +} + +void ErrorHandler::ClearFilesToQuarantine() { + db_mutex_->AssertHeld(); + files_to_quarantine_.clear(); + ROCKS_LOG_INFO(db_options_.info_log, + "ErrorHandler: cleared files in quarantine.\n"); +} + Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery) { -#ifndef ROCKSDB_LITE if (bg_error.severity() >= Status::Severity::kFatalError) { return bg_error; } @@ -528,14 +551,9 @@ Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error, } return bg_error; -#else - (void)auto_recovery; - return Status(bg_error, Status::Severity::kFatalError); -#endif } void ErrorHandler::RecoverFromNoSpace() { -#ifndef ROCKSDB_LITE SstFileManagerImpl* sfm = reinterpret_cast(db_options_.sst_file_manager.get()); @@ -543,36 +561,32 @@ void ErrorHandler::RecoverFromNoSpace() { if (sfm) { sfm->StartErrorRecovery(this, bg_error_); } -#endif } Status ErrorHandler::ClearBGError() { -#ifndef ROCKSDB_LITE db_mutex_->AssertHeld(); // Signal that recovery succeeded if (recovery_error_.ok()) { + assert(files_to_quarantine_.empty()); Status old_bg_error = bg_error_; // old_bg_error is only for notifying listeners, so may not be checked old_bg_error.PermitUncheckedError(); // Clear and check the recovery IO and BG error + is_db_stopped_.store(false, std::memory_order_release); bg_error_ = Status::OK(); - recovery_io_error_ = IOStatus::OK(); + recovery_error_ = IOStatus::OK(); bg_error_.PermitUncheckedError(); - recovery_io_error_.PermitUncheckedError(); + recovery_error_.PermitUncheckedError(); recovery_in_prog_ = false; soft_error_no_bg_work_ = false; EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, old_bg_error, bg_error_, db_mutex_); } return recovery_error_; -#else - return bg_error_; -#endif } Status ErrorHandler::RecoverFromBGError(bool is_manual) { -#ifndef ROCKSDB_LITE InstrumentedMutexLock l(db_mutex_); bool no_bg_work_original_flag = soft_error_no_bg_work_; if (is_manual) { @@ -601,14 +615,14 @@ Status ErrorHandler::RecoverFromBGError(bool is_manual) { if (bg_error_.severity() == Status::Severity::kSoftError && recover_context_.flush_reason == FlushReason::kErrorRecovery) { // Simply clear the background error and return - recovery_error_ = Status::OK(); + recovery_error_ = IOStatus::OK(); return ClearBGError(); } // Reset recovery_error_. We will use this to record any errors that happen // during the recovery process. While recovering, the only operations that // can generate background errors should be the flush operations - recovery_error_ = Status::OK(); + recovery_error_ = IOStatus::OK(); recovery_error_.PermitUncheckedError(); Status s = db_->ResumeImpl(recover_context_); if (s.ok()) { @@ -625,15 +639,10 @@ Status ErrorHandler::RecoverFromBGError(bool is_manual) { recovery_in_prog_ = false; } return s; -#else - (void)is_manual; - return bg_error_; -#endif } const Status& ErrorHandler::StartRecoverFromRetryableBGIOError( const IOStatus& io_error) { -#ifndef ROCKSDB_LITE db_mutex_->AssertHeld(); if (bg_error_.ok()) { return bg_error_; @@ -642,6 +651,13 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError( } else if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) { // Auto resume BG error is not enabled, directly return bg_error_. return bg_error_; + } else if (end_recovery_) { + // Can temporarily release db mutex + EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, + Status::ShutdownInProgress(), + db_mutex_); + db_mutex_->AssertHeld(); + return bg_error_; } if (bg_error_stats_ != nullptr) { RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT); @@ -649,43 +665,53 @@ const Status& ErrorHandler::StartRecoverFromRetryableBGIOError( ROCKS_LOG_INFO( db_options_.info_log, "ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n"); + // Needs to be set in the same lock hold as setting BG error, otherwise + // intervening writes could see a BG error without a recovery and bail out. + recovery_in_prog_ = true; + if (recovery_thread_) { + // Ensure only one thread can execute the join(). + std::unique_ptr old_recovery_thread( + std::move(recovery_thread_)); // In this case, if recovery_in_prog_ is false, current thread should // wait the previous recover thread to finish and create a new thread // to recover from the bg error. db_mutex_->Unlock(); - recovery_thread_->join(); + TEST_SYNC_POINT( + "StartRecoverFromRetryableBGIOError:BeforeWaitingForOtherThread"); + old_recovery_thread->join(); + TEST_SYNC_POINT( + "StartRecoverFromRetryableBGIOError:AfterWaitingForOtherThread"); db_mutex_->Lock(); } - recovery_in_prog_ = true; recovery_thread_.reset( new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this)); - if (recovery_io_error_.ok() && recovery_error_.ok()) { + if (recovery_error_.ok()) { return recovery_error_; } else { return bg_error_; } -#else - (void)io_error; - return bg_error_; -#endif } // Automatic recover from Retryable BG IO error. Must be called after db // mutex is released. void ErrorHandler::RecoverFromRetryableBGIOError() { -#ifndef ROCKSDB_LITE + assert(recovery_in_prog_); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart"); + TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart2"); InstrumentedMutexLock l(db_mutex_); if (end_recovery_) { EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, Status::ShutdownInProgress(), db_mutex_); + + recovery_in_prog_ = false; return; } DBRecoverContext context = recover_context_; + context.flush_after_recovery = true; int resume_count = db_options_.max_bgerror_resume_count; uint64_t wait_interval = db_options_.bgerror_resume_retry_interval; uint64_t retry_count = 0; @@ -695,12 +721,12 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_, Status::ShutdownInProgress(), db_mutex_); + recovery_in_prog_ = false; return; } TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0"); TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1"); - recovery_io_error_ = IOStatus::OK(); - recovery_error_ = Status::OK(); + recovery_error_ = IOStatus::OK(); retry_count++; Status s = db_->ResumeImpl(context); if (bg_error_stats_ != nullptr) { @@ -720,9 +746,9 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { bg_error_, db_mutex_); return; } - if (!recovery_io_error_.ok() && + if (!recovery_error_.ok() && recovery_error_.severity() <= Status::Severity::kHardError && - recovery_io_error_.GetRetryable()) { + recovery_error_.GetRetryable()) { // If new BG IO error happens during auto recovery and it is retryable // and its severity is Hard Error or lower, the auto resmue sleep for // a period of time and redo auto resume if it is allowed. @@ -731,33 +757,23 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { int64_t wait_until = db_options_.clock->NowMicros() + wait_interval; cv_.TimedWait(wait_until); } else { - // There are three possibility: 1) recover_io_error is set during resume + // There are three possibility: 1) recovery_error_ is set during resume // and the error is not retryable, 2) recover is successful, 3) other // error happens during resume and cannot be resumed here. - if (recovery_io_error_.ok() && recovery_error_.ok() && s.ok()) { + if (recovery_error_.ok() && s.ok()) { // recover from the retryable IO error and no other BG errors. Clean // the bg_error and notify user. TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess"); - Status old_bg_error = bg_error_; - is_db_stopped_.store(false, std::memory_order_release); - bg_error_ = Status::OK(); - bg_error_.PermitUncheckedError(); - EventHelpers::NotifyOnErrorRecoveryEnd( - db_options_.listeners, old_bg_error, bg_error_, db_mutex_); if (bg_error_stats_ != nullptr) { RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT); RecordInHistogram(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); } - recovery_in_prog_ = false; - if (soft_error_no_bg_work_) { - soft_error_no_bg_work_ = false; - } return; } else { - // In this case: 1) recovery_io_error is more serious or not retryable - // 2) other Non IO recovery_error happens. The auto recovery stops. + // In this case: 1) recovery_error_ is more serious or not retryable + // 2) other error happens. The auto recovery stops. recovery_in_prog_ = false; if (bg_error_stats_ != nullptr) { RecordInHistogram(bg_error_stats_.get(), @@ -765,10 +781,7 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { } EventHelpers::NotifyOnErrorRecoveryEnd( db_options_.listeners, bg_error_, - !recovery_io_error_.ok() - ? recovery_io_error_ - : (!recovery_error_.ok() ? recovery_error_ : s), - db_mutex_); + !recovery_error_.ok() ? recovery_error_ : s, db_mutex_); return; } } @@ -784,14 +797,11 @@ void ErrorHandler::RecoverFromRetryableBGIOError() { ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count); } return; -#else - return; -#endif } void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) { if (recovery_in_prog_ && recovery_error_.ok()) { - recovery_error_ = bg_err; + recovery_error_ = status_to_io_status(Status(bg_err)); } if (bg_err.severity() > bg_error_.severity()) { bg_error_ = bg_err; @@ -807,12 +817,16 @@ void ErrorHandler::EndAutoRecovery() { if (!end_recovery_) { end_recovery_ = true; } - cv_.SignalAll(); - db_mutex_->Unlock(); if (recovery_thread_) { - recovery_thread_->join(); + // Ensure only one thread can execute the join(). + std::unique_ptr old_recovery_thread( + std::move(recovery_thread_)); + db_mutex_->Unlock(); + cv_.SignalAll(); + old_recovery_thread->join(); + db_mutex_->Lock(); } - db_mutex_->Lock(); + TEST_SYNC_POINT("PostEndAutoRecovery"); return; } diff --git a/db/error_handler.h b/db/error_handler.h index 34e08a525d78..1168d91fa874 100644 --- a/db/error_handler.h +++ b/db/error_handler.h @@ -4,11 +4,14 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once +#include + #include "monitoring/instrumented_mutex.h" #include "options/db_options.h" #include "rocksdb/io_status.h" #include "rocksdb/listener.h" #include "rocksdb/status.h" +#include "util/autovector.h" namespace ROCKSDB_NAMESPACE { @@ -19,10 +22,13 @@ class DBImpl; // FlushReason, which tells the flush job why this flush is called. struct DBRecoverContext { FlushReason flush_reason; + bool flush_after_recovery; - DBRecoverContext() : flush_reason(FlushReason::kErrorRecovery) {} - - DBRecoverContext(FlushReason reason) : flush_reason(reason) {} + DBRecoverContext() + : flush_reason(FlushReason::kErrorRecovery), + flush_after_recovery(false) {} + DBRecoverContext(FlushReason reason) + : flush_reason(reason), flush_after_recovery(false) {} }; class ErrorHandler { @@ -43,7 +49,6 @@ class ErrorHandler { // Clear the checked flag for uninitialized errors bg_error_.PermitUncheckedError(); recovery_error_.PermitUncheckedError(); - recovery_io_error_.PermitUncheckedError(); } void EnableAutoRecovery() { auto_recovery_ = true; } @@ -78,16 +83,23 @@ class ErrorHandler { void EndAutoRecovery(); + void AddFilesToQuarantine( + autovector*> files_to_quarantine); + + const autovector& GetFilesToQuarantine() const { + db_mutex_->AssertHeld(); + return files_to_quarantine_; + } + + void ClearFilesToQuarantine(); + private: DBImpl* db_; const ImmutableDBOptions& db_options_; Status bg_error_; // A separate Status variable used to record any errors during the // recovery process from hard errors - Status recovery_error_; - // A separate IO Status variable used to record any IO errors during - // the recovery process. At the same time, recovery_error_ is also set. - IOStatus recovery_io_error_; + IOStatus recovery_error_; // The condition variable used with db_mutex during auto resume for time // wait. InstrumentedCondVar cv_; @@ -109,6 +121,13 @@ class ErrorHandler { // The pointer of DB statistics. std::shared_ptr bg_error_stats_; + // During recovery from manifest IO errors, files whose VersionEdits entries + // could be in an ambiguous state are quarantined and file deletion refrain + // from deleting them. Successful recovery will clear this vector. Files are + // added to this vector while DB mutex was locked, this data structure is + // unsorted. + autovector files_to_quarantine_; + const Status& HandleKnownErrors(const Status& bg_err, BackgroundErrorReason reason); Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery); diff --git a/db/error_handler_fs_test.cc b/db/error_handler_fs_test.cc index 153f3b79ef20..2d33a7a694f6 100644 --- a/db/error_handler_fs_test.cc +++ b/db/error_handler_fs_test.cc @@ -6,16 +6,16 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE + +#include #include "db/db_test_util.h" #include "file/sst_file_manager_impl.h" #include "port/stack_trace.h" #include "rocksdb/io_status.h" #include "rocksdb/sst_file_manager.h" -#if !defined(ROCKSDB_LITE) #include "test_util/sync_point.h" -#endif +#include "test_util/testharness.h" #include "util/random.h" #include "utilities/fault_injection_env.h" #include "utilities/fault_injection_fs.h" @@ -155,9 +155,9 @@ class ErrorHandlerFSListener : public EventListener { FaultInjectionTestFS* fault_fs_; }; -TEST_F(DBErrorHandlingFSTest, FLushWriteError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); +TEST_F(DBErrorHandlingFSTest, FlushWriteError) { + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -200,9 +200,9 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteError) { // All the NoSpace IOError will be handled as the regular BG Error no matter the // retryable flag is set of not. So the auto resume for retryable IO Error will // not be triggered. Also, it is mapped as hard error. -TEST_F(DBErrorHandlingFSTest, FLushWriteNoSpaceError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); +TEST_F(DBErrorHandlingFSTest, FlushWriteNoSpaceError) { + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -244,9 +244,9 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteNoSpaceError) { Destroy(options); } -TEST_F(DBErrorHandlingFSTest, FLushWriteRetryableError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); +TEST_F(DBErrorHandlingFSTest, FlushWriteRetryableError) { + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -318,9 +318,9 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteRetryableError) { Destroy(options); } -TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); +TEST_F(DBErrorHandlingFSTest, FlushWriteFileScopeError) { + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -402,9 +402,9 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) { Destroy(options); } -TEST_F(DBErrorHandlingFSTest, FLushWALWriteRetryableError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); +TEST_F(DBErrorHandlingFSTest, FlushWALWriteRetryableError) { + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -446,9 +446,9 @@ TEST_F(DBErrorHandlingFSTest, FLushWALWriteRetryableError) { Destroy(options); } -TEST_F(DBErrorHandlingFSTest, FLushWALAtomicWriteRetryableError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); +TEST_F(DBErrorHandlingFSTest, FlushWALAtomicWriteRetryableError) { + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -492,9 +492,9 @@ TEST_F(DBErrorHandlingFSTest, FLushWALAtomicWriteRetryableError) { } // The flush error is injected before we finish the table build -TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); +TEST_F(DBErrorHandlingFSTest, FlushWritNoWALRetryableError1) { + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -548,9 +548,9 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) { } // The retryable IO error is injected before we sync table -TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); +TEST_F(DBErrorHandlingFSTest, FlushWriteNoWALRetryableError2) { + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -592,9 +592,9 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) { } // The retryable IO error is injected before we close the table file -TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); +TEST_F(DBErrorHandlingFSTest, FlushWriteNoWALRetryableError3) { + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -636,8 +636,8 @@ TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) { } TEST_F(DBErrorHandlingFSTest, ManifestWriteError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -661,6 +661,7 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteError) { SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + ASSERT_FALSE(dbfull()->TEST_GetFilesToQuarantine().empty()); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->DisableProcessing(); fault_fs_->SetFilesystemActive(true); @@ -669,6 +670,7 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteError) { new_manifest = GetManifestNameFromLiveFiles(); ASSERT_NE(new_manifest, old_manifest); + ASSERT_TRUE(dbfull()->TEST_GetFilesToQuarantine().empty()); Reopen(options); ASSERT_EQ("val", Get(Key(0))); @@ -677,8 +679,8 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteError) { } TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -704,6 +706,7 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) { SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + ASSERT_FALSE(dbfull()->TEST_GetFilesToQuarantine().empty()); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->DisableProcessing(); fault_fs_->SetFilesystemActive(true); @@ -712,6 +715,7 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) { new_manifest = GetManifestNameFromLiveFiles(); ASSERT_NE(new_manifest, old_manifest); + ASSERT_TRUE(dbfull()->TEST_GetFilesToQuarantine().empty()); Reopen(options); ASSERT_EQ("val", Get(Key(0))); @@ -720,8 +724,8 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) { } TEST_F(DBErrorHandlingFSTest, ManifestWriteFileScopeError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -749,6 +753,7 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteFileScopeError) { [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); + ASSERT_FALSE(dbfull()->TEST_GetFilesToQuarantine().empty()); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->DisableProcessing(); @@ -758,6 +763,7 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteFileScopeError) { new_manifest = GetManifestNameFromLiveFiles(); ASSERT_NE(new_manifest, old_manifest); + ASSERT_TRUE(dbfull()->TEST_GetFilesToQuarantine().empty()); Reopen(options); ASSERT_EQ("val", Get(Key(0))); @@ -766,8 +772,8 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteFileScopeError) { } TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -795,6 +801,7 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableError) { SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); + ASSERT_FALSE(dbfull()->TEST_GetFilesToQuarantine().empty()); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->DisableProcessing(); fault_fs_->SetFilesystemActive(true); @@ -803,6 +810,7 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableError) { new_manifest = GetManifestNameFromLiveFiles(); ASSERT_NE(new_manifest, old_manifest); + ASSERT_TRUE(dbfull()->TEST_GetFilesToQuarantine().empty()); Reopen(options); ASSERT_EQ("val", Get(Key(0))); @@ -811,8 +819,8 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableError) { } TEST_F(DBErrorHandlingFSTest, DoubleManifestWriteError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -836,11 +844,13 @@ TEST_F(DBErrorHandlingFSTest, DoubleManifestWriteError) { SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + ASSERT_FALSE(dbfull()->TEST_GetFilesToQuarantine().empty()); fault_fs_->SetFilesystemActive(true); // This Resume() will attempt to create a new manifest file and fail again s = dbfull()->Resume(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError); + ASSERT_FALSE(dbfull()->TEST_GetFilesToQuarantine().empty()); fault_fs_->SetFilesystemActive(true); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->DisableProcessing(); @@ -851,6 +861,7 @@ TEST_F(DBErrorHandlingFSTest, DoubleManifestWriteError) { new_manifest = GetManifestNameFromLiveFiles(); ASSERT_NE(new_manifest, old_manifest); + ASSERT_TRUE(dbfull()->TEST_GetFilesToQuarantine().empty()); Reopen(options); ASSERT_EQ("val", Get(Key(0))); @@ -863,8 +874,8 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) { ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); return; } - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -917,6 +928,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) { fault_fs_->SetFilesystemActive(true); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); TEST_SYNC_POINT("CompactionManifestWriteError:1"); + ASSERT_FALSE(dbfull()->TEST_GetFilesToQuarantine().empty()); TEST_SYNC_POINT("CompactionManifestWriteError:2"); s = dbfull()->TEST_WaitForCompact(); @@ -925,6 +937,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) { new_manifest = GetManifestNameFromLiveFiles(); ASSERT_NE(new_manifest, old_manifest); + ASSERT_TRUE(dbfull()->TEST_GetFilesToQuarantine().empty()); Reopen(options); ASSERT_EQ("val", Get(Key(0))); ASSERT_EQ("val", Get(Key(1))); @@ -933,8 +946,8 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) { } TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -983,6 +996,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) { ASSERT_OK(s); TEST_SYNC_POINT("CompactionManifestWriteError:0"); + ASSERT_FALSE(dbfull()->TEST_GetFilesToQuarantine().empty()); TEST_SYNC_POINT("CompactionManifestWriteError:1"); s = dbfull()->TEST_WaitForCompact(); @@ -996,6 +1010,7 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) { new_manifest = GetManifestNameFromLiveFiles(); ASSERT_NE(new_manifest, old_manifest); + ASSERT_TRUE(dbfull()->TEST_GetFilesToQuarantine().empty()); Reopen(options); ASSERT_EQ("val", Get(Key(0))); @@ -1005,8 +1020,8 @@ TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) { } TEST_F(DBErrorHandlingFSTest, CompactionWriteError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -1047,8 +1062,8 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteError) { } TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteRetryableError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -1094,8 +1109,8 @@ TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteRetryableError) { } TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteFileScopeError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -1186,8 +1201,8 @@ TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) { ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); return; } - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -1231,8 +1246,8 @@ TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) { } TEST_F(DBErrorHandlingFSTest, FailRecoverFlushError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -1260,8 +1275,8 @@ TEST_F(DBErrorHandlingFSTest, WALWriteError) { ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); return; } - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -1333,8 +1348,8 @@ TEST_F(DBErrorHandlingFSTest, WALWriteError) { } TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -1428,8 +1443,8 @@ TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) { ROCKSDB_GTEST_SKIP("Test requires non-mock environment"); return; } - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -1581,7 +1596,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { } for (auto i = 0; i < kNumDbInstances; ++i) { - Status s = static_cast(db[i])->TEST_WaitForCompact(true); + Status s = static_cast(db[i])->TEST_WaitForCompact(); ASSERT_EQ(s.severity(), Status::Severity::kSoftError); fault_fs[i]->SetFilesystemActive(true); } @@ -1590,7 +1605,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) { for (auto i = 0; i < kNumDbInstances; ++i) { std::string prop; ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); - ASSERT_OK(static_cast(db[i])->TEST_WaitForCompact(true)); + ASSERT_OK(static_cast(db[i])->TEST_WaitForCompact()); EXPECT_TRUE(db[i]->GetProperty( "rocksdb.num-files-at-level" + std::to_string(0), &prop)); EXPECT_EQ(atoi(prop.c_str()), 0); @@ -1704,7 +1719,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { } for (auto i = 0; i < kNumDbInstances; ++i) { - Status s = static_cast(db[i])->TEST_WaitForCompact(true); + Status s = static_cast(db[i])->TEST_WaitForCompact(); switch (i) { case 0: ASSERT_EQ(s.severity(), Status::Severity::kSoftError); @@ -1726,7 +1741,7 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true); } if (i == 1) { - ASSERT_OK(static_cast(db[i])->TEST_WaitForCompact(true)); + ASSERT_OK(static_cast(db[i])->TEST_WaitForCompact()); } EXPECT_TRUE(db[i]->GetProperty( "rocksdb.num-files-at-level" + std::to_string(0), &prop)); @@ -1760,10 +1775,10 @@ TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) { // to soft error and trigger auto resume. During auto resume, SwitchMemtable // is disabled to avoid small SST tables. Write can still be applied before // the bg error is cleaned unless the memtable is full. -TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover1) { +TEST_F(DBErrorHandlingFSTest, FlushWritNoWALRetryableErrorAutoRecover1) { // Activate the FS before the first resume - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -1784,7 +1799,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover1) { ASSERT_OK(Put(Key(1), "val1", wo)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"RecoverFromRetryableBGIOError:LoopOut", - "FLushWritNoWALRetryableeErrorAutoRecover1:1"}}); + "FlushWritNoWALRetryableeErrorAutoRecover1:1"}}); SyncPoint::GetInstance()->SetCallBack( "BuildTable:BeforeFinishBuildTable", [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); @@ -1793,7 +1808,7 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover1) { s = Flush(); ASSERT_EQ("val1", Get(Key(1))); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); - TEST_SYNC_POINT("FLushWritNoWALRetryableeErrorAutoRecover1:1"); + TEST_SYNC_POINT("FlushWritNoWALRetryableeErrorAutoRecover1:1"); ASSERT_EQ("val1", Get(Key(1))); ASSERT_EQ("val1", Get(Key(1))); SyncPoint::GetInstance()->DisableProcessing(); @@ -1830,10 +1845,94 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover1) { Destroy(options); } -TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover2) { +TEST_F(DBErrorHandlingFSTest, MultipleRecoveryThreads) { + // This test creates a scenario where second write's recovery can get started + // while mutex is released for a short period during + // NotifyOnErrorRecoveryEnd() from the first write's recovery. This is to make + // sure RecoverFromRetryableBGIOError() from the second write's recovery + // thread does not start with recovery_in_prog_ = false; + + std::shared_ptr listener = + std::make_shared(); + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + options.listeners.emplace_back(listener); + options.max_bgerror_resume_count = 100; + options.bgerror_resume_retry_interval = 1000000; // 1 second + options.statistics = CreateDBStatistics(); + + listener->EnableAutoRecovery(false); + DestroyAndReopen(options); + + IOStatus error_msg = IOStatus::IOError("Retryable IO Error"); + error_msg.SetRetryable(true); + + WriteOptions wo = WriteOptions(); + wo.disableWAL = true; + fault_fs_->SetFilesystemActive(false, error_msg); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"NotifyOnErrorRecoveryEnd:MutexUnlocked:1", + "MultipleRecoveryThreads:1"}, + {"MultipleRecoveryThreads:2", + "NotifyOnErrorRecoveryEnd:MutexUnlocked:2"}, + {"StartRecoverFromRetryableBGIOError:BeforeWaitingForOtherThread", + "MultipleRecoveryThreads:3"}, + {"RecoverFromRetryableBGIOError:RecoverSuccess", + "MultipleRecoveryThreads:4"}, + {"MultipleRecoveryThreads:4", + "StartRecoverFromRetryableBGIOError:AfterWaitingForOtherThread"}}); + SyncPoint::GetInstance()->EnableProcessing(); + + // First write with read fault injected and recovery will start + { + ASSERT_OK(Put(Key(1), "val1", wo)); + Status s = Flush(); + ASSERT_NOK(s); + } + // Remove read fault injection so that first recovery can go through + fault_fs_->SetFilesystemActive(true); + + // At this point, first recovery is now at NotifyOnErrorRecoveryEnd. Mutex is + // released. + TEST_SYNC_POINT("MultipleRecoveryThreads:1"); + + ROCKSDB_NAMESPACE::port::Thread second_write([&] { + // Second write with read fault injected + fault_fs_->SetFilesystemActive(false, error_msg); + ASSERT_OK(Put(Key(2), "val2", wo)); + Status s = Flush(); + ASSERT_NOK(s); + }); + // Second bg thread before waiting for the first thread's recovery thread + TEST_SYNC_POINT("MultipleRecoveryThreads:3"); + // First thread's recovery thread continues + TEST_SYNC_POINT("MultipleRecoveryThreads:2"); + // Wait for the first thread's recovery to finish + // (this sets recovery_in_prog_ = false) + // And second thread continues and starts recovery thread + TEST_SYNC_POINT("MultipleRecoveryThreads:4"); + second_write.join(); + // Remove error injection so that second thread recovery can go through + fault_fs_->SetFilesystemActive(true); + + // Set up sync point so that we can wait for the recovery thread to finish + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"RecoverFromRetryableBGIOError:RecoverSuccess", + "MultipleRecoveryThreads:6"}}); + + // Wait for the second thread's recovery to be done + TEST_SYNC_POINT("MultipleRecoveryThreads:6"); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Destroy(options); +} + +TEST_F(DBErrorHandlingFSTest, FlushWritNoWALRetryableErrorAutoRecover2) { // Activate the FS before the first resume - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -1891,10 +1990,10 @@ TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover2) { // Auto resume fromt the flush retryable IO error. Activate the FS before the // first resume. Resume is successful -TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) { +TEST_F(DBErrorHandlingFSTest, FlushWritRetryableErrorAutoRecover1) { // Activate the FS before the first resume - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -1933,10 +2032,10 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) { // Auto resume fromt the flush retryable IO error and set the retry limit count. // Never activate the FS and auto resume should fail at the end -TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) { +TEST_F(DBErrorHandlingFSTest, FlushWritRetryableErrorAutoRecover2) { // Fail all the resume and let user to resume - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -1953,18 +2052,18 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) { ASSERT_OK(Put(Key(1), "val1")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"FLushWritRetryableeErrorAutoRecover2:0", + {{"FlushWritRetryableeErrorAutoRecover2:0", "RecoverFromRetryableBGIOError:BeforeStart"}, {"RecoverFromRetryableBGIOError:LoopOut", - "FLushWritRetryableeErrorAutoRecover2:1"}}); + "FlushWritRetryableeErrorAutoRecover2:1"}}); SyncPoint::GetInstance()->SetCallBack( "BuildTable:BeforeFinishBuildTable", [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); }); SyncPoint::GetInstance()->EnableProcessing(); s = Flush(); ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError); - TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:0"); - TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:1"); + TEST_SYNC_POINT("FlushWritRetryableeErrorAutoRecover2:0"); + TEST_SYNC_POINT("FlushWritRetryableeErrorAutoRecover2:1"); fault_fs_->SetFilesystemActive(true); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->DisableProcessing(); @@ -1986,8 +2085,8 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) { // Fail the first resume and let the second resume be successful. TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) { // Fail the first resume and let the second resume be successful - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -2039,8 +2138,8 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) { TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableErrorAutoRecover) { // Fail the first resume and let the second resume be successful - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -2094,8 +2193,8 @@ TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableErrorAutoRecover) { TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableErrorAutoRecover) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -2184,8 +2283,8 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) { // compaction, the FS is set to active and compaction is successful, so // the test will hit the CompactionJob::FinishCompactionOutputFile1 sync // point. - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -2238,8 +2337,8 @@ TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) { } TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -2341,8 +2440,8 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) { TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) { // Fail the first recover and try second time. - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -2442,10 +2541,10 @@ TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) { // Fail auto resume from a flush retryable error and verify that // OnErrorRecoveryEnd listener callback is called -TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAbortRecovery) { +TEST_F(DBErrorHandlingFSTest, FlushWritRetryableErrorAbortRecovery) { // Activate the FS before the first resume - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -2476,6 +2575,59 @@ TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAbortRecovery) { Destroy(options); } +TEST_F(DBErrorHandlingFSTest, FlushErrorRecoveryRaceWithDBDestruction) { + Options options = GetDefaultOptions(); + options.env = fault_env_.get(); + options.create_if_missing = true; + std::shared_ptr listener = + std::make_shared(); + options.listeners.emplace_back(listener); + DestroyAndReopen(options); + ASSERT_OK(Put("k1", "val")); + + // Inject retryable flush error + bool error_set = false; + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeOutputValidation", [&](void*) { + if (error_set) { + return; + } + IOStatus st = IOStatus::IOError("Injected"); + st.SetRetryable(true); + fault_fs_->SetFilesystemActive(false, st); + error_set = true; + }); + + port::Thread db_close_thread; + SyncPoint::GetInstance()->SetCallBack( + "BuildTable:BeforeDeleteFile", [&](void*) { + // Clear retryable flush error injection + fault_fs_->SetFilesystemActive(true); + + // Coerce race between ending auto recovery in db destruction and flush + // error recovery + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"PostEndAutoRecovery", "FlushJob::WriteLevel0Table"}}); + db_close_thread = port::Thread([&] { Close(); }); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = Flush(); + ASSERT_NOK(s); + + int placeholder = 1; + listener->WaitForRecovery(placeholder); + ASSERT_TRUE(listener->new_bg_error().IsShutdownInProgress()); + + // Prior to the fix, the db close will crash due to the recovery thread for + // flush error is not joined by the time of destruction. + db_close_thread.join(); + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Destroy(options); +} + TEST_F(DBErrorHandlingFSTest, FlushReadError) { std::shared_ptr listener = std::make_shared(); @@ -2568,8 +2720,8 @@ TEST_F(DBErrorHandlingFSTest, AtomicFlushReadError) { s = dbfull()->TEST_GetBGError(); ASSERT_OK(s); - TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, - GetDefaultOptions()); + ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, + GetDefaultOptions())); ASSERT_EQ("val", Get(Key(0))); } @@ -2609,8 +2761,8 @@ TEST_F(DBErrorHandlingFSTest, AtomicFlushNoSpaceError) { s = dbfull()->TEST_GetBGError(); ASSERT_OK(s); - TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, - GetDefaultOptions()); + ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, + GetDefaultOptions())); ASSERT_EQ("val", Get(Key(0))); } @@ -2683,9 +2835,9 @@ TEST_F(DBErrorHandlingFSTest, CompactionReadRetryableErrorAutoRecover) { class DBErrorHandlingFencingTest : public DBErrorHandlingFSTest, public testing::WithParamInterface {}; -TEST_P(DBErrorHandlingFencingTest, FLushWriteFenced) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); +TEST_P(DBErrorHandlingFencingTest, FlushWriteFenced) { + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -2712,8 +2864,8 @@ TEST_P(DBErrorHandlingFencingTest, FLushWriteFenced) { } TEST_P(DBErrorHandlingFencingTest, ManifestWriteFenced) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -2747,8 +2899,8 @@ TEST_P(DBErrorHandlingFencingTest, ManifestWriteFenced) { } TEST_P(DBErrorHandlingFencingTest, CompactionWriteFenced) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -2788,8 +2940,8 @@ TEST_P(DBErrorHandlingFencingTest, CompactionWriteFenced) { } TEST_P(DBErrorHandlingFencingTest, WALWriteFenced) { - std::shared_ptr listener( - new ErrorHandlerFSListener()); + std::shared_ptr listener = + std::make_shared(); Options options = GetDefaultOptions(); options.env = fault_env_.get(); options.create_if_missing = true; @@ -2863,13 +3015,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/db/event_helpers.cc b/db/event_helpers.cc index 7987b8ec6a5a..65f6a5a48612 100644 --- a/db/event_helpers.cc +++ b/db/event_helpers.cc @@ -10,13 +10,11 @@ #include "rocksdb/utilities/customizable_util.h" namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE Status EventListener::CreateFromString(const ConfigOptions& config_options, const std::string& id, std::shared_ptr* result) { - return LoadSharedObject(config_options, id, nullptr, result); + return LoadSharedObject(config_options, id, result); } -#endif // ROCKSDB_LITE namespace { template @@ -32,7 +30,6 @@ void EventHelpers::AppendCurrentTime(JSONWriter* jwriter) { .count(); } -#ifndef ROCKSDB_LITE void EventHelpers::NotifyTableFileCreationStarted( const std::vector>& listeners, const std::string& db_name, const std::string& cf_name, @@ -50,13 +47,11 @@ void EventHelpers::NotifyTableFileCreationStarted( listener->OnTableFileCreationStarted(info); } } -#endif // !ROCKSDB_LITE void EventHelpers::NotifyOnBackgroundError( const std::vector>& listeners, BackgroundErrorReason reason, Status* bg_error, InstrumentedMutex* db_mutex, bool* auto_recovery) { -#ifndef ROCKSDB_LITE if (listeners.empty()) { return; } @@ -71,13 +66,6 @@ void EventHelpers::NotifyOnBackgroundError( } } db_mutex->Lock(); -#else - (void)listeners; - (void)reason; - (void)bg_error; - (void)db_mutex; - (void)auto_recovery; -#endif // ROCKSDB_LITE } void EventHelpers::LogAndNotifyTableFileCreationFinished( @@ -134,6 +122,8 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished( << "column_family_name" << table_properties.column_family_name << "column_family_id" << table_properties.column_family_id << "comparator" << table_properties.comparator_name + << "user_defined_timestamps_persisted" + << table_properties.user_defined_timestamps_persisted << "merge_operator" << table_properties.merge_operator_name << "prefix_extractor_name" << table_properties.prefix_extractor_name << "property_collectors" @@ -179,7 +169,6 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished( event_logger->Log(jwriter); } -#ifndef ROCKSDB_LITE if (listeners.empty()) { return; } @@ -198,13 +187,6 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished( listener->OnTableFileCreated(info); } info.status.PermitUncheckedError(); -#else - (void)listeners; - (void)db_name; - (void)cf_name; - (void)file_path; - (void)reason; -#endif // !ROCKSDB_LITE } void EventHelpers::LogAndNotifyTableFileDeletion( @@ -226,7 +208,6 @@ void EventHelpers::LogAndNotifyTableFileDeletion( event_logger->Log(jwriter); -#ifndef ROCKSDB_LITE if (listeners.empty()) { return; } @@ -239,22 +220,18 @@ void EventHelpers::LogAndNotifyTableFileDeletion( listener->OnTableFileDeleted(info); } info.status.PermitUncheckedError(); -#else - (void)file_path; - (void)dbname; - (void)listeners; -#endif // !ROCKSDB_LITE } void EventHelpers::NotifyOnErrorRecoveryEnd( const std::vector>& listeners, const Status& old_bg_error, const Status& new_bg_error, InstrumentedMutex* db_mutex) { -#ifndef ROCKSDB_LITE if (!listeners.empty()) { db_mutex->AssertHeld(); // release lock while notifying events db_mutex->Unlock(); + TEST_SYNC_POINT("NotifyOnErrorRecoveryEnd:MutexUnlocked:1"); + TEST_SYNC_POINT("NotifyOnErrorRecoveryEnd:MutexUnlocked:2"); for (auto& listener : listeners) { BackgroundErrorRecoveryInfo info; info.old_bg_error = old_bg_error; @@ -265,16 +242,11 @@ void EventHelpers::NotifyOnErrorRecoveryEnd( info.new_bg_error.PermitUncheckedError(); } db_mutex->Lock(); + } else { + old_bg_error.PermitUncheckedError(); } -#else - (void)listeners; - (void)old_bg_error; - (void)new_bg_error; - (void)db_mutex; -#endif // ROCKSDB_LITE } -#ifndef ROCKSDB_LITE void EventHelpers::NotifyBlobFileCreationStarted( const std::vector>& listeners, const std::string& db_name, const std::string& cf_name, @@ -289,7 +261,6 @@ void EventHelpers::NotifyBlobFileCreationStarted( listener->OnBlobFileCreationStarted(info); } } -#endif // !ROCKSDB_LITE void EventHelpers::LogAndNotifyBlobFileCreationFinished( EventLogger* event_logger, @@ -314,7 +285,6 @@ void EventHelpers::LogAndNotifyBlobFileCreationFinished( event_logger->Log(jwriter); } -#ifndef ROCKSDB_LITE if (listeners.empty()) { return; } @@ -325,12 +295,6 @@ void EventHelpers::LogAndNotifyBlobFileCreationFinished( listener->OnBlobFileCreated(info); } info.status.PermitUncheckedError(); -#else - (void)listeners; - (void)db_name; - (void)file_path; - (void)creation_reason; -#endif } void EventHelpers::LogAndNotifyBlobFileDeletion( @@ -352,7 +316,6 @@ void EventHelpers::LogAndNotifyBlobFileDeletion( jwriter.EndObject(); event_logger->Log(jwriter); } -#ifndef ROCKSDB_LITE if (listeners.empty()) { return; } @@ -361,11 +324,6 @@ void EventHelpers::LogAndNotifyBlobFileDeletion( listener->OnBlobFileDeleted(info); } info.status.PermitUncheckedError(); -#else - (void)listeners; - (void)dbname; - (void)file_path; -#endif // !ROCKSDB_LITE } } // namespace ROCKSDB_NAMESPACE diff --git a/db/event_helpers.h b/db/event_helpers.h index 68d819fe6bd5..a1331d8a9a80 100644 --- a/db/event_helpers.h +++ b/db/event_helpers.h @@ -19,12 +19,10 @@ namespace ROCKSDB_NAMESPACE { class EventHelpers { public: static void AppendCurrentTime(JSONWriter* json_writer); -#ifndef ROCKSDB_LITE static void NotifyTableFileCreationStarted( const std::vector>& listeners, const std::string& db_name, const std::string& cf_name, const std::string& file_path, int job_id, TableFileCreationReason reason); -#endif // !ROCKSDB_LITE static void NotifyOnBackgroundError( const std::vector>& listeners, BackgroundErrorReason reason, Status* bg_error, @@ -48,13 +46,11 @@ class EventHelpers { const Status& old_bg_error, const Status& new_bg_error, InstrumentedMutex* db_mutex); -#ifndef ROCKSDB_LITE static void NotifyBlobFileCreationStarted( const std::vector>& listeners, const std::string& db_name, const std::string& cf_name, const std::string& file_path, int job_id, BlobFileCreationReason creation_reason); -#endif // !ROCKSDB_LITE static void LogAndNotifyBlobFileCreationFinished( EventLogger* event_logger, diff --git a/db/experimental.cc b/db/experimental.cc index 20b5daa970b9..f6f920b2ccb7 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -12,7 +12,6 @@ namespace ROCKSDB_NAMESPACE { namespace experimental { -#ifndef ROCKSDB_LITE Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end) { @@ -30,19 +29,6 @@ Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) { return db->PromoteL0(column_family, target_level); } -#else // ROCKSDB_LITE - -Status SuggestCompactRange(DB* /*db*/, ColumnFamilyHandle* /*column_family*/, - const Slice* /*begin*/, const Slice* /*end*/) { - return Status::NotSupported("Not supported in RocksDB LITE"); -} - -Status PromoteL0(DB* /*db*/, ColumnFamilyHandle* /*column_family*/, - int /*target_level*/) { - return Status::NotSupported("Not supported in RocksDB LITE"); -} - -#endif // ROCKSDB_LITE Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end) { return SuggestCompactRange(db, db->DefaultColumnFamily(), begin, end); @@ -52,6 +38,8 @@ Status UpdateManifestForFilesState( const DBOptions& db_opts, const std::string& db_name, const std::vector& column_families, const UpdateManifestForFilesStateOptions& opts) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; OfflineManifestWriter w(db_opts, db_name); Status s = w.Recover(column_families); @@ -114,7 +102,8 @@ Status UpdateManifestForFilesState( lf->oldest_blob_file_number, lf->oldest_ancester_time, lf->file_creation_time, lf->epoch_number, lf->file_checksum, lf->file_checksum_func_name, lf->unique_id, - lf->compensated_range_deletion_size); + lf->compensated_range_deletion_size, lf->tail_size, + lf->user_defined_timestamps_persisted); } } } else { @@ -128,7 +117,7 @@ Status UpdateManifestForFilesState( std::unique_ptr db_dir; s = fs->NewDirectory(db_name, IOOptions(), &db_dir, nullptr); if (s.ok()) { - s = w.LogAndApply(cfd, &edit, db_dir.get()); + s = w.LogAndApply(read_options, cfd, &edit, db_dir.get()); } if (s.ok()) { ++cfs_updated; diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index c12313a4ffda..749a172ac609 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -17,7 +17,6 @@ namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE class ExternalSSTFileBasicTest : public DBTestBase, public ::testing::WithParamInterface> { @@ -103,7 +102,8 @@ class ExternalSSTFileBasicTest // all point operators, even though sst_file_writer.DeleteRange // must be called before other sst_file_writer methods. This is // because point writes take precedence over range deletions - // in the same ingested sst. + // in the same ingested sst. This precedence is part of + // `SstFileWriter::DeleteRange()`'s API contract. std::string start_key = Key(range_deletions[i].first); std::string end_key = Key(range_deletions[i].second); s = sst_file_writer.DeleteRange(start_key, end_key); @@ -1183,6 +1183,7 @@ TEST_F(ExternalSSTFileBasicTest, SyncFailure) { ASSERT_OK(sst_file_writer->Finish()); IngestExternalFileOptions ingest_opt; + ASSERT_FALSE(ingest_opt.write_global_seqno); // new default if (i == 0) { ingest_opt.move_files = true; } @@ -1345,7 +1346,7 @@ TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) { // range del [0, 50) in L6 file, [50, 100) in L0 file, [100, 150) in memtable for (int i = 0; i < 3; i++) { if (i != 0) { - db_->Flush(FlushOptions()); + ASSERT_OK(db_->Flush(FlushOptions())); if (i == 1) { MoveFilesToLevel(kNumLevels - 1); } @@ -1418,6 +1419,7 @@ TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) { ASSERT_EQ(4, NumTableFilesAtLevel(0)); ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2)); ASSERT_EQ(2, NumTableFilesAtLevel(options.num_levels - 1)); + VerifyDBFromMap(true_data); } TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) { @@ -1462,7 +1464,95 @@ TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) { DestroyAndRecreateExternalSSTFilesDir(); } +TEST_F(ExternalSSTFileBasicTest, UnorderedRangeDeletions) { + int kNumLevels = 7; + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.num_levels = kNumLevels; + Reopen(options); + + std::map true_data; + int file_id = 1; + + // prevent range deletions from being dropped due to becoming obsolete. + const Snapshot* snapshot = db_->GetSnapshot(); + + // Range del [0, 50) in memtable + ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0), + Key(50))); + + // Out of order range del overlaps memtable, so flush is required before file + // is ingested into L0 + ASSERT_OK(GenerateAndAddExternalFile( + options, {60, 90}, {ValueType::kTypeValue, ValueType::kTypeValue}, + {{65, 70}, {45, 50}}, file_id++, true /* write_global_seqno */, + true /* verify_checksums_before_ingest */, &true_data)); + ASSERT_EQ(2, true_data.size()); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 1)); + VerifyDBFromMap(true_data); + + // Compact to L6 + MoveFilesToLevel(kNumLevels - 1); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 1)); + VerifyDBFromMap(true_data); + + // Ingest a file containing out of order range dels that cover nothing + ASSERT_OK(GenerateAndAddExternalFile( + options, {151, 175}, {ValueType::kTypeValue, ValueType::kTypeValue}, + {{160, 200}, {120, 180}}, file_id++, true /* write_global_seqno */, + true /* verify_checksums_before_ingest */, &true_data)); + ASSERT_EQ(4, true_data.size()); + ASSERT_EQ(0, NumTableFilesAtLevel(0)); + ASSERT_EQ(2, NumTableFilesAtLevel(kNumLevels - 1)); + VerifyDBFromMap(true_data); + + // Ingest a file containing out of order range dels that cover keys in L6 + ASSERT_OK(GenerateAndAddExternalFile( + options, {}, {}, {{190, 200}, {170, 180}, {55, 65}}, file_id++, + true /* write_global_seqno */, true /* verify_checksums_before_ingest */, + &true_data)); + ASSERT_EQ(2, true_data.size()); + ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2)); + ASSERT_EQ(2, NumTableFilesAtLevel(kNumLevels - 1)); + VerifyDBFromMap(true_data); + + db_->ReleaseSnapshot(snapshot); +} + +TEST_F(ExternalSSTFileBasicTest, RangeDeletionEndComesBeforeStart) { + Options options = CurrentOptions(); + SstFileWriter sst_file_writer(EnvOptions(), options); + + // "file.sst" + // Verify attempt to delete 300 => 200 fails. + // Then, verify attempt to delete 300 => 300 succeeds but writes nothing. + // Afterwards, verify attempt to delete 300 => 400 works normally. + std::string file = sst_files_dir_ + "file.sst"; + ASSERT_OK(sst_file_writer.Open(file)); + ASSERT_TRUE( + sst_file_writer.DeleteRange(Key(300), Key(200)).IsInvalidArgument()); + ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(300))); + ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(400))); + ExternalSstFileInfo file_info; + Status s = sst_file_writer.Finish(&file_info); + ASSERT_OK(s) << s.ToString(); + ASSERT_EQ(file_info.file_path, file); + ASSERT_EQ(file_info.num_entries, 0); + ASSERT_EQ(file_info.smallest_key, ""); + ASSERT_EQ(file_info.largest_key, ""); + ASSERT_EQ(file_info.num_range_del_entries, 1); + ASSERT_EQ(file_info.smallest_range_del_key, Key(300)); + ASSERT_EQ(file_info.largest_range_del_key, Key(400)); +} + TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) { + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + if (!verify_checksums_before_ingest) { + ROCKSDB_GTEST_BYPASS("Bypassing test when !verify_checksums_before_ingest"); + return; + } bool change_checksum_called = false; const auto& change_checksum = [&](void* arg) { if (!change_checksum_called) { @@ -1480,24 +1570,20 @@ TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) { SyncPoint::GetInstance()->EnableProcessing(); int file_id = 0; bool write_global_seqno = std::get<0>(GetParam()); - bool verify_checksums_before_ingest = std::get<1>(GetParam()); do { Options options = CurrentOptions(); DestroyAndReopen(options); std::map true_data; Status s = GenerateAndAddExternalFile( options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++, - write_global_seqno, verify_checksums_before_ingest, &true_data); - if (verify_checksums_before_ingest) { - ASSERT_NOK(s); - } else { - ASSERT_OK(s); - } + write_global_seqno, /*verify_checksums_before_ingest=*/true, + &true_data); + ASSERT_NOK(s); change_checksum_called = false; } while (ChangeOptionsForFileIngestionTest()); } -TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) { +TEST_P(ExternalSSTFileBasicTest, IngestFileWithCorruptedDataBlock) { if (!random_rwfile_supported_) { ROCKSDB_GTEST_SKIP("Test requires NewRandomRWFile support"); return; @@ -1505,15 +1591,21 @@ TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) { SyncPoint::GetInstance()->DisableProcessing(); int file_id = 0; EnvOptions env_options; + Random rnd(301); do { Options options = CurrentOptions(); + options.compression = kNoCompression; + BlockBasedTableOptions table_options; + table_options.block_size = 4 * 1024; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); std::string file_path = sst_files_dir_ + std::to_string(file_id++); SstFileWriter sst_file_writer(env_options, options); Status s = sst_file_writer.Open(file_path); ASSERT_OK(s); + // This should write more than 2 data blocks. for (int i = 0; i != 100; ++i) { std::string key = Key(i); - std::string value = Key(i) + std::to_string(0); + std::string value = rnd.RandomString(200); ASSERT_OK(sst_file_writer.Put(key, value)); } ASSERT_OK(sst_file_writer.Finish()); @@ -1524,11 +1616,11 @@ TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) { ASSERT_GT(file_size, 8); std::unique_ptr rwfile; ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions())); - // Manually corrupt the file - // We deterministically corrupt the first byte because we currently - // cannot choose a random offset. The reason for this limitation is that - // we do not checksum property block at present. - const uint64_t offset = 0; + // Corrupt the second data block. + // We need to corrupt a non-first and non-last data block + // since we access them to get smallest and largest internal + // key in the file in GetIngestedFileInfo(). + const uint64_t offset = 5000; char scratch[8] = {0}; Slice buf; ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch)); @@ -1662,11 +1754,11 @@ TEST_F(ExternalSSTFileBasicTest, IngestFileAfterDBPut) { Options options = CurrentOptions(); ASSERT_OK(Put("k", "a")); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(Put("k", "a")); - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(Put("k", "a")); - Flush(); + ASSERT_OK(Flush()); SstFileWriter sst_file_writer(EnvOptions(), options); // Current file size should be 0 after sst_file_writer init and before open a @@ -1988,7 +2080,6 @@ INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest, std::make_tuple(false, true), std::make_tuple(false, false))); -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 80fd82dab98b..3cc4d6752e0e 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "db/external_sst_file_ingestion_job.h" @@ -220,6 +219,8 @@ Status ExternalSstFileIngestionJob::Prepare( std::string requested_checksum_func_name; // TODO: rate limit file reads for checksum calculation during file // ingestion. + // TODO: plumb Env::IOActivity + ReadOptions ro; IOStatus io_s = GenerateOneFileChecksum( fs_.get(), files_to_ingest_[i].internal_file_path, db_options_.file_checksum_gen_factory.get(), @@ -227,8 +228,8 @@ Status ExternalSstFileIngestionJob::Prepare( &generated_checksum_func_name, ingestion_options_.verify_checksums_readahead_size, db_options_.allow_mmap_reads, io_tracer_, - db_options_.rate_limiter.get(), - Env::IO_TOTAL /* rate_limiter_priority */); + db_options_.rate_limiter.get(), ro, db_options_.stats, + db_options_.clock); if (!io_s.ok()) { status = io_s; ROCKS_LOG_WARN(db_options_.info_log, @@ -351,7 +352,7 @@ Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed, std::string end_str; AppendUserKeyWithMaxTimestamp( &begin_str, file_to_ingest.smallest_internal_key.user_key(), ts_sz); - AppendKeyWithMinTimestamp( + AppendUserKeyWithMinTimestamp( &end_str, file_to_ingest.largest_internal_key.user_key(), ts_sz); keys.emplace_back(std::move(begin_str)); keys.emplace_back(std::move(end_str)); @@ -467,6 +468,16 @@ Status ExternalSstFileIngestionJob::Run() { current_time = oldest_ancester_time = static_cast(temp_current_time); } + uint64_t tail_size = 0; + bool contain_no_data_blocks = f.table_properties.num_entries > 0 && + (f.table_properties.num_entries == + f.table_properties.num_range_deletions); + if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) { + uint64_t file_size = f.fd.GetFileSize(); + assert(f.table_properties.tail_start_offset <= file_size); + tail_size = file_size - f.table_properties.tail_start_offset; + } + FileMetaData f_metadata( f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno, @@ -475,7 +486,9 @@ Status ExternalSstFileIngestionJob::Run() { ingestion_options_.ingest_behind ? kReservedEpochNumberForFileIngestedBehind : cfd_->NewEpochNumber(), - f.file_checksum, f.file_checksum_func_name, f.unique_id, 0); + f.file_checksum, f.file_checksum_func_name, f.unique_id, 0, tail_size, + static_cast( + f.table_properties.user_defined_timestamps_persisted)); f_metadata.temperature = f.file_temperature; edit_.AddFile(f.picked_level, f_metadata); } @@ -677,10 +690,14 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( sst_file_reader.reset(new RandomAccessFileReader( std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_)); + // TODO(yuzhangyu): User-defined timestamps doesn't support external sst file + // ingestion. Pass in the correct `user_defined_timestamps_persisted` flag + // for creating `TableReaderOptions` when the support is there. status = cfd_->ioptions()->table_factory->NewTableReader( TableReaderOptions( *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, env_options_, cfd_->internal_comparator(), + sv->mutable_cf_options.block_protection_bytes_per_key, /*skip_filters*/ false, /*immortal*/ false, /*force_direct_prefetch*/ false, /*level*/ -1, /*block_cache_tracer*/ nullptr, @@ -695,13 +712,14 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( // If customized readahead size is needed, we can pass a user option // all the way to here. Right now we just rely on the default readahead // to keep things simple. + // TODO: plumb Env::IOActivity ReadOptions ro; ro.readahead_size = ingestion_options_.verify_checksums_readahead_size; status = table_reader->VerifyChecksum( ro, TableReaderCaller::kExternalSSTIngestion); - } - if (!status.ok()) { - return status; + if (!status.ok()) { + return status; + } } // Get the external file properties @@ -748,18 +766,11 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( file_to_ingest->num_range_deletions = props->num_range_deletions; ParsedInternalKey key; + // TODO: plumb Env::IOActivity ReadOptions ro; - // During reading the external file we can cache blocks that we read into - // the block cache, if we later change the global seqno of this file, we will - // have block in cache that will include keys with wrong seqno. - // We need to disable fill_cache so that we read from the file without - // updating the block cache. - ro.fill_cache = false; std::unique_ptr iter(table_reader->NewIterator( ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); - std::unique_ptr range_del_iter( - table_reader->NewRangeTombstoneIterator(ro)); // Get first (smallest) and last (largest) key from file. file_to_ingest->smallest_internal_key = @@ -781,8 +792,33 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( } file_to_ingest->smallest_internal_key.SetFrom(key); - iter->SeekToLast(); - pik_status = ParseInternalKey(iter->key(), &key, allow_data_in_errors); + Slice largest; + if (strcmp(cfd_->ioptions()->table_factory->Name(), "PlainTable") == 0) { + // PlainTable iterator does not support SeekToLast(). + largest = iter->key(); + for (; iter->Valid(); iter->Next()) { + if (cfd_->internal_comparator().Compare(iter->key(), largest) > 0) { + largest = iter->key(); + } + } + if (!iter->status().ok()) { + return iter->status(); + } + } else { + iter->SeekToLast(); + if (!iter->Valid()) { + if (iter->status().ok()) { + // The file contains at least 1 key since iter is valid after + // SeekToFirst(). + return Status::Corruption("Can not find largest key in sst file"); + } else { + return iter->status(); + } + } + largest = iter->key(); + } + + pik_status = ParseInternalKey(largest, &key, allow_data_in_errors); if (!pik_status.ok()) { return Status::Corruption("Corrupted key in external file. ", pik_status.getState()); @@ -793,8 +829,12 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( file_to_ingest->largest_internal_key.SetFrom(key); bounds_set = true; + } else if (!iter->status().ok()) { + return iter->status(); } + std::unique_ptr range_del_iter( + table_reader->NewRangeTombstoneIterator(ro)); // We may need to adjust these key bounds, depending on whether any range // deletion tombstones extend past them. const Comparator* ucmp = cfd_->internal_comparator().user_comparator(); @@ -864,6 +904,7 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( bool overlap_with_db = false; Arena arena; + // TODO: plumb Env::IOActivity ReadOptions ro; ro.total_order_seek = true; int target_level = 0; @@ -1048,13 +1089,15 @@ IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile( std::string file_checksum_func_name; std::string requested_checksum_func_name; // TODO: rate limit file reads for checksum calculation during file ingestion. + // TODO: plumb Env::IOActivity + ReadOptions ro; IOStatus io_s = GenerateOneFileChecksum( fs_.get(), file_to_ingest->internal_file_path, db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name, &file_checksum, &file_checksum_func_name, ingestion_options_.verify_checksums_readahead_size, db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get(), - Env::IO_TOTAL /* rate_limiter_priority */); + ro, db_options_.stats, db_options_.clock); if (!io_s.ok()) { return io_s; } @@ -1097,5 +1140,3 @@ Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) { } } // namespace ROCKSDB_NAMESPACE - -#endif // !ROCKSDB_LITE diff --git a/db/external_sst_file_test.cc b/db/external_sst_file_test.cc index edbed9e9eba3..ef4ab7fa58ab 100644 --- a/db/external_sst_file_test.cc +++ b/db/external_sst_file_test.cc @@ -3,9 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#if !defined(ROCKSDB_LITE) && !defined(USE_AWS) - #include +#include #include "db/db_test_util.h" #include "db/dbformat.h" @@ -23,18 +22,19 @@ namespace ROCKSDB_NAMESPACE { // A test environment that can be configured to fail the Link operation. -class ExternalSSTTestEnv : public EnvWrapper { +class ExternalSSTTestFS : public FileSystemWrapper { public: - ExternalSSTTestEnv(Env* t, bool fail_link) - : EnvWrapper(t), fail_link_(fail_link) {} - static const char* kClassName() { return "ExternalSSTTestEnv"; } + ExternalSSTTestFS(const std::shared_ptr& t, bool fail_link) + : FileSystemWrapper(t), fail_link_(fail_link) {} + static const char* kClassName() { return "ExternalSSTTestFS"; } const char* Name() const override { return kClassName(); } - Status LinkFile(const std::string& s, const std::string& t) override { + IOStatus LinkFile(const std::string& s, const std::string& t, + const IOOptions& options, IODebugContext* dbg) override { if (fail_link_) { - return Status::NotSupported("Link failed"); + return IOStatus::NotSupported("Link failed"); } - return target()->LinkFile(s, t); + return target()->LinkFile(s, t, options, dbg); } void set_fail_link(bool fail_link) { fail_link_ = fail_link; } @@ -68,24 +68,24 @@ class ExternSSTFileLinkFailFallbackTest : public ExternalSSTFileTestBase, public ::testing::WithParamInterface> { public: - ExternSSTFileLinkFailFallbackTest() - : test_env_(new ExternalSSTTestEnv(env_, true)) { + ExternSSTFileLinkFailFallbackTest() { + fs_ = std::make_shared(env_->GetFileSystem(), true); + test_env_.reset(new CompositeEnvWrapper(env_, fs_)); options_ = CurrentOptions(); options_.disable_auto_compactions = true; - options_.env = test_env_; + options_.env = test_env_.get(); } void TearDown() override { delete db_; db_ = nullptr; ASSERT_OK(DestroyDB(dbname_, options_)); - delete test_env_; - test_env_ = nullptr; } protected: Options options_; - ExternalSSTTestEnv* test_env_; + std::shared_ptr fs_; + std::unique_ptr test_env_; }; class ExternalSSTFileTest @@ -538,6 +538,113 @@ TEST_F(ExternalSSTFileTest, Basic) { kRangeDelSkipConfigs)); } +TEST_F(ExternalSSTFileTest, BasicWideColumn) { + do { + Options options = CurrentOptions(); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Current file size should be 0 after sst_file_writer init and before open + // a file. + ASSERT_EQ(sst_file_writer.FileSize(), 0); + + std::string file = sst_files_dir_ + "wide_column_file.sst"; + ASSERT_OK(sst_file_writer.Open(file)); + for (int k = 0; k < 10; k++) { + std::string val1 = Key(k) + "_attr_1_val"; + std::string val2 = Key(k) + "_attr_2_val"; + WideColumns columns{{"attr_1", val1}, {"attr_2", val2}}; + ASSERT_OK(sst_file_writer.PutEntity(Key(k), columns)); + } + ExternalSstFileInfo file_info; + ASSERT_OK(sst_file_writer.Finish(&file_info)); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + ASSERT_EQ(file_info.file_path, file); + ASSERT_EQ(file_info.num_entries, 10); + ASSERT_EQ(file_info.smallest_key, Key(0)); + ASSERT_EQ(file_info.largest_key, Key(9)); + ASSERT_EQ(file_info.num_range_del_entries, 0); + ASSERT_EQ(file_info.smallest_range_del_key, ""); + ASSERT_EQ(file_info.largest_range_del_key, ""); + + DestroyAndReopen(options); + // Add file using file path + ASSERT_OK(DeprecatedAddFile({file})); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 10; k++) { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + Key(k), &result)); + std::string val1 = Key(k) + "_attr_1_val"; + std::string val2 = Key(k) + "_attr_2_val"; + WideColumns expected_columns{{"attr_1", val1}, {"attr_2", val2}}; + ASSERT_EQ(result.columns(), expected_columns); + } + + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction | + kRangeDelSkipConfigs)); +} + +TEST_F(ExternalSSTFileTest, BasicMixed) { + do { + Options options = CurrentOptions(); + + SstFileWriter sst_file_writer(EnvOptions(), options); + + // Current file size should be 0 after sst_file_writer init and before open + // a file. + ASSERT_EQ(sst_file_writer.FileSize(), 0); + + std::string file = sst_files_dir_ + "mixed_file.sst"; + ASSERT_OK(sst_file_writer.Open(file)); + for (int k = 0; k < 100; k++) { + if (k % 5 == 0) { + std::string val1 = Key(k) + "_attr_1_val"; + std::string val2 = Key(k) + "_attr_2_val"; + WideColumns columns{{"attr_1", val1}, {"attr_2", val2}}; + ASSERT_OK(sst_file_writer.PutEntity(Key(k), columns)); + } else { + ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val")); + } + } + ExternalSstFileInfo file_info; + ASSERT_OK(sst_file_writer.Finish(&file_info)); + + // Current file size should be non-zero after success write. + ASSERT_GT(sst_file_writer.FileSize(), 0); + + ASSERT_EQ(file_info.file_path, file); + ASSERT_EQ(file_info.num_entries, 100); + ASSERT_EQ(file_info.smallest_key, Key(0)); + ASSERT_EQ(file_info.largest_key, Key(99)); + ASSERT_EQ(file_info.num_range_del_entries, 0); + ASSERT_EQ(file_info.smallest_range_del_key, ""); + ASSERT_EQ(file_info.largest_range_del_key, ""); + + DestroyAndReopen(options); + // Add file using file path + ASSERT_OK(DeprecatedAddFile({file})); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + for (int k = 0; k < 10; k++) { + if (k % 5 == 0) { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + Key(k), &result)); + std::string val1 = Key(k) + "_attr_1_val"; + std::string val2 = Key(k) + "_attr_2_val"; + WideColumns expected_columns{{"attr_1", val1}, {"attr_2", val2}}; + ASSERT_EQ(result.columns(), expected_columns); + } else { + ASSERT_EQ(Get(Key(k)), Key(k) + "_val"); + } + } + } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction | + kRangeDelSkipConfigs)); +} + class SstFileWriterCollector : public TablePropertiesCollector { public: explicit SstFileWriterCollector(const std::string prefix) : prefix_(prefix) { @@ -1289,7 +1396,7 @@ TEST_F(ExternalSSTFileTest, IngestNonExistingFile) { ASSERT_OK(Flush()); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - ASSERT_OK(dbfull()->TEST_WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // After full compaction, there should be only 1 file. std::vector files; @@ -1996,7 +2103,7 @@ TEST_F(ExternalSSTFileTest, FileWithCFInfo) { TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) { const bool fail_link = std::get<0>(GetParam()); const bool failed_move_fall_back_to_copy = std::get<1>(GetParam()); - test_env_->set_fail_link(fail_link); + fs_->set_fail_link(fail_link); const EnvOptions env_options; DestroyAndReopen(options_); const int kNumKeys = 10000; @@ -2160,13 +2267,13 @@ TEST_P(ExternalSSTFileTest, IngestBehind) { // Insert 100 -> 200 into the memtable for (int i = 100; i <= 200; i++) { ASSERT_OK(Put(Key(i), "memtable")); - true_data[Key(i)] = "memtable"; } // Insert 100 -> 200 using IngestExternalFile file_data.clear(); for (int i = 0; i <= 20; i++) { file_data.emplace_back(Key(i), "ingest_behind"); + true_data[Key(i)] = "ingest_behind"; } bool allow_global_seqno = true; @@ -2188,6 +2295,7 @@ TEST_P(ExternalSSTFileTest, IngestBehind) { options.num_levels = 3; DestroyAndReopen(options); + true_data.clear(); // Insert 100 -> 200 into the memtable for (int i = 100; i <= 200; i++) { ASSERT_OK(Put(Key(i), "memtable")); @@ -2207,12 +2315,43 @@ TEST_P(ExternalSSTFileTest, IngestBehind) { verify_checksums_before_ingest, true /*ingest_behind*/, false /*sort_data*/, &true_data)); ASSERT_EQ("0,1,1", FilesPerLevel()); + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files); + uint64_t ingested_file_number = level_to_files[2][0].fd.GetNumber(); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - // bottom level should be empty - ASSERT_EQ("0,1", FilesPerLevel()); - + // Last level should not be compacted + ASSERT_EQ("0,1,1", FilesPerLevel()); + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files); + ASSERT_EQ(ingested_file_number, level_to_files[2][0].fd.GetNumber()); size_t kcnt = 0; VerifyDBFromMap(true_data, &kcnt, false); + + // Auto-compaction should not include the last level. + // Trigger compaction if size amplification exceeds 110%. + options.compaction_options_universal.max_size_amplification_percent = 110; + options.level0_file_num_compaction_trigger = 4; + ASSERT_OK(TryReopen(options)); + Random rnd(301); + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 10; j++) { + true_data[Key(j)] = rnd.RandomString(1000); + ASSERT_OK(Put(Key(j), true_data[Key(j)])); + } + ASSERT_OK(Flush()); + } + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files); + ASSERT_EQ(1, level_to_files[2].size()); + ASSERT_EQ(ingested_file_number, level_to_files[2][0].fd.GetNumber()); + + // Turning off the option allows DB to compact ingested files. + options.allow_ingest_behind = false; + ASSERT_OK(TryReopen(options)); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &level_to_files); + ASSERT_EQ(1, level_to_files[2].size()); + ASSERT_NE(ingested_file_number, level_to_files[2][0].fd.GetNumber()); + VerifyDBFromMap(true_data, &kcnt, false); } TEST_F(ExternalSSTFileTest, SkipBloomFilter) { @@ -2484,6 +2623,7 @@ TEST_P(ExternalSSTFileTest, "AfterRead"); ingest_thread.join(); for (auto* iter : iters) { + ASSERT_OK(iter->status()); delete iter; } iters.clear(); @@ -2857,15 +2997,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "SKIPPED as External SST File Writer and Ingestion are not supported " - "in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index ddd4b47cc59c..d888dfde1041 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -443,7 +443,7 @@ TEST_P(FaultInjectionTest, UninstalledCompaction) { options_.level0_stop_writes_trigger = 1 << 10; options_.level0_slowdown_writes_trigger = 1 << 10; options_.max_background_compactions = 1; - OpenDB(); + ASSERT_OK(OpenDB()); if (!sequential_order_) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ diff --git a/db/file_indexer.cc b/db/file_indexer.cc index 608f1cb28dac..ee6cfdc03f08 100644 --- a/db/file_indexer.cc +++ b/db/file_indexer.cc @@ -56,17 +56,15 @@ void FileIndexer::GetNextLevelIndex(const size_t level, const size_t file_index, } else if (cmp_smallest == 0) { *left_bound = index.smallest_lb; *right_bound = index.smallest_rb; - } else if (cmp_smallest > 0 && cmp_largest < 0) { + } else if (cmp_largest < 0) { *left_bound = index.smallest_lb; *right_bound = index.largest_rb; } else if (cmp_largest == 0) { *left_bound = index.largest_lb; *right_bound = index.largest_rb; - } else if (cmp_largest > 0) { + } else { *left_bound = index.largest_lb; *right_bound = level_rb_[level + 1]; - } else { - assert(false); } assert(*left_bound >= 0); diff --git a/db/flush_job.cc b/db/flush_job.cc index 7acd83dacf55..0b60a4bbd0d2 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -75,8 +75,12 @@ const char* GetFlushReasonString(FlushReason flush_reason) { return "Manual Flush"; case FlushReason::kErrorRecovery: return "Error Recovery"; + case FlushReason::kErrorRecoveryRetryFlush: + return "Error Recovery Retry Flush"; case FlushReason::kWalFull: return "WAL Full"; + case FlushReason::kCatchUpAfterErrorRecovery: + return "Catch Up After Error Recovery"; default: return "Invalid"; } @@ -96,7 +100,7 @@ FlushJob::FlushJob( Statistics* stats, EventLogger* event_logger, bool measure_io_stats, const bool sync_output_directory, const bool write_manifest, Env::Priority thread_pri, const std::shared_ptr& io_tracer, - const SeqnoToTimeMapping& seqno_time_mapping, const std::string& db_id, + const SeqnoToTimeMapping& seqno_to_time_mapping, const std::string& db_id, const std::string& db_session_id, std::string full_history_ts_low, BlobFileCompletionCallback* blob_callback) : dbname_(dbname), @@ -132,7 +136,7 @@ FlushJob::FlushJob( clock_(db_options_.clock), full_history_ts_low_(std::move(full_history_ts_low)), blob_callback_(blob_callback), - db_impl_seqno_time_mapping_(seqno_time_mapping) { + db_impl_seqno_to_time_mapping_(seqno_to_time_mapping) { // Update the thread status to indicate flush. ReportStartedFlush(); TEST_SYNC_POINT("FlushJob::FlushJob()"); @@ -141,11 +145,12 @@ FlushJob::FlushJob( FlushJob::~FlushJob() { ThreadStatusUtil::ResetThreadStatus(); } void FlushJob::ReportStartedFlush() { - ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env, - db_options_.enable_thread_tracking); + ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking); + ThreadStatusUtil::SetColumnFamily(cfd_); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH); ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID, job_context_->job_id); + IOSTATS_RESET(bytes_written); } @@ -186,6 +191,10 @@ void FlushJob::PickMemTable() { return; } + // Track effective cutoff user-defined timestamp during flush if + // user-defined timestamps can be stripped. + GetEffectiveCutoffUDTForPickedMemTables(); + ReportFlushInputSize(mems_); // entries mems are (implicitly) sorted in ascending order by their created @@ -212,7 +221,8 @@ void FlushJob::PickMemTable() { } Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta, - bool* switched_to_mempurge) { + bool* switched_to_mempurge, bool* skipped_since_bg_error, + ErrorHandler* error_handler) { TEST_SYNC_POINT("FlushJob::Start"); db_mutex_->AssertHeld(); assert(pick_memtable_called); @@ -295,18 +305,37 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta, s = Status::ShutdownInProgress("Database shutdown"); } + if (s.ok()) { + s = MaybeIncreaseFullHistoryTsLowToAboveCutoffUDT(); + } + if (!s.ok()) { - cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber()); + cfd_->imm()->RollbackMemtableFlush( + mems_, /*rollback_succeeding_memtables=*/!db_options_.atomic_flush); } else if (write_manifest_) { - TEST_SYNC_POINT("FlushJob::InstallResults"); - // Replace immutable memtable with the generated Table - s = cfd_->imm()->TryInstallMemtableFlushResults( - cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_, - meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_, - log_buffer_, &committed_flush_jobs_info_, - !(mempurge_s.ok()) /* write_edit : true if no mempurge happened (or if aborted), + assert(!db_options_.atomic_flush); + if (!db_options_.atomic_flush && + flush_reason_ != FlushReason::kErrorRecovery && + flush_reason_ != FlushReason::kErrorRecoveryRetryFlush && + error_handler && !error_handler->GetBGError().ok() && + error_handler->IsBGWorkStopped()) { + cfd_->imm()->RollbackMemtableFlush( + mems_, /*rollback_succeeding_memtables=*/!db_options_.atomic_flush); + s = error_handler->GetBGError(); + if (skipped_since_bg_error) { + *skipped_since_bg_error = true; + } + } else { + TEST_SYNC_POINT("FlushJob::InstallResults"); + // Replace immutable memtable with the generated Table + s = cfd_->imm()->TryInstallMemtableFlushResults( + cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_, + meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_, + log_buffer_, &committed_flush_jobs_info_, + !(mempurge_s.ok()) /* write_edit : true if no mempurge happened (or if aborted), but 'false' if mempurge successful: no new min log number or new level 0 file path to write to manifest. */); + } } if (s.ok() && file_meta != nullptr) { @@ -355,6 +384,7 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta, << (IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos); } + TEST_SYNC_POINT("FlushJob::End"); return s; } @@ -387,6 +417,7 @@ Status FlushJob::MemPurge() { // Create two iterators, one for the memtable data (contains // info from puts + deletes), and one for the memtable // Range Tombstones (from DeleteRanges). + // TODO: plumb Env::IOActivity ReadOptions ro; ro.total_order_seek = true; Arena arena; @@ -485,6 +516,7 @@ Status FlushJob::MemPurge() { nullptr, ioptions->allow_data_in_errors, ioptions->enforce_single_del_contracts, /*manual_compaction_canceled=*/kManualCompactionCanceledFalse, + false /* must_count_input_entries */, /*compaction=*/nullptr, compaction_filter.get(), /*shutting_down=*/nullptr, ioptions->info_log, full_history_ts_low); @@ -604,12 +636,10 @@ Status FlushJob::MemPurge() { // we do not call SchedulePendingFlush(). cfd_->imm()->Add(new_mem, &job_context_->memtables_to_free); new_mem->Ref(); -#ifndef ROCKSDB_LITE // Piggyback FlushJobInfo on the first flushed memtable. db_mutex_->AssertHeld(); meta_.fd.file_size = 0; mems_[0]->SetFlushJobInfo(GetFlushJobInfo()); -#endif // !ROCKSDB_LITE db_mutex_->Unlock(); } else { s = Status::Aborted(Slice("Mempurge filled more than one memtable.")); @@ -679,6 +709,7 @@ bool FlushJob::MemPurgeDecider(double threshold) { // Cochran formula for determining sample size. // 95% confidence interval, 7% precision. // n0 = (1.96*1.96)*0.25/(0.07*0.07) = 196.0 + // TODO: plumb Env::IOActivity double n0 = 196.0; ReadOptions ro; ro.total_order_seek = true; @@ -828,10 +859,11 @@ Status FlushJob::WriteLevel0Table() { Status s; SequenceNumber smallest_seqno = mems_.front()->GetEarliestSequenceNumber(); - if (!db_impl_seqno_time_mapping_.Empty()) { - // make a local copy, as the seqno_time_mapping from db_impl is not thread - // safe, which will be used while not holding the db_mutex. - seqno_to_time_mapping_ = db_impl_seqno_time_mapping_.Copy(smallest_seqno); + if (!db_impl_seqno_to_time_mapping_.Empty()) { + // make a local copy, as the seqno_to_time_mapping from db_impl is not + // thread safe, which will be used while not holding the db_mutex. + seqno_to_time_mapping_ = + db_impl_seqno_to_time_mapping_.Copy(smallest_seqno); } std::vector blob_file_additions; @@ -851,10 +883,12 @@ Status FlushJob::WriteLevel0Table() { range_del_iters; ReadOptions ro; ro.total_order_seek = true; + ro.io_activity = Env::IOActivity::kFlush; Arena arena; uint64_t total_num_entries = 0, total_num_deletes = 0; uint64_t total_data_size = 0; size_t total_memory_usage = 0; + uint64_t total_num_range_deletes = 0; // Used for testing: uint64_t mems_size = mems_.size(); (void)mems_size; // avoids unused variable error when @@ -879,15 +913,20 @@ Status FlushJob::WriteLevel0Table() { total_num_deletes += m->num_deletes(); total_data_size += m->get_data_size(); total_memory_usage += m->ApproximateMemoryUsage(); + total_num_range_deletes += m->num_range_deletes(); } + // TODO(cbi): when memtable is flushed due to number of range deletions + // hitting limit memtable_max_range_deletions, flush_reason_ is still + // "Write Buffer Full", should make update flush_reason_ accordingly. event_logger_->Log() << "job" << job_context_->job_id << "event" << "flush_started" << "num_memtables" << mems_.size() << "num_entries" << total_num_entries << "num_deletes" << total_num_deletes << "total_data_size" << total_data_size << "memory_usage" - << total_memory_usage << "flush_reason" + << total_memory_usage << "num_range_deletes" + << total_num_range_deletes << "flush_reason" << GetFlushReasonString(flush_reason_); { @@ -942,17 +981,20 @@ Status FlushJob::WriteLevel0Table() { meta_.fd.GetNumber()); const SequenceNumber job_snapshot_seq = job_context_->GetJobSnapshotSequence(); - s = BuildTable( - dbname_, versions_, db_options_, tboptions, file_options_, - cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_, - &blob_file_additions, existing_snapshots_, - earliest_write_conflict_snapshot_, job_snapshot_seq, - snapshot_checker_, mutable_cf_options_.paranoid_file_checks, - cfd_->internal_stats(), &io_s, io_tracer_, - BlobFileCreationReason::kFlush, seqno_to_time_mapping_, event_logger_, - job_context_->job_id, io_priority, &table_properties_, write_hint, - full_history_ts_low, blob_callback_, base_, &num_input_entries, - &memtable_payload_bytes, &memtable_garbage_bytes); + const ReadOptions read_options(Env::IOActivity::kFlush); + s = BuildTable(dbname_, versions_, db_options_, tboptions, file_options_, + read_options, cfd_->table_cache(), iter.get(), + std::move(range_del_iters), &meta_, &blob_file_additions, + existing_snapshots_, earliest_write_conflict_snapshot_, + job_snapshot_seq, snapshot_checker_, + mutable_cf_options_.paranoid_file_checks, + cfd_->internal_stats(), &io_s, io_tracer_, + BlobFileCreationReason::kFlush, seqno_to_time_mapping_, + event_logger_, job_context_->job_id, io_priority, + &table_properties_, write_hint, full_history_ts_low, + blob_callback_, base_, &num_input_entries, + &memtable_payload_bytes, &memtable_garbage_bytes); + TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:s", &s); // TODO: Cleanup io_status in BuildTable and table builders assert(!s.ok() || io_s.ok()); io_s.PermitUncheckedError(); @@ -1013,13 +1055,12 @@ Status FlushJob::WriteLevel0Table() { meta_.oldest_blob_file_number, meta_.oldest_ancester_time, meta_.file_creation_time, meta_.epoch_number, meta_.file_checksum, meta_.file_checksum_func_name, - meta_.unique_id, meta_.compensated_range_deletion_size); + meta_.unique_id, meta_.compensated_range_deletion_size, + meta_.tail_size, meta_.user_defined_timestamps_persisted); edit_->SetBlobFileAdditions(std::move(blob_file_additions)); } -#ifndef ROCKSDB_LITE // Piggyback FlushJobInfo on the first first flushed memtable. mems_[0]->SetFlushJobInfo(GetFlushJobInfo()); -#endif // !ROCKSDB_LITE // Note that here we treat flush as level 0 compaction in internal stats InternalStats::CompactionStats stats(CompactionReason::kFlush, 1); @@ -1069,7 +1110,6 @@ Env::IOPriority FlushJob::GetRateLimiterPriorityForWrite() { return Env::IO_HIGH; } -#ifndef ROCKSDB_LITE std::unique_ptr FlushJob::GetFlushJobInfo() const { db_mutex_->AssertHeld(); std::unique_ptr info(new FlushJobInfo{}); @@ -1101,6 +1141,55 @@ std::unique_ptr FlushJob::GetFlushJobInfo() const { } return info; } -#endif // !ROCKSDB_LITE + +void FlushJob::GetEffectiveCutoffUDTForPickedMemTables() { + db_mutex_->AssertHeld(); + assert(pick_memtable_called); + const auto* ucmp = cfd_->internal_comparator().user_comparator(); + assert(ucmp); + const size_t ts_sz = ucmp->timestamp_size(); + if (db_options_.atomic_flush || ts_sz == 0 || + cfd_->ioptions()->persist_user_defined_timestamps) { + return; + } + // Find the newest user-defined timestamps from all the flushed memtables. + for (MemTable* m : mems_) { + Slice table_newest_udt = m->GetNewestUDT(); + if (cutoff_udt_.empty() || + ucmp->CompareTimestamp(table_newest_udt, cutoff_udt_) > 0) { + if (!cutoff_udt_.empty()) { + assert(table_newest_udt.size() == cutoff_udt_.size()); + } + cutoff_udt_.assign(table_newest_udt.data(), table_newest_udt.size()); + } + } +} + +Status FlushJob::MaybeIncreaseFullHistoryTsLowToAboveCutoffUDT() { + db_mutex_->AssertHeld(); + const auto* ucmp = cfd_->user_comparator(); + assert(ucmp); + const std::string& full_history_ts_low = cfd_->GetFullHistoryTsLow(); + // Update full_history_ts_low to right above cutoff udt only if that would + // increase it. + if (cutoff_udt_.empty() || + (!full_history_ts_low.empty() && + ucmp->CompareTimestamp(cutoff_udt_, full_history_ts_low) < 0)) { + return Status::OK(); + } + std::string new_full_history_ts_low; + Slice cutoff_udt_slice = cutoff_udt_; + // TODO(yuzhangyu): Add a member to AdvancedColumnFamilyOptions for an + // operation to get the next immediately larger user-defined timestamp to + // expand this feature to other user-defined timestamp formats. + GetFullHistoryTsLowFromU64CutoffTs(&cutoff_udt_slice, + &new_full_history_ts_low); + VersionEdit edit; + edit.SetColumnFamily(cfd_->GetID()); + edit.SetFullHistoryTsLow(new_full_history_ts_low); + return versions_->LogAndApply(cfd_, *cfd_->GetLatestMutableCFOptions(), + ReadOptions(), &edit, db_mutex_, + output_file_directory_); +} } // namespace ROCKSDB_NAMESPACE diff --git a/db/flush_job.h b/db/flush_job.h index 062ef299760b..aef33ef423a7 100644 --- a/db/flush_job.h +++ b/db/flush_job.h @@ -83,17 +83,20 @@ class FlushJob { // Require db_mutex held. // Once PickMemTable() is called, either Run() or Cancel() has to be called. void PickMemTable(); + // @param skip_since_bg_error If not nullptr and if atomic_flush=false, + // then it is set to true if flush installation is skipped and memtable + // is rolled back due to existing background error. Status Run(LogsWithPrepTracker* prep_tracker = nullptr, FileMetaData* file_meta = nullptr, - bool* switched_to_mempurge = nullptr); + bool* switched_to_mempurge = nullptr, + bool* skipped_since_bg_error = nullptr, + ErrorHandler* error_handler = nullptr); void Cancel(); const autovector& GetMemTables() const { return mems_; } -#ifndef ROCKSDB_LITE std::list>* GetCommittedFlushJobsInfo() { return &committed_flush_jobs_info_; } -#endif // !ROCKSDB_LITE private: friend class FlushJobTest_GetRateLimiterPriorityForWrite_Test; @@ -127,9 +130,21 @@ class FlushJob { bool MemPurgeDecider(double threshold); // The rate limiter priority (io_priority) is determined dynamically here. Env::IOPriority GetRateLimiterPriorityForWrite(); -#ifndef ROCKSDB_LITE std::unique_ptr GetFlushJobInfo() const; -#endif // !ROCKSDB_LITE + + // Require db_mutex held. + // Called only when UDT feature is enabled and + // `persist_user_defined_timestamps` flag is false. Because we will refrain + // from flushing as long as there are still UDTs in a memtable that hasn't + // expired w.r.t `full_history_ts_low`. However, flush is continued if there + // is risk of entering write stall mode. In that case, we need + // to track the effective cutoff timestamp below which all the udts are + // removed because of flush, and use it to increase `full_history_ts_low` if + // the effective cutoff timestamp is newer. See + // `MaybeIncreaseFullHistoryTsLowToAboveCutoffUDT` for details. + void GetEffectiveCutoffUDTForPickedMemTables(); + + Status MaybeIncreaseFullHistoryTsLowToAboveCutoffUDT(); const std::string& dbname_; const std::string db_id_; @@ -195,10 +210,14 @@ class FlushJob { const std::string full_history_ts_low_; BlobFileCompletionCallback* blob_callback_; - // reference to the seqno_time_mapping_ in db_impl.h, not safe to read without - // db mutex - const SeqnoToTimeMapping& db_impl_seqno_time_mapping_; + // reference to the seqno_to_time_mapping_ in db_impl.h, not safe to read + // without db mutex + const SeqnoToTimeMapping& db_impl_seqno_to_time_mapping_; SeqnoToTimeMapping seqno_to_time_mapping_; + + // Keeps track of the newest user-defined timestamp for this flush job if + // `persist_user_defined_timestamps` flag is false. + std::string cutoff_udt_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index 003a1a6570c1..21d1571a05e1 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -70,6 +70,7 @@ class FlushJobTestBase : public testing::Test { new_cf.AddColumnFamily(column_family_names_[i]); new_cf.SetColumnFamily(cf_id++); new_cf.SetComparatorName(ucmp_->Name()); + new_cf.SetPersistUserDefinedTimestamps(persist_udt_); new_cf.SetLogNumber(0); new_cf.SetNextFile(2); new_cf.SetLastSequence(last_seq++); @@ -117,6 +118,8 @@ class FlushJobTestBase : public testing::Test { db_options_.statistics = CreateDBStatistics(); cf_options_.comparator = ucmp_; + cf_options_.persist_user_defined_timestamps = persist_udt_; + cf_options_.paranoid_file_checks = paranoid_file_checks_; std::vector column_families; cf_options_.table_factory = mock_table_factory_; @@ -124,11 +127,12 @@ class FlushJobTestBase : public testing::Test { column_families.emplace_back(cf_name, cf_options_); } - versions_.reset( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ "")); + versions_.reset(new VersionSet( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr)); EXPECT_OK(versions_->Recover(column_families, false)); } @@ -149,6 +153,9 @@ class FlushJobTestBase : public testing::Test { std::atomic shutting_down_; std::shared_ptr mock_table_factory_; + bool persist_udt_ = true; + bool paranoid_file_checks_ = false; + SeqnoToTimeMapping empty_seqno_to_time_mapping_; }; @@ -427,11 +434,9 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { } autovector>*> committed_flush_jobs_info; -#ifndef ROCKSDB_LITE for (auto& job : flush_jobs) { committed_flush_jobs_info.push_back(job->GetCommittedFlushJobsInfo()); } -#endif //! ROCKSDB_LITE Status s = InstallMemtableAtomicFlushResults( nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list, @@ -453,7 +458,8 @@ TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) { // Verify that imm is empty ASSERT_EQ(std::numeric_limits::max(), all_cfds[k]->imm()->GetEarliestMemTableID()); - ASSERT_EQ(0, all_cfds[k]->imm()->GetLatestMemTableID()); + ASSERT_EQ(0, all_cfds[k]->imm()->GetLatestMemTableID( + false /* for_atomic_flush */)); ++k; } @@ -602,7 +608,13 @@ TEST_F(FlushJobTest, GetRateLimiterPriorityForWrite) { } } -class FlushJobTimestampTest : public FlushJobTestBase { +// Test parameters: +// param 0): paranoid file check +// param 1): user-defined timestamp test mode +class FlushJobTimestampTest + : public FlushJobTestBase, + public testing::WithParamInterface< + std::tuple> { public: FlushJobTimestampTest() : FlushJobTestBase(test::PerThreadDBPath("flush_job_ts_gc_test"), @@ -618,13 +630,40 @@ class FlushJobTimestampTest : public FlushJobTestBase { } protected: + void SetUp() override { + paranoid_file_checks_ = std::get<0>(GetParam()); + auto udt_test_mode = std::get<1>(GetParam()); + persist_udt_ = test::ShouldPersistUDT(udt_test_mode); + FlushJobTestBase::SetUp(); + } static constexpr uint64_t kStartTs = 10; static constexpr SequenceNumber kStartSeq = 0; SequenceNumber curr_seq_{kStartSeq}; std::atomic curr_ts_{kStartTs}; + + void CheckFileMetaData(ColumnFamilyData* cfd, + const InternalKey& expected_smallest, + const InternalKey& expected_largest, + const FileMetaData* meta_from_flush) const { + ASSERT_EQ(expected_smallest.Encode(), meta_from_flush->smallest.Encode()); + ASSERT_EQ(expected_largest.Encode(), meta_from_flush->largest.Encode()); + + const VersionStorageInfo* storage_info = cfd->current()->storage_info(); + const std::vector& l0_files = storage_info->LevelFiles(0); + + ASSERT_EQ(l0_files.size(), 1); + auto installed_file_meta = l0_files[0]; + ASSERT_EQ(expected_smallest.Encode(), + installed_file_meta->smallest.Encode()); + ASSERT_EQ(expected_largest.Encode(), installed_file_meta->largest.Encode()); + } + void CheckFullHistoryTsLow(ColumnFamilyData* cfd, + const std::string& expected_full_history_ts_low) { + ASSERT_EQ(expected_full_history_ts_low, cfd->GetFullHistoryTsLow()); + } }; -TEST_F(FlushJobTimestampTest, AllKeysExpired) { +TEST_P(FlushJobTimestampTest, AllKeysExpired) { ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault(); autovector to_delete; @@ -652,6 +691,7 @@ TEST_F(FlushJobTimestampTest, AllKeysExpired) { EventLogger event_logger(db_options_.info_log.get()); std::string full_history_ts_low; PutFixed64(&full_history_ts_low, std::numeric_limits::max()); + cfd->SetFullHistoryTsLow(full_history_ts_low); FlushJob flush_job( dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), std::numeric_limits::max() /* memtable_id */, env_options_, @@ -671,17 +711,25 @@ TEST_F(FlushJobTimestampTest, AllKeysExpired) { { std::string key = test::EncodeInt(0); - key.append(test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1)); + if (!persist_udt_) { + // When `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` flag + // is set to false. The user-defined timestamp is stripped from user key + // during flush, making the user key logically containing the minimum + // timestamp. + key.append(test::EncodeInt(0)); + } else { + key.append(test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1)); + } InternalKey ikey(key, curr_seq_ - 1, ValueType::kTypeDeletionWithTimestamp); - ASSERT_EQ(ikey.Encode(), fmeta.smallest.Encode()); - ASSERT_EQ(ikey.Encode(), fmeta.largest.Encode()); + CheckFileMetaData(cfd, ikey, ikey, &fmeta); + CheckFullHistoryTsLow(cfd, full_history_ts_low); } job_context.Clean(); ASSERT_TRUE(to_delete.empty()); } -TEST_F(FlushJobTimestampTest, NoKeyExpired) { +TEST_P(FlushJobTimestampTest, NoKeyExpired) { ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault(); autovector to_delete; @@ -705,6 +753,7 @@ TEST_F(FlushJobTimestampTest, NoKeyExpired) { EventLogger event_logger(db_options_.info_log.get()); std::string full_history_ts_low; PutFixed64(&full_history_ts_low, 0); + cfd->SetFullHistoryTsLow(full_history_ts_low); FlushJob flush_job( dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(), std::numeric_limits::max() /* memtable_id */, env_options_, @@ -724,18 +773,46 @@ TEST_F(FlushJobTimestampTest, NoKeyExpired) { { std::string ukey = test::EncodeInt(0); - std::string smallest_key = - ukey + test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1); - std::string largest_key = ukey + test::EncodeInt(kStartTs); + std::string smallest_key; + std::string largest_key; + std::string expected_full_history_ts_low; + if (!persist_udt_) { + // When `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` flag + // is set to false. The user-defined timestamp is stripped from user key + // during flush, making the user key logically containing the minimum + // timestamp, which is hardcoded to be all zeros for now. + smallest_key = ukey + test::EncodeInt(0); + largest_key = ukey + test::EncodeInt(0); + // When not all keys have expired and `persist_user_defined_timestamps` is + // false. UDTs will be removed during flush, `full_history_ts_low` should + // be automatically increased to above the effective cutoff UDT in the + // flush. + PutFixed64(&expected_full_history_ts_low, curr_ts_.fetch_add(1)); + } else { + smallest_key = + ukey + test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1); + largest_key = ukey + test::EncodeInt(kStartTs); + expected_full_history_ts_low = full_history_ts_low; + } InternalKey smallest(smallest_key, curr_seq_ - 1, ValueType::kTypeValue); InternalKey largest(largest_key, kStartSeq, ValueType::kTypeValue); - ASSERT_EQ(smallest.Encode(), fmeta.smallest.Encode()); - ASSERT_EQ(largest.Encode(), fmeta.largest.Encode()); + CheckFileMetaData(cfd, smallest, largest, &fmeta); + CheckFullHistoryTsLow(cfd, expected_full_history_ts_low); } job_context.Clean(); ASSERT_TRUE(to_delete.empty()); } +// Param 0: paranoid file check +// Param 1: test mode for the user-defined timestamp feature +INSTANTIATE_TEST_CASE_P( + FlushJobTimestampTest, FlushJobTimestampTest, + ::testing::Combine( + ::testing::Bool(), + ::testing::Values( + test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp, + test::UserDefinedTimestampTestMode::kNormal))); + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 3fbc2cf47066..c7691560eb80 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "db/forward_iterator.h" #include @@ -37,7 +36,7 @@ class ForwardLevelIterator : public InternalIterator { const ColumnFamilyData* const cfd, const ReadOptions& read_options, const std::vector& files, const std::shared_ptr& prefix_extractor, - bool allow_unprepared_value) + bool allow_unprepared_value, uint8_t block_protection_bytes_per_key) : cfd_(cfd), read_options_(read_options), files_(files), @@ -46,7 +45,8 @@ class ForwardLevelIterator : public InternalIterator { file_iter_(nullptr), pinned_iters_mgr_(nullptr), prefix_extractor_(prefix_extractor), - allow_unprepared_value_(allow_unprepared_value) { + allow_unprepared_value_(allow_unprepared_value), + block_protection_bytes_per_key_(block_protection_bytes_per_key) { status_.PermitUncheckedError(); // Allow uninitialized status through } @@ -88,7 +88,8 @@ class ForwardLevelIterator : public InternalIterator { /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0, /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + block_protection_bytes_per_key_); file_iter_->SetPinnedItersMgr(pinned_iters_mgr_); valid_ = false; if (!range_del_agg.IsEmpty()) { @@ -212,6 +213,7 @@ class ForwardLevelIterator : public InternalIterator { // Kept alive by ForwardIterator::sv_->mutable_cf_options const std::shared_ptr& prefix_extractor_; const bool allow_unprepared_value_; + const uint8_t block_protection_bytes_per_key_; }; ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, @@ -239,7 +241,10 @@ ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, if (sv_) { RebuildIterators(false); } - + if (!CheckFSFeatureSupport(cfd_->ioptions()->env->GetFileSystem().get(), + FSSupportedOps::kAsyncIO)) { + read_options_.async_io = false; + } // immutable_status_ is a local aggregation of the // status of the immutable Iterators. // We have to PermitUncheckedError in case it is never @@ -736,7 +741,8 @@ void ForwardIterator::RebuildIterators(bool refresh_sv) { /*skip_filters=*/false, /*level=*/-1, MaxFileSizeForL0MetaPin(sv_->mutable_cf_options), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_)); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + sv_->mutable_cf_options.block_protection_bytes_per_key)); } BuildLevelIterators(vstorage, sv_); current_ = nullptr; @@ -817,7 +823,8 @@ void ForwardIterator::RenewIterators() { /*skip_filters=*/false, /*level=*/-1, MaxFileSizeForL0MetaPin(svnew->mutable_cf_options), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_)); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + svnew->mutable_cf_options.block_protection_bytes_per_key)); } for (auto* f : l0_iters_) { @@ -861,7 +868,8 @@ void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage, } else { level_iters_.push_back(new ForwardLevelIterator( cfd_, read_options_, level_files, - sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_)); + sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_, + sv->mutable_cf_options.block_protection_bytes_per_key)); } } } @@ -883,7 +891,8 @@ void ForwardIterator::ResetIncompleteIterators() { /*skip_filters=*/false, /*level=*/-1, MaxFileSizeForL0MetaPin(sv_->mutable_cf_options), /*smallest_compaction_key=*/nullptr, - /*largest_compaction_key=*/nullptr, allow_unprepared_value_); + /*largest_compaction_key=*/nullptr, allow_unprepared_value_, + sv_->mutable_cf_options.block_protection_bytes_per_key); l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_); } @@ -1058,5 +1067,3 @@ void ForwardIterator::DeleteIterator(InternalIterator* iter, bool is_arena) { } } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/db/forward_iterator.h b/db/forward_iterator.h index 5a5c6f0f376e..cb418aeeb0af 100644 --- a/db/forward_iterator.h +++ b/db/forward_iterator.h @@ -5,7 +5,6 @@ #pragma once #include "rocksdb/comparator.h" -#ifndef ROCKSDB_LITE #include #include @@ -123,7 +122,7 @@ class ForwardIterator : public InternalIterator { void DeleteIterator(InternalIterator* iter, bool is_arena = false); DBImpl* const db_; - const ReadOptions read_options_; + ReadOptions read_options_; ColumnFamilyData* const cfd_; const SliceTransform* const prefix_extractor_; const Comparator* user_comparator_; @@ -165,4 +164,3 @@ class ForwardIterator : public InternalIterator { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/db/forward_iterator_bench.cc b/db/forward_iterator_bench.cc index 325661cef341..b57b119e484a 100644 --- a/db/forward_iterator_bench.cc +++ b/db/forward_iterator_bench.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#if !defined(GFLAGS) || defined(ROCKSDB_LITE) +#if !defined(GFLAGS) #include int main() { fprintf(stderr, "Please install gflags to run rocksdb tools\n"); @@ -375,4 +375,4 @@ int main(int argc, char** argv) { writers.clear(); readers.clear(); } -#endif // !defined(GFLAGS) || defined(ROCKSDB_LITE) +#endif // !defined(GFLAGS) diff --git a/db/history_trimming_iterator.h b/db/history_trimming_iterator.h index b445ced33424..4af5cde72053 100644 --- a/db/history_trimming_iterator.h +++ b/db/history_trimming_iterator.h @@ -82,6 +82,10 @@ class HistoryTrimmingIterator : public InternalIterator { bool IsValuePinned() const override { return input_->IsValuePinned(); } + bool IsDeleteRangeSentinelKey() const override { + return input_->IsDeleteRangeSentinelKey(); + } + private: InternalIterator* input_; const std::string filter_ts_; diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index 17ad044a7e7a..f7b8a50aef0e 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -5,7 +5,6 @@ // (found in the LICENSE.Apache file in the root directory). #include "db/version_builder.h" -#ifndef ROCKSDB_LITE #include "db/import_column_family_job.h" @@ -30,76 +29,132 @@ namespace ROCKSDB_NAMESPACE { Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number, SuperVersion* sv) { Status status; + std::vector cf_ingest_infos; + for (const auto& metadata_per_cf : metadatas_) { + // Read the information of files we are importing + ColumnFamilyIngestFileInfo cf_file_info; + InternalKey smallest, largest; + int num_files = 0; + std::vector files_to_import_per_cf; + for (size_t i = 0; i < metadata_per_cf.size(); i++) { + auto file_metadata = *metadata_per_cf[i]; + const auto file_path = file_metadata.db_path + "/" + file_metadata.name; + IngestedFileInfo file_to_import; + status = GetIngestedFileInfo(file_path, next_file_number++, sv, + file_metadata, &file_to_import); + if (!status.ok()) { + return status; + } - // Read the information of files we are importing - for (const auto& file_metadata : metadata_) { - const auto file_path = file_metadata.db_path + "/" + file_metadata.name; - IngestedFileInfo file_to_import; - status = - GetIngestedFileInfo(file_path, next_file_number++, &file_to_import, sv); - if (!status.ok()) { - return status; - } - files_to_import_.push_back(file_to_import); - } + if (file_to_import.num_entries == 0) { + status = Status::InvalidArgument("File contain no entries"); + return status; + } - auto num_files = files_to_import_.size(); - if (num_files == 0) { - return Status::InvalidArgument("The list of files is empty"); - } + if (!file_to_import.smallest_internal_key.Valid() || + !file_to_import.largest_internal_key.Valid()) { + status = Status::Corruption("File has corrupted keys"); + return status; + } - for (const auto& f : files_to_import_) { - if (f.num_entries == 0) { - return Status::InvalidArgument("File contain no entries"); + files_to_import_per_cf.push_back(file_to_import); + num_files++; + + // Calculate the smallest and largest keys of all files in this CF + if (i == 0) { + smallest = file_to_import.smallest_internal_key; + largest = file_to_import.largest_internal_key; + } else { + if (cfd_->internal_comparator().Compare( + smallest, file_to_import.smallest_internal_key) < 0) { + smallest = file_to_import.smallest_internal_key; + } + if (cfd_->internal_comparator().Compare( + largest, file_to_import.largest_internal_key) > 0) { + largest = file_to_import.largest_internal_key; + } + } + } + + if (num_files == 0) { + status = Status::InvalidArgument("The list of files is empty"); + return status; } + files_to_import_.push_back(files_to_import_per_cf); + cf_file_info.smallest_internal_key = smallest; + cf_file_info.largest_internal_key = largest; + cf_ingest_infos.push_back(cf_file_info); + } - if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) { - return Status::Corruption("File has corrupted keys"); + std::sort(cf_ingest_infos.begin(), cf_ingest_infos.end(), + [this](const ColumnFamilyIngestFileInfo& info1, + const ColumnFamilyIngestFileInfo& info2) { + return cfd_->user_comparator()->Compare( + info1.smallest_internal_key.user_key(), + info2.smallest_internal_key.user_key()) < 0; + }); + + for (size_t i = 0; i + 1 < cf_ingest_infos.size(); i++) { + if (cfd_->user_comparator()->Compare( + cf_ingest_infos[i].largest_internal_key.user_key(), + cf_ingest_infos[i + 1].smallest_internal_key.user_key()) >= 0) { + status = Status::InvalidArgument("CFs have overlapping ranges"); + return status; } } // Copy/Move external files into DB auto hardlink_files = import_options_.move_files; - for (auto& f : files_to_import_) { - const auto path_outside_db = f.external_file_path; - const auto path_inside_db = TableFileName( - cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); - - if (hardlink_files) { - status = - fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr); - if (status.IsNotSupported()) { - // Original file is on a different FS, use copy instead of hard linking - hardlink_files = false; - ROCKS_LOG_INFO(db_options_.info_log, - "Try to link file %s but it's not supported : %s", - f.internal_file_path.c_str(), status.ToString().c_str()); + + for (auto& files_to_import_per_cf : files_to_import_) { + for (auto& f : files_to_import_per_cf) { + const auto path_outside_db = f.external_file_path; + const auto path_inside_db = TableFileName( + cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId()); + + if (hardlink_files) { + status = fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), + nullptr); + if (status.IsNotSupported()) { + // Original file is on a different FS, use copy instead of hard + // linking + hardlink_files = false; + ROCKS_LOG_INFO(db_options_.info_log, + "Try to link file %s but it's not supported : %s", + f.internal_file_path.c_str(), + status.ToString().c_str()); + } } - } - if (!hardlink_files) { - status = - CopyFile(fs_.get(), path_outside_db, path_inside_db, 0, - db_options_.use_fsync, io_tracer_, Temperature::kUnknown); + if (!hardlink_files) { + status = + CopyFile(fs_.get(), path_outside_db, path_inside_db, 0, + db_options_.use_fsync, io_tracer_, Temperature::kUnknown); + } + if (!status.ok()) { + break; + } + f.copy_file = !hardlink_files; + f.internal_file_path = path_inside_db; } if (!status.ok()) { break; } - f.copy_file = !hardlink_files; - f.internal_file_path = path_inside_db; } if (!status.ok()) { // We failed, remove all files that we copied into the db - for (const auto& f : files_to_import_) { - if (f.internal_file_path.empty()) { - break; - } - const auto s = - fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr); - if (!s.ok()) { - ROCKS_LOG_WARN(db_options_.info_log, - "AddFile() clean up for file %s failed : %s", - f.internal_file_path.c_str(), s.ToString().c_str()); + for (auto& files_to_import_per_cf : files_to_import_) { + for (auto& f : files_to_import_per_cf) { + if (f.internal_file_path.empty()) { + break; + } + const auto s = + fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } } } } @@ -130,22 +185,41 @@ Status ImportColumnFamilyJob::Run() { &cfd_->internal_comparator(), cfd_->user_comparator(), cfd_->NumberLevels(), cfd_->ioptions()->compaction_style, nullptr /* src_vstorage */, cfd_->ioptions()->force_consistency_checks, - EpochNumberRequirement::kMightMissing); + EpochNumberRequirement::kMightMissing, cfd_->ioptions()->clock, + cfd_->GetLatestMutableCFOptions()->bottommost_file_compaction_delay, + cfd_->current()->version_set()->offpeak_time_option()); Status s; + for (size_t i = 0; s.ok() && i < files_to_import_.size(); ++i) { - const auto& f = files_to_import_[i]; - const auto& file_metadata = metadata_[i]; - - VersionEdit dummy_version_edit; - dummy_version_edit.AddFile( - file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(), - f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, - file_metadata.smallest_seqno, file_metadata.largest_seqno, false, - file_metadata.temperature, kInvalidBlobFileNumber, oldest_ancester_time, - current_time, file_metadata.epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, f.unique_id, 0); - s = dummy_version_builder.Apply(&dummy_version_edit); + for (size_t j = 0; s.ok() && j < files_to_import_[i].size(); ++j) { + const auto& f = files_to_import_[i][j]; + const auto& file_metadata = *metadatas_[i][j]; + + uint64_t tail_size = 0; + bool contain_no_data_blocks = f.table_properties.num_entries > 0 && + (f.table_properties.num_entries == + f.table_properties.num_range_deletions); + if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) { + uint64_t file_size = f.fd.GetFileSize(); + assert(f.table_properties.tail_start_offset <= file_size); + tail_size = file_size - f.table_properties.tail_start_offset; + } + + VersionEdit dummy_version_edit; + dummy_version_edit.AddFile( + file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(), + f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, + file_metadata.smallest_seqno, file_metadata.largest_seqno, false, + file_metadata.temperature, kInvalidBlobFileNumber, + oldest_ancester_time, current_time, file_metadata.epoch_number, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, f.unique_id, 0, + tail_size, + static_cast( + f.table_properties.user_defined_timestamps_persisted)); + s = dummy_version_builder.Apply(&dummy_version_edit); + } } + if (s.ok()) { s = dummy_version_builder.SaveTo(&dummy_vstorage); } @@ -186,26 +260,30 @@ Status ImportColumnFamilyJob::Run() { void ImportColumnFamilyJob::Cleanup(const Status& status) { if (!status.ok()) { // We failed to add files to the database remove all the files we copied. - for (const auto& f : files_to_import_) { - const auto s = - fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr); - if (!s.ok()) { - ROCKS_LOG_WARN(db_options_.info_log, - "AddFile() clean up for file %s failed : %s", - f.internal_file_path.c_str(), s.ToString().c_str()); + for (auto& files_to_import_per_cf : files_to_import_) { + for (auto& f : files_to_import_per_cf) { + const auto s = + fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "AddFile() clean up for file %s failed : %s", + f.internal_file_path.c_str(), s.ToString().c_str()); + } } } } else if (status.ok() && import_options_.move_files) { // The files were moved and added successfully, remove original file links - for (IngestedFileInfo& f : files_to_import_) { - const auto s = - fs_->DeleteFile(f.external_file_path, IOOptions(), nullptr); - if (!s.ok()) { - ROCKS_LOG_WARN( - db_options_.info_log, - "%s was added to DB successfully but failed to remove original " - "file link : %s", - f.external_file_path.c_str(), s.ToString().c_str()); + for (auto& files_to_import_per_cf : files_to_import_) { + for (auto& f : files_to_import_per_cf) { + const auto s = + fs_->DeleteFile(f.external_file_path, IOOptions(), nullptr); + if (!s.ok()) { + ROCKS_LOG_WARN( + db_options_.info_log, + "%s was added to DB successfully but failed to remove original " + "file link : %s", + f.external_file_path.c_str(), s.ToString().c_str()); + } } } } @@ -213,16 +291,20 @@ void ImportColumnFamilyJob::Cleanup(const Status& status) { Status ImportColumnFamilyJob::GetIngestedFileInfo( const std::string& external_file, uint64_t new_file_number, - IngestedFileInfo* file_to_import, SuperVersion* sv) { + SuperVersion* sv, const LiveFileMetaData& file_meta, + IngestedFileInfo* file_to_import) { file_to_import->external_file_path = external_file; - - // Get external file size - Status status = fs_->GetFileSize(external_file, IOOptions(), - &file_to_import->file_size, nullptr); - if (!status.ok()) { - return status; + Status status; + if (file_meta.size > 0) { + file_to_import->file_size = file_meta.size; + } else { + // Get external file size + status = fs_->GetFileSize(external_file, IOOptions(), + &file_to_import->file_size, nullptr); + if (!status.ok()) { + return status; + } } - // Assign FD with number file_to_import->fd = FileDescriptor(new_file_number, 0, file_to_import->file_size); @@ -240,10 +322,14 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( sst_file_reader.reset(new RandomAccessFileReader( std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_)); + // TODO(yuzhangyu): User-defined timestamps doesn't support importing column + // family. Pass in the correct `user_defined_timestamps_persisted` flag for + // creating `TableReaderOptions` when the support is there. status = cfd_->ioptions()->table_factory->NewTableReader( TableReaderOptions( *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor, env_options_, cfd_->internal_comparator(), + sv->mutable_cf_options.block_protection_bytes_per_key, /*skip_filters*/ false, /*immortal*/ false, /*force_direct_prefetch*/ false, /*level*/ -1, /*block_cache_tracer*/ nullptr, @@ -263,37 +349,97 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( // Get number of entries in table file_to_import->num_entries = props->num_entries; - ParsedInternalKey key; - ReadOptions ro; - // During reading the external file we can cache blocks that we read into - // the block cache, if we later change the global seqno of this file, we will - // have block in cache that will include keys with wrong seqno. - // We need to disable fill_cache so that we read from the file without - // updating the block cache. - ro.fill_cache = false; - std::unique_ptr iter(table_reader->NewIterator( - ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, - /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); - - // Get first (smallest) key from file - iter->SeekToFirst(); - Status pik_status = - ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors); - if (!pik_status.ok()) { - return Status::Corruption("Corrupted Key in external file. ", - pik_status.getState()); - } - file_to_import->smallest_internal_key.SetFrom(key); - - // Get last (largest) key from file - iter->SeekToLast(); - pik_status = - ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors); - if (!pik_status.ok()) { - return Status::Corruption("Corrupted Key in external file. ", - pik_status.getState()); + // If the importing files were exported with Checkpoint::ExportColumnFamily(), + // we cannot simply recompute smallest and largest used to truncate range + // tombstones from file content, and we expect smallest and largest populated + // in file_meta. + if (file_meta.smallest.empty()) { + assert(file_meta.largest.empty()); + // TODO: plumb Env::IOActivity + ReadOptions ro; + std::unique_ptr iter(table_reader->NewIterator( + ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); + + // Get first (smallest) key from file + iter->SeekToFirst(); + bool bound_set = false; + if (iter->Valid()) { + file_to_import->smallest_internal_key.DecodeFrom(iter->key()); + Slice largest; + if (strcmp(cfd_->ioptions()->table_factory->Name(), "PlainTable") == 0) { + // PlainTable iterator does not support SeekToLast(). + largest = iter->key(); + for (; iter->Valid(); iter->Next()) { + if (cfd_->internal_comparator().Compare(iter->key(), largest) > 0) { + largest = iter->key(); + } + } + if (!iter->status().ok()) { + return iter->status(); + } + } else { + iter->SeekToLast(); + if (!iter->Valid()) { + if (iter->status().ok()) { + // The file contains at least 1 key since iter is valid after + // SeekToFirst(). + return Status::Corruption("Can not find largest key in sst file"); + } else { + return iter->status(); + } + } + largest = iter->key(); + } + file_to_import->largest_internal_key.DecodeFrom(largest); + bound_set = true; + } else if (!iter->status().ok()) { + return iter->status(); + } + + std::unique_ptr range_del_iter{ + table_reader->NewRangeTombstoneIterator(ro)}; + if (range_del_iter != nullptr) { + range_del_iter->SeekToFirst(); + if (range_del_iter->Valid()) { + ParsedInternalKey key; + Status pik_status = ParseInternalKey(range_del_iter->key(), &key, + db_options_.allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + RangeTombstone first_tombstone(key, range_del_iter->value()); + InternalKey start_key = first_tombstone.SerializeKey(); + const InternalKeyComparator* icmp = &cfd_->internal_comparator(); + if (!bound_set || + icmp->Compare(start_key, file_to_import->smallest_internal_key) < + 0) { + file_to_import->smallest_internal_key = start_key; + } + + range_del_iter->SeekToLast(); + pik_status = ParseInternalKey(range_del_iter->key(), &key, + db_options_.allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + RangeTombstone last_tombstone(key, range_del_iter->value()); + InternalKey end_key = last_tombstone.SerializeEndKey(); + if (!bound_set || + icmp->Compare(end_key, file_to_import->largest_internal_key) > 0) { + file_to_import->largest_internal_key = end_key; + } + bound_set = true; + } + } + assert(bound_set); + } else { + assert(!file_meta.largest.empty()); + file_to_import->smallest_internal_key.DecodeFrom(file_meta.smallest); + file_to_import->largest_internal_key.DecodeFrom(file_meta.largest); } - file_to_import->largest_internal_key.SetFrom(key); file_to_import->cf_id = static_cast(props->column_family_id); @@ -311,5 +457,3 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( return status; } } // namespace ROCKSDB_NAMESPACE - -#endif // !ROCKSDB_LITE diff --git a/db/import_column_family_job.h b/db/import_column_family_job.h index 57c49c67ffbd..fb41c4b48157 100644 --- a/db/import_column_family_job.h +++ b/db/import_column_family_job.h @@ -25,13 +25,22 @@ class SystemClock; // Imports a set of sst files as is into a new column family. Logic is similar // to ExternalSstFileIngestionJob. class ImportColumnFamilyJob { + // All file information of an imported CF, mainly used to + // calculate whether there is overlap between CFs + struct ColumnFamilyIngestFileInfo { + // Smallest internal key in cf + InternalKey smallest_internal_key; + // Largest internal key in cf + InternalKey largest_internal_key; + }; + public: - ImportColumnFamilyJob(VersionSet* versions, ColumnFamilyData* cfd, - const ImmutableDBOptions& db_options, - const EnvOptions& env_options, - const ImportColumnFamilyOptions& import_options, - const std::vector& metadata, - const std::shared_ptr& io_tracer) + ImportColumnFamilyJob( + VersionSet* versions, ColumnFamilyData* cfd, + const ImmutableDBOptions& db_options, const EnvOptions& env_options, + const ImportColumnFamilyOptions& import_options, + const std::vector>& metadatas, + const std::shared_ptr& io_tracer) : clock_(db_options.clock), versions_(versions), cfd_(cfd), @@ -39,7 +48,7 @@ class ImportColumnFamilyJob { fs_(db_options_.fs, io_tracer), env_options_(env_options), import_options_(import_options), - metadata_(metadata), + metadatas_(metadatas), io_tracer_(io_tracer) {} // Prepare the job by copying external files into the DB. @@ -54,7 +63,7 @@ class ImportColumnFamilyJob { VersionEdit* edit() { return &edit_; } - const autovector& files_to_import() const { + const std::vector>& files_to_import() const { return files_to_import_; } @@ -62,9 +71,9 @@ class ImportColumnFamilyJob { // Open the external file and populate `file_to_import` with all the // external information we need to import this file. Status GetIngestedFileInfo(const std::string& external_file, - uint64_t new_file_number, - IngestedFileInfo* file_to_import, - SuperVersion* sv); + uint64_t new_file_number, SuperVersion* sv, + const LiveFileMetaData& file_meta, + IngestedFileInfo* file_to_import); SystemClock* clock_; VersionSet* versions_; @@ -72,10 +81,10 @@ class ImportColumnFamilyJob { const ImmutableDBOptions& db_options_; const FileSystemPtr fs_; const EnvOptions& env_options_; - autovector files_to_import_; + std::vector> files_to_import_; VersionEdit edit_; const ImportColumnFamilyOptions& import_options_; - std::vector metadata_; + const std::vector> metadatas_; const std::shared_ptr io_tracer_; }; diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc index 0c07ee2a8bf2..f6c1a024839e 100644 --- a/db/import_column_family_test.cc +++ b/db/import_column_family_test.cc @@ -4,7 +4,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include @@ -23,10 +22,13 @@ class ImportColumnFamilyTest : public DBTestBase { : DBTestBase("import_column_family_test", /*env_do_fsync=*/true) { sst_files_dir_ = dbname_ + "/sst_files/"; export_files_dir_ = test::PerThreadDBPath(env_, "export"); + export_files_dir2_ = test::PerThreadDBPath(env_, "export2"); + DestroyAndRecreateExternalSSTFilesDir(); import_cfh_ = nullptr; import_cfh2_ = nullptr; metadata_ptr_ = nullptr; + metadata_ptr2_ = nullptr; } ~ImportColumnFamilyTest() { @@ -44,14 +46,21 @@ class ImportColumnFamilyTest : public DBTestBase { delete metadata_ptr_; metadata_ptr_ = nullptr; } + + if (metadata_ptr2_) { + delete metadata_ptr2_; + metadata_ptr2_ = nullptr; + } EXPECT_OK(DestroyDir(env_, sst_files_dir_)); EXPECT_OK(DestroyDir(env_, export_files_dir_)); + EXPECT_OK(DestroyDir(env_, export_files_dir2_)); } void DestroyAndRecreateExternalSSTFilesDir() { EXPECT_OK(DestroyDir(env_, sst_files_dir_)); EXPECT_OK(env_->CreateDir(sst_files_dir_)); EXPECT_OK(DestroyDir(env_, export_files_dir_)); + EXPECT_OK(DestroyDir(env_, export_files_dir2_)); } LiveFileMetaData LiveFileMetaDataInit(std::string name, std::string path, @@ -70,9 +79,11 @@ class ImportColumnFamilyTest : public DBTestBase { protected: std::string sst_files_dir_; std::string export_files_dir_; + std::string export_files_dir2_; ColumnFamilyHandle* import_cfh_; ColumnFamilyHandle* import_cfh2_; ExportImportFilesMetaData* metadata_ptr_; + ExportImportFilesMetaData* metadata_ptr2_; }; TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) { @@ -281,6 +292,59 @@ TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) { } } +TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithRangeTombstone) { + // Test for a bug where import file's smallest and largest key did not + // consider range tombstone. + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + // cf1.sst + const std::string cf1_sst_name = "cf1.sst"; + const std::string cf1_sst = sst_files_dir_ + cf1_sst_name; + ASSERT_OK(sfw_cf1.Open(cf1_sst)); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.DeleteRange("K3", "K4")); + ASSERT_OK(sfw_cf1.DeleteRange("K7", "K9")); + + ASSERT_OK(sfw_cf1.Finish()); + + // Import sst file corresponding to cf1 onto a new cf and verify + ExportImportFilesMetaData metadata; + metadata.files.push_back( + LiveFileMetaDataInit(cf1_sst_name, sst_files_dir_, 0, 0, 19)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport( + options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + ColumnFamilyMetaData import_cf_meta; + db_->GetColumnFamilyMetaData(import_cfh_, &import_cf_meta); + ASSERT_EQ(import_cf_meta.file_count, 1); + const SstFileMetaData* file_meta = nullptr; + for (const auto& level_meta : import_cf_meta.levels) { + if (!level_meta.files.empty()) { + file_meta = &(level_meta.files[0]); + break; + } + } + ASSERT_TRUE(file_meta != nullptr); + InternalKey largest; + largest.DecodeFrom(file_meta->largest); + ASSERT_EQ(largest.user_key(), "K9"); + + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K1", &value)); + ASSERT_EQ(value, "V1"); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K2", &value)); + ASSERT_EQ(value, "V2"); + ASSERT_OK(db_->DropColumnFamily(import_cfh_)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_)); + import_cfh_ = nullptr; +} + TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) { Options options = CurrentOptions(); CreateAndReopenWithCF({"koko"}, options); @@ -445,6 +509,70 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) { ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); } +TEST_F(ImportColumnFamilyTest, + ImportExportedSSTFromAnotherCFWithRangeTombstone) { + // Test for a bug where import file's smallest and largest key did not + // consider range tombstone. + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"koko"}, options); + + for (int i = 10; i < 20; ++i) { + ASSERT_OK(Put(1, Key(i), Key(i) + "_val")); + } + ASSERT_OK(Flush(1 /* cf */)); + MoveFilesToLevel(1 /* level */, 1 /* cf */); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), handles_[1], Key(0), Key(25))); + ASSERT_OK(Put(1, Key(1), "t")); + ASSERT_OK(Flush(1)); + // Tests importing a range tombstone only file + ASSERT_OK(db_->DeleteRange(WriteOptions(), handles_[1], Key(0), Key(2))); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, + &metadata_ptr_)); + ASSERT_NE(metadata_ptr_, nullptr); + delete checkpoint; + + ImportColumnFamilyOptions import_options; + import_options.move_files = false; + ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "toto", import_options, + *metadata_ptr_, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + import_options.move_files = true; + ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "yoyo", import_options, + *metadata_ptr_, &import_cfh2_)); + ASSERT_NE(import_cfh2_, nullptr); + delete metadata_ptr_; + metadata_ptr_ = nullptr; + + std::string value1, value2; + ReadOptions ro_latest; + ReadOptions ro_snapshot; + ro_snapshot.snapshot = snapshot; + + for (int i = 10; i < 20; ++i) { + ASSERT_TRUE(db_->Get(ro_latest, import_cfh_, Key(i), &value1).IsNotFound()); + ASSERT_OK(db_->Get(ro_snapshot, import_cfh_, Key(i), &value1)); + ASSERT_EQ(Get(1, Key(i), snapshot), value1); + } + ASSERT_TRUE(db_->Get(ro_latest, import_cfh_, Key(1), &value1).IsNotFound()); + + for (int i = 10; i < 20; ++i) { + ASSERT_TRUE( + db_->Get(ro_latest, import_cfh2_, Key(i), &value1).IsNotFound()); + + ASSERT_OK(db_->Get(ro_snapshot, import_cfh2_, Key(i), &value2)); + ASSERT_EQ(Get(1, Key(i), snapshot), value2); + } + ASSERT_TRUE(db_->Get(ro_latest, import_cfh2_, Key(1), &value1).IsNotFound()); + + db_->ReleaseSnapshot(snapshot); +} + TEST_F(ImportColumnFamilyTest, LevelFilesOverlappingAtEndpoints) { // Imports a column family containing a level where two files overlap at their // endpoints. "Overlap" means the largest user key in one file is the same as @@ -514,22 +642,22 @@ TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) { { // Create column family with existing cf name. ExportImportFilesMetaData metadata; - - ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "koko", - ImportColumnFamilyOptions(), - metadata, &import_cfh_), - Status::InvalidArgument("Column family already exists")); + metadata.db_comparator_name = options.comparator->Name(); + Status s = db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "koko", + ImportColumnFamilyOptions(), + metadata, &import_cfh_); + ASSERT_TRUE(std::strstr(s.getState(), "Column family already exists")); ASSERT_EQ(import_cfh_, nullptr); } { // Import with no files specified. ExportImportFilesMetaData metadata; - - ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", - ImportColumnFamilyOptions(), - metadata, &import_cfh_), - Status::InvalidArgument("The list of files is empty")); + metadata.db_comparator_name = options.comparator->Name(); + Status s = db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_); + ASSERT_TRUE(std::strstr(s.getState(), "The list of files is empty")); ASSERT_EQ(import_cfh_, nullptr); } @@ -579,10 +707,10 @@ TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) { LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19)); metadata.db_comparator_name = mismatch_options.comparator->Name(); - ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "coco", - ImportColumnFamilyOptions(), - metadata, &import_cfh_), - Status::InvalidArgument("Comparator name mismatch")); + Status s = db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "coco", + ImportColumnFamilyOptions(), + metadata, &import_cfh_); + ASSERT_TRUE(std::strstr(s.getState(), "Comparator name mismatch")); ASSERT_EQ(import_cfh_, nullptr); } @@ -604,10 +732,10 @@ TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) { LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 1, 10, 19)); metadata.db_comparator_name = options.comparator->Name(); - ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", - ImportColumnFamilyOptions(), - metadata, &import_cfh_), - Status::IOError("No such file or directory")); + Status s = db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo", + ImportColumnFamilyOptions(), + metadata, &import_cfh_); + ASSERT_TRUE(std::strstr(s.getState(), "No such file or directory")); ASSERT_EQ(import_cfh_, nullptr); // Test successful import after a failure with the same CF name. Ensures @@ -622,6 +750,137 @@ TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) { } } +TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyTest) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put(1, Key(i), Key(i) + "_val")); + } + ASSERT_OK(Flush(1)); + + ASSERT_OK( + db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); + + // Overwrite the value in the same set of keys. + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite")); + } + + // Flush again to create another L0 file. It should have higher sequencer. + ASSERT_OK(Flush(1)); + + Checkpoint* checkpoint1; + Checkpoint* checkpoint2; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint1)); + ASSERT_OK(checkpoint1->ExportColumnFamily(handles_[1], export_files_dir_, + &metadata_ptr_)); + + // Create a new db and import the files. + DB* db_copy; + ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); + ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy)); + ColumnFamilyHandle* copy_cfh = nullptr; + ASSERT_OK(db_copy->CreateColumnFamily(options, "koko", ©_cfh)); + WriteOptions wo; + for (int i = 100; i < 200; ++i) { + ASSERT_OK(db_copy->Put(wo, copy_cfh, Key(i), Key(i) + "_val")); + } + ASSERT_OK(db_copy->Flush(FlushOptions())); + for (int i = 100; i < 200; ++i) { + ASSERT_OK(db_copy->Put(wo, copy_cfh, Key(i), Key(i) + "_overwrite")); + } + ASSERT_OK(db_copy->Flush(FlushOptions())); + for (int i = 100; i < 200; ++i) { + ASSERT_OK(db_copy->Put(wo, copy_cfh, Key(i), Key(i) + "_overwrite2")); + } + ASSERT_OK(db_copy->Flush(FlushOptions())); + + // Flush again to create another L0 file. It should have higher sequencer. + ASSERT_OK(Checkpoint::Create(db_copy, &checkpoint2)); + ASSERT_OK(checkpoint2->ExportColumnFamily(copy_cfh, export_files_dir2_, + &metadata_ptr2_)); + + ASSERT_NE(metadata_ptr_, nullptr); + ASSERT_NE(metadata_ptr2_, nullptr); + delete checkpoint1; + delete checkpoint2; + ImportColumnFamilyOptions import_options; + import_options.move_files = false; + + std::vector metadatas = {metadata_ptr_, + metadata_ptr2_}; + ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "toto", import_options, + metadatas, &import_cfh_)); + + std::string value1, value2; + for (int i = 0; i < 100; ++i) { + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); + ASSERT_EQ(Get(1, Key(i)), value1); + } + + for (int i = 100; i < 200; ++i) { + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1)); + ASSERT_OK(db_copy->Get(ReadOptions(), copy_cfh, Key(i), &value2)); + ASSERT_EQ(value1, value2); + } + + ASSERT_OK(db_copy->DropColumnFamily(copy_cfh)); + ASSERT_OK(db_copy->DestroyColumnFamilyHandle(copy_cfh)); + delete db_copy; + ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); +} + +TEST_F(ImportColumnFamilyTest, ImportMultiColumnFamilyWithOverlap) { + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + for (int i = 0; i < 100; ++i) { + ASSERT_OK(Put(1, Key(i), Key(i) + "_val")); + } + + Checkpoint* checkpoint1; + Checkpoint* checkpoint2; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint1)); + ASSERT_OK(checkpoint1->ExportColumnFamily(handles_[1], export_files_dir_, + &metadata_ptr_)); + + // Create a new db and import the files. + DB* db_copy; + ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); + ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy)); + ColumnFamilyHandle* copy_cfh = nullptr; + ASSERT_OK(db_copy->CreateColumnFamily(options, "koko", ©_cfh)); + WriteOptions wo; + for (int i = 50; i < 150; ++i) { + ASSERT_OK(db_copy->Put(wo, copy_cfh, Key(i), Key(i) + "_val")); + } + ASSERT_OK(db_copy->Flush(FlushOptions())); + + // Flush again to create another L0 file. It should have higher sequencer. + ASSERT_OK(Checkpoint::Create(db_copy, &checkpoint2)); + ASSERT_OK(checkpoint2->ExportColumnFamily(copy_cfh, export_files_dir2_, + &metadata_ptr2_)); + + ASSERT_NE(metadata_ptr_, nullptr); + ASSERT_NE(metadata_ptr2_, nullptr); + delete checkpoint1; + delete checkpoint2; + ImportColumnFamilyOptions import_options; + import_options.move_files = false; + + std::vector metadatas = {metadata_ptr_, + metadata_ptr2_}; + + ASSERT_EQ(db_->CreateColumnFamilyWithImport(options, "toto", import_options, + metadatas, &import_cfh_), + Status::InvalidArgument("CFs have overlapping ranges")); + + ASSERT_OK(db_copy->DropColumnFamily(copy_cfh)); + ASSERT_OK(db_copy->DestroyColumnFamilyHandle(copy_cfh)); + delete db_copy; + ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { @@ -630,14 +889,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "SKIPPED as External SST File Writer and Import are not supported " - "in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/db/internal_stats.cc b/db/internal_stats.cc index cc2bbfe47b11..d690d3529619 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -23,6 +23,7 @@ #include "cache/cache_entry_stats.h" #include "db/column_family.h" #include "db/db_impl/db_impl.h" +#include "db/write_stall_stats.h" #include "port/port.h" #include "rocksdb/system_clock.h" #include "rocksdb/table.h" @@ -32,7 +33,6 @@ namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE const std::map InternalStats::compaction_level_stats = { @@ -78,6 +78,10 @@ const std::map DBStatInfo{"db.user_writes_with_wal"}}, {InternalStats::kIntStatsWriteStallMicros, DBStatInfo{"db.user_write_stall_micros"}}, + {InternalStats::kIntStatsWriteBufferManagerLimitStopsCounts, + DBStatInfo{WriteStallStatsMapKeys::CauseConditionCount( + WriteStallCause::kWriteBufferManagerLimit, + WriteStallCondition::kStopped)}}, }; namespace { @@ -244,7 +248,9 @@ static const std::string cfstats = "cfstats"; static const std::string cfstats_no_file_histogram = "cfstats-no-file-histogram"; static const std::string cf_file_histogram = "cf-file-histogram"; +static const std::string cf_write_stall_stats = "cf-write-stall-stats"; static const std::string dbstats = "dbstats"; +static const std::string db_write_stall_stats = "db-write-stall-stats"; static const std::string levelstats = "levelstats"; static const std::string block_cache_entry_stats = "block-cache-entry-stats"; static const std::string fast_block_cache_entry_stats = @@ -287,6 +293,7 @@ static const std::string total_sst_files_size = "total-sst-files-size"; static const std::string live_non_bottommost_sst_files_size = "live-non-bottommost-sst-files-size"; static const std::string live_sst_files_size = "live-sst-files-size"; +static const std::string obsolete_sst_files_size = "obsolete-sst-files-size"; static const std::string live_sst_files_size_at_temperature = "live-sst-files-size-at-temperature"; static const std::string estimate_pending_comp_bytes = @@ -326,6 +333,10 @@ const std::string DB::Properties::kCFStatsNoFileHistogram = rocksdb_prefix + cfstats_no_file_histogram; const std::string DB::Properties::kCFFileHistogram = rocksdb_prefix + cf_file_histogram; +const std::string DB::Properties::kCFWriteStallStats = + rocksdb_prefix + cf_write_stall_stats; +const std::string DB::Properties::kDBWriteStallStats = + rocksdb_prefix + db_write_stall_stats; const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats; const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats; const std::string DB::Properties::kBlockCacheEntryStats = @@ -388,6 +399,8 @@ const std::string DB::Properties::kLiveNonBottommostSstFilesSize = rocksdb_prefix + live_non_bottommost_sst_files_size; const std::string DB::Properties::kLiveSstFilesSize = rocksdb_prefix + live_sst_files_size; +const std::string DB::Properties::kObsoleteSstFilesSize = + rocksdb_prefix + obsolete_sst_files_size; const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level_str; const std::string DB::Properties::kEstimatePendingCompactionBytes = rocksdb_prefix + estimate_pending_comp_bytes; @@ -455,9 +468,15 @@ const UnorderedMap {DB::Properties::kCFFileHistogram, {false, &InternalStats::HandleCFFileHistogram, nullptr, nullptr, nullptr}}, + {DB::Properties::kCFWriteStallStats, + {false, &InternalStats::HandleCFWriteStallStats, nullptr, + &InternalStats::HandleCFWriteStallStatsMap, nullptr}}, {DB::Properties::kDBStats, {false, &InternalStats::HandleDBStats, nullptr, &InternalStats::HandleDBMapStats, nullptr}}, + {DB::Properties::kDBWriteStallStats, + {false, &InternalStats::HandleDBWriteStallStats, nullptr, + &InternalStats::HandleDBWriteStallStatsMap, nullptr}}, {DB::Properties::kBlockCacheEntryStats, {true, &InternalStats::HandleBlockCacheEntryStats, nullptr, &InternalStats::HandleBlockCacheEntryStatsMap, nullptr}}, @@ -556,6 +575,9 @@ const UnorderedMap {DB::Properties::kLiveSstFilesSizeAtTemperature, {false, &InternalStats::HandleLiveSstFilesSizeAtTemperature, nullptr, nullptr, nullptr}}, + {DB::Properties::kObsoleteSstFilesSize, + {false, nullptr, &InternalStats::HandleObsoleteSstFilesSize, nullptr, + nullptr}}, {DB::Properties::kEstimatePendingCompactionBytes, {false, nullptr, &InternalStats::HandleEstimatePendingCompactionBytes, nullptr, nullptr}}, @@ -666,11 +688,6 @@ void InternalStats::CollectCacheEntryStats(bool foreground) { min_interval_factor); } -std::function Blah() { - static int x = 42; - return [&]() { ++x; }; -} - std::function InternalStats::CacheEntryRoleStats::GetEntryCallback() { @@ -696,6 +713,7 @@ void InternalStats::CacheEntryRoleStats::BeginCollection( cache_usage = cache->GetUsage(); table_size = cache->GetTableAddressCount(); occupancy = cache->GetOccupancyCount(); + hash_seed = cache->GetHashSeed(); } void InternalStats::CacheEntryRoleStats::EndCollection( @@ -720,7 +738,7 @@ std::string InternalStats::CacheEntryRoleStats::ToString( std::ostringstream str; str << "Block cache " << cache_id << " capacity: " << BytesToHumanString(cache_capacity) - << " usage: " << BytesToHumanString(cache_usage) + << " seed: " << hash_seed << " usage: " << BytesToHumanString(cache_usage) << " table_size: " << table_size << " occupancy: " << occupancy << " collections: " << collection_count << " last_copies: " << copies_of_last_collection @@ -1100,6 +1118,18 @@ bool InternalStats::HandleCFFileHistogram(std::string* value, return true; } +bool InternalStats::HandleCFWriteStallStats(std::string* value, + Slice /*suffix*/) { + DumpCFStatsWriteStall(value); + return true; +} + +bool InternalStats::HandleCFWriteStallStatsMap( + std::map* value, Slice /*suffix*/) { + DumpCFMapStatsWriteStall(value); + return true; +} + bool InternalStats::HandleDBMapStats( std::map* db_stats, Slice /*suffix*/) { DumpDBMapStats(db_stats); @@ -1111,6 +1141,18 @@ bool InternalStats::HandleDBStats(std::string* value, Slice /*suffix*/) { return true; } +bool InternalStats::HandleDBWriteStallStats(std::string* value, + Slice /*suffix*/) { + DumpDBStatsWriteStall(value); + return true; +} + +bool InternalStats::HandleDBWriteStallStatsMap( + std::map* value, Slice /*suffix*/) { + DumpDBMapStatsWriteStall(value); + return true; +} + bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) { auto* current = cfd_->current(); *value = current->DebugString(true, true); @@ -1120,7 +1162,9 @@ bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) { bool InternalStats::HandleAggregatedTableProperties(std::string* value, Slice /*suffix*/) { std::shared_ptr tp; - auto s = cfd_->current()->GetAggregatedTableProperties(&tp); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = cfd_->current()->GetAggregatedTableProperties(read_options, &tp); if (!s.ok()) { return false; } @@ -1140,7 +1184,9 @@ static std::map MapUint64ValuesToString( bool InternalStats::HandleAggregatedTablePropertiesMap( std::map* values, Slice /*suffix*/) { std::shared_ptr tp; - auto s = cfd_->current()->GetAggregatedTableProperties(&tp); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = cfd_->current()->GetAggregatedTableProperties(read_options, &tp); if (!s.ok()) { return false; } @@ -1156,8 +1202,10 @@ bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* values, return false; } std::shared_ptr tp; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; auto s = cfd_->current()->GetAggregatedTableProperties( - &tp, static_cast(level)); + read_options, &tp, static_cast(level)); if (!s.ok()) { return false; } @@ -1173,8 +1221,10 @@ bool InternalStats::HandleAggregatedTablePropertiesAtLevelMap( return false; } std::shared_ptr tp; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; auto s = cfd_->current()->GetAggregatedTableProperties( - &tp, static_cast(level)); + read_options, &tp, static_cast(level)); if (!s.ok()) { return false; } @@ -1365,6 +1415,12 @@ bool InternalStats::HandleLiveNonBottommostSstFilesSize(uint64_t* value, return true; } +bool InternalStats::HandleObsoleteSstFilesSize(uint64_t* value, DBImpl* db, + Version* /*version*/) { + *value = db->GetObsoleteSstFilesSize(); + return true; +} + bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value, DBImpl* /*db*/, Version* /*version*/) { @@ -1376,7 +1432,11 @@ bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value, bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value, DBImpl* /*db*/, Version* version) { - *value = (version == nullptr) ? 0 : version->GetMemoryUsageByTableReaders(); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + *value = (version == nullptr) + ? 0 + : version->GetMemoryUsageByTableReaders(read_options); return true; } @@ -1427,9 +1487,10 @@ bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/, ->compaction_options_fifo.allow_compaction) { return false; } - + // TODO: plumb Env::IOActivity + const ReadOptions read_options; TablePropertiesCollection collection; - auto s = cfd_->current()->GetPropertiesOfAllTables(&collection); + auto s = cfd_->current()->GetPropertiesOfAllTables(read_options, &collection); if (!s.ok()) { return false; } @@ -1600,6 +1661,10 @@ void InternalStats::DumpDBStats(std::string* value) { 10000.0 / std::max(interval_seconds_up, 0.001)); value->append(buf); + std::string write_stall_stats; + DumpDBStatsWriteStall(&write_stall_stats); + value->append(write_stall_stats); + db_stats_snapshot_.seconds_up = seconds_up; db_stats_snapshot_.ingest_bytes = user_bytes_written; db_stats_snapshot_.write_other = write_other; @@ -1611,6 +1676,58 @@ void InternalStats::DumpDBStats(std::string* value) { db_stats_snapshot_.write_stall_micros = write_stall_micros; } +void InternalStats::DumpDBMapStatsWriteStall( + std::map* value) { + constexpr uint32_t max_db_scope_write_stall_cause = + static_cast(WriteStallCause::kDBScopeWriteStallCauseEnumMax); + + for (uint32_t i = + max_db_scope_write_stall_cause - kNumDBScopeWriteStallCauses; + i < max_db_scope_write_stall_cause; ++i) { + for (uint32_t j = 0; + j < static_cast(WriteStallCondition::kNormal); ++j) { + WriteStallCause cause = static_cast(i); + WriteStallCondition condition = static_cast(j); + InternalStats::InternalDBStatsType internal_db_stat = + InternalDBStat(cause, condition); + + if (internal_db_stat == InternalStats::kIntStatsNumMax) { + continue; + } + + std::string name = + WriteStallStatsMapKeys::CauseConditionCount(cause, condition); + uint64_t stat = + db_stats_[static_cast(internal_db_stat)].load( + std::memory_order_relaxed); + (*value)[name] = std::to_string(stat); + } + } +} + +void InternalStats::DumpDBStatsWriteStall(std::string* value) { + assert(value); + + std::map write_stall_stats_map; + DumpDBMapStatsWriteStall(&write_stall_stats_map); + + std::ostringstream str; + str << "Write Stall (count): "; + + for (auto write_stall_stats_map_iter = write_stall_stats_map.begin(); + write_stall_stats_map_iter != write_stall_stats_map.end(); + write_stall_stats_map_iter++) { + const auto& name_and_stat = *write_stall_stats_map_iter; + str << name_and_stat.first << ": " << name_and_stat.second; + if (std::next(write_stall_stats_map_iter) == write_stall_stats_map.end()) { + str << "\n"; + } else { + str << ", "; + } + } + *value = str.str(); +} + /** * Dump Compaction Level stats to a map of stat name with "compaction." prefix * to value in double as string. The level in stat name is represented with @@ -1637,7 +1754,7 @@ void InternalStats::DumpCFMapStats( } } - DumpCFMapStatsIOStalls(cf_stats); + DumpCFMapStatsWriteStall(cf_stats); } void InternalStats::DumpCFMapStats( @@ -1647,7 +1764,7 @@ void InternalStats::DumpCFMapStats( assert(vstorage); int num_levels_to_check = - (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO) + (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) ? vstorage->num_levels() - 1 : 1; @@ -1729,36 +1846,89 @@ void InternalStats::DumpCFMapStatsByPriority( } } -void InternalStats::DumpCFMapStatsIOStalls( - std::map* cf_stats) { - (*cf_stats)["io_stalls.level0_slowdown"] = - std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS]); - (*cf_stats)["io_stalls.level0_slowdown_with_compaction"] = - std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS]); - (*cf_stats)["io_stalls.level0_numfiles"] = - std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS]); - (*cf_stats)["io_stalls.level0_numfiles_with_compaction"] = - std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS]); - (*cf_stats)["io_stalls.stop_for_pending_compaction_bytes"] = - std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS]); - (*cf_stats)["io_stalls.slowdown_for_pending_compaction_bytes"] = - std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS]); - (*cf_stats)["io_stalls.memtable_compaction"] = - std::to_string(cf_stats_count_[MEMTABLE_LIMIT_STOPS]); - (*cf_stats)["io_stalls.memtable_slowdown"] = - std::to_string(cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]); - - uint64_t total_stop = cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] + - cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] + - cf_stats_count_[MEMTABLE_LIMIT_STOPS]; - - uint64_t total_slowdown = - cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] + - cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] + - cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]; - - (*cf_stats)["io_stalls.total_stop"] = std::to_string(total_stop); - (*cf_stats)["io_stalls.total_slowdown"] = std::to_string(total_slowdown); +void InternalStats::DumpCFMapStatsWriteStall( + std::map* value) { + uint64_t total_delays = 0; + uint64_t total_stops = 0; + constexpr uint32_t max_cf_scope_write_stall_cause = + static_cast(WriteStallCause::kCFScopeWriteStallCauseEnumMax); + + for (uint32_t i = + max_cf_scope_write_stall_cause - kNumCFScopeWriteStallCauses; + i < max_cf_scope_write_stall_cause; ++i) { + for (uint32_t j = 0; + j < static_cast(WriteStallCondition::kNormal); ++j) { + WriteStallCause cause = static_cast(i); + WriteStallCondition condition = static_cast(j); + InternalStats::InternalCFStatsType internal_cf_stat = + InternalCFStat(cause, condition); + + if (internal_cf_stat == InternalStats::INTERNAL_CF_STATS_ENUM_MAX) { + continue; + } + + std::string name = + WriteStallStatsMapKeys::CauseConditionCount(cause, condition); + uint64_t stat = + cf_stats_count_[static_cast(internal_cf_stat)]; + (*value)[name] = std::to_string(stat); + + if (condition == WriteStallCondition::kDelayed) { + total_delays += stat; + } else if (condition == WriteStallCondition::kStopped) { + total_stops += stat; + } + } + } + + (*value)[WriteStallStatsMapKeys:: + CFL0FileCountLimitDelaysWithOngoingCompaction()] = + std::to_string( + cf_stats_count_[L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION]); + (*value)[WriteStallStatsMapKeys:: + CFL0FileCountLimitStopsWithOngoingCompaction()] = + std::to_string( + cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION]); + + (*value)[WriteStallStatsMapKeys::TotalStops()] = std::to_string(total_stops); + (*value)[WriteStallStatsMapKeys::TotalDelays()] = + std::to_string(total_delays); +} + +void InternalStats::DumpCFStatsWriteStall(std::string* value, + uint64_t* total_stall_count) { + assert(value); + + std::map write_stall_stats_map; + DumpCFMapStatsWriteStall(&write_stall_stats_map); + + std::ostringstream str; + str << "Write Stall (count): "; + + for (auto write_stall_stats_map_iter = write_stall_stats_map.begin(); + write_stall_stats_map_iter != write_stall_stats_map.end(); + write_stall_stats_map_iter++) { + const auto& name_and_stat = *write_stall_stats_map_iter; + str << name_and_stat.first << ": " << name_and_stat.second; + if (std::next(write_stall_stats_map_iter) == write_stall_stats_map.end()) { + str << "\n"; + } else { + str << ", "; + } + } + + if (total_stall_count) { + *total_stall_count = + ParseUint64( + write_stall_stats_map[WriteStallStatsMapKeys::TotalStops()]) + + ParseUint64( + write_stall_stats_map[WriteStallStatsMapKeys::TotalDelays()]); + if (*total_stall_count > 0) { + str << "interval: " << *total_stall_count - cf_stats_snapshot_.stall_count + << " total count\n"; + } + } + *value = str.str(); } void InternalStats::DumpCFStats(std::string* value) { @@ -1796,14 +1966,6 @@ void InternalStats::DumpCFStatsNoFileHistogram(bool is_periodic, uint64_t ingest_l0_files_addfile = cf_stats_value_[INGESTED_LEVEL0_NUM_FILES_TOTAL]; uint64_t ingest_keys_addfile = cf_stats_value_[INGESTED_NUM_KEYS_TOTAL]; - // Cumulative summary - uint64_t total_stall_count = - cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] + - cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] + - cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] + - cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] + - cf_stats_count_[MEMTABLE_LIMIT_STOPS] + - cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]; // Interval summary uint64_t interval_flush_ingest = flush_ingest - cf_stats_snapshot_.ingest_bytes_flush; @@ -1925,34 +2087,10 @@ void InternalStats::DumpCFStatsNoFileHistogram(bool is_periodic, cf_stats_snapshot_.compact_micros = compact_micros; } - snprintf(buf, sizeof(buf), - "Stalls(count): %" PRIu64 - " level0_slowdown, " - "%" PRIu64 - " level0_slowdown_with_compaction, " - "%" PRIu64 - " level0_numfiles, " - "%" PRIu64 - " level0_numfiles_with_compaction, " - "%" PRIu64 - " stop for pending_compaction_bytes, " - "%" PRIu64 - " slowdown for pending_compaction_bytes, " - "%" PRIu64 - " memtable_compaction, " - "%" PRIu64 - " memtable_slowdown, " - "interval %" PRIu64 " total count\n", - cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS], - cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS], - cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS], - cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS], - cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS], - cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS], - cf_stats_count_[MEMTABLE_LIMIT_STOPS], - cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS], - total_stall_count - cf_stats_snapshot_.stall_count); - value->append(buf); + std::string write_stall_stats; + uint64_t total_stall_count; + DumpCFStatsWriteStall(&write_stall_stats, &total_stall_count); + value->append(write_stall_stats); if (is_periodic) { cf_stats_snapshot_.seconds_up = seconds_up; @@ -2005,12 +2143,5 @@ void InternalStats::DumpCFFileHistogram(std::string* value) { value->append(oss.str()); } -#else - -const DBPropertyInfo* GetPropertyInfo(const Slice& /*property*/) { - return nullptr; -} - -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/internal_stats.h b/db/internal_stats.h index 7ab6437c4d86..58275b145db5 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -60,7 +60,6 @@ struct DBPropertyInfo { extern const DBPropertyInfo* GetPropertyInfo(const Slice& property); -#ifndef ROCKSDB_LITE #undef SCORE enum class LevelStatType { INVALID = 0, @@ -105,15 +104,20 @@ class InternalStats { static const std::map compaction_level_stats; enum InternalCFStatsType { - L0_FILE_COUNT_LIMIT_SLOWDOWNS, - LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, + MEMTABLE_LIMIT_DELAYS, MEMTABLE_LIMIT_STOPS, - MEMTABLE_LIMIT_SLOWDOWNS, + L0_FILE_COUNT_LIMIT_DELAYS, L0_FILE_COUNT_LIMIT_STOPS, - LOCKED_L0_FILE_COUNT_LIMIT_STOPS, - PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, + PENDING_COMPACTION_BYTES_LIMIT_DELAYS, PENDING_COMPACTION_BYTES_LIMIT_STOPS, + // Write slowdown caused by l0 file count limit while there is ongoing L0 + // compaction + L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION, + // Write stop caused by l0 file count limit while there is ongoing L0 + // compaction + L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION, WRITE_STALLS_ENUM_MAX, + // End of all write stall stats BYTES_FLUSHED, BYTES_INGESTED_ADD_FILE, INGESTED_NUM_FILES_TOTAL, @@ -130,7 +134,18 @@ class InternalStats { kIntStatsWriteDoneByOther, kIntStatsWriteDoneBySelf, kIntStatsWriteWithWal, + // TODO(hx235): Currently `kIntStatsWriteStallMicros` only measures + // "delayed" time of CF-scope write stalls, not including the "stopped" time + // nor any DB-scope write stalls (e.g, ones triggered by + // `WriteBufferManager`). + // + // However, the word "write stall" includes both "delayed" and "stopped" + // (see `WriteStallCondition`) and DB-scope writes stalls (see + // `WriteStallCause`). + // + // So we should improve, rename or clarify it kIntStatsWriteStallMicros, + kIntStatsWriteBufferManagerLimitStopsCounts, kIntStatsNumMax, }; @@ -463,6 +478,7 @@ class InternalStats { uint32_t copies_of_last_collection = 0; uint64_t last_start_time_micros_ = 0; uint64_t last_end_time_micros_ = 0; + uint32_t hash_seed = 0; void Clear() { // Wipe everything except collection_count @@ -600,6 +616,10 @@ class InternalStats { private: void DumpDBMapStats(std::map* db_stats); void DumpDBStats(std::string* value); + + void DumpDBMapStatsWriteStall(std::map* value); + void DumpDBStatsWriteStall(std::string* value); + void DumpCFMapStats(std::map* cf_stats); void DumpCFMapStats( const VersionStorageInfo* vstorage, @@ -607,7 +627,6 @@ class InternalStats { CompactionStats* compaction_stats_sum); void DumpCFMapStatsByPriority( std::map>* priorities_stats); - void DumpCFMapStatsIOStalls(std::map* cf_stats); void DumpCFStats(std::string* value); // if is_periodic = true, it is an internal call by RocksDB periodically to // dump the status. @@ -616,6 +635,10 @@ class InternalStats { // dump the status. void DumpCFFileHistogram(std::string* value); + void DumpCFMapStatsWriteStall(std::map* value); + void DumpCFStatsWriteStall(std::string* value, + uint64_t* total_stall_count = nullptr); + Cache* GetBlockCacheForStats(); Cache* GetBlobCacheForStats(); @@ -649,7 +672,7 @@ class InternalStats { // ColumnFamily-level stats CompactionStats comp_stats; uint64_t ingest_bytes_flush; // Bytes written to L0 (Flush) - uint64_t stall_count; // Stall count + uint64_t stall_count; // Total counts of CF-scope write stalls // Stats from compaction jobs - bytes written, bytes read, duration. uint64_t compact_bytes_write; uint64_t compact_bytes_read; @@ -744,9 +767,15 @@ class InternalStats { bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix); bool HandleCFFileHistogram(std::string* value, Slice suffix); bool HandleCFStatsPeriodic(std::string* value, Slice suffix); + bool HandleCFWriteStallStats(std::string* value, Slice suffix); + bool HandleCFWriteStallStatsMap(std::map* values, + Slice suffix); bool HandleDBMapStats(std::map* compaction_stats, Slice suffix); bool HandleDBStats(std::string* value, Slice suffix); + bool HandleDBWriteStallStats(std::string* value, Slice suffix); + bool HandleDBWriteStallStatsMap(std::map* values, + Slice suffix); bool HandleSsTables(std::string* value, Slice suffix); bool HandleAggregatedTableProperties(std::string* value, Slice suffix); bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix); @@ -792,6 +821,8 @@ class InternalStats { bool HandleLiveSstFilesSize(uint64_t* value, DBImpl* db, Version* version); bool HandleLiveNonBottommostSstFilesSize(uint64_t* value, DBImpl* db, Version* version); + bool HandleObsoleteSstFilesSize(uint64_t* value, DBImpl* db, + Version* version); bool HandleEstimatePendingCompactionBytes(uint64_t* value, DBImpl* db, Version* version); bool HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db, @@ -844,155 +875,4 @@ class InternalStats { uint64_t started_at_; }; -#else - -class InternalStats { - public: - enum InternalCFStatsType { - L0_FILE_COUNT_LIMIT_SLOWDOWNS, - LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, - MEMTABLE_LIMIT_STOPS, - MEMTABLE_LIMIT_SLOWDOWNS, - L0_FILE_COUNT_LIMIT_STOPS, - LOCKED_L0_FILE_COUNT_LIMIT_STOPS, - PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, - PENDING_COMPACTION_BYTES_LIMIT_STOPS, - WRITE_STALLS_ENUM_MAX, - BYTES_FLUSHED, - BYTES_INGESTED_ADD_FILE, - INGESTED_NUM_FILES_TOTAL, - INGESTED_LEVEL0_NUM_FILES_TOTAL, - INGESTED_NUM_KEYS_TOTAL, - INTERNAL_CF_STATS_ENUM_MAX, - }; - - enum InternalDBStatsType { - kIntStatsWalFileBytes, - kIntStatsWalFileSynced, - kIntStatsBytesWritten, - kIntStatsNumKeysWritten, - kIntStatsWriteDoneByOther, - kIntStatsWriteDoneBySelf, - kIntStatsWriteWithWal, - kIntStatsWriteStallMicros, - kIntStatsNumMax, - }; - - InternalStats(int /*num_levels*/, SystemClock* /*clock*/, - ColumnFamilyData* /*cfd*/) {} - - // Per level compaction stats - struct CompactionOutputsStats { - uint64_t num_output_records = 0; - uint64_t bytes_written = 0; - uint64_t bytes_written_blob = 0; - uint64_t num_output_files = 0; - uint64_t num_output_files_blob = 0; - - void Add(const CompactionOutputsStats& stats) { - this->num_output_records += stats.num_output_records; - this->bytes_written += stats.bytes_written; - this->bytes_written_blob += stats.bytes_written_blob; - this->num_output_files += stats.num_output_files; - this->num_output_files_blob += stats.num_output_files_blob; - } - }; - - struct CompactionStats { - uint64_t micros; - uint64_t cpu_micros; - uint64_t bytes_read_non_output_levels; - uint64_t bytes_read_output_level; - uint64_t bytes_read_blob; - uint64_t bytes_written; - uint64_t bytes_written_blob; - uint64_t bytes_moved; - int num_input_files_in_non_output_levels; - int num_input_files_in_output_level; - int num_output_files; - int num_output_files_blob; - uint64_t num_input_records; - uint64_t num_dropped_records; - uint64_t num_output_records; - int count; - - explicit CompactionStats() {} - - explicit CompactionStats(CompactionReason /*reason*/, int /*c*/) {} - - explicit CompactionStats(const CompactionStats& /*c*/) {} - - void Add(const CompactionStats& /*c*/) {} - - void Add(const CompactionOutputsStats& /*c*/) {} - - void Subtract(const CompactionStats& /*c*/) {} - }; - - struct CompactionStatsFull { - // the stats for the target primary output level (per level stats) - CompactionStats stats; - - // stats for output_to_penultimate_level level (per level stats) - bool has_penultimate_level_output = false; - CompactionStats penultimate_level_stats; - - explicit CompactionStatsFull(){}; - - explicit CompactionStatsFull(CompactionReason /*reason*/, int /*c*/){}; - - uint64_t TotalBytesWritten() const { return 0; } - - uint64_t DroppedRecords() { return 0; } - - void SetMicros(uint64_t /*val*/){}; - - void AddCpuMicros(uint64_t /*val*/){}; - }; - - void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/, - const CompactionStats& /*stats*/) {} - - void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/, - const CompactionStatsFull& /*unmerged_stats*/) {} - - void IncBytesMoved(int /*level*/, uint64_t /*amount*/) {} - - void AddCFStats(InternalCFStatsType /*type*/, uint64_t /*value*/) {} - - void AddDBStats(InternalDBStatsType /*type*/, uint64_t /*value*/, - bool /*concurrent */ = false) {} - - HistogramImpl* GetFileReadHist(int /*level*/) { return nullptr; } - - HistogramImpl* GetBlobFileReadHist() { return nullptr; } - - uint64_t GetBackgroundErrorCount() const { return 0; } - - uint64_t BumpAndGetBackgroundErrorCount() { return 0; } - - bool GetStringProperty(const DBPropertyInfo& /*property_info*/, - const Slice& /*property*/, std::string* /*value*/) { - return false; - } - - bool GetMapProperty(const DBPropertyInfo& /*property_info*/, - const Slice& /*property*/, - std::map* /*value*/) { - return false; - } - - bool GetIntProperty(const DBPropertyInfo& /*property_info*/, - uint64_t* /*value*/, DBImpl* /*db*/) const { - return false; - } - - bool GetIntPropertyOutOfMutex(const DBPropertyInfo& /*property_info*/, - Version* /*version*/, - uint64_t* /*value*/) const { - return false; - } -}; -#endif // !ROCKSDB_LITE - } // namespace ROCKSDB_NAMESPACE diff --git a/db/job_context.h b/db/job_context.h index 352c58e82391..48728f48d6bb 100644 --- a/db/job_context.h +++ b/db/job_context.h @@ -15,6 +15,7 @@ #include "db/column_family.h" #include "db/log_writer.h" #include "db/version_set.h" +#include "util/autovector.h" namespace ROCKSDB_NAMESPACE { @@ -64,7 +65,7 @@ struct SuperVersionContext { WriteStallCondition new_cond, const std::string& name, const ImmutableOptions* ioptions) { -#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) +#if !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) WriteStallNotification notif; notif.write_stall_info.cf_name = name; notif.write_stall_info.condition.prev = old_cond; @@ -76,12 +77,11 @@ struct SuperVersionContext { (void)new_cond; (void)name; (void)ioptions; -#endif // !defined(ROCKSDB_LITE) && - // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) +#endif // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) } void Clean() { -#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) +#if !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION) // notify listeners on changed write stall conditions for (auto& notif : write_stall_notifications) { for (auto& listener : notif.immutable_options->listeners) { @@ -89,7 +89,7 @@ struct SuperVersionContext { } } write_stall_notifications.clear(); -#endif // !ROCKSDB_LITE +#endif // free superversions for (auto s : superversions_to_free) { delete s; @@ -171,6 +171,16 @@ struct JobContext { // will be reused later std::vector log_recycle_files; + // Files quarantined from deletion. This list contains file numbers for files + // that are in an ambiguous states. This includes newly generated SST files + // and blob files from flush and compaction job whose VersionEdits' persist + // state in Manifest are unclear. An old manifest file whose immediately + // following new manifest file's CURRENT file creation is in an unclear state. + // WAL logs don't have this premature deletion risk since + // min_log_number_to_keep is only updated after successful manifest commits. + // So this data structure doesn't track log files. + autovector files_to_quarantine; + // a list of manifest files that we need to delete std::vector manifest_delete_files; diff --git a/db/kv_checksum.h b/db/kv_checksum.h index bce507fcf983..53c02485ffa7 100644 --- a/db/kv_checksum.h +++ b/db/kv_checksum.h @@ -46,6 +46,8 @@ template class ProtectionInfoKVOC; template class ProtectionInfoKVOS; +template +class ProtectionInfoKV; // Aliases for 64-bit protection infos. using ProtectionInfo64 = ProtectionInfo; @@ -64,13 +66,13 @@ class ProtectionInfo { ProtectionInfoKVO ProtectKVO(const SliceParts& key, const SliceParts& value, ValueType op_type) const; - - T GetVal() const { return val_; } + ProtectionInfoKV ProtectKV(const Slice& key, const Slice& value) const; private: friend class ProtectionInfoKVO; friend class ProtectionInfoKVOS; friend class ProtectionInfoKVOC; + friend class ProtectionInfoKV; // Each field is hashed with an independent value so we can catch fields being // swapped. Per the `NPHash64()` docs, using consecutive seeds is a pitfall, @@ -89,8 +91,47 @@ class ProtectionInfo { static_assert(sizeof(ProtectionInfo) == sizeof(T), ""); } + T GetVal() const { return val_; } void SetVal(T val) { val_ = val; } + void Encode(uint8_t len, char* dst) const { + assert(sizeof(val_) >= len); + switch (len) { + case 1: + dst[0] = static_cast(val_); + break; + case 2: + EncodeFixed16(dst, static_cast(val_)); + break; + case 4: + EncodeFixed32(dst, static_cast(val_)); + break; + case 8: + EncodeFixed64(dst, static_cast(val_)); + break; + default: + assert(false); + } + } + + bool Verify(uint8_t len, const char* checksum_ptr) const { + assert(sizeof(val_) >= len); + switch (len) { + case 1: + return static_cast(checksum_ptr[0]) == + static_cast(val_); + case 2: + return DecodeFixed16(checksum_ptr) == static_cast(val_); + case 4: + return DecodeFixed32(checksum_ptr) == static_cast(val_); + case 8: + return DecodeFixed64(checksum_ptr) == static_cast(val_); + default: + assert(false); + return false; + } + } + T val_ = 0; }; @@ -113,7 +154,14 @@ class ProtectionInfoKVO { void UpdateV(const SliceParts& old_value, const SliceParts& new_value); void UpdateO(ValueType old_op_type, ValueType new_op_type); - T GetVal() const { return info_.GetVal(); } + // Encode this protection info into `len` bytes and stores them in `dst`. + void Encode(uint8_t len, char* dst) const { info_.Encode(len, dst); } + // Verify this protection info against the protection info encoded by Encode() + // at the first `len` bytes of `checksum_ptr`. + // Returns true iff the verification is successful. + bool Verify(uint8_t len, const char* checksum_ptr) const { + return info_.Verify(len, checksum_ptr); + } private: friend class ProtectionInfo; @@ -124,6 +172,7 @@ class ProtectionInfoKVO { static_assert(sizeof(ProtectionInfoKVO) == sizeof(T), ""); } + T GetVal() const { return info_.GetVal(); } void SetVal(T val) { info_.SetVal(val); } ProtectionInfo info_; @@ -154,7 +203,10 @@ class ProtectionInfoKVOC { void UpdateC(ColumnFamilyId old_column_family_id, ColumnFamilyId new_column_family_id); - T GetVal() const { return kvo_.GetVal(); } + void Encode(uint8_t len, char* dst) const { kvo_.Encode(len, dst); } + bool Verify(uint8_t len, const char* checksum_ptr) const { + return kvo_.Verify(len, checksum_ptr); + } private: friend class ProtectionInfoKVO; @@ -163,6 +215,7 @@ class ProtectionInfoKVOC { static_assert(sizeof(ProtectionInfoKVOC) == sizeof(T), ""); } + T GetVal() const { return kvo_.GetVal(); } void SetVal(T val) { kvo_.SetVal(val); } ProtectionInfoKVO kvo_; @@ -193,7 +246,10 @@ class ProtectionInfoKVOS { void UpdateS(SequenceNumber old_sequence_number, SequenceNumber new_sequence_number); - T GetVal() const { return kvo_.GetVal(); } + void Encode(uint8_t len, char* dst) const { kvo_.Encode(len, dst); } + bool Verify(uint8_t len, const char* checksum_ptr) const { + return kvo_.Verify(len, checksum_ptr); + } private: friend class ProtectionInfoKVO; @@ -202,11 +258,32 @@ class ProtectionInfoKVOS { static_assert(sizeof(ProtectionInfoKVOS) == sizeof(T), ""); } + T GetVal() const { return kvo_.GetVal(); } void SetVal(T val) { kvo_.SetVal(val); } ProtectionInfoKVO kvo_; }; +template +class ProtectionInfoKV { + public: + ProtectionInfoKV() = default; + + void Encode(uint8_t len, char* dst) const { info_.Encode(len, dst); } + bool Verify(uint8_t len, const char* checksum_ptr) const { + return info_.Verify(len, checksum_ptr); + } + + private: + friend class ProtectionInfo; + + explicit ProtectionInfoKV(T val) : info_(val) { + static_assert(sizeof(ProtectionInfoKV) == sizeof(T)); + } + + ProtectionInfo info_; +}; + template Status ProtectionInfo::GetStatus() const { if (val_ != 0) { @@ -244,6 +321,16 @@ ProtectionInfoKVO ProtectionInfo::ProtectKVO(const SliceParts& key, return ProtectionInfoKVO(val); } +template +ProtectionInfoKV ProtectionInfo::ProtectKV(const Slice& key, + const Slice& value) const { + T val = GetVal(); + val = val ^ static_cast(GetSliceNPHash64(key, ProtectionInfo::kSeedK)); + val = + val ^ static_cast(GetSliceNPHash64(value, ProtectionInfo::kSeedV)); + return ProtectionInfoKV(val); +} + template void ProtectionInfoKVO::UpdateK(const Slice& old_key, const Slice& new_key) { T val = GetVal(); @@ -394,5 +481,4 @@ void ProtectionInfoKVOS::UpdateS(SequenceNumber old_sequence_number, sizeof(new_sequence_number), ProtectionInfo::kSeedS)); SetVal(val); } - } // namespace ROCKSDB_NAMESPACE diff --git a/db/listener_test.cc b/db/listener_test.cc index 160866bb774f..41577b92c179 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -3,6 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include + #include "db/blob/blob_index.h" #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" @@ -10,7 +12,7 @@ #include "db/version_set.h" #include "db/write_batch_internal.h" #include "file/filename.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/db.h" @@ -27,11 +29,10 @@ #include "test_util/testutil.h" #include "util/hash.h" #include "util/mutexlock.h" -#include "util/rate_limiter.h" +#include "util/rate_limiter_impl.h" #include "util/string_util.h" #include "utilities/merge_operators.h" -#ifndef ROCKSDB_LITE namespace ROCKSDB_NAMESPACE { @@ -550,6 +551,7 @@ class TestCompactionReasonListener : public EventListener { TEST_F(EventListenerTest, CompactionReasonLevel) { Options options; + options.level_compaction_dynamic_level_bytes = false; options.env = CurrentOptions().env; options.create_if_missing = true; options.memtable_factory.reset(test::NewSpecialSkipListFactory( @@ -580,7 +582,7 @@ TEST_F(EventListenerTest, CompactionReasonLevel) { for (int k = 1; k <= 30; k++) { ASSERT_OK(Put(Key(k), Key(k))); if (k % 10 == 0) { - Flush(); + ASSERT_OK(Flush()); } } @@ -707,27 +709,28 @@ TEST_F(EventListenerTest, CompactionReasonFIFO) { class TableFileCreationListener : public EventListener { public: - class TestEnv : public EnvWrapper { + class TestFS : public FileSystemWrapper { public: - explicit TestEnv(Env* t) : EnvWrapper(t) {} + explicit TestFS(const std::shared_ptr& t) + : FileSystemWrapper(t) {} static const char* kClassName() { return "TestEnv"; } const char* Name() const override { return kClassName(); } - void SetStatus(Status s) { status_ = s; } + void SetStatus(IOStatus s) { status_ = s; } - Status NewWritableFile(const std::string& fname, - std::unique_ptr* result, - const EnvOptions& options) override { + IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts, + std::unique_ptr* result, + IODebugContext* dbg) override { if (fname.size() > 4 && fname.substr(fname.size() - 4) == ".sst") { if (!status_.ok()) { return status_; } } - return target()->NewWritableFile(fname, result, options); + return target()->NewWritableFile(fname, opts, result, dbg); } private: - Status status_; + IOStatus status_; }; TableFileCreationListener() { @@ -813,8 +816,10 @@ class TableFileCreationListener : public EventListener { TEST_F(EventListenerTest, TableFileCreationListenersTest) { auto listener = std::make_shared(); Options options; - std::unique_ptr test_env( - new TableFileCreationListener::TestEnv(CurrentOptions().env)); + std::shared_ptr test_fs = + std::make_shared( + CurrentOptions().env->GetFileSystem()); + std::unique_ptr test_env = NewCompositeEnv(test_fs); options.create_if_missing = true; options.listeners.push_back(listener); options.env = test_env.get(); @@ -827,11 +832,11 @@ TEST_F(EventListenerTest, TableFileCreationListenersTest) { listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0); ASSERT_OK(Put("foo", "aaa1")); ASSERT_OK(Put("bar", "bbb1")); - test_env->SetStatus(Status::NotSupported("not supported")); + test_fs->SetStatus(IOStatus::NotSupported("not supported")); ASSERT_NOK(Flush()); listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0); ASSERT_TRUE(listener->last_failure_.IsNotSupported()); - test_env->SetStatus(Status::OK()); + test_fs->SetStatus(IOStatus::OK()); Reopen(options); ASSERT_OK(Put("foo", "aaa2")); @@ -850,7 +855,7 @@ TEST_F(EventListenerTest, TableFileCreationListenersTest) { ASSERT_OK(Put("foo", "aaa3")); ASSERT_OK(Put("bar", "bbb3")); ASSERT_OK(Flush()); - test_env->SetStatus(Status::NotSupported("not supported")); + test_fs->SetStatus(IOStatus::NotSupported("not supported")); ASSERT_NOK( dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd)); ASSERT_NOK(dbfull()->TEST_WaitForCompact()); @@ -858,7 +863,7 @@ TEST_F(EventListenerTest, TableFileCreationListenersTest) { ASSERT_TRUE(listener->last_failure_.IsNotSupported()); // Reset - test_env->SetStatus(Status::OK()); + test_fs->SetStatus(IOStatus::OK()); DestroyAndReopen(options); // Verify that an empty table file that is immediately deleted gives Aborted @@ -1586,7 +1591,6 @@ TEST_F(EventListenerTest, BlobDBFileTest) { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); diff --git a/db/log_format.h b/db/log_format.h index d397372f4914..a976b3f9eab3 100644 --- a/db/log_format.h +++ b/db/log_format.h @@ -35,17 +35,21 @@ enum RecordType { // Compression Type kSetCompressionType = 9, + + // User-defined timestamp sizes + kUserDefinedTimestampSizeType = 10, + kRecyclableUserDefinedTimestampSizeType = 11, }; -static const int kMaxRecordType = kSetCompressionType; +constexpr int kMaxRecordType = kRecyclableUserDefinedTimestampSizeType; -static const unsigned int kBlockSize = 32768; +constexpr unsigned int kBlockSize = 32768; // Header is checksum (4 bytes), length (2 bytes), type (1 byte) -static const int kHeaderSize = 4 + 2 + 1; +constexpr int kHeaderSize = 4 + 2 + 1; // Recyclable header is checksum (4 bytes), length (2 bytes), type (1 byte), // log number (4 bytes). -static const int kRecyclableHeaderSize = 4 + 2 + 1 + 4; +constexpr int kRecyclableHeaderSize = 4 + 2 + 1 + 4; } // namespace log } // namespace ROCKSDB_NAMESPACE diff --git a/db/log_reader.cc b/db/log_reader.cc index 575a7d758910..4e470616f054 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -164,6 +164,54 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, } break; + case kSetCompressionType: { + if (compression_type_record_read_) { + ReportCorruption(fragment.size(), + "read multiple SetCompressionType records"); + } + if (first_record_read_) { + ReportCorruption(fragment.size(), + "SetCompressionType not the first record"); + } + prospective_record_offset = physical_record_offset; + scratch->clear(); + last_record_offset_ = prospective_record_offset; + CompressionTypeRecord compression_record(kNoCompression); + Status s = compression_record.DecodeFrom(&fragment); + if (!s.ok()) { + ReportCorruption(fragment.size(), + "could not decode SetCompressionType record"); + } else { + InitCompression(compression_record); + } + break; + } + case kUserDefinedTimestampSizeType: + case kRecyclableUserDefinedTimestampSizeType: { + if (in_fragmented_record && !scratch->empty()) { + ReportCorruption( + scratch->size(), + "user-defined timestamp size record interspersed partial record"); + } + prospective_record_offset = physical_record_offset; + scratch->clear(); + last_record_offset_ = prospective_record_offset; + UserDefinedTimestampSizeRecord ts_record; + Status s = ts_record.DecodeFrom(&fragment); + if (!s.ok()) { + ReportCorruption( + fragment.size(), + "could not decode user-defined timestamp size record"); + } else { + s = UpdateRecordedTimestampSize( + ts_record.GetUserDefinedTimestampSize()); + if (!s.ok()) { + ReportCorruption(fragment.size(), s.getState()); + } + } + break; + } + case kBadHeader: if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency || wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) { @@ -257,29 +305,6 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, } break; - case kSetCompressionType: { - if (compression_type_record_read_) { - ReportCorruption(fragment.size(), - "read multiple SetCompressionType records"); - } - if (first_record_read_) { - ReportCorruption(fragment.size(), - "SetCompressionType not the first record"); - } - prospective_record_offset = physical_record_offset; - scratch->clear(); - last_record_offset_ = prospective_record_offset; - CompressionTypeRecord compression_record(kNoCompression); - Status s = compression_record.DecodeFrom(&fragment); - if (!s.ok()) { - ReportCorruption(fragment.size(), - "could not decode SetCompressionType record"); - } else { - InitCompression(compression_record); - } - break; - } - default: { char buf[40]; snprintf(buf, sizeof(buf), "unknown record type %u", record_type); @@ -444,11 +469,14 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size, const unsigned int type = header[6]; const uint32_t length = a | (b << 8); int header_size = kHeaderSize; - if (type >= kRecyclableFullType && type <= kRecyclableLastType) { + const bool is_recyclable_type = + ((type >= kRecyclableFullType && type <= kRecyclableLastType) || + type == kRecyclableUserDefinedTimestampSizeType); + if (is_recyclable_type) { + header_size = kRecyclableHeaderSize; if (end_of_buffer_offset_ - buffer_.size() == 0) { recycled_ = true; } - header_size = kRecyclableHeaderSize; // We need enough for the larger header if (buffer_.size() < static_cast(kRecyclableHeaderSize)) { int r = kEof; @@ -457,11 +485,8 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size, } continue; } - const uint32_t log_num = DecodeFixed32(header + 7); - if (log_num != log_number_) { - return kOldRecord; - } } + if (header_size + length > buffer_.size()) { assert(buffer_.size() >= static_cast(header_size)); *drop_size = buffer_.size(); @@ -473,6 +498,14 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size, return kBadRecordLen; } + if (is_recyclable_type) { + const uint32_t log_num = DecodeFixed32(header + 7); + if (log_num != log_number_) { + buffer_.remove_prefix(header_size + length); + return kOldRecord; + } + } + if (type == kZeroType && length == 0) { // Skip zero length record without reporting any drops since // such records are produced by the mmap based writing code in @@ -500,7 +533,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size, buffer_.remove_prefix(header_size + length); - if (!uncompress_ || type == kSetCompressionType) { + if (!uncompress_ || type == kSetCompressionType || + type == kUserDefinedTimestampSizeType || + type == kRecyclableUserDefinedTimestampSizeType) { *result = Slice(header + header_size, length); return type; } else { @@ -567,6 +602,26 @@ void Reader::InitCompression(const CompressionTypeRecord& compression_record) { assert(uncompressed_buffer_); } +Status Reader::UpdateRecordedTimestampSize( + const std::vector>& cf_to_ts_sz) { + for (const auto& [cf, ts_sz] : cf_to_ts_sz) { + // Zero user-defined timestamp size are not recorded. + if (ts_sz == 0) { + return Status::Corruption( + "User-defined timestamp size record contains zero timestamp size."); + } + // The user-defined timestamp size record for a column family should not be + // updated in the same log file. + if (recorded_cf_to_ts_sz_.count(cf) != 0) { + return Status::Corruption( + "User-defined timestamp size record contains update to " + "recorded column family."); + } + recorded_cf_to_ts_sz_.insert(std::make_pair(cf, ts_sz)); + } + return Status::OK(); +} + bool FragmentBufferedReader::ReadRecord(Slice* record, std::string* scratch, WALRecoveryMode /*unused*/, uint64_t* /* checksum */) { @@ -635,30 +690,6 @@ bool FragmentBufferedReader::ReadRecord(Slice* record, std::string* scratch, } break; - case kBadHeader: - case kBadRecord: - case kEof: - case kOldRecord: - if (in_fragmented_record_) { - ReportCorruption(fragments_.size(), "error in middle of record"); - in_fragmented_record_ = false; - fragments_.clear(); - } - break; - - case kBadRecordChecksum: - if (recycled_) { - fragments_.clear(); - return false; - } - ReportCorruption(drop_size, "checksum mismatch"); - if (in_fragmented_record_) { - ReportCorruption(fragments_.size(), "error in middle of record"); - in_fragmented_record_ = false; - fragments_.clear(); - } - break; - case kSetCompressionType: { if (compression_type_record_read_) { ReportCorruption(fragment.size(), @@ -683,6 +714,57 @@ bool FragmentBufferedReader::ReadRecord(Slice* record, std::string* scratch, break; } + case kUserDefinedTimestampSizeType: + case kRecyclableUserDefinedTimestampSizeType: { + if (in_fragmented_record_ && !scratch->empty()) { + ReportCorruption( + scratch->size(), + "user-defined timestamp size record interspersed partial record"); + } + fragments_.clear(); + prospective_record_offset = physical_record_offset; + last_record_offset_ = prospective_record_offset; + in_fragmented_record_ = false; + UserDefinedTimestampSizeRecord ts_record; + Status s = ts_record.DecodeFrom(&fragment); + if (!s.ok()) { + ReportCorruption( + fragment.size(), + "could not decode user-defined timestamp size record"); + } else { + s = UpdateRecordedTimestampSize( + ts_record.GetUserDefinedTimestampSize()); + if (!s.ok()) { + ReportCorruption(fragment.size(), s.getState()); + } + } + break; + } + + case kBadHeader: + case kBadRecord: + case kEof: + case kOldRecord: + if (in_fragmented_record_) { + ReportCorruption(fragments_.size(), "error in middle of record"); + in_fragmented_record_ = false; + fragments_.clear(); + } + break; + + case kBadRecordChecksum: + if (recycled_) { + fragments_.clear(); + return false; + } + ReportCorruption(drop_size, "checksum mismatch"); + if (in_fragmented_record_) { + ReportCorruption(fragments_.size(), "error in middle of record"); + in_fragmented_record_ = false; + fragments_.clear(); + } + break; + default: { char buf[40]; snprintf(buf, sizeof(buf), "unknown record type %u", @@ -770,7 +852,8 @@ bool FragmentBufferedReader::TryReadFragment( const unsigned int type = header[6]; const uint32_t length = a | (b << 8); int header_size = kHeaderSize; - if (type >= kRecyclableFullType && type <= kRecyclableLastType) { + if ((type >= kRecyclableFullType && type <= kRecyclableLastType) || + type == kRecyclableUserDefinedTimestampSizeType) { if (end_of_buffer_offset_ - buffer_.size() == 0) { recycled_ = true; } @@ -822,7 +905,9 @@ bool FragmentBufferedReader::TryReadFragment( buffer_.remove_prefix(header_size + length); - if (!uncompress_ || type == kSetCompressionType) { + if (!uncompress_ || type == kSetCompressionType || + type == kUserDefinedTimestampSizeType || + type == kRecyclableUserDefinedTimestampSizeType) { *fragment = Slice(header + header_size, length); *fragment_type_or_err = type; return true; diff --git a/db/log_reader.h b/db/log_reader.h index e3be1570e37b..697d1b5d58c0 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -11,6 +11,8 @@ #include #include +#include +#include #include "db/log_format.h" #include "file/sequence_file_reader.h" @@ -18,6 +20,8 @@ #include "rocksdb/slice.h" #include "rocksdb/status.h" #include "util/compression.h" +#include "util/hash_containers.h" +#include "util/udt_util.h" #include "util/xxhash.h" namespace ROCKSDB_NAMESPACE { @@ -74,6 +78,12 @@ class Reader { WALRecoveryMode::kTolerateCorruptedTailRecords, uint64_t* record_checksum = nullptr); + // Return the recorded user-defined timestamp size that have been read so + // far. This only applies to WAL logs. + const UnorderedMap& GetRecordedTimestampSize() const { + return recorded_cf_to_ts_sz_; + } + // Returns the physical offset of the last record returned by ReadRecord. // // Undefined before the first call to ReadRecord. @@ -154,6 +164,10 @@ class Reader { // Used for stream hashing uncompressed buffer in ReadPhysicalRecord() XXH3_state_t* uncompress_hash_state_; + // The recorded user-defined timestamp sizes that have been read so far. This + // is only for WAL logs. + UnorderedMap recorded_cf_to_ts_sz_; + // Extend record types with the following special values enum { kEof = kMaxRecordType + 1, @@ -190,6 +204,9 @@ class Reader { void ReportDrop(size_t bytes, const Status& reason); void InitCompression(const CompressionTypeRecord& compression_record); + + Status UpdateRecordedTimestampSize( + const std::vector>& cf_to_ts_sz); }; class FragmentBufferedReader : public Reader { diff --git a/db/log_test.cc b/db/log_test.cc index f4d388f41b05..fa5e2aa0fcd5 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -45,9 +45,10 @@ static std::string RandomSkewedString(int i, Random* rnd) { return BigString(NumberString(i), rnd->Skewed(17)); } -// Param type is tuple +// Param type is tuple // get<0>(tuple): non-zero if recycling log, zero if regular log // get<1>(tuple): true if allow retry after read EOF, false otherwise +// get<2>(tuple): type of compression used class LogTest : public ::testing::TestWithParam> { private: @@ -181,20 +182,28 @@ class LogTest Slice* get_reader_contents() { return &reader_contents_; } - void Write(const std::string& msg) { + void Write(const std::string& msg, + const UnorderedMap* cf_to_ts_sz = nullptr) { + if (cf_to_ts_sz != nullptr && !cf_to_ts_sz->empty()) { + ASSERT_OK(writer_->MaybeAddUserDefinedTimestampSizeRecord(*cf_to_ts_sz)); + } ASSERT_OK(writer_->AddRecord(Slice(msg))); } size_t WrittenBytes() const { return dest_contents().size(); } std::string Read(const WALRecoveryMode wal_recovery_mode = - WALRecoveryMode::kTolerateCorruptedTailRecords) { + WALRecoveryMode::kTolerateCorruptedTailRecords, + UnorderedMap* cf_to_ts_sz = nullptr) { std::string scratch; Slice record; bool ret = false; uint64_t record_checksum; ret = reader_->ReadRecord(&record, &scratch, wal_recovery_mode, &record_checksum); + if (cf_to_ts_sz != nullptr) { + *cf_to_ts_sz = reader_->GetRecordedTimestampSize(); + } if (ret) { if (!allow_retry_read_) { // allow_retry_read_ means using FragmentBufferedReader which does not @@ -257,6 +266,16 @@ class LogTest return "OK"; } } + + void CheckRecordAndTimestampSize( + std::string record, UnorderedMap& expected_ts_sz) { + UnorderedMap recorded_ts_sz; + ASSERT_EQ(record, + Read(WALRecoveryMode:: + kTolerateCorruptedTailRecords /* wal_recovery_mode */, + &recorded_ts_sz)); + EXPECT_EQ(expected_ts_sz, recorded_ts_sz); + } }; TEST_P(LogTest, Empty) { ASSERT_EQ("EOF", Read()); } @@ -274,6 +293,42 @@ TEST_P(LogTest, ReadWrite) { ASSERT_EQ("EOF", Read()); // Make sure reads at eof work } +TEST_P(LogTest, ReadWriteWithTimestampSize) { + UnorderedMap ts_sz_one = { + {1, sizeof(uint64_t)}, + }; + Write("foo", &ts_sz_one); + Write("bar"); + UnorderedMap ts_sz_two = {{2, sizeof(char)}}; + Write("", &ts_sz_two); + Write("xxxx"); + + CheckRecordAndTimestampSize("foo", ts_sz_one); + CheckRecordAndTimestampSize("bar", ts_sz_one); + UnorderedMap expected_ts_sz_two; + // User-defined timestamp size records are accumulated and applied to + // subsequent records. + expected_ts_sz_two.insert(ts_sz_one.begin(), ts_sz_one.end()); + expected_ts_sz_two.insert(ts_sz_two.begin(), ts_sz_two.end()); + CheckRecordAndTimestampSize("", expected_ts_sz_two); + CheckRecordAndTimestampSize("xxxx", expected_ts_sz_two); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("EOF", Read()); // Make sure reads at eof work +} + +TEST_P(LogTest, ReadWriteWithTimestampSizeZeroTimestampIgnored) { + UnorderedMap ts_sz_one = {{1, sizeof(uint64_t)}}; + Write("foo", &ts_sz_one); + UnorderedMap ts_sz_two(ts_sz_one.begin(), ts_sz_one.end()); + ts_sz_two.insert(std::make_pair(2, 0)); + Write("bar", &ts_sz_two); + + CheckRecordAndTimestampSize("foo", ts_sz_one); + CheckRecordAndTimestampSize("bar", ts_sz_one); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("EOF", Read()); // Make sure reads at eof work +} + TEST_P(LogTest, ManyBlocks) { for (int i = 0; i < 100000; i++) { Write(NumberString(i)); @@ -685,6 +740,39 @@ TEST_P(LogTest, Recycle) { ASSERT_EQ("EOF", Read()); } +TEST_P(LogTest, RecycleWithTimestampSize) { + bool recyclable_log = (std::get<0>(GetParam()) != 0); + if (!recyclable_log) { + return; // test is only valid for recycled logs + } + UnorderedMap ts_sz_one = { + {1, sizeof(uint32_t)}, + }; + Write("foo", &ts_sz_one); + Write("bar"); + Write("baz"); + Write("bif"); + Write("blitz"); + while (get_reader_contents()->size() < log::kBlockSize * 2) { + Write("xxxxxxxxxxxxxxxx"); + } + std::unique_ptr sink( + new test::OverwritingStringSink(get_reader_contents())); + std::unique_ptr dest_holder(new WritableFileWriter( + std::move(sink), "" /* don't care */, FileOptions())); + Writer recycle_writer(std::move(dest_holder), 123, true); + UnorderedMap ts_sz_two = { + {2, sizeof(uint64_t)}, + }; + ASSERT_OK(recycle_writer.MaybeAddUserDefinedTimestampSizeRecord(ts_sz_two)); + ASSERT_OK(recycle_writer.AddRecord(Slice("foooo"))); + ASSERT_OK(recycle_writer.AddRecord(Slice("bar"))); + ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2); + CheckRecordAndTimestampSize("foooo", ts_sz_two); + CheckRecordAndTimestampSize("bar", ts_sz_two); + ASSERT_EQ("EOF", Read()); +} + // Do NOT enable compression for this instantiation. INSTANTIATE_TEST_CASE_P( Log, LogTest, @@ -940,6 +1028,35 @@ TEST_P(CompressionLogTest, ReadWrite) { ASSERT_EQ("EOF", Read()); // Make sure reads at eof work } +TEST_P(CompressionLogTest, ReadWriteWithTimestampSize) { + CompressionType compression_type = std::get<2>(GetParam()); + if (!StreamingCompressionTypeSupported(compression_type)) { + ROCKSDB_GTEST_SKIP("Test requires support for compression type"); + return; + } + ASSERT_OK(SetupTestEnv()); + UnorderedMap ts_sz_one = { + {1, sizeof(uint64_t)}, + }; + Write("foo", &ts_sz_one); + Write("bar"); + UnorderedMap ts_sz_two = {{2, sizeof(char)}}; + Write("", &ts_sz_two); + Write("xxxx"); + + CheckRecordAndTimestampSize("foo", ts_sz_one); + CheckRecordAndTimestampSize("bar", ts_sz_one); + UnorderedMap expected_ts_sz_two; + // User-defined timestamp size records are accumulated and applied to + // subsequent records. + expected_ts_sz_two.insert(ts_sz_one.begin(), ts_sz_one.end()); + expected_ts_sz_two.insert(ts_sz_two.begin(), ts_sz_two.end()); + CheckRecordAndTimestampSize("", expected_ts_sz_two); + CheckRecordAndTimestampSize("xxxx", expected_ts_sz_two); + ASSERT_EQ("EOF", Read()); + ASSERT_EQ("EOF", Read()); // Make sure reads at eof work +} + TEST_P(CompressionLogTest, ManyBlocks) { CompressionType compression_type = std::get<2>(GetParam()); if (!StreamingCompressionTypeSupported(compression_type)) { diff --git a/db/log_writer.cc b/db/log_writer.cc index 56f58543e9e4..86e0286ccd57 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -16,6 +16,7 @@ #include "rocksdb/io_status.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/udt_util.h" namespace ROCKSDB_NAMESPACE { namespace log { @@ -73,7 +74,6 @@ IOStatus Writer::AddRecord(const Slice& slice, // Fragment the record if necessary and emit it. Note that if slice // is empty, we still want to iterate once to emit a single // zero-length record - IOStatus s; bool begin = true; int compress_remaining = 0; bool compress_start = false; @@ -81,6 +81,8 @@ IOStatus Writer::AddRecord(const Slice& slice, compress_->Reset(); compress_start = true; } + + IOStatus s; do { const int64_t leftover = kBlockSize - block_offset_; assert(leftover >= 0); @@ -194,6 +196,33 @@ IOStatus Writer::AddCompressionTypeRecord() { return s; } +IOStatus Writer::MaybeAddUserDefinedTimestampSizeRecord( + const UnorderedMap& cf_to_ts_sz, + Env::IOPriority rate_limiter_priority) { + std::vector> ts_sz_to_record; + for (const auto& [cf_id, ts_sz] : cf_to_ts_sz) { + if (recorded_cf_to_ts_sz_.count(cf_id) != 0) { + // A column family's user-defined timestamp size should not be + // updated while DB is running. + assert(recorded_cf_to_ts_sz_[cf_id] == ts_sz); + } else if (ts_sz != 0) { + ts_sz_to_record.emplace_back(cf_id, ts_sz); + recorded_cf_to_ts_sz_.insert(std::make_pair(cf_id, ts_sz)); + } + } + if (ts_sz_to_record.empty()) { + return IOStatus::OK(); + } + + UserDefinedTimestampSizeRecord record(std::move(ts_sz_to_record)); + std::string encoded; + record.EncodeTo(&encoded); + RecordType type = recycle_log_files_ ? kRecyclableUserDefinedTimestampSizeType + : kUserDefinedTimestampSizeType; + return EmitPhysicalRecord(type, encoded.data(), encoded.size(), + rate_limiter_priority); +} + bool Writer::BufferIsEmpty() { return dest_->BufferIsEmpty(); } IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n, @@ -209,7 +238,8 @@ IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n, buf[6] = static_cast(t); uint32_t crc = type_crc_[t]; - if (t < kRecyclableFullType || t == kSetCompressionType) { + if (t < kRecyclableFullType || t == kSetCompressionType || + t == kUserDefinedTimestampSizeType) { // Legacy record format assert(block_offset_ + kHeaderSize + n <= kBlockSize); header_size = kHeaderSize; diff --git a/db/log_writer.h b/db/log_writer.h index 5d266e434314..7a64a8560158 100644 --- a/db/log_writer.h +++ b/db/log_writer.h @@ -10,6 +10,8 @@ #include #include +#include +#include #include "db/log_format.h" #include "rocksdb/compression_type.h" @@ -18,6 +20,7 @@ #include "rocksdb/slice.h" #include "rocksdb/status.h" #include "util/compression.h" +#include "util/hash_containers.h" namespace ROCKSDB_NAMESPACE { @@ -87,6 +90,15 @@ class Writer { Env::IOPriority rate_limiter_priority = Env::IO_TOTAL); IOStatus AddCompressionTypeRecord(); + // If there are column families in `cf_to_ts_sz` not included in + // `recorded_cf_to_ts_sz_` and its user-defined timestamp size is non-zero, + // adds a record of type kUserDefinedTimestampSizeType or + // kRecyclableUserDefinedTimestampSizeType for these column families. + // This timestamp size record applies to all subsequent records. + IOStatus MaybeAddUserDefinedTimestampSizeRecord( + const UnorderedMap& cf_to_ts_sz, + Env::IOPriority rate_limiter_priority = Env::IO_TOTAL); + WritableFileWriter* file() { return dest_.get(); } const WritableFileWriter* file() const { return dest_.get(); } @@ -122,6 +134,11 @@ class Writer { StreamingCompress* compress_; // Reusable compressed output buffer std::unique_ptr compressed_buffer_; + + // The recorded user-defined timestamp size that have been written so far. + // Since the user-defined timestamp size cannot be changed while the DB is + // running, existing entry in this map cannot be updated. + UnorderedMap recorded_cf_to_ts_sz_; }; } // namespace log diff --git a/db/malloc_stats.cc b/db/malloc_stats.cc index 52f2e6e0f280..641e01f9a39d 100644 --- a/db/malloc_stats.cc +++ b/db/malloc_stats.cc @@ -9,7 +9,6 @@ #include "db/malloc_stats.h" -#ifndef ROCKSDB_LITE #include #include @@ -52,4 +51,3 @@ void DumpMallocStats(std::string* stats) { void DumpMallocStats(std::string*) {} #endif // ROCKSDB_JEMALLOC } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/db/malloc_stats.h b/db/malloc_stats.h index 18aff3ad0f84..1cca8a9522c3 100644 --- a/db/malloc_stats.h +++ b/db/malloc_stats.h @@ -9,7 +9,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include @@ -21,4 +20,3 @@ void DumpMallocStats(std::string*); } -#endif // !ROCKSDB_LITE diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc index b92cb794b994..95b099a66dc0 100644 --- a/db/manual_compaction_test.cc +++ b/db/manual_compaction_test.cc @@ -124,6 +124,7 @@ TEST_F(ManualCompactionTest, CompactTouchesAllKeys) { ASSERT_EQ("key3", itr->key().ToString()); itr->Next(); ASSERT_TRUE(!itr->Valid()); + ASSERT_OK(itr->status()); delete itr; delete options.compaction_filter; @@ -179,6 +180,7 @@ TEST_F(ManualCompactionTest, Test) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { num_keys++; } + ASSERT_OK(iter->status()); delete iter; ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys"; @@ -190,6 +192,7 @@ TEST_F(ManualCompactionTest, Test) { TEST_F(ManualCompactionTest, SkipLevel) { DB* db; Options options; + options.level_compaction_dynamic_level_bytes = false; options.num_levels = 3; // Initially, flushed L0 files won't exceed 100. options.level0_file_num_compaction_trigger = 100; @@ -286,9 +289,9 @@ TEST_F(ManualCompactionTest, SkipLevel) { filter->Reset(); ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr)); ASSERT_EQ(4, filter->NumKeys()); - // 1 is first compacted to L1 and then further compacted into [2, 4, 8], - // so finally the logged level for 1 is L1. - ASSERT_EQ(1, filter->KeyLevel("1")); + // 1 is first compacted from L0 to L1, and then L1 intra level compaction + // compacts [2, 4, 8] only. + ASSERT_EQ(0, filter->KeyLevel("1")); ASSERT_EQ(1, filter->KeyLevel("2")); ASSERT_EQ(1, filter->KeyLevel("4")); ASSERT_EQ(1, filter->KeyLevel("8")); diff --git a/db/memtable.cc b/db/memtable.cc index f8870e0dd908..4b2360d40504 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -26,7 +26,7 @@ #include "memory/arena.h" #include "memory/memory_usage.h" #include "monitoring/perf_context_imp.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "port/lang.h" #include "port/port.h" #include "rocksdb/comparator.h" @@ -95,6 +95,7 @@ MemTable::MemTable(const InternalKeyComparator& cmp, data_size_(0), num_entries_(0), num_deletes_(0), + num_range_deletes_(0), write_buffer_size_(mutable_cf_options.write_buffer_size), flush_in_progress_(false), flush_completed_(false), @@ -115,7 +116,9 @@ MemTable::MemTable(const InternalKeyComparator& cmp, oldest_key_time_(std::numeric_limits::max()), atomic_flush_seqno_(kMaxSequenceNumber), approximate_memory_usage_(0), - disable_auto_flush_(mutable_cf_options.disable_auto_flush) { + disable_auto_flush_(mutable_cf_options.disable_auto_flush), + memtable_max_range_deletions_( + mutable_cf_options.memtable_max_range_deletions) { UpdateFlushState(); // something went wrong if we need to flush before inserting anything assert(!ShouldScheduleFlush()); @@ -144,6 +147,10 @@ MemTable::MemTable(const InternalKeyComparator& cmp, new_cache.get()), std::memory_order_relaxed); } + const Comparator* ucmp = cmp.user_comparator(); + assert(ucmp); + ts_sz_ = ucmp->timestamp_size(); + persist_user_defined_timestamps_ = ioptions.persist_user_defined_timestamps; } MemTable::~MemTable() { @@ -171,6 +178,14 @@ size_t MemTable::ApproximateMemoryUsage() { } bool MemTable::ShouldFlushNow() { + // This is set if memtable_max_range_deletions is > 0, + // and that many range deletions are done + if (memtable_max_range_deletions_ > 0 && + num_range_deletes_.load(std::memory_order_relaxed) >= + static_cast(memtable_max_range_deletions_)) { + return true; + } + size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed); // In a lot of times, we cannot allocate arena blocks that exactly matches the // buffer size. Thus we have to decide if we should over-allocate or @@ -273,7 +288,7 @@ void MemTable::UpdateOldestKeyTime() { } Status MemTable::VerifyEntryChecksum(const char* entry, - size_t protection_bytes_per_key, + uint32_t protection_bytes_per_key, bool allow_data_in_errors) { if (protection_bytes_per_key == 0) { return Status::OK(); @@ -302,28 +317,11 @@ Status MemTable::VerifyEntryChecksum(const char* entry, Slice value = Slice(value_ptr, value_length); const char* checksum_ptr = value_ptr + value_length; - uint64_t expected = ProtectionInfo64() - .ProtectKVO(user_key, value, type) - .ProtectS(seq) - .GetVal(); - bool match = true; - switch (protection_bytes_per_key) { - case 1: - match = static_cast(checksum_ptr[0]) == - static_cast(expected); - break; - case 2: - match = DecodeFixed16(checksum_ptr) == static_cast(expected); - break; - case 4: - match = DecodeFixed32(checksum_ptr) == static_cast(expected); - break; - case 8: - match = DecodeFixed64(checksum_ptr) == expected; - break; - default: - assert(false); - } + bool match = + ProtectionInfo64() + .ProtectKVO(user_key, value, type) + .ProtectS(seq) + .Verify(static_cast(protection_bytes_per_key), checksum_ptr); if (!match) { std::string msg( "Corrupted memtable entry, per key-value checksum verification " @@ -355,11 +353,7 @@ int MemTable::KeyComparator::operator()( } void MemTableRep::InsertConcurrently(KeyHandle /*handle*/) { -#ifndef ROCKSDB_LITE throw std::runtime_error("concurrent insert not supported"); -#else - abort(); -#endif } Slice MemTableRep::UserKey(const char* key) const { @@ -395,7 +389,8 @@ class MemTableIterator : public InternalIterator { !mem.GetImmutableMemTableOptions()->inplace_update_support), protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key), status_(Status::OK()), - logger_(mem.moptions_.info_log) { + logger_(mem.moptions_.info_log), + ts_sz_(mem.ts_sz_) { if (use_range_del_table) { iter_ = mem.range_del_table_->GetIterator(arena); } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek && @@ -438,8 +433,7 @@ class MemTableIterator : public InternalIterator { PERF_COUNTER_ADD(seek_on_memtable_count, 1); if (bloom_) { // iterator should only use prefix bloom filter - auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size(); - Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz)); + Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_)); if (prefix_extractor_->InDomain(user_k_without_ts)) { if (!bloom_->MayContain( prefix_extractor_->Transform(user_k_without_ts))) { @@ -459,8 +453,7 @@ class MemTableIterator : public InternalIterator { PERF_TIMER_GUARD(seek_on_memtable_time); PERF_COUNTER_ADD(seek_on_memtable_count, 1); if (bloom_) { - auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size(); - Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz)); + Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz_)); if (prefix_extractor_->InDomain(user_k_without_ts)) { if (!bloom_->MayContain( prefix_extractor_->Transform(user_k_without_ts))) { @@ -547,9 +540,10 @@ class MemTableIterator : public InternalIterator { bool valid_; bool arena_mode_; bool value_pinned_; - size_t protection_bytes_per_key_; + uint32_t protection_bytes_per_key_; Status status_; Logger* logger_; + size_t ts_sz_; void VerifyEntryChecksum() { if (protection_bytes_per_key_ > 0 && Valid()) { @@ -620,6 +614,7 @@ void MemTable::ConstructFragmentedRangeTombstones() { assert(!IsFragmentedRangeTombstonesConstructed(false)); // There should be no concurrent Construction if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) { + // TODO: plumb Env::IOActivity auto* unfragmented_iter = new MemTableIterator(*this, ReadOptions(), nullptr /* arena */, true /* use_range_del_table */); @@ -662,8 +657,7 @@ Status MemTable::VerifyEncodedEntry(Slice encoded, if (!GetVarint32(&encoded, &ikey_len)) { return Status::Corruption("Unable to parse internal key length"); } - size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); - if (ikey_len < 8 + ts_sz) { + if (ikey_len < 8 + ts_sz_) { return Status::Corruption("Internal key length too short"); } if (ikey_len > encoded.size()) { @@ -704,28 +698,15 @@ void MemTable::UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info, return; } - uint64_t checksum = 0; if (kv_prot_info == nullptr) { - checksum = - ProtectionInfo64().ProtectKVO(key, value, type).ProtectS(s).GetVal(); + ProtectionInfo64() + .ProtectKVO(key, value, type) + .ProtectS(s) + .Encode(static_cast(moptions_.protection_bytes_per_key), + checksum_ptr); } else { - checksum = kv_prot_info->GetVal(); - } - switch (moptions_.protection_bytes_per_key) { - case 1: - checksum_ptr[0] = static_cast(checksum); - break; - case 2: - EncodeFixed16(checksum_ptr, static_cast(checksum)); - break; - case 4: - EncodeFixed32(checksum_ptr, static_cast(checksum)); - break; - case 8: - EncodeFixed64(checksum_ptr, checksum); - break; - default: - assert(false); + kv_prot_info->Encode( + static_cast(moptions_.protection_bytes_per_key), checksum_ptr); } } @@ -775,8 +756,7 @@ Status MemTable::Add(SequenceNumber s, ValueType type, } } - size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); - Slice key_without_ts = StripTimestampFromUserKey(key, ts_sz); + Slice key_without_ts = StripTimestampFromUserKey(key, ts_sz_); if (!allow_concurrent) { // Extract prefix for insert with hint. @@ -804,6 +784,9 @@ Status MemTable::Add(SequenceNumber s, ValueType type, type == kTypeDeletionWithTimestamp) { num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1, std::memory_order_relaxed); + } else if (type == kTypeRangeDeletion) { + uint64_t val = num_range_deletes_.load(std::memory_order_relaxed) + 1; + num_range_deletes_.store(val, std::memory_order_relaxed); } if (bloom_filter_ && prefix_extractor_ && @@ -826,6 +809,9 @@ Status MemTable::Add(SequenceNumber s, ValueType type, assert(first_seqno_.load() >= earliest_seqno_.load()); } assert(post_process_info == nullptr); + // TODO(yuzhangyu): support updating newest UDT for when `allow_concurrent` + // is true. + MaybeUpdateNewestUDT(key_slice); UpdateFlushState(); } else { bool res = (hint == nullptr) @@ -860,13 +846,14 @@ Status MemTable::Add(SequenceNumber s, ValueType type, earliest_seqno_.load(std::memory_order_relaxed); while ( (cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) && - !first_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) { + !earliest_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) { } } if (type == kTypeRangeDeletion) { auto new_cache = std::make_shared(); size_t size = cached_range_tombstone_.Size(); if (allow_concurrent) { + post_process_info->num_range_deletes++; range_del_mutex_.lock(); } for (size_t i = 0; i < size; ++i) { @@ -885,6 +872,7 @@ Status MemTable::Add(SequenceNumber s, ValueType type, new_local_cache_ref, new_cache.get()), std::memory_order_relaxed); } + if (allow_concurrent) { range_del_mutex_.unlock(); } @@ -922,7 +910,7 @@ struct Saver { ReadCallback* callback_; bool* is_blob_index; bool allow_data_in_errors; - size_t protection_bytes_per_key; + uint32_t protection_bytes_per_key; bool CheckCallback(SequenceNumber _seq) { if (callback_) { return callback_->IsVisible(_seq); @@ -1092,25 +1080,15 @@ static bool SaveValue(void* arg, const char* entry) { assert(s->do_merge); if (s->value || s->columns) { - std::string result; // `op_failure_scope` (an output parameter) is not provided (set to // nullptr) since a failure must be propagated regardless of its // value. *(s->status) = MergeHelper::TimedFullMerge( - merge_operator, s->key->user_key(), &v, - merge_context->GetOperands(), &result, s->logger, s->statistics, - s->clock, /* result_operand */ nullptr, - /* update_num_ops_stats */ true, + merge_operator, s->key->user_key(), + MergeHelper::kPlainBaseValue, v, merge_context->GetOperands(), + s->logger, s->statistics, s->clock, + /* update_num_ops_stats */ true, s->value, s->columns, /* op_failure_scope */ nullptr); - - if (s->status->ok()) { - if (s->value) { - *(s->value) = std::move(result); - } else { - assert(s->columns); - s->columns->SetPlainValue(result); - } - } } } else if (s->value) { s->value->assign(v.data(), v.size()); @@ -1155,35 +1133,15 @@ static bool SaveValue(void* arg, const char* entry) { } else if (*(s->merge_in_progress)) { assert(s->do_merge); - if (s->value) { - Slice value_of_default; - *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn( - v, value_of_default); - if (s->status->ok()) { - // `op_failure_scope` (an output parameter) is not provided (set - // to nullptr) since a failure must be propagated regardless of - // its value. - *(s->status) = MergeHelper::TimedFullMerge( - merge_operator, s->key->user_key(), &value_of_default, - merge_context->GetOperands(), s->value, s->logger, - s->statistics, s->clock, /* result_operand */ nullptr, - /* update_num_ops_stats */ true, - /* op_failure_scope */ nullptr); - } - } else if (s->columns) { - std::string result; - // `op_failure_scope` (an output parameter) is not provided (set to - // nullptr) since a failure must be propagated regardless of its - // value. - *(s->status) = MergeHelper::TimedFullMergeWithEntity( - merge_operator, s->key->user_key(), v, - merge_context->GetOperands(), &result, s->logger, s->statistics, - s->clock, /* update_num_ops_stats */ true, + if (s->value || s->columns) { + // `op_failure_scope` (an output parameter) is not provided (set + // to nullptr) since a failure must be propagated regardless of + // its value. + *(s->status) = MergeHelper::TimedFullMerge( + merge_operator, s->key->user_key(), MergeHelper::kWideBaseValue, + v, merge_context->GetOperands(), s->logger, s->statistics, + s->clock, /* update_num_ops_stats */ true, s->value, s->columns, /* op_failure_scope */ nullptr); - - if (s->status->ok()) { - *(s->status) = s->columns->SetWideColumnValue(result); - } } } else if (s->value) { Slice value_of_default; @@ -1214,25 +1172,14 @@ static bool SaveValue(void* arg, const char* entry) { case kTypeRangeDeletion: { if (*(s->merge_in_progress)) { if (s->value || s->columns) { - std::string result; // `op_failure_scope` (an output parameter) is not provided (set to // nullptr) since a failure must be propagated regardless of its // value. *(s->status) = MergeHelper::TimedFullMerge( - merge_operator, s->key->user_key(), nullptr, - merge_context->GetOperands(), &result, s->logger, s->statistics, - s->clock, /* result_operand */ nullptr, - /* update_num_ops_stats */ true, + merge_operator, s->key->user_key(), MergeHelper::kNoBaseValue, + merge_context->GetOperands(), s->logger, s->statistics, + s->clock, /* update_num_ops_stats */ true, s->value, s->columns, /* op_failure_scope */ nullptr); - - if (s->status->ok()) { - if (s->value) { - *(s->value) = std::move(result); - } else { - assert(s->columns); - s->columns->SetPlainValue(result); - } - } } else { // We have found a final value (a base deletion) and have newer // merge operands that we do not intend to merge. Nothing remains @@ -1260,28 +1207,19 @@ static bool SaveValue(void* arg, const char* entry) { *(s->merge_in_progress) = true; merge_context->PushOperand( v, s->inplace_update_support == false /* operand_pinned */); + PERF_COUNTER_ADD(internal_merge_point_lookup_count, 1); + if (s->do_merge && merge_operator->ShouldMerge( merge_context->GetOperandsDirectionBackward())) { if (s->value || s->columns) { - std::string result; // `op_failure_scope` (an output parameter) is not provided (set to // nullptr) since a failure must be propagated regardless of its // value. *(s->status) = MergeHelper::TimedFullMerge( - merge_operator, s->key->user_key(), nullptr, - merge_context->GetOperands(), &result, s->logger, s->statistics, - s->clock, /* result_operand */ nullptr, - /* update_num_ops_stats */ true, + merge_operator, s->key->user_key(), MergeHelper::kNoBaseValue, + merge_context->GetOperands(), s->logger, s->statistics, + s->clock, /* update_num_ops_stats */ true, s->value, s->columns, /* op_failure_scope */ nullptr); - - if (s->status->ok()) { - if (s->value) { - *(s->value) = std::move(result); - } else { - assert(s->columns); - s->columns->SetPlainValue(result); - } - } } *(s->found_final_value) = true; @@ -1320,6 +1258,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, // Avoiding recording stats for speed. return false; } + PERF_TIMER_GUARD(get_from_memtable_time); std::unique_ptr range_del_iter( @@ -1343,8 +1282,7 @@ bool MemTable::Get(const LookupKey& key, std::string* value, bool found_final_value = false; bool merge_in_progress = s->IsMergeInProgress(); bool may_contain = true; - size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size(); - Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz); + Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz_); bool bloom_checked = false; if (bloom_filter_) { // when both memtable_whole_key_filtering and prefix_extractor_ are set, @@ -1482,18 +1420,24 @@ void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range, } SequenceNumber dummy_seq; GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true, - callback, &iter->is_blob_index, iter->value->GetSelf(), - /*columns=*/nullptr, iter->timestamp, iter->s, - &(iter->merge_context), &dummy_seq, &found_final_value, - &merge_in_progress); + callback, &iter->is_blob_index, + iter->value ? iter->value->GetSelf() : nullptr, iter->columns, + iter->timestamp, iter->s, &(iter->merge_context), &dummy_seq, + &found_final_value, &merge_in_progress); if (!found_final_value && merge_in_progress) { *(iter->s) = Status::MergeInProgress(); } if (found_final_value) { - iter->value->PinSelf(); - range->AddValueSize(iter->value->size()); + if (iter->value) { + iter->value->PinSelf(); + range->AddValueSize(iter->value->size()); + } else { + assert(iter->columns); + range->AddValueSize(iter->columns->serialized_size()); + } + range->MarkKeyDone(iter); RecordTick(moptions_.statistics, MEMTABLE_HIT); if (range->GetValueSize() > read_options.value_size_soft_limit) { @@ -1723,4 +1667,22 @@ uint64_t MemTable::GetMinLogContainingPrepSection() { return min_prep_log_referenced_.load(); } +void MemTable::MaybeUpdateNewestUDT(const Slice& user_key) { + if (ts_sz_ == 0 || persist_user_defined_timestamps_) { + return; + } + const Comparator* ucmp = GetInternalKeyComparator().user_comparator(); + Slice udt = ExtractTimestampFromUserKey(user_key, ts_sz_); + if (newest_udt_.empty() || ucmp->CompareTimestamp(udt, newest_udt_) > 0) { + newest_udt_ = udt; + } +} + +const Slice& MemTable::GetNewestUDT() const { + // This path should not be invoked for MemTables that does not enable the UDT + // in Memtable only feature. + assert(ts_sz_ > 0 && !persist_user_defined_timestamps_); + return newest_udt_; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/db/memtable.h b/db/memtable.h index 1fa8c9ca699e..d3d2322c7830 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -68,6 +68,7 @@ struct MemTablePostProcessInfo { uint64_t data_size = 0; uint64_t num_entries = 0; uint64_t num_deletes = 0; + uint64_t num_range_deletes = 0; }; using MultiGetRange = MultiGetContext::Range; @@ -332,6 +333,10 @@ class MemTable { num_deletes_.fetch_add(update_counters.num_deletes, std::memory_order_relaxed); } + if (update_counters.num_range_deletes > 0) { + num_range_deletes_.fetch_add(update_counters.num_range_deletes, + std::memory_order_relaxed); + } UpdateFlushState(); } @@ -349,10 +354,21 @@ class MemTable { return num_deletes_.load(std::memory_order_relaxed); } + // Get total number of range deletions in the mem table. + // REQUIRES: external synchronization to prevent simultaneous + // operations on the same MemTable (unless this Memtable is immutable). + uint64_t num_range_deletes() const { + return num_range_deletes_.load(std::memory_order_relaxed); + } + uint64_t get_data_size() const { return data_size_.load(std::memory_order_relaxed); } + size_t write_buffer_size() const { + return write_buffer_size_.load(std::memory_order_relaxed); + } + // Dynamically change the memtable's capacity. If set below the current usage, // the next key added will trigger a flush. Can only increase size when // memtable prefix bloom is disabled, since we can't easily allocate more @@ -506,7 +522,6 @@ class MemTable { flush_in_progress_ = in_progress; } -#ifndef ROCKSDB_LITE void SetFlushJobInfo(std::unique_ptr&& info) { flush_job_info_ = std::move(info); } @@ -514,7 +529,6 @@ class MemTable { std::unique_ptr ReleaseFlushJobInfo() { return std::move(flush_job_info_); } -#endif // !ROCKSDB_LITE // Returns a heuristic flush decision bool ShouldFlushNow(); @@ -539,9 +553,17 @@ class MemTable { } } + // Get the newest user-defined timestamp contained in this MemTable. Check + // `newest_udt_` for what newer means. This method should only be invoked for + // an MemTable that has enabled user-defined timestamp feature and set + // `persist_user_defined_timestamps` to false. The tracked newest UDT will be + // used by flush job in the background to help check the MemTable's + // eligibility for Flush. + const Slice& GetNewestUDT() const; + // Returns Corruption status if verification fails. static Status VerifyEntryChecksum(const char* entry, - size_t protection_bytes_per_key, + uint32_t protection_bytes_per_key, bool allow_data_in_errors = false); private: @@ -565,6 +587,7 @@ class MemTable { std::atomic data_size_; std::atomic num_entries_; std::atomic num_deletes_; + std::atomic num_range_deletes_; // Dynamically changeable memtable option std::atomic write_buffer_size_; @@ -610,7 +633,7 @@ class MemTable { const SliceTransform* insert_with_hint_prefix_extractor_; // Insert hints for each prefix. - UnorderedMapH insert_hints_; + UnorderedMapH insert_hints_; // Timestamp of oldest key std::atomic oldest_key_time_; @@ -630,10 +653,25 @@ class MemTable { std::atomic_bool disable_auto_flush_; -#ifndef ROCKSDB_LITE + // max range deletions in a memtable, before automatic flushing, 0 for + // unlimited. + uint32_t memtable_max_range_deletions_ = 0; + // Flush job info of the current memtable. std::unique_ptr flush_job_info_; -#endif // !ROCKSDB_LITE + + // Size in bytes for the user-defined timestamps. + size_t ts_sz_; + + // Whether to persist user-defined timestamps + bool persist_user_defined_timestamps_; + + // Newest user-defined timestamp contained in this MemTable. For ts1, and ts2 + // if Comparator::CompareTimestamp(ts1, ts2) > 0, ts1 is considered newer than + // ts2. We track this field for a MemTable if its column family has UDT + // feature enabled and the `persist_user_defined_timestamp` flag is false. + // Otherwise, this field just contains an empty Slice. + Slice newest_udt_; // Updates flush_state_ using ShouldFlushNow() void UpdateFlushState(); @@ -671,6 +709,8 @@ class MemTable { void UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info, const Slice& key, const Slice& value, ValueType type, SequenceNumber s, char* checksum_ptr); + + void MaybeUpdateNewestUDT(const Slice& user_key); }; extern const char* EncodeKey(std::string* scratch, const Slice& target); diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 7d6c2b88d2b6..ebf0813c2d82 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -184,7 +184,7 @@ bool MemTableListVersion::GetFromList( assert(*seq != kMaxSequenceNumber || s->IsNotFound()); return true; } - if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) { + if (!s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) { return false; } } @@ -434,23 +434,57 @@ void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id, } void MemTableList::RollbackMemtableFlush(const autovector& mems, - uint64_t /*file_number*/) { + bool rollback_succeeding_memtables) { + TEST_SYNC_POINT("RollbackMemtableFlush"); AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_MEMTABLE_ROLLBACK); - assert(!mems.empty()); - - // If the flush was not successful, then just reset state. - // Maybe a succeeding attempt to flush will be successful. +#ifndef NDEBUG for (MemTable* m : mems) { assert(m->flush_in_progress_); assert(m->file_number_ == 0); + } +#endif + + if (rollback_succeeding_memtables && !mems.empty()) { + std::list& memlist = current_->memlist_; + auto it = memlist.rbegin(); + for (; *it != mems[0] && it != memlist.rend(); ++it) { + } + // mems should be in memlist + assert(*it == mems[0]); + if (*it == mems[0]) { + ++it; + } + while (it != memlist.rend()) { + MemTable* m = *it; + // Only rollback complete, not in-progress, + // in_progress can be flushes that are still writing SSTs + if (m->flush_completed_) { + m->flush_in_progress_ = false; + m->flush_completed_ = false; + m->edit_.Clear(); + m->file_number_ = 0; + num_flush_not_started_++; + ++it; + } else { + break; + } + } + } - m->flush_in_progress_ = false; - m->flush_completed_ = false; - m->edit_.Clear(); - num_flush_not_started_++; + for (MemTable* m : mems) { + if (m->flush_in_progress_) { + assert(m->file_number_ == 0); + m->file_number_ = 0; + m->flush_in_progress_ = false; + m->flush_completed_ = false; + m->edit_.Clear(); + num_flush_not_started_++; + } + } + if (!mems.empty()) { + imm_flush_needed.store(true, std::memory_order_release); } - imm_flush_needed.store(true, std::memory_order_release); } // Try record a successful flush in the manifest file. It might just return @@ -467,6 +501,8 @@ Status MemTableList::TryInstallMemtableFlushResults( ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); mu->AssertHeld(); + const ReadOptions read_options(Env::IOActivity::kFlush); + // Flush was successful // Record the status on the memtable object. Either this call or a call by a // concurrent flush thread will read the status and write it to manifest. @@ -529,14 +565,10 @@ Status MemTableList::TryInstallMemtableFlushResults( edit_list.push_back(&m->edit_); memtables_to_flush.push_back(m); -#ifndef ROCKSDB_LITE std::unique_ptr info = m->ReleaseFlushJobInfo(); if (info != nullptr) { committed_flush_jobs_info->push_back(std::move(info)); } -#else - (void)committed_flush_jobs_info; -#endif // !ROCKSDB_LITE } batch_count++; } @@ -582,8 +614,8 @@ Status MemTableList::TryInstallMemtableFlushResults( }; if (write_edits) { // this can release and reacquire the mutex. - s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu, - db_directory, /*new_descriptor_log=*/false, + s = vset->LogAndApply(cfd, mutable_cf_options, read_options, edit_list, + mu, db_directory, /*new_descriptor_log=*/false, /*column_family_options=*/nullptr, manifest_write_cb); } else { @@ -802,6 +834,8 @@ Status InstallMemtableAtomicFlushResults( ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); mu->AssertHeld(); + const ReadOptions read_options(Env::IOActivity::kFlush); + size_t num = mems_list.size(); assert(cfds.size() == num); if (imm_lists != nullptr) { @@ -825,7 +859,6 @@ Status InstallMemtableAtomicFlushResults( (*mems_list[k])[i]->SetFlushCompleted(true); (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber()); } -#ifndef ROCKSDB_LITE if (committed_flush_jobs_info[k]) { assert(!mems_list[k]->empty()); assert((*mems_list[k])[0]); @@ -833,9 +866,6 @@ Status InstallMemtableAtomicFlushResults( (*mems_list[k])[0]->ReleaseFlushJobInfo(); committed_flush_jobs_info[k]->push_back(std::move(flush_job_info)); } -#else //! ROCKSDB_LITE - (void)committed_flush_jobs_info; -#endif // ROCKSDB_LITE } Status s; @@ -906,8 +936,8 @@ Status InstallMemtableAtomicFlushResults( } // this can release and reacquire the mutex. - s = vset->LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, - db_directory); + s = vset->LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, + mu, db_directory); for (size_t k = 0; k != cfds.size(); ++k) { auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k); diff --git a/db/memtable_list.h b/db/memtable_list.h index 33e9b0046a65..f4f342ed5447 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -271,8 +271,20 @@ class MemTableList { // Reset status of the given memtable list back to pending state so that // they can get picked up again on the next round of flush. + // + // @param rollback_succeeding_memtables If true, will rollback adjacent + // younger memtables whose flush is completed. Specifically, suppose the + // current immutable memtables are M_0,M_1...M_N ordered from youngest to + // oldest. Suppose that the youngest memtable in `mems` is M_K. We will try to + // rollback M_K-1, M_K-2... until the first memtable whose flush is + // not completed. These are the memtables that would have been installed + // by this flush job if it were to succeed. This flag is currently used + // by non atomic_flush rollback. + // Note that we also do rollback in `write_manifest_cb` by calling + // `RemoveMemTablesOrRestoreFlags()`. There we rollback the entire batch so + // it is similar to what we do here with rollback_succeeding_memtables=true. void RollbackMemtableFlush(const autovector& mems, - uint64_t file_number); + bool rollback_succeeding_memtables); // Try commit a successful flush in the manifest file. It might just return // Status::OK letting a concurrent flush to do the actual the recording. @@ -374,14 +386,43 @@ class MemTableList { return memlist.back()->GetID(); } - uint64_t GetLatestMemTableID() const { + uint64_t GetLatestMemTableID(bool for_atomic_flush) const { auto& memlist = current_->memlist_; if (memlist.empty()) { return 0; } + if (for_atomic_flush) { + // Scan the memtable list from new to old + for (auto it = memlist.begin(); it != memlist.end(); ++it) { + MemTable* m = *it; + if (m->atomic_flush_seqno_ != kMaxSequenceNumber) { + return m->GetID(); + } + } + return 0; + } return memlist.front()->GetID(); } + // DB mutex held. + // Gets the newest user-defined timestamp for the Memtables in ascending ID + // order, up to the `max_memtable_id`. Used by background flush job + // to check Memtables' eligibility for flush w.r.t retaining UDTs. + std::vector GetTablesNewestUDT(uint64_t max_memtable_id) { + std::vector newest_udts; + auto& memlist = current_->memlist_; + // Iterating through the memlist starting at the end, the vector + // ret is filled with memtables already sorted in increasing MemTable ID. + for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) { + MemTable* m = *it; + if (m->GetID() > max_memtable_id) { + break; + } + newest_udts.push_back(m->GetNewestUDT()); + } + return newest_udts; + } + void AssignAtomicFlushSeq(const SequenceNumber& seq) { const auto& memlist = current_->memlist_; // Scan the memtable list from new to old diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index 8242061afb0d..9a5b7557f89f 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -43,6 +43,9 @@ class MemTableListTest : public testing::Test { // Open DB only with default column family ColumnFamilyOptions cf_options; std::vector cf_descs; + if (udt_enabled_) { + cf_options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + } cf_descs.emplace_back(kDefaultColumnFamilyName, cf_options); Status s = DB::Open(options, dbname, cf_descs, &handles, &db); EXPECT_OK(s); @@ -67,11 +70,9 @@ class MemTableListTest : public testing::Test { ~MemTableListTest() override { if (db) { std::vector cf_descs(handles.size()); -#ifndef ROCKSDB_LITE for (int i = 0; i != static_cast(handles.size()); ++i) { EXPECT_OK(handles[i]->GetDescriptor(&cf_descs[i])); } -#endif // !ROCKSDB_LITE for (auto h : handles) { if (h) { EXPECT_OK(db->DestroyColumnFamilyHandle(h)); @@ -105,8 +106,9 @@ class MemTableListTest : public testing::Test { VersionSet versions(dbname, &immutable_db_options, env_options, table_cache.get(), &write_buffer_manager, &write_controller, /*block_cache_tracer=*/nullptr, - /*io_tracer=*/nullptr, /*db_id*/ "", - /*db_session_id*/ ""); + /*io_tracer=*/nullptr, /*db_id=*/"", + /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr); std::vector cf_descs; cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); cf_descs.emplace_back("one", ColumnFamilyOptions()); @@ -156,8 +158,9 @@ class MemTableListTest : public testing::Test { VersionSet versions(dbname, &immutable_db_options, env_options, table_cache.get(), &write_buffer_manager, &write_controller, /*block_cache_tracer=*/nullptr, - /*io_tracer=*/nullptr, /*db_id*/ "", - /*db_session_id*/ ""); + /*io_tracer=*/nullptr, /*db_id=*/"", + /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr); std::vector cf_descs; cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions()); cf_descs.emplace_back("one", ColumnFamilyOptions()); @@ -202,6 +205,9 @@ class MemTableListTest : public testing::Test { nullptr /* prep_tracker */, &mutex, file_meta_ptrs, committed_flush_jobs_info, to_delete, nullptr, &log_buffer); } + + protected: + bool udt_enabled_ = false; }; TEST_F(MemTableListTest, Empty) { @@ -678,7 +684,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); // Revert flush - list.RollbackMemtableFlush(to_flush, 0); + list.RollbackMemtableFlush(to_flush, false); ASSERT_FALSE(list.IsFlushPending()); ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); to_flush.clear(); @@ -728,7 +734,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); // Rollback first pick of tables - list.RollbackMemtableFlush(to_flush, 0); + list.RollbackMemtableFlush(to_flush, false); ASSERT_TRUE(list.IsFlushPending()); ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); to_flush.clear(); @@ -829,7 +835,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { // Add another table list.Add(tables[5], &to_delete); ASSERT_EQ(1, list.NumNotFlushed()); - ASSERT_EQ(5, list.GetLatestMemTableID()); + ASSERT_EQ(5, list.GetLatestMemTableID(false /* for_atomic_flush */)); memtable_id = 4; // Pick tables to flush. The tables to pick must have ID smaller than or // equal to 4. Therefore, no table will be selected in this case. @@ -870,7 +876,7 @@ TEST_F(MemTableListTest, FlushPendingTest) { to_delete.clear(); } -TEST_F(MemTableListTest, EmptyAtomicFlusTest) { +TEST_F(MemTableListTest, EmptyAtomicFlushTest) { autovector lists; autovector cf_ids; autovector options_list; @@ -882,7 +888,7 @@ TEST_F(MemTableListTest, EmptyAtomicFlusTest) { ASSERT_TRUE(to_delete.empty()); } -TEST_F(MemTableListTest, AtomicFlusTest) { +TEST_F(MemTableListTest, AtomicFlushTest) { const int num_cfs = 3; const int num_tables_per_cf = 2; SequenceNumber seq = 1; @@ -1030,6 +1036,86 @@ TEST_F(MemTableListTest, AtomicFlusTest) { } } +class MemTableListWithTimestampTest : public MemTableListTest { + public: + MemTableListWithTimestampTest() : MemTableListTest() {} + + void SetUp() override { udt_enabled_ = true; } +}; + +TEST_F(MemTableListWithTimestampTest, GetTableNewestUDT) { + const int num_tables = 3; + const int num_entries = 5; + SequenceNumber seq = 1; + + auto factory = std::make_shared(); + options.memtable_factory = factory; + options.persist_user_defined_timestamps = false; + ImmutableOptions ioptions(options); + const Comparator* ucmp = test::BytewiseComparatorWithU64TsWrapper(); + InternalKeyComparator cmp(ucmp); + WriteBufferManager wb(options.db_write_buffer_size); + + // Create MemTableList + int min_write_buffer_number_to_merge = 1; + int max_write_buffer_number_to_maintain = 4; + int64_t max_write_buffer_size_to_maintain = + 4 * static_cast(options.write_buffer_size); + MemTableList list(min_write_buffer_number_to_merge, + max_write_buffer_number_to_maintain, + max_write_buffer_size_to_maintain); + + // Create some MemTables + uint64_t memtable_id = 0; + std::vector tables; + MutableCFOptions mutable_cf_options(options); + uint64_t current_ts = 0; + autovector to_delete; + std::vector newest_udts; + + std::string key; + std::string write_ts; + for (int i = 0; i < num_tables; i++) { + MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb, + kMaxSequenceNumber, 0 /* column_family_id */); + mem->SetID(memtable_id++); + mem->Ref(); + + std::string value; + MergeContext merge_context; + + for (int j = 0; j < num_entries; j++) { + key = "key1"; + write_ts.clear(); + PutFixed64(&write_ts, current_ts); + key.append(write_ts); + ASSERT_OK(mem->Add(++seq, kTypeValue, key, std::to_string(i), + nullptr /* kv_prot_info */)); + current_ts++; + } + + tables.push_back(mem); + list.Add(tables.back(), &to_delete); + newest_udts.push_back(write_ts); + } + + ASSERT_EQ(num_tables, list.NumNotFlushed()); + ASSERT_TRUE(list.IsFlushPending()); + std::vector tables_newest_udts = list.GetTablesNewestUDT(num_tables); + ASSERT_EQ(newest_udts.size(), tables_newest_udts.size()); + for (size_t i = 0; i < tables_newest_udts.size(); i++) { + const Slice& table_newest_udt = tables_newest_udts[i]; + const Slice expected_newest_udt = newest_udts[i]; + ASSERT_EQ(expected_newest_udt, table_newest_udt); + } + + list.current()->Unref(&to_delete); + for (MemTable* m : to_delete) { + delete m; + } + to_delete.clear(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/merge_helper.cc b/db/merge_helper.cc index e29d9c5badb8..d8b1d788bb56 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -13,9 +13,10 @@ #include "db/compaction/compaction_iteration_stats.h" #include "db/dbformat.h" #include "db/wide/wide_column_serialization.h" +#include "db/wide/wide_columns_helper.h" #include "logging/logging.h" #include "monitoring/perf_context_imp.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "port/likely.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" @@ -23,6 +24,7 @@ #include "rocksdb/system_clock.h" #include "table/format.h" #include "table/internal_iterator.h" +#include "util/overload.h" namespace ROCKSDB_NAMESPACE { @@ -56,120 +58,326 @@ MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator, } } -Status MergeHelper::TimedFullMerge( - const MergeOperator* merge_operator, const Slice& key, const Slice* value, - const std::vector& operands, std::string* result, Logger* logger, - Statistics* statistics, SystemClock* clock, Slice* result_operand, - bool update_num_ops_stats, - MergeOperator::OpFailureScope* op_failure_scope) { - assert(merge_operator != nullptr); - - if (operands.empty()) { - assert(value != nullptr && result != nullptr); - result->assign(value->data(), value->size()); - return Status::OK(); - } +template +Status MergeHelper::TimedFullMergeCommonImpl( + const MergeOperator* merge_operator, const Slice& key, + MergeOperator::MergeOperationInputV3::ExistingValue&& existing_value, + const std::vector& operands, Logger* logger, Statistics* statistics, + SystemClock* clock, bool update_num_ops_stats, + MergeOperator::OpFailureScope* op_failure_scope, Visitor&& visitor) { + assert(merge_operator); + assert(!operands.empty()); if (update_num_ops_stats) { RecordInHistogram(statistics, READ_NUM_MERGE_OPERANDS, static_cast(operands.size())); } + const MergeOperator::MergeOperationInputV3 merge_in( + key, std::move(existing_value), operands, logger); + MergeOperator::MergeOperationOutputV3 merge_out; + bool success = false; - Slice tmp_result_operand(nullptr, 0); - const MergeOperator::MergeOperationInput merge_in(key, value, operands, - logger); - MergeOperator::MergeOperationOutput merge_out(*result, tmp_result_operand); + { - // Setup to time the merge StopWatchNano timer(clock, statistics != nullptr); PERF_TIMER_GUARD(merge_operator_time_nanos); - // Do the merge - success = merge_operator->FullMergeV2(merge_in, &merge_out); - - if (tmp_result_operand.data()) { - // FullMergeV2 result is an existing operand - if (result_operand != nullptr) { - *result_operand = tmp_result_operand; - } else { - result->assign(tmp_result_operand.data(), tmp_result_operand.size()); - } - } else if (result_operand) { - *result_operand = Slice(nullptr, 0); - } + success = merge_operator->FullMergeV3(merge_in, &merge_out); RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME, statistics ? timer.ElapsedNanos() : 0); } - if (op_failure_scope != nullptr) { - *op_failure_scope = merge_out.op_failure_scope; - // Apply default per merge_operator.h - if (*op_failure_scope == MergeOperator::OpFailureScope::kDefault) { - *op_failure_scope = MergeOperator::OpFailureScope::kTryMerge; - } - } - if (!success) { RecordTick(statistics, NUMBER_MERGE_FAILURES); - return Status::Corruption("Error: Could not perform merge."); + + if (op_failure_scope) { + *op_failure_scope = merge_out.op_failure_scope; + // Apply default per merge_operator.h + if (*op_failure_scope == MergeOperator::OpFailureScope::kDefault) { + *op_failure_scope = MergeOperator::OpFailureScope::kTryMerge; + } + } + + return Status::Corruption(Status::SubCode::kMergeOperatorFailed); } - return Status::OK(); + return std::visit(std::forward(visitor), + std::move(merge_out.new_value)); +} + +Status MergeHelper::TimedFullMergeImpl( + const MergeOperator* merge_operator, const Slice& key, + MergeOperator::MergeOperationInputV3::ExistingValue&& existing_value, + const std::vector& operands, Logger* logger, Statistics* statistics, + SystemClock* clock, bool update_num_ops_stats, std::string* result, + Slice* result_operand, ValueType* result_type, + MergeOperator::OpFailureScope* op_failure_scope) { + assert(result); + assert(result_type); + + auto visitor = overload{ + [&](std::string&& new_value) -> Status { + *result_type = kTypeValue; + + if (result_operand) { + *result_operand = Slice(nullptr, 0); + } + + *result = std::move(new_value); + + return Status::OK(); + }, + [&](MergeOperator::MergeOperationOutputV3::NewColumns&& new_columns) + -> Status { + *result_type = kTypeWideColumnEntity; + + if (result_operand) { + *result_operand = Slice(nullptr, 0); + } + + result->clear(); + + WideColumns sorted_columns; + sorted_columns.reserve(new_columns.size()); + + for (const auto& column : new_columns) { + sorted_columns.emplace_back(column.first, column.second); + } + + WideColumnsHelper::SortColumns(sorted_columns); + + return WideColumnSerialization::Serialize(sorted_columns, *result); + }, + [&](Slice&& operand) -> Status { + *result_type = kTypeValue; + + if (result_operand) { + *result_operand = operand; + result->clear(); + } else { + result->assign(operand.data(), operand.size()); + } + + return Status::OK(); + }}; + + return TimedFullMergeCommonImpl(merge_operator, key, + std::move(existing_value), operands, logger, + statistics, clock, update_num_ops_stats, + op_failure_scope, std::move(visitor)); +} + +Status MergeHelper::TimedFullMergeImpl( + const MergeOperator* merge_operator, const Slice& key, + MergeOperator::MergeOperationInputV3::ExistingValue&& existing_value, + const std::vector& operands, Logger* logger, Statistics* statistics, + SystemClock* clock, bool update_num_ops_stats, std::string* result_value, + PinnableWideColumns* result_entity, + MergeOperator::OpFailureScope* op_failure_scope) { + assert(result_value || result_entity); + assert(!result_value || !result_entity); + + auto visitor = overload{ + [&](std::string&& new_value) -> Status { + if (result_value) { + *result_value = std::move(new_value); + + return Status::OK(); + } + + assert(result_entity); + result_entity->SetPlainValue(std::move(new_value)); + + return Status::OK(); + }, + [&](MergeOperator::MergeOperationOutputV3::NewColumns&& new_columns) + -> Status { + if (result_value) { + if (!new_columns.empty() && + new_columns.front().first == kDefaultWideColumnName) { + *result_value = std::move(new_columns.front().second); + } else { + result_value->clear(); + } + + return Status::OK(); + } + + assert(result_entity); + + WideColumns sorted_columns; + sorted_columns.reserve(new_columns.size()); + + for (const auto& column : new_columns) { + sorted_columns.emplace_back(column.first, column.second); + } + + WideColumnsHelper::SortColumns(sorted_columns); + + std::string result; + const Status s = + WideColumnSerialization::Serialize(sorted_columns, result); + if (!s.ok()) { + result_entity->Reset(); + return s; + } + + return result_entity->SetWideColumnValue(std::move(result)); + }, + [&](Slice&& operand) -> Status { + if (result_value) { + result_value->assign(operand.data(), operand.size()); + + return Status::OK(); + } + + assert(result_entity); + result_entity->SetPlainValue(operand); + + return Status::OK(); + }}; + + return TimedFullMergeCommonImpl(merge_operator, key, + std::move(existing_value), operands, logger, + statistics, clock, update_num_ops_stats, + op_failure_scope, std::move(visitor)); +} + +Status MergeHelper::TimedFullMerge( + const MergeOperator* merge_operator, const Slice& key, NoBaseValueTag, + const std::vector& operands, Logger* logger, Statistics* statistics, + SystemClock* clock, bool update_num_ops_stats, std::string* result, + Slice* result_operand, ValueType* result_type, + MergeOperator::OpFailureScope* op_failure_scope) { + MergeOperator::MergeOperationInputV3::ExistingValue existing_value; + + return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), + operands, logger, statistics, clock, + update_num_ops_stats, result, result_operand, + result_type, op_failure_scope); } -Status MergeHelper::TimedFullMergeWithEntity( - const MergeOperator* merge_operator, const Slice& key, Slice base_entity, - const std::vector& operands, std::string* result, Logger* logger, +Status MergeHelper::TimedFullMerge( + const MergeOperator* merge_operator, const Slice& key, PlainBaseValueTag, + const Slice& value, const std::vector& operands, Logger* logger, Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, + std::string* result, Slice* result_operand, ValueType* result_type, MergeOperator::OpFailureScope* op_failure_scope) { - WideColumns base_columns; + MergeOperator::MergeOperationInputV3::ExistingValue existing_value(value); - { - const Status s = - WideColumnSerialization::Deserialize(base_entity, base_columns); - if (!s.ok()) { - return s; - } - } + return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), + operands, logger, statistics, clock, + update_num_ops_stats, result, result_operand, + result_type, op_failure_scope); +} + +Status MergeHelper::TimedFullMerge( + const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag, + const Slice& entity, const std::vector& operands, Logger* logger, + Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, + std::string* result, Slice* result_operand, ValueType* result_type, + MergeOperator::OpFailureScope* op_failure_scope) { + MergeOperator::MergeOperationInputV3::ExistingValue existing_value; - const bool has_default_column = - !base_columns.empty() && base_columns[0].name() == kDefaultWideColumnName; + Slice entity_copy(entity); + WideColumns existing_columns; - Slice value_of_default; - if (has_default_column) { - value_of_default = base_columns[0].value(); + const Status s = + WideColumnSerialization::Deserialize(entity_copy, existing_columns); + if (!s.ok()) { + return s; } - std::string merge_result; + existing_value = std::move(existing_columns); - { - const Status s = TimedFullMerge(merge_operator, key, &value_of_default, - operands, &merge_result, logger, statistics, - clock, nullptr /* result_operand */, - update_num_ops_stats, op_failure_scope); - if (!s.ok()) { - return s; - } - } + return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), + operands, logger, statistics, clock, + update_num_ops_stats, result, result_operand, + result_type, op_failure_scope); +} - if (has_default_column) { - base_columns[0].value() = merge_result; +Status MergeHelper::TimedFullMerge( + const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag, + const WideColumns& columns, const std::vector& operands, + Logger* logger, Statistics* statistics, SystemClock* clock, + bool update_num_ops_stats, std::string* result, Slice* result_operand, + ValueType* result_type, MergeOperator::OpFailureScope* op_failure_scope) { + MergeOperator::MergeOperationInputV3::ExistingValue existing_value(columns); + + return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), + operands, logger, statistics, clock, + update_num_ops_stats, result, result_operand, + result_type, op_failure_scope); +} - const Status s = WideColumnSerialization::Serialize(base_columns, *result); - if (!s.ok()) { - return s; - } - } else { - const Status s = - WideColumnSerialization::Serialize(merge_result, base_columns, *result); - if (!s.ok()) { - return s; - } +Status MergeHelper::TimedFullMerge( + const MergeOperator* merge_operator, const Slice& key, NoBaseValueTag, + const std::vector& operands, Logger* logger, Statistics* statistics, + SystemClock* clock, bool update_num_ops_stats, std::string* result_value, + PinnableWideColumns* result_entity, + MergeOperator::OpFailureScope* op_failure_scope) { + MergeOperator::MergeOperationInputV3::ExistingValue existing_value; + + return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), + operands, logger, statistics, clock, + update_num_ops_stats, result_value, result_entity, + op_failure_scope); +} + +Status MergeHelper::TimedFullMerge( + const MergeOperator* merge_operator, const Slice& key, PlainBaseValueTag, + const Slice& value, const std::vector& operands, Logger* logger, + Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, + std::string* result_value, PinnableWideColumns* result_entity, + MergeOperator::OpFailureScope* op_failure_scope) { + MergeOperator::MergeOperationInputV3::ExistingValue existing_value(value); + + return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), + operands, logger, statistics, clock, + update_num_ops_stats, result_value, result_entity, + op_failure_scope); +} + +Status MergeHelper::TimedFullMerge( + const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag, + const Slice& entity, const std::vector& operands, Logger* logger, + Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, + std::string* result_value, PinnableWideColumns* result_entity, + MergeOperator::OpFailureScope* op_failure_scope) { + MergeOperator::MergeOperationInputV3::ExistingValue existing_value; + + Slice entity_copy(entity); + WideColumns existing_columns; + + const Status s = + WideColumnSerialization::Deserialize(entity_copy, existing_columns); + if (!s.ok()) { + return s; } - return Status::OK(); + existing_value = std::move(existing_columns); + + return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), + operands, logger, statistics, clock, + update_num_ops_stats, result_value, result_entity, + op_failure_scope); +} + +Status MergeHelper::TimedFullMerge( + const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag, + const WideColumns& columns, const std::vector& operands, + Logger* logger, Statistics* statistics, SystemClock* clock, + bool update_num_ops_stats, std::string* result_value, + PinnableWideColumns* result_entity, + MergeOperator::OpFailureScope* op_failure_scope) { + MergeOperator::MergeOperationInputV3::ExistingValue existing_value(columns); + + return TimedFullMergeImpl(merge_operator, key, std::move(existing_value), + operands, logger, statistics, clock, + update_num_ops_stats, result_value, result_entity, + op_failure_scope); } // PRE: iter points to the first merge type entry @@ -231,6 +439,10 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, s = Status::ShutdownInProgress(); return s; } + // Skip range tombstones emitted by the compaction iterator. + if (iter->IsDeleteRangeSentinelKey()) { + continue; + } ParsedInternalKey ikey; assert(keys_.size() == merge_context_.GetNumOperands()); @@ -283,7 +495,7 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, // hit a put/delete/single delete // => merge the put value or a nullptr with operands_ // => store result in operands_.back() (and update keys_.back()) - // => change the entry type to kTypeValue for keys_.back() + // => change the entry type for keys_.back() // We are done! Success! // If there are no operands, just return the Status::OK(). That will cause @@ -296,24 +508,23 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, // TODO: if we're in compaction and it's a put, it would be nice to run // compaction filter on it. std::string merge_result; + ValueType merge_result_type; MergeOperator::OpFailureScope op_failure_scope; if (range_del_agg && range_del_agg->ShouldDelete( ikey, RangeDelPositioningMode::kForwardTraversal)) { - s = TimedFullMerge(user_merge_operator_, ikey.user_key, nullptr, - merge_context_.GetOperands(), &merge_result, logger_, - stats_, clock_, - /* result_operand */ nullptr, - /* update_num_ops_stats */ false, &op_failure_scope); + s = TimedFullMerge(user_merge_operator_, ikey.user_key, kNoBaseValue, + merge_context_.GetOperands(), logger_, stats_, + clock_, /* update_num_ops_stats */ false, + &merge_result, /* result_operand */ nullptr, + &merge_result_type, &op_failure_scope); } else if (ikey.type == kTypeValue) { - const Slice val = iter->value(); - - s = TimedFullMerge(user_merge_operator_, ikey.user_key, &val, - merge_context_.GetOperands(), &merge_result, logger_, - stats_, clock_, - /* result_operand */ nullptr, - /* update_num_ops_stats */ false, &op_failure_scope); + s = TimedFullMerge(user_merge_operator_, ikey.user_key, kPlainBaseValue, + iter->value(), merge_context_.GetOperands(), logger_, + stats_, clock_, /* update_num_ops_stats */ false, + &merge_result, /* result_operand */ nullptr, + &merge_result_type, &op_failure_scope); } else if (ikey.type == kTypeBlobIndex) { BlobIndex blob_index; @@ -343,22 +554,23 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, c_iter_stats->total_blob_bytes_read += bytes_read; } - s = TimedFullMerge(user_merge_operator_, ikey.user_key, &blob_value, - merge_context_.GetOperands(), &merge_result, logger_, - stats_, clock_, - /* result_operand */ nullptr, - /* update_num_ops_stats */ false, &op_failure_scope); + s = TimedFullMerge(user_merge_operator_, ikey.user_key, kPlainBaseValue, + blob_value, merge_context_.GetOperands(), logger_, + stats_, clock_, /* update_num_ops_stats */ false, + &merge_result, /* result_operand */ nullptr, + &merge_result_type, &op_failure_scope); } else if (ikey.type == kTypeWideColumnEntity) { - s = TimedFullMergeWithEntity( - user_merge_operator_, ikey.user_key, iter->value(), - merge_context_.GetOperands(), &merge_result, logger_, stats_, - clock_, /* update_num_ops_stats */ false, &op_failure_scope); + s = TimedFullMerge(user_merge_operator_, ikey.user_key, kWideBaseValue, + iter->value(), merge_context_.GetOperands(), logger_, + stats_, clock_, /* update_num_ops_stats */ false, + &merge_result, /* result_operand */ nullptr, + &merge_result_type, &op_failure_scope); } else { - s = TimedFullMerge(user_merge_operator_, ikey.user_key, nullptr, - merge_context_.GetOperands(), &merge_result, logger_, - stats_, clock_, - /* result_operand */ nullptr, - /* update_num_ops_stats */ false, &op_failure_scope); + s = TimedFullMerge(user_merge_operator_, ikey.user_key, kNoBaseValue, + merge_context_.GetOperands(), logger_, stats_, + clock_, /* update_num_ops_stats */ false, + &merge_result, /* result_operand */ nullptr, + &merge_result_type, &op_failure_scope); } // We store the result in keys_.back() and operands_.back() @@ -366,10 +578,12 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, if (s.ok()) { // The original key encountered original_key = std::move(keys_.back()); - orig_ikey.type = ikey.type == kTypeWideColumnEntity - ? kTypeWideColumnEntity - : kTypeValue; + + assert(merge_result_type == kTypeValue || + merge_result_type == kTypeWideColumnEntity); + orig_ikey.type = merge_result_type; UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type); + keys_.clear(); merge_context_.Clear(); keys_.emplace_front(std::move(original_key)); @@ -494,19 +708,24 @@ Status MergeHelper::MergeUntil(InternalIterator* iter, assert(merge_context_.GetNumOperands() >= 1); assert(merge_context_.GetNumOperands() == keys_.size()); std::string merge_result; + ValueType merge_result_type; MergeOperator::OpFailureScope op_failure_scope; - s = TimedFullMerge(user_merge_operator_, orig_ikey.user_key, nullptr, - merge_context_.GetOperands(), &merge_result, logger_, - stats_, clock_, - /* result_operand */ nullptr, - /* update_num_ops_stats */ false, &op_failure_scope); + s = TimedFullMerge(user_merge_operator_, orig_ikey.user_key, kNoBaseValue, + merge_context_.GetOperands(), logger_, stats_, clock_, + /* update_num_ops_stats */ false, &merge_result, + /* result_operand */ nullptr, &merge_result_type, + &op_failure_scope); if (s.ok()) { // The original key encountered // We are certain that keys_ is not empty here (see assertions couple of // lines before). original_key = std::move(keys_.back()); - orig_ikey.type = kTypeValue; + + assert(merge_result_type == kTypeValue || + merge_result_type == kTypeWideColumnEntity); + orig_ikey.type = merge_result_type; UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type); + keys_.clear(); merge_context_.Clear(); keys_.emplace_front(std::move(original_key)); @@ -578,14 +797,15 @@ CompactionFilter::Decision MergeHelper::FilterMerge(const Slice& user_key, } compaction_filter_value_.clear(); compaction_filter_skip_until_.Clear(); - auto ret = compaction_filter_->FilterV2( - level_, user_key, CompactionFilter::ValueType::kMergeOperand, value_slice, - &compaction_filter_value_, compaction_filter_skip_until_.rep()); + auto ret = compaction_filter_->FilterV3( + level_, user_key, CompactionFilter::ValueType::kMergeOperand, + &value_slice, /* existing_columns */ nullptr, &compaction_filter_value_, + /* new_columns */ nullptr, compaction_filter_skip_until_.rep()); if (ret == CompactionFilter::Decision::kRemoveAndSkipUntil) { if (user_comparator_->Compare(*compaction_filter_skip_until_.rep(), user_key) <= 0) { // Invalid skip_until returned from compaction filter. - // Keep the key as per FilterV2 documentation. + // Keep the key as per FilterV2/FilterV3 documentation. ret = CompactionFilter::Decision::kKeep; } else { compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber, diff --git a/db/merge_helper.h b/db/merge_helper.h index 7f624b74328d..84c5f35351f9 100644 --- a/db/merge_helper.h +++ b/db/merge_helper.h @@ -41,30 +41,94 @@ class MergeHelper { Statistics* stats = nullptr, const std::atomic* shutting_down = nullptr); - // Wrapper around MergeOperator::FullMergeV2() that records perf statistics. - // Result of merge will be written to result if status returned is OK. - // If operands is empty, the value will simply be copied to result. - // Set `update_num_ops_stats` to true if it is from a user read, so that - // the latency is sensitive. + // Wrappers around MergeOperator::FullMergeV3() that record perf statistics. + // Set `update_num_ops_stats` to true if it is from a user read so that + // the corresponding statistics are updated. // Returns one of the following statuses: // - OK: Entries were successfully merged. // - Corruption: Merge operator reported unsuccessful merge. The scope of the // damage will be stored in `*op_failure_scope` when `op_failure_scope` is // not nullptr + + // Empty tag types to disambiguate overloads + struct NoBaseValueTag {}; + static constexpr NoBaseValueTag kNoBaseValue{}; + + struct PlainBaseValueTag {}; + static constexpr PlainBaseValueTag kPlainBaseValue{}; + + struct WideBaseValueTag {}; + static constexpr WideBaseValueTag kWideBaseValue{}; + + // Variants that expose the merge result directly (in serialized form for wide + // columns) as well as its value type. Used by iterator and compaction. + static Status TimedFullMerge(const MergeOperator* merge_operator, + const Slice& key, NoBaseValueTag, + const std::vector& operands, + Logger* logger, Statistics* statistics, + SystemClock* clock, bool update_num_ops_stats, + std::string* result, Slice* result_operand, + ValueType* result_type, + MergeOperator::OpFailureScope* op_failure_scope); + + static Status TimedFullMerge( + const MergeOperator* merge_operator, const Slice& key, PlainBaseValueTag, + const Slice& value, const std::vector& operands, Logger* logger, + Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, + std::string* result, Slice* result_operand, ValueType* result_type, + MergeOperator::OpFailureScope* op_failure_scope); + + static Status TimedFullMerge( + const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag, + const Slice& entity, const std::vector& operands, Logger* logger, + Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, + std::string* result, Slice* result_operand, ValueType* result_type, + MergeOperator::OpFailureScope* op_failure_scope); + + static Status TimedFullMerge( + const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag, + const WideColumns& columns, const std::vector& operands, + Logger* logger, Statistics* statistics, SystemClock* clock, + bool update_num_ops_stats, std::string* result, Slice* result_operand, + ValueType* result_type, MergeOperator::OpFailureScope* op_failure_scope); + + // Variants that expose the merge result translated to the form requested by + // the client. (For example, if the result is a wide-column structure but the + // client requested the results in plain-value form, the value of the default + // column is returned.) Used by point lookups. static Status TimedFullMerge(const MergeOperator* merge_operator, - const Slice& key, const Slice* value, + const Slice& key, NoBaseValueTag, const std::vector& operands, - std::string* result, Logger* logger, - Statistics* statistics, SystemClock* clock, - Slice* result_operand, bool update_num_ops_stats, + Logger* logger, Statistics* statistics, + SystemClock* clock, bool update_num_ops_stats, + std::string* result_value, + PinnableWideColumns* result_entity, MergeOperator::OpFailureScope* op_failure_scope); - static Status TimedFullMergeWithEntity( - const MergeOperator* merge_operator, const Slice& key, Slice base_entity, - const std::vector& operands, std::string* result, Logger* logger, + static Status TimedFullMerge( + const MergeOperator* merge_operator, const Slice& key, PlainBaseValueTag, + const Slice& value, const std::vector& operands, Logger* logger, Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, + std::string* result_value, PinnableWideColumns* result_entity, MergeOperator::OpFailureScope* op_failure_scope); + static Status TimedFullMerge( + const MergeOperator* merge_operator, const Slice& key, WideBaseValueTag, + const Slice& entity, const std::vector& operands, Logger* logger, + Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, + std::string* result_value, PinnableWideColumns* result_entity, + MergeOperator::OpFailureScope* op_failure_scope); + + static Status TimedFullMerge(const MergeOperator* merge_operator, + const Slice& key, WideBaseValueTag, + const WideColumns& columns, + const std::vector& operands, + Logger* logger, Statistics* statistics, + SystemClock* clock, bool update_num_ops_stats, + std::string* result_value, + PinnableWideColumns* result_entity, + MergeOperator::OpFailureScope* op_failure_scope); + // During compaction, merge entries until we hit // - a corrupted key // - a Put/Delete, @@ -198,6 +262,30 @@ class MergeHelper { // This is a best-effort facility, so memory_order_relaxed is sufficient. return shutting_down_ && shutting_down_->load(std::memory_order_relaxed); } + + template + static Status TimedFullMergeCommonImpl( + const MergeOperator* merge_operator, const Slice& key, + MergeOperator::MergeOperationInputV3::ExistingValue&& existing_value, + const std::vector& operands, Logger* logger, + Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, + MergeOperator::OpFailureScope* op_failure_scope, Visitor&& visitor); + + static Status TimedFullMergeImpl( + const MergeOperator* merge_operator, const Slice& key, + MergeOperator::MergeOperationInputV3::ExistingValue&& existing_value, + const std::vector& operands, Logger* logger, + Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, + std::string* result, Slice* result_operand, ValueType* result_type, + MergeOperator::OpFailureScope* op_failure_scope); + + static Status TimedFullMergeImpl( + const MergeOperator* merge_operator, const Slice& key, + MergeOperator::MergeOperationInputV3::ExistingValue&& existing_value, + const std::vector& operands, Logger* logger, + Statistics* statistics, SystemClock* clock, bool update_num_ops_stats, + std::string* result_value, PinnableWideColumns* result_entity, + MergeOperator::OpFailureScope* op_failure_scope); }; // MergeOutputIterator can be used to iterate over the result of a merge. diff --git a/db/merge_operator.cc b/db/merge_operator.cc index d325856406f8..bb5dbbc36533 100644 --- a/db/merge_operator.cc +++ b/db/merge_operator.cc @@ -9,6 +9,11 @@ #include "rocksdb/merge_operator.h" +#include + +#include "db/wide/wide_columns_helper.h" +#include "util/overload.h" + namespace ROCKSDB_NAMESPACE { bool MergeOperator::FullMergeV2(const MergeOperationInput& merge_in, @@ -23,6 +28,83 @@ bool MergeOperator::FullMergeV2(const MergeOperationInput& merge_in, &merge_out->new_value, merge_in.logger); } +bool MergeOperator::FullMergeV3(const MergeOperationInputV3& merge_in, + MergeOperationOutputV3* merge_out) const { + assert(merge_out); + + MergeOperationInput in_v2(merge_in.key, nullptr, merge_in.operand_list, + merge_in.logger); + + std::string new_value; + Slice existing_operand(nullptr, 0); + MergeOperationOutput out_v2(new_value, existing_operand); + + return std::visit( + overload{ + [&](const auto& existing) -> bool { + using T = std::decay_t; + + if constexpr (std::is_same_v) { + in_v2.existing_value = &existing; + } + + const bool result = FullMergeV2(in_v2, &out_v2); + if (!result) { + merge_out->op_failure_scope = out_v2.op_failure_scope; + return false; + } + + if (existing_operand.data()) { + merge_out->new_value = existing_operand; + } else { + merge_out->new_value = std::move(new_value); + } + + return true; + }, + [&](const WideColumns& existing_columns) -> bool { + const bool has_default_column = + WideColumnsHelper::HasDefaultColumn(existing_columns); + + Slice value_of_default; + if (has_default_column) { + value_of_default = existing_columns.front().value(); + } + + in_v2.existing_value = &value_of_default; + + const bool result = FullMergeV2(in_v2, &out_v2); + if (!result) { + merge_out->op_failure_scope = out_v2.op_failure_scope; + return false; + } + + merge_out->new_value = MergeOperationOutputV3::NewColumns(); + auto& new_columns = std::get( + merge_out->new_value); + new_columns.reserve(has_default_column + ? existing_columns.size() + : (existing_columns.size() + 1)); + + if (existing_operand.data()) { + new_columns.emplace_back(kDefaultWideColumnName.ToString(), + existing_operand.ToString()); + } else { + new_columns.emplace_back(kDefaultWideColumnName.ToString(), + std::move(new_value)); + } + + for (size_t i = has_default_column ? 1 : 0; + i < existing_columns.size(); ++i) { + new_columns.emplace_back(existing_columns[i].name().ToString(), + existing_columns[i].value().ToString()); + } + + return true; + }}, + merge_in.existing_value); +} + // The default implementation of PartialMergeMulti, which invokes // PartialMerge multiple times internally and merges two operands at // a time. diff --git a/db/merge_test.cc b/db/merge_test.cc index 0d373d41ec70..93a8535a7ee1 100644 --- a/db/merge_test.cc +++ b/db/merge_test.cc @@ -18,6 +18,7 @@ #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/utilities/db_ttl.h" +#include "rocksdb/wide_columns.h" #include "test_util/testharness.h" #include "util/coding.h" #include "utilities/merge_operators.h" @@ -104,8 +105,6 @@ std::shared_ptr OpenDb(const std::string& dbname, const bool ttl = false, options.env = EnvMergeTest::GetInstance(); EXPECT_OK(DestroyDB(dbname, Options())); Status s; -// DBWithTTL is not supported in ROCKSDB_LITE -#ifndef ROCKSDB_LITE if (ttl) { DBWithTTL* db_with_ttl; s = DBWithTTL::Open(options, dbname, &db_with_ttl); @@ -113,10 +112,6 @@ std::shared_ptr OpenDb(const std::string& dbname, const bool ttl = false, } else { s = DB::Open(options, dbname, &db); } -#else - assert(!ttl); - s = DB::Open(options, dbname, &db); -#endif // !ROCKSDB_LITE EXPECT_OK(s); assert(s.ok()); // Allowed to call NowNanos during DB creation (in GenerateRawUniqueId() for @@ -595,7 +590,6 @@ TEST_F(MergeTest, MergeDbTest) { runTest(test::PerThreadDBPath("merge_testdb")); } -#ifndef ROCKSDB_LITE TEST_F(MergeTest, MergeDbTtlTest) { runTest(test::PerThreadDBPath("merge_testdbttl"), true); // Run test on TTL database @@ -613,7 +607,272 @@ TEST_F(MergeTest, MergeWithCompactionAndFlush) { } ASSERT_OK(DestroyDB(dbname, Options())); } -#endif // !ROCKSDB_LITE + +TEST_F(MergeTest, FullMergeV3FallbackNewValue) { + // Test that the default FullMergeV3 implementation correctly handles the case + // when FullMergeV2 results in a new value. + + const Slice key("foo"); + const MergeOperator::MergeOperationInputV3::OperandList operands{ + "first", "second", "third"}; + constexpr Logger* logger = nullptr; + + auto append_operator = + MergeOperators::CreateStringAppendOperator(std::string()); + + // No existing value + { + MergeOperator::MergeOperationInputV3::ExistingValue existing_value; + const MergeOperator::MergeOperationInputV3 merge_in( + key, std::move(existing_value), operands, logger); + + MergeOperator::MergeOperationOutputV3 merge_out; + + ASSERT_TRUE(append_operator->FullMergeV3(merge_in, &merge_out)); + + const auto& result = std::get(merge_out.new_value); + ASSERT_EQ(result, operands[0].ToString() + operands[1].ToString() + + operands[2].ToString()); + } + + // Plain existing value + { + const Slice plain("plain"); + MergeOperator::MergeOperationInputV3::ExistingValue existing_value(plain); + const MergeOperator::MergeOperationInputV3 merge_in( + key, std::move(existing_value), operands, logger); + + MergeOperator::MergeOperationOutputV3 merge_out; + + ASSERT_TRUE(append_operator->FullMergeV3(merge_in, &merge_out)); + + const auto& result = std::get(merge_out.new_value); + ASSERT_EQ(result, plain.ToString() + operands[0].ToString() + + operands[1].ToString() + operands[2].ToString()); + } + + // Wide-column existing value with default column + { + const WideColumns entity{ + {kDefaultWideColumnName, "default"}, {"one", "1"}, {"two", "2"}}; + MergeOperator::MergeOperationInputV3::ExistingValue existing_value(entity); + const MergeOperator::MergeOperationInputV3 merge_in( + key, std::move(existing_value), operands, logger); + + MergeOperator::MergeOperationOutputV3 merge_out; + + ASSERT_TRUE(append_operator->FullMergeV3(merge_in, &merge_out)); + + const auto& result = + std::get( + merge_out.new_value); + ASSERT_EQ(result.size(), entity.size()); + ASSERT_EQ(result[0].first, entity[0].name()); + ASSERT_EQ(result[0].second, + entity[0].value().ToString() + operands[0].ToString() + + operands[1].ToString() + operands[2].ToString()); + ASSERT_EQ(result[1].first, entity[1].name()); + ASSERT_EQ(result[1].second, entity[1].value()); + ASSERT_EQ(result[2].first, entity[2].name()); + ASSERT_EQ(result[2].second, entity[2].value()); + } + + // Wide-column existing value without default column + { + const WideColumns entity{{"one", "1"}, {"two", "2"}}; + MergeOperator::MergeOperationInputV3::ExistingValue existing_value(entity); + const MergeOperator::MergeOperationInputV3 merge_in( + key, std::move(existing_value), operands, logger); + + MergeOperator::MergeOperationOutputV3 merge_out; + + ASSERT_TRUE(append_operator->FullMergeV3(merge_in, &merge_out)); + + const auto& result = + std::get( + merge_out.new_value); + ASSERT_EQ(result.size(), entity.size() + 1); + ASSERT_EQ(result[0].first, kDefaultWideColumnName); + ASSERT_EQ(result[0].second, operands[0].ToString() + + operands[1].ToString() + + operands[2].ToString()); + ASSERT_EQ(result[1].first, entity[0].name()); + ASSERT_EQ(result[1].second, entity[0].value()); + ASSERT_EQ(result[2].first, entity[1].name()); + ASSERT_EQ(result[2].second, entity[1].value()); + } +} + +TEST_F(MergeTest, FullMergeV3FallbackExistingOperand) { + // Test that the default FullMergeV3 implementation correctly handles the case + // when FullMergeV2 results in an existing operand. + + const Slice key("foo"); + const MergeOperator::MergeOperationInputV3::OperandList operands{ + "first", "second", "third"}; + constexpr Logger* logger = nullptr; + + auto put_operator = MergeOperators::CreatePutOperator(); + + // No existing value + { + MergeOperator::MergeOperationInputV3::ExistingValue existing_value; + const MergeOperator::MergeOperationInputV3 merge_in( + key, std::move(existing_value), operands, logger); + + MergeOperator::MergeOperationOutputV3 merge_out; + + ASSERT_TRUE(put_operator->FullMergeV3(merge_in, &merge_out)); + + const auto& result = std::get(merge_out.new_value); + ASSERT_EQ(result.data(), operands.back().data()); + ASSERT_EQ(result.size(), operands.back().size()); + } + + // Plain existing value + { + const Slice plain("plain"); + MergeOperator::MergeOperationInputV3::ExistingValue existing_value(plain); + const MergeOperator::MergeOperationInputV3 merge_in( + key, std::move(existing_value), operands, logger); + + MergeOperator::MergeOperationOutputV3 merge_out; + + ASSERT_TRUE(put_operator->FullMergeV3(merge_in, &merge_out)); + + const auto& result = std::get(merge_out.new_value); + ASSERT_EQ(result.data(), operands.back().data()); + ASSERT_EQ(result.size(), operands.back().size()); + } + + // Wide-column existing value with default column + { + const WideColumns entity{ + {kDefaultWideColumnName, "default"}, {"one", "1"}, {"two", "2"}}; + MergeOperator::MergeOperationInputV3::ExistingValue existing_value(entity); + const MergeOperator::MergeOperationInputV3 merge_in( + key, std::move(existing_value), operands, logger); + + MergeOperator::MergeOperationOutputV3 merge_out; + + ASSERT_TRUE(put_operator->FullMergeV3(merge_in, &merge_out)); + + const auto& result = + std::get( + merge_out.new_value); + ASSERT_EQ(result.size(), entity.size()); + ASSERT_EQ(result[0].first, entity[0].name()); + ASSERT_EQ(result[0].second, operands.back()); + ASSERT_EQ(result[1].first, entity[1].name()); + ASSERT_EQ(result[1].second, entity[1].value()); + ASSERT_EQ(result[2].first, entity[2].name()); + ASSERT_EQ(result[2].second, entity[2].value()); + } + + // Wide-column existing value without default column + { + const WideColumns entity{{"one", "1"}, {"two", "2"}}; + MergeOperator::MergeOperationInputV3::ExistingValue existing_value(entity); + const MergeOperator::MergeOperationInputV3 merge_in( + key, std::move(existing_value), operands, logger); + + MergeOperator::MergeOperationOutputV3 merge_out; + + ASSERT_TRUE(put_operator->FullMergeV3(merge_in, &merge_out)); + + const auto& result = + std::get( + merge_out.new_value); + ASSERT_EQ(result.size(), entity.size() + 1); + ASSERT_EQ(result[0].first, kDefaultWideColumnName); + ASSERT_EQ(result[0].second, operands.back()); + ASSERT_EQ(result[1].first, entity[0].name()); + ASSERT_EQ(result[1].second, entity[0].value()); + ASSERT_EQ(result[2].first, entity[1].name()); + ASSERT_EQ(result[2].second, entity[1].value()); + } +} + +TEST_F(MergeTest, FullMergeV3FallbackFailure) { + // Test that the default FullMergeV3 implementation correctly handles the case + // when FullMergeV2 fails. + + const Slice key("foo"); + const MergeOperator::MergeOperationInputV3::OperandList operands{ + "first", "second", "third"}; + constexpr Logger* logger = nullptr; + + class FailMergeOperator : public MergeOperator { + public: + bool FullMergeV2(const MergeOperationInput& /* merge_in */, + MergeOperationOutput* merge_out) const override { + assert(merge_out); + merge_out->op_failure_scope = OpFailureScope::kMustMerge; + + return false; + } + + const char* Name() const override { return "FailMergeOperator"; } + }; + + FailMergeOperator fail_operator; + + // No existing value + { + MergeOperator::MergeOperationInputV3::ExistingValue existing_value; + const MergeOperator::MergeOperationInputV3 merge_in( + key, std::move(existing_value), operands, logger); + + MergeOperator::MergeOperationOutputV3 merge_out; + + ASSERT_FALSE(fail_operator.FullMergeV3(merge_in, &merge_out)); + ASSERT_EQ(merge_out.op_failure_scope, + MergeOperator::OpFailureScope::kMustMerge); + } + + // Plain existing value + { + const Slice plain("plain"); + MergeOperator::MergeOperationInputV3::ExistingValue existing_value(plain); + const MergeOperator::MergeOperationInputV3 merge_in( + key, std::move(existing_value), operands, logger); + + MergeOperator::MergeOperationOutputV3 merge_out; + + ASSERT_FALSE(fail_operator.FullMergeV3(merge_in, &merge_out)); + ASSERT_EQ(merge_out.op_failure_scope, + MergeOperator::OpFailureScope::kMustMerge); + } + + // Wide-column existing value with default column + { + const WideColumns entity{ + {kDefaultWideColumnName, "default"}, {"one", "1"}, {"two", "2"}}; + MergeOperator::MergeOperationInputV3::ExistingValue existing_value(entity); + const MergeOperator::MergeOperationInputV3 merge_in( + key, std::move(existing_value), operands, logger); + + MergeOperator::MergeOperationOutputV3 merge_out; + + ASSERT_FALSE(fail_operator.FullMergeV3(merge_in, &merge_out)); + ASSERT_EQ(merge_out.op_failure_scope, + MergeOperator::OpFailureScope::kMustMerge); + } + + // Wide-column existing value without default column + { + const WideColumns entity{{"one", "1"}, {"two", "2"}}; + MergeOperator::MergeOperationInputV3::ExistingValue existing_value(entity); + const MergeOperator::MergeOperationInputV3 merge_in( + key, std::move(existing_value), operands, logger); + + MergeOperator::MergeOperationOutputV3 merge_out; + + ASSERT_FALSE(fail_operator.FullMergeV3(merge_in, &merge_out)); + ASSERT_EQ(merge_out.op_failure_scope, + MergeOperator::OpFailureScope::kMustMerge); + } +} } // namespace ROCKSDB_NAMESPACE diff --git a/db/obsolete_files_test.cc b/db/obsolete_files_test.cc index 8e9f28f65aa8..eec1486c1ba4 100644 --- a/db/obsolete_files_test.cc +++ b/db/obsolete_files_test.cc @@ -7,7 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #include @@ -166,7 +165,7 @@ TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) { {{"paranoid_file_checks", "true"}})); } } - ASSERT_OK(dbfull()->EnableFileDeletions(true /* force */)); + ASSERT_OK(dbfull()->EnableFileDeletions(/*force=*/false)); Close(); @@ -316,13 +315,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/db/options_file_test.cc b/db/options_file_test.cc index eb02e6ca4f13..c3adbeb642f3 100644 --- a/db/options_file_test.cc +++ b/db/options_file_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include #include "db/db_impl/db_impl.h" @@ -109,12 +108,3 @@ int main(int argc, char** argv) { return 0; #endif // !(defined NDEBUG) || !defined(OS_WIN) } -#else - -#include - -int main(int /*argc*/, char** /*argv*/) { - printf("Skipped as Options file is not supported in RocksDBLite.\n"); - return 0; -} -#endif // !ROCKSDB_LITE diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index 454d12dc584b..666ed32f0ec2 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -49,11 +49,9 @@ std::shared_ptr OpenDb(bool read_only = false) { FLAGS_min_write_buffer_number_to_merge; if (FLAGS_use_set_based_memetable) { -#ifndef ROCKSDB_LITE options.prefix_extractor.reset( ROCKSDB_NAMESPACE::NewFixedPrefixTransform(0)); options.memtable_factory.reset(NewHashSkipListRepFactory()); -#endif // ROCKSDB_LITE } Status s; @@ -151,6 +149,7 @@ TEST_F(PerfContextTest, SeekIntoDeletion) { ASSERT_TRUE(iter->Valid()); StopWatchNano timer2(SystemClock::Default().get(), true); iter->Next(); + ASSERT_OK(iter->status()); auto elapsed_nanos2 = timer2.ElapsedNanos(); if (FLAGS_verbose) { std::cout << "next cmp: " << get_perf_context()->user_key_comparison_count @@ -189,7 +188,8 @@ TEST_F(PerfContextTest, StopWatchOverhead) { uint64_t elapsed = 0; std::vector timings(kTotalIterations); - StopWatch timer(SystemClock::Default().get(), nullptr, 0, &elapsed); + StopWatch timer(SystemClock::Default().get(), nullptr, 0, + Histograms::HISTOGRAM_ENUM_MAX, &elapsed); for (auto& timing : timings) { timing = elapsed; } @@ -263,7 +263,7 @@ void ProfileQueries(bool enabled_time = false) { for (const int i : keys) { if (i == kFlushFlag) { FlushOptions fo; - db->Flush(fo); + ASSERT_OK(db->Flush(fo)); continue; } @@ -498,7 +498,6 @@ void ProfileQueries(bool enabled_time = false) { } } -#ifndef ROCKSDB_LITE TEST_F(PerfContextTest, KeyComparisonCount) { SetPerfLevel(kEnableCount); ProfileQueries(); @@ -509,7 +508,6 @@ TEST_F(PerfContextTest, KeyComparisonCount) { SetPerfLevel(kEnableTime); ProfileQueries(true); } -#endif // ROCKSDB_LITE // make perf_context_test // export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison @@ -968,6 +966,159 @@ TEST_F(PerfContextTest, CPUTimer) { ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos); } } + +TEST_F(PerfContextTest, MergeOperandCount) { + ASSERT_OK(DestroyDB(kDbName, Options())); + + DB* db = nullptr; + Options options; + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateStringAppendOperator(); + + ASSERT_OK(DB::Open(options, kDbName, &db)); + std::unique_ptr db_guard(db); + + constexpr size_t num_keys = 3; + const std::string key_prefix("key"); + const std::string value_prefix("value"); + + std::vector keys; + keys.reserve(num_keys); + + for (size_t i = 0; i < num_keys; ++i) { + keys.emplace_back(key_prefix + std::to_string(i)); + } + + // Write three keys with one Put each followed by 1, 2, and 3 + // Merge operations respectively. + constexpr size_t total_merges = num_keys * (num_keys + 1) / 2; + + std::vector snapshots; + snapshots.reserve(total_merges); + + for (size_t i = 0; i < num_keys; ++i) { + const std::string suffix = std::to_string(i); + const std::string value = value_prefix + suffix; + + ASSERT_OK(db->Put(WriteOptions(), keys[i], value)); + + for (size_t j = 0; j <= i; ++j) { + // Take a snapshot before each Merge so they are preserved and not + // collapsed during flush. + snapshots.emplace_back(db); + + ASSERT_OK(db->Merge(WriteOptions(), keys[i], value + std::to_string(j))); + } + } + + auto verify = [&]() { + get_perf_context()->Reset(); + + for (size_t i = 0; i < num_keys; ++i) { + // Get + { + PinnableSlice result; + ASSERT_OK(db->Get(ReadOptions(), db->DefaultColumnFamily(), keys[i], + &result)); + ASSERT_EQ(get_perf_context()->internal_merge_point_lookup_count, i + 1); + + get_perf_context()->Reset(); + } + + // GetEntity + { + PinnableWideColumns result; + ASSERT_OK(db->GetEntity(ReadOptions(), db->DefaultColumnFamily(), + keys[i], &result)); + ASSERT_EQ(get_perf_context()->internal_merge_point_lookup_count, i + 1); + + get_perf_context()->Reset(); + } + } + + { + std::vector key_slices; + key_slices.reserve(num_keys); + + for (size_t i = 0; i < num_keys; ++i) { + key_slices.emplace_back(keys[i]); + } + + // MultiGet + { + std::vector results(num_keys); + std::vector statuses(num_keys); + + db->MultiGet(ReadOptions(), db->DefaultColumnFamily(), num_keys, + &key_slices[0], &results[0], &statuses[0]); + + for (size_t i = 0; i < num_keys; ++i) { + ASSERT_OK(statuses[i]); + } + + ASSERT_EQ(get_perf_context()->internal_merge_point_lookup_count, + total_merges); + + get_perf_context()->Reset(); + } + + // MultiGetEntity + { + std::vector results(num_keys); + std::vector statuses(num_keys); + + db->MultiGetEntity(ReadOptions(), db->DefaultColumnFamily(), num_keys, + &key_slices[0], &results[0], &statuses[0]); + + for (size_t i = 0; i < num_keys; ++i) { + ASSERT_OK(statuses[i]); + } + + ASSERT_EQ(get_perf_context()->internal_merge_point_lookup_count, + total_merges); + + get_perf_context()->Reset(); + } + } + + std::unique_ptr it(db->NewIterator(ReadOptions())); + + // Forward iteration + { + size_t i = 0; + + for (it->SeekToFirst(); it->Valid(); it->Next(), ++i) { + ASSERT_EQ(it->key(), keys[i]); + ASSERT_EQ(get_perf_context()->internal_merge_count, i + 1); + + get_perf_context()->Reset(); + } + ASSERT_OK(it->status()); + } + + // Backward iteration + { + size_t i = num_keys - 1; + + for (it->SeekToLast(); it->Valid(); it->Prev(), --i) { + ASSERT_EQ(it->key(), keys[i]); + ASSERT_EQ(get_perf_context()->internal_merge_count, i + 1); + + get_perf_context()->Reset(); + } + ASSERT_OK(it->status()); + } + }; + + // Verify counters when reading from memtable + verify(); + + // Verify counters when reading from table files + ASSERT_OK(db->Flush(FlushOptions())); + + verify(); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/periodic_task_scheduler.cc b/db/periodic_task_scheduler.cc index 2024510dd606..1c4fc16b1c8b 100644 --- a/db/periodic_task_scheduler.cc +++ b/db/periodic_task_scheduler.cc @@ -8,7 +8,6 @@ #include "rocksdb/system_clock.h" -#ifndef ROCKSDB_LITE namespace ROCKSDB_NAMESPACE { // `timer_mutex` is a global mutex serves 3 purposes currently: @@ -95,7 +94,7 @@ Status PeriodicTaskScheduler::Unregister(PeriodicTaskType task_type) { } Timer* PeriodicTaskScheduler::Default() { - static Timer timer(SystemClock::Default().get()); + STATIC_AVOID_DESTRUCTION(Timer, timer)(SystemClock::Default().get()); return &timer; } @@ -109,5 +108,3 @@ void PeriodicTaskScheduler::TEST_OverrideTimer(SystemClock* clock) { #endif // NDEBUG } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/db/periodic_task_scheduler.h b/db/periodic_task_scheduler.h index f45b80c4d811..a93f9a09588a 100644 --- a/db/periodic_task_scheduler.h +++ b/db/periodic_task_scheduler.h @@ -6,7 +6,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include "util/timer.h" @@ -43,15 +42,16 @@ class PeriodicTaskScheduler { PeriodicTaskScheduler& operator=(const PeriodicTaskScheduler&) = delete; PeriodicTaskScheduler& operator=(PeriodicTaskScheduler&&) = delete; - // Register a task with its default repeat period + // Register a task with its default repeat period. Thread safe call. Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn); // Register a task with specified repeat period. 0 is an invalid argument - // (kInvalidPeriodSec). To stop the task, please use Unregister() specifically + // (kInvalidPeriodSec). To stop the task, please use Unregister(). + // Thread safe call. Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn, uint64_t repeat_period_seconds); - // Unregister the task + // Unregister the task. Thread safe call. Status Unregister(PeriodicTaskType task_type); #ifndef NDEBUG @@ -106,5 +106,3 @@ class PeriodicTaskScheduler { }; } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/db/periodic_task_scheduler_test.cc b/db/periodic_task_scheduler_test.cc index 73c13fa1384e..c1205bcf6125 100644 --- a/db/periodic_task_scheduler_test.cc +++ b/db/periodic_task_scheduler_test.cc @@ -12,7 +12,6 @@ namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE class PeriodicTaskSchedulerTest : public DBTestBase { public: PeriodicTaskSchedulerTest() @@ -220,7 +219,6 @@ TEST_F(PeriodicTaskSchedulerTest, MultiEnv) { Close(); } -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 755b639b07f1..a6acb7b188c6 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -7,7 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #include #include @@ -124,6 +123,7 @@ class PlainTableDBTest : public testing::Test, // Return the current option configuration. Options CurrentOptions() { Options options; + options.level_compaction_dynamic_level_bytes = false; PlainTableOptions plain_table_options; plain_table_options.user_key_len = 0; @@ -330,21 +330,23 @@ class TestPlainTableFactory : public PlainTableFactory { std::unique_ptr* table, bool /*prefetch_index_and_filter_in_cache*/) const override { std::unique_ptr props; + const ReadOptions read_options; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - table_reader_options.ioptions, &props); + table_reader_options.ioptions, read_options, + &props); EXPECT_TRUE(s.ok()); if (store_index_in_file_) { BlockHandle bloom_block_handle; s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber, - table_reader_options.ioptions, + table_reader_options.ioptions, read_options, BloomBlockBuilder::kBloomBlock, &bloom_block_handle); EXPECT_TRUE(s.ok()); BlockHandle index_block_handle; s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber, - table_reader_options.ioptions, + table_reader_options.ioptions, read_options, PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle); EXPECT_TRUE(s.ok()); @@ -895,6 +897,7 @@ TEST_P(PlainTableDBTest, IteratorLargeKeys) { } ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); delete iter; } @@ -943,6 +946,7 @@ TEST_P(PlainTableDBTest, IteratorLargeKeysWithPrefix) { } ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); delete iter; } @@ -1345,13 +1349,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as plain table is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/db/prefix_test.cc b/db/prefix_test.cc index 8592b8f313c0..bb6e6f7a670e 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #ifndef GFLAGS #include @@ -783,6 +782,7 @@ TEST_F(PrefixTest, PrefixSeekModePrev) { } } } + ASSERT_OK(iter->status()); } } @@ -893,14 +893,3 @@ int main(int argc, char** argv) { #endif // GFLAGS -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "SKIPPED as HashSkipList and HashLinkList are not supported in " - "ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index c03efa11ffe0..6e76f9c72589 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -30,12 +30,15 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator( icmp_(icmp), smallest_ikey_(smallest), largest_ikey_(largest) { + // Set up bounds such that range tombstones from this iterator are + // truncated to range [smallest_, largest_). if (smallest != nullptr) { pinned_bounds_.emplace_back(); auto& parsed_smallest = pinned_bounds_.back(); Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest, false /* log_err_key */); // TODO pik_status.PermitUncheckedError(); + parsed_smallest.type = kTypeMaxValid; assert(pik_status.ok()); smallest_ = &parsed_smallest; } @@ -63,6 +66,8 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator( // // Therefore, we will never truncate a range tombstone at largest, so we // can leave it unchanged. + // TODO: maybe use kMaxValid here to ensure range tombstone having + // distinct key from point keys. } else { // The same user key may straddle two sstable boundaries. To ensure that // the truncated end key can cover the largest key in this sstable, reduce @@ -70,7 +75,7 @@ TruncatedRangeDelIterator::TruncatedRangeDelIterator( parsed_largest.sequence -= 1; // This line is not needed for correctness, but it ensures that the // truncated end key is not covering keys from the next SST file. - parsed_largest.type = kValueTypeForSeek; + parsed_largest.type = kTypeMaxValid; } largest_ = &parsed_largest; } @@ -101,6 +106,24 @@ void TruncatedRangeDelIterator::Seek(const Slice& target) { iter_->Seek(target); } +void TruncatedRangeDelIterator::SeekInternalKey(const Slice& target) { + if (largest_ && icmp_->Compare(*largest_, target) <= 0) { + iter_->Invalidate(); + return; + } + if (smallest_ && icmp_->Compare(target, *smallest_) < 0) { + // Since target < smallest, target < largest_. + // This seek must land on a range tombstone where end_key() > target, + // so there is no need to check again. + iter_->Seek(smallest_->user_key); + } else { + iter_->Seek(ExtractUserKey(target)); + while (Valid() && icmp_->Compare(end_key(), target) <= 0) { + Next(); + } + } +} + // NOTE: target is a user key, with timestamp if enabled. void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) { if (smallest_ != nullptr && @@ -393,21 +416,20 @@ bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed, namespace { // Produce a sorted (by start internal key) stream of range tombstones from -// `children`. lower_bound and upper_bound on user key can be +// `children`. lower_bound and upper_bound on internal key can be // optionally specified. Range tombstones that ends before lower_bound or starts // after upper_bound are excluded. // If user-defined timestamp is enabled, lower_bound and upper_bound should -// contain timestamp, but comparison is done ignoring timestamps. +// contain timestamp. class TruncatedRangeDelMergingIter : public InternalIterator { public: TruncatedRangeDelMergingIter( const InternalKeyComparator* icmp, const Slice* lower_bound, - const Slice* upper_bound, bool upper_bound_inclusive, + const Slice* upper_bound, const std::vector>& children) : icmp_(icmp), lower_bound_(lower_bound), upper_bound_(upper_bound), - upper_bound_inclusive_(upper_bound_inclusive), heap_(StartKeyMinComparator(icmp)), ts_sz_(icmp_->user_comparator()->timestamp_size()) { for (auto& child : children) { @@ -420,7 +442,7 @@ class TruncatedRangeDelMergingIter : public InternalIterator { } bool Valid() const override { - return !heap_.empty() && BeforeEndKey(heap_.top()); + return !heap_.empty() && !AfterEndKey(heap_.top()); } Status status() const override { return Status::OK(); } @@ -428,7 +450,13 @@ class TruncatedRangeDelMergingIter : public InternalIterator { heap_.clear(); for (auto& child : children_) { if (lower_bound_ != nullptr) { - child->Seek(*lower_bound_); + child->Seek(ExtractUserKey(*lower_bound_)); + // Since the above `Seek()` operates on a user key while `lower_bound_` + // is an internal key, we may need to advance `child` farther for it to + // be in bounds. + while (child->Valid() && BeforeStartKey(child)) { + child->InternalNext(); + } } else { child->SeekToFirst(); } @@ -481,19 +509,23 @@ class TruncatedRangeDelMergingIter : public InternalIterator { void SeekToLast() override { assert(false); } private: - bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const { + bool BeforeStartKey(const TruncatedRangeDelIterator* iter) const { + if (lower_bound_ == nullptr) { + return false; + } + return icmp_->Compare(iter->end_key(), *lower_bound_) <= 0; + } + + bool AfterEndKey(const TruncatedRangeDelIterator* iter) const { if (upper_bound_ == nullptr) { - return true; + return false; } - int cmp = icmp_->user_comparator()->CompareWithoutTimestamp( - iter->start_key().user_key, *upper_bound_); - return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0; + return icmp_->Compare(iter->start_key(), *upper_bound_) > 0; } const InternalKeyComparator* icmp_; const Slice* lower_bound_; const Slice* upper_bound_; - bool upper_bound_inclusive_; BinaryHeap heap_; std::vector children_; @@ -506,11 +538,10 @@ class TruncatedRangeDelMergingIter : public InternalIterator { std::unique_ptr CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound, - const Slice* upper_bound, - bool upper_bound_inclusive) { + const Slice* upper_bound) { InvalidateRangeDelMapPositions(); auto merging_iter = std::make_unique( - icmp_, lower_bound, upper_bound, upper_bound_inclusive, parent_iters_); + icmp_, lower_bound, upper_bound, parent_iters_); auto fragmented_tombstone_list = std::make_shared( diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index 9bd40967d015..f7fa87af40dd 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -36,6 +36,10 @@ class TruncatedRangeDelIterator { const InternalKeyComparator* icmp, const InternalKey* smallest, const InternalKey* largest); + void SetRangeDelReadSeqno(SequenceNumber read_seqno) { + iter_->SetRangeDelReadSeqno(read_seqno); + } + bool Valid() const; void Next() { iter_->TopNext(); } @@ -49,6 +53,9 @@ class TruncatedRangeDelIterator { // REQUIRES: target is a user key. void Seek(const Slice& target); + // Seeks to the first range tombstone with end_key() > target. + void SeekInternalKey(const Slice& target); + // Seeks to the tombstone with the highest visible sequence number that covers // target (a user key). If no such tombstone exists, the position will be at // the latest tombstone that starts before target. @@ -452,16 +459,15 @@ class CompactionRangeDelAggregator : public RangeDelAggregator { } // Creates an iterator over all the range tombstones in the aggregator, for - // use in compaction. Nullptr arguments indicate that the iterator range is - // unbounded. - // NOTE: the boundaries are used for optimization purposes to reduce the - // number of tombstones that are passed to the fragmenter; they do not - // guarantee that the resulting iterator only contains range tombstones that - // cover keys in the provided range. If required, these bounds must be + // use in compaction. + // + // NOTE: the internal key boundaries are used for optimization purposes to + // reduce the number of tombstones that are passed to the fragmenter; they do + // not guarantee that the resulting iterator only contains range tombstones + // that cover keys in the provided range. If required, these bounds must be // enforced during iteration. std::unique_ptr NewIterator( - const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr, - bool upper_bound_inclusive = false); + const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr); private: std::vector> parent_iters_; diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index 7fe35276a67b..89391c924d93 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -224,26 +224,32 @@ TEST_F(RangeDelAggregatorTest, UntruncatedIter) { TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, nullptr); - VerifyIterator(&iter, bytewise_icmp, - {{UncutEndpoint("a"), UncutEndpoint("e"), 10}, - {UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {UncutEndpoint("j"), UncutEndpoint("n"), 4}}); + VerifyIterator( + &iter, bytewise_icmp, + {{InternalValue("a", 10, kTypeRangeDeletion), UncutEndpoint("e"), 10}, + {InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8}, + {InternalValue("j", 4, kTypeRangeDeletion), UncutEndpoint("n"), 4}}); VerifySeek( &iter, bytewise_icmp, - {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10}, - {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4}, - {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, - {"", UncutEndpoint("a"), UncutEndpoint("e"), 10}}); + {{"d", InternalValue("a", 10, kTypeRangeDeletion), UncutEndpoint("e"), + 10}, + {"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8}, + {"ia", InternalValue("j", 4, kTypeRangeDeletion), UncutEndpoint("n"), 4}, + {"n", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0, + true /* invalid */}, + {"", InternalValue("a", 10, kTypeRangeDeletion), UncutEndpoint("e"), + 10}}); VerifySeekForPrev( &iter, bytewise_icmp, - {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10}, - {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4}, - {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); + {{"d", InternalValue("a", 10, kTypeRangeDeletion), UncutEndpoint("e"), + 10}, + {"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8}, + {"ia", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8}, + {"n", InternalValue("j", 4, kTypeRangeDeletion), UncutEndpoint("n"), 4}, + {"", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0, + true /* invalid */}}); } TEST_F(RangeDelAggregatorTest, UntruncatedIterWithSnapshot) { @@ -258,25 +264,29 @@ TEST_F(RangeDelAggregatorTest, UntruncatedIterWithSnapshot) { TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr, nullptr); - VerifyIterator(&iter, bytewise_icmp, - {{UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {UncutEndpoint("j"), UncutEndpoint("n"), 4}}); + VerifyIterator( + &iter, bytewise_icmp, + {{InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8}, + {InternalValue("j", 4, kTypeRangeDeletion), UncutEndpoint("n"), 4}}); VerifySeek( &iter, bytewise_icmp, - {{"d", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4}, - {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, - {"", UncutEndpoint("e"), UncutEndpoint("g"), 8}}); + {{"d", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8}, + {"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8}, + {"ia", InternalValue("j", 4, kTypeRangeDeletion), UncutEndpoint("n"), 4}, + {"n", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0, + true /* invalid */}, + {"", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8}}); VerifySeekForPrev( &iter, bytewise_icmp, - {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, - {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4}, - {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); + {{"d", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0, + true /* invalid */}, + {"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8}, + {"ia", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8}, + {"n", InternalValue("j", 4, kTypeRangeDeletion), UncutEndpoint("n"), 4}, + {"", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0, + true /* invalid */}}); } TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) { @@ -295,27 +305,30 @@ TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) { VerifyIterator( &iter, bytewise_icmp, - {{InternalValue("d", 7), UncutEndpoint("e"), 10}, - {UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4}}); + {{InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10}, + {InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8}, + {InternalValue("j", 4, kTypeRangeDeletion), + InternalValue("m", 8, kTypeMaxValid), 4}}); VerifySeek( &iter, bytewise_icmp, - {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10}, - {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"ia", UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4, - false /* invalid */}, - {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, - {"", InternalValue("d", 7), UncutEndpoint("e"), 10}}); + {{"d", InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10}, + {"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8}, + {"ia", InternalValue("j", 4, kTypeRangeDeletion), + InternalValue("m", 8, kTypeMaxValid), 4, false /* invalid */}, + {"n", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0, + true /* invalid */}, + {"", InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10}}); VerifySeekForPrev( &iter, bytewise_icmp, - {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10}, - {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8}, - {"n", UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4, - false /* invalid */}, - {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); + {{"d", InternalValue("d", 7, kTypeMaxValid), UncutEndpoint("e"), 10}, + {"e", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8}, + {"ia", InternalValue("e", 8, kTypeRangeDeletion), UncutEndpoint("g"), 8}, + {"n", InternalValue("j", 4, kTypeRangeDeletion), + InternalValue("m", 8, kTypeMaxValid), 4, false /* invalid */}, + {"", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0, + true /* invalid */}}); } TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) { @@ -332,20 +345,23 @@ TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) { TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, &smallest, &largest); - VerifyIterator(&iter, bytewise_icmp, - {{InternalValue("f", 7), UncutEndpoint("g"), 8}}); + VerifyIterator( + &iter, bytewise_icmp, + {{InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8}}); VerifySeek( &iter, bytewise_icmp, - {{"d", InternalValue("f", 7), UncutEndpoint("g"), 8}, - {"f", InternalValue("f", 7), UncutEndpoint("g"), 8}, - {"j", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}}); + {{"d", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8}, + {"f", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8}, + {"j", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0, + true /* invalid */}}); VerifySeekForPrev( &iter, bytewise_icmp, - {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}, - {"f", InternalValue("f", 7), UncutEndpoint("g"), 8}, - {"j", InternalValue("f", 7), UncutEndpoint("g"), 8}}); + {{"d", InternalValue("", 0, kTypeRangeDeletion), UncutEndpoint(""), 0, + true /* invalid */}, + {"f", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8}, + {"j", InternalValue("f", 7, kTypeMaxValid), UncutEndpoint("g"), 8}}); } TEST_F(RangeDelAggregatorTest, SingleIterInAggregator) { @@ -627,15 +643,12 @@ TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorRight) { range_del_agg.AddTombstones(std::move(input_iter)); } - Slice start("p"); - Slice end("q"); - auto range_del_compaction_iter1 = - range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); - VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {}); - - auto range_del_compaction_iter2 = - range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); - VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {}); + InternalKey start_buf("p", 0, kTypeRangeDeletion); + InternalKey end_buf("q", 0, kTypeRangeDeletion); + Slice start = start_buf.Encode(); + Slice end = end_buf.Encode(); + auto range_del_compaction_iter = range_del_agg.NewIterator(&start, &end); + VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {}); } TEST_F(RangeDelAggregatorTest, CompactionAggregatorBoundedIterator) { @@ -652,18 +665,13 @@ TEST_F(RangeDelAggregatorTest, CompactionAggregatorBoundedIterator) { range_del_agg.AddTombstones(std::move(input_iter)); } - Slice start("bb"); - Slice end("e"); - auto range_del_compaction_iter1 = - range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); - VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), + InternalKey start_buf("bb", 0, kTypeRangeDeletion); + InternalKey end_buf("e", 9, kTypeRangeDeletion); + Slice start = start_buf.Encode(); + Slice end = end_buf.Encode(); + auto range_del_compaction_iter = range_del_agg.NewIterator(&start, &end); + VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}}); - - auto range_del_compaction_iter2 = - range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); - VerifyFragmentedRangeDels( - range_del_compaction_iter2.get(), - {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}, {"e", "g", 8}}); } TEST_F(RangeDelAggregatorTest, @@ -681,29 +689,19 @@ TEST_F(RangeDelAggregatorTest, range_del_agg.AddTombstones(std::move(input_iter)); } - Slice start("bb"); - Slice end("e"); - auto range_del_compaction_iter1 = - range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */); - VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {{"a", "b", 10}, - {"b", "c", 20}, - {"b", "c", 10}, - {"c", "d", 10}, - {"c", "d", 8}, - {"d", "f", 30}, - {"d", "f", 8}, - {"f", "g", 8}}); - - auto range_del_compaction_iter2 = - range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */); - VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {{"a", "b", 10}, - {"b", "c", 20}, - {"b", "c", 10}, - {"c", "d", 10}, - {"c", "d", 8}, - {"d", "f", 30}, - {"d", "f", 8}, - {"f", "g", 8}}); + InternalKey start_buf("bb", 0, kTypeRangeDeletion); + InternalKey end_buf("e", 0, kTypeRangeDeletion); + Slice start = start_buf.Encode(); + Slice end = end_buf.Encode(); + auto range_del_compaction_iter = range_del_agg.NewIterator(&start, &end); + VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 10}, + {"b", "c", 20}, + {"b", "c", 10}, + {"c", "d", 10}, + {"c", "d", 8}, + {"d", "f", 30}, + {"d", "f", 8}, + {"f", "g", 8}}); } } // namespace ROCKSDB_NAMESPACE diff --git a/db/range_tombstone_fragmenter.h b/db/range_tombstone_fragmenter.h index df07fa8949b8..ce631d495e67 100644 --- a/db/range_tombstone_fragmenter.h +++ b/db/range_tombstone_fragmenter.h @@ -148,6 +148,10 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { const InternalKeyComparator& icmp, SequenceNumber upper_bound, const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0); + void SetRangeDelReadSeqno(SequenceNumber read_seqno) override { + upper_bound_ = read_seqno; + } + void SeekToFirst() override; void SeekToLast() override; @@ -218,8 +222,7 @@ class FragmentedRangeTombstoneIterator : public InternalIterator { } ParsedInternalKey parsed_start_key() const { - return ParsedInternalKey(pos_->start_key, kMaxSequenceNumber, - kTypeRangeDeletion); + return ParsedInternalKey(pos_->start_key, seq(), kTypeRangeDeletion); } ParsedInternalKey parsed_end_key() const { return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber, diff --git a/db/repair.cc b/db/repair.cc index ddec43e9b60d..ef21f7ea611d 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -60,7 +60,6 @@ // in the table's meta section to speed up ScanTable. #include "db/version_builder.h" -#ifndef ROCKSDB_LITE #include @@ -123,7 +122,8 @@ class Repairer { vset_(dbname_, &immutable_db_options_, file_options_, raw_table_cache_.get(), &wb_, &wc_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id=*/"", db_session_id_), + /*db_id=*/"", db_session_id_, db_options.daily_offpeak_time_utc, + /*error_handler=*/nullptr), next_file_number_(1), db_lock_(nullptr), closed_(false) { @@ -146,6 +146,8 @@ class Repairer { // Adds a column family to the VersionSet with cf_options_ and updates // manifest. Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) { + // TODO: plumb Env::IOActivity; + const ReadOptions read_options; const auto* cf_opts = GetColumnFamilyOptions(cf_name); if (cf_opts == nullptr) { return Status::Corruption("Encountered unknown column family with name=" + @@ -156,6 +158,7 @@ class Repairer { VersionEdit edit; edit.SetComparatorName(opts.comparator->Name()); + edit.SetPersistUserDefinedTimestamps(opts.persist_user_defined_timestamps); edit.SetLogNumber(0); edit.SetColumnFamily(cf_id); ColumnFamilyData* cfd; @@ -167,8 +170,9 @@ class Repairer { Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), &db_dir, nullptr); if (status.ok()) { - status = vset_.LogAndApply(cfd, mut_cf_opts, &edit, &mutex_, db_dir.get(), - false /* new_descriptor_log */, cf_opts); + status = vset_.LogAndApply(cfd, mut_cf_opts, read_options, &edit, &mutex_, + db_dir.get(), false /* new_descriptor_log */, + cf_opts); } mutex_.Unlock(); return status; @@ -358,6 +362,9 @@ class Repairer { } }; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + // Open the log file std::string logname = LogFileName(wal_dir, log); const auto& fs = env_->GetFileSystem(); @@ -389,9 +396,12 @@ class Repairer { auto cf_mems = new ColumnFamilyMemTablesImpl(vset_.GetColumnFamilySet()); // Read all the records and add to a memtable + const UnorderedMap& running_ts_sz = + vset_.GetRunningColumnFamiliesTimestampSize(); std::string scratch; Slice record; WriteBatch batch; + int counter = 0; while (reader.ReadRecord(&record, &scratch)) { if (record.size() < WriteBatchInternal::kHeader) { @@ -401,8 +411,15 @@ class Repairer { } Status record_status = WriteBatchInternal::SetContents(&batch, record); if (record_status.ok()) { - record_status = - WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr); + const UnorderedMap& record_ts_sz = + reader.GetRecordedTimestampSize(); + record_status = HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kVerifyConsistency); + if (record_status.ok()) { + record_status = + WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr); + } } if (record_status.ok()) { counter += WriteBatchInternal::Count(&batch); @@ -423,6 +440,7 @@ class Repairer { FileMetaData meta; meta.fd = FileDescriptor(next_file_number_++, 0, 0); + // TODO: plumb Env::IOActivity ReadOptions ro; ro.total_order_seek = true; Arena arena; @@ -454,16 +472,17 @@ class Repairer { 0 /* file_creation_time */, "DB Repairer" /* db_id */, db_session_id_, 0 /*target_file_size*/, meta.fd.GetNumber()); - SeqnoToTimeMapping empty_seqno_time_mapping; + SeqnoToTimeMapping empty_seqno_to_time_mapping; status = BuildTable( dbname_, /* versions */ nullptr, immutable_db_options_, tboptions, - file_options_, table_cache_.get(), iter.get(), + file_options_, read_options, table_cache_.get(), iter.get(), std::move(range_del_iters), &meta, nullptr /* blob_file_additions */, {}, kMaxSequenceNumber, kMaxSequenceNumber, snapshot_checker, false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s, nullptr /*IOTracer*/, BlobFileCreationReason::kRecovery, - empty_seqno_time_mapping, nullptr /* event_logger */, 0 /* job_id */, - Env::IO_HIGH, nullptr /* table_properties */, write_hint); + empty_seqno_to_time_mapping, nullptr /* event_logger */, + 0 /* job_id */, Env::IO_HIGH, nullptr /* table_properties */, + write_hint); ROCKS_LOG_INFO(db_options_.info_log, "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", log, counter, meta.fd.GetNumber(), @@ -510,8 +529,11 @@ class Repairer { file_size); std::shared_ptr props; if (status.ok()) { - status = table_cache_->GetTableProperties(file_options_, icmp_, t->meta, - &props); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status = table_cache_->GetTableProperties( + file_options_, read_options, icmp_, t->meta, &props, + 0 /* block_protection_bytes_per_key */); } if (status.ok()) { auto s = @@ -541,6 +563,19 @@ class Repairer { AddColumnFamily(props->column_family_name, t->column_family_id); } t->meta.oldest_ancester_time = props->creation_time; + t->meta.user_defined_timestamps_persisted = + static_cast(props->user_defined_timestamps_persisted); + } + if (status.ok()) { + uint64_t tail_size = 0; + bool contain_no_data_blocks = + props->num_entries > 0 && + (props->num_entries == props->num_range_deletions); + if (props->tail_start_offset > 0 || contain_no_data_blocks) { + assert(props->tail_start_offset <= file_size); + tail_size = file_size - props->tail_start_offset; + } + t->meta.tail_size = tail_size; } ColumnFamilyData* cfd = nullptr; if (status.ok()) { @@ -557,6 +592,7 @@ class Repairer { } } if (status.ok()) { + // TODO: plumb Env::IOActivity ReadOptions ropts; ropts.total_order_seek = true; InternalIterator* iter = table_cache_->NewIterator( @@ -568,7 +604,8 @@ class Repairer { /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0, /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, - /*allow_unprepared_value=*/false); + /*allow_unprepared_value=*/false, + cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key); ParsedInternalKey parsed; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { Slice key = iter->key(); @@ -604,10 +641,13 @@ class Repairer { // an SST file is a full sorted run. This probably needs the extra logic // from compaction_job.cc around call to UpdateBoundariesForRange (to // handle range tombstones extendingg beyond range of other entries). + // TODO: plumb Env::IOActivity ReadOptions ropts; std::unique_ptr r_iter; status = table_cache_->GetRangeTombstoneIterator( - ropts, cfd->internal_comparator(), t->meta, &r_iter); + ropts, cfd->internal_comparator(), t->meta, + cfd->GetLatestMutableCFOptions()->block_protection_bytes_per_key, + &r_iter); if (r_iter) { r_iter->SeekToFirst(); @@ -626,6 +666,8 @@ class Repairer { } Status AddTables() { + // TODO: plumb Env::IOActivity; + const ReadOptions read_options; std::unordered_map> cf_id_to_tables; SequenceNumber max_sequence = 0; for (size_t i = 0; i < tables_.size(); i++) { @@ -652,7 +694,9 @@ class Repairer { &cfd->internal_comparator(), cfd->user_comparator(), cfd->NumberLevels(), cfd->ioptions()->compaction_style, nullptr /* src_vstorage */, cfd->ioptions()->force_consistency_checks, - EpochNumberRequirement::kMightMissing); + EpochNumberRequirement::kMightMissing, cfd->ioptions()->clock, + /*bottommost_file_compaction_delay=*/0, + cfd->current()->version_set()->offpeak_time_option()); Status s; VersionEdit dummy_edit; for (const auto* table : cf_id_and_tables.second) { @@ -666,7 +710,8 @@ class Repairer { table->meta.oldest_ancester_time, table->meta.file_creation_time, table->meta.epoch_number, table->meta.file_checksum, table->meta.file_checksum_func_name, table->meta.unique_id, - table->meta.compensated_range_deletion_size); + table->meta.compensated_range_deletion_size, table->meta.tail_size, + table->meta.user_defined_timestamps_persisted); } s = dummy_version_builder.Apply(&dummy_edit); if (s.ok()) { @@ -680,6 +725,8 @@ class Repairer { // recovered epoch numbers VersionEdit edit; edit.SetComparatorName(cfd->user_comparator()->Name()); + edit.SetPersistUserDefinedTimestamps( + cfd->ioptions()->persist_user_defined_timestamps); edit.SetLogNumber(0); edit.SetNextFile(next_file_number_); edit.SetColumnFamily(cfd->GetID()); @@ -707,8 +754,8 @@ class Repairer { s = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), &db_dir, nullptr); if (s.ok()) { - s = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, - &mutex_, db_dir.get(), + s = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + read_options, &edit, &mutex_, db_dir.get(), false /* new_descriptor_log */); } mutex_.Unlock(); @@ -810,5 +857,3 @@ Status RepairDB(const std::string& dbname, const Options& options) { } } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/db/repair_test.cc b/db/repair_test.cc index f80f2b722f73..e8cc40aab4a3 100644 --- a/db/repair_test.cc +++ b/db/repair_test.cc @@ -3,25 +3,23 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "rocksdb/options.h" -#ifndef ROCKSDB_LITE - #include #include #include #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" +#include "db/db_with_timestamp_test_util.h" #include "file/file_util.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" +#include "rocksdb/options.h" #include "rocksdb/transaction_log.h" #include "table/unique_id_impl.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE class RepairTest : public DBTestBase { public: RepairTest() : DBTestBase("repair_test", /*env_do_fsync=*/true) {} @@ -317,6 +315,147 @@ TEST_F(RepairTest, UnflushedSst) { ASSERT_EQ(Get("key"), "val"); } +// Test parameters: +// param 0): paranoid file check +// param 1): user-defined timestamp test mode +class RepairTestWithTimestamp + : public DBBasicTestWithTimestampBase, + public testing::WithParamInterface< + std::tuple> { + public: + RepairTestWithTimestamp() + : DBBasicTestWithTimestampBase("repair_test_with_timestamp") {} + + Status Put(const Slice& key, const Slice& ts, const Slice& value) { + WriteOptions write_opts; + return db_->Put(write_opts, handles_[0], key, ts, value); + } + + void CheckGet(const ReadOptions& read_opts, const Slice& key, + const std::string& expected_value, + const std::string& expected_ts) { + std::string actual_value; + std::string actual_ts; + ASSERT_OK(db_->Get(read_opts, handles_[0], key, &actual_value, &actual_ts)); + ASSERT_EQ(expected_value, actual_value); + ASSERT_EQ(expected_ts, actual_ts); + } + + void CheckFileBoundaries(const Slice& smallest_user_key, + const Slice& largest_user_key) { + std::vector> level_to_files; + dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), + &level_to_files); + ASSERT_GT(level_to_files.size(), 1); + // L0 only has one SST file. + ASSERT_EQ(level_to_files[0].size(), 1); + auto file_meta = level_to_files[0][0]; + ASSERT_EQ(smallest_user_key, file_meta.smallest.user_key()); + ASSERT_EQ(largest_user_key, file_meta.largest.user_key()); + } +}; + +TEST_P(RepairTestWithTimestamp, UnflushedSst) { + Destroy(last_options_); + + bool paranoid_file_checks = std::get<0>(GetParam()); + bool persist_udt = test::ShouldPersistUDT(std::get<1>(GetParam())); + std::string smallest_ukey_without_ts = "bar"; + std::string largest_ukey_without_ts = "foo"; + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + std::string min_ts; + std::string write_ts; + PutFixed64(&min_ts, 0); + PutFixed64(&write_ts, 1); + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + options.persist_user_defined_timestamps = persist_udt; + if (!persist_udt) { + options.allow_concurrent_memtable_write = false; + } + options.paranoid_file_checks = paranoid_file_checks; + + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + + ASSERT_OK(DB::Open(options, dbname_, column_families, &handles_, &db_)); + + ASSERT_OK(Put(smallest_ukey_without_ts, write_ts, + smallest_ukey_without_ts + ":val")); + ASSERT_OK( + Put(largest_ukey_without_ts, write_ts, largest_ukey_without_ts + ":val")); + VectorLogPtr wal_files; + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + ASSERT_EQ(wal_files.size(), 1); + { + uint64_t total_ssts_size; + std::unordered_map sst_files; + ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size)); + ASSERT_EQ(total_ssts_size, 0); + } + // Need to get path before Close() deletes db_, but delete it after Close() to + // ensure Close() didn't change the manifest. + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + ASSERT_OK(RepairDB(dbname_, options)); + ASSERT_OK(DB::Open(options, dbname_, column_families, &handles_, &db_)); + + ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); + ASSERT_EQ(wal_files.size(), 0); + { + uint64_t total_ssts_size; + std::unordered_map sst_files; + ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size)); + ASSERT_GT(total_ssts_size, 0); + } + + // Check file boundaries are correct for different + // `persist_user_defined_timestamps` option values. + if (persist_udt) { + CheckFileBoundaries(smallest_ukey_without_ts + write_ts, + largest_ukey_without_ts + write_ts); + } else { + CheckFileBoundaries(smallest_ukey_without_ts + min_ts, + largest_ukey_without_ts + min_ts); + } + + ReadOptions read_opts; + Slice read_ts_slice = write_ts; + read_opts.timestamp = &read_ts_slice; + if (persist_udt) { + CheckGet(read_opts, smallest_ukey_without_ts, + smallest_ukey_without_ts + ":val", write_ts); + CheckGet(read_opts, largest_ukey_without_ts, + largest_ukey_without_ts + ":val", write_ts); + } else { + // TODO(yuzhangyu): currently when `persist_user_defined_timestamps` is + // false, ts is unconditionally stripped during flush. + // When `full_history_ts_low` is set and respected during flush. + // We should prohibit reading below `full_history_ts_low` all together. + CheckGet(read_opts, smallest_ukey_without_ts, + smallest_ukey_without_ts + ":val", min_ts); + CheckGet(read_opts, largest_ukey_without_ts, + largest_ukey_without_ts + ":val", min_ts); + } +} + +// Param 0: paranoid file check +// Param 1: test mode for the user-defined timestamp feature +INSTANTIATE_TEST_CASE_P( + UnflushedSst, RepairTestWithTimestamp, + ::testing::Combine( + ::testing::Bool(), + ::testing::Values( + test::UserDefinedTimestampTestMode::kStripUserDefinedTimestamp, + test::UserDefinedTimestampTestMode::kNormal))); + TEST_F(RepairTest, SeparateWalDir) { do { Options options = CurrentOptions(); @@ -476,7 +615,6 @@ TEST_F(RepairTest, DbNameContainsTrailingSlash) { ReopenWithSstIdVerify(); ASSERT_EQ(Get("key"), "val"); } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { @@ -485,12 +623,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as RepairDB is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/db/seqno_time_test.cc b/db/seqno_time_test.cc index c19dd1c91dce..199c59c9bbb1 100644 --- a/db/seqno_time_test.cc +++ b/db/seqno_time_test.cc @@ -12,14 +12,13 @@ #include "rocksdb/utilities/debug.h" #include "test_util/mock_time_env.h" -#ifndef ROCKSDB_LITE - namespace ROCKSDB_NAMESPACE { class SeqnoTimeTest : public DBTestBase { public: SeqnoTimeTest() : DBTestBase("seqno_time_test", /*env_do_fsync=*/false) { mock_clock_ = std::make_shared(env_->GetSystemClock()); + mock_clock_->SetCurrentTime(kMockStartTime); mock_env_ = std::make_unique(env_, mock_clock_); } @@ -27,6 +26,10 @@ class SeqnoTimeTest : public DBTestBase { std::unique_ptr mock_env_; std::shared_ptr mock_clock_; + // Sufficient starting time that preserve time doesn't under-flow into + // pre-history + static constexpr uint32_t kMockStartTime = 10000000; + void SetUp() override { mock_clock_->InstallTimedWaitFixCallback(); SyncPoint::GetInstance()->SetCallBack( @@ -35,6 +38,7 @@ class SeqnoTimeTest : public DBTestBase { reinterpret_cast(arg); periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get()); }); + mock_clock_->SetCurrentTime(kMockStartTime); } // make sure the file is not in cache, otherwise it won't have IO info @@ -78,11 +82,6 @@ TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) { options.num_levels = kNumLevels; DestroyAndReopen(options); - // pass some time first, otherwise the first a few keys write time are going - // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeriodicTaskRun( - [&] { mock_clock_->MockSleepForSeconds(static_cast(kKeyPerSec)); }); - int sst_num = 0; // Write files that are overlap and enough to trigger compaction for (; sst_num < kNumTrigger; sst_num++) { @@ -94,7 +93,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) { } ASSERT_OK(Flush()); } - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); // All data is hot, only output to penultimate level ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel()); @@ -115,7 +114,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) { }); } ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); } @@ -129,7 +128,7 @@ TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) { }); } ASSERT_OK(Flush()); - ASSERT_OK(dbfull()->WaitForCompact(true)); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); } CompactRangeOptions cro; @@ -190,11 +189,6 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) { options.disable_auto_compactions = true; DestroyAndReopen(options); - // pass some time first, otherwise the first a few keys write time are going - // to be zero, and internally zero has special meaning: kUnknownSeqnoTime - dbfull()->TEST_WaitForPeriodicTaskRun( - [&] { mock_clock_->MockSleepForSeconds(static_cast(10)); }); - int sst_num = 0; // Write files that are overlap for (; sst_num < 4; sst_num++) { @@ -227,7 +221,8 @@ TEST_F(SeqnoTimeTest, TemperatureBasicLevel) { } ASSERT_OK(Flush()); } - ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + // Second to last level + MoveFilesToLevel(5); ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); @@ -320,7 +315,9 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { DestroyAndReopen(options); std::set checked_file_nums; - SequenceNumber start_seq = dbfull()->GetLatestSequenceNumber(); + SequenceNumber start_seq = dbfull()->GetLatestSequenceNumber() + 1; + uint64_t start_time = mock_clock_->NowSeconds(); + // Write a key every 10 seconds for (int i = 0; i < 200; i++) { ASSERT_OK(Put(Key(i), "value")); @@ -338,21 +335,20 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { ASSERT_FALSE(tp_mapping.Empty()); auto seqs = tp_mapping.TEST_GetInternalMapping(); // about ~20 seqs->time entries, because the sample rate is 10000/100, and it - // passes 2k time. - ASSERT_GE(seqs.size(), 19); - ASSERT_LE(seqs.size(), 21); - SequenceNumber seq_end = dbfull()->GetLatestSequenceNumber(); - for (auto i = start_seq; i < start_seq + 10; i++) { - ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i + 1) * 10); - } - start_seq += 10; + // passes 2k time. Add (roughly) one for starting entry. + ASSERT_GE(seqs.size(), 20); + ASSERT_LE(seqs.size(), 22); + SequenceNumber seq_end = dbfull()->GetLatestSequenceNumber() + 1; for (auto i = start_seq; i < seq_end; i++) { // The result is within the range - ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), (i - 10) * 10); - ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i + 10) * 10); + ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i), + start_time + (i - start_seq) * 10 - 100); + ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i), + start_time + (i - start_seq) * 10); } checked_file_nums.insert(it->second->orig_file_number); start_seq = seq_end; + start_time = mock_clock_->NowSeconds(); // Write a key every 1 seconds for (int i = 0; i < 200; i++) { @@ -360,7 +356,7 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(1)); }); } - seq_end = dbfull()->GetLatestSequenceNumber(); + seq_end = dbfull()->GetLatestSequenceNumber() + 1; ASSERT_OK(Flush()); tables_props.clear(); ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); @@ -382,13 +378,14 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { ASSERT_GE(seqs.size(), 1); ASSERT_LE(seqs.size(), 3); for (auto i = start_seq; i < seq_end; i++) { - // The result is not very accurate, as there is more data write within small - // range of time - ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 1000); - ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 3000); + ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i), + start_time + (i - start_seq) - 100); + ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i), + start_time + (i - start_seq)); } checked_file_nums.insert(it->second->orig_file_number); start_seq = seq_end; + start_time = mock_clock_->NowSeconds(); // Write a key every 200 seconds for (int i = 0; i < 200; i++) { @@ -396,7 +393,7 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(200)); }); } - seq_end = dbfull()->GetLatestSequenceNumber(); + seq_end = dbfull()->GetLatestSequenceNumber() + 1; ASSERT_OK(Flush()); tables_props.clear(); ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); @@ -417,20 +414,18 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { // The sequence number -> time entries should be maxed ASSERT_GE(seqs.size(), 99); ASSERT_LE(seqs.size(), 101); - for (auto i = start_seq; i < seq_end - 99; i++) { - // likely the first 100 entries reports 0 - ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 3000); - } - start_seq += 101; - for (auto i = start_seq; i < seq_end; i++) { - ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), - (i - start_seq) * 200 + 22200); - ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), - (i - start_seq) * 200 + 22600); + // aged out entries allowed to report time=0 + if ((seq_end - i) * 200 <= 10000) { + ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i), + start_time + (i - start_seq) * 200 - 100); + } + ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i), + start_time + (i - start_seq) * 200); } checked_file_nums.insert(it->second->orig_file_number); start_seq = seq_end; + start_time = mock_clock_->NowSeconds(); // Write a key every 100 seconds for (int i = 0; i < 200; i++) { @@ -438,7 +433,7 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); } - seq_end = dbfull()->GetLatestSequenceNumber(); + seq_end = dbfull()->GetLatestSequenceNumber() + 1; ASSERT_OK(Flush()); tables_props.clear(); ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props)); @@ -484,18 +479,15 @@ TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) { seqs = tp_mapping.TEST_GetInternalMapping(); ASSERT_GE(seqs.size(), 99); ASSERT_LE(seqs.size(), 101); - for (auto i = start_seq; i < seq_end - 99; i++) { - // likely the first 100 entries reports 0 - ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), - (i - start_seq) * 100 + 50000); - } - start_seq += 101; - for (auto i = start_seq; i < seq_end; i++) { - ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), - (i - start_seq) * 100 + 52200); - ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), - (i - start_seq) * 100 + 52400); + // aged out entries allowed to report time=0 + // FIXME: should be <= + if ((seq_end - i) * 100 < 10000) { + ASSERT_GE(tp_mapping.GetProximalTimeBeforeSeqno(i), + start_time + (i - start_seq) * 100 - 100); + } + ASSERT_LE(tp_mapping.GetProximalTimeBeforeSeqno(i), + start_time + (i - start_seq) * 100); } ASSERT_OK(db_->Close()); } @@ -620,14 +612,12 @@ TEST_P(SeqnoTimeTablePropTest, MultiCFs) { ASSERT_GE(seqs.size(), 99); ASSERT_LE(seqs.size(), 101); - for (int j = 0; j < 2; j++) { for (int i = 0; i < 200; i++) { ASSERT_OK(Put(0, Key(i), "value")); dbfull()->TEST_WaitForPeriodicTaskRun( [&] { mock_clock_->MockSleepForSeconds(static_cast(100)); }); } ASSERT_OK(Flush(0)); - } ASSERT_OK(dbfull()->TEST_WaitForCompact()); tables_props.clear(); ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[0], &tables_props)); @@ -735,8 +725,9 @@ TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) { ASSERT_OK(tp_mapping.Sort()); ASSERT_FALSE(tp_mapping.Empty()); auto seqs = tp_mapping.TEST_GetInternalMapping(); - ASSERT_GE(seqs.size(), 10 - 1); - ASSERT_LE(seqs.size(), 10 + 1); + // Add (roughly) one for starting entry. + ASSERT_GE(seqs.size(), 10); + ASSERT_LE(seqs.size(), 10 + 2); } // Trigger a compaction @@ -826,6 +817,203 @@ TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) { Close(); } +TEST_P(SeqnoTimeTablePropTest, PrePopulateInDB) { + Options base_options = CurrentOptions(); + base_options.env = mock_env_.get(); + base_options.disable_auto_compactions = true; + base_options.create_missing_column_families = true; + Options track_options = base_options; + constexpr uint32_t kPreserveSecs = 1234567; + SetTrackTimeDurationOptions(kPreserveSecs, track_options); + SeqnoToTimeMapping sttm; + SequenceNumber latest_seqno; + uint64_t start_time, end_time; + + // #### DB#1, #2: No pre-population without preserve/preclude #### + // #### But a single entry is added when preserve/preclude enabled #### + for (bool with_write : {false, true}) { + SCOPED_TRACE("with_write=" + std::to_string(with_write)); + DestroyAndReopen(base_options); + sttm = dbfull()->TEST_GetSeqnoToTimeMapping(); + ASSERT_TRUE(sttm.Empty()); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U); + + if (with_write) { + // Ensure that writes before new CF with preserve/preclude option don't + // interfere with the seqno-to-time mapping getting a starting entry. + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + } else { + // FIXME: currently, starting entry after CreateColumnFamily requires + // non-zero seqno + ASSERT_OK(Delete("blah")); + } + + // Unfortunately, if we add a CF with preserve/preclude option after + // open, that does not reserve seqnos with pre-populated time mappings. + CreateColumnFamilies({"one"}, track_options); + + // No pre-population (unfortunately), just a single starting entry + sttm = dbfull()->TEST_GetSeqnoToTimeMapping(); + latest_seqno = db_->GetLatestSequenceNumber(); + start_time = mock_clock_->NowSeconds(); + ASSERT_EQ(sttm.Size(), 1); + ASSERT_EQ(latest_seqno, 1U); + // Current time maps to starting entry / seqno + ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), 1U); + // Any older times are unknown. + ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - 1), + kUnknownSeqnoBeforeAll); + + // Now check that writes can proceed normally (passing about 20% of preserve + // time) + for (int i = 0; i < 20; i++) { + ASSERT_OK(Put(Key(i), "value")); + dbfull()->TEST_WaitForPeriodicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(kPreserveSecs / 99)); + }); + } + ASSERT_OK(Flush()); + + // Check that mappings are getting populated + sttm = dbfull()->TEST_GetSeqnoToTimeMapping(); + latest_seqno = db_->GetLatestSequenceNumber(); + end_time = mock_clock_->NowSeconds(); + ASSERT_EQ(sttm.Size(), 21); + ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(end_time), latest_seqno); + ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), 1U); + ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - 1), + kUnknownSeqnoBeforeAll); + } + + // ### DB#3, #4: Read-only DB with preserve/preclude after not #### + // Make sure we don't hit issues with read-only DBs, which don't need + // the mapping in the DB state (though it wouldn't hurt anything) + for (bool with_write : {false, true}) { + SCOPED_TRACE("with_write=" + std::to_string(with_write)); + DestroyAndReopen(base_options); + if (with_write) { + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Flush()); + } + + ASSERT_OK(ReadOnlyReopen(base_options)); + if (with_write) { + ASSERT_EQ(Get("foo"), "bar"); + } + sttm = dbfull()->TEST_GetSeqnoToTimeMapping(); + ASSERT_EQ(sttm.Size(), 0); + if (!with_write) { + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0); + } + + ASSERT_OK(ReadOnlyReopen(track_options)); + if (with_write) { + ASSERT_EQ(Get("foo"), "bar"); + } + sttm = dbfull()->TEST_GetSeqnoToTimeMapping(); + ASSERT_EQ(sttm.Size(), 0); + if (!with_write) { + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0); + + // And even if we re-open read-write, we do not get pre-population, + // because that's only for new DBs. + Reopen(track_options); + sttm = dbfull()->TEST_GetSeqnoToTimeMapping(); + ASSERT_EQ(sttm.Size(), 0); + ASSERT_EQ(db_->GetLatestSequenceNumber(), 0); + } + } + + // #### DB#5: Destroy and open with preserve/preclude option #### + DestroyAndReopen(track_options); + + // Ensure pre-population + constexpr auto kPrePopPairs = SeqnoToTimeMapping::kMaxSeqnoTimePairsPerSST; + sttm = dbfull()->TEST_GetSeqnoToTimeMapping(); + latest_seqno = db_->GetLatestSequenceNumber(); + start_time = mock_clock_->NowSeconds(); + ASSERT_EQ(sttm.Size(), kPrePopPairs); + // One nono-zero sequence number per pre-populated pair (this could be + // revised if we want to use interpolation for better approximate time + // mappings with no guarantee of erring in just one direction). + ASSERT_EQ(latest_seqno, kPrePopPairs); + // Current time maps to last pre-allocated seqno + ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), latest_seqno); + // Oldest tracking time maps to first pre-allocated seqno + ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs), 1); + + // In more detail, check that estimated seqnos (pre-allocated) are uniformly + // spread over the tracked time. + for (auto ratio : {0.0, 0.433, 0.678, 0.987, 1.0}) { + // Round up query time + uint64_t t = start_time - kPreserveSecs + + static_cast(ratio * kPreserveSecs + 0.9999999); + // Round down estimated seqno + SequenceNumber s = + static_cast(ratio * (latest_seqno - 1)) + 1; + // Match + ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(t), s); + } + + // Now check that writes can proceed normally (passing about 20% of preserve + // time) + for (int i = 0; i < 20; i++) { + ASSERT_OK(Put(Key(i), "value")); + dbfull()->TEST_WaitForPeriodicTaskRun([&] { + mock_clock_->MockSleepForSeconds(static_cast(kPreserveSecs / 99)); + }); + } + ASSERT_OK(Flush()); + + // Can still see some pre-populated mappings, though some displaced + sttm = dbfull()->TEST_GetSeqnoToTimeMapping(); + latest_seqno = db_->GetLatestSequenceNumber(); + end_time = mock_clock_->NowSeconds(); + ASSERT_EQ(sttm.Size(), kPrePopPairs); + ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(end_time), latest_seqno); + ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs / 2), + kPrePopPairs / 2); + ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs), + kUnknownSeqnoBeforeAll); + + // Make sure we don't hit issues with read-only DBs, which don't need + // the mapping in the DB state (though it wouldn't hurt anything) + ASSERT_OK(ReadOnlyReopen(track_options)); + ASSERT_EQ(Get(Key(0)), "value"); + sttm = dbfull()->TEST_GetSeqnoToTimeMapping(); + ASSERT_EQ(sttm.Size(), 0); + + // #### DB#6: Destroy and open+create an extra CF with preserve/preclude #### + // (default CF does not have the option) + Destroy(track_options); + ReopenWithColumnFamilies({"default", "one"}, + List({base_options, track_options})); + + // Ensure pre-population (not as exhaustive checking here) + sttm = dbfull()->TEST_GetSeqnoToTimeMapping(); + latest_seqno = db_->GetLatestSequenceNumber(); + start_time = mock_clock_->NowSeconds(); + ASSERT_EQ(sttm.Size(), kPrePopPairs); + // One nono-zero sequence number per pre-populated pair (this could be + // revised if we want to use interpolation for better approximate time + // mappings with no guarantee of erring in just one direction). + ASSERT_EQ(latest_seqno, kPrePopPairs); + // Current time maps to last pre-allocated seqno + ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time), latest_seqno); + // Oldest tracking time maps to first pre-allocated seqno + ASSERT_EQ(sttm.GetProximalSeqnoBeforeTime(start_time - kPreserveSecs), 1); + + // Even after no writes and DB re-open without tracking options, sequence + // numbers should not go backward into those that were pre-allocated. + // (Future work: persist the mapping) + ReopenWithColumnFamilies({"default", "one"}, + List({base_options, base_options})); + ASSERT_EQ(latest_seqno, db_->GetLatestSequenceNumber()); + + Close(); +} + TEST_F(SeqnoTimeTest, MappingAppend) { SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10); @@ -843,8 +1031,9 @@ TEST_F(SeqnoTimeTest, MappingAppend) { ASSERT_FALSE(test.Append(8, 12)); ASSERT_EQ(size, test.Size()); - // Append with the same seqno, newer time will be accepted - ASSERT_TRUE(test.Append(10, 12)); + // Append with the same seqno, newer time is rejected because that makes + // GetProximalSeqnoBeforeTime queries worse (see later test) + ASSERT_FALSE(test.Append(10, 12)); ASSERT_EQ(size, test.Size()); // older time will be ignored ASSERT_FALSE(test.Append(10, 9)); @@ -853,25 +1042,220 @@ TEST_F(SeqnoTimeTest, MappingAppend) { // new seqno with old time will be ignored ASSERT_FALSE(test.Append(12, 8)); ASSERT_EQ(size, test.Size()); + + // new seqno with same time is accepted by replacing last entry + // (improves GetProximalSeqnoBeforeTime queries without blowing up size) + ASSERT_TRUE(test.Append(12, 11)); + ASSERT_EQ(size, test.Size()); } -TEST_F(SeqnoTimeTest, GetOldestApproximateTime) { +TEST_F(SeqnoTimeTest, ProximalFunctions) { SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10); - ASSERT_EQ(test.GetOldestApproximateTime(10), kUnknownSeqnoTime); + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1), kUnknownTimeBeforeAll); + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1000000000000U), + kUnknownTimeBeforeAll); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1), kUnknownSeqnoBeforeAll); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1000000000000U), + kUnknownSeqnoBeforeAll); + + // (Taken from example in SeqnoToTimeMapping class comment) + // Time 500 is after seqno 10 and before seqno 11 + EXPECT_TRUE(test.Append(10, 500)); + + // Seqno too early + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(9), kUnknownTimeBeforeAll); + // We only know that 500 is after 10 + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(10), kUnknownTimeBeforeAll); + // Found + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(11), 500U); + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1000000000000U), 500U); + + // Time too early + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(499), kUnknownSeqnoBeforeAll); + // Found + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), 10U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(501), 10U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1000000000000U), 10U); + + // More samples + EXPECT_TRUE(test.Append(20, 600)); + EXPECT_TRUE(test.Append(30, 700)); + + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(10), kUnknownTimeBeforeAll); + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(11), 500U); + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(20), 500U); + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(21), 600U); + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U); + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(31), 700U); + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(1000000000000U), 700U); + + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(499), kUnknownSeqnoBeforeAll); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), 10U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(501), 10U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(599), 10U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(600), 20U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(601), 20U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(701), 30U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(1000000000000U), 30U); + + // Redundant sample ignored + EXPECT_EQ(test.Size(), 3U); + EXPECT_FALSE(test.Append(30, 700)); + EXPECT_EQ(test.Size(), 3U); + + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U); + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(31), 700U); + + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U); + + // Later sample with same seqno is ignored, to provide best results + // for GetProximalSeqnoBeforeTime function while saving entries + // in SeqnoToTimeMapping. + EXPECT_FALSE(test.Append(30, 800)); + + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U); + // Could return 800, but saving space in SeqnoToTimeMapping instead. + // Can reconsider if/when GetProximalTimeBeforeSeqno is used in + // production. + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(31), 700U); + + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U); + // If the existing {30, 700} entry were replaced with {30, 800}, this + // would return seqno 20 instead of 30, which would preclude more than + // necessary for "preclude_last_level_data_seconds" feature. + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(800), 30U); + + // Still OK + EXPECT_TRUE(test.Append(40, 900)); + + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(30), 600U); + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(41), 900U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(899), 30U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(900), 40U); + + // Burst of writes during a short time creates an opportunity + // for better results from GetProximalSeqnoBeforeTime(), at the + // expense of GetProximalTimeBeforeSeqno(). + EXPECT_TRUE(test.Append(50, 900)); + + // These are subject to later revision depending on priorities + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(49), 700U); + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(51), 900U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(899), 30U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(900), 50U); +} - test.Append(3, 10); +TEST_F(SeqnoTimeTest, PrePopulate) { + SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10); - ASSERT_EQ(test.GetOldestApproximateTime(2), kUnknownSeqnoTime); - ASSERT_EQ(test.GetOldestApproximateTime(3), 10); - ASSERT_EQ(test.GetOldestApproximateTime(10), 10); + EXPECT_EQ(test.Size(), 0U); - test.Append(10, 100); + // Smallest case is like two Appends + test.PrePopulate(10, 11, 500, 600); + + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(10), kUnknownTimeBeforeAll); + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(11), 500U); + EXPECT_EQ(test.GetProximalTimeBeforeSeqno(12), 600U); + + test.Clear(); + + // Populate a small range + uint64_t kTimeIncrement = 1234567; + test.PrePopulate(1, 12, kTimeIncrement, kTimeIncrement * 2); + + for (uint64_t i = 0; i <= 12; ++i) { + // NOTE: with 1 and 12 as the pre-populated end points, the duration is + // broken into 11 equal(-ish) spans + uint64_t t = kTimeIncrement + (i * kTimeIncrement) / 11 - 1; + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(t), i); + } + + test.Clear(); + + // Populate an excessively large range (in the future we might want to + // interpolate estimated times for seqnos between entries) + test.PrePopulate(1, 34567, kTimeIncrement, kTimeIncrement * 2); + + for (auto ratio : {0.0, 0.433, 0.678, 0.987, 1.0}) { + // Round up query time + uint64_t t = kTimeIncrement + + static_cast(ratio * kTimeIncrement + 0.9999999); + // Round down estimated seqno + SequenceNumber s = static_cast(ratio * (34567 - 1)) + 1; + // Match + // TODO: for now this is exact, but in the future might need approximation + // bounds to account for limited samples. + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(t), s); + } +} + +TEST_F(SeqnoTimeTest, TruncateOldEntries) { + constexpr uint64_t kMaxTimeDuration = 42; + SeqnoToTimeMapping test(kMaxTimeDuration, /*max_capacity=*/10); + + EXPECT_EQ(test.Size(), 0U); + + // Safe on empty mapping + test.TruncateOldEntries(500); + + EXPECT_EQ(test.Size(), 0U); + + // (Taken from example in SeqnoToTimeMapping class comment) + // Time 500 is after seqno 10 and before seqno 11 + EXPECT_TRUE(test.Append(10, 500)); + EXPECT_TRUE(test.Append(20, 600)); + EXPECT_TRUE(test.Append(30, 700)); + EXPECT_TRUE(test.Append(40, 800)); + EXPECT_TRUE(test.Append(50, 900)); + + EXPECT_EQ(test.Size(), 5U); + + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), 10U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(599), 10U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(600), 20U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U); + // etc. + + // Must keep first entry + test.TruncateOldEntries(500 + kMaxTimeDuration); + EXPECT_EQ(test.Size(), 5U); + test.TruncateOldEntries(599 + kMaxTimeDuration); + EXPECT_EQ(test.Size(), 5U); + + // Purges first entry + test.TruncateOldEntries(600 + kMaxTimeDuration); + EXPECT_EQ(test.Size(), 4U); + + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(500), kUnknownSeqnoBeforeAll); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(599), kUnknownSeqnoBeforeAll); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(600), 20U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(699), 20U); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(700), 30U); + + // No effect + test.TruncateOldEntries(600 + kMaxTimeDuration); + EXPECT_EQ(test.Size(), 4U); + test.TruncateOldEntries(699 + kMaxTimeDuration); + EXPECT_EQ(test.Size(), 4U); + + // Purges next two + test.TruncateOldEntries(899 + kMaxTimeDuration); + EXPECT_EQ(test.Size(), 2U); + + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(799), kUnknownSeqnoBeforeAll); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(899), 40U); + + // Always keep last entry, to have a non-trivial seqno bound + test.TruncateOldEntries(10000000); + EXPECT_EQ(test.Size(), 1U); - test.Append(100, 1000); - ASSERT_EQ(test.GetOldestApproximateTime(10), 100); - ASSERT_EQ(test.GetOldestApproximateTime(40), 100); - ASSERT_EQ(test.GetOldestApproximateTime(111), 1000); + EXPECT_EQ(test.GetProximalSeqnoBeforeTime(10000000), 50U); } TEST_F(SeqnoTimeTest, Sort) { @@ -930,10 +1314,10 @@ TEST_F(SeqnoTimeTest, EncodeDecodeBasic) { for (SequenceNumber seq = 0; seq <= 1000; seq++) { // test has the more accurate time mapping, encode only pick // kMaxSeqnoTimePairsPerSST number of entries, which is less accurate - uint64_t target_time = test.GetOldestApproximateTime(seq); - ASSERT_GE(decoded.GetOldestApproximateTime(seq), + uint64_t target_time = test.GetProximalTimeBeforeSeqno(seq); + ASSERT_GE(decoded.GetProximalTimeBeforeSeqno(seq), target_time < 200 ? 0 : target_time - 200); - ASSERT_LE(decoded.GetOldestApproximateTime(seq), target_time); + ASSERT_LE(decoded.GetProximalTimeBeforeSeqno(seq), target_time); } } @@ -987,7 +1371,6 @@ TEST_F(SeqnoTimeTest, EncodeDecodePerferNewTime) { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); diff --git a/db/seqno_to_time_mapping.cc b/db/seqno_to_time_mapping.cc index c69209929487..97a3e9879862 100644 --- a/db/seqno_to_time_mapping.cc +++ b/db/seqno_to_time_mapping.cc @@ -11,14 +11,34 @@ namespace ROCKSDB_NAMESPACE { -uint64_t SeqnoToTimeMapping::GetOldestApproximateTime( - const SequenceNumber seqno) const { +SeqnoToTimeMapping::pair_const_iterator SeqnoToTimeMapping::FindGreaterTime( + uint64_t time) const { + return std::upper_bound(pairs_.cbegin(), pairs_.cend(), + SeqnoTimePair{0, time}, SeqnoTimePair::TimeLess); +} + +SeqnoToTimeMapping::pair_const_iterator SeqnoToTimeMapping::FindGreaterEqSeqno( + SequenceNumber seqno) const { + return std::lower_bound(pairs_.cbegin(), pairs_.cend(), + SeqnoTimePair{seqno, 0}, SeqnoTimePair::SeqnoLess); +} + +SeqnoToTimeMapping::pair_const_iterator SeqnoToTimeMapping::FindGreaterSeqno( + SequenceNumber seqno) const { + return std::upper_bound(pairs_.cbegin(), pairs_.cend(), + SeqnoTimePair{seqno, 0}, SeqnoTimePair::SeqnoLess); +} + +uint64_t SeqnoToTimeMapping::GetProximalTimeBeforeSeqno( + SequenceNumber seqno) const { assert(is_sorted_); - auto it = std::upper_bound(seqno_time_mapping_.begin(), - seqno_time_mapping_.end(), seqno); - if (it == seqno_time_mapping_.begin()) { - return 0; + // Find the last entry with a seqno strictly less than the given seqno. + // First, find the first entry >= the given seqno (or end) + auto it = FindGreaterEqSeqno(seqno); + if (it == pairs_.cbegin()) { + return kUnknownTimeBeforeAll; } + // Then return data from previous. it--; return it->time; } @@ -28,44 +48,47 @@ void SeqnoToTimeMapping::Add(SequenceNumber seqno, uint64_t time) { return; } is_sorted_ = false; - seqno_time_mapping_.emplace_back(seqno, time); + pairs_.emplace_back(seqno, time); } void SeqnoToTimeMapping::TruncateOldEntries(const uint64_t now) { assert(is_sorted_); if (max_time_duration_ == 0) { + // No cutoff time + return; + } + + if (now < max_time_duration_) { + // Would under-flow return; } - const uint64_t cut_off_time = - now > max_time_duration_ ? now - max_time_duration_ : 0; - assert(cut_off_time <= now); // no overflow + const uint64_t cut_off_time = now - max_time_duration_; + assert(cut_off_time <= now); // no under/overflow - auto it = std::upper_bound( - seqno_time_mapping_.begin(), seqno_time_mapping_.end(), cut_off_time, - [](uint64_t target, const SeqnoTimePair& other) -> bool { - return target < other.time; - }); - if (it == seqno_time_mapping_.begin()) { + auto it = FindGreaterTime(cut_off_time); + if (it == pairs_.cbegin()) { return; } - it--; - seqno_time_mapping_.erase(seqno_time_mapping_.begin(), it); + // Move back one, to the entry that would be used to return a good seqno from + // GetProximalSeqnoBeforeTime(cut_off_time) + --it; + // Remove everything strictly before that entry + pairs_.erase(pairs_.cbegin(), std::move(it)); } -SequenceNumber SeqnoToTimeMapping::GetOldestSequenceNum(uint64_t time) { +SequenceNumber SeqnoToTimeMapping::GetProximalSeqnoBeforeTime(uint64_t time) { assert(is_sorted_); - auto it = std::upper_bound( - seqno_time_mapping_.begin(), seqno_time_mapping_.end(), time, - [](uint64_t target, const SeqnoTimePair& other) -> bool { - return target < other.time; - }); - if (it == seqno_time_mapping_.begin()) { - return 0; + // Find the last entry with a time <= the given time. + // First, find the first entry > the given time (or end). + auto it = FindGreaterTime(time); + if (it == pairs_.cbegin()) { + return kUnknownSeqnoBeforeAll; } - it--; + // Then return data from previous. + --it; return it->seqno; } @@ -84,15 +107,13 @@ void SeqnoToTimeMapping::Encode(std::string& dest, const SequenceNumber start, return; } - auto start_it = std::upper_bound(seqno_time_mapping_.begin(), - seqno_time_mapping_.end(), start); - if (start_it != seqno_time_mapping_.begin()) { + auto start_it = FindGreaterSeqno(start); + if (start_it != pairs_.begin()) { start_it--; } - auto end_it = std::upper_bound(seqno_time_mapping_.begin(), - seqno_time_mapping_.end(), end); - if (end_it == seqno_time_mapping_.begin()) { + auto end_it = FindGreaterSeqno(end); + if (end_it == pairs_.begin()) { return; } if (start_it >= end_it) { @@ -108,7 +129,7 @@ void SeqnoToTimeMapping::Encode(std::string& dest, const SequenceNumber start, } } // to include the first element - if (start_it != seqno_time_mapping_.begin()) { + if (start_it != pairs_.begin()) { start_it--; } @@ -166,14 +187,14 @@ void SeqnoToTimeMapping::Encode(std::string& dest, const SequenceNumber start, SeqnoTimePair base; for (auto it = start_it; it < end_it; it++) { assert(base < *it); - SeqnoTimePair val = *it - base; + SeqnoTimePair val = it->ComputeDelta(base); base = *it; val.Encode(dest); } } -Status SeqnoToTimeMapping::Add(const std::string& seqno_time_mapping_str) { - Slice input(seqno_time_mapping_str); +Status SeqnoToTimeMapping::Add(const std::string& pairs_str) { + Slice input(pairs_str); if (input.empty()) { return Status::OK(); } @@ -189,8 +210,8 @@ Status SeqnoToTimeMapping::Add(const std::string& seqno_time_mapping_str) { if (!s.ok()) { return s; } - val.Add(base); - seqno_time_mapping_.emplace_back(val); + val.ApplyDelta(base); + pairs_.emplace_back(val); base = val; } return Status::OK(); @@ -222,33 +243,58 @@ bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) { return false; } if (seqno == Last().seqno) { - Last().time = time; - return true; + // Updating Last() would hurt GetProximalSeqnoBeforeTime() queries, so + // NOT doing it (for now) + return false; } if (time == Last().time) { - // new sequence has the same time as old one, no need to add new mapping - return false; + // Updating Last() here helps GetProximalSeqnoBeforeTime() queries, so + // doing it (for now) + Last().seqno = seqno; + return true; } } - seqno_time_mapping_.emplace_back(seqno, time); + pairs_.emplace_back(seqno, time); - if (seqno_time_mapping_.size() > max_capacity_) { - seqno_time_mapping_.pop_front(); + if (pairs_.size() > max_capacity_) { + // FIXME: be smarter about how we erase to avoid data falling off the + // front prematurely. + pairs_.pop_front(); } return true; } +bool SeqnoToTimeMapping::PrePopulate(SequenceNumber from_seqno, + SequenceNumber to_seqno, + uint64_t from_time, uint64_t to_time) { + assert(Empty()); + assert(from_seqno > 0); + assert(to_seqno > from_seqno); + assert(from_time > kUnknownTimeBeforeAll); + assert(to_time >= from_time); + + // TODO: smartly limit this to max_capacity_ representative samples + for (auto i = from_seqno; i <= to_seqno; i++) { + uint64_t t = from_time + (to_time - from_time) * (i - from_seqno) / + (to_seqno - from_seqno); + pairs_.emplace_back(i, t); + } + + return /*success*/ true; +} + bool SeqnoToTimeMapping::Resize(uint64_t min_time_duration, uint64_t max_time_duration) { uint64_t new_max_capacity = CalculateMaxCapacity(min_time_duration, max_time_duration); if (new_max_capacity == max_capacity_) { return false; - } else if (new_max_capacity < seqno_time_mapping_.size()) { - uint64_t delta = seqno_time_mapping_.size() - new_max_capacity; - seqno_time_mapping_.erase(seqno_time_mapping_.begin(), - seqno_time_mapping_.begin() + delta); + } else if (new_max_capacity < pairs_.size()) { + uint64_t delta = pairs_.size() - new_max_capacity; + // FIXME: be smarter about how we erase to avoid data falling off the + // front prematurely. + pairs_.erase(pairs_.begin(), pairs_.begin() + delta); } max_capacity_ = new_max_capacity; return true; @@ -258,16 +304,16 @@ Status SeqnoToTimeMapping::Sort() { if (is_sorted_) { return Status::OK(); } - if (seqno_time_mapping_.empty()) { + if (pairs_.empty()) { is_sorted_ = true; return Status::OK(); } - std::deque copy = std::move(seqno_time_mapping_); + std::deque copy = std::move(pairs_); std::sort(copy.begin(), copy.end()); - seqno_time_mapping_.clear(); + pairs_.clear(); // remove seqno = 0, which may have special meaning, like zeroed out data while (copy.front().seqno == 0) { @@ -285,12 +331,12 @@ Status SeqnoToTimeMapping::Sort() { assert(it.seqno > prev.seqno); // If a larger sequence number has an older time which is not useful, skip if (it.time > prev.time) { - seqno_time_mapping_.push_back(prev); + pairs_.push_back(prev); prev = it; } } } - seqno_time_mapping_.emplace_back(prev); + pairs_.emplace_back(prev); is_sorted_ = true; return Status::OK(); @@ -298,7 +344,7 @@ Status SeqnoToTimeMapping::Sort() { std::string SeqnoToTimeMapping::ToHumanString() const { std::string ret; - for (const auto& seq_time : seqno_time_mapping_) { + for (const auto& seq_time : pairs_) { AppendNumberTo(&ret, seq_time.seqno); ret.append("->"); AppendNumberTo(&ret, seq_time.time); @@ -310,13 +356,11 @@ std::string SeqnoToTimeMapping::ToHumanString() const { SeqnoToTimeMapping SeqnoToTimeMapping::Copy( SequenceNumber smallest_seqno) const { SeqnoToTimeMapping ret; - auto it = std::upper_bound(seqno_time_mapping_.begin(), - seqno_time_mapping_.end(), smallest_seqno); - if (it != seqno_time_mapping_.begin()) { + auto it = FindGreaterSeqno(smallest_seqno); + if (it != pairs_.begin()) { it--; } - std::copy(it, seqno_time_mapping_.end(), - std::back_inserter(ret.seqno_time_mapping_)); + std::copy(it, pairs_.end(), std::back_inserter(ret.pairs_)); return ret; } @@ -330,12 +374,4 @@ uint64_t SeqnoToTimeMapping::CalculateMaxCapacity(uint64_t min_time_duration, max_time_duration * kMaxSeqnoTimePairsPerCF / min_time_duration); } -SeqnoToTimeMapping::SeqnoTimePair SeqnoToTimeMapping::SeqnoTimePair::operator-( - const SeqnoTimePair& other) const { - SeqnoTimePair res; - res.seqno = seqno - other.seqno; - res.time = time - other.time; - return res; -} - } // namespace ROCKSDB_NAMESPACE diff --git a/db/seqno_to_time_mapping.h b/db/seqno_to_time_mapping.h index 4ffc9c199265..95a4455be186 100644 --- a/db/seqno_to_time_mapping.h +++ b/db/seqno_to_time_mapping.h @@ -18,20 +18,32 @@ namespace ROCKSDB_NAMESPACE { -constexpr uint64_t kUnknownSeqnoTime = 0; - -// SeqnoToTimeMapping stores the sequence number to time mapping, so given a -// sequence number it can estimate the oldest possible time for that sequence -// number. For example: -// 10 -> 100 -// 50 -> 300 -// then if a key has seqno 19, the OldestApproximateTime would be 100, for 51 it -// would be 300. -// As it's a sorted list, the new entry is inserted from the back. The old data -// will be popped from the front if they're no longer used. +constexpr uint64_t kUnknownTimeBeforeAll = 0; +constexpr SequenceNumber kUnknownSeqnoBeforeAll = 0; + +// SeqnoToTimeMapping stores a sampled mapping from sequence numbers to +// unix times (seconds since epoch). This information provides rough bounds +// between sequence numbers and their write times, but is primarily designed +// for getting a best lower bound on the sequence number of data written no +// later than a specified time. // -// Note: the data struct is not thread safe, both read and write need to be -// synchronized by caller. +// For ease of sampling, it is assumed that the recorded time in each pair +// comes at or after the sequence number and before the next sequence number, +// so this example: +// +// Seqno: 10, 11, ... 20, 21, ... 30, 31, ... +// Time: ... 500 ... 600 ... 700 ... +// +// would be represented as +// 10 -> 500 +// 20 -> 600 +// 30 -> 700 +// +// In typical operation, the list is sorted, both among seqnos and among times, +// with a bounded number of entries, but some public working states violate +// these constraints. +// +// NOT thread safe - requires external synchronization. class SeqnoToTimeMapping { public: // Maximum number of entries can be encoded into SST. The data is delta encode @@ -63,28 +75,33 @@ class SeqnoToTimeMapping { // Decode the value from input Slice and remove it from the input Status Decode(Slice& input); - // subtraction of 2 SeqnoTimePair - SeqnoTimePair operator-(const SeqnoTimePair& other) const; - - // Add 2 values together - void Add(const SeqnoTimePair& obj) { - seqno += obj.seqno; - time += obj.time; + // For delta encoding + SeqnoTimePair ComputeDelta(const SeqnoTimePair& base) const { + return {seqno - base.seqno, time - base.time}; } - // Compare SeqnoTimePair with a sequence number, used for binary search a - // sequence number in a list of SeqnoTimePair - bool operator<(const SequenceNumber& other) const { return seqno < other; } + // For delta decoding + void ApplyDelta(const SeqnoTimePair& delta_or_base) { + seqno += delta_or_base.seqno; + time += delta_or_base.time; + } - // Compare 2 SeqnoTimePair + // Ordering used for Sort() bool operator<(const SeqnoTimePair& other) const { return std::tie(seqno, time) < std::tie(other.seqno, other.time); } - // Check if 2 SeqnoTimePair is the same bool operator==(const SeqnoTimePair& other) const { return std::tie(seqno, time) == std::tie(other.seqno, other.time); } + + static bool SeqnoLess(const SeqnoTimePair& a, const SeqnoTimePair& b) { + return a.seqno < b.seqno; + } + + static bool TimeLess(const SeqnoTimePair& a, const SeqnoTimePair& b) { + return a.time < b.time; + } }; // constractor of SeqnoToTimeMapping @@ -99,20 +116,40 @@ class SeqnoToTimeMapping { uint64_t max_capacity = 0) : max_time_duration_(max_time_duration), max_capacity_(max_capacity) {} + // Both seqno range and time range are inclusive. ... TODO + // + bool PrePopulate(SequenceNumber from_seqno, SequenceNumber to_seqno, + uint64_t from_time, uint64_t to_time); + // Append a new entry to the list. The new entry should be newer than the // existing ones. It maintains the internal sorted status. bool Append(SequenceNumber seqno, uint64_t time); - // Given a sequence number, estimate it's oldest time - uint64_t GetOldestApproximateTime(SequenceNumber seqno) const; - - // Truncate the old entries based on the current time and max_time_duration_ + // Given a sequence number, return the best (largest / newest) known time + // that is no later than the write time of that given sequence number. + // If no such specific time is known, returns kUnknownTimeBeforeAll. + // Using the example in the class comment above, + // GetProximalTimeBeforeSeqno(10) -> kUnknownTimeBeforeAll + // GetProximalTimeBeforeSeqno(11) -> 500 + // GetProximalTimeBeforeSeqno(20) -> 500 + // GetProximalTimeBeforeSeqno(21) -> 600 + uint64_t GetProximalTimeBeforeSeqno(SequenceNumber seqno) const; + + // Remove any entries not needed for GetProximalSeqnoBeforeTime queries of + // times older than `now - max_time_duration_` void TruncateOldEntries(uint64_t now); - // Given a time, return it's oldest possible sequence number - SequenceNumber GetOldestSequenceNum(uint64_t time); - - // Encode to a binary string + // Given a time, return the best (largest) sequence number whose write time + // is no later than that given time. If no such specific sequence number is + // known, returns kUnknownSeqnoBeforeAll. Using the example in the class + // comment above, + // GetProximalSeqnoBeforeTime(499) -> kUnknownSeqnoBeforeAll + // GetProximalSeqnoBeforeTime(500) -> 10 + // GetProximalSeqnoBeforeTime(599) -> 10 + // GetProximalSeqnoBeforeTime(600) -> 20 + SequenceNumber GetProximalSeqnoBeforeTime(uint64_t time); + + // Encode to a binary string. start and end seqno are both inclusive. void Encode(std::string& des, SequenceNumber start, SequenceNumber end, uint64_t now, uint64_t output_size = kMaxSeqnoTimePairsPerSST) const; @@ -122,10 +159,10 @@ class SeqnoToTimeMapping { void Add(SequenceNumber seqno, uint64_t time); // Decode and add the entries to the current obj. The list will be unsorted - Status Add(const std::string& seqno_time_mapping_str); + Status Add(const std::string& pairs_str); // Return the number of entries - size_t Size() const { return seqno_time_mapping_.size(); } + size_t Size() const { return pairs_.size(); } // Reduce the size of internal list bool Resize(uint64_t min_time_duration, uint64_t max_time_duration); @@ -145,10 +182,10 @@ class SeqnoToTimeMapping { SeqnoToTimeMapping Copy(SequenceNumber smallest_seqno) const; // If the internal list is empty - bool Empty() const { return seqno_time_mapping_.empty(); } + bool Empty() const { return pairs_.empty(); } // clear all entries - void Clear() { seqno_time_mapping_.clear(); } + void Clear() { pairs_.clear(); } // return the string for user message // Note: Not efficient, okay for print @@ -156,7 +193,7 @@ class SeqnoToTimeMapping { #ifndef NDEBUG const std::deque& TEST_GetInternalMapping() const { - return seqno_time_mapping_; + return pairs_; } #endif @@ -167,7 +204,7 @@ class SeqnoToTimeMapping { uint64_t max_time_duration_; uint64_t max_capacity_; - std::deque seqno_time_mapping_; + std::deque pairs_; bool is_sorted_ = true; @@ -176,14 +213,14 @@ class SeqnoToTimeMapping { SeqnoTimePair& Last() { assert(!Empty()); - return seqno_time_mapping_.back(); + return pairs_.back(); } -}; -// for searching the sequence number from SeqnoToTimeMapping -inline bool operator<(const SequenceNumber& seqno, - const SeqnoToTimeMapping::SeqnoTimePair& other) { - return seqno < other.seqno; -} + using pair_const_iterator = + std::deque::const_iterator; + pair_const_iterator FindGreaterTime(uint64_t time) const; + pair_const_iterator FindGreaterSeqno(SequenceNumber seqno) const; + pair_const_iterator FindGreaterEqSeqno(SequenceNumber seqno) const; +}; } // namespace ROCKSDB_NAMESPACE diff --git a/db/snapshot_checker.h b/db/snapshot_checker.h index 0bfb1aa07a15..b7ff1df8c010 100644 --- a/db/snapshot_checker.h +++ b/db/snapshot_checker.h @@ -52,9 +52,7 @@ class WritePreparedSnapshotChecker : public SnapshotChecker { SequenceNumber sequence, SequenceNumber snapshot_sequence) const override; private: -#ifndef ROCKSDB_LITE const WritePreparedTxnDB* const txn_db_; -#endif // !ROCKSDB_LITE }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/table_cache.cc b/db/table_cache.cc index a5fa5fbe3c2e..b4f0d770563b 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -53,7 +53,6 @@ static Slice GetSliceForFileNumber(const uint64_t* file_number) { sizeof(*file_number)); } -#ifndef ROCKSDB_LITE void AppendVarint64(IterKey* key, uint64_t v) { char buf[10]; @@ -61,7 +60,6 @@ void AppendVarint64(IterKey* key, uint64_t v) { key->TrimAppend(key->Size(), buf, ptr - buf); } -#endif // ROCKSDB_LITE } // anonymous namespace @@ -77,7 +75,7 @@ TableCache::TableCache(const ImmutableOptions& ioptions, cache_(cache), immortal_tables_(false), block_cache_tracer_(block_cache_tracer), - loader_mutex_(kLoadConcurency, kGetSliceNPHash64UnseededFnPtr), + loader_mutex_(kLoadConcurency), io_tracer_(io_tracer), db_session_id_(db_session_id) { if (ioptions_.row_cache) { @@ -92,8 +90,9 @@ TableCache::~TableCache() {} Status TableCache::GetTableReader( const ReadOptions& ro, const FileOptions& file_options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, bool sequential_mode, bool record_read_stats, - HistogramImpl* file_read_hist, std::unique_ptr* table_reader, + const FileMetaData& file_meta, bool sequential_mode, + uint8_t block_protection_bytes_per_key, HistogramImpl* file_read_hist, + std::unique_ptr* table_reader, const std::shared_ptr& prefix_extractor, bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { @@ -112,13 +111,17 @@ Status TableCache::GetTableReader( RecordTick(ioptions_.stats, NO_FILE_OPENS); } else if (s.IsPathNotFound()) { fname = Rocks2LevelTableFileName(fname); - s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); - if (s.ok()) { - s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, - nullptr); + // If this file is also not found, we want to use the error message + // that contains the table file name which is less confusing. + Status temp_s = + PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options); + if (temp_s.ok()) { + temp_s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file, + nullptr); } - if (s.ok()) { + if (temp_s.ok()) { RecordTick(ioptions_.stats, NO_FILE_OPENS); + s = temp_s; } } @@ -126,13 +129,17 @@ Status TableCache::GetTableReader( if (!sequential_mode && ioptions_.advise_random_on_open) { file->Hint(FSRandomAccessFile::kRandom); } + if (ioptions_.default_temperature != Temperature::kUnknown && + file_temperature == Temperature::kUnknown) { + file_temperature = ioptions_.default_temperature; + } StopWatch sw(ioptions_.clock, ioptions_.stats, TABLE_OPEN_IO_MICROS); std::unique_ptr file_reader( - new RandomAccessFileReader( - std::move(file), fname, ioptions_.clock, io_tracer_, - record_read_stats ? ioptions_.stats : nullptr, SST_READ_MICROS, - file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners, - file_temperature, level == ioptions_.num_levels - 1)); + new RandomAccessFileReader(std::move(file), fname, ioptions_.clock, + io_tracer_, ioptions_.stats, SST_READ_MICROS, + file_read_hist, ioptions_.rate_limiter.get(), + ioptions_.listeners, file_temperature, + level == ioptions_.num_levels - 1)); UniqueId64x2 expected_unique_id; if (ioptions_.verify_sst_unique_id_in_manifest) { expected_unique_id = file_meta.unique_id; @@ -141,12 +148,14 @@ Status TableCache::GetTableReader( } s = ioptions_.table_factory->NewTableReader( ro, - TableReaderOptions(ioptions_, prefix_extractor, file_options, - internal_comparator, skip_filters, immortal_tables_, - false /* force_direct_prefetch */, level, - block_cache_tracer_, max_file_size_for_l0_meta_pin, - db_session_id_, file_meta.fd.GetNumber(), - expected_unique_id, file_meta.fd.largest_seqno), + TableReaderOptions( + ioptions_, prefix_extractor, file_options, internal_comparator, + block_protection_bytes_per_key, skip_filters, immortal_tables_, + false /* force_direct_prefetch */, level, block_cache_tracer_, + max_file_size_for_l0_meta_pin, db_session_id_, + file_meta.fd.GetNumber(), expected_unique_id, + file_meta.fd.largest_seqno, file_meta.tail_size, + file_meta.user_defined_timestamps_persisted), std::move(file_reader), file_meta.fd.GetFileSize(), table_reader, prefetch_index_and_filter_in_cache); TEST_SYNC_POINT("TableCache::GetTableReader:0"); @@ -158,9 +167,10 @@ Status TableCache::FindTable( const ReadOptions& ro, const FileOptions& file_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, TypedHandle** handle, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, - const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist, - bool skip_filters, int level, bool prefetch_index_and_filter_in_cache, + const bool no_io, HistogramImpl* file_read_hist, bool skip_filters, + int level, bool prefetch_index_and_filter_in_cache, size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) { PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock); uint64_t number = file_meta.fd.GetNumber(); @@ -173,7 +183,7 @@ Status TableCache::FindTable( if (no_io) { return Status::Incomplete("Table not found in table_cache, no_io is set"); } - MutexLock load_lock(loader_mutex_.get(key)); + MutexLock load_lock(&loader_mutex_.Get(key)); // We check the cache again under loading mutex *handle = cache_.Lookup(key); if (*handle != nullptr) { @@ -181,12 +191,12 @@ Status TableCache::FindTable( } std::unique_ptr table_reader; - Status s = - GetTableReader(ro, file_options, internal_comparator, file_meta, - false /* sequential mode */, record_read_stats, - file_read_hist, &table_reader, prefix_extractor, - skip_filters, level, prefetch_index_and_filter_in_cache, - max_file_size_for_l0_meta_pin, file_temperature); + Status s = GetTableReader(ro, file_options, internal_comparator, file_meta, + false /* sequential mode */, + block_protection_bytes_per_key, file_read_hist, + &table_reader, prefix_extractor, skip_filters, + level, prefetch_index_and_filter_in_cache, + max_file_size_for_l0_meta_pin, file_temperature); if (!s.ok()) { assert(table_reader == nullptr); RecordTick(ioptions_.stats, NO_FILE_ERRORS); @@ -214,6 +224,7 @@ InternalIterator* TableCache::NewIterator( size_t max_file_size_for_l0_meta_pin, const InternalKey* smallest_compaction_key, const InternalKey* largest_compaction_key, bool allow_unprepared_value, + uint8_t block_protection_bytes_per_key, const SequenceNumber* read_seqno, TruncatedRangeDelIterator** range_del_iter) { PERF_TIMER_GUARD(new_table_iterator_nanos); @@ -227,12 +238,12 @@ InternalIterator* TableCache::NewIterator( auto& fd = file_meta.fd; table_reader = fd.table_reader; if (table_reader == nullptr) { - s = FindTable( - options, file_options, icomparator, file_meta, &handle, - prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, - !for_compaction /* record_read_stats */, file_read_hist, skip_filters, - level, true /* prefetch_index_and_filter_in_cache */, - max_file_size_for_l0_meta_pin, file_meta.temperature); + s = FindTable(options, file_options, icomparator, file_meta, &handle, + block_protection_bytes_per_key, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + file_read_hist, skip_filters, level, + true /* prefetch_index_and_filter_in_cache */, + max_file_size_for_l0_meta_pin, file_meta.temperature); if (s.ok()) { table_reader = cache_.Value(handle); } @@ -262,7 +273,9 @@ InternalIterator* TableCache::NewIterator( if (s.ok() && !options.ignore_range_deletions) { if (range_del_iter != nullptr) { auto new_range_del_iter = - table_reader->NewRangeTombstoneIterator(options); + read_seqno ? table_reader->NewRangeTombstoneIterator( + *read_seqno, options.timestamp) + : table_reader->NewRangeTombstoneIterator(options); if (new_range_del_iter == nullptr || new_range_del_iter->empty()) { delete new_range_del_iter; *range_del_iter = nullptr; @@ -310,7 +323,7 @@ InternalIterator* TableCache::NewIterator( Status TableCache::GetRangeTombstoneIterator( const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, std::unique_ptr* out_iter) { assert(out_iter); const FileDescriptor& fd = file_meta.fd; @@ -319,7 +332,7 @@ Status TableCache::GetRangeTombstoneIterator( TypedHandle* handle = nullptr; if (t == nullptr) { s = FindTable(options, file_options_, internal_comparator, file_meta, - &handle); + &handle, block_protection_bytes_per_key); if (s.ok()) { t = cache_.Value(handle); } @@ -338,24 +351,25 @@ Status TableCache::GetRangeTombstoneIterator( return s; } -#ifndef ROCKSDB_LITE -void TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options, - const FileDescriptor& fd, - const Slice& internal_key, - GetContext* get_context, - IterKey& row_cache_key) { +uint64_t TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options, + const FileDescriptor& fd, + const Slice& internal_key, + GetContext* get_context, + IterKey& row_cache_key) { uint64_t fd_number = fd.GetNumber(); // We use the user key as cache key instead of the internal key, // otherwise the whole cache would be invalidated every time the // sequence key increases. However, to support caching snapshot - // reads, we append the sequence number (incremented by 1 to - // distinguish from 0) only in this case. + // reads, we append a sequence number (incremented by 1 to + // distinguish from 0) other than internal_key seq no + // to determine row cache entry visibility. // If the snapshot is larger than the largest seqno in the file, // all data should be exposed to the snapshot, so we treat it // the same as there is no snapshot. The exception is that if // a seq-checking callback is registered, some internal keys // may still be filtered out. - uint64_t seq_no = 0; + uint64_t cache_entry_seq_no = 0; + // Maybe we can include the whole file ifsnapshot == fd.largest_seqno. if (options.snapshot != nullptr && (get_context->has_callback() || @@ -364,18 +378,24 @@ void TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options, // We should consider to use options.snapshot->GetSequenceNumber() // instead of GetInternalKeySeqno(k), which will make the code // easier to understand. - seq_no = 1 + GetInternalKeySeqno(internal_key); + cache_entry_seq_no = 1 + GetInternalKeySeqno(internal_key); } // Compute row cache key. row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(), row_cache_id_.size()); AppendVarint64(&row_cache_key, fd_number); - AppendVarint64(&row_cache_key, seq_no); + AppendVarint64(&row_cache_key, cache_entry_seq_no); + + // Provide a sequence number for callback checking on cache hit. + // As cache_entry_seq_no starts at 1, decrease it's value by 1 to get + // a sequence number align with get context's logic. + return cache_entry_seq_no == 0 ? 0 : cache_entry_seq_no - 1; } bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, - size_t prefix_size, GetContext* get_context) { + size_t prefix_size, GetContext* get_context, + SequenceNumber seq_no) { bool found = false; row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size()); @@ -392,8 +412,10 @@ bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, // get_context.pinnable_slice_. Cache entry is released when // get_context.pinnable_slice_ is reset. row_cache.RegisterReleaseAsCleanup(row_handle, value_pinner); + // If row cache hit, knowing cache key is the same to row_cache_key, + // can use row_cache_key's seq no to construct InternalKey. replayGetContextLog(*row_cache.Value(row_handle), user_key, get_context, - &value_pinner); + &value_pinner, seq_no); RecordTick(ioptions_.stats, ROW_CACHE_HIT); found = true; } else { @@ -401,34 +423,33 @@ bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, } return found; } -#endif // ROCKSDB_LITE Status TableCache::Get( const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, bool skip_filters, int level, size_t max_file_size_for_l0_meta_pin) { auto& fd = file_meta.fd; std::string* row_cache_entry = nullptr; bool done = false; -#ifndef ROCKSDB_LITE IterKey row_cache_key; std::string row_cache_entry_buffer; - // Check row cache if enabled. Since row cache does not currently store - // sequence numbers, we cannot use it if we need to fetch the sequence. + // Check row cache if enabled. + // Reuse row_cache_key sequence number when row cache hits. if (ioptions_.row_cache && !get_context->NeedToReadSequence()) { auto user_key = ExtractUserKey(k); - CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key); + uint64_t cache_entry_seq_no = + CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key); done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(), - get_context); + get_context, cache_entry_seq_no); if (!done) { row_cache_entry = &row_cache_entry_buffer; } } -#endif // ROCKSDB_LITE Status s; TableReader* t = fd.table_reader; TypedHandle* handle = nullptr; @@ -436,10 +457,10 @@ Status TableCache::Get( assert(s.ok()); if (t == nullptr) { s = FindTable(options, file_options_, internal_comparator, file_meta, - &handle, prefix_extractor, + &handle, block_protection_bytes_per_key, prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, - true /* record_read_stats */, file_read_hist, skip_filters, - level, true /* prefetch_index_and_filter_in_cache */, + file_read_hist, skip_filters, level, + true /* prefetch_index_and_filter_in_cache */, max_file_size_for_l0_meta_pin, file_meta.temperature); if (s.ok()) { t = cache_.Value(handle); @@ -475,17 +496,18 @@ Status TableCache::Get( } } -#ifndef ROCKSDB_LITE // Put the replay log in row cache only if something was found. if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) { RowCacheInterface row_cache{ioptions_.row_cache.get()}; size_t charge = row_cache_entry->capacity() + sizeof(std::string); auto row_ptr = new std::string(std::move(*row_cache_entry)); - // If row cache is full, it's OK to continue. - row_cache.Insert(row_cache_key.GetUserKey(), row_ptr, charge) - .PermitUncheckedError(); + Status rcs = row_cache.Insert(row_cache_key.GetUserKey(), row_ptr, charge); + if (!rcs.ok()) { + // If row cache is full, it's OK to continue, but we keep ownership of + // row_ptr. + delete row_ptr; + } } -#endif // ROCKSDB_LITE if (handle != nullptr) { cache_.Release(handle); @@ -521,9 +543,9 @@ Status TableCache::MultiGetFilter( const FileMetaData& file_meta, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, int level, - MultiGetContext::Range* mget_range, TypedHandle** table_handle) { + MultiGetContext::Range* mget_range, TypedHandle** table_handle, + uint8_t block_protection_bytes_per_key) { auto& fd = file_meta.fd; -#ifndef ROCKSDB_LITE IterKey row_cache_key; std::string row_cache_entry_buffer; @@ -534,19 +556,19 @@ Status TableCache::MultiGetFilter( if (ioptions_.row_cache && !first_key.get_context->NeedToReadSequence()) { return Status::NotSupported(); } -#endif // ROCKSDB_LITE Status s; TableReader* t = fd.table_reader; TypedHandle* handle = nullptr; MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(), mget_range->end()); if (t == nullptr) { - s = FindTable( - options, file_options_, internal_comparator, file_meta, &handle, - prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, - true /* record_read_stats */, file_read_hist, /*skip_filters=*/false, - level, true /* prefetch_index_and_filter_in_cache */, - /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature); + s = FindTable(options, file_options_, internal_comparator, file_meta, + &handle, block_protection_bytes_per_key, prefix_extractor, + options.read_tier == kBlockCacheTier /* no_io */, + file_read_hist, + /*skip_filters=*/false, level, + true /* prefetch_index_and_filter_in_cache */, + /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature); if (s.ok()) { t = cache_.Value(handle); } @@ -570,10 +592,11 @@ Status TableCache::MultiGetFilter( } Status TableCache::GetTableProperties( - const FileOptions& file_options, + const FileOptions& file_options, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, std::shared_ptr* properties, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, bool no_io) { auto table_reader = file_meta.fd.table_reader; // table already been pre-loaded? @@ -584,8 +607,9 @@ Status TableCache::GetTableProperties( } TypedHandle* table_handle = nullptr; - Status s = FindTable(ReadOptions(), file_options, internal_comparator, - file_meta, &table_handle, prefix_extractor, no_io); + Status s = FindTable(read_options, file_options, internal_comparator, + file_meta, &table_handle, block_protection_bytes_per_key, + prefix_extractor, no_io); if (!s.ok()) { return s; } @@ -598,12 +622,14 @@ Status TableCache::GetTableProperties( Status TableCache::ApproximateKeyAnchors( const ReadOptions& ro, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, std::vector& anchors) { + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, + std::vector& anchors) { Status s; TableReader* t = file_meta.fd.table_reader; TypedHandle* handle = nullptr; if (t == nullptr) { - s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle); + s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle, + block_protection_bytes_per_key); if (s.ok()) { t = cache_.Value(handle); } @@ -618,9 +644,9 @@ Status TableCache::ApproximateKeyAnchors( } size_t TableCache::GetMemoryUsageByTableReader( - const FileOptions& file_options, + const FileOptions& file_options, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor) { auto table_reader = file_meta.fd.table_reader; // table already been pre-loaded? @@ -629,8 +655,9 @@ size_t TableCache::GetMemoryUsageByTableReader( } TypedHandle* table_handle = nullptr; - Status s = FindTable(ReadOptions(), file_options, internal_comparator, - file_meta, &table_handle, prefix_extractor, true); + Status s = FindTable(read_options, file_options, internal_comparator, + file_meta, &table_handle, block_protection_bytes_per_key, + prefix_extractor, true /* no_io */); if (!s.ok()) { return 0; } @@ -646,25 +673,26 @@ void TableCache::Evict(Cache* cache, uint64_t file_number) { } uint64_t TableCache::ApproximateOffsetOf( - const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller, + const ReadOptions& read_options, const Slice& key, + const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor) { uint64_t result = 0; TableReader* table_reader = file_meta.fd.table_reader; TypedHandle* table_handle = nullptr; if (table_reader == nullptr) { - const bool for_compaction = (caller == TableReaderCaller::kCompaction); Status s = - FindTable(ReadOptions(), file_options_, internal_comparator, file_meta, - &table_handle, prefix_extractor, false /* no_io */, - !for_compaction /* record_read_stats */); + FindTable(read_options, file_options_, internal_comparator, file_meta, + &table_handle, block_protection_bytes_per_key, + prefix_extractor, false /* no_io */); if (s.ok()) { table_reader = cache_.Value(table_handle); } } if (table_reader != nullptr) { - result = table_reader->ApproximateOffsetOf(key, caller); + result = table_reader->ApproximateOffsetOf(read_options, key, caller); } if (table_handle != nullptr) { cache_.Release(table_handle); @@ -674,25 +702,26 @@ uint64_t TableCache::ApproximateOffsetOf( } uint64_t TableCache::ApproximateSize( - const Slice& start, const Slice& end, const FileMetaData& file_meta, - TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + const ReadOptions& read_options, const Slice& start, const Slice& end, + const FileMetaData& file_meta, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor) { uint64_t result = 0; TableReader* table_reader = file_meta.fd.table_reader; TypedHandle* table_handle = nullptr; if (table_reader == nullptr) { - const bool for_compaction = (caller == TableReaderCaller::kCompaction); Status s = - FindTable(ReadOptions(), file_options_, internal_comparator, file_meta, - &table_handle, prefix_extractor, false /* no_io */, - !for_compaction /* record_read_stats */); + FindTable(read_options, file_options_, internal_comparator, file_meta, + &table_handle, block_protection_bytes_per_key, + prefix_extractor, false /* no_io */); if (s.ok()) { table_reader = cache_.Value(table_handle); } } if (table_reader != nullptr) { - result = table_reader->ApproximateSize(start, end, caller); + result = table_reader->ApproximateSize(read_options, start, end, caller); } if (table_handle != nullptr) { cache_.Release(table_handle); diff --git a/db/table_cache.h b/db/table_cache.h index 66282bf41f04..5b056f9a9f88 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -86,6 +86,8 @@ class TableCache { // not cached), depending on the CF options // @param skip_filters Disables loading/accessing the filter block // @param level The level this table is at, -1 for "not set / don't know" + // @param range_del_read_seqno If non-nullptr, will be used to create + // *range_del_iter. InternalIterator* NewIterator( const ReadOptions& options, const FileOptions& toptions, const InternalKeyComparator& internal_comparator, @@ -96,6 +98,8 @@ class TableCache { size_t max_file_size_for_l0_meta_pin, const InternalKey* smallest_compaction_key, const InternalKey* largest_compaction_key, bool allow_unprepared_value, + uint8_t protection_bytes_per_key, + const SequenceNumber* range_del_read_seqno = nullptr, TruncatedRangeDelIterator** range_del_iter = nullptr); // If a seek to internal key "k" in specified file finds an entry, @@ -112,6 +116,7 @@ class TableCache { const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const Slice& k, GetContext* get_context, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr, HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, int level = -1, size_t max_file_size_for_l0_meta_pin = 0); @@ -121,7 +126,7 @@ class TableCache { Status GetRangeTombstoneIterator( const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, std::unique_ptr* out_iter); // Call table reader's MultiGetFilter to use the bloom filter to filter out @@ -135,7 +140,8 @@ class TableCache { const FileMetaData& file_meta, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, int level, - MultiGetContext::Range* mget_range, TypedHandle** table_handle); + MultiGetContext::Range* mget_range, TypedHandle** table_handle, + uint8_t block_protection_bytes_per_key); // If a seek to internal key "k" in specified file finds an entry, // call get_context->SaveValue() repeatedly until @@ -150,6 +156,7 @@ class TableCache { Status, MultiGet, const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const MultiGetContext::Range* mget_range, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr, HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, bool skip_range_deletions = false, int level = -1, @@ -165,10 +172,11 @@ class TableCache { const ReadOptions& ro, const FileOptions& toptions, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, TypedHandle**, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr, - const bool no_io = false, bool record_read_stats = true, - HistogramImpl* file_read_hist = nullptr, bool skip_filters = false, - int level = -1, bool prefetch_index_and_filter_in_cache = true, + const bool no_io = false, HistogramImpl* file_read_hist = nullptr, + bool skip_filters = false, int level = -1, + bool prefetch_index_and_filter_in_cache = true, size_t max_file_size_for_l0_meta_pin = 0, Temperature file_temperature = Temperature::kUnknown); @@ -179,38 +187,43 @@ class TableCache { // return Status::Incomplete() if table is not present in cache and // we set `no_io` to be true. Status GetTableProperties( - const FileOptions& toptions, + const FileOptions& toptions, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, std::shared_ptr* properties, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr, bool no_io = false); Status ApproximateKeyAnchors(const ReadOptions& ro, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, + uint8_t block_protection_bytes_per_key, std::vector& anchors); // Return total memory usage of the table reader of the file. // 0 if table reader of the file is not loaded. size_t GetMemoryUsageByTableReader( - const FileOptions& toptions, + const FileOptions& toptions, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, + const FileMetaData& file_meta, uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr); // Returns approximated offset of a key in a file represented by fd. uint64_t ApproximateOffsetOf( - const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller, + const ReadOptions& read_options, const Slice& key, + const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr); // Returns approximated data size between start and end keys in a file // represented by fd (the start key must not be greater than the end key). uint64_t ApproximateSize( - const Slice& start, const Slice& end, const FileMetaData& file_meta, - TableReaderCaller caller, + const ReadOptions& read_options, const Slice& start, const Slice& end, + const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor = nullptr); CacheInterface& get_cache() { return cache_; } @@ -233,7 +246,7 @@ class TableCache { const ReadOptions& ro, const FileOptions& file_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, bool sequential_mode, - bool record_read_stats, HistogramImpl* file_read_hist, + uint8_t block_protection_bytes_per_key, HistogramImpl* file_read_hist, std::unique_ptr* table_reader, const std::shared_ptr& prefix_extractor = nullptr, bool skip_filters = false, int level = -1, @@ -249,15 +262,18 @@ class TableCache { // Create a key prefix for looking up the row cache. The prefix is of the // format row_cache_id + fd_number + seq_no. Later, the user key can be // appended to form the full key - void CreateRowCacheKeyPrefix(const ReadOptions& options, - const FileDescriptor& fd, - const Slice& internal_key, - GetContext* get_context, IterKey& row_cache_key); + // Return the sequence number that determines the visibility of row_cache_key + uint64_t CreateRowCacheKeyPrefix(const ReadOptions& options, + const FileDescriptor& fd, + const Slice& internal_key, + GetContext* get_context, + IterKey& row_cache_key); // Helper function to lookup the row cache for a key. It appends the // user key to row_cache_key at offset prefix_size bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key, - size_t prefix_size, GetContext* get_context); + size_t prefix_size, GetContext* get_context, + SequenceNumber seq_no = kMaxSequenceNumber); const ImmutableOptions& ioptions_; const FileOptions& file_options_; @@ -265,7 +281,7 @@ class TableCache { std::string row_cache_id_; bool immortal_tables_; BlockCacheTracer* const block_cache_tracer_; - Striped loader_mutex_; + Striped> loader_mutex_; std::shared_ptr io_tracer_; std::string db_session_id_; }; diff --git a/db/table_cache_sync_and_async.h b/db/table_cache_sync_and_async.h index 9043ec8363c3..8ff03ec50159 100644 --- a/db/table_cache_sync_and_async.h +++ b/db/table_cache_sync_and_async.h @@ -17,6 +17,7 @@ namespace ROCKSDB_NAMESPACE { DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) (const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const MultiGetContext::Range* mget_range, + uint8_t block_protection_bytes_per_key, const std::shared_ptr& prefix_extractor, HistogramImpl* file_read_hist, bool skip_filters, bool skip_range_deletions, int level, TypedHandle* handle) { @@ -28,7 +29,6 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) if (handle != nullptr && t == nullptr) { t = cache_.Value(handle); } -#ifndef ROCKSDB_LITE autovector row_cache_entries; IterKey row_cache_key; size_t row_cache_key_prefix_size = 0; @@ -59,7 +59,6 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) } } } -#endif // ROCKSDB_LITE // Check that table_range is not empty. Its possible all keys may have been // found in the row cache and thus the range may now be empty @@ -67,10 +66,10 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) if (t == nullptr) { assert(handle == nullptr); s = FindTable(options, file_options_, internal_comparator, file_meta, - &handle, prefix_extractor, + &handle, block_protection_bytes_per_key, prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, - true /* record_read_stats */, file_read_hist, skip_filters, - level, true /* prefetch_index_and_filter_in_cache */, + file_read_hist, skip_filters, level, + true /* prefetch_index_and_filter_in_cache */, 0 /*max_file_size_for_l0_meta_pin*/, file_meta.temperature); TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s); if (s.ok()) { @@ -96,7 +95,6 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) } } -#ifndef ROCKSDB_LITE if (lookup_row_cache) { size_t row_idx = 0; RowCacheInterface row_cache{ioptions_.row_cache.get()}; @@ -122,7 +120,6 @@ DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet) } } } -#endif // ROCKSDB_LITE if (handle != nullptr) { cache_.Release(handle); diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 5f0f205da1c0..437b7e30903f 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -292,8 +292,9 @@ void TestCustomizedTablePropertiesCollector( new RandomAccessFileReader(std::move(source), "test")); std::unique_ptr props; + const ReadOptions read_options; Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(), - magic_number, ioptions, &props); + magic_number, ioptions, read_options, &props); ASSERT_OK(s); auto user_collected = props->user_collected_properties; @@ -355,7 +356,6 @@ TEST_P(TablePropertiesTest, CustomizedTablePropertiesCollector) { kBlockBasedTableMagicNumber, encode_as_internal, options, ikc); -#ifndef ROCKSDB_LITE // PlainTable is not supported in Lite // test plain table PlainTableOptions plain_table_options; plain_table_options.user_key_len = 8; @@ -367,7 +367,6 @@ TEST_P(TablePropertiesTest, CustomizedTablePropertiesCollector) { TestCustomizedTablePropertiesCollector(backward_mode_, kPlainTableMagicNumber, encode_as_internal, options, ikc); -#endif // !ROCKSDB_LITE } } @@ -431,8 +430,10 @@ void TestInternalKeyPropertiesCollector( new RandomAccessFileReader(std::move(source), "test")); std::unique_ptr props; - Status s = ReadTableProperties(reader.get(), fwf->contents().size(), - magic_number, ioptions, &props); + const ReadOptions read_options; + Status s = + ReadTableProperties(reader.get(), fwf->contents().size(), magic_number, + ioptions, read_options, &props); ASSERT_OK(s); auto user_collected = props->user_collected_properties; @@ -486,7 +487,6 @@ TEST_P(TablePropertiesTest, InternalKeyPropertiesCollector) { std::make_shared()); } -#ifndef ROCKSDB_LITE // PlainTable is not supported in Lite PlainTableOptions plain_table_options; plain_table_options.user_key_len = 8; plain_table_options.bloom_bits_per_key = 8; @@ -495,7 +495,6 @@ TEST_P(TablePropertiesTest, InternalKeyPropertiesCollector) { TestInternalKeyPropertiesCollector( backward_mode_, kPlainTableMagicNumber, false /* not sanitize */, std::make_shared(plain_table_options)); -#endif // !ROCKSDB_LITE } INSTANTIATE_TEST_CASE_P(InternalKeyPropertiesCollector, TablePropertiesTest, diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc index 3878b428aa61..8841b8cf3b22 100644 --- a/db/transaction_log_impl.cc +++ b/db/transaction_log_impl.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "db/transaction_log_impl.h" @@ -295,4 +294,3 @@ Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* log_file) { return Status::OK(); } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h index e8c6efc02e4b..6568de23f6c2 100644 --- a/db/transaction_log_impl.h +++ b/db/transaction_log_impl.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include "db/log_reader.h" @@ -127,4 +126,3 @@ class TransactionLogIteratorImpl : public TransactionLogIterator { Status OpenLogReader(const LogFile* file); }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/db/version_builder.cc b/db/version_builder.cc index 4f0e3a8413c0..210b0de86946 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -1257,7 +1257,8 @@ class VersionBuilder::Rep { InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin) { + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, + uint8_t block_protection_bytes_per_key) { assert(table_cache_ != nullptr); size_t table_cache_capacity = @@ -1324,9 +1325,9 @@ class VersionBuilder::Rep { int level = files_meta[file_idx].second; TableCache::TypedHandle* handle = nullptr; statuses[file_idx] = table_cache_->FindTable( - ReadOptions(), file_options_, + read_options, file_options_, *(base_vstorage_->InternalComparator()), *file_meta, &handle, - prefix_extractor, false /*no_io */, true /* record_read_stats */, + block_protection_bytes_per_key, prefix_extractor, false /*no_io */, internal_stats->GetFileReadHist(level), false, level, prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin, file_meta->temperature); @@ -1384,10 +1385,12 @@ Status VersionBuilder::LoadTableHandlers( InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin) { + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, + uint8_t block_protection_bytes_per_key) { return rep_->LoadTableHandlers( internal_stats, max_threads, prefetch_index_and_filter_in_cache, - is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin); + is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin, + read_options, block_protection_bytes_per_key); } uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const { diff --git a/db/version_builder.h b/db/version_builder.h index 682d60524293..fb2a304a8439 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -48,7 +48,8 @@ class VersionBuilder { InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin); + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options, + uint8_t block_protection_bytes_per_key); uint64_t GetMinOldestBlobFileNumber() const; private: diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 611dee774b03..2ca10c449cee 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -37,7 +37,9 @@ class VersionBuilderTest : public testing::Test { ioptions_(options_), mutable_cf_options_(options_), vstorage_(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, - nullptr, false), + nullptr, false, EpochNumberRequirement::kMustPresent, + ioptions_.clock, options_.bottommost_file_compaction_delay, + OffpeakTimeOption(options_.daily_offpeak_time_utc)), file_num_(1) { mutable_cf_options_.RefreshDerivedOptions(ioptions_); size_being_compacted_.resize(options_.num_levels); @@ -73,7 +75,8 @@ class VersionBuilderTest : public testing::Test { /* marked_for_compact */ false, Temperature::kUnknown, oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, + /* user_defined_timestamps_persisted */ true); f->compensated_file_size = file_size; f->num_entries = num_entries; f->num_deletions = num_deletions; @@ -130,13 +133,13 @@ class VersionBuilderTest : public testing::Test { constexpr SequenceNumber largest_seqno = 300; constexpr bool marked_for_compaction = false; - edit->AddFile(level, table_file_number, path_id, file_size, - GetInternalKey(smallest), GetInternalKey(largest), - smallest_seqno, largest_seqno, marked_for_compaction, - Temperature::kUnknown, blob_file_number, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + edit->AddFile( + level, table_file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, blob_file_number, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, + 0, 0, /* user_defined_timestamps_persisted */ true); } void UpdateVersionStorageInfo(VersionStorageInfo* vstorage) { @@ -187,7 +190,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) { 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); version_edit.DeleteFile(3, 27U); EnvOptions env_options; @@ -197,8 +201,10 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) { VersionBuilder version_builder(env_options, &ioptions_, table_cache, &vstorage_, version_set); - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, nullptr, false); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, nullptr, false, + EpochNumberRequirement::kMightMissing, nullptr, 0, + OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(version_builder.Apply(&version_edit)); ASSERT_OK(version_builder.SaveTo(&new_vstorage)); @@ -234,7 +240,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { 3, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); @@ -246,8 +253,10 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { VersionBuilder version_builder(env_options, &ioptions_, table_cache, &vstorage_, version_set); - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, nullptr, false); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, nullptr, false, + EpochNumberRequirement::kMightMissing, nullptr, 0, + OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(version_builder.Apply(&version_edit)); ASSERT_OK(version_builder.SaveTo(&new_vstorage)); @@ -285,7 +294,8 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) { 4, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); version_edit.DeleteFile(4, 6U); @@ -299,8 +309,10 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) { VersionBuilder version_builder(env_options, &ioptions_, table_cache, &vstorage_, version_set); - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, nullptr, false); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, nullptr, false, + EpochNumberRequirement::kMightMissing, nullptr, 0, + OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(version_builder.Apply(&version_edit)); ASSERT_OK(version_builder.SaveTo(&new_vstorage)); @@ -321,27 +333,32 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) { 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); version_edit.AddFile( 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); version_edit.AddFile( 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); version_edit.AddFile( 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); version_edit.AddFile( 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); EnvOptions env_options; constexpr TableCache* table_cache = nullptr; @@ -350,8 +367,10 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) { VersionBuilder version_builder(env_options, &ioptions_, table_cache, &vstorage_, version_set); - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, nullptr, false); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, nullptr, false, + EpochNumberRequirement::kMightMissing, nullptr, 0, + OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(version_builder.Apply(&version_edit)); ASSERT_OK(version_builder.SaveTo(&new_vstorage)); @@ -372,35 +391,42 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { VersionBuilder version_builder(env_options, &ioptions_, table_cache, &vstorage_, version_set); - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, nullptr, false); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, nullptr, false, + EpochNumberRequirement::kMightMissing, nullptr, 0, + OffpeakTimeOption(options_.daily_offpeak_time_utc)); VersionEdit version_edit; version_edit.AddFile( 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); version_edit.AddFile( 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); version_edit.AddFile( 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); version_edit.AddFile( 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); version_edit.AddFile( 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); ASSERT_OK(version_builder.Apply(&version_edit)); VersionEdit version_edit2; @@ -408,14 +434,16 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { 2, 808, 0, 100U, GetInternalKey("901"), GetInternalKey("950"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); version_edit2.DeleteFile(2, 616); version_edit2.DeleteFile(2, 636); version_edit.AddFile( 2, 806, 0, 100U, GetInternalKey("801"), GetInternalKey("850"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); ASSERT_OK(version_builder.Apply(&version_edit2)); ASSERT_OK(version_builder.SaveTo(&new_vstorage)); @@ -526,14 +554,16 @@ TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) { GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); ASSERT_OK(builder.Apply(&addition)); constexpr bool force_consistency_checks = false; - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, &vstorage_, - force_consistency_checks); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, &vstorage_, + force_consistency_checks, EpochNumberRequirement::kMightMissing, nullptr, + 0, OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(builder.SaveTo(&new_vstorage)); @@ -575,7 +605,8 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) { GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); const Status s = builder.Apply(&edit); ASSERT_TRUE(s.IsCorruption()); @@ -611,7 +642,8 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); ASSERT_OK(builder.Apply(&edit)); @@ -624,7 +656,8 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); const Status s = builder.Apply(&other_edit); ASSERT_TRUE(s.IsCorruption()); @@ -660,7 +693,8 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) { GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, + 0, /* user_defined_timestamps_persisted */ true); ASSERT_OK(builder.Apply(&addition)); @@ -671,9 +705,10 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) { ASSERT_OK(builder.Apply(&deletion)); constexpr bool force_consistency_checks = false; - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, &vstorage_, - force_consistency_checks); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, &vstorage_, + force_consistency_checks, EpochNumberRequirement::kMightMissing, nullptr, + 0, OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(builder.SaveTo(&new_vstorage)); @@ -715,9 +750,10 @@ TEST_F(VersionBuilderTest, ApplyBlobFileAddition) { ASSERT_OK(builder.Apply(&edit)); constexpr bool force_consistency_checks = false; - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, &vstorage_, - force_consistency_checks); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, &vstorage_, + force_consistency_checks, EpochNumberRequirement::kMightMissing, nullptr, + 0, OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(builder.SaveTo(&new_vstorage)); @@ -854,9 +890,10 @@ TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileInBase) { ASSERT_OK(builder.Apply(&edit)); constexpr bool force_consistency_checks = false; - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, &vstorage_, - force_consistency_checks); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, &vstorage_, + force_consistency_checks, EpochNumberRequirement::kMightMissing, nullptr, + 0, OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(builder.SaveTo(&new_vstorage)); @@ -927,9 +964,10 @@ TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileAdditionApplied) { ASSERT_OK(builder.Apply(&garbage)); constexpr bool force_consistency_checks = false; - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, &vstorage_, - force_consistency_checks); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, &vstorage_, + force_consistency_checks, EpochNumberRequirement::kMightMissing, nullptr, + 0, OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(builder.SaveTo(&new_vstorage)); @@ -1107,9 +1145,10 @@ TEST_F(VersionBuilderTest, SaveBlobFilesTo) { ASSERT_OK(builder.Apply(&edit)); constexpr bool force_consistency_checks = false; - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, &vstorage_, - force_consistency_checks); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, &vstorage_, + force_consistency_checks, EpochNumberRequirement::kMightMissing, nullptr, + 0, OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(builder.SaveTo(&new_vstorage)); @@ -1155,9 +1194,10 @@ TEST_F(VersionBuilderTest, SaveBlobFilesTo) { ASSERT_OK(second_builder.Apply(&second_edit)); - VersionStorageInfo newer_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, &new_vstorage, - force_consistency_checks); + VersionStorageInfo newer_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, &new_vstorage, + force_consistency_checks, EpochNumberRequirement::kMightMissing, nullptr, + 0, OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(second_builder.SaveTo(&newer_vstorage)); @@ -1233,16 +1273,18 @@ TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) { GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 2 /*epoch_number*/, - checksum_value, checksum_method, kNullUniqueId64x2, 0); + checksum_value, checksum_method, kNullUniqueId64x2, 0, 0, + /* user_defined_timestamps_persisted */ true); edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, checksum_method, checksum_value); ASSERT_OK(builder.Apply(&edit)); constexpr bool force_consistency_checks = true; - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, &vstorage_, - force_consistency_checks); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, &vstorage_, + force_consistency_checks, EpochNumberRequirement::kMightMissing, nullptr, + 0, OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(builder.SaveTo(&new_vstorage)); @@ -1321,7 +1363,8 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { /* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0, + /* user_defined_timestamps_persisted */ true); edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0, /* file_size */ 100, /* smallest */ GetInternalKey("801"), @@ -1331,7 +1374,8 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { /* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0, + /* user_defined_timestamps_persisted */ true); edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000, /* total_blob_bytes */ 200000, /* checksum_method */ std::string(), @@ -1341,9 +1385,10 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { // Save to a new version in order to trigger consistency checks. constexpr bool force_consistency_checks = true; - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, &vstorage_, - force_consistency_checks); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, &vstorage_, + force_consistency_checks, EpochNumberRequirement::kMightMissing, nullptr, + 0, OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(builder.SaveTo(&new_vstorage)); @@ -1380,9 +1425,10 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesInconsistentLinks) { // Save to a new version in order to trigger consistency checks. constexpr bool force_consistency_checks = true; - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, &vstorage_, - force_consistency_checks); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, &vstorage_, + force_consistency_checks, EpochNumberRequirement::kMightMissing, nullptr, + 0, OffpeakTimeOption(options_.daily_offpeak_time_utc)); const Status s = builder.SaveTo(&new_vstorage); ASSERT_TRUE(s.IsCorruption()); @@ -1421,9 +1467,10 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesAllGarbage) { // Save to a new version in order to trigger consistency checks. constexpr bool force_consistency_checks = true; - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, &vstorage_, - force_consistency_checks); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, &vstorage_, + force_consistency_checks, EpochNumberRequirement::kMightMissing, nullptr, + 0, OffpeakTimeOption(options_.daily_offpeak_time_utc)); const Status s = builder.SaveTo(&new_vstorage); ASSERT_TRUE(s.IsCorruption()); @@ -1470,9 +1517,10 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesAllGarbageLinkedSsts) { // Save to a new version in order to trigger consistency checks. constexpr bool force_consistency_checks = true; - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, &vstorage_, - force_consistency_checks); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, &vstorage_, + force_consistency_checks, EpochNumberRequirement::kMightMissing, nullptr, + 0, OffpeakTimeOption(options_.daily_offpeak_time_utc)); const Status s = builder.SaveTo(&new_vstorage); ASSERT_TRUE(s.IsCorruption()); @@ -1552,7 +1600,8 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { Temperature::kUnknown, /* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, + /* user_defined_timestamps_persisted */ true); // Add an SST that does not reference any blob files. edit.AddFile( @@ -1562,7 +1611,8 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* largest_seqno */ 2200, /* marked_for_compaction */ false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, + /* user_defined_timestamps_persisted */ true); // Delete a file that references a blob file. edit.DeleteFile(/* level */ 1, /* file_number */ 6); @@ -1585,7 +1635,8 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0, + /* user_defined_timestamps_persisted */ true); // Trivially move a file that does not reference any blob files. edit.DeleteFile(/* level */ 1, /* file_number */ 13); @@ -1597,7 +1648,8 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, + /* user_defined_timestamps_persisted */ true); // Add one more SST file that references a blob file, then promptly // delete it in a second version edit before the new version gets saved. @@ -1611,7 +1663,8 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0, + /* user_defined_timestamps_persisted */ true); VersionEdit edit2; @@ -1628,9 +1681,10 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { ASSERT_OK(builder.Apply(&edit2)); constexpr bool force_consistency_checks = true; - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, &vstorage_, - force_consistency_checks); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, &vstorage_, + force_consistency_checks, EpochNumberRequirement::kMightMissing, nullptr, + 0, OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(builder.SaveTo(&new_vstorage)); @@ -1679,9 +1733,11 @@ TEST_F(VersionBuilderTest, CheckConsistencyForFileDeletedTwice) { VersionBuilder version_builder(env_options, &ioptions_, table_cache, &vstorage_, version_set); - VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, nullptr, - true /* force_consistency_checks */); + VersionStorageInfo new_vstorage( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, nullptr, + true /* force_consistency_checks */, + EpochNumberRequirement::kMightMissing, nullptr, 0, + OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(version_builder.Apply(&version_edit)); ASSERT_OK(version_builder.SaveTo(&new_vstorage)); @@ -1689,9 +1745,11 @@ TEST_F(VersionBuilderTest, CheckConsistencyForFileDeletedTwice) { VersionBuilder version_builder2(env_options, &ioptions_, table_cache, &new_vstorage, version_set); - VersionStorageInfo new_vstorage2(&icmp_, ucmp_, options_.num_levels, - kCompactionStyleLevel, nullptr, - true /* force_consistency_checks */); + VersionStorageInfo new_vstorage2( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, nullptr, + true /* force_consistency_checks */, + EpochNumberRequirement::kMightMissing, nullptr, 0, + OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_NOK(version_builder2.Apply(&version_edit)); UnrefFilesInVersion(&new_vstorage); @@ -1712,7 +1770,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0, /* user_defined_timestamps_persisted */ true); version_edit_1.AddFile( /* level */ 0, /* file_number */ 2U, /* path_id */ 0, /* file_size */ 100, /* smallest */ GetInternalKey("b", 2), @@ -1722,14 +1780,16 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0, /* user_defined_timestamps_persisted */ true); VersionBuilder version_builder_1(EnvOptions(), &ioptions_, nullptr /* table_cache */, &vstorage_, nullptr /* file_metadata_cache_res_mgr */); VersionStorageInfo new_vstorage_1( &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, - nullptr /* src_vstorage */, true /* force_consistency_checks */); + nullptr /* src_vstorage */, true /* force_consistency_checks */, + EpochNumberRequirement::kMightMissing, nullptr, 0, + OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(version_builder_1.Apply(&version_edit_1)); s = version_builder_1.SaveTo(&new_vstorage_1); @@ -1749,7 +1809,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0, /* user_defined_timestamps_persisted */ true); version_edit_2.AddFile( /* level */ 0, /* file_number */ 2U, /* path_id */ 0, /* file_size */ 100, /* smallest */ GetInternalKey("b", 2), @@ -1759,14 +1819,16 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 2 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0, /* user_defined_timestamps_persisted */ true); VersionBuilder version_builder_2(EnvOptions(), &ioptions_, nullptr /* table_cache */, &vstorage_, nullptr /* file_metadata_cache_res_mgr */); VersionStorageInfo new_vstorage_2( &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, - nullptr /* src_vstorage */, true /* force_consistency_checks */); + nullptr /* src_vstorage */, true /* force_consistency_checks */, + EpochNumberRequirement::kMightMissing, nullptr, 0, + OffpeakTimeOption(options_.daily_offpeak_time_utc)); ASSERT_OK(version_builder_2.Apply(&version_edit_2)); s = version_builder_2.SaveTo(&new_vstorage_2); diff --git a/db/version_edit.cc b/db/version_edit.cc index e751353315ac..f6de9779fb6f 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -97,12 +97,14 @@ void VersionEdit::Clear() { full_history_ts_low_.clear(); } -bool VersionEdit::EncodeTo(std::string* dst) const { +bool VersionEdit::EncodeTo(std::string* dst, + std::optional ts_sz) const { if (has_db_id_) { PutVarint32(dst, kDbId); PutLengthPrefixedSlice(dst, db_id_); } if (has_comparator_) { + assert(has_persist_user_defined_timestamps_); PutVarint32(dst, kComparator); PutLengthPrefixedSlice(dst, comparator_); } @@ -145,6 +147,8 @@ bool VersionEdit::EncodeTo(std::string* dst) const { } bool min_log_num_written = false; + + assert(new_files_.empty() || ts_sz.has_value()); for (size_t i = 0; i < new_files_.size(); i++) { const FileMetaData& f = new_files_[i].second; if (!f.smallest.Valid() || !f.largest.Valid() || @@ -154,8 +158,7 @@ bool VersionEdit::EncodeTo(std::string* dst) const { PutVarint32(dst, kNewFile4); PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber()); PutVarint64(dst, f.fd.GetFileSize()); - PutLengthPrefixedSlice(dst, f.smallest.Encode()); - PutLengthPrefixedSlice(dst, f.largest.Encode()); + EncodeFileBoundaries(dst, f, ts_sz.value()); PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno); // Customized fields' format: // +-----------------------------+ @@ -250,6 +253,20 @@ bool VersionEdit::EncodeTo(std::string* dst) const { f.compensated_range_deletion_size); PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size)); } + if (f.tail_size) { + PutVarint32(dst, NewFileCustomTag::kTailSize); + std::string varint_tail_size; + PutVarint64(&varint_tail_size, f.tail_size); + PutLengthPrefixedSlice(dst, Slice(varint_tail_size)); + } + if (!f.user_defined_timestamps_persisted) { + // The default value for the flag is true, it's only explicitly persisted + // when it's false. We are putting 0 as the value here to signal false + // (i.e. UDTS not persisted). + PutVarint32(dst, NewFileCustomTag::kUserDefinedTimestampsPersisted); + char p = static_cast(0); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", dst); @@ -304,6 +321,15 @@ bool VersionEdit::EncodeTo(std::string* dst) const { PutVarint32(dst, kFullHistoryTsLow); PutLengthPrefixedSlice(dst, full_history_ts_low_); } + + if (HasPersistUserDefinedTimestamps()) { + // persist_user_defined_timestamps flag should be logged in the same + // VersionEdit as the user comparator name. + assert(has_comparator_); + PutVarint32(dst, kPersistUserDefinedTimestamps); + char p = static_cast(persist_user_defined_timestamps_); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } return true; } @@ -428,6 +454,17 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { return "Invalid compensated range deletion size"; } break; + case kTailSize: + if (!GetVarint64(&field, &f.tail_size)) { + return "invalid tail start offset"; + } + break; + case kUserDefinedTimestampsPersisted: + if (field.size() != 1) { + return "user-defined timestamps persisted field wrong size"; + } + f.user_defined_timestamps_persisted = (field[0] == 1); + break; default: if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { // Should not proceed if cannot understand it @@ -445,6 +482,23 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { return nullptr; } +void VersionEdit::EncodeFileBoundaries(std::string* dst, + const FileMetaData& meta, + size_t ts_sz) const { + if (ts_sz == 0 || meta.user_defined_timestamps_persisted) { + PutLengthPrefixedSlice(dst, meta.smallest.Encode()); + PutLengthPrefixedSlice(dst, meta.largest.Encode()); + return; + } + std::string smallest_buf; + std::string largest_buf; + StripTimestampFromInternalKey(&smallest_buf, meta.smallest.Encode(), ts_sz); + StripTimestampFromInternalKey(&largest_buf, meta.largest.Encode(), ts_sz); + PutLengthPrefixedSlice(dst, smallest_buf); + PutLengthPrefixedSlice(dst, largest_buf); + return; +}; + Status VersionEdit::DecodeFrom(const Slice& src) { Clear(); #ifndef NDEBUG @@ -762,6 +816,17 @@ Status VersionEdit::DecodeFrom(const Slice& src) { } break; + case kPersistUserDefinedTimestamps: + if (!GetLengthPrefixedSlice(&input, &str)) { + msg = "persist_user_defined_timestamps"; + } else if (str.size() != 1) { + msg = "persist_user_defined_timestamps field wrong size"; + } else { + persist_user_defined_timestamps_ = (str[0] == 1); + has_persist_user_defined_timestamps_ = true; + } + break; + default: if (tag & kTagSafeIgnoreMask) { // Tag from future which can be safely ignored. @@ -804,6 +869,10 @@ std::string VersionEdit::DebugString(bool hex_key) const { r.append("\n Comparator: "); r.append(comparator_); } + if (has_persist_user_defined_timestamps_) { + r.append("\n PersistUserDefinedTimestamps: "); + r.append(persist_user_defined_timestamps_ ? "true" : "false"); + } if (has_log_number_) { r.append("\n LogNumber: "); AppendNumberTo(&r, log_number_); @@ -888,6 +957,10 @@ std::string VersionEdit::DebugString(bool hex_key) const { InternalUniqueIdToExternal(&id); r.append(UniqueIdToHumanString(EncodeUniqueIdBytes(&id))); } + r.append(" tail size: "); + AppendNumberTo(&r, f.tail_size); + r.append(" User-defined timestamps persisted: "); + r.append(f.user_defined_timestamps_persisted ? "true" : "false"); } for (const auto& blob_file_addition : blob_file_additions_) { @@ -1009,6 +1082,9 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { // permanent jw << "Temperature" << static_cast(f.temperature); } + jw << "TailSize" << f.tail_size; + jw << "UserDefinedTimestampsPersisted" + << f.user_defined_timestamps_persisted; jw.EndArrayedObject(); } diff --git a/db/version_edit.h b/db/version_edit.h index 24938de0bbd2..8d648134f62e 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -9,6 +9,7 @@ #pragma once #include +#include #include #include #include @@ -20,8 +21,8 @@ #include "db/wal_edit.h" #include "memory/arena.h" #include "port/malloc.h" +#include "rocksdb/advanced_cache.h" #include "rocksdb/advanced_options.h" -#include "rocksdb/cache.h" #include "table/table_reader.h" #include "table/unique_id_impl.h" #include "util/autovector.h" @@ -74,6 +75,7 @@ enum Tag : uint32_t { kFullHistoryTsLow, kWalAddition2, kWalDeletion2, + kPersistUserDefinedTimestamps, }; enum NewFileCustomTag : uint32_t { @@ -94,6 +96,8 @@ enum NewFileCustomTag : uint32_t { kUniqueId = 12, kEpochNumber = 13, kCompensatedRangeDeletionSize = 14, + kTailSize = 15, + kUserDefinedTimestampsPersisted = 16, // If this bit for the custom tag is set, opening DB should fail if // we don't know this field. @@ -193,7 +197,8 @@ struct FileMetaData { uint64_t compensated_file_size = 0; // These values can mutate, but they can only be read or written from // single-threaded LogAndApply thread - uint64_t num_entries = 0; // the number of entries. + uint64_t num_entries = + 0; // The number of entries, including deletions and range deletions. // The number of deletion entries, including range deletions. uint64_t num_deletions = 0; uint64_t raw_key_size = 0; // total uncompressed key size. @@ -218,10 +223,16 @@ struct FileMetaData { // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1. uint64_t oldest_blob_file_number = kInvalidBlobFileNumber; - // The file could be the compaction output from other SST files, which could - // in turn be outputs for compact older SST files. We track the memtable - // flush timestamp for the oldest SST file that eventually contribute data - // to this file. 0 means the information is not available. + // For flush output file, oldest ancestor time is the oldest key time in the + // file. If the oldest key time is not available, flush time is used. + // + // For compaction output file, oldest ancestor time is the oldest + // among all the oldest key time of its input files, since the file could be + // the compaction output from other SST files, which could in turn be outputs + // for compact older SST files. If that's not available, creation time of this + // compaction output file is used. + // + // 0 means the information is not available. uint64_t oldest_ancester_time = kUnknownOldestAncesterTime; // Unix time when the SST file is created. @@ -242,6 +253,15 @@ struct FileMetaData { // SST unique id UniqueId64x2 unique_id{}; + // Size of the "tail" part of a SST file + // "Tail" refers to all blocks after data blocks till the end of the SST file + uint64_t tail_size = 0; + + // Value of the `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` + // flag when the file is created. Default to true, only when this flag is + // false, it's explicitly written to Manifest. + bool user_defined_timestamps_persisted = true; + FileMetaData() = default; FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size, @@ -253,7 +273,8 @@ struct FileMetaData { uint64_t _epoch_number, const std::string& _file_checksum, const std::string& _file_checksum_func_name, UniqueId64x2 _unique_id, - const uint64_t _compensated_range_deletion_size) + const uint64_t _compensated_range_deletion_size, + uint64_t _tail_size, bool _user_defined_timestamps_persisted) : fd(file, file_path_id, file_size, smallest_seq, largest_seq), smallest(smallest_key), largest(largest_key), @@ -266,7 +287,9 @@ struct FileMetaData { epoch_number(_epoch_number), file_checksum(_file_checksum), file_checksum_func_name(_file_checksum_func_name), - unique_id(std::move(_unique_id)) { + unique_id(std::move(_unique_id)), + tail_size(_tail_size), + user_defined_timestamps_persisted(_user_defined_timestamps_persisted) { TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); } @@ -386,6 +409,17 @@ class VersionEdit { bool HasComparatorName() const { return has_comparator_; } const std::string& GetComparatorName() const { return comparator_; } + void SetPersistUserDefinedTimestamps(bool persist_user_defined_timestamps) { + has_persist_user_defined_timestamps_ = true; + persist_user_defined_timestamps_ = persist_user_defined_timestamps; + } + bool HasPersistUserDefinedTimestamps() const { + return has_persist_user_defined_timestamps_; + } + bool GetPersistUserDefinedTimestamps() const { + return persist_user_defined_timestamps_; + } + void SetLogNumber(uint64_t num) { has_log_number_ = true; log_number_ = num; @@ -478,7 +512,8 @@ class VersionEdit { uint64_t epoch_number, const std::string& file_checksum, const std::string& file_checksum_func_name, const UniqueId64x2& unique_id, - const uint64_t compensated_range_deletion_size) { + const uint64_t compensated_range_deletion_size, + uint64_t tail_size, bool user_defined_timestamps_persisted) { assert(smallest_seqno <= largest_seqno); new_files_.emplace_back( level, @@ -487,7 +522,9 @@ class VersionEdit { temperature, oldest_blob_file_number, oldest_ancester_time, file_creation_time, epoch_number, file_checksum, file_checksum_func_name, unique_id, - compensated_range_deletion_size)); + compensated_range_deletion_size, tail_size, + user_defined_timestamps_persisted)); + files_to_quarantine_.push_back(file); if (!HasLastSequence() || largest_seqno > GetLastSequence()) { SetLastSequence(largest_seqno); } @@ -496,6 +533,7 @@ class VersionEdit { void AddFile(int level, const FileMetaData& f) { assert(f.fd.smallest_seqno <= f.fd.largest_seqno); new_files_.emplace_back(level, f); + files_to_quarantine_.push_back(f.fd.GetNumber()); if (!HasLastSequence() || f.fd.largest_seqno > GetLastSequence()) { SetLastSequence(f.fd.largest_seqno); } @@ -505,6 +543,8 @@ class VersionEdit { using NewFiles = std::vector>; const NewFiles& GetNewFiles() const { return new_files_; } + NewFiles& GetMutableNewFiles() { return new_files_; } + // Retrieve all the compact cursors using CompactCursors = std::vector>; const CompactCursors& GetCompactCursors() const { return compact_cursors_; } @@ -530,10 +570,13 @@ class VersionEdit { blob_file_additions_.emplace_back( blob_file_number, total_blob_count, total_blob_bytes, std::move(checksum_method), std::move(checksum_value)); + files_to_quarantine_.push_back(blob_file_number); } void AddBlobFile(BlobFileAddition blob_file_addition) { blob_file_additions_.emplace_back(std::move(blob_file_addition)); + files_to_quarantine_.push_back( + blob_file_additions_.back().GetBlobFileNumber()); } // Retrieve all the blob files added. @@ -545,6 +588,11 @@ class VersionEdit { void SetBlobFileAdditions(BlobFileAdditions blob_file_additions) { assert(blob_file_additions_.empty()); blob_file_additions_ = std::move(blob_file_additions); + std::for_each( + blob_file_additions_.begin(), blob_file_additions_.end(), + [&](const BlobFileAddition& blob_file) { + files_to_quarantine_.push_back(blob_file.GetBlobFileNumber()); + }); } // Add garbage for an existing blob file. Note: intentionally broken English @@ -612,6 +660,8 @@ class VersionEdit { } uint32_t GetColumnFamily() const { return column_family_; } + const std::string& GetColumnFamilyName() const { return column_family_name_; } + // set column family ID by calling SetColumnFamily() void AddColumnFamily(const std::string& name) { assert(!is_column_family_drop_); @@ -643,6 +693,9 @@ class VersionEdit { remaining_entries_ = remaining_entries; } bool IsInAtomicGroup() const { return is_in_atomic_group_; } + void SetRemainingEntries(uint32_t remaining_entries) { + remaining_entries_ = remaining_entries; + } uint32_t GetRemainingEntries() const { return remaining_entries_; } bool HasFullHistoryTsLow() const { return !full_history_ts_low_.empty(); } @@ -656,7 +709,17 @@ class VersionEdit { } // return true on success. - bool EncodeTo(std::string* dst) const; + // `ts_sz` is the size in bytes for the user-defined timestamp contained in + // a user key. This argument is optional because it's only required for + // encoding a `VersionEdit` with new SST files to add. It's used to handle the + // file boundaries: `smallest`, `largest` when + // `FileMetaData.user_defined_timestamps_persisted` is false. When reading + // the Manifest file, a mirroring change needed to handle + // file boundaries are not added to the `VersionEdit.DecodeFrom` function + // because timestamp size is not available at `VersionEdit` decoding time, + // it's instead added to `VersionEditHandler::OnNonCfOperation`. + bool EncodeTo(std::string* dst, + std::optional ts_sz = std::nullopt) const; Status DecodeFrom(const Slice& src); bool GetNextFileNumber(uint64_t* result) const { @@ -666,24 +729,24 @@ class VersionEdit { return has_next_file_number_; } + const autovector* GetFilesToQuarantineIfCommitFail() const { + return &files_to_quarantine_; + } + std::string DebugString(bool hex_key = false) const; std::string DebugJSON(int edit_num, bool hex_key = false) const; private: - friend class ReactiveVersionSet; - friend class VersionEditHandlerBase; - friend class ListColumnFamiliesHandler; - friend class VersionEditHandler; - friend class VersionEditHandlerPointInTime; - friend class DumpManifestHandler; - friend class VersionSet; - friend class Version; - friend class AtomicGroupReadBuffer; - bool GetLevel(Slice* input, int* level, const char** msg); const char* DecodeNewFile4From(Slice* input); + // Encode file boundaries `FileMetaData.smallest` and `FileMetaData.largest`. + // User-defined timestamps in the user key will be stripped if they shouldn't + // be persisted. + void EncodeFileBoundaries(std::string* dst, const FileMetaData& meta, + size_t ts_sz) const; + int max_level_ = 0; std::string db_id_; std::string comparator_; @@ -706,6 +769,7 @@ class VersionEdit { bool has_max_column_family_ = false; bool has_min_log_number_to_keep_ = false; bool has_last_sequence_ = false; + bool has_persist_user_defined_timestamps_ = false; // Compaction cursors for round-robin compaction policy CompactCursors compact_cursors_; @@ -733,6 +797,17 @@ class VersionEdit { uint32_t remaining_entries_ = 0; std::string full_history_ts_low_; + bool persist_user_defined_timestamps_ = true; + + // Newly created table files and blob files are eligible for deletion if they + // are not registered as live files after the background jobs creating them + // have finished. In case committing the VersionEdit containing such changes + // to manifest encountered an error, we want to quarantine these files from + // deletion to avoid prematurely deleting files that ended up getting recorded + // in Manifest as live files. + // Since table files and blob files share the same file number space, we just + // record the file number here. + autovector files_to_quarantine_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc index 1e0a934eb857..af337c929d78 100644 --- a/db/version_edit_handler.cc +++ b/db/version_edit_handler.cc @@ -17,6 +17,7 @@ #include "db/version_edit.h" #include "logging/logging.h" #include "monitoring/persistent_stats_history.h" +#include "util/udt_util.h" namespace ROCKSDB_NAMESPACE { @@ -27,7 +28,7 @@ void VersionEditHandlerBase::Iterate(log::Reader& reader, assert(log_read_status); assert(log_read_status->ok()); - size_t recovered_edits = 0; + [[maybe_unused]] size_t recovered_edits = 0; Status s = Initialize(); while (reader.LastRecordEnd() < max_manifest_read_size_ && s.ok() && reader.ReadRecord(&record, &scratch) && log_read_status->ok()) { @@ -42,7 +43,7 @@ void VersionEditHandlerBase::Iterate(log::Reader& reader, break; } ColumnFamilyData* cfd = nullptr; - if (edit.is_in_atomic_group_) { + if (edit.IsInAtomicGroup()) { if (read_buffer_.IsFull()) { for (auto& e : read_buffer_.replay_buffer()) { s = ApplyVersionEdit(e, &cfd); @@ -86,7 +87,8 @@ void VersionEditHandlerBase::Iterate(log::Reader& reader, message << ' '; } // append the filename to the corruption message - message << "in file " << reader.file()->file_name(); + message << " The file " << reader.file()->file_name() + << " may be corrupted."; // overwrite the status with the extended status s = Status(s.code(), s.subcode(), s.severity(), message.str()); } @@ -99,20 +101,18 @@ void VersionEditHandlerBase::Iterate(log::Reader& reader, Status ListColumnFamiliesHandler::ApplyVersionEdit( VersionEdit& edit, ColumnFamilyData** /*unused*/) { Status s; - if (edit.is_column_family_add_) { - if (column_family_names_.find(edit.column_family_) != - column_family_names_.end()) { + uint32_t cf_id = edit.GetColumnFamily(); + if (edit.IsColumnFamilyAdd()) { + if (column_family_names_.find(cf_id) != column_family_names_.end()) { s = Status::Corruption("Manifest adding the same column family twice"); } else { - column_family_names_.insert( - {edit.column_family_, edit.column_family_name_}); + column_family_names_.insert({cf_id, edit.GetColumnFamilyName()}); } - } else if (edit.is_column_family_drop_) { - if (column_family_names_.find(edit.column_family_) == - column_family_names_.end()) { + } else if (edit.IsColumnFamilyDrop()) { + if (column_family_names_.find(cf_id) == column_family_names_.end()) { s = Status::Corruption("Manifest - dropping non-existing column family"); } else { - column_family_names_.erase(edit.column_family_); + column_family_names_.erase(cf_id); } } return s; @@ -155,8 +155,9 @@ VersionEditHandler::VersionEditHandler( bool read_only, std::vector column_families, VersionSet* version_set, bool track_missing_files, bool no_error_if_files_missing, const std::shared_ptr& io_tracer, - bool skip_load_table_files, EpochNumberRequirement epoch_number_requirement) - : VersionEditHandlerBase(), + const ReadOptions& read_options, bool skip_load_table_files, + EpochNumberRequirement epoch_number_requirement) + : VersionEditHandlerBase(read_options), read_only_(read_only), column_families_(std::move(column_families)), version_set_(version_set), @@ -198,9 +199,9 @@ Status VersionEditHandler::Initialize() { Status VersionEditHandler::ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) { Status s; - if (edit.is_column_family_add_) { + if (edit.IsColumnFamilyAdd()) { s = OnColumnFamilyAdd(edit, cfd); - } else if (edit.is_column_family_drop_) { + } else if (edit.IsColumnFamilyDrop()) { s = OnColumnFamilyDrop(edit, cfd); } else if (edit.IsWalAddition()) { s = OnWalAddition(edit); @@ -224,22 +225,22 @@ Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit, assert(cfd != nullptr); *cfd = nullptr; + const std::string& cf_name = edit.GetColumnFamilyName(); Status s; if (cf_in_builders || cf_in_not_found) { s = Status::Corruption("MANIFEST adding the same column family twice: " + - edit.column_family_name_); + cf_name); } if (s.ok()) { - auto cf_options = name_to_options_.find(edit.column_family_name_); + auto cf_options = name_to_options_.find(cf_name); // implicitly add persistent_stats column family without requiring user // to specify ColumnFamilyData* tmp_cfd = nullptr; bool is_persistent_stats_column_family = - edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0; + cf_name.compare(kPersistentStatsColumnFamilyName) == 0; if (cf_options == name_to_options_.end() && !is_persistent_stats_column_family) { - column_families_not_found_.emplace(edit.column_family_, - edit.column_family_name_); + column_families_not_found_.emplace(edit.GetColumnFamily(), cf_name); } else { if (is_persistent_stats_column_family) { ColumnFamilyOptions cfo; @@ -267,7 +268,7 @@ Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit, if (cf_in_builders) { tmp_cfd = DestroyCfAndCleanup(edit); } else if (cf_in_not_found) { - column_families_not_found_.erase(edit.column_family_); + column_families_not_found_.erase(edit.GetColumnFamily()); } else { s = Status::Corruption("MANIFEST - dropping non-existing column family"); } @@ -302,11 +303,22 @@ Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit, } ColumnFamilyData* tmp_cfd = nullptr; if (s.ok()) { - auto builder_iter = builders_.find(edit.column_family_); + auto builder_iter = builders_.find(edit.GetColumnFamily()); assert(builder_iter != builders_.end()); tmp_cfd = version_set_->GetColumnFamilySet()->GetColumnFamily( - edit.column_family_); + edit.GetColumnFamily()); assert(tmp_cfd != nullptr); + // It's important to handle file boundaries before `MaybeCreateVersion` + // because `VersionEditHandlerPointInTime::MaybeCreateVersion` does + // `FileMetaData` verification that involves the file boundaries. + // All `VersionEditHandlerBase` subclasses that need to deal with + // `FileMetaData` for new files are also subclasses of + // `VersionEditHandler`, so it's sufficient to do the file boundaries + // handling in this method. + s = MaybeHandleFileBoundariesForNewFiles(edit, tmp_cfd); + if (!s.ok()) { + return s; + } s = MaybeCreateVersion(edit, tmp_cfd, /*force_create_version=*/false); if (s.ok()) { s = builder_iter->second->version_builder()->Apply(&edit); @@ -348,11 +360,12 @@ void VersionEditHandler::CheckColumnFamilyId(const VersionEdit& edit, // record. Once we encounter column family drop record, // we will delete the column family from // column_families_not_found. - bool in_not_found = column_families_not_found_.find(edit.column_family_) != + uint32_t cf_id = edit.GetColumnFamily(); + bool in_not_found = column_families_not_found_.find(cf_id) != column_families_not_found_.end(); // in builders means that user supplied that column family // option AND that we encountered column family add record - bool in_builders = builders_.find(edit.column_family_) != builders_.end(); + bool in_builders = builders_.find(cf_id) != builders_.end(); // They cannot both be true assert(!(in_not_found && in_builders)); *cf_in_not_found = in_not_found; @@ -364,17 +377,17 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, assert(s != nullptr); if (!s->ok()) { // Do nothing here. - } else if (!version_edit_params_.has_log_number_ || - !version_edit_params_.has_next_file_number_ || - !version_edit_params_.has_last_sequence_) { + } else if (!version_edit_params_.HasLogNumber() || + !version_edit_params_.HasNextFile() || + !version_edit_params_.HasLastSequence()) { std::string msg("no "); - if (!version_edit_params_.has_log_number_) { + if (!version_edit_params_.HasLogNumber()) { msg.append("log_file_number, "); } - if (!version_edit_params_.has_next_file_number_) { + if (!version_edit_params_.HasNextFile()) { msg.append("next_file_number, "); } - if (!version_edit_params_.has_last_sequence_) { + if (!version_edit_params_.HasLastSequence()) { msg.append("last_sequence, "); } msg = msg.substr(0, msg.size() - 2); @@ -395,11 +408,11 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, } if (s->ok()) { version_set_->GetColumnFamilySet()->UpdateMaxColumnFamily( - version_edit_params_.max_column_family_); + version_edit_params_.GetMaxColumnFamily()); version_set_->MarkMinLogNumberToKeep( - version_edit_params_.min_log_number_to_keep_); - version_set_->MarkFileNumberUsed(version_edit_params_.prev_log_number_); - version_set_->MarkFileNumberUsed(version_edit_params_.log_number_); + version_edit_params_.GetMinLogNumberToKeep()); + version_set_->MarkFileNumberUsed(version_edit_params_.GetPrevLogNumber()); + version_set_->MarkFileNumberUsed(version_edit_params_.GetLogNumber()); for (auto* cfd : *(version_set_->GetColumnFamilySet())) { if (cfd->IsDropped()) { continue; @@ -450,9 +463,9 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, if (s->ok()) { version_set_->manifest_file_size_ = reader.GetReadOffset(); assert(version_set_->manifest_file_size_ > 0); - version_set_->next_file_number_.store( - version_edit_params_.next_file_number_ + 1); - SequenceNumber last_seq = version_edit_params_.last_sequence_; + version_set_->next_file_number_.store(version_edit_params_.GetNextFile() + + 1); + SequenceNumber last_seq = version_edit_params_.GetLastSequence(); assert(last_seq != kMaxSequenceNumber); if (last_seq != kMaxSequenceNumber && last_seq > version_set_->last_allocated_sequence_.load()) { @@ -474,7 +487,6 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, // sequence number zeroed through compaction. version_set_->descriptor_last_sequence_ = last_seq; } - version_set_->prev_log_number_ = version_edit_params_.prev_log_number_; if (version_edit_params_.HasManifestUpdateSequence()) { version_set_->manifest_update_sequence_ = version_edit_params_.GetManifestUpdateSequence(); @@ -488,39 +500,40 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, ColumnFamilyData* VersionEditHandler::CreateCfAndInit( const ColumnFamilyOptions& cf_options, const VersionEdit& edit) { - ColumnFamilyData* cfd = version_set_->CreateColumnFamily(cf_options, &edit); + uint32_t cf_id = edit.GetColumnFamily(); + ColumnFamilyData* cfd = + version_set_->CreateColumnFamily(cf_options, read_options_, &edit); assert(cfd != nullptr); cfd->set_initialized(); - assert(builders_.find(edit.column_family_) == builders_.end()); - builders_.emplace(edit.column_family_, + assert(builders_.find(cf_id) == builders_.end()); + builders_.emplace(cf_id, VersionBuilderUPtr(new BaseReferencedVersionBuilder(cfd))); if (track_missing_files_) { - cf_to_missing_files_.emplace(edit.column_family_, - std::unordered_set()); - cf_to_missing_blob_files_high_.emplace(edit.column_family_, - kInvalidBlobFileNumber); + cf_to_missing_files_.emplace(cf_id, std::unordered_set()); + cf_to_missing_blob_files_high_.emplace(cf_id, kInvalidBlobFileNumber); } return cfd; } ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup( const VersionEdit& edit) { - auto builder_iter = builders_.find(edit.column_family_); + uint32_t cf_id = edit.GetColumnFamily(); + auto builder_iter = builders_.find(cf_id); assert(builder_iter != builders_.end()); builders_.erase(builder_iter); if (track_missing_files_) { - auto missing_files_iter = cf_to_missing_files_.find(edit.column_family_); + auto missing_files_iter = cf_to_missing_files_.find(cf_id); assert(missing_files_iter != cf_to_missing_files_.end()); cf_to_missing_files_.erase(missing_files_iter); auto missing_blob_files_high_iter = - cf_to_missing_blob_files_high_.find(edit.column_family_); + cf_to_missing_blob_files_high_.find(cf_id); assert(missing_blob_files_high_iter != cf_to_missing_blob_files_high_.end()); cf_to_missing_blob_files_high_.erase(missing_blob_files_high_iter); } ColumnFamilyData* ret = - version_set_->GetColumnFamilySet()->GetColumnFamily(edit.column_family_); + version_set_->GetColumnFamilySet()->GetColumnFamily(cf_id); assert(ret != nullptr); ret->SetDropped(); ret->UnrefAndTryDelete(); @@ -545,7 +558,7 @@ Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/, if (s.ok()) { // Install new version v->PrepareAppend( - *cfd->GetLatestMutableCFOptions(), + *cfd->GetLatestMutableCFOptions(), read_options_, !(version_set_->db_options_->skip_stats_update_on_db_open)); version_set_->AppendVersion(cfd, v); } else { @@ -572,12 +585,13 @@ Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd, assert(builder_iter->second != nullptr); VersionBuilder* builder = builder_iter->second->version_builder(); assert(builder); + const MutableCFOptions* moptions = cfd->GetLatestMutableCFOptions(); Status s = builder->LoadTableHandlers( cfd->internal_stats(), version_set_->db_options_->max_file_opening_threads, prefetch_index_and_filter_in_cache, is_initial_load, - cfd->GetLatestMutableCFOptions()->prefix_extractor, - MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions())); + moptions->prefix_extractor, MaxFileSizeForL0MetaPin(*moptions), + read_options_, moptions->block_protection_bytes_per_key); if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) { s = Status::OK(); } @@ -590,30 +604,36 @@ Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd, Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, const VersionEdit& edit) { Status s; - if (edit.has_db_id_) { + if (edit.HasDbId()) { version_set_->db_id_ = edit.GetDbId(); - version_edit_params_.SetDBId(edit.db_id_); + version_edit_params_.SetDBId(edit.GetDbId()); } if (cfd != nullptr) { - if (edit.has_log_number_) { - if (cfd->GetLogNumber() > edit.log_number_) { + if (edit.HasLogNumber()) { + if (cfd->GetLogNumber() > edit.GetLogNumber()) { ROCKS_LOG_WARN( version_set_->db_options()->info_log, "MANIFEST corruption detected, but ignored - Log numbers in " "records NOT monotonically increasing"); } else { - cfd->SetLogNumber(edit.log_number_); - version_edit_params_.SetLogNumber(edit.log_number_); + cfd->SetLogNumber(edit.GetLogNumber()); + version_edit_params_.SetLogNumber(edit.GetLogNumber()); } } - if (edit.has_comparator_ && - edit.comparator_ != cfd->user_comparator()->Name()) { - if (!cf_to_cmp_names_) { - s = Status::InvalidArgument( - cfd->user_comparator()->Name(), - "does not match existing comparator " + edit.comparator_); - } else { - cf_to_cmp_names_->emplace(cfd->GetID(), edit.comparator_); + if (edit.HasComparatorName()) { + bool mark_sst_files_has_no_udt = false; + // If `persist_user_defined_timestamps` flag is recorded in manifest, it + // is guaranteed to be in the same VersionEdit as comparator. Otherwise, + // it's not recorded and it should have default value true. + s = ValidateUserDefinedTimestampsOptions( + cfd->user_comparator(), edit.GetComparatorName(), + cfd->ioptions()->persist_user_defined_timestamps, + edit.GetPersistUserDefinedTimestamps(), &mark_sst_files_has_no_udt); + if (!s.ok() && cf_to_cmp_names_) { + cf_to_cmp_names_->emplace(cfd->GetID(), edit.GetComparatorName()); + } + if (mark_sst_files_has_no_udt) { + cfds_to_mark_no_udt_.insert(cfd->GetID()); } } if (edit.HasFullHistoryTsLow()) { @@ -623,60 +643,114 @@ Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, } if (s.ok()) { - if (edit.has_prev_log_number_) { - version_edit_params_.SetPrevLogNumber(edit.prev_log_number_); + if (edit.HasPrevLogNumber()) { + version_edit_params_.SetPrevLogNumber(edit.GetPrevLogNumber()); } - if (edit.has_next_file_number_) { - version_edit_params_.SetNextFile(edit.next_file_number_); + if (edit.HasNextFile()) { + version_edit_params_.SetNextFile(edit.GetNextFile()); } - if (edit.has_max_column_family_) { - version_edit_params_.SetMaxColumnFamily(edit.max_column_family_); + if (edit.HasMaxColumnFamily()) { + version_edit_params_.SetMaxColumnFamily(edit.GetMaxColumnFamily()); } - if (edit.has_min_log_number_to_keep_) { - version_edit_params_.min_log_number_to_keep_ = - std::max(version_edit_params_.min_log_number_to_keep_, - edit.min_log_number_to_keep_); + if (edit.HasMinLogNumberToKeep()) { + version_edit_params_.SetMinLogNumberToKeep( + std::max(version_edit_params_.GetMinLogNumberToKeep(), + edit.GetMinLogNumberToKeep())); } - if (edit.has_last_sequence_) { + if (edit.HasLastSequence()) { // `VersionEdit::last_sequence_`s are assumed to be non-decreasing. This // is legacy behavior that cannot change without breaking downgrade // compatibility. - assert(!version_edit_params_.has_last_sequence_ || - version_edit_params_.last_sequence_ <= edit.last_sequence_); - version_edit_params_.SetLastSequence(edit.last_sequence_); + assert(!version_edit_params_.HasLastSequence() || + version_edit_params_.GetLastSequence() <= edit.GetLastSequence()); + version_edit_params_.SetLastSequence(edit.GetLastSequence()); } - if (!version_edit_params_.has_prev_log_number_) { + if (!version_edit_params_.HasPrevLogNumber()) { version_edit_params_.SetPrevLogNumber(0); } - if (edit.has_replication_sequence_) { - version_edit_params_.SetReplicationSequence(edit.replication_sequence_); + if (edit.HasReplicationSequence()) { + version_edit_params_.SetReplicationSequence(edit.GetReplicationSequence()); } - if (edit.has_manifest_update_sequence_) { + if (edit.HasManifestUpdateSequence()) { // Manifest update should be stricly and monotonically increasing. - if (version_edit_params_.has_manifest_update_sequence_ && - edit.manifest_update_sequence_ != - version_edit_params_.manifest_update_sequence_ + 1) { + if (version_edit_params_.HasManifestUpdateSequence() && + edit.GetManifestUpdateSequence() != + version_edit_params_.GetManifestUpdateSequence() + 1) { std::ostringstream oss; oss << "Gap in ManifestUpdateSequence, expected=" - << version_edit_params_.manifest_update_sequence_ + 1 - << " got=" << edit.manifest_update_sequence_; + << version_edit_params_.GetManifestUpdateSequence() + 1 + << " got=" << edit.GetManifestUpdateSequence(); return Status::Corruption(oss.str()); } version_edit_params_.SetManifestUpdateSequence( - edit.manifest_update_sequence_); + edit.GetManifestUpdateSequence()); } } return s; } +Status VersionEditHandler::MaybeHandleFileBoundariesForNewFiles( + VersionEdit& edit, const ColumnFamilyData* cfd) { + if (edit.GetNewFiles().empty()) { + return Status::OK(); + } + auto ucmp = cfd->user_comparator(); + assert(ucmp); + size_t ts_sz = ucmp->timestamp_size(); + if (ts_sz == 0) { + return Status::OK(); + } + + VersionEdit::NewFiles& new_files = edit.GetMutableNewFiles(); + assert(!new_files.empty()); + // If true, enabling user-defined timestamp is detected for this column + // family. All its existing SST files need to have the file boundaries handled + // and their `persist_user_defined_timestamps` flag set to false regardless of + // its existing value. + bool mark_existing_ssts_with_no_udt = + cfds_to_mark_no_udt_.find(cfd->GetID()) != cfds_to_mark_no_udt_.end(); + bool file_boundaries_need_handling = false; + for (auto& new_file : new_files) { + FileMetaData& meta = new_file.second; + if (meta.user_defined_timestamps_persisted && + !mark_existing_ssts_with_no_udt) { + // `FileMetaData.user_defined_timestamps_persisted` field is the value of + // the flag `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` + // at the time when the SST file was created. As a result, all added SST + // files in one `VersionEdit` should have the same value for it. + if (file_boundaries_need_handling) { + return Status::Corruption( + "New files in one VersionEdit has different " + "user_defined_timestamps_persisted value."); + } + break; + } + file_boundaries_need_handling = true; + assert(!meta.user_defined_timestamps_persisted || + mark_existing_ssts_with_no_udt); + if (mark_existing_ssts_with_no_udt) { + meta.user_defined_timestamps_persisted = false; + } + std::string smallest_buf; + std::string largest_buf; + PadInternalKeyWithMinTimestamp(&smallest_buf, meta.smallest.Encode(), + ts_sz); + PadInternalKeyWithMinTimestamp(&largest_buf, meta.largest.Encode(), ts_sz); + meta.smallest.DecodeFrom(smallest_buf); + meta.largest.DecodeFrom(largest_buf); + } + return Status::OK(); +} + VersionEditHandlerPointInTime::VersionEditHandlerPointInTime( bool read_only, std::vector column_families, VersionSet* version_set, const std::shared_ptr& io_tracer, + const ReadOptions& read_options, EpochNumberRequirement epoch_number_requirement) : VersionEditHandler(read_only, column_families, version_set, /*track_missing_files=*/true, /*no_error_if_files_missing=*/true, io_tracer, - epoch_number_requirement) {} + read_options, epoch_number_requirement) {} VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() { for (const auto& elem : versions_) { @@ -714,7 +788,7 @@ void VersionEditHandlerPointInTime::CheckIterationResult( ColumnFamilyData* VersionEditHandlerPointInTime::DestroyCfAndCleanup( const VersionEdit& edit) { ColumnFamilyData* cfd = VersionEditHandler::DestroyCfAndCleanup(edit); - auto v_iter = versions_.find(edit.column_family_); + auto v_iter = versions_.find(edit.GetColumnFamily()); if (v_iter != versions_.end()) { delete v_iter->second; versions_.erase(v_iter); @@ -726,7 +800,7 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) { assert(cfd != nullptr); if (!force_create_version) { - assert(edit.column_family_ == cfd->GetID()); + assert(edit.GetColumnFamily() == cfd->GetID()); } auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID()); assert(missing_files_iter != cf_to_missing_files_.end()); @@ -809,9 +883,9 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( const bool has_missing_files = !missing_files.empty() || has_missing_blob_files; - bool missing_info = !version_edit_params_.has_log_number_ || - !version_edit_params_.has_next_file_number_ || - !version_edit_params_.has_last_sequence_; + bool missing_info = !version_edit_params_.HasLogNumber() || + !version_edit_params_.HasNextFile() || + !version_edit_params_.HasLastSequence(); // Create version before apply edit. The version will represent the state // before applying the version edit. @@ -833,15 +907,16 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( assert(builder); } + const MutableCFOptions* cf_opts_ptr = cfd->GetLatestMutableCFOptions(); auto* version = new Version(cfd, version_set_, version_set_->file_options_, - *cfd->GetLatestMutableCFOptions(), io_tracer_, + *cf_opts_ptr, io_tracer_, version_set_->current_version_number_++, epoch_number_requirement_); s = builder->LoadTableHandlers( cfd->internal_stats(), version_set_->db_options_->max_file_opening_threads, false, true, - cfd->GetLatestMutableCFOptions()->prefix_extractor, - MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions())); + cf_opts_ptr->prefix_extractor, MaxFileSizeForL0MetaPin(*cf_opts_ptr), + read_options_, cf_opts_ptr->block_protection_bytes_per_key); if (!s.ok()) { delete version; if (s.IsCorruption()) { @@ -852,7 +927,7 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( s = builder->SaveTo(version->storage_info()); if (s.ok()) { version->PrepareAppend( - *cfd->GetLatestMutableCFOptions(), + *cfd->GetLatestMutableCFOptions(), read_options_, !version_set_->db_options_->skip_stats_update_on_db_open); auto v_iter = versions_.find(cfd->GetID()); if (v_iter != versions_.end()) { @@ -872,7 +947,8 @@ Status VersionEditHandlerPointInTime::VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level, const FileMetaData& fmeta) { - return version_set_->VerifyFileMetadata(cfd, fpath, level, fmeta); + return version_set_->VerifyFileMetadata(read_options_, cfd, fpath, level, + fmeta); } Status VersionEditHandlerPointInTime::VerifyBlobFile( @@ -881,7 +957,9 @@ Status VersionEditHandlerPointInTime::VerifyBlobFile( BlobSource* blob_source = cfd->blob_source(); assert(blob_source); CacheHandleGuard blob_file_reader; - Status s = blob_source->GetBlobFileReader(blob_file_num, &blob_file_reader); + + Status s = blob_source->GetBlobFileReader(read_options_, blob_file_num, + &blob_file_reader); if (!s.ok()) { return s; } diff --git a/db/version_edit_handler.h b/db/version_edit_handler.h index fc3fe7c6b88d..af0817e4a17d 100644 --- a/db/version_edit_handler.h +++ b/db/version_edit_handler.h @@ -19,8 +19,9 @@ struct FileMetaData; class VersionEditHandlerBase { public: - explicit VersionEditHandlerBase() - : max_manifest_read_size_(std::numeric_limits::max()) {} + explicit VersionEditHandlerBase(const ReadOptions& read_options) + : read_options_(read_options), + max_manifest_read_size_(std::numeric_limits::max()) {} virtual ~VersionEditHandlerBase() {} @@ -31,8 +32,9 @@ class VersionEditHandlerBase { AtomicGroupReadBuffer& GetReadBuffer() { return read_buffer_; } protected: - explicit VersionEditHandlerBase(uint64_t max_read_size) - : max_manifest_read_size_(max_read_size) {} + explicit VersionEditHandlerBase(const ReadOptions& read_options, + uint64_t max_read_size) + : read_options_(read_options), max_manifest_read_size_(max_read_size) {} virtual Status Initialize() { return Status::OK(); } virtual Status ApplyVersionEdit(VersionEdit& edit, @@ -45,6 +47,8 @@ class VersionEditHandlerBase { Status status_; + const ReadOptions& read_options_; + private: AtomicGroupReadBuffer read_buffer_; const uint64_t max_manifest_read_size_; @@ -52,7 +56,8 @@ class VersionEditHandlerBase { class ListColumnFamiliesHandler : public VersionEditHandlerBase { public: - ListColumnFamiliesHandler() : VersionEditHandlerBase() {} + explicit ListColumnFamiliesHandler(const ReadOptions& read_options) + : VersionEditHandlerBase(read_options) {} ~ListColumnFamiliesHandler() override {} @@ -72,9 +77,9 @@ class ListColumnFamiliesHandler : public VersionEditHandlerBase { class FileChecksumRetriever : public VersionEditHandlerBase { public: - FileChecksumRetriever(uint64_t max_read_size, + FileChecksumRetriever(const ReadOptions& read_options, uint64_t max_read_size, FileChecksumList& file_checksum_list) - : VersionEditHandlerBase(max_read_size), + : VersionEditHandlerBase(read_options, max_read_size), file_checksum_list_(file_checksum_list) {} ~FileChecksumRetriever() override {} @@ -111,12 +116,13 @@ class VersionEditHandler : public VersionEditHandlerBase { VersionSet* version_set, bool track_missing_files, bool no_error_if_files_missing, const std::shared_ptr& io_tracer, + const ReadOptions& read_options, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent) - : VersionEditHandler(read_only, column_families, version_set, - track_missing_files, no_error_if_files_missing, - io_tracer, /*skip_load_table_files=*/false, - epoch_number_requirement) {} + : VersionEditHandler( + read_only, column_families, version_set, track_missing_files, + no_error_if_files_missing, io_tracer, read_options, + /*skip_load_table_files=*/false, epoch_number_requirement) {} ~VersionEditHandler() override {} @@ -127,8 +133,8 @@ class VersionEditHandler : public VersionEditHandlerBase { bool HasMissingFiles() const; void GetDbId(std::string* db_id) const { - if (db_id && version_edit_params_.has_db_id_) { - *db_id = version_edit_params_.db_id_; + if (db_id && version_edit_params_.HasDbId()) { + *db_id = version_edit_params_.GetDbId(); } } @@ -137,7 +143,8 @@ class VersionEditHandler : public VersionEditHandlerBase { bool read_only, std::vector column_families, VersionSet* version_set, bool track_missing_files, bool no_error_if_files_missing, - const std::shared_ptr& io_tracer, bool skip_load_table_files, + const std::shared_ptr& io_tracer, + const ReadOptions& read_options, bool skip_load_table_files, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent); @@ -195,10 +202,22 @@ class VersionEditHandler : public VersionEditHandlerBase { bool initialized_; std::unique_ptr> cf_to_cmp_names_; EpochNumberRequirement epoch_number_requirement_; + std::unordered_set cfds_to_mark_no_udt_; private: Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, const VersionEdit& edit); + + // When `FileMetaData.user_defined_timestamps_persisted` is false and + // user-defined timestamp size is non-zero. User-defined timestamps are + // stripped from file boundaries: `smallest`, `largest` in + // `VersionEdit.DecodeFrom` before they were written to Manifest. + // This is the mirroring change to handle file boundaries on the Manifest read + // path for this scenario: to pad a minimum timestamp to the user key in + // `smallest` and `largest` so their format are consistent with the running + // user comparator. + Status MaybeHandleFileBoundariesForNewFiles(VersionEdit& edit, + const ColumnFamilyData* cfd); }; // A class similar to its base class, i.e. VersionEditHandler. @@ -212,6 +231,7 @@ class VersionEditHandlerPointInTime : public VersionEditHandler { VersionEditHandlerPointInTime( bool read_only, std::vector column_families, VersionSet* version_set, const std::shared_ptr& io_tracer, + const ReadOptions& read_options, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent); ~VersionEditHandlerPointInTime() override; @@ -238,10 +258,11 @@ class ManifestTailer : public VersionEditHandlerPointInTime { explicit ManifestTailer(std::vector column_families, VersionSet* version_set, const std::shared_ptr& io_tracer, + const ReadOptions& read_options, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent) : VersionEditHandlerPointInTime(/*read_only=*/false, column_families, - version_set, io_tracer, + version_set, io_tracer, read_options, epoch_number_requirement), mode_(Mode::kRecovery) {} @@ -281,12 +302,13 @@ class DumpManifestHandler : public VersionEditHandler { public: DumpManifestHandler(std::vector column_families, VersionSet* version_set, - const std::shared_ptr& io_tracer, bool verbose, - bool hex, bool json) + const std::shared_ptr& io_tracer, + const ReadOptions& read_options, bool verbose, bool hex, + bool json) : VersionEditHandler( /*read_only=*/true, column_families, version_set, /*track_missing_files=*/false, - /*no_error_if_files_missing=*/false, io_tracer, + /*no_error_if_files_missing=*/false, io_tracer, read_options, /*skip_load_table_files=*/true), verbose_(verbose), hex_(hex), diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 1fa6c0054973..c47389901739 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -21,12 +21,20 @@ namespace ROCKSDB_NAMESPACE { static void TestEncodeDecode(const VersionEdit& edit) { + // Encoding one `VersionEdit` and decoding it again should result in the + // exact same `VersionEdit`. However, a special handling is applied to file + // boundaries: `FileMetaData.smallest`, `FileMetaData.largest` when + // user-defined timestamps should not be persisted. In that scenario, this + // invariant does not hold. We disable this scenario in this util method to + // enable all other test cases continue to verify this invariant, while the + // special case is separately covered in test + // `EncodeDecodeNewFile4HandleFileBoundary`. std::string encoded, encoded2; - edit.EncodeTo(&encoded); + edit.EncodeTo(&encoded, 0 /* ts_sz */); VersionEdit parsed; Status s = parsed.DecodeFrom(encoded); ASSERT_TRUE(s.ok()) << s.ToString(); - parsed.EncodeTo(&encoded2); + parsed.EncodeTo(&encoded2, 0 /* ts_sz */); ASSERT_EQ(encoded, encoded2); } @@ -45,11 +53,12 @@ TEST_F(VersionEditTest, EncodeDecode) { kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown, kInvalidBlobFileNumber, 888, 678, kBig + 300 + i /* epoch_number */, "234", "crc32c", - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0, true); edit.DeleteFile(4, kBig + 700 + i); } edit.SetComparatorName("foo"); + edit.SetPersistUserDefinedTimestamps(true); edit.SetLogNumber(kBig + 100); edit.SetNextFile(kBig + 200); edit.SetLastSequence(kBig + 1000); @@ -65,41 +74,42 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) { kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 300 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, true); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 301 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, false); edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue), InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502, kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber, 666, 888, 302 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, true); edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex), InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503, kBig + 603, true, Temperature::kUnknown, 1001, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 303 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, true); edit.DeleteFile(4, 700); edit.SetComparatorName("foo"); + edit.SetPersistUserDefinedTimestamps(false); edit.SetLogNumber(kBig + 100); edit.SetNextFile(kBig + 200); edit.SetLastSequence(kBig + 1000); TestEncodeDecode(edit); std::string encoded, encoded2; - edit.EncodeTo(&encoded); + edit.EncodeTo(&encoded, 0 /* ts_sz */); VersionEdit parsed; Status s = parsed.DecodeFrom(encoded); ASSERT_TRUE(s.ok()) << s.ToString(); auto& new_files = parsed.GetNewFiles(); ASSERT_TRUE(new_files[0].second.marked_for_compaction); - ASSERT_TRUE(!new_files[1].second.marked_for_compaction); + ASSERT_FALSE(new_files[1].second.marked_for_compaction); ASSERT_TRUE(new_files[2].second.marked_for_compaction); ASSERT_TRUE(new_files[3].second.marked_for_compaction); ASSERT_EQ(3u, new_files[0].second.fd.GetPathId()); @@ -113,6 +123,62 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) { ASSERT_EQ(kInvalidBlobFileNumber, new_files[2].second.oldest_blob_file_number); ASSERT_EQ(1001, new_files[3].second.oldest_blob_file_number); + ASSERT_TRUE(new_files[0].second.user_defined_timestamps_persisted); + ASSERT_FALSE(new_files[1].second.user_defined_timestamps_persisted); + ASSERT_TRUE(new_files[2].second.user_defined_timestamps_persisted); + ASSERT_TRUE(new_files[3].second.user_defined_timestamps_persisted); + ASSERT_FALSE(parsed.GetPersistUserDefinedTimestamps()); +} + +TEST_F(VersionEditTest, EncodeDecodeNewFile4HandleFileBoundary) { + static const uint64_t kBig = 1ull << 50; + size_t ts_sz = 16; + static std::string min_ts(ts_sz, static_cast(0)); + VersionEdit edit; + std::string smallest = "foo"; + std::string largest = "zoo"; + // In real manifest writing scenarios, one `VersionEdit` should not contain + // files with different `user_defined_timestamps_persisted` flag value. + // This is just for testing file boundaries handling w.r.t persisting user + // defined timestamps during `VersionEdit` encoding. + edit.AddFile( + 3, 300, 3, 100, InternalKey(smallest + min_ts, kBig + 500, kTypeValue), + InternalKey(largest + min_ts, kBig + 600, kTypeDeletion), kBig + 500, + kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + 300 /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, + 0 /* compensated_range_deletion_size */, 0 /* tail_size */, + false /* user_defined_timestamps_persisted */); + edit.AddFile(3, 300, 3, 100, + InternalKey(smallest + min_ts, kBig + 500, kTypeValue), + InternalKey(largest + min_ts, kBig + 600, kTypeDeletion), + kBig + 500, kBig + 600, true, Temperature::kUnknown, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, 300 /* epoch_number */, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0 /* compensated_range_deletion_size */, + 0 /* tail_size */, true /* user_defined_timestamps_persisted */); + + std::string encoded; + edit.EncodeTo(&encoded, ts_sz); + VersionEdit parsed; + Status s = parsed.DecodeFrom(encoded); + ASSERT_TRUE(s.ok()) << s.ToString(); + auto& new_files = parsed.GetNewFiles(); + ASSERT_TRUE(new_files.size() == 2); + ASSERT_FALSE(new_files[0].second.user_defined_timestamps_persisted); + // First file's boundaries do not contain user-defined timestamps. + ASSERT_EQ(InternalKey(smallest, kBig + 500, kTypeValue).Encode(), + new_files[0].second.smallest.Encode()); + ASSERT_EQ(InternalKey(largest, kBig + 600, kTypeDeletion).Encode(), + new_files[0].second.largest.Encode()); + ASSERT_TRUE(new_files[1].second.user_defined_timestamps_persisted); + // Second file's boundaries contain user-defined timestamps. + ASSERT_EQ(InternalKey(smallest + min_ts, kBig + 500, kTypeValue).Encode(), + new_files[1].second.smallest.Encode()); + ASSERT_EQ(InternalKey(largest + min_ts, kBig + 600, kTypeDeletion).Encode(), + new_files[1].second.largest.Encode()); } TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { @@ -123,15 +189,16 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 300 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, true); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, 686, 868, 301 /* epoch_number */, "234", "crc32c", - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0, true); edit.DeleteFile(4, 700); edit.SetComparatorName("foo"); + edit.SetPersistUserDefinedTimestamps(true); edit.SetLogNumber(kBig + 100); edit.SetNextFile(kBig + 200); edit.SetLastSequence(kBig + 1000); @@ -154,7 +221,7 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - edit.EncodeTo(&encoded); + edit.EncodeTo(&encoded, 0 /* ts_sz */); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); VersionEdit parsed; @@ -167,6 +234,7 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { ASSERT_EQ(3u, new_files[0].second.fd.GetPathId()); ASSERT_EQ(3u, new_files[1].second.fd.GetPathId()); ASSERT_EQ(1u, parsed.GetDeletedFiles().size()); + ASSERT_TRUE(parsed.GetPersistUserDefinedTimestamps()); } TEST_F(VersionEditTest, NewFile4NotSupportedField) { @@ -177,9 +245,10 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) { kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 300 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, false); edit.SetComparatorName("foo"); + edit.SetPersistUserDefinedTimestamps(false); edit.SetLogNumber(kBig + 100); edit.SetNextFile(kBig + 200); edit.SetLastSequence(kBig + 1000); @@ -194,7 +263,7 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) { PutLengthPrefixedSlice(str, str1); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - edit.EncodeTo(&encoded); + edit.EncodeTo(&encoded, 0 /* ts_sz */); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); VersionEdit parsed; @@ -208,9 +277,9 @@ TEST_F(VersionEditTest, EncodeEmptyFile) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /*epoch_number*/, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, true); std::string buffer; - ASSERT_TRUE(!edit.EncodeTo(&buffer)); + ASSERT_TRUE(!edit.EncodeTo(&buffer, 0 /* ts_sz */)); } TEST_F(VersionEditTest, ColumnFamilyTest) { @@ -579,7 +648,7 @@ TEST_F(VersionEditTest, IgnorableTags) { edit.SetColumnFamily(kColumnFamilyId); std::string encoded; - ASSERT_TRUE(edit.EncodeTo(&encoded)); + ASSERT_TRUE(edit.EncodeTo(&encoded, 0 /* ts_sz */)); VersionEdit decoded; ASSERT_OK(decoded.DecodeFrom(encoded)); diff --git a/db/version_set.cc b/db/version_set.cc index bc114b9571df..11399d628fc9 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -24,7 +24,6 @@ #include "db/blob/blob_fetcher.h" #include "db/blob/blob_file_cache.h" #include "db/blob/blob_file_reader.h" -#include "db/blob/blob_index.h" #include "db/blob/blob_log_format.h" #include "db/blob/blob_source.h" #include "db/compaction/compaction.h" @@ -39,7 +38,12 @@ #include "db/pinned_iterators_manager.h" #include "db/table_cache.h" #include "db/version_builder.h" +#include "db/version_edit.h" #include "db/version_edit_handler.h" +#include "db/wide/wide_columns_helper.h" +#include "file/file_util.h" +#include "table/compaction_merging_iterator.h" + #if USE_COROUTINES #include "folly/experimental/coro/BlockingWait.h" #include "folly/experimental/coro/Collect.h" @@ -92,11 +96,13 @@ namespace ROCKSDB_NAMESPACE { namespace { Status SerializeReplicationLogManifestWrite( - std::string* dst, const autovector& src) { + std::string* dst, const autovector& src, + const autovector>& batch_edits_ts_sz) { PutVarint64(dst, src.size()); + size_t idx = 0; for (auto& e : src) { std::string tmp; - if (!e->EncodeTo(&tmp)) { + if (!e->EncodeTo(&tmp, batch_edits_ts_sz[idx++])) { return Status::Corruption("Unable to encode VersionEdit:" + e->DebugString(true)); } @@ -956,7 +962,7 @@ class LevelIterator final : public InternalIterator { const std::shared_ptr& prefix_extractor, bool should_sample, HistogramImpl* file_read_hist, TableReaderCaller caller, bool skip_filters, int level, - RangeDelAggregator* range_del_agg, + uint8_t block_protection_bytes_per_key, RangeDelAggregator* range_del_agg, const std::vector* compaction_boundaries = nullptr, bool allow_unprepared_value = false, @@ -969,17 +975,21 @@ class LevelIterator final : public InternalIterator { flevel_(flevel), prefix_extractor_(prefix_extractor), file_read_hist_(file_read_hist), - should_sample_(should_sample), caller_(caller), - skip_filters_(skip_filters), - allow_unprepared_value_(allow_unprepared_value), file_index_(flevel_->num_files), - level_(level), range_del_agg_(range_del_agg), pinned_iters_mgr_(nullptr), compaction_boundaries_(compaction_boundaries), - is_next_read_sequential_(false), range_tombstone_iter_(nullptr), + read_seq_(read_options.snapshot + ? read_options.snapshot->GetSequenceNumber() + : kMaxSequenceNumber), + level_(level), + block_protection_bytes_per_key_(block_protection_bytes_per_key), + should_sample_(should_sample), + skip_filters_(skip_filters), + allow_unprepared_value_(allow_unprepared_value), + is_next_read_sequential_(false), to_return_sentinel_(false) { // Empty level is not supported. assert(flevel_ != nullptr && flevel_->num_files > 0); @@ -1067,6 +1077,10 @@ class LevelIterator final : public InternalIterator { bool IsDeleteRangeSentinelKey() const override { return to_return_sentinel_; } + void SetRangeDelReadSeqno(SequenceNumber read_seq) override { + read_seq_ = read_seq; + } + private: // Return true if at least one invalid file is seen and skipped. bool SkipEmptyFileForward(); @@ -1122,7 +1136,8 @@ class LevelIterator final : public InternalIterator { nullptr /* don't need reference to table */, file_read_hist_, caller_, /*arena=*/nullptr, skip_filters_, level_, /*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key, - largest_compaction_key, allow_unprepared_value_, range_tombstone_iter_); + largest_compaction_key, allow_unprepared_value_, + block_protection_bytes_per_key_, &read_seq_, range_tombstone_iter_); } // Check if current file being fully within iterate_lower_bound. @@ -1152,13 +1167,8 @@ class LevelIterator final : public InternalIterator { const std::shared_ptr& prefix_extractor_; HistogramImpl* file_read_hist_; - bool should_sample_; TableReaderCaller caller_; - bool skip_filters_; - bool allow_unprepared_value_; - bool may_be_out_of_lower_bound_ = true; size_t file_index_; - int level_; RangeDelAggregator* range_del_agg_; IteratorWrapper file_iter_; // May be nullptr PinnedIteratorsManager* pinned_iters_mgr_; @@ -1167,8 +1177,6 @@ class LevelIterator final : public InternalIterator { // tombstones. const std::vector* compaction_boundaries_; - bool is_next_read_sequential_; - // This is set when this level iterator is used under a merging iterator // that processes range tombstones. range_tombstone_iter_ points to where the // merging iterator stores the range tombstones iterator for this level. When @@ -1185,20 +1193,29 @@ class LevelIterator final : public InternalIterator { // *range_tombstone_iter_ points to range tombstones of the current SST file TruncatedRangeDelIterator** range_tombstone_iter_; - // Whether next/prev key is a sentinel key. - bool to_return_sentinel_ = false; // The sentinel key to be returned Slice sentinel_; - // Sets flags for if we should return the sentinel key next. - // The condition for returning sentinel is reaching the end of current - // file_iter_: !Valid() && status.().ok(). - void TrySetDeleteRangeSentinel(const Slice& boundary_key); - void ClearSentinel() { to_return_sentinel_ = false; } + SequenceNumber read_seq_; + int level_; + uint8_t block_protection_bytes_per_key_; + bool should_sample_; + bool skip_filters_; + bool allow_unprepared_value_; + bool may_be_out_of_lower_bound_ = true; + bool is_next_read_sequential_; // Set in Seek() when a prefix seek reaches end of the current file, // and the next file has a different prefix. SkipEmptyFileForward() // will not move to next file when this flag is set. bool prefix_exhausted_ = false; + // Whether next/prev key is a sentinel key. + bool to_return_sentinel_ = false; + + // Sets flags for if we should return the sentinel key next. + // The condition for returning sentinel is reaching the end of current + // file_iter_: !Valid() && status.().ok(). + void TrySetDeleteRangeSentinel(const Slice& boundary_key); + void ClearSentinel() { to_return_sentinel_ = false; } }; void LevelIterator::TrySetDeleteRangeSentinel(const Slice& boundary_key) { @@ -1542,13 +1559,15 @@ void LevelIterator::InitFileIterator(size_t new_file_index) { } } // anonymous namespace -Status Version::GetTableProperties(std::shared_ptr* tp, +Status Version::GetTableProperties(const ReadOptions& read_options, + std::shared_ptr* tp, const FileMetaData* file_meta, const std::string* fname) const { auto table_cache = cfd_->table_cache(); auto ioptions = cfd_->ioptions(); Status s = table_cache->GetTableProperties( - file_options_, cfd_->internal_comparator(), *file_meta, tp, + file_options_, read_options, cfd_->internal_comparator(), *file_meta, tp, + mutable_cf_options_.block_protection_bytes_per_key, mutable_cf_options_.prefix_extractor, true /* no io */); if (s.ok()) { return s; @@ -1580,14 +1599,16 @@ Status Version::GetTableProperties(std::shared_ptr* tp, // the magic number check in the footer. std::unique_ptr file_reader( new RandomAccessFileReader( - std::move(file), file_name, nullptr /* env */, io_tracer_, - nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */, - nullptr /* rate_limiter */, ioptions->listeners)); + std::move(file), file_name, ioptions->clock /* clock */, io_tracer_, + ioptions->stats /* stats */, + Histograms::SST_READ_MICROS /* hist_type */, + nullptr /* file_read_hist */, nullptr /* rate_limiter */, + ioptions->listeners)); std::unique_ptr props; s = ReadTableProperties( file_reader.get(), file_meta->fd.GetFileSize(), Footer::kNullTableMagicNumber /* table's magic number */, *ioptions, - &props); + read_options, &props); if (!s.ok()) { return s; } @@ -1596,10 +1617,11 @@ Status Version::GetTableProperties(std::shared_ptr* tp, return s; } -Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { +Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props) { Status s; for (int level = 0; level < storage_info_.num_levels_; level++) { - s = GetPropertiesOfAllTables(props, level); + s = GetPropertiesOfAllTables(read_options, props, level); if (!s.ok()) { return s; } @@ -1617,6 +1639,8 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, std::stringstream ss; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; for (int level = 0; level < storage_info_.num_levels_; level++) { for (const auto& file_meta : storage_info_.files_[level]) { auto fname = @@ -1629,7 +1653,8 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, std::unique_ptr tombstone_iter; Status s = table_cache->GetRangeTombstoneIterator( - ReadOptions(), cfd_->internal_comparator(), *file_meta, + read_options, cfd_->internal_comparator(), *file_meta, + cfd_->GetLatestMutableCFOptions()->block_protection_bytes_per_key, &tombstone_iter); if (!s.ok()) { return s; @@ -1663,7 +1688,8 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, return Status::OK(); } -Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, +Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props, int level) { for (const auto& file_meta : storage_info_.files_[level]) { auto fname = @@ -1672,7 +1698,8 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, // 1. If the table is already present in table cache, load table // properties from there. std::shared_ptr table_properties; - Status s = GetTableProperties(&table_properties, file_meta, &fname); + Status s = + GetTableProperties(read_options, &table_properties, file_meta, &fname); if (s.ok()) { props->insert({fname, table_properties}); } else { @@ -1684,7 +1711,8 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, } Status Version::GetPropertiesOfTablesInRange( - const Range* range, std::size_t n, TablePropertiesCollection* props) const { + const ReadOptions& read_options, const Range* range, std::size_t n, + TablePropertiesCollection* props) const { for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { for (decltype(n) i = 0; i < n; i++) { // Convert user_key into a corresponding internal key. @@ -1701,7 +1729,8 @@ Status Version::GetPropertiesOfTablesInRange( // 1. If the table is already present in table cache, load table // properties from there. std::shared_ptr table_properties; - Status s = GetTableProperties(&table_properties, file_meta, &fname); + Status s = GetTableProperties(read_options, &table_properties, + file_meta, &fname); if (s.ok()) { props->insert({fname, table_properties}); } else { @@ -1716,13 +1745,14 @@ Status Version::GetPropertiesOfTablesInRange( } Status Version::GetAggregatedTableProperties( - std::shared_ptr* tp, int level) { + const ReadOptions& read_options, std::shared_ptr* tp, + int level) { TablePropertiesCollection props; Status s; if (level < 0) { - s = GetPropertiesOfAllTables(&props); + s = GetPropertiesOfAllTables(read_options, &props); } else { - s = GetPropertiesOfAllTables(&props, level); + s = GetPropertiesOfAllTables(read_options, &props, level); } if (!s.ok()) { return s; @@ -1736,13 +1766,14 @@ Status Version::GetAggregatedTableProperties( return Status::OK(); } -size_t Version::GetMemoryUsageByTableReaders() { +size_t Version::GetMemoryUsageByTableReaders(const ReadOptions& read_options) { size_t total_usage = 0; for (auto& file_level : storage_info_.level_files_brief_) { for (size_t i = 0; i < file_level.num_files; i++) { total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader( - file_options_, cfd_->internal_comparator(), + file_options_, read_options, cfd_->internal_comparator(), *file_level.files[i].file_metadata, + mutable_cf_options_.block_protection_bytes_per_key, mutable_cf_options_.prefix_extractor); } } @@ -1791,6 +1822,8 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { file->file_checksum, file->file_checksum_func_name); files.back().num_entries = file->num_entries; files.back().num_deletions = file->num_deletions; + files.back().smallest = file->smallest.Encode().ToString(); + files.back().largest = file->largest.Encode().ToString(); level_size += file->fd.GetFileSize(); } cf_meta->levels.emplace_back(level, level_size, std::move(files)); @@ -1822,6 +1855,49 @@ uint64_t Version::GetSstFilesSize(bool include_bottommost) { return sst_files_size; } +void Version::GetSstFilesBoundaryKeys(Slice* smallest_user_key, + Slice* largest_user_key) { + smallest_user_key->clear(); + largest_user_key->clear(); + bool initialized = false; + const Comparator* ucmp = storage_info_.user_comparator_; + for (int level = 0; level < cfd_->NumberLevels(); level++) { + if (storage_info_.LevelFiles(level).size() == 0) { + continue; + } + if (level == 0) { + // we need to consider all files on level 0 + for (const auto& file : storage_info_.LevelFiles(level)) { + const Slice& start_user_key = file->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = file->largest.user_key(); + if (!initialized || + ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } else { + // we only need to consider the first and last file + const Slice& start_user_key = + storage_info_.LevelFiles(level)[0]->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = + storage_info_.LevelFiles(level).back()->largest.user_key(); + if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } +} + void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) { uint64_t oldest_time = std::numeric_limits::max(); for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) { @@ -1851,8 +1927,10 @@ InternalIterator* Version::TEST_GetLevelIterator( cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), mutable_cf_options_.prefix_extractor, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), - TableReaderCaller::kUserIterator, IsFilterSkipped(level, read_options), - level, nullptr /* range_del_agg */, nullptr /* compaction_boundaries */, + TableReaderCaller::kUserIterator, + IsFilterSkipped(level, read_options, false), level, + mutable_cf_options_.block_protection_bytes_per_key, + nullptr /* range_del_agg */, nullptr /* compaction_boundaries */, allow_unprepared_value, &tombstone_iter_ptr); if (read_options.ignore_range_deletions) { merge_iter_builder->AddIterator(level_iter); @@ -1899,8 +1977,14 @@ double VersionStorageInfo::GetEstimatedCompressionRatioAtLevel( uint64_t sum_file_size_bytes = 0; uint64_t sum_data_size_bytes = 0; for (auto* file_meta : files_[level]) { - sum_file_size_bytes += file_meta->fd.GetFileSize(); - sum_data_size_bytes += file_meta->raw_key_size + file_meta->raw_value_size; + auto raw_size = file_meta->raw_key_size + file_meta->raw_value_size; + // Check if the table property is properly initialized. It might not be + // because in `UpdateAccumulatedStats` we limit the maximum number of + // properties to read once. + if (raw_size > 0) { + sum_file_size_bytes += file_meta->fd.GetFileSize(); + sum_data_size_bytes += raw_size; + } } if (sum_file_size_bytes == 0) { return -1.0; @@ -1950,7 +2034,8 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, allow_unprepared_value, - &tombstone_iter); + mutable_cf_options_.block_protection_bytes_per_key, + /*range_del_read_seqno=*/nullptr, &tombstone_iter); if (read_options.ignore_range_deletions) { merge_iter_builder->AddIterator(table_iter); } else { @@ -1978,9 +2063,12 @@ void Version::AddIteratorsForLevel(const ReadOptions& read_options, cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), mutable_cf_options_.prefix_extractor, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), - TableReaderCaller::kUserIterator, IsFilterSkipped(level, read_options), - level, /*range_del_agg=*/nullptr, /*compaction_boundaries=*/nullptr, - allow_unprepared_value, &tombstone_iter_ptr); + TableReaderCaller::kUserIterator, + IsFilterSkipped(level, read_options, false), level, + mutable_cf_options_.block_protection_bytes_per_key, + /*range_del_agg=*/nullptr, + /*compaction_boundaries=*/nullptr, allow_unprepared_value, + &tombstone_iter_ptr); if (read_options.ignore_range_deletions) { merge_iter_builder->AddIterator(level_iter); } else { @@ -2023,7 +2111,8 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_, /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, - /*allow_unprepared_value=*/false)); + /*allow_unprepared_value=*/false, + mutable_cf_options_.block_protection_bytes_per_key)); status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); if (!status.ok() || *overlap) { @@ -2037,8 +2126,10 @@ Status Version::OverlapWithLevelIterator(const ReadOptions& read_options, cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level), mutable_cf_options_.prefix_extractor, should_sample_file_read(), cfd_->internal_stats()->GetFileReadHist(level), - TableReaderCaller::kUserIterator, IsFilterSkipped(level, read_options), level, - &range_del_agg)); + TableReaderCaller::kUserIterator, + IsFilterSkipped(level, read_options, false), level, + mutable_cf_options_.block_protection_bytes_per_key, &range_del_agg, + nullptr, false)); status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key, iter.get(), overlap); } @@ -2055,7 +2146,9 @@ VersionStorageInfo::VersionStorageInfo( const Comparator* user_comparator, int levels, CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage, bool _force_consistency_checks, - EpochNumberRequirement epoch_number_requirement) + EpochNumberRequirement epoch_number_requirement, SystemClock* clock, + uint32_t bottommost_file_compaction_delay, + OffpeakTimeOption offpeak_time_option) : internal_comparator_(internal_comparator), user_comparator_(user_comparator), // cfd is nullptr if Version is dummy @@ -2065,6 +2158,7 @@ VersionStorageInfo::VersionStorageInfo( compaction_style_(compaction_style), files_(new std::vector[num_levels_]), base_level_(num_levels_ == 1 ? -1 : 1), + lowest_unnecessary_level_(-1), level_multiplier_(0.0), files_by_compaction_pri_(num_levels_), level0_non_overlapping_(false), @@ -2082,9 +2176,12 @@ VersionStorageInfo::VersionStorageInfo( current_num_deletions_(0), current_num_samples_(0), estimated_compaction_needed_bytes_(0), + clock_(clock), + bottommost_file_compaction_delay_(bottommost_file_compaction_delay), finalized_(false), force_consistency_checks_(_force_consistency_checks), - epoch_number_requirement_(epoch_number_requirement) { + epoch_number_requirement_(epoch_number_requirement), + offpeak_time_option_(std::move(offpeak_time_option)) { if (ref_vstorage != nullptr) { accumulated_file_size_ = ref_vstorage->accumulated_file_size_; accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_; @@ -2126,7 +2223,11 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, ? nullptr : cfd_->current()->storage_info(), cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks, - epoch_number_requirement), + epoch_number_requirement, + cfd_ == nullptr ? nullptr : cfd_->ioptions()->clock, + cfd_ == nullptr ? 0 + : mutable_cf_options.bottommost_file_compaction_delay, + vset->offpeak_time_option()), vset_(vset), next_(this), prev_(this), @@ -2136,7 +2237,13 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, max_file_size_for_l0_meta_pin_( MaxFileSizeForL0MetaPin(mutable_cf_options_)), version_number_(version_number), - io_tracer_(io_tracer) {} + io_tracer_(io_tracer), + use_async_io_(false) { + if (CheckFSFeatureSupport(env_->GetFileSystem().get(), + FSSupportedOps::kAsyncIO)) { + use_async_io_ = true; + } +} Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key, const Slice& blob_index_slice, @@ -2195,25 +2302,35 @@ void Version::MultiGetBlob( autovector blob_reqs_in_file; BlobReadContexts& blobs_in_file = ctx.second; - for (const auto& blob : blobs_in_file) { - const BlobIndex& blob_index = blob.first; - const KeyContext& key_context = blob.second; + for (auto& blob : blobs_in_file) { + const BlobIndex& blob_index = blob.blob_index; + const KeyContext* const key_context = blob.key_context; + assert(key_context); + assert(key_context->get_context); + assert(key_context->s); + + if (key_context->value) { + key_context->value->Reset(); + } else { + assert(key_context->columns); + key_context->columns->Reset(); + } if (!blob_file_meta) { - *key_context.s = Status::Corruption("Invalid blob file number"); + *key_context->s = Status::Corruption("Invalid blob file number"); continue; } if (blob_index.HasTTL() || blob_index.IsInlined()) { - *key_context.s = + *key_context->s = Status::Corruption("Unexpected TTL/inlined blob index"); continue; } - key_context.value->Reset(); blob_reqs_in_file.emplace_back( - key_context.ukey_with_ts, blob_index.offset(), blob_index.size(), - blob_index.compression(), key_context.value, key_context.s); + key_context->get_context->ukey_to_get_blob_value(), + blob_index.offset(), blob_index.size(), blob_index.compression(), + &blob.result, key_context->s); } if (blob_reqs_in_file.size() > 0) { const auto file_size = blob_file_meta->GetBlobFileSize(); @@ -2222,23 +2339,35 @@ void Version::MultiGetBlob( } if (blob_reqs.size() > 0) { - blob_source_->MultiGetBlob(read_options, blob_reqs, /*bytes_read=*/nullptr); + blob_source_->MultiGetBlob(read_options, blob_reqs, + /*bytes_read=*/nullptr); } for (auto& ctx : blob_ctxs) { BlobReadContexts& blobs_in_file = ctx.second; - for (const auto& blob : blobs_in_file) { - const KeyContext& key_context = blob.second; - if (key_context.s->ok()) { - range.AddValueSize(key_context.value->size()); + for (auto& blob : blobs_in_file) { + const KeyContext* const key_context = blob.key_context; + assert(key_context); + assert(key_context->get_context); + assert(key_context->s); + + if (key_context->s->ok()) { + if (key_context->value) { + *key_context->value = std::move(blob.result); + range.AddValueSize(key_context->value->size()); + } else { + assert(key_context->columns); + key_context->columns->SetPlainValue(std::move(blob.result)); + range.AddValueSize(key_context->columns->serialized_size()); + } + if (range.GetValueSize() > read_options.value_size_soft_limit) { - *key_context.s = Status::Aborted(); + *key_context->s = Status::Aborted(); } - } else if (key_context.s->IsIncomplete()) { + } else if (key_context->s->IsIncomplete()) { // read_options.read_tier == kBlockCacheTier // Cannot read blob(s): no disk I/O allowed - assert(key_context.get_context); - auto& get_context = *(key_context.get_context); + auto& get_context = *(key_context->get_context); get_context.MarkKeyMayExist(); } } @@ -2313,7 +2442,8 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, StopWatchNano timer(clock_, timer_enabled /* auto_start */); *status = table_cache_->Get( read_options, *internal_comparator(), *f->file_metadata, ikey, - &get_context, mutable_cf_options_.prefix_extractor, + &get_context, mutable_cf_options_.block_protection_bytes_per_key, + mutable_cf_options_.prefix_extractor, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), IsFilterSkipped(static_cast(fp.GetHitFileLevel()), read_options, fp.IsHitFileLastInLevel()), @@ -2355,22 +2485,34 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, fp.GetHitFileLevel()); - if (is_blob_index) { - if (do_merge && value) { - TEST_SYNC_POINT_CALLBACK("Version::Get::TamperWithBlobIndex", - value); + if (is_blob_index && do_merge && (value || columns)) { + Slice blob_index = + value ? *value + : WideColumnsHelper::GetDefaultColumn(columns->columns()); - constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; - constexpr uint64_t* bytes_read = nullptr; + TEST_SYNC_POINT_CALLBACK("Version::Get::TamperWithBlobIndex", + &blob_index); - *status = GetBlob(read_options, user_key, *value, prefetch_buffer, - value, bytes_read); - if (!status->ok()) { - if (status->IsIncomplete()) { - get_context.MarkKeyMayExist(); - } - return; + constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; + + PinnableSlice result; + + constexpr uint64_t* bytes_read = nullptr; + + *status = GetBlob(read_options, get_context.ukey_to_get_blob_value(), + blob_index, prefetch_buffer, &result, bytes_read); + if (!status->ok()) { + if (status->IsIncomplete()) { + get_context.MarkKeyMayExist(); } + return; + } + + if (value) { + *value = std::move(result); + } else { + assert(columns); + columns->SetPlainValue(std::move(result)); } } @@ -2388,6 +2530,9 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, "Encounter unexpected blob index. Please open DB with " "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); return; + case GetContext::kMergeOperatorFailed: + *status = Status::Corruption(Status::SubCode::kMergeOperatorFailed); + return; } f = fp.GetNextFile(); } @@ -2407,21 +2552,16 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, // merge_operands are in saver and we hit the beginning of the key history // do a final merge of nullptr and operands; if (value || columns) { - std::string result; // `op_failure_scope` (an output parameter) is not provided (set to // nullptr) since a failure must be propagated regardless of its value. *status = MergeHelper::TimedFullMerge( - merge_operator_, user_key, nullptr, merge_context->GetOperands(), - &result, info_log_, db_statistics_, clock_, - /* result_operand */ nullptr, /* update_num_ops_stats */ true, - /* op_failure_scope */ nullptr); + merge_operator_, user_key, MergeHelper::kNoBaseValue, + merge_context->GetOperands(), info_log_, db_statistics_, clock_, + /* update_num_ops_stats */ true, value ? value->GetSelf() : nullptr, + columns, /* op_failure_scope */ nullptr); if (status->ok()) { if (LIKELY(value != nullptr)) { - *(value->GetSelf()) = std::move(result); value->PinSelf(); - } else { - assert(columns != nullptr); - columns->SetPlainValue(result); } } } @@ -2457,7 +2597,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, get_ctx.emplace_back( user_comparator(), merge_operator_, info_log_, db_statistics_, iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge, - iter->ukey_with_ts, iter->value, /*columns=*/nullptr, iter->timestamp, + iter->ukey_with_ts, iter->value, iter->columns, iter->timestamp, nullptr, &(iter->merge_context), true, &iter->max_covering_tombstone_seq, clock_, nullptr, merge_operator_ ? &pinned_iters_mgr : nullptr, callback, @@ -2480,7 +2620,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end()); #if USE_COROUTINES if (read_options.async_io && read_options.optimize_multiget_for_io && - using_coroutines()) { + using_coroutines() && use_async_io_) { s = MultiGetAsync(read_options, range, &blob_ctxs); } else #endif // USE_COROUTINES @@ -2506,7 +2646,7 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, // Avoid using the coroutine version if we're looking in a L0 file, since // L0 files won't be parallelized anyway. The regular synchronous version // is faster. - if (!read_options.async_io || !using_coroutines() || + if (!read_options.async_io || !using_coroutines() || !use_async_io_ || fp.GetHitFileLevel() == 0 || !fp.RemainingOverlapInLevel()) { if (f) { bool skip_filters = @@ -2550,7 +2690,8 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, read_options, *internal_comparator(), *f->file_metadata, mutable_cf_options_.prefix_extractor, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), - fp.GetHitFileLevel(), &file_range, &table_handle); + fp.GetHitFileLevel(), &file_range, &table_handle, + mutable_cf_options_.block_protection_bytes_per_key); skip_range_deletions = true; if (status.ok()) { skip_filters = true; @@ -2677,23 +2818,26 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, } // merge_operands are in saver and we hit the beginning of the key history // do a final merge of nullptr and operands; - std::string* str_value = - iter->value != nullptr ? iter->value->GetSelf() : nullptr; // `op_failure_scope` (an output parameter) is not provided (set to // nullptr) since a failure must be propagated regardless of its value. *status = MergeHelper::TimedFullMerge( - merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(), - str_value, info_log_, db_statistics_, clock_, - /* result_operand */ nullptr, /* update_num_ops_stats */ true, + merge_operator_, user_key, MergeHelper::kNoBaseValue, + iter->merge_context.GetOperands(), info_log_, db_statistics_, clock_, + /* update_num_ops_stats */ true, + iter->value ? iter->value->GetSelf() : nullptr, iter->columns, /* op_failure_scope */ nullptr); if (LIKELY(iter->value != nullptr)) { iter->value->PinSelf(); range->AddValueSize(iter->value->size()); - range->MarkKeyDone(iter); - if (range->GetValueSize() > read_options.value_size_soft_limit) { - s = Status::Aborted(); - break; - } + } else { + assert(iter->columns); + range->AddValueSize(iter->columns->serialized_size()); + } + + range->MarkKeyDone(iter); + if (range->GetValueSize() > read_options.value_size_soft_limit) { + s = Status::Aborted(); + break; } } else { range->MarkKeyDone(iter); @@ -2736,16 +2880,17 @@ Status Version::ProcessBatch( while (f) { MultiGetRange file_range = fp.CurrentFileRange(); TableCache::TypedHandle* table_handle = nullptr; - bool skip_filters = IsFilterSkipped(static_cast(fp.GetHitFileLevel()), - read_options, - fp.IsHitFileLastInLevel()); + bool skip_filters = + IsFilterSkipped(static_cast(fp.GetHitFileLevel()), read_options, + fp.IsHitFileLastInLevel()); bool skip_range_deletions = false; if (!skip_filters) { Status status = table_cache_->MultiGetFilter( read_options, *internal_comparator(), *f->file_metadata, mutable_cf_options_.prefix_extractor, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), - fp.GetHitFileLevel(), &file_range, &table_handle); + fp.GetHitFileLevel(), &file_range, &table_handle, + mutable_cf_options_.block_protection_bytes_per_key); if (status.ok()) { skip_filters = true; skip_range_deletions = true; @@ -2987,24 +3132,26 @@ void VersionStorageInfo::PrepareForVersionAppend( } void Version::PrepareAppend(const MutableCFOptions& mutable_cf_options, + const ReadOptions& read_options, bool update_stats) { TEST_SYNC_POINT_CALLBACK( "Version::PrepareAppend:forced_check", reinterpret_cast(&storage_info_.force_consistency_checks_)); if (update_stats) { - UpdateAccumulatedStats(); + UpdateAccumulatedStats(read_options); } storage_info_.PrepareForVersionAppend(*cfd_->ioptions(), mutable_cf_options); } -bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) { +bool Version::MaybeInitializeFileMetaData(const ReadOptions& read_options, + FileMetaData* file_meta) { if (file_meta->init_stats_from_file || file_meta->compensated_file_size > 0) { return false; } std::shared_ptr tp; - Status s = GetTableProperties(&tp, file_meta); + Status s = GetTableProperties(read_options, &tp, file_meta); file_meta->init_stats_from_file = true; if (!s.ok()) { ROCKS_LOG_ERROR(vset_->db_options_->info_log, @@ -3049,7 +3196,7 @@ void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) { } } -void Version::UpdateAccumulatedStats() { +void Version::UpdateAccumulatedStats(const ReadOptions& read_options) { // maximum number of table properties loaded from files. const int kMaxInitCount = 20; int init_count = 0; @@ -3067,7 +3214,7 @@ void Version::UpdateAccumulatedStats() { level < storage_info_.num_levels_ && init_count < kMaxInitCount; ++level) { for (auto* file_meta : storage_info_.files_[level]) { - if (MaybeInitializeFileMetaData(file_meta)) { + if (MaybeInitializeFileMetaData(read_options, file_meta)) { // each FileMeta will be initialized only once. storage_info_.UpdateAccumulatedStats(file_meta); // when option "max_open_files" is -1, all the file metadata has @@ -3092,7 +3239,8 @@ void Version::UpdateAccumulatedStats() { storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) { for (int i = static_cast(storage_info_.files_[level].size()) - 1; storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) { - if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) { + if (MaybeInitializeFileMetaData(read_options, + storage_info_.files_[level][i])) { storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]); } } @@ -3256,6 +3404,55 @@ uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions, } return ttl_expired_files_count; } + +bool ShouldChangeFileTemperature(const ImmutableOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + const std::vector& files) { + const std::vector& ages = + mutable_cf_options.compaction_options_fifo + .file_temperature_age_thresholds; + if (ages.empty()) { + return false; + } + if (files.empty()) { + return false; + } + int64_t _current_time; + auto status = ioptions.clock->GetCurrentTime(&_current_time); + const uint64_t current_time = static_cast(_current_time); + // We use oldest_ancestor_time of a file to be the estimate age of + // the file just older than it. This is the same logic used in + // FIFOCompactionPicker::PickTemperatureChangeCompaction(). + if (status.ok() && current_time >= ages[0].age) { + uint64_t create_time_threshold = current_time - ages[0].age; + Temperature target_temp; + assert(files.size() >= 1); + for (size_t index = files.size() - 1; index >= 1; --index) { + FileMetaData* cur_file = files[index]; + FileMetaData* prev_file = files[index - 1]; + if (!cur_file->being_compacted) { + uint64_t oldest_ancestor_time = prev_file->TryGetOldestAncesterTime(); + if (oldest_ancestor_time == kUnknownOldestAncesterTime) { + return false; + } + if (oldest_ancestor_time > create_time_threshold) { + return false; + } + target_temp = ages[0].temperature; + for (size_t i = 1; i < ages.size(); ++i) { + if (current_time >= ages[i].age && + oldest_ancestor_time <= current_time - ages[i].age) { + target_temp = ages[i].temperature; + } + } + if (cur_file->temperature != target_temp) { + return true; + } + } + } + } + return false; +} } // anonymous namespace void VersionStorageInfo::ComputeCompactionScore( @@ -3266,11 +3463,12 @@ void VersionStorageInfo::ComputeCompactionScore( // the level's target size, and 1.0 is the threshold for triggering // compaction. Higher score means higher prioritization. // Now we keep the compaction triggering condition, but consider more - // factors for priorization, while still keeping the 1.0 threshold. + // factors for prioritization, while still keeping the 1.0 threshold. // In order to provide flexibility for reducing score while still // maintaining it to be over 1.0, we scale the original score by 10x // if it is larger than 1.0. const double kScoreScale = 10.0; + int max_output_level = MaxOutputLevel(immutable_options.allow_ingest_behind); for (int level = 0; level <= MaxInputLevel(); level++) { double score; if (level == 0) { @@ -3298,8 +3496,8 @@ void VersionStorageInfo::ComputeCompactionScore( // For universal compaction, we use level0 score to indicate // compaction score for the whole DB. Adding other levels as if // they are L0 files. - for (int i = 1; i < num_levels(); i++) { - // Its possible that a subset of the files in a level may be in a + for (int i = 1; i <= max_output_level; i++) { + // It's possible that a subset of the files in a level may be in a // compaction, due to delete triggered compaction or trivial move. // In that case, the below check may not catch a level being // compacted as it only checks the first file. The worst that can @@ -3313,22 +3511,25 @@ void VersionStorageInfo::ComputeCompactionScore( if (compaction_style_ == kCompactionStyleFIFO) { score = static_cast(total_size) / mutable_cf_options.compaction_options_fifo.max_table_files_size; - if (mutable_cf_options.compaction_options_fifo.allow_compaction || - mutable_cf_options.compaction_options_fifo.age_for_warm > 0) { - // Warm tier move can happen at any time. It's too expensive to - // check very file's timestamp now. For now, just trigger it - // slightly more frequently than FIFO compaction so that this - // happens first. + if (score < 1 && + mutable_cf_options.compaction_options_fifo.allow_compaction) { score = std::max( static_cast(num_sorted_runs) / mutable_cf_options.level0_file_num_compaction_trigger, score); } - if (mutable_cf_options.ttl > 0) { - score = std::max( - static_cast(GetExpiredTtlFilesCount( - immutable_options, mutable_cf_options, files_[level])), - score); + if (score < 1 && mutable_cf_options.ttl > 0) { + score = + std::max(static_cast(GetExpiredTtlFilesCount( + immutable_options, mutable_cf_options, files_[0])), + score); + } + if (score < 1 && + ShouldChangeFileTemperature(immutable_options, mutable_cf_options, + files_[0])) { + // For FIFO, just need a large enough score to trigger compaction. + const double kScoreForNeedCompaction = 1.1; + score = kScoreForNeedCompaction; } } else { score = static_cast(num_sorted_runs) / @@ -3337,27 +3538,29 @@ void VersionStorageInfo::ComputeCompactionScore( // Level-based involves L0->L0 compactions that can lead to oversized // L0 files. Take into account size as well to avoid later giant // compactions to the base level. - // If score in L0 is always too high, L0->L1 will always be - // prioritized over L1->L2 compaction and L1 will accumulate to - // too large. But if L0 score isn't high enough, L0 will accumulate - // and data is not moved to L1 fast enough. With potential L0->L0 - // compaction, number of L0 files aren't always an indication of - // L0 oversizing, and we also need to consider total size of L0. + // If score in L0 is always too high, L0->LBase will always be + // prioritized over LBase->LBase+1 compaction and LBase will + // accumulate to too large. But if L0 score isn't high enough, L0 will + // accumulate and data is not moved to LBase fast enough. The score + // calculation below takes into account L0 size vs LBase size. if (immutable_options.level_compaction_dynamic_level_bytes) { if (total_size >= mutable_cf_options.max_bytes_for_level_base) { // When calculating estimated_compaction_needed_bytes, we assume // L0 is qualified as pending compactions. We will need to make // sure that it qualifies for compaction. - // It might be guafanteed by logic below anyway, but we are + // It might be guaranteed by logic below anyway, but we are // explicit here to make sure we don't stop writes with no // compaction scheduled. score = std::max(score, 1.01); } if (total_size > level_max_bytes_[base_level_]) { - // In this case, we compare L0 size with actual L1 size and make - // sure score is more than 1.0 (10.0 after scaled) if L0 is larger - // than L1. Since in this case L1 score is lower than 10.0, L0->L1 - // is prioritized over L1->L2. + // In this case, we compare L0 size with actual LBase size and + // make sure score is more than 1.0 (10.0 after scaled) if L0 is + // larger than LBase. Since LBase score = LBase size / + // (target size + total_downcompact_bytes) where + // total_downcompact_bytes = total_size > LBase size, + // LBase score is lower than 10.0. So L0->LBase is prioritized + // over LBase -> LBase+1. uint64_t base_level_size = 0; for (auto f : files_[base_level_]) { base_level_size += f->compensated_file_size; @@ -3377,7 +3580,7 @@ void VersionStorageInfo::ComputeCompactionScore( } } } - } else { + } else { // level > 0 // Compute the ratio of current size to size limit. uint64_t level_bytes_no_compacting = 0; uint64_t level_total_bytes = 0; @@ -3387,21 +3590,36 @@ void VersionStorageInfo::ComputeCompactionScore( level_bytes_no_compacting += f->compensated_file_size; } } - if (!immutable_options.level_compaction_dynamic_level_bytes || - level_bytes_no_compacting < MaxBytesForLevel(level)) { + if (!immutable_options.level_compaction_dynamic_level_bytes) { score = static_cast(level_bytes_no_compacting) / MaxBytesForLevel(level); } else { - // If there are a large mount of data being compacted down to the - // current level soon, we would de-prioritize compaction from - // a level where the incoming data would be a large ratio. We do - // it by dividing level size not by target level size, but - // the target size and the incoming compaction bytes. - score = static_cast(level_bytes_no_compacting) / - (MaxBytesForLevel(level) + total_downcompact_bytes) * - kScoreScale; + if (level_bytes_no_compacting < MaxBytesForLevel(level)) { + score = static_cast(level_bytes_no_compacting) / + MaxBytesForLevel(level); + } else { + // If there are a large mount of data being compacted down to the + // current level soon, we would de-prioritize compaction from + // a level where the incoming data would be a large ratio. We do + // it by dividing level size not by target level size, but + // the target size and the incoming compaction bytes. + score = static_cast(level_bytes_no_compacting) / + (MaxBytesForLevel(level) + total_downcompact_bytes) * + kScoreScale; + } + // Drain unnecessary levels, but with lower priority compared to + // when L0 is eligible. Only non-empty levels can be unnecessary. + // If there is no unnecessary levels, lowest_unnecessary_level_ = -1. + if (level_bytes_no_compacting > 0 && + level <= lowest_unnecessary_level_) { + score = std::max( + score, kScoreScale * + (1.001 + 0.001 * (lowest_unnecessary_level_ - level))); + } } - if (level_total_bytes > MaxBytesForLevel(level)) { + if (level <= lowest_unnecessary_level_) { + total_downcompact_bytes += level_total_bytes; + } else if (level_total_bytes > MaxBytesForLevel(level)) { total_downcompact_bytes += static_cast(level_total_bytes - MaxBytesForLevel(level)); } @@ -3424,37 +3642,29 @@ void VersionStorageInfo::ComputeCompactionScore( } } } - ComputeFilesMarkedForCompaction(); - if (!immutable_options.allow_ingest_behind) { - ComputeBottommostFilesMarkedForCompaction(); - } - if (mutable_cf_options.ttl > 0) { - ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl); - } - if (mutable_cf_options.periodic_compaction_seconds > 0) { - ComputeFilesMarkedForPeriodicCompaction( - immutable_options, mutable_cf_options.periodic_compaction_seconds); - } - - if (mutable_cf_options.enable_blob_garbage_collection && - mutable_cf_options.blob_garbage_collection_age_cutoff > 0.0 && - mutable_cf_options.blob_garbage_collection_force_threshold < 1.0) { - ComputeFilesMarkedForForcedBlobGC( - mutable_cf_options.blob_garbage_collection_age_cutoff, - mutable_cf_options.blob_garbage_collection_force_threshold); - } + ComputeFilesMarkedForCompaction(max_output_level); + ComputeBottommostFilesMarkedForCompaction( + immutable_options.allow_ingest_behind); + ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl); + ComputeFilesMarkedForPeriodicCompaction( + immutable_options, mutable_cf_options.periodic_compaction_seconds, + max_output_level); + ComputeFilesMarkedForForcedBlobGC( + mutable_cf_options.blob_garbage_collection_age_cutoff, + mutable_cf_options.blob_garbage_collection_force_threshold, + mutable_cf_options.enable_blob_garbage_collection); EstimateCompactionBytesNeeded(mutable_cf_options); } -void VersionStorageInfo::ComputeFilesMarkedForCompaction() { +void VersionStorageInfo::ComputeFilesMarkedForCompaction(int last_level) { files_marked_for_compaction_.clear(); int last_qualify_level = 0; // Do not include files from the last level with data // If table properties collector suggests a file on the last level, // we should not move it to a new level. - for (int level = num_levels() - 1; level >= 1; level--) { + for (int level = last_level; level >= 1; level--) { if (!files_[level].empty()) { last_qualify_level = level - 1; break; @@ -3472,9 +3682,10 @@ void VersionStorageInfo::ComputeFilesMarkedForCompaction() { void VersionStorageInfo::ComputeExpiredTtlFiles( const ImmutableOptions& ioptions, const uint64_t ttl) { - assert(ttl > 0); - expired_ttl_files_.clear(); + if (ttl == 0 || compaction_style_ != CompactionStyle::kCompactionStyleLevel) { + return; + } int64_t _current_time; auto status = ioptions.clock->GetCurrentTime(&_current_time); @@ -3498,10 +3709,11 @@ void VersionStorageInfo::ComputeExpiredTtlFiles( void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( const ImmutableOptions& ioptions, - const uint64_t periodic_compaction_seconds) { - assert(periodic_compaction_seconds > 0); - + const uint64_t periodic_compaction_seconds, int last_level) { files_marked_for_periodic_compaction_.clear(); + if (periodic_compaction_seconds == 0) { + return; + } int64_t temp_current_time; auto status = ioptions.clock->GetCurrentTime(&temp_current_time); @@ -3519,7 +3731,17 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( const uint64_t allowed_time_limit = current_time - periodic_compaction_seconds; - for (int level = 0; level < num_levels(); level++) { + // Find the adjust_allowed_time_limit such that it includes files that are + // going to expire by the time next daily offpeak starts. + const OffpeakTimeInfo offpeak_time_info = + offpeak_time_option_.GetOffpeakTimeInfo(current_time); + const uint64_t adjusted_allowed_time_limit = + allowed_time_limit + + (offpeak_time_info.is_now_offpeak + ? offpeak_time_info.seconds_till_next_offpeak_start + : 0); + + for (int level = 0; level <= last_level; level++) { for (auto f : files_[level]) { if (!f->being_compacted) { // Compute a file's modification time in the following order: @@ -3545,7 +3767,7 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( } } if (file_modification_time > 0 && - file_modification_time < allowed_time_limit) { + file_modification_time < adjusted_allowed_time_limit) { files_marked_for_periodic_compaction_.emplace_back(level, f); } } @@ -3555,8 +3777,14 @@ void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction( void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC( double blob_garbage_collection_age_cutoff, - double blob_garbage_collection_force_threshold) { + double blob_garbage_collection_force_threshold, + bool enable_blob_garbage_collection) { files_marked_for_forced_blob_gc_.clear(); + if (!(enable_blob_garbage_collection && + blob_garbage_collection_age_cutoff > 0.0 && + blob_garbage_collection_force_threshold < 1.0)) { + return; + } if (blob_files_.empty()) { return; @@ -4013,26 +4241,65 @@ void VersionStorageInfo::GenerateFileLocationIndex() { } } -void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum) { +void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum, + bool allow_ingest_behind) { assert(seqnum >= oldest_snapshot_seqnum_); oldest_snapshot_seqnum_ = seqnum; if (oldest_snapshot_seqnum_ > bottommost_files_mark_threshold_) { - ComputeBottommostFilesMarkedForCompaction(); + ComputeBottommostFilesMarkedForCompaction(allow_ingest_behind); } } -void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction() { +void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction( + bool allow_ingest_behind) { bottommost_files_marked_for_compaction_.clear(); bottommost_files_mark_threshold_ = kMaxSequenceNumber; + if (allow_ingest_behind) { + return; + } + // If a file's creation time is larger than creation_time_ub, + // it is too new to be marked for compaction. + int64_t creation_time_ub = 0; + bool needs_delay = bottommost_file_compaction_delay_ > 0; + if (needs_delay) { + int64_t current_time = 0; + clock_->GetCurrentTime(¤t_time).PermitUncheckedError(); + // Note that if GetCurrentTime() fails, current_time will be 0. + // We will treat it as is and treat all files as too new. + // The subtraction will not underflow since + // bottommost_file_compaction_delay_ is of type uint32_t. + creation_time_ub = + current_time - static_cast(bottommost_file_compaction_delay_); + } + for (auto& level_and_file : bottommost_files_) { if (!level_and_file.second->being_compacted && level_and_file.second->fd.largest_seqno != 0 && level_and_file.second->num_deletions > 1) { // largest_seqno might be nonzero due to containing the final key in an - // earlier compaction, whose seqnum we didn't zero out. Multiple deletions - // ensures the file really contains deleted or overwritten keys. + // earlier compaction, whose seqnum we didn't zero out. if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) { - bottommost_files_marked_for_compaction_.push_back(level_and_file); + if (!needs_delay) { + bottommost_files_marked_for_compaction_.push_back(level_and_file); + } else if (creation_time_ub > 0) { + int64_t creation_time = static_cast( + level_and_file.second->TryGetFileCreationTime()); + if (creation_time == kUnknownFileCreationTime || + creation_time <= creation_time_ub) { + bottommost_files_marked_for_compaction_.push_back(level_and_file); + } else { + // Just ignore this file for both + // bottommost_files_marked_for_compaction_ and + // bottommost_files_mark_threshold_. The next time + // this method is called, it will try this file again. The method + // is called after a new Version creation (compaction, flush, etc.), + // after a compaction is picked, and after a snapshot newer than + // bottommost_files_mark_threshold_ is released. + } + } else { + // creation_time_ub <= 0, all files are too new to be marked for + // compaction. + } } else { bottommost_files_mark_threshold_ = std::min(bottommost_files_mark_threshold_, @@ -4475,6 +4742,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, } } } else { + assert(ioptions.compaction_style == kCompactionStyleLevel); uint64_t max_level_size = 0; int first_non_empty_level = -1; @@ -4499,11 +4767,13 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, level_max_bytes_[i] = std::numeric_limits::max(); } + lowest_unnecessary_level_ = -1; if (max_level_size == 0) { // No data for L1 and up. L0 compacts to last level directly. // No compaction from L1+ needs to be scheduled. base_level_ = num_levels_ - 1; } else { + assert(first_non_empty_level >= 1); uint64_t base_bytes_max = options.max_bytes_for_level_base; uint64_t base_bytes_min = static_cast( base_bytes_max / options.max_bytes_for_level_multiplier); @@ -4514,20 +4784,41 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, // Round up after dividing cur_level_size = static_cast( cur_level_size / options.max_bytes_for_level_multiplier); + if (lowest_unnecessary_level_ == -1 && + cur_level_size <= base_bytes_min && + (ioptions.preclude_last_level_data_seconds == 0 || + i < num_levels_ - 2)) { + // When per_key_placement is enabled, the penultimate level is + // necessary. + lowest_unnecessary_level_ = i; + } } // Calculate base level and its size. uint64_t base_level_size; if (cur_level_size <= base_bytes_min) { + // If per_key_placement is not enabled, + // either there is only one non-empty level after level 0, + // which can less than base_bytes_min AND necessary, + // or there is some unnecessary level. + assert(first_non_empty_level == num_levels_ - 1 || + ioptions.preclude_last_level_data_seconds > 0 || + lowest_unnecessary_level_ != -1); // Case 1. If we make target size of last level to be max_level_size, // target size of the first non-empty level would be smaller than // base_bytes_min. We set it be base_bytes_min. base_level_size = base_bytes_min + 1U; base_level_ = first_non_empty_level; - ROCKS_LOG_INFO(ioptions.logger, - "More existing levels in DB than needed. " - "max_bytes_for_level_multiplier may not be guaranteed."); + if (base_level_ < num_levels_ - 1) { + ROCKS_LOG_INFO( + ioptions.logger, + "More existing levels in DB than needed: all non-zero " + "levels <= level %d are unnecessary. " + "max_bytes_for_level_multiplier may not be guaranteed.", + lowest_unnecessary_level_); + } } else { + assert(lowest_unnecessary_level_ == -1); // Find base level (where L0 data is compacted to). base_level_ = first_non_empty_level; while (base_level_ > 1 && cur_level_size > base_bytes_max) { @@ -4540,7 +4831,7 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions, assert(base_level_ == 1); base_level_size = base_bytes_max; } else { - base_level_size = cur_level_size; + base_level_size = std::max(static_cast(1), cur_level_size); } } @@ -4781,15 +5072,15 @@ struct VersionSet::ManifestWriter { Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) { assert(edit); - if (edit->is_in_atomic_group_) { + if (edit->IsInAtomicGroup()) { TEST_SYNC_POINT("AtomicGroupReadBuffer::AddEdit:AtomicGroup"); if (replay_buffer_.empty()) { - replay_buffer_.resize(edit->remaining_entries_ + 1); + replay_buffer_.resize(edit->GetRemainingEntries() + 1); TEST_SYNC_POINT_CALLBACK( "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", edit); } read_edits_in_atomic_group_++; - if (read_edits_in_atomic_group_ + edit->remaining_entries_ != + if (read_edits_in_atomic_group_ + edit->GetRemainingEntries() != static_cast(replay_buffer_.size())) { TEST_SYNC_POINT_CALLBACK( "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", edit); @@ -4824,15 +5115,14 @@ void AtomicGroupReadBuffer::Clear() { replay_buffer_.clear(); } -VersionSet::VersionSet(const std::string& dbname, - const ImmutableDBOptions* _db_options, - const FileOptions& storage_options, Cache* table_cache, - WriteBufferManager* write_buffer_manager, - WriteController* write_controller, - BlockCacheTracer* const block_cache_tracer, - const std::shared_ptr& io_tracer, - const std::string& db_id, - const std::string& db_session_id) +VersionSet::VersionSet( + const std::string& dbname, const ImmutableDBOptions* _db_options, + const FileOptions& storage_options, Cache* table_cache, + WriteBufferManager* write_buffer_manager, WriteController* write_controller, + BlockCacheTracer* const block_cache_tracer, + const std::shared_ptr& io_tracer, const std::string& db_id, + const std::string& db_session_id, const std::string& daily_offpeak_time_utc, + ErrorHandler* const error_handler) : column_family_set_(new ColumnFamilySet( dbname, _db_options, storage_options, table_cache, write_buffer_manager, write_controller, block_cache_tracer, io_tracer, @@ -4857,7 +5147,9 @@ VersionSet::VersionSet(const std::string& dbname, file_options_(storage_options), block_cache_tracer_(block_cache_tracer), io_tracer_(io_tracer), - db_session_id_(db_session_id) {} + db_session_id_(db_session_id), + offpeak_time_option_(OffpeakTimeOption(daily_offpeak_time_utc)), + error_handler_(error_handler) {} VersionSet::~VersionSet() { // we need to delete column_family_set_ because its destructor depends on @@ -4937,7 +5229,8 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, Status VersionSet::ProcessManifestWrites( std::deque& writers, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log, - const ColumnFamilyOptions* new_cf_options) { + const ColumnFamilyOptions* new_cf_options, + const ReadOptions& read_options) { mu->AssertHeld(); assert(!writers.empty()); ManifestWriter& first_writer = writers.front(); @@ -4947,9 +5240,17 @@ Status VersionSet::ProcessManifestWrites( assert(manifest_writers_.front() == &first_writer); autovector batch_edits; + // This vector keeps track of the corresponding user-defined timestamp size + // for `batch_edits` side by side, which is only needed for encoding a + // `VersionEdit` that adds new SST files. + // Note that anytime `batch_edits` has new element added or get existing + // element removed, `batch_edits_ts_sz` should be updated too. + autovector> batch_edits_ts_sz; autovector versions; autovector mutable_cf_options_ptrs; std::vector> builder_guards; + autovector*> files_to_quarantine_if_commit_fail; + autovector limbo_descriptor_log_file_number; // Tracking `max_last_sequence` is needed to ensure we write // `VersionEdit::last_sequence_`s in non-decreasing order according to the @@ -4962,6 +5263,7 @@ Status VersionSet::ProcessManifestWrites( // No group commits for column family add or drop LogAndApplyCFHelper(first_writer.edit_list.front(), &max_last_sequence); batch_edits.push_back(first_writer.edit_list.front()); + batch_edits_ts_sz.push_back(std::nullopt); } else { auto it = manifest_writers_.cbegin(); size_t group_start = std::numeric_limits::max(); @@ -4982,15 +5284,15 @@ Status VersionSet::ProcessManifestWrites( // don't update, then Recover can report corrupted atomic group because // the `remaining_entries_` do not match. if (!batch_edits.empty()) { - if (batch_edits.back()->is_in_atomic_group_ && - batch_edits.back()->remaining_entries_ > 0) { + if (batch_edits.back()->IsInAtomicGroup() && + batch_edits.back()->GetRemainingEntries() > 0) { assert(group_start < batch_edits.size()); const auto& edit_list = last_writer->edit_list; size_t k = 0; while (k < edit_list.size()) { - if (!edit_list[k]->is_in_atomic_group_) { + if (!edit_list[k]->IsInAtomicGroup()) { break; - } else if (edit_list[k]->remaining_entries_ == 0) { + } else if (edit_list[k]->GetRemainingEntries() == 0) { ++k; break; } @@ -4998,8 +5300,10 @@ Status VersionSet::ProcessManifestWrites( } for (auto i = group_start; i < batch_edits.size(); ++i) { assert(static_cast(k) <= - batch_edits.back()->remaining_entries_); - batch_edits[i]->remaining_entries_ -= static_cast(k); + batch_edits.back()->GetRemainingEntries()); + batch_edits[i]->SetRemainingEntries( + batch_edits[i]->GetRemainingEntries() - + static_cast(k)); } } } @@ -5038,11 +5342,14 @@ Status VersionSet::ProcessManifestWrites( TEST_SYNC_POINT_CALLBACK("VersionSet::ProcessManifestWrites:NewVersion", version); } + const Comparator* ucmp = last_writer->cfd->user_comparator(); + assert(ucmp); + std::optional edit_ts_sz = ucmp->timestamp_size(); for (const auto& e : last_writer->edit_list) { - if (e->is_in_atomic_group_) { - if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ || - (batch_edits.back()->is_in_atomic_group_ && - batch_edits.back()->remaining_entries_ == 0)) { + if (e->IsInAtomicGroup()) { + if (batch_edits.empty() || !batch_edits.back()->IsInAtomicGroup() || + (batch_edits.back()->IsInAtomicGroup() && + batch_edits.back()->GetRemainingEntries() == 0)) { group_start = batch_edits.size(); } } else if (group_start != std::numeric_limits::max()) { @@ -5058,6 +5365,7 @@ Status VersionSet::ProcessManifestWrites( return s; } batch_edits.push_back(e); + batch_edits_ts_sz.push_back(edit_ts_sz); } } for (int i = 0; i < static_cast(versions.size()); ++i) { @@ -5080,7 +5388,7 @@ Status VersionSet::ProcessManifestWrites( // remaining_entries_. size_t k = 0; while (k < batch_edits.size()) { - while (k < batch_edits.size() && !batch_edits[k]->is_in_atomic_group_) { + while (k < batch_edits.size() && !batch_edits[k]->IsInAtomicGroup()) { ++k; } if (k == batch_edits.size()) { @@ -5088,19 +5396,19 @@ Status VersionSet::ProcessManifestWrites( } size_t i = k; while (i < batch_edits.size()) { - if (!batch_edits[i]->is_in_atomic_group_) { + if (!batch_edits[i]->IsInAtomicGroup()) { break; } - assert(i - k + batch_edits[i]->remaining_entries_ == - batch_edits[k]->remaining_entries_); - if (batch_edits[i]->remaining_entries_ == 0) { + assert(i - k + batch_edits[i]->GetRemainingEntries() == + batch_edits[k]->GetRemainingEntries()); + if (batch_edits[i]->GetRemainingEntries() == 0) { ++i; break; } ++i; } - assert(batch_edits[i - 1]->is_in_atomic_group_); - assert(0 == batch_edits[i - 1]->remaining_entries_); + assert(batch_edits[i - 1]->IsInAtomicGroup()); + assert(0 == batch_edits[i - 1]->GetRemainingEntries()); std::vector tmp; for (size_t j = k; j != i; ++j) { tmp.emplace_back(batch_edits[j]); @@ -5222,7 +5530,8 @@ Status VersionSet::ProcessManifestWrites( true /* prefetch_index_and_filter_in_cache */, false /* is_initial_load */, mutable_cf_options_ptrs[i]->prefix_extractor, - MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i])); + MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]), read_options, + mutable_cf_options_ptrs[i]->block_protection_bytes_per_key); if (!s.ok()) { if (db_options_->paranoid_checks) { break; @@ -5267,7 +5576,8 @@ Status VersionSet::ProcessManifestWrites( constexpr bool update_stats = true; for (int i = 0; i < static_cast(versions.size()); ++i) { - versions[i]->PrepareAppend(*mutable_cf_options_ptrs[i], update_stats); + versions[i]->PrepareAppend(*mutable_cf_options_ptrs[i], read_options, + update_stats); } } } @@ -5277,9 +5587,13 @@ Status VersionSet::ProcessManifestWrites( #ifndef NDEBUG size_t idx = 0; #endif - for (auto& e : batch_edits) { + assert(batch_edits.size() == batch_edits_ts_sz.size()); + for (size_t bidx = 0; bidx < batch_edits.size(); bidx++) { + auto& e = batch_edits[bidx]; + files_to_quarantine_if_commit_fail.push_back( + e->GetFilesToQuarantineIfCommitFail()); std::string record; - if (!e->EncodeTo(&record)) { + if (!e->EncodeTo(&record, batch_edits_ts_sz[bidx])) { s = Status::Corruption("Unable to encode VersionEdit:" + e->DebugString(true)); break; @@ -5307,7 +5621,8 @@ Status VersionSet::ProcessManifestWrites( if (s.ok() && db_options_->replication_log_listener) { ReplicationLogRecord rlr; rlr.type = ReplicationLogRecord::kManifestWrite; - s = SerializeReplicationLogManifestWrite(&rlr.contents, batch_edits); + s = SerializeReplicationLogManifestWrite(&rlr.contents, batch_edits, + batch_edits_ts_sz); if (s.ok()) { db_options_->replication_log_listener->OnReplicationLogRecord(rlr); } @@ -5338,6 +5653,11 @@ Status VersionSet::ProcessManifestWrites( dir_contains_current_file); if (!io_s.ok()) { s = io_s; + // Quarantine old manifest file in case new manifest file's CURRENT file + // wasn't created successfully and the old manifest is needed. + limbo_descriptor_log_file_number.push_back(manifest_file_number_); + files_to_quarantine_if_commit_fail.push_back( + &limbo_descriptor_log_file_number); } } @@ -5346,7 +5666,7 @@ Status VersionSet::ProcessManifestWrites( new_manifest_file_size = descriptor_log_->file()->GetFileSize(); } - if (first_writer.edit_list.front()->is_column_family_drop_) { + if (first_writer.edit_list.front()->IsColumnFamilyDrop()) { TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0"); TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1"); TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2"); @@ -5374,9 +5694,16 @@ Status VersionSet::ProcessManifestWrites( if (!io_s.ok()) { if (io_status_.ok()) { io_status_ = io_s; + if (error_handler_) { + error_handler_->AddFilesToQuarantine( + files_to_quarantine_if_commit_fail); + } } } else if (!io_status_.ok()) { io_status_ = io_s; + if (error_handler_) { + error_handler_->ClearFilesToQuarantine(); + } } // Append the old manifest file to the obsolete_manifest_ list to be deleted @@ -5388,12 +5715,13 @@ Status VersionSet::ProcessManifestWrites( // Install the new versions if (s.ok()) { - if (first_writer.edit_list.front()->is_column_family_add_) { + if (first_writer.edit_list.front()->IsColumnFamilyAdd()) { assert(batch_edits.size() == 1); assert(new_cf_options != nullptr); assert(max_last_sequence == descriptor_last_sequence_); - CreateColumnFamily(*new_cf_options, first_writer.edit_list.front()); - } else if (first_writer.edit_list.front()->is_column_family_drop_) { + CreateColumnFamily(*new_cf_options, read_options, + first_writer.edit_list.front()); + } else if (first_writer.edit_list.front()->IsColumnFamilyDrop()) { assert(batch_edits.size() == 1); assert(max_last_sequence == descriptor_last_sequence_); first_writer.cfd->SetDropped(); @@ -5406,22 +5734,22 @@ Status VersionSet::ProcessManifestWrites( for (const auto& e : batch_edits) { ColumnFamilyData* cfd = nullptr; if (!e->IsColumnFamilyManipulation()) { - cfd = column_family_set_->GetColumnFamily(e->column_family_); + cfd = column_family_set_->GetColumnFamily(e->GetColumnFamily()); // e would not have been added to batch_edits if its corresponding // column family is dropped. assert(cfd); } if (cfd) { - if (e->has_log_number_ && e->log_number_ > cfd->GetLogNumber()) { - cfd->SetLogNumber(e->log_number_); + if (e->HasLogNumber() && e->GetLogNumber() > cfd->GetLogNumber()) { + cfd->SetLogNumber(e->GetLogNumber()); } if (e->HasFullHistoryTsLow()) { cfd->SetFullHistoryTsLow(e->GetFullHistoryTsLow()); } } - if (e->has_min_log_number_to_keep_) { + if (e->HasMinLogNumberToKeep()) { last_min_log_number_to_keep = - std::max(last_min_log_number_to_keep, e->min_log_number_to_keep_); + std::max(last_min_log_number_to_keep, e->GetMinLogNumberToKeep()); } } @@ -5438,7 +5766,7 @@ Status VersionSet::ProcessManifestWrites( descriptor_last_sequence_ = max_last_sequence; manifest_file_number_ = pending_manifest_file_number_; manifest_file_size_ = new_manifest_file_size; - prev_log_number_ = first_writer.edit_list.front()->prev_log_number_; + prev_log_number_ = first_writer.edit_list.front()->GetPrevLogNumber(); } else { std::string version_edits; for (auto& e : batch_edits) { @@ -5561,6 +5889,7 @@ void VersionSet::WakeUpWaitingManifestWriters() { Status VersionSet::LogAndApply( const autovector& column_family_datas, const autovector& mutable_cf_options_list, + const ReadOptions& read_options, const autovector>& edit_lists, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options, @@ -5585,7 +5914,7 @@ Status VersionSet::LogAndApply( int num_cfds = static_cast(column_family_datas.size()); if (num_cfds == 1 && column_family_datas[0] == nullptr) { assert(edit_lists.size() == 1 && edit_lists[0].size() == 1); - assert(edit_lists[0][0]->is_column_family_add_); + assert(edit_lists[0][0]->IsColumnFamilyAdd()); assert(new_cf_options != nullptr); } std::deque writers; @@ -5638,7 +5967,8 @@ Status VersionSet::LogAndApply( return Status::ColumnFamilyDropped(); } return ProcessManifestWrites(writers, mu, dir_contains_current_file, - new_descriptor_log, new_cf_options); + new_descriptor_log, new_cf_options, + read_options); } void VersionSet::LogAndApplyCFHelper(VersionEdit* edit, @@ -5649,7 +5979,7 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit, assert(!edit->HasLastSequence() || edit->GetLastSequence() == *max_last_sequence); edit->SetLastSequence(*max_last_sequence); - if (edit->is_column_family_drop_) { + if (edit->IsColumnFamilyDrop()) { // if we drop column family, we have to make sure to save max column family, // so that we don't reuse existing ID edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily()); @@ -5667,12 +5997,12 @@ Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, assert(!edit->IsColumnFamilyManipulation()); assert(max_last_sequence != nullptr); - if (edit->has_log_number_) { - assert(edit->log_number_ >= cfd->GetLogNumber()); - assert(edit->log_number_ < next_file_number_.load()); + if (edit->HasLogNumber()) { + assert(edit->GetLogNumber() >= cfd->GetLogNumber()); + assert(edit->GetLogNumber() < next_file_number_.load()); } - if (!edit->has_prev_log_number_) { + if (!edit->HasPrevLogNumber()) { edit->SetPrevLogNumber(prev_log_number_); } edit->SetNextFile(next_file_number_.load()); @@ -5727,6 +6057,7 @@ Status VersionSet::GetCurrentManifestPath(const std::string& dbname, Status VersionSet::Recover( const std::vector& column_families, bool read_only, std::string* db_id, bool no_error_if_files_missing) { + const ReadOptions read_options(Env::IOActivity::kDBOpen); // Read "CURRENT" file, which contains a pointer to the current manifest // file std::string manifest_path; @@ -5763,11 +6094,11 @@ Status VersionSet::Recover( VersionEditHandler handler( read_only, column_families, const_cast(this), /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_, - EpochNumberRequirement::kMightMissing); + read_options, EpochNumberRequirement::kMightMissing); handler.Iterate(reader, &log_read_status); s = handler.status(); if (s.ok()) { - log_number = handler.GetVersionEditParams().log_number_; + log_number = handler.GetVersionEditParams().GetLogNumber(); current_manifest_file_size = reader.GetReadOffset(); assert(current_manifest_file_size != 0); handler.GetDbId(db_id); @@ -5913,6 +6244,7 @@ Status VersionSet::TryRecoverFromOneManifest( const std::string& manifest_path, const std::vector& column_families, bool read_only, std::string* db_id, bool* has_missing_table_file) { + const ReadOptions read_options(Env::IOActivity::kDBOpen); ROCKS_LOG_INFO(db_options_->info_log, "Trying to recover from manifest: %s\n", manifest_path.c_str()); std::unique_ptr manifest_file_reader; @@ -5937,7 +6269,7 @@ Status VersionSet::TryRecoverFromOneManifest( /*checksum=*/true, /*log_num=*/0); VersionEditHandlerPointInTime handler_pit( read_only, column_families, const_cast(this), io_tracer_, - EpochNumberRequirement::kMightMissing); + read_options, EpochNumberRequirement::kMightMissing); handler_pit.Iterate(reader, &s); @@ -5980,6 +6312,8 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, Status VersionSet::ListColumnFamiliesFromManifest( const std::string& manifest_path, FileSystem* fs, std::vector* column_families) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; std::unique_ptr file_reader; Status s; { @@ -5999,7 +6333,7 @@ Status VersionSet::ListColumnFamiliesFromManifest( log::Reader reader(nullptr, std::move(file_reader), &reporter, true /* checksum */, 0 /* log_number */); - ListColumnFamiliesHandler handler; + ListColumnFamiliesHandler handler(read_options); handler.Iterate(reader, &s); assert(column_families); @@ -6013,7 +6347,6 @@ Status VersionSet::ListColumnFamiliesFromManifest( return handler.status(); } -#ifndef ROCKSDB_LITE Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, const Options* options, const FileOptions& file_options, @@ -6023,6 +6356,9 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, "Number of levels needs to be bigger than 1"); } + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + ImmutableDBOptions db_options(*options); ColumnFamilyOptions cf_options(*options); std::shared_ptr tc(NewLRUCache(options->max_open_files - 10, @@ -6032,7 +6368,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc, nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/, /*db_id*/ "", - /*db_session_id*/ ""); + /*db_session_id*/ "", options->daily_offpeak_time_utc, + /*error_handler_*/ nullptr); Status status; std::vector dummy; @@ -6110,8 +6447,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, InstrumentedMutex dummy_mutex; InstrumentedMutexLock l(&dummy_mutex); return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), - mutable_cf_options, &ve, &dummy_mutex, nullptr, - true); + mutable_cf_options, read_options, &ve, + &dummy_mutex, nullptr, true); } // Get the checksum information including the checksum and checksum function @@ -6181,9 +6518,13 @@ Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) { return s; } -Status VersionSet::DumpManifest(Options& options, std::string& dscname, - bool verbose, bool hex, bool json) { +Status VersionSet::DumpManifest( + Options& options, std::string& dscname, bool verbose, bool hex, bool json, + const std::vector& cf_descs) { assert(options.env); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + std::vector column_families; Status s = ListColumnFamiliesFromManifest( dscname, options.env->GetFileSystem().get(), &column_families); @@ -6205,12 +6546,22 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, std::move(file), dscname, db_options_->log_readahead_size, io_tracer_); } - std::vector cf_descs; + std::map cf_name_to_desc; + for (const auto& cf_desc : cf_descs) { + cf_name_to_desc[cf_desc.name] = &cf_desc; + } + std::vector final_cf_descs; for (const auto& cf : column_families) { - cf_descs.emplace_back(cf, options); + const auto iter = cf_name_to_desc.find(cf); + if (iter != cf_name_to_desc.cend()) { + final_cf_descs.push_back(*iter->second); + } else { + final_cf_descs.emplace_back(cf, options); + } } - DumpManifestHandler handler(cf_descs, this, io_tracer_, verbose, hex, json); + DumpManifestHandler handler(final_cf_descs, this, io_tracer_, read_options, + verbose, hex, json); { VersionSet::LogReporter reporter; reporter.status = &s; @@ -6221,7 +6572,6 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, return handler.status(); } -#endif // ROCKSDB_LITE void VersionSet::MarkFileNumberUsed(uint64_t number) { // only called during recovery and repair which are single threaded, so this @@ -6314,6 +6664,8 @@ Status VersionSet::WriteCurrentStateToManifest( } edit.SetComparatorName( cfd->internal_comparator().user_comparator()->Name()); + edit.SetPersistUserDefinedTimestamps( + cfd->ioptions()->persist_user_defined_timestamps); std::string record; if (!edit.EncodeTo(&record)) { return Status::Corruption("Unable to Encode VersionEdit:" + @@ -6349,7 +6701,8 @@ Status VersionSet::WriteCurrentStateToManifest( f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size, + f->user_defined_timestamps_persisted); } } @@ -6393,8 +6746,10 @@ Status VersionSet::WriteCurrentStateToManifest( edit.SetLastSequence(descriptor_last_sequence_); + const Comparator* ucmp = cfd->user_comparator(); + assert(ucmp); std::string record; - if (!edit.EncodeTo(&record)) { + if (!edit.EncodeTo(&record, ucmp->timestamp_size())) { return Status::Corruption("Unable to Encode VersionEdit:" + edit.DebugString(true)); } @@ -6414,6 +6769,7 @@ Status VersionSet::WriteCurrentStateToManifest( // we avoid doing binary search for the keys b and c twice and instead somehow // maintain state of where they first appear in the files. uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, + const ReadOptions& read_options, Version* v, const Slice& start, const Slice& end, int start_level, int end_level, TableReaderCaller caller) { @@ -6493,8 +6849,8 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, for (int i = idx_start + 1; i < idx_end; ++i) { uint64_t file_size = files_brief.files[i].fd.GetFileSize(); // The entire file falls into the range, so we can just take its size. - assert(file_size == - ApproximateSize(v, files_brief.files[i], start, end, caller)); + assert(file_size == ApproximateSize(read_options, v, files_brief.files[i], + start, end, caller)); total_full_size += file_size; } @@ -6529,21 +6885,24 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, // Estimate for all the first files (might also be last files), at each // level for (const auto file_ptr : first_files) { - total_full_size += ApproximateSize(v, *file_ptr, start, end, caller); + total_full_size += + ApproximateSize(read_options, v, *file_ptr, start, end, caller); } // Estimate for all the last files, at each level for (const auto file_ptr : last_files) { // We could use ApproximateSize here, but calling ApproximateOffsetOf // directly is just more efficient. - total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller); + total_full_size += + ApproximateOffsetOf(read_options, v, *file_ptr, end, caller); } } return total_full_size; } -uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, +uint64_t VersionSet::ApproximateOffsetOf(const ReadOptions& read_options, + Version* v, const FdWithKeyRange& f, const Slice& key, TableReaderCaller caller) { // pre-condition @@ -6561,16 +6920,18 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, // "key" falls in the range for this table. Add the // approximate offset of "key" within the table. TableCache* table_cache = v->cfd_->table_cache(); + const MutableCFOptions& cf_opts = v->GetMutableCFOptions(); if (table_cache != nullptr) { result = table_cache->ApproximateOffsetOf( - key, *f.file_metadata, caller, icmp, - v->GetMutableCFOptions().prefix_extractor); + read_options, key, *f.file_metadata, caller, icmp, + cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor); } } return result; } -uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, +uint64_t VersionSet::ApproximateSize(const ReadOptions& read_options, + Version* v, const FdWithKeyRange& f, const Slice& start, const Slice& end, TableReaderCaller caller) { // pre-condition @@ -6586,13 +6947,14 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, if (icmp.Compare(f.smallest_key, start) >= 0) { // Start of the range is before the file start - approximate by end offset - return ApproximateOffsetOf(v, f, end, caller); + return ApproximateOffsetOf(read_options, v, f, end, caller); } if (icmp.Compare(f.largest_key, end) < 0) { // End of the range is after the file end - approximate by subtracting // start offset from the file size - uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller); + uint64_t start_offset = + ApproximateOffsetOf(read_options, v, f, start, caller); assert(f.fd.GetFileSize() >= start_offset); return f.fd.GetFileSize() - start_offset; } @@ -6602,9 +6964,10 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, if (table_cache == nullptr) { return 0; } + const MutableCFOptions& cf_opts = v->GetMutableCFOptions(); return table_cache->ApproximateSize( - start, end, *f.file_metadata, caller, icmp, - v->GetMutableCFOptions().prefix_extractor); + read_options, start, end, *f.file_metadata, caller, icmp, + cf_opts.block_protection_bytes_per_key, cf_opts.prefix_extractor); } void VersionSet::RemoveLiveFiles( @@ -6721,6 +7084,14 @@ InternalIterator* VersionSet::MakeInputIterator( c->num_input_levels() - 1 : c->num_input_levels()); InternalIterator** list = new InternalIterator*[space]; + // First item in the pair is a pointer to range tombstones. + // Second item is a pointer to a member of a LevelIterator, + // that will be initialized to where CompactionMergingIterator stores + // pointer to its range tombstones. This is used by LevelIterator + // to update pointer to range tombstones as it traverse different SST files. + std::vector< + std::pair> + range_tombstones; size_t num = 0; for (size_t which = 0; which < c->num_input_levels(); which++) { if (c->input_levels(which)->num_files != 0) { @@ -6730,7 +7101,7 @@ InternalIterator* VersionSet::MakeInputIterator( const FileMetaData& fmd = *flevel->files[i].file_metadata; if (start.has_value() && cfd->user_comparator()->CompareWithoutTimestamp( - start.value(), fmd.largest.user_key()) > 0) { + *start, fmd.largest.user_key()) > 0) { continue; } // We should be able to filter out the case where the end key @@ -6738,10 +7109,10 @@ InternalIterator* VersionSet::MakeInputIterator( // We try to be extra safe here. if (end.has_value() && cfd->user_comparator()->CompareWithoutTimestamp( - end.value(), fmd.smallest.user_key()) < 0) { + *end, fmd.smallest.user_key()) < 0) { continue; } - + TruncatedRangeDelIterator* range_tombstone_iter = nullptr; list[num++] = cfd->table_cache()->NewIterator( read_options, file_options_compactions, cfd->internal_comparator(), fmd, range_del_agg, @@ -6754,10 +7125,15 @@ InternalIterator* VersionSet::MakeInputIterator( MaxFileSizeForL0MetaPin(*c->mutable_cf_options()), /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, - /*allow_unprepared_value=*/false); + /*allow_unprepared_value=*/false, + c->mutable_cf_options()->block_protection_bytes_per_key, + /*range_del_read_seqno=*/nullptr, + /*range_del_iter=*/&range_tombstone_iter); + range_tombstones.emplace_back(range_tombstone_iter, nullptr); } } else { // Create concatenating iterator for the files from this level + TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr; list[num++] = new LevelIterator( cfd->table_cache(), read_options, file_options_compactions, cfd->internal_comparator(), c->input_levels(which), @@ -6765,15 +7141,17 @@ InternalIterator* VersionSet::MakeInputIterator( /*should_sample=*/false, /*no per level latency histogram=*/nullptr, TableReaderCaller::kCompaction, /*skip_filters=*/false, - /*level=*/static_cast(c->level(which)), range_del_agg, - c->boundaries(which)); + /*level=*/static_cast(c->level(which)), + c->mutable_cf_options()->block_protection_bytes_per_key, + range_del_agg, c->boundaries(which), false, &tombstone_iter_ptr); + range_tombstones.emplace_back(nullptr, tombstone_iter_ptr); } } } assert(num <= space); - InternalIterator* result = - NewMergingIterator(&c->column_family_data()->internal_comparator(), list, - static_cast(num)); + InternalIterator* result = NewCompactionMergingIterator( + &c->column_family_data()->internal_comparator(), list, + static_cast(num), range_tombstones); delete[] list; return result; } @@ -6881,9 +7259,20 @@ void VersionSet::GetObsoleteFiles(std::vector* files, obsolete_manifests_.swap(*manifest_filenames); } +uint64_t VersionSet::GetObsoleteSstFilesSize() const { + uint64_t ret = 0; + for (auto& f : obsolete_files_) { + if (f.metadata != nullptr) { + ret += f.metadata->fd.GetFileSize(); + } + } + return ret; +} + ColumnFamilyData* VersionSet::CreateColumnFamily( - const ColumnFamilyOptions& cf_options, const VersionEdit* edit) { - assert(edit->is_column_family_add_); + const ColumnFamilyOptions& cf_options, const ReadOptions& read_options, + const VersionEdit* edit) { + assert(edit->IsColumnFamilyAdd()); MutableCFOptions dummy_cf_options; Version* dummy_versions = @@ -6892,7 +7281,7 @@ ColumnFamilyData* VersionSet::CreateColumnFamily( // by avoiding calling "delete" explicitly (~Version is private) dummy_versions->Ref(); auto new_cfd = column_family_set_->CreateColumnFamily( - edit->column_family_name_, edit->column_family_, dummy_versions, + edit->GetColumnFamilyName(), edit->GetColumnFamily(), dummy_versions, cf_options); Version* v = new Version(new_cfd, this, file_options_, @@ -6901,14 +7290,15 @@ ColumnFamilyData* VersionSet::CreateColumnFamily( constexpr bool update_stats = false; - v->PrepareAppend(*new_cfd->GetLatestMutableCFOptions(), update_stats); + v->PrepareAppend(*new_cfd->GetLatestMutableCFOptions(), read_options, + update_stats); AppendVersion(new_cfd, v); // GetLatestMutableCFOptions() is safe here without mutex since the // cfd is not available to client new_cfd->CreateNewMemtable(*new_cfd->GetLatestMutableCFOptions(), LastSequence()); - new_cfd->SetLogNumber(edit->log_number_); + new_cfd->SetLogNumber(edit->GetLogNumber()); return new_cfd; } @@ -6966,7 +7356,8 @@ uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) { return all_versions_blob_file_size; } -Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd, +Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options, + ColumnFamilyData* cfd, const std::string& fpath, int level, const FileMetaData& meta) { uint64_t fsize = 0; @@ -6999,9 +7390,9 @@ Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd, TableCache::TypedHandle* handle = nullptr; FileMetaData meta_copy = meta; status = table_cache->FindTable( - ReadOptions(), file_opts, *icmp, meta_copy, &handle, pe, - /*no_io=*/false, /*record_read_stats=*/true, - internal_stats->GetFileReadHist(level), false, level, + read_options, file_opts, *icmp, meta_copy, &handle, + cf_opts->block_protection_bytes_per_key, pe, + /*no_io=*/false, internal_stats->GetFileReadHist(level), false, level, /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin, meta_copy.temperature); if (handle) { @@ -7019,7 +7410,8 @@ ReactiveVersionSet::ReactiveVersionSet( : VersionSet(dbname, _db_options, _file_options, table_cache, write_buffer_manager, write_controller, /*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "", - /*db_session_id*/ "") {} + /*db_session_id*/ "", /*daily_offpeak_time_utc*/ "", + /*error_handler=*/nullptr) {} ReactiveVersionSet::~ReactiveVersionSet() {} @@ -7043,9 +7435,9 @@ Status ReactiveVersionSet::Recover( log::Reader* reader = manifest_reader->get(); assert(reader); - manifest_tailer_.reset( - new ManifestTailer(column_families, const_cast(this), - io_tracer_, EpochNumberRequirement::kMightMissing)); + manifest_tailer_.reset(new ManifestTailer( + column_families, const_cast(this), io_tracer_, + read_options_, EpochNumberRequirement::kMightMissing)); manifest_tailer_->Iterate(*reader, manifest_reader_status->get()); diff --git a/db/version_set.h b/db/version_set.h index 24aa61abad99..cceb5f975c51 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -33,10 +33,12 @@ #include "cache/cache_helpers.h" #include "db/blob/blob_file_meta.h" +#include "db/blob/blob_index.h" #include "db/column_family.h" #include "db/compaction/compaction.h" #include "db/compaction/compaction_picker.h" #include "db/dbformat.h" +#include "db/error_handler.h" #include "db/file_indexer.h" #include "db/log_reader.h" #include "db/range_del_aggregator.h" @@ -52,6 +54,7 @@ #endif #include "monitoring/instrumented_mutex.h" #include "options/db_options.h" +#include "options/offpeak_time_info.h" #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/file_checksum.h" @@ -131,8 +134,10 @@ class VersionStorageInfo { CompactionStyle compaction_style, VersionStorageInfo* src_vstorage, bool _force_consistency_checks, - EpochNumberRequirement epoch_number_requirement = - EpochNumberRequirement::kMustPresent); + EpochNumberRequirement epoch_number_requirement, + SystemClock* clock, + uint32_t bottommost_file_compaction_delay, + OffpeakTimeOption offpeak_time_option); // No copying allowed VersionStorageInfo(const VersionStorageInfo&) = delete; void operator=(const VersionStorageInfo&) = delete; @@ -203,7 +208,7 @@ class VersionStorageInfo { // This computes files_marked_for_compaction_ and is called by // ComputeCompactionScore() - void ComputeFilesMarkedForCompaction(); + void ComputeFilesMarkedForCompaction(int last_level); // This computes ttl_expired_files_ and is called by // ComputeCompactionScore() @@ -214,7 +219,7 @@ class VersionStorageInfo { // ComputeCompactionScore() void ComputeFilesMarkedForPeriodicCompaction( const ImmutableOptions& ioptions, - const uint64_t periodic_compaction_seconds); + const uint64_t periodic_compaction_seconds, int last_level); // This computes bottommost_files_marked_for_compaction_ and is called by // ComputeCompactionScore() or UpdateOldestSnapshot(). @@ -226,7 +231,7 @@ class VersionStorageInfo { // eligible for compaction. // // REQUIRES: DB mutex held - void ComputeBottommostFilesMarkedForCompaction(); + void ComputeBottommostFilesMarkedForCompaction(bool allow_ingest_behind); // This computes files_marked_for_forced_blob_gc_ and is called by // ComputeCompactionScore() @@ -234,14 +239,16 @@ class VersionStorageInfo { // REQUIRES: DB mutex held void ComputeFilesMarkedForForcedBlobGC( double blob_garbage_collection_age_cutoff, - double blob_garbage_collection_force_threshold); + double blob_garbage_collection_force_threshold, + bool enable_blob_garbage_collection); bool level0_non_overlapping() const { return level0_non_overlapping_; } // Updates the oldest snapshot and related internal state, like the bottommost // files marked for compaction. // REQUIRES: DB mutex held - void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum); + void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum, + bool allow_ingest_behind); int MaxInputLevel() const; int MaxOutputLevel(bool allow_ingest_behind) const; @@ -464,6 +471,7 @@ class VersionStorageInfo { // REQUIRES: ComputeCompactionScore has been called // REQUIRES: DB mutex held during access + // Used by Leveled Compaction only. const autovector>& ExpiredTtlFiles() const { assert(finalized_); return expired_ttl_files_; @@ -471,6 +479,7 @@ class VersionStorageInfo { // REQUIRES: ComputeCompactionScore has been called // REQUIRES: DB mutex held during access + // Used by Leveled and Universal Compaction. const autovector>& FilesMarkedForPeriodicCompaction() const { assert(finalized_); @@ -649,6 +658,12 @@ class VersionStorageInfo { // be empty. -1 if it is not level-compaction so it's not applicable. int base_level_; + // Applies to level compaction when + // `level_compaction_dynamic_level_bytes=true`. All non-empty levels <= + // lowest_unnecessary_level_ are not needed and will be drained automatically. + // -1 if there is no unnecessary level, + int lowest_unnecessary_level_; + double level_multiplier_; // A list for the same set of files that are stored in files_, @@ -673,7 +688,7 @@ class VersionStorageInfo { // This vector contains list of files marked for compaction and also not // currently being compacted. It is protected by DB mutex. It is calculated in - // ComputeCompactionScore() + // ComputeCompactionScore(). Used by Leveled and Universal Compaction. autovector> files_marked_for_compaction_; autovector> expired_ttl_files_; @@ -739,6 +754,11 @@ class VersionStorageInfo { // target sizes. uint64_t estimated_compaction_needed_bytes_; + // Used for computing bottommost files marked for compaction and checking for + // offpeak time. + SystemClock* clock_; + uint32_t bottommost_file_compaction_delay_; + bool finalized_; // If set to true, we will run consistency checks even if RocksDB @@ -747,6 +767,8 @@ class VersionStorageInfo { EpochNumberRequirement epoch_number_requirement_; + OffpeakTimeOption offpeak_time_option_; + friend class Version; friend class VersionSet; }; @@ -889,8 +911,15 @@ class Version { FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value, uint64_t* bytes_read) const; - using BlobReadContext = - std::pair>; + struct BlobReadContext { + BlobReadContext(const BlobIndex& blob_idx, const KeyContext* key_ctx) + : blob_index(blob_idx), key_context(key_ctx) {} + + BlobIndex blob_index; + const KeyContext* key_context; + PinnableSlice result; + }; + using BlobReadContexts = std::vector; void MultiGetBlob(const ReadOptions& read_options, MultiGetRange& range, std::unordered_map& blob_ctxs); @@ -899,7 +928,7 @@ class Version { // populates derived data structures. Call without mutex held. It needs to be // called before appending the version to the version set. void PrepareAppend(const MutableCFOptions& mutable_cf_options, - bool update_stats); + const ReadOptions& read_options, bool update_stats); // Reference count management (so Versions do not disappear out from // under live iterators) @@ -929,7 +958,8 @@ class Version { // specified in "file_meta". If the file name of "file_meta" is // known ahead, passing it by a non-null "fname" can save a // file-name conversion. - Status GetTableProperties(std::shared_ptr* tp, + Status GetTableProperties(const ReadOptions& read_options, + std::shared_ptr* tp, const FileMetaData* file_meta, const std::string* fname = nullptr) const; @@ -937,9 +967,12 @@ class Version { // On success, *props will be populated with all SSTables' table properties. // The keys of `props` are the sst file name, the values of `props` are the // tables' properties, represented as std::shared_ptr. - Status GetPropertiesOfAllTables(TablePropertiesCollection* props); - Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level); - Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n, + Status GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props); + Status GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props, int level); + Status GetPropertiesOfTablesInRange(const ReadOptions& read_options, + const Range* range, std::size_t n, TablePropertiesCollection* props) const; // Print summary of range delete tombstones in SST files into out_str, @@ -951,13 +984,14 @@ class Version { // On success, "tp" will contains the aggregated table property among // the table properties of all sst files in this version. Status GetAggregatedTableProperties( + const ReadOptions& read_options, std::shared_ptr* tp, int level = -1); uint64_t GetEstimatedActiveKeys() { return storage_info_.GetEstimatedActiveKeys(); } - size_t GetMemoryUsageByTableReaders(); + size_t GetMemoryUsageByTableReaders(const ReadOptions& read_options); ColumnFamilyData* cfd() const { return cfd_; } @@ -973,7 +1007,10 @@ class Version { void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta); - uint64_t GetSstFilesSize(bool include_bottommost = true); + void GetSstFilesBoundaryKeys(Slice* smallest_user_key, + Slice* largest_user_key); + + uint64_t GetSstFilesSize(bool include_bottommost); // Retrieves the file_creation_time of the oldest file in the DB. // Prerequisite for this API is max_open_files = -1 @@ -1011,11 +1048,12 @@ class Version { // The helper function of UpdateAccumulatedStats, which may fill the missing // fields of file_meta from its associated TableProperties. // Returns true if it does initialize FileMetaData. - bool MaybeInitializeFileMetaData(FileMetaData* file_meta); + bool MaybeInitializeFileMetaData(const ReadOptions& read_options, + FileMetaData* file_meta); // Update the accumulated stats associated with the current version. // This accumulated stats will be used in compaction. - void UpdateAccumulatedStats(); + void UpdateAccumulatedStats(const ReadOptions& read_options); DECLARE_SYNC_AND_ASYNC( /* ret_type */ Status, /* func_name */ MultiGetFromSST, @@ -1068,6 +1106,7 @@ class Version { // used for debugging and logging purposes only. uint64_t version_number_; std::shared_ptr io_tracer_; + bool use_async_io_; Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt, MutableCFOptions mutable_cf_options, @@ -1114,7 +1153,9 @@ class VersionSet { WriteController* write_controller, BlockCacheTracer* const block_cache_tracer, const std::shared_ptr& io_tracer, - const std::string& db_id, const std::string& db_session_id); + const std::string& db_id, const std::string& db_session_id, + const std::string& daily_offpeak_time_utc, + ErrorHandler* const error_handler); // No copying allowed VersionSet(const VersionSet&) = delete; void operator=(const VersionSet&) = delete; @@ -1122,13 +1163,13 @@ class VersionSet { virtual ~VersionSet(); Status LogAndApplyToDefaultColumnFamily( - VersionEdit* edit, InstrumentedMutex* mu, + const ReadOptions& read_options, VersionEdit* edit, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr) { ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault(); const MutableCFOptions* cf_options = default_cf->GetLatestMutableCFOptions(); - return LogAndApply(default_cf, *cf_options, edit, mu, + return LogAndApply(default_cf, *cf_options, read_options, edit, mu, dir_contains_current_file, new_descriptor_log, column_family_options); } @@ -1141,10 +1182,11 @@ class VersionSet { // REQUIRES: no other thread concurrently calls LogAndApply() Status LogAndApply( ColumnFamilyData* column_family_data, - const MutableCFOptions& mutable_cf_options, VersionEdit* edit, - InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, - bool new_descriptor_log = false, - const ColumnFamilyOptions* column_family_options = nullptr) { + const MutableCFOptions& mutable_cf_options, + const ReadOptions& read_options, VersionEdit* edit, InstrumentedMutex* mu, + FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = nullptr, + const std::function& manifest_wcb = {}) { autovector cfds; cfds.emplace_back(column_family_data); autovector mutable_cf_options_list; @@ -1153,15 +1195,16 @@ class VersionSet { autovector edit_list; edit_list.emplace_back(edit); edit_lists.emplace_back(edit_list); - return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, - dir_contains_current_file, new_descriptor_log, - column_family_options); + return LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, + mu, dir_contains_current_file, new_descriptor_log, + column_family_options, {manifest_wcb}); } // The batch version. If edit_list.size() > 1, caller must ensure that // no edit in the list column family add or drop Status LogAndApply( ColumnFamilyData* column_family_data, const MutableCFOptions& mutable_cf_options, + const ReadOptions& read_options, const autovector& edit_list, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr, @@ -1172,8 +1215,8 @@ class VersionSet { mutable_cf_options_list.emplace_back(&mutable_cf_options); autovector> edit_lists; edit_lists.emplace_back(edit_list); - return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, - dir_contains_current_file, new_descriptor_log, + return LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, + mu, dir_contains_current_file, new_descriptor_log, column_family_options, {manifest_wcb}); } @@ -1183,6 +1226,7 @@ class VersionSet { virtual Status LogAndApply( const autovector& cfds, const autovector& mutable_cf_options_list, + const ReadOptions& read_options, const autovector>& edit_lists, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, @@ -1196,7 +1240,7 @@ class VersionSet { uint64_t* manifest_file_number); void WakeUpWaitingManifestWriters(); - // Recover the last saved descriptor from persistent storage. + // Recover the last saved descriptor (MANIFEST) from persistent storage. // If read_only == true, Recover() will not complain if some column families // are not opened Status Recover(const std::vector& column_families, @@ -1227,7 +1271,6 @@ class VersionSet { const std::string& manifest_path, FileSystem* fs, std::vector* column_families); -#ifndef ROCKSDB_LITE // Try to reduce the number of levels. This call is valid when // only one level from the new max level to the old // max level containing files. @@ -1247,9 +1290,8 @@ class VersionSet { // printf contents (for debugging) Status DumpManifest(Options& options, std::string& manifestFileName, - bool verbose, bool hex = false, bool json = false); - -#endif // ROCKSDB_LITE + bool verbose, bool hex = false, bool json = false, + const std::vector& cf_descs = {}); const std::string& DbSessionId() const { return db_session_id_; } @@ -1424,7 +1466,8 @@ class VersionSet { // Return the approximate size of data to be scanned for range [start, end) // in levels [start_level, end_level). If end_level == -1 it will search // through all non-empty levels - uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v, + uint64_t ApproximateSize(const SizeApproximationOptions& options, + const ReadOptions& read_options, Version* v, const Slice& start, const Slice& end, int start_level, int end_level, TableReaderCaller caller); @@ -1451,7 +1494,21 @@ class VersionSet { std::vector* manifest_filenames, uint64_t min_pending_output); + // REQUIRES: DB mutex held + uint64_t GetObsoleteSstFilesSize() const; + ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); } + + const UnorderedMap& GetRunningColumnFamiliesTimestampSize() + const { + return column_family_set_->GetRunningColumnFamiliesTimestampSize(); + } + + const UnorderedMap& + GetColumnFamiliesTimestampSizeForRecord() const { + return column_family_set_->GetColumnFamiliesTimestampSizeForRecord(); + } + RefedColumnFamilySet GetRefedColumnFamilySet() { return RefedColumnFamilySet(GetColumnFamilySet()); } @@ -1462,6 +1519,14 @@ class VersionSet { new_options.writable_file_max_buffer_size; } + // TODO - Consider updating together when file options change in SetDBOptions + const OffpeakTimeOption& offpeak_time_option() { + return offpeak_time_option_; + } + void ChangeOffpeakTimeOption(const std::string& daily_offpeak_time_utc) { + offpeak_time_option_.SetFromOffpeakTimeString(daily_offpeak_time_utc); + } + const ImmutableDBOptions* db_options() const { return db_options_; } static uint64_t GetNumLiveVersions(Version* dummy_versions); @@ -1488,14 +1553,12 @@ class VersionSet { new Version(cfd, this, file_options_, mutable_cf_options, io_tracer_); constexpr bool update_stats = false; - version->PrepareAppend(mutable_cf_options, update_stats); + const ReadOptions read_options; + version->PrepareAppend(mutable_cf_options, read_options, update_stats); AppendVersion(cfd, version); } protected: - using VersionBuilderMap = - UnorderedMap>; - struct ManifestWriter; friend class Version; @@ -1519,14 +1582,15 @@ class VersionSet { void Reset(); // Returns approximated offset of a key in a file for a given version. - uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, - const Slice& key, TableReaderCaller caller); + uint64_t ApproximateOffsetOf(const ReadOptions& read_options, Version* v, + const FdWithKeyRange& f, const Slice& key, + TableReaderCaller caller); // Returns approximated data size between start and end keys in a file // for a given version. - uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f, - const Slice& start, const Slice& end, - TableReaderCaller caller); + uint64_t ApproximateSize(const ReadOptions& read_options, Version* v, + const FdWithKeyRange& f, const Slice& start, + const Slice& end, TableReaderCaller caller); struct MutableCFState { uint64_t log_number; @@ -1545,9 +1609,11 @@ class VersionSet { void AppendVersion(ColumnFamilyData* column_family_data, Version* v); ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, + const ReadOptions& read_options, const VersionEdit* edit); - Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath, + Status VerifyFileMetadata(const ReadOptions& read_options, + ColumnFamilyData* cfd, const std::string& fpath, int level, const FileMetaData& meta); // Protected by DB mutex. @@ -1625,13 +1691,20 @@ class VersionSet { std::string db_session_id_; + // Off-peak time option used for compaction scoring + OffpeakTimeOption offpeak_time_option_; + + // Pointer to the DB's ErrorHandler. + ErrorHandler* const error_handler_; + private: // REQUIRES db mutex at beginning. may release and re-acquire db mutex Status ProcessManifestWrites(std::deque& writers, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log, - const ColumnFamilyOptions* new_cf_options); + const ColumnFamilyOptions* new_cf_options, + const ReadOptions& read_options); void LogAndApplyCFHelper(VersionEdit* edit, SequenceNumber* max_last_sequence); @@ -1683,13 +1756,15 @@ class ReactiveVersionSet : public VersionSet { private: std::unique_ptr manifest_tailer_; - + // TODO: plumb Env::IOActivity + const ReadOptions read_options_; using VersionSet::LogAndApply; using VersionSet::Recover; Status LogAndApply( const autovector& /*cfds*/, const autovector& /*mutable_cf_options_list*/, + const ReadOptions& /* read_options */, const autovector>& /*edit_lists*/, InstrumentedMutex* /*mu*/, FSDirectory* /*dir_contains_current_file*/, bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/, diff --git a/db/version_set_sync_and_async.h b/db/version_set_sync_and_async.h index 51f58cdad0c9..75776b620c84 100644 --- a/db/version_set_sync_and_async.h +++ b/db/version_set_sync_and_async.h @@ -25,6 +25,7 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST) StopWatchNano timer(clock_, timer_enabled /* auto_start */); s = CO_AWAIT(table_cache_->MultiGet)( read_options, *internal_comparator(), *f->file_metadata, &file_range, + mutable_cf_options_.block_protection_bytes_per_key, mutable_cf_options_.prefix_extractor, cfd_->internal_stats()->GetFileReadHist(hit_file_level), skip_filters, skip_range_deletions, hit_file_level, table_handle); @@ -101,23 +102,36 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST) file_range.MarkKeyDone(iter); if (iter->is_blob_index) { + BlobIndex blob_index; + Status tmp_s; + if (iter->value) { TEST_SYNC_POINT_CALLBACK("Version::MultiGet::TamperWithBlobIndex", &(*iter)); - const Slice& blob_index_slice = *(iter->value); - BlobIndex blob_index; - Status tmp_s = blob_index.DecodeFrom(blob_index_slice); - if (tmp_s.ok()) { - const uint64_t blob_file_num = blob_index.file_number(); - blob_ctxs[blob_file_num].emplace_back( - std::make_pair(blob_index, std::cref(*iter))); - } else { - *(iter->s) = tmp_s; - } + tmp_s = blob_index.DecodeFrom(*(iter->value)); + + } else { + assert(iter->columns); + + tmp_s = blob_index.DecodeFrom( + WideColumnsHelper::GetDefaultColumn(iter->columns->columns())); + } + + if (tmp_s.ok()) { + const uint64_t blob_file_num = blob_index.file_number(); + blob_ctxs[blob_file_num].emplace_back(blob_index, &*iter); + } else { + *(iter->s) = tmp_s; } } else { - file_range.AddValueSize(iter->value->size()); + if (iter->value) { + file_range.AddValueSize(iter->value->size()); + } else { + assert(iter->columns); + file_range.AddValueSize(iter->columns->serialized_size()); + } + if (file_range.GetValueSize() > read_options.value_size_soft_limit) { s = Status::Aborted(); break; @@ -141,6 +155,10 @@ DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST) "ROCKSDB_NAMESPACE::blob_db::BlobDB instead."); file_range.MarkKeyDone(iter); continue; + case GetContext::kMergeOperatorFailed: + *status = Status::Corruption(Status::SubCode::kMergeOperatorFailed); + file_range.MarkKeyDone(iter); + continue; } } diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 9234a4d880ce..5eb910c9f32a 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -21,6 +21,7 @@ #include "table/block_based/block_based_table_factory.h" #include "table/mock_table.h" #include "table/unique_id_impl.h" +#include "test_util/mock_time_env.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/string_util.h" @@ -51,7 +52,8 @@ class GenerateLevelFilesBriefTest : public testing::Test { largest_seq, /* marked_for_compact */ false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, + /* user_defined_timestamps_persisted */ true); files_.push_back(f); } @@ -129,7 +131,10 @@ class VersionStorageInfoTestBase : public testing::Test { mutable_cf_options_(options_), vstorage_(&icmp_, ucmp_, 6, kCompactionStyleLevel, /*src_vstorage=*/nullptr, - /*_force_consistency_checks=*/false) {} + /*_force_consistency_checks=*/false, + EpochNumberRequirement::kMustPresent, ioptions_.clock, + mutable_cf_options_.bottommost_file_compaction_delay, + OffpeakTimeOption()) {} ~VersionStorageInfoTestBase() override { for (int i = 0; i < vstorage_.num_levels(); ++i) { @@ -163,7 +168,8 @@ class VersionStorageInfoTestBase : public testing::Test { Temperature::kUnknown, oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, compensated_range_deletion_size); + kNullUniqueId64x2, compensated_range_deletion_size, 0, + /* user_defined_timestamps_persisted */ true); vstorage_.AddFile(level, f); } @@ -454,6 +460,37 @@ TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) { ASSERT_EQ(4, vstorage_.CompactionScoreLevel(2)); } +TEST_F(VersionStorageInfoTest, DrainUnnecessaryLevel) { + ioptions_.level_compaction_dynamic_level_bytes = true; + mutable_cf_options_.max_bytes_for_level_base = 1000; + mutable_cf_options_.max_bytes_for_level_multiplier = 10; + + // Create a few unnecessary levels. + // See if score is calculated correctly. + Add(5, 1U, "1", "2", 2000U); // target size 1010000 + Add(4, 2U, "1", "2", 200U); // target size 101000 + // Unnecessary levels + Add(3, 3U, "1", "2", 100U); // target size 10100 + // Level 2: target size 1010 + Add(1, 4U, "1", "2", + 10U); // target size 1000 = max(base_bytes_min + 1, base_bytes_max) + + UpdateVersionStorageInfo(); + + ASSERT_EQ(1, vstorage_.base_level()); + ASSERT_EQ(1000, vstorage_.MaxBytesForLevel(1)); + ASSERT_EQ(10100, vstorage_.MaxBytesForLevel(3)); + vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_); + + // Tests that levels 1 and 3 are eligible for compaction. + // Levels 1 and 3 are much smaller than target size, + // so size does not contribute to a high compaction score. + ASSERT_EQ(1, vstorage_.CompactionScoreLevel(0)); + ASSERT_GT(vstorage_.CompactionScore(0), 10); + ASSERT_EQ(3, vstorage_.CompactionScoreLevel(1)); + ASSERT_GT(vstorage_.CompactionScore(1), 10); +} + TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) { // Test whether the overlaps are detected as expected Add(1, 1U, "4", "7", 1U); // Perfect overlap with last level @@ -549,7 +586,8 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCEmpty) { constexpr double age_cutoff = 0.5; constexpr double force_threshold = 0.75; - vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + vstorage_.ComputeFilesMarkedForForcedBlobGC( + age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true); ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); } @@ -633,7 +671,8 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCSingleBatch) { { constexpr double age_cutoff = 0.1; constexpr double force_threshold = 0.0; - vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + vstorage_.ComputeFilesMarkedForForcedBlobGC( + age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true); ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); } @@ -644,7 +683,8 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCSingleBatch) { { constexpr double age_cutoff = 0.5; constexpr double force_threshold = 0.0; - vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + vstorage_.ComputeFilesMarkedForForcedBlobGC( + age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true); ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); } @@ -655,7 +695,8 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCSingleBatch) { { constexpr double age_cutoff = 1.0; constexpr double force_threshold = 0.6; - vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + vstorage_.ComputeFilesMarkedForForcedBlobGC( + age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true); ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); } @@ -666,7 +707,8 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCSingleBatch) { { constexpr double age_cutoff = 1.0; constexpr double force_threshold = 0.5; - vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + vstorage_.ComputeFilesMarkedForForcedBlobGC( + age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true); auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC(); ASSERT_EQ(ssts_to_be_compacted.size(), 1); @@ -780,7 +822,8 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) { { constexpr double age_cutoff = 0.1; constexpr double force_threshold = 0.0; - vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + vstorage_.ComputeFilesMarkedForForcedBlobGC( + age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true); ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); } @@ -791,7 +834,8 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) { { constexpr double age_cutoff = 0.25; constexpr double force_threshold = 0.0; - vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + vstorage_.ComputeFilesMarkedForForcedBlobGC( + age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true); ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); } @@ -802,7 +846,8 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) { { constexpr double age_cutoff = 0.5; constexpr double force_threshold = 0.6; - vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + vstorage_.ComputeFilesMarkedForForcedBlobGC( + age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true); ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); } @@ -813,7 +858,8 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) { { constexpr double age_cutoff = 0.5; constexpr double force_threshold = 0.5; - vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + vstorage_.ComputeFilesMarkedForForcedBlobGC( + age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true); auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC(); ASSERT_EQ(ssts_to_be_compacted.size(), 2); @@ -842,7 +888,8 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) { { constexpr double age_cutoff = 0.75; constexpr double force_threshold = 0.6; - vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + vstorage_.ComputeFilesMarkedForForcedBlobGC( + age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true); ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty()); } @@ -853,7 +900,8 @@ TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) { { constexpr double age_cutoff = 0.75; constexpr double force_threshold = 0.5; - vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold); + vstorage_.ComputeFilesMarkedForForcedBlobGC( + age_cutoff, force_threshold, /*enable_blob_garbage_collection=*/true); auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC(); ASSERT_EQ(ssts_to_be_compacted.size(), 2); @@ -1153,11 +1201,12 @@ class VersionSetTestBase { immutable_options_.fs = fs_; immutable_options_.clock = env_->GetSystemClock().get(); - versions_.reset( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ "")); + versions_.reset(new VersionSet( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr)); reactive_versions_ = std::make_shared( dbname_, &db_options_, env_options_, table_cache_.get(), &write_buffer_manager_, &write_controller_, nullptr); @@ -1188,7 +1237,7 @@ class VersionSetTestBase { tmp_db_options.env = env_; std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); std::string db_id; - impl->GetDbIdentityFromIdentityFile(&db_id); + ASSERT_OK(impl->GetDbIdentityFromIdentityFile(&db_id)); new_db.SetDBId(db_id); } new_db.SetLogNumber(0); @@ -1244,7 +1293,7 @@ class VersionSetTestBase { void NewDB() { SequenceNumber last_seqno; std::unique_ptr log_writer; - SetIdentityFile(env_, dbname_); + ASSERT_OK(SetIdentityFile(env_, dbname_)); PrepareManifest(&column_families_, &last_seqno, &log_writer); log_writer.reset(); // Make "CURRENT" file point to the new manifest file. @@ -1257,11 +1306,12 @@ class VersionSetTestBase { } void ReopenDB() { - versions_.reset( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ "")); + versions_.reset(new VersionSet( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr)); EXPECT_OK(versions_->Recover(column_families_, false)); } @@ -1276,9 +1326,9 @@ class VersionSetTestBase { Status LogAndApplyToDefaultCF(VersionEdit& edit) { mutex_.Lock(); - Status s = - versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options_, &edit, &mutex_, nullptr); + Status s = versions_->LogAndApply( + versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, + read_options_, &edit, &mutex_, nullptr); mutex_.Unlock(); return s; } @@ -1290,9 +1340,9 @@ class VersionSetTestBase { vedits.push_back(e.get()); } mutex_.Lock(); - Status s = - versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options_, vedits, &mutex_, nullptr); + Status s = versions_->LogAndApply( + versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, + read_options_, vedits, &mutex_, nullptr); mutex_.Unlock(); return s; } @@ -1304,7 +1354,7 @@ class VersionSetTestBase { VersionEdit dummy; ASSERT_OK(versions_->LogAndApply( versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, - &dummy, &mutex_, db_directory, new_descriptor_log)); + read_options_, &dummy, &mutex_, db_directory, new_descriptor_log)); mutex_.Unlock(); } @@ -1316,10 +1366,13 @@ class VersionSetTestBase { new_cf.SetColumnFamily(new_id); new_cf.SetLogNumber(0); new_cf.SetComparatorName(cf_options.comparator->Name()); + new_cf.SetPersistUserDefinedTimestamps( + cf_options.persist_user_defined_timestamps); Status s; mutex_.Lock(); s = versions_->LogAndApply(/*column_family_data=*/nullptr, - MutableCFOptions(cf_options), &new_cf, &mutex_, + MutableCFOptions(cf_options), read_options_, + &new_cf, &mutex_, /*db_directory=*/nullptr, /*new_descriptor_log=*/false, &cf_options); mutex_.Unlock(); @@ -1341,6 +1394,7 @@ class VersionSetTestBase { ColumnFamilyOptions cf_options_; ImmutableOptions immutable_options_; MutableCFOptions mutable_cf_options_; + const ReadOptions read_options_; std::shared_ptr table_cache_; WriteController write_controller_; WriteBufferManager write_buffer_manager_; @@ -1364,6 +1418,8 @@ class VersionSetTest : public VersionSetTestBase, public testing::Test { TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { NewDB(); const int kGroupSize = 5; + const ReadOptions read_options; + autovector edits; for (int i = 0; i != kGroupSize; ++i) { edits.emplace_back(VersionEdit()); @@ -1390,8 +1446,8 @@ TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { }); SyncPoint::GetInstance()->EnableProcessing(); mutex_.Lock(); - Status s = versions_->LogAndApply(cfds, all_mutable_cf_options, edit_lists, - &mutex_, nullptr); + Status s = versions_->LogAndApply(cfds, all_mutable_cf_options, read_options, + edit_lists, &mutex_, nullptr); mutex_.Unlock(); EXPECT_OK(s); EXPECT_EQ(kGroupSize - 1, count); @@ -1591,9 +1647,9 @@ TEST_F(VersionSetTest, ObsoleteBlobFile) { edit.AddBlobFileGarbage(blob_file_number, total_blob_count, total_blob_bytes); mutex_.Lock(); - Status s = - versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options_, &edit, &mutex_, nullptr); + Status s = versions_->LogAndApply( + versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, + read_options_, &edit, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); @@ -1763,11 +1819,12 @@ TEST_F(VersionSetTest, WalAddition) { // Recover a new VersionSet. { - std::unique_ptr new_versions( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ "")); + std::unique_ptr new_versions(new VersionSet( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr)); ASSERT_OK(new_versions->Recover(column_families_, /*read_only=*/false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 1); @@ -1830,11 +1887,12 @@ TEST_F(VersionSetTest, WalCloseWithoutSync) { // Recover a new VersionSet. { - std::unique_ptr new_versions( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ "")); + std::unique_ptr new_versions(new VersionSet( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr)); ASSERT_OK(new_versions->Recover(column_families_, false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 2); @@ -1883,11 +1941,12 @@ TEST_F(VersionSetTest, WalDeletion) { // Recover a new VersionSet, only the non-closed WAL should show up. { - std::unique_ptr new_versions( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ "")); + std::unique_ptr new_versions(new VersionSet( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr)); ASSERT_OK(new_versions->Recover(column_families_, false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 1); @@ -1921,11 +1980,12 @@ TEST_F(VersionSetTest, WalDeletion) { // Recover from the new MANIFEST, only the non-closed WAL should show up. { - std::unique_ptr new_versions( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ "")); + std::unique_ptr new_versions(new VersionSet( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr)); ASSERT_OK(new_versions->Recover(column_families_, false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 1); @@ -2041,11 +2101,12 @@ TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) { // Recover a new VersionSet, WAL0 is deleted, WAL1 is not. { - std::unique_ptr new_versions( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ "")); + std::unique_ptr new_versions(new VersionSet( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr)); ASSERT_OK(new_versions->Recover(column_families_, false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 1); @@ -2077,11 +2138,12 @@ TEST_F(VersionSetTest, DeleteAllWals) { // Recover a new VersionSet, all WALs are deleted. { - std::unique_ptr new_versions( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ "")); + std::unique_ptr new_versions(new VersionSet( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr)); ASSERT_OK(new_versions->Recover(column_families_, false)); const auto& wals = new_versions->GetWalSet().GetWals(); ASSERT_EQ(wals.size(), 0); @@ -2119,11 +2181,12 @@ TEST_F(VersionSetTest, AtomicGroupWithWalEdits) { // Recover a new VersionSet, the min log number and the last WAL should be // kept. { - std::unique_ptr new_versions( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ "")); + std::unique_ptr new_versions(new VersionSet( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr)); std::string db_id; ASSERT_OK( new_versions->Recover(column_families_, /*read_only=*/false, &db_id)); @@ -2138,6 +2201,99 @@ TEST_F(VersionSetTest, AtomicGroupWithWalEdits) { } } +TEST_F(VersionSetTest, OffpeakTimeInfoTest) { + Random rnd(test::RandomSeed()); + + // Sets off-peak time from 11:30PM to 4:30AM next day. + // Starting at 1:30PM, use mock sleep to make time pass + // and see if IsNowOffpeak() returns correctly per time changes + int now_hour = 13; + int now_minute = 30; + versions_->ChangeOffpeakTimeOption("23:30-04:30"); + + auto mock_clock = std::make_shared(env_->GetSystemClock()); + // Add some extra random days to current time + int days = rnd.Uniform(100); + mock_clock->SetCurrentTime(days * 86400 + now_hour * 3600 + now_minute * 60); + int64_t now; + ASSERT_OK(mock_clock.get()->GetCurrentTime(&now)); + + // Starting at 1:30PM. It's not off-peak + ASSERT_FALSE( + versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak); + + // Now it's at 4:30PM. Still not off-peak + mock_clock->MockSleepForSeconds(3 * 3600); + ASSERT_OK(mock_clock.get()->GetCurrentTime(&now)); + ASSERT_FALSE( + versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak); + + // Now it's at 11:30PM. It's off-peak + mock_clock->MockSleepForSeconds(7 * 3600); + ASSERT_OK(mock_clock.get()->GetCurrentTime(&now)); + ASSERT_TRUE( + versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak); + + // Now it's at 2:30AM next day. It's still off-peak + mock_clock->MockSleepForSeconds(3 * 3600); + ASSERT_OK(mock_clock.get()->GetCurrentTime(&now)); + ASSERT_TRUE( + versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak); + + // Now it's at 4:30AM. It's still off-peak + mock_clock->MockSleepForSeconds(2 * 3600); + ASSERT_OK(mock_clock.get()->GetCurrentTime(&now)); + ASSERT_TRUE( + versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak); + + // Sleep for one more minute. It's at 4:31AM It's no longer off-peak + mock_clock->MockSleepForSeconds(60); + ASSERT_OK(mock_clock.get()->GetCurrentTime(&now)); + ASSERT_FALSE( + versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak); + + // Entire day offpeak + versions_->ChangeOffpeakTimeOption("00:00-23:59"); + // It doesn't matter what time it is. It should be just offpeak. + ASSERT_TRUE( + versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak); + + // Mock Sleep for 3 hours. It's still off-peak + mock_clock->MockSleepForSeconds(3 * 3600); + ASSERT_OK(mock_clock.get()->GetCurrentTime(&now)); + ASSERT_TRUE( + versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak); + + // Mock Sleep for 20 hours. It's still off-peak + mock_clock->MockSleepForSeconds(20 * 3600); + ASSERT_OK(mock_clock.get()->GetCurrentTime(&now)); + ASSERT_TRUE( + versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak); + + // Mock Sleep for 59 minutes. It's still off-peak + mock_clock->MockSleepForSeconds(59 * 60); + ASSERT_OK(mock_clock.get()->GetCurrentTime(&now)); + ASSERT_TRUE( + versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak); + + // Mock Sleep for 59 seconds. It's still off-peak + mock_clock->MockSleepForSeconds(59); + ASSERT_OK(mock_clock.get()->GetCurrentTime(&now)); + ASSERT_TRUE( + versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak); + + // Mock Sleep for 1 second (exactly 24h passed). It's still off-peak + mock_clock->MockSleepForSeconds(1); + ASSERT_OK(mock_clock.get()->GetCurrentTime(&now)); + ASSERT_TRUE( + versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak); + // Another second for sanity check + mock_clock->MockSleepForSeconds(1); + ASSERT_OK(mock_clock.get()->GetCurrentTime(&now)); + ASSERT_TRUE( + versions_->offpeak_time_option().GetOffpeakTimeInfo(now).is_now_offpeak); +} + TEST_F(VersionStorageInfoTest, AddRangeDeletionCompensatedFileSize) { // Tests that compensated range deletion size is added to compensated file // size. @@ -2184,11 +2340,12 @@ class VersionSetWithTimestampTest : public VersionSetTest { } void VerifyFullHistoryTsLow(uint64_t expected_ts_low) { - std::unique_ptr vset( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ "")); + std::unique_ptr vset(new VersionSet( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr)); ASSERT_OK(vset->Recover(column_families_, /*read_only=*/false, /*db_id=*/nullptr)); for (auto* cfd : *(vset->GetColumnFamilySet())) { @@ -2211,7 +2368,7 @@ class VersionSetWithTimestampTest : public VersionSetTest { Status s; mutex_.Lock(); s = versions_->LogAndApply(cfd_, *(cfd_->GetLatestMutableCFOptions()), - edits_, &mutex_, nullptr); + read_options_, edits_, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); VerifyFullHistoryTsLow(*std::max_element(ts_lbs.begin(), ts_lbs.end())); @@ -2221,6 +2378,9 @@ class VersionSetWithTimestampTest : public VersionSetTest { ColumnFamilyData* cfd_{nullptr}; // edits_ must contain and own pointers to heap-alloc VersionEdit objects. autovector edits_; + + private: + const ReadOptions read_options_; }; const std::string VersionSetWithTimestampTest::kNewCfName("new_cf"); @@ -2649,6 +2809,8 @@ class VersionSetTestDropOneCF : public VersionSetTestBase, // Repeat the test for i = 1, 2, 3 to simulate dropping the first, middle and // last column family in an atomic group. TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { + const ReadOptions read_options; + std::vector column_families; SequenceNumber last_seqno; std::unique_ptr log_writer; @@ -2678,7 +2840,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { mutex_.Lock(); s = versions_->LogAndApply(cfd_to_drop, *cfd_to_drop->GetLatestMutableCFOptions(), - &drop_cf_edit, &mutex_, nullptr); + read_options, &drop_cf_edit, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); @@ -2727,8 +2889,8 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { }); SyncPoint::GetInstance()->EnableProcessing(); mutex_.Lock(); - s = versions_->LogAndApply(cfds, mutable_cf_options_list, edit_lists, &mutex_, - nullptr); + s = versions_->LogAndApply(cfds, mutable_cf_options_list, read_options, + edit_lists, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); ASSERT_EQ(1, called); @@ -2818,11 +2980,12 @@ class VersionSetTestEmptyDb assert(nullptr != log_writer); VersionEdit new_db; if (db_options_.write_dbid_to_manifest) { + ASSERT_OK(SetIdentityFile(env_, dbname_)); DBOptions tmp_db_options; tmp_db_options.env = env_; std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); std::string db_id; - impl->GetDbIdentityFromIdentityFile(&db_id); + ASSERT_OK(impl->GetDbIdentityFromIdentityFile(&db_id)); new_db.SetDBId(db_id); } const std::string manifest_path = DescriptorFileName(dbname_, 1); @@ -3152,7 +3315,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, tmp_db_options.env = env_; std::unique_ptr impl(new DBImpl(tmp_db_options, dbname_)); std::string db_id; - impl->GetDbIdentityFromIdentityFile(&db_id); + ASSERT_OK(impl->GetDbIdentityFromIdentityFile(&db_id)); new_db.SetDBId(db_id); } { @@ -3252,11 +3415,11 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr); ASSERT_OK(s); ASSERT_NE(0, file_size); - file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey, - ikey, 0, 0, false, Temperature::kUnknown, 0, 0, - 0, info.epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, - 0); + file_metas->emplace_back( + file_num, /*file_path_id=*/0, file_size, ikey, ikey, 0, 0, false, + Temperature::kUnknown, 0, 0, 0, info.epoch_number, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, + 0, 0, /* user_defined_timestamps_persisted */ true); } } @@ -3278,7 +3441,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, ++last_seqno_; assert(log_writer_.get() != nullptr); std::string record; - ASSERT_TRUE(edit.EncodeTo(&record)); + ASSERT_TRUE(edit.EncodeTo(&record, 0 /* ts_sz */)); Status s = log_writer_->AddRecord(record); ASSERT_OK(s); } @@ -3313,7 +3476,8 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, file_num /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, + /* user_defined_timestamps_persisted */ true); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( @@ -3374,7 +3538,8 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, file_num /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0, + /* user_defined_timestamps_persisted */ true); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( @@ -3500,7 +3665,6 @@ class ChargeFileMetadataTestWithParam ChargeFileMetadataTestWithParam() {} }; -#ifndef ROCKSDB_LITE INSTANTIATE_TEST_CASE_P( ChargeFileMetadataTestWithParam, ChargeFileMetadataTestWithParam, ::testing::Values(CacheEntryRoleOptions::Decision::kEnabled, @@ -3508,6 +3672,7 @@ INSTANTIATE_TEST_CASE_P( TEST_P(ChargeFileMetadataTestWithParam, Basic) { Options options; + options.level_compaction_dynamic_level_bytes = false; BlockBasedTableOptions table_options; CacheEntryRoleOptions::Decision charge_file_metadata = GetParam(); table_options.cache_usage_options.options_overrides.insert( @@ -3611,7 +3776,6 @@ TEST_P(ChargeFileMetadataTestWithParam, Basic) { EXPECT_TRUE(s.ok()); } } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/db/version_util.h b/db/version_util.h index 5ec6fda119d3..acb27749b1b9 100644 --- a/db/version_util.h +++ b/db/version_util.h @@ -25,7 +25,9 @@ class OfflineManifestWriter { options.table_cache_numshardbits)), versions_(db_path, &immutable_db_options_, sopt_, tc_.get(), &wb_, &wc_, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ "") {} + /*db_id=*/"", /*db_session_id=*/"", + options.daily_offpeak_time_utc, + /*error_handler=*/nullptr) {} Status Recover(const std::vector& column_families) { return versions_.Recover(column_families, /*read_only*/ false, @@ -33,14 +35,15 @@ class OfflineManifestWriter { /*no_error_if_files_missing*/ true); } - Status LogAndApply(ColumnFamilyData* cfd, VersionEdit* edit, + Status LogAndApply(const ReadOptions& read_options, ColumnFamilyData* cfd, + VersionEdit* edit, FSDirectory* dir_contains_current_file) { // Use `mutex` to imitate a locked DB mutex when calling `LogAndApply()`. InstrumentedMutex mutex; mutex.Lock(); - Status s = versions_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - edit, &mutex, dir_contains_current_file, - false /* new_descriptor_log */); + Status s = versions_.LogAndApply( + cfd, *cfd->GetLatestMutableCFOptions(), read_options, edit, &mutex, + dir_contains_current_file, false /* new_descriptor_log */); mutex.Unlock(); return s; } diff --git a/db/wal_manager.cc b/db/wal_manager.cc index a6060235f216..2b384e7d2081 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -34,7 +34,6 @@ namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE Status WalManager::DeleteFile(const std::string& fname, uint64_t number) { auto s = env_->DeleteFile(wal_dir_ + "/" + fname); @@ -154,10 +153,11 @@ void WalManager::PurgeObsoleteWALFiles() { return; } uint64_t const now_seconds = static_cast(current_time); - uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) - ? db_options_.WAL_ttl_seconds / 2 - : kDefaultIntervalToDeleteObsoleteWAL; - + uint64_t const time_to_check = + ttl_enabled + ? std::min(kDefaultIntervalToDeleteObsoleteWAL, + std::max(uint64_t{1}, db_options_.WAL_ttl_seconds / 2)) + : kDefaultIntervalToDeleteObsoleteWAL; if (purge_wal_files_last_run_ + time_to_check > now_seconds) { return; } @@ -525,5 +525,4 @@ Status WalManager::ReadFirstLine(const std::string& fname, return status; } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/wal_manager.h b/db/wal_manager.h index 8cc0679357ae..ab79bf002392 100644 --- a/db/wal_manager.h +++ b/db/wal_manager.h @@ -28,7 +28,6 @@ namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE // WAL manager provides the abstraction for reading the WAL files as a single // unit. Internally, it opens and reads the files using Reader or Writer @@ -134,5 +133,4 @@ class WalManager { std::shared_ptr io_tracer_; }; -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index 4ad4e9749a10..0ead57ae8115 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "db/wal_manager.h" @@ -51,11 +50,12 @@ class WalManagerTest : public testing::Test { db_options_.fs = env_->GetFileSystem(); db_options_.clock = env_->GetSystemClock().get(); - versions_.reset( - new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(), - &write_buffer_manager_, &write_controller_, - /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ "")); + versions_.reset(new VersionSet( + dbname_, &db_options_, env_options_, table_cache_.get(), + &write_buffer_manager_, &write_controller_, + /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, + /*db_id=*/"", /*db_session_id=*/"", /*daily_offpeak_time_utc=*/"", + /*error_handler=*/nullptr)); wal_manager_.reset( new WalManager(db_options_, env_options_, nullptr /*IOTracer*/)); @@ -334,13 +334,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as WalManager is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/db/wide/db_wide_basic_test.cc b/db/wide/db_wide_basic_test.cc index 1ffe314fef70..2280a3ed2e99 100644 --- a/db/wide/db_wide_basic_test.cc +++ b/db/wide/db_wide_basic_test.cc @@ -4,11 +4,13 @@ // (found in the LICENSE.Apache file in the root directory). #include +#include #include #include "db/db_test_util.h" #include "port/stack_trace.h" #include "test_util/testutil.h" +#include "util/overload.h" #include "utilities/merge_operators.h" namespace ROCKSDB_NAMESPACE { @@ -104,6 +106,26 @@ TEST_F(DBWideBasicTest, PutEntity) { ASSERT_EQ(values[2], third_value); } + { + constexpr size_t num_keys = 3; + + std::array keys{{first_key, second_key, third_key}}; + std::array results; + std::array statuses; + + db_->MultiGetEntity(ReadOptions(), db_->DefaultColumnFamily(), num_keys, + &keys[0], &results[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(results[0].columns(), first_columns); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(results[1].columns(), second_columns); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(results[2].columns(), expected_third_columns); + } + { std::unique_ptr iter(db_->NewIterator(ReadOptions())); @@ -187,6 +209,11 @@ TEST_F(DBWideBasicTest, PutEntity) { ASSERT_OK(Flush()); verify(); + + // Reopen as Readonly DB and verify + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + verify(); } TEST_F(DBWideBasicTest, PutEntityColumnFamily) { @@ -209,6 +236,334 @@ TEST_F(DBWideBasicTest, PutEntityColumnFamily) { ASSERT_OK(db_->Write(WriteOptions(), &batch)); } +TEST_F(DBWideBasicTest, GetEntityAsPinnableAttributeGroups) { + Options options = GetDefaultOptions(); + CreateAndReopenWithCF({"hot_cf", "cold_cf"}, options); + + constexpr int kDefaultCfHandleIndex = 0; + constexpr int kHotCfHandleIndex = 1; + constexpr int kColdCfHandleIndex = 2; + + constexpr char first_key[] = "first"; + WideColumns first_default_columns{ + {"default_cf_col_1_name", "first_key_default_cf_col_1_value"}, + {"default_cf_col_2_name", "first_key_default_cf_col_2_value"}}; + WideColumns first_hot_columns{ + {"hot_cf_col_1_name", "first_key_hot_cf_col_1_value"}, + {"hot_cf_col_2_name", "first_key_hot_cf_col_2_value"}}; + WideColumns first_cold_columns{ + {"cold_cf_col_1_name", "first_key_cold_cf_col_1_value"}}; + + constexpr char second_key[] = "second"; + WideColumns second_hot_columns{ + {"hot_cf_col_1_name", "second_key_hot_cf_col_1_value"}}; + WideColumns second_cold_columns{ + {"cold_cf_col_1_name", "second_key_cold_cf_col_1_value"}}; + + AttributeGroups first_key_attribute_groups{ + AttributeGroup(handles_[kDefaultCfHandleIndex], first_default_columns), + AttributeGroup(handles_[kHotCfHandleIndex], first_hot_columns), + AttributeGroup(handles_[kColdCfHandleIndex], first_cold_columns)}; + AttributeGroups second_key_attribute_groups{ + AttributeGroup(handles_[kHotCfHandleIndex], second_hot_columns), + AttributeGroup(handles_[kColdCfHandleIndex], second_cold_columns)}; + ASSERT_OK( + db_->PutEntity(WriteOptions(), first_key, first_key_attribute_groups)); + ASSERT_OK( + db_->PutEntity(WriteOptions(), second_key, second_key_attribute_groups)); + + std::vector all_cfs = handles_; + std::vector default_and_hot_cfs{ + {handles_[kDefaultCfHandleIndex], handles_[kHotCfHandleIndex]}}; + std::vector hot_and_cold_cfs{ + {handles_[kHotCfHandleIndex], handles_[kColdCfHandleIndex]}}; + std::vector default_null_and_hot_cfs{ + handles_[kDefaultCfHandleIndex], nullptr, handles_[kHotCfHandleIndex], + nullptr}; + auto create_result = + [](const std::vector& column_families) + -> PinnableAttributeGroups { + PinnableAttributeGroups result; + for (size_t i = 0; i < column_families.size(); ++i) { + result.emplace_back(column_families[i]); + } + return result; + }; + { + // Case 1. Invalid Argument (passing in null CF) + AttributeGroups ag{ + AttributeGroup(nullptr, first_default_columns), + AttributeGroup(handles_[kHotCfHandleIndex], first_hot_columns)}; + ASSERT_NOK(db_->PutEntity(WriteOptions(), first_key, ag)); + + PinnableAttributeGroups result = create_result(default_null_and_hot_cfs); + Status s = db_->GetEntity(ReadOptions(), first_key, &result); + ASSERT_NOK(s); + ASSERT_TRUE(s.IsInvalidArgument()); + // Valid CF, but failed with Incomplete status due to other attribute groups + ASSERT_TRUE(result[0].status().IsIncomplete()); + // Null CF + ASSERT_TRUE(result[1].status().IsInvalidArgument()); + // Valid CF, but failed with Incomplete status due to other attribute groups + ASSERT_TRUE(result[2].status().IsIncomplete()); + // Null CF, but failed with Incomplete status because the nullcheck break + // out early in the loop + ASSERT_TRUE(result[3].status().IsIncomplete()); + } + { + // Case 2. Get first key from default cf and hot_cf and second key from + // hot_cf and cold_cf + constexpr size_t num_column_families = 2; + PinnableAttributeGroups first_key_result = + create_result(default_and_hot_cfs); + PinnableAttributeGroups second_key_result = create_result(hot_and_cold_cfs); + + // GetEntity for first_key + ASSERT_OK(db_->GetEntity(ReadOptions(), first_key, &first_key_result)); + ASSERT_EQ(num_column_families, first_key_result.size()); + // We expect to get values for all keys and CFs + for (size_t i = 0; i < num_column_families; ++i) { + ASSERT_OK(first_key_result[i].status()); + } + // verify values for first key (default cf and hot cf) + ASSERT_EQ(first_default_columns, first_key_result[0].columns()); + ASSERT_EQ(first_hot_columns, first_key_result[1].columns()); + + // GetEntity for second_key + ASSERT_OK(db_->GetEntity(ReadOptions(), second_key, &second_key_result)); + ASSERT_EQ(num_column_families, second_key_result.size()); + // We expect to get values for all keys and CFs + for (size_t i = 0; i < num_column_families; ++i) { + ASSERT_OK(second_key_result[i].status()); + } + // verify values for second key (hot cf and cold cf) + ASSERT_EQ(second_hot_columns, second_key_result[0].columns()); + ASSERT_EQ(second_cold_columns, second_key_result[1].columns()); + } + { + // Case 3. Get first key and second key from all cfs. For the second key, we + // don't expect to get columns from default cf. + constexpr size_t num_column_families = 3; + PinnableAttributeGroups first_key_result = create_result(all_cfs); + PinnableAttributeGroups second_key_result = create_result(all_cfs); + + // GetEntity for first_key + ASSERT_OK(db_->GetEntity(ReadOptions(), first_key, &first_key_result)); + ASSERT_EQ(num_column_families, first_key_result.size()); + // We expect to get values for all keys and CFs + for (size_t i = 0; i < num_column_families; ++i) { + ASSERT_OK(first_key_result[i].status()); + } + // verify values for first key + ASSERT_EQ(first_default_columns, first_key_result[0].columns()); + ASSERT_EQ(first_hot_columns, first_key_result[1].columns()); + ASSERT_EQ(first_cold_columns, first_key_result[2].columns()); + + // GetEntity for second_key + ASSERT_OK(db_->GetEntity(ReadOptions(), second_key, &second_key_result)); + ASSERT_EQ(num_column_families, second_key_result.size()); + // key does not exist in default cf + ASSERT_NOK(second_key_result[0].status()); + ASSERT_TRUE(second_key_result[0].status().IsNotFound()); + + // verify values for second key (hot cf and cold cf) + ASSERT_OK(second_key_result[1].status()); + ASSERT_OK(second_key_result[2].status()); + ASSERT_EQ(second_hot_columns, second_key_result[1].columns()); + ASSERT_EQ(second_cold_columns, second_key_result[2].columns()); + } +} + +TEST_F(DBWideBasicTest, MultiCFMultiGetEntity) { + Options options = GetDefaultOptions(); + CreateAndReopenWithCF({"corinthian"}, options); + + constexpr char first_key[] = "first"; + WideColumns first_columns{{"attr_name1", "foo"}, {"attr_name2", "bar"}}; + + ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), + first_key, first_columns)); + + constexpr char second_key[] = "second"; + WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}}; + + ASSERT_OK( + db_->PutEntity(WriteOptions(), handles_[1], second_key, second_columns)); + + constexpr size_t num_keys = 2; + + std::array column_families{ + {db_->DefaultColumnFamily(), handles_[1]}}; + std::array keys{{first_key, second_key}}; + std::array results; + std::array statuses; + + db_->MultiGetEntity(ReadOptions(), num_keys, &column_families[0], &keys[0], + &results[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(results[0].columns(), first_columns); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(results[1].columns(), second_columns); +} + +TEST_F(DBWideBasicTest, MultiCFMultiGetEntityAsPinnableAttributeGroups) { + Options options = GetDefaultOptions(); + CreateAndReopenWithCF({"hot_cf", "cold_cf"}, options); + + constexpr int kDefaultCfHandleIndex = 0; + constexpr int kHotCfHandleIndex = 1; + constexpr int kColdCfHandleIndex = 2; + + constexpr char first_key[] = "first"; + WideColumns first_default_columns{ + {"default_cf_col_1_name", "first_key_default_cf_col_1_value"}, + {"default_cf_col_2_name", "first_key_default_cf_col_2_value"}}; + WideColumns first_hot_columns{ + {"hot_cf_col_1_name", "first_key_hot_cf_col_1_value"}, + {"hot_cf_col_2_name", "first_key_hot_cf_col_2_value"}}; + WideColumns first_cold_columns{ + {"cold_cf_col_1_name", "first_key_cold_cf_col_1_value"}}; + constexpr char second_key[] = "second"; + WideColumns second_hot_columns{ + {"hot_cf_col_1_name", "second_key_hot_cf_col_1_value"}}; + WideColumns second_cold_columns{ + {"cold_cf_col_1_name", "second_key_cold_cf_col_1_value"}}; + + AttributeGroups first_key_attribute_groups{ + AttributeGroup(handles_[kDefaultCfHandleIndex], first_default_columns), + AttributeGroup(handles_[kHotCfHandleIndex], first_hot_columns), + AttributeGroup(handles_[kColdCfHandleIndex], first_cold_columns)}; + AttributeGroups second_key_attribute_groups{ + AttributeGroup(handles_[kHotCfHandleIndex], second_hot_columns), + AttributeGroup(handles_[kColdCfHandleIndex], second_cold_columns)}; + + ASSERT_OK( + db_->PutEntity(WriteOptions(), first_key, first_key_attribute_groups)); + ASSERT_OK( + db_->PutEntity(WriteOptions(), second_key, second_key_attribute_groups)); + + constexpr size_t num_keys = 2; + std::array keys = {first_key, second_key}; + std::vector all_cfs = handles_; + std::vector default_and_hot_cfs{ + {handles_[kDefaultCfHandleIndex], handles_[kHotCfHandleIndex]}}; + std::vector hot_and_cold_cfs{ + {handles_[kHotCfHandleIndex], handles_[kColdCfHandleIndex]}}; + std::vector null_and_hot_cfs{ + nullptr, handles_[kHotCfHandleIndex], nullptr}; + auto create_result = + [](const std::vector& column_families) + -> PinnableAttributeGroups { + PinnableAttributeGroups result; + for (size_t i = 0; i < column_families.size(); ++i) { + result.emplace_back(column_families[i]); + } + return result; + }; + { + // Check for invalid read option argument + ReadOptions read_options; + read_options.io_activity = Env::IOActivity::kGetEntity; + std::vector results; + for (size_t i = 0; i < num_keys; ++i) { + results.emplace_back(create_result(all_cfs)); + } + db_->MultiGetEntity(read_options, num_keys, keys.data(), results.data()); + for (size_t i = 0; i < num_keys; ++i) { + for (size_t j = 0; j < all_cfs.size(); ++j) { + ASSERT_NOK(results[i][j].status()); + ASSERT_TRUE(results[i][j].status().IsInvalidArgument()); + } + } + // Check for invalid column family in Attribute Group result + results.clear(); + results.emplace_back(create_result(null_and_hot_cfs)); + results.emplace_back(create_result(all_cfs)); + db_->MultiGetEntity(ReadOptions(), num_keys, keys.data(), results.data()); + + // First one failed due to null CFs in the AttributeGroup + // Null CF + ASSERT_NOK(results[0][0].status()); + ASSERT_TRUE(results[0][0].status().IsInvalidArgument()); + // Valid CF, but failed with incomplete status because of other attribute + // groups + ASSERT_NOK(results[0][1].status()); + ASSERT_TRUE(results[0][1].status().IsIncomplete()); + // Null CF + ASSERT_NOK(results[0][2].status()); + ASSERT_TRUE(results[0][2].status().IsInvalidArgument()); + + // Second one failed with Incomplete because first one failed + ASSERT_NOK(results[1][0].status()); + ASSERT_TRUE(results[1][0].status().IsIncomplete()); + ASSERT_NOK(results[1][1].status()); + ASSERT_TRUE(results[1][1].status().IsIncomplete()); + ASSERT_NOK(results[1][2].status()); + ASSERT_TRUE(results[1][2].status().IsIncomplete()); + } + { + // Case 1. Get first key from default cf and hot_cf and second key from + // hot_cf and cold_cf + std::vector results; + PinnableAttributeGroups first_key_result = + create_result(default_and_hot_cfs); + PinnableAttributeGroups second_key_result = create_result(hot_and_cold_cfs); + results.emplace_back(std::move(first_key_result)); + results.emplace_back(std::move(second_key_result)); + + db_->MultiGetEntity(ReadOptions(), num_keys, keys.data(), results.data()); + ASSERT_EQ(2, results.size()); + // We expect to get values for all keys and CFs + for (size_t i = 0; i < num_keys; ++i) { + for (size_t j = 0; j < 2; ++j) { + ASSERT_OK(results[i][j].status()); + } + } + // verify values for first key (default cf and hot cf) + ASSERT_EQ(2, results[0].size()); + ASSERT_EQ(first_default_columns, results[0][0].columns()); + ASSERT_EQ(first_hot_columns, results[0][1].columns()); + + // verify values for second key (hot cf and cold cf) + ASSERT_EQ(2, results[1].size()); + ASSERT_EQ(second_hot_columns, results[1][0].columns()); + ASSERT_EQ(second_cold_columns, results[1][1].columns()); + } + { + // Case 2. Get first key and second key from all cfs. For the second key, we + // don't expect to get columns from default cf. + std::vector results; + PinnableAttributeGroups first_key_result = create_result(all_cfs); + PinnableAttributeGroups second_key_result = create_result(all_cfs); + results.emplace_back(std::move(first_key_result)); + results.emplace_back(std::move(second_key_result)); + + db_->MultiGetEntity(ReadOptions(), num_keys, keys.data(), results.data()); + // verify first key + for (size_t i = 0; i < all_cfs.size(); ++i) { + ASSERT_OK(results[0][i].status()); + } + ASSERT_EQ(3, results[0].size()); + ASSERT_EQ(first_default_columns, results[0][0].columns()); + ASSERT_EQ(first_hot_columns, results[0][1].columns()); + ASSERT_EQ(first_cold_columns, results[0][2].columns()); + + // verify second key + // key does not exist in default cf + ASSERT_NOK(results[1][0].status()); + ASSERT_TRUE(results[1][0].status().IsNotFound()); + ASSERT_TRUE(results[1][0].columns().empty()); + + // key exists in hot_cf and cold_cf + ASSERT_OK(results[1][1].status()); + ASSERT_EQ(second_hot_columns, results[1][1].columns()); + ASSERT_OK(results[1][2].status()); + ASSERT_EQ(second_cold_columns, results[1][2].columns()); + } +} + TEST_F(DBWideBasicTest, MergePlainKeyValue) { Options options = GetDefaultOptions(); options.create_if_missing = true; @@ -279,6 +634,26 @@ TEST_F(DBWideBasicTest, MergePlainKeyValue) { ASSERT_EQ(result.columns(), expected_third_columns); } + { + constexpr size_t num_keys = 3; + + std::array keys{{first_key, second_key, third_key}}; + std::array results; + std::array statuses; + + db_->MultiGetEntity(ReadOptions(), db_->DefaultColumnFamily(), num_keys, + &keys[0], &results[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(results[0].columns(), expected_first_columns); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(results[1].columns(), expected_second_columns); + + ASSERT_OK(statuses[2]); + ASSERT_EQ(results[2].columns(), expected_third_columns); + } + { std::unique_ptr iter(db_->NewIterator(ReadOptions())); @@ -456,6 +831,23 @@ TEST_F(DBWideBasicTest, MergeEntity) { ASSERT_OK(statuses[1]); } + { + constexpr size_t num_keys = 2; + + std::array keys{{first_key, second_key}}; + std::array results; + std::array statuses; + + db_->MultiGetEntity(ReadOptions(), db_->DefaultColumnFamily(), num_keys, + &keys[0], &results[0], &statuses[0]); + + ASSERT_OK(statuses[0]); + ASSERT_EQ(results[0].columns(), first_expected_columns); + + ASSERT_OK(statuses[1]); + ASSERT_EQ(results[1].columns(), second_expected_columns); + } + { std::unique_ptr iter(db_->NewIterator(ReadOptions())); @@ -593,6 +985,693 @@ TEST_F(DBWideBasicTest, MergeEntity) { verify_merge_ops_post_compaction(); } +class DBWideMergeV3Test : public DBWideBasicTest { + protected: + void RunTest(const WideColumns& first_expected, + const WideColumns& second_expected, + const WideColumns& third_expected) { + // Note: we'll take some snapshots to prevent merging during flush + snapshots_.reserve(6); + + // Test reading from memtables + WriteKeyValues(); + VerifyKeyValues(first_expected, second_expected, third_expected); + VerifyMergeOperandCount(first_key, 2); + VerifyMergeOperandCount(second_key, 3); + VerifyMergeOperandCount(third_key, 3); + + // Test reading from SST files + ASSERT_OK(Flush()); + VerifyKeyValues(first_expected, second_expected, third_expected); + VerifyMergeOperandCount(first_key, 2); + VerifyMergeOperandCount(second_key, 3); + VerifyMergeOperandCount(third_key, 3); + + // Test reading from SSTs after compaction. Note that we write the same KVs + // and flush again so we have two overlapping files. We also release the + // snapshots so that the compaction can merge all keys. + WriteKeyValues(); + ASSERT_OK(Flush()); + + snapshots_.clear(); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /* begin */ nullptr, + /* end */ nullptr)); + VerifyKeyValues(first_expected, second_expected, third_expected); + VerifyMergeOperandCount(first_key, 1); + VerifyMergeOperandCount(second_key, 1); + VerifyMergeOperandCount(third_key, 1); + } + + void WriteKeyValues() { + // Base values + ASSERT_OK(db_->Delete(WriteOptions(), db_->DefaultColumnFamily(), + first_key)); // no base value + ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), second_key, + second_base_value)); // plain base value + ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), + third_key, + third_columns)); // wide-column base value + + snapshots_.emplace_back(db_); + + // First round of merge operands + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key, + first_merge_op1)); + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), second_key, + second_merge_op1)); + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), third_key, + third_merge_op1)); + + snapshots_.emplace_back(db_); + + // Second round of merge operands + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key, + first_merge_op2)); + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), second_key, + second_merge_op2)); + ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), third_key, + third_merge_op2)); + + snapshots_.emplace_back(db_); + } + + void VerifyKeyValues(const WideColumns& first_expected, + const WideColumns& second_expected, + const WideColumns& third_expected) { + assert(!first_expected.empty() && + first_expected[0].name() == kDefaultWideColumnName); + assert(!second_expected.empty() && + second_expected[0].name() == kDefaultWideColumnName); + assert(!third_expected.empty() && + third_expected[0].name() == kDefaultWideColumnName); + + // Get + { + PinnableSlice result; + ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), first_key, + &result)); + ASSERT_EQ(result, first_expected[0].value()); + } + + { + PinnableSlice result; + ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), second_key, + &result)); + ASSERT_EQ(result, second_expected[0].value()); + } + + { + PinnableSlice result; + ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), third_key, + &result)); + ASSERT_EQ(result, third_expected[0].value()); + } + + // MultiGet + { + std::array keys{{first_key, second_key, third_key}}; + std::array values; + std::array statuses; + + db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, + keys.data(), values.data(), statuses.data()); + ASSERT_OK(statuses[0]); + ASSERT_EQ(values[0], first_expected[0].value()); + ASSERT_OK(statuses[1]); + ASSERT_EQ(values[1], second_expected[0].value()); + ASSERT_OK(statuses[2]); + ASSERT_EQ(values[2], third_expected[0].value()); + } + + // GetEntity + { + PinnableWideColumns result; + + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + first_key, &result)); + ASSERT_EQ(result.columns(), first_expected); + } + + { + PinnableWideColumns result; + + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + second_key, &result)); + ASSERT_EQ(result.columns(), second_expected); + } + + { + PinnableWideColumns result; + + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + third_key, &result)); + ASSERT_EQ(result.columns(), third_expected); + } + + // MultiGetEntity + { + std::array keys{{first_key, second_key, third_key}}; + std::array results; + std::array statuses; + + db_->MultiGetEntity(ReadOptions(), db_->DefaultColumnFamily(), num_keys, + keys.data(), results.data(), statuses.data()); + ASSERT_OK(statuses[0]); + ASSERT_EQ(results[0].columns(), first_expected); + ASSERT_OK(statuses[1]); + ASSERT_EQ(results[1].columns(), second_expected); + ASSERT_OK(statuses[2]); + ASSERT_EQ(results[2].columns(), third_expected); + } + + // Iterator + { + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + + iter->SeekToFirst(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_expected[0].value()); + ASSERT_EQ(iter->columns(), first_expected); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), second_expected[0].value()); + ASSERT_EQ(iter->columns(), second_expected); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), third_expected[0].value()); + ASSERT_EQ(iter->columns(), third_expected); + + iter->Next(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), third_key); + ASSERT_EQ(iter->value(), third_expected[0].value()); + ASSERT_EQ(iter->columns(), third_expected); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), second_key); + ASSERT_EQ(iter->value(), second_expected[0].value()); + ASSERT_EQ(iter->columns(), second_expected); + + iter->Prev(); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + ASSERT_EQ(iter->key(), first_key); + ASSERT_EQ(iter->value(), first_expected[0].value()); + ASSERT_EQ(iter->columns(), first_expected); + + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); + } + } + + void VerifyMergeOperandCount(const Slice& key, int expected_merge_ops) { + GetMergeOperandsOptions get_merge_opts; + get_merge_opts.expected_max_number_of_operands = expected_merge_ops; + + std::vector merge_operands(expected_merge_ops); + int number_of_operands = 0; + + ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), + key, merge_operands.data(), &get_merge_opts, + &number_of_operands)); + ASSERT_EQ(number_of_operands, expected_merge_ops); + } + + std::vector snapshots_; + + static constexpr size_t num_keys = 3; + + static constexpr char first_key[] = "first"; + static constexpr char first_merge_op1[] = "hello"; + static constexpr char first_merge_op1_upper[] = "HELLO"; + static constexpr char first_merge_op2[] = "world"; + static constexpr char first_merge_op2_upper[] = "WORLD"; + + static constexpr char second_key[] = "second"; + static constexpr char second_base_value[] = "foo"; + static constexpr char second_base_value_upper[] = "FOO"; + static constexpr char second_merge_op1[] = "bar"; + static constexpr char second_merge_op1_upper[] = "BAR"; + static constexpr char second_merge_op2[] = "baz"; + static constexpr char second_merge_op2_upper[] = "BAZ"; + + static constexpr char third_key[] = "third"; + static const WideColumns third_columns; + static constexpr char third_merge_op1[] = "three"; + static constexpr char third_merge_op1_upper[] = "THREE"; + static constexpr char third_merge_op2[] = "four"; + static constexpr char third_merge_op2_upper[] = "FOUR"; +}; + +const WideColumns DBWideMergeV3Test::third_columns{{"one", "ONE"}, + {"two", "TWO"}}; + +TEST_F(DBWideMergeV3Test, MergeV3WideColumnOutput) { + // A test merge operator that always returns a wide-column result. It adds any + // base values and merge operands to a single wide-column entity, and converts + // all column values to uppercase. In addition, it puts "none", "plain", or + // "wide" into the value of the default column depending on the type of the + // base value (if any). + static constexpr char kNone[] = "none"; + static constexpr char kPlain[] = "plain"; + static constexpr char kWide[] = "wide"; + + class WideColumnOutputMergeOperator : public MergeOperator { + public: + bool FullMergeV3(const MergeOperationInputV3& merge_in, + MergeOperationOutputV3* merge_out) const override { + assert(merge_out); + + merge_out->new_value = MergeOperationOutputV3::NewColumns(); + auto& new_columns = + std::get(merge_out->new_value); + + auto upper = [](std::string str) { + for (char& c : str) { + c = static_cast(std::toupper(static_cast(c))); + } + + return str; + }; + + std::visit(overload{[&](const std::monostate&) { + new_columns.emplace_back( + kDefaultWideColumnName.ToString(), kNone); + }, + [&](const Slice& value) { + new_columns.emplace_back( + kDefaultWideColumnName.ToString(), kPlain); + + const std::string val = value.ToString(); + new_columns.emplace_back(val, upper(val)); + }, + [&](const WideColumns& columns) { + new_columns.emplace_back( + kDefaultWideColumnName.ToString(), kWide); + + for (const auto& column : columns) { + new_columns.emplace_back( + column.name().ToString(), + upper(column.value().ToString())); + } + }}, + merge_in.existing_value); + + for (const auto& operand : merge_in.operand_list) { + const std::string op = operand.ToString(); + new_columns.emplace_back(op, upper(op)); + } + + return true; + } + + const char* Name() const override { + return "WideColumnOutputMergeOperator"; + } + }; + + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.merge_operator = std::make_shared(); + Reopen(options); + + // Expected results + // Lexicographical order: [default] < hello < world + const WideColumns first_expected{{kDefaultWideColumnName, kNone}, + {first_merge_op1, first_merge_op1_upper}, + {first_merge_op2, first_merge_op2_upper}}; + // Lexicographical order: [default] < bar < baz < foo + const WideColumns second_expected{ + {kDefaultWideColumnName, kPlain}, + {second_merge_op1, second_merge_op1_upper}, + {second_merge_op2, second_merge_op2_upper}, + {second_base_value, second_base_value_upper}}; + // Lexicographical order: [default] < four < one < three < two + const WideColumns third_expected{ + {kDefaultWideColumnName, kWide}, + {third_merge_op2, third_merge_op2_upper}, + {third_columns[0].name(), third_columns[0].value()}, + {third_merge_op1, third_merge_op1_upper}, + {third_columns[1].name(), third_columns[1].value()}}; + + RunTest(first_expected, second_expected, third_expected); +} + +TEST_F(DBWideMergeV3Test, MergeV3PlainOutput) { + // A test merge operator that always returns a plain value as result, namely + // the total number of operands serialized as a string. Base values are also + // counted as operands; specifically, a plain base value is counted as one + // operand, while a wide-column base value is counted as as many operands as + // the number of columns. + class PlainOutputMergeOperator : public MergeOperator { + public: + bool FullMergeV3(const MergeOperationInputV3& merge_in, + MergeOperationOutputV3* merge_out) const override { + assert(merge_out); + + size_t count = 0; + std::visit( + overload{[&](const std::monostate&) {}, + [&](const Slice&) { count = 1; }, + [&](const WideColumns& columns) { count = columns.size(); }}, + merge_in.existing_value); + + count += merge_in.operand_list.size(); + + merge_out->new_value = std::string(); + std::get(merge_out->new_value) = std::to_string(count); + + return true; + } + + const char* Name() const override { return "PlainOutputMergeOperator"; } + }; + + Options options = GetDefaultOptions(); + options.create_if_missing = true; + options.merge_operator = std::make_shared(); + Reopen(options); + + const WideColumns first_expected{{kDefaultWideColumnName, "2"}}; + const WideColumns second_expected{{kDefaultWideColumnName, "3"}}; + const WideColumns third_expected{{kDefaultWideColumnName, "4"}}; + + RunTest(first_expected, second_expected, third_expected); +} + +TEST_F(DBWideBasicTest, CompactionFilter) { + Options options = GetDefaultOptions(); + options.create_if_missing = true; + + // Wide-column entity with default column + constexpr char first_key[] = "first"; + WideColumns first_columns{{kDefaultWideColumnName, "a"}, + {"attr_name1", "foo"}, + {"attr_name2", "bar"}}; + WideColumns first_columns_uppercase{{kDefaultWideColumnName, "A"}, + {"attr_name1", "FOO"}, + {"attr_name2", "BAR"}}; + + // Wide-column entity without default column + constexpr char second_key[] = "second"; + WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}}; + WideColumns second_columns_uppercase{{"attr_one", "TWO"}, + {"attr_three", "FOUR"}}; + + // Plain old key-value + constexpr char last_key[] = "last"; + constexpr char last_value[] = "baz"; + constexpr char last_value_uppercase[] = "BAZ"; + + auto write = [&] { + ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), + first_key, first_columns)); + ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(), + second_key, second_columns)); + + ASSERT_OK(Flush()); + + ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), last_key, + last_value)); + + ASSERT_OK(Flush()); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /* begin */ nullptr, + /* end */ nullptr)); + }; + + // Test a compaction filter that keeps all entries + { + class KeepFilter : public CompactionFilter { + public: + Decision FilterV3( + int /* level */, const Slice& /* key */, ValueType /* value_type */, + const Slice* /* existing_value */, + const WideColumns* /* existing_columns */, + std::string* /* new_value */, + std::vector>* /* new_columns */, + std::string* /* skip_until */) const override { + return Decision::kKeep; + } + + const char* Name() const override { return "KeepFilter"; } + }; + + KeepFilter filter; + options.compaction_filter = &filter; + + DestroyAndReopen(options); + + write(); + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + first_key, &result)); + ASSERT_EQ(result.columns(), first_columns); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + second_key, &result)); + ASSERT_EQ(result.columns(), second_columns); + } + + // Note: GetEntity should return an entity with a single default column, + // since last_key is a plain key-value + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + last_key, &result)); + + WideColumns expected_columns{{kDefaultWideColumnName, last_value}}; + ASSERT_EQ(result.columns(), expected_columns); + } + } + + // Test a compaction filter that removes all entries + { + class RemoveFilter : public CompactionFilter { + public: + Decision FilterV3( + int /* level */, const Slice& /* key */, ValueType /* value_type */, + const Slice* /* existing_value */, + const WideColumns* /* existing_columns */, + std::string* /* new_value */, + std::vector>* /* new_columns */, + std::string* /* skip_until */) const override { + return Decision::kRemove; + } + + const char* Name() const override { return "RemoveFilter"; } + }; + + RemoveFilter filter; + options.compaction_filter = &filter; + + DestroyAndReopen(options); + + write(); + + { + PinnableWideColumns result; + ASSERT_TRUE(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + first_key, &result) + .IsNotFound()); + } + + { + PinnableWideColumns result; + ASSERT_TRUE(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + second_key, &result) + .IsNotFound()); + } + + { + PinnableWideColumns result; + ASSERT_TRUE(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + last_key, &result) + .IsNotFound()); + } + } + + // Test a compaction filter that changes the values of entries to uppercase. + // The new entry is always a plain key-value; if the existing entry is a + // wide-column entity, only the value of its first column is kept. + { + class ChangeValueFilter : public CompactionFilter { + public: + Decision FilterV3( + int /* level */, const Slice& /* key */, ValueType value_type, + const Slice* existing_value, const WideColumns* existing_columns, + std::string* new_value, + std::vector>* /* new_columns */, + std::string* /* skip_until */) const override { + assert(new_value); + + auto upper = [](const std::string& str) { + std::string result(str); + + for (char& c : result) { + c = static_cast(std::toupper(static_cast(c))); + } + + return result; + }; + + if (value_type == ValueType::kWideColumnEntity) { + assert(existing_columns); + + if (!existing_columns->empty()) { + *new_value = upper(existing_columns->front().value().ToString()); + } + } else { + assert(existing_value); + + *new_value = upper(existing_value->ToString()); + } + + return Decision::kChangeValue; + } + + const char* Name() const override { return "ChangeValueFilter"; } + }; + + ChangeValueFilter filter; + options.compaction_filter = &filter; + + DestroyAndReopen(options); + + write(); + + // Note: GetEntity should return entities with a single default column, + // since all entries are now plain key-values + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + first_key, &result)); + + WideColumns expected_columns{ + {kDefaultWideColumnName, first_columns_uppercase[0].value()}}; + ASSERT_EQ(result.columns(), expected_columns); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + second_key, &result)); + + WideColumns expected_columns{ + {kDefaultWideColumnName, second_columns_uppercase[0].value()}}; + ASSERT_EQ(result.columns(), expected_columns); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + last_key, &result)); + + WideColumns expected_columns{ + {kDefaultWideColumnName, last_value_uppercase}}; + ASSERT_EQ(result.columns(), expected_columns); + } + } + + // Test a compaction filter that changes the column values of entries to + // uppercase. The new entry is always a wide-column entity; if the existing + // entry is a plain key-value, it is converted to a wide-column entity with a + // single default column. + { + class ChangeEntityFilter : public CompactionFilter { + public: + Decision FilterV3( + int /* level */, const Slice& /* key */, ValueType value_type, + const Slice* existing_value, const WideColumns* existing_columns, + std::string* /* new_value */, + std::vector>* new_columns, + std::string* /* skip_until */) const override { + assert(new_columns); + + auto upper = [](const std::string& str) { + std::string result(str); + + for (char& c : result) { + c = static_cast(std::toupper(static_cast(c))); + } + + return result; + }; + + if (value_type == ValueType::kWideColumnEntity) { + assert(existing_columns); + + for (const auto& column : *existing_columns) { + new_columns->emplace_back(column.name().ToString(), + upper(column.value().ToString())); + } + } else { + assert(existing_value); + + new_columns->emplace_back(kDefaultWideColumnName.ToString(), + upper(existing_value->ToString())); + } + + return Decision::kChangeWideColumnEntity; + } + + const char* Name() const override { return "ChangeEntityFilter"; } + }; + + ChangeEntityFilter filter; + options.compaction_filter = &filter; + + DestroyAndReopen(options); + + write(); + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + first_key, &result)); + ASSERT_EQ(result.columns(), first_columns_uppercase); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + second_key, &result)); + ASSERT_EQ(result.columns(), second_columns_uppercase); + } + + { + PinnableWideColumns result; + ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), + last_key, &result)); + + WideColumns expected_columns{ + {kDefaultWideColumnName, last_value_uppercase}}; + ASSERT_EQ(result.columns(), expected_columns); + } + } +} + TEST_F(DBWideBasicTest, PutEntityTimestampError) { // Note: timestamps are currently not supported diff --git a/db/wide/wide_column_serialization.cc b/db/wide/wide_column_serialization.cc index f62143c4052e..bb3f29584f9a 100644 --- a/db/wide/wide_column_serialization.cc +++ b/db/wide/wide_column_serialization.cc @@ -9,17 +9,16 @@ #include #include +#include "db/wide/wide_columns_helper.h" #include "rocksdb/slice.h" #include "util/autovector.h" #include "util/coding.h" namespace ROCKSDB_NAMESPACE { -Status WideColumnSerialization::SerializeImpl(const Slice* value_of_default, - const WideColumns& columns, - std::string& output) { - const size_t num_columns = - value_of_default ? columns.size() + 1 : columns.size(); +Status WideColumnSerialization::Serialize(const WideColumns& columns, + std::string& output) { + const size_t num_columns = columns.size(); if (num_columns > static_cast(std::numeric_limits::max())) { return Status::InvalidArgument("Too many wide columns"); @@ -30,17 +29,6 @@ Status WideColumnSerialization::SerializeImpl(const Slice* value_of_default, PutVarint32(&output, static_cast(num_columns)); const Slice* prev_name = nullptr; - if (value_of_default) { - if (value_of_default->size() > - static_cast(std::numeric_limits::max())) { - return Status::InvalidArgument("Wide column value too long"); - } - - PutLengthPrefixedSlice(&output, kDefaultWideColumnName); - PutVarint32(&output, static_cast(value_of_default->size())); - - prev_name = &kDefaultWideColumnName; - } for (size_t i = 0; i < columns.size(); ++i) { const WideColumn& column = columns[i]; @@ -67,10 +55,6 @@ Status WideColumnSerialization::SerializeImpl(const Slice* value_of_default, prev_name = &name; } - if (value_of_default) { - output.append(value_of_default->data(), value_of_default->size()); - } - for (const auto& column : columns) { const Slice& value = column.value(); @@ -169,12 +153,12 @@ Status WideColumnSerialization::GetValueOfDefaultColumn(Slice& input, return s; } - if (columns.empty() || columns[0].name() != kDefaultWideColumnName) { + if (!WideColumnsHelper::HasDefaultColumn(columns)) { value.clear(); return Status::OK(); } - value = columns[0].value(); + value = WideColumnsHelper::GetDefaultColumn(columns); return Status::OK(); } diff --git a/db/wide/wide_column_serialization.h b/db/wide/wide_column_serialization.h index f0ffbd392481..bb92db04f126 100644 --- a/db/wide/wide_column_serialization.h +++ b/db/wide/wide_column_serialization.h @@ -44,9 +44,6 @@ class Slice; class WideColumnSerialization { public: static Status Serialize(const WideColumns& columns, std::string& output); - static Status Serialize(const Slice& value_of_default, - const WideColumns& other_columns, - std::string& output); static Status Deserialize(Slice& input, WideColumns& columns); @@ -55,23 +52,6 @@ class WideColumnSerialization { static Status GetValueOfDefaultColumn(Slice& input, Slice& value); static constexpr uint32_t kCurrentVersion = 1; - - private: - static Status SerializeImpl(const Slice* value_of_default, - const WideColumns& columns, std::string& output); }; -inline Status WideColumnSerialization::Serialize(const WideColumns& columns, - std::string& output) { - constexpr Slice* value_of_default = nullptr; - - return SerializeImpl(value_of_default, columns, output); -} - -inline Status WideColumnSerialization::Serialize( - const Slice& value_of_default, const WideColumns& other_columns, - std::string& output) { - return SerializeImpl(&value_of_default, other_columns, output); -} - } // namespace ROCKSDB_NAMESPACE diff --git a/db/wide/wide_column_serialization_test.cc b/db/wide/wide_column_serialization_test.cc index 8060d2f24e48..a52d8eb3bf1d 100644 --- a/db/wide/wide_column_serialization_test.cc +++ b/db/wide/wide_column_serialization_test.cc @@ -124,25 +124,6 @@ TEST(WideColumnSerializationTest, SerializeDeserialize) { } } -TEST(WideColumnSerializationTest, SerializeWithPrepend) { - Slice value_of_default("baz"); - WideColumns other_columns{{"foo", "bar"}, {"hello", "world"}}; - - std::string output; - ASSERT_OK(WideColumnSerialization::Serialize(value_of_default, other_columns, - output)); - - Slice input(output); - - WideColumns deserialized_columns; - ASSERT_OK(WideColumnSerialization::Deserialize(input, deserialized_columns)); - - WideColumns expected_columns{{kDefaultWideColumnName, value_of_default}, - other_columns[0], - other_columns[1]}; - ASSERT_EQ(deserialized_columns, expected_columns); -} - TEST(WideColumnSerializationTest, SerializeDuplicateError) { WideColumns columns{{"foo", "bar"}, {"foo", "baz"}}; std::string output; @@ -151,16 +132,6 @@ TEST(WideColumnSerializationTest, SerializeDuplicateError) { WideColumnSerialization::Serialize(columns, output).IsCorruption()); } -TEST(WideColumnSerializationTest, SerializeWithPrependDuplicateError) { - Slice value_of_default("baz"); - WideColumns other_columns{{kDefaultWideColumnName, "dup"}, {"foo", "bar"}}; - - std::string output; - ASSERT_TRUE(WideColumnSerialization::Serialize(value_of_default, - other_columns, output) - .IsCorruption()); -} - TEST(WideColumnSerializationTest, SerializeOutOfOrderError) { WideColumns columns{{"hello", "world"}, {"foo", "bar"}}; std::string output; diff --git a/db/wide/wide_columns_helper.cc b/db/wide/wide_columns_helper.cc new file mode 100644 index 000000000000..cf829ce7969a --- /dev/null +++ b/db/wide/wide_columns_helper.cc @@ -0,0 +1,46 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/wide/wide_columns_helper.h" + +#include + +#include "db/wide/wide_column_serialization.h" + +namespace ROCKSDB_NAMESPACE { +void WideColumnsHelper::DumpWideColumns(const WideColumns& columns, + std::ostream& os, bool hex) { + if (columns.empty()) { + return; + } + if (hex) { + os << std::hex; + } + auto it = columns.begin(); + os << *it; + for (++it; it != columns.end(); ++it) { + os << ' ' << *it; + } +} + +Status WideColumnsHelper::DumpSliceAsWideColumns(const Slice& value, + std::ostream& os, bool hex) { + WideColumns columns; + Slice value_copy = value; + const Status s = WideColumnSerialization::Deserialize(value_copy, columns); + if (s.ok()) { + DumpWideColumns(columns, os, hex); + } + return s; +} + +void WideColumnsHelper::SortColumns(WideColumns& columns) { + std::sort(columns.begin(), columns.end(), + [](const WideColumn& lhs, const WideColumn& rhs) { + return lhs.name().compare(rhs.name()) < 0; + }); +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/db/wide/wide_columns_helper.h b/db/wide/wide_columns_helper.h new file mode 100644 index 000000000000..a870fae30d44 --- /dev/null +++ b/db/wide/wide_columns_helper.h @@ -0,0 +1,40 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include + +#include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/wide_columns.h" + +namespace ROCKSDB_NAMESPACE { + +class WideColumnsHelper { + public: + static void DumpWideColumns(const WideColumns& columns, std::ostream& os, + bool hex); + + static Status DumpSliceAsWideColumns(const Slice& value, std::ostream& os, + bool hex); + + static bool HasDefaultColumn(const WideColumns& columns) { + return !columns.empty() && columns.front().name() == kDefaultWideColumnName; + } + + static bool HasDefaultColumnOnly(const WideColumns& columns) { + return columns.size() == 1 && + columns.front().name() == kDefaultWideColumnName; + } + + static const Slice& GetDefaultColumn(const WideColumns& columns) { + assert(HasDefaultColumn(columns)); + return columns.front().value(); + } + + static void SortColumns(WideColumns& columns); +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/db/wide/wide_columns_helper_test.cc b/db/wide/wide_columns_helper_test.cc new file mode 100644 index 000000000000..482bba531000 --- /dev/null +++ b/db/wide/wide_columns_helper_test.cc @@ -0,0 +1,39 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/wide/wide_columns_helper.h" + +#include "db/wide/wide_column_serialization.h" +#include "test_util/testharness.h" +#include "util/coding.h" + +namespace ROCKSDB_NAMESPACE { + +TEST(WideColumnsHelperTest, DumpWideColumns) { + WideColumns columns{{"foo", "bar"}, {"hello", "world"}}; + std::ostringstream oss; + WideColumnsHelper::DumpWideColumns(columns, oss, false /* hex */); + EXPECT_EQ("foo:bar hello:world", oss.str()); +} + +TEST(WideColumnsHelperTest, DumpSliceAsWideColumns) { + WideColumns columns{{"foo", "bar"}, {"hello", "world"}}; + std::string output; + ASSERT_OK(WideColumnSerialization::Serialize(columns, output)); + Slice input(output); + + std::ostringstream oss; + ASSERT_OK( + WideColumnsHelper::DumpSliceAsWideColumns(input, oss, false /* hex */)); + + EXPECT_EQ("foo:bar hello:world", oss.str()); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/write_batch.cc b/db/write_batch.cc index 4d310d9ea556..f8583f478edc 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -39,6 +39,7 @@ #include "rocksdb/write_batch.h" #include +#include #include #include #include @@ -57,9 +58,10 @@ #include "db/snapshot_impl.h" #include "db/trim_history_scheduler.h" #include "db/wide/wide_column_serialization.h" +#include "db/wide/wide_columns_helper.h" #include "db/write_batch_internal.h" #include "monitoring/perf_context_imp.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "port/lang.h" #include "rocksdb/merge_operator.h" #include "rocksdb/system_clock.h" @@ -293,6 +295,12 @@ size_t WriteBatch::GetProtectionBytesPerKey() const { return 0; } +std::string WriteBatch::Release() { + std::string ret = std::move(rep_); + Clear(); + return ret; +} + bool WriteBatch::HasPut() const { return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0; } @@ -740,6 +748,11 @@ size_t WriteBatchInternal::GetFirstOffset(WriteBatch* /*b*/) { return WriteBatchInternal::kHeader; } +void WriteBatchInternal::SetDefaultColumnFamilyTimestampSize( + WriteBatch* wb, size_t default_cf_ts_sz) { + wb->default_cf_ts_sz_ = default_cf_ts_sz; +} + std::tuple WriteBatchInternal::GetColumnFamilyIdAndTimestampSize( WriteBatch* b, ColumnFamilyHandle* column_family) { @@ -937,10 +950,7 @@ Status WriteBatchInternal::PutEntity(WriteBatch* b, uint32_t column_family_id, } WideColumns sorted_columns(columns); - std::sort(sorted_columns.begin(), sorted_columns.end(), - [](const WideColumn& lhs, const WideColumn& rhs) { - return lhs.name().compare(rhs.name()) < 0; - }); + WideColumnsHelper::SortColumns(sorted_columns); std::string entity; const Status s = WideColumnSerialization::Serialize(sorted_columns, entity); @@ -1007,6 +1017,22 @@ Status WriteBatch::PutEntity(ColumnFamilyHandle* column_family, return WriteBatchInternal::PutEntity(this, cf_id, key, columns); } +Status WriteBatch::PutEntity(const Slice& key, + const AttributeGroups& attribute_groups) { + if (attribute_groups.empty()) { + return Status::InvalidArgument( + "Cannot call this method with empty attribute groups"); + } + Status s; + for (const AttributeGroup& ag : attribute_groups) { + s = PutEntity(ag.column_family(), key, ag.columns()); + if (!s.ok()) { + return s; + } + } + return s; +} + Status WriteBatchInternal::InsertNoop(WriteBatch* b) { b->rep_.push_back(static_cast(kTypeNoop)); return Status::OK(); @@ -2036,6 +2062,7 @@ class MemTableInserter : public WriteBatch::Handler { // key not found in memtable. Do sst get, update, add SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; + // TODO: plumb Env::IOActivity ReadOptions ropts; // it's going to be overwritten for sure, so no point caching data block // containing the old version @@ -2473,13 +2500,16 @@ class MemTableInserter : public WriteBatch::Handler { } if (perform_merge) { - // 1) Get the existing value - std::string get_value; + // 1) Get the existing value. Use the wide column APIs to make sure we + // don't lose any columns in the process. + PinnableWideColumns existing; // Pass in the sequence number so that we also include previous merge // operations in the same batch. SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; + + // TODO: plumb Env::IOActivity ReadOptions read_options; read_options.snapshot = &read_from_snapshot; @@ -2487,26 +2517,47 @@ class MemTableInserter : public WriteBatch::Handler { if (cf_handle == nullptr) { cf_handle = db_->DefaultColumnFamily(); } - Status get_status = db_->Get(read_options, cf_handle, key, &get_value); + + Status get_status = + db_->GetEntity(read_options, cf_handle, key, &existing); if (!get_status.ok()) { // Failed to read a key we know exists. Store the delta in memtable. perform_merge = false; } else { - Slice get_value_slice = Slice(get_value); - // 2) Apply this merge auto merge_operator = moptions->merge_operator; assert(merge_operator); + const auto& columns = existing.columns(); + + Status merge_status; std::string new_value; - // `op_failure_scope` (an output parameter) is not provided (set to - // nullptr) since a failure must be propagated regardless of its value. - Status merge_status = MergeHelper::TimedFullMerge( - merge_operator, key, &get_value_slice, {value}, &new_value, - moptions->info_log, moptions->statistics, - SystemClock::Default().get(), /* result_operand */ nullptr, - /* update_num_ops_stats */ false, - /* op_failure_scope */ nullptr); + ValueType new_value_type; + + if (WideColumnsHelper::HasDefaultColumnOnly(columns)) { + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its + // value. + merge_status = MergeHelper::TimedFullMerge( + merge_operator, key, MergeHelper::kPlainBaseValue, + WideColumnsHelper::GetDefaultColumn(columns), {value}, + moptions->info_log, moptions->statistics, + SystemClock::Default().get(), + /* update_num_ops_stats */ false, &new_value, + /* result_operand */ nullptr, &new_value_type, + /* op_failure_scope */ nullptr); + } else { + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its + // value. + merge_status = MergeHelper::TimedFullMerge( + merge_operator, key, MergeHelper::kWideBaseValue, columns, + {value}, moptions->info_log, moptions->statistics, + SystemClock::Default().get(), + /* update_num_ops_stats */ false, &new_value, + /* result_operand */ nullptr, &new_value_type, + /* op_failure_scope */ nullptr); + } if (!merge_status.ok()) { // Failed to merge! @@ -2515,15 +2566,18 @@ class MemTableInserter : public WriteBatch::Handler { } else { // 3) Add value to memtable assert(!concurrent_memtable_writes_); + assert(new_value_type == kTypeValue || + new_value_type == kTypeWideColumnEntity); + if (kv_prot_info != nullptr) { auto merged_kv_prot_info = kv_prot_info->StripC(column_family_id).ProtectS(sequence_); merged_kv_prot_info.UpdateV(value, new_value); - merged_kv_prot_info.UpdateO(kTypeMerge, kTypeValue); - ret_status = mem->Add(sequence_, kTypeValue, key, new_value, + merged_kv_prot_info.UpdateO(kTypeMerge, new_value_type); + ret_status = mem->Add(sequence_, new_value_type, key, new_value, &merged_kv_prot_info); } else { - ret_status = mem->Add(sequence_, kTypeValue, key, new_value, + ret_status = mem->Add(sequence_, new_value_type, key, new_value, nullptr /* kv_prot_info */); } } diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index 524f4f283f68..ba0b6f24040f 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -229,6 +229,9 @@ class WriteBatchInternal { static void SetAsLatestPersistentState(WriteBatch* b); static bool IsLatestPersistentState(const WriteBatch* b); + static void SetDefaultColumnFamilyTimestampSize(WriteBatch* wb, + size_t default_cf_ts_sz); + static std::tuple GetColumnFamilyIdAndTimestampSize( WriteBatch* b, ColumnFamilyHandle* column_family); diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index d233853e21a4..00faea4ce464 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -12,7 +12,9 @@ #include "db/column_family.h" #include "db/db_test_util.h" #include "db/memtable.h" +#include "db/wide/wide_columns_helper.h" #include "db/write_batch_internal.h" +#include "dbformat.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/env.h" @@ -247,6 +249,22 @@ TEST_F(WriteBatchTest, SingleDeletion) { ASSERT_EQ(2u, batch.Count()); } +TEST_F(WriteBatchTest, OwnershipTransfer) { + Random rnd(301); + WriteBatch put_batch; + ASSERT_OK(put_batch.Put(rnd.RandomString(16) /* key */, + rnd.RandomString(1024) /* value */)); + + // (1) Verify `Release()` transfers string data ownership + const char* expected_data = put_batch.Data().data(); + std::string batch_str = put_batch.Release(); + ASSERT_EQ(expected_data, batch_str.data()); + + // (2) Verify constructor transfers string data ownership + WriteBatch move_batch(std::move(batch_str)); + ASSERT_EQ(expected_data, move_batch.Data().data()); +} + namespace { struct TestHandler : public WriteBatch::Handler { std::string seen; @@ -260,6 +278,21 @@ struct TestHandler : public WriteBatch::Handler { } return Status::OK(); } + Status PutEntityCF(uint32_t column_family_id, const Slice& key, + const Slice& entity) override { + std::ostringstream oss; + Status s = WideColumnsHelper::DumpSliceAsWideColumns(entity, oss, false); + if (!s.ok()) { + return s; + } + if (column_family_id == 0) { + seen += "PutEntity(" + key.ToString() + ", " + oss.str() + ")"; + } else { + seen += "PutEntityCF(" + std::to_string(column_family_id) + ", " + + key.ToString() + ", " + oss.str() + ")"; + } + return Status::OK(); + } Status DeleteCF(uint32_t column_family_id, const Slice& key) override { if (column_family_id == 0) { seen += "Delete(" + key.ToString() + ")"; @@ -649,6 +682,82 @@ class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl { }; } // anonymous namespace +TEST_F(WriteBatchTest, AttributeGroupTest) { + WriteBatch batch; + ColumnFamilyHandleImplDummy zero(0), two(2); + AttributeGroups foo_ags; + WideColumn zero_col_1{"0_c_1_n", "0_c_1_v"}; + WideColumn zero_col_2{"0_c_2_n", "0_c_2_v"}; + WideColumns zero_col_1_col_2{zero_col_1, zero_col_2}; + + WideColumn two_col_1{"2_c_1_n", "2_c_1_v"}; + WideColumn two_col_2{"2_c_2_n", "2_c_2_v"}; + WideColumns two_col_1_col_2{two_col_1, two_col_2}; + + foo_ags.emplace_back(&zero, zero_col_1_col_2); + foo_ags.emplace_back(&two, two_col_1_col_2); + + ASSERT_OK(batch.PutEntity("foo", foo_ags)); + + TestHandler handler; + ASSERT_OK(batch.Iterate(&handler)); + ASSERT_EQ( + "PutEntity(foo, 0_c_1_n:0_c_1_v " + "0_c_2_n:0_c_2_v)" + "PutEntityCF(2, foo, 2_c_1_n:2_c_1_v " + "2_c_2_n:2_c_2_v)", + handler.seen); +} + +TEST_F(WriteBatchTest, AttributeGroupSavePointTest) { + WriteBatch batch; + batch.SetSavePoint(); + + ColumnFamilyHandleImplDummy zero(0), two(2), three(3); + AttributeGroups foo_ags; + WideColumn zero_col_1{"0_c_1_n", "0_c_1_v"}; + WideColumn zero_col_2{"0_c_2_n", "0_c_2_v"}; + WideColumns zero_col_1_col_2{zero_col_1, zero_col_2}; + + WideColumn two_col_1{"2_c_1_n", "2_c_1_v"}; + WideColumn two_col_2{"2_c_2_n", "2_c_2_v"}; + WideColumns two_col_1_col_2{two_col_1, two_col_2}; + + foo_ags.emplace_back(&zero, zero_col_1_col_2); + foo_ags.emplace_back(&two, two_col_1_col_2); + + AttributeGroups bar_ags; + WideColumn three_col_1{"3_c_1_n", "3_c_1_v"}; + WideColumn three_col_2{"3_c_2_n", "3_c_2_v"}; + WideColumns three_col_1_col_2{three_col_1, three_col_2}; + + bar_ags.emplace_back(&zero, zero_col_1_col_2); + bar_ags.emplace_back(&three, three_col_1_col_2); + + ASSERT_OK(batch.PutEntity("foo", foo_ags)); + batch.SetSavePoint(); + + ASSERT_OK(batch.PutEntity("bar", bar_ags)); + + TestHandler handler; + ASSERT_OK(batch.Iterate(&handler)); + ASSERT_EQ( + "PutEntity(foo, 0_c_1_n:0_c_1_v 0_c_2_n:0_c_2_v)" + "PutEntityCF(2, foo, 2_c_1_n:2_c_1_v 2_c_2_n:2_c_2_v)" + "PutEntity(bar, 0_c_1_n:0_c_1_v 0_c_2_n:0_c_2_v)" + "PutEntityCF(3, bar, 3_c_1_n:3_c_1_v 3_c_2_n:3_c_2_v)", + handler.seen); + + ASSERT_OK(batch.RollbackToSavePoint()); + + handler.seen.clear(); + ASSERT_OK(batch.Iterate(&handler)); + ASSERT_EQ( + "PutEntity(foo, 0_c_1_n:0_c_1_v 0_c_2_n:0_c_2_v)" + "PutEntityCF(2, foo, 2_c_1_n:2_c_1_v 2_c_2_n:2_c_2_v)", + handler.seen); +} + TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) { WriteBatch batch; ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8); @@ -661,6 +770,9 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) { ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three"))); ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar"))); ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom"))); + // TODO(yuzhangyu): implement this. + ASSERT_TRUE( + batch.TimedPut(&zero, Slice("foo"), Slice("bar"), 0u).IsNotSupported()); TestHandler handler; ASSERT_OK(batch.Iterate(&handler)); @@ -677,7 +789,6 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) { handler.seen); } -#ifndef ROCKSDB_LITE TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) { WriteBatchWithIndex batch; ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8); @@ -689,6 +800,8 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) { ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three"))); ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar"))); ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom"))); + ASSERT_TRUE( + batch.TimedPut(&zero, Slice("foo"), Slice("bar"), 0u).IsNotSupported()); std::unique_ptr iter; @@ -779,7 +892,6 @@ TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) { "Merge(omom, nom)", handler.seen); } -#endif // !ROCKSDB_LITE TEST_F(WriteBatchTest, SavePointTest) { Status s; diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc index e6ebaae08c7a..ef8e6c98d3c0 100644 --- a/db/write_callback_test.cc +++ b/db/write_callback_test.cc @@ -3,7 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE + +#include "db/write_callback.h" #include "db/write_callback.h" @@ -453,13 +454,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "SKIPPED as WriteWithCallback is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/db/write_stall_stats.cc b/db/write_stall_stats.cc new file mode 100644 index 000000000000..3973df76858f --- /dev/null +++ b/db/write_stall_stats.cc @@ -0,0 +1,179 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/write_stall_stats.h" + +namespace ROCKSDB_NAMESPACE { +const std::string& InvalidWriteStallHyphenString() { + static const std::string kInvalidWriteStallHyphenString = "invalid"; + return kInvalidWriteStallHyphenString; +} + +const std::string& WriteStallCauseToHyphenString(WriteStallCause cause) { + static const std::string kMemtableLimit = "memtable-limit"; + static const std::string kL0FileCountLimit = "l0-file-count-limit"; + static const std::string kPendingCompactionBytes = "pending-compaction-bytes"; + static const std::string kWriteBufferManagerLimit = + "write-buffer-manager-limit"; + switch (cause) { + case WriteStallCause::kMemtableLimit: + return kMemtableLimit; + case WriteStallCause::kL0FileCountLimit: + return kL0FileCountLimit; + case WriteStallCause::kPendingCompactionBytes: + return kPendingCompactionBytes; + case WriteStallCause::kWriteBufferManagerLimit: + return kWriteBufferManagerLimit; + default: + break; + } + return InvalidWriteStallHyphenString(); +} + +const std::string& WriteStallConditionToHyphenString( + WriteStallCondition condition) { + static const std::string kDelayed = "delays"; + static const std::string kStopped = "stops"; + switch (condition) { + case WriteStallCondition::kDelayed: + return kDelayed; + case WriteStallCondition::kStopped: + return kStopped; + default: + break; + } + return InvalidWriteStallHyphenString(); +} + +InternalStats::InternalCFStatsType InternalCFStat( + WriteStallCause cause, WriteStallCondition condition) { + switch (cause) { + case WriteStallCause::kMemtableLimit: { + switch (condition) { + case WriteStallCondition::kDelayed: + return InternalStats::MEMTABLE_LIMIT_DELAYS; + case WriteStallCondition::kStopped: + return InternalStats::MEMTABLE_LIMIT_STOPS; + case WriteStallCondition::kNormal: + break; + } + break; + } + case WriteStallCause::kL0FileCountLimit: { + switch (condition) { + case WriteStallCondition::kDelayed: + return InternalStats::L0_FILE_COUNT_LIMIT_DELAYS; + case WriteStallCondition::kStopped: + return InternalStats::L0_FILE_COUNT_LIMIT_STOPS; + case WriteStallCondition::kNormal: + break; + } + break; + } + case WriteStallCause::kPendingCompactionBytes: { + switch (condition) { + case WriteStallCondition::kDelayed: + return InternalStats::PENDING_COMPACTION_BYTES_LIMIT_DELAYS; + case WriteStallCondition::kStopped: + return InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS; + case WriteStallCondition::kNormal: + break; + } + break; + } + default: + break; + } + return InternalStats::INTERNAL_CF_STATS_ENUM_MAX; +} + +InternalStats::InternalDBStatsType InternalDBStat( + WriteStallCause cause, WriteStallCondition condition) { + switch (cause) { + case WriteStallCause::kWriteBufferManagerLimit: { + switch (condition) { + case WriteStallCondition::kStopped: + return InternalStats::kIntStatsWriteBufferManagerLimitStopsCounts; + default: + break; + } + break; + } + default: + break; + } + return InternalStats::kIntStatsNumMax; +} + +bool isCFScopeWriteStallCause(WriteStallCause cause) { + uint32_t int_cause = static_cast(cause); + uint32_t lower_bound = + static_cast(WriteStallCause::kCFScopeWriteStallCauseEnumMax) - + kNumCFScopeWriteStallCauses; + uint32_t upper_bound = + static_cast(WriteStallCause::kCFScopeWriteStallCauseEnumMax) - + 1; + return lower_bound <= int_cause && int_cause <= upper_bound; +} + +bool isDBScopeWriteStallCause(WriteStallCause cause) { + uint32_t int_cause = static_cast(cause); + uint32_t lower_bound = + static_cast(WriteStallCause::kDBScopeWriteStallCauseEnumMax) - + kNumDBScopeWriteStallCauses; + uint32_t upper_bound = + static_cast(WriteStallCause::kDBScopeWriteStallCauseEnumMax) - + 1; + return lower_bound <= int_cause && int_cause <= upper_bound; +} + +const std::string& WriteStallStatsMapKeys::TotalStops() { + static const std::string kTotalStops = "total-stops"; + return kTotalStops; +} + +const std::string& WriteStallStatsMapKeys::TotalDelays() { + static const std::string kTotalDelays = "total-delays"; + return kTotalDelays; +} + +const std::string& +WriteStallStatsMapKeys::CFL0FileCountLimitDelaysWithOngoingCompaction() { + static const std::string ret = + "cf-l0-file-count-limit-delays-with-ongoing-compaction"; + return ret; +} + +const std::string& +WriteStallStatsMapKeys::CFL0FileCountLimitStopsWithOngoingCompaction() { + static const std::string ret = + "cf-l0-file-count-limit-stops-with-ongoing-compaction"; + return ret; +} + +std::string WriteStallStatsMapKeys::CauseConditionCount( + WriteStallCause cause, WriteStallCondition condition) { + std::string cause_condition_count_name; + + std::string cause_name; + if (isCFScopeWriteStallCause(cause) || isDBScopeWriteStallCause(cause)) { + cause_name = WriteStallCauseToHyphenString(cause); + } else { + assert(false); + return ""; + } + + const std::string& condition_name = + WriteStallConditionToHyphenString(condition); + + cause_condition_count_name.reserve(cause_name.size() + 1 + + condition_name.size()); + cause_condition_count_name.append(cause_name); + cause_condition_count_name.append("-"); + cause_condition_count_name.append(condition_name); + + return cause_condition_count_name; +} +} // namespace ROCKSDB_NAMESPACE diff --git a/db/write_stall_stats.h b/db/write_stall_stats.h new file mode 100644 index 000000000000..6394abb0a82d --- /dev/null +++ b/db/write_stall_stats.h @@ -0,0 +1,47 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "db/internal_stats.h" +#include "rocksdb/types.h" + +namespace ROCKSDB_NAMESPACE { +extern const std::string& InvalidWriteStallHyphenString(); + +extern const std::string& WriteStallCauseToHyphenString(WriteStallCause cause); + +extern const std::string& WriteStallConditionToHyphenString( + WriteStallCondition condition); + +// REQUIRES: +// cause` is CF-scope `WriteStallCause`, see `WriteStallCause` for more +// +// REQUIRES: +// `condition` != `WriteStallCondition::kNormal` +extern InternalStats::InternalCFStatsType InternalCFStat( + WriteStallCause cause, WriteStallCondition condition); + +// REQUIRES: +// cause` is DB-scope `WriteStallCause`, see `WriteStallCause` for more +// +// REQUIRES: +// `condition` != `WriteStallCondition::kNormal` +extern InternalStats::InternalDBStatsType InternalDBStat( + WriteStallCause cause, WriteStallCondition condition); + +extern bool isCFScopeWriteStallCause(WriteStallCause cause); +extern bool isDBScopeWriteStallCause(WriteStallCause cause); + +constexpr uint32_t kNumCFScopeWriteStallCauses = + static_cast(WriteStallCause::kCFScopeWriteStallCauseEnumMax) - + static_cast(WriteStallCause::kMemtableLimit); + +constexpr uint32_t kNumDBScopeWriteStallCauses = + static_cast(WriteStallCause::kDBScopeWriteStallCauseEnumMax) - + static_cast(WriteStallCause::kWriteBufferManagerLimit); +} // namespace ROCKSDB_NAMESPACE diff --git a/db/write_thread.cc b/db/write_thread.cc index de1744cf0489..79870077523b 100644 --- a/db/write_thread.cc +++ b/db/write_thread.cc @@ -228,6 +228,7 @@ bool WriteThread::LinkOne(Writer* w, std::atomic* newest_writer) { assert(w->state == STATE_INIT); Writer* writers = newest_writer->load(std::memory_order_relaxed); while (true) { + assert(writers != w); // If write stall in effect, and w->no_slowdown is not true, // block here until stall is cleared. If its true, then return // immediately @@ -325,6 +326,7 @@ void WriteThread::CompleteFollower(Writer* w, WriteGroup& write_group) { } void WriteThread::BeginWriteStall() { + ++stall_begun_count_; LinkOne(&write_stall_dummy_, &newest_writer_); // Walk writer list until w->write_group != nullptr. The current write group @@ -367,10 +369,34 @@ void WriteThread::EndWriteStall() { } newest_writer_.exchange(write_stall_dummy_.link_older); + ++stall_ended_count_; + // Wake up writers stall_cv_.SignalAll(); } +uint64_t WriteThread::GetBegunCountOfOutstandingStall() { + if (stall_begun_count_ > stall_ended_count_) { + // Oustanding stall in queue + assert(newest_writer_.load(std::memory_order_relaxed) == + &write_stall_dummy_); + return stall_begun_count_; + } else { + // No stall in queue + assert(newest_writer_.load(std::memory_order_relaxed) != + &write_stall_dummy_); + return 0; + } +} + +void WriteThread::WaitForStallEndedCount(uint64_t stall_count) { + MutexLock lock(&stall_mu_); + + while (stall_ended_count_ < stall_count) { + stall_cv_.Wait(); + } +} + static WriteThread::AdaptationContext jbg_ctx("JoinBatchGroup"); void WriteThread::JoinBatchGroup(Writer* w) { TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Start", w); diff --git a/db/write_thread.h b/db/write_thread.h index 558843d27ec7..6638bbfd914b 100644 --- a/db/write_thread.h +++ b/db/write_thread.h @@ -357,11 +357,23 @@ class WriteThread { // Insert a dummy writer at the tail of the write queue to indicate a write // stall, and fail any writers in the queue with no_slowdown set to true + // REQUIRES: db mutex held, no other stall on this queue outstanding void BeginWriteStall(); // Remove the dummy writer and wake up waiting writers + // REQUIRES: db mutex held void EndWriteStall(); + // Number of BeginWriteStall(), or 0 if there is no active stall in the + // write queue. + // REQUIRES: db mutex held + uint64_t GetBegunCountOfOutstandingStall(); + + // Wait for number of completed EndWriteStall() to reach >= `stall_count`, + // which will generally have come from GetBegunCountOfOutstandingStall(). + // (Does not require db mutex held) + void WaitForStallEndedCount(uint64_t stall_count); + private: // See AwaitState. const uint64_t max_yield_usec_; @@ -401,6 +413,18 @@ class WriteThread { port::Mutex stall_mu_; port::CondVar stall_cv_; + // Count the number of stalls begun, so that we can check whether + // a particular stall has cleared (even if caught in another stall). + // Controlled by DB mutex. + // Because of the contract on BeginWriteStall() / EndWriteStall(), + // stall_ended_count_ <= stall_begun_count_ <= stall_ended_count_ + 1. + uint64_t stall_begun_count_ = 0; + // Count the number of stalls ended, so that we can check whether + // a particular stall has cleared (even if caught in another stall). + // Writes controlled by DB mutex + stall_mu_, signalled by stall_cv_. + // Read with stall_mu or DB mutex. + uint64_t stall_ended_count_ = 0; + // Waits for w->state & goal_mask using w->StateMutex(). Returns // the state that satisfies goal_mask. uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask); diff --git a/db_stress_tool/CMakeLists.txt b/db_stress_tool/CMakeLists.txt index 96d70dd0e1ef..60c02e173f47 100644 --- a/db_stress_tool/CMakeLists.txt +++ b/db_stress_tool/CMakeLists.txt @@ -9,8 +9,10 @@ add_executable(db_stress${ARTIFACT_SUFFIX} db_stress_shared_state.cc db_stress_stat.cc db_stress_test_base.cc + db_stress_wide_merge_operator.cc db_stress_tool.cc expected_state.cc + expected_value.cc multi_ops_txns_stress.cc no_batched_ops_stress.cc) target_link_libraries(db_stress${ARTIFACT_SUFFIX} ${ROCKSDB_LIB} ${THIRDPARTY_LIBS}) diff --git a/db_stress_tool/batched_ops_stress.cc b/db_stress_tool/batched_ops_stress.cc index 3f34460762ae..7fb89b60bbd0 100644 --- a/db_stress_tool/batched_ops_stress.cc +++ b/db_stress_tool/batched_ops_stress.cc @@ -31,8 +31,7 @@ class BatchedOpsStressTest : public StressTest { const std::string key_body = Key(rand_keys[0]); - const uint32_t value_base = - thread->rand.Next() % thread->shared->UNKNOWN_SENTINEL; + const uint32_t value_base = thread->rand.Next(); const size_t sz = GenerateValue(value_base, value, sizeof(value)); const std::string value_body = Slice(value, sz).ToString(); @@ -53,11 +52,11 @@ class BatchedOpsStressTest : public StressTest { const std::string k = num + key_body; const std::string v = value_body + num; - if (FLAGS_use_merge) { - batch.Merge(cfh, k, v); - } else if (FLAGS_use_put_entity_one_in > 0 && - (value_base % FLAGS_use_put_entity_one_in) == 0) { + if (FLAGS_use_put_entity_one_in > 0 && + (value_base % FLAGS_use_put_entity_one_in) == 0) { batch.PutEntity(cfh, k, GenerateWideColumns(value_base, v)); + } else if (FLAGS_use_merge) { + batch.Merge(cfh, k, v); } else { batch.Put(cfh, k, v); } @@ -268,6 +267,179 @@ class BatchedOpsStressTest : public StressTest { return ret_status; } + void TestGetEntity(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override { + assert(thread); + + ManagedSnapshot snapshot_guard(db_); + + ReadOptions read_opts_copy(read_opts); + read_opts_copy.snapshot = snapshot_guard.snapshot(); + + assert(!rand_keys.empty()); + + const std::string key_suffix = Key(rand_keys[0]); + + assert(!rand_column_families.empty()); + assert(rand_column_families[0] >= 0); + assert(rand_column_families[0] < static_cast(column_families_.size())); + + ColumnFamilyHandle* const cfh = column_families_[rand_column_families[0]]; + assert(cfh); + + constexpr size_t num_keys = 10; + + std::array results; + + for (size_t i = 0; i < num_keys; ++i) { + const std::string key = std::to_string(i) + key_suffix; + + const Status s = db_->GetEntity(read_opts_copy, cfh, key, &results[i]); + + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "GetEntity error: %s\n", s.ToString().c_str()); + thread->stats.AddErrors(1); + } else if (s.IsNotFound()) { + thread->stats.AddGets(1, 0); + } else { + thread->stats.AddGets(1, 1); + } + } + + for (size_t i = 0; i < num_keys; ++i) { + const WideColumns& columns = results[i].columns(); + + if (!CompareColumns(results[0].columns(), columns)) { + fprintf(stderr, + "GetEntity error: inconsistent entities for key %s: %s, %s\n", + StringToHex(key_suffix).c_str(), + WideColumnsToHex(results[0].columns()).c_str(), + WideColumnsToHex(columns).c_str()); + } + + if (!columns.empty()) { + // The last character of each column value should be 'i' as a decimal + // digit + const char expected = static_cast('0' + i); + + for (const auto& column : columns) { + const Slice& value = column.value(); + + if (value.empty() || value[value.size() - 1] != expected) { + fprintf(stderr, + "GetEntity error: incorrect column value for key " + "%s, entity %s, column value %s, expected %c\n", + StringToHex(key_suffix).c_str(), + WideColumnsToHex(columns).c_str(), + value.ToString(/* hex */ true).c_str(), expected); + } + } + + if (!VerifyWideColumns(columns)) { + fprintf( + stderr, + "GetEntity error: inconsistent columns for key %s, entity %s\n", + StringToHex(key_suffix).c_str(), + WideColumnsToHex(columns).c_str()); + } + } + } + } + + void TestMultiGetEntity(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override { + assert(thread); + + assert(!rand_column_families.empty()); + assert(rand_column_families[0] >= 0); + assert(rand_column_families[0] < static_cast(column_families_.size())); + + ColumnFamilyHandle* const cfh = column_families_[rand_column_families[0]]; + assert(cfh); + + assert(!rand_keys.empty()); + + ManagedSnapshot snapshot_guard(db_); + + ReadOptions read_opts_copy(read_opts); + read_opts_copy.snapshot = snapshot_guard.snapshot(); + + const size_t num_keys = rand_keys.size(); + + for (size_t i = 0; i < num_keys; ++i) { + const std::string key_suffix = Key(rand_keys[i]); + + constexpr size_t num_prefixes = 10; + + std::array keys; + std::array key_slices; + std::array results; + std::array statuses; + + for (size_t j = 0; j < num_prefixes; ++j) { + keys[j] = std::to_string(j) + key_suffix; + key_slices[j] = keys[j]; + } + + db_->MultiGetEntity(read_opts_copy, cfh, num_prefixes, key_slices.data(), + results.data(), statuses.data()); + + for (size_t j = 0; j < num_prefixes; ++j) { + const Status& s = statuses[j]; + + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "MultiGetEntity error: %s\n", s.ToString().c_str()); + thread->stats.AddErrors(1); + } else if (s.IsNotFound()) { + thread->stats.AddGets(1, 0); + } else { + thread->stats.AddGets(1, 1); + } + + const WideColumns& cmp_columns = results[0].columns(); + const WideColumns& columns = results[j].columns(); + + if (!CompareColumns(cmp_columns, columns)) { + fprintf(stderr, + "MultiGetEntity error: inconsistent entities for key %s: %s, " + "%s\n", + StringToHex(key_suffix).c_str(), + WideColumnsToHex(cmp_columns).c_str(), + WideColumnsToHex(columns).c_str()); + } + + if (!columns.empty()) { + // The last character of each column value should be 'j' as a decimal + // digit + const char expected = static_cast('0' + j); + + for (const auto& column : columns) { + const Slice& value = column.value(); + + if (value.empty() || value[value.size() - 1] != expected) { + fprintf(stderr, + "MultiGetEntity error: incorrect column value for key " + "%s, entity %s, column value %s, expected %c\n", + StringToHex(key_suffix).c_str(), + WideColumnsToHex(columns).c_str(), + value.ToString(/* hex */ true).c_str(), expected); + } + } + + if (!VerifyWideColumns(columns)) { + fprintf(stderr, + "MultiGetEntity error: inconsistent columns for key %s, " + "entity %s\n", + StringToHex(key_suffix).c_str(), + WideColumnsToHex(columns).c_str()); + } + } + } + } + } + // Given a key, this does prefix scans for "0"+P, "1"+P, ..., "9"+P // in the same snapshot where P is the first FLAGS_prefix_size - 1 bytes // of the key. Each of these 10 scans returns a series of values; @@ -357,16 +529,14 @@ class BatchedOpsStressTest : public StressTest { } // make sure value() and columns() are consistent - const WideColumns expected_columns = GenerateExpectedWideColumns( - GetValueBase(iters[i]->value()), iters[i]->value()); - if (iters[i]->columns() != expected_columns) { + if (!VerifyWideColumns(iters[i]->value(), iters[i]->columns())) { fprintf(stderr, "prefix scan error : %" ROCKSDB_PRIszt - ", value and columns inconsistent for prefix %s: %s\n", + ", value and columns inconsistent for prefix %s: value: %s, " + "columns: %s\n", i, prefix_slices[i].ToString(/* hex */ true).c_str(), - DebugString(iters[i]->value(), iters[i]->columns(), - expected_columns) - .c_str()); + iters[i]->value().ToString(/* hex */ true).c_str(), + WideColumnsToHex(iters[i]->columns()).c_str()); } iters[i]->Next(); @@ -391,6 +561,30 @@ class BatchedOpsStressTest : public StressTest { void VerifyDb(ThreadState* /* thread */) const override {} void ContinuouslyVerifyDb(ThreadState* /* thread */) const override {} + + // Compare columns ignoring the last character of column values + bool CompareColumns(const WideColumns& lhs, const WideColumns& rhs) { + if (lhs.size() != rhs.size()) { + return false; + } + + for (size_t i = 0; i < lhs.size(); ++i) { + if (lhs[i].name() != rhs[i].name()) { + return false; + } + + if (lhs[i].value().size() != rhs[i].value().size()) { + return false; + } + + if (lhs[i].value().difference_offset(rhs[i].value()) < + lhs[i].value().size() - 1) { + return false; + } + } + + return true; + } }; StressTest* CreateBatchedOpsStressTest() { return new BatchedOpsStressTest(); } diff --git a/db_stress_tool/cf_consistency_stress.cc b/db_stress_tool/cf_consistency_stress.cc index 33f7b1f2e3a6..a7b0895f37f7 100644 --- a/db_stress_tool/cf_consistency_stress.cc +++ b/db_stress_tool/cf_consistency_stress.cc @@ -36,18 +36,15 @@ class CfConsistencyStressTest : public StressTest { WriteBatch batch; - const bool use_put_entity = !FLAGS_use_merge && - FLAGS_use_put_entity_one_in > 0 && - (value_base % FLAGS_use_put_entity_one_in) == 0; - for (auto cf : rand_column_families) { ColumnFamilyHandle* const cfh = column_families_[cf]; assert(cfh); - if (FLAGS_use_merge) { - batch.Merge(cfh, k, v); - } else if (use_put_entity) { + if (FLAGS_use_put_entity_one_in > 0 && + (value_base % FLAGS_use_put_entity_one_in) == 0) { batch.PutEntity(cfh, k, GenerateWideColumns(value_base, v)); + } else if (FLAGS_use_merge) { + batch.Merge(cfh, k, v); } else { batch.Put(cfh, k, v); } @@ -251,6 +248,269 @@ class CfConsistencyStressTest : public StressTest { return statuses; } + void TestGetEntity(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override { + assert(thread); + assert(!rand_column_families.empty()); + assert(!rand_keys.empty()); + + const std::string key = Key(rand_keys[0]); + + Status s; + bool is_consistent = true; + + if (thread->rand.OneIn(2)) { + // With a 1/2 chance, do a random read from a random CF + const size_t cf_id = thread->rand.Next() % rand_column_families.size(); + + assert(rand_column_families[cf_id] >= 0); + assert(rand_column_families[cf_id] < + static_cast(column_families_.size())); + + ColumnFamilyHandle* const cfh = + column_families_[rand_column_families[cf_id]]; + assert(cfh); + + PinnableWideColumns result; + s = db_->GetEntity(read_opts, cfh, key, &result); + + if (s.ok()) { + if (!VerifyWideColumns(result.columns())) { + fprintf( + stderr, + "GetEntity error: inconsistent columns for key %s, entity %s\n", + StringToHex(key).c_str(), + WideColumnsToHex(result.columns()).c_str()); + is_consistent = false; + } + } + } else { + // With a 1/2 chance, compare one key across all CFs + ManagedSnapshot snapshot_guard(db_); + + ReadOptions read_opts_copy = read_opts; + read_opts_copy.snapshot = snapshot_guard.snapshot(); + + assert(rand_column_families[0] >= 0); + assert(rand_column_families[0] < + static_cast(column_families_.size())); + + PinnableWideColumns cmp_result; + s = db_->GetEntity(read_opts_copy, + column_families_[rand_column_families[0]], key, + &cmp_result); + + if (s.ok() || s.IsNotFound()) { + const bool cmp_found = s.ok(); + + if (cmp_found) { + if (!VerifyWideColumns(cmp_result.columns())) { + fprintf(stderr, + "GetEntity error: inconsistent columns for key %s, " + "entity %s\n", + StringToHex(key).c_str(), + WideColumnsToHex(cmp_result.columns()).c_str()); + is_consistent = false; + } + } + + if (is_consistent) { + for (size_t i = 1; i < rand_column_families.size(); ++i) { + assert(rand_column_families[i] >= 0); + assert(rand_column_families[i] < + static_cast(column_families_.size())); + + PinnableWideColumns result; + s = db_->GetEntity(read_opts_copy, + column_families_[rand_column_families[i]], key, + &result); + + if (!s.ok() && !s.IsNotFound()) { + break; + } + + const bool found = s.ok(); + + assert(!column_family_names_.empty()); + assert(i < column_family_names_.size()); + + if (!cmp_found && found) { + fprintf(stderr, + "GetEntity returns different results for key %s: CF %s " + "returns not found, CF %s returns entity %s\n", + StringToHex(key).c_str(), column_family_names_[0].c_str(), + column_family_names_[i].c_str(), + WideColumnsToHex(result.columns()).c_str()); + is_consistent = false; + break; + } + + if (cmp_found && !found) { + fprintf(stderr, + "GetEntity returns different results for key %s: CF %s " + "returns entity %s, CF %s returns not found\n", + StringToHex(key).c_str(), column_family_names_[0].c_str(), + WideColumnsToHex(cmp_result.columns()).c_str(), + column_family_names_[i].c_str()); + is_consistent = false; + break; + } + + if (found && result != cmp_result) { + fprintf(stderr, + "GetEntity returns different results for key %s: CF %s " + "returns entity %s, CF %s returns entity %s\n", + StringToHex(key).c_str(), column_family_names_[0].c_str(), + WideColumnsToHex(cmp_result.columns()).c_str(), + column_family_names_[i].c_str(), + WideColumnsToHex(result.columns()).c_str()); + is_consistent = false; + break; + } + } + } + } + } + + if (!is_consistent) { + fprintf(stderr, "TestGetEntity error: results are not consistent\n"); + thread->stats.AddErrors(1); + // Fail fast to preserve the DB state. + thread->shared->SetVerificationFailure(); + } else if (s.ok()) { + thread->stats.AddGets(1, 1); + } else if (s.IsNotFound()) { + thread->stats.AddGets(1, 0); + } else { + fprintf(stderr, "TestGetEntity error: %s\n", s.ToString().c_str()); + thread->stats.AddErrors(1); + } + } + + void TestMultiGetEntity(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override { + assert(thread); + assert(thread->shared); + assert(!rand_column_families.empty()); + assert(!rand_keys.empty()); + + ManagedSnapshot snapshot_guard(db_); + + ReadOptions read_opts_copy = read_opts; + read_opts_copy.snapshot = snapshot_guard.snapshot(); + + const size_t num_cfs = rand_column_families.size(); + + std::vector cfhs; + cfhs.reserve(num_cfs); + + for (size_t j = 0; j < num_cfs; ++j) { + assert(rand_column_families[j] >= 0); + assert(rand_column_families[j] < + static_cast(column_families_.size())); + + ColumnFamilyHandle* const cfh = column_families_[rand_column_families[j]]; + assert(cfh); + + cfhs.emplace_back(cfh); + } + + const size_t num_keys = rand_keys.size(); + + for (size_t i = 0; i < num_keys; ++i) { + const std::string key = Key(rand_keys[i]); + + std::vector key_slices(num_cfs, key); + std::vector results(num_cfs); + std::vector statuses(num_cfs); + + db_->MultiGetEntity(read_opts_copy, num_cfs, cfhs.data(), + key_slices.data(), results.data(), statuses.data()); + + bool is_consistent = true; + + for (size_t j = 0; j < num_cfs; ++j) { + const Status& s = statuses[j]; + const Status& cmp_s = statuses[0]; + const WideColumns& columns = results[j].columns(); + const WideColumns& cmp_columns = results[0].columns(); + + if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "TestMultiGetEntity error: %s\n", + s.ToString().c_str()); + thread->stats.AddErrors(1); + break; + } + + assert(cmp_s.ok() || cmp_s.IsNotFound()); + + if (s.IsNotFound()) { + if (cmp_s.ok()) { + fprintf( + stderr, + "MultiGetEntity returns different results for key %s: CF %s " + "returns entity %s, CF %s returns not found\n", + StringToHex(key).c_str(), column_family_names_[0].c_str(), + WideColumnsToHex(cmp_columns).c_str(), + column_family_names_[j].c_str()); + is_consistent = false; + break; + } + + continue; + } + + assert(s.ok()); + if (cmp_s.IsNotFound()) { + fprintf(stderr, + "MultiGetEntity returns different results for key %s: CF %s " + "returns not found, CF %s returns entity %s\n", + StringToHex(key).c_str(), column_family_names_[0].c_str(), + column_family_names_[j].c_str(), + WideColumnsToHex(columns).c_str()); + is_consistent = false; + break; + } + + if (columns != cmp_columns) { + fprintf(stderr, + "MultiGetEntity returns different results for key %s: CF %s " + "returns entity %s, CF %s returns entity %s\n", + StringToHex(key).c_str(), column_family_names_[0].c_str(), + WideColumnsToHex(cmp_columns).c_str(), + column_family_names_[j].c_str(), + WideColumnsToHex(columns).c_str()); + is_consistent = false; + break; + } + + if (!VerifyWideColumns(columns)) { + fprintf(stderr, + "MultiGetEntity error: inconsistent columns for key %s, " + "entity %s\n", + StringToHex(key).c_str(), WideColumnsToHex(columns).c_str()); + is_consistent = false; + break; + } + } + + if (!is_consistent) { + fprintf(stderr, + "TestMultiGetEntity error: results are not consistent\n"); + thread->stats.AddErrors(1); + // Fail fast to preserve the DB state. + thread->shared->SetVerificationFailure(); + break; + } else if (statuses[0].ok()) { + thread->stats.AddGets(1, 1); + } else if (statuses[0].IsNotFound()) { + thread->stats.AddGets(1, 0); + } + } + } + Status TestPrefixScan(ThreadState* thread, const ReadOptions& readoptions, const std::vector& rand_column_families, const std::vector& rand_keys) override { @@ -290,12 +550,9 @@ class CfConsistencyStressTest : public StressTest { iter->Next()) { ++count; - const WideColumns expected_columns = GenerateExpectedWideColumns( - GetValueBase(iter->value()), iter->value()); - if (iter->columns() != expected_columns) { - s = Status::Corruption( - "Value and columns inconsistent", - DebugString(iter->value(), iter->columns(), expected_columns)); + if (!VerifyWideColumns(iter->value(), iter->columns())) { + s = Status::Corruption("Value and columns inconsistent", + DebugString(iter->value(), iter->columns())); break; } } @@ -372,12 +629,10 @@ class CfConsistencyStressTest : public StressTest { assert(iter); if (iter->Valid()) { - const WideColumns expected_columns = GenerateExpectedWideColumns( - GetValueBase(iter->value()), iter->value()); - if (iter->columns() != expected_columns) { - statuses[i] = Status::Corruption( - "Value and columns inconsistent", - DebugString(iter->value(), iter->columns(), expected_columns)); + if (!VerifyWideColumns(iter->value(), iter->columns())) { + statuses[i] = + Status::Corruption("Value and columns inconsistent", + DebugString(iter->value(), iter->columns())); } else { ++valid_cnt; } @@ -466,7 +721,6 @@ class CfConsistencyStressTest : public StressTest { iters[i]->key().ToString(true /* hex */).c_str(), iters[i]->value().ToString(true /* hex */).c_str()); -#ifndef ROCKSDB_LITE Slice begin_key; Slice end_key; if (cmp < 0) { @@ -509,7 +763,6 @@ class CfConsistencyStressTest : public StressTest { } print_key_versions(column_families_[i]); -#endif // ROCKSDB_LITE shared->SetVerificationFailure(); } @@ -524,7 +777,6 @@ class CfConsistencyStressTest : public StressTest { } while (true); } -#ifndef ROCKSDB_LITE void ContinuouslyVerifyDb(ThreadState* thread) const override { assert(thread); Status status; @@ -614,9 +866,6 @@ class CfConsistencyStressTest : public StressTest { } } } -#else // ROCKSDB_LITE - void ContinuouslyVerifyDb(ThreadState* /*thread*/) const override {} -#endif // !ROCKSDB_LITE std::vector GenerateColumnFamilies( const int /* num_column_families */, diff --git a/db_stress_tool/db_stress_common.cc b/db_stress_tool/db_stress_common.cc index af8db9e2f21c..c0087dc5c70a 100644 --- a/db_stress_tool/db_stress_common.cc +++ b/db_stress_tool/db_stress_common.cc @@ -13,6 +13,7 @@ #include +#include "rocksdb/secondary_cache.h" #include "util/file_checksum_helper.h" #include "util/xxhash.h" @@ -21,6 +22,8 @@ ROCKSDB_NAMESPACE::Env* db_stress_env = nullptr; // If non-null, injects read error at a rate specified by the // read_fault_one_in or write_fault_one_in flag std::shared_ptr fault_fs_guard; +std::shared_ptr compressed_secondary_cache; +std::shared_ptr block_cache; enum ROCKSDB_NAMESPACE::CompressionType compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression; enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e = @@ -148,6 +151,88 @@ void DbVerificationThread(void* v) { } } +void CompressedCacheSetCapacityThread(void* v) { + assert(FLAGS_compressed_secondary_cache_size > 0 || + FLAGS_compressed_secondary_cache_ratio > 0.0); + auto* thread = reinterpret_cast(v); + SharedState* shared = thread->shared; + while (true) { + { + MutexLock l(shared->GetMutex()); + if (shared->ShouldStopBgThread()) { + shared->IncBgThreadsFinished(); + if (shared->BgThreadsFinished()) { + shared->GetCondVar()->SignalAll(); + } + return; + } + } + db_stress_env->SleepForMicroseconds(FLAGS_secondary_cache_update_interval); + if (FLAGS_compressed_secondary_cache_size > 0) { + Status s = compressed_secondary_cache->SetCapacity(0); + size_t capacity; + if (s.ok()) { + s = compressed_secondary_cache->GetCapacity(capacity); + assert(capacity == 0); + } + db_stress_env->SleepForMicroseconds(10 * 1000 * 1000); + if (s.ok()) { + s = compressed_secondary_cache->SetCapacity( + FLAGS_compressed_secondary_cache_size); + } + if (s.ok()) { + s = compressed_secondary_cache->GetCapacity(capacity); + assert(capacity == FLAGS_compressed_secondary_cache_size); + } + if (!s.ok()) { + fprintf(stderr, "Compressed cache Set/GetCapacity returned error: %s\n", + s.ToString().c_str()); + } + } else if (FLAGS_compressed_secondary_cache_ratio > 0.0) { + if (thread->rand.OneIn(2)) { // if (thread->rand.OneIn(2)) { + size_t capacity = block_cache->GetCapacity(); + size_t adjustment; + if (FLAGS_use_write_buffer_manager && FLAGS_db_write_buffer_size > 0) { + adjustment = (capacity - FLAGS_db_write_buffer_size); + } else { + adjustment = capacity; + } + // Lower by upto 50% of usable block cache capacity + adjustment = (adjustment * thread->rand.Uniform(50)) / 100; + block_cache->SetCapacity(capacity - adjustment); + fprintf(stderr, "New cache capacity = %lu\n", + block_cache->GetCapacity()); + db_stress_env->SleepForMicroseconds(10 * 1000 * 1000); + block_cache->SetCapacity(capacity); + } else { + Status s; + double new_comp_cache_ratio = + (double)thread->rand.Uniform( + FLAGS_compressed_secondary_cache_ratio * 100) / + 100; + if (new_comp_cache_ratio == 0.0) { + new_comp_cache_ratio = 0.05; + } + fprintf(stderr, "New comp cache ratio = %f\n", new_comp_cache_ratio); + + s = UpdateTieredCache(block_cache, /*capacity*/ -1, + new_comp_cache_ratio); + if (s.ok()) { + db_stress_env->SleepForMicroseconds(10 * 1000 * 1000); + } + if (s.ok()) { + s = UpdateTieredCache(block_cache, /*capacity*/ -1, + FLAGS_compressed_secondary_cache_ratio); + } + if (!s.ok()) { + fprintf(stderr, "UpdateTieredCache returned error: %s\n", + s.ToString().c_str()); + } + } + } + } +} + void PrintKeyValue(int cf, uint64_t key, const char* value, size_t sz) { if (!FLAGS_verbose) { return; @@ -270,14 +355,38 @@ WideColumns GenerateExpectedWideColumns(uint32_t value_base, WideColumns columns = GenerateWideColumns(value_base, slice); - std::sort(columns.begin(), columns.end(), - [](const WideColumn& lhs, const WideColumn& rhs) { - return lhs.name().compare(rhs.name()) < 0; - }); + WideColumnsHelper::SortColumns(columns); return columns; } +bool VerifyWideColumns(const Slice& value, const WideColumns& columns) { + if (value.size() < sizeof(uint32_t)) { + return false; + } + + const uint32_t value_base = GetValueBase(value); + + const WideColumns expected_columns = + GenerateExpectedWideColumns(value_base, value); + + if (columns != expected_columns) { + return false; + } + + return true; +} + +bool VerifyWideColumns(const WideColumns& columns) { + if (!WideColumnsHelper::HasDefaultColumn(columns)) { + return false; + } + + const Slice& value_of_default = WideColumnsHelper::GetDefaultColumn(columns); + + return VerifyWideColumns(value_of_default, columns); +} + std::string GetNowNanos() { uint64_t t = db_stress_env->NowNanos(); std::string ret; diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index 397d22299e0d..485400e05b62 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -37,6 +37,7 @@ #include "db/db_impl/db_impl.h" #include "db/version_set.h" +#include "db/wide/wide_columns_helper.h" #include "db_stress_tool/db_stress_env_wrapper.h" #include "db_stress_tool/db_stress_listener.h" #include "db_stress_tool/db_stress_shared_state.h" @@ -54,6 +55,7 @@ #include "rocksdb/utilities/checkpoint.h" #include "rocksdb/utilities/db_ttl.h" #include "rocksdb/utilities/debug.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/options_util.h" #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" @@ -87,6 +89,7 @@ DECLARE_int64(active_width); DECLARE_bool(test_batches_snapshots); DECLARE_bool(atomic_flush); DECLARE_int32(manual_wal_flush_one_in); +DECLARE_int32(lock_wal_one_in); DECLARE_bool(test_cf_consistency); DECLARE_bool(test_multi_ops_txns); DECLARE_int32(threads); @@ -105,11 +108,14 @@ DECLARE_int32(max_write_buffer_number); DECLARE_int32(min_write_buffer_number_to_merge); DECLARE_int32(max_write_buffer_number_to_maintain); DECLARE_int64(max_write_buffer_size_to_maintain); +DECLARE_bool(use_write_buffer_manager); DECLARE_double(memtable_prefix_bloom_size_ratio); DECLARE_bool(memtable_whole_key_filtering); DECLARE_int32(open_files); -DECLARE_int64(compressed_cache_size); -DECLARE_int32(compressed_cache_numshardbits); +DECLARE_uint64(compressed_secondary_cache_size); +DECLARE_int32(compressed_secondary_cache_numshardbits); +DECLARE_int32(secondary_cache_update_interval); +DECLARE_double(compressed_secondary_cache_ratio); DECLARE_int32(compaction_style); DECLARE_int32(compaction_pri); DECLARE_int32(num_levels); @@ -156,7 +162,7 @@ DECLARE_double(experimental_mempurge_threshold); DECLARE_bool(enable_write_thread_adaptive_yield); DECLARE_int32(reopen); DECLARE_double(bloom_bits); -DECLARE_int32(ribbon_starting_level); +DECLARE_int32(bloom_before_level); DECLARE_bool(partition_filters); DECLARE_bool(optimize_filters_for_memory); DECLARE_bool(detect_filter_construct_corruption); @@ -193,9 +199,6 @@ DECLARE_bool(rate_limit_user_ops); DECLARE_bool(rate_limit_auto_wal_flush); DECLARE_uint64(sst_file_manager_bytes_per_sec); DECLARE_uint64(sst_file_manager_bytes_per_truncate); -DECLARE_bool(use_txn); -DECLARE_uint64(txn_write_policy); -DECLARE_bool(unordered_write); DECLARE_int32(backup_one_in); DECLARE_uint64(backup_max_size); DECLARE_int32(checkpoint_one_in); @@ -212,6 +215,8 @@ DECLARE_bool(compare_full_db_state_snapshot); DECLARE_uint64(snapshot_hold_ops); DECLARE_bool(long_running_snapshots); DECLARE_bool(use_multiget); +DECLARE_bool(use_get_entity); +DECLARE_bool(use_multi_get_entity); DECLARE_int32(readpercent); DECLARE_int32(prefixpercent); DECLARE_int32(writepercent); @@ -228,6 +233,7 @@ DECLARE_int32(compression_zstd_max_train_bytes); DECLARE_int32(compression_parallel_threads); DECLARE_uint64(compression_max_dict_buffer_bytes); DECLARE_bool(compression_use_zstd_dict_trainer); +DECLARE_bool(compression_checksum); DECLARE_string(checksum_type); DECLARE_string(env_uri); DECLARE_string(fs_uri); @@ -247,12 +253,28 @@ DECLARE_bool(avoid_flush_during_recovery); DECLARE_uint64(max_write_batch_group_size_bytes); DECLARE_bool(level_compaction_dynamic_level_bytes); DECLARE_int32(verify_checksum_one_in); +DECLARE_int32(verify_file_checksums_one_in); DECLARE_int32(verify_db_one_in); DECLARE_int32(continuous_verification_interval); DECLARE_int32(get_property_one_in); DECLARE_string(file_checksum_impl); +DECLARE_bool(verification_only); + +// Options for transaction dbs. +// Use TransactionDB (a.k.a. Pessimistic Transaction DB) +// OR OptimisticTransactionDB +DECLARE_bool(use_txn); + +// Options for TransactionDB (a.k.a. Pessimistic Transaction DB) +DECLARE_uint64(txn_write_policy); +DECLARE_bool(unordered_write); + +// Options for OptimisticTransactionDB +DECLARE_bool(use_optimistic_txn); +DECLARE_uint64(occ_validation_policy); +DECLARE_bool(share_occ_lock_buckets); +DECLARE_uint32(occ_lock_bucket_count); -#ifndef ROCKSDB_LITE // Options for StackableDB-based BlobDB DECLARE_bool(use_blob_db); DECLARE_uint64(blob_db_min_blob_size); @@ -260,7 +282,6 @@ DECLARE_uint64(blob_db_bytes_per_sync); DECLARE_uint64(blob_db_file_size); DECLARE_bool(blob_db_enable_gc); DECLARE_double(blob_db_gc_cutoff); -#endif // !ROCKSDB_LITE // Options for integrated BlobDB DECLARE_bool(allow_setting_blob_options_dynamically); @@ -289,6 +310,7 @@ DECLARE_bool(paranoid_file_checks); DECLARE_bool(fail_if_options_file_error); DECLARE_uint64(batch_protection_bytes_per_key); DECLARE_uint32(memtable_protection_bytes_per_key); +DECLARE_uint32(block_protection_bytes_per_key); DECLARE_uint64(user_timestamp_size); DECLARE_string(secondary_cache_uri); @@ -297,11 +319,9 @@ DECLARE_int32(secondary_cache_fault_one_in); DECLARE_int32(prepopulate_block_cache); DECLARE_bool(two_write_queues); -#ifndef ROCKSDB_LITE DECLARE_bool(use_only_the_last_commit_time_batch_for_recovery); DECLARE_uint64(wp_snapshot_cache_bits); DECLARE_uint64(wp_commit_cache_bits); -#endif // !ROCKSDB_LITE DECLARE_bool(adaptive_readahead); DECLARE_bool(async_io); @@ -312,6 +332,12 @@ DECLARE_int32(create_timestamped_snapshot_one_in); DECLARE_bool(allow_data_in_errors); +DECLARE_bool(enable_thread_tracking); + +DECLARE_uint32(memtable_max_range_deletions); + +DECLARE_uint32(bottommost_file_compaction_delay); + // Tiered storage DECLARE_bool(enable_tiered_storage); // set last_level_temperature DECLARE_int64(preclude_last_level_data_seconds); @@ -324,6 +350,8 @@ DECLARE_uint64(readahead_size); DECLARE_uint64(initial_auto_readahead_size); DECLARE_uint64(max_auto_readahead_size); DECLARE_uint64(num_file_reads_for_auto_readahead); +DECLARE_bool(use_io_uring); +DECLARE_bool(auto_readahead_size); constexpr long KB = 1024; constexpr int kRandomValueMaxFactor = 3; @@ -333,6 +361,9 @@ constexpr int kValueMaxLen = 100; extern ROCKSDB_NAMESPACE::Env* db_stress_env; extern ROCKSDB_NAMESPACE::Env* db_stress_listener_env; extern std::shared_ptr fault_fs_guard; +extern std::shared_ptr + compressed_secondary_cache; +extern std::shared_ptr block_cache; extern enum ROCKSDB_NAMESPACE::CompressionType compression_type_e; extern enum ROCKSDB_NAMESPACE::CompressionType bottommost_compression_type_e; @@ -598,6 +629,18 @@ extern inline std::string StringToHex(const std::string& str) { return result; } +inline std::string WideColumnsToHex(const WideColumns& columns) { + if (columns.empty()) { + return std::string(); + } + + std::ostringstream oss; + + WideColumnsHelper::DumpWideColumns(columns, oss, true); + + return oss.str(); +} + // Unified output format for double parameters extern inline std::string FormatDoubleParam(double param) { return std::to_string(param); @@ -613,6 +656,8 @@ extern void PoolSizeChangeThread(void* v); extern void DbVerificationThread(void* v); +extern void CompressedCacheSetCapacityThread(void* v); + extern void TimestampedSnapshotsThread(void* v); extern void PrintKeyValue(int cf, uint64_t key, const char* value, size_t sz); @@ -628,6 +673,8 @@ extern uint32_t GetValueBase(Slice s); extern WideColumns GenerateWideColumns(uint32_t value_base, const Slice& slice); extern WideColumns GenerateExpectedWideColumns(uint32_t value_base, const Slice& slice); +extern bool VerifyWideColumns(const Slice& value, const WideColumns& columns); +extern bool VerifyWideColumns(const WideColumns& columns); extern StressTest* CreateCfConsistencyStressTest(); extern StressTest* CreateBatchedOpsStressTest(); diff --git a/db_stress_tool/db_stress_driver.cc b/db_stress_tool/db_stress_driver.cc index 2c8dcf610867..92730beca2b6 100644 --- a/db_stress_tool/db_stress_driver.cc +++ b/db_stress_tool/db_stress_driver.cc @@ -14,6 +14,7 @@ namespace ROCKSDB_NAMESPACE { void ThreadBody(void* v) { + ThreadStatusUtil::RegisterThread(db_stress_env, ThreadStatus::USER); ThreadState* thread = reinterpret_cast(v); SharedState* shared = thread->shared; @@ -26,37 +27,42 @@ void ThreadBody(void* v) { if (shared->AllInitialized()) { shared->GetCondVar()->SignalAll(); } - while (!shared->Started()) { - shared->GetCondVar()->Wait(); - } } - thread->shared->GetStressTest()->OperateDb(thread); - - { - MutexLock l(shared->GetMutex()); - shared->IncOperated(); - if (shared->AllOperated()) { - shared->GetCondVar()->SignalAll(); + if (!FLAGS_verification_only) { + { + MutexLock l(shared->GetMutex()); + while (!shared->Started()) { + shared->GetCondVar()->Wait(); + } } - while (!shared->VerifyStarted()) { - shared->GetCondVar()->Wait(); + thread->shared->GetStressTest()->OperateDb(thread); + { + MutexLock l(shared->GetMutex()); + shared->IncOperated(); + if (shared->AllOperated()) { + shared->GetCondVar()->SignalAll(); + } + while (!shared->VerifyStarted()) { + shared->GetCondVar()->Wait(); + } } - } - if (!FLAGS_skip_verifydb) { - thread->shared->GetStressTest()->VerifyDb(thread); - } + if (!FLAGS_skip_verifydb) { + thread->shared->GetStressTest()->VerifyDb(thread); + } - { - MutexLock l(shared->GetMutex()); - shared->IncDone(); - if (shared->AllDone()) { - shared->GetCondVar()->SignalAll(); + { + MutexLock l(shared->GetMutex()); + shared->IncDone(); + if (shared->AllDone()) { + shared->GetCondVar()->SignalAll(); + } } } -} -bool RunStressTest(SharedState* shared) { + ThreadStatusUtil::UnregisterThread(); +} +bool RunStressTestImpl(SharedState* shared) { SystemClock* clock = db_stress_env->GetSystemClock().get(); StressTest* stress = shared->GetStressTest(); @@ -75,12 +81,30 @@ bool RunStressTest(SharedState* shared) { stress->InitDb(shared); stress->FinishInitDb(shared); - if (FLAGS_sync_fault_injection) { - fault_fs_guard->SetFilesystemDirectWritable(false); - } if (FLAGS_write_fault_one_in) { + if (!FLAGS_sync_fault_injection) { + // unsynced WAL loss is not supported without sync_fault_injection + fault_fs_guard->SetDirectWritableTypes({kWalFile}); + } + IOStatus error_msg; + if (FLAGS_inject_error_severity <= 1 || FLAGS_inject_error_severity > 2) { + error_msg = IOStatus::IOError("Retryable injected write error"); + error_msg.SetRetryable(true); + } else if (FLAGS_inject_error_severity == 2) { + error_msg = IOStatus::IOError("Fatal injected write error"); + error_msg.SetDataLoss(true); + } + // TODO: inject write error for other file types including + // MANIFEST, CURRENT, and WAL files. + fault_fs_guard->SetRandomWriteError( + shared->GetSeed(), FLAGS_write_fault_one_in, error_msg, + /*inject_for_all_file_types=*/false, {FileType::kTableFile}); + fault_fs_guard->SetFilesystemDirectWritable(false); fault_fs_guard->EnableWriteErrorInjection(); } + if (FLAGS_sync_fault_injection) { + fault_fs_guard->SetFilesystemDirectWritable(false); + } uint32_t n = FLAGS_threads; uint64_t now = clock->NowMicros(); @@ -97,6 +121,11 @@ bool RunStressTest(SharedState* shared) { shared->IncBgThreads(); } + if (FLAGS_compressed_secondary_cache_size > 0 || + FLAGS_compressed_secondary_cache_ratio > 0.0) { + shared->IncBgThreads(); + } + std::vector threads(n); for (uint32_t i = 0; i < n; i++) { threads[i] = new ThreadState(i, shared); @@ -114,6 +143,13 @@ bool RunStressTest(SharedState* shared) { &continuous_verification_thread); } + ThreadState compressed_cache_set_capacity_thread(0, shared); + if (FLAGS_compressed_secondary_cache_size > 0 || + FLAGS_compressed_secondary_cache_ratio > 0.0) { + db_stress_env->StartThread(CompressedCacheSetCapacityThread, + &compressed_cache_set_capacity_thread); + } + // Each thread goes through the following states: // initializing -> wait for others to init -> read/populate/depopulate // wait for others to operate -> verify -> done @@ -140,45 +176,55 @@ bool RunStressTest(SharedState* shared) { } } - // This is after the verification step to avoid making all those `Get()`s - // and `MultiGet()`s contend on the DB-wide trace mutex. - if (!FLAGS_expected_values_dir.empty()) { - stress->TrackExpectedState(shared); - } - - now = clock->NowMicros(); - fprintf(stdout, "%s Starting database operations\n", - clock->TimeToString(now / 1000000).c_str()); + if (!FLAGS_verification_only) { + // This is after the verification step to avoid making all those `Get()`s + // and `MultiGet()`s contend on the DB-wide trace mutex. + if (!FLAGS_expected_values_dir.empty()) { + stress->TrackExpectedState(shared); + } + now = clock->NowMicros(); + fprintf(stdout, "%s Starting database operations\n", + clock->TimeToString(now / 1000000).c_str()); - shared->SetStart(); - shared->GetCondVar()->SignalAll(); - while (!shared->AllOperated()) { - shared->GetCondVar()->Wait(); - } + shared->SetStart(); + shared->GetCondVar()->SignalAll(); + while (!shared->AllOperated()) { + shared->GetCondVar()->Wait(); + } - now = clock->NowMicros(); - if (FLAGS_test_batches_snapshots) { - fprintf(stdout, "%s Limited verification already done during gets\n", - clock->TimeToString((uint64_t)now / 1000000).c_str()); - } else if (FLAGS_skip_verifydb) { - fprintf(stdout, "%s Verification skipped\n", - clock->TimeToString((uint64_t)now / 1000000).c_str()); - } else { - fprintf(stdout, "%s Starting verification\n", - clock->TimeToString((uint64_t)now / 1000000).c_str()); - } + now = clock->NowMicros(); + if (FLAGS_test_batches_snapshots) { + fprintf(stdout, "%s Limited verification already done during gets\n", + clock->TimeToString((uint64_t)now / 1000000).c_str()); + } else if (FLAGS_skip_verifydb) { + fprintf(stdout, "%s Verification skipped\n", + clock->TimeToString((uint64_t)now / 1000000).c_str()); + } else { + fprintf(stdout, "%s Starting verification\n", + clock->TimeToString((uint64_t)now / 1000000).c_str()); + } - shared->SetStartVerify(); - shared->GetCondVar()->SignalAll(); - while (!shared->AllDone()) { - shared->GetCondVar()->Wait(); + shared->SetStartVerify(); + shared->GetCondVar()->SignalAll(); + while (!shared->AllDone()) { + shared->GetCondVar()->Wait(); + } } } - for (unsigned int i = 1; i < n; i++) { - threads[0]->stats.Merge(threads[i]->stats); + // If we are running verification_only + // stats will be empty and trying to report them will + // emit no ops or writes error. To avoid this, merging and reporting stats + // are not executed when running with verification_only + // TODO: We need to create verification stats (e.g. how many keys + // are verified by which method) and report them here instead of operation + // stats. + if (!FLAGS_verification_only) { + for (unsigned int i = 1; i < n; i++) { + threads[0]->stats.Merge(threads[i]->stats); + } + threads[0]->stats.Report("Stress Test"); } - threads[0]->stats.Report("Stress Test"); for (unsigned int i = 0; i < n; i++) { delete threads[i]; @@ -190,10 +236,15 @@ bool RunStressTest(SharedState* shared) { fprintf(stdout, "%s Verification successful\n", clock->TimeToString(now / 1000000).c_str()); } - stress->PrintStatistics(); + + if (!FLAGS_verification_only) { + stress->PrintStatistics(); + } if (FLAGS_compaction_thread_pool_adjust_interval > 0 || - FLAGS_continuous_verification_interval > 0) { + FLAGS_continuous_verification_interval > 0 || + FLAGS_compressed_secondary_cache_size > 0 || + FLAGS_compressed_secondary_cache_ratio > 0.0) { MutexLock l(shared->GetMutex()); shared->SetShouldStopBgThread(); while (!shared->BgThreadsFinished()) { @@ -207,5 +258,11 @@ bool RunStressTest(SharedState* shared) { } return true; } +bool RunStressTest(SharedState* shared) { + ThreadStatusUtil::RegisterThread(db_stress_env, ThreadStatus::USER); + bool result = RunStressTestImpl(shared); + ThreadStatusUtil::UnregisterThread(); + return result; +} } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff --git a/db_stress_tool/db_stress_env_wrapper.h b/db_stress_tool/db_stress_env_wrapper.h index af60df9bc20b..83e6838c7037 100644 --- a/db_stress_tool/db_stress_env_wrapper.h +++ b/db_stress_tool/db_stress_env_wrapper.h @@ -10,8 +10,72 @@ #ifdef GFLAGS #pragma once #include "db_stress_tool/db_stress_common.h" +#include "monitoring/thread_status_util.h" namespace ROCKSDB_NAMESPACE { +class DbStressRandomAccessFileWrapper : public FSRandomAccessFileOwnerWrapper { + public: + explicit DbStressRandomAccessFileWrapper( + std::unique_ptr&& target) + : FSRandomAccessFileOwnerWrapper(std::move(target)) {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Read(offset, n, options, result, scratch, dbg); + } + + IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, + const IOOptions& options, IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->MultiRead(reqs, num_reqs, options, dbg); + } + + IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Prefetch(offset, n, options, dbg); + } + + IOStatus ReadAsync(FSReadRequest& req, const IOOptions& options, + std::function cb, + void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, + IODebugContext* dbg) override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->ReadAsync(req, options, cb, cb_arg, io_handle, del_fn, + dbg); + } +}; + class DbStressFSWrapper : public FileSystemWrapper { public: explicit DbStressFSWrapper(const std::shared_ptr& t) @@ -19,6 +83,18 @@ class DbStressFSWrapper : public FileSystemWrapper { static const char* kClassName() { return "DbStressFS"; } const char* Name() const override { return kClassName(); } + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + std::unique_ptr file; + IOStatus s = target()->NewRandomAccessFile(f, file_opts, &file, dbg); + if (s.ok()) { + r->reset(new DbStressRandomAccessFileWrapper(std::move(file))); + } + return s; + } + IOStatus DeleteFile(const std::string& f, const IOOptions& opts, IODebugContext* dbg) override { // We determine whether it is a manifest file by searching a strong, diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index ef542db109e4..cd1c978b810e 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -92,6 +92,10 @@ DEFINE_int32( "on average. Setting `manual_wal_flush_one_in` to be greater than 0 " "implies `Options::manual_wal_flush = true` is set."); +DEFINE_int32(lock_wal_one_in, 1000000, + "If non-zero, then `LockWAL()` + `UnlockWAL()` will be called in " + "db_stress once for every N ops on average."); + DEFINE_bool(test_cf_consistency, false, "If set, runs the stress test dedicated to verifying writes to " "multiple column families are consistent. Setting this implies " @@ -132,6 +136,9 @@ DEFINE_uint64(db_write_buffer_size, ROCKSDB_NAMESPACE::Options().db_write_buffer_size, "Number of bytes to buffer in all memtables before compacting"); +DEFINE_bool(use_write_buffer_manager, false, + "Charge WriteBufferManager memory to the block cache"); + DEFINE_int32( write_buffer_size, static_cast(ROCKSDB_NAMESPACE::Options().write_buffer_size), @@ -194,15 +201,23 @@ DEFINE_int32(open_files, ROCKSDB_NAMESPACE::Options().max_open_files, "Maximum number of files to keep open at the same time " "(use default if == 0)"); -DEFINE_int64(compressed_cache_size, 0, - "Number of bytes to use as a cache of compressed data." - " 0 means use default settings."); +DEFINE_uint64(compressed_secondary_cache_size, 0, + "Number of bytes to use as a cache of compressed data." + " 0 means use default settings."); -DEFINE_int32( - compressed_cache_numshardbits, -1, - "Number of shards for the compressed block cache is 2 ** " - "compressed_cache_numshardbits. Negative value means default settings. " - "This is applied only if compressed_cache_size is greater than 0."); +DEFINE_int32(compressed_secondary_cache_numshardbits, -1, + "Number of shards for the compressed secondary cache is 2 ** " + "compressed_secondary_cache_numshardbits. " + "Negative value means default settings. This is applied only " + "if compressed_secondary_cache_size is greater than 0."); + +DEFINE_double(compressed_secondary_cache_ratio, 0.0, + "Fraction of block cache memory budget to use for compressed " + "secondary cache"); + +DEFINE_int32(secondary_cache_update_interval, 30 * 1000 * 1000, + "Interval between modification of secondary cache parameters, in " + "microseconds"); DEFINE_int32(compaction_style, ROCKSDB_NAMESPACE::Options().compaction_style, ""); @@ -390,7 +405,6 @@ DEFINE_double(experimental_mempurge_threshold, 0.0, DEFINE_bool(enable_write_thread_adaptive_yield, true, "Use a yielding spin loop for brief writer thread waits."); -#ifndef ROCKSDB_LITE // Options for StackableDB-based BlobDB DEFINE_bool(use_blob_db, false, "[Stacked BlobDB] Use BlobDB."); @@ -418,7 +432,6 @@ DEFINE_double( blob_db_gc_cutoff, ROCKSDB_NAMESPACE::blob_db::BlobDBOptions().garbage_collection_cutoff, "[Stacked BlobDB] Cutoff ratio for BlobDB garbage collection."); -#endif // !ROCKSDB_LITE // Options for integrated BlobDB DEFINE_bool(allow_setting_blob_options_dynamically, false, @@ -524,10 +537,11 @@ DEFINE_double(bloom_bits, 10, "Negative means use default settings."); DEFINE_int32( - ribbon_starting_level, 999, + bloom_before_level, 999, "Use Bloom filter on levels below specified and Ribbon beginning on level " - "specified. Flush is considered level -1. 999 or more -> always Bloom. 0 " - "-> Ribbon except Bloom for flush. -1 -> always Ribbon."); + "specified. Flush is considered level -1. Setting -1 -> always Ribbon. " + "0 -> Ribbon except Bloom for flush. INT_MAX (typically 2147483647) -> " + "always Bloom."); DEFINE_bool(partition_filters, false, "use partitioned filters " @@ -667,13 +681,24 @@ DEFINE_uint64(sst_file_manager_bytes_per_truncate, 0, "many bytes. By default whole files will be deleted."); DEFINE_bool(use_txn, false, - "Use TransactionDB. Currently the default write policy is " - "TxnDBWritePolicy::WRITE_PREPARED"); + "Use TransactionDB or OptimisticTransactionDB. When " + "use_optimistic_txn == false (by default), " + "it's (Pessimistic) TransactionDB"); DEFINE_uint64(txn_write_policy, 0, "The transaction write policy. Default is " "TxnDBWritePolicy::WRITE_COMMITTED. Note that this should not be " - "changed accross crashes."); + "changed across crashes."); + +DEFINE_bool(use_optimistic_txn, false, "Use OptimisticTransactionDB."); +DEFINE_uint64(occ_validation_policy, 1, + "Optimistic Concurrency Control Validation Policy for " + "OptimisticTransactionDB"); +DEFINE_bool(share_occ_lock_buckets, false, + "Share a pool of locks across DB instances for buckets"); +DEFINE_uint32( + occ_lock_bucket_count, 500, + "Bucket Count for shared Optimistic Concurrency Control (OCC) locks"); DEFINE_bool(unordered_write, false, "Turn on the unordered_write feature. This options is currently " @@ -745,6 +770,11 @@ DEFINE_bool(long_running_snapshots, false, DEFINE_bool(use_multiget, false, "If set, use the batched MultiGet API for reads"); +DEFINE_bool(use_get_entity, false, "If set, use the GetEntity API for reads"); + +DEFINE_bool(use_multi_get_entity, false, + "If set, use the MultiGetEntity API for reads"); + static bool ValidateInt32Percent(const char* flagname, int32_t value) { if (value < 0 || value > 100) { fprintf(stderr, "Invalid value for --%s: %d, 0<= pct <=100 \n", flagname, @@ -827,6 +857,9 @@ DEFINE_bool( "ZSTD 1.4.5+ is required. If ZSTD 1.4.5+ is not linked with the binary, " "this flag will have the default value true."); +DEFINE_bool(compression_checksum, false, + "Turn on zstd's checksum feature for detecting corruption."); + DEFINE_string(bottommost_compression_type, "disable", "Algorithm to use to compress bottommost level of the database. " "\"disable\" means disabling the feature"); @@ -911,6 +944,13 @@ DEFINE_int32(verify_checksum_one_in, 0, " checksum verification of all the files in the database once for" " every N ops on average. 0 indicates that calls to" " VerifyChecksum() are disabled."); + +DEFINE_int32(verify_file_checksums_one_in, 0, + "If non-zero, then DB::VerifyFileChecksums() will be called to do" + " checksum verification of all the files in the database once for" + " every N ops on average. 0 indicates that calls to" + " VerifyFileChecksums() are disabled."); + DEFINE_int32(verify_db_one_in, 0, "If non-zero, call VerifyDb() once for every N ops. 0 indicates " "that VerifyDb() will not be called in OperateDb(). Note that " @@ -968,12 +1008,18 @@ DEFINE_uint32( "specified number of bytes per key. Currently the supported " "nonzero values are 1, 2, 4 and 8."); +DEFINE_uint32(block_protection_bytes_per_key, 0, + "If nonzero, enables integrity protection in blocks at the " + "specified number of bytes per key. Currently the supported " + "nonzero values are 1, 2, 4 and 8."); + DEFINE_string(file_checksum_impl, "none", "Name of an implementation for file_checksum_gen_factory, or " "\"none\" for null."); DEFINE_int32(write_fault_one_in, 0, - "On non-zero, enables fault injection on write"); + "On non-zero, enables fault injection on write. Currently only" + "injects write error when writing to SST files."); DEFINE_uint64(user_timestamp_size, 0, "Number of bytes for a user-defined timestamp. Currently, only " @@ -983,21 +1029,22 @@ DEFINE_int32(open_metadata_write_fault_one_in, 0, "On non-zero, enables fault injection on file metadata write " "during DB reopen."); -#ifndef ROCKSDB_LITE DEFINE_string(secondary_cache_uri, "", "Full URI for creating a customized secondary cache object"); DEFINE_int32(secondary_cache_fault_one_in, 0, "On non-zero, enables fault injection in secondary cache inserts" " and lookups"); -#endif // ROCKSDB_LITE +DEFINE_double(tiered_cache_percent_compressed, 0.0, + "Percentage of total block cache budget to allocate to the " + "compressed cache"); DEFINE_int32(open_write_fault_one_in, 0, "On non-zero, enables fault injection on file writes " "during DB reopen."); DEFINE_int32(open_read_fault_one_in, 0, "On non-zero, enables fault injection on file reads " "during DB reopen."); -DEFINE_int32(injest_error_severity, 1, - "The severity of the injested IO Error. 1 is soft error (e.g. " +DEFINE_int32(inject_error_severity, 1, + "The severity of the injected IO Error. 1 is soft error (e.g. " "retryable error), 2 is fatal error, and the default is " "retryable error."); DEFINE_int32(prepopulate_block_cache, @@ -1008,7 +1055,6 @@ DEFINE_int32(prepopulate_block_cache, DEFINE_bool(two_write_queues, false, "Set to true to enable two write queues. Default: false"); -#ifndef ROCKSDB_LITE DEFINE_bool(use_only_the_last_commit_time_batch_for_recovery, false, "If true, the commit-time write batch will not be immediately " @@ -1022,7 +1068,6 @@ DEFINE_uint64( DEFINE_uint64(wp_commit_cache_bits, 23ull, "Number of bits to represent write-prepared transaction db's " "commit cache. Default: 23 (8M entries)"); -#endif // !ROCKSDB_LITE DEFINE_bool(adaptive_readahead, false, "Carry forward internal auto readahead size from one file to next " @@ -1048,6 +1093,11 @@ DEFINE_bool(allow_data_in_errors, ROCKSDB_NAMESPACE::Options().allow_data_in_errors, "If true, allow logging data, e.g. key, value in LOG files."); +DEFINE_bool(enable_thread_tracking, + ROCKSDB_NAMESPACE::Options().enable_thread_tracking, + "If true, the status of the threads involved in this DB will be " + "tracked and available via GetThreadList() API."); + DEFINE_int32(verify_iterator_with_expected_state_one_in, 0, "If non-zero, when TestIterate() is to be called, there is a " "1/verify_iterator_with_expected_state_one_in " @@ -1075,4 +1125,21 @@ DEFINE_uint64(stats_dump_period_sec, ROCKSDB_NAMESPACE::Options().stats_dump_period_sec, "Gap between printing stats to log in seconds"); +DEFINE_bool(use_io_uring, false, "Enable the use of IO uring on Posix"); + +DEFINE_bool(verification_only, false, + "If true, tests will only execute verification step"); +extern "C" bool RocksDbIOUringEnable() { return FLAGS_use_io_uring; } + +DEFINE_uint32(memtable_max_range_deletions, 0, + "If nonzero, RocksDB will try to flush the current memtable" + "after the number of range deletions is >= this limit"); + +DEFINE_uint32(bottommost_file_compaction_delay, 0, + "Delay kBottommostFiles compaction by this amount of seconds." + "See more in option comment."); + +DEFINE_bool(auto_readahead_size, false, + "Does auto tuning of readahead_size when enabled during scans."); + #endif // GFLAGS diff --git a/db_stress_tool/db_stress_listener.cc b/db_stress_tool/db_stress_listener.cc index 578f21c415c4..e2838c582a17 100644 --- a/db_stress_tool/db_stress_listener.cc +++ b/db_stress_tool/db_stress_listener.cc @@ -14,7 +14,6 @@ namespace ROCKSDB_NAMESPACE { #ifdef GFLAGS -#ifndef ROCKSDB_LITE // TODO: consider using expected_values_dir instead, but this is more // convenient for now. @@ -185,7 +184,6 @@ void DbStressListener::VerifyTableFileUniqueId( unique_ids_.Verify(id); } -#endif // !ROCKSDB_LITE #endif // GFLAGS } // namespace ROCKSDB_NAMESPACE diff --git a/db_stress_tool/db_stress_listener.h b/db_stress_tool/db_stress_listener.h index faced3172592..505b0a604dff 100644 --- a/db_stress_tool/db_stress_listener.h +++ b/db_stress_tool/db_stress_listener.h @@ -9,6 +9,7 @@ #include #include +#include "db_stress_tool/db_stress_shared_state.h" #include "file/filename.h" #include "file/writable_file_writer.h" #include "rocksdb/db.h" @@ -19,12 +20,14 @@ #include "rocksdb/unique_id.h" #include "util/gflags_compat.h" #include "util/random.h" +#include "utilities/fault_injection_fs.h" DECLARE_int32(compact_files_one_in); +extern std::shared_ptr fault_fs_guard; + namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE // Verify across process executions that all seen IDs are unique class UniqueIdVerifier { public: @@ -68,11 +71,23 @@ class DbStressListener : public EventListener { VerifyFilePath(info.file_path); // pretending doing some work here RandomSleep(); + if (FLAGS_read_fault_one_in) { + (void)fault_fs_guard->GetAndResetErrorCount(); + fault_fs_guard->DisableErrorInjection(); + } } void OnFlushBegin(DB* /*db*/, const FlushJobInfo& /*flush_job_info*/) override { RandomSleep(); + if (FLAGS_read_fault_one_in) { + // Hardcoded to inject retryable error as a non-retryable error would put + // the DB in read-only mode and then it would crash on the next write. + fault_fs_guard->SetThreadLocalReadErrorContext( + static_cast(FLAGS_seed), FLAGS_read_fault_one_in, + true /* retryable */); + fault_fs_guard->EnableErrorInjection(); + } } void OnTableFileDeleted(const TableFileDeletionInfo& /*info*/) override { @@ -96,6 +111,24 @@ class DbStressListener : public EventListener { RandomSleep(); } + void OnSubcompactionBegin(const SubcompactionJobInfo& /* si */) override { + if (FLAGS_read_fault_one_in) { + // Hardcoded to inject retryable error as a non-retryable error would put + // the DB in read-only mode and then it would crash on the next write. + fault_fs_guard->SetThreadLocalReadErrorContext( + static_cast(FLAGS_seed), FLAGS_read_fault_one_in, + true /* retryable */); + fault_fs_guard->EnableErrorInjection(); + } + } + + void OnSubcompactionCompleted(const SubcompactionJobInfo& /* si */) override { + if (FLAGS_read_fault_one_in) { + (void)fault_fs_guard->GetAndResetErrorCount(); + fault_fs_guard->DisableErrorInjection(); + } + } + void OnTableFileCreationStarted( const TableFileCreationBriefInfo& /*info*/) override { ++num_pending_file_creations_; @@ -266,6 +299,5 @@ class DbStressListener : public EventListener { std::atomic num_pending_file_creations_; UniqueIdVerifier unique_ids_; }; -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff --git a/db_stress_tool/db_stress_shared_state.h b/db_stress_tool/db_stress_shared_state.h index 5565c62211dc..bad6a77e1fa5 100644 --- a/db_stress_tool/db_stress_shared_state.h +++ b/db_stress_tool/db_stress_shared_state.h @@ -35,7 +35,7 @@ DECLARE_int32(open_metadata_write_fault_one_in); DECLARE_int32(open_write_fault_one_in); DECLARE_int32(open_read_fault_one_in); -DECLARE_int32(injest_error_severity); +DECLARE_int32(inject_error_severity); namespace ROCKSDB_NAMESPACE { class StressTest; @@ -43,12 +43,6 @@ class StressTest; // State shared by all concurrent executions of the same benchmark. class SharedState { public: - // indicates a key may have any value (or not be present) as an operation on - // it is incomplete. - static constexpr uint32_t UNKNOWN_SENTINEL = 0xfffffffe; - // indicates a key should definitely be deleted - static constexpr uint32_t DELETION_SENTINEL = 0xffffffff; - // Errors when reading filter blocks are ignored, so we use a thread // local variable updated via sync points to keep track of errors injected // while reading filter blocks in order to ignore the Get/MultiGet result @@ -254,54 +248,70 @@ class SharedState { return expected_state_manager_->ClearColumnFamily(cf); } - // @param pending True if the update may have started but is not yet - // guaranteed finished. This is useful for crash-recovery testing when the - // process may crash before updating the expected values array. + // Prepare a Put that will be started but not finish yet + // This is useful for crash-recovery testing when the process may crash + // before updating the corresponding expected value // - // Requires external locking covering `key` in `cf`. - void Put(int cf, int64_t key, uint32_t value_base, bool pending) { - return expected_state_manager_->Put(cf, key, value_base, pending); + // Requires external locking covering `key` in `cf` to prevent concurrent + // write or delete to the same `key`. + PendingExpectedValue PreparePut(int cf, int64_t key) { + return expected_state_manager_->PreparePut(cf, key); } - // Requires external locking covering `key` in `cf`. - uint32_t Get(int cf, int64_t key) const { + // Does not requires external locking. + ExpectedValue Get(int cf, int64_t key) { return expected_state_manager_->Get(cf, key); } - // @param pending See comment above Put() - // Returns true if the key was not yet deleted. + // Prepare a Delete that will be started but not finish yet + // This is useful for crash-recovery testing when the process may crash + // before updating the corresponding expected value // - // Requires external locking covering `key` in `cf`. - bool Delete(int cf, int64_t key, bool pending) { - return expected_state_manager_->Delete(cf, key, pending); + // Requires external locking covering `key` in `cf` to prevent concurrent + // write or delete to the same `key`. + PendingExpectedValue PrepareDelete(int cf, int64_t key) { + return expected_state_manager_->PrepareDelete(cf, key); } - // @param pending See comment above Put() - // Returns true if the key was not yet deleted. - // - // Requires external locking covering `key` in `cf`. - bool SingleDelete(int cf, int64_t key, bool pending) { - return expected_state_manager_->Delete(cf, key, pending); + // Requires external locking covering `key` in `cf` to prevent concurrent + // write or delete to the same `key`. + PendingExpectedValue PrepareSingleDelete(int cf, int64_t key) { + return expected_state_manager_->PrepareSingleDelete(cf, key); } - // @param pending See comment above Put() - // Returns number of keys deleted by the call. - // - // Requires external locking covering keys in `[begin_key, end_key)` in `cf`. - int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending) { - return expected_state_manager_->DeleteRange(cf, begin_key, end_key, - pending); + // Requires external locking covering keys in `[begin_key, end_key)` in `cf` + // to prevent concurrent write or delete to the same `key`. + std::vector PrepareDeleteRange(int cf, + int64_t begin_key, + int64_t end_key) { + return expected_state_manager_->PrepareDeleteRange(cf, begin_key, end_key); } bool AllowsOverwrite(int64_t key) const { return no_overwrite_ids_.find(key) == no_overwrite_ids_.end(); } - // Requires external locking covering `key` in `cf`. + // Requires external locking covering `key` in `cf` to prevent concurrent + // delete to the same `key`. bool Exists(int cf, int64_t key) { return expected_state_manager_->Exists(cf, key); } + // Sync the `value_base` to the corresponding expected value + void SyncPut(int cf, int64_t key, uint32_t value_base) { + return expected_state_manager_->SyncPut(cf, key, value_base); + } + + // Sync the corresponding expected value to be pending Put + void SyncPendingPut(int cf, int64_t key) { + return expected_state_manager_->SyncPendingPut(cf, key); + } + + // Sync the corresponding expected value to be deleted + void SyncDelete(int cf, int64_t key) { + return expected_state_manager_->SyncDelete(cf, key); + } + uint32_t GetSeed() const { return seed_; } void SetShouldStopBgThread() { should_stop_bg_thread_ = true; } @@ -332,6 +342,13 @@ class SharedState { uint64_t GetStartTimestamp() const { return start_timestamp_; } + void SafeTerminate() { + // Grab mutex so that we don't call terminate while another thread is + // attempting to print a stack trace due to the first one + MutexLock l(&mu_); + std::terminate(); + } + private: static void IgnoreReadErrorCallback(void*) { ignore_read_error = true; } diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index e939954671f7..20077558f6df 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -9,13 +9,17 @@ // #include +#include +#include "rocksdb/options.h" #include "util/compression.h" #ifdef GFLAGS #include "db_stress_tool/db_stress_common.h" #include "db_stress_tool/db_stress_compaction_filter.h" #include "db_stress_tool/db_stress_driver.h" #include "db_stress_tool/db_stress_table_properties_collector.h" +#include "db_stress_tool/db_stress_wide_merge_operator.h" +#include "options/options_parser.h" #include "rocksdb/convenience.h" #include "rocksdb/filter_policy.h" #include "rocksdb/secondary_cache.h" @@ -38,12 +42,12 @@ std::shared_ptr CreateFilterPolicy() { return BlockBasedTableOptions().filter_policy; } const FilterPolicy* new_policy; - if (FLAGS_ribbon_starting_level >= 999) { + if (FLAGS_bloom_before_level == INT_MAX) { // Use Bloom API new_policy = NewBloomFilterPolicy(FLAGS_bloom_bits, false); } else { - new_policy = NewRibbonFilterPolicy( - FLAGS_bloom_bits, /* bloom_before_level */ FLAGS_ribbon_starting_level); + new_policy = + NewRibbonFilterPolicy(FLAGS_bloom_bits, FLAGS_bloom_before_level); } return std::shared_ptr(new_policy); } @@ -52,13 +56,10 @@ std::shared_ptr CreateFilterPolicy() { StressTest::StressTest() : cache_(NewCache(FLAGS_cache_size, FLAGS_cache_numshardbits)), - compressed_cache_(NewLRUCache(FLAGS_compressed_cache_size, - FLAGS_compressed_cache_numshardbits)), filter_policy_(CreateFilterPolicy()), db_(nullptr), -#ifndef ROCKSDB_LITE txn_db_(nullptr), -#endif + optimistic_txn_db_(nullptr), db_aptr_(nullptr), clock_(db_stress_env->GetSystemClock().get()), new_column_family_name_(1), @@ -78,14 +79,10 @@ StressTest::StressTest() Options options; options.env = db_stress_env; // Remove files without preserving manfiest files -#ifndef ROCKSDB_LITE const Status s = !FLAGS_use_blob_db ? DestroyDB(FLAGS_db, options) : blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions()); -#else - const Status s = DestroyDB(FLAGS_db, options); -#endif // !ROCKSDB_LITE if (!s.ok()) { fprintf(stderr, "Cannot destroy original db: %s\n", s.ToString().c_str()); @@ -115,42 +112,101 @@ std::shared_ptr StressTest::NewCache(size_t capacity, return nullptr; } - if (FLAGS_cache_type == "clock_cache") { + std::shared_ptr secondary_cache; + if (!FLAGS_secondary_cache_uri.empty()) { + assert(!strstr(FLAGS_secondary_cache_uri.c_str(), + "compressed_secondary_cache") || + (FLAGS_compressed_secondary_cache_size == 0 && + FLAGS_compressed_secondary_cache_ratio == 0.0 && + !StartsWith(FLAGS_cache_type, "tiered_"))); + Status s = SecondaryCache::CreateFromString( + config_options, FLAGS_secondary_cache_uri, &secondary_cache); + if (secondary_cache == nullptr) { + fprintf(stderr, + "No secondary cache registered matching string: %s status=%s\n", + FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str()); + exit(1); + } + if (FLAGS_secondary_cache_fault_one_in > 0) { + secondary_cache = std::make_shared( + secondary_cache, static_cast(FLAGS_seed), + FLAGS_secondary_cache_fault_one_in); + } + } else if (FLAGS_compressed_secondary_cache_size > 0) { + if (StartsWith(FLAGS_cache_type, "tiered_")) { + fprintf(stderr, + "Cannot specify both compressed_secondary_cache_size and %s\n", + FLAGS_cache_type.c_str()); + exit(1); + } + CompressedSecondaryCacheOptions opts; + opts.capacity = FLAGS_compressed_secondary_cache_size; + secondary_cache = NewCompressedSecondaryCache(opts); + if (secondary_cache == nullptr) { + fprintf(stderr, "Failed to allocate compressed secondary cache\n"); + exit(1); + } + compressed_secondary_cache = secondary_cache; + } + + std::string cache_type = FLAGS_cache_type; + size_t cache_size = FLAGS_cache_size; + bool tiered = false; + if (StartsWith(cache_type, "tiered_")) { + tiered = true; + cache_type.erase(0, strlen("tiered_")); + } + if (FLAGS_use_write_buffer_manager) { + cache_size += FLAGS_db_write_buffer_size; + } + if (cache_type == "clock_cache") { fprintf(stderr, "Old clock cache implementation has been removed.\n"); exit(1); - } else if (FLAGS_cache_type == "hyper_clock_cache") { - return HyperClockCacheOptions(static_cast(capacity), - FLAGS_block_size /*estimated_entry_charge*/, - num_shard_bits) - .MakeSharedCache(); - } else if (FLAGS_cache_type == "lru_cache") { + } else if (EndsWith(cache_type, "hyper_clock_cache")) { + size_t estimated_entry_charge; + if (cache_type == "fixed_hyper_clock_cache" || + cache_type == "hyper_clock_cache") { + estimated_entry_charge = FLAGS_block_size; + } else if (cache_type == "auto_hyper_clock_cache") { + estimated_entry_charge = 0; + } else { + fprintf(stderr, "Cache type not supported."); + exit(1); + } + HyperClockCacheOptions opts(cache_size, estimated_entry_charge, + num_shard_bits); + opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX); + if (tiered) { + TieredCacheOptions tiered_opts; + tiered_opts.cache_opts = &opts; + tiered_opts.cache_type = PrimaryCacheType::kCacheTypeHCC; + tiered_opts.total_capacity = cache_size; + tiered_opts.compressed_secondary_ratio = 0.5; + block_cache = NewTieredCache(tiered_opts); + } else { + opts.secondary_cache = std::move(secondary_cache); + block_cache = opts.MakeSharedCache(); + } + } else if (EndsWith(cache_type, "lru_cache")) { LRUCacheOptions opts; opts.capacity = capacity; opts.num_shard_bits = num_shard_bits; -#ifndef ROCKSDB_LITE - std::shared_ptr secondary_cache; - if (!FLAGS_secondary_cache_uri.empty()) { - Status s = SecondaryCache::CreateFromString( - config_options, FLAGS_secondary_cache_uri, &secondary_cache); - if (secondary_cache == nullptr) { - fprintf(stderr, - "No secondary cache registered matching string: %s status=%s\n", - FLAGS_secondary_cache_uri.c_str(), s.ToString().c_str()); - exit(1); - } - if (FLAGS_secondary_cache_fault_one_in > 0) { - secondary_cache = std::make_shared( - secondary_cache, static_cast(FLAGS_seed), - FLAGS_secondary_cache_fault_one_in); - } - opts.secondary_cache = secondary_cache; + if (tiered) { + TieredCacheOptions tiered_opts; + tiered_opts.cache_opts = &opts; + tiered_opts.cache_type = PrimaryCacheType::kCacheTypeLRU; + tiered_opts.total_capacity = cache_size; + tiered_opts.compressed_secondary_ratio = 0.5; + block_cache = NewTieredCache(tiered_opts); + } else { + opts.secondary_cache = std::move(secondary_cache); + block_cache = NewLRUCache(opts); } -#endif - return NewLRUCache(opts); } else { fprintf(stderr, "Cache type not supported."); exit(1); } + return block_cache; } std::vector StressTest::GetBlobCompressionTags() { @@ -270,6 +326,13 @@ bool StressTest::BuildOptionsTable() { std::vector{"kDisable", "kFlushOnly"}); } + if (FLAGS_bloom_before_level != INT_MAX) { + // Can modify RibbonFilterPolicy field + options_tbl.emplace("table_factory.filter_policy.bloom_before_level", + std::vector{"-1", "0", "1", "2", + "2147483646", "2147483647"}); + } + options_table_ = std::move(options_tbl); for (const auto& iter : options_table_) { @@ -307,15 +370,14 @@ void StressTest::FinishInitDb(SharedState* shared) { exit(1); } } -#ifndef ROCKSDB_LITE - if (FLAGS_use_txn) { + if (FLAGS_use_txn && !FLAGS_use_optimistic_txn) { // It's OK here without sync because unsynced data cannot be lost at this // point // - even with sync_fault_injection=1 as the // file is still directly writable until after FinishInitDb() ProcessRecoveredPreparedTxns(shared); } -#endif + if (FLAGS_enable_compaction_filter) { auto* compaction_filter_factory = reinterpret_cast( @@ -405,10 +467,22 @@ Status StressTest::AssertSame(DB* db, ColumnFamilyHandle* cf, return Status::OK(); } -void StressTest::VerificationAbort(SharedState* shared, std::string msg, - Status s) const { - fprintf(stderr, "Verification failed: %s. Status is %s\n", msg.c_str(), - s.ToString().c_str()); +void StressTest::ProcessStatus(SharedState* shared, std::string opname, + Status s) const { + if (s.ok()) { + return; + } + if (!s.IsIOError() || !std::strstr(s.getState(), "injected")) { + std::ostringstream oss; + oss << opname << " failed: " << s.ToString(); + VerificationAbort(shared, oss.str()); + assert(false); + } + fprintf(stdout, "%s failed: %s\n", opname.c_str(), s.ToString().c_str()); +} + +void StressTest::VerificationAbort(SharedState* shared, std::string msg) const { + fprintf(stderr, "Verification failed: %s\n", msg.c_str()); shared->SetVerificationFailure(); } @@ -437,47 +511,27 @@ void StressTest::VerificationAbort(SharedState* shared, std::string msg, int cf, void StressTest::VerificationAbort(SharedState* shared, int cf, int64_t key, const Slice& value, - const WideColumns& columns, - const WideColumns& expected_columns) const { + const WideColumns& columns) const { assert(shared); auto key_str = Key(key); fprintf(stderr, "Verification failed for column family %d key %s (%" PRIi64 - "): Value and columns inconsistent: %s\n", + "): Value and columns inconsistent: value: %s, columns: %s\n", cf, Slice(key_str).ToString(/* hex */ true).c_str(), key, - DebugString(value, columns, expected_columns).c_str()); + value.ToString(/* hex */ true).c_str(), + WideColumnsToHex(columns).c_str()); shared->SetVerificationFailure(); } std::string StressTest::DebugString(const Slice& value, - const WideColumns& columns, - const WideColumns& expected_columns) { + const WideColumns& columns) { std::ostringstream oss; - oss << "value: " << value.ToString(/* hex */ true); - - auto dump = [](const WideColumns& cols, std::ostream& os) { - if (cols.empty()) { - return; - } - - os << std::hex; - - auto it = cols.begin(); - os << *it; - for (++it; it != cols.end(); ++it) { - os << ' ' << *it; - } - }; - - oss << ", columns: "; - dump(columns, oss); - - oss << ", expected_columns: "; - dump(expected_columns, oss); + oss << "value: " << value.ToString(/* hex */ true) + << ", columns: " << WideColumnsToHex(columns); return oss.str(); } @@ -510,19 +564,23 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys, for (int64_t k = 0; k != number_of_keys; ++k) { const std::string key = Key(k); - constexpr uint32_t value_base = 0; + PendingExpectedValue pending_expected_value = + shared->PreparePut(cf_idx, k); + const uint32_t value_base = pending_expected_value.GetFinalValueBase(); const size_t sz = GenerateValue(value_base, value, sizeof(value)); const Slice v(value, sz); - shared->Put(cf_idx, k, value_base, true /* pending */); - std::string ts; if (FLAGS_user_timestamp_size > 0) { ts = GetNowNanos(); } - if (FLAGS_use_merge) { + if (FLAGS_use_put_entity_one_in > 0 && + (value_base % FLAGS_use_put_entity_one_in) == 0) { + s = db_->PutEntity(write_opts, cfh, key, + GenerateWideColumns(value_base, v)); + } else if (FLAGS_use_merge) { if (!FLAGS_use_txn) { if (FLAGS_user_timestamp_size > 0) { s = db_->Merge(write_opts, cfh, key, ts, v); @@ -530,20 +588,10 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys, s = db_->Merge(write_opts, cfh, key, v); } } else { -#ifndef ROCKSDB_LITE - Transaction* txn; - s = NewTxn(write_opts, &txn); - if (s.ok()) { - s = txn->Merge(cfh, key, v); - if (s.ok()) { - s = CommitTxn(txn); - } - } -#endif + s = ExecuteTransaction( + write_opts, /*thread=*/nullptr, + [&](Transaction& txn) { return txn.Merge(cfh, key, v); }); } - } else if (FLAGS_use_put_entity_one_in > 0) { - s = db_->PutEntity(write_opts, cfh, key, - GenerateWideColumns(value_base, v)); } else { if (!FLAGS_use_txn) { if (FLAGS_user_timestamp_size > 0) { @@ -552,20 +600,13 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys, s = db_->Put(write_opts, cfh, key, v); } } else { -#ifndef ROCKSDB_LITE - Transaction* txn; - s = NewTxn(write_opts, &txn); - if (s.ok()) { - s = txn->Put(cfh, key, v); - if (s.ok()) { - s = CommitTxn(txn); - } - } -#endif + s = ExecuteTransaction( + write_opts, /*thread=*/nullptr, + [&](Transaction& txn) { return txn.Put(cfh, key, v); }); } } - shared->Put(cf_idx, k, value_base, false /* pending */); + pending_expected_value.Commit(); if (!s.ok()) { break; } @@ -585,9 +626,8 @@ void StressTest::PreloadDbAndReopenAsReadOnly(int64_t number_of_keys, column_families_.clear(); delete db_; db_ = nullptr; -#ifndef ROCKSDB_LITE txn_db_ = nullptr; -#endif + optimistic_txn_db_ = nullptr; db_preload_finished_.store(true); auto now = clock_->NowMicros(); @@ -625,7 +665,6 @@ Status StressTest::SetOptions(ThreadState* thread) { return db_->SetOptions(cfh, opts); } -#ifndef ROCKSDB_LITE void StressTest::ProcessRecoveredPreparedTxns(SharedState* shared) { assert(txn_db_); std::vector recovered_prepared_trans; @@ -648,8 +687,7 @@ void StressTest::ProcessRecoveredPreparedTxnsHelper(Transaction* txn, for (wbwi_iter->SeekToFirst(); wbwi_iter->Valid(); wbwi_iter->Next()) { uint64_t key_val; if (GetIntVal(wbwi_iter->Entry().key.ToString(), &key_val)) { - shared->Put(static_cast(i) /* cf_idx */, key_val, - 0 /* value_base */, true /* pending */); + shared->SyncPendingPut(static_cast(i) /* cf_idx */, key_val); } } } @@ -662,77 +700,118 @@ void StressTest::ProcessRecoveredPreparedTxnsHelper(Transaction* txn, } } -Status StressTest::NewTxn(WriteOptions& write_opts, Transaction** txn) { +Status StressTest::NewTxn(WriteOptions& write_opts, + std::unique_ptr* out_txn) { if (!FLAGS_use_txn) { return Status::InvalidArgument("NewTxn when FLAGS_use_txn is not set"); } write_opts.disableWAL = FLAGS_disable_wal; static std::atomic txn_id = {0}; - TransactionOptions txn_options; - txn_options.use_only_the_last_commit_time_batch_for_recovery = - FLAGS_use_only_the_last_commit_time_batch_for_recovery; - txn_options.lock_timeout = 600000; // 10 min - txn_options.deadlock_detect = true; - *txn = txn_db_->BeginTransaction(write_opts, txn_options); - auto istr = std::to_string(txn_id.fetch_add(1)); - Status s = (*txn)->SetName("xid" + istr); - return s; + if (FLAGS_use_optimistic_txn) { + out_txn->reset(optimistic_txn_db_->BeginTransaction(write_opts)); + return Status::OK(); + } else { + TransactionOptions txn_options; + txn_options.use_only_the_last_commit_time_batch_for_recovery = + FLAGS_use_only_the_last_commit_time_batch_for_recovery; + txn_options.lock_timeout = 600000; // 10 min + txn_options.deadlock_detect = true; + out_txn->reset(txn_db_->BeginTransaction(write_opts, txn_options)); + auto istr = std::to_string(txn_id.fetch_add(1)); + Status s = (*out_txn)->SetName("xid" + istr); + return s; + } } -Status StressTest::CommitTxn(Transaction* txn, ThreadState* thread) { +Status StressTest::CommitTxn(Transaction& txn, ThreadState* thread) { if (!FLAGS_use_txn) { return Status::InvalidArgument("CommitTxn when FLAGS_use_txn is not set"); } - assert(txn_db_); - Status s = txn->Prepare(); - std::shared_ptr timestamped_snapshot; - if (s.ok()) { - if (thread && FLAGS_create_timestamped_snapshot_one_in && - thread->rand.OneIn(FLAGS_create_timestamped_snapshot_one_in)) { - uint64_t ts = db_stress_env->NowNanos(); - s = txn->CommitAndTryCreateSnapshot(/*notifier=*/nullptr, ts, - ×tamped_snapshot); - - std::pair> res; - if (thread->tid == 0) { - uint64_t now = db_stress_env->NowNanos(); - res = txn_db_->CreateTimestampedSnapshot(now); - if (res.first.ok()) { - assert(res.second); - assert(res.second->GetTimestamp() == now); - if (timestamped_snapshot) { - assert(res.second->GetTimestamp() > - timestamped_snapshot->GetTimestamp()); + Status s = Status::OK(); + if (FLAGS_use_optimistic_txn) { + assert(optimistic_txn_db_); + s = txn.Commit(); + } else { + assert(txn_db_); + s = txn.Prepare(); + std::shared_ptr timestamped_snapshot; + if (s.ok()) { + if (thread && FLAGS_create_timestamped_snapshot_one_in && + thread->rand.OneIn(FLAGS_create_timestamped_snapshot_one_in)) { + uint64_t ts = db_stress_env->NowNanos(); + s = txn.CommitAndTryCreateSnapshot(/*notifier=*/nullptr, ts, + ×tamped_snapshot); + + std::pair> res; + if (thread->tid == 0) { + uint64_t now = db_stress_env->NowNanos(); + res = txn_db_->CreateTimestampedSnapshot(now); + if (res.first.ok()) { + assert(res.second); + assert(res.second->GetTimestamp() == now); + if (timestamped_snapshot) { + assert(res.second->GetTimestamp() > + timestamped_snapshot->GetTimestamp()); + } + } else { + assert(!res.second); } - } else { - assert(!res.second); } + } else { + s = txn.Commit(); } - } else { - s = txn->Commit(); + } + if (thread && FLAGS_create_timestamped_snapshot_one_in > 0 && + thread->rand.OneInOpt(50000)) { + uint64_t now = db_stress_env->NowNanos(); + constexpr uint64_t time_diff = static_cast(1000) * 1000 * 1000; + txn_db_->ReleaseTimestampedSnapshotsOlderThan(now - time_diff); } } - if (thread && FLAGS_create_timestamped_snapshot_one_in > 0 && - thread->rand.OneInOpt(50000)) { - uint64_t now = db_stress_env->NowNanos(); - constexpr uint64_t time_diff = static_cast(1000) * 1000 * 1000; - txn_db_->ReleaseTimestampedSnapshotsOlderThan(now - time_diff); - } - delete txn; return s; } -Status StressTest::RollbackTxn(Transaction* txn) { - if (!FLAGS_use_txn) { - return Status::InvalidArgument( - "RollbackTxn when FLAGS_use_txn is not" - " set"); +Status StressTest::ExecuteTransaction( + WriteOptions& write_opts, ThreadState* thread, + std::function&& ops) { + std::unique_ptr txn; + Status s = NewTxn(write_opts, &txn); + std::string try_again_messages; + if (s.ok()) { + for (int tries = 1;; ++tries) { + s = ops(*txn); + if (s.ok()) { + s = CommitTxn(*txn, thread); + if (s.ok()) { + break; + } + } + // Optimistic txn might return TryAgain, in which case rollback + // and try again. + if (!s.IsTryAgain() || !FLAGS_use_optimistic_txn) { + break; + } + // Record and report historical TryAgain messages for debugging + try_again_messages += + std::to_string(SystemClock::Default()->NowMicros() / 1000); + try_again_messages += "ms "; + try_again_messages += s.getState(); + try_again_messages += "\n"; + // In theory, each Rollback after TryAgain should have an independent + // chance of success, so too many retries could indicate something is + // not working properly. + if (tries >= 10) { + s = Status::TryAgain(try_again_messages); + break; + } + s = txn->Rollback(); + if (!s.ok()) { + break; + } + } } - Status s = txn->Rollback(); - delete txn; return s; } -#endif void StressTest::OperateDb(ThreadState* thread) { ReadOptions read_opts(FLAGS_verify_checksum, true); @@ -741,6 +820,7 @@ void StressTest::OperateDb(ThreadState* thread) { read_opts.async_io = FLAGS_async_io; read_opts.adaptive_readahead = FLAGS_adaptive_readahead; read_opts.readahead_size = FLAGS_readahead_size; + read_opts.auto_readahead_size = FLAGS_auto_readahead_size; WriteOptions write_opts; if (FLAGS_rate_limit_auto_wal_flush) { write_opts.rate_limiter_priority = Env::IO_USER; @@ -766,27 +846,11 @@ void StressTest::OperateDb(ThreadState* thread) { #ifndef NDEBUG if (FLAGS_read_fault_one_in) { - fault_fs_guard->SetThreadLocalReadErrorContext(thread->shared->GetSeed(), - FLAGS_read_fault_one_in); + fault_fs_guard->SetThreadLocalReadErrorContext( + thread->shared->GetSeed(), FLAGS_read_fault_one_in, + FLAGS_inject_error_severity == 1 /* retryable */); } #endif // NDEBUG - if (FLAGS_write_fault_one_in) { - IOStatus error_msg; - if (FLAGS_injest_error_severity <= 1 || FLAGS_injest_error_severity > 2) { - error_msg = IOStatus::IOError("Retryable IO Error"); - error_msg.SetRetryable(true); - } else if (FLAGS_injest_error_severity == 2) { - // Ingest the fatal error - error_msg = IOStatus::IOError("Fatal IO Error"); - error_msg.SetDataLoss(true); - } - std::vector types = {FileType::kTableFile, - FileType::kDescriptorFile, - FileType::kCurrentFile}; - fault_fs_guard->SetRandomWriteError( - thread->shared->GetSeed(), FLAGS_write_fault_one_in, error_msg, - /*inject_for_all_file_types=*/false, types); - } thread->stats.Start(); for (int open_cnt = 0; open_cnt <= FLAGS_reopen; ++open_cnt) { if (thread->shared->HasVerificationFailedYet() || @@ -845,6 +909,31 @@ void StressTest::OperateDb(ThreadState* thread) { } } + if (thread->rand.OneInOpt(FLAGS_lock_wal_one_in)) { + Status s = db_->LockWAL(); + if (!s.ok()) { + fprintf(stderr, "LockWAL() failed: %s\n", s.ToString().c_str()); + } else { + auto old_seqno = db_->GetLatestSequenceNumber(); + // Yield for a while + do { + std::this_thread::yield(); + } while (thread->rand.OneIn(2)); + // Latest seqno should not have changed + auto new_seqno = db_->GetLatestSequenceNumber(); + if (old_seqno != new_seqno) { + fprintf( + stderr, + "Failure: latest seqno changed from %u to %u with WAL locked\n", + (unsigned)old_seqno, (unsigned)new_seqno); + } + s = db_->UnlockWAL(); + if (!s.ok()) { + fprintf(stderr, "UnlockWAL() failed: %s\n", s.ToString().c_str()); + } + } + } + if (thread->rand.OneInOpt(FLAGS_sync_wal_one_in)) { Status s = db_->SyncWAL(); if (!s.ok() && !s.IsNotSupported()) { @@ -881,55 +970,51 @@ void StressTest::OperateDb(ThreadState* thread) { } } -#ifndef ROCKSDB_LITE // Verify GetLiveFiles with a 1 in N chance. if (thread->rand.OneInOpt(FLAGS_get_live_files_one_in) && !FLAGS_write_fault_one_in) { Status status = VerifyGetLiveFiles(); - if (!status.ok()) { - VerificationAbort(shared, "VerifyGetLiveFiles status not OK", status); - } + ProcessStatus(shared, "VerifyGetLiveFiles", status); } // Verify GetSortedWalFiles with a 1 in N chance. if (thread->rand.OneInOpt(FLAGS_get_sorted_wal_files_one_in)) { Status status = VerifyGetSortedWalFiles(); - if (!status.ok()) { - VerificationAbort(shared, "VerifyGetSortedWalFiles status not OK", - status); - } + ProcessStatus(shared, "VerifyGetSortedWalFiles", status); } // Verify GetCurrentWalFile with a 1 in N chance. if (thread->rand.OneInOpt(FLAGS_get_current_wal_file_one_in)) { Status status = VerifyGetCurrentWalFile(); - if (!status.ok()) { - VerificationAbort(shared, "VerifyGetCurrentWalFile status not OK", - status); - } + ProcessStatus(shared, "VerifyGetCurrentWalFile", status); } -#endif // !ROCKSDB_LITE if (thread->rand.OneInOpt(FLAGS_pause_background_one_in)) { Status status = TestPauseBackground(thread); - if (!status.ok()) { - VerificationAbort( - shared, "Pause/ContinueBackgroundWork status not OK", status); - } + ProcessStatus(shared, "Pause/ContinueBackgroundWork", status); } -#ifndef ROCKSDB_LITE if (thread->rand.OneInOpt(FLAGS_verify_checksum_one_in)) { + ThreadStatusUtil::SetEnableTracking(FLAGS_enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_VERIFY_DB_CHECKSUM); Status status = db_->VerifyChecksum(); - if (!status.ok()) { - VerificationAbort(shared, "VerifyChecksum status not OK", status); - } + ThreadStatusUtil::ResetThreadStatus(); + ProcessStatus(shared, "VerifyChecksum", status); + } + + if (thread->rand.OneInOpt(FLAGS_verify_file_checksums_one_in)) { + ThreadStatusUtil::SetEnableTracking(FLAGS_enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_VERIFY_FILE_CHECKSUMS); + Status status = db_->VerifyFileChecksums(read_opts); + ThreadStatusUtil::ResetThreadStatus(); + ProcessStatus(shared, "VerifyFileChecksums", status); } if (thread->rand.OneInOpt(FLAGS_get_property_one_in)) { TestGetProperty(thread); } -#endif std::vector rand_keys = GenerateKeys(rand_key); @@ -951,38 +1036,27 @@ void StressTest::OperateDb(ThreadState* thread) { if (total_size <= FLAGS_backup_max_size) { Status s = TestBackupRestore(thread, rand_column_families, rand_keys); - if (!s.ok()) { - VerificationAbort(shared, "Backup/restore gave inconsistent state", - s); - } + ProcessStatus(shared, "Backup/restore", s); } } if (thread->rand.OneInOpt(FLAGS_checkpoint_one_in)) { Status s = TestCheckpoint(thread, rand_column_families, rand_keys); - if (!s.ok()) { - VerificationAbort(shared, "Checkpoint gave inconsistent state", s); - } + ProcessStatus(shared, "Checkpoint", s); } -#ifndef ROCKSDB_LITE if (thread->rand.OneInOpt(FLAGS_approximate_size_one_in)) { Status s = TestApproximateSize(thread, i, rand_column_families, rand_keys); - if (!s.ok()) { - VerificationAbort(shared, "ApproximateSize Failed", s); - } + ProcessStatus(shared, "ApproximateSize", s); } -#endif // !ROCKSDB_LITE if (thread->rand.OneInOpt(FLAGS_acquire_snapshot_one_in)) { TestAcquireSnapshot(thread, rand_column_family, keystr, i); } /*always*/ { Status s = MaybeReleaseSnapshots(thread, i); - if (!s.ok()) { - VerificationAbort(shared, "Snapshot gave inconsistent state", s); - } + ProcessStatus(shared, "Snapshot", s); } // Assign timestamps if necessary. @@ -1001,7 +1075,27 @@ void StressTest::OperateDb(ThreadState* thread) { if (prob_op >= 0 && prob_op < static_cast(FLAGS_readpercent)) { assert(0 <= prob_op); // OPERATION read - if (FLAGS_use_multiget) { + ThreadStatusUtil::SetEnableTracking(FLAGS_enable_thread_tracking); + if (FLAGS_use_multi_get_entity) { + constexpr uint64_t max_batch_size = 64; + const uint64_t batch_size = std::min( + static_cast(thread->rand.Uniform(max_batch_size)) + 1, + ops_per_open - i); + assert(batch_size >= 1); + assert(batch_size <= max_batch_size); + assert(i + batch_size <= ops_per_open); + + rand_keys = GenerateNKeys(thread, static_cast(batch_size), i); + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_MULTIGETENTITY); + TestMultiGetEntity(thread, read_opts, rand_column_families, + rand_keys); + i += batch_size - 1; + } else if (FLAGS_use_get_entity) { + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_GETENTITY); + TestGetEntity(thread, read_opts, rand_column_families, rand_keys); + } else if (FLAGS_use_multiget) { // Leave room for one more iteration of the loop with a single key // batch. This is to ensure that each thread does exactly the same // number of ops @@ -1011,11 +1105,16 @@ void StressTest::OperateDb(ThreadState* thread) { // If its the last iteration, ensure that multiget_batch_size is 1 multiget_batch_size = std::max(multiget_batch_size, 1); rand_keys = GenerateNKeys(thread, multiget_batch_size, i); + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_MULTIGET); TestMultiGet(thread, read_opts, rand_column_families, rand_keys); i += multiget_batch_size - 1; } else { + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_GET); TestGet(thread, read_opts, rand_column_families, rand_keys); } + ThreadStatusUtil::ResetThreadStatus(); } else if (prob_op < prefix_bound) { assert(static_cast(FLAGS_readpercent) <= prob_op); // OPERATION prefix scan @@ -1043,8 +1142,12 @@ void StressTest::OperateDb(ThreadState* thread) { if (!FLAGS_skip_verifydb && thread->rand.OneInOpt( FLAGS_verify_iterator_with_expected_state_one_in)) { + ThreadStatusUtil::SetEnableTracking(FLAGS_enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_DBITERATOR); TestIterateAgainstExpected(thread, read_opts, rand_column_families, rand_keys); + ThreadStatusUtil::ResetThreadStatus(); } else { int num_seeks = static_cast(std::min( std::max(static_cast(thread->rand.Uniform(4)), @@ -1053,7 +1156,11 @@ void StressTest::OperateDb(ThreadState* thread) { static_cast(1)))); rand_keys = GenerateNKeys(thread, num_seeks, i); i += num_seeks - 1; + ThreadStatusUtil::SetEnableTracking(FLAGS_enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_DBITERATOR); TestIterate(thread, read_opts, rand_column_families, rand_keys); + ThreadStatusUtil::ResetThreadStatus(); } } else { assert(iterate_bound <= prob_op); @@ -1071,7 +1178,6 @@ void StressTest::OperateDb(ThreadState* thread) { thread->stats.Stop(); } -#ifndef ROCKSDB_LITE // Generated a list of keys that close to boundaries of SST keys. // If there isn't any SST file in the DB, return empty list. std::vector StressTest::GetWhiteBoxKeys(ThreadState* thread, @@ -1130,7 +1236,6 @@ std::vector StressTest::GetWhiteBoxKeys(ThreadState* thread, } return ret; } -#endif // !ROCKSDB_LITE // Given a key K, this creates an iterator which scans to K and then // does a random sequence of Next/Prev operations. @@ -1162,7 +1267,6 @@ Status StressTest::TestIterate(ThreadState* thread, } else if (options_.prefix_extractor.get() == nullptr) { expect_total_order = true; } - std::string upper_bound_str; Slice upper_bound; if (thread->rand.OneIn(16)) { @@ -1173,6 +1277,7 @@ Status StressTest::TestIterate(ThreadState* thread, upper_bound = Slice(upper_bound_str); ro.iterate_upper_bound = &upper_bound; } + std::string lower_bound_str; Slice lower_bound; if (thread->rand.OneIn(16)) { @@ -1267,6 +1372,14 @@ Status StressTest::TestIterate(ThreadState* thread, const bool support_seek_first_or_last = expect_total_order; + // Write-prepared and Write-unprepared do not support Refresh() yet. + if (!(FLAGS_use_txn && FLAGS_txn_write_policy != 0 /* write committed */) && + thread->rand.OneIn(4)) { + Status s = iter->Refresh(snapshot_guard.snapshot()); + assert(s.ok()); + op_logs += "Refresh "; + } + LastIterateOp last_op; if (support_seek_first_or_last && thread->rand.OneIn(100)) { iter->SeekToFirst(); @@ -1326,7 +1439,6 @@ Status StressTest::TestIterate(ThreadState* thread, return Status::OK(); } -#ifndef ROCKSDB_LITE // Test the return status of GetLiveFiles. Status StressTest::VerifyGetLiveFiles() const { std::vector live_file; @@ -1345,7 +1457,6 @@ Status StressTest::VerifyGetCurrentWalFile() const { std::unique_ptr cur_wal_file; return db_->GetCurrentWalFile(&cur_wal_file); } -#endif // !ROCKSDB_LITE // Compare the two iterator, iter and cmp_iter are in the same position, // unless iter might be made invalidate or undefined because of @@ -1374,11 +1485,11 @@ void StressTest::VerifyIterator(ThreadState* thread, } if (op == kLastOpSeekToFirst && ro.iterate_lower_bound != nullptr) { - // SeekToFirst() with lower bound is not well defined. + // SeekToFirst() with lower bound is not well-defined. *diverged = true; return; } else if (op == kLastOpSeekToLast && ro.iterate_upper_bound != nullptr) { - // SeekToLast() with higher bound is not well defined. + // SeekToLast() with higher bound is not well-defined. *diverged = true; return; } else if (op == kLastOpSeek && ro.iterate_lower_bound != nullptr && @@ -1389,7 +1500,7 @@ void StressTest::VerifyIterator(ThreadState* thread, options_.comparator->CompareWithoutTimestamp( *ro.iterate_lower_bound, /*a_has_ts=*/false, *ro.iterate_upper_bound, /*b_has_ts*/ false) >= 0))) { - // Lower bound behavior is not well defined if it is larger than + // Lower bound behavior is not well-defined if it is larger than // seek key or upper bound. Disable the check for now. *diverged = true; return; @@ -1401,7 +1512,7 @@ void StressTest::VerifyIterator(ThreadState* thread, options_.comparator->CompareWithoutTimestamp( *ro.iterate_lower_bound, /*a_has_ts=*/false, *ro.iterate_upper_bound, /*b_has_ts=*/false) >= 0))) { - // Uppder bound behavior is not well defined if it is smaller than + // Upper bound behavior is not well-defined if it is smaller than // seek key or lower bound. Disable the check for now. *diverged = true; return; @@ -1429,13 +1540,13 @@ void StressTest::VerifyIterator(ThreadState* thread, } } fprintf(stderr, - "Control interator is invalid but iterator has key %s " + "Control iterator is invalid but iterator has key %s " "%s\n", iter->key().ToString(true).c_str(), op_logs.c_str()); *diverged = true; } else if (cmp_iter->Valid()) { - // Iterator is not valid. It can be legimate if it has already been + // Iterator is not valid. It can be legitimate if it has already been // out of upper or lower bound, or filtered out by prefix iterator. const Slice& total_order_key = cmp_iter->key(); @@ -1458,7 +1569,7 @@ void StressTest::VerifyIterator(ThreadState* thread, return; } fprintf(stderr, - "Iterator stays in prefix but contol doesn't" + "Iterator stays in prefix but control doesn't" " iterator key %s control iterator key %s %s\n", iter->key().ToString(true).c_str(), cmp_iter->key().ToString(true).c_str(), op_logs.c_str()); @@ -1484,7 +1595,8 @@ void StressTest::VerifyIterator(ThreadState* thread, fprintf(stderr, "iterator has value %s\n", iter->key().ToString(true).c_str()); } else { - fprintf(stderr, "iterator is not valid\n"); + fprintf(stderr, "iterator is not valid with status: %s\n", + iter->status().ToString().c_str()); } *diverged = true; } @@ -1492,57 +1604,26 @@ void StressTest::VerifyIterator(ThreadState* thread, } if (!*diverged && iter->Valid()) { - const WideColumns expected_columns = - GenerateExpectedWideColumns(GetValueBase(iter->value()), iter->value()); - if (iter->columns() != expected_columns) { - fprintf(stderr, "Value and columns inconsistent for iterator: %s\n", - DebugString(iter->value(), iter->columns(), expected_columns) - .c_str()); + if (!VerifyWideColumns(iter->value(), iter->columns())) { + fprintf(stderr, + "Value and columns inconsistent for iterator: value: %s, " + "columns: %s\n", + iter->value().ToString(/* hex */ true).c_str(), + WideColumnsToHex(iter->columns()).c_str()); *diverged = true; } } if (*diverged) { - fprintf(stderr, "Control CF %s\n", cmp_cfh->GetName().c_str()); + fprintf(stderr, "VerifyIterator failed. Control CF %s\n", + cmp_cfh->GetName().c_str()); thread->stats.AddErrors(1); // Fail fast to preserve the DB state. thread->shared->SetVerificationFailure(); } } -#ifdef ROCKSDB_LITE -Status StressTest::TestBackupRestore( - ThreadState* /* thread */, - const std::vector& /* rand_column_families */, - const std::vector& /* rand_keys */) { - assert(false); - fprintf(stderr, - "RocksDB lite does not support " - "TestBackupRestore\n"); - std::terminate(); -} - -Status StressTest::TestCheckpoint( - ThreadState* /* thread */, - const std::vector& /* rand_column_families */, - const std::vector& /* rand_keys */) { - assert(false); - fprintf(stderr, - "RocksDB lite does not support " - "TestCheckpoint\n"); - std::terminate(); -} - -void StressTest::TestCompactFiles(ThreadState* /* thread */, - ColumnFamilyHandle* /* column_family */) { - assert(false); - fprintf(stderr, - "RocksDB lite does not support " - "CompactFiles\n"); - std::terminate(); -} -#else // ROCKSDB_LITE Status StressTest::TestBackupRestore( ThreadState* thread, const std::vector& rand_column_families, const std::vector& rand_keys) { @@ -1687,13 +1768,16 @@ Status StressTest::TestBackupRestore( } DB* restored_db = nullptr; std::vector restored_cf_handles; + // Not yet implemented: opening restored BlobDB or TransactionDB + Options restore_options; + if (s.ok() && !FLAGS_use_txn && !FLAGS_use_blob_db) { + s = PrepareOptionsForRestoredDB(&restore_options); + if (!s.ok()) { + from = "PrepareRestoredDBOptions in backup/restore"; + } + } if (s.ok() && !FLAGS_use_txn && !FLAGS_use_blob_db) { - Options restore_options(options_); - restore_options.best_efforts_recovery = false; - restore_options.listeners.clear(); - // Avoid dangling/shared file descriptors, for reliable destroy - restore_options.sst_file_manager = nullptr; std::vector cf_descriptors; // TODO(ajkr): `column_family_names_` is not safe to access here when // `clear_column_family_one_in != 0`. But we can't easily switch to @@ -1804,13 +1888,75 @@ Status StressTest::TestBackupRestore( from = "Destroy restore dir"; } } - if (!s.ok()) { + if (!s.ok() && (!s.IsIOError() || !std::strstr(s.getState(), "injected"))) { fprintf(stderr, "Failure in %s with: %s\n", from.c_str(), s.ToString().c_str()); } return s; } +void InitializeMergeOperator(Options& options) { + if (FLAGS_use_full_merge_v1) { + options.merge_operator = MergeOperators::CreateDeprecatedPutOperator(); + } else { + if (FLAGS_use_put_entity_one_in > 0) { + options.merge_operator = std::make_shared(); + } else { + options.merge_operator = MergeOperators::CreatePutOperator(); + } + } +} + +Status StressTest::PrepareOptionsForRestoredDB(Options* options) { + assert(options); + // To avoid race with other threads' operations (e.g, SetOptions()) + // on the same pointer sub-option (e.g, `std::shared_ptr + // filter_policy`) while having the same settings as `options_`, we create a + // new Options object from `options_`'s string to deep copy these pointer + // sub-options + Status s; + ConfigOptions config_opts; + + std::string db_options_str; + s = GetStringFromDBOptions(config_opts, options_, &db_options_str); + if (!s.ok()) { + return s; + } + DBOptions db_options; + s = GetDBOptionsFromString(config_opts, Options(), db_options_str, + &db_options); + if (!s.ok()) { + return s; + } + + std::string cf_options_str; + s = GetStringFromColumnFamilyOptions(config_opts, options_, &cf_options_str); + if (!s.ok()) { + return s; + } + ColumnFamilyOptions cf_options; + s = GetColumnFamilyOptionsFromString(config_opts, Options(), cf_options_str, + &cf_options); + if (!s.ok()) { + return s; + } + + *options = Options(db_options, cf_options); + options->best_efforts_recovery = false; + options->listeners.clear(); + // Avoid dangling/shared file descriptors, for reliable destroy + options->sst_file_manager = nullptr; + // GetColumnFamilyOptionsFromString does not create customized merge operator. + InitializeMergeOperator(*options); + if (FLAGS_user_timestamp_size > 0) { + // Check OPTIONS string loading can bootstrap the correct user comparator + // from object registry. + assert(options->comparator); + assert(options->comparator == test::BytewiseComparatorWithU64TsWrapper()); + } + + return Status::OK(); +} Status StressTest::TestApproximateSize( ThreadState* thread, uint64_t iteration, const std::vector& rand_column_families, @@ -1870,35 +2016,29 @@ Status StressTest::TestCheckpoint(ThreadState* thread, Options tmp_opts(options_); tmp_opts.listeners.clear(); tmp_opts.env = db_stress_env; + // Avoid delayed deletion so whole directory can be deleted + tmp_opts.sst_file_manager.reset(); DestroyDB(checkpoint_dir, tmp_opts); - if (db_stress_env->FileExists(checkpoint_dir).ok()) { - // If the directory might still exist, try to delete the files one by one. - // Likely a trash file is still there. - Status my_s = DestroyDir(db_stress_env, checkpoint_dir); - if (!my_s.ok()) { - fprintf(stderr, "Fail to destory directory before checkpoint: %s", - my_s.ToString().c_str()); - } - } - Checkpoint* checkpoint = nullptr; Status s = Checkpoint::Create(db_, &checkpoint); if (s.ok()) { s = checkpoint->CreateCheckpoint(checkpoint_dir); if (!s.ok()) { - fprintf(stderr, "Fail to create checkpoint to %s\n", - checkpoint_dir.c_str()); - std::vector files; - Status my_s = db_stress_env->GetChildren(checkpoint_dir, &files); - if (my_s.ok()) { - for (const auto& f : files) { - fprintf(stderr, " %s\n", f.c_str()); + if (!s.IsIOError() || !std::strstr(s.getState(), "injected")) { + fprintf(stderr, "Fail to create checkpoint to %s\n", + checkpoint_dir.c_str()); + std::vector files; + Status my_s = db_stress_env->GetChildren(checkpoint_dir, &files); + if (my_s.ok()) { + for (const auto& f : files) { + fprintf(stderr, " %s\n", f.c_str()); + } + } else { + fprintf(stderr, "Fail to get files under the directory to %s\n", + my_s.ToString().c_str()); } - } else { - fprintf(stderr, "Fail to get files under the directory to %s\n", - my_s.ToString().c_str()); } } } @@ -1974,8 +2114,10 @@ Status StressTest::TestCheckpoint(ThreadState* thread, } if (!s.ok()) { - fprintf(stderr, "A checkpoint operation failed with: %s\n", - s.ToString().c_str()); + if (!s.IsIOError() || !std::strstr(s.getState(), "injected")) { + fprintf(stderr, "A checkpoint operation failed with: %s\n", + s.ToString().c_str()); + } } else { DestroyDB(checkpoint_dir, tmp_opts); } @@ -2110,7 +2252,6 @@ void StressTest::TestCompactFiles(ThreadState* thread, } } } -#endif // ROCKSDB_LITE Status StressTest::TestFlush(const std::vector& rand_column_families) { FlushOptions flush_opts; @@ -2147,15 +2288,11 @@ void StressTest::TestAcquireSnapshot(ThreadState* thread, // This `ReadOptions` is for validation purposes. Ignore // `FLAGS_rate_limit_user_ops` to avoid slowing any validation. ReadOptions ropt; -#ifndef ROCKSDB_LITE auto db_impl = static_cast_with_check(db_->GetRootDB()); const bool ww_snapshot = thread->rand.OneIn(10); const Snapshot* snapshot = ww_snapshot ? db_impl->GetSnapshotForWriteConflictBoundary() : db_->GetSnapshot(); -#else - const Snapshot* snapshot = db_->GetSnapshot(); -#endif // !ROCKSDB_LITE ropt.snapshot = snapshot; // Ideally, we want snapshot taking and timestamp generation to be atomic @@ -2359,32 +2496,43 @@ void StressTest::PrintEnv() const { fprintf(stdout, "Format version : %d\n", FLAGS_format_version); fprintf(stdout, "TransactionDB : %s\n", FLAGS_use_txn ? "true" : "false"); - if (FLAGS_use_txn) { -#ifndef ROCKSDB_LITE - fprintf(stdout, "Two write queues: : %s\n", - FLAGS_two_write_queues ? "true" : "false"); - fprintf(stdout, "Write policy : %d\n", - static_cast(FLAGS_txn_write_policy)); - if (static_cast(TxnDBWritePolicy::WRITE_PREPARED) == - FLAGS_txn_write_policy || - static_cast(TxnDBWritePolicy::WRITE_UNPREPARED) == - FLAGS_txn_write_policy) { - fprintf(stdout, "Snapshot cache bits : %d\n", - static_cast(FLAGS_wp_snapshot_cache_bits)); - fprintf(stdout, "Commit cache bits : %d\n", - static_cast(FLAGS_wp_commit_cache_bits)); - } - fprintf(stdout, "last cwb for recovery : %s\n", - FLAGS_use_only_the_last_commit_time_batch_for_recovery ? "true" - : "false"); -#endif // !ROCKSDB_LITE - } - -#ifndef ROCKSDB_LITE + fprintf(stdout, "TransactionDB Type : %s\n", + FLAGS_use_optimistic_txn ? "Optimistic" : "Pessimistic"); + if (FLAGS_use_optimistic_txn) { + fprintf(stdout, "OCC Validation Type : %d\n", + static_cast(FLAGS_occ_validation_policy)); + if (static_cast(OccValidationPolicy::kValidateParallel) == + FLAGS_occ_validation_policy) { + fprintf(stdout, "Share Lock Buckets : %s\n", + FLAGS_share_occ_lock_buckets ? "true" : "false"); + if (FLAGS_share_occ_lock_buckets) { + fprintf(stdout, "Lock Bucket Count : %d\n", + static_cast(FLAGS_occ_lock_bucket_count)); + } + } + } else { + fprintf(stdout, "Two write queues: : %s\n", + FLAGS_two_write_queues ? "true" : "false"); + fprintf(stdout, "Write policy : %d\n", + static_cast(FLAGS_txn_write_policy)); + if (static_cast(TxnDBWritePolicy::WRITE_PREPARED) == + FLAGS_txn_write_policy || + static_cast(TxnDBWritePolicy::WRITE_UNPREPARED) == + FLAGS_txn_write_policy) { + fprintf(stdout, "Snapshot cache bits : %d\n", + static_cast(FLAGS_wp_snapshot_cache_bits)); + fprintf(stdout, "Commit cache bits : %d\n", + static_cast(FLAGS_wp_commit_cache_bits)); + } + fprintf(stdout, "last cwb for recovery : %s\n", + FLAGS_use_only_the_last_commit_time_batch_for_recovery ? "true" + : "false"); + } + } + fprintf(stdout, "Stacked BlobDB : %s\n", FLAGS_use_blob_db ? "true" : "false"); -#endif // !ROCKSDB_LITE fprintf(stdout, "Read only mode : %s\n", FLAGS_read_only ? "true" : "false"); fprintf(stdout, "Atomic flush : %s\n", @@ -2444,6 +2592,12 @@ void StressTest::PrintEnv() const { FLAGS_subcompactions); fprintf(stdout, "Use MultiGet : %s\n", FLAGS_use_multiget ? "true" : "false"); + fprintf(stdout, "Use GetEntity : %s\n", + FLAGS_use_get_entity ? "true" : "false"); + fprintf(stdout, "Use MultiGetEntity : %s\n", + FLAGS_use_multi_get_entity ? "true" : "false"); + fprintf(stdout, "Verification only : %s\n", + FLAGS_verification_only ? "true" : "false"); const char* memtablerep = ""; switch (FLAGS_rep_factory) { @@ -2522,18 +2676,14 @@ void StressTest::PrintEnv() const { fprintf(stdout, "------------------------------------------------\n"); } -void StressTest::Open(SharedState* shared) { +void StressTest::Open(SharedState* shared, bool reopen) { assert(db_ == nullptr); -#ifndef ROCKSDB_LITE assert(txn_db_ == nullptr); -#else - (void)shared; -#endif + assert(optimistic_txn_db_ == nullptr); if (!InitializeOptionsFromFile(options_)) { - InitializeOptionsFromFlags(cache_, compressed_cache_, filter_policy_, - options_); + InitializeOptionsFromFlags(cache_, filter_policy_, options_); } - InitializeOptionsGeneral(cache_, compressed_cache_, filter_policy_, options_); + InitializeOptionsGeneral(cache_, filter_policy_, options_); if (FLAGS_prefix_size == 0 && FLAGS_rep_factory == kHashSkipList) { fprintf(stderr, @@ -2640,21 +2790,24 @@ void StressTest::Open(SharedState* shared) { } options_.listeners.clear(); -#ifndef ROCKSDB_LITE options_.listeners.emplace_back(new DbStressListener( FLAGS_db, options_.db_paths, cf_descriptors, db_stress_listener_env)); -#endif // !ROCKSDB_LITE RegisterAdditionalListeners(); + // If this is for DB reopen, write error injection may have been enabled. + // Disable it here in case there is no open fault injection. + if (fault_fs_guard) { + fault_fs_guard->DisableWriteErrorInjection(); + } if (!FLAGS_use_txn) { - // Determine whether we need to ingest file metadata write failures + // Determine whether we need to inject file metadata write failures // during DB reopen. If it does, enable it. - // Only ingest metadata error if it is reopening, as initial open + // Only inject metadata error if it is reopening, as initial open // failure doesn't need to be handled. // TODO cover transaction DB is not covered in this fault test too. - bool ingest_meta_error = false; - bool ingest_write_error = false; - bool ingest_read_error = false; + bool inject_meta_error = false; + bool inject_write_error = false; + bool inject_read_error = false; if ((FLAGS_open_metadata_write_fault_one_in || FLAGS_open_write_fault_one_in || FLAGS_open_read_fault_one_in) && fault_fs_guard @@ -2665,30 +2818,29 @@ void StressTest::Open(SharedState* shared) { // WAL is durable. Buffering unsynced writes will cause false // positive in crash tests. Before we figure out a way to // solve it, skip WAL from failure injection. - fault_fs_guard->SetSkipDirectWritableTypes({kWalFile}); + fault_fs_guard->SetDirectWritableTypes({kWalFile}); } - ingest_meta_error = FLAGS_open_metadata_write_fault_one_in; - ingest_write_error = FLAGS_open_write_fault_one_in; - ingest_read_error = FLAGS_open_read_fault_one_in; - if (ingest_meta_error) { + inject_meta_error = FLAGS_open_metadata_write_fault_one_in; + inject_write_error = FLAGS_open_write_fault_one_in; + inject_read_error = FLAGS_open_read_fault_one_in; + if (inject_meta_error) { fault_fs_guard->EnableMetadataWriteErrorInjection(); fault_fs_guard->SetRandomMetadataWriteError( FLAGS_open_metadata_write_fault_one_in); } - if (ingest_write_error) { + if (inject_write_error) { fault_fs_guard->SetFilesystemDirectWritable(false); fault_fs_guard->EnableWriteErrorInjection(); fault_fs_guard->SetRandomWriteError( static_cast(FLAGS_seed), FLAGS_open_write_fault_one_in, - IOStatus::IOError("Injected Open Error"), + IOStatus::IOError("Injected Open Write Error"), /*inject_for_all_file_types=*/true, /*types=*/{}); } - if (ingest_read_error) { + if (inject_read_error) { fault_fs_guard->SetRandomReadError(FLAGS_open_read_fault_one_in); } } while (true) { -#ifndef ROCKSDB_LITE // StackableDB-based BlobDB if (FLAGS_use_blob_db) { blob_db::BlobDBOptions blob_db_options; @@ -2705,9 +2857,7 @@ void StressTest::Open(SharedState* shared) { if (s.ok()) { db_ = blob_db; } - } else -#endif // !ROCKSDB_LITE - { + } else { if (db_preload_finished_.load() && FLAGS_read_only) { s = DB::OpenForReadOnly(DBOptions(options_), FLAGS_db, cf_descriptors, &column_families_, &db_); @@ -2717,18 +2867,19 @@ void StressTest::Open(SharedState* shared) { } } - if (ingest_meta_error || ingest_write_error || ingest_read_error) { + if (inject_meta_error || inject_write_error || inject_read_error) { + // TODO: re-enable write error injection after reopen. Same for + // sync fault injection. fault_fs_guard->SetFilesystemDirectWritable(true); fault_fs_guard->DisableMetadataWriteErrorInjection(); fault_fs_guard->DisableWriteErrorInjection(); - fault_fs_guard->SetSkipDirectWritableTypes({}); + fault_fs_guard->SetDirectWritableTypes({}); fault_fs_guard->SetRandomReadError(0); if (s.ok()) { - // Ingested errors might happen in background compactions. We + // Injected errors might happen in background compactions. We // wait for all compactions to finish to make sure DB is in // clean state before executing queries. - s = static_cast_with_check(db_->GetRootDB()) - ->WaitForCompact(true /* wait_unscheduled */); + s = db_->GetRootDB()->WaitForCompact(WaitForCompactOptions()); if (!s.ok()) { for (auto cf : column_families_) { delete cf; @@ -2742,19 +2893,24 @@ void StressTest::Open(SharedState* shared) { // After failure to opening a DB due to IO error, retry should // successfully open the DB with correct data if no IO error shows // up. - ingest_meta_error = false; - ingest_write_error = false; - ingest_read_error = false; - - Random rand(static_cast(FLAGS_seed)); - if (rand.OneIn(2)) { - fault_fs_guard->DeleteFilesCreatedAfterLastDirSync(IOOptions(), - nullptr); - } - if (rand.OneIn(3)) { - fault_fs_guard->DropUnsyncedFileData(); - } else if (rand.OneIn(2)) { - fault_fs_guard->DropRandomUnsyncedFileData(&rand); + inject_meta_error = false; + inject_write_error = false; + inject_read_error = false; + + // TODO: Unsynced data loss during DB reopen is not supported yet in + // stress test. Will need to recreate expected state if we decide + // to support unsynced data loss during DB reopen. + if (!reopen) { + Random rand(static_cast(FLAGS_seed)); + if (rand.OneIn(2)) { + fault_fs_guard->DeleteFilesCreatedAfterLastDirSync(IOOptions(), + nullptr); + } + if (rand.OneIn(3)) { + fault_fs_guard->DropUnsyncedFileData(); + } else if (rand.OneIn(2)) { + fault_fs_guard->DropRandomUnsyncedFileData(&rand); + } } continue; } @@ -2762,39 +2918,66 @@ void StressTest::Open(SharedState* shared) { break; } } else { -#ifndef ROCKSDB_LITE - TransactionDBOptions txn_db_options; - assert(FLAGS_txn_write_policy <= TxnDBWritePolicy::WRITE_UNPREPARED); - txn_db_options.write_policy = - static_cast(FLAGS_txn_write_policy); - if (FLAGS_unordered_write) { - assert(txn_db_options.write_policy == TxnDBWritePolicy::WRITE_PREPARED); - options_.unordered_write = true; - options_.two_write_queues = true; - txn_db_options.skip_concurrency_control = true; + if (FLAGS_use_optimistic_txn) { + OptimisticTransactionDBOptions optimistic_txn_db_options; + optimistic_txn_db_options.validate_policy = + static_cast(FLAGS_occ_validation_policy); + + if (FLAGS_share_occ_lock_buckets) { + optimistic_txn_db_options.shared_lock_buckets = + MakeSharedOccLockBuckets(FLAGS_occ_lock_bucket_count); + } else { + optimistic_txn_db_options.occ_lock_buckets = + FLAGS_occ_lock_bucket_count; + optimistic_txn_db_options.shared_lock_buckets = nullptr; + } + s = OptimisticTransactionDB::Open( + options_, optimistic_txn_db_options, FLAGS_db, cf_descriptors, + &column_families_, &optimistic_txn_db_); + if (!s.ok()) { + fprintf(stderr, "Error in opening the OptimisticTransactionDB [%s]\n", + s.ToString().c_str()); + fflush(stderr); + } + assert(s.ok()); + { + db_ = optimistic_txn_db_; + db_aptr_.store(optimistic_txn_db_, std::memory_order_release); + } } else { - options_.two_write_queues = FLAGS_two_write_queues; - } - txn_db_options.wp_snapshot_cache_bits = - static_cast(FLAGS_wp_snapshot_cache_bits); - txn_db_options.wp_commit_cache_bits = - static_cast(FLAGS_wp_commit_cache_bits); - PrepareTxnDbOptions(shared, txn_db_options); - s = TransactionDB::Open(options_, txn_db_options, FLAGS_db, - cf_descriptors, &column_families_, &txn_db_); - if (!s.ok()) { - fprintf(stderr, "Error in opening the TransactionDB [%s]\n", - s.ToString().c_str()); - fflush(stderr); - } - assert(s.ok()); + TransactionDBOptions txn_db_options; + assert(FLAGS_txn_write_policy <= TxnDBWritePolicy::WRITE_UNPREPARED); + txn_db_options.write_policy = + static_cast(FLAGS_txn_write_policy); + if (FLAGS_unordered_write) { + assert(txn_db_options.write_policy == + TxnDBWritePolicy::WRITE_PREPARED); + options_.unordered_write = true; + options_.two_write_queues = true; + txn_db_options.skip_concurrency_control = true; + } else { + options_.two_write_queues = FLAGS_two_write_queues; + } + txn_db_options.wp_snapshot_cache_bits = + static_cast(FLAGS_wp_snapshot_cache_bits); + txn_db_options.wp_commit_cache_bits = + static_cast(FLAGS_wp_commit_cache_bits); + PrepareTxnDbOptions(shared, txn_db_options); + s = TransactionDB::Open(options_, txn_db_options, FLAGS_db, + cf_descriptors, &column_families_, &txn_db_); + if (!s.ok()) { + fprintf(stderr, "Error in opening the TransactionDB [%s]\n", + s.ToString().c_str()); + fflush(stderr); + } + assert(s.ok()); - // Do not swap the order of the following. - { - db_ = txn_db_; - db_aptr_.store(txn_db_, std::memory_order_release); + // Do not swap the order of the following. + { + db_ = txn_db_; + db_aptr_.store(txn_db_, std::memory_order_release); + } } -#endif } if (!s.ok()) { fprintf(stderr, "Error in opening the DB [%s]\n", s.ToString().c_str()); @@ -2808,7 +2991,6 @@ void StressTest::Open(SharedState* shared) { // transactions, thus just disable secondary instance if we use // transaction. if (s.ok() && FLAGS_test_secondary && !FLAGS_use_txn) { -#ifndef ROCKSDB_LITE Options tmp_opts; // TODO(yanqin) support max_open_files != -1 for secondary instance. tmp_opts.max_open_files = -1; @@ -2818,20 +3000,11 @@ void StressTest::Open(SharedState* shared) { cf_descriptors, &cmp_cfhs_, &cmp_db_); assert(s.ok()); assert(cmp_cfhs_.size() == static_cast(FLAGS_column_families)); -#else - fprintf(stderr, "Secondary is not supported in RocksDBLite\n"); - exit(1); -#endif // !ROCKSDB_LITE } } else { -#ifndef ROCKSDB_LITE DBWithTTL* db_with_ttl; s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl); db_ = db_with_ttl; -#else - fprintf(stderr, "TTL is not supported in RocksDBLite\n"); - exit(1); -#endif } if (FLAGS_preserve_unverified_changes) { @@ -2855,7 +3028,6 @@ void StressTest::Open(SharedState* shared) { } void StressTest::Reopen(ThreadState* thread) { -#ifndef ROCKSDB_LITE // BG jobs in WritePrepared must be canceled first because i) they can access // the db via a callbac ii) they hold on to a snapshot and the upcoming // ::Close would complain about it. @@ -2868,16 +3040,12 @@ void StressTest::Reopen(ThreadState* thread) { bg_canceled = wait; } assert(!write_prepared || bg_canceled); -#else - (void)thread; -#endif for (auto cf : column_families_) { delete cf; } column_families_.clear(); -#ifndef ROCKSDB_LITE if (thread->rand.OneIn(2)) { Status s = db_->Close(); if (!s.ok()) { @@ -2886,18 +3054,18 @@ void StressTest::Reopen(ThreadState* thread) { } assert(s.ok()); } -#endif + assert((txn_db_ == nullptr && optimistic_txn_db_ == nullptr) || + (db_ == txn_db_ || db_ == optimistic_txn_db_)); delete db_; db_ = nullptr; -#ifndef ROCKSDB_LITE txn_db_ = nullptr; -#endif + optimistic_txn_db_ = nullptr; num_times_reopened_++; auto now = clock_->NowMicros(); fprintf(stdout, "%s Reopening database for the %dth time\n", clock_->TimeToString(now / 1000000).c_str(), num_times_reopened_); - Open(thread->shared); + Open(thread->shared, /*reopen=*/true); if ((FLAGS_sync_fault_injection || FLAGS_disable_wal || FLAGS_manual_wal_flush_one_in > 0) && @@ -2996,12 +3164,6 @@ void CheckAndSetOptionsForUserTimestamp(Options& options) { fprintf(stderr, "TransactionDB does not support timestamp yet.\n"); exit(1); } -#ifndef ROCKSDB_LITE - if (FLAGS_enable_blob_files || FLAGS_use_blob_db) { - fprintf(stderr, "BlobDB not supported with timestamp.\n"); - exit(1); - } -#endif // !ROCKSDB_LITE if (FLAGS_test_cf_consistency || FLAGS_test_batches_snapshots) { fprintf(stderr, "Due to per-key ts-seq ordering constraint, only the (default) " @@ -3016,11 +3178,14 @@ void CheckAndSetOptionsForUserTimestamp(Options& options) { } bool InitializeOptionsFromFile(Options& options) { -#ifndef ROCKSDB_LITE DBOptions db_options; + ConfigOptions config_options; + config_options.ignore_unknown_options = false; + config_options.input_strings_escaped = true; + config_options.env = db_stress_env; std::vector cf_descriptors; if (!FLAGS_options_file.empty()) { - Status s = LoadOptionsFromFile(FLAGS_options_file, db_stress_env, + Status s = LoadOptionsFromFile(config_options, FLAGS_options_file, &db_options, &cf_descriptors); if (!s.ok()) { fprintf(stderr, "Unable to load options file %s --- %s\n", @@ -3031,17 +3196,11 @@ bool InitializeOptionsFromFile(Options& options) { options = Options(db_options, cf_descriptors[0].options); return true; } -#else - (void)options; - fprintf(stderr, "--options_file not supported in lite mode\n"); - exit(1); -#endif //! ROCKSDB_LITE return false; } void InitializeOptionsFromFlags( const std::shared_ptr& cache, - const std::shared_ptr& block_cache_compressed, const std::shared_ptr& filter_policy, Options& options) { BlockBasedTableOptions block_based_options; @@ -3054,7 +3213,6 @@ void InitializeOptionsFromFlags( static_cast(FLAGS_partition_pinning); block_based_options.metadata_cache_options.unpartitioned_pinning = static_cast(FLAGS_unpartitioned_pinning); - block_based_options.block_cache_compressed = block_cache_compressed; block_based_options.checksum = checksum_type_e; block_based_options.block_size = FLAGS_block_size; block_based_options.cache_usage_options.options_overrides.insert( @@ -3117,6 +3275,10 @@ void InitializeOptionsFromFlags( FLAGS_max_write_buffer_size_to_maintain; options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_prefix_bloom_size_ratio; + if (FLAGS_use_write_buffer_manager) { + options.write_buffer_manager.reset( + new WriteBufferManager(FLAGS_db_write_buffer_size, block_cache)); + } options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering; options.disable_auto_compactions = FLAGS_disable_auto_compactions; options.max_background_compactions = FLAGS_max_background_compactions; @@ -3173,6 +3335,9 @@ void InitializeOptionsFromFlags( "cannot be used because ZSTD 1.4.5+ is not linked with the binary." " zstd dictionary trainer will be used.\n"); } + if (FLAGS_compression_checksum) { + options.compression_opts.checksum = true; + } options.max_manifest_file_size = FLAGS_max_manifest_file_size; options.inplace_update_support = FLAGS_in_place_update; options.max_subcompactions = static_cast(FLAGS_subcompactions); @@ -3208,6 +3373,7 @@ void InitializeOptionsFromFlags( FLAGS_verify_sst_unique_id_in_manifest; options.memtable_protection_bytes_per_key = FLAGS_memtable_protection_bytes_per_key; + options.block_protection_bytes_per_key = FLAGS_block_protection_bytes_per_key; // Integrated BlobDB options.enable_blob_files = FLAGS_enable_blob_files; @@ -3266,26 +3432,15 @@ void InitializeOptionsFromFlags( case kSkipList: // no need to do anything break; -#ifndef ROCKSDB_LITE case kHashSkipList: options.memtable_factory.reset(NewHashSkipListRepFactory(10000)); break; case kVectorRep: options.memtable_factory.reset(new VectorRepFactory()); break; -#else - default: - fprintf(stderr, - "RocksdbLite only supports skip list mem table. Skip " - "--rep_factory\n"); -#endif // ROCKSDB_LITE } - if (FLAGS_use_full_merge_v1) { - options.merge_operator = MergeOperators::CreateDeprecatedPutOperator(); - } else { - options.merge_operator = MergeOperators::CreatePutOperator(); - } + InitializeMergeOperator(options); if (FLAGS_enable_compaction_filter) { options.compaction_filter_factory = @@ -3301,11 +3456,17 @@ void InitializeOptionsFromFlags( } options.allow_data_in_errors = FLAGS_allow_data_in_errors; + + options.enable_thread_tracking = FLAGS_enable_thread_tracking; + + options.memtable_max_range_deletions = FLAGS_memtable_max_range_deletions; + + options.bottommost_file_compaction_delay = + FLAGS_bottommost_file_compaction_delay; } void InitializeOptionsGeneral( const std::shared_ptr& cache, - const std::shared_ptr& block_cache_compressed, const std::shared_ptr& filter_policy, Options& options) { options.create_missing_column_families = true; @@ -3326,10 +3487,6 @@ void InitializeOptionsGeneral( if (FLAGS_cache_size > 0) { table_options->block_cache = cache; } - if (!table_options->block_cache_compressed && - FLAGS_compressed_cache_size > 0) { - table_options->block_cache_compressed = block_cache_compressed; - } if (!table_options->filter_policy) { table_options->filter_policy = filter_policy; } diff --git a/db_stress_tool/db_stress_test_base.h b/db_stress_tool/db_stress_test_base.h index 81fbbe24b15a..424570b33c9b 100644 --- a/db_stress_tool/db_stress_test_base.h +++ b/db_stress_tool/db_stress_test_base.h @@ -17,6 +17,7 @@ namespace ROCKSDB_NAMESPACE { class SystemClock; class Transaction; class TransactionDB; +class OptimisticTransactionDB; struct TransactionDBOptions; class StressTest { @@ -51,7 +52,6 @@ class StressTest { Status SetOptions(ThreadState* thread); -#ifndef ROCKSDB_LITE // For transactionsDB, there can be txns prepared but not yet committeed // right before previous stress run crash. // They will be recovered and processed through @@ -64,12 +64,14 @@ class StressTest { virtual void ProcessRecoveredPreparedTxnsHelper(Transaction* txn, SharedState* shared); - Status NewTxn(WriteOptions& write_opts, Transaction** txn); + // ExecuteTransaction is recommended instead + Status NewTxn(WriteOptions& write_opts, + std::unique_ptr* out_txn); + Status CommitTxn(Transaction& txn, ThreadState* thread = nullptr); - Status CommitTxn(Transaction* txn, ThreadState* thread = nullptr); - - Status RollbackTxn(Transaction* txn); -#endif + // Creates a transaction, executes `ops`, and tries to commit + Status ExecuteTransaction(WriteOptions& write_opts, ThreadState* thread, + std::function&& ops); virtual void MaybeClearOneColumnFamily(ThreadState* /* thread */) {} @@ -96,6 +98,15 @@ class StressTest { const std::vector& rand_column_families, const std::vector& rand_keys) = 0; + virtual void TestGetEntity(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) = 0; + + virtual void TestMultiGetEntity(ThreadState* thread, + const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) = 0; + virtual Status TestPrefixScan(ThreadState* thread, const ReadOptions& read_opts, const std::vector& rand_column_families, @@ -139,19 +150,11 @@ class StressTest { return column_families_[column_family_id]; } -#ifndef ROCKSDB_LITE // Generated a list of keys that close to boundaries of SST keys. // If there isn't any SST file in the DB, return empty list. std::vector GetWhiteBoxKeys(ThreadState* thread, DB* db, ColumnFamilyHandle* cfh, size_t num_keys); -#else // !ROCKSDB_LITE - std::vector GetWhiteBoxKeys(ThreadState*, DB*, - ColumnFamilyHandle*, size_t) { - // Not supported in LITE mode. - return {}; - } -#endif // !ROCKSDB_LITE // Given a key K, this creates an iterator which scans to K and then // does a random sequence of Next/Prev operations. @@ -191,6 +194,8 @@ class StressTest { const std::vector& rand_column_families, const std::vector& rand_keys); + virtual Status PrepareOptionsForRestoredDB(Options* options); + virtual Status TestCheckpoint(ThreadState* thread, const std::vector& rand_column_families, const std::vector& rand_keys); @@ -205,7 +210,6 @@ class StressTest { const std::string& keystr, uint64_t i); Status MaybeReleaseSnapshots(ThreadState* thread, uint64_t i); -#ifndef ROCKSDB_LITE Status VerifyGetLiveFiles() const; Status VerifyGetSortedWalFiles() const; Status VerifyGetCurrentWalFile() const; @@ -215,7 +219,6 @@ class StressTest { ThreadState* thread, uint64_t iteration, const std::vector& rand_column_families, const std::vector& rand_keys); -#endif // !ROCKSDB_LITE virtual Status TestCustomOperations( ThreadState* /*thread*/, @@ -223,7 +226,9 @@ class StressTest { return Status::NotSupported("TestCustomOperations() must be overridden"); } - void VerificationAbort(SharedState* shared, std::string msg, Status s) const; + void ProcessStatus(SharedState* shared, std::string msg, Status s) const; + + void VerificationAbort(SharedState* shared, std::string msg) const; void VerificationAbort(SharedState* shared, std::string msg, int cf, int64_t key) const; @@ -233,24 +238,21 @@ class StressTest { Slice value_from_expected) const; void VerificationAbort(SharedState* shared, int cf, int64_t key, - const Slice& value, const WideColumns& columns, - const WideColumns& expected_columns) const; + const Slice& value, const WideColumns& columns) const; - static std::string DebugString(const Slice& value, const WideColumns& columns, - const WideColumns& expected_columns); + static std::string DebugString(const Slice& value, + const WideColumns& columns); void PrintEnv() const; - void Open(SharedState* shared); + void Open(SharedState* shared, bool reopen = false); void Reopen(ThreadState* thread); virtual void RegisterAdditionalListeners() {} -#ifndef ROCKSDB_LITE virtual void PrepareTxnDbOptions(SharedState* /*shared*/, TransactionDBOptions& /*txn_db_opts*/) {} -#endif // Returns whether the timestamp of read_opts is updated. bool MaybeUseOlderTimestampForPointLookup(ThreadState* thread, @@ -266,9 +268,8 @@ class StressTest { std::shared_ptr compressed_cache_; std::shared_ptr filter_policy_; DB* db_; -#ifndef ROCKSDB_LITE TransactionDB* txn_db_; -#endif + OptimisticTransactionDB* optimistic_txn_db_; // Currently only used in MultiOpsTxnsStressTest std::atomic db_aptr_; @@ -298,7 +299,6 @@ extern bool InitializeOptionsFromFile(Options& options); // input arguments. extern void InitializeOptionsFromFlags( const std::shared_ptr& cache, - const std::shared_ptr& block_cache_compressed, const std::shared_ptr& filter_policy, Options& options); // Initialize `options` on which `InitializeOptionsFromFile()` and @@ -306,7 +306,7 @@ extern void InitializeOptionsFromFlags( // There are two cases. // Case 1: OPTIONS file is not specified. Command line arguments have been used // to initialize `options`. InitializeOptionsGeneral() will use -// `cache`, `block_cache_compressed` and `filter_policy` to initialize +// `cache` and `filter_policy` to initialize // corresponding fields of `options`. InitializeOptionsGeneral() will // also set up other fields of `options` so that stress test can run. // Examples include `create_if_missing` and @@ -317,14 +317,13 @@ extern void InitializeOptionsFromFlags( // case, if command line arguments indicate that the user wants to set // up such shared objects, e.g. block cache, compressed block cache, // row cache, filter policy, then InitializeOptionsGeneral() will honor -// the user's choice, thus passing `cache`, `block_cache_compressed`, +// the user's choice, thus passing `cache`, // `filter_policy` as input arguments. // // InitializeOptionsGeneral() must not overwrite fields of `options` loaded // from OPTIONS file. extern void InitializeOptionsGeneral( const std::shared_ptr& cache, - const std::shared_ptr& block_cache_compressed, const std::shared_ptr& filter_policy, Options& options); // If no OPTIONS file is specified, set up `options` so that we can test diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc index fd28856b7311..9c24e2c42517 100644 --- a/db_stress_tool/db_stress_tool.cc +++ b/db_stress_tool/db_stress_tool.cc @@ -31,6 +31,7 @@ namespace ROCKSDB_NAMESPACE { namespace { static std::shared_ptr env_guard; static std::shared_ptr env_wrapper_guard; +static std::shared_ptr legacy_env_wrapper_guard; static std::shared_ptr dbsl_env_wrapper_guard; static std::shared_ptr fault_env_guard; @@ -87,11 +88,11 @@ int db_stress_tool(int argc, char** argv) { FaultInjectionTestFS* fs = new FaultInjectionTestFS(raw_env->GetFileSystem()); fault_fs_guard.reset(fs); - if (FLAGS_write_fault_one_in) { - fault_fs_guard->SetFilesystemDirectWritable(false); - } else { - fault_fs_guard->SetFilesystemDirectWritable(true); - } + // Set it to direct writable here to not lose files created during DB open + // when no open fault injection is not enabled. + // This will be overwritten in StressTest::Open() for open fault injection + // and in RunStressTestImpl() for proper write fault injection setup. + fault_fs_guard->SetFilesystemDirectWritable(true); fault_env_guard = std::make_shared(raw_env, fault_fs_guard); raw_env = fault_env_guard.get(); @@ -99,12 +100,16 @@ int db_stress_tool(int argc, char** argv) { env_wrapper_guard = std::make_shared( raw_env, std::make_shared(raw_env->GetFileSystem())); - if (!env_opts) { + if (!env_opts && !FLAGS_use_io_uring) { // If using the default Env (Posix), wrap DbStressEnvWrapper with the - // legacy EnvWrapper. This is a temporary fix for the ReadAsync interface - // not being properly supported with Posix and db_stress. The EnvWrapper + // legacy EnvWrapper. This is a workaround to prevent MultiGet and scans + // from failing when IO uring is disabled. The EnvWrapper // has a default implementation of ReadAsync that redirects to Read. - env_wrapper_guard = std::make_shared(env_wrapper_guard); + legacy_env_wrapper_guard = std::make_shared(raw_env); + env_wrapper_guard = std::make_shared( + legacy_env_wrapper_guard, + std::make_shared( + legacy_env_wrapper_guard->GetFileSystem())); } db_stress_env = env_wrapper_guard.get(); @@ -235,10 +240,10 @@ int db_stress_tool(int argc, char** argv) { FLAGS_secondaries_base = default_secondaries_path; } - if (FLAGS_best_efforts_recovery && !FLAGS_skip_verifydb && - !FLAGS_disable_wal) { + if (FLAGS_best_efforts_recovery && + !(FLAGS_skip_verifydb && FLAGS_disable_wal)) { fprintf(stderr, - "With best-efforts recovery, either skip_verifydb or disable_wal " + "With best-efforts recovery, skip_verifydb and disable_wal " "should be set to true.\n"); exit(1); } @@ -269,6 +274,14 @@ int db_stress_tool(int argc, char** argv) { CheckAndSetOptionsForMultiOpsTxnStressTest(); } + if (!FLAGS_use_txn && FLAGS_use_optimistic_txn) { + fprintf( + stderr, + "You cannot set use_optimistic_txn true while use_txn is false. Please " + "set use_txn true if you want to use OptimisticTransactionDB\n"); + exit(1); + } + if (FLAGS_create_timestamped_snapshot_one_in > 0) { if (!FLAGS_use_txn) { fprintf(stderr, "timestamped snapshot supported only in TransactionDB\n"); @@ -286,8 +299,8 @@ int db_stress_tool(int argc, char** argv) { exit(1); } - if (FLAGS_use_txn && FLAGS_sync_fault_injection && - FLAGS_txn_write_policy != 0) { + if (FLAGS_use_txn && !FLAGS_use_optimistic_txn && + FLAGS_sync_fault_injection && FLAGS_txn_write_policy != 0) { fprintf(stderr, "For TransactionDB, correctness testing with unsync data loss is " "currently compatible with only write committed policy\n"); @@ -295,12 +308,11 @@ int db_stress_tool(int argc, char** argv) { } if (FLAGS_use_put_entity_one_in > 0 && - (FLAGS_ingest_external_file_one_in > 0 || FLAGS_use_merge || - FLAGS_use_full_merge_v1 || FLAGS_use_txn || FLAGS_test_multi_ops_txns || + (FLAGS_use_full_merge_v1 || FLAGS_use_txn || FLAGS_test_multi_ops_txns || FLAGS_user_timestamp_size > 0)) { fprintf(stderr, - "PutEntity is currently incompatible with SstFileWriter, Merge," - " transactions, and user-defined timestamps\n"); + "Wide columns are incompatible with V1 Merge, transactions, and " + "user-defined timestamps\n"); exit(1); } diff --git a/db_stress_tool/db_stress_wide_merge_operator.cc b/db_stress_tool/db_stress_wide_merge_operator.cc new file mode 100644 index 000000000000..1fcfc304249b --- /dev/null +++ b/db_stress_tool/db_stress_wide_merge_operator.cc @@ -0,0 +1,51 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef GFLAGS + +#include "db_stress_tool/db_stress_wide_merge_operator.h" + +#include "db_stress_tool/db_stress_common.h" + +namespace ROCKSDB_NAMESPACE { + +bool DBStressWideMergeOperator::FullMergeV3( + const MergeOperationInputV3& merge_in, + MergeOperationOutputV3* merge_out) const { + assert(!merge_in.operand_list.empty()); + assert(merge_out); + + const Slice& latest = merge_in.operand_list.back(); + + if (latest.size() < sizeof(uint32_t)) { + return false; + } + + const uint32_t value_base = GetValueBase(latest); + + if (FLAGS_use_put_entity_one_in == 0 || + (value_base % FLAGS_use_put_entity_one_in) != 0) { + merge_out->new_value = latest; + return true; + } + + const auto columns = GenerateWideColumns(value_base, latest); + + merge_out->new_value = MergeOperationOutputV3::NewColumns(); + auto& new_columns = + std::get(merge_out->new_value); + new_columns.reserve(columns.size()); + + for (const auto& column : columns) { + new_columns.emplace_back(column.name().ToString(), + column.value().ToString()); + } + + return true; +} + +} // namespace ROCKSDB_NAMESPACE + +#endif // GFLAGS diff --git a/db_stress_tool/db_stress_wide_merge_operator.h b/db_stress_tool/db_stress_wide_merge_operator.h new file mode 100644 index 000000000000..cba4f6b6b8a7 --- /dev/null +++ b/db_stress_tool/db_stress_wide_merge_operator.h @@ -0,0 +1,27 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/merge_operator.h" + +namespace ROCKSDB_NAMESPACE { + +// A test merge operator that implements the wide-column aware FullMergeV3 +// interface. Similarly to the simple "put" type merge operators, the merge +// result is based on the last merge operand; however, the merge result can +// potentially be a wide-column entity, depending on the value base encoded into +// the merge operand and the value of the "use_put_entity_one_in" stress test +// option. Following the same rule as for writes ensures that the queries +// issued by the validation logic receive the expected results. +class DBStressWideMergeOperator : public MergeOperator { + public: + bool FullMergeV3(const MergeOperationInputV3& merge_in, + MergeOperationOutputV3* merge_out) const override; + + const char* Name() const override { return "DBStressWideMergeOperator"; } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/db_stress_tool/expected_state.cc b/db_stress_tool/expected_state.cc index d08403b7617e..b483e154c451 100644 --- a/db_stress_tool/expected_state.cc +++ b/db_stress_tool/expected_state.cc @@ -3,88 +3,125 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include #ifdef GFLAGS -#include "db_stress_tool/expected_state.h" - #include "db/wide/wide_column_serialization.h" +#include "db/wide/wide_columns_helper.h" #include "db_stress_tool/db_stress_common.h" #include "db_stress_tool/db_stress_shared_state.h" +#include "db_stress_tool/expected_state.h" #include "rocksdb/trace_reader_writer.h" #include "rocksdb/trace_record_result.h" namespace ROCKSDB_NAMESPACE { - ExpectedState::ExpectedState(size_t max_key, size_t num_column_families) : max_key_(max_key), num_column_families_(num_column_families), values_(nullptr) {} void ExpectedState::ClearColumnFamily(int cf) { - std::fill(&Value(cf, 0 /* key */), &Value(cf + 1, 0 /* key */), - SharedState::DELETION_SENTINEL); + const uint32_t del_mask = ExpectedValue::GetDelMask(); + std::fill(&Value(cf, 0 /* key */), &Value(cf + 1, 0 /* key */), del_mask); } -void ExpectedState::Put(int cf, int64_t key, uint32_t value_base, - bool pending) { - if (!pending) { - // prevent expected-value update from reordering before Write - std::atomic_thread_fence(std::memory_order_release); - } - Value(cf, key).store(pending ? SharedState::UNKNOWN_SENTINEL : value_base, - std::memory_order_relaxed); - if (pending) { - // prevent Write from reordering before expected-value update - std::atomic_thread_fence(std::memory_order_release); - } +void ExpectedState::Precommit(int cf, int64_t key, const ExpectedValue& value) { + Value(cf, key).store(value.Read()); + // To prevent low-level instruction reordering that results + // in db write happens before setting pending state in expected value + std::atomic_thread_fence(std::memory_order_release); } -uint32_t ExpectedState::Get(int cf, int64_t key) const { - return Value(cf, key); +PendingExpectedValue ExpectedState::PreparePut(int cf, int64_t key) { + ExpectedValue expected_value = Load(cf, key); + const ExpectedValue orig_expected_value = expected_value; + expected_value.Put(true /* pending */); + const ExpectedValue pending_expected_value = expected_value; + expected_value.Put(false /* pending */); + const ExpectedValue final_expected_value = expected_value; + Precommit(cf, key, pending_expected_value); + return PendingExpectedValue(&Value(cf, key), orig_expected_value, + final_expected_value); } -bool ExpectedState::Delete(int cf, int64_t key, bool pending) { - if (Value(cf, key) == SharedState::DELETION_SENTINEL) { - return false; - } - Put(cf, key, SharedState::DELETION_SENTINEL, pending); - return true; +ExpectedValue ExpectedState::Get(int cf, int64_t key) { return Load(cf, key); } + +PendingExpectedValue ExpectedState::PrepareDelete(int cf, int64_t key, + bool* prepared) { + ExpectedValue expected_value = Load(cf, key); + const ExpectedValue orig_expected_value = expected_value; + bool res = expected_value.Delete(true /* pending */); + if (prepared) { + *prepared = res; + } + if (!res) { + return PendingExpectedValue(&Value(cf, key), orig_expected_value, + orig_expected_value); + } + const ExpectedValue pending_expected_value = expected_value; + expected_value.Delete(false /* pending */); + const ExpectedValue final_expected_value = expected_value; + Precommit(cf, key, pending_expected_value); + return PendingExpectedValue(&Value(cf, key), orig_expected_value, + final_expected_value); } -bool ExpectedState::SingleDelete(int cf, int64_t key, bool pending) { - return Delete(cf, key, pending); +PendingExpectedValue ExpectedState::PrepareSingleDelete(int cf, int64_t key) { + return PrepareDelete(cf, key); } -int ExpectedState::DeleteRange(int cf, int64_t begin_key, int64_t end_key, - bool pending) { - int covered = 0; +std::vector ExpectedState::PrepareDeleteRange( + int cf, int64_t begin_key, int64_t end_key) { + std::vector pending_expected_values; for (int64_t key = begin_key; key < end_key; ++key) { - if (Delete(cf, key, pending)) { - ++covered; + bool prepared = false; + PendingExpectedValue pending_expected_value = + PrepareDelete(cf, key, &prepared); + if (prepared) { + pending_expected_values.push_back(pending_expected_value); } } - return covered; + return pending_expected_values; } bool ExpectedState::Exists(int cf, int64_t key) { - // UNKNOWN_SENTINEL counts as exists. That assures a key for which overwrite - // is disallowed can't be accidentally added a second time, in which case - // SingleDelete wouldn't be able to properly delete the key. It does allow - // the case where a SingleDelete might be added which covers nothing, but - // that's not a correctness issue. - uint32_t expected_value = Value(cf, key).load(); - return expected_value != SharedState::DELETION_SENTINEL; + return Load(cf, key).Exists(); } void ExpectedState::Reset() { + const uint32_t del_mask = ExpectedValue::GetDelMask(); for (size_t i = 0; i < num_column_families_; ++i) { for (size_t j = 0; j < max_key_; ++j) { - Value(static_cast(i), j) - .store(SharedState::DELETION_SENTINEL, std::memory_order_relaxed); + Value(static_cast(i), j).store(del_mask, std::memory_order_relaxed); } } } +void ExpectedState::SyncPut(int cf, int64_t key, uint32_t value_base) { + ExpectedValue expected_value = Load(cf, key); + expected_value.SyncPut(value_base); + Value(cf, key).store(expected_value.Read()); +} + +void ExpectedState::SyncPendingPut(int cf, int64_t key) { + ExpectedValue expected_value = Load(cf, key); + expected_value.SyncPendingPut(); + Value(cf, key).store(expected_value.Read()); +} + +void ExpectedState::SyncDelete(int cf, int64_t key) { + ExpectedValue expected_value = Load(cf, key); + expected_value.SyncDelete(); + Value(cf, key).store(expected_value.Read()); +} + +void ExpectedState::SyncDeleteRange(int cf, int64_t begin_key, + int64_t end_key) { + for (int64_t key = begin_key; key < end_key; ++key) { + SyncDelete(cf, key); + } +} + FileExpectedState::FileExpectedState(std::string expected_state_file_path, size_t max_key, size_t num_column_families) : ExpectedState(max_key, num_column_families), @@ -254,7 +291,6 @@ Status FileExpectedStateManager::Open() { return s; } -#ifndef ROCKSDB_LITE Status FileExpectedStateManager::SaveAtAndAfter(DB* db) { SequenceNumber seqno = db->GetLatestSequenceNumber(); @@ -322,18 +358,11 @@ Status FileExpectedStateManager::SaveAtAndAfter(DB* db) { } return s; } -#else // ROCKSDB_LITE -Status FileExpectedStateManager::SaveAtAndAfter(DB* /* db */) { - return Status::NotSupported(); -} -#endif // ROCKSDB_LITE bool FileExpectedStateManager::HasHistory() { return saved_seqno_ != kMaxSequenceNumber; } -#ifndef ROCKSDB_LITE - namespace { // An `ExpectedStateTraceRecordHandler` applies a configurable number of @@ -392,7 +421,7 @@ class ExpectedStateTraceRecordHandler : public TraceRecord::Handler, if (!GetIntVal(key.ToString(), &key_id)) { return Status::Corruption("unable to parse key", key.ToString()); } - uint32_t value_id = GetValueBase(value); + uint32_t value_base = GetValueBase(value); bool should_buffer_write = !(buffered_writes_ == nullptr); if (should_buffer_write) { @@ -400,8 +429,7 @@ class ExpectedStateTraceRecordHandler : public TraceRecord::Handler, key, value); } - state_->Put(column_family_id, static_cast(key_id), value_id, - false /* pending */); + state_->SyncPut(column_family_id, static_cast(key_id), value_base); ++num_write_ops_; return Status::OK(); } @@ -423,16 +451,7 @@ class ExpectedStateTraceRecordHandler : public TraceRecord::Handler, entity.ToString(/* hex */ true)); } - if (columns.empty() || columns[0].name() != kDefaultWideColumnName) { - return Status::Corruption("Cannot find default column in entity", - entity.ToString(/* hex */ true)); - } - - const Slice& value_of_default = columns[0].value(); - - const uint32_t value_base = GetValueBase(value_of_default); - - if (columns != GenerateExpectedWideColumns(value_base, value_of_default)) { + if (!VerifyWideColumns(columns)) { return Status::Corruption("Wide columns in entity inconsistent", entity.ToString(/* hex */ true)); } @@ -442,8 +461,10 @@ class ExpectedStateTraceRecordHandler : public TraceRecord::Handler, column_family_id, key, columns); } - state_->Put(column_family_id, static_cast(key_id), value_base, - false /* pending */); + const uint32_t value_base = + GetValueBase(WideColumnsHelper::GetDefaultColumn(columns)); + + state_->SyncPut(column_family_id, static_cast(key_id), value_base); ++num_write_ops_; @@ -465,8 +486,7 @@ class ExpectedStateTraceRecordHandler : public TraceRecord::Handler, column_family_id, key); } - state_->Delete(column_family_id, static_cast(key_id), - false /* pending */); + state_->SyncDelete(column_family_id, static_cast(key_id)); ++num_write_ops_; return Status::OK(); } @@ -510,8 +530,9 @@ class ExpectedStateTraceRecordHandler : public TraceRecord::Handler, buffered_writes_.get(), column_family_id, begin_key, end_key); } - state_->DeleteRange(column_family_id, static_cast(begin_key_id), - static_cast(end_key_id), false /* pending */); + state_->SyncDeleteRange(column_family_id, + static_cast(begin_key_id), + static_cast(end_key_id)); ++num_write_ops_; return Status::OK(); } @@ -683,11 +704,6 @@ Status FileExpectedStateManager::Restore(DB* db) { } return s; } -#else // ROCKSDB_LITE -Status FileExpectedStateManager::Restore(DB* /* db */) { - return Status::NotSupported(); -} -#endif // ROCKSDB_LITE Status FileExpectedStateManager::Clean() { std::vector expected_state_dir_children; diff --git a/db_stress_tool/expected_state.h b/db_stress_tool/expected_state.h index 41d747e7696c..f3f924cb8808 100644 --- a/db_stress_tool/expected_state.h +++ b/db_stress_tool/expected_state.h @@ -13,6 +13,7 @@ #include #include "db/dbformat.h" +#include "db_stress_tool/expected_value.h" #include "file/file_util.h" #include "rocksdb/db.h" #include "rocksdb/env.h" @@ -22,9 +23,8 @@ #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { - -// An `ExpectedState` provides read/write access to expected values for every -// key. +// `ExpectedState` provides read/write access to expected values stored in +// `ExpectedState` for every key. class ExpectedState { public: explicit ExpectedState(size_t max_key, size_t num_column_families); @@ -38,43 +38,79 @@ class ExpectedState { // Requires external locking covering all keys in `cf`. void ClearColumnFamily(int cf); - // @param pending True if the update may have started but is not yet - // guaranteed finished. This is useful for crash-recovery testing when the - // process may crash before updating the expected values array. + // Prepare a Put that will be started but not finished yet + // This is useful for crash-recovery testing when the process may crash + // before updating the corresponding expected value // - // Requires external locking covering `key` in `cf`. - void Put(int cf, int64_t key, uint32_t value_base, bool pending); + // Requires external locking covering `key` in `cf` to prevent concurrent + // write or delete to the same `key`. + PendingExpectedValue PreparePut(int cf, int64_t key); - // Requires external locking covering `key` in `cf`. - uint32_t Get(int cf, int64_t key) const; + // Does not requires external locking. + ExpectedValue Get(int cf, int64_t key); - // @param pending See comment above Put() - // Returns true if the key was not yet deleted. + // Prepare a Delete that will be started but not finished yet + // This is useful for crash-recovery testing when the process may crash + // before updating the corresponding expected value // - // Requires external locking covering `key` in `cf`. - bool Delete(int cf, int64_t key, bool pending); + // Requires external locking covering `key` in `cf` to prevent concurrent + // write or delete to the same `key`. + PendingExpectedValue PrepareDelete(int cf, int64_t key, + bool* prepared = nullptr); + + // Requires external locking covering `key` in `cf` to prevent concurrent + // write or delete to the same `key`. + PendingExpectedValue PrepareSingleDelete(int cf, int64_t key); + + // Requires external locking covering keys in `[begin_key, end_key)` in `cf` + // to prevent concurrent write or delete to the same `key`. + std::vector PrepareDeleteRange(int cf, + int64_t begin_key, + int64_t end_key); + + // Update the expected value for start of an incomplete write or delete + // operation on the key assoicated with this expected value + void Precommit(int cf, int64_t key, const ExpectedValue& value); + + // Requires external locking covering `key` in `cf` to prevent concurrent + // delete to the same `key`. + bool Exists(int cf, int64_t key); - // @param pending See comment above Put() - // Returns true if the key was not yet deleted. + // Sync the `value_base` to the corresponding expected value // - // Requires external locking covering `key` in `cf`. - bool SingleDelete(int cf, int64_t key, bool pending); + // Requires external locking covering `key` in `cf` or be in single thread + // to prevent concurrent write or delete to the same `key` + void SyncPut(int cf, int64_t key, uint32_t value_base); - // @param pending See comment above Put() - // Returns number of keys deleted by the call. + // Sync the corresponding expected value to be pending Put // - // Requires external locking covering keys in `[begin_key, end_key)` in `cf`. - int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending); + // Requires external locking covering `key` in `cf` or be in single thread + // to prevent concurrent write or delete to the same `key` + void SyncPendingPut(int cf, int64_t key); - // Requires external locking covering `key` in `cf`. - bool Exists(int cf, int64_t key); + // Sync the corresponding expected value to be deleted + // + // Requires external locking covering `key` in `cf` or be in single thread + // to prevent concurrent write or delete to the same `key` + void SyncDelete(int cf, int64_t key); + + // Sync the corresponding expected values to be deleted + // + // Requires external locking covering keys in `[begin_key, end_key)` in `cf` + // to prevent concurrent write or delete to the same `key` + void SyncDeleteRange(int cf, int64_t begin_key, int64_t end_key); private: - // Requires external locking covering `key` in `cf`. + // Does not requires external locking. std::atomic& Value(int cf, int64_t key) const { return values_[cf * max_key_ + key]; } + // Does not requires external locking + ExpectedValue Load(int cf, int64_t key) const { + return ExpectedValue(Value(cf, key).load()); + } + const size_t max_key_; const size_t num_column_families_; @@ -160,45 +196,52 @@ class ExpectedStateManager { // Requires external locking covering all keys in `cf`. void ClearColumnFamily(int cf) { return latest_->ClearColumnFamily(cf); } - // @param pending True if the update may have started but is not yet - // guaranteed finished. This is useful for crash-recovery testing when the - // process may crash before updating the expected values array. - // - // Requires external locking covering `key` in `cf`. - void Put(int cf, int64_t key, uint32_t value_base, bool pending) { - return latest_->Put(cf, key, value_base, pending); + // See ExpectedState::PreparePut() + PendingExpectedValue PreparePut(int cf, int64_t key) { + return latest_->PreparePut(cf, key); } - // Requires external locking covering `key` in `cf`. - uint32_t Get(int cf, int64_t key) const { return latest_->Get(cf, key); } + // See ExpectedState::Get() + ExpectedValue Get(int cf, int64_t key) { return latest_->Get(cf, key); } - // @param pending See comment above Put() - // Returns true if the key was not yet deleted. - // - // Requires external locking covering `key` in `cf`. - bool Delete(int cf, int64_t key, bool pending) { - return latest_->Delete(cf, key, pending); + // See ExpectedState::PrepareDelete() + PendingExpectedValue PrepareDelete(int cf, int64_t key) { + return latest_->PrepareDelete(cf, key); } - // @param pending See comment above Put() - // Returns true if the key was not yet deleted. - // - // Requires external locking covering `key` in `cf`. - bool SingleDelete(int cf, int64_t key, bool pending) { - return latest_->SingleDelete(cf, key, pending); + // See ExpectedState::PrepareSingleDelete() + PendingExpectedValue PrepareSingleDelete(int cf, int64_t key) { + return latest_->PrepareSingleDelete(cf, key); } - // @param pending See comment above Put() - // Returns number of keys deleted by the call. - // - // Requires external locking covering keys in `[begin_key, end_key)` in `cf`. - int DeleteRange(int cf, int64_t begin_key, int64_t end_key, bool pending) { - return latest_->DeleteRange(cf, begin_key, end_key, pending); + // See ExpectedState::PrepareDeleteRange() + std::vector PrepareDeleteRange(int cf, + int64_t begin_key, + int64_t end_key) { + return latest_->PrepareDeleteRange(cf, begin_key, end_key); } - // Requires external locking covering `key` in `cf`. + // See ExpectedState::Exists() bool Exists(int cf, int64_t key) { return latest_->Exists(cf, key); } + // See ExpectedState::SyncPut() + void SyncPut(int cf, int64_t key, uint32_t value_base) { + return latest_->SyncPut(cf, key, value_base); + } + + // See ExpectedState::SyncPendingPut() + void SyncPendingPut(int cf, int64_t key) { + return latest_->SyncPendingPut(cf, key); + } + + // See ExpectedState::SyncDelete() + void SyncDelete(int cf, int64_t key) { return latest_->SyncDelete(cf, key); } + + // See ExpectedState::SyncDeleteRange() + void SyncDeleteRange(int cf, int64_t begin_key, int64_t end_key) { + return latest_->SyncDeleteRange(cf, begin_key, end_key); + } + protected: const size_t max_key_; const size_t num_column_families_; @@ -281,7 +324,6 @@ class AnonExpectedStateManager : public ExpectedStateManager { // member function. Status Open() override; }; - } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff --git a/db_stress_tool/expected_value.cc b/db_stress_tool/expected_value.cc new file mode 100644 index 000000000000..d280055a2560 --- /dev/null +++ b/db_stress_tool/expected_value.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2021-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef GFLAGS + +#include "db_stress_tool/expected_value.h" + +#include + +namespace ROCKSDB_NAMESPACE { +void ExpectedValue::Put(bool pending) { + if (pending) { + SetPendingWrite(); + } else { + SetValueBase(NextValueBase()); + ClearDeleted(); + ClearPendingWrite(); + } +} + +bool ExpectedValue::Delete(bool pending) { + if (!Exists()) { + return false; + } + if (pending) { + SetPendingDel(); + } else { + SetDelCounter(NextDelCounter()); + SetDeleted(); + ClearPendingDel(); + } + return true; +} + +void ExpectedValue::SyncPut(uint32_t value_base) { + assert(ExpectedValue::IsValueBaseValid(value_base)); + + SetValueBase(value_base); + ClearDeleted(); + ClearPendingWrite(); + + // This is needed in case crash happens during a pending delete of the key + // assocated with this expected value + ClearPendingDel(); +} + +void ExpectedValue::SyncPendingPut() { Put(true /* pending */); } + +void ExpectedValue::SyncDelete() { + Delete(false /* pending */); + // This is needed in case crash happens during a pending write of the key + // assocated with this expected value + ClearPendingWrite(); +} + +uint32_t ExpectedValue::GetFinalValueBase() const { + return PendingWrite() ? NextValueBase() : GetValueBase(); +} + +uint32_t ExpectedValue::GetFinalDelCounter() const { + return PendingDelete() ? NextDelCounter() : GetDelCounter(); +} + +bool ExpectedValueHelper::MustHaveNotExisted( + ExpectedValue pre_read_expected_value, + ExpectedValue post_read_expected_value) { + const bool pre_read_expected_deleted = pre_read_expected_value.IsDeleted(); + + const uint32_t pre_read_expected_value_base = + pre_read_expected_value.GetValueBase(); + + const uint32_t post_read_expected_final_value_base = + post_read_expected_value.GetFinalValueBase(); + + const bool during_read_no_write_happened = + (pre_read_expected_value_base == post_read_expected_final_value_base); + return pre_read_expected_deleted && during_read_no_write_happened; +} + +bool ExpectedValueHelper::MustHaveExisted( + ExpectedValue pre_read_expected_value, + ExpectedValue post_read_expected_value) { + const bool pre_read_expected_not_deleted = + !pre_read_expected_value.IsDeleted(); + + const uint32_t pre_read_expected_del_counter = + pre_read_expected_value.GetDelCounter(); + const uint32_t post_read_expected_final_del_counter = + post_read_expected_value.GetFinalDelCounter(); + + const bool during_read_no_delete_happened = + (pre_read_expected_del_counter == post_read_expected_final_del_counter); + + return pre_read_expected_not_deleted && during_read_no_delete_happened; +} + +bool ExpectedValueHelper::InExpectedValueBaseRange( + uint32_t value_base, ExpectedValue pre_read_expected_value, + ExpectedValue post_read_expected_value) { + assert(ExpectedValue::IsValueBaseValid(value_base)); + + const uint32_t pre_read_expected_value_base = + pre_read_expected_value.GetValueBase(); + const uint32_t post_read_expected_final_value_base = + post_read_expected_value.GetFinalValueBase(); + + if (pre_read_expected_value_base <= post_read_expected_final_value_base) { + const uint32_t lower_bound = pre_read_expected_value_base; + const uint32_t upper_bound = post_read_expected_final_value_base; + return lower_bound <= value_base && value_base <= upper_bound; + } else { + const uint32_t upper_bound_1 = post_read_expected_final_value_base; + const uint32_t lower_bound_2 = pre_read_expected_value_base; + const uint32_t upper_bound_2 = ExpectedValue::GetValueBaseMask(); + return (value_base <= upper_bound_1) || + (lower_bound_2 <= value_base && value_base <= upper_bound_2); + } +} +} // namespace ROCKSDB_NAMESPACE +#endif // GFLAGS diff --git a/db_stress_tool/expected_value.h b/db_stress_tool/expected_value.h new file mode 100644 index 000000000000..338afc049146 --- /dev/null +++ b/db_stress_tool/expected_value.h @@ -0,0 +1,208 @@ +// Copyright (c) 2021-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#ifdef GFLAGS + +#pragma once + +#include + +#include +#include +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +// `ExpectedValue` represents the expected value of a key used in db stress, +// which provides APIs to obtain various information e.g, value base, existence, +// pending operation status and APIs to edit expected value. +// +// This class is not thread-safe. +class ExpectedValue { + public: + static uint32_t GetValueBaseMask() { return VALUE_BASE_MASK; } + static uint32_t GetValueBaseDelta() { return VALUE_BASE_DELTA; } + static uint32_t GetDelCounterDelta() { return DEL_COUNTER_DELTA; } + static uint32_t GetDelMask() { return DEL_MASK; } + static bool IsValueBaseValid(uint32_t value_base) { + return IsValuePartValid(value_base, VALUE_BASE_MASK); + } + + ExpectedValue() : expected_value_(DEL_MASK) {} + + explicit ExpectedValue(uint32_t expected_value) + : expected_value_(expected_value) {} + + bool Exists() const { return PendingWrite() || !IsDeleted(); } + + uint32_t Read() const { return expected_value_; } + + void Put(bool pending); + + bool Delete(bool pending); + + void SyncPut(uint32_t value_base); + + void SyncPendingPut(); + + void SyncDelete(); + + uint32_t GetValueBase() const { return GetValuePart(VALUE_BASE_MASK); } + + uint32_t NextValueBase() const { + return GetIncrementedValuePart(VALUE_BASE_MASK, VALUE_BASE_DELTA); + } + + void SetValueBase(uint32_t new_value_base) { + SetValuePart(VALUE_BASE_MASK, new_value_base); + } + + bool PendingWrite() const { + const uint32_t pending_write = GetValuePart(PENDING_WRITE_MASK); + return pending_write != 0; + } + + void SetPendingWrite() { + SetValuePart(PENDING_WRITE_MASK, PENDING_WRITE_MASK); + } + + void ClearPendingWrite() { ClearValuePart(PENDING_WRITE_MASK); } + + uint32_t GetDelCounter() const { return GetValuePart(DEL_COUNTER_MASK); } + + uint32_t NextDelCounter() const { + return GetIncrementedValuePart(DEL_COUNTER_MASK, DEL_COUNTER_DELTA); + } + + void SetDelCounter(uint32_t new_del_counter) { + SetValuePart(DEL_COUNTER_MASK, new_del_counter); + } + + bool PendingDelete() const { + const uint32_t pending_del = GetValuePart(PENDING_DEL_MASK); + return pending_del != 0; + } + + void SetPendingDel() { SetValuePart(PENDING_DEL_MASK, PENDING_DEL_MASK); } + + void ClearPendingDel() { ClearValuePart(PENDING_DEL_MASK); } + + bool IsDeleted() const { + const uint32_t deleted = GetValuePart(DEL_MASK); + return deleted != 0; + } + + void SetDeleted() { SetValuePart(DEL_MASK, DEL_MASK); } + + void ClearDeleted() { ClearValuePart(DEL_MASK); } + + uint32_t GetFinalValueBase() const; + + uint32_t GetFinalDelCounter() const; + + private: + static bool IsValuePartValid(uint32_t value_part, uint32_t value_part_mask) { + return (value_part & (~value_part_mask)) == 0; + } + + // The 32-bit expected_value_ is divided into following parts: + // Bit 0 - 14: value base + static constexpr uint32_t VALUE_BASE_MASK = 0x7fff; + static constexpr uint32_t VALUE_BASE_DELTA = 1; + // Bit 15: whether write to this value base is pending (0 equals `false`) + static constexpr uint32_t PENDING_WRITE_MASK = (uint32_t)1 << 15; + // Bit 16 - 29: deletion counter (i.e, number of times this value base has + // been deleted) + static constexpr uint32_t DEL_COUNTER_MASK = 0x3fff0000; + static constexpr uint32_t DEL_COUNTER_DELTA = (uint32_t)1 << 16; + // Bit 30: whether deletion of this value base is pending (0 equals `false`) + static constexpr uint32_t PENDING_DEL_MASK = (uint32_t)1 << 30; + // Bit 31: whether this value base is deleted (0 equals `false`) + static constexpr uint32_t DEL_MASK = (uint32_t)1 << 31; + + uint32_t GetValuePart(uint32_t value_part_mask) const { + return expected_value_ & value_part_mask; + } + + uint32_t GetIncrementedValuePart(uint32_t value_part_mask, + uint32_t value_part_delta) const { + uint32_t current_value_part = GetValuePart(value_part_mask); + ExpectedValue temp_expected_value(current_value_part + value_part_delta); + return temp_expected_value.GetValuePart(value_part_mask); + } + + void SetValuePart(uint32_t value_part_mask, uint32_t new_value_part) { + assert(IsValuePartValid(new_value_part, value_part_mask)); + ClearValuePart(value_part_mask); + expected_value_ |= new_value_part; + } + + void ClearValuePart(uint32_t value_part_mask) { + expected_value_ &= (~value_part_mask); + } + + uint32_t expected_value_; +}; + +// `PendingExpectedValue` represents the expected value of a key undergoing a +// pending operation in db stress. +// +// This class is not thread-safe. +class PendingExpectedValue { + public: + explicit PendingExpectedValue(std::atomic* value_ptr, + ExpectedValue orig_value, + ExpectedValue final_value) + : value_ptr_(value_ptr), + orig_value_(orig_value), + final_value_(final_value) {} + + void Commit() { + // To prevent low-level instruction reordering that results + // in setting expected value happens before db write + std::atomic_thread_fence(std::memory_order_release); + value_ptr_->store(final_value_.Read()); + } + + uint32_t GetFinalValueBase() { return final_value_.GetValueBase(); } + + private: + std::atomic* const value_ptr_; + const ExpectedValue orig_value_; + const ExpectedValue final_value_; +}; + +// `ExpectedValueHelper` provides utils to parse `ExpectedValue` to obtain +// useful info about it in db stress +class ExpectedValueHelper { + public: + // Return whether the key associated with `pre_read_expected_value` and + // `post_read_expected_value` is expected not to exist from begining till the + // end of the read + // + // The negation of `MustHaveNotExisted()` is "may have not existed". + // To assert some key must have existsed, please use `MustHaveExisted()` + static bool MustHaveNotExisted(ExpectedValue pre_read_expected_value, + ExpectedValue post_read_expected_value); + + // Return whether the key associated with `pre_read_expected_value` and + // `post_read_expected_value` is expected to exist from begining till the end + // of the read. + // + // The negation of `MustHaveExisted()` is "may have existed". + // To assert some key must have not existsed, please use + // `MustHaveNotExisted()` + static bool MustHaveExisted(ExpectedValue pre_read_expected_value, + ExpectedValue post_read_expected_value); + + // Return whether the `value_base` falls within the expected value base + static bool InExpectedValueBaseRange(uint32_t value_base, + ExpectedValue pre_read_expected_value, + ExpectedValue post_read_expected_value); +}; +} // namespace ROCKSDB_NAMESPACE + +#endif // GFLAGS diff --git a/db_stress_tool/multi_ops_txns_stress.cc b/db_stress_tool/multi_ops_txns_stress.cc index 7db5e894200f..c7d38339bb2f 100644 --- a/db_stress_tool/multi_ops_txns_stress.cc +++ b/db_stress_tool/multi_ops_txns_stress.cc @@ -329,9 +329,7 @@ void MultiOpsTxnsStressTest::FinishInitDb(SharedState* shared) { if (FLAGS_enable_compaction_filter) { // TODO (yanqin) enable compaction filter } -#ifndef ROCKSDB_LITE ProcessRecoveredPreparedTxns(shared); -#endif ReopenAndPreloadDbIfNeeded(shared); // TODO (yanqin) parallelize if key space is large @@ -348,7 +346,6 @@ void MultiOpsTxnsStressTest::FinishInitDb(SharedState* shared) { void MultiOpsTxnsStressTest::ReopenAndPreloadDbIfNeeded(SharedState* shared) { (void)shared; -#ifndef ROCKSDB_LITE bool db_empty = false; { std::unique_ptr iter(db_->NewIterator(ReadOptions())); @@ -369,7 +366,6 @@ void MultiOpsTxnsStressTest::ReopenAndPreloadDbIfNeeded(SharedState* shared) { fflush(stdout); ScanExistingDb(shared, FLAGS_threads); } -#endif // !ROCKSDB_LITE } // Used for point-lookup transaction @@ -391,6 +387,18 @@ std::vector MultiOpsTxnsStressTest::TestMultiGet( return std::vector{Status::NotSupported()}; } +// Wide columns are currently not supported by transactions. +void MultiOpsTxnsStressTest::TestGetEntity( + ThreadState* /* thread */, const ReadOptions& /* read_opts */, + const std::vector& /* rand_column_families */, + const std::vector& /* rand_keys */) {} + +// Wide columns are currently not supported by transactions. +void MultiOpsTxnsStressTest::TestMultiGetEntity( + ThreadState* /* thread */, const ReadOptions& /* read_opts */, + const std::vector& /* rand_column_families */, + const std::vector& /* rand_keys */) {} + Status MultiOpsTxnsStressTest::TestPrefixScan( ThreadState* thread, const ReadOptions& read_opts, const std::vector& rand_column_families, @@ -477,7 +485,6 @@ Status MultiOpsTxnsStressTest::TestCheckpoint( return Status::OK(); } -#ifndef ROCKSDB_LITE Status MultiOpsTxnsStressTest::TestApproximateSize( ThreadState* thread, uint64_t iteration, const std::vector& rand_column_families, @@ -488,7 +495,6 @@ Status MultiOpsTxnsStressTest::TestApproximateSize( (void)rand_column_families; return Status::OK(); } -#endif // !ROCKSDB_LITE Status MultiOpsTxnsStressTest::TestCustomOperations( ThreadState* thread, const std::vector& rand_column_families) { @@ -529,7 +535,6 @@ void MultiOpsTxnsStressTest::RegisterAdditionalListeners() { options_.listeners.emplace_back(new MultiOpsTxnsStressListener(this)); } -#ifndef ROCKSDB_LITE void MultiOpsTxnsStressTest::PrepareTxnDbOptions( SharedState* /*shared*/, TransactionDBOptions& txn_db_opts) { // MultiOpsTxnStressTest uses SingleDelete to delete secondary keys, thus we @@ -548,22 +553,14 @@ void MultiOpsTxnsStressTest::PrepareTxnDbOptions( return index_id == Record::kSecondaryIndexId; }; } -#endif // !ROCKSDB_LITE Status MultiOpsTxnsStressTest::PrimaryKeyUpdateTxn(ThreadState* thread, uint32_t old_a, uint32_t old_a_pos, uint32_t new_a) { -#ifdef ROCKSDB_LITE - (void)thread; - (void)old_a; - (void)old_a_pos; - (void)new_a; - return Status::NotSupported(); -#else std::string old_pk = Record::EncodePrimaryKey(old_a); std::string new_pk = Record::EncodePrimaryKey(new_a); - Transaction* txn = nullptr; + std::unique_ptr txn; WriteOptions wopts; Status s = NewTxn(wopts, &txn); if (!s.ok()) { @@ -575,7 +572,7 @@ Status MultiOpsTxnsStressTest::PrimaryKeyUpdateTxn(ThreadState* thread, assert(txn); txn->SetSnapshotOnNextOperation(/*notifier=*/nullptr); - const Defer cleanup([new_a, &s, thread, txn, this]() { + const Defer cleanup([new_a, &s, thread, this, &txn]() { if (s.ok()) { // Two gets, one for existing pk, one for locking potential new pk. thread->stats.AddGets(/*ngets=*/2, /*nfounds=*/1); @@ -597,7 +594,7 @@ Status MultiOpsTxnsStressTest::PrimaryKeyUpdateTxn(ThreadState* thread, } auto& key_gen = key_gen_for_a_[thread->tid]; key_gen->UndoAllocation(new_a); - RollbackTxn(txn).PermitUncheckedError(); + txn->Rollback().PermitUncheckedError(); }); ReadOptions ropts; @@ -674,25 +671,16 @@ Status MultiOpsTxnsStressTest::PrimaryKeyUpdateTxn(ThreadState* thread, auto& key_gen = key_gen_for_a_.at(thread->tid); if (s.ok()) { - delete txn; key_gen->Replace(old_a, old_a_pos, new_a); } return s; -#endif // !ROCKSDB_LITE } Status MultiOpsTxnsStressTest::SecondaryKeyUpdateTxn(ThreadState* thread, uint32_t old_c, uint32_t old_c_pos, uint32_t new_c) { -#ifdef ROCKSDB_LITE - (void)thread; - (void)old_c; - (void)old_c_pos; - (void)new_c; - return Status::NotSupported(); -#else - Transaction* txn = nullptr; + std::unique_ptr txn; WriteOptions wopts; Status s = NewTxn(wopts, &txn); if (!s.ok()) { @@ -705,7 +693,7 @@ Status MultiOpsTxnsStressTest::SecondaryKeyUpdateTxn(ThreadState* thread, Iterator* it = nullptr; long iterations = 0; - const Defer cleanup([new_c, &s, thread, &it, txn, this, &iterations]() { + const Defer cleanup([new_c, &s, thread, &txn, &it, this, &iterations]() { delete it; if (s.ok()) { thread->stats.AddIterations(iterations); @@ -730,7 +718,7 @@ Status MultiOpsTxnsStressTest::SecondaryKeyUpdateTxn(ThreadState* thread, } auto& key_gen = key_gen_for_c_[thread->tid]; key_gen->UndoAllocation(new_c); - RollbackTxn(txn).PermitUncheckedError(); + txn->Rollback().PermitUncheckedError(); }); // TODO (yanqin) try SetSnapshotOnNextOperation(). We currently need to take @@ -879,26 +867,18 @@ Status MultiOpsTxnsStressTest::SecondaryKeyUpdateTxn(ThreadState* thread, s = CommitAndCreateTimestampedSnapshotIfNeeded(thread, *txn); if (s.ok()) { - delete txn; auto& key_gen = key_gen_for_c_.at(thread->tid); key_gen->Replace(old_c, old_c_pos, new_c); } return s; -#endif // !ROCKSDB_LITE } Status MultiOpsTxnsStressTest::UpdatePrimaryIndexValueTxn(ThreadState* thread, uint32_t a, uint32_t b_delta) { -#ifdef ROCKSDB_LITE - (void)thread; - (void)a; - (void)b_delta; - return Status::NotSupported(); -#else std::string pk_str = Record::EncodePrimaryKey(a); - Transaction* txn = nullptr; + std::unique_ptr txn; WriteOptions wopts; Status s = NewTxn(wopts, &txn); if (!s.ok()) { @@ -909,7 +889,7 @@ Status MultiOpsTxnsStressTest::UpdatePrimaryIndexValueTxn(ThreadState* thread, assert(txn); - const Defer cleanup([&s, thread, txn, this]() { + const Defer cleanup([&s, thread, &txn]() { if (s.ok()) { thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/1); thread->stats.AddBytesForWrites( @@ -926,7 +906,7 @@ Status MultiOpsTxnsStressTest::UpdatePrimaryIndexValueTxn(ThreadState* thread, } else { thread->stats.AddErrors(1); } - RollbackTxn(txn).PermitUncheckedError(); + txn->Rollback().PermitUncheckedError(); }); ReadOptions ropts; ropts.rate_limiter_priority = @@ -970,26 +950,16 @@ Status MultiOpsTxnsStressTest::UpdatePrimaryIndexValueTxn(ThreadState* thread, s = CommitAndCreateTimestampedSnapshotIfNeeded(thread, *txn); - if (s.ok()) { - delete txn; - } return s; -#endif // !ROCKSDB_LITE } Status MultiOpsTxnsStressTest::PointLookupTxn(ThreadState* thread, ReadOptions ropts, uint32_t a) { -#ifdef ROCKSDB_LITE - (void)thread; - (void)ropts; - (void)a; - return Status::NotSupported(); -#else std::string pk_str = Record::EncodePrimaryKey(a); // pk may or may not exist PinnableSlice value; - Transaction* txn = nullptr; + std::unique_ptr txn; WriteOptions wopts; Status s = NewTxn(wopts, &txn); if (!s.ok()) { @@ -1000,7 +970,7 @@ Status MultiOpsTxnsStressTest::PointLookupTxn(ThreadState* thread, assert(txn); - const Defer cleanup([&s, thread, txn, this]() { + const Defer cleanup([&s, thread, &txn]() { if (s.ok()) { thread->stats.AddGets(/*ngets=*/1, /*nfounds=*/1); return; @@ -1009,7 +979,7 @@ Status MultiOpsTxnsStressTest::PointLookupTxn(ThreadState* thread, } else { thread->stats.AddErrors(1); } - RollbackTxn(txn).PermitUncheckedError(); + txn->Rollback().PermitUncheckedError(); }); std::shared_ptr snapshot; @@ -1026,24 +996,14 @@ Status MultiOpsTxnsStressTest::PointLookupTxn(ThreadState* thread, if (s.ok()) { s = txn->Commit(); } - if (s.ok()) { - delete txn; - } return s; -#endif // !ROCKSDB_LITE } Status MultiOpsTxnsStressTest::RangeScanTxn(ThreadState* thread, ReadOptions ropts, uint32_t c) { -#ifdef ROCKSDB_LITE - (void)thread; - (void)ropts; - (void)c; - return Status::NotSupported(); -#else std::string sk = Record::EncodeSecondaryKey(c); - Transaction* txn = nullptr; + std::unique_ptr txn; WriteOptions wopts; Status s = NewTxn(wopts, &txn); if (!s.ok()) { @@ -1054,13 +1014,13 @@ Status MultiOpsTxnsStressTest::RangeScanTxn(ThreadState* thread, assert(txn); - const Defer cleanup([&s, thread, txn, this]() { + const Defer cleanup([&s, thread, &txn]() { if (s.ok()) { thread->stats.AddIterations(1); return; } thread->stats.AddErrors(1); - RollbackTxn(txn).PermitUncheckedError(); + txn->Rollback().PermitUncheckedError(); }); std::shared_ptr snapshot; @@ -1088,12 +1048,7 @@ Status MultiOpsTxnsStressTest::RangeScanTxn(ThreadState* thread, s = iter->status(); } - if (s.ok()) { - delete txn; - } - return s; -#endif // !ROCKSDB_LITE } void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const { @@ -1149,8 +1104,9 @@ void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const { Status s = record.DecodePrimaryIndexEntry(it->key(), it->value()); if (!s.ok()) { oss << "Cannot decode primary index entry " << it->key().ToString(true) - << "=>" << it->value().ToString(true); - VerificationAbort(thread->shared, oss.str(), s); + << "=>" << it->value().ToString(true) << ". Status is " + << s.ToString(); + VerificationAbort(thread->shared, oss.str()); assert(false); return; } @@ -1170,8 +1126,9 @@ void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const { std::string value; s = db_->Get(ropts, sk, &value); if (!s.ok()) { - oss << "Cannot find secondary index entry " << sk.ToString(true); - VerificationAbort(thread->shared, oss.str(), s); + oss << "Cannot find secondary index entry " << sk.ToString(true) + << ". Status is " << s.ToString(); + VerificationAbort(thread->shared, oss.str()); assert(false); return; } @@ -1198,8 +1155,9 @@ void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const { Status s = record.DecodeSecondaryIndexEntry(it->key(), it->value()); if (!s.ok()) { oss << "Cannot decode secondary index entry " - << it->key().ToString(true) << "=>" << it->value().ToString(true); - VerificationAbort(thread->shared, oss.str(), s); + << it->key().ToString(true) << "=>" << it->value().ToString(true) + << ". Status is " << s.ToString(); + VerificationAbort(thread->shared, oss.str()); assert(false); return; } @@ -1213,7 +1171,7 @@ void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const { if (!s.ok()) { oss << "Error searching pk " << Slice(pk).ToString(true) << ". " << s.ToString() << ". sk " << it->key().ToString(true); - VerificationAbort(thread->shared, oss.str(), s); + VerificationAbort(thread->shared, oss.str()); assert(false); return; } @@ -1221,8 +1179,8 @@ void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const { s = std::get<0>(result); if (!s.ok()) { oss << "Error decoding primary index value " - << Slice(value).ToString(true) << ". " << s.ToString(); - VerificationAbort(thread->shared, oss.str(), s); + << Slice(value).ToString(true) << ". Status is " << s.ToString(); + VerificationAbort(thread->shared, oss.str()); assert(false); return; } @@ -1232,7 +1190,7 @@ void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const { << Slice(value).ToString(true) << " (a=" << record.a_value() << ", c=" << c_in_primary << "), sk: " << it->key().ToString(true) << " (c=" << record.c_value() << ")"; - VerificationAbort(thread->shared, oss.str(), s); + VerificationAbort(thread->shared, oss.str()); assert(false); return; } @@ -1243,7 +1201,7 @@ void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const { oss << "Pk/sk mismatch: primary index has " << primary_index_entries_count << " entries. Secondary index has " << secondary_index_entries_count << " entries."; - VerificationAbort(thread->shared, oss.str(), Status::OK()); + VerificationAbort(thread->shared, oss.str()); assert(false); return; } @@ -1253,7 +1211,8 @@ void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const { // which can be called before TransactionDB::Open() returns to caller. // Therefore, at that time, db_ and txn_db_ may still be nullptr. // Caller has to make sure that the race condition does not happen. -void MultiOpsTxnsStressTest::VerifyPkSkFast(int job_id) { +void MultiOpsTxnsStressTest::VerifyPkSkFast(const ReadOptions& read_options, + int job_id) { DB* const db = db_aptr_.load(std::memory_order_acquire); if (db == nullptr) { return; @@ -1282,6 +1241,7 @@ void MultiOpsTxnsStressTest::VerifyPkSkFast(int job_id) { ReadOptions ropts; ropts.snapshot = snapshot; ropts.total_order_seek = true; + ropts.io_activity = read_options.io_activity; std::unique_ptr it(db_->NewIterator(ropts)); for (it->Seek(start_key); it->Valid(); it->Next()) { @@ -1356,7 +1316,6 @@ uint32_t MultiOpsTxnsStressTest::GenerateNextC(ThreadState* thread) { return key_gen->Allocate(); } -#ifndef ROCKSDB_LITE void MultiOpsTxnsStressTest::ProcessRecoveredPreparedTxnsHelper( Transaction* txn, SharedState*) { thread_local Random rand(static_cast(FLAGS_seed)); @@ -1418,7 +1377,6 @@ void MultiOpsTxnsStressTest::SetupSnapshot( read_opts.snapshot = txn.GetSnapshot(); } } -#endif // !ROCKSDB_LITE std::string MultiOpsTxnsStressTest::KeySpaces::EncodeTo() const { std::string result; @@ -1489,14 +1447,6 @@ MultiOpsTxnsStressTest::KeySpaces MultiOpsTxnsStressTest::ReadKeySpacesDesc( void MultiOpsTxnsStressTest::PreloadDb(SharedState* shared, int threads, uint32_t lb_a, uint32_t ub_a, uint32_t lb_c, uint32_t ub_c) { -#ifdef ROCKSDB_LITE - (void)shared; - (void)threads; - (void)lb_a; - (void)ub_a; - (void)lb_c; - (void)ub_c; -#else key_gen_for_a_.resize(threads); key_gen_for_c_.resize(threads); @@ -1599,7 +1549,6 @@ void MultiOpsTxnsStressTest::PreloadDb(SharedState* shared, int threads, my_seed, low, high, std::move(existing_c_uniqs[i]), std::move(non_existing_c_uniqs[i])); } -#endif // !ROCKSDB_LITE } // Scan an existing, non-empty database. @@ -1739,7 +1688,6 @@ StressTest* CreateMultiOpsTxnsStressTest() { } void CheckAndSetOptionsForMultiOpsTxnStressTest() { -#ifndef ROCKSDB_LITE if (FLAGS_test_batches_snapshots || FLAGS_test_cf_consistency) { fprintf(stderr, "-test_multi_ops_txns is not compatible with " @@ -1798,10 +1746,6 @@ void CheckAndSetOptionsForMultiOpsTxnStressTest() { "-test_multi_ops_txns\n"); exit(1); } -#else - fprintf(stderr, "-test_multi_ops_txns not supported in ROCKSDB_LITE mode\n"); - exit(1); -#endif // !ROCKSDB_LITE } } // namespace ROCKSDB_NAMESPACE diff --git a/db_stress_tool/multi_ops_txns_stress.h b/db_stress_tool/multi_ops_txns_stress.h index 7463d05d744d..12c45aaa3279 100644 --- a/db_stress_tool/multi_ops_txns_stress.h +++ b/db_stress_tool/multi_ops_txns_stress.h @@ -210,6 +210,14 @@ class MultiOpsTxnsStressTest : public StressTest { const std::vector& rand_column_families, const std::vector& rand_keys) override; + void TestGetEntity(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + + void TestMultiGetEntity(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override; + Status TestPrefixScan(ThreadState* thread, const ReadOptions& read_opts, const std::vector& rand_column_families, const std::vector& rand_keys) override; @@ -248,11 +256,9 @@ class MultiOpsTxnsStressTest : public StressTest { const std::vector& rand_column_families, const std::vector& rand_keys) override; -#ifndef ROCKSDB_LITE Status TestApproximateSize(ThreadState* thread, uint64_t iteration, const std::vector& rand_column_families, const std::vector& rand_keys) override; -#endif // !ROCKSDB_LITE Status TestCustomOperations( ThreadState* thread, @@ -260,10 +266,8 @@ class MultiOpsTxnsStressTest : public StressTest { void RegisterAdditionalListeners() override; -#ifndef ROCKSDB_LITE void PrepareTxnDbOptions(SharedState* /*shared*/, TransactionDBOptions& txn_db_opts) override; -#endif // !ROCKSDB_LITE Status PrimaryKeyUpdateTxn(ThreadState* thread, uint32_t old_a, uint32_t old_a_pos, uint32_t new_a); @@ -284,7 +288,7 @@ class MultiOpsTxnsStressTest : public StressTest { VerifyDb(thread); } - void VerifyPkSkFast(int job_id); + void VerifyPkSkFast(const ReadOptions& read_options, int job_id); protected: class Counter { @@ -344,7 +348,6 @@ class MultiOpsTxnsStressTest : public StressTest { uint32_t GenerateNextC(ThreadState* thread); -#ifndef ROCKSDB_LITE // Randomly commit or rollback `txn` void ProcessRecoveredPreparedTxnsHelper(Transaction* txn, SharedState*) override; @@ -363,7 +366,6 @@ class MultiOpsTxnsStressTest : public StressTest { void SetupSnapshot(ThreadState* thread, ReadOptions& read_opts, Transaction& txn, std::shared_ptr& snapshot); -#endif //! ROCKSDB_LITE std::vector> key_gen_for_a_; std::vector> key_gen_for_c_; @@ -414,7 +416,6 @@ class MultiOpsTxnsStressListener : public EventListener { assert(stress_test_); } -#ifndef ROCKSDB_LITE ~MultiOpsTxnsStressListener() override {} void OnFlushCompleted(DB* db, const FlushJobInfo& info) override { @@ -423,7 +424,8 @@ class MultiOpsTxnsStressListener : public EventListener { (void)db; #endif assert(info.cf_id == 0); - stress_test_->VerifyPkSkFast(info.job_id); + const ReadOptions read_options(Env::IOActivity::kFlush); + stress_test_->VerifyPkSkFast(read_options, info.job_id); } void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override { @@ -432,9 +434,9 @@ class MultiOpsTxnsStressListener : public EventListener { (void)db; #endif assert(info.cf_id == 0); - stress_test_->VerifyPkSkFast(info.job_id); + const ReadOptions read_options(Env::IOActivity::kCompaction); + stress_test_->VerifyPkSkFast(read_options, info.job_id); } -#endif //! ROCKSDB_LITE private: MultiOpsTxnsStressTest* const stress_test_ = nullptr; diff --git a/db_stress_tool/no_batched_ops_stress.cc b/db_stress_tool/no_batched_ops_stress.cc index 01f5d67636ea..27a20fd5a5d4 100644 --- a/db_stress_tool/no_batched_ops_stress.cc +++ b/db_stress_tool/no_batched_ops_stress.cc @@ -7,7 +7,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db_stress_tool/expected_state.h" #ifdef GFLAGS +#include "db/wide/wide_columns_helper.h" #include "db_stress_tool/db_stress_common.h" #include "rocksdb/utilities/transaction_db.h" #include "utilities/fault_injection_fs.h" @@ -51,7 +53,9 @@ class NonBatchedOpsStressTest : public StressTest { enum class VerificationMethod { kIterator, kGet, + kGetEntity, kMultiGet, + kMultiGetEntity, kGetMergeOperands, // Add any new items above kNumberOfMethods kNumberOfMethods @@ -99,13 +103,9 @@ class NonBatchedOpsStressTest : public StressTest { if (diff > 0) { s = Status::NotFound(); } else if (diff == 0) { - const WideColumns expected_columns = GenerateExpectedWideColumns( - GetValueBase(iter->value()), iter->value()); - if (iter->columns() != expected_columns) { + if (!VerifyWideColumns(iter->value(), iter->columns())) { VerificationAbort(shared, static_cast(cf), i, - iter->value(), iter->columns(), - expected_columns); - break; + iter->value(), iter->columns()); } from_db = iter->value().ToString(); @@ -123,7 +123,7 @@ class NonBatchedOpsStressTest : public StressTest { } VerifyOrSyncValue(static_cast(cf), i, options, shared, from_db, - s, /* strict */ true); + /* msg_prefix */ "Iterator verification", s); if (!from_db.empty()) { PrintKeyValue(static_cast(cf), static_cast(i), @@ -142,7 +142,42 @@ class NonBatchedOpsStressTest : public StressTest { Status s = db_->Get(options, column_families_[cf], key, &from_db); VerifyOrSyncValue(static_cast(cf), i, options, shared, from_db, - s, /* strict */ true); + /* msg_prefix */ "Get verification", s); + + if (!from_db.empty()) { + PrintKeyValue(static_cast(cf), static_cast(i), + from_db.data(), from_db.size()); + } + } + } else if (method == VerificationMethod::kGetEntity) { + for (int64_t i = start; i < end; ++i) { + if (thread->shared->HasVerificationFailedYet()) { + break; + } + + const std::string key = Key(i); + PinnableWideColumns result; + + Status s = + db_->GetEntity(options, column_families_[cf], key, &result); + + std::string from_db; + + if (s.ok()) { + const WideColumns& columns = result.columns(); + + if (WideColumnsHelper::HasDefaultColumn(columns)) { + from_db = WideColumnsHelper::GetDefaultColumn(columns).ToString(); + } + + if (!VerifyWideColumns(columns)) { + VerificationAbort(shared, static_cast(cf), i, from_db, + columns); + } + } + + VerifyOrSyncValue(static_cast(cf), i, options, shared, from_db, + /* msg_prefix */ "GetEntity verification", s); if (!from_db.empty()) { PrintKeyValue(static_cast(cf), static_cast(i), @@ -159,14 +194,14 @@ class NonBatchedOpsStressTest : public StressTest { size_t batch_size = thread->rand.Uniform(128) + 1; batch_size = std::min(batch_size, end - i); - std::vector keystrs(batch_size); + std::vector key_strs(batch_size); std::vector keys(batch_size); std::vector values(batch_size); std::vector statuses(batch_size); for (size_t j = 0; j < batch_size; ++j) { - keystrs[j] = Key(i + j); - keys[j] = Slice(keystrs[j].data(), keystrs[j].size()); + key_strs[j] = Key(i + j); + keys[j] = Slice(key_strs[j]); } db_->MultiGet(options, column_families_[cf], batch_size, keys.data(), @@ -176,7 +211,60 @@ class NonBatchedOpsStressTest : public StressTest { const std::string from_db = values[j].ToString(); VerifyOrSyncValue(static_cast(cf), i + j, options, shared, - from_db, statuses[j], /* strict */ true); + from_db, /* msg_prefix */ "MultiGet verification", + statuses[j]); + + if (!from_db.empty()) { + PrintKeyValue(static_cast(cf), static_cast(i + j), + from_db.data(), from_db.size()); + } + } + + i += batch_size; + } + } else if (method == VerificationMethod::kMultiGetEntity) { + for (int64_t i = start; i < end;) { + if (thread->shared->HasVerificationFailedYet()) { + break; + } + + // Keep the batch size to some reasonable value + size_t batch_size = thread->rand.Uniform(128) + 1; + batch_size = std::min(batch_size, end - i); + + std::vector key_strs(batch_size); + std::vector keys(batch_size); + std::vector results(batch_size); + std::vector statuses(batch_size); + + for (size_t j = 0; j < batch_size; ++j) { + key_strs[j] = Key(i + j); + keys[j] = Slice(key_strs[j]); + } + + db_->MultiGetEntity(options, column_families_[cf], batch_size, + keys.data(), results.data(), statuses.data()); + + for (size_t j = 0; j < batch_size; ++j) { + std::string from_db; + + if (statuses[j].ok()) { + const WideColumns& columns = results[j].columns(); + + if (WideColumnsHelper::HasDefaultColumn(columns)) { + from_db = + WideColumnsHelper::GetDefaultColumn(columns).ToString(); + } + + if (!VerifyWideColumns(columns)) { + VerificationAbort(shared, static_cast(cf), i, from_db, + columns); + } + } + + VerifyOrSyncValue( + static_cast(cf), i + j, options, shared, from_db, + /* msg_prefix */ "MultiGetEntity verification", statuses[j]); if (!from_db.empty()) { PrintKeyValue(static_cast(cf), static_cast(i + j), @@ -228,7 +316,8 @@ class NonBatchedOpsStressTest : public StressTest { } VerifyOrSyncValue(static_cast(cf), i, options, shared, from_db, - s, /* strict */ true); + /* msg_prefix */ "GetMergeOperands verification", + s); if (!from_db.empty()) { PrintKeyValue(static_cast(cf), static_cast(i), @@ -239,7 +328,6 @@ class NonBatchedOpsStressTest : public StressTest { } } -#ifndef ROCKSDB_LITE void ContinuouslyVerifyDb(ThreadState* thread) const override { if (!cmp_db_) { return; @@ -333,9 +421,6 @@ class NonBatchedOpsStressTest : public StressTest { } } } -#else - void ContinuouslyVerifyDb(ThreadState* /*thread*/) const override {} -#endif // ROCKSDB_LITE void MaybeClearOneColumnFamily(ThreadState* thread) override { if (FLAGS_column_families > 1) { @@ -357,7 +442,7 @@ class NonBatchedOpsStressTest : public StressTest { if (!s.ok()) { fprintf(stderr, "dropping column family error: %s\n", s.ToString().c_str()); - std::terminate(); + thread->shared->SafeTerminate(); } s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name, &column_families_[cf]); @@ -366,7 +451,7 @@ class NonBatchedOpsStressTest : public StressTest { if (!s.ok()) { fprintf(stderr, "creating column family error: %s\n", s.ToString().c_str()); - std::terminate(); + thread->shared->SafeTerminate(); } thread->shared->UnlockColumnFamily(cf); } @@ -391,16 +476,22 @@ class NonBatchedOpsStressTest : public StressTest { SharedState::ignore_read_error = false; } - std::unique_ptr lock(new MutexLock( - thread->shared->GetMutexForKey(rand_column_families[0], rand_keys[0]))); - ReadOptions read_opts_copy = read_opts; std::string read_ts_str; Slice read_ts_slice; + if (FLAGS_user_timestamp_size > 0) { + read_ts_str = GetNowNanos(); + read_ts_slice = read_ts_str; + read_opts_copy.timestamp = &read_ts_slice; + } bool read_older_ts = MaybeUseOlderTimestampForPointLookup( thread, read_ts_str, read_ts_slice, read_opts_copy); + const ExpectedValue pre_read_expected_value = + thread->shared->Get(rand_column_families[0], rand_keys[0]); Status s = db_->Get(read_opts_copy, cfh, key, &from_db); + const ExpectedValue post_read_expected_value = + thread->shared->Get(rand_column_families[0], rand_keys[0]); if (fault_fs_guard) { error_count = fault_fs_guard->GetAndResetErrorCount(); } @@ -419,23 +510,35 @@ class NonBatchedOpsStressTest : public StressTest { // found case thread->stats.AddGets(1, 1); // we only have the latest expected state - if (!FLAGS_skip_verifydb && !read_opts_copy.timestamp && - thread->shared->Get(rand_column_families[0], rand_keys[0]) == - SharedState::DELETION_SENTINEL) { - thread->shared->SetVerificationFailure(); - fprintf(stderr, - "error : inconsistent values for key %s: Get returns %s, " - "expected state does not have the key.\n", - key.ToString(true).c_str(), StringToHex(from_db).c_str()); + if (!FLAGS_skip_verifydb && !read_older_ts) { + if (ExpectedValueHelper::MustHaveNotExisted(pre_read_expected_value, + post_read_expected_value)) { + thread->shared->SetVerificationFailure(); + fprintf(stderr, + "error : inconsistent values for key %s: Get returns %s, " + "but expected state is \"deleted\".\n", + key.ToString(true).c_str(), StringToHex(from_db).c_str()); + } + Slice from_db_slice(from_db); + uint32_t value_base_from_db = GetValueBase(from_db_slice); + if (!ExpectedValueHelper::InExpectedValueBaseRange( + value_base_from_db, pre_read_expected_value, + post_read_expected_value)) { + thread->shared->SetVerificationFailure(); + fprintf(stderr, + "error : inconsistent values for key %s: Get returns %s with " + "value base %d that falls out of expected state's value base " + "range.\n", + key.ToString(true).c_str(), StringToHex(from_db).c_str(), + value_base_from_db); + } } } else if (s.IsNotFound()) { // not found case thread->stats.AddGets(1, 0); if (!FLAGS_skip_verifydb && !read_older_ts) { - auto expected = - thread->shared->Get(rand_column_families[0], rand_keys[0]); - if (expected != SharedState::DELETION_SENTINEL && - expected != SharedState::UNKNOWN_SENTINEL) { + if (ExpectedValueHelper::MustHaveExisted(pre_read_expected_value, + post_read_expected_value)) { thread->shared->SetVerificationFailure(); fprintf(stderr, "error : inconsistent values for key %s: expected state has " @@ -468,13 +571,21 @@ class NonBatchedOpsStressTest : public StressTest { keys.reserve(num_keys); std::vector values(num_keys); std::vector statuses(num_keys); - ColumnFamilyHandle* cfh = column_families_[rand_column_families[0]]; + // When Flags_use_txn is enabled, we also do a read your write check. + std::vector> ryw_expected_values; + ryw_expected_values.reserve(num_keys); + + SharedState* shared = thread->shared; + + int column_family = rand_column_families[0]; + ColumnFamilyHandle* cfh = column_families_[column_family]; int error_count = 0; // Do a consistency check between Get and MultiGet. Don't do it too // often as it will slow db_stress down bool do_consistency_check = thread->rand.OneIn(4); ReadOptions readoptionscopy = read_opts; + if (do_consistency_check) { readoptionscopy.snapshot = db_->GetSnapshot(); } @@ -493,8 +604,7 @@ class NonBatchedOpsStressTest : public StressTest { // Create a transaction in order to write some data. The purpose is to // exercise WriteBatchWithIndex::MultiGetFromBatchAndDB. The transaction // will be rolled back once MultiGet returns. -#ifndef ROCKSDB_LITE - Transaction* txn = nullptr; + std::unique_ptr txn; if (use_txn) { WriteOptions wo; if (FLAGS_rate_limit_auto_wal_flush) { @@ -502,29 +612,37 @@ class NonBatchedOpsStressTest : public StressTest { } Status s = NewTxn(wo, &txn); if (!s.ok()) { - fprintf(stderr, "NewTxn: %s\n", s.ToString().c_str()); - std::terminate(); + fprintf(stderr, "NewTxn error: %s\n", s.ToString().c_str()); + thread->shared->SafeTerminate(); } } -#endif for (size_t i = 0; i < num_keys; ++i) { - key_str.emplace_back(Key(rand_keys[i])); + uint64_t rand_key = rand_keys[i]; + key_str.emplace_back(Key(rand_key)); keys.emplace_back(key_str.back()); -#ifndef ROCKSDB_LITE if (use_txn) { + if (!shared->AllowsOverwrite(rand_key) && + shared->Exists(column_family, rand_key)) { + // Just do read your write checks for keys that allow overwrites. + ryw_expected_values.push_back(std::nullopt); + continue; + } // With a 1 in 10 probability, insert the just added key in the batch // into the transaction. This will create an overlap with the MultiGet // keys and exercise some corner cases in the code if (thread->rand.OneIn(10)) { int op = thread->rand.Uniform(2); Status s; + assert(txn); switch (op) { case 0: case 1: { - uint32_t value_base = - thread->rand.Next() % thread->shared->UNKNOWN_SENTINEL; + ExpectedValue put_value; + put_value.Put(false /* pending */); + ryw_expected_values.emplace_back(put_value); char value[100]; - size_t sz = GenerateValue(value_base, value, sizeof(value)); + size_t sz = + GenerateValue(put_value.GetValueBase(), value, sizeof(value)); Slice v(value, sz); if (op == 0) { s = txn->Put(cfh, keys.back(), v); @@ -533,19 +651,25 @@ class NonBatchedOpsStressTest : public StressTest { } break; } - case 2: + case 2: { + ExpectedValue delete_value; + delete_value.Delete(false /* pending */); + ryw_expected_values.emplace_back(delete_value); s = txn->Delete(cfh, keys.back()); break; + } default: assert(false); } if (!s.ok()) { - fprintf(stderr, "Transaction put: %s\n", s.ToString().c_str()); - std::terminate(); + fprintf(stderr, "Transaction put error: %s\n", + s.ToString().c_str()); + thread->shared->SafeTerminate(); } + } else { + ryw_expected_values.push_back(std::nullopt); } } -#endif } if (!use_txn) { @@ -559,10 +683,9 @@ class NonBatchedOpsStressTest : public StressTest { error_count = fault_fs_guard->GetAndResetErrorCount(); } } else { -#ifndef ROCKSDB_LITE + assert(txn); txn->MultiGet(readoptionscopy, cfh, num_keys, keys.data(), values.data(), statuses.data()); -#endif } if (fault_fs_guard && error_count && !SharedState::ignore_read_error) { @@ -589,52 +712,127 @@ class NonBatchedOpsStressTest : public StressTest { fault_fs_guard->DisableErrorInjection(); } - for (size_t i = 0; i < statuses.size(); ++i) { - Status s = statuses[i]; + auto ryw_check = + [](const Slice& key, const PinnableSlice& value, const Status& s, + const std::optional& ryw_expected_value) -> bool { + if (!ryw_expected_value.has_value()) { + return true; + } + const ExpectedValue& expected = ryw_expected_value.value(); + char expected_value[100]; + if (s.ok() && + ExpectedValueHelper::MustHaveNotExisted(expected, expected)) { + fprintf(stderr, + "MultiGet returned value different from what was " + "written for key %s\n", + key.ToString(true).c_str()); + fprintf(stderr, + "MultiGet returned ok, transaction has non-committed " + "delete.\n"); + return false; + } else if (s.IsNotFound() && + ExpectedValueHelper::MustHaveExisted(expected, expected)) { + fprintf(stderr, + "MultiGet returned value different from what was " + "written for key %s\n", + key.ToString(true).c_str()); + fprintf(stderr, + "MultiGet returned not found, transaction has " + "non-committed value.\n"); + return false; + } else if (s.ok() && + ExpectedValueHelper::MustHaveExisted(expected, expected)) { + Slice from_txn_slice(value); + size_t sz = GenerateValue(expected.GetValueBase(), expected_value, + sizeof(expected_value)); + Slice expected_value_slice(expected_value, sz); + if (expected_value_slice.compare(from_txn_slice) == 0) { + return true; + } + fprintf(stderr, + "MultiGet returned value different from what was " + "written for key %s\n", + key.ToString(true /* hex */).c_str()); + fprintf(stderr, "MultiGet returned value %s\n", + from_txn_slice.ToString(true /* hex */).c_str()); + fprintf(stderr, "Transaction has non-committed value %s\n", + expected_value_slice.ToString(true /* hex */).c_str()); + return false; + } + return true; + }; + + auto check_multiget = + [&](const Slice& key, const PinnableSlice& expected_value, + const Status& s, + const std::optional& ryw_expected_value) -> bool { bool is_consistent = true; - // Only do the consistency check if no error was injected and MultiGet - // didn't return an unexpected error + bool is_ryw_correct = true; + // Only do the consistency check if no error was injected and + // MultiGet didn't return an unexpected error. If test does not use + // transaction, the consistency check for each key included check results + // from db `Get` and db `MultiGet` are consistent. + // If test use transaction, after consistency check, also do a read your + // own write check. if (do_consistency_check && !error_count && (s.ok() || s.IsNotFound())) { Status tmp_s; std::string value; if (use_txn) { -#ifndef ROCKSDB_LITE - tmp_s = txn->Get(readoptionscopy, cfh, keys[i], &value); -#endif // ROCKSDB_LITE + assert(txn); + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_GET); + tmp_s = txn->Get(readoptionscopy, cfh, key, &value); + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_MULTIGET); } else { - tmp_s = db_->Get(readoptionscopy, cfh, keys[i], &value); + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_GET); + tmp_s = db_->Get(readoptionscopy, cfh, key, &value); + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_MULTIGET); } if (!tmp_s.ok() && !tmp_s.IsNotFound()) { fprintf(stderr, "Get error: %s\n", s.ToString().c_str()); is_consistent = false; } else if (!s.ok() && tmp_s.ok()) { fprintf(stderr, "MultiGet returned different results with key %s\n", - keys[i].ToString(true).c_str()); + key.ToString(true).c_str()); fprintf(stderr, "Get returned ok, MultiGet returned not found\n"); is_consistent = false; } else if (s.ok() && tmp_s.IsNotFound()) { fprintf(stderr, "MultiGet returned different results with key %s\n", - keys[i].ToString(true).c_str()); + key.ToString(true).c_str()); fprintf(stderr, "MultiGet returned ok, Get returned not found\n"); is_consistent = false; - } else if (s.ok() && value != values[i].ToString()) { + } else if (s.ok() && value != expected_value.ToString()) { fprintf(stderr, "MultiGet returned different results with key %s\n", - keys[i].ToString(true).c_str()); + key.ToString(true).c_str()); fprintf(stderr, "MultiGet returned value %s\n", - values[i].ToString(true).c_str()); + expected_value.ToString(true).c_str()); fprintf(stderr, "Get returned value %s\n", Slice(value).ToString(true /* hex */).c_str()); is_consistent = false; } } + // If test uses transaction, continue to do a read your own write check. + if (is_consistent && use_txn) { + is_ryw_correct = ryw_check(key, expected_value, s, ryw_expected_value); + } + if (!is_consistent) { fprintf(stderr, "TestMultiGet error: is_consistent is false\n"); thread->stats.AddErrors(1); // Fail fast to preserve the DB state thread->shared->SetVerificationFailure(); - break; + return false; + } else if (!is_ryw_correct) { + fprintf(stderr, "TestMultiGet error: is_ryw_correct is false\n"); + thread->stats.AddErrors(1); + // Fail fast to preserve the DB state + thread->shared->SetVerificationFailure(); + return false; } else if (s.ok()) { // found case thread->stats.AddGets(1, 1); @@ -653,19 +851,287 @@ class NonBatchedOpsStressTest : public StressTest { thread->stats.AddVerifiedErrors(1); } } + return true; + }; + + size_t num_of_keys = keys.size(); + assert(values.size() == num_of_keys); + assert(statuses.size() == num_of_keys); + for (size_t i = 0; i < num_of_keys; ++i) { + bool check_result = true; + if (use_txn) { + assert(ryw_expected_values.size() == num_of_keys); + check_result = check_multiget(keys[i], values[i], statuses[i], + ryw_expected_values[i]); + } else { + check_result = check_multiget(keys[i], values[i], statuses[i], + std::nullopt /* ryw_expected_value */); + } + if (!check_result) { + break; + } } if (readoptionscopy.snapshot) { db_->ReleaseSnapshot(readoptionscopy.snapshot); } if (use_txn) { -#ifndef ROCKSDB_LITE - RollbackTxn(txn); -#endif + txn->Rollback().PermitUncheckedError(); } return statuses; } + void TestGetEntity(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override { + if (fault_fs_guard) { + fault_fs_guard->EnableErrorInjection(); + SharedState::ignore_read_error = false; + } + + assert(thread); + + SharedState* const shared = thread->shared; + assert(shared); + + assert(!rand_column_families.empty()); + assert(!rand_keys.empty()); + + std::unique_ptr lock(new MutexLock( + shared->GetMutexForKey(rand_column_families[0], rand_keys[0]))); + + assert(rand_column_families[0] >= 0); + assert(rand_column_families[0] < static_cast(column_families_.size())); + + ColumnFamilyHandle* const cfh = column_families_[rand_column_families[0]]; + assert(cfh); + + const std::string key = Key(rand_keys[0]); + + PinnableWideColumns from_db; + + const Status s = db_->GetEntity(read_opts, cfh, key, &from_db); + + int error_count = 0; + + if (fault_fs_guard) { + error_count = fault_fs_guard->GetAndResetErrorCount(); + } + + if (s.ok()) { + if (fault_fs_guard) { + if (error_count && !SharedState::ignore_read_error) { + // Grab mutex so multiple threads don't try to print the + // stack trace at the same time + MutexLock l(shared->GetMutex()); + fprintf(stderr, "Didn't get expected error from GetEntity\n"); + fprintf(stderr, "Call stack that injected the fault\n"); + fault_fs_guard->PrintFaultBacktrace(); + std::terminate(); + } + } + + thread->stats.AddGets(1, 1); + + if (!FLAGS_skip_verifydb) { + const WideColumns& columns = from_db.columns(); + ExpectedValue expected = + shared->Get(rand_column_families[0], rand_keys[0]); + if (!VerifyWideColumns(columns)) { + shared->SetVerificationFailure(); + fprintf(stderr, + "error : inconsistent columns returned by GetEntity for key " + "%s: %s\n", + StringToHex(key).c_str(), WideColumnsToHex(columns).c_str()); + } else if (ExpectedValueHelper::MustHaveNotExisted(expected, + expected)) { + shared->SetVerificationFailure(); + fprintf( + stderr, + "error : inconsistent values for key %s: GetEntity returns %s, " + "expected state does not have the key.\n", + StringToHex(key).c_str(), WideColumnsToHex(columns).c_str()); + } + } + } else if (s.IsNotFound()) { + thread->stats.AddGets(1, 0); + + if (!FLAGS_skip_verifydb) { + ExpectedValue expected = + shared->Get(rand_column_families[0], rand_keys[0]); + if (ExpectedValueHelper::MustHaveExisted(expected, expected)) { + shared->SetVerificationFailure(); + fprintf(stderr, + "error : inconsistent values for key %s: expected state has " + "the key, GetEntity returns NotFound.\n", + StringToHex(key).c_str()); + } + } + } else { + if (error_count == 0) { + thread->stats.AddErrors(1); + } else { + thread->stats.AddVerifiedErrors(1); + } + } + + if (fault_fs_guard) { + fault_fs_guard->DisableErrorInjection(); + } + } + + void TestMultiGetEntity(ThreadState* thread, const ReadOptions& read_opts, + const std::vector& rand_column_families, + const std::vector& rand_keys) override { + assert(thread); + + ManagedSnapshot snapshot_guard(db_); + + ReadOptions read_opts_copy(read_opts); + read_opts_copy.snapshot = snapshot_guard.snapshot(); + + assert(!rand_column_families.empty()); + assert(rand_column_families[0] >= 0); + assert(rand_column_families[0] < static_cast(column_families_.size())); + + ColumnFamilyHandle* const cfh = column_families_[rand_column_families[0]]; + assert(cfh); + + assert(!rand_keys.empty()); + + const size_t num_keys = rand_keys.size(); + + std::vector keys(num_keys); + std::vector key_slices(num_keys); + + for (size_t i = 0; i < num_keys; ++i) { + keys[i] = Key(rand_keys[i]); + key_slices[i] = keys[i]; + } + + std::vector results(num_keys); + std::vector statuses(num_keys); + + if (fault_fs_guard) { + fault_fs_guard->EnableErrorInjection(); + SharedState::ignore_read_error = false; + } + + db_->MultiGetEntity(read_opts_copy, cfh, num_keys, key_slices.data(), + results.data(), statuses.data()); + + int error_count = 0; + + if (fault_fs_guard) { + error_count = fault_fs_guard->GetAndResetErrorCount(); + + if (error_count && !SharedState::ignore_read_error) { + int stat_nok = 0; + for (const auto& s : statuses) { + if (!s.ok() && !s.IsNotFound()) { + stat_nok++; + } + } + + if (stat_nok < error_count) { + // Grab mutex so multiple threads don't try to print the + // stack trace at the same time + assert(thread->shared); + MutexLock l(thread->shared->GetMutex()); + + fprintf(stderr, "Didn't get expected error from MultiGetEntity\n"); + fprintf(stderr, "num_keys %zu Expected %d errors, seen %d\n", + num_keys, error_count, stat_nok); + fprintf(stderr, "Call stack that injected the fault\n"); + fault_fs_guard->PrintFaultBacktrace(); + std::terminate(); + } + } + + fault_fs_guard->DisableErrorInjection(); + } + + const bool check_get_entity = !error_count && thread->rand.OneIn(4); + + for (size_t i = 0; i < num_keys; ++i) { + const Status& s = statuses[i]; + + bool is_consistent = true; + + if (s.ok() && !VerifyWideColumns(results[i].columns())) { + fprintf( + stderr, + "error : inconsistent columns returned by MultiGetEntity for key " + "%s: %s\n", + StringToHex(keys[i]).c_str(), + WideColumnsToHex(results[i].columns()).c_str()); + is_consistent = false; + } else if (check_get_entity && (s.ok() || s.IsNotFound())) { + PinnableWideColumns cmp_result; + ThreadStatusUtil::SetThreadOperation( + ThreadStatus::OperationType::OP_GETENTITY); + const Status cmp_s = + db_->GetEntity(read_opts_copy, cfh, key_slices[i], &cmp_result); + + if (!cmp_s.ok() && !cmp_s.IsNotFound()) { + fprintf(stderr, "GetEntity error: %s\n", cmp_s.ToString().c_str()); + is_consistent = false; + } else if (cmp_s.IsNotFound()) { + if (s.ok()) { + fprintf(stderr, + "Inconsistent results for key %s: MultiGetEntity returned " + "ok, GetEntity returned not found\n", + StringToHex(keys[i]).c_str()); + is_consistent = false; + } + } else { + assert(cmp_s.ok()); + + if (s.IsNotFound()) { + fprintf(stderr, + "Inconsistent results for key %s: MultiGetEntity returned " + "not found, GetEntity returned ok\n", + StringToHex(keys[i]).c_str()); + is_consistent = false; + } else { + assert(s.ok()); + + if (results[i] != cmp_result) { + fprintf( + stderr, + "Inconsistent results for key %s: MultiGetEntity returned " + "%s, GetEntity returned %s\n", + StringToHex(keys[i]).c_str(), + WideColumnsToHex(results[i].columns()).c_str(), + WideColumnsToHex(cmp_result.columns()).c_str()); + is_consistent = false; + } + } + } + } + + if (!is_consistent) { + fprintf(stderr, + "TestMultiGetEntity error: results are not consistent\n"); + thread->stats.AddErrors(1); + // Fail fast to preserve the DB state + thread->shared->SetVerificationFailure(); + break; + } else if (s.ok()) { + thread->stats.AddGets(1, 1); + } else if (s.IsNotFound()) { + thread->stats.AddGets(1, 0); + } else { + if (error_count == 0) { + fprintf(stderr, "MultiGetEntity error: %s\n", s.ToString().c_str()); + thread->stats.AddErrors(1); + } else { + thread->stats.AddVerifiedErrors(1); + } + } + } + } + Status TestPrefixScan(ThreadState* thread, const ReadOptions& read_opts, const std::vector& rand_column_families, const std::vector& rand_keys) override { @@ -720,12 +1186,9 @@ class NonBatchedOpsStressTest : public StressTest { } } - const WideColumns expected_columns = GenerateExpectedWideColumns( - GetValueBase(iter->value()), iter->value()); - if (iter->columns() != expected_columns) { - s = Status::Corruption( - "Value and columns inconsistent", - DebugString(iter->value(), iter->columns(), expected_columns)); + if (!VerifyWideColumns(iter->value(), iter->columns())) { + s = Status::Corruption("Value and columns inconsistent", + DebugString(iter->value(), iter->columns())); break; } } @@ -803,20 +1266,25 @@ class NonBatchedOpsStressTest : public StressTest { std::string from_db; Status s = db_->Get(read_opts, cfh, k, &from_db); if (!VerifyOrSyncValue(rand_column_family, rand_key, read_opts, shared, - from_db, s, /* strict */ true)) { + /* msg_prefix */ "Pre-Put Get verification", + from_db, s)) { return s; } } - const uint32_t value_base = thread->rand.Next() % shared->UNKNOWN_SENTINEL; + PendingExpectedValue pending_expected_value = + shared->PreparePut(rand_column_family, rand_key); + const uint32_t value_base = pending_expected_value.GetFinalValueBase(); const size_t sz = GenerateValue(value_base, value, sizeof(value)); const Slice v(value, sz); - shared->Put(rand_column_family, rand_key, value_base, true /* pending */); - Status s; - if (FLAGS_use_merge) { + if (FLAGS_use_put_entity_one_in > 0 && + (value_base % FLAGS_use_put_entity_one_in) == 0) { + s = db_->PutEntity(write_opts, cfh, k, + GenerateWideColumns(value_base, v)); + } else if (FLAGS_use_merge) { if (!FLAGS_use_txn) { if (FLAGS_user_timestamp_size == 0) { s = db_->Merge(write_opts, cfh, k, v); @@ -824,21 +1292,10 @@ class NonBatchedOpsStressTest : public StressTest { s = db_->Merge(write_opts, cfh, k, write_ts, v); } } else { -#ifndef ROCKSDB_LITE - Transaction* txn; - s = NewTxn(write_opts, &txn); - if (s.ok()) { - s = txn->Merge(cfh, k, v); - if (s.ok()) { - s = CommitTxn(txn, thread); - } - } -#endif + s = ExecuteTransaction(write_opts, thread, [&](Transaction& txn) { + return txn.Merge(cfh, k, v); + }); } - } else if (FLAGS_use_put_entity_one_in > 0 && - (value_base % FLAGS_use_put_entity_one_in) == 0) { - s = db_->PutEntity(write_opts, cfh, k, - GenerateWideColumns(value_base, v)); } else { if (!FLAGS_use_txn) { if (FLAGS_user_timestamp_size == 0) { @@ -847,36 +1304,27 @@ class NonBatchedOpsStressTest : public StressTest { s = db_->Put(write_opts, cfh, k, write_ts, v); } } else { -#ifndef ROCKSDB_LITE - Transaction* txn; - s = NewTxn(write_opts, &txn); - if (s.ok()) { - s = txn->Put(cfh, k, v); - if (s.ok()) { - s = CommitTxn(txn, thread); - } - } -#endif + s = ExecuteTransaction(write_opts, thread, [&](Transaction& txn) { + return txn.Put(cfh, k, v); + }); } } - shared->Put(rand_column_family, rand_key, value_base, false /* pending */); - if (!s.ok()) { - if (FLAGS_injest_error_severity >= 2) { + if (FLAGS_inject_error_severity >= 2) { if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) { is_db_stopped_ = true; } else if (!is_db_stopped_ || s.severity() < Status::Severity::kFatalError) { fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str()); - std::terminate(); + thread->shared->SafeTerminate(); } } else { fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str()); - std::terminate(); + thread->shared->SafeTerminate(); } } - + pending_expected_value.Commit(); thread->stats.AddBytesForWrites(1, sz); PrintKeyValue(rand_column_family, static_cast(rand_key), value, sz); @@ -905,7 +1353,8 @@ class NonBatchedOpsStressTest : public StressTest { // otherwise. Status s; if (shared->AllowsOverwrite(rand_key)) { - shared->Delete(rand_column_family, rand_key, true /* pending */); + PendingExpectedValue pending_expected_value = + shared->PrepareDelete(rand_column_family, rand_key); if (!FLAGS_use_txn) { if (FLAGS_user_timestamp_size == 0) { s = db_->Delete(write_opts, cfh, key); @@ -913,36 +1362,31 @@ class NonBatchedOpsStressTest : public StressTest { s = db_->Delete(write_opts, cfh, key, write_ts); } } else { -#ifndef ROCKSDB_LITE - Transaction* txn; - s = NewTxn(write_opts, &txn); - if (s.ok()) { - s = txn->Delete(cfh, key); - if (s.ok()) { - s = CommitTxn(txn, thread); - } - } -#endif + s = ExecuteTransaction(write_opts, thread, [&](Transaction& txn) { + return txn.Delete(cfh, key); + }); } - shared->Delete(rand_column_family, rand_key, false /* pending */); - thread->stats.AddDeletes(1); + if (!s.ok()) { - if (FLAGS_injest_error_severity >= 2) { + if (FLAGS_inject_error_severity >= 2) { if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) { is_db_stopped_ = true; } else if (!is_db_stopped_ || s.severity() < Status::Severity::kFatalError) { fprintf(stderr, "delete error: %s\n", s.ToString().c_str()); - std::terminate(); + thread->shared->SafeTerminate(); } } else { fprintf(stderr, "delete error: %s\n", s.ToString().c_str()); - std::terminate(); + thread->shared->SafeTerminate(); } } + pending_expected_value.Commit(); + thread->stats.AddDeletes(1); } else { - shared->SingleDelete(rand_column_family, rand_key, true /* pending */); + PendingExpectedValue pending_expected_value = + shared->PrepareSingleDelete(rand_column_family, rand_key); if (!FLAGS_use_txn) { if (FLAGS_user_timestamp_size == 0) { s = db_->SingleDelete(write_opts, cfh, key); @@ -950,34 +1394,28 @@ class NonBatchedOpsStressTest : public StressTest { s = db_->SingleDelete(write_opts, cfh, key, write_ts); } } else { -#ifndef ROCKSDB_LITE - Transaction* txn; - s = NewTxn(write_opts, &txn); - if (s.ok()) { - s = txn->SingleDelete(cfh, key); - if (s.ok()) { - s = CommitTxn(txn, thread); - } - } -#endif + s = ExecuteTransaction(write_opts, thread, [&](Transaction& txn) { + return txn.SingleDelete(cfh, key); + }); } - shared->SingleDelete(rand_column_family, rand_key, false /* pending */); - thread->stats.AddSingleDeletes(1); + if (!s.ok()) { - if (FLAGS_injest_error_severity >= 2) { + if (FLAGS_inject_error_severity >= 2) { if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) { is_db_stopped_ = true; } else if (!is_db_stopped_ || s.severity() < Status::Severity::kFatalError) { fprintf(stderr, "single delete error: %s\n", s.ToString().c_str()); - std::terminate(); + thread->shared->SafeTerminate(); } } else { fprintf(stderr, "single delete error: %s\n", s.ToString().c_str()); - std::terminate(); + thread->shared->SafeTerminate(); } } + pending_expected_value.Commit(); + thread->stats.AddSingleDeletes(1); } return s; } @@ -1006,10 +1444,10 @@ class NonBatchedOpsStressTest : public StressTest { shared->GetMutexForKey(rand_column_family, rand_key + j))); } } - shared->DeleteRange(rand_column_family, rand_key, - rand_key + FLAGS_range_deletion_width, - true /* pending */); - + std::vector pending_expected_values = + shared->PrepareDeleteRange(rand_column_family, rand_key, + rand_key + FLAGS_range_deletion_width); + const int covered = static_cast(pending_expected_values.size()); std::string keystr = Key(rand_key); Slice key = keystr; auto cfh = column_families_[rand_column_family]; @@ -1026,39 +1464,28 @@ class NonBatchedOpsStressTest : public StressTest { s = db_->DeleteRange(write_opts, cfh, key, end_key); } if (!s.ok()) { - if (FLAGS_injest_error_severity >= 2) { + if (FLAGS_inject_error_severity >= 2) { if (!is_db_stopped_ && s.severity() >= Status::Severity::kFatalError) { is_db_stopped_ = true; } else if (!is_db_stopped_ || s.severity() < Status::Severity::kFatalError) { fprintf(stderr, "delete range error: %s\n", s.ToString().c_str()); - std::terminate(); + thread->shared->SafeTerminate(); } } else { fprintf(stderr, "delete range error: %s\n", s.ToString().c_str()); - std::terminate(); + thread->shared->SafeTerminate(); } } - int covered = shared->DeleteRange(rand_column_family, rand_key, - rand_key + FLAGS_range_deletion_width, - false /* pending */); + for (PendingExpectedValue& pending_expected_value : + pending_expected_values) { + pending_expected_value.Commit(); + } thread->stats.AddRangeDeletions(1); thread->stats.AddCoveredByRangeDeletions(covered); return s; } -#ifdef ROCKSDB_LITE - void TestIngestExternalFile( - ThreadState* /* thread */, - const std::vector& /* rand_column_families */, - const std::vector& /* rand_keys */) override { - assert(false); - fprintf(stderr, - "RocksDB lite does not support " - "TestIngestExternalFile\n"); - std::terminate(); - } -#else void TestIngestExternalFile(ThreadState* thread, const std::vector& rand_column_families, const std::vector& rand_keys) override { @@ -1083,6 +1510,8 @@ class NonBatchedOpsStressTest : public StressTest { keys.reserve(FLAGS_ingest_external_file_width); std::vector values; values.reserve(FLAGS_ingest_external_file_width); + std::vector pending_expected_values; + pending_expected_values.reserve(FLAGS_ingest_external_file_width); SharedState* shared = thread->shared; assert(FLAGS_nooverwritepercent < 100); @@ -1097,20 +1526,30 @@ class NonBatchedOpsStressTest : public StressTest { new MutexLock(shared->GetMutexForKey(column_family, key))); } if (!shared->AllowsOverwrite(key)) { - // We could alternatively include `key` on the condition its current - // value is `DELETION_SENTINEL`. + // We could alternatively include `key` that is deleted. continue; } keys.push_back(key); - uint32_t value_base = thread->rand.Next() % shared->UNKNOWN_SENTINEL; + PendingExpectedValue pending_expected_value = + shared->PreparePut(column_family, key); + const uint32_t value_base = pending_expected_value.GetFinalValueBase(); values.push_back(value_base); - shared->Put(column_family, key, value_base, true /* pending */); + pending_expected_values.push_back(pending_expected_value); char value[100]; - size_t value_len = GenerateValue(value_base, value, sizeof(value)); auto key_str = Key(key); - s = sst_file_writer.Put(Slice(key_str), Slice(value, value_len)); + const size_t value_len = GenerateValue(value_base, value, sizeof(value)); + const Slice k(key_str); + const Slice v(value, value_len); + + if (FLAGS_use_put_entity_one_in > 0 && + (value_base % FLAGS_use_put_entity_one_in) == 0) { + WideColumns columns = GenerateWideColumns(value_base, v); + s = sst_file_writer.PutEntity(k, columns); + } else { + s = sst_file_writer.Put(k, v); + } } if (s.ok() && keys.empty()) { @@ -1125,14 +1564,18 @@ class NonBatchedOpsStressTest : public StressTest { {sst_filename}, IngestExternalFileOptions()); } if (!s.ok()) { - fprintf(stderr, "file ingestion error: %s\n", s.ToString().c_str()); - std::terminate(); - } - for (size_t i = 0; i < keys.size(); ++i) { - shared->Put(column_family, keys[i], values[i], false /* pending */); + if (!s.IsIOError() || !std::strstr(s.getState(), "injected")) { + fprintf(stderr, "file ingestion error: %s\n", s.ToString().c_str()); + thread->shared->SafeTerminate(); + } else { + fprintf(stdout, "file ingestion error: %s\n", s.ToString().c_str()); + } + } else { + for (size_t i = 0; i < pending_expected_values.size(); ++i) { + pending_expected_values[i].Commit(); + } } } -#endif // ROCKSDB_LITE // Given a key K, this creates an iterator which scans the range // [K, K + FLAGS_num_iterations) forward and backward. @@ -1159,14 +1602,19 @@ class NonBatchedOpsStressTest : public StressTest { const int64_t ub = lb + num_iter; - // Lock the whole range over which we might iterate to ensure it doesn't - // change under us. const int rand_column_family = rand_column_families[0]; - std::vector> range_locks = - shared->GetLocksForKeyRange(rand_column_family, lb, ub); + + // Testing parallel read and write to the same key with user timestamp + // is not currently supported + std::vector> range_locks; + if (FLAGS_user_timestamp_size > 0) { + range_locks = shared->GetLocksForKeyRange(rand_column_family, lb, ub); + } ReadOptions ro(read_opts); - ro.total_order_seek = true; + if (FLAGS_prefix_size > 0) { + ro.total_order_seek = true; + } std::string read_ts_str; Slice read_ts; @@ -1190,7 +1638,22 @@ class NonBatchedOpsStressTest : public StressTest { ColumnFamilyHandle* const cfh = column_families_[rand_column_family]; assert(cfh); + const std::size_t expected_values_size = static_cast(ub - lb); + std::vector pre_read_expected_values; + std::vector post_read_expected_values; + + for (int64_t i = 0; i < static_cast(expected_values_size); ++i) { + pre_read_expected_values.push_back( + shared->Get(rand_column_family, i + lb)); + } std::unique_ptr iter(db_->NewIterator(ro, cfh)); + for (int64_t i = 0; i < static_cast(expected_values_size); ++i) { + post_read_expected_values.push_back( + shared->Get(rand_column_family, i + lb)); + } + + assert(pre_read_expected_values.size() == expected_values_size && + pre_read_expected_values.size() == post_read_expected_values.size()); std::string op_logs; @@ -1198,17 +1661,15 @@ class NonBatchedOpsStressTest : public StressTest { assert(iter); assert(iter->Valid()); - const WideColumns expected_columns = GenerateExpectedWideColumns( - GetValueBase(iter->value()), iter->value()); - if (iter->columns() != expected_columns) { + if (!VerifyWideColumns(iter->value(), iter->columns())) { shared->SetVerificationFailure(); fprintf(stderr, "Verification failed for key %s: " - "Value and columns inconsistent: %s\n", + "Value and columns inconsistent: value: %s, columns: %s\n", Slice(iter->key()).ToString(/* hex */ true).c_str(), - DebugString(iter->value(), iter->columns(), expected_columns) - .c_str()); + iter->value().ToString(/* hex */ true).c_str(), + WideColumnsToHex(iter->columns()).c_str()); fprintf(stderr, "Column family: %s, op_logs: %s\n", cfh->GetName().c_str(), op_logs.c_str()); @@ -1221,20 +1682,29 @@ class NonBatchedOpsStressTest : public StressTest { }; auto check_no_key_in_range = [&](int64_t start, int64_t end) { + assert(start <= end); for (auto j = std::max(start, lb); j < std::min(end, ub); ++j) { - auto expected_value = - shared->Get(rand_column_family, static_cast(j)); - if (expected_value != shared->DELETION_SENTINEL && - expected_value != shared->UNKNOWN_SENTINEL) { + std::size_t index = static_cast(j - lb); + assert(index < pre_read_expected_values.size() && + index < post_read_expected_values.size()); + const ExpectedValue pre_read_expected_value = + pre_read_expected_values[index]; + const ExpectedValue post_read_expected_value = + post_read_expected_values[index]; + if (ExpectedValueHelper::MustHaveExisted(pre_read_expected_value, + post_read_expected_value)) { // Fail fast to preserve the DB state. thread->shared->SetVerificationFailure(); if (iter->Valid()) { fprintf(stderr, - "Expected state has key %s, iterator is at key %s\n", + "Verification failed. Expected state has key %s, iterator " + "is at key %s\n", Slice(Key(j)).ToString(true).c_str(), iter->key().ToString(true).c_str()); } else { - fprintf(stderr, "Expected state has key %s, iterator is invalid\n", + fprintf(stderr, + "Verification failed. Expected state has key %s, iterator " + "is invalid\n", Slice(Key(j)).ToString(true).c_str()); } fprintf(stderr, "Column family: %s, op_logs: %s\n", @@ -1258,6 +1728,7 @@ class NonBatchedOpsStressTest : public StressTest { uint64_t curr = 0; while (true) { + assert(last_key < ub); if (!iter->Valid()) { if (!iter->status().ok()) { thread->shared->SetVerificationFailure(); @@ -1280,6 +1751,19 @@ class NonBatchedOpsStressTest : public StressTest { // iter is valid, the range (last_key, current key) was skipped GetIntVal(iter->key().ToString(), &curr); + if (static_cast(curr) <= last_key) { + thread->shared->SetVerificationFailure(); + fprintf(stderr, + "TestIterateAgainstExpected failed: found unexpectedly small " + "key\n"); + fprintf(stderr, "Column family: %s, op_logs: %s\n", + cfh->GetName().c_str(), op_logs.c_str()); + fprintf(stderr, "Last op found key: %s, expected at least: %s\n", + Slice(Key(curr)).ToString(true).c_str(), + Slice(Key(last_key + 1)).ToString(true).c_str()); + thread->stats.AddErrors(1); + return Status::OK(); + } if (!check_no_key_in_range(last_key + 1, static_cast(curr))) { return Status::OK(); } @@ -1302,6 +1786,7 @@ class NonBatchedOpsStressTest : public StressTest { last_key = ub; while (true) { + assert(lb < last_key); if (!iter->Valid()) { if (!iter->status().ok()) { thread->shared->SetVerificationFailure(); @@ -1324,6 +1809,19 @@ class NonBatchedOpsStressTest : public StressTest { // the range (current key, last key) was skipped GetIntVal(iter->key().ToString(), &curr); + if (last_key <= static_cast(curr)) { + thread->shared->SetVerificationFailure(); + fprintf(stderr, + "TestIterateAgainstExpected failed: found unexpectedly large " + "key\n"); + fprintf(stderr, "Column family: %s, op_logs: %s\n", + cfh->GetName().c_str(), op_logs.c_str()); + fprintf(stderr, "Last op found key: %s, expected at most: %s\n", + Slice(Key(curr)).ToString(true).c_str(), + Slice(Key(last_key - 1)).ToString(true).c_str()); + thread->stats.AddErrors(1); + return Status::OK(); + } if (!check_no_key_in_range(static_cast(curr + 1), last_key)) { return Status::OK(); } @@ -1338,10 +1836,28 @@ class NonBatchedOpsStressTest : public StressTest { op_logs += "P"; } - if (thread->rand.OneIn(2)) { + // Write-prepared and Write-unprepared do not support Refresh() yet. + if (!(FLAGS_use_txn && FLAGS_txn_write_policy != 0) && + thread->rand.OneIn(2)) { + pre_read_expected_values.clear(); + post_read_expected_values.clear(); // Refresh after forward/backward scan to allow higher chance of SV - // change. It is safe to refresh since the testing key range is locked. - iter->Refresh(); + // change. + for (int64_t i = 0; i < static_cast(expected_values_size); ++i) { + pre_read_expected_values.push_back( + shared->Get(rand_column_family, i + lb)); + } + Status rs = iter->Refresh(); + assert(rs.ok()); + op_logs += "Refresh "; + for (int64_t i = 0; i < static_cast(expected_values_size); ++i) { + post_read_expected_values.push_back( + shared->Get(rand_column_family, i + lb)); + } + + assert(pre_read_expected_values.size() == expected_values_size && + pre_read_expected_values.size() == + post_read_expected_values.size()); } // start from middle of [lb, ub) otherwise it is easy to iterate out of @@ -1358,6 +1874,21 @@ class NonBatchedOpsStressTest : public StressTest { if (!check_no_key_in_range(mid, ub)) { return Status::OK(); } + } else if (iter->Valid()) { + GetIntVal(iter->key().ToString(), &curr); + if (static_cast(curr) < mid) { + thread->shared->SetVerificationFailure(); + fprintf(stderr, + "TestIterateAgainstExpected failed: found unexpectedly small " + "key\n"); + fprintf(stderr, "Column family: %s, op_logs: %s\n", + cfh->GetName().c_str(), op_logs.c_str()); + fprintf(stderr, "Last op found key: %s, expected at least: %s\n", + Slice(Key(curr)).ToString(true).c_str(), + Slice(Key(mid)).ToString(true).c_str()); + thread->stats.AddErrors(1); + return Status::OK(); + } } } else { iter->SeekForPrev(key); @@ -1367,6 +1898,21 @@ class NonBatchedOpsStressTest : public StressTest { if (!check_no_key_in_range(lb, mid + 1)) { return Status::OK(); } + } else if (iter->Valid()) { + GetIntVal(iter->key().ToString(), &curr); + if (mid < static_cast(curr)) { + thread->shared->SetVerificationFailure(); + fprintf(stderr, + "TestIterateAgainstExpected failed: found unexpectedly large " + "key\n"); + fprintf(stderr, "Column family: %s, op_logs: %s\n", + cfh->GetName().c_str(), op_logs.c_str()); + fprintf(stderr, "Last op found key: %s, expected at most: %s\n", + Slice(Key(curr)).ToString(true).c_str(), + Slice(Key(mid)).ToString(true).c_str()); + thread->stats.AddErrors(1); + return Status::OK(); + } } } @@ -1383,12 +1929,24 @@ class NonBatchedOpsStressTest : public StressTest { iter->Prev(); op_logs += "P"; } else { - const uint32_t expected_value = - shared->Get(rand_column_family, static_cast(curr)); - if (expected_value == shared->DELETION_SENTINEL) { + const uint32_t value_base_from_db = GetValueBase(iter->value()); + std::size_t index = static_cast(curr - lb); + assert(index < pre_read_expected_values.size() && + index < post_read_expected_values.size()); + const ExpectedValue pre_read_expected_value = + pre_read_expected_values[index]; + const ExpectedValue post_read_expected_value = + post_read_expected_values[index]; + if (ExpectedValueHelper::MustHaveNotExisted(pre_read_expected_value, + post_read_expected_value) || + !ExpectedValueHelper::InExpectedValueBaseRange( + value_base_from_db, pre_read_expected_value, + post_read_expected_value)) { // Fail fast to preserve the DB state. thread->shared->SetVerificationFailure(); - fprintf(stderr, "Iterator has key %s, but expected state does not.\n", + fprintf(stderr, + "Verification failed: iterator has key %s, but expected " + "state does not.\n", iter->key().ToString(true).c_str()); fprintf(stderr, "Column family: %s, op_logs: %s\n", cfh->GetName().c_str(), op_logs.c_str()); @@ -1404,6 +1962,19 @@ class NonBatchedOpsStressTest : public StressTest { } uint64_t next = 0; GetIntVal(iter->key().ToString(), &next); + if (next <= curr) { + thread->shared->SetVerificationFailure(); + fprintf(stderr, + "TestIterateAgainstExpected failed: found unexpectedly " + "small key\n"); + fprintf(stderr, "Column family: %s, op_logs: %s\n", + cfh->GetName().c_str(), op_logs.c_str()); + fprintf(stderr, "Last op found key: %s, expected at least: %s\n", + Slice(Key(next)).ToString(true).c_str(), + Slice(Key(curr + 1)).ToString(true).c_str()); + thread->stats.AddErrors(1); + return Status::OK(); + } if (!check_no_key_in_range(static_cast(curr + 1), static_cast(next))) { return Status::OK(); @@ -1416,6 +1987,19 @@ class NonBatchedOpsStressTest : public StressTest { } uint64_t prev = 0; GetIntVal(iter->key().ToString(), &prev); + if (curr <= prev) { + thread->shared->SetVerificationFailure(); + fprintf(stderr, + "TestIterateAgainstExpected failed: found unexpectedly " + "large key\n"); + fprintf(stderr, "Column family: %s, op_logs: %s\n", + cfh->GetName().c_str(), op_logs.c_str()); + fprintf(stderr, "Last op found key: %s, expected at most: %s\n", + Slice(Key(prev)).ToString(true).c_str(), + Slice(Key(curr - 1)).ToString(true).c_str()); + thread->stats.AddErrors(1); + return Status::OK(); + } if (!check_no_key_in_range(static_cast(prev + 1), static_cast(curr))) { return Status::OK(); @@ -1441,59 +2025,80 @@ class NonBatchedOpsStressTest : public StressTest { bool VerifyOrSyncValue(int cf, int64_t key, const ReadOptions& /*opts*/, SharedState* shared, const std::string& value_from_db, - const Status& s, bool strict = false) const { + std::string msg_prefix, const Status& s) const { if (shared->HasVerificationFailedYet()) { return false; } - // compare value_from_db with the value in the shared state - uint32_t value_base = shared->Get(cf, key); - if (value_base == SharedState::UNKNOWN_SENTINEL) { + const ExpectedValue expected_value = shared->Get(cf, key); + + if (expected_value.PendingWrite() || expected_value.PendingDelete()) { if (s.ok()) { // Value exists in db, update state to reflect that Slice slice(value_from_db); - value_base = GetValueBase(slice); - shared->Put(cf, key, value_base, false); + uint32_t value_base = GetValueBase(slice); + shared->SyncPut(cf, key, value_base); } else if (s.IsNotFound()) { // Value doesn't exist in db, update state to reflect that - shared->SingleDelete(cf, key, false); + shared->SyncDelete(cf, key); } return true; } - if (value_base == SharedState::DELETION_SENTINEL && !strict) { - return true; - } + // compare value_from_db with the value in the shared state if (s.ok()) { - char value[kValueMaxLen]; - if (value_base == SharedState::DELETION_SENTINEL) { - VerificationAbort(shared, "Unexpected value found", cf, key, - value_from_db, ""); + const Slice slice(value_from_db); + const uint32_t value_base_from_db = GetValueBase(slice); + if (ExpectedValueHelper::MustHaveNotExisted(expected_value, + expected_value)) { + VerificationAbort(shared, msg_prefix + ": Unexpected value found", cf, + key, value_from_db, ""); return false; } - size_t sz = GenerateValue(value_base, value, sizeof(value)); - if (value_from_db.length() != sz) { - VerificationAbort(shared, "Length of value read is not equal", cf, key, - value_from_db, Slice(value, sz)); + char expected_value_data[kValueMaxLen]; + size_t expected_value_data_size = + GenerateValue(expected_value.GetValueBase(), expected_value_data, + sizeof(expected_value_data)); + if (!ExpectedValueHelper::InExpectedValueBaseRange( + value_base_from_db, expected_value, expected_value)) { + VerificationAbort(shared, msg_prefix + ": Unexpected value found", cf, + key, value_from_db, + Slice(expected_value_data, expected_value_data_size)); return false; } - if (memcmp(value_from_db.data(), value, sz) != 0) { - VerificationAbort(shared, "Contents of value read don't match", cf, key, - value_from_db, Slice(value, sz)); + // TODO: are the length/memcmp() checks repetitive? + if (value_from_db.length() != expected_value_data_size) { + VerificationAbort(shared, + msg_prefix + ": Length of value read is not equal", + cf, key, value_from_db, + Slice(expected_value_data, expected_value_data_size)); return false; } - } else { - if (value_base != SharedState::DELETION_SENTINEL) { - char value[kValueMaxLen]; - size_t sz = GenerateValue(value_base, value, sizeof(value)); - VerificationAbort(shared, "Value not found: " + s.ToString(), cf, key, - "", Slice(value, sz)); + if (memcmp(value_from_db.data(), expected_value_data, + expected_value_data_size) != 0) { + VerificationAbort(shared, + msg_prefix + ": Contents of value read don't match", + cf, key, value_from_db, + Slice(expected_value_data, expected_value_data_size)); return false; } + } else if (s.IsNotFound()) { + if (ExpectedValueHelper::MustHaveExisted(expected_value, + expected_value)) { + char expected_value_data[kValueMaxLen]; + size_t expected_value_data_size = + GenerateValue(expected_value.GetValueBase(), expected_value_data, + sizeof(expected_value_data)); + VerificationAbort( + shared, msg_prefix + ": Value not found: " + s.ToString(), cf, key, + "", Slice(expected_value_data, expected_value_data_size)); + return false; + } + } else { + assert(false); } return true; } -#ifndef ROCKSDB_LITE void PrepareTxnDbOptions(SharedState* shared, TransactionDBOptions& txn_db_opts) override { txn_db_opts.rollback_deletion_type_callback = @@ -1506,7 +2111,6 @@ class NonBatchedOpsStressTest : public StressTest { return !shared->AllowsOverwrite(key_num); }; } -#endif // ROCKSDB_LITE }; StressTest* CreateNonBatchedOpsStressTest() { diff --git a/docs/Gemfile b/docs/Gemfile index d0602ba2b222..dfb1cfdd4784 100644 --- a/docs/Gemfile +++ b/docs/Gemfile @@ -1,4 +1,4 @@ source 'https://rubygems.org' -gem 'github-pages', '~> 225' +gem 'github-pages', '~> 227' gem "webrick", "~> 1.7" diff --git a/docs/Gemfile.lock b/docs/Gemfile.lock deleted file mode 100644 index a30377aeb406..000000000000 --- a/docs/Gemfile.lock +++ /dev/null @@ -1,285 +0,0 @@ -GEM - remote: https://rubygems.org/ - specs: - activesupport (6.0.4.6) - concurrent-ruby (~> 1.0, >= 1.0.2) - i18n (>= 0.7, < 2) - minitest (~> 5.1) - tzinfo (~> 1.1) - zeitwerk (~> 2.2, >= 2.2.2) - addressable (2.8.0) - public_suffix (>= 2.0.2, < 5.0) - coffee-script (2.4.1) - coffee-script-source - execjs - coffee-script-source (1.11.1) - colorator (1.1.0) - commonmarker (0.23.6) - concurrent-ruby (1.1.9) - dnsruby (1.61.9) - simpleidn (~> 0.1) - em-websocket (0.5.3) - eventmachine (>= 0.12.9) - http_parser.rb (~> 0) - ethon (0.15.0) - ffi (>= 1.15.0) - eventmachine (1.2.7) - execjs (2.8.1) - faraday (1.10.0) - faraday-em_http (~> 1.0) - faraday-em_synchrony (~> 1.0) - faraday-excon (~> 1.1) - faraday-httpclient (~> 1.0) - faraday-multipart (~> 1.0) - faraday-net_http (~> 1.0) - faraday-net_http_persistent (~> 1.0) - faraday-patron (~> 1.0) - faraday-rack (~> 1.0) - faraday-retry (~> 1.0) - ruby2_keywords (>= 0.0.4) - faraday-em_http (1.0.0) - faraday-em_synchrony (1.0.0) - faraday-excon (1.1.0) - faraday-httpclient (1.0.1) - faraday-multipart (1.0.3) - multipart-post (>= 1.2, < 3) - faraday-net_http (1.0.1) - faraday-net_http_persistent (1.2.0) - faraday-patron (1.0.0) - faraday-rack (1.0.0) - faraday-retry (1.0.3) - ffi (1.15.5) - forwardable-extended (2.6.0) - gemoji (3.0.1) - github-pages (225) - github-pages-health-check (= 1.17.9) - jekyll (= 3.9.0) - jekyll-avatar (= 0.7.0) - jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.2.0) - jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.15.1) - jekyll-gist (= 1.5.0) - jekyll-github-metadata (= 2.13.0) - jekyll-include-cache (= 0.2.1) - jekyll-mentions (= 1.6.0) - jekyll-optional-front-matter (= 0.3.2) - jekyll-paginate (= 1.1.0) - jekyll-readme-index (= 0.3.0) - jekyll-redirect-from (= 0.16.0) - jekyll-relative-links (= 0.6.1) - jekyll-remote-theme (= 0.4.3) - jekyll-sass-converter (= 1.5.2) - jekyll-seo-tag (= 2.8.0) - jekyll-sitemap (= 1.4.0) - jekyll-swiss (= 1.0.0) - jekyll-theme-architect (= 0.2.0) - jekyll-theme-cayman (= 0.2.0) - jekyll-theme-dinky (= 0.2.0) - jekyll-theme-hacker (= 0.2.0) - jekyll-theme-leap-day (= 0.2.0) - jekyll-theme-merlot (= 0.2.0) - jekyll-theme-midnight (= 0.2.0) - jekyll-theme-minimal (= 0.2.0) - jekyll-theme-modernist (= 0.2.0) - jekyll-theme-primer (= 0.6.0) - jekyll-theme-slate (= 0.2.0) - jekyll-theme-tactile (= 0.2.0) - jekyll-theme-time-machine (= 0.2.0) - jekyll-titles-from-headings (= 0.5.3) - jemoji (= 0.12.0) - kramdown (= 2.3.1) - kramdown-parser-gfm (= 1.1.0) - liquid (= 4.0.3) - mercenary (~> 0.3) - minima (= 2.5.1) - nokogiri (>= 1.12.5, < 2.0) - rouge (= 3.26.0) - terminal-table (~> 1.4) - github-pages-health-check (1.17.9) - addressable (~> 2.3) - dnsruby (~> 1.60) - octokit (~> 4.0) - public_suffix (>= 3.0, < 5.0) - typhoeus (~> 1.3) - html-pipeline (2.14.0) - activesupport (>= 2) - nokogiri (>= 1.4) - http_parser.rb (0.8.0) - i18n (0.9.5) - concurrent-ruby (~> 1.0) - jekyll (3.9.0) - addressable (~> 2.4) - colorator (~> 1.0) - em-websocket (~> 0.5) - i18n (~> 0.7) - jekyll-sass-converter (~> 1.0) - jekyll-watch (~> 2.0) - kramdown (>= 1.17, < 3) - liquid (~> 4.0) - mercenary (~> 0.3.3) - pathutil (~> 0.9) - rouge (>= 1.7, < 4) - safe_yaml (~> 1.0) - jekyll-avatar (0.7.0) - jekyll (>= 3.0, < 5.0) - jekyll-coffeescript (1.1.1) - coffee-script (~> 2.2) - coffee-script-source (~> 1.11.1) - jekyll-commonmark (1.4.0) - commonmarker (~> 0.22) - jekyll-commonmark-ghpages (0.2.0) - commonmarker (~> 0.23.4) - jekyll (~> 3.9.0) - jekyll-commonmark (~> 1.4.0) - rouge (>= 2.0, < 4.0) - jekyll-default-layout (0.1.4) - jekyll (~> 3.0) - jekyll-feed (0.15.1) - jekyll (>= 3.7, < 5.0) - jekyll-gist (1.5.0) - octokit (~> 4.2) - jekyll-github-metadata (2.13.0) - jekyll (>= 3.4, < 5.0) - octokit (~> 4.0, != 4.4.0) - jekyll-include-cache (0.2.1) - jekyll (>= 3.7, < 5.0) - jekyll-mentions (1.6.0) - html-pipeline (~> 2.3) - jekyll (>= 3.7, < 5.0) - jekyll-optional-front-matter (0.3.2) - jekyll (>= 3.0, < 5.0) - jekyll-paginate (1.1.0) - jekyll-readme-index (0.3.0) - jekyll (>= 3.0, < 5.0) - jekyll-redirect-from (0.16.0) - jekyll (>= 3.3, < 5.0) - jekyll-relative-links (0.6.1) - jekyll (>= 3.3, < 5.0) - jekyll-remote-theme (0.4.3) - addressable (~> 2.0) - jekyll (>= 3.5, < 5.0) - jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) - rubyzip (>= 1.3.0, < 3.0) - jekyll-sass-converter (1.5.2) - sass (~> 3.4) - jekyll-seo-tag (2.8.0) - jekyll (>= 3.8, < 5.0) - jekyll-sitemap (1.4.0) - jekyll (>= 3.7, < 5.0) - jekyll-swiss (1.0.0) - jekyll-theme-architect (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-cayman (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-dinky (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-hacker (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-leap-day (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-merlot (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-midnight (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-minimal (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-modernist (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-primer (0.6.0) - jekyll (> 3.5, < 5.0) - jekyll-github-metadata (~> 2.9) - jekyll-seo-tag (~> 2.0) - jekyll-theme-slate (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-tactile (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-theme-time-machine (0.2.0) - jekyll (> 3.5, < 5.0) - jekyll-seo-tag (~> 2.0) - jekyll-titles-from-headings (0.5.3) - jekyll (>= 3.3, < 5.0) - jekyll-watch (2.2.1) - listen (~> 3.0) - jemoji (0.12.0) - gemoji (~> 3.0) - html-pipeline (~> 2.2) - jekyll (>= 3.0, < 5.0) - kramdown (2.3.1) - rexml - kramdown-parser-gfm (1.1.0) - kramdown (~> 2.0) - liquid (4.0.3) - listen (3.7.1) - rb-fsevent (~> 0.10, >= 0.10.3) - rb-inotify (~> 0.9, >= 0.9.10) - mercenary (0.3.6) - mini_portile2 (2.8.0) - minima (2.5.1) - jekyll (>= 3.5, < 5.0) - jekyll-feed (~> 0.9) - jekyll-seo-tag (~> 2.1) - minitest (5.15.0) - multipart-post (2.1.1) - nokogiri (1.13.10) - mini_portile2 (~> 2.8.0) - racc (~> 1.4) - octokit (4.22.0) - faraday (>= 0.9) - sawyer (~> 0.8.0, >= 0.5.3) - pathutil (0.16.2) - forwardable-extended (~> 2.6) - public_suffix (4.0.6) - racc (1.6.1) - rb-fsevent (0.11.1) - rb-inotify (0.10.1) - ffi (~> 1.0) - rexml (3.2.5) - rouge (3.26.0) - ruby2_keywords (0.0.5) - rubyzip (2.3.2) - safe_yaml (1.0.5) - sass (3.7.4) - sass-listen (~> 4.0.0) - sass-listen (4.0.0) - rb-fsevent (~> 0.9, >= 0.9.4) - rb-inotify (~> 0.9, >= 0.9.7) - sawyer (0.8.2) - addressable (>= 2.3.5) - faraday (> 0.8, < 2.0) - simpleidn (0.2.1) - unf (~> 0.1.4) - terminal-table (1.8.0) - unicode-display_width (~> 1.1, >= 1.1.1) - thread_safe (0.3.6) - typhoeus (1.4.0) - ethon (>= 0.9.0) - tzinfo (1.2.10) - thread_safe (~> 0.1) - unf (0.1.4) - unf_ext - unf_ext (0.0.8) - unicode-display_width (1.8.0) - webrick (1.7.0) - zeitwerk (2.5.4) - -PLATFORMS - ruby - -DEPENDENCIES - github-pages (~> 225) - webrick (~> 1.7) - -BUNDLED WITH - 2.2.3 diff --git a/docs/_includes/footer.html b/docs/_includes/footer.html index f560172d19fa..f5b78babd360 100644 --- a/docs/_includes/footer.html +++ b/docs/_includes/footer.html @@ -13,7 +13,7 @@

Meta Open Source diff --git a/docs/_posts/2023-11-06-java-jni-benchmarks.markdown b/docs/_posts/2023-11-06-java-jni-benchmarks.markdown new file mode 100644 index 000000000000..2cf5c83620a3 --- /dev/null +++ b/docs/_posts/2023-11-06-java-jni-benchmarks.markdown @@ -0,0 +1,287 @@ +--- +title: Java API Performance Improvements +layout: post +author: alanpaxton +category: blog +--- +# RocksDB Java API Performance Improvements + +Evolved Binary has been working on several aspects of how the Java API to RocksDB can be improved. Two aspects of this which are of particular importance are performance and the developer experience. + +* We have built some synthetic benchmark code to determine which are the most efficient methods of transferring data between Java and C++. +* We have used the results of the synthetic benchmarking to guide plans for rationalising the API interfaces. +* We have made some opportunistic performance optimizations/fixes within the Java API which have already yielded noticable improvements. + +## Synthetic JNI API Performance Benchmarks +The synthetic benchmark repository contains tests designed to isolate the Java to/from C++ interaction of a canonical data intensive Key/Value Store implemented in C++ with a Java (JNI) API layered on top. + +JNI provides several mechanisms for allowing transfer of data between Java buffers and C++ buffers. These mechanisms are not trivial, because they require the JNI system to ensure that Java memory under the control of the JVM is not moved or garbage collected whilst it is being accessed outside the direct control of the JVM. + +We set out to determine which of multiple options for transfer of data from `C++` to `Java` and vice-versa were the most efficient. We used the [Java Microbenchmark Harness](https://github.com/openjdk/jmh) to set up repeatable benchmarks to measure all the options. + +We explore these and some other potential mechanisms in the detailed results (in our [Synthetic JNI performance repository](https://github.com/evolvedbinary/jni-benchmarks/blob/main/DataBenchmarks.md)) + +We summarise this work here: + +### The Model + +* In `C++` we represent the on-disk data as an in-memory map of `(key, value)` + pairs. +* For a fetch query, we expect the result to be a Java object with access to the + contents of the _value_. This may be a standard Java object which does the job + of data access (a `byte[]` or a `ByteBuffer`) or an object of our own devising + which holds references to the value in some form (a `FastBuffer` pointing to + `com.sun.unsafe.Unsafe` unsafe memory, for instance). + +### Data Types + +There are several potential data types for holding data for transfer, and they +are unsurprisingly quite connected underneath. + +#### Byte Array + +The simplest data container is a _raw_ array of bytes (`byte[]`). + +There are 3 different mechanisms for transferring data between a `byte[]` and +C++ + +* At the C++ side, the method + [`JNIEnv.GetArrayCritical()`](https://docs.oracle.com/en/java/javase/13/docs/specs/jni/functions.html#getprimitivearraycritical) + allows access to a C++ pointer to the underlying array. +* The `JNIEnv` methods `GetByteArrayElements()` and `ReleaseByteArrayElements()` + fetch references/copies to and from the contents of a byte array, with less + concern for critical sections than the _critical_ methods, though they are + consequently more likely/certain to result in (extra) copies. +* The `JNIEnv` methods `GetByteArrayRegion()` and `SetByteArrayRegion()` + transfer raw C++ buffer data to and from the contents of a byte array. These + must ultimately do some data pinning for the duration of copies; the + mechanisms may be similar or different to the _critical_ operations, and + therefore performance may differ. + +#### Byte Buffer + +A `ByteBuffer` abstracts the contents of a collection of bytes, and was in fact +introduced to support a range of higher-performance I/O operations in some +circumstances. + +There are 2 types of byte buffers in Java, _indirect_ and _direct_. Indirect +byte buffers are the standard, and the memory they use is on-heap as with all +usual Java objects. In contrast, direct byte buffers are used to wrap off-heap +memory which is accessible to direct network I/O. Either type of `ByteBuffer` +can be allocated at the Java side, using the `allocate()` and `allocateDirect()` +methods respectively. + +Direct byte buffers can be created in C++ using the JNI method +[`JNIEnv.NewDirectByteBuffer()`](https://docs.oracle.com/en/java/javase/13/docs/specs/jni/functions.html#newdirectbytebuffer) +to wrap some native (C++) memory. + +Direct byte buffers can be accessed in C++ using the +[`JNIEnv.GetDirectBufferAddress()`](https://docs.oracle.com/en/java/javase/13/docs/specs/jni/functions.html#GetDirectBufferAddress) +and measured using +[`JNIEnv.GetDirectBufferCapacity()`](https://docs.oracle.com/en/java/javase/13/docs/specs/jni/functions.html#GetDirectBufferCapacity) + +#### Unsafe Memory + +The call `com.sun.unsafe.Unsafe.allocateMemory()` returns a handle which is (of course) just a pointer to raw memory, and +can be used as such on the C++ side. We could turn it into a byte buffer on the +C++ side by calling `JNIEnv.NewDirectByteBuffer()`, or simply use it as a native +C++ buffer at the expected address, assuming we record or remember how much +space was allocated. + +A custom `FastBuffer` class provides access to unsafe memory from the Java side. + + +#### Allocation + +For these benchmarks, allocation has been excluded from the benchmark costs by +pre-allocating a quantity of buffers of the appropriate kind as part of the test +setup. Each run of the benchmark acquires an existing buffer from a pre-allocated +FIFO list, and returns it afterwards. A small test has +confirmed that the request and return cycle is of insignificant cost compared to +the benchmark API call. + +### GetJNIBenchmark Performance + +Benchmarks ran for a duration of order 6 hours on an otherwise unloaded VM, + the error bars are small and we can have strong confidence in the values + derived and plotted. + +![Raw JNI Get small](/static/images/jni-get-benchmarks/fig_1024_1_none_nopoolbig.png) + +Comparing all the benchmarks as the data size tends large, the conclusions we +can draw are: + +- Indirect byte buffers add cost; they are effectively an overhead on plain + `byte[]` and the JNI-side only allows them to be accessed via their + encapsulated `byte[]`. +- `SetRegion` and `GetCritical` mechanisms for copying data into a `byte[]` are + of very comparable performance; presumably the behaviour behind the scenes of + `SetRegion` is very similar to that of declaring a critical region, doing a + `memcpy()` and releasing the critical region. +- `GetElements` methods for transferring data from C++ to Java are consistently + less efficient than `SetRegion` and `GetCritical`. +- Getting into a raw memory buffer, passed as an address (the `handle` of an + `Unsafe` or of a netty `ByteBuf`) is of similar cost to the more efficient + `byte[]` operations. +- Getting into a direct `nio.ByteBuffer` is of similar cost again; while the + ByteBuffer is passed over JNI as an ordinary Java object, JNI has a specific + method for getting hold of the address of the direct buffer, and using this, the + `get()` cost with a ByteBuffer is just that of the underlying C++ `memcpy()`. + +At small(er) data sizes, we can see whether other factors are important. + +![Raw JNI Get large](/static/images/jni-get-benchmarks/fig_1024_1_none_nopoolsmall.png) + +- Indirect byte buffers are the most significant overhead here. Again, we can + conclude that this is due to pure overhead compared to `byte[]` operations. +- At the lowest data sizes, netty `ByteBuf`s and unsafe memory are marginally + more efficient than `byte[]`s or (slightly less efficient) direct + `nio.Bytebuffer`s. This may be explained by even the small cost of + calling the JNI model on the C++ side simply to acquire a + direct buffer address. The margins (nanoseconds) here are extremely small. + +#### Post processing the results + +Our benchmark model for post-processing is to transfer the results into a +`byte[]`. Where the result is already a `byte[]` this may seem like an unfair +extra cost, but the aim is to model the least cost processing step for any kind +of result. + +- Copying into a `byte[]` using the bulk methods supported by `byte[]`, + `nio.ByteBuffer` have comparable performance. +- Accessing the contents of an `Unsafe` buffer using the supplied unsafe methods + is inefficient. The access is word by + word, in Java. +- Accessing the contents of a netty `ByteBuf` is similarly inefficient; again + the access is presumably word by word, using normal + Java mechanisms. + +![Copy out JNI Get](/static/images/jni-get-benchmarks/fig_1024_1_copyout_nopoolbig.png) + +### PutJNIBenchmark + +We benchmarked `Put` methods in a similar synthetic fashion in less depth, but enough to confirm that the performance profile is similar/symmetrical. As with `get()` using `GetElements` is the least performant way of implementing transfers to/from Java objects in C++/JNI, and other JNI mechanisms do not differ greatly one from another. + +## Lessons from Synthetic API + +Performance analysis shows that for `get()`, fetching into allocated `byte[]` is +equally as efficient as any other mechanism, as long as JNI region methods are used +for the internal data transfer. Copying out or otherwise using the +result on the Java side is straightforward and efficient. Using `byte[]` avoids the manual memory +management required with direct `nio.ByteBuffer`s, which extra work does not +appear to provide any gain. A C++ implementation using the `GetRegion` JNI +method is probably to be preferred to using `GetCritical` because while their +performance is equal, `GetRegion` is a higher-level/simpler abstraction. + +Vitally, whatever JNI transfer mechanism is chosen, the buffer allocation +mechanism and pattern is crucial to achieving good performance. We experimented +with making use of netty's pooled allocator part of the benchmark, and the +difference of `getIntoPooledNettyByteBuf`, using the allocator, compared to +`getIntoNettyByteBuf` using the same pre-allocate on setup as every other +benchmark, is significant. + +Equally importantly, transfer of data to or from buffers should where possible +be done in bulk, using array copy or buffer copy mechanisms. Thought should +perhaps be given to supporting common transformations in the underlying C++ +layer. + +## API Recommendations + +Of course there is some noise within the results. but we can agree: + + * Don't make copies you don't need to make + * Don't allocate/deallocate when you can avoid it + +Translating this into designing an efficient API, we want to: + + * Support API methods that return results in buffers supplied by the client. + * Support `byte[]`-based APIs as the simplest way of getting data into a usable configuration for a broad range of Java use. + * Support direct `ByteBuffer`s as these can reduce copies when used as part of a chain of `ByteBuffer`-based operations. This sort of sophisticated streaming model is most likely to be used by clients where performance is important, and so we decide to support it. + * Support indirect `ByteBuffer`s for a combination of reasons: + * API consistency between direct and indirect buffers + * Simplicity of implementation, as we can wrap `byte[]`-oriented methods + * Continue to support methods which allocate return buffers per-call, as these are the easiest to use on initial encounter with the RocksDB API. + +High performance Java interaction with RocksDB ultimately requires architectural decisions by the client + * Use more complex (client supplied buffer) API methods where performance matters + * Don't allocate/deallocate where you don't need to + * recycle your own buffers where this makes sense + * or make sure that you are supplying the ultimate destination buffer (your cache, or a target network buffer) as input to RocksDB `get()` and `put()` calls + +We are currently implementing a number of extra methods consistently across the Java fetch and store APIs to RocksDB in the PR [Java API consistency between RocksDB.put() , .merge() and Transaction.put() , .merge()](https://github.com/facebook/rocksdb/pull/11019) according to these principles. + +## Optimizations + +### Reduce Copies within API Implementation + +Having analysed JNI performance as described, we reviewed the core of RocksJNI for opportunities to improve the performance. We noticed one thing in particular; some of the `get()` methods of the Java API had not been updated to take advantage of the new [`PinnableSlice`](http://rocksdb.org/blog/2017/08/24/pinnableslice.html) methods. + +Fixing this turned out to be a straightforward change, which has now been incorporated in the codebase [Improve Java API `get()` performance by reducing copies](https://github.com/facebook/rocksdb/pull/10970) + +#### Performance Results + +Using the JMH performances tests we updated as part of the above PR, we can see a small but consistent improvement in performance for all of the different get method variants which we have enhanced in the PR. + +```sh +java -jar target/rocksdbjni-jmh-1.0-SNAPSHOT-benchmarks.jar -p keyCount=1000,50000 -p keySize=128 -p valueSize=1024,16384 -p columnFamilyTestType="1_column_family" GetBenchmarks.get GetBenchmarks.preallocatedByteBufferGet GetBenchmarks.preallocatedGet +``` +The y-axis shows `ops/sec` in throughput, so higher is better. + +![](/static/images/jni-get-benchmarks/optimization-graph.png) + +### Analysis + +Before the invention of the Pinnable Slice the simplest RocksDB (native) API `Get()` looked like this: + +```cpp +Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) +``` + +After PinnableSlice the correct way for new code to implement a `get()` is like this + +```cpp +Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) +``` + +But of course RocksDB has to support legacy code, so there is an `inline` method in `db.h` which re-implements the former using the latter. +And RocksJava API implementation seamlessly continues to use the `std::string`-based `get()` + +Let's examine what happens when get() is called from Java + +```cpp +jint Java_org_rocksdb_RocksDB_get__JJ_3BII_3BIIJ( + JNIEnv* env, jobject, jlong jdb_handle, jlong jropt_handle, jbyteArray jkey, + jint jkey_off, jint jkey_len, jbyteArray jval, jint jval_off, jint jval_len, + jlong jcf_handle) +``` + + 1. Create an empty `std::string value` + 2. Call `DB::Get()` using the `std::string` variant + 3. Copy the resultant `std::string` into Java, using the JNI `SetByteArrayRegion()` method + +So stage (3) costs us a copy into Java. It's mostly unavoidable that there will be at least the one copy from a C++ buffer into a Java buffer. + +But what does stage 2 do ? + + * Create a `PinnableSlice(std::string&)` which uses the value as the slice's backing buffer. + * Call `DB::Get()` using the PinnableSlice variant + * Work out if the slice has pinned data, in which case copy the pinned data into value and release it. + * ..or, if the slice has not pinned data, it is already in value (because we tried, but couldn't pin anything). + +So stage (2) costs us a copy into a `std::string`. But! It's just a naive `std::string` that we have copied a large buffer into. And in RocksDB, the buffer is or can be large, so an extra copy something we need to worry about. + +Luckily this is easy to fix. In the Java API (JNI) implementation: + + 1. Create a PinnableSlice() which uses its own default backing buffer. + 2. Call `DB::Get()` using the `PinnableSlice` variant of the RocksDB API + 3. Copy the data indicated by the `PinnableSlice` straight into the Java output buffer using the JNI `SetByteArrayRegion()` method, then release the slice. + 4. Work out if the slice has successfully pinned data, in which case copy the pinned data straight into the Java output buffer using the JNI `SetByteArrayRegion()` method, then release the pin. + 5. ..or, if the slice has not pinned data, it is in the pinnable slice's default backing buffer. All that is left, is to copy it straight into the Java output buffer using the JNI SetByteArrayRegion() method. + +In the case where the `PinnableSlice` has succesfully pinned the data, this saves us the intermediate copy to the `std::string`. In the case where it hasn't, we still have the extra copy so the observed performance improvement depends on when the data can be pinned. Luckily, our benchmarking suggests that the pin is happening in a significant number of cases. + +On discussion with the RocksDB core team we understand that the core `PinnableSlice` optimization is most likely to succeed when pages are loaded from the block cache, rather than when they are in `memtable`. And it might be possible to successfully pin in the `memtable` as well, with some extra coding effort. This would likely improve the results for these benchmarks. diff --git a/docs/static/images/jni-get-benchmarks/fig_1024_1_copyout_nopoolbig.png b/docs/static/images/jni-get-benchmarks/fig_1024_1_copyout_nopoolbig.png new file mode 100644 index 000000000000..2d662dde8320 Binary files /dev/null and b/docs/static/images/jni-get-benchmarks/fig_1024_1_copyout_nopoolbig.png differ diff --git a/docs/static/images/jni-get-benchmarks/fig_1024_1_none_nopoolbig.png b/docs/static/images/jni-get-benchmarks/fig_1024_1_none_nopoolbig.png new file mode 100644 index 000000000000..ef071b388340 Binary files /dev/null and b/docs/static/images/jni-get-benchmarks/fig_1024_1_none_nopoolbig.png differ diff --git a/docs/static/images/jni-get-benchmarks/fig_1024_1_none_nopoolsmall.png b/docs/static/images/jni-get-benchmarks/fig_1024_1_none_nopoolsmall.png new file mode 100644 index 000000000000..732c42ff768e Binary files /dev/null and b/docs/static/images/jni-get-benchmarks/fig_1024_1_none_nopoolsmall.png differ diff --git a/docs/static/images/jni-get-benchmarks/optimization-graph.png b/docs/static/images/jni-get-benchmarks/optimization-graph.png new file mode 100644 index 000000000000..8eb1e6bcf985 Binary files /dev/null and b/docs/static/images/jni-get-benchmarks/optimization-graph.png differ diff --git a/env/composite_env.cc b/env/composite_env.cc index b93aa9fcbf52..8ddc9a1a6cd6 100644 --- a/env/composite_env.cc +++ b/env/composite_env.cc @@ -391,7 +391,6 @@ Status CompositeEnv::NewDirectory(const std::string& name, namespace { static std::unordered_map env_wrapper_type_info = { -#ifndef ROCKSDB_LITE {"target", OptionTypeInfo(0, OptionType::kUnknown, OptionVerificationType::kByName, OptionTypeFlags::kDontSerialize) @@ -433,24 +432,19 @@ static std::unordered_map env_wrapper_type_info = { return target->env->ValidateOptions(db_opts, cf_opts); } })}, -#endif // ROCKSDB_LITE }; static std::unordered_map composite_fs_wrapper_type_info = { -#ifndef ROCKSDB_LITE {"file_system", OptionTypeInfo::AsCustomSharedPtr( 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}, -#endif // ROCKSDB_LITE }; static std::unordered_map composite_clock_wrapper_type_info = { -#ifndef ROCKSDB_LITE {"clock", OptionTypeInfo::AsCustomSharedPtr( 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}, -#endif // ROCKSDB_LITE }; } // namespace @@ -488,7 +482,6 @@ Status CompositeEnvWrapper::PrepareOptions(const ConfigOptions& options) { return Env::PrepareOptions(options); } -#ifndef ROCKSDB_LITE std::string CompositeEnvWrapper::SerializeOptions( const ConfigOptions& config_options, const std::string& header) const { auto options = CompositeEnv::SerializeOptions(config_options, header); @@ -498,7 +491,6 @@ std::string CompositeEnvWrapper::SerializeOptions( } return options; } -#endif // ROCKSDB_LITE EnvWrapper::EnvWrapper(Env* t) : target_(t) { RegisterOptions("", &target_, &env_wrapper_type_info); @@ -519,7 +511,6 @@ Status EnvWrapper::PrepareOptions(const ConfigOptions& options) { return Env::PrepareOptions(options); } -#ifndef ROCKSDB_LITE std::string EnvWrapper::SerializeOptions(const ConfigOptions& config_options, const std::string& header) const { auto parent = Env::SerializeOptions(config_options, ""); @@ -539,6 +530,5 @@ std::string EnvWrapper::SerializeOptions(const ConfigOptions& config_options, return result; } } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/env/composite_env_wrapper.h b/env/composite_env_wrapper.h index d6f8366e2138..8fc90fdaf38c 100644 --- a/env/composite_env_wrapper.h +++ b/env/composite_env_wrapper.h @@ -289,10 +289,8 @@ class CompositeEnvWrapper : public CompositeEnv { const Customizable* Inner() const override { return target_.env; } Status PrepareOptions(const ConfigOptions& options) override; -#ifndef ROCKSDB_LITE std::string SerializeOptions(const ConfigOptions& config_options, const std::string& header) const override; -#endif // ROCKSDB_LITE // Return the target to which this Env forwards all calls Env* env_target() const { return target_.env; } diff --git a/env/env.cc b/env/env.cc index fb2cb950d767..948765bb916e 100644 --- a/env/env.cc +++ b/env/env.cc @@ -29,7 +29,6 @@ namespace ROCKSDB_NAMESPACE { namespace { -#ifndef ROCKSDB_LITE static int RegisterBuiltinEnvs(ObjectLibrary& library, const std::string& /*arg*/) { library.AddFactory(MockEnv::kClassName(), [](const std::string& /*uri*/, @@ -48,15 +47,12 @@ static int RegisterBuiltinEnvs(ObjectLibrary& library, size_t num_types; return static_cast(library.GetFactoryCount(&num_types)); } -#endif // ROCKSDB_LITE static void RegisterSystemEnvs() { -#ifndef ROCKSDB_LITE static std::once_flag loaded; std::call_once(loaded, [&]() { RegisterBuiltinEnvs(*(ObjectLibrary::Default().get()), ""); }); -#endif // ROCKSDB_LITE } class LegacySystemClock : public SystemClock { @@ -97,7 +93,6 @@ class LegacySystemClock : public SystemClock { return env_->TimeToString(time); } -#ifndef ROCKSDB_LITE std::string SerializeOptions(const ConfigOptions& /*config_options*/, const std::string& /*prefix*/) const override { // We do not want the LegacySystemClock to appear in the serialized output. @@ -105,7 +100,6 @@ class LegacySystemClock : public SystemClock { // would be part of the Env. As such, do not serialize it here. return ""; } -#endif // ROCKSDB_LITE }; class LegacySequentialFileWrapper : public FSSequentialFile { @@ -165,8 +159,7 @@ class LegacyRandomAccessFileWrapper : public FSRandomAccessFile { req.len = fs_reqs[i].len; req.scratch = fs_reqs[i].scratch; req.status = Status::OK(); - - reqs.emplace_back(req); + reqs.emplace_back(std::move(req)); } status = target_->MultiRead(reqs.data(), num_reqs); for (size_t i = 0; i < num_reqs; ++i) { @@ -605,7 +598,6 @@ class LegacyFileSystemWrapper : public FileSystem { return status_to_io_status(target_->IsDirectory(path, is_dir)); } -#ifndef ROCKSDB_LITE std::string SerializeOptions(const ConfigOptions& /*config_options*/, const std::string& /*prefix*/) const override { // We do not want the LegacyFileSystem to appear in the serialized output. @@ -613,7 +605,6 @@ class LegacyFileSystemWrapper : public FileSystem { // would be part of the Env. As such, do not serialize it here. return ""; } -#endif // ROCKSDB_LITE private: Env* target_; }; @@ -640,10 +631,6 @@ Status Env::NewLogger(const std::string& fname, return NewEnvLogger(fname, this, result); } -Status Env::LoadEnv(const std::string& value, Env** result) { - return CreateFromString(ConfigOptions(), value, result); -} - Status Env::CreateFromString(const ConfigOptions& config_options, const std::string& value, Env** result) { Env* base = Env::Default(); @@ -653,7 +640,7 @@ Status Env::CreateFromString(const ConfigOptions& config_options, } else { RegisterSystemEnvs(); Env* env = *result; - Status s = LoadStaticObject(config_options, value, nullptr, &env); + Status s = LoadStaticObject(config_options, value, &env); if (s.ok()) { *result = env; } @@ -661,11 +648,6 @@ Status Env::CreateFromString(const ConfigOptions& config_options, } } -Status Env::LoadEnv(const std::string& value, Env** result, - std::shared_ptr* guard) { - return CreateFromString(ConfigOptions(), value, result, guard); -} - Status Env::CreateFromString(const ConfigOptions& config_options, const std::string& value, Env** result, std::shared_ptr* guard) { @@ -688,13 +670,8 @@ Status Env::CreateFromString(const ConfigOptions& config_options, status = Status::OK(); } else { RegisterSystemEnvs(); -#ifndef ROCKSDB_LITE // First, try to load the Env as a unique object. status = config_options.registry->NewObject(id, &env, &uniq); -#else - status = - Status::NotSupported("Cannot load environment in LITE mode", value); -#endif } if (config_options.ignore_unsupported_options && status.IsNotSupported()) { status = Status::OK(); @@ -1187,11 +1164,9 @@ const std::shared_ptr& Env::GetSystemClock() const { namespace { static std::unordered_map sc_wrapper_type_info = { -#ifndef ROCKSDB_LITE {"target", OptionTypeInfo::AsCustomSharedPtr( 0, OptionVerificationType::kByName, OptionTypeFlags::kDontSerialize)}, -#endif // ROCKSDB_LITE }; } // namespace @@ -1207,7 +1182,6 @@ Status SystemClockWrapper::PrepareOptions(const ConfigOptions& options) { return SystemClock::PrepareOptions(options); } -#ifndef ROCKSDB_LITE std::string SystemClockWrapper::SerializeOptions( const ConfigOptions& config_options, const std::string& header) const { auto parent = SystemClock::SerializeOptions(config_options, ""); @@ -1227,9 +1201,7 @@ std::string SystemClockWrapper::SerializeOptions( return result; } } -#endif // ROCKSDB_LITE -#ifndef ROCKSDB_LITE static int RegisterBuiltinSystemClocks(ObjectLibrary& library, const std::string& /*arg*/) { library.AddFactory( @@ -1242,7 +1214,6 @@ static int RegisterBuiltinSystemClocks(ObjectLibrary& library, size_t num_types; return static_cast(library.GetFactoryCount(&num_types)); } -#endif // ROCKSDB_LITE Status SystemClock::CreateFromString(const ConfigOptions& config_options, const std::string& value, @@ -1252,14 +1223,16 @@ Status SystemClock::CreateFromString(const ConfigOptions& config_options, *result = clock; return Status::OK(); } else { -#ifndef ROCKSDB_LITE static std::once_flag once; std::call_once(once, [&]() { RegisterBuiltinSystemClocks(*(ObjectLibrary::Default().get()), ""); }); -#endif // ROCKSDB_LITE - return LoadSharedObject(config_options, value, nullptr, - result); + return LoadSharedObject(config_options, value, result); } } + +bool SystemClock::TimedWait(port::CondVar* cv, + std::chrono::microseconds deadline) { + return cv->TimedWait(deadline.count()); +} } // namespace ROCKSDB_NAMESPACE diff --git a/env/env_basic_test.cc b/env/env_basic_test.cc index 0f18b321867e..93bb2dba0eb0 100644 --- a/env/env_basic_test.cc +++ b/env/env_basic_test.cc @@ -32,7 +32,6 @@ static Env* GetMockEnv() { static std::unique_ptr mock_env(MockEnv::Create(Env::Default())); return mock_env.get(); } -#ifndef ROCKSDB_LITE static Env* NewTestEncryptedEnv(Env* base, const std::string& provider_id) { ConfigOptions config_opts; config_opts.invoke_prepare_options = false; @@ -58,10 +57,10 @@ static Env* GetTestEnv() { static std::shared_ptr env_guard; static Env* custom_env = nullptr; if (custom_env == nullptr) { - const char* uri = getenv("TEST_ENV_URI"); - if (uri != nullptr) { - EXPECT_OK(Env::CreateFromUri(ConfigOptions(), uri, "", &custom_env, - &env_guard)); + const char* env_uri = getenv("TEST_ENV_URI"); + if (env_uri != nullptr) { + EXPECT_OK(Env::CreateFromUri(ConfigOptions(), env_uri, /*fs_uri=*/"", + &custom_env, &env_guard)); } } EXPECT_NE(custom_env, nullptr); @@ -72,16 +71,15 @@ static Env* GetTestFS() { static std::shared_ptr fs_env_guard; static Env* fs_env = nullptr; if (fs_env == nullptr) { - const char* uri = getenv("TEST_FS_URI"); - if (uri != nullptr) { - EXPECT_OK( - Env::CreateFromUri(ConfigOptions(), uri, "", &fs_env, &fs_env_guard)); + const char* fs_uri = getenv("TEST_FS_URI"); + if (fs_uri != nullptr) { + EXPECT_OK(Env::CreateFromUri(ConfigOptions(), /*env_uri=*/"", fs_uri, + &fs_env, &fs_env_guard)); } } EXPECT_NE(fs_env, nullptr); return fs_env; } -#endif // ROCKSDB_LITE } // namespace class EnvBasicTestWithParam @@ -111,7 +109,6 @@ INSTANTIATE_TEST_CASE_P(EnvDefault, EnvMoreTestWithParam, INSTANTIATE_TEST_CASE_P(MockEnv, EnvBasicTestWithParam, ::testing::Values(&GetMockEnv)); -#ifndef ROCKSDB_LITE // next statements run env test against default encryption code. INSTANTIATE_TEST_CASE_P(EncryptedEnv, EnvBasicTestWithParam, ::testing::Values(&GetCtrEncryptedEnv)); @@ -148,7 +145,6 @@ INSTANTIATE_TEST_CASE_P(CustomEnv, EnvBasicTestWithParam, INSTANTIATE_TEST_CASE_P(CustomEnv, EnvMoreTestWithParam, ::testing::ValuesIn(GetCustomEnvs())); -#endif // ROCKSDB_LITE TEST_P(EnvBasicTestWithParam, Basics) { uint64_t file_size; diff --git a/env/env_chroot.cc b/env/env_chroot.cc index a64373517f8d..5ff32a7e444d 100644 --- a/env/env_chroot.cc +++ b/env/env_chroot.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#if !defined(ROCKSDB_LITE) && !defined(OS_WIN) +#if !defined(OS_WIN) #include "env/env_chroot.h" @@ -145,4 +145,4 @@ Env* NewChrootEnv(Env* base_env, const std::string& chroot_dir) { } // namespace ROCKSDB_NAMESPACE -#endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN) +#endif // !defined(OS_WIN) diff --git a/env/env_chroot.h b/env/env_chroot.h index 9e5b9a1e95ae..9cead1561b94 100644 --- a/env/env_chroot.h +++ b/env/env_chroot.h @@ -5,7 +5,7 @@ #pragma once -#if !defined(ROCKSDB_LITE) && !defined(OS_WIN) +#if !defined(OS_WIN) #include @@ -52,4 +52,4 @@ std::shared_ptr NewChrootFileSystem( } // namespace ROCKSDB_NAMESPACE -#endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN) +#endif // !defined(OS_WIN) diff --git a/env/env_encryption.cc b/env/env_encryption.cc index c6b0a257dbf1..7b2a531c424e 100644 --- a/env/env_encryption.cc +++ b/env/env_encryption.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "rocksdb/env_encryption.h" @@ -25,22 +24,12 @@ #include "util/random.h" #include "util/string_util.h" -#endif namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE std::shared_ptr EncryptionProvider::NewCTRProvider( const std::shared_ptr& cipher) { return std::make_shared(cipher); } -// Read up to "n" bytes from the file. "scratch[0..n-1]" may be -// written by this routine. Sets "*result" to the data that was -// read (including if fewer than "n" bytes were successfully read). -// May set "*result" to point at data in "scratch[0..n-1]", so -// "scratch[0..n-1]" must be live when "*result" is used. -// If an error was encountered, returns a non-OK status. -// -// REQUIRES: External synchronization IOStatus EncryptedSequentialFile::Read(size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) { @@ -55,19 +44,12 @@ IOStatus EncryptedSequentialFile::Read(size_t n, const IOOptions& options, stream_->Decrypt(offset_, (char*)result->data(), result->size())); } if (io_s.ok()) { - offset_ += result->size(); // We've already ready data from disk, so update + offset_ += result->size(); // We've already read data from disk, so update // offset_ even if decryption fails. } return io_s; } -// Skip "n" bytes from the file. This is guaranteed to be no -// slower that reading the same data, but may be faster. -// -// If end of file is reached, skipping will stop at the end of the -// file, and Skip will return OK. -// -// REQUIRES: External synchronization IOStatus EncryptedSequentialFile::Skip(uint64_t n) { auto status = file_->Skip(n); if (!status.ok()) { @@ -77,28 +59,19 @@ IOStatus EncryptedSequentialFile::Skip(uint64_t n) { return status; } -// Indicates the upper layers if the current SequentialFile implementation -// uses direct IO. bool EncryptedSequentialFile::use_direct_io() const { return file_->use_direct_io(); } -// Use the returned alignment value to allocate -// aligned buffer for Direct I/O size_t EncryptedSequentialFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } -// Remove any kind of caching of data from the offset to offset+length -// of this file. If the length is 0, then it refers to the end of file. -// If the system is not caching the file contents, then this is a noop. IOStatus EncryptedSequentialFile::InvalidateCache(size_t offset, size_t length) { return file_->InvalidateCache(offset + prefixLength_, length); } -// Positioned Read for direct I/O -// If Direct I/O enabled, offset, n, and scratch should be properly aligned IOStatus EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, @@ -118,16 +91,6 @@ IOStatus EncryptedSequentialFile::PositionedRead(uint64_t offset, size_t n, return io_s; } -// Read up to "n" bytes from the file starting at "offset". -// "scratch[0..n-1]" may be written by this routine. Sets "*result" -// to the data that was read (including if fewer than "n" bytes were -// successfully read). May set "*result" to point at data in -// "scratch[0..n-1]", so "scratch[0..n-1]" must be live when -// "*result" is used. If an error was encountered, returns a non-OK -// status. -// -// Safe for concurrent use by multiple threads. -// If Direct I/O enabled, offset, n, and scratch should be aligned properly. IOStatus EncryptedRandomAccessFile::Read(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, @@ -146,29 +109,12 @@ IOStatus EncryptedRandomAccessFile::Read(uint64_t offset, size_t n, return io_s; } -// Readahead the file starting from offset by n bytes for caching. IOStatus EncryptedRandomAccessFile::Prefetch(uint64_t offset, size_t n, const IOOptions& options, IODebugContext* dbg) { - // return Status::OK(); return file_->Prefetch(offset + prefixLength_, n, options, dbg); } -// Tries to get an unique ID for this file that will be the same each time -// the file is opened (and will stay the same while the file is open). -// Furthermore, it tries to make this ID at most "max_size" bytes. If such an -// ID can be created this function returns the length of the ID and places it -// in "id"; otherwise, this function returns 0, in which case "id" -// may not have been modified. -// -// This function guarantees, for IDs from a given environment, two unique ids -// cannot be made equal to each other by adding arbitrary bytes to one of -// them. That is, no unique ID is the prefix of another. -// -// This function guarantees that the returned ID will not be interpretable as -// a single varint. -// -// Note: these IDs are only valid for the duration of the process. size_t EncryptedRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { return file_->GetUniqueId(id, max_size); }; @@ -177,29 +123,19 @@ void EncryptedRandomAccessFile::Hint(AccessPattern pattern) { file_->Hint(pattern); } -// Indicates the upper layers if the current RandomAccessFile implementation -// uses direct IO. bool EncryptedRandomAccessFile::use_direct_io() const { return file_->use_direct_io(); } -// Use the returned alignment value to allocate -// aligned buffer for Direct I/O size_t EncryptedRandomAccessFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } -// Remove any kind of caching of data from the offset to offset+length -// of this file. If the length is 0, then it refers to the end of file. -// If the system is not caching the file contents, then this is a noop. IOStatus EncryptedRandomAccessFile::InvalidateCache(size_t offset, size_t length) { return file_->InvalidateCache(offset + prefixLength_, length); } -// A file abstraction for sequential writing. The implementation -// must provide buffering since callers may append small fragments -// at a time to the file. IOStatus EncryptedWritableFile::Append(const Slice& data, const IOOptions& options, IODebugContext* dbg) { @@ -255,67 +191,39 @@ IOStatus EncryptedWritableFile::PositionedAppend(const Slice& data, return file_->PositionedAppend(dataToAppend, offset, options, dbg); } -// Indicates the upper layers if the current WritableFile implementation -// uses direct IO. bool EncryptedWritableFile::use_direct_io() const { return file_->use_direct_io(); } -// true if Sync() and Fsync() are safe to call concurrently with Append() -// and Flush(). bool EncryptedWritableFile::IsSyncThreadSafe() const { return file_->IsSyncThreadSafe(); } -// Use the returned alignment value to allocate -// aligned buffer for Direct I/O size_t EncryptedWritableFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } -/* - * Get the size of valid data in the file. - */ uint64_t EncryptedWritableFile::GetFileSize(const IOOptions& options, IODebugContext* dbg) { return file_->GetFileSize(options, dbg) - prefixLength_; } -// Truncate is necessary to trim the file to the correct size -// before closing. It is not always possible to keep track of the file -// size due to whole pages writes. The behavior is undefined if called -// with other writes to follow. IOStatus EncryptedWritableFile::Truncate(uint64_t size, const IOOptions& options, IODebugContext* dbg) { return file_->Truncate(size + prefixLength_, options, dbg); } -// Remove any kind of caching of data from the offset to offset+length -// of this file. If the length is 0, then it refers to the end of file. -// If the system is not caching the file contents, then this is a noop. -// This call has no effect on dirty pages in the cache. IOStatus EncryptedWritableFile::InvalidateCache(size_t offset, size_t length) { return file_->InvalidateCache(offset + prefixLength_, length); } -// Sync a file range with disk. -// offset is the starting byte of the file range to be synchronized. -// nbytes specifies the length of the range to be synchronized. -// This asks the OS to initiate flushing the cached data to disk, -// without waiting for completion. -// Default implementation does nothing. IOStatus EncryptedWritableFile::RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options, IODebugContext* dbg) { return file_->RangeSync(offset + prefixLength_, nbytes, options, dbg); } -// PrepareWrite performs any necessary preparation for a write -// before the write actually occurs. This allows for pre-allocation -// of space on devices where it can result in less file -// fragmentation and/or less waste from over-zealous filesystem -// pre-allocation. void EncryptedWritableFile::PrepareWrite(size_t offset, size_t len, const IOOptions& options, IODebugContext* dbg) { @@ -333,7 +241,6 @@ void EncryptedWritableFile::GetPreallocationStatus( file_->GetPreallocationStatus(block_size, last_allocated_block); } -// Pre-allocates space for a file. IOStatus EncryptedWritableFile::Allocate(uint64_t offset, uint64_t len, const IOOptions& options, IODebugContext* dbg) { @@ -355,22 +262,14 @@ IOStatus EncryptedWritableFile::Close(const IOOptions& options, return file_->Close(options, dbg); } -// A file abstraction for random reading and writing. - -// Indicates if the class makes use of direct I/O -// If false you must pass aligned buffer to Write() bool EncryptedRandomRWFile::use_direct_io() const { return file_->use_direct_io(); } -// Use the returned alignment value to allocate -// aligned buffer for Direct I/O size_t EncryptedRandomRWFile::GetRequiredBufferAlignment() const { return file_->GetRequiredBufferAlignment(); } -// Write bytes in `data` at offset `offset`, Returns Status::OK() on success. -// Pass aligned buffer when use_direct_io() returns true. IOStatus EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data, const IOOptions& options, IODebugContext* dbg) { @@ -397,9 +296,6 @@ IOStatus EncryptedRandomRWFile::Write(uint64_t offset, const Slice& data, return file_->Write(offset, dataToWrite, options, dbg); } -// Read up to `n` bytes starting from offset `offset` and store them in -// result, provided `scratch` size should be at least `n`. -// Returns Status::OK() on success. IOStatus EncryptedRandomRWFile::Read(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) const { @@ -681,7 +577,6 @@ class EncryptedFileSystemImpl : public EncryptedFileSystem { return provider_->AddCipher(descriptor, cipher, len, for_write); } - // NewSequentialFile opens a file for sequential reading. IOStatus NewSequentialFile(const std::string& fname, const FileOptions& options, std::unique_ptr* result, @@ -719,7 +614,6 @@ class EncryptedFileSystemImpl : public EncryptedFileSystem { return status; } - // NewRandomAccessFile opens a file for random read access. IOStatus NewRandomAccessFile(const std::string& fname, const FileOptions& options, std::unique_ptr* result, @@ -750,7 +644,6 @@ class EncryptedFileSystemImpl : public EncryptedFileSystem { return status; } - // NewWritableFile opens a file for sequential writing. IOStatus NewWritableFile(const std::string& fname, const FileOptions& options, std::unique_ptr* result, IODebugContext* dbg) override { @@ -768,13 +661,6 @@ class EncryptedFileSystemImpl : public EncryptedFileSystem { return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); } - // Create an object that writes to a new file with the specified - // name. Deletes any existing file with the same name and creates a - // new file. On success, stores a pointer to the new file in - // *result and returns OK. On failure stores nullptr in *result and - // returns non-OK. - // - // The returned file will only be accessed by one thread at a time. IOStatus ReopenWritableFile(const std::string& fname, const FileOptions& options, std::unique_ptr* result, @@ -793,7 +679,6 @@ class EncryptedFileSystemImpl : public EncryptedFileSystem { return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); } - // Reuse an existing file by renaming it and opening it as writable. IOStatus ReuseWritableFile(const std::string& fname, const std::string& old_fname, const FileOptions& options, @@ -813,11 +698,6 @@ class EncryptedFileSystemImpl : public EncryptedFileSystem { return CreateWritableEncryptedFile(fname, underlying, options, result, dbg); } - // Open `fname` for random read and write, if file doesn't exist the file - // will be created. On success, stores a pointer to the new file in - // *result and returns OK. On failure returns non-OK. - // - // The returned file will only be accessed by one thread at a time. IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options, std::unique_ptr* result, IODebugContext* dbg) override { @@ -857,20 +737,6 @@ class EncryptedFileSystemImpl : public EncryptedFileSystem { return status; } - // Store in *result the attributes of the children of the specified - // directory. - // In case the implementation lists the directory prior to iterating the - // files - // and files are concurrently deleted, the deleted files will be omitted - // from - // result. - // The name attributes are relative to "dir". - // Original contents of *results are dropped. - // Returns OK if "dir" exists and "*result" contains its children. - // NotFound if "dir" does not exist, the calling process does not - // have - // permission to access "dir", or if "dir" is invalid. - // IOError if an IO Error was encountered IOStatus GetChildrenFileAttributes(const std::string& dir, const IOOptions& options, std::vector* result, @@ -897,7 +763,6 @@ class EncryptedFileSystemImpl : public EncryptedFileSystem { return IOStatus::OK(); } - // Store the size of fname in *file_size. IOStatus GetFileSize(const std::string& fname, const IOOptions& options, uint64_t* file_size, IODebugContext* dbg) override { auto status = @@ -943,16 +808,13 @@ std::shared_ptr NewEncryptedFS( return nullptr; } } -// Returns an Env that encrypts data when stored on disk and decrypts data when -// read from disk. + Env* NewEncryptedEnv(Env* base_env, const std::shared_ptr& provider) { return new CompositeEnvWrapper( base_env, NewEncryptedFS(base_env->GetFileSystem(), provider)); } -// Encrypt one or more (partial) blocks of data at the file offset. -// Length of data is given in dataSize. Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char* data, size_t dataSize) { // Calculate block index @@ -971,7 +833,7 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char* data, if (n != blockSize) { // We're not encrypting a full block. // Copy data to blockBuffer - if (!blockBuffer.get()) { + if (!blockBuffer) { // Allocate buffer blockBuffer = std::unique_ptr(new char[blockSize]); } @@ -997,8 +859,6 @@ Status BlockAccessCipherStream::Encrypt(uint64_t fileOffset, char* data, } } -// Decrypt one or more (partial) blocks of data at the file offset. -// Length of data is given in dataSize. Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char* data, size_t dataSize) { // Calculate block index @@ -1017,7 +877,7 @@ Status BlockAccessCipherStream::Decrypt(uint64_t fileOffset, char* data, if (n != blockSize) { // We're not decrypting a full block. // Copy data to blockBuffer - if (!blockBuffer.get()) { + if (!blockBuffer) { // Allocate buffer blockBuffer = std::unique_ptr(new char[blockSize]); } @@ -1058,6 +918,7 @@ static std::unordered_map {0 /* No offset, whole struct*/, OptionType::kInt, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, }; + // Implements a BlockCipher using ROT13. // // Note: This is a sample implementation of BlockCipher, @@ -1074,22 +935,17 @@ class ROT13BlockCipher : public BlockCipher { static const char* kClassName() { return "ROT13"; } const char* Name() const override { return kClassName(); } - // BlockSize returns the size of each block supported by this cipher stream. - size_t BlockSize() override { return blockSize_; } - // Encrypt a block of data. - // Length of data is equal to BlockSize(). + size_t BlockSize() override { return blockSize_; } Status Encrypt(char* data) override { for (size_t i = 0; i < blockSize_; ++i) { data[i] += 13; } return Status::OK(); } - - // Decrypt a block of data. - // Length of data is equal to BlockSize(). Status Decrypt(char* data) override { return Encrypt(data); } }; + static const std::unordered_map ctr_encryption_provider_type_info = { {"cipher", @@ -1099,14 +955,11 @@ static const std::unordered_map }; } // anonymous namespace -// Allocate scratch space which is passed to EncryptBlock/DecryptBlock. void CTRCipherStream::AllocateScratch(std::string& scratch) { auto blockSize = cipher_->BlockSize(); scratch.reserve(blockSize); } -// Encrypt a block of data at the given block index. -// Length of data is equal to BlockSize(); Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char* data, char* scratch) { // Create nonce + counter @@ -1114,7 +967,7 @@ Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char* data, memmove(scratch, iv_.data(), blockSize); EncodeFixed64(scratch, blockIndex + initialCounter_); - // Encrypt nonce+counter + // Encrypt nonce + counter auto status = cipher_->Encrypt(scratch); if (!status.ok()) { return status; @@ -1127,8 +980,6 @@ Status CTRCipherStream::EncryptBlock(uint64_t blockIndex, char* data, return Status::OK(); } -// Decrypt a block of data at the given block index. -// Length of data is equal to BlockSize(); Status CTRCipherStream::DecryptBlock(uint64_t blockIndex, char* data, char* scratch) { // For CTR decryption & encryption are the same @@ -1150,10 +1001,6 @@ bool CTREncryptionProvider::IsInstanceOf(const std::string& name) const { } } -// GetPrefixLength returns the length of the prefix that is added to every file -// and used for storing encryption options. -// For optimal performance, the prefix length should be a multiple of -// the page size. size_t CTREncryptionProvider::GetPrefixLength() const { return defaultPrefixLength; } @@ -1182,8 +1029,6 @@ static void decodeCTRParameters(const char* prefix, size_t blockSize, iv = Slice(prefix + blockSize, blockSize); } -// CreateNewPrefix initialized an allocated block of prefix memory -// for a new file. Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, char* prefix, size_t prefixLength) const { @@ -1215,10 +1060,8 @@ Status CTREncryptionProvider::CreateNewPrefix(const std::string& /*fname*/, status = cipherStream.Encrypt(0, prefix + (2 * blockSize), prefixLength - (2 * blockSize)); } - if (!status.ok()) { - return status; - } - return Status::OK(); + + return status; } // PopulateSecretPrefixPart initializes the data into a new prefix block @@ -1270,7 +1113,7 @@ Status CTREncryptionProvider::CreateCipherStream( } // CreateCipherStreamFromPrefix creates a block access cipher stream for a file -// given given name and options. The given prefix is already decrypted. +// given name and options. The given prefix is already decrypted. Status CTREncryptionProvider::CreateCipherStreamFromPrefix( const std::string& /*fname*/, const EnvOptions& /*options*/, uint64_t initialCounter, const Slice& iv, const Slice& /*prefix*/, @@ -1335,17 +1178,15 @@ Status BlockCipher::CreateFromString(const ConfigOptions& config_options, const std::string& value, std::shared_ptr* result) { RegisterEncryptionBuiltins(); - return LoadSharedObject(config_options, value, nullptr, result); + return LoadSharedObject(config_options, value, result); } Status EncryptionProvider::CreateFromString( const ConfigOptions& config_options, const std::string& value, std::shared_ptr* result) { RegisterEncryptionBuiltins(); - return LoadSharedObject(config_options, value, nullptr, - result); + return LoadSharedObject(config_options, value, result); } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/env/env_encryption_ctr.h b/env/env_encryption_ctr.h index cfb440c72ace..b4342f7012d3 100644 --- a/env/env_encryption_ctr.h +++ b/env/env_encryption_ctr.h @@ -5,7 +5,6 @@ #pragma once -#if !defined(ROCKSDB_LITE) #include "rocksdb/env_encryption.h" @@ -28,19 +27,13 @@ class CTRCipherStream final : public BlockAccessCipherStream { : cipher_(c), iv_(iv, c->BlockSize()), initialCounter_(initialCounter){}; virtual ~CTRCipherStream(){}; - // BlockSize returns the size of each block supported by this cipher stream. size_t BlockSize() override { return cipher_->BlockSize(); } protected: - // Allocate scratch space which is passed to EncryptBlock/DecryptBlock. void AllocateScratch(std::string&) override; - // Encrypt a block of data at the given block index. - // Length of data is equal to BlockSize(); Status EncryptBlock(uint64_t blockIndex, char* data, char* scratch) override; - // Decrypt a block of data at the given block index. - // Length of data is equal to BlockSize(); Status DecryptBlock(uint64_t blockIndex, char* data, char* scratch) override; }; @@ -67,20 +60,9 @@ class CTREncryptionProvider : public EncryptionProvider { static const char* kClassName() { return "CTR"; } const char* Name() const override { return kClassName(); } bool IsInstanceOf(const std::string& name) const override; - // GetPrefixLength returns the length of the prefix that is added to every - // file - // and used for storing encryption options. - // For optimal performance when using direct IO, the prefix length should be a - // multiple of the page size. size_t GetPrefixLength() const override; - - // CreateNewPrefix initialized an allocated block of prefix memory - // for a new file. Status CreateNewPrefix(const std::string& fname, char* prefix, size_t prefixLength) const override; - - // CreateCipherStream creates a block access cipher stream for a file given - // given name and options. Status CreateCipherStream( const std::string& fname, const EnvOptions& options, Slice& prefix, std::unique_ptr* result) override; @@ -113,4 +95,3 @@ Status NewEncryptedFileSystemImpl( } // namespace ROCKSDB_NAMESPACE -#endif // !defined(ROCKSDB_LITE) diff --git a/env/env_posix.cc b/env/env_posix.cc index 77f28e1f50ee..ae2f9036028c 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -213,13 +213,14 @@ class PosixEnv : public CompositeEnv { const char* Name() const override { return kClassName(); } const char* NickName() const override { return kDefaultName(); } - ~PosixEnv() override { - if (this == Env::Default()) { - for (const auto tid : threads_to_join_) { + struct JoinThreadsOnExit { + explicit JoinThreadsOnExit(PosixEnv& _deflt) : deflt(_deflt) {} + ~JoinThreadsOnExit() { + for (const auto tid : deflt.threads_to_join_) { pthread_join(tid, nullptr); } for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { - thread_pools_[pool_id].JoinAllThreads(); + deflt.thread_pools_[pool_id].JoinAllThreads(); } // Do not delete the thread_status_updater_ in order to avoid the // free after use when Env::Default() is destructed while some other @@ -227,7 +228,8 @@ class PosixEnv : public CompositeEnv { // PosixEnv instances use the same thread_status_updater_, so never // explicitly delete it. } - } + PosixEnv& deflt; + }; void SetFD_CLOEXEC(int fd, const EnvOptions* options) { if ((options == nullptr || options->set_fd_cloexec) && fd > 0) { @@ -328,12 +330,16 @@ class PosixEnv : public CompositeEnv { } Status GetHostName(char* name, uint64_t len) override { - int ret = gethostname(name, static_cast(len)); + const size_t max_len = static_cast(len); + int ret = gethostname(name, max_len); if (ret < 0) { if (errno == EFAULT || errno == EINVAL) { return Status::InvalidArgument(errnoStr(errno).c_str()); + } else if (errno == ENAMETOOLONG) { + return IOError("GetHostName", std::string(name, strnlen(name, max_len)), + errno); } else { - return IOError("GetHostName", name, errno); + return IOError("GetHostName", "", errno); } } return Status::OK(); @@ -501,9 +507,11 @@ Env* Env::Default() { ThreadLocalPtr::InitSingletons(); CompressionContextCache::InitSingleton(); INIT_SYNC_POINT_SINGLETONS(); - // ~PosixEnv must be called on exit - //**TODO: Can we make this a STATIC_AVOID_DESTRUCTION? - static PosixEnv default_env; + // Avoid problems with accessing most members of Env::Default() during + // static destruction. + STATIC_AVOID_DESTRUCTION(PosixEnv, default_env); + // This destructor must be called on exit + static PosixEnv::JoinThreadsOnExit thread_joiner(default_env); return &default_env; } diff --git a/env/env_test.cc b/env/env_test.cc index f4e9d50b239b..1bd176fb0b09 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -1107,7 +1107,6 @@ class IoctlFriendlyTmpdir { bool is_supported_ = true; }; -#ifndef ROCKSDB_LITE TEST_F(EnvPosixTest, PositionedAppend) { std::unique_ptr writable_file; EnvOptions options; @@ -1141,7 +1140,6 @@ TEST_F(EnvPosixTest, PositionedAppend) { ASSERT_EQ('a', result[kBlockSize - 1]); ASSERT_EQ('b', result[kBlockSize]); } -#endif // !ROCKSDB_LITE // `GetUniqueId()` temporarily returns zero on Windows. `BlockBasedTable` can // handle a return value of zero but this test case cannot. @@ -1551,7 +1549,6 @@ TEST_F(EnvPosixTest, MultiReadNonAlignedLargeNum) { } } -#ifndef ROCKSDB_LITE TEST_F(EnvPosixTest, NonAlignedDirectIOMultiReadBeyondFileSize) { EnvOptions soptions; soptions.use_direct_reads = true; @@ -1622,7 +1619,6 @@ TEST_F(EnvPosixTest, NonAlignedDirectIOMultiReadBeyondFileSize) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -#endif // ROCKSDB_LITE #if defined(ROCKSDB_IOURING_PRESENT) void GenerateFilesAndRequest(Env* env, const std::string& fname, @@ -2470,13 +2466,11 @@ TEST_F(EnvTest, LogvWithInfoLogLevel) { INSTANTIATE_TEST_CASE_P(DefaultEnvWithoutDirectIO, EnvPosixTestWithParam, ::testing::Values(std::pair(Env::Default(), false))); -#if !defined(ROCKSDB_LITE) INSTANTIATE_TEST_CASE_P(DefaultEnvWithDirectIO, EnvPosixTestWithParam, ::testing::Values(std::pair(Env::Default(), true))); -#endif // !defined(ROCKSDB_LITE) -#if !defined(ROCKSDB_LITE) && !defined(OS_WIN) +#if !defined(OS_WIN) static Env* GetChrootEnv() { static std::unique_ptr chroot_env( NewChrootEnv(Env::Default(), test::TmpDir(Env::Default()))); @@ -2488,7 +2482,7 @@ INSTANTIATE_TEST_CASE_P(ChrootEnvWithoutDirectIO, EnvPosixTestWithParam, INSTANTIATE_TEST_CASE_P(ChrootEnvWithDirectIO, EnvPosixTestWithParam, ::testing::Values(std::pair(GetChrootEnv(), true))); -#endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN) +#endif // !defined(OS_WIN) class EnvFSTestWithParam : public ::testing::Test, @@ -2653,7 +2647,6 @@ class CreateEnvTest : public testing::Test { ConfigOptions config_options_; }; -#ifndef ROCKSDB_LITE TEST_F(CreateEnvTest, LoadCTRProvider) { config_options_.invoke_prepare_options = false; std::string CTR = CTREncryptionProvider::kClassName(); @@ -2712,7 +2705,6 @@ TEST_F(CreateEnvTest, LoadROT13Cipher) { ASSERT_NE(cipher, nullptr); ASSERT_STREQ(cipher->Name(), "ROT13"); } -#endif // ROCKSDB_LITE TEST_F(CreateEnvTest, CreateDefaultSystemClock) { std::shared_ptr clock, copy; @@ -2720,15 +2712,12 @@ TEST_F(CreateEnvTest, CreateDefaultSystemClock) { SystemClock::kDefaultName(), &clock)); ASSERT_NE(clock, nullptr); ASSERT_EQ(clock, SystemClock::Default()); -#ifndef ROCKSDB_LITE std::string opts_str = clock->ToString(config_options_); std::string mismatch; ASSERT_OK(SystemClock::CreateFromString(config_options_, opts_str, ©)); ASSERT_TRUE(clock->AreEquivalent(config_options_, copy.get(), &mismatch)); -#endif // ROCKSDB_LITE } -#ifndef ROCKSDB_LITE TEST_F(CreateEnvTest, CreateMockSystemClock) { std::shared_ptr mock, copy; @@ -2917,6 +2906,13 @@ TEST_F(CreateEnvTest, CreateEncryptedFileSystem) { std::string base_opts = std::string("provider=1://test; id=") + EncryptedFileSystem::kClassName(); + // Rewrite the default FileSystem URI if the "TEST_FS_URI" environment + // variable is set. This is useful to test customer encryption plugins. + const char* uri = getenv("TEST_FS_URI"); + if (uri != nullptr) { + base_opts = uri; + } + // The EncryptedFileSystem requires a "provider" option. ASSERT_NOK(FileSystem::CreateFromString( config_options_, EncryptedFileSystem::kClassName(), &fs)); @@ -2943,7 +2939,6 @@ TEST_F(CreateEnvTest, CreateEncryptedFileSystem) { ASSERT_TRUE(fs->AreEquivalent(config_options_, copy.get(), &mismatch)); } -#endif // ROCKSDB_LITE namespace { @@ -3066,7 +3061,7 @@ TEST_F(EnvTest, PortGenerateRfcUuid) { VerifyRfcUuids(t.ids); } -// Test the atomic, linear generation of GenerateRawUuid +// Test the atomic, linear generation of GenerateRawUniqueId TEST_F(EnvTest, GenerateRawUniqueId) { struct MyStressTest : public NoDuplicateMiniStressTest { @@ -3146,6 +3141,104 @@ TEST_F(EnvTest, SemiStructuredUniqueIdGenTest) { t.Run(); } +TEST_F(EnvTest, SemiStructuredUniqueIdGenTestSmaller) { + // For small generated types, will cycle through all the possible values. + SemiStructuredUniqueIdGen gen; + std::vector hit(256); + for (int i = 0; i < 256; ++i) { + auto val = gen.GenerateNext(); + ASSERT_FALSE(hit[val]); + hit[val] = true; + } + for (int i = 0; i < 256; ++i) { + ASSERT_TRUE(hit[i]); + } +} + +TEST_F(EnvTest, UnpredictableUniqueIdGenTest1) { + // Must be thread safe and usable as a static. + static UnpredictableUniqueIdGen gen; + + struct MyStressTest + : public NoDuplicateMiniStressTest { + uint64_pair_t Generate() override { + uint64_pair_t p; + gen.GenerateNext(&p.first, &p.second); + return p; + } + }; + + MyStressTest t; + t.Run(); +} + +TEST_F(EnvTest, UnpredictableUniqueIdGenTest2) { + // Even if we completely strip the seeding and entropy of the structure + // down to a bare minimum, we still get quality pseudorandom results. + static UnpredictableUniqueIdGen gen{ + UnpredictableUniqueIdGen::TEST_ZeroInitialized{}}; + + struct MyStressTest + : public NoDuplicateMiniStressTest { + uint64_pair_t Generate() override { + uint64_pair_t p; + // No extra entropy is required to get quality pseudorandom results + gen.GenerateNextWithEntropy(&p.first, &p.second, /*no extra entropy*/ 0); + return p; + } + }; + + MyStressTest t; + t.Run(); +} + +TEST_F(EnvTest, UnpredictableUniqueIdGenTest3) { + struct MyStressTest + : public NoDuplicateMiniStressTest { + uint64_pair_t Generate() override { + uint64_pair_t p; + thread_local UnpredictableUniqueIdGen gen{ + UnpredictableUniqueIdGen::TEST_ZeroInitialized{}}; + // Even without the counter (reset it to thread id), we get quality + // single-threaded results (because part of each result is fed back + // into pool). + gen.TEST_counter().store(Env::Default()->GetThreadID()); + gen.GenerateNext(&p.first, &p.second); + return p; + } + }; + + MyStressTest t; + t.Run(); +} + +TEST_F(EnvTest, UnpredictableUniqueIdGenTest4) { + struct MyStressTest + : public NoDuplicateMiniStressTest { + uint64_pair_t Generate() override { + uint64_pair_t p; + // Even if we reset the state to thread ID each time, RDTSC instruction + // suffices for quality single-threaded results. + UnpredictableUniqueIdGen gen{ + UnpredictableUniqueIdGen::TEST_ZeroInitialized{}}; + gen.TEST_counter().store(Env::Default()->GetThreadID()); + gen.GenerateNext(&p.first, &p.second); + return p; + } + }; + + MyStressTest t; +#ifdef __SSE4_2__ // Our rough check for RDTSC + t.Run(); +#else + ROCKSDB_GTEST_BYPASS("Requires IA32 with RDTSC"); + // because nanosecond time might not be high enough fidelity to have + // incremented after a few hundred instructions, especially in cases where + // we really only have microsecond fidelity. Also, wall clock might not be + // monotonic. +#endif +} + TEST_F(EnvTest, FailureToCreateLockFile) { auto env = Env::Default(); auto fs = env->GetFileSystem(); @@ -3191,17 +3284,14 @@ TEST_F(CreateEnvTest, CreateDefaultEnv) { ASSERT_EQ(env, Env::Default()); ASSERT_EQ(guard, nullptr); -#ifndef ROCKSDB_LITE std::string opt_str = env->ToString(options); ASSERT_OK(Env::CreateFromString(options, opt_str, &env)); ASSERT_EQ(env, Env::Default()); ASSERT_OK(Env::CreateFromString(options, opt_str, &env, &guard)); ASSERT_EQ(env, Env::Default()); ASSERT_EQ(guard, nullptr); -#endif // ROCKSDB_LITE } -#ifndef ROCKSDB_LITE namespace { class WrappedEnv : public EnvWrapper { public: @@ -3353,7 +3443,6 @@ TEST_F(CreateEnvTest, CreateCompositeEnv) { ASSERT_NE(comp->Inner(), nullptr); ASSERT_OK(ValidateOptions(db_opts, cf_opts)); } -#endif // ROCKSDB_LITE // Forward declaration class ReadAsyncFS; @@ -3456,7 +3545,7 @@ IOStatus ReadAsyncRandomAccessFile::ReadAsync( } }; - fs_.workers.emplace_back(submit_request, req); + fs_.workers.emplace_back(submit_request, std::move(req)); return IOStatus::OK(); } @@ -3553,6 +3642,23 @@ TEST_F(TestAsyncRead, ReadAsync) { } } } + +struct StaticDestructionTester { + bool activated = false; + ~StaticDestructionTester() { + if (activated && !kMustFreeHeapAllocations) { + // Make sure we can still call some things on default Env. + std::string hostname; + Env::Default()->GetHostNameString(&hostname); + } + } +} static_destruction_tester; + +TEST(EnvTestMisc, StaticDestruction) { + // Check for any crashes during static destruction. + static_destruction_tester.activated = true; +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/env/file_system.cc b/env/file_system.cc index f9dda429a33e..71fb4d5bc74c 100644 --- a/env/file_system.cc +++ b/env/file_system.cc @@ -26,12 +26,6 @@ FileSystem::FileSystem() {} FileSystem::~FileSystem() {} -Status FileSystem::Load(const std::string& value, - std::shared_ptr* result) { - return CreateFromString(ConfigOptions(), value, result); -} - -#ifndef ROCKSDB_LITE static int RegisterBuiltinFileSystems(ObjectLibrary& library, const std::string& /*arg*/) { library.AddFactory( @@ -84,7 +78,6 @@ static int RegisterBuiltinFileSystems(ObjectLibrary& library, size_t num_types; return static_cast(library.GetFactoryCount(&num_types)); } -#endif // ROCKSDB_LITE Status FileSystem::CreateFromString(const ConfigOptions& config_options, const std::string& value, @@ -94,13 +87,11 @@ Status FileSystem::CreateFromString(const ConfigOptions& config_options, *result = default_fs; return Status::OK(); } else { -#ifndef ROCKSDB_LITE static std::once_flag once; std::call_once(once, [&]() { RegisterBuiltinFileSystems(*(ObjectLibrary::Default().get()), ""); }); -#endif // ROCKSDB_LITE - return LoadSharedObject(config_options, value, nullptr, result); + return LoadSharedObject(config_options, value, result); } } @@ -235,11 +226,9 @@ IOStatus ReadFileToString(FileSystem* fs, const std::string& fname, namespace { static std::unordered_map fs_wrapper_type_info = { -#ifndef ROCKSDB_LITE {"target", OptionTypeInfo::AsCustomSharedPtr( 0, OptionVerificationType::kByName, OptionTypeFlags::kDontSerialize)}, -#endif // ROCKSDB_LITE }; } // namespace FileSystemWrapper::FileSystemWrapper(const std::shared_ptr& t) @@ -254,7 +243,6 @@ Status FileSystemWrapper::PrepareOptions(const ConfigOptions& options) { return FileSystem::PrepareOptions(options); } -#ifndef ROCKSDB_LITE std::string FileSystemWrapper::SerializeOptions( const ConfigOptions& config_options, const std::string& header) const { auto parent = FileSystem::SerializeOptions(config_options, ""); @@ -274,7 +262,6 @@ std::string FileSystemWrapper::SerializeOptions( return result; } } -#endif // ROCKSDB_LITE DirFsyncOptions::DirFsyncOptions() { reason = kDefault; } diff --git a/env/fs_posix.cc b/env/fs_posix.cc index e179a421dcd8..dd2f749350da 100644 --- a/env/fs_posix.cc +++ b/env/fs_posix.cc @@ -168,10 +168,6 @@ class PosixFileSystem : public FileSystem { FILE* file = nullptr; if (options.use_direct_reads && !options.use_mmap_reads) { -#ifdef ROCKSDB_LITE - return IOStatus::IOError(fname, - "Direct I/O not supported in RocksDB lite"); -#endif // !ROCKSDB_LITE #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) flags |= O_DIRECT; TEST_SYNC_POINT_CALLBACK("NewSequentialFile:O_DIRECT", &flags); @@ -223,10 +219,6 @@ class PosixFileSystem : public FileSystem { int flags = cloexec_flags(O_RDONLY, &options); if (options.use_direct_reads && !options.use_mmap_reads) { -#ifdef ROCKSDB_LITE - return IOStatus::IOError(fname, - "Direct I/O not supported in RocksDB lite"); -#endif // !ROCKSDB_LITE #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) flags |= O_DIRECT; TEST_SYNC_POINT_CALLBACK("NewRandomAccessFile:O_DIRECT", &flags); @@ -300,10 +292,6 @@ class PosixFileSystem : public FileSystem { // appends data to the end of the file, regardless of the value of // offset. // More info here: https://linux.die.net/man/2/pwrite -#ifdef ROCKSDB_LITE - return IOStatus::IOError(fname, - "Direct I/O not supported in RocksDB lite"); -#endif // ROCKSDB_LITE flags |= O_WRONLY; #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) flags |= O_DIRECT; @@ -392,10 +380,6 @@ class PosixFileSystem : public FileSystem { int flags = 0; // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX) if (options.use_direct_writes && !options.use_mmap_writes) { -#ifdef ROCKSDB_LITE - return IOStatus::IOError(fname, - "Direct I/O not supported in RocksDB lite"); -#endif // !ROCKSDB_LITE flags |= O_WRONLY; #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS) flags |= O_DIRECT; @@ -1199,6 +1183,16 @@ class PosixFileSystem : public FileSystem { #endif } + void SupportedOps(int64_t& supported_ops) override { + supported_ops = 0; +#if defined(ROCKSDB_IOURING_PRESENT) + if (IsIOUringEnabled()) { + // Underlying FS supports async_io + supported_ops |= (1 << FSSupportedOps::kAsyncIO); + } +#endif + } + #if defined(ROCKSDB_IOURING_PRESENT) // io_uring instance std::unique_ptr thread_local_io_urings_; @@ -1278,7 +1272,6 @@ std::shared_ptr FileSystem::Default() { return instance; } -#ifndef ROCKSDB_LITE static FactoryFunc posix_filesystem_reg = ObjectLibrary::Default()->AddFactory( ObjectLibrary::PatternEntry("posix").AddSeparator("://", false), @@ -1287,7 +1280,6 @@ static FactoryFunc posix_filesystem_reg = f->reset(new PosixFileSystem()); return f->get(); }); -#endif } // namespace ROCKSDB_NAMESPACE diff --git a/env/fs_readonly.h b/env/fs_readonly.h index 1bbe607849cb..7a04aea0801a 100644 --- a/env/fs_readonly.h +++ b/env/fs_readonly.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include "rocksdb/file_system.h" @@ -104,4 +103,3 @@ class ReadOnlyFileSystem : public FileSystemWrapper { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/env/fs_remap.cc b/env/fs_remap.cc index fd92411814bd..b9832e6cba3e 100644 --- a/env/fs_remap.cc +++ b/env/fs_remap.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "env/fs_remap.h" @@ -340,4 +339,3 @@ IOStatus RemapFileSystem::GetAbsolutePath(const std::string& db_path, } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/env/fs_remap.h b/env/fs_remap.h index 1f6e061fd6f9..a3c998262c0b 100644 --- a/env/fs_remap.h +++ b/env/fs_remap.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include @@ -136,4 +135,3 @@ class RemapFileSystem : public FileSystemWrapper { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/env/io_posix.h b/env/io_posix.h index f129668ea546..8c51ba6450c3 100644 --- a/env/io_posix.h +++ b/env/io_posix.h @@ -29,7 +29,8 @@ // For non linux platform, the following macros are used only as place // holder. -#if !(defined OS_LINUX) && !(defined CYGWIN) && !(defined OS_AIX) +#if !(defined OS_LINUX) && !(defined OS_FREEBSD) && !(defined CYGWIN) && \ + !(defined OS_AIX) #define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */ #define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */ #define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ diff --git a/env/mock_env.cc b/env/mock_env.cc index bfa7dc2f47e0..c232af61eb5f 100644 --- a/env/mock_env.cc +++ b/env/mock_env.cc @@ -21,7 +21,7 @@ #include "util/cast_util.h" #include "util/hash.h" #include "util/random.h" -#include "util/rate_limiter.h" +#include "util/rate_limiter_impl.h" #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { @@ -33,7 +33,6 @@ int64_t MaybeCurrentTime(const std::shared_ptr& clock) { } static std::unordered_map time_elapse_type_info = { -#ifndef ROCKSDB_LITE {"time_elapse_only_sleep", {0, OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kCompareNever, @@ -50,10 +49,8 @@ static std::unordered_map time_elapse_type_info = { return Status::OK(); }, nullptr}}, -#endif // ROCKSDB_LITE }; static std::unordered_map mock_sleep_type_info = { -#ifndef ROCKSDB_LITE {"mock_sleep", {0, OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kCompareNever, @@ -70,7 +67,6 @@ static std::unordered_map mock_sleep_type_info = { return Status::OK(); }, nullptr}}, -#endif // ROCKSDB_LITE }; } // namespace @@ -572,11 +568,9 @@ class TestMemLogger : public Logger { }; static std::unordered_map mock_fs_type_info = { -#ifndef ROCKSDB_LITE {"supports_direct_io", {0, OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, -#endif // ROCKSDB_LITE }; } // namespace @@ -1057,14 +1051,8 @@ Status MockEnv::CorruptBuffer(const std::string& fname) { return mock->CorruptBuffer(fname); } -#ifndef ROCKSDB_LITE // This is to maintain the behavior before swithcing from InMemoryEnv to MockEnv Env* NewMemEnv(Env* base_env) { return MockEnv::Create(base_env); } -#else // ROCKSDB_LITE - -Env* NewMemEnv(Env* /*base_env*/) { return nullptr; } - -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/env/unique_id_gen.cc b/env/unique_id_gen.cc index a1986fa15d3e..8d9db86956d9 100644 --- a/env/unique_id_gen.cc +++ b/env/unique_id_gen.cc @@ -7,14 +7,28 @@ #include #include +#include +#include #include #include +#include "port/lang.h" #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/version.h" #include "util/hash.h" +#ifdef __SSE4_2__ +#ifdef _WIN32 +#include +#define _rdtsc() __rdtsc() +#else +#include +#endif +#else +#include "rocksdb/system_clock.h" +#endif + namespace ROCKSDB_NAMESPACE { namespace { @@ -161,4 +175,69 @@ void SemiStructuredUniqueIdGen::GenerateNext(uint64_t* upper, uint64_t* lower) { } } +void UnpredictableUniqueIdGen::Reset() { + for (size_t i = 0; i < pool_.size(); i += 2) { + assert(i + 1 < pool_.size()); + uint64_t a, b; + GenerateRawUniqueId(&a, &b); + pool_[i] = a; + pool_[i + 1] = b; + } +} + +void UnpredictableUniqueIdGen::GenerateNext(uint64_t* upper, uint64_t* lower) { + uint64_t extra_entropy; + // Use timing information (if available) to add to entropy. (Not a disaster + // if unavailable on some platforms. High performance is important.) +#ifdef __SSE4_2__ // More than enough to guarantee rdtsc instruction + extra_entropy = static_cast(_rdtsc()); +#else + extra_entropy = SystemClock::Default()->NowNanos(); +#endif + + GenerateNextWithEntropy(upper, lower, extra_entropy); +} + +void UnpredictableUniqueIdGen::GenerateNextWithEntropy(uint64_t* upper, + uint64_t* lower, + uint64_t extra_entropy) { + // To efficiently ensure unique inputs to the hash function in the presence + // of multithreading, we do not require atomicity on the whole entropy pool, + // but instead only a piece of it (a 64-bit counter) that is sufficient to + // guarantee uniqueness. + uint64_t count = counter_.fetch_add(1, std::memory_order_relaxed); + uint64_t a = count; + uint64_t b = extra_entropy; + // Invoking the hash function several times avoids copying all the inputs + // to a contiguous, non-atomic buffer. + BijectiveHash2x64(a, b, &a, &b); // Based on XXH128 + + // In hashing the rest of the pool with that, we don't need to worry about + // races, but use atomic operations for sanitizer-friendliness. + for (size_t i = 0; i < pool_.size(); i += 2) { + assert(i + 1 < pool_.size()); + a ^= pool_[i].load(std::memory_order_relaxed); + b ^= pool_[i + 1].load(std::memory_order_relaxed); + BijectiveHash2x64(a, b, &a, &b); // Based on XXH128 + } + + // Return result + *lower = a; + *upper = b; + + // Add some back into pool. We don't really care that there's a race in + // storing the result back and another thread computing the next value. + // It's just an entropy pool. + pool_[count & (pool_.size() - 1)].fetch_add(a, std::memory_order_relaxed); +} + +#ifndef NDEBUG +UnpredictableUniqueIdGen::UnpredictableUniqueIdGen(TEST_ZeroInitialized) { + for (auto& p : pool_) { + p.store(0); + } + counter_.store(0); +} +#endif + } // namespace ROCKSDB_NAMESPACE diff --git a/env/unique_id_gen.h b/env/unique_id_gen.h index 17e71e6220b5..f654c7b11e4e 100644 --- a/env/unique_id_gen.h +++ b/env/unique_id_gen.h @@ -12,9 +12,12 @@ #pragma once +#include #include #include +#include +#include "port/port.h" #include "rocksdb/rocksdb_namespace.h" namespace ROCKSDB_NAMESPACE { @@ -61,6 +64,19 @@ class SemiStructuredUniqueIdGen { // to the next (thread safe). void GenerateNext(uint64_t* upper, uint64_t* lower); + // For generating smaller values. Will cycle through all the possibilities + // before repeating. + template + T GenerateNext() { + static_assert(sizeof(T) <= sizeof(uint64_t)); + static_assert(std::is_integral_v); + uint64_t ignore, val; + GenerateNext(&ignore, &val); + return static_cast(val); + } + + uint64_t GetBaseUpper() const { return base_upper_; } + private: uint64_t base_upper_; uint64_t base_lower_; @@ -68,4 +84,36 @@ class SemiStructuredUniqueIdGen { int64_t saved_process_id_; }; +// A unique id generator that should provide reasonable security against +// predicting the output from previous outputs, but is NOT known to be +// cryptographically secure. Unlike std::random_device, this is guaranteed +// not to block once initialized. +class ALIGN_AS(CACHE_LINE_SIZE) UnpredictableUniqueIdGen { + public: + // Initializes with random starting state (from several GenerateRawUniqueId) + UnpredictableUniqueIdGen() { Reset(); } + // Re-initializes, but not thread safe + void Reset(); + + // Generate next probabilistically unique value. Thread safe. Uses timing + // information to add to the entropy pool. + void GenerateNext(uint64_t* upper, uint64_t* lower); + + // Explicitly include given value for entropy pool instead of timing + // information. + void GenerateNextWithEntropy(uint64_t* upper, uint64_t* lower, + uint64_t extra_entropy); + +#ifndef NDEBUG + struct TEST_ZeroInitialized {}; + explicit UnpredictableUniqueIdGen(TEST_ZeroInitialized); + std::atomic& TEST_counter() { return counter_; } +#endif + private: + // 256 bit entropy pool + std::array, 4> pool_; + // Counter to ensure unique hash inputs + std::atomic counter_; +}; + } // namespace ROCKSDB_NAMESPACE diff --git a/examples/compact_files_example.cc b/examples/compact_files_example.cc index 1ecf8c794744..544adf8ae6ac 100644 --- a/examples/compact_files_example.cc +++ b/examples/compact_files_example.cc @@ -144,6 +144,8 @@ int main() { options.create_if_missing = true; // Disable RocksDB background compaction. options.compaction_style = ROCKSDB_NAMESPACE::kCompactionStyleNone; + // Small write buffer size for generating more sst files in level 0. + options.write_buffer_size = 4 << 20; // Small slowdown and stop trigger for experimental purpose. options.level0_slowdown_writes_trigger = 3; options.level0_stop_writes_trigger = 5; diff --git a/examples/optimistic_transaction_example.cc b/examples/optimistic_transaction_example.cc index fb0514a694a4..079572737245 100644 --- a/examples/optimistic_transaction_example.cc +++ b/examples/optimistic_transaction_example.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "rocksdb/db.h" #include "rocksdb/options.h" @@ -189,4 +188,3 @@ int main() { return 0; } -#endif // ROCKSDB_LITE diff --git a/examples/transaction_example.cc b/examples/transaction_example.cc index 08bcca1b690d..541b13f796ec 100644 --- a/examples/transaction_example.cc +++ b/examples/transaction_example.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "rocksdb/db.h" #include "rocksdb/options.h" @@ -195,4 +194,3 @@ int main() { return 0; } -#endif // ROCKSDB_LITE diff --git a/file/delete_scheduler.cc b/file/delete_scheduler.cc index b97a0f224d57..78ea6f7feeb9 100644 --- a/file/delete_scheduler.cc +++ b/file/delete_scheduler.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "file/delete_scheduler.h" @@ -257,6 +256,7 @@ void DeleteScheduler::BackgroundEmptyTrash() { total_deleted_bytes += deleted_bytes; mu_.Lock(); if (is_complete) { + RecordTick(stats_.get(), FILES_DELETED_FROM_TRASH_QUEUE); queue_.pop(); } @@ -408,4 +408,3 @@ void DeleteScheduler::MaybeCreateBackgroundThread() { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/file/delete_scheduler.h b/file/delete_scheduler.h index 2904ec621863..da3735aed817 100644 --- a/file/delete_scheduler.h +++ b/file/delete_scheduler.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -146,4 +145,3 @@ class DeleteScheduler { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/file/delete_scheduler_test.cc b/file/delete_scheduler_test.cc index d825da32a286..46e834879a30 100644 --- a/file/delete_scheduler_test.cc +++ b/file/delete_scheduler_test.cc @@ -18,7 +18,6 @@ #include "test_util/testharness.h" #include "util/string_util.h" -#ifndef ROCKSDB_LITE namespace ROCKSDB_NAMESPACE { @@ -41,7 +40,7 @@ class DeleteSchedulerTest : public testing::Test { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); for (const auto& dummy_files_dir : dummy_files_dirs_) { - DestroyDir(env_, dummy_files_dir); + EXPECT_OK(DestroyDir(env_, dummy_files_dir)); } } @@ -83,11 +82,11 @@ class DeleteSchedulerTest : public testing::Test { std::string file_path = dummy_files_dirs_[dummy_files_dirs_idx] + "/" + file_name; std::unique_ptr f; - env_->NewWritableFile(file_path, &f, EnvOptions()); + EXPECT_OK(env_->NewWritableFile(file_path, &f, EnvOptions())); std::string data(size, 'A'); EXPECT_OK(f->Append(data)); EXPECT_OK(f->Close()); - sst_file_mgr_->OnAddFile(file_path); + EXPECT_OK(sst_file_mgr_->OnAddFile(file_path)); return file_path; } @@ -186,6 +185,8 @@ TEST_F(DeleteSchedulerTest, BasicRateLimiting) { ASSERT_EQ(CountTrashFiles(), 0); ASSERT_EQ(num_files, stats_->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ(num_files, + stats_->getAndResetTickerCount(FILES_DELETED_FROM_TRASH_QUEUE)); ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } @@ -225,6 +226,8 @@ TEST_F(DeleteSchedulerTest, MultiDirectoryDeletionsScheduled) { } ASSERT_EQ(kNumFiles, stats_->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ(kNumFiles, + stats_->getAndResetTickerCount(FILES_DELETED_FROM_TRASH_QUEUE)); ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -309,8 +312,11 @@ TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) { ASSERT_EQ(CountNormalFiles(), 0); ASSERT_EQ(CountTrashFiles(), 0); - ASSERT_EQ(num_files * thread_cnt, + int total_num_files = num_files * thread_cnt; + ASSERT_EQ(total_num_files, stats_->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ(total_num_files, + stats_->getAndResetTickerCount(FILES_DELETED_FROM_TRASH_QUEUE)); ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -343,6 +349,7 @@ TEST_F(DeleteSchedulerTest, DisableRateLimiting) { ASSERT_EQ(bg_delete_file, 0); ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_FROM_TRASH_QUEUE)); ASSERT_EQ(num_files, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); @@ -382,6 +389,7 @@ TEST_F(DeleteSchedulerTest, ConflictNames) { auto bg_errors = delete_scheduler_->GetBackgroundErrors(); ASSERT_EQ(bg_errors.size(), 0); ASSERT_EQ(10, stats_->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ(10, stats_->getAndResetTickerCount(FILES_DELETED_FROM_TRASH_QUEUE)); ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); @@ -466,6 +474,8 @@ TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) { auto bg_errors = delete_scheduler_->GetBackgroundErrors(); ASSERT_EQ(bg_errors.size(), 0); ASSERT_EQ(kTestFileNum, stats_->getAndResetTickerCount(FILES_MARKED_TRASH)); + ASSERT_EQ(kTestFileNum, + stats_->getAndResetTickerCount(FILES_DELETED_FROM_TRASH_QUEUE)); ASSERT_EQ(0, stats_->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY)); } @@ -716,9 +726,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -int main(int /*argc*/, char** /*argv*/) { - printf("DeleteScheduler is not supported in ROCKSDB_LITE\n"); - return 0; -} -#endif // ROCKSDB_LITE diff --git a/file/file_prefetch_buffer.cc b/file/file_prefetch_buffer.cc index f7d4c9591359..da4a1d0b9dd7 100644 --- a/file/file_prefetch_buffer.cc +++ b/file/file_prefetch_buffer.cc @@ -18,7 +18,7 @@ #include "port/port.h" #include "test_util/sync_point.h" #include "util/random.h" -#include "util/rate_limiter.h" +#include "util/rate_limiter_impl.h" namespace ROCKSDB_NAMESPACE { @@ -81,13 +81,12 @@ void FilePrefetchBuffer::CalculateOffsetAndLen(size_t alignment, Status FilePrefetchBuffer::Read(const IOOptions& opts, RandomAccessFileReader* reader, - Env::IOPriority rate_limiter_priority, uint64_t read_len, uint64_t chunk_len, uint64_t rounddown_start, uint32_t index) { Slice result; Status s = reader->Read(opts, rounddown_start + chunk_len, read_len, &result, bufs_[index].buffer_.BufferStart() + chunk_len, - /*aligned_buf=*/nullptr, rate_limiter_priority); + /*aligned_buf=*/nullptr); #ifndef NDEBUG if (result.size() < read_len) { // Fake an IO error to force db_stress fault injection to ignore @@ -99,6 +98,9 @@ Status FilePrefetchBuffer::Read(const IOOptions& opts, return s; } + if (usage_ == FilePrefetchBufferUsage::kUserScanPrefetch) { + RecordTick(stats_, PREFETCH_BYTES, read_len); + } // Update the buffer offset and size. bufs_[index].offset_ = rounddown_start; bufs_[index].buffer_.Size(static_cast(chunk_len) + result.size()); @@ -134,8 +136,7 @@ Status FilePrefetchBuffer::ReadAsync(const IOOptions& opts, Status FilePrefetchBuffer::Prefetch(const IOOptions& opts, RandomAccessFileReader* reader, - uint64_t offset, size_t n, - Env::IOPriority rate_limiter_priority) { + uint64_t offset, size_t n) { if (!enable_ || reader == nullptr) { return Status::OK(); } @@ -160,8 +161,11 @@ Status FilePrefetchBuffer::Prefetch(const IOOptions& opts, true /*refit_tail*/, chunk_len); size_t read_len = static_cast(roundup_len - chunk_len); - Status s = Read(opts, reader, rate_limiter_priority, read_len, chunk_len, - rounddown_offset, curr_); + Status s = Read(opts, reader, read_len, chunk_len, rounddown_offset, curr_); + + if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && s.ok()) { + RecordInHistogram(stats_, TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, read_len); + } return s; } @@ -325,8 +329,7 @@ void FilePrefetchBuffer::PollAndUpdateBuffersIfNeeded(uint64_t offset) { Status FilePrefetchBuffer::HandleOverlappingData( const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, - size_t length, size_t readahead_size, - Env::IOPriority /*rate_limiter_priority*/, bool& copy_to_third_buffer, + size_t length, size_t readahead_size, bool& copy_to_third_buffer, uint64_t& tmp_offset, size_t& tmp_length) { Status s; size_t alignment = reader->file()->GetRequiredBufferAlignment(); @@ -364,8 +367,11 @@ Status FilePrefetchBuffer::HandleOverlappingData( size_t second_size = bufs_[second].async_read_in_progress_ ? bufs_[second].async_req_len_ : bufs_[second].buffer_.CurrentSize(); - if (tmp_offset + tmp_length <= bufs_[second].offset_ + second_size) { - uint64_t rounddown_start = bufs_[second].offset_ + second_size; + uint64_t rounddown_start = bufs_[second].offset_ + second_size; + // Second buffer might be out of bound if first buffer already prefetched + // that data. + if (tmp_offset + tmp_length <= bufs_[second].offset_ + second_size && + !IsOffsetOutOfBound(rounddown_start)) { uint64_t roundup_end = Roundup(rounddown_start + readahead_size, alignment); uint64_t roundup_len = roundup_end - rounddown_start; @@ -409,10 +415,11 @@ Status FilePrefetchBuffer::HandleOverlappingData( // curr_, send async request on curr_, wait for poll to fill second // buffer (if any), and copy remaining data from second buffer to third // buffer. -Status FilePrefetchBuffer::PrefetchAsyncInternal( - const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, - size_t length, size_t readahead_size, Env::IOPriority rate_limiter_priority, - bool& copy_to_third_buffer) { +Status FilePrefetchBuffer::PrefetchAsyncInternal(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t offset, size_t length, + size_t readahead_size, + bool& copy_to_third_buffer) { if (!enable_) { return Status::OK(); } @@ -439,8 +446,7 @@ Status FilePrefetchBuffer::PrefetchAsyncInternal( // - switch buffers and curr_ now points to second buffer to copy remaining // data. s = HandleOverlappingData(opts, reader, offset, length, readahead_size, - rate_limiter_priority, copy_to_third_buffer, - tmp_offset, tmp_length); + copy_to_third_buffer, tmp_offset, tmp_length); if (!s.ok()) { return s; } @@ -545,7 +551,9 @@ Status FilePrefetchBuffer::PrefetchAsyncInternal( assert(roundup_len1 >= chunk_len1); read_len1 = static_cast(roundup_len1 - chunk_len1); } - { + + // Prefetch in second buffer only if readahead_size_ > 0. + if (readahead_size_ > 0) { // offset and size alignment for second buffer for asynchronous // prefetching uint64_t rounddown_start2 = roundup_end1; @@ -560,25 +568,29 @@ Status FilePrefetchBuffer::PrefetchAsyncInternal( roundup_end2 = Roundup(rounddown_start2 + prefetch_size, alignment); } - uint64_t roundup_len2 = roundup_end2 - rounddown_start2; - uint64_t chunk_len2 = 0; - CalculateOffsetAndLen(alignment, rounddown_start2, roundup_len2, second, - false /*refit_tail*/, chunk_len2); - assert(chunk_len2 == 0); - // Update the buffer offset. - bufs_[second].offset_ = rounddown_start2; - assert(roundup_len2 >= chunk_len2); - uint64_t read_len2 = static_cast(roundup_len2 - chunk_len2); - Status tmp_s = ReadAsync(opts, reader, read_len2, rounddown_start2, second); - if (!tmp_s.ok()) { - DestroyAndClearIOHandle(second); - bufs_[second].buffer_.Clear(); + // Second buffer might be out of bound if first buffer already prefetched + // that data. + if (!IsOffsetOutOfBound(rounddown_start2)) { + uint64_t roundup_len2 = roundup_end2 - rounddown_start2; + uint64_t chunk_len2 = 0; + CalculateOffsetAndLen(alignment, rounddown_start2, roundup_len2, second, + false /*refit_tail*/, chunk_len2); + assert(chunk_len2 == 0); + // Update the buffer offset. + bufs_[second].offset_ = rounddown_start2; + assert(roundup_len2 >= chunk_len2); + uint64_t read_len2 = static_cast(roundup_len2 - chunk_len2); + s = ReadAsync(opts, reader, read_len2, rounddown_start2, second); + if (!s.ok()) { + DestroyAndClearIOHandle(second); + bufs_[second].buffer_.Clear(); + return s; + } } } if (read_len1 > 0) { - s = Read(opts, reader, rate_limiter_priority, read_len1, chunk_len1, - rounddown_start1, curr_); + s = Read(opts, reader, read_len1, chunk_len1, rounddown_start1, curr_); if (!s.ok()) { if (bufs_[second].io_handle_ != nullptr) { std::vector handles; @@ -606,8 +618,23 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, size_t n, Slice* result, Status* status, - Env::IOPriority rate_limiter_priority, bool for_compaction /* = false */) { + bool ret = TryReadFromCacheUntracked(opts, reader, offset, n, result, status, + for_compaction); + if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) { + if (ret) { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_HIT); + } else { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_MISS); + } + } + return ret; +} + +bool FilePrefetchBuffer::TryReadFromCacheUntracked( + const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, + size_t n, Slice* result, Status* status, + bool for_compaction /* = false */) { if (track_min_offset_ && offset < min_offset_read_) { min_offset_read_ = static_cast(offset); } @@ -627,9 +654,13 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts, assert(reader != nullptr); assert(max_readahead_size_ >= readahead_size_); if (for_compaction) { - s = Prefetch(opts, reader, offset, std::max(n, readahead_size_), - rate_limiter_priority); + s = Prefetch(opts, reader, offset, std::max(n, readahead_size_)); } else { + if (IsOffsetInBuffer(offset, curr_)) { + RecordTick(stats_, PREFETCH_BYTES_USEFUL, + bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize() - + offset); + } if (implicit_auto_readahead_) { if (!IsEligibleForPrefetch(offset, n)) { // Ignore status as Prefetch is not called. @@ -637,8 +668,8 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts, return false; } } - s = Prefetch(opts, reader, offset, n + readahead_size_, - rate_limiter_priority); + size_t current_readahead_size = ReadAheadSizeTuning(offset, n); + s = Prefetch(opts, reader, offset, n + current_readahead_size); } if (!s.ok()) { if (status) { @@ -653,6 +684,9 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts, } else { return false; } + } else if (!for_compaction) { + RecordTick(stats_, PREFETCH_HITS); + RecordTick(stats_, PREFETCH_BYTES_USEFUL, n); } UpdateReadPattern(offset, n, false /*decrease_readaheadsize*/); @@ -661,10 +695,25 @@ bool FilePrefetchBuffer::TryReadFromCache(const IOOptions& opts, return true; } -bool FilePrefetchBuffer::TryReadFromCacheAsync( +bool FilePrefetchBuffer::TryReadFromCacheAsync(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t offset, size_t n, + Slice* result, Status* status) { + bool ret = + TryReadFromCacheAsyncUntracked(opts, reader, offset, n, result, status); + if (usage_ == FilePrefetchBufferUsage::kTableOpenPrefetchTail && enable_) { + if (ret) { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_HIT); + } else { + RecordTick(stats_, TABLE_OPEN_PREFETCH_TAIL_MISS); + } + } + return ret; +} + +bool FilePrefetchBuffer::TryReadFromCacheAsyncUntracked( const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, - size_t n, Slice* result, Status* status, - Env::IOPriority rate_limiter_priority) { + size_t n, Slice* result, Status* status) { if (track_min_offset_ && offset < min_offset_read_) { min_offset_read_ = static_cast(offset); } @@ -704,7 +753,9 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync( (bufs_[curr_].async_read_in_progress_ || offset + n > bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize())) { - if (readahead_size_ > 0) { + // In case readahead_size is trimmed (=0), we still want to poll the data + // submitted with explicit_prefetch_submitted_=true. + if (readahead_size_ > 0 || explicit_prefetch_submitted_) { Status s; assert(reader != nullptr); assert(max_readahead_size_ >= readahead_size_); @@ -716,10 +767,13 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync( return false; } } + + UpdateReadAheadSizeForUpperBound(offset, n); + // Prefetch n + readahead_size_/2 synchronously as remaining // readahead_size_/2 will be prefetched asynchronously. s = PrefetchAsyncInternal(opts, reader, offset, n, readahead_size_ / 2, - rate_limiter_priority, copy_to_third_buffer); + copy_to_third_buffer); explicit_prefetch_submitted_ = false; if (!s.ok()) { if (status) { @@ -793,10 +847,12 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts, num_file_reads_ = 0; explicit_prefetch_submitted_ = false; bool is_eligible_for_prefetching = false; + + UpdateReadAheadSizeForUpperBound(offset, n); if (readahead_size_ > 0 && (!implicit_auto_readahead_ || - num_file_reads_ + 1 >= num_file_reads_for_auto_readahead_)) { - is_eligible_for_prefetching = true; + num_file_reads_ >= num_file_reads_for_auto_readahead_)) { + is_eligible_for_prefetching = true; } // 1. Cancel any pending async read to make code simpler as buffers can be out @@ -858,18 +914,24 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts, // - prefetch_size on second. // Calculate length and offsets for reading. if (!DoesBufferContainData(curr_)) { + uint64_t roundup_len1; // Prefetch full data + prefetch_size in curr_. - rounddown_start1 = Rounddown(offset_to_read, alignment); - roundup_end1 = Roundup(offset_to_read + n + prefetch_size, alignment); - uint64_t roundup_len1 = roundup_end1 - rounddown_start1; - assert(roundup_len1 >= alignment); - assert(roundup_len1 % alignment == 0); - + if (is_eligible_for_prefetching || reader->use_direct_io()) { + rounddown_start1 = Rounddown(offset_to_read, alignment); + roundup_end1 = Roundup(offset_to_read + n + prefetch_size, alignment); + roundup_len1 = roundup_end1 - rounddown_start1; + assert(roundup_len1 >= alignment); + assert(roundup_len1 % alignment == 0); + } else { + rounddown_start1 = offset_to_read; + roundup_end1 = offset_to_read + n; + roundup_len1 = roundup_end1 - rounddown_start1; + } CalculateOffsetAndLen(alignment, rounddown_start1, roundup_len1, curr_, false, chunk_len1); assert(chunk_len1 == 0); assert(roundup_len1 >= chunk_len1); - read_len1 = static_cast(roundup_len1 - chunk_len1); + read_len1 = static_cast(roundup_len1); bufs_[curr_].offset_ = rounddown_start1; } @@ -881,17 +943,20 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts, rounddown_start2 = roundup_end1; } - roundup_end2 = Roundup(rounddown_start2 + prefetch_size, alignment); - uint64_t roundup_len2 = roundup_end2 - rounddown_start2; - - assert(roundup_len2 >= alignment); - CalculateOffsetAndLen(alignment, rounddown_start2, roundup_len2, second, - false, chunk_len2); - assert(chunk_len2 == 0); - assert(roundup_len2 >= chunk_len2); - read_len2 = static_cast(roundup_len2 - chunk_len2); - // Update the buffer offset. - bufs_[second].offset_ = rounddown_start2; + // Second buffer might be out of bound if first buffer already prefetched + // that data. + if (!IsOffsetOutOfBound(rounddown_start2)) { + roundup_end2 = Roundup(rounddown_start2 + prefetch_size, alignment); + uint64_t roundup_len2 = roundup_end2 - rounddown_start2; + + CalculateOffsetAndLen(alignment, rounddown_start2, roundup_len2, second, + false, chunk_len2); + assert(chunk_len2 == 0); + assert(roundup_len2 >= chunk_len2); + read_len2 = static_cast(roundup_len2 - chunk_len2); + // Update the buffer offset. + bufs_[second].offset_ = rounddown_start2; + } } if (read_len1) { @@ -905,6 +970,7 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts, prev_len_ = 0; } if (read_len2) { + TEST_SYNC_POINT("FilePrefetchBuffer::PrefetchAsync:ExtraPrefetching"); s = ReadAsync(opts, reader, read_len2, rounddown_start2, second); if (!s.ok()) { DestroyAndClearIOHandle(second); diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h index a4a75fe2b2ab..d71b28ab816a 100644 --- a/file/file_prefetch_buffer.h +++ b/file/file_prefetch_buffer.h @@ -15,7 +15,7 @@ #include #include "file/readahead_file_info.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/file_system.h" @@ -54,6 +54,12 @@ struct BufferInfo { uint32_t pos_ = 0; }; +enum class FilePrefetchBufferUsage { + kTableOpenPrefetchTail, + kUserScanPrefetch, + kUnknown, +}; + // FilePrefetchBuffer is a smart buffer to store and read data from a file. class FilePrefetchBuffer { public: @@ -78,13 +84,15 @@ class FilePrefetchBuffer { // and max_readahead_size are passed in. // A user can construct a FilePrefetchBuffer without any arguments, but use // `Prefetch` to load data into the buffer. - FilePrefetchBuffer(size_t readahead_size = 0, size_t max_readahead_size = 0, - bool enable = true, bool track_min_offset = false, - bool implicit_auto_readahead = false, - uint64_t num_file_reads = 0, - uint64_t num_file_reads_for_auto_readahead = 0, - FileSystem* fs = nullptr, SystemClock* clock = nullptr, - Statistics* stats = nullptr) + FilePrefetchBuffer( + size_t readahead_size = 0, size_t max_readahead_size = 0, + bool enable = true, bool track_min_offset = false, + bool implicit_auto_readahead = false, uint64_t num_file_reads = 0, + uint64_t num_file_reads_for_auto_readahead = 0, + uint64_t upper_bound_offset = 0, FileSystem* fs = nullptr, + SystemClock* clock = nullptr, Statistics* stats = nullptr, + const std::function& cb = nullptr, + FilePrefetchBufferUsage usage = FilePrefetchBufferUsage::kUnknown) : curr_(0), readahead_size_(readahead_size), initial_auto_readahead_size_(readahead_size), @@ -100,7 +108,10 @@ class FilePrefetchBuffer { explicit_prefetch_submitted_(false), fs_(fs), clock_(clock), - stats_(stats) { + stats_(stats), + usage_(usage), + upper_bound_offset_(upper_bound_offset), + readaheadsize_cb_(cb) { assert((num_file_reads_ >= num_file_reads_for_auto_readahead_ + 1) || (num_file_reads_ == 0)); // If ReadOptions.async_io is enabled, data is asynchronously filled in @@ -174,15 +185,15 @@ class FilePrefetchBuffer { RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded); } + bool Enabled() const { return enable_; } + // Load data into the buffer from a file. + // opts : the IO options to use. // reader : the file reader. // offset : the file offset to start reading from. // n : the number of bytes to read. - // rate_limiter_priority : rate limiting priority, or `Env::IO_TOTAL` to - // bypass. Status Prefetch(const IOOptions& opts, RandomAccessFileReader* reader, - uint64_t offset, size_t n, - Env::IOPriority rate_limiter_priority); + uint64_t offset, size_t n); // Request for reading the data from a file asynchronously. // If data already exists in the buffer, result will be updated. @@ -209,23 +220,21 @@ class FilePrefetchBuffer { // n : the number of bytes. // result : output buffer to put the data into. // s : output status. - // rate_limiter_priority : rate limiting priority, or `Env::IO_TOTAL` to - // bypass. // for_compaction : true if cache read is done for compaction read. bool TryReadFromCache(const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, size_t n, Slice* result, Status* s, - Env::IOPriority rate_limiter_priority, bool for_compaction = false); bool TryReadFromCacheAsync(const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, - size_t n, Slice* result, Status* status, - Env::IOPriority rate_limiter_priority); + size_t n, Slice* result, Status* status); // The minimum `offset` ever passed to TryReadFromCache(). This will nly be // tracked if track_min_offset = true. size_t min_offset_read() const { return min_offset_read_; } + size_t GetPrefetchOffset() const { return bufs_[curr_].offset_; } + // Called in case of implicit auto prefetching. void UpdateReadPattern(const uint64_t& offset, const size_t& len, bool decrease_readaheadsize) { @@ -273,6 +282,11 @@ class FilePrefetchBuffer { // Callback function passed to underlying FS in case of asynchronous reads. void PrefetchAsyncCallback(const FSReadRequest& req, void* cb_arg); + void ResetUpperBoundOffset(uint64_t upper_bound_offset) { + upper_bound_offset_ = upper_bound_offset; + readahead_size_ = initial_auto_readahead_size_; + } + private: // Calculates roundoff offset and length to be prefetched based on alignment // and data present in buffer_. It also allocates new buffer or refit tail if @@ -295,12 +309,11 @@ class FilePrefetchBuffer { Status PrefetchAsyncInternal(const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, size_t length, size_t readahead_size, - Env::IOPriority rate_limiter_priority, bool& copy_to_third_buffer); Status Read(const IOOptions& opts, RandomAccessFileReader* reader, - Env::IOPriority rate_limiter_priority, uint64_t read_len, - uint64_t chunk_len, uint64_t rounddown_start, uint32_t index); + uint64_t read_len, uint64_t chunk_len, uint64_t rounddown_start, + uint32_t index); Status ReadAsync(const IOOptions& opts, RandomAccessFileReader* reader, uint64_t read_len, uint64_t rounddown_start, uint32_t index); @@ -383,6 +396,12 @@ class FilePrefetchBuffer { bufs_[second].offset_)) { return false; } + + // Readahead size can be 0 because of trimming. + if (readahead_size_ == 0) { + return false; + } + bufs_[second].buffer_.Clear(); return true; } @@ -399,10 +418,54 @@ class FilePrefetchBuffer { Status HandleOverlappingData(const IOOptions& opts, RandomAccessFileReader* reader, uint64_t offset, size_t length, size_t readahead_size, - Env::IOPriority rate_limiter_priority, bool& copy_to_third_buffer, uint64_t& tmp_offset, size_t& tmp_length); + bool TryReadFromCacheUntracked(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t offset, size_t n, Slice* result, + Status* s, + bool for_compaction = false); + + bool TryReadFromCacheAsyncUntracked(const IOOptions& opts, + RandomAccessFileReader* reader, + uint64_t offset, size_t n, Slice* result, + Status* status); + + void UpdateReadAheadSizeForUpperBound(uint64_t offset, size_t n) { + // Adjust readhahead_size till upper_bound if upper_bound_offset_ is + // set. + if (readahead_size_ > 0 && upper_bound_offset_ > 0 && + upper_bound_offset_ > offset) { + if (upper_bound_offset_ < offset + n + readahead_size_) { + readahead_size_ = (upper_bound_offset_ - offset) - n; + RecordTick(stats_, READAHEAD_TRIMMED); + } + } + } + + inline bool IsOffsetOutOfBound(uint64_t offset) { + if (upper_bound_offset_ > 0) { + return (offset >= upper_bound_offset_); + } + return false; + } + + // Performs tuning to calculate readahead_size. + size_t ReadAheadSizeTuning(uint64_t offset, size_t n) { + UpdateReadAheadSizeForUpperBound(offset, n); + + if (readaheadsize_cb_ != nullptr && readahead_size_ > 0) { + size_t updated_readahead_size = 0; + readaheadsize_cb_(offset, readahead_size_, updated_readahead_size); + if (readahead_size_ != updated_readahead_size) { + RecordTick(stats_, READAHEAD_TRIMMED); + } + return updated_readahead_size; + } + return readahead_size_; + } + std::vector bufs_; // curr_ represents the index for bufs_ indicating which buffer is being // consumed currently. @@ -442,5 +505,13 @@ class FilePrefetchBuffer { FileSystem* fs_; SystemClock* clock_; Statistics* stats_; + + FilePrefetchBufferUsage usage_; + + // upper_bound_offset_ is set when ReadOptions.iterate_upper_bound and + // ReadOptions.auto_readahead_size are set to trim readahead_size upto + // upper_bound_offset_ during prefetching. + uint64_t upper_bound_offset_ = 0; + std::function readaheadsize_cb_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/file/file_util.cc b/file/file_util.cc index 7997d6e11eb0..9eee106378b9 100644 --- a/file/file_util.cc +++ b/file/file_util.cc @@ -13,6 +13,7 @@ #include "file/sst_file_manager_impl.h" #include "file/writable_file_writer.h" #include "rocksdb/env.h" +#include "rocksdb/statistics.h" namespace ROCKSDB_NAMESPACE { @@ -116,7 +117,6 @@ IOStatus CreateFile(FileSystem* fs, const std::string& destination, Status DeleteDBFile(const ImmutableDBOptions* db_options, const std::string& fname, const std::string& dir_to_sync, const bool force_bg, const bool force_fg) { -#ifndef ROCKSDB_LITE SstFileManagerImpl* sfm = static_cast(db_options->sst_file_manager.get()); if (sfm && !force_fg) { @@ -124,14 +124,6 @@ Status DeleteDBFile(const ImmutableDBOptions* db_options, } else { return db_options->env->DeleteFile(fname); } -#else - (void)dir_to_sync; - (void)force_bg; - (void)force_fg; - // SstFileManager is not supported in ROCKSDB_LITE - // Delete file immediately - return db_options->env->DeleteFile(fname); -#endif } // requested_checksum_func_name brings the function name of the checksum @@ -144,9 +136,9 @@ IOStatus GenerateOneFileChecksum( FileChecksumGenFactory* checksum_factory, const std::string& requested_checksum_func_name, std::string* file_checksum, std::string* file_checksum_func_name, - size_t verify_checksums_readahead_size, bool allow_mmap_reads, + size_t verify_checksums_readahead_size, bool /*allow_mmap_reads*/, std::shared_ptr& io_tracer, RateLimiter* rate_limiter, - Env::IOPriority rate_limiter_priority) { + const ReadOptions& read_options, Statistics* stats, SystemClock* clock) { if (checksum_factory == nullptr) { return IOStatus::InvalidArgument("Checksum factory is invalid"); } @@ -194,9 +186,9 @@ IOStatus GenerateOneFileChecksum( if (!io_s.ok()) { return io_s; } - reader.reset(new RandomAccessFileReader(std::move(r_file), file_path, - nullptr /*Env*/, io_tracer, nullptr, - 0, nullptr, rate_limiter)); + reader.reset(new RandomAccessFileReader( + std::move(r_file), file_path, clock, io_tracer, stats, + Histograms::SST_READ_MICROS, nullptr, rate_limiter)); } // Found that 256 KB readahead size provides the best performance, based on @@ -205,22 +197,28 @@ IOStatus GenerateOneFileChecksum( size_t readahead_size = (verify_checksums_readahead_size != 0) ? verify_checksums_readahead_size : default_max_read_ahead_size; - - FilePrefetchBuffer prefetch_buffer(readahead_size /* readahead_size */, - readahead_size /* max_readahead_size */, - !allow_mmap_reads /* enable */); + std::unique_ptr buf; + if (reader->use_direct_io()) { + size_t alignment = reader->file()->GetRequiredBufferAlignment(); + readahead_size = (readahead_size + alignment - 1) & ~(alignment - 1); + } + buf.reset(new char[readahead_size]); Slice slice; uint64_t offset = 0; IOOptions opts; + io_s = reader->PrepareIOOptions(read_options, opts); + if (!io_s.ok()) { + return io_s; + } while (size > 0) { size_t bytes_to_read = static_cast(std::min(uint64_t{readahead_size}, size)); - if (!prefetch_buffer.TryReadFromCache( - opts, reader.get(), offset, bytes_to_read, &slice, - nullptr /* status */, rate_limiter_priority, - false /* for_compaction */)) { - return IOStatus::Corruption("file read failed"); + io_s = + reader->Read(opts, offset, bytes_to_read, &slice, buf.get(), nullptr); + if (!io_s.ok()) { + return IOStatus::Corruption("file read failed with error: " + + io_s.ToString()); } if (slice.size() == 0) { return IOStatus::Corruption("file too small"); @@ -228,6 +226,8 @@ IOStatus GenerateOneFileChecksum( checksum_generator->Update(slice.data(), slice.size()); size -= slice.size(); offset += slice.size(); + + TEST_SYNC_POINT("GenerateOneFileChecksum::Chunk:0"); } checksum_generator->Finalize(); *file_checksum = checksum_generator->GetChecksum(); diff --git a/file/file_util.h b/file/file_util.h index f9d81dbdede5..2c91718eeb6a 100644 --- a/file/file_util.h +++ b/file/file_util.h @@ -11,6 +11,7 @@ #include "rocksdb/env.h" #include "rocksdb/file_system.h" #include "rocksdb/sst_file_writer.h" +#include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/system_clock.h" #include "rocksdb/types.h" @@ -52,6 +53,7 @@ extern Status DeleteDBFile(const ImmutableDBOptions* db_options, const std::string& path_to_sync, const bool force_bg, const bool force_fg); +// TODO(hx235): pass the whole DBOptions intead of its individual fields extern IOStatus GenerateOneFileChecksum( FileSystem* fs, const std::string& file_path, FileChecksumGenFactory* checksum_factory, @@ -59,7 +61,7 @@ extern IOStatus GenerateOneFileChecksum( std::string* file_checksum_func_name, size_t verify_checksums_readahead_size, bool allow_mmap_reads, std::shared_ptr& io_tracer, RateLimiter* rate_limiter, - Env::IOPriority rate_limiter_priority); + const ReadOptions& read_options, Statistics* stats, SystemClock* clock); inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro, SystemClock* clock, IOOptions& opts) { @@ -86,10 +88,22 @@ inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro, } opts.rate_limiter_priority = ro.rate_limiter_priority; + opts.io_activity = ro.io_activity; + return IOStatus::OK(); } // Test method to delete the input directory and all of its contents. // This method is destructive and is meant for use only in tests!!! Status DestroyDir(Env* env, const std::string& dir); + +inline bool CheckFSFeatureSupport(FileSystem* fs, FSSupportedOps feat) { + int64_t supported_ops = 0; + fs->SupportedOps(supported_ops); + if (supported_ops & (1ULL << feat)) { + return true; + } + return false; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index 23e7454ed115..84932440a037 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -13,6 +13,11 @@ #endif #include "util/random.h" +namespace { +static bool enable_io_uring = true; +extern "C" bool RocksDbIOUringEnable() { return enable_io_uring; } +} // namespace + namespace ROCKSDB_NAMESPACE { class MockFS; @@ -20,10 +25,12 @@ class MockFS; class MockRandomAccessFile : public FSRandomAccessFileOwnerWrapper { public: MockRandomAccessFile(std::unique_ptr& file, - bool support_prefetch, std::atomic_int& prefetch_count) + bool support_prefetch, std::atomic_int& prefetch_count, + bool small_buffer_alignment = false) : FSRandomAccessFileOwnerWrapper(std::move(file)), support_prefetch_(support_prefetch), - prefetch_count_(prefetch_count) {} + prefetch_count_(prefetch_count), + small_buffer_alignment_(small_buffer_alignment) {} IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, IODebugContext* dbg) override { @@ -35,16 +42,25 @@ class MockRandomAccessFile : public FSRandomAccessFileOwnerWrapper { } } + size_t GetRequiredBufferAlignment() const override { + return small_buffer_alignment_ + ? 1 + : FSRandomAccessFileOwnerWrapper::GetRequiredBufferAlignment(); + } + private: const bool support_prefetch_; std::atomic_int& prefetch_count_; + const bool small_buffer_alignment_; }; class MockFS : public FileSystemWrapper { public: explicit MockFS(const std::shared_ptr& wrapped, - bool support_prefetch) - : FileSystemWrapper(wrapped), support_prefetch_(support_prefetch) {} + bool support_prefetch, bool small_buffer_alignment = false) + : FileSystemWrapper(wrapped), + support_prefetch_(support_prefetch), + small_buffer_alignment_(small_buffer_alignment) {} static const char* kClassName() { return "MockFS"; } const char* Name() const override { return kClassName(); } @@ -56,8 +72,8 @@ class MockFS : public FileSystemWrapper { std::unique_ptr file; IOStatus s; s = target()->NewRandomAccessFile(fname, opts, &file, dbg); - result->reset( - new MockRandomAccessFile(file, support_prefetch_, prefetch_count_)); + result->reset(new MockRandomAccessFile( + file, support_prefetch_, prefetch_count_, small_buffer_alignment_)); return s; } @@ -71,6 +87,7 @@ class MockFS : public FileSystemWrapper { private: const bool support_prefetch_; + const bool small_buffer_alignment_; std::atomic_int prefetch_count_{0}; }; @@ -80,7 +97,8 @@ class PrefetchTest public: PrefetchTest() : DBTestBase("prefetch_test", true) {} - void SetGenericOptions(Env* env, bool use_direct_io, Options& options) { + virtual void SetGenericOptions(Env* env, bool use_direct_io, + Options& options) { options = CurrentOptions(); options.write_buffer_size = 1024; options.create_if_missing = true; @@ -110,7 +128,14 @@ std::string BuildKey(int num, std::string postfix = "") { return "my_key_" + std::to_string(num) + postfix; } -// This test verifies the basic functionality of prefetching. +// This test verifies the following basic functionalities of prefetching: +// (1) If underline file system supports prefetch, and directIO is not enabled +// make sure prefetch() is called and FilePrefetchBuffer is not used. +// (2) If underline file system doesn't support prefetch, or directIO is +// enabled, make sure prefetch() is not called and FilePrefetchBuffer is +// used. +// (3) Measure read bytes, hit and miss of SST's tail prefetching during table +// open. TEST_P(PrefetchTest, Basic) { // First param is if the mockFS support_prefetch or not bool support_prefetch = @@ -125,6 +150,7 @@ TEST_P(PrefetchTest, Basic) { std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); Options options; SetGenericOptions(env.get(), use_direct_io, options); + options.statistics = CreateDBStatistics(); const int kNumKeys = 1100; int buff_prefetch_count = 0; @@ -143,16 +169,18 @@ TEST_P(PrefetchTest, Basic) { // create first key range WriteBatch batch; for (int i = 0; i < kNumKeys; i++) { - ASSERT_OK(batch.Put(BuildKey(i), "value for range 1 key")); + ASSERT_OK(batch.Put(BuildKey(i), "v1")); } ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(db_->Flush(FlushOptions())); // create second key range batch.Clear(); for (int i = 0; i < kNumKeys; i++) { - ASSERT_OK(batch.Put(BuildKey(i, "key2"), "value for range 2 key")); + ASSERT_OK(batch.Put(BuildKey(i, "key2"), "v2")); } ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(db_->Flush(FlushOptions())); // delete second key range batch.Clear(); @@ -160,6 +188,23 @@ TEST_P(PrefetchTest, Basic) { ASSERT_OK(batch.Delete(BuildKey(i, "key2"))); } ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(db_->Flush(FlushOptions())); + + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + const size_t num_file = metadata.size(); + // To verify SST file tail prefetch (once per file) during flush output + // verification + if (support_prefetch && !use_direct_io) { + ASSERT_TRUE(fs->IsPrefetchCalled()); + ASSERT_EQ(num_file, fs->GetPrefetchCount()); + ASSERT_EQ(0, buff_prefetch_count); + fs->ClearPrefetchCount(); + } else { + ASSERT_FALSE(fs->IsPrefetchCalled()); + ASSERT_EQ(buff_prefetch_count, num_file); + buff_prefetch_count = 0; + } // compact database std::string start_key = BuildKey(0); @@ -167,22 +212,46 @@ TEST_P(PrefetchTest, Basic) { Slice least(start_key.data(), start_key.size()); Slice greatest(end_key.data(), end_key.size()); + HistogramData prev_table_open_prefetch_tail_read; + options.statistics->histogramData(TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, + &prev_table_open_prefetch_tail_read); + const uint64_t prev_table_open_prefetch_tail_miss = + options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_MISS); + const uint64_t prev_table_open_prefetch_tail_hit = + options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT); + // commenting out the line below causes the example to work correctly ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + HistogramData cur_table_open_prefetch_tail_read; + options.statistics->histogramData(TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, + &cur_table_open_prefetch_tail_read); + const uint64_t cur_table_open_prefetch_tail_miss = + options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_MISS); + const uint64_t cur_table_open_prefetch_tail_hit = + options.statistics->getTickerCount(TABLE_OPEN_PREFETCH_TAIL_HIT); + + // To verify prefetch during compaction input read if (support_prefetch && !use_direct_io) { - // If underline file system supports prefetch, and directIO is not enabled - // make sure prefetch() is called and FilePrefetchBuffer is not used. ASSERT_TRUE(fs->IsPrefetchCalled()); - fs->ClearPrefetchCount(); + // To rule out false positive by the SST file tail prefetch during + // compaction output verification + ASSERT_GT(fs->GetPrefetchCount(), 1); ASSERT_EQ(0, buff_prefetch_count); + fs->ClearPrefetchCount(); } else { - // If underline file system doesn't support prefetch, or directIO is - // enabled, make sure prefetch() is not called and FilePrefetchBuffer is - // used. ASSERT_FALSE(fs->IsPrefetchCalled()); - ASSERT_GT(buff_prefetch_count, 0); + // To rule out false positive by the SST file tail prefetch during + // compaction output verification + ASSERT_GT(buff_prefetch_count, 1); buff_prefetch_count = 0; + + ASSERT_GT(cur_table_open_prefetch_tail_read.count, + prev_table_open_prefetch_tail_read.count); + ASSERT_GT(cur_table_open_prefetch_tail_hit, + prev_table_open_prefetch_tail_hit); + ASSERT_GE(cur_table_open_prefetch_tail_miss, + prev_table_open_prefetch_tail_miss); } // count the keys @@ -192,9 +261,11 @@ TEST_P(PrefetchTest, Basic) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { num_keys++; } + ASSERT_OK(iter->status()); + (void)num_keys; } - // Make sure prefetch is called only if file system support prefetch. + // To verify prefetch during user scan if (support_prefetch && !use_direct_io) { ASSERT_TRUE(fs->IsPrefetchCalled()); fs->ClearPrefetchCount(); @@ -207,7 +278,221 @@ TEST_P(PrefetchTest, Basic) { Close(); } -#ifndef ROCKSDB_LITE +class PrefetchTailTest : public PrefetchTest { + public: + bool SupportPrefetch() const { + return std::get<0>(GetParam()) && + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + } + + bool UseDirectIO() const { return std::get<1>(GetParam()); } + + bool UseFilePrefetchBuffer() const { + return !SupportPrefetch() || UseDirectIO(); + } + + Env* GetEnv(bool small_buffer_alignment = false) const { + std::shared_ptr fs = std::make_shared( + env_->GetFileSystem(), SupportPrefetch(), small_buffer_alignment); + + return new CompositeEnvWrapper(env_, fs); + } + + void SetGenericOptions(Env* env, bool use_direct_io, + Options& options) override { + PrefetchTest::SetGenericOptions(env, use_direct_io, options); + options.statistics = CreateDBStatistics(); + } + + void SetBlockBasedTableOptions( + BlockBasedTableOptions& table_options, bool partition_filters = true, + uint64_t metadata_block_size = + BlockBasedTableOptions().metadata_block_size, + bool use_small_cache = false) { + table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + table_options.partition_filters = partition_filters; + if (table_options.partition_filters) { + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + } + table_options.metadata_block_size = metadata_block_size; + + if (use_small_cache) { + LRUCacheOptions co; + co.capacity = 1; + std::shared_ptr cache = NewLRUCache(co); + table_options.block_cache = cache; + } + } + + int64_t GetNumIndexPartition() const { + int64_t index_partition_counts = 0; + TablePropertiesCollection all_table_props; + assert(db_->GetPropertiesOfAllTables(&all_table_props).ok()); + for (const auto& name_and_table_props : all_table_props) { + const auto& table_props = name_and_table_props.second; + index_partition_counts += table_props->index_partitions; + } + return index_partition_counts; + } +}; + +INSTANTIATE_TEST_CASE_P(PrefetchTailTest, PrefetchTailTest, + ::testing::Combine(::testing::Bool(), + ::testing::Bool())); + +TEST_P(PrefetchTailTest, Basic) { + std::unique_ptr env(GetEnv()); + Options options; + SetGenericOptions(env.get(), UseDirectIO(), options); + + BlockBasedTableOptions bbto; + SetBlockBasedTableOptions(bbto); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + Status s = TryReopen(options); + if (UseDirectIO() && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + ROCKSDB_GTEST_BYPASS("Direct IO is not supported"); + return; + } else { + ASSERT_OK(s); + } + + ASSERT_OK(Put("k1", "v1")); + + HistogramData pre_flush_file_read; + options.statistics->histogramData(FILE_READ_FLUSH_MICROS, + &pre_flush_file_read); + ASSERT_OK(Flush()); + HistogramData post_flush_file_read; + options.statistics->histogramData(FILE_READ_FLUSH_MICROS, + &post_flush_file_read); + if (UseFilePrefetchBuffer()) { + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // should read from the prefetched tail in file prefetch buffer instead of + // initiating extra SST reads. Therefore `BlockBasedTable::PrefetchTail()` + // should be the only SST read in table verification during flush. + ASSERT_EQ(post_flush_file_read.count - pre_flush_file_read.count, 1); + } else { + // Without the prefetched tail in file prefetch buffer, + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // will initiate extra SST reads + ASSERT_GT(post_flush_file_read.count - pre_flush_file_read.count, 1); + } + ASSERT_OK(Put("k1", "v2")); + ASSERT_OK(Put("k2", "v2")); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + HistogramData pre_compaction_file_read; + options.statistics->histogramData(FILE_READ_COMPACTION_MICROS, + &pre_compaction_file_read); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + HistogramData post_compaction_file_read; + options.statistics->histogramData(FILE_READ_COMPACTION_MICROS, + &post_compaction_file_read); + if (UseFilePrefetchBuffer()) { + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // should read from the prefetched tail in file prefetch buffer instead of + // initiating extra SST reads. + // + // Therefore the 3 reads are + // (1) `ProcessKeyValueCompaction()` of input file 1 + // (2) `ProcessKeyValueCompaction()` of input file 2 + // (3) `BlockBasedTable::PrefetchTail()` of output file during table + // verification in compaction + ASSERT_EQ(post_compaction_file_read.count - pre_compaction_file_read.count, + 3); + } else { + // Without the prefetched tail in file prefetch buffer, + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // as well as reading other parts of the tail (e.g, footer, table + // properties..) will initiate extra SST reads + ASSERT_GT(post_compaction_file_read.count - pre_compaction_file_read.count, + 3); + } + Close(); +} + +TEST_P(PrefetchTailTest, UpgradeToTailSizeInManifest) { + if (!UseFilePrefetchBuffer()) { + ROCKSDB_GTEST_BYPASS( + "Upgrade to tail size in manifest is only relevant when RocksDB file " + "prefetch buffer is used."); + } + if (UseDirectIO()) { + ROCKSDB_GTEST_BYPASS( + "To simplify testing logics with setting file's buffer alignment to " + "be " + "1, direct IO is required to be disabled."); + } + + std::unique_ptr env(GetEnv(true /* small_buffer_alignment */)); + Options options; + SetGenericOptions(env.get(), false /* use_direct_io*/, options); + options.max_open_files = -1; + options.write_buffer_size = 1024 * 1024; + + BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options, false /* partition_filters */, + 1 /* metadata_block_size*/, + true /* use_small_cache */); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + SyncPoint::GetInstance()->EnableProcessing(); + // To simulate a pre-upgrade DB where file tail size is not recorded in + // manifest + SyncPoint::GetInstance()->SetCallBack( + "FileMetaData::FileMetaData", [&](void* arg) { + FileMetaData* meta = static_cast(arg); + meta->tail_size = 0; + }); + + ASSERT_OK(TryReopen(options)); + for (int i = 0; i < 10000; ++i) { + ASSERT_OK(Put("k" + std::to_string(i), "v")); + } + ASSERT_OK(Flush()); + + SyncPoint::GetInstance()->ClearAllCallBacks(); + + // To simulate a DB undergoing the upgrade where tail size to prefetch is + // inferred to be a small number for files with no tail size recorded in + // manifest. + // "1" is chosen to be such number so that with `small_buffer_alignment == + // true` and `use_small_cache == true`, it would have caused one file read + // per index partition during db open if the upgrade is done wrong. + SyncPoint::GetInstance()->SetCallBack( + "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { + std::pair* prefetch_off_len_pair = + static_cast*>(arg); + size_t* prefetch_off = prefetch_off_len_pair->first; + size_t* tail_size = prefetch_off_len_pair->second; + const size_t file_size = *prefetch_off + *tail_size; + + *tail_size = 1; + *prefetch_off = file_size - (*tail_size); + }); + + ASSERT_OK(TryReopen(options)); + + SyncPoint::GetInstance()->ClearAllCallBacks(); + SyncPoint::GetInstance()->DisableProcessing(); + + HistogramData db_open_file_read; + options.statistics->histogramData(FILE_READ_DB_OPEN_MICROS, + &db_open_file_read); + + int64_t num_index_partition = GetNumIndexPartition(); + // If the upgrade is done right, db open will prefetch all the index + // partitions at once, instead of doing one read per partition. + // That is, together with `metadata_block_size == 1`, there will be more + // index partitions than number of non index partitions reads. + ASSERT_LT(db_open_file_read.count, num_index_partition); + + Close(); +} + // This test verifies BlockBasedTableOptions.max_auto_readahead_size is // configured dynamically. TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { @@ -270,7 +555,7 @@ TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { } Close(); std::vector buff_prefectch_level_count = {0, 0, 0}; - TryReopen(options); + ASSERT_OK(TryReopen(options)); { auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); fs->ClearPrefetchCount(); @@ -370,9 +655,6 @@ TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) { SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", [&](void*) { buff_prefetch_count++; }); - - SyncPoint::GetInstance()->EnableProcessing(); - SyncPoint::GetInstance()->EnableProcessing(); Status s = TryReopen(options); @@ -398,7 +680,7 @@ TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) { } Close(); - TryReopen(options); + ASSERT_OK(TryReopen(options)); { auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); fs->ClearPrefetchCount(); @@ -415,8 +697,8 @@ TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) { "{initial_auto_readahead_size=0;}"}})); break; case 1: - // intial_auto_readahead_size and max_auto_readahead_size are set same - // so readahead_size remains same. + // intial_auto_readahead_size and max_auto_readahead_size are set + // same so readahead_size remains same. ASSERT_OK(db_->SetOptions({{"block_based_table_factory", "{initial_auto_readahead_size=4096;max_" "auto_readahead_size=4096;}"}})); @@ -434,6 +716,7 @@ TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) { iter->Seek(Key(key_count++)); iter->Next(); } + ASSERT_OK(iter->status()); buff_prefetch_level_count[level] = buff_prefetch_count; if (support_prefetch && !use_direct_io) { @@ -513,7 +796,7 @@ TEST_P(PrefetchTest, ConfigureNumFilesReadsForReadaheadSize) { ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); Close(); - TryReopen(options); + ASSERT_OK(TryReopen(options)); fs->ClearPrefetchCount(); buff_prefetch_count = 0; @@ -523,8 +806,9 @@ TEST_P(PrefetchTest, ConfigureNumFilesReadsForReadaheadSize) { /* * Reseek keys from sequential Data Blocks within same partitioned * index. It will prefetch the data block at the first seek since - * num_file_reads_for_auto_readahead = 0. Data Block size is nearly 4076 so - * readahead will fetch 8 * 1024 data more initially (2 more data blocks). + * num_file_reads_for_auto_readahead = 0. Data Block size is nearly 4076 + * so readahead will fetch 8 * 1024 data more initially (2 more data + * blocks). */ iter->Seek(BuildKey(0)); // Prefetch data + index block since // num_file_reads_for_auto_readahead = 0. @@ -559,7 +843,6 @@ TEST_P(PrefetchTest, ConfigureNumFilesReadsForReadaheadSize) { SyncPoint::GetInstance()->ClearAllCallBacks(); Close(); } -#endif // !ROCKSDB_LITE // This test verifies the basic functionality of implicit autoreadahead: // - Enable implicit autoreadahead and prefetch only if sequential blocks are @@ -623,8 +906,8 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) { /* * Reseek keys from sequential Data Blocks within same partitioned * index. After 2 sequential reads it will prefetch the data block. - * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more - * initially (2 more data blocks). + * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data + * more initially (2 more data blocks). */ iter->Seek(BuildKey(0)); ASSERT_TRUE(iter->Valid()); @@ -701,9 +984,9 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) { { /* * Reseek keys from sequential data blocks to set implicit auto readahead - * and prefetch data but after that iterate over different (non sequential) - * data blocks which won't prefetch any data further. So buff_prefetch_count - * will be 1 for the first one. + * and prefetch data but after that iterate over different (non + * sequential) data blocks which won't prefetch any data further. So + * buff_prefetch_count will be 1 for the first one. */ auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); iter->Seek(BuildKey(0)); @@ -730,8 +1013,8 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) { buff_prefetch_count = 0; } - // Read sequentially to confirm readahead_size is reset to initial value (2 - // more data blocks) + // Read sequentially to confirm readahead_size is reset to initial value + // (2 more data blocks) iter->Seek(BuildKey(1011)); ASSERT_TRUE(iter->Valid()); iter->Seek(BuildKey(1015)); @@ -781,8 +1064,8 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) { } { /* - * Reseek over different keys from different blocks. buff_prefetch_count is - * set 0. + * Reseek over different keys from different blocks. buff_prefetch_count + * is set 0. */ auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); int i = 0; @@ -790,6 +1073,7 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) { do { iter->Seek(BuildKey(i)); if (!iter->Valid()) { + ASSERT_OK(iter->status()); break; } i = i + 100; @@ -809,6 +1093,7 @@ TEST_P(PrefetchTest, PrefetchWhenReseek) { auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { } + ASSERT_OK(iter->status()); if (support_prefetch && !use_direct_io) { ASSERT_EQ(fs->GetPrefetchCount(), 13); fs->ClearPrefetchCount(); @@ -886,8 +1171,8 @@ TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) { /* * Reseek keys from sequential Data Blocks within same partitioned * index. After 2 sequential reads it will prefetch the data block. - * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data more - * initially (2 more data blocks). + * Data Block size is nearly 4076 so readahead will fetch 8 * 1024 data + * more initially (2 more data blocks). */ auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); // Warm up the cache @@ -914,8 +1199,8 @@ TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) { ASSERT_TRUE(iter->Valid()); iter->Seek(BuildKey(1004)); // Prefetch data (not in cache). ASSERT_TRUE(iter->Valid()); - // Missed one sequential block but next is in already in buffer so readahead - // will not be reset. + // Missed one sequential block but next is in already in buffer so + // readahead will not be reset. iter->Seek(BuildKey(1011)); ASSERT_TRUE(iter->Valid()); // Prefetch data but blocks are in cache so no prefetch and reset. @@ -949,7 +1234,271 @@ TEST_P(PrefetchTest, PrefetchWhenReseekwithCache) { Close(); } -#ifndef ROCKSDB_LITE +TEST_P(PrefetchTest, PrefetchWithBlockLookupAutoTuneTest) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + + std::shared_ptr fs = + std::make_shared(FileSystem::Default(), false); + + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + Options options; + SetGenericOptions(env.get(), /*use_direct_io=*/false, options); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + Status s = TryReopen(options); + ASSERT_OK(s); + + Random rnd(309); + WriteBatch batch; + + for (int i = 0; i < 26; i++) { + std::string key = "my_key_"; + + for (int j = 0; j < 10; j++) { + key += char('a' + i); + ASSERT_OK(batch.Put(key, rnd.RandomString(1000))); + } + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + std::string start_key = "my_key_a"; + + std::string end_key = "my_key_"; + for (int j = 0; j < 10; j++) { + end_key += char('a' + 25); + } + + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + + // Try with different num_file_reads_for_auto_readahead from 0 to 3. + for (size_t i = 0; i < 3; i++) { + std::shared_ptr cache = NewLRUCache(1024 * 1024, 2); + table_options.block_cache = cache; + table_options.no_block_cache = false; + table_options.num_file_reads_for_auto_readahead = i; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + s = TryReopen(options); + ASSERT_OK(s); + + // Warm up the cache. + { + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + + iter->Seek("my_key_bbb"); + ASSERT_TRUE(iter->Valid()); + + iter->Seek("my_key_ccccccccc"); + ASSERT_TRUE(iter->Valid()); + + iter->Seek("my_key_ddd"); + ASSERT_TRUE(iter->Valid()); + + iter->Seek("my_key_ddddddd"); + ASSERT_TRUE(iter->Valid()); + + iter->Seek("my_key_e"); + ASSERT_TRUE(iter->Valid()); + + iter->Seek("my_key_eeeee"); + ASSERT_TRUE(iter->Valid()); + + iter->Seek("my_key_eeeeeeeee"); + ASSERT_TRUE(iter->Valid()); + } + + ReadOptions ropts; + ropts.auto_readahead_size = true; + ReadOptions cmp_ro; + cmp_ro.auto_readahead_size = false; + + if (std::get<0>(GetParam())) { + ropts.readahead_size = cmp_ro.readahead_size = 32768; + } + + // With and without tuning readahead_size. + { + ASSERT_OK(options.statistics->Reset()); + // Seek. + { + Slice ub = Slice("my_key_uuu"); + Slice* ub_ptr = &ub; + cmp_ro.iterate_upper_bound = ub_ptr; + ropts.iterate_upper_bound = ub_ptr; + + auto iter = std::unique_ptr(db_->NewIterator(ropts)); + auto cmp_iter = std::unique_ptr(db_->NewIterator(cmp_ro)); + + Slice seek_key = Slice("my_key_aaa"); + iter->Seek(seek_key); + cmp_iter->Seek(seek_key); + + while (iter->Valid() && cmp_iter->Valid()) { + if (iter->key() != cmp_iter->key()) { + // Error + ASSERT_TRUE(false); + } + iter->Next(); + cmp_iter->Next(); + } + + uint64_t readahead_trimmed = + options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); + ASSERT_GT(readahead_trimmed, 0); + + ASSERT_OK(cmp_iter->status()); + ASSERT_OK(iter->status()); + } + + // Reseek with new upper_bound_iterator. + { + Slice ub = Slice("my_key_y"); + ropts.iterate_upper_bound = &ub; + cmp_ro.iterate_upper_bound = &ub; + + auto iter = std::unique_ptr(db_->NewIterator(ropts)); + auto cmp_iter = std::unique_ptr(db_->NewIterator(cmp_ro)); + + Slice reseek_key = Slice("my_key_v"); + iter->Seek(reseek_key); + cmp_iter->Seek(reseek_key); + + while (iter->Valid() && cmp_iter->Valid()) { + if (iter->key() != cmp_iter->key()) { + // Error + ASSERT_TRUE(false); + } + iter->Next(); + cmp_iter->Next(); + } + + uint64_t readahead_trimmed = + options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); + ASSERT_GT(readahead_trimmed, 0); + + ASSERT_OK(cmp_iter->status()); + ASSERT_OK(iter->status()); + } + } + Close(); + } +} + +TEST_F(PrefetchTest, PrefetchWithBlockLookupAutoTuneWithPrev) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + + // First param is if the mockFS support_prefetch or not + std::shared_ptr fs = + std::make_shared(FileSystem::Default(), false); + + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + Options options; + SetGenericOptions(env.get(), /*use_direct_io=*/false, options); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options); + std::shared_ptr cache = NewLRUCache(1024 * 1024, 2); + table_options.block_cache = cache; + table_options.no_block_cache = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + Status s = TryReopen(options); + ASSERT_OK(s); + + Random rnd(309); + WriteBatch batch; + + for (int i = 0; i < 26; i++) { + std::string key = "my_key_"; + + for (int j = 0; j < 10; j++) { + key += char('a' + i); + ASSERT_OK(batch.Put(key, rnd.RandomString(1000))); + } + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + std::string start_key = "my_key_a"; + + std::string end_key = "my_key_"; + for (int j = 0; j < 10; j++) { + end_key += char('a' + 25); + } + + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + + ReadOptions ropts; + ropts.auto_readahead_size = true; + + { + // Seek. + Slice ub = Slice("my_key_uuu"); + Slice* ub_ptr = &ub; + ropts.iterate_upper_bound = ub_ptr; + ropts.auto_readahead_size = true; + + ReadOptions cmp_readopts = ropts; + cmp_readopts.auto_readahead_size = false; + + auto iter = std::unique_ptr(db_->NewIterator(ropts)); + auto cmp_iter = std::unique_ptr(db_->NewIterator(cmp_readopts)); + + Slice seek_key = Slice("my_key_bbb"); + { + cmp_iter->Seek(seek_key); + ASSERT_TRUE(cmp_iter->Valid()); + ASSERT_OK(cmp_iter->status()); + + iter->Seek(seek_key); + ASSERT_TRUE(iter->Valid()); + ASSERT_OK(iter->status()); + + ASSERT_EQ(iter->key(), cmp_iter->key()); + } + + // Prev op should pass with auto tuning of readahead_size. + { + cmp_iter->Prev(); + ASSERT_TRUE(cmp_iter->Valid()); + ASSERT_OK(cmp_iter->status()); + + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + + ASSERT_EQ(iter->key(), cmp_iter->key()); + } + + // Reseek would follow as usual. + { + cmp_iter->Seek(seek_key); + ASSERT_TRUE(cmp_iter->Valid()); + ASSERT_OK(cmp_iter->status()); + + iter->Seek(seek_key); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), cmp_iter->key()); + } + } + Close(); +} + // This test verifies the functionality of ReadOptions.adaptive_readahead. TEST_P(PrefetchTest, DBIterLevelReadAhead) { const int kNumKeys = 1000; @@ -1032,6 +1581,7 @@ TEST_P(PrefetchTest, DBIterLevelReadAhead) { ASSERT_OK(iter->status()); num_keys++; } + ASSERT_OK(iter->status()); ASSERT_EQ(num_keys, total_keys); // For index and data blocks. @@ -1051,10 +1601,14 @@ TEST_P(PrefetchTest, DBIterLevelReadAhead) { // This test verifies the functionality of ReadOptions.adaptive_readahead when // async_io is enabled. TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_BYPASS("Test requires non-mem or non-encrypted environment"); + return; + } const int kNumKeys = 1000; // Set options std::shared_ptr fs = - std::make_shared(env_->GetFileSystem(), false); + std::make_shared(FileSystem::Default(), false); std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); bool use_direct_io = std::get<0>(GetParam()); @@ -1088,16 +1642,26 @@ TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) { } MoveFilesToLevel(2); int buff_async_prefetch_count = 0; + int buff_prefetch_count = 0; int readahead_carry_over_count = 0; int num_sst_files = NumTableFilesAtLevel(2); size_t current_readahead_size = 0; + bool read_async_called = false; // Test - Iterate over the keys sequentially. { + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); + SyncPoint::GetInstance()->SetCallBack( "FilePrefetchBuffer::PrefetchAsyncInternal:Start", [&](void*) { buff_async_prefetch_count++; }); + SyncPoint::GetInstance()->SetCallBack( + "UpdateResults::io_uring_result", + [&](void* /*arg*/) { read_async_called = true; }); + // The callback checks, since reads are sequential, readahead_size doesn't // start from 8KB when iterator moves to next file and its called // num_sst_files-1 times (excluding for first file). @@ -1132,6 +1696,7 @@ TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) { ASSERT_OK(iter->status()); num_keys++; } + ASSERT_OK(iter->status()); ASSERT_EQ(num_keys, total_keys); // For index and data blocks. @@ -1140,15 +1705,18 @@ TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) { } else { ASSERT_EQ(readahead_carry_over_count, 0); } - ASSERT_GT(buff_async_prefetch_count, 0); // Check stats to make sure async prefetch is done. { HistogramData async_read_bytes; options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); - if (ro.async_io) { + // Not all platforms support iouring. In that case, ReadAsync in posix + // won't submit async requests. + if (read_async_called) { + ASSERT_GT(buff_async_prefetch_count, 0); ASSERT_GT(async_read_bytes.count, 0); } else { + ASSERT_GT(buff_prefetch_count, 0); ASSERT_EQ(async_read_bytes.count, 0); } } @@ -1158,16 +1726,180 @@ TEST_P(PrefetchTest, DBIterLevelReadAheadWithAsyncIO) { } Close(); } -#endif //! ROCKSDB_LITE - -class PrefetchTest1 : public DBTestBase, - public ::testing::WithParamInterface { - public: - PrefetchTest1() : DBTestBase("prefetch_test1", true) {} - void SetGenericOptions(Env* env, bool use_direct_io, Options& options) { - options = CurrentOptions(); - options.write_buffer_size = 1024; +TEST_P(PrefetchTest, AvoidBlockCacheLookupTwice) { + const int kNumKeys = 1000; + // Set options + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), false); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + bool use_direct_io = std::get<0>(GetParam()); + bool async_io = std::get<1>(GetParam()); + + Options options; + SetGenericOptions(env.get(), use_direct_io, options); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options); + std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); // 8MB + table_options.block_cache = cache; + table_options.no_block_cache = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + // Write to DB. + { + WriteBatch batch; + Random rnd(309); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + std::string start_key = BuildKey(0); + std::string end_key = BuildKey(kNumKeys - 1); + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + } + + ReadOptions ro; + ro.async_io = async_io; + // Iterate over the keys. + { + // Each block contains around 4 keys. + auto iter = std::unique_ptr(db_->NewIterator(ro)); + ASSERT_OK(options.statistics->Reset()); + + iter->Seek(BuildKey(99)); // Prefetch data because of seek parallelization. + ASSERT_TRUE(iter->Valid()); + + ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS), + 1); + } + + Close(); +} + +TEST_P(PrefetchTest, DBIterAsyncIONoIOUring) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + + const int kNumKeys = 1000; + // Set options + bool use_direct_io = std::get<0>(GetParam()); + bool is_adaptive_readahead = std::get<1>(GetParam()); + + Options options; + SetGenericOptions(Env::Default(), use_direct_io, options); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + enable_io_uring = false; + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + enable_io_uring = true; + return; + } else { + ASSERT_OK(s); + } + + WriteBatch batch; + Random rnd(309); + int total_keys = 0; + for (int j = 0; j < 5; j++) { + for (int i = j * kNumKeys; i < (j + 1) * kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + total_keys++; + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + ASSERT_OK(Flush()); + } + MoveFilesToLevel(2); + + // Test - Iterate over the keys sequentially. + { + ReadOptions ro; + if (is_adaptive_readahead) { + ro.adaptive_readahead = true; + } + ro.async_io = true; + + ASSERT_OK(options.statistics->Reset()); + + auto iter = std::unique_ptr(db_->NewIterator(ro)); + int num_keys = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + num_keys++; + } + ASSERT_OK(iter->status()); + ASSERT_EQ(num_keys, total_keys); + + // Check stats to make sure async prefetch is done. + { + HistogramData async_read_bytes; + options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); + ASSERT_EQ(async_read_bytes.count, 0); + ASSERT_EQ(options.statistics->getTickerCount(READ_ASYNC_MICROS), 0); + } + } + + { + ReadOptions ro; + if (is_adaptive_readahead) { + ro.adaptive_readahead = true; + } + ro.async_io = true; + ro.tailing = true; + + ASSERT_OK(options.statistics->Reset()); + + auto iter = std::unique_ptr(db_->NewIterator(ro)); + int num_keys = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(iter->status()); + num_keys++; + } + ASSERT_OK(iter->status()); + ASSERT_EQ(num_keys, total_keys); + + // Check stats to make sure async prefetch is done. + { + HistogramData async_read_bytes; + options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); + ASSERT_EQ(async_read_bytes.count, 0); + ASSERT_EQ(options.statistics->getTickerCount(READ_ASYNC_MICROS), 0); + } + } + Close(); + + enable_io_uring = true; +} + +class PrefetchTest1 : public DBTestBase, + public ::testing::WithParamInterface { + public: + PrefetchTest1() : DBTestBase("prefetch_test1", true) {} + + virtual void SetGenericOptions(Env* env, bool use_direct_io, + Options& options) { + options = CurrentOptions(); + options.write_buffer_size = 1024; options.create_if_missing = true; options.compression = kNoCompression; options.env = env; @@ -1189,7 +1921,106 @@ class PrefetchTest1 : public DBTestBase, INSTANTIATE_TEST_CASE_P(PrefetchTest1, PrefetchTest1, ::testing::Bool()); -#ifndef ROCKSDB_LITE +TEST_P(PrefetchTest1, SeekWithExtraPrefetchAsyncIO) { + const int kNumKeys = 2000; + // Set options + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), false); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + Options options; + SetGenericOptions(env.get(), GetParam(), options); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + Status s = TryReopen(options); + if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + WriteBatch batch; + Random rnd(309); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + std::string start_key = BuildKey(0); + std::string end_key = BuildKey(kNumKeys - 1); + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + Close(); + + for (size_t i = 0; i < 3; i++) { + table_options.num_file_reads_for_auto_readahead = i; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + s = TryReopen(options); + ASSERT_OK(s); + + int buff_prefetch_count = 0; + int extra_prefetch_buff_cnt = 0; + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::PrefetchAsync:ExtraPrefetching", + [&](void*) { extra_prefetch_buff_cnt++; }); + + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::PrefetchAsyncInternal:Start", + [&](void*) { buff_prefetch_count++; }); + + SyncPoint::GetInstance()->EnableProcessing(); + + ReadOptions ro; + ro.async_io = true; + { + auto iter = std::unique_ptr(db_->NewIterator(ro)); + // First Seek + iter->Seek(BuildKey( + 0)); // Prefetch data on seek because of seek parallelization. + ASSERT_TRUE(iter->Valid()); + + // Do extra prefetching in Seek only if + // num_file_reads_for_auto_readahead = 0. + ASSERT_EQ(extra_prefetch_buff_cnt, (i == 0 ? 1 : 0)); + // buff_prefetch_count is 2 because of index block when + // num_file_reads_for_auto_readahead = 0. + // If num_file_reads_for_auto_readahead > 0, index block isn't + // prefetched. + ASSERT_EQ(buff_prefetch_count, i == 0 ? 2 : 1); + + extra_prefetch_buff_cnt = 0; + buff_prefetch_count = 0; + // Reset all values of FilePrefetchBuffer on new seek. + iter->Seek( + BuildKey(22)); // Prefetch data because of seek parallelization. + ASSERT_TRUE(iter->Valid()); + // Do extra prefetching in Seek only if + // num_file_reads_for_auto_readahead = 0. + ASSERT_EQ(extra_prefetch_buff_cnt, (i == 0 ? 1 : 0)); + ASSERT_EQ(buff_prefetch_count, 1); + + extra_prefetch_buff_cnt = 0; + buff_prefetch_count = 0; + // Reset all values of FilePrefetchBuffer on new seek. + iter->Seek( + BuildKey(33)); // Prefetch data because of seek parallelization. + ASSERT_TRUE(iter->Valid()); + // Do extra prefetching in Seek only if + // num_file_reads_for_auto_readahead = 0. + ASSERT_EQ(extra_prefetch_buff_cnt, (i == 0 ? 1 : 0)); + ASSERT_EQ(buff_prefetch_count, 1); + } + Close(); + } +} + // This test verifies the functionality of ReadOptions.adaptive_readahead when // reads are not sequential. TEST_P(PrefetchTest1, NonSequentialReadsWithAdaptiveReadahead) { @@ -1272,10 +2103,9 @@ TEST_P(PrefetchTest1, NonSequentialReadsWithAdaptiveReadahead) { } Close(); } -#endif //! ROCKSDB_LITE -// This test verifies the functionality of adaptive_readaheadsize with cache and -// if block is found in cache, decrease the readahead_size if +// This test verifies the functionality of adaptive_readaheadsize with cache +// and if block is found in cache, decrease the readahead_size if // - its enabled internally by RocksDB (implicit_auto_readahead_) and, // - readahead_size is greater than 0 and, // - the block would have called prefetch API if not found in cache for @@ -1397,8 +2227,8 @@ TEST_P(PrefetchTest1, DecreaseReadAheadIfInCache) { ASSERT_TRUE(iter->Valid()); // Prefetch data (not in buffer) but found in cache. So decrease - // readahead_size. Since it will 0 after decrementing so readahead_size will - // be set to initial value. + // readahead_size. Since it will 0 after decrementing so readahead_size + // will be set to initial value. iter->Seek(BuildKey(1019)); ASSERT_TRUE(iter->Valid()); expected_current_readahead_size = std::max( @@ -1421,99 +2251,417 @@ TEST_P(PrefetchTest1, DecreaseReadAheadIfInCache) { // This test verifies the basic functionality of seek parallelization for // async_io. TEST_P(PrefetchTest1, SeekParallelizationTest) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_BYPASS("Test requires non-mem or non-encrypted environment"); + return; + } const int kNumKeys = 2000; // Set options + std::shared_ptr fs = std::make_shared( + FileSystem::Default(), /*support_prefetch=*/false); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + Options options; + SetGenericOptions(env.get(), GetParam(), options); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + Status s = TryReopen(options); + if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + WriteBatch batch; + Random rnd(309); + for (int i = 0; i < kNumKeys; i++) { + ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + std::string start_key = BuildKey(0); + std::string end_key = BuildKey(kNumKeys - 1); + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + + int buff_prefetch_count = 0; + int buff_prefetch_async_count = 0; + + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::PrefetchAsyncInternal:Start", + [&](void*) { buff_prefetch_async_count++; }); + + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); + + bool read_async_called = false; + SyncPoint::GetInstance()->SetCallBack( + "UpdateResults::io_uring_result", + [&](void* /*arg*/) { read_async_called = true; }); + + SyncPoint::GetInstance()->EnableProcessing(); + ReadOptions ro; + ro.adaptive_readahead = true; + ro.async_io = true; + + { + ASSERT_OK(options.statistics->Reset()); + // Each block contains around 4 keys. + auto iter = std::unique_ptr(db_->NewIterator(ro)); + iter->Seek(BuildKey(0)); // Prefetch data because of seek parallelization. + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + + // New data block. Since num_file_reads in FilePrefetch after this read is + // 2, it won't go for prefetching. + iter->Next(); + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + + // Prefetch data. + iter->Next(); + ASSERT_TRUE(iter->Valid()); + + HistogramData async_read_bytes; + options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); + // not all platforms support io_uring. In that case it'll fallback to + // normal prefetching without async_io. + if (read_async_called) { + ASSERT_EQ(buff_prefetch_async_count, 2); + ASSERT_GT(async_read_bytes.count, 0); + ASSERT_GT(get_perf_context()->number_async_seek, 0); + } else { + ASSERT_EQ(buff_prefetch_count, 1); + } + } + Close(); +} + +// This test checks if readahead_size is trimmed when upper_bound is reached. +// It tests with different combinations of async_io disabled/enabled, +// readahead_size (implicit and explicit), and num_file_reads_for_auto_readahead +// from 0 to 2. +TEST_P(PrefetchTest, IterReadAheadSizeWithUpperBound) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + + // First param is if the mockFS support_prefetch or not std::shared_ptr fs = - std::make_shared(env_->GetFileSystem(), false); + std::make_shared(FileSystem::Default(), false); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + Options options; + SetGenericOptions(env.get(), /*use_direct_io=*/false, options); + options.statistics = CreateDBStatistics(); + BlockBasedTableOptions table_options; + SetBlockBasedTableOptions(table_options); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + Status s = TryReopen(options); + ASSERT_OK(s); + + Random rnd(309); + WriteBatch batch; + + for (int i = 0; i < 26; i++) { + std::string key = "my_key_"; + + for (int j = 0; j < 10; j++) { + key += char('a' + i); + ASSERT_OK(batch.Put(key, rnd.RandomString(1000))); + } + } + ASSERT_OK(db_->Write(WriteOptions(), &batch)); + + std::string start_key = "my_key_a"; + + std::string end_key = "my_key_"; + for (int j = 0; j < 10; j++) { + end_key += char('a' + 25); + } + + Slice least(start_key.data(), start_key.size()); + Slice greatest(end_key.data(), end_key.size()); + + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); + + int buff_prefetch_count = 0; + + // Try with different num_file_reads_for_auto_readahead from 0 to 3. + for (size_t i = 0; i < 3; i++) { + table_options.num_file_reads_for_auto_readahead = i; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + s = TryReopen(options); + ASSERT_OK(s); + + int buff_count_with_tuning = 0, buff_count_without_tuning = 0; + int keys_with_tuning = 0, keys_without_tuning = 0; + int reseek_keys_with_tuning = 0, reseek_keys_without_tuning = 0; + buff_prefetch_count = 0; + + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); + + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::PrefetchAsyncInternal:Start", + [&](void*) { buff_prefetch_count++; }); + + SyncPoint::GetInstance()->EnableProcessing(); + + ReadOptions ropts; + if (std::get<0>(GetParam())) { + ropts.readahead_size = 32768; + } + if (std::get<1>(GetParam())) { + ropts.async_io = true; + } + + // With tuning readahead_size. + { + ASSERT_OK(options.statistics->Reset()); + Slice ub = Slice("my_key_uuu"); + Slice* ub_ptr = &ub; + ropts.iterate_upper_bound = ub_ptr; + ropts.auto_readahead_size = true; + + auto iter = std::unique_ptr(db_->NewIterator(ropts)); + + // Seek. + { + Slice seek_key = Slice("my_key_aaa"); + iter->Seek(seek_key); + + while (iter->Valid()) { + keys_with_tuning++; + iter->Next(); + } + + uint64_t readahead_trimmed = + options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); + ASSERT_GT(readahead_trimmed, 0); + buff_count_with_tuning = buff_prefetch_count; + } + + // Reseek with new upper_bound_iterator. + { + ub = Slice("my_key_y"); + Slice reseek_key = Slice("my_key_v"); + iter->Seek(reseek_key); + + while (iter->Valid()) { + iter->Next(); + reseek_keys_with_tuning++; + } + ASSERT_OK(iter->status()); + + uint64_t readahead_trimmed = + options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); + ASSERT_GT(readahead_trimmed, 0); + ASSERT_GT(reseek_keys_with_tuning, 0); + } + } + + // Without tuning readahead_size + { + Slice ub = Slice("my_key_uuu"); + Slice* ub_ptr = &ub; + ropts.iterate_upper_bound = ub_ptr; + buff_prefetch_count = 0; + ASSERT_OK(options.statistics->Reset()); + ropts.auto_readahead_size = false; + + auto iter = std::unique_ptr(db_->NewIterator(ropts)); + + // Seek. + { + Slice seek_key = Slice("my_key_aaa"); + iter->Seek(seek_key); + + while (iter->Valid()) { + keys_without_tuning++; + iter->Next(); + } + buff_count_without_tuning = buff_prefetch_count; + uint64_t readahead_trimmed = + options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); + ASSERT_EQ(readahead_trimmed, 0); + } + + // Reseek with new upper_bound_iterator. + { + ub = Slice("my_key_y"); + Slice reseek_key = Slice("my_key_v"); + iter->Seek(reseek_key); + while (iter->Valid()) { + iter->Next(); + reseek_keys_without_tuning++; + } + ASSERT_OK(iter->status()); + + uint64_t readahead_trimmed = + options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); + ASSERT_EQ(readahead_trimmed, 0); + ASSERT_GT(reseek_keys_without_tuning, 0); + } + } + + { + // Verify results with and without tuning. + if (std::get<1>(GetParam())) { + // In case of async_io. + ASSERT_GE(buff_count_with_tuning, buff_count_without_tuning); + } else { + ASSERT_EQ(buff_count_without_tuning, buff_count_with_tuning); + } + // Prefetching should happen. + ASSERT_GT(buff_count_without_tuning, 0); + ASSERT_GT(buff_count_with_tuning, 0); + // No of keys should be equal. + ASSERT_EQ(keys_without_tuning, keys_with_tuning); + // No of keys after reseek with new upper bound should be equal. + ASSERT_EQ(reseek_keys_without_tuning, reseek_keys_with_tuning); + } + Close(); + } +} + +// This test checks if readahead_size is trimmed when upper_bound is reached +// during Seek in async_io and it goes for polling without any extra +// prefetching. +TEST_P(PrefetchTest, IterReadAheadSizeWithUpperBoundSeekOnly) { + if (mem_env_ || encrypted_env_) { + ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment"); + return; + } + + // First param is if the mockFS support_prefetch or not + std::shared_ptr fs = + std::make_shared(FileSystem::Default(), false); + + bool use_direct_io = false; + if (std::get<0>(GetParam())) { + use_direct_io = true; + } + + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); Options options; - SetGenericOptions(env.get(), GetParam(), options); + SetGenericOptions(env.get(), use_direct_io, options); options.statistics = CreateDBStatistics(); BlockBasedTableOptions table_options; SetBlockBasedTableOptions(table_options); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Status s = TryReopen(options); - if (GetParam() && (s.IsNotSupported() || s.IsInvalidArgument())) { + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { // If direct IO is not supported, skip the test return; } else { ASSERT_OK(s); } - WriteBatch batch; Random rnd(309); - for (int i = 0; i < kNumKeys; i++) { - ASSERT_OK(batch.Put(BuildKey(i), rnd.RandomString(1000))); + WriteBatch batch; + + for (int i = 0; i < 26; i++) { + std::string key = "my_key_"; + + for (int j = 0; j < 10; j++) { + key += char('a' + i); + ASSERT_OK(batch.Put(key, rnd.RandomString(1000))); + } } ASSERT_OK(db_->Write(WriteOptions(), &batch)); - std::string start_key = BuildKey(0); - std::string end_key = BuildKey(kNumKeys - 1); + std::string start_key = "my_key_a"; + + std::string end_key = "my_key_"; + for (int j = 0; j < 10; j++) { + end_key += char('a' + 25); + } + Slice least(start_key.data(), start_key.size()); Slice greatest(end_key.data(), end_key.size()); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &least, &greatest)); - int buff_prefetch_count = 0; + s = TryReopen(options); + ASSERT_OK(s); + + int buff_count_with_tuning = 0; SyncPoint::GetInstance()->SetCallBack( "FilePrefetchBuffer::PrefetchAsyncInternal:Start", - [&](void*) { buff_prefetch_count++; }); + [&](void*) { buff_count_with_tuning++; }); + + bool read_async_called = false; + SyncPoint::GetInstance()->SetCallBack( + "UpdateResults::io_uring_result", + [&](void* /*arg*/) { read_async_called = true; }); SyncPoint::GetInstance()->EnableProcessing(); - ReadOptions ro; - ro.adaptive_readahead = true; - ro.async_io = true; + SyncPoint::GetInstance()->EnableProcessing(); + + ReadOptions ropts; + if (std::get<1>(GetParam())) { + ropts.readahead_size = 32768; + } + ropts.async_io = true; + + Slice ub = Slice("my_key_aaa"); + ropts.iterate_upper_bound = &ub; + Slice seek_key = Slice("my_key_aaa"); + + // With tuning readahead_size. { ASSERT_OK(options.statistics->Reset()); - // Each block contains around 4 keys. - auto iter = std::unique_ptr(db_->NewIterator(ro)); - iter->Seek(BuildKey(0)); // Prefetch data because of seek parallelization. - ASSERT_TRUE(iter->Valid()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); + ropts.auto_readahead_size = true; - // New data block. Since num_file_reads in FilePrefetch after this read is - // 2, it won't go for prefetching. - iter->Next(); - ASSERT_TRUE(iter->Valid()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); - iter->Next(); - ASSERT_TRUE(iter->Valid()); + auto iter = std::unique_ptr(db_->NewIterator(ropts)); - // Prefetch data. - iter->Next(); - ASSERT_TRUE(iter->Valid()); + iter->Seek(seek_key); - ASSERT_EQ(buff_prefetch_count, 2); + ASSERT_OK(iter->status()); - // Check stats to make sure async prefetch is done. - { - HistogramData async_read_bytes; - options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); - ASSERT_GT(async_read_bytes.count, 0); - ASSERT_GT(get_perf_context()->number_async_seek, 0); + // Verify results. + uint64_t readhahead_trimmed = + options.statistics->getAndResetTickerCount(READAHEAD_TRIMMED); + // Readahead got trimmed. + if (read_async_called) { + ASSERT_GT(readhahead_trimmed, 0); + // Seek called PrefetchAsync to poll the data. + ASSERT_EQ(1, buff_count_with_tuning); + } else { + // async_io disabled. + ASSERT_GE(readhahead_trimmed, 0); + ASSERT_EQ(0, buff_count_with_tuning); } - - buff_prefetch_count = 0; } Close(); } -extern "C" bool RocksDbIOUringEnable() { return true; } - namespace { -#ifndef ROCKSDB_LITE #ifdef GFLAGS const int kMaxArgCount = 100; const size_t kArgBufferSize = 100000; @@ -1538,7 +2686,6 @@ void RunIOTracerParserTool(std::string trace_file) { ASSERT_EQ(0, ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv)); } #endif // GFLAGS -#endif // ROCKSDB_LITE } // namespace // Tests the default implementation of ReadAsync API with PosixFileSystem during @@ -1616,28 +2763,26 @@ TEST_P(PrefetchTest, ReadAsyncWithPosixFS) { ASSERT_OK(iter->status()); num_keys++; } + ASSERT_OK(iter->status()); - ASSERT_EQ(num_keys, total_keys); - ASSERT_GT(buff_prefetch_count, 0); - - // Check stats to make sure async prefetch is done. - { + if (read_async_called) { + ASSERT_EQ(num_keys, total_keys); + ASSERT_GT(buff_prefetch_count, 0); + // Check stats to make sure async prefetch is done. HistogramData async_read_bytes; options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); HistogramData prefetched_bytes_discarded; options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED, &prefetched_bytes_discarded); - + ASSERT_GT(async_read_bytes.count, 0); + ASSERT_GT(prefetched_bytes_discarded.count, 0); + ASSERT_EQ(get_perf_context()->number_async_seek, 0); + } else { // Not all platforms support iouring. In that case, ReadAsync in posix // won't submit async requests. - if (read_async_called) { - ASSERT_GT(async_read_bytes.count, 0); - } else { - ASSERT_EQ(async_read_bytes.count, 0); - } - ASSERT_GT(prefetched_bytes_discarded.count, 0); + ASSERT_EQ(num_keys, total_keys); + ASSERT_EQ(buff_prefetch_count, 0); } - ASSERT_EQ(get_perf_context()->number_async_seek, 0); } SyncPoint::GetInstance()->DisableProcessing(); @@ -1690,6 +2835,7 @@ TEST_P(PrefetchTest, MultipleSeekWithPosixFS) { } MoveFilesToLevel(2); } + (void)total_keys; int num_keys_first_batch = 0; int num_keys_second_batch = 0; @@ -1748,22 +2894,20 @@ TEST_P(PrefetchTest, MultipleSeekWithPosixFS) { num_keys++; iter->Next(); } + ASSERT_OK(iter->status()); ASSERT_EQ(num_keys, num_keys_first_batch); // Check stats to make sure async prefetch is done. - { - HistogramData async_read_bytes; - options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); - + HistogramData async_read_bytes; + options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); + if (read_async_called) { + ASSERT_GT(async_read_bytes.count, 0); + ASSERT_GT(get_perf_context()->number_async_seek, 0); + } else { // Not all platforms support iouring. In that case, ReadAsync in posix // won't submit async requests. - if (read_async_called) { - ASSERT_GT(async_read_bytes.count, 0); - ASSERT_GT(get_perf_context()->number_async_seek, 0); - } else { - ASSERT_EQ(async_read_bytes.count, 0); - ASSERT_EQ(get_perf_context()->number_async_seek, 0); - } + ASSERT_EQ(async_read_bytes.count, 0); + ASSERT_EQ(get_perf_context()->number_async_seek, 0); } } @@ -1779,29 +2923,27 @@ TEST_P(PrefetchTest, MultipleSeekWithPosixFS) { num_keys++; iter->Next(); } + ASSERT_OK(iter->status()); ASSERT_EQ(num_keys, num_keys_second_batch); + HistogramData async_read_bytes; + options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); + HistogramData prefetched_bytes_discarded; + options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED, + &prefetched_bytes_discarded); + ASSERT_GT(prefetched_bytes_discarded.count, 0); - ASSERT_GT(buff_prefetch_count, 0); - - // Check stats to make sure async prefetch is done. - { - HistogramData async_read_bytes; - options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); - HistogramData prefetched_bytes_discarded; - options.statistics->histogramData(PREFETCHED_BYTES_DISCARDED, - &prefetched_bytes_discarded); + if (read_async_called) { + ASSERT_GT(buff_prefetch_count, 0); + // Check stats to make sure async prefetch is done. + ASSERT_GT(async_read_bytes.count, 0); + ASSERT_GT(get_perf_context()->number_async_seek, 0); + } else { // Not all platforms support iouring. In that case, ReadAsync in posix // won't submit async requests. - if (read_async_called) { - ASSERT_GT(async_read_bytes.count, 0); - ASSERT_GT(get_perf_context()->number_async_seek, 0); - } else { - ASSERT_EQ(async_read_bytes.count, 0); - ASSERT_EQ(get_perf_context()->number_async_seek, 0); - } - ASSERT_GT(prefetched_bytes_discarded.count, 0); + ASSERT_EQ(async_read_bytes.count, 0); + ASSERT_EQ(get_perf_context()->number_async_seek, 0); } } } @@ -1901,35 +3043,28 @@ TEST_P(PrefetchTest, SeekParallelizationTestWithPosix) { // Prefetch data. iter->Next(); - ASSERT_TRUE(iter->Valid()); - // Check stats to make sure async prefetch is done. - { - HistogramData async_read_bytes; - options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); - // Not all platforms support iouring. In that case, ReadAsync in posix - // won't submit async requests. - if (read_async_called) { - ASSERT_GT(async_read_bytes.count, 0); - ASSERT_GT(get_perf_context()->number_async_seek, 0); - if (std::get<1>(GetParam())) { - ASSERT_EQ(buff_prefetch_count, 1); - } else { - ASSERT_EQ(buff_prefetch_count, 2); - } - } else { - ASSERT_EQ(async_read_bytes.count, 0); - ASSERT_EQ(get_perf_context()->number_async_seek, 0); + ASSERT_TRUE(iter->Valid()); + HistogramData async_read_bytes; + options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); + if (read_async_called) { + ASSERT_GT(async_read_bytes.count, 0); + ASSERT_GT(get_perf_context()->number_async_seek, 0); + if (std::get<1>(GetParam())) { ASSERT_EQ(buff_prefetch_count, 1); + } else { + ASSERT_EQ(buff_prefetch_count, 2); } + } else { + // Not all platforms support iouring. In that case, ReadAsync in posix + // won't submit async requests. + ASSERT_EQ(async_read_bytes.count, 0); + ASSERT_EQ(get_perf_context()->number_async_seek, 0); } - - buff_prefetch_count = 0; } Close(); } -#ifndef ROCKSDB_LITE #ifdef GFLAGS // This test verifies io_tracing with PosixFileSystem during prefetching. TEST_P(PrefetchTest, TraceReadAsyncWithCallbackWrapper) { @@ -2013,25 +3148,23 @@ TEST_P(PrefetchTest, TraceReadAsyncWithCallbackWrapper) { ASSERT_OK(iter->status()); num_keys++; } + ASSERT_OK(iter->status()); // End the tracing. ASSERT_OK(db_->EndIOTrace()); ASSERT_OK(env_->FileExists(trace_file_path)); ASSERT_EQ(num_keys, total_keys); - ASSERT_GT(buff_prefetch_count, 0); - - // Check stats to make sure async prefetch is done. - { - HistogramData async_read_bytes; - options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); + HistogramData async_read_bytes; + options.statistics->histogramData(ASYNC_READ_BYTES, &async_read_bytes); + if (read_async_called) { + ASSERT_GT(buff_prefetch_count, 0); + // Check stats to make sure async prefetch is done. + ASSERT_GT(async_read_bytes.count, 0); + } else { // Not all platforms support iouring. In that case, ReadAsync in posix // won't submit async requests. - if (read_async_called) { - ASSERT_GT(async_read_bytes.count, 0); - } else { - ASSERT_EQ(async_read_bytes.count, 0); - } + ASSERT_EQ(async_read_bytes.count, 0); } // Check the file to see if ReadAsync is logged. @@ -2106,19 +3239,109 @@ TEST_F(FilePrefetchBufferTest, SeekWithBlockCacheHit) { std::unique_ptr r; Read(fname, opts, &r); - FilePrefetchBuffer fpb(16384, 16384, true, false, false, 0, 0, fs()); + FilePrefetchBuffer fpb(16384, 16384, true, false, false, 0, 0, 0, fs()); Slice result; // Simulate a seek of 4096 bytes at offset 0. Due to the readahead settings, // it will do two reads of 4096+8192 and 8192 Status s = fpb.PrefetchAsync(IOOptions(), r.get(), 0, 4096, &result); - // Platforms that don't have IO uring may not support async IO - ASSERT_TRUE(s.IsTryAgain() || s.IsNotSupported()); + + // Platforms that don't have IO uring may not support async IO. + if (s.IsNotSupported()) { + return; + } + + ASSERT_TRUE(s.IsTryAgain()); // Simulate a block cache hit fpb.UpdateReadPattern(0, 4096, false); // Now read some data that straddles the two prefetch buffers - offset 8192 to // 16384 - ASSERT_TRUE(fpb.TryReadFromCacheAsync(IOOptions(), r.get(), 8192, 8192, - &result, &s, Env::IOPriority::IO_LOW)); + IOOptions io_opts; + io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW; + ASSERT_TRUE( + fpb.TryReadFromCacheAsync(io_opts, r.get(), 8192, 8192, &result, &s)); +} + +// Test to ensure when PrefetchAsync is called during seek, it doesn't do any +// alignment or prefetch extra if readahead is not enabled during seek. +TEST_F(FilePrefetchBufferTest, SeekWithoutAlignment) { + std::string fname = "seek-wwithout-alignment"; + Random rand(0); + std::string content = rand.RandomString(32768); + Write(fname, content); + + FileOptions opts; + std::unique_ptr r; + Read(fname, opts, &r); + + size_t alignment = r->file()->GetRequiredBufferAlignment(); + size_t n = alignment / 2; + + int read_async_called = 0; + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::ReadAsync", + [&](void* /*arg*/) { read_async_called++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Without readahead enabled, there will be no alignment and offset of buffer + // will be n. + { + FilePrefetchBuffer fpb( + /*readahead_size=*/8192, /*max_readahead_size=*/16384, /*enable=*/true, + /*track_min_offset=*/false, /*implicit_auto_readahead=*/true, + /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/2, + /*upper_bound_offset=*/0, fs()); + + Slice result; + // Simulate a seek of half of alignment bytes at offset n. Due to the + // readahead settings, it won't prefetch extra or do any alignment and + // offset of buffer will be n. + Status s = fpb.PrefetchAsync(IOOptions(), r.get(), n, n, &result); + + // Platforms that don't have IO uring may not support async IO. + if (s.IsNotSupported()) { + return; + } + + ASSERT_TRUE(s.IsTryAgain()); + + IOOptions io_opts; + io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW; + ASSERT_TRUE(fpb.TryReadFromCacheAsync(io_opts, r.get(), n, n, &result, &s)); + + if (read_async_called) { + ASSERT_EQ(fpb.GetPrefetchOffset(), n); + } + } + + // With readahead enabled, it will do the alignment and prefetch and offset of + // buffer will be 0. + { + read_async_called = false; + FilePrefetchBuffer fpb( + /*readahead_size=*/16384, /*max_readahead_size=*/16384, /*enable=*/true, + /*track_min_offset=*/false, /*implicit_auto_readahead=*/false, + /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/2, + /*upper_bound_offset=*/0, fs()); + + Slice result; + // Simulate a seek of half of alignment bytes at offset n. + Status s = fpb.PrefetchAsync(IOOptions(), r.get(), n, n, &result); + + // Platforms that don't have IO uring may not support async IO. + if (s.IsNotSupported()) { + return; + } + + ASSERT_TRUE(s.IsTryAgain()); + + IOOptions io_opts; + io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW; + ASSERT_TRUE(fpb.TryReadFromCacheAsync(io_opts, r.get(), n, n, &result, &s)); + + if (read_async_called) { + ASSERT_EQ(fpb.GetPrefetchOffset(), 0); + } + } } TEST_F(FilePrefetchBufferTest, NoSyncWithAsyncIO) { @@ -2134,7 +3357,8 @@ TEST_F(FilePrefetchBufferTest, NoSyncWithAsyncIO) { FilePrefetchBuffer fpb( /*readahead_size=*/8192, /*max_readahead_size=*/16384, /*enable=*/true, /*track_min_offset=*/false, /*implicit_auto_readahead=*/false, - /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0, fs()); + /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0, + /*upper_bound_offset=*/0, fs()); int read_async_called = 0; SyncPoint::GetInstance()->SetCallBack( @@ -2146,12 +3370,17 @@ TEST_F(FilePrefetchBufferTest, NoSyncWithAsyncIO) { // Simulate a seek of 4000 bytes at offset 3000. Due to the readahead // settings, it will do two reads of 4000+4096 and 4096 Status s = fpb.PrefetchAsync(IOOptions(), r.get(), 3000, 4000, &async_result); + // Platforms that don't have IO uring may not support async IO - ASSERT_TRUE(s.IsTryAgain() || s.IsNotSupported()); + if (s.IsNotSupported()) { + return; + } - ASSERT_TRUE(fpb.TryReadFromCacheAsync(IOOptions(), r.get(), /*offset=*/3000, - /*length=*/4000, &async_result, &s, - Env::IOPriority::IO_LOW)); + ASSERT_TRUE(s.IsTryAgain()); + IOOptions io_opts; + io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW; + ASSERT_TRUE(fpb.TryReadFromCacheAsync(io_opts, r.get(), /*offset=*/3000, + /*length=*/4000, &async_result, &s)); // No sync call should be made. HistogramData sst_read_micros; stats()->histogramData(SST_READ_MICROS, &sst_read_micros); @@ -2162,12 +3391,107 @@ TEST_F(FilePrefetchBufferTest, NoSyncWithAsyncIO) { // Length should be 4000. ASSERT_EQ(async_result.size(), 4000); // Data correctness. - Slice result(content.c_str() + 3000, 4000); + Slice result(&content[3000], 4000); + ASSERT_EQ(result.size(), 4000); + ASSERT_EQ(result, async_result); +} + +// This test checks if during seek in async_io, if first buffer already +// prefetched the data till upper_bound offset, second buffer shouldn't go for +// prefetching. +TEST_F(FilePrefetchBufferTest, IterateUpperBoundTest1) { + std::string fname = "iterate-upperbound-test1"; + Random rand(0); + std::string content = rand.RandomString(32768); + Write(fname, content); + + FileOptions opts; + std::unique_ptr r; + Read(fname, opts, &r); + + FilePrefetchBuffer fpb( + /*readahead_size=*/8192, /*max_readahead_size=*/16384, /*enable=*/true, + /*track_min_offset=*/false, /*implicit_auto_readahead=*/false, + /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0, + /*upper_bound_offset=*/8000, fs()); + + int read_async_called = 0; + SyncPoint::GetInstance()->SetCallBack( + "FilePrefetchBuffer::ReadAsync", + [&](void* /*arg*/) { read_async_called++; }); + SyncPoint::GetInstance()->EnableProcessing(); + + Slice async_result; + // Simulate a seek of 4000 bytes at offset 3000. Due to the readahead + // settings, it will do 1 read of 4000+1000 (till 8000 - upper bound). + Status s = fpb.PrefetchAsync(IOOptions(), r.get(), 3000, 4000, &async_result); + + // Platforms that don't have IO uring may not support async IO + if (s.IsNotSupported()) { + return; + } + + ASSERT_TRUE(s.IsTryAgain()); + IOOptions io_opts; + io_opts.rate_limiter_priority = Env::IOPriority::IO_LOW; + ASSERT_TRUE(fpb.TryReadFromCacheAsync(io_opts, r.get(), /*offset=*/3000, + /*length=*/4000, &async_result, &s)); + // No sync call should be made. + HistogramData sst_read_micros; + stats()->histogramData(SST_READ_MICROS, &sst_read_micros); + ASSERT_EQ(sst_read_micros.count, 0); + + // Number of async calls should be 1. + // No Prefetching should happen in second buffer as first buffer has already + // prefetched till offset. + ASSERT_EQ(read_async_called, 1); + // Length should be 4000. + ASSERT_EQ(async_result.size(), 4000); + // Data correctness. + Slice result(&content[3000], 4000); ASSERT_EQ(result.size(), 4000); ASSERT_EQ(result, async_result); } -#endif // ROCKSDB_LITE +TEST_F(FilePrefetchBufferTest, SyncReadaheadStats) { + std::string fname = "seek-with-block-cache-hit"; + Random rand(0); + std::string content = rand.RandomString(32768); + Write(fname, content); + + FileOptions opts; + std::unique_ptr r; + Read(fname, opts, &r); + + std::shared_ptr stats = CreateDBStatistics(); + FilePrefetchBuffer fpb(8192, 8192, true, false, false, 0, 0, 0, fs(), nullptr, + stats.get()); + Slice result; + // Simulate a seek of 4096 bytes at offset 0. Due to the readahead settings, + // it will do two reads of 4096+8192 and 8192 + Status s; + ASSERT_TRUE(fpb.TryReadFromCache(IOOptions(), r.get(), 0, 4096, &result, &s)); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(stats->getTickerCount(PREFETCH_HITS), 0); + ASSERT_EQ(stats->getTickerCount(PREFETCH_BYTES_USEFUL), 0); + + // Simulate a block cache hit + fpb.UpdateReadPattern(4096, 4096, false); + // Now read some data that straddles the two prefetch buffers - offset 8192 to + // 16384 + ASSERT_TRUE( + fpb.TryReadFromCache(IOOptions(), r.get(), 8192, 8192, &result, &s)); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(stats->getTickerCount(PREFETCH_HITS), 0); + ASSERT_EQ(stats->getTickerCount(PREFETCH_BYTES_USEFUL), 4096); + + ASSERT_TRUE( + fpb.TryReadFromCache(IOOptions(), r.get(), 12288, 4096, &result, &s)); + ASSERT_EQ(s, Status::OK()); + ASSERT_EQ(stats->getTickerCount(PREFETCH_HITS), 1); + ASSERT_EQ(stats->getTickerCount(PREFETCH_BYTES_USEFUL), 8192); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc index 030cd8d07a21..2025ce65b57e 100644 --- a/file/random_access_file_reader.cc +++ b/file/random_access_file_reader.cc @@ -19,10 +19,40 @@ #include "table/format.h" #include "test_util/sync_point.h" #include "util/random.h" -#include "util/rate_limiter.h" +#include "util/rate_limiter_impl.h" namespace ROCKSDB_NAMESPACE { +inline Histograms GetFileReadHistograms(Statistics* stats, + Env::IOActivity io_activity) { + switch (io_activity) { + case Env::IOActivity::kFlush: + return Histograms::FILE_READ_FLUSH_MICROS; + case Env::IOActivity::kCompaction: + return Histograms::FILE_READ_COMPACTION_MICROS; + case Env::IOActivity::kDBOpen: + return Histograms::FILE_READ_DB_OPEN_MICROS; + default: + break; + } + if (stats && stats->get_stats_level() > StatsLevel::kExceptDetailedTimers) { + switch (io_activity) { + case Env::IOActivity::kGet: + return Histograms::FILE_READ_GET_MICROS; + case Env::IOActivity::kMultiGet: + return Histograms::FILE_READ_MULTIGET_MICROS; + case Env::IOActivity::kDBIterator: + return Histograms::FILE_READ_DB_ITERATOR_MICROS; + case Env::IOActivity::kVerifyDBChecksum: + return Histograms::FILE_READ_VERIFY_DB_CHECKSUM_MICROS; + case Env::IOActivity::kVerifyFileChecksums: + return Histograms::FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS; + default: + break; + } + } + return Histograms::HISTOGRAM_ENUM_MAX; +} inline void RecordIOStats(Statistics* stats, Temperature file_temperature, bool is_last_level, size_t size) { IOSTATS_ADD(bytes_read, size); @@ -74,11 +104,11 @@ IOStatus RandomAccessFileReader::Create( return io_s; } -IOStatus RandomAccessFileReader::Read( - const IOOptions& opts, uint64_t offset, size_t n, Slice* result, - char* scratch, AlignedBuf* aligned_buf, - Env::IOPriority rate_limiter_priority) const { +IOStatus RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset, + size_t n, Slice* result, char* scratch, + AlignedBuf* aligned_buf) const { (void)aligned_buf; + const Env::IOPriority rate_limiter_priority = opts.rate_limiter_priority; TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read", nullptr); @@ -92,15 +122,23 @@ IOStatus RandomAccessFileReader::Read( IOStatus io_s; uint64_t elapsed = 0; + size_t alignment = file_->GetRequiredBufferAlignment(); + bool is_aligned = false; + if (scratch != nullptr) { + // Check if offset, length and buffer are aligned. + is_aligned = (offset & (alignment - 1)) == 0 && + (n & (alignment - 1)) == 0 && + (uintptr_t(scratch) & (alignment - 1)) == 0; + } + { StopWatch sw(clock_, stats_, hist_type_, + GetFileReadHistograms(stats_, opts.io_activity), (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, true /*delay_enabled*/); auto prev_perf_level = GetPerfLevel(); IOSTATS_TIMER_GUARD(read_nanos); - if (use_direct_io()) { -#ifndef ROCKSDB_LITE - size_t alignment = file_->GetRequiredBufferAlignment(); + if (use_direct_io() && is_aligned == false) { size_t aligned_offset = TruncateToPageBoundary(alignment, static_cast(offset)); size_t offset_advance = static_cast(offset) - aligned_offset; @@ -165,7 +203,6 @@ IOStatus RandomAccessFileReader::Read( } } *result = Slice(scratch, res_len); -#endif // !ROCKSDB_LITE } else { size_t pos = 0; const char* res_scratch = nullptr; @@ -176,9 +213,9 @@ IOStatus RandomAccessFileReader::Read( if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) { sw.DelayStart(); } - allowed = rate_limiter_->RequestToken(n - pos, 0 /* alignment */, - rate_limiter_priority, stats_, - RateLimiter::OpType::kRead); + allowed = rate_limiter_->RequestToken( + n - pos, (use_direct_io() ? alignment : 0), rate_limiter_priority, + stats_, RateLimiter::OpType::kRead); if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) { sw.DelayStop(); } @@ -187,12 +224,10 @@ IOStatus RandomAccessFileReader::Read( } Slice tmp_result; -#ifndef ROCKSDB_LITE FileOperationInfo::StartTimePoint start_ts; if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); } -#endif { IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_); @@ -204,7 +239,6 @@ IOStatus RandomAccessFileReader::Read( io_s = file_->Read(offset + pos, allowed, opts, &tmp_result, scratch + pos, nullptr); } -#ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { auto finish_ts = FileOperationInfo::FinishNow(); NotifyOnFileReadFinish(offset + pos, tmp_result.size(), start_ts, @@ -215,7 +249,6 @@ IOStatus RandomAccessFileReader::Read( tmp_result.size(), offset + pos); } } -#endif if (res_scratch == nullptr) { // we can't simply use `scratch` because reads of mmap'd files return // data in a different buffer. @@ -238,6 +271,14 @@ IOStatus RandomAccessFileReader::Read( file_read_hist_->Add(elapsed); } +#ifndef NDEBUG + auto pair = std::make_pair(&file_name_, &io_s); + if (offset == 0) { + TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read::BeforeReturn", + &pair); + } + TEST_SYNC_POINT_CALLBACK("RandomAccessFileReader::Read::AnyOffset", &pair); +#endif return io_s; } @@ -267,9 +308,10 @@ bool TryMerge(FSReadRequest* dest, const FSReadRequest& src) { return true; } -IOStatus RandomAccessFileReader::MultiRead( - const IOOptions& opts, FSReadRequest* read_reqs, size_t num_reqs, - AlignedBuf* aligned_buf, Env::IOPriority rate_limiter_priority) const { +IOStatus RandomAccessFileReader::MultiRead(const IOOptions& opts, + FSReadRequest* read_reqs, + size_t num_reqs, + AlignedBuf* aligned_buf) const { (void)aligned_buf; // suppress warning of unused variable in LITE mode assert(num_reqs > 0); @@ -278,6 +320,7 @@ IOStatus RandomAccessFileReader::MultiRead( assert(read_reqs[i].offset <= read_reqs[i + 1].offset); } #endif // !NDEBUG + const Env::IOPriority rate_limiter_priority = opts.rate_limiter_priority; // To be paranoid modify scratch a little bit, so in case underlying // FileSystem doesn't fill the buffer but return success and `scratch` returns @@ -294,6 +337,7 @@ IOStatus RandomAccessFileReader::MultiRead( uint64_t elapsed = 0; { StopWatch sw(clock_, stats_, hist_type_, + GetFileReadHistograms(stats_, opts.io_activity), (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, true /*delay_enabled*/); auto prev_perf_level = GetPerfLevel(); @@ -301,7 +345,6 @@ IOStatus RandomAccessFileReader::MultiRead( FSReadRequest* fs_reqs = read_reqs; size_t num_fs_reqs = num_reqs; -#ifndef ROCKSDB_LITE std::vector aligned_reqs; if (use_direct_io()) { // num_reqs is the max possible size, @@ -310,14 +353,14 @@ IOStatus RandomAccessFileReader::MultiRead( // Align and merge the read requests. size_t alignment = file_->GetRequiredBufferAlignment(); for (size_t i = 0; i < num_reqs; i++) { - const auto& r = Align(read_reqs[i], alignment); + FSReadRequest r = Align(read_reqs[i], alignment); if (i == 0) { // head - aligned_reqs.push_back(r); + aligned_reqs.push_back(std::move(r)); } else if (!TryMerge(&aligned_reqs.back(), r)) { // head + n - aligned_reqs.push_back(r); + aligned_reqs.push_back(std::move(r)); } else { // unused @@ -345,14 +388,11 @@ IOStatus RandomAccessFileReader::MultiRead( fs_reqs = aligned_reqs.data(); num_fs_reqs = aligned_reqs.size(); } -#endif // ROCKSDB_LITE -#ifndef ROCKSDB_LITE FileOperationInfo::StartTimePoint start_ts; if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); } -#endif // ROCKSDB_LITE { IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, clock_); @@ -384,7 +424,6 @@ IOStatus RandomAccessFileReader::MultiRead( RecordInHistogram(stats_, MULTIGET_IO_BATCH_SIZE, num_fs_reqs); } -#ifndef ROCKSDB_LITE if (use_direct_io()) { // Populate results in the unaligned read requests. size_t aligned_i = 0; @@ -410,10 +449,8 @@ IOStatus RandomAccessFileReader::MultiRead( } } } -#endif // ROCKSDB_LITE for (size_t i = 0; i < num_reqs; ++i) { -#ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { auto finish_ts = FileOperationInfo::FinishNow(); NotifyOnFileReadFinish(read_reqs[i].offset, read_reqs[i].result.size(), @@ -425,7 +462,6 @@ IOStatus RandomAccessFileReader::MultiRead( read_reqs[i].offset); } -#endif // ROCKSDB_LITE RecordIOStats(stats_, file_temperature_, is_last_level_, read_reqs[i].result.size()); } @@ -439,7 +475,7 @@ IOStatus RandomAccessFileReader::MultiRead( } IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro, - IOOptions& opts) { + IOOptions& opts) const { if (clock_ != nullptr) { return PrepareIOFromReadOptions(ro, clock_, opts); } else { @@ -459,11 +495,9 @@ IOStatus RandomAccessFileReader::ReadAsync( ReadAsyncInfo* read_async_info = new ReadAsyncInfo(cb, cb_arg, clock_->NowMicros()); -#ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { read_async_info->fs_start_ts_ = FileOperationInfo::StartNow(); } -#endif size_t alignment = file_->GetRequiredBufferAlignment(); bool is_aligned = (req.offset & (alignment - 1)) == 0 && @@ -492,13 +526,17 @@ IOStatus RandomAccessFileReader::ReadAsync( assert(read_async_info->buf_.CurrentSize() == 0); - StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/, &elapsed, - true /*overwrite*/, true /*delay_enabled*/); + StopWatch sw(clock_, stats_, hist_type_, + GetFileReadHistograms(stats_, opts.io_activity), + (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, + true /*delay_enabled*/); s = file_->ReadAsync(aligned_req, opts, read_async_callback, read_async_info, io_handle, del_fn, nullptr /*dbg*/); } else { - StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/, &elapsed, - true /*overwrite*/, true /*delay_enabled*/); + StopWatch sw(clock_, stats_, hist_type_, + GetFileReadHistograms(stats_, opts.io_activity), + (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, + true /*delay_enabled*/); s = file_->ReadAsync(req, opts, read_async_callback, read_async_info, io_handle, del_fn, nullptr /*dbg*/); } @@ -584,7 +622,6 @@ void RandomAccessFileReader::ReadAsyncCallback(const FSReadRequest& req, } else if (!req.status.IsAborted()) { RecordTick(stats_, ASYNC_READ_ERROR_COUNT, 1); } -#ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { auto finish_ts = FileOperationInfo::FinishNow(); NotifyOnFileReadFinish(req.offset, req.result.size(), @@ -595,7 +632,6 @@ void RandomAccessFileReader::ReadAsyncCallback(const FSReadRequest& req, NotifyOnIOError(req.status, FileOperationType::kRead, file_name(), req.result.size(), req.offset); } -#endif RecordIOStats(stats_, file_temperature_, is_last_level_, req.result.size()); delete read_async_info; } diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h index ea7cfd234f9a..93cbe0e1ac70 100644 --- a/file/random_access_file_reader.h +++ b/file/random_access_file_reader.h @@ -46,7 +46,6 @@ bool TryMerge(FSReadRequest* dest, const FSReadRequest& src); // - Updating IO stats. class RandomAccessFileReader { private: -#ifndef ROCKSDB_LITE void NotifyOnFileReadFinish( uint64_t offset, size_t length, const FileOperationInfo::StartTimePoint& start_ts, @@ -77,7 +76,6 @@ class RandomAccessFileReader { io_status.PermitUncheckedError(); } -#endif // ROCKSDB_LITE bool ShouldNotifyListeners() const { return !listeners_.empty(); } @@ -107,9 +105,7 @@ class RandomAccessFileReader { std::function cb_; void* cb_arg_; uint64_t start_time_; -#ifndef ROCKSDB_LITE FileOperationInfo::StartTimePoint fs_start_ts_; -#endif // Below fields stores the parameters passed by caller in case of direct_io. char* user_scratch_; AlignedBuf* user_aligned_buf_; @@ -126,7 +122,8 @@ class RandomAccessFileReader { std::unique_ptr&& raf, const std::string& _file_name, SystemClock* clock = nullptr, const std::shared_ptr& io_tracer = nullptr, - Statistics* stats = nullptr, uint32_t hist_type = 0, + Statistics* stats = nullptr, + uint32_t hist_type = Histograms::HISTOGRAM_ENUM_MAX, HistogramImpl* file_read_hist = nullptr, RateLimiter* rate_limiter = nullptr, const std::vector>& listeners = {}, @@ -142,16 +139,12 @@ class RandomAccessFileReader { listeners_(), file_temperature_(file_temperature), is_last_level_(is_last_level) { -#ifndef ROCKSDB_LITE std::for_each(listeners.begin(), listeners.end(), [this](const std::shared_ptr& e) { if (e->ShouldBeNotifiedOnFileIO()) { listeners_.emplace_back(e); } }); -#else // !ROCKSDB_LITE - (void)listeners; -#endif } static IOStatus Create(const std::shared_ptr& fs, @@ -171,31 +164,18 @@ class RandomAccessFileReader { // 2. Otherwise, scratch is not used and can be null, the aligned_buf owns // the internally allocated buffer on return, and the result refers to a // region in aligned_buf. - // - // `rate_limiter_priority` is used to charge the internal rate limiter when - // enabled. The special value `Env::IO_TOTAL` makes this operation bypass the - // rate limiter. IOStatus Read(const IOOptions& opts, uint64_t offset, size_t n, Slice* result, - char* scratch, AlignedBuf* aligned_buf, - Env::IOPriority rate_limiter_priority) const; + char* scratch, AlignedBuf* aligned_buf) const; // REQUIRES: // num_reqs > 0, reqs do not overlap, and offsets in reqs are increasing. // In non-direct IO mode, aligned_buf should be null; // In direct IO mode, aligned_buf stores the aligned buffer allocated inside // MultiRead, the result Slices in reqs refer to aligned_buf. - // - // `rate_limiter_priority` will be used to charge the internal rate limiter. - // It is not yet supported so the client must provide the special value - // `Env::IO_TOTAL` to bypass the rate limiter. IOStatus MultiRead(const IOOptions& opts, FSReadRequest* reqs, - size_t num_reqs, AlignedBuf* aligned_buf, - Env::IOPriority rate_limiter_priority) const; + size_t num_reqs, AlignedBuf* aligned_buf) const; - IOStatus Prefetch(uint64_t offset, size_t n, - const Env::IOPriority rate_limiter_priority) const { - IOOptions opts; - opts.rate_limiter_priority = rate_limiter_priority; + IOStatus Prefetch(const IOOptions& opts, uint64_t offset, size_t n) const { return file_->Prefetch(offset, n, opts, nullptr); } @@ -205,7 +185,7 @@ class RandomAccessFileReader { bool use_direct_io() const { return file_->use_direct_io(); } - IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts); + IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts) const; IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, std::function cb, diff --git a/file/random_access_file_reader_test.cc b/file/random_access_file_reader_test.cc index ac0e9e57a1fb..6b7b7eb68ce0 100644 --- a/file/random_access_file_reader_test.cc +++ b/file/random_access_file_reader_test.cc @@ -64,7 +64,6 @@ class RandomAccessFileReaderTest : public testing::Test { }; // Skip the following tests in lite mode since direct I/O is unsupported. -#ifndef ROCKSDB_LITE TEST_F(RandomAccessFileReaderTest, ReadDirectIO) { std::string fname = "read-direct-io"; @@ -84,8 +83,9 @@ TEST_F(RandomAccessFileReaderTest, ReadDirectIO) { Slice result; AlignedBuf buf; for (Env::IOPriority rate_limiter_priority : {Env::IO_LOW, Env::IO_TOTAL}) { - ASSERT_OK(r->Read(IOOptions(), offset, len, &result, nullptr, &buf, - rate_limiter_priority)); + IOOptions io_opts; + io_opts.rate_limiter_priority = rate_limiter_priority; + ASSERT_OK(r->Read(io_opts, offset, len, &result, nullptr, &buf)); ASSERT_EQ(result.ToString(), content.substr(offset, len)); } } @@ -96,7 +96,18 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) { "RandomAccessFileReader::MultiRead:AlignedReqs", [&](void* reqs) { // Copy reqs, since it's allocated on stack inside MultiRead, which will // be deallocated after MultiRead returns. - aligned_reqs = *reinterpret_cast*>(reqs); + size_t i = 0; + aligned_reqs.resize( + (*reinterpret_cast*>(reqs)).size()); + for (auto& req : + (*reinterpret_cast*>(reqs))) { + aligned_reqs[i].offset = req.offset; + aligned_reqs[i].len = req.len; + aligned_reqs[i].result = req.result; + aligned_reqs[i].status = req.status; + aligned_reqs[i].scratch = req.scratch; + i++; + } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -136,8 +147,8 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) { reqs.push_back(std::move(r0)); reqs.push_back(std::move(r1)); AlignedBuf aligned_buf; - ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf, - Env::IO_TOTAL /* rate_limiter_priority */)); + ASSERT_OK( + r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf)); AssertResult(content, reqs); @@ -181,8 +192,8 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) { reqs.push_back(std::move(r1)); reqs.push_back(std::move(r2)); AlignedBuf aligned_buf; - ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf, - Env::IO_TOTAL /* rate_limiter_priority */)); + ASSERT_OK( + r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf)); AssertResult(content, reqs); @@ -226,8 +237,8 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) { reqs.push_back(std::move(r1)); reqs.push_back(std::move(r2)); AlignedBuf aligned_buf; - ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf, - Env::IO_TOTAL /* rate_limiter_priority */)); + ASSERT_OK( + r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf)); AssertResult(content, reqs); @@ -263,8 +274,8 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) { reqs.push_back(std::move(r0)); reqs.push_back(std::move(r1)); AlignedBuf aligned_buf; - ASSERT_OK(r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf, - Env::IO_TOTAL /* rate_limiter_priority */)); + ASSERT_OK( + r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf)); AssertResult(content, reqs); @@ -284,8 +295,6 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } -#endif // ROCKSDB_LITE - TEST(FSReadRequest, Align) { FSReadRequest r; r.offset = 2000; diff --git a/file/readahead_raf.cc b/file/readahead_raf.cc index 6d346432e226..dd09822e3e23 100644 --- a/file/readahead_raf.cc +++ b/file/readahead_raf.cc @@ -15,7 +15,7 @@ #include "file/read_write_util.h" #include "rocksdb/file_system.h" #include "util/aligned_buffer.h" -#include "util/rate_limiter.h" +#include "util/rate_limiter_impl.h" namespace ROCKSDB_NAMESPACE { namespace { diff --git a/file/sequence_file_reader.cc b/file/sequence_file_reader.cc index d51d5be46e2a..a753c1d098c5 100644 --- a/file/sequence_file_reader.cc +++ b/file/sequence_file_reader.cc @@ -19,7 +19,7 @@ #include "test_util/sync_point.h" #include "util/aligned_buffer.h" #include "util/random.h" -#include "util/rate_limiter.h" +#include "util/rate_limiter_impl.h" namespace ROCKSDB_NAMESPACE { IOStatus SequentialFileReader::Create( @@ -39,7 +39,6 @@ IOStatus SequentialFileReader::Read(size_t n, Slice* result, char* scratch, Env::IOPriority rate_limiter_priority) { IOStatus io_s; if (use_direct_io()) { -#ifndef ROCKSDB_LITE // // |-offset_advance-|---bytes returned--| // |----------------------buf size-------------------------| @@ -95,7 +94,6 @@ IOStatus SequentialFileReader::Read(size_t n, Slice* result, char* scratch, std::min(buf.CurrentSize() - offset_advance, n)); } *result = Slice(scratch, r); -#endif // !ROCKSDB_LITE } else { // To be paranoid, modify scratch a little bit, so in case underlying // FileSystem doesn't fill the buffer but return success and `scratch` @@ -116,22 +114,18 @@ IOStatus SequentialFileReader::Read(size_t n, Slice* result, char* scratch, } else { allowed = n; } -#ifndef ROCKSDB_LITE FileOperationInfo::StartTimePoint start_ts; if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); } -#endif Slice tmp; io_s = file_->Read(allowed, IOOptions(), &tmp, scratch + read, nullptr /* dbg */); -#ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { auto finish_ts = FileOperationInfo::FinishNow(); size_t offset = offset_.fetch_add(tmp.size()); NotifyOnFileReadFinish(offset, tmp.size(), start_ts, finish_ts, io_s); } -#endif read += tmp.size(); if (!io_s.ok() || tmp.size() < allowed) { break; @@ -144,12 +138,10 @@ IOStatus SequentialFileReader::Read(size_t n, Slice* result, char* scratch, } IOStatus SequentialFileReader::Skip(uint64_t n) { -#ifndef ROCKSDB_LITE if (use_direct_io()) { offset_ += static_cast(n); return IOStatus::OK(); } -#endif // !ROCKSDB_LITE return file_->Skip(n); } diff --git a/file/sequence_file_reader.h b/file/sequence_file_reader.h index baea10eb767f..dc0e61bd2a13 100644 --- a/file/sequence_file_reader.h +++ b/file/sequence_file_reader.h @@ -23,7 +23,6 @@ namespace ROCKSDB_NAMESPACE { // cache disabled) reads appropriately, and also updates the IO stats. class SequentialFileReader { private: -#ifndef ROCKSDB_LITE void NotifyOnFileReadFinish( uint64_t offset, size_t length, const FileOperationInfo::StartTimePoint& start_ts, @@ -49,7 +48,6 @@ class SequentialFileReader { } }); } -#endif // ROCKSDB_LITE bool ShouldNotifyListeners() const { return !listeners_.empty(); } @@ -70,11 +68,7 @@ class SequentialFileReader { file_(std::move(_file), io_tracer, _file_name), listeners_(), rate_limiter_(rate_limiter) { -#ifndef ROCKSDB_LITE AddFileIOListeners(listeners); -#else - (void)listeners; -#endif } explicit SequentialFileReader( @@ -89,11 +83,7 @@ class SequentialFileReader { io_tracer, _file_name), listeners_(), rate_limiter_(rate_limiter) { -#ifndef ROCKSDB_LITE AddFileIOListeners(listeners); -#else - (void)listeners; -#endif } static IOStatus Create(const std::shared_ptr& fs, const std::string& fname, const FileOptions& file_opts, @@ -109,6 +99,9 @@ class SequentialFileReader { // when less than n bytes are actually read (e.g. at end of file). To avoid // overcharging the rate limiter, the caller can use file size to cap n to // read until end of file. + // + // TODO(hx235): accept parameter `IOOptions` containing + // `rate_limiter_priority` like RandomAccessFileReader::Read() IOStatus Read(size_t n, Slice* result, char* scratch, Env::IOPriority rate_limiter_priority); diff --git a/file/sst_file_manager_impl.cc b/file/sst_file_manager_impl.cc index 7053e6a0738f..459ea36cdb56 100644 --- a/file/sst_file_manager_impl.cc +++ b/file/sst_file_manager_impl.cc @@ -18,7 +18,6 @@ namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE SstFileManagerImpl::SstFileManagerImpl( const std::shared_ptr& clock, const std::shared_ptr& fs, @@ -504,22 +503,5 @@ SstFileManager* NewSstFileManager(Env* env, std::shared_ptr fs, return res; } -#else - -SstFileManager* NewSstFileManager(Env* /*env*/, - std::shared_ptr /*info_log*/, - std::string /*trash_dir*/, - int64_t /*rate_bytes_per_sec*/, - bool /*delete_existing_trash*/, - Status* status, double /*max_trash_db_ratio*/, - uint64_t /*bytes_max_delete_chunk*/) { - if (status) { - *status = - Status::NotSupported("SstFileManager is not supported in ROCKSDB_LITE"); - } - return nullptr; -} - -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/file/sst_file_manager_impl.h b/file/sst_file_manager_impl.h index b21b47b86846..24f056dcc4e8 100644 --- a/file/sst_file_manager_impl.h +++ b/file/sst_file_manager_impl.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include @@ -192,4 +191,3 @@ class SstFileManagerImpl : public SstFileManager { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/file/writable_file_writer.cc b/file/writable_file_writer.cc index 3afc51c567b0..908878a5faee 100644 --- a/file/writable_file_writer.cc +++ b/file/writable_file_writer.cc @@ -21,7 +21,7 @@ #include "test_util/sync_point.h" #include "util/crc32c.h" #include "util/random.h" -#include "util/rate_limiter.h" +#include "util/rate_limiter_impl.h" namespace ROCKSDB_NAMESPACE { IOStatus WritableFileWriter::Create(const std::shared_ptr& fs, @@ -249,15 +249,12 @@ IOStatus WritableFileWriter::Close() { // we need to let the file know where data ends. if (use_direct_io()) { { -#ifndef ROCKSDB_LITE FileOperationInfo::StartTimePoint start_ts; if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); } -#endif uint64_t filesz = filesize_.load(std::memory_order_acquire); interim = writable_file_->Truncate(filesz, io_options, nullptr); -#ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { auto finish_ts = FileOperationInfo::FinishNow(); NotifyOnFileTruncateFinish(start_ts, finish_ts, s); @@ -266,18 +263,14 @@ IOStatus WritableFileWriter::Close() { filesz); } } -#endif } if (interim.ok()) { { -#ifndef ROCKSDB_LITE FileOperationInfo::StartTimePoint start_ts; if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); } -#endif interim = writable_file_->Fsync(io_options, nullptr); -#ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { auto finish_ts = FileOperationInfo::FinishNow(); NotifyOnFileSyncFinish(start_ts, finish_ts, s, @@ -286,7 +279,6 @@ IOStatus WritableFileWriter::Close() { NotifyOnIOError(interim, FileOperationType::kFsync, file_name()); } } -#endif } } if (!interim.ok() && s.ok()) { @@ -296,14 +288,11 @@ IOStatus WritableFileWriter::Close() { TEST_KILL_RANDOM("WritableFileWriter::Close:0"); { -#ifndef ROCKSDB_LITE FileOperationInfo::StartTimePoint start_ts; if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); } -#endif interim = writable_file_->Close(io_options, nullptr); -#ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { auto finish_ts = FileOperationInfo::FinishNow(); NotifyOnFileCloseFinish(start_ts, finish_ts, s); @@ -311,7 +300,6 @@ IOStatus WritableFileWriter::Close() { NotifyOnIOError(interim, FileOperationType::kClose, file_name()); } } -#endif } if (!interim.ok() && s.ok()) { s = interim; @@ -344,7 +332,6 @@ IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) { if (buf_.CurrentSize() > 0) { if (use_direct_io()) { -#ifndef ROCKSDB_LITE if (pending_sync_) { if (perform_data_verification_ && buffered_data_with_checksum_) { s = WriteDirectWithChecksum(op_rate_limiter_priority); @@ -352,7 +339,6 @@ IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) { s = WriteDirect(op_rate_limiter_priority); } } -#endif // !ROCKSDB_LITE } else { if (perform_data_verification_ && buffered_data_with_checksum_) { s = WriteBufferedWithChecksum(buf_.BufferStart(), buf_.CurrentSize(), @@ -369,18 +355,15 @@ IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) { } { -#ifndef ROCKSDB_LITE FileOperationInfo::StartTimePoint start_ts; if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); } -#endif IOOptions io_options; io_options.rate_limiter_priority = WritableFileWriter::DecideRateLimiterPriority( writable_file_->GetIOPriority(), op_rate_limiter_priority); s = writable_file_->Flush(io_options, nullptr); -#ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { auto finish_ts = std::chrono::steady_clock::now(); NotifyOnFileFlushFinish(start_ts, finish_ts, s); @@ -388,7 +371,6 @@ IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) { NotifyOnIOError(s, FileOperationType::kFlush, file_name()); } } -#endif } if (!s.ok()) { @@ -500,12 +482,10 @@ IOStatus WritableFileWriter::SyncInternal(bool use_fsync) { IOSTATS_CPU_TIMER_GUARD(cpu_write_nanos, clock_); -#ifndef ROCKSDB_LITE FileOperationInfo::StartTimePoint start_ts; if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); } -#endif IOOptions io_options; io_options.rate_limiter_priority = writable_file_->GetIOPriority(); @@ -514,7 +494,6 @@ IOStatus WritableFileWriter::SyncInternal(bool use_fsync) { } else { s = writable_file_->Sync(io_options, nullptr); } -#ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { auto finish_ts = std::chrono::steady_clock::now(); NotifyOnFileSyncFinish( @@ -526,7 +505,6 @@ IOStatus WritableFileWriter::SyncInternal(bool use_fsync) { file_name()); } } -#endif SetPerfLevel(prev_perf_level); // The caller will be responsible to call set_seen_error() if s is not OK. @@ -540,19 +518,16 @@ IOStatus WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) { IOSTATS_TIMER_GUARD(range_sync_nanos); TEST_SYNC_POINT("WritableFileWriter::RangeSync:0"); -#ifndef ROCKSDB_LITE FileOperationInfo::StartTimePoint start_ts; if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); } -#endif IOOptions io_options; io_options.rate_limiter_priority = writable_file_->GetIOPriority(); IOStatus s = writable_file_->RangeSync(offset, nbytes, io_options, nullptr); if (!s.ok()) { set_seen_error(); } -#ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { auto finish_ts = std::chrono::steady_clock::now(); NotifyOnFileRangeSyncFinish(offset, nbytes, start_ts, finish_ts, s); @@ -561,7 +536,6 @@ IOStatus WritableFileWriter::RangeSync(uint64_t offset, uint64_t nbytes) { offset); } } -#endif return s; } @@ -598,14 +572,12 @@ IOStatus WritableFileWriter::WriteBuffered( IOSTATS_TIMER_GUARD(write_nanos); TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); -#ifndef ROCKSDB_LITE FileOperationInfo::StartTimePoint start_ts; uint64_t old_size = writable_file_->GetFileSize(io_options, nullptr); if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); old_size = next_write_offset_; } -#endif { auto prev_perf_level = GetPerfLevel(); @@ -633,7 +605,6 @@ IOStatus WritableFileWriter::WriteBuffered( } SetPerfLevel(prev_perf_level); } -#ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { auto finish_ts = std::chrono::steady_clock::now(); NotifyOnFileWriteFinish(old_size, allowed, start_ts, finish_ts, s); @@ -642,7 +613,6 @@ IOStatus WritableFileWriter::WriteBuffered( old_size); } } -#endif if (!s.ok()) { set_seen_error(); return s; @@ -702,14 +672,12 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum( IOSTATS_TIMER_GUARD(write_nanos); TEST_SYNC_POINT("WritableFileWriter::Flush:BeforeAppend"); -#ifndef ROCKSDB_LITE FileOperationInfo::StartTimePoint start_ts; uint64_t old_size = writable_file_->GetFileSize(io_options, nullptr); if (ShouldNotifyListeners()) { start_ts = FileOperationInfo::StartNow(); old_size = next_write_offset_; } -#endif { auto prev_perf_level = GetPerfLevel(); @@ -720,7 +688,6 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum( s = writable_file_->Append(Slice(src, left), io_options, v_info, nullptr); SetPerfLevel(prev_perf_level); } -#ifndef ROCKSDB_LITE if (ShouldNotifyListeners()) { auto finish_ts = std::chrono::steady_clock::now(); NotifyOnFileWriteFinish(old_size, left, start_ts, finish_ts, s); @@ -729,7 +696,6 @@ IOStatus WritableFileWriter::WriteBufferedWithChecksum( old_size); } } -#endif if (!s.ok()) { // If writable_file_->Append() failed, then the data may or may not // exist in the underlying memory buffer, OS page cache, remote file @@ -789,7 +755,6 @@ void WritableFileWriter::Crc32cHandoffChecksumCalculation(const char* data, // whole number of pages to be written again on the next flush because we can // only write on aligned // offsets. -#ifndef ROCKSDB_LITE IOStatus WritableFileWriter::WriteDirect( Env::IOPriority op_rate_limiter_priority) { if (seen_error()) { @@ -1006,7 +971,6 @@ IOStatus WritableFileWriter::WriteDirectWithChecksum( } return s; } -#endif // !ROCKSDB_LITE Env::IOPriority WritableFileWriter::DecideRateLimiterPriority( Env::IOPriority writable_file_io_priority, Env::IOPriority op_rate_limiter_priority) { diff --git a/file/writable_file_writer.h b/file/writable_file_writer.h index b3985eb209a3..aac0f59491ed 100644 --- a/file/writable_file_writer.h +++ b/file/writable_file_writer.h @@ -35,7 +35,6 @@ class SystemClock; // - Update IO stats. class WritableFileWriter { private: -#ifndef ROCKSDB_LITE void NotifyOnFileWriteFinish( uint64_t offset, size_t length, const FileOperationInfo::StartTimePoint& start_ts, @@ -128,7 +127,6 @@ class WritableFileWriter { } io_error_info.io_status.PermitUncheckedError(); } -#endif // ROCKSDB_LITE bool ShouldNotifyListeners() const { return !listeners_.empty(); } void UpdateFileChecksum(const Slice& data); @@ -144,12 +142,10 @@ class WritableFileWriter { // not counting padding data std::atomic filesize_; std::atomic flushed_size_; -#ifndef ROCKSDB_LITE // This is necessary when we use unbuffered access // and writes must happen on aligned offsets // so we need to go back and write that page again uint64_t next_write_offset_; -#endif // ROCKSDB_LITE bool pending_sync_; std::atomic seen_error_; #ifndef NDEBUG @@ -169,9 +165,7 @@ class WritableFileWriter { bool perform_data_verification_; uint32_t buffered_data_crc32c_checksum_; bool buffered_data_with_checksum_; -#ifndef ROCKSDB_LITE Temperature temperature_; -#endif // ROCKSDB_LITE public: WritableFileWriter( @@ -190,9 +184,7 @@ class WritableFileWriter { max_buffer_size_(options.writable_file_max_buffer_size), filesize_(0), flushed_size_(0), -#ifndef ROCKSDB_LITE next_write_offset_(0), -#endif // ROCKSDB_LITE pending_sync_(false), seen_error_(false), last_sync_size_(0), @@ -205,24 +197,18 @@ class WritableFileWriter { perform_data_verification_(perform_data_verification), buffered_data_crc32c_checksum_(0), buffered_data_with_checksum_(buffered_data_with_checksum) { -#ifndef ROCKSDB_LITE temperature_ = options.temperature; -#endif // ROCKSDB_LITE assert(!use_direct_io() || max_buffer_size_ > 0); TEST_SYNC_POINT_CALLBACK("WritableFileWriter::WritableFileWriter:0", reinterpret_cast(max_buffer_size_)); buf_.Alignment(writable_file_->GetRequiredBufferAlignment()); buf_.AllocateNewBuffer(std::min((size_t)65536, max_buffer_size_)); -#ifndef ROCKSDB_LITE std::for_each(listeners.begin(), listeners.end(), [this](const std::shared_ptr& e) { if (e->ShouldBeNotifiedOnFileIO()) { listeners_.emplace_back(e); } }); -#else // !ROCKSDB_LITE - (void)listeners; -#endif if (file_checksum_gen_factory != nullptr) { FileChecksumGenContext checksum_gen_context; checksum_gen_context.file_name = _file_name; @@ -321,10 +307,8 @@ class WritableFileWriter { // Used when os buffering is OFF and we are writing // DMA such as in Direct I/O mode -#ifndef ROCKSDB_LITE IOStatus WriteDirect(Env::IOPriority op_rate_limiter_priority); IOStatus WriteDirectWithChecksum(Env::IOPriority op_rate_limiter_priority); -#endif // !ROCKSDB_LITE // Normal write. IOStatus WriteBuffered(const char* data, size_t size, Env::IOPriority op_rate_limiter_priority); diff --git a/fuzz/sst_file_writer_fuzzer.cc b/fuzz/sst_file_writer_fuzzer.cc index e93b9a3f5f8b..676daf574fa4 100644 --- a/fuzz/sst_file_writer_fuzzer.cc +++ b/fuzz/sst_file_writer_fuzzer.cc @@ -92,7 +92,8 @@ TableReader* NewTableReader(const std::string& sst_file_path, if (s.ok()) { ImmutableOptions iopts(options, cf_ioptions); TableReaderOptions t_opt(iopts, /*prefix_extractor=*/nullptr, env_options, - cf_ioptions.internal_comparator); + cf_ioptions.internal_comparator, + 0 /* block_protection_bytes_per_key */); t_opt.largest_seqno = kMaxSequenceNumber; s = options.table_factory->NewTableReader(t_opt, std::move(file_reader), file_size, &table_reader, diff --git a/include/rocksdb/advanced_cache.h b/include/rocksdb/advanced_cache.h new file mode 100644 index 000000000000..e2aefdd0112e --- /dev/null +++ b/include/rocksdb/advanced_cache.h @@ -0,0 +1,665 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// APIs for customizing read caches in RocksDB. + +#pragma once + +#include +#include +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/compression_type.h" +#include "rocksdb/memory_allocator.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" + +namespace ROCKSDB_NAMESPACE { + +class Logger; +class SecondaryCacheResultHandle; +class Statistics; + +// A Cache maps keys to objects resident in memory, tracks reference counts +// on those key-object entries, and is able to remove unreferenced entries +// whenever it wants. All operations are fully thread safe except as noted. +// Inserted entries have a specified "charge" which is some quantity in +// unspecified units, typically bytes of memory used. A Cache will typically +// have a finite capacity in units of charge, and evict entries as needed +// to stay at or below that capacity. +// +// NOTE: This API is for expert use only and is intended more for customizing +// cache behavior than for calling into outside of RocksDB. It is subject to +// change as RocksDB evolves, especially the RocksDB block cache. Overriding +// CacheWrapper is the preferred way of customizing some operations on an +// existing implementation. +// +// INTERNAL: See typed_cache.h for convenient wrappers on top of this API. +// New virtual functions must also be added to CacheWrapper below. +class Cache { + public: // types hidden from API client + // Opaque handle to an entry stored in the cache. + struct Handle {}; + + public: // types hidden from Cache implementation + // Pointer to cached object of unspecified type. (This type alias is + // provided for clarity, not really for type checking.) + using ObjectPtr = void*; + + // Opaque object providing context (settings, etc.) to create objects + // for primary cache from saved (serialized) secondary cache entries. + struct CreateContext {}; + + public: // type defs + // Depending on implementation, cache entries with higher priority levels + // could be less likely to get evicted than entries with lower priority + // levels. The "high" priority level applies to certain SST metablocks (e.g. + // index and filter blocks) if the option + // cache_index_and_filter_blocks_with_high_priority is set. The "low" priority + // level is used for other kinds of SST blocks (most importantly, data + // blocks), as well as the above metablocks in case + // cache_index_and_filter_blocks_with_high_priority is + // not set. The "bottom" priority level is for BlobDB's blob values. + enum class Priority { HIGH, LOW, BOTTOM }; + + // A set of callbacks to allow objects in the primary block cache to be + // persisted in a secondary cache. The purpose of the secondary cache + // is to support other ways of caching the object, such as persistent or + // compressed data, that may require the object to be parsed and transformed + // in some way. Since the primary cache holds C++ objects and the secondary + // cache may only hold flat data that doesn't need relocation, these + // callbacks need to be provided by the user of the block + // cache to do the conversion. + // The CacheItemHelper is passed to Insert() and Lookup(). It has pointers + // to callback functions for size, saving and deletion of the + // object. The callbacks are defined in C-style in order to make them + // stateless and not add to the cache metadata size. + // Saving multiple std::function objects will take up 32 bytes per + // function, even if its not bound to an object and does no capture. + // + // All the callbacks are C-style function pointers in order to simplify + // lifecycle management. Objects in the cache can outlive the parent DB, + // so anything required for these operations should be contained in the + // object itself. + // + // The SizeCallback takes a pointer to the object and returns the size + // of the persistable data. It can be used by the secondary cache to allocate + // memory if needed. + // + // RocksDB callbacks are NOT exception-safe. A callback completing with an + // exception can lead to undefined behavior in RocksDB, including data loss, + // unreported corruption, deadlocks, and more. + using SizeCallback = size_t (*)(ObjectPtr obj); + + // The SaveToCallback takes an object pointer and saves the persistable + // data into a buffer. The secondary cache may decide to not store it in a + // contiguous buffer, in which case this callback will be called multiple + // times with increasing offset + using SaveToCallback = Status (*)(ObjectPtr from_obj, size_t from_offset, + size_t length, char* out_buf); + + // A function pointer type for destruction of a cache object. This will + // typically call the destructor for the appropriate type of the object. + // The Cache is responsible for copying and reclaiming space for the key, + // but objects are managed in part using this callback. Generally a DeleterFn + // can be nullptr if the ObjectPtr does not need destruction (e.g. nullptr or + // pointer into static data). + using DeleterFn = void (*)(ObjectPtr obj, MemoryAllocator* allocator); + + // The CreateCallback is takes in a buffer from the secondary cache and + // constructs an object using it. The buffer could be compressed or + // uncompressed, as indicated by the type argument. If compressed, + // the callback is responsible for uncompressing it using information + // from the context, such as compression dictionary. + // The callback doesn't have ownership of the buffer and + // should copy the contents into its own buffer. The CreateContext* is + // provided by Lookup and may be used to follow DB- or CF-specific settings. + // In case of some error, non-OK is returned and the caller should ignore + // any result in out_obj. (The implementation must clean up after itself.) + using CreateCallback = Status (*)(const Slice& data, CompressionType type, + CacheTier source, CreateContext* context, + MemoryAllocator* allocator, + ObjectPtr* out_obj, size_t* out_charge); + + // A struct with pointers to helper functions for spilling items from the + // cache into the secondary cache. May be extended in the future. An + // instance of this struct is expected to outlive the cache. + struct CacheItemHelper { + // Function for deleting an object on its removal from the Cache. + // nullptr is only for entries that require no destruction, such as + // "placeholder" cache entries with nullptr object. + DeleterFn del_cb; // (<- Most performance critical) + // Next three are used for persisting values as described above. + // If any is nullptr, then all three should be nullptr and persisting the + // entry to/from secondary cache is not supported. + SizeCallback size_cb; + SaveToCallback saveto_cb; + CreateCallback create_cb; + // Classification of the entry for monitoring purposes in block cache. + CacheEntryRole role; + // Another CacheItemHelper (or this one) without secondary cache support. + // This is provided so that items promoted from secondary cache into + // primary cache without removal from the secondary cache can be prevented + // from attempting re-insertion into secondary cache (for efficiency). + const CacheItemHelper* without_secondary_compat; + + CacheItemHelper() : CacheItemHelper(CacheEntryRole::kMisc) {} + + // For helpers without SecondaryCache support + explicit CacheItemHelper(CacheEntryRole _role, DeleterFn _del_cb = nullptr) + : CacheItemHelper(_role, _del_cb, nullptr, nullptr, nullptr, this) {} + + // For helpers with SecondaryCache support + explicit CacheItemHelper(CacheEntryRole _role, DeleterFn _del_cb, + SizeCallback _size_cb, SaveToCallback _saveto_cb, + CreateCallback _create_cb, + const CacheItemHelper* _without_secondary_compat) + : del_cb(_del_cb), + size_cb(_size_cb), + saveto_cb(_saveto_cb), + create_cb(_create_cb), + role(_role), + without_secondary_compat(_without_secondary_compat) { + // Either all three secondary cache callbacks are non-nullptr or + // all three are nullptr + assert((size_cb != nullptr) == (saveto_cb != nullptr)); + assert((size_cb != nullptr) == (create_cb != nullptr)); + // without_secondary_compat points to equivalent but without + // secondary support + assert(role == without_secondary_compat->role); + assert(del_cb == without_secondary_compat->del_cb); + assert(!without_secondary_compat->IsSecondaryCacheCompatible()); + } + inline bool IsSecondaryCacheCompatible() const { + return size_cb != nullptr; + } + }; + + public: // ctor/dtor/create + Cache(std::shared_ptr allocator = nullptr) + : memory_allocator_(std::move(allocator)) {} + // No copying allowed + Cache(const Cache&) = delete; + Cache& operator=(const Cache&) = delete; + + // Destroys all remaining entries by calling the associated "deleter" + virtual ~Cache() {} + + // Creates a new Cache based on the input value string and returns the result. + // Currently, this method can be used to create LRUCaches only + // @param config_options + // @param value The value might be: + // - an old-style cache ("1M") -- equivalent to NewLRUCache(1024*102( + // - Name-value option pairs -- "capacity=1M; num_shard_bits=4; + // For the LRUCache, the values are defined in LRUCacheOptions. + // @param result The new Cache object + // @return OK if the cache was successfully created + // @return NotFound if an invalid name was specified in the value + // @return InvalidArgument if either the options were not valid + static Status CreateFromString(const ConfigOptions& config_options, + const std::string& value, + std::shared_ptr* result); + + public: // functions + // The type of the Cache + virtual const char* Name() const = 0; + + // The Insert and Lookup APIs below are intended to allow cached objects + // to be demoted/promoted between the primary block cache and a secondary + // cache. The secondary cache could be a non-volatile cache, and will + // likely store the object in a different representation. They rely on a + // per object CacheItemHelper to do the conversions. + // The secondary cache may persist across process and system restarts, + // and may even be moved between hosts. Therefore, the cache key must + // be repeatable across restarts/reboots, and globally unique if + // multiple DBs share the same cache and the set of DBs can change + // over time. + + // Insert a mapping from key->object into the cache and assign it + // the specified charge against the total cache capacity. If + // strict_capacity_limit is true and cache reaches its full capacity, + // return Status::MemoryLimit. `obj` must be non-nullptr if compatible + // with secondary cache (helper->size_cb != nullptr), because Value() == + // nullptr is reserved for indicating some secondary cache failure cases. + // On success, returns OK and takes ownership of `obj`, eventually deleting + // it with helper->del_cb. On non-OK return, the caller maintains ownership + // of `obj` so will often need to delete it in such cases. + // + // The helper argument is saved by the cache and will be used when the + // inserted object is evicted or considered for promotion to the secondary + // cache. Promotion to secondary cache is only enabled if helper->size_cb + // != nullptr. The helper must outlive the cache. Callers may use + // &kNoopCacheItemHelper as a trivial helper (no deleter for the object, + // no secondary cache). `helper` must not be nullptr (efficiency). + // + // If `handle` is not nullptr and return status is OK, `handle` is set + // to a Handle* for the entry. The caller must call this->Release(handle) + // when the returned entry is no longer needed. If `handle` is nullptr, it is + // as if Release is called immediately after Insert. + // + // Regardless of whether the item was inserted into the cache, + // it will attempt to insert it into the secondary cache if one is + // configured, and the helper supports it. + // The cache implementation must support a secondary cache, otherwise + // the item is only inserted into the primary cache. It may + // defer the insertion to the secondary cache as it sees fit. + // + // Along with the object pointer, the caller may pass a Slice pointing to + // the compressed serialized data of the object. If compressed is + // non-empty, then the caller must pass the type indicating the compression + // algorithm used. The cache may, optionally, also insert the compressed + // block into one or more cache tiers. + // + // When the inserted entry is no longer needed, it will be destroyed using + // helper->del_cb (if non-nullptr). + virtual Status Insert( + const Slice& key, ObjectPtr obj, const CacheItemHelper* helper, + size_t charge, Handle** handle = nullptr, + Priority priority = Priority::LOW, const Slice& compressed = Slice(), + CompressionType type = CompressionType::kNoCompression) = 0; + + // Similar to Insert, but used for creating cache entries that cannot + // be found with Lookup, such as for memory charging purposes. The + // key is needed for cache sharding purposes. + // * If allow_uncharged==true or strict_capacity_limit=false, the operation + // always succeeds and returns a valid Handle. + // * If strict_capacity_limit=true and the requested charge cannot be freed + // up in the cache, then + // * If allow_uncharged==true, it's created anyway (GetCharge() == 0). + // * If allow_uncharged==false, returns nullptr to indicate failure. + virtual Handle* CreateStandalone(const Slice& key, ObjectPtr obj, + const CacheItemHelper* helper, size_t charge, + bool allow_uncharged) = 0; + + // Lookup the key, returning nullptr if not found. If found, returns + // a handle to the mapping that must eventually be passed to Release(). + // + // If a non-nullptr helper argument is provided with a non-nullptr + // create_cb, and a secondary cache is configured, then the secondary + // cache is also queried if lookup in the primary cache fails. If found + // in secondary cache, the provided create_db and create_context are + // used to promote the entry to an object in the primary cache. + // In that case, the helper may be saved and used later when the object + // is evicted, so as usual, the pointed-to helper must outlive the cache. + virtual Handle* Lookup(const Slice& key, + const CacheItemHelper* helper = nullptr, + CreateContext* create_context = nullptr, + Priority priority = Priority::LOW, + Statistics* stats = nullptr) = 0; + + // Convenience wrapper when secondary cache not supported + inline Handle* BasicLookup(const Slice& key, Statistics* stats) { + return Lookup(key, nullptr, nullptr, Priority::LOW, stats); + } + + // Increments the reference count for the handle if it refers to an entry in + // the cache. Returns true if refcount was incremented; otherwise, returns + // false. + // REQUIRES: handle must have been returned by a method on *this. + virtual bool Ref(Handle* handle) = 0; + + /** + * Release a mapping returned by a previous Lookup(). A released entry might + * still remain in cache in case it is later looked up by others. If + * erase_if_last_ref is set then it also erases it from the cache if there is + * no other reference to it. Erasing it should call the deleter function that + * was provided when the entry was inserted. + * + * Returns true if the entry was also erased. + */ + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual bool Release(Handle* handle, bool erase_if_last_ref = false) = 0; + + // Return the object assiciated with a handle returned by a successful + // Lookup(). For historical reasons, this is also known at the "value" + // associated with the key. + // REQUIRES: handle must not have been released yet. + // REQUIRES: handle must have been returned by a method on *this. + virtual ObjectPtr Value(Handle* handle) = 0; + + // If the cache contains the entry for the key, erase it. Note that the + // underlying entry will be kept around until all existing handles + // to it have been released. + virtual void Erase(const Slice& key) = 0; + // Return a new numeric id. May be used by multiple clients who are + // sharding the same cache to partition the key space. Typically the + // client will allocate a new id at startup and prepend the id to + // its cache keys. + virtual uint64_t NewId() = 0; + + // sets the maximum configured capacity of the cache. When the new + // capacity is less than the old capacity and the existing usage is + // greater than new capacity, the implementation will do its best job to + // purge the released entries from the cache in order to lower the usage + virtual void SetCapacity(size_t capacity) = 0; + + // Set whether to return error on insertion when cache reaches its full + // capacity. + virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0; + + // Get the flag whether to return error on insertion when cache reaches its + // full capacity. + virtual bool HasStrictCapacityLimit() const = 0; + + // Returns the maximum configured capacity of the cache + virtual size_t GetCapacity() const = 0; + + // Returns the memory size for the entries residing in the cache. + virtual size_t GetUsage() const = 0; + + // Returns the number of entries currently tracked in the table. SIZE_MAX + // means "not supported." This is used for inspecting the load factor, along + // with GetTableAddressCount(). + virtual size_t GetOccupancyCount() const { return SIZE_MAX; } + + // Returns the number of ways the hash function is divided for addressing + // entries. Zero means "not supported." This is used for inspecting the load + // factor, along with GetOccupancyCount(). + virtual size_t GetTableAddressCount() const { return 0; } + + // Returns the memory size for a specific entry in the cache. + virtual size_t GetUsage(Handle* handle) const = 0; + + // Returns the memory size for the entries in use by the system + virtual size_t GetPinnedUsage() const = 0; + + // Returns the charge for the specific entry in the cache. + virtual size_t GetCharge(Handle* handle) const = 0; + + // Returns the helper for the specified entry. + virtual const CacheItemHelper* GetCacheItemHelper(Handle* handle) const = 0; + + virtual Status GetSecondaryCacheCapacity(size_t& /*size*/) const { + return Status::NotSupported(); + } + + virtual Status GetSecondaryCachePinnedUsage(size_t& /*size*/) const { + return Status::NotSupported(); + } + + // Call this on shutdown if you want to speed it up. Cache will disown + // any underlying data and will not free it on delete. This call will leak + // memory - call this only if you're shutting down the process. + // Any attempts of using cache after this call will fail terribly. + // Always delete the DB object before calling this method! + virtual void DisownData() { + // default implementation is noop + } + + struct ApplyToAllEntriesOptions { + // If the Cache uses locks, setting `average_entries_per_lock` to + // a higher value suggests iterating over more entries each time a lock + // is acquired, likely reducing the time for ApplyToAllEntries but + // increasing latency for concurrent users of the Cache. Setting + // `average_entries_per_lock` to a smaller value could be helpful if + // callback is relatively expensive, such as using large data structures. + size_t average_entries_per_lock = 256; + }; + + // Apply a callback to all entries in the cache. The Cache must ensure + // thread safety but does not guarantee that a consistent snapshot of all + // entries is iterated over if other threads are operating on the Cache + // also. + virtual void ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) = 0; + + // Remove all entries. + // Prerequisite: no entry is referenced. + virtual void EraseUnRefEntries() = 0; + + virtual std::string GetPrintableOptions() const { return ""; } + + // Check for any warnings or errors in the operation of the cache and + // report them to the logger. This is intended only to be called + // periodically so does not need to be very efficient. (Obscure calling + // conventions for Logger inherited from env.h) + virtual void ReportProblems( + const std::shared_ptr& /*info_log*/) const {} + + MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); } + + // See ShardedCacheOptions::hash_seed + virtual uint32_t GetHashSeed() const { return 0; } + + // EXPERIMENTAL + // The following APIs are experimental and might change in the future. + + // Release a mapping returned by a previous Lookup(). The "useful" + // parameter specifies whether the data was actually used or not, + // which may be used by the cache implementation to decide whether + // to consider it as a hit for retention purposes. As noted elsewhere, + // "pending" handles require Wait()/WaitAll() before Release(). + virtual bool Release(Handle* handle, bool /*useful*/, + bool erase_if_last_ref) { + return Release(handle, erase_if_last_ref); + } + + // A temporary handle structure for managing async lookups, which callers + // of AsyncLookup() can allocate on the call stack for efficiency. + // An AsyncLookupHandle should not be used concurrently across threads. + struct AsyncLookupHandle { + // Inputs, populated by caller: + // NOTE: at least in case of stacked secondary caches, the underlying + // key buffer must last until handle is completely waited on. + Slice key; + const CacheItemHelper* helper = nullptr; + CreateContext* create_context = nullptr; + Priority priority = Priority::LOW; + Statistics* stats = nullptr; + + AsyncLookupHandle() {} + AsyncLookupHandle(const Slice& _key, const CacheItemHelper* _helper, + CreateContext* _create_context, + Priority _priority = Priority::LOW, + Statistics* _stats = nullptr) + : key(_key), + helper(_helper), + create_context(_create_context), + priority(_priority), + stats(_stats) {} + + // AsyncLookupHandle should only be destroyed when no longer pending + ~AsyncLookupHandle() { assert(!IsPending()); } + + // No copies or moves (StartAsyncLookup may save a pointer to this) + AsyncLookupHandle(const AsyncLookupHandle&) = delete; + AsyncLookupHandle operator=(const AsyncLookupHandle&) = delete; + AsyncLookupHandle(AsyncLookupHandle&&) = delete; + AsyncLookupHandle operator=(AsyncLookupHandle&&) = delete; + + // Determines if the handle returned by Lookup() can give a value without + // blocking, though Wait()/WaitAll() might be required to publish it to + // Value(). See secondary cache compatible Lookup() above for details. + // This call is not thread safe on "pending" handles. + // WART/TODO with stacked secondaries: might indicate ready when one + // result is ready (a miss) but the next lookup will block. + bool IsReady(); + + // Returns true if Wait/WaitAll is required before calling Result(). + bool IsPending(); + + // Returns a Lookup()-like result if this AsyncHandle is not pending. + // (Undefined behavior on a pending AsyncHandle.) Like Lookup(), the + // caller is responsible for eventually Release()ing a non-nullptr + // Handle* result. + Handle* Result(); + + // Implementation details, for RocksDB internal use only + Handle* result_handle = nullptr; + SecondaryCacheResultHandle* pending_handle = nullptr; + SecondaryCache* pending_cache = nullptr; + bool found_dummy_entry = false; + bool kept_in_sec_cache = false; + }; + + // Starts a potentially asynchronous Lookup(), based on the populated + // "input" fields of the async_handle. The caller is responsible for + // keeping the AsyncLookupHandle and the key it references alive through + // WaitAll(), and the AsyncLookupHandle alive through + // AsyncLookupHandle::Result(). WaitAll() can only be skipped if + // AsyncLookupHandle::IsPending() is already false after StartAsyncLookup. + // Calling AsyncLookupHandle::Result() is essentially required so that + // Release() can be called on non-nullptr Handle result. Wait() is a + // concise version of WaitAll()+Result() on a single handle. After an + // AsyncLookupHandle has completed this cycle, its input fields can be + // updated and re-used for another StartAsyncLookup. + // + // Handle is thread-safe while AsyncLookupHandle is not thread-safe. + // + // Default implementation is appropriate for Caches without + // true asynchronous support: defers to synchronous Lookup(). + // (AsyncLookupHandles will only get into the "pending" state with + // SecondaryCache configured.) + virtual void StartAsyncLookup(AsyncLookupHandle& async_handle); + + // A convenient wrapper around WaitAll() and AsyncLookupHandle::Result() + // for a single async handle. See StartAsyncLookup(). + Handle* Wait(AsyncLookupHandle& async_handle); + + // Wait for an array of async handles to get results, so that none are left + // in the "pending" state. Not thread safe. See StartAsyncLookup(). + // Default implementation is appropriate for Caches without true + // asynchronous support: asserts that all handles are not pending (or not + // expected to be handled by this cache, in case of wrapped/stacked + // WaitAlls()). + virtual void WaitAll(AsyncLookupHandle* /*async_handles*/, size_t /*count*/); + + // For a function called on cache entries about to be evicted. The function + // returns `true` if it has taken ownership of the Value (object), or + // `false` if the cache should destroy it as usual. Regardless, Ref() and + // Release() cannot be called on this Handle that is poised for eviction. + using EvictionCallback = + std::function; + // Sets an eviction callback for this Cache. Not thread safe and only + // supports being set once, so should only be used during initialization + // or destruction, guaranteed before or after any thread-shared operations. + void SetEvictionCallback(EvictionCallback&& fn); + + protected: + std::shared_ptr memory_allocator_; + EvictionCallback eviction_callback_; +}; + +// A wrapper around Cache that can easily be extended with instrumentation, +// etc. +class CacheWrapper : public Cache { + public: + explicit CacheWrapper(std::shared_ptr target) + : target_(std::move(target)) {} + + // Only function that derived class must provide + // const char* Name() const override { ... } + + Status Insert( + const Slice& key, ObjectPtr value, const CacheItemHelper* helper, + size_t charge, Handle** handle = nullptr, + Priority priority = Priority::LOW, + const Slice& compressed_value = Slice(), + CompressionType type = CompressionType::kNoCompression) override { + return target_->Insert(key, value, helper, charge, handle, priority, + compressed_value, type); + } + + Handle* CreateStandalone(const Slice& key, ObjectPtr obj, + const CacheItemHelper* helper, size_t charge, + bool allow_uncharged) override { + return target_->CreateStandalone(key, obj, helper, charge, allow_uncharged); + } + + Handle* Lookup(const Slice& key, const CacheItemHelper* helper, + CreateContext* create_context, + Priority priority = Priority::LOW, + Statistics* stats = nullptr) override { + return target_->Lookup(key, helper, create_context, priority, stats); + } + + bool Ref(Handle* handle) override { return target_->Ref(handle); } + + using Cache::Release; + bool Release(Handle* handle, bool erase_if_last_ref = false) override { + return target_->Release(handle, erase_if_last_ref); + } + + ObjectPtr Value(Handle* handle) override { return target_->Value(handle); } + + void Erase(const Slice& key) override { target_->Erase(key); } + uint64_t NewId() override { return target_->NewId(); } + + void SetCapacity(size_t capacity) override { target_->SetCapacity(capacity); } + + void SetStrictCapacityLimit(bool strict_capacity_limit) override { + target_->SetStrictCapacityLimit(strict_capacity_limit); + } + + bool HasStrictCapacityLimit() const override { + return target_->HasStrictCapacityLimit(); + } + + size_t GetOccupancyCount() const override { + return target_->GetOccupancyCount(); + } + + size_t GetTableAddressCount() const override { + return target_->GetTableAddressCount(); + } + + size_t GetCapacity() const override { return target_->GetCapacity(); } + + size_t GetUsage() const override { return target_->GetUsage(); } + + size_t GetUsage(Handle* handle) const override { + return target_->GetUsage(handle); + } + + size_t GetPinnedUsage() const override { return target_->GetPinnedUsage(); } + + size_t GetCharge(Handle* handle) const override { + return target_->GetCharge(handle); + } + + const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override { + return target_->GetCacheItemHelper(handle); + } + + void ApplyToAllEntries( + const std::function& callback, + const ApplyToAllEntriesOptions& opts) override { + target_->ApplyToAllEntries(callback, opts); + } + + void EraseUnRefEntries() override { target_->EraseUnRefEntries(); } + + void StartAsyncLookup(AsyncLookupHandle& async_handle) override { + target_->StartAsyncLookup(async_handle); + } + + void WaitAll(AsyncLookupHandle* async_handles, size_t count) override { + target_->WaitAll(async_handles, count); + } + + uint32_t GetHashSeed() const override { return target_->GetHashSeed(); } + + void ReportProblems(const std::shared_ptr& info_log) const override { + target_->ReportProblems(info_log); + } + + const std::shared_ptr& GetTarget() { return target_; } + + protected: + std::shared_ptr target_; +}; + +// Useful for cache entries requiring no clean-up, such as for cache +// reservations +extern const Cache::CacheItemHelper kNoopCacheItemHelper; + +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 258cf82a1059..e5ffe8944d66 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -27,14 +27,11 @@ enum CompactionStyle : char { // level based compaction style kCompactionStyleLevel = 0x0, // Universal compaction style - // Not supported in ROCKSDB_LITE. kCompactionStyleUniversal = 0x1, // FIFO compaction style - // Not supported in ROCKSDB_LITE kCompactionStyleFIFO = 0x2, // Disable background compaction. Compaction jobs are submitted // via CompactFiles(). - // Not supported in ROCKSDB_LITE kCompactionStyleNone = 0x3, }; @@ -62,43 +59,37 @@ enum CompactionPri : char { kRoundRobin = 0x4, }; -struct CompactionOptionsFIFO { - // once the total sum of table files reaches this, we will delete the oldest - // table file - // Default: 1GB - uint64_t max_table_files_size; - - // If true, try to do compaction to compact smaller files into larger ones. - // Minimum files to compact follows options.level0_file_num_compaction_trigger - // and compaction won't trigger if average compact bytes per del file is - // larger than options.write_buffer_size. This is to protect large files - // from being compacted again. - // Default: false; - bool allow_compaction = false; - - // When not 0, if the data in the file is older than this threshold, RocksDB - // will soon move the file to warm temperature. - uint64_t age_for_warm = 0; - - CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {} - CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction) - : max_table_files_size(_max_table_files_size), - allow_compaction(_allow_compaction) {} -}; - // Compression options for different compression algorithms like Zlib struct CompressionOptions { + // ==> BEGIN options that can be set by deprecated configuration syntax, <== + // ==> e.g. compression_opts=5:6:7:8:9:10:true:11:false <== + // ==> Please use compression_opts={level=6;strategy=7;} form instead. <== + // RocksDB's generic default compression level. Internally it'll be translated // to the default compression level specific to the library being used (see // comment above `ColumnFamilyOptions::compression`). // // The default value is the max 16-bit int as it'll be written out in OPTIONS // file, which should be portable. - const static int kDefaultCompressionLevel = 32767; + static constexpr int kDefaultCompressionLevel = 32767; + + // zlib only: windowBits parameter. See https://www.zlib.net/manual.html + int window_bits = -14; + + // Compression "level" applicable to zstd, zlib, LZ4, and LZ4HC. Except for + // kDefaultCompressionLevel (see above), the meaning of each value depends + // on the compression algorithm. Decreasing across non- + // `kDefaultCompressionLevel` values will either favor speed over + // compression ratio or have no effect. + // + // In LZ4 specifically, the absolute value of a negative `level` internally + // configures the `acceleration` parameter. For example, set `level=-10` for + // `acceleration=10`. This negation is necessary to ensure decreasing `level` + // values favor speed over compression ratio. + int level = kDefaultCompressionLevel; - int window_bits; - int level; - int strategy; + // zlib only: strategy parameter. See https://www.zlib.net/manual.html + int strategy = 0; // Maximum size of dictionaries used to prime the compression library. // Enabling dictionary can improve compression ratios when there are @@ -120,18 +111,14 @@ struct CompressionOptions { // If block cache insertion fails with `Status::MemoryLimit` (i.e., it is // full), we finalize the dictionary with whatever data we have and then stop // buffering. - // - // Default: 0. - uint32_t max_dict_bytes; + uint32_t max_dict_bytes = 0; // Maximum size of training data passed to zstd's dictionary trainer. Using // zstd's dictionary trainer can achieve even better compression ratio // improvements than using `max_dict_bytes` alone. // // The training data will be used to generate a dictionary of max_dict_bytes. - // - // Default: 0. - uint32_t zstd_max_train_bytes; + uint32_t zstd_max_train_bytes = 0; // Number of threads for parallel compression. // Parallel compression is enabled only if threads > 1. @@ -144,9 +131,7 @@ struct CompressionOptions { // compressed size is in flight when compression is parallelized. To be // reasonably accurate, this inflation is also estimated by using historical // compression ratio and current bytes inflight. - // - // Default: 1. - uint32_t parallel_threads; + uint32_t parallel_threads = 1; // When the compression options are set by the user, it will be set to "true". // For bottommost_compression_opts, to enable it, user must set enabled=true. @@ -155,9 +140,7 @@ struct CompressionOptions { // // For compression_opts, if compression_opts.enabled=false, it is still // used as compression options for compression process. - // - // Default: false. - bool enabled; + bool enabled = false; // Limit on data buffering when gathering samples to build a dictionary. Zero // means no limit. When dictionary is disabled (`max_dict_bytes == 0`), @@ -176,9 +159,7 @@ struct CompressionOptions { // `zstd_max_train_bytes` (when enabled) can restrict how many samples we can // pass to the dictionary trainer. Configuring it below `max_dict_bytes` can // restrict the size of the final dictionary. - // - // Default: 0 (unlimited) - uint64_t max_dict_buffer_bytes; + uint64_t max_dict_buffer_bytes = 0; // Use zstd trainer to generate dictionaries. When this option is set to true, // zstd_max_train_bytes of training data sampled from max_dict_buffer_bytes @@ -190,34 +171,37 @@ struct CompressionOptions { // data will be passed to this API. Using this API should save CPU time on // dictionary training, but the compression ratio may not be as good as using // a dictionary trainer. - // - // Default: true - bool use_zstd_dict_trainer; - - CompressionOptions() - : window_bits(-14), - level(kDefaultCompressionLevel), - strategy(0), - max_dict_bytes(0), - zstd_max_train_bytes(0), - parallel_threads(1), - enabled(false), - max_dict_buffer_bytes(0), - use_zstd_dict_trainer(true) {} - CompressionOptions(int wbits, int _lev, int _strategy, - uint32_t _max_dict_bytes, uint32_t _zstd_max_train_bytes, - uint32_t _parallel_threads, bool _enabled, - uint64_t _max_dict_buffer_bytes, - bool _use_zstd_dict_trainer) - : window_bits(wbits), - level(_lev), - strategy(_strategy), - max_dict_bytes(_max_dict_bytes), - zstd_max_train_bytes(_zstd_max_train_bytes), - parallel_threads(_parallel_threads), - enabled(_enabled), - max_dict_buffer_bytes(_max_dict_buffer_bytes), - use_zstd_dict_trainer(_use_zstd_dict_trainer) {} + bool use_zstd_dict_trainer = true; + + // ===> END options that can be set by deprecated configuration syntax <=== + // ===> Use compression_opts={level=6;strategy=7;} form for below opts <=== + + // Essentially specifies a minimum acceptable compression ratio. A block is + // stored uncompressed if the compressed block does not achieve this ratio, + // because the downstream cost of decompression is not considered worth such + // a small savings (if any). + // However, the ratio is specified in a way that is efficient for checking. + // An integer from 1 to 1024 indicates the maximum allowable compressed bytes + // per 1KB of input, so the minimum acceptable ratio is 1024.0 / this value. + // For example, for a minimum ratio of 1.5:1, set to 683. See SetMinRatio(). + // Default: abandon use of compression for a specific block or entry if + // compressed by less than 12.5% (minimum ratio of 1.143:1). + int max_compressed_bytes_per_kb = 1024 * 7 / 8; + + // ZSTD only. + // Enable compression algorithm's checksum feature. + // (https://github.com/facebook/zstd/blob/d857369028d997c92ff1f1861a4d7f679a125464/lib/zstd.h#L428) + // Each compressed frame will have a 32-bit checksum attached. The checksum + // computed from the uncompressed data and can be verified during + // decompression. + bool checksum = false; + + // A convenience function for setting max_compressed_bytes_per_kb based on a + // minimum acceptable compression ratio (uncompressed size over compressed + // size). + void SetMinRatio(double min_ratio) { + max_compressed_bytes_per_kb = static_cast(1024.0 / min_ratio + 0.5); + } }; // Temperature of a file. Used to pass to FileSystem for a different @@ -232,12 +216,67 @@ enum class Temperature : uint8_t { kLastTemperature, }; +struct FileTemperatureAge { + Temperature temperature = Temperature::kUnknown; + uint64_t age = 0; +}; + +struct CompactionOptionsFIFO { + // once the total sum of table files reaches this, we will delete the oldest + // table file + // Default: 1GB + uint64_t max_table_files_size; + + // If true, try to do compaction to compact smaller files into larger ones. + // Minimum files to compact follows options.level0_file_num_compaction_trigger + // and compaction won't trigger if average compact bytes per del file is + // larger than options.write_buffer_size. This is to protect large files + // from being compacted again. + // Default: false; + bool allow_compaction = false; + + // DEPRECATED + // When not 0, if the data in the file is older than this threshold, RocksDB + // will soon move the file to warm temperature. + uint64_t age_for_warm = 0; + + // EXPERIMENTAL + // Age (in seconds) threshold for different file temperatures. + // When not empty, each element specifies an age threshold `age` and a + // temperature such that if all the data in a file is older than `age`, + // RocksDB will compact the file to the specified `temperature`. + // + // Note: + // - Flushed files will always have temperature kUnknown. + // - Compaction output files will have temperature kUnknown by default, so + // only temperatures other than kUnknown needs to be specified. + // - The elements should be in increasing order with respect to `age` field. + // + // Dynamically changeable through SetOptions() API, e.g., + // SetOptions("compaction_options_fifo", + // "{file_temperature_age_thresholds={ + // {age=10;temperature=kWarm}:{age=20;temperature=kCold}}}") + // In this example, all files that are at least 20 seconds old will be + // compacted and output files will have temperature kCold. All files that are + // at least 10 seconds old but younger than 20 seconds will be compacted to + // files with temperature kWarm. + // + // Default: empty + std::vector file_temperature_age_thresholds{}; + + CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {} + CompactionOptionsFIFO(uint64_t _max_table_files_size, bool _allow_compaction) + : max_table_files_size(_max_table_files_size), + allow_compaction(_allow_compaction) {} +}; + // The control option of how the cache tiers will be used. Currently rocksdb // support block cache (volatile tier), secondary cache (non-volatile tier). // In the future, we may add more caching layers. enum class CacheTier : uint8_t { kVolatileTier = 0, - kNonVolatileBlockTier = 0x01, + kVolatileCompressedTier = 0x01, + kNonVolatileBlockTier = 0x02, }; enum UpdateStatus { // Return status For inplace update callback @@ -577,11 +616,11 @@ struct AdvancedColumnFamilyOptions { // 1. target size is in the range of // (max_bytes_for_level_base / max_bytes_for_level_multiplier, // max_bytes_for_level_base] - // 2. target size of the last level (level num_levels-1) equals to extra size - // of the level. - // At the same time max_bytes_for_level_multiplier and - // max_bytes_for_level_multiplier_additional are still satisfied. - // (When L0 is too large, we make some adjustment. See below.) + // 2. target size of the last level (level num_levels-1) equals to the max + // size of a level in the LSM (typically the last level). + // At the same time max_bytes_for_level_multiplier is still satisfied. + // Note that max_bytes_for_level_multiplier_additional is ignored with this + // flag on. // // With this option on, from an empty DB, we make last level the base level, // which means merging L0 data into the last level, until it exceeds @@ -595,7 +634,7 @@ struct AdvancedColumnFamilyOptions { // and max_bytes_for_level_base=10MB. // Target sizes of level 1 to 5 starts with: // [- - - - 10MB] - // with base level is level. Target sizes of level 1 to 4 are not applicable + // with base level is level 5. Target sizes of level 1 to 4 are not applicable // because they will not be used. // Until the size of Level 5 grows to more than 10MB, say 11MB, we make // base target to level 4 and now the targets looks like: @@ -619,37 +658,37 @@ struct AdvancedColumnFamilyOptions { // By doing it, we give max_bytes_for_level_multiplier a priority against // max_bytes_for_level_base, for a more predictable LSM tree shape. It is // useful to limit worse case space amplification. + // If `allow_ingest_behind=true` or `preclude_last_level_data_seconds > 0`, + // then the last level is reserved, and we will start filling LSM from the + // second last level. + // + // With this option on, compaction is more adaptive to write traffic: + // Compaction priority will take into account estimated bytes to be compacted + // down to a level and favors compacting lower levels when there is a write + // traffic spike (and hence more compaction debt). Refer to + // https://github.com/facebook/rocksdb/wiki/Leveled-Compactio#option-level_compaction_dynamic_level_bytes-and-levels-target-size + // for more detailed description. See more implementation detail in: + // VersionStorageInfo::ComputeCompactionScore(). + // + // With this option on, unneeded levels will be drained automatically: + // Note that there may be excessive levels (where target level size is 0 when + // computed based on this feature) in the LSM. This can happen after a user + // migrates to turn this feature on or deletes a lot of data. This is + // especially likely when a user migrates from leveled compaction with a + // smaller multiplier or from universal compaction. RocksDB will gradually + // drain these unnecessary levels by compacting files down the LSM. Smaller + // number of levels should help to reduce read amplification. + // + // Migration to turn on this option: + // - Before RocksDB v8.2, users are expected to do a full manual compaction + // and then restart DB to turn on this option. + // - Since RocksDB v8.2, users can just restart DB with this option on, as + // long as num_levels is no smaller than number of non-empty levels in the + // LSM. Migration will be done automatically by RocksDB. See more in + // https://github.com/facebook/rocksdb/wiki/Leveled-Compaction#migrating-from-level_compaction_dynamic_level_bytesfalse-to-level_compaction_dynamic_level_bytestrue // - // - // If the compaction from L0 is lagged behind, a special mode will be turned - // on to prioritize write amplification against max_bytes_for_level_multiplier - // or max_bytes_for_level_base. The L0 compaction is lagged behind by looking - // at number of L0 files and total L0 size. If number of L0 files is at least - // the double of level0_file_num_compaction_trigger, or the total size is - // at least max_bytes_for_level_base, this mode is on. The target of L1 grows - // to the actual data size in L0, and then determine the target for each level - // so that each level will have the same level multiplier. - // - // For example, when L0 size is 100MB, the size of last level is 1600MB, - // max_bytes_for_level_base = 80MB, and max_bytes_for_level_multiplier = 10. - // Since L0 size is larger than max_bytes_for_level_base, this is a L0 - // compaction backlogged mode. So that the L1 size is determined to be 100MB. - // Based on max_bytes_for_level_multiplier = 10, at least 3 non-0 levels will - // be needed. The level multiplier will be calculated to be 4 and the three - // levels' target to be [100MB, 400MB, 1600MB]. - // - // In this mode, The number of levels will be no more than the normal mode, - // and the level multiplier will be lower. The write amplification will - // likely to be reduced. - // - // - // max_bytes_for_level_multiplier_additional is ignored with this flag on. - // - // Turning this feature on or off for an existing DB can cause unexpected - // LSM tree structure so it's not recommended. - // - // Default: false - bool level_compaction_dynamic_level_bytes = false; + // Default: true + bool level_compaction_dynamic_level_bytes = true; // Allows RocksDB to generate files that are not exactly the target_file_size // only for the non-bottommost files. Which can reduce the write-amplification @@ -668,12 +707,14 @@ struct AdvancedColumnFamilyOptions { // Different max-size multipliers for different levels. // These are multiplied by max_bytes_for_level_multiplier to arrive // at the max-size of each level. + // This option only applies to leveled compaction with + // `level_compaction_dynamic_level_bytes = false`. // // Default: 1 // // Dynamically changeable through SetOptions() API std::vector max_bytes_for_level_multiplier_additional = - std::vector(num_levels, 1); + std::vector(static_cast(num_levels), 1); // We try to limit number of bytes in one compaction to be lower than this // threshold. But it's not guaranteed. @@ -753,7 +794,7 @@ struct AdvancedColumnFamilyOptions { // Related options that were originally here but now moved include: // no_block_cache // block_cache - // block_cache_compressed + // block_cache_compressed (removed) // block_size // block_size_deviation // block_restart_interval @@ -833,30 +874,60 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API bool report_bg_io_stats = false; - // Files containing updates older than TTL will go through the compaction - // process. This usually happens in a cascading way so that those entries - // will be compacted to bottommost level/file. - // The feature is used to remove stale entries that have been deleted or - // updated from the file system. - // Pre-req: This needs max_open_files to be set to -1. - // In Level: Non-bottom-level files older than TTL will go through the - // compaction process. - // In FIFO: Files older than TTL will be deleted. + // This option has different meanings for different compaction styles: + // + // Leveled: Non-bottom-level files with all keys older than TTL will go + // through the compaction process. This usually happens in a cascading + // way so that those entries will be compacted to bottommost level/file. + // The feature is used to remove stale entries that have been deleted or + // updated from the file system. + // + // FIFO: Files with all keys older than TTL will be deleted. TTL is only + // supported if option max_open_files is set to -1. + // + // Universal: users should only set the option `periodic_compaction_seconds` + // below instead. For backward compatibility, this option has the same + // meaning as `periodic_compaction_seconds`. See more in comments for + // `periodic_compaction_seconds` on the interaction between these two + // options. + // + // This option only supports block based table format for any compaction + // style. + // // unit: seconds. Ex: 1 day = 1 * 24 * 60 * 60 - // In FIFO, this option will have the same meaning as - // periodic_compaction_seconds. Whichever stricter will be used. // 0 means disabling. // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to // pick default. // - // Default: 30 days for leveled compaction + block based table. disable - // otherwise. + // Default: 30 days if using block based table. 0 (disable) otherwise. // // Dynamically changeable through SetOptions() API + // Note that dynamically changing this option only works for leveled and FIFO + // compaction. For universal compaction, dynamically changing this option has + // no effect, users should dynamically change `periodic_compaction_seconds` + // instead. uint64_t ttl = 0xfffffffffffffffe; - // Files older than this value will be picked up for compaction, and - // re-written to the same level as they were before. + // This option has different meanings for different compaction styles: + // + // Leveled: files older than `periodic_compaction_seconds` will be picked up + // for compaction and will be re-written to the same level as they were + // before. + // + // FIFO: not supported. Setting this option has no effect for FIFO compaction. + // + // Universal: when there are files older than `periodic_compaction_seconds`, + // rocksdb will try to do as large a compaction as possible including the + // last level. Such compaction is only skipped if only last level is to + // be compacted and no file in last level is older than + // `periodic_compaction_seconds`. See more in + // UniversalCompactionBuilder::PickPeriodicCompaction(). + // For backward compatibility, the effective value of this option takes + // into account the value of option `ttl`. The logic is as follows: + // - both options are set to 30 days if they have the default value. + // - if both options are zero, zero is picked. Otherwise, we take the min + // value among non-zero options values (i.e. takes the stricter limit). + // // One main use of the feature is to make sure a file goes through compaction // filters periodically. Users can also use the feature to clear up SST // files using old format. @@ -866,23 +937,19 @@ struct AdvancedColumnFamilyOptions { // age is based on the file's last modified time (given by the underlying // Env). // - // Supported in Level and FIFO compaction. - // In FIFO compaction, this option has the same meaning as TTL and whichever - // stricter will be used. - // Pre-req: max_open_file == -1. + // This option only supports block based table format for any compaction + // style. + // // unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60 // // Values: // 0: Turn off Periodic compactions. - // UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature - // as needed. For now, RocksDB will change this value to 30 days - // (i.e 30 * 24 * 60 * 60) so that every file goes through the compaction - // process at least once every 30 days if not compacted sooner. - // In FIFO compaction, since the option has the same meaning as ttl, - // when this value is left default, and ttl is left to 0, 30 days will be - // used. Otherwise, min(ttl, periodic_compaction_seconds) will be used. + // UINT64_MAX - 1 (0xfffffffffffffffe) is special flag to allow RocksDB to + // pick default. // - // Default: UINT64_MAX - 1 (allow RocksDB to auto-tune) + // Default: 30 days if using block based table format + compaction filter + + // leveled compaction or block based table format + universal compaction. + // 0 (disabled) otherwise. // // Dynamically changeable through SetOptions() API uint64_t periodic_compaction_seconds = 0xfffffffffffffffe; @@ -909,6 +976,14 @@ struct AdvancedColumnFamilyOptions { Temperature bottommost_temperature = Temperature::kUnknown; Temperature last_level_temperature = Temperature::kUnknown; + // EXPERIMENTAL + // When this field is set, all SST files without an explicitly set temperature + // will be treated as if they have this temperature for file reading + // accounting purpose, such as io statistics, io perf context. + // + // Not dynamically changeable, change it requires db restart. + Temperature default_temperature = Temperature::kUnknown; + // EXPERIMENTAL // The feature is still in development and is incomplete. // If this option is set, when data insert time is within this time range, it @@ -1085,8 +1160,84 @@ struct AdvancedColumnFamilyOptions { // // Default: 0 (no protection) // Supported values: 0, 1, 2, 4, 8. + // Dynamically changeable through the SetOptions() API. uint32_t memtable_protection_bytes_per_key = 0; + // UNDER CONSTRUCTION -- DO NOT USE + // When the user-defined timestamp feature is enabled, this flag controls + // whether the user-defined timestamps will be persisted. + // + // When it's false, the user-defined timestamps will be removed from the user + // keys when data is flushed from memtables to SST files. Other places that + // user keys can be persisted like file boundaries in file metadata and blob + // files go through a similar process. There are two major motivations + // for this flag: + // 1) backward compatibility: if the user later decides to + // disable the user-defined timestamp feature for the column family, these SST + // files can be handled by a user comparator that is not aware of user-defined + // timestamps. + // 2) enable user-defined timestamp feature for an existing column family + // while set this flag to be `false`: user keys in the newly generated SST + // files are of the same format as the existing SST files. + // + // Currently only user comparator that formats user-defined timesamps as + // uint64_t via using one of the RocksDB provided comparator + // `ComparatorWithU64TsImpl` are supported. + // + // When setting this flag to `false`, users should also call + // `DB::IncreaseFullHistoryTsLow` to set a cutoff timestamp for flush. RocksDB + // refrains from flushing a memtable with data still above + // the cutoff timestamp with best effort. If this cutoff timestamp is not set, + // flushing continues normally. + // + // Users can do user-defined + // multi-versioned read above the cutoff timestamp. When users try to read + // below the cutoff timestamp, an error will be returned. + // + // Note that if WAL is enabled, unlike SST files, user-defined timestamps are + // persisted to WAL even if this flag is set to `false`. The benefit of this + // is that user-defined timestamps can be recovered with the caveat that users + // should flush all memtables so there is no active WAL files before doing a + // downgrade. In order to use WAL to recover user-defined timestamps, users of + // this feature would want to set both `avoid_flush_during_shutdown` and + // `avoid_flush_during_recovery` to be true. + // + // Note that setting this flag to false is not supported in combination with + // atomic flush, or concurrent memtable write enabled by + // `allow_concurrent_memtable_write`. + // + // Default: true (user-defined timestamps are persisted) + // Not dynamically changeable, change it requires db restart and + // only compatible changes are allowed. + bool persist_user_defined_timestamps = true; + + // Enable/disable per key-value checksum protection for in memory blocks. + // + // Checksum is constructed when a block is loaded into memory and verification + // is done for each key read from the block. This is useful for detecting + // in-memory data corruption. Note that this feature has a non-trivial + // negative impact on read performance. Different values of the + // option have similar performance impact, but different memory cost and + // corruption detection probability (e.g. 1 byte gives 255/256 chance for + // detecting a corruption). + // + // Default: 0 (no protection) + // Supported values: 0, 1, 2, 4, 8. + // Dynamically changeable through the SetOptions() API. + uint8_t block_protection_bytes_per_key = 0; + + // For leveled compaction, RocksDB may compact a file at the bottommost level + // if it can compact away data that were protected by some snapshot. + // The compaction reason in LOG for this kind of compactions is + // "BottommostFiles". Usually such compaction can happen as soon as a + // relevant snapshot is released. This option allows user to delay + // such compactions. A file is qualified for "BottommostFiles" compaction + // if it is at least "bottommost_file_compaction_delay" seconds old. + // + // Default: 0 (no delay) + // Dynamically changeable through the SetOptions() API. + uint32_t bottommost_file_compaction_delay = 0; + // Create ColumnFamilyOptions with default values for all fields AdvancedColumnFamilyOptions(); // Create ColumnFamilyOptions from Options diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 11e4d2686889..8a26585fe738 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -62,7 +62,6 @@ extern "C" { #endif -#include #include #include #include @@ -76,7 +75,10 @@ typedef struct rocksdb_backup_engine_options_t rocksdb_backup_engine_options_t; typedef struct rocksdb_restore_options_t rocksdb_restore_options_t; typedef struct rocksdb_memory_allocator_t rocksdb_memory_allocator_t; typedef struct rocksdb_lru_cache_options_t rocksdb_lru_cache_options_t; +typedef struct rocksdb_hyper_clock_cache_options_t + rocksdb_hyper_clock_cache_options_t; typedef struct rocksdb_cache_t rocksdb_cache_t; +typedef struct rocksdb_write_buffer_manager_t rocksdb_write_buffer_manager_t; typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t; typedef struct rocksdb_compactionfiltercontext_t rocksdb_compactionfiltercontext_t; @@ -135,6 +137,10 @@ typedef struct rocksdb_wal_iterator_t rocksdb_wal_iterator_t; typedef struct rocksdb_wal_readoptions_t rocksdb_wal_readoptions_t; typedef struct rocksdb_memory_consumers_t rocksdb_memory_consumers_t; typedef struct rocksdb_memory_usage_t rocksdb_memory_usage_t; +typedef struct rocksdb_statistics_histogram_data_t + rocksdb_statistics_histogram_data_t; +typedef struct rocksdb_wait_for_compact_options_t + rocksdb_wait_for_compact_options_t; /* DB operations */ @@ -410,6 +416,16 @@ rocksdb_create_column_family(rocksdb_t* db, const rocksdb_options_t* column_family_options, const char* column_family_name, char** errptr); +extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t** +rocksdb_create_column_families(rocksdb_t* db, + const rocksdb_options_t* column_family_options, + int num_column_families, + const char* const* column_family_names, + size_t* lencfs, char** errptr); + +extern ROCKSDB_LIBRARY_API void rocksdb_create_column_families_destroy( + rocksdb_column_family_handle_t** list); + extern ROCKSDB_LIBRARY_API rocksdb_column_family_handle_t* rocksdb_create_column_family_with_ttl( rocksdb_t* db, const rocksdb_options_t* column_family_options, @@ -597,13 +613,14 @@ extern ROCKSDB_LIBRARY_API void rocksdb_release_snapshot( extern ROCKSDB_LIBRARY_API char* rocksdb_property_value(rocksdb_t* db, const char* propname); /* returns 0 on success, -1 otherwise */ -int rocksdb_property_int(rocksdb_t* db, const char* propname, - uint64_t* out_val); +extern ROCKSDB_LIBRARY_API int rocksdb_property_int(rocksdb_t* db, + const char* propname, + uint64_t* out_val); /* returns 0 on success, -1 otherwise */ -int rocksdb_property_int_cf(rocksdb_t* db, - rocksdb_column_family_handle_t* column_family, - const char* propname, uint64_t* out_val); +extern ROCKSDB_LIBRARY_API int rocksdb_property_int_cf( + rocksdb_t* db, rocksdb_column_family_handle_t* column_family, + const char* propname, uint64_t* out_val); extern ROCKSDB_LIBRARY_API char* rocksdb_property_value_cf( rocksdb_t* db, rocksdb_column_family_handle_t* column_family, @@ -662,6 +679,11 @@ extern ROCKSDB_LIBRARY_API void rocksdb_flush_cf( rocksdb_t* db, const rocksdb_flushoptions_t* options, rocksdb_column_family_handle_t* column_family, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_flush_cfs( + rocksdb_t* db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t** column_family, int num_column_families, + char** errptr); + extern ROCKSDB_LIBRARY_API void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr); @@ -1001,10 +1023,6 @@ extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_no_block_cache( extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_block_cache( rocksdb_block_based_table_options_t* options, rocksdb_cache_t* block_cache); extern ROCKSDB_LIBRARY_API void -rocksdb_block_based_options_set_block_cache_compressed( - rocksdb_block_based_table_options_t* options, - rocksdb_cache_t* block_cache_compressed); -extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_whole_key_filtering( rocksdb_block_based_table_options_t*, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_format_version( @@ -1043,6 +1061,8 @@ rocksdb_block_based_options_set_pin_top_level_index_and_filter( rocksdb_block_based_table_options_t*, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory( rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_write_buffer_manager( + rocksdb_options_t* opt, rocksdb_write_buffer_manager_t* wbm); /* Cuckoo table options */ @@ -1126,6 +1146,8 @@ extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_paranoid_checks( rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_db_paths( rocksdb_options_t*, const rocksdb_dbpath_t** path_values, size_t num_paths); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_cf_paths( + rocksdb_options_t*, const rocksdb_dbpath_t** path_values, size_t num_paths); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_env(rocksdb_options_t*, rocksdb_env_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_info_log(rocksdb_options_t*, @@ -1239,6 +1261,26 @@ rocksdb_options_set_max_bytes_for_level_multiplier_additional( rocksdb_options_t*, int* level_values, size_t num_levels); extern ROCKSDB_LIBRARY_API void rocksdb_options_enable_statistics( rocksdb_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_periodic_compaction_seconds( + rocksdb_options_t*, uint64_t); +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_options_get_periodic_compaction_seconds(rocksdb_options_t*); + +enum { + rocksdb_statistics_level_disable_all = 0, + rocksdb_statistics_level_except_tickers = + rocksdb_statistics_level_disable_all, + rocksdb_statistics_level_except_histogram_or_timers = 1, + rocksdb_statistics_level_except_timers = 2, + rocksdb_statistics_level_except_detailed_timers = 3, + rocksdb_statistics_level_except_time_for_mutex = 4, + rocksdb_statistics_level_all = 5, +}; + +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_statistics_level( + rocksdb_options_t*, int level); +extern ROCKSDB_LIBRARY_API int rocksdb_options_get_statistics_level( + rocksdb_options_t*); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt, unsigned char val); @@ -1315,6 +1357,11 @@ extern ROCKSDB_LIBRARY_API int rocksdb_options_get_prepopulate_blob_cache( /* returns a pointer to a malloc()-ed, null terminated string */ extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string( rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API uint64_t rocksdb_options_statistics_get_ticker_count( + rocksdb_options_t* opt, uint32_t ticker_type); +extern ROCKSDB_LIBRARY_API void rocksdb_options_statistics_get_histogram_data( + rocksdb_options_t* opt, uint32_t histogram_type, + rocksdb_statistics_histogram_data_t* const data); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_max_write_buffer_number( rocksdb_options_t*, int); @@ -1515,7 +1562,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_skip_list_rep( extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hash_link_list_rep( rocksdb_options_t*, size_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_plain_table_factory( - rocksdb_options_t*, uint32_t, int, double, size_t); + rocksdb_options_t*, uint32_t, int, double, size_t, size_t, char, + unsigned char, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_min_level_to_compress( rocksdb_options_t* opt, int level); @@ -1613,6 +1661,10 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_row_cache( extern ROCKSDB_LIBRARY_API void rocksdb_options_add_compact_on_deletion_collector_factory( rocksdb_options_t*, size_t window_size, size_t num_dels_trigger); +extern ROCKSDB_LIBRARY_API void +rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio( + rocksdb_options_t*, size_t window_size, size_t num_dels_trigger, + double deletion_ratio); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_manual_wal_flush( rocksdb_options_t* opt, unsigned char); extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_manual_wal_flush( @@ -1625,6 +1677,10 @@ extern ROCKSDB_LIBRARY_API int rocksdb_options_get_wal_compression( /* RateLimiter */ extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* rocksdb_ratelimiter_create( int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness); +extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t* +rocksdb_ratelimiter_create_auto_tuned(int64_t rate_bytes_per_sec, + int64_t refill_period_us, + int32_t fairness); extern ROCKSDB_LIBRARY_API void rocksdb_ratelimiter_destroy( rocksdb_ratelimiter_t*); @@ -1715,7 +1771,8 @@ enum { rocksdb_blob_checksum_time, rocksdb_blob_decompress_time, rocksdb_internal_range_del_reseek_count, - rocksdb_total_metric_count = 78 + rocksdb_block_read_cpu_time, + rocksdb_total_metric_count = 79 }; extern ROCKSDB_LIBRARY_API void rocksdb_set_perf_level(int); @@ -1902,6 +1959,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_timestamp( rocksdb_readoptions_t*, const char* ts, size_t tslen); extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_iter_start_ts( rocksdb_readoptions_t*, const char* ts, size_t tslen); +extern ROCKSDB_LIBRARY_API void rocksdb_readoptions_set_auto_readahead_size( + rocksdb_readoptions_t*, unsigned char); /* Write options */ @@ -2003,18 +2062,78 @@ extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru( extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru_with_strict_capacity_limit(size_t capacity); extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_lru_opts( - rocksdb_lru_cache_options_t*); + const rocksdb_lru_cache_options_t*); + extern ROCKSDB_LIBRARY_API void rocksdb_cache_destroy(rocksdb_cache_t* cache); extern ROCKSDB_LIBRARY_API void rocksdb_cache_disown_data( rocksdb_cache_t* cache); extern ROCKSDB_LIBRARY_API void rocksdb_cache_set_capacity( rocksdb_cache_t* cache, size_t capacity); extern ROCKSDB_LIBRARY_API size_t -rocksdb_cache_get_capacity(rocksdb_cache_t* cache); +rocksdb_cache_get_capacity(const rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_cache_get_usage(const rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_cache_get_pinned_usage(const rocksdb_cache_t* cache); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_cache_get_table_address_count(const rocksdb_cache_t* cache); extern ROCKSDB_LIBRARY_API size_t -rocksdb_cache_get_usage(rocksdb_cache_t* cache); +rocksdb_cache_get_occupancy_count(const rocksdb_cache_t* cache); + +/* WriteBufferManager */ + +extern ROCKSDB_LIBRARY_API rocksdb_write_buffer_manager_t* +rocksdb_write_buffer_manager_create(size_t buffer_size, bool allow_stall); +extern ROCKSDB_LIBRARY_API rocksdb_write_buffer_manager_t* +rocksdb_write_buffer_manager_create_with_cache(size_t buffer_size, + const rocksdb_cache_t* cache, + bool allow_stall); + +extern ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_destroy( + rocksdb_write_buffer_manager_t* wbm); +extern ROCKSDB_LIBRARY_API bool rocksdb_write_buffer_manager_enabled( + rocksdb_write_buffer_manager_t* wbm); +extern ROCKSDB_LIBRARY_API bool rocksdb_write_buffer_manager_cost_to_cache( + rocksdb_write_buffer_manager_t* wbm); extern ROCKSDB_LIBRARY_API size_t -rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache); +rocksdb_write_buffer_manager_memory_usage(rocksdb_write_buffer_manager_t* wbm); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_write_buffer_manager_mutable_memtable_memory_usage( + rocksdb_write_buffer_manager_t* wbm); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_write_buffer_manager_dummy_entries_in_cache_usage( + rocksdb_write_buffer_manager_t* wbm); +extern ROCKSDB_LIBRARY_API size_t +rocksdb_write_buffer_manager_buffer_size(rocksdb_write_buffer_manager_t* wbm); +extern ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_buffer_size( + rocksdb_write_buffer_manager_t* wbm, size_t new_size); +extern ROCKSDB_LIBRARY_API void rocksdb_write_buffer_manager_set_allow_stall( + rocksdb_write_buffer_manager_t* wbm, bool new_allow_stall); + +/* HyperClockCache */ + +extern ROCKSDB_LIBRARY_API rocksdb_hyper_clock_cache_options_t* +rocksdb_hyper_clock_cache_options_create(size_t capacity, + size_t estimated_entry_charge); +extern ROCKSDB_LIBRARY_API void rocksdb_hyper_clock_cache_options_destroy( + rocksdb_hyper_clock_cache_options_t*); +extern ROCKSDB_LIBRARY_API void rocksdb_hyper_clock_cache_options_set_capacity( + rocksdb_hyper_clock_cache_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void +rocksdb_hyper_clock_cache_options_set_estimated_entry_charge( + rocksdb_hyper_clock_cache_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void +rocksdb_hyper_clock_cache_options_set_num_shard_bits( + rocksdb_hyper_clock_cache_options_t*, int); +extern ROCKSDB_LIBRARY_API void +rocksdb_hyper_clock_cache_options_set_memory_allocator( + rocksdb_hyper_clock_cache_options_t*, rocksdb_memory_allocator_t*); + +extern ROCKSDB_LIBRARY_API rocksdb_cache_t* rocksdb_cache_create_hyper_clock( + size_t capacity, size_t estimated_entry_charge); +extern ROCKSDB_LIBRARY_API rocksdb_cache_t* +rocksdb_cache_create_hyper_clock_opts( + const rocksdb_hyper_clock_cache_options_t*); /* DBPath */ @@ -2120,6 +2239,11 @@ rocksdb_ingestexternalfileoptions_set_allow_blocking_flush( extern ROCKSDB_LIBRARY_API void rocksdb_ingestexternalfileoptions_set_ingest_behind( rocksdb_ingestexternalfileoptions_t* opt, unsigned char ingest_behind); +extern ROCKSDB_LIBRARY_API void +rocksdb_ingestexternalfileoptions_set_fail_if_not_bottommost_level( + rocksdb_ingestexternalfileoptions_t* opt, + unsigned char fail_if_not_bottommost_level); + extern ROCKSDB_LIBRARY_API void rocksdb_ingestexternalfileoptions_destroy( rocksdb_ingestexternalfileoptions_t* opt); @@ -2202,6 +2326,13 @@ extern ROCKSDB_LIBRARY_API void rocksdb_universal_compaction_options_destroy( extern ROCKSDB_LIBRARY_API rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create(void); extern ROCKSDB_LIBRARY_API void +rocksdb_fifo_compaction_options_set_allow_compaction( + rocksdb_fifo_compaction_options_t* fifo_opts, + unsigned char allow_compaction); +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_fifo_compaction_options_get_allow_compaction( + rocksdb_fifo_compaction_options_t* fifo_opts); +extern ROCKSDB_LIBRARY_API void rocksdb_fifo_compaction_options_set_max_table_files_size( rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size); extern ROCKSDB_LIBRARY_API uint64_t @@ -2339,6 +2470,9 @@ extern ROCKSDB_LIBRARY_API char* rocksdb_sst_file_metadata_get_relative_filename( rocksdb_sst_file_metadata_t* file_meta); +extern ROCKSDB_LIBRARY_API char* rocksdb_sst_file_metadata_get_directory( + rocksdb_sst_file_metadata_t* file_meta); + extern ROCKSDB_LIBRARY_API uint64_t rocksdb_sst_file_metadata_get_size(rocksdb_sst_file_metadata_t* file_meta); @@ -2397,6 +2531,12 @@ extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_property_value( extern ROCKSDB_LIBRARY_API int rocksdb_transactiondb_property_int( rocksdb_transactiondb_t* db, const char* propname, uint64_t* out_val); +extern ROCKSDB_LIBRARY_API rocksdb_t* rocksdb_transactiondb_get_base_db( + rocksdb_transactiondb_t* txn_db); + +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_close_base_db( + rocksdb_t* base_db); + extern ROCKSDB_LIBRARY_API rocksdb_transaction_t* rocksdb_transaction_begin( rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* write_options, @@ -2503,6 +2643,12 @@ extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get( const size_t* keys_list_sizes, char** values_list, size_t* values_list_sizes, char** errs); +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get_for_update( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs); + extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get_cf( rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, const rocksdb_column_family_handle_t* const* column_families, @@ -2510,6 +2656,13 @@ extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get_cf( const size_t* keys_list_sizes, char** values_list, size_t* values_list_sizes, char** errs); +extern ROCKSDB_LIBRARY_API void rocksdb_transaction_multi_get_for_update_cf( + rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options, + const rocksdb_column_family_handle_t* const* column_families, + size_t num_keys, const char* const* keys_list, + const size_t* keys_list_sizes, char** values_list, + size_t* values_list_sizes, char** errs); + extern ROCKSDB_LIBRARY_API char* rocksdb_transactiondb_get( rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options, const char* key, size_t klen, size_t* vlen, char** errptr); @@ -2626,6 +2779,11 @@ extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_cf( rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options, rocksdb_column_family_handle_t* column_family, char** errptr); +extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_cfs( + rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options, + rocksdb_column_family_handle_t** column_families, int num_column_families, + char** errptr); + extern ROCKSDB_LIBRARY_API void rocksdb_transactiondb_flush_wal( rocksdb_transactiondb_t* txn_db, unsigned char sync, char** errptr); @@ -2796,6 +2954,67 @@ extern ROCKSDB_LIBRARY_API void rocksdb_disable_manual_compaction( extern ROCKSDB_LIBRARY_API void rocksdb_enable_manual_compaction(rocksdb_t* db); +extern ROCKSDB_LIBRARY_API rocksdb_statistics_histogram_data_t* +rocksdb_statistics_histogram_data_create(void); +extern ROCKSDB_LIBRARY_API void rocksdb_statistics_histogram_data_destroy( + rocksdb_statistics_histogram_data_t* data); +extern ROCKSDB_LIBRARY_API double rocksdb_statistics_histogram_data_get_median( + rocksdb_statistics_histogram_data_t* data); +extern ROCKSDB_LIBRARY_API double rocksdb_statistics_histogram_data_get_p95( + rocksdb_statistics_histogram_data_t* data); +extern ROCKSDB_LIBRARY_API double rocksdb_statistics_histogram_data_get_p99( + rocksdb_statistics_histogram_data_t* data); +extern ROCKSDB_LIBRARY_API double rocksdb_statistics_histogram_data_get_average( + rocksdb_statistics_histogram_data_t* data); +extern ROCKSDB_LIBRARY_API double rocksdb_statistics_histogram_data_get_std_dev( + rocksdb_statistics_histogram_data_t* data); +extern ROCKSDB_LIBRARY_API double rocksdb_statistics_histogram_data_get_max( + rocksdb_statistics_histogram_data_t* data); +extern ROCKSDB_LIBRARY_API uint64_t rocksdb_statistics_histogram_data_get_count( + rocksdb_statistics_histogram_data_t* data); +extern ROCKSDB_LIBRARY_API uint64_t rocksdb_statistics_histogram_data_get_sum( + rocksdb_statistics_histogram_data_t* data); +extern ROCKSDB_LIBRARY_API double rocksdb_statistics_histogram_data_get_min( + rocksdb_statistics_histogram_data_t* data); + +extern ROCKSDB_LIBRARY_API void rocksdb_wait_for_compact( + rocksdb_t* db, rocksdb_wait_for_compact_options_t* options, char** errptr); + +extern ROCKSDB_LIBRARY_API rocksdb_wait_for_compact_options_t* +rocksdb_wait_for_compact_options_create(void); + +extern ROCKSDB_LIBRARY_API void rocksdb_wait_for_compact_options_destroy( + rocksdb_wait_for_compact_options_t* opt); + +extern ROCKSDB_LIBRARY_API void +rocksdb_wait_for_compact_options_set_abort_on_pause( + rocksdb_wait_for_compact_options_t* opt, unsigned char v); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_wait_for_compact_options_get_abort_on_pause( + rocksdb_wait_for_compact_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_wait_for_compact_options_set_flush( + rocksdb_wait_for_compact_options_t* opt, unsigned char v); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_wait_for_compact_options_get_flush( + rocksdb_wait_for_compact_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_wait_for_compact_options_set_close_db( + rocksdb_wait_for_compact_options_t* opt, unsigned char v); + +extern ROCKSDB_LIBRARY_API unsigned char +rocksdb_wait_for_compact_options_get_close_db( + rocksdb_wait_for_compact_options_t* opt); + +extern ROCKSDB_LIBRARY_API void rocksdb_wait_for_compact_options_set_timeout( + rocksdb_wait_for_compact_options_t* opt, uint64_t microseconds); + +extern ROCKSDB_LIBRARY_API uint64_t +rocksdb_wait_for_compact_options_get_timeout( + rocksdb_wait_for_compact_options_t* opt); + #ifdef __cplusplus } /* end extern "C" */ #endif diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 584e119bc847..d3762b4a2e1d 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -7,28 +7,45 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. // -// Various APIs for creating and customizing read caches in RocksDB. +// Various APIs for configuring, creating, and monitoring read caches. #pragma once #include -#include +#include #include #include #include "rocksdb/compression_type.h" +#include "rocksdb/data_structure.h" #include "rocksdb/memory_allocator.h" -#include "rocksdb/slice.h" -#include "rocksdb/statistics.h" -#include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { -class Cache; +class Cache; // defined in advanced_cache.h struct ConfigOptions; -class Logger; class SecondaryCache; +// These definitions begin source compatibility for a future change in which +// a specific class for block cache is split away from general caches, so that +// the block cache API can continue to become more specialized and +// customizeable, including in ways incompatible with a general cache. For +// example, HyperClockCache is not usable as a general cache because it expects +// only fixed-size block cache keys, but this limitation is not yet reflected +// in the API function signatures. +// * Phase 1 (done) - Make both BlockCache and RowCache aliases for Cache, +// and make a factory function for row caches. Encourage users of row_cache +// (not common) to switch to the factory function for row caches. +// * Phase 2 - Split off RowCache as its own class, removing secondary +// cache support features and more from the API to simplify it. Between Phase 1 +// and Phase 2 users of row_cache will need to update their code. Any time +// after Phase 2, the block cache and row cache APIs can become more specialized +// in ways incompatible with general caches. +// * Phase 3 - Move existing RocksDB uses of Cache to BlockCache, and deprecate +// (but not yet remove) Cache as an alias for BlockCache. +using BlockCache = Cache; +using RowCache = Cache; + // Classifications of block cache entries. // // Developer notes: Adding a new enum to this class requires corresponding @@ -75,6 +92,9 @@ constexpr uint32_t kNumCacheEntryRoles = // Obtain a hyphen-separated, lowercase name of a `CacheEntryRole`. const std::string& GetCacheEntryRoleName(CacheEntryRole); +// A fast bit set for CacheEntryRoles +using CacheEntryRoleSet = SmallEnumSet; + // For use with `GetMapProperty()` for property // `DB::Properties::kBlockCacheEntryStats`. On success, the map will // be populated with all keys that can be obtained from these functions. @@ -136,6 +156,39 @@ struct ShardedCacheOptions { CacheMetadataChargePolicy metadata_charge_policy = kDefaultCacheMetadataChargePolicy; + // A SecondaryCache instance to use the non-volatile tier. For a RowCache + // this option must be kept as default empty. + std::shared_ptr secondary_cache; + + // See hash_seed comments below + static constexpr int32_t kQuasiRandomHashSeed = -1; + static constexpr int32_t kHostHashSeed = -2; + + // EXPERT OPTION: Specifies how a hash seed should be determined for the + // cache, or specifies a specific seed (only recommended for diagnostics or + // testing). + // + // Background: it could be dangerous to have different cache instances + // access the same SST files with the same hash seed, as correlated unlucky + // hashing across hosts or restarts could cause a widespread issue, rather + // than an isolated one. For example, with smaller block caches, it is + // possible for large full Bloom filters in a set of SST files to be randomly + // clustered into one cache shard, causing mutex contention or a thrashing + // condition as there's little or no space left for other entries assigned to + // the shard. If a set of SST files is broadcast and used on many hosts, we + // should ensure all have an independent chance of balanced shards. + // + // Values >= 0 will be treated as fixed hash seeds. Values < 0 are reserved + // for methods of dynamically choosing a seed, currently: + // * kQuasiRandomHashSeed - Each cache created chooses a seed mostly randomly, + // except that within a process, no seed is repeated until all have been + // issued. + // * kHostHashSeed - The seed is determined based on hashing the host name. + // Although this is arguably slightly worse for production reliability, it + // solves the essential problem of cross-host correlation while ensuring + // repeatable behavior on a host, for diagnostic purposes. + int32_t hash_seed = kHostHashSeed; + ShardedCacheOptions() {} ShardedCacheOptions( size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, @@ -147,8 +200,17 @@ struct ShardedCacheOptions { strict_capacity_limit(_strict_capacity_limit), memory_allocator(std::move(_memory_allocator)), metadata_charge_policy(_metadata_charge_policy) {} + // Make ShardedCacheOptions polymorphic + virtual ~ShardedCacheOptions() = default; }; +// LRUCache - A cache using LRU eviction to stay at or below a set capacity. +// The cache is sharded to 2^num_shard_bits shards, by hash of the key. +// The total capacity is divided and evenly assigned to each shard, and each +// shard has its own LRU list for evictions. Each shard also has a mutex for +// exclusive access during operations; even read operations need exclusive +// access in order to update the LRU list. Mutex contention is usually low +// with enough shards. struct LRUCacheOptions : public ShardedCacheOptions { // Ratio of cache reserved for high-priority and low-priority entries, // respectively. (See Cache::Priority below more information on the levels.) @@ -156,7 +218,8 @@ struct LRUCacheOptions : public ShardedCacheOptions { // values cannot exceed 1. // // If high_pri_pool_ratio is greater than zero, a dedicated high-priority LRU - // list is maintained by the cache. Similarly, if low_pri_pool_ratio is + // list is maintained by the cache. A ratio of 0.5 means non-high-priority + // entries will use midpoint insertion. Similarly, if low_pri_pool_ratio is // greater than zero, a dedicated low-priority LRU list is maintained. // There is also a bottom-priority LRU list, which is always enabled and not // explicitly configurable. Entries are spilled over to the next available @@ -171,9 +234,6 @@ struct LRUCacheOptions : public ShardedCacheOptions { // otherwise, they are placed in the bottom-priority pool.) This results // in lower-priority entries without hits getting evicted from the cache // sooner. - // - // Default values: high_pri_pool_ratio = 0.5 (which is referred to as - // "midpoint insertion"), low_pri_pool_ratio = 0 double high_pri_pool_ratio = 0.5; double low_pri_pool_ratio = 0.0; @@ -183,9 +243,6 @@ struct LRUCacheOptions : public ShardedCacheOptions { // -DROCKSDB_DEFAULT_TO_ADAPTIVE_MUTEX, false otherwise. bool use_adaptive_mutex = kDefaultToAdaptiveMutex; - // A SecondaryCache instance to use a the non-volatile tier. - std::shared_ptr secondary_cache; - LRUCacheOptions() {} LRUCacheOptions(size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, double _high_pri_pool_ratio, @@ -200,31 +257,40 @@ struct LRUCacheOptions : public ShardedCacheOptions { high_pri_pool_ratio(_high_pri_pool_ratio), low_pri_pool_ratio(_low_pri_pool_ratio), use_adaptive_mutex(_use_adaptive_mutex) {} + + // Construct an instance of LRUCache using these options + std::shared_ptr MakeSharedCache() const; + + // Construct an instance of LRUCache for use as a row cache, typically for + // `DBOptions::row_cache`. Some options are not relevant to row caches. + std::shared_ptr MakeSharedRowCache() const; }; -// Create a new cache with a fixed size capacity. The cache is sharded -// to 2^num_shard_bits shards, by hash of the key. The total capacity -// is divided and evenly assigned to each shard. If strict_capacity_limit -// is set, insert to the cache will fail when cache is full. User can also -// set percentage of the cache reserves for high priority entries via -// high_pri_pool_pct. -// num_shard_bits = -1 means it is automatically determined: every shard -// will be at least 512KB and number of shard bits will not exceed 6. -extern std::shared_ptr NewLRUCache( +// DEPRECATED wrapper function +inline std::shared_ptr NewLRUCache( size_t capacity, int num_shard_bits = -1, bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5, std::shared_ptr memory_allocator = nullptr, bool use_adaptive_mutex = kDefaultToAdaptiveMutex, CacheMetadataChargePolicy metadata_charge_policy = kDefaultCacheMetadataChargePolicy, - double low_pri_pool_ratio = 0.0); - -extern std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts); + double low_pri_pool_ratio = 0.0) { + return LRUCacheOptions(capacity, num_shard_bits, strict_capacity_limit, + high_pri_pool_ratio, memory_allocator, + use_adaptive_mutex, metadata_charge_policy, + low_pri_pool_ratio) + .MakeSharedCache(); +} + +// DEPRECATED wrapper function +inline std::shared_ptr NewLRUCache(const LRUCacheOptions& cache_opts) { + return cache_opts.MakeSharedCache(); +} // EXPERIMENTAL -// Options structure for configuring a SecondaryCache instance based on -// LRUCache. The LRUCacheOptions.secondary_cache is not used and -// should not be set. +// Options structure for configuring a SecondaryCache instance with in-memory +// compression. The implementation uses LRUCache so inherits its options, +// except LRUCacheOptions.secondary_cache is not used and should not be set. struct CompressedSecondaryCacheOptions : LRUCacheOptions { // The compression method (if any) that is used to compress data. CompressionType compression_type = CompressionType::kLZ4Compression; @@ -240,6 +306,10 @@ struct CompressedSecondaryCacheOptions : LRUCacheOptions { // into chunks so that they may better fit jemalloc bins. bool enable_custom_split_merge = false; + // Kinds of entries that should not be compressed, but can be stored. + // (Filter blocks are essentially non-compressible but others usually are.) + CacheEntryRoleSet do_not_compress_roles = {CacheEntryRole::kFilterBlock}; + CompressedSecondaryCacheOptions() {} CompressedSecondaryCacheOptions( size_t _capacity, int _num_shard_bits, bool _strict_capacity_limit, @@ -250,19 +320,27 @@ struct CompressedSecondaryCacheOptions : LRUCacheOptions { kDefaultCacheMetadataChargePolicy, CompressionType _compression_type = CompressionType::kLZ4Compression, uint32_t _compress_format_version = 2, - bool _enable_custom_split_merge = false) + bool _enable_custom_split_merge = false, + const CacheEntryRoleSet& _do_not_compress_roles = + {CacheEntryRole::kFilterBlock}) : LRUCacheOptions(_capacity, _num_shard_bits, _strict_capacity_limit, _high_pri_pool_ratio, std::move(_memory_allocator), _use_adaptive_mutex, _metadata_charge_policy, _low_pri_pool_ratio), compression_type(_compression_type), compress_format_version(_compress_format_version), - enable_custom_split_merge(_enable_custom_split_merge) {} + enable_custom_split_merge(_enable_custom_split_merge), + do_not_compress_roles(_do_not_compress_roles) {} + + // Construct an instance of CompressedSecondaryCache using these options + std::shared_ptr MakeSharedSecondaryCache() const; + + // Avoid confusion with LRUCache + std::shared_ptr MakeSharedCache() const = delete; }; -// EXPERIMENTAL -// Create a new Secondary Cache that is implemented on top of LRUCache. -extern std::shared_ptr NewCompressedSecondaryCache( +// DEPRECATED wrapper function +inline std::shared_ptr NewCompressedSecondaryCache( size_t capacity, int num_shard_bits = -1, bool strict_capacity_limit = false, double high_pri_pool_ratio = 0.5, double low_pri_pool_ratio = 0.0, @@ -272,10 +350,23 @@ extern std::shared_ptr NewCompressedSecondaryCache( kDefaultCacheMetadataChargePolicy, CompressionType compression_type = CompressionType::kLZ4Compression, uint32_t compress_format_version = 2, - bool enable_custom_split_merge = false); - -extern std::shared_ptr NewCompressedSecondaryCache( - const CompressedSecondaryCacheOptions& opts); + bool enable_custom_split_merge = false, + const CacheEntryRoleSet& _do_not_compress_roles = { + CacheEntryRole::kFilterBlock}) { + return CompressedSecondaryCacheOptions( + capacity, num_shard_bits, strict_capacity_limit, + high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator, + use_adaptive_mutex, metadata_charge_policy, compression_type, + compress_format_version, enable_custom_split_merge, + _do_not_compress_roles) + .MakeSharedSecondaryCache(); +} + +// DEPRECATED wrapper function +inline std::shared_ptr NewCompressedSecondaryCache( + const CompressedSecondaryCacheOptions& opts) { + return opts.MakeSharedSecondaryCache(); +} // HyperClockCache - A lock-free Cache alternative for RocksDB block cache // that offers much improved CPU efficiency vs. LRUCache under high parallel @@ -285,8 +376,8 @@ extern std::shared_ptr NewCompressedSecondaryCache( // compatible with HyperClockCache. // * Requires an extra tuning parameter: see estimated_entry_charge below. // Similarly, substantially changing the capacity with SetCapacity could -// harm efficiency. -// * SecondaryCache is not yet supported. +// harm efficiency. -> EXPERIMENTAL: the tuning parameter can be set to 0 +// to find the appropriate balance automatically. // * Cache priorities are less aggressively enforced, which could cause // cache dilution from long range scans (unless they use fill_cache=false). // * Can be worse for small caches, because if almost all of a cache shard is @@ -295,10 +386,16 @@ extern std::shared_ptr NewCompressedSecondaryCache( // // See internal cache/clock_cache.h for full description. struct HyperClockCacheOptions : public ShardedCacheOptions { - // The estimated average `charge` associated with cache entries. This is a - // critical configuration parameter for good performance from the hyper - // cache, because having a table size that is fixed at creation time greatly - // reduces the required synchronization between threads. + // The estimated average `charge` associated with cache entries. + // + // EXPERIMENTAL: the field can be set to 0 to size the table dynamically + // and automatically. See also min_avg_entry_charge. This feature requires + // platform support for lazy anonymous memory mappings (incl Linux, Windows). + // Performance is very similar to choosing the best configuration parameter. + // + // PRODUCTION-TESTED: This is a critical configuration parameter for good + // performance, because having a table size that is fixed at creation time + // greatly reduces the required synchronization between threads. // * If the estimate is substantially too low (e.g. less than half the true // average) then metadata space overhead with be substantially higher (e.g. // 200 bytes per entry rather than 100). With kFullChargeCacheMetadata, this @@ -327,6 +424,23 @@ struct HyperClockCacheOptions : public ShardedCacheOptions { // to estimate toward the lower side than the higher side. size_t estimated_entry_charge; + // EXPERIMENTAL: When estimated_entry_charge == 0, this parameter establishes + // a promised lower bound on the average charge of all entries in the table, + // which is roughly the average uncompressed SST block size of block cache + // entries, typically > 4KB. The default should generally suffice with almost + // no cost. (This option is ignored for estimated_entry_charge > 0.) + // + // More detail: The table for indexing cache entries will grow automatically + // as needed, but a hard upper bound on that size is needed at creation time. + // The reason is that a contiguous memory mapping for the maximum size is + // created, but memory pages are only mapped to physical (RSS) memory as + // needed. If the average charge of all entries in the table falls below + // this value, the table will operate below its full logical capacity (total + // memory usage) because it has reached its physical capacity for efficiently + // indexing entries. The hash table is never allowed to exceed a certain safe + // load factor for efficient Lookup, Insert, etc. + size_t min_avg_entry_charge = 450; + HyperClockCacheOptions( size_t _capacity, size_t _estimated_entry_charge, int _num_shard_bits = -1, bool _strict_capacity_limit = false, @@ -352,426 +466,69 @@ extern std::shared_ptr NewClockCache( CacheMetadataChargePolicy metadata_charge_policy = kDefaultCacheMetadataChargePolicy); -// A Cache maps keys to objects resident in memory, tracks reference counts -// on those key-object entries, and is able to remove unreferenced entries -// whenever it wants. All operations are fully thread safe except as noted. -// Inserted entries have a specified "charge" which is some quantity in -// unspecified units, typically bytes of memory used. A Cache will typically -// have a finite capacity in units of charge, and evict entries as needed -// to stay at or below that capacity. -// -// NOTE: This API is for expert use only and is more intended for providing -// custom implementations than for calling into. It is subject to change -// as RocksDB evolves, especially the RocksDB block cache. +enum PrimaryCacheType { + kCacheTypeLRU, // LRU cache type + kCacheTypeHCC, // Hyper Clock Cache type + kCacheTypeMax, +}; + +enum TieredAdmissionPolicy { + // Automatically select the admission policy + kAdmPolicyAuto, + // During promotion/demotion, first time insert a placeholder entry, second + // time insert the full entry if the placeholder is found, i.e insert on + // second hit + kAdmPolicyPlaceholder, + // Same as kAdmPolicyPlaceholder, but also if an entry in the primary cache + // was a hit, then force insert it into the compressed secondary cache + kAdmPolicyAllowCacheHits, + // An admission policy for three cache tiers - primary uncompressed, + // compressed secondary, and a compressed local flash (non-volatile) cache. + // Each tier is managed as an independent queue. + kAdmPolicyThreeQueue, + kAdmPolicyMax, +}; + +// EXPERIMENTAL +// The following feature is experimental, and the API is subject to change // -// INTERNAL: See typed_cache.h for convenient wrappers on top of this API. -class Cache { - public: // types hidden from API client - // Opaque handle to an entry stored in the cache. - struct Handle {}; - - public: // types hidden from Cache implementation - // Pointer to cached object of unspecified type. (This type alias is - // provided for clarity, not really for type checking.) - using ObjectPtr = void*; - - // Opaque object providing context (settings, etc.) to create objects - // for primary cache from saved (serialized) secondary cache entries. - struct CreateContext {}; - - public: // type defs - // Depending on implementation, cache entries with higher priority levels - // could be less likely to get evicted than entries with lower priority - // levels. The "high" priority level applies to certain SST metablocks (e.g. - // index and filter blocks) if the option - // cache_index_and_filter_blocks_with_high_priority is set. The "low" priority - // level is used for other kinds of SST blocks (most importantly, data - // blocks), as well as the above metablocks in case - // cache_index_and_filter_blocks_with_high_priority is - // not set. The "bottom" priority level is for BlobDB's blob values. - enum class Priority { HIGH, LOW, BOTTOM }; - - // A set of callbacks to allow objects in the primary block cache to be - // be persisted in a secondary cache. The purpose of the secondary cache - // is to support other ways of caching the object, such as persistent or - // compressed data, that may require the object to be parsed and transformed - // in some way. Since the primary cache holds C++ objects and the secondary - // cache may only hold flat data that doesn't need relocation, these - // callbacks need to be provided by the user of the block - // cache to do the conversion. - // The CacheItemHelper is passed to Insert() and Lookup(). It has pointers - // to callback functions for size, saving and deletion of the - // object. The callbacks are defined in C-style in order to make them - // stateless and not add to the cache metadata size. - // Saving multiple std::function objects will take up 32 bytes per - // function, even if its not bound to an object and does no capture. - // - // All the callbacks are C-style function pointers in order to simplify - // lifecycle management. Objects in the cache can outlive the parent DB, - // so anything required for these operations should be contained in the - // object itself. - // - // The SizeCallback takes a pointer to the object and returns the size - // of the persistable data. It can be used by the secondary cache to allocate - // memory if needed. - // - // RocksDB callbacks are NOT exception-safe. A callback completing with an - // exception can lead to undefined behavior in RocksDB, including data loss, - // unreported corruption, deadlocks, and more. - using SizeCallback = size_t (*)(ObjectPtr obj); - - // The SaveToCallback takes an object pointer and saves the persistable - // data into a buffer. The secondary cache may decide to not store it in a - // contiguous buffer, in which case this callback will be called multiple - // times with increasing offset - using SaveToCallback = Status (*)(ObjectPtr from_obj, size_t from_offset, - size_t length, char* out_buf); - - // A function pointer type for destruction of a cache object. This will - // typically call the destructor for the appropriate type of the object. - // The Cache is responsible for copying and reclaiming space for the key, - // but objects are managed in part using this callback. Generally a DeleterFn - // can be nullptr if the ObjectPtr does not need destruction (e.g. nullptr or - // pointer into static data). - using DeleterFn = void (*)(ObjectPtr obj, MemoryAllocator* allocator); - - // The CreateCallback is takes in a buffer from the NVM cache and constructs - // an object using it. The callback doesn't have ownership of the buffer and - // should copy the contents into its own buffer. The CreateContext* is - // provided by Lookup and may be used to follow DB- or CF-specific settings. - // In case of some error, non-OK is returned and the caller should ignore - // any result in out_obj. (The implementation must clean up after itself.) - using CreateCallback = Status (*)(const Slice& data, CreateContext* context, - MemoryAllocator* allocator, - ObjectPtr* out_obj, size_t* out_charge); - - // A struct with pointers to helper functions for spilling items from the - // cache into the secondary cache. May be extended in the future. An - // instance of this struct is expected to outlive the cache. - struct CacheItemHelper { - // Function for deleting an object on its removal from the Cache. - // nullptr is only for entries that require no destruction, such as - // "placeholder" cache entries with nullptr object. - DeleterFn del_cb; // (<- Most performance critical) - // Next three are used for persisting values as described above. - // If any is nullptr, then all three should be nullptr and persisting the - // entry to/from secondary cache is not supported. - SizeCallback size_cb; - SaveToCallback saveto_cb; - CreateCallback create_cb; - // Classification of the entry for monitoring purposes in block cache. - CacheEntryRole role; - - constexpr CacheItemHelper() - : del_cb(nullptr), - size_cb(nullptr), - saveto_cb(nullptr), - create_cb(nullptr), - role(CacheEntryRole::kMisc) {} - - explicit constexpr CacheItemHelper(CacheEntryRole _role, - DeleterFn _del_cb = nullptr, - SizeCallback _size_cb = nullptr, - SaveToCallback _saveto_cb = nullptr, - CreateCallback _create_cb = nullptr) - : del_cb(_del_cb), - size_cb(_size_cb), - saveto_cb(_saveto_cb), - create_cb(_create_cb), - role(_role) { - // Either all three secondary cache callbacks are non-nullptr or - // all three are nullptr - assert((size_cb != nullptr) == (saveto_cb != nullptr)); - assert((size_cb != nullptr) == (create_cb != nullptr)); - } - inline bool IsSecondaryCacheCompatible() const { - return size_cb != nullptr; - } - }; - - public: // ctor/dtor/create - Cache(std::shared_ptr allocator = nullptr) - : memory_allocator_(std::move(allocator)) {} - // No copying allowed - Cache(const Cache&) = delete; - Cache& operator=(const Cache&) = delete; - - // Destroys all remaining entries by calling the associated "deleter" - virtual ~Cache() {} - - // Creates a new Cache based on the input value string and returns the result. - // Currently, this method can be used to create LRUCaches only - // @param config_options - // @param value The value might be: - // - an old-style cache ("1M") -- equivalent to NewLRUCache(1024*102( - // - Name-value option pairs -- "capacity=1M; num_shard_bits=4; - // For the LRUCache, the values are defined in LRUCacheOptions. - // @param result The new Cache object - // @return OK if the cache was successfully created - // @return NotFound if an invalid name was specified in the value - // @return InvalidArgument if either the options were not valid - static Status CreateFromString(const ConfigOptions& config_options, - const std::string& value, - std::shared_ptr* result); - - public: // functions - // The type of the Cache - virtual const char* Name() const = 0; - - // The Insert and Lookup APIs below are intended to allow cached objects - // to be demoted/promoted between the primary block cache and a secondary - // cache. The secondary cache could be a non-volatile cache, and will - // likely store the object in a different representation. They rely on a - // per object CacheItemHelper to do the conversions. - // The secondary cache may persist across process and system restarts, - // and may even be moved between hosts. Therefore, the cache key must - // be repeatable across restarts/reboots, and globally unique if - // multiple DBs share the same cache and the set of DBs can change - // over time. - - // Insert a mapping from key->object into the cache and assign it - // the specified charge against the total cache capacity. If - // strict_capacity_limit is true and cache reaches its full capacity, - // return Status::MemoryLimit. `obj` must be non-nullptr if compatible - // with secondary cache (helper->size_cb != nullptr), because Value() == - // nullptr is reserved for indicating some secondary cache failure cases. - // On success, returns OK and takes ownership of `obj`, eventually deleting - // it with helper->del_cb. On non-OK return, the caller maintains ownership - // of `obj` so will often need to delete it in such cases. - // - // The helper argument is saved by the cache and will be used when the - // inserted object is evicted or considered for promotion to the secondary - // cache. Promotion to secondary cache is only enabled if helper->size_cb - // != nullptr. The helper must outlive the cache. Callers may use - // &kNoopCacheItemHelper as a trivial helper (no deleter for the object, - // no secondary cache). `helper` must not be nullptr (efficiency). - // - // If `handle` is not nullptr and return status is OK, `handle` is set - // to a Handle* for the entry. The caller must call this->Release(handle) - // when the returned entry is no longer needed. If `handle` is nullptr, it is - // as if Release is called immediately after Insert. - // - // Regardless of whether the item was inserted into the cache, - // it will attempt to insert it into the secondary cache if one is - // configured, and the helper supports it. - // The cache implementation must support a secondary cache, otherwise - // the item is only inserted into the primary cache. It may - // defer the insertion to the secondary cache as it sees fit. - // - // When the inserted entry is no longer needed, it will be destroyed using - // helper->del_cb (if non-nullptr). - virtual Status Insert(const Slice& key, ObjectPtr obj, - const CacheItemHelper* helper, size_t charge, - Handle** handle = nullptr, - Priority priority = Priority::LOW) = 0; - - // Lookup the key, returning nullptr if not found. If found, returns - // a handle to the mapping that must eventually be passed to Release(). - // - // If a non-nullptr helper argument is provided with a non-nullptr - // create_cb, and a secondary cache is configured, then the secondary - // cache is also queried if lookup in the primary cache fails. If found - // in secondary cache, the provided create_db and create_context are - // used to promote the entry to an object in the primary cache. - // In that case, the helper may be saved and used later when the object - // is evicted, so as usual, the pointed-to helper must outlive the cache. - // - // ======================== Async Lookup (wait=false) ====================== - // When wait=false, the handle returned might be in any of three states: - // * Present - If Value() != nullptr, then the result is present and - // the handle can be used just as if wait=true. - // * Pending, not ready (IsReady() == false) - secondary cache is still - // working to retrieve the value. Might become ready any time. - // * Pending, ready (IsReady() == true) - secondary cache has the value - // but it has not been loaded as an object into primary cache. Call to - // Wait()/WaitAll() will not block. - // - // IMPORTANT: Pending handles are not thread-safe, and only these functions - // are allowed on them: Value(), IsReady(), Wait(), WaitAll(). Even Release() - // can only come after Wait() or WaitAll() even though a reference is held. - // - // Only Wait()/WaitAll() gets a Handle out of a Pending state. (Waiting is - // safe and has no effect on other handle states.) After waiting on a Handle, - // it is in one of two states: - // * Present - if Value() != nullptr - // * Failed - if Value() == nullptr, such as if the secondary cache - // initially thought it had the value but actually did not. - // - // Note that given an arbitrary Handle, the only way to distinguish the - // Pending+ready state from the Failed state is to Wait() on it. A cache - // entry not compatible with secondary cache can also have Value()==nullptr - // like the Failed state, but this is not generally a concern. - virtual Handle* Lookup(const Slice& key, - const CacheItemHelper* helper = nullptr, - CreateContext* create_context = nullptr, - Priority priority = Priority::LOW, bool wait = true, - Statistics* stats = nullptr) = 0; - - // Convenience wrapper when secondary cache not supported - inline Handle* BasicLookup(const Slice& key, Statistics* stats) { - return Lookup(key, nullptr, nullptr, Priority::LOW, true, stats); - } - - // Increments the reference count for the handle if it refers to an entry in - // the cache. Returns true if refcount was incremented; otherwise, returns - // false. - // REQUIRES: handle must have been returned by a method on *this. - virtual bool Ref(Handle* handle) = 0; - - /** - * Release a mapping returned by a previous Lookup(). A released entry might - * still remain in cache in case it is later looked up by others. If - * erase_if_last_ref is set then it also erases it from the cache if there is - * no other reference to it. Erasing it should call the deleter function that - * was provided when the entry was inserted. - * - * Returns true if the entry was also erased. - */ - // REQUIRES: handle must not have been released yet. - // REQUIRES: handle must have been returned by a method on *this. - virtual bool Release(Handle* handle, bool erase_if_last_ref = false) = 0; - - // Return the object assiciated with a handle returned by a successful - // Lookup(). For historical reasons, this is also known at the "value" - // associated with the key. - // REQUIRES: handle must not have been released yet. - // REQUIRES: handle must have been returned by a method on *this. - virtual ObjectPtr Value(Handle* handle) = 0; - - // If the cache contains the entry for the key, erase it. Note that the - // underlying entry will be kept around until all existing handles - // to it have been released. - virtual void Erase(const Slice& key) = 0; - // Return a new numeric id. May be used by multiple clients who are - // sharding the same cache to partition the key space. Typically the - // client will allocate a new id at startup and prepend the id to - // its cache keys. - virtual uint64_t NewId() = 0; - - // sets the maximum configured capacity of the cache. When the new - // capacity is less than the old capacity and the existing usage is - // greater than new capacity, the implementation will do its best job to - // purge the released entries from the cache in order to lower the usage - virtual void SetCapacity(size_t capacity) = 0; - - // Set whether to return error on insertion when cache reaches its full - // capacity. - virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0; - - // Get the flag whether to return error on insertion when cache reaches its - // full capacity. - virtual bool HasStrictCapacityLimit() const = 0; - - // Returns the maximum configured capacity of the cache - virtual size_t GetCapacity() const = 0; - - // Returns the memory size for the entries residing in the cache. - virtual size_t GetUsage() const = 0; - - // Returns the number of entries currently tracked in the table. SIZE_MAX - // means "not supported." This is used for inspecting the load factor, along - // with GetTableAddressCount(). - virtual size_t GetOccupancyCount() const { return SIZE_MAX; } - - // Returns the number of ways the hash function is divided for addressing - // entries. Zero means "not supported." This is used for inspecting the load - // factor, along with GetOccupancyCount(). - virtual size_t GetTableAddressCount() const { return 0; } - - // Returns the memory size for a specific entry in the cache. - virtual size_t GetUsage(Handle* handle) const = 0; - - // Returns the memory size for the entries in use by the system - virtual size_t GetPinnedUsage() const = 0; - - // Returns the charge for the specific entry in the cache. - virtual size_t GetCharge(Handle* handle) const = 0; - - // Returns the helper for the specified entry. - virtual const CacheItemHelper* GetCacheItemHelper(Handle* handle) const = 0; - - // Call this on shutdown if you want to speed it up. Cache will disown - // any underlying data and will not free it on delete. This call will leak - // memory - call this only if you're shutting down the process. - // Any attempts of using cache after this call will fail terribly. - // Always delete the DB object before calling this method! - virtual void DisownData() { - // default implementation is noop - } - - struct ApplyToAllEntriesOptions { - // If the Cache uses locks, setting `average_entries_per_lock` to - // a higher value suggests iterating over more entries each time a lock - // is acquired, likely reducing the time for ApplyToAllEntries but - // increasing latency for concurrent users of the Cache. Setting - // `average_entries_per_lock` to a smaller value could be helpful if - // callback is relatively expensive, such as using large data structures. - size_t average_entries_per_lock = 256; - }; - - // Apply a callback to all entries in the cache. The Cache must ensure - // thread safety but does not guarantee that a consistent snapshot of all - // entries is iterated over if other threads are operating on the Cache - // also. - virtual void ApplyToAllEntries( - const std::function& callback, - const ApplyToAllEntriesOptions& opts) = 0; - - // Remove all entries. - // Prerequisite: no entry is referenced. - virtual void EraseUnRefEntries() = 0; - - virtual std::string GetPrintableOptions() const { return ""; } - - // Check for any warnings or errors in the operation of the cache and - // report them to the logger. This is intended only to be called - // periodically so does not need to be very efficient. (Obscure calling - // conventions for Logger inherited from env.h) - virtual void ReportProblems( - const std::shared_ptr& /*info_log*/) const {} - - MemoryAllocator* memory_allocator() const { return memory_allocator_.get(); } - - // EXPERIMENTAL - // The following APIs are experimental and might change in the future. - - // Release a mapping returned by a previous Lookup(). The "useful" - // parameter specifies whether the data was actually used or not, - // which may be used by the cache implementation to decide whether - // to consider it as a hit for retention purposes. As noted elsewhere, - // "pending" handles require Wait()/WaitAll() before Release(). - virtual bool Release(Handle* handle, bool /*useful*/, - bool erase_if_last_ref) { - return Release(handle, erase_if_last_ref); - } - - // Determines if the handle returned by Lookup() can give a value without - // blocking, though Wait()/WaitAll() might be required to publish it to - // Value(). See secondary cache compatible Lookup() above for details. - // This call is not thread safe on "pending" handles. - virtual bool IsReady(Handle* /*handle*/) { return true; } - - // Convert a "pending" handle into a full thread-shareable handle by - // * If necessary, wait until secondary cache finishes loading the value. - // * Construct the object for primary cache and set it in the handle. - // Even after Wait() on a pending handle, the caller must check for - // Value() == nullptr in case of failure. This call is not thread-safe - // on pending handles. This call has no effect on non-pending handles. - // See secondary cache compatible Lookup() above for details. - virtual void Wait(Handle* /*handle*/) {} - - // Wait for a vector of handles to become ready. As with Wait(), the user - // should check the Value() of each handle for nullptr. This call is not - // thread-safe on pending handles. - virtual void WaitAll(std::vector& /*handles*/) {} - - private: - std::shared_ptr memory_allocator_; +// A 2-tier cache with a primary block cache, and a compressed secondary +// cache. The returned cache instance will internally allocate a primary +// uncompressed cache of the specified type, and a compressed secondary +// cache. Any cache memory reservations, such as WriteBufferManager +// allocations costed to the block cache, will be distributed +// proportionally across both the primary and secondary. +struct TieredCacheOptions { + ShardedCacheOptions* cache_opts = nullptr; + PrimaryCacheType cache_type = PrimaryCacheType::kCacheTypeLRU; + TieredAdmissionPolicy adm_policy = TieredAdmissionPolicy::kAdmPolicyAuto; + CompressedSecondaryCacheOptions comp_cache_opts; + // Any capacity specified in LRUCacheOptions, HyperClockCacheOptions and + // CompressedSecondaryCacheOptions is ignored + // The total_capacity specified here is taken as the memory budget and + // divided between the primary block cache and compressed secondary cache + size_t total_capacity = 0; + double compressed_secondary_ratio = 0.0; + // An optional secondary cache that will serve as the persistent cache + // tier. If present, compressed blocks will be written to this + // secondary cache. + std::shared_ptr nvm_sec_cache; }; -// Useful for cache entries requiring no clean-up, such as for cache -// reservations -inline constexpr Cache::CacheItemHelper kNoopCacheItemHelper{}; +extern std::shared_ptr NewTieredCache( + const TieredCacheOptions& cache_opts); +// EXPERIMENTAL +// Dynamically update some of the parameters of a TieredCache. The input +// cache shared_ptr should have been allocated using NewTieredVolatileCache. +// At the moment, there are a couple of limitations - +// 1. The total_capacity should be > the WriteBufferManager max size, if +// using the block cache charging feature +// 2. Once the compressed secondary cache is disabled by setting the +// compressed_secondary_ratio to 0.0, it cannot be dynamically re-enabled +// again +extern Status UpdateTieredCache( + const std::shared_ptr& cache, int64_t total_capacity = -1, + double compressed_secondary_ratio = std::numeric_limits::max(), + TieredAdmissionPolicy adm_policy = TieredAdmissionPolicy::kAdmPolicyMax); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/cloud/cloud_file_system.h b/include/rocksdb/cloud/cloud_file_system.h index 6651e51dcd2f..b8a61b3f3d8e 100644 --- a/include/rocksdb/cloud/cloud_file_system.h +++ b/include/rocksdb/cloud/cloud_file_system.h @@ -204,20 +204,6 @@ class CloudFileSystemOptions { // Default: null std::shared_ptr storage_provider; - // Specifies the amount of sst files to be cached in local storage. - // If non-null, then the local storage would be used as a file cache. - // The Get or a Scan request on the database generates a random read - // request on the sst file and such a request causes the sst file to - // be inserted into the local file cache. - // A compaction request generates a sequential read request on the sst - // file and it does not cause the sst file to be inserted into the - // local file cache. - // A memtable flush generates a write requst to a new sst file and this - // sst file is not inserted into the local file cache. - // Cannot be set if keep_local_log_files is true. - // Default: null (disabled) - std::shared_ptr sst_file_cache; - // Access credentials AwsCloudAccessCredentials credentials; @@ -233,7 +219,6 @@ class CloudFileSystemOptions { // If false, then local sst files are created, uploaded to cloud immediately, // and local file is deleted. All reads are satisfied by fetching // data from the cloud. - // Cannot be set if sst_file_cache is enabled. // Default: false bool keep_local_sst_files; @@ -413,13 +398,11 @@ class CloudFileSystemOptions { int _constant_sst_file_size_in_sst_file_manager = -1, bool _skip_cloud_files_in_getchildren = false, bool _use_direct_io_for_cloud_download = false, - std::shared_ptr _sst_file_cache = nullptr, bool _roll_cloud_manifest_on_open = true, std::string _cookie_on_open = "", std::string _new_cookie_on_open = "", bool _delete_cloud_invisible_files_on_open = true, std::chrono::seconds _cloud_file_deletion_delay = std::chrono::hours(1)) : log_type(_log_type), - sst_file_cache(_sst_file_cache), keep_local_sst_files(_keep_local_sst_files), keep_local_log_files(_keep_local_log_files), purger_periodicity_millis(_purger_periodicity_millis), @@ -463,11 +446,6 @@ class CloudFileSystemOptions { const std::string& opts_str); Status Serialize(const ConfigOptions& config_options, std::string* result) const; - - // Is the sst file cache configured? - bool hasSstFileCache() const { - return sst_file_cache != nullptr && sst_file_cache->GetCapacity() > 0; - } }; struct CheckpointToCloudOptions { diff --git a/include/rocksdb/cloud/cloud_storage_provider_impl.h b/include/rocksdb/cloud/cloud_storage_provider_impl.h index 2efc8f07de78..22d8aa47dc9d 100644 --- a/include/rocksdb/cloud/cloud_storage_provider_impl.h +++ b/include/rocksdb/cloud/cloud_storage_provider_impl.h @@ -2,7 +2,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include "rocksdb/cloud/cloud_storage_provider.h" #include @@ -157,5 +156,3 @@ class CloudStorageProviderImpl : public CloudStorageProvider { Status status_; }; } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/cloud/db_cloud.h b/include/rocksdb/cloud/db_cloud.h index 91cae0e3fa2d..3d4bfb4c1a36 100644 --- a/include/rocksdb/cloud/db_cloud.h +++ b/include/rocksdb/cloud/db_cloud.h @@ -1,7 +1,6 @@ // Copyright (c) 2017-present, Rockset #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -71,4 +70,3 @@ class DBCloud : public StackableDB { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h index 9c6a9c30d686..1784f2329ac6 100644 --- a/include/rocksdb/compaction_filter.h +++ b/include/rocksdb/compaction_filter.h @@ -11,11 +11,14 @@ #include #include #include +#include #include #include "rocksdb/customizable.h" #include "rocksdb/rocksdb_namespace.h" +#include "rocksdb/table_properties.h" #include "rocksdb/types.h" +#include "rocksdb/wide_columns.h" namespace ROCKSDB_NAMESPACE { @@ -25,28 +28,129 @@ class SliceTransform; // CompactionFilter allows an application to modify/delete a key-value during // table file creation. // -// Exceptions MUST NOT propagate out of overridden functions into RocksDB, +// Some general notes: +// +// * RocksDB snapshots do not guarantee to preserve the state of the DB in the +// presence of CompactionFilter. Data seen from a snapshot might disappear after +// a table file created with a `CompactionFilter` is installed. If you use +// snapshots, think twice about whether you want to use `CompactionFilter` and +// whether you are using it in a safe way. +// +// * If multithreaded compaction is being used *and* a single CompactionFilter +// instance was supplied via Options::compaction_filter, CompactionFilter +// methods may be called from different threads concurrently. The application +// must ensure that such calls are thread-safe. If the CompactionFilter was +// created by a factory, then it will only ever be used by a single thread that +// is doing the table file creation, and this call does not need to be +// thread-safe. However, multiple filters may be in existence and operating +// concurrently. +// +// * The key passed to the filtering methods includes the timestamp if +// user-defined timestamps are enabled. +// +// * Exceptions MUST NOT propagate out of overridden functions into RocksDB, // because RocksDB is not exception-safe. This could cause undefined behavior // including data loss, unreported corruption, deadlocks, and more. class CompactionFilter : public Customizable { public: + // Value type of the key-value passed to the compaction filter's FilterV2/V3 + // methods. enum ValueType { + // Plain key-value kValue, + // Merge operand kMergeOperand, - kBlobIndex, // used internally by BlobDB. + // Used internally by the old stacked BlobDB implementation; this value type + // is never passed to application code. Note that when using the new + // integrated BlobDB, values stored separately as blobs are retrieved and + // presented to FilterV2/V3 with the type kValue above. + kBlobIndex, + // Wide-column entity + kWideColumnEntity, }; + // Potential decisions that can be returned by the compaction filter's + // FilterV2/V3 and FilterBlobByKey methods. See decision-specific caveats and + // constraints below. enum class Decision { + // Keep the current key-value as-is. kKeep, + + // Remove the current key-value. Note that the semantics of removal are + // dependent on the value type. If the current key-value is a plain + // key-value or a wide-column entity, it is converted to a tombstone + // (Delete), resulting in the deletion of any earlier versions of the key. + // If it is a merge operand, it is simply dropped. Note: if you are using + // a TransactionDB, it is not recommended to filter out merge operands. + // If a Merge operation is filtered out, TransactionDB may not realize there + // is a write conflict and may allow a Transaction that should have failed + // to Commit. Instead, it is better to implement any Merge filtering inside + // the MergeOperator. kRemove, + + // Change the value of the current key-value. If the current key-value is a + // plain key-value or a merge operand, its value is updated but its value + // type remains the same. If the current key-value is a wide-column entity, + // it is converted to a plain key-value with the new value specified. kChangeValue, + + // Remove all key-values with key in [key, *skip_until). This range of keys + // will be skipped in a way that potentially avoids some IO operations + // compared to removing the keys one by one. Note that removal in this case + // means dropping the key-value regardless of value type; in other words, in + // contrast with kRemove, plain values and entities are not converted to + // tombstones. + // + // *skip_until <= key is treated the same as Decision::kKeep (since the + // range [key, *skip_until) is empty). + // + // Caveats: + // * The keys are skipped even if there are snapshots containing them, + // i.e. values removed by kRemoveAndSkipUntil can disappear from a + // snapshot - beware if you're using TransactionDB or DB::GetSnapshot(). + // * If value for a key was overwritten or merged into (multiple Put()s + // or Merge()s), and `CompactionFilter` skips this key with + // kRemoveAndSkipUntil, it's possible that it will remove only + // the new value, exposing the old value that was supposed to be + // overwritten. + // * Doesn't work with PlainTableFactory in prefix mode. + // * If you use kRemoveAndSkipUntil for table files created by compaction, + // consider also reducing compaction_readahead_size option. kRemoveAndSkipUntil, - kChangeBlobIndex, // used internally by BlobDB. - kIOError, // used internally by BlobDB. - kPurge, // used for keys that can only be SingleDelete'ed + + // Used internally by the old stacked BlobDB implementation. Returning this + // decision from application code is not supported. + kChangeBlobIndex, + + // Used internally by the old stacked BlobDB implementation. Returning this + // decision from application code is not supported. + kIOError, + + // Remove the current key-value by converting it to a SingleDelete-type + // tombstone. Only supported for plain-key values and wide-column entities; + // not supported for merge operands. All the caveats related to + // SingleDeletes apply. + kPurge, + + // Change the current key-value to the wide-column entity specified. If the + // current key-value is already a wide-column entity, only its columns are + // updated; if it is a plain key-value, it is converted to a wide-column + // entity with the specified columns. Not supported for merge operands. + // Only applicable to FilterV3. + kChangeWideColumnEntity, + + // When using the integrated BlobDB implementation, it may be possible for + // applications to make a filtering decision for a given blob based on + // the key only without actually reading the blob value, which saves some + // I/O; see the FilterBlobByKey method below. Returning kUndetermined from + // FilterBlobByKey signals that making a decision solely based on the + // key is not possible; in this case, RocksDB reads the blob value and + // passes the key-value to the regular filtering method. Only applicable to + // FilterBlobByKey; returning this value from FilterV2/V3 is not supported. kUndetermined, }; + // Used internally by the old stacked BlobDB implementation. enum class BlobDecision { kKeep, kChangeValue, kCorruption, kIOError }; // Context information for a table file creation. @@ -57,10 +161,20 @@ class CompactionFilter : public Customizable { // Whether this table file is created as part of a compaction requested by // the client. bool is_manual_compaction; + // The lowest level among all the input files (if any) used in table + // creation + int input_start_level = kUnknownStartLevel; // The column family that will contain the created table file. uint32_t column_family_id; // Reason this table file is being created. TableFileCreationReason reason; + // Map from all the input files (if any) used in table creation to their + // table properties. When there are such input files but RocksDB fail to + // load their table properties, `input_table_properties` will be an empty + // map. + TablePropertiesCollection input_table_properties; + + static const int kUnknownStartLevel = -1; }; virtual ~CompactionFilter() {} @@ -72,8 +186,8 @@ class CompactionFilter : public Customizable { // The table file creation process invokes this method before adding a kv to // the table file. A return value of false indicates that the kv should be // preserved in the new table file and a return value of true indicates - // that this key-value should be removed from the new table file. The - // application can inspect the existing value of the key and make decision + // that this key-value should be removed (that is, converted to a tombstone). + // The application can inspect the existing value of the key and make decision // based on it. // // Key-Values that are results of merge operation during table file creation @@ -84,23 +198,6 @@ class CompactionFilter : public Customizable { // When the value is to be preserved, the application has the option // to modify the existing_value and pass it back through new_value. // value_changed needs to be set to true in this case. - // - // Note that RocksDB snapshots (i.e. call GetSnapshot() API on a - // DB* object) will not guarantee to preserve the state of the DB with - // CompactionFilter. Data seen from a snapshot might disappear after a - // table file created with a `CompactionFilter` is installed. If you use - // snapshots, think twice about whether you want to use `CompactionFilter` and - // whether you are using it in a safe way. - // - // If multithreaded compaction is being used *and* a single CompactionFilter - // instance was supplied via Options::compaction_filter, this method may be - // called from different threads concurrently. The application must ensure - // that the call is thread-safe. - // - // If the CompactionFilter was created by a factory, then it will only ever - // be used by a single thread that is doing the table file creation, and this - // call does not need to be thread-safe. However, multiple filters may be - // in existence and operating concurrently. virtual bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*existing_value*/, std::string* /*new_value*/, @@ -122,48 +219,18 @@ class CompactionFilter : public Customizable { return false; } - // An extended API. Called for both values and merge operands. - // Allows changing value and skipping ranges of keys. + // A unified API for plain values and merge operands that may + // return a variety of decisions (see Decision above). The `value_type` + // parameter indicates the type of the key-value and the `existing_value` + // contains the current value or merge operand. The `new_value` output + // parameter can be used to set the updated value or merge operand when the + // kChangeValue decision is made by the filter. See the description of + // kRemoveAndSkipUntil above for the semantics of the `skip_until` output + // parameter, and see Decision above for more information on the semantics of + // the potential return values. + // // The default implementation uses Filter() and FilterMergeOperand(). // If you're overriding this method, no need to override the other two. - // `value_type` indicates whether this key-value corresponds to a normal - // value (e.g. written with Put()) or a merge operand (written with Merge()). - // - // Possible return values: - // * kKeep - keep the key-value pair. - // * kRemove - remove the key-value pair or merge operand. - // * kChangeValue - keep the key and change the value/operand to *new_value. - // * kRemoveAndSkipUntil - remove this key-value pair, and also remove - // all key-value pairs with key in [key, *skip_until). This range - // of keys will be skipped without reading, potentially saving some - // IO operations compared to removing the keys one by one. - // - // *skip_until <= key is treated the same as Decision::kKeep - // (since the range [key, *skip_until) is empty). - // - // Caveats: - // - The keys are skipped even if there are snapshots containing them, - // i.e. values removed by kRemoveAndSkipUntil can disappear from a - // snapshot - beware if you're using TransactionDB or - // DB::GetSnapshot(). - // - If value for a key was overwritten or merged into (multiple Put()s - // or Merge()s), and `CompactionFilter` skips this key with - // kRemoveAndSkipUntil, it's possible that it will remove only - // the new value, exposing the old value that was supposed to be - // overwritten. - // - Doesn't work with PlainTableFactory in prefix mode. - // - If you use kRemoveAndSkipUntil for table files created by - // compaction, consider also reducing compaction_readahead_size - // option. - // - // Should never return kUndetermined. - // Note: If you are using a TransactionDB, it is not recommended to filter - // out or modify merge operands (ValueType::kMergeOperand). - // If a merge operation is filtered out, TransactionDB may not realize there - // is a write conflict and may allow a Transaction to Commit that should have - // failed. Instead, it is better to implement any Merge filtering inside the - // MergeOperator. - // key includes timestamp if user-defined timestamp is enabled. virtual Decision FilterV2(int level, const Slice& key, ValueType value_type, const Slice& existing_value, std::string* new_value, std::string* /*skip_until*/) const { @@ -176,15 +243,61 @@ class CompactionFilter : public Customizable { } return value_changed ? Decision::kChangeValue : Decision::kKeep; } + case ValueType::kMergeOperand: { bool rv = FilterMergeOperand(level, key, existing_value); return rv ? Decision::kRemove : Decision::kKeep; } + case ValueType::kBlobIndex: return Decision::kKeep; + + default: + assert(false); + return Decision::kKeep; + } + } + + // Wide column aware unified API. Called for plain values, merge operands, and + // wide-column entities; the `value_type` parameter indicates the type of the + // key-value. When the key-value is a plain value or a merge operand, the + // `existing_value` parameter contains the existing value and the + // `existing_columns` parameter is invalid (nullptr). When the key-value is a + // wide-column entity, the `existing_columns` parameter contains the wide + // columns of the existing entity and the `existing_value` parameter is + // invalid (nullptr). The `new_value` output parameter can be used to set the + // updated value or merge operand when the kChangeValue decision is made by + // the filter. The `new_columns` output parameter can be used to specify + // the pairs of column names and column values when the + // kChangeWideColumnEntity decision is returned. See the description of + // kRemoveAndSkipUntil above for the semantics of the `skip_until` output + // parameter, and see Decision above for more information on the semantics of + // the potential return values. + // + // For compatibility, the default implementation keeps all wide-column + // entities, and falls back to FilterV2 for plain values and merge operands. + // If you override this method, there is no need to override FilterV2 (or + // Filter/FilterMergeOperand). + virtual Decision FilterV3( + int level, const Slice& key, ValueType value_type, + const Slice* existing_value, const WideColumns* existing_columns, + std::string* new_value, + std::vector>* /* new_columns */, + std::string* skip_until) const { +#ifdef NDEBUG + (void)existing_columns; +#endif + + assert(!existing_value || !existing_columns); + assert(value_type == ValueType::kWideColumnEntity || existing_value); + assert(value_type != ValueType::kWideColumnEntity || existing_columns); + + if (value_type == ValueType::kWideColumnEntity) { + return Decision::kKeep; } - assert(false); - return Decision::kKeep; + + return FilterV2(level, key, value_type, *existing_value, new_value, + skip_until); } // Internal (BlobDB) use only. Do not override in application code. @@ -209,10 +322,15 @@ class CompactionFilter : public Customizable { virtual bool IsStackedBlobDbInternalCompactionFilter() const { return false; } // In the case of BlobDB, it may be possible to reach a decision with only - // the key without reading the actual value. Keys whose value_type is - // kBlobIndex will be checked by this method. - // Returning kUndetermined will cause FilterV2() to be called to make a - // decision as usual. + // the key without reading the actual value, saving some I/O operations. + // Keys where the value is stored separately in a blob file will be + // passed to this method. If the method returns a supported decision other + // than kUndetermined, it will be considered final and performed without + // reading the existing value. Returning kUndetermined will cause FilterV3() + // to be called to make a decision as usual. The output parameters + // `new_value` and `skip_until` are applicable to the decisions kChangeValue + // and kRemoveAndSkipUntil respectively, and have the same semantics as + // the corresponding parameters of FilterV2/V3. virtual Decision FilterBlobByKey(int /*level*/, const Slice& /*key*/, std::string* /*new_value*/, std::string* /*skip_until*/) const { diff --git a/include/rocksdb/compaction_job_stats.h b/include/rocksdb/compaction_job_stats.h index 5ff8eccc8bf1..7e8153044364 100644 --- a/include/rocksdb/compaction_job_stats.h +++ b/include/rocksdb/compaction_job_stats.h @@ -24,6 +24,9 @@ struct CompactionJobStats { // the elapsed CPU time of this compaction in microseconds. uint64_t cpu_micros; + // Used internally indicating whether a subcompaction's + // `num_input_records` is accurate. + bool has_num_input_records; // the number of compaction input records. uint64_t num_input_records; // the number of blobs read from blob files diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h index ad1e71a11929..4b39a25851ac 100644 --- a/include/rocksdb/comparator.h +++ b/include/rocksdb/comparator.h @@ -155,10 +155,50 @@ class Comparator : public Customizable, public CompareInterface { // Return a builtin comparator that uses lexicographic byte-wise // ordering. The result remains the property of this module and // must not be deleted. -extern const Comparator* BytewiseComparator(); +const Comparator* BytewiseComparator(); // Return a builtin comparator that uses reverse lexicographic byte-wise // ordering. -extern const Comparator* ReverseBytewiseComparator(); +const Comparator* ReverseBytewiseComparator(); + +// Returns a builtin comparator that enables user-defined timestamps (formatted +// as uint64_t) while ordering the user key part without UDT with a +// BytewiseComparator. +// For the same user key with different timestamps, larger (newer) timestamp +// comes first. +const Comparator* BytewiseComparatorWithU64Ts(); + +// Returns a builtin comparator that enables user-defined timestamps (formatted +// as uint64_t) while ordering the user key part without UDT with a +// ReverseBytewiseComparator. +// For the same user key with different timestamps, larger (newer) timestamp +// comes first. +const Comparator* ReverseBytewiseComparatorWithU64Ts(); + +// Decode a `U64Ts` timestamp returned by RocksDB to uint64_t. +// When a column family enables user-defined timestamp feature +// with `BytewiseComparatorWithU64Ts` or `ReverseBytewiseComparatorWithU64Ts` +// comparator, the `Iterator::timestamp()` API returns timestamp in `Slice` +// format. This util function helps to translate that `Slice` into an uint64_t +// type. +Status DecodeU64Ts(const Slice& ts, uint64_t* int_ts); + +// Encode an uint64_t timestamp into a U64Ts `Slice`, to be used as +// `ReadOptions.timestamp` for a column family that enables user-defined +// timestamp feature with `BytewiseComparatorWithU64Ts` or +// `ReverseBytewiseComparatorWithU64Ts` comparator. +// Be mindful that the returned `Slice` is backed by `ts_buf`. When `ts_buf` +// is deconstructed, the returned `Slice` can no longer be used. +Slice EncodeU64Ts(uint64_t ts, std::string* ts_buf); + +// Returns a `Slice` representing the maximum U64Ts timestamp. +// The returned `Slice` is backed by some static storage, so it's valid until +// program destruction. +Slice MaxU64Ts(); + +// Returns a `Slice` representing the minimum U64Ts timestamp. +// The returned `Slice` is backed by some static storage, so it's valid until +// program destruction. +Slice MinU64Ts(); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/configurable.h b/include/rocksdb/configurable.h index 7d20eb9bd621..7c538e83ae5c 100644 --- a/include/rocksdb/configurable.h +++ b/include/rocksdb/configurable.h @@ -48,10 +48,8 @@ class Configurable { std::string name; // Pointer to the object being registered void* opt_ptr; -#ifndef ROCKSDB_LITE // The map of options being registered const std::unordered_map* type_map; -#endif }; public: @@ -120,7 +118,6 @@ class Configurable { const std::unordered_map& opt_map, std::unordered_map* unused); -#ifndef ROCKSDB_LITE // Updates the named option to the input value, returning OK if successful. // Note that ConfigureOption does not cause PrepareOptions to be invoked. // @param config_options Controls how the name/value is processed. @@ -134,7 +131,6 @@ class Configurable { // @return InvalidArgument If the value cannot be successfully parsed. Status ConfigureOption(const ConfigOptions& config_options, const std::string& name, const std::string& value); -#endif // ROCKSDB_LITE // Configures the options for this class based on the input parameters. // On successful completion, the object is updated with the settings from @@ -170,7 +166,6 @@ class Configurable { // serialized. Status GetOptionString(const ConfigOptions& config_options, std::string* result) const; -#ifndef ROCKSDB_LITE // Returns the serialized options for this object. // This method is similar to GetOptionString with no errors. // @param config_options Controls how serialization happens. @@ -201,7 +196,6 @@ class Configurable { // its value cannot be serialized. virtual Status GetOption(const ConfigOptions& config_options, const std::string& name, std::string* value) const; -#endif // ROCKSDB_LITE // Checks to see if this Configurable is equivalent to other. // This method assumes that the two objects are of the same class. @@ -316,7 +310,6 @@ class Configurable { const std::unordered_map& opts_map, std::unordered_map* unused); -#ifndef ROCKSDB_LITE // Method that configures a the specific opt_name from opt_value. // By default, this method calls opt_info.ParseOption with the // input parameters. @@ -347,13 +340,10 @@ class Configurable { const void* const this_ptr, const void* const that_ptr, std::string* bad_name) const; -#endif -#ifndef ROCKSDB_LITE // Internal method to serialize options (ToString) // Classes may override this value to change its behavior. virtual std::string SerializeOptions(const ConfigOptions& config_options, const std::string& header) const; -#endif // ROCKSDB_LITE // Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt) virtual std::string GetOptionName(const std::string& long_name) const; diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h index 921ec221beba..cff03f2bc194 100644 --- a/include/rocksdb/convenience.h +++ b/include/rocksdb/convenience.h @@ -90,10 +90,8 @@ struct ConfigOptions { // The environment to use for this option Env* env = Env::Default(); -#ifndef ROCKSDB_LITE // The object registry to use for this options std::shared_ptr registry; -#endif bool IsShallow() const { return depth == Depth::kDepthShallow; } bool IsDetailed() const { return depth == Depth::kDepthDetailed; } @@ -107,7 +105,6 @@ struct ConfigOptions { } }; -#ifndef ROCKSDB_LITE // The following set of functions provide a way to construct RocksDB Options // from a string or a string-to-string map. Here is the general rule of @@ -163,9 +160,10 @@ struct ConfigOptions { // "kCompactionStyleNone". // -// Take a default ColumnFamilyOptions "base_options" in addition to a -// map "opts_map" of option name to option value to construct the new -// ColumnFamilyOptions "new_options". +// Take a ConfigOptions `config_options` and a ColumnFamilyOptions +// "base_options" as the default option in addition to a map "opts_map" of +// option name to option value to construct the new ColumnFamilyOptions +// "new_options". // // Below are the instructions of how to config some non-primitive-typed // options in ColumnFamilyOptions: @@ -241,11 +239,6 @@ struct ConfigOptions { // cf_opt.compression_opts.strategy = 6; // cf_opt.compression_opts.max_dict_bytes = 7; // -// The GetColumnFamilyOptionsFromMap(ConfigOptions, ...) should be used; the -// alternative signature may be deprecated in a future release. The equivalent -// functionality can be achieved by setting the corresponding options in -// the ConfigOptions parameter. -// // @param config_options controls how the map is processed. // @param base_options the default options of the output "new_options". // @param opts_map an option name to value map for specifying how "new_options" @@ -270,15 +263,10 @@ Status GetColumnFamilyOptionsFromMap( const ColumnFamilyOptions& base_options, const std::unordered_map& opts_map, ColumnFamilyOptions* new_options); -Status GetColumnFamilyOptionsFromMap( - const ColumnFamilyOptions& base_options, - const std::unordered_map& opts_map, - ColumnFamilyOptions* new_options, bool input_strings_escaped = false, - bool ignore_unknown_options = false); -// Take a default DBOptions "base_options" in addition to a -// map "opts_map" of option name to option value to construct the new -// DBOptions "new_options". +// Take a ConfigOptions `config_options` and a DBOptions "base_options" as the +// default option in addition to a map "opts_map" of option name to option value +// to construct the new DBOptions "new_options". // // Below are the instructions of how to config some non-primitive-typed // options in DBOptions: @@ -289,11 +277,6 @@ Status GetColumnFamilyOptionsFromMap( // - Passing {"rate_limiter_bytes_per_sec", "1024"} is equivalent to // passing NewGenericRateLimiter(1024) to rate_limiter_bytes_per_sec. // -// The GetDBOptionsFromMap(ConfigOptions, ...) should be used; the -// alternative signature may be deprecated in a future release. The equivalent -// functionality can be achieved by setting the corresponding options in -// the ConfigOptions parameter. -// // @param config_options controls how the map is processed. // @param base_options the default options of the output "new_options". // @param opts_map an option name to value map for specifying how "new_options" @@ -317,15 +300,11 @@ Status GetDBOptionsFromMap( const ConfigOptions& cfg_options, const DBOptions& base_options, const std::unordered_map& opts_map, DBOptions* new_options); -Status GetDBOptionsFromMap( - const DBOptions& base_options, - const std::unordered_map& opts_map, - DBOptions* new_options, bool input_strings_escaped = false, - bool ignore_unknown_options = false); -// Take a default BlockBasedTableOptions "table_options" in addition to a -// map "opts_map" of option name to option value to construct the new -// BlockBasedTableOptions "new_table_options". +// Take a ConfigOptions `config_options` and a BlockBasedTableOptions +// "table_options" as the default option in addition to a map "opts_map" of +// option name to option value to construct the new BlockBasedTableOptions +// "new_table_options". // // Below are the instructions of how to config some non-primitive-typed // options in BlockBasedTableOptions: @@ -348,11 +327,6 @@ Status GetDBOptionsFromMap( // - Passing {"block_cache", "1M"} in GetBlockBasedTableOptionsFromMap is // equivalent to setting block_cache using NewLRUCache(1024 * 1024). // -// The GetBlockBasedTableOptionsFromMap(ConfigOptions, ...) should be used; -// the alternative signature may be deprecated in a future release. The -// equivalent functionality can be achieved by setting the corresponding -// options in the ConfigOptions parameter. -// // @param config_options controls how the map is processed. // @param table_options the default options of the output "new_table_options". // @param opts_map an option name to value map for specifying how @@ -372,20 +346,11 @@ Status GetBlockBasedTableOptionsFromMap( const BlockBasedTableOptions& table_options, const std::unordered_map& opts_map, BlockBasedTableOptions* new_table_options); -Status GetBlockBasedTableOptionsFromMap( - const BlockBasedTableOptions& table_options, - const std::unordered_map& opts_map, - BlockBasedTableOptions* new_table_options, - bool input_strings_escaped = false, bool ignore_unknown_options = false); -// Take a default PlainTableOptions "table_options" in addition to a -// map "opts_map" of option name to option value to construct the new -// PlainTableOptions "new_table_options". -// -// The GetPlainTableOptionsFromMap(ConfigOptions, ...) should be used; the -// alternative signature may be deprecated in a future release. The equivalent -// functionality can be achieved by setting the corresponding options in -// the ConfigOptions parameter. +// Take a ConfigOptions `config_options` and a default PlainTableOptions +// "table_options" as the default option in addition to a map "opts_map" of +// option name to option value to construct the new PlainTableOptions +// "new_table_options". // // @param config_options controls how the map is processed. // @param table_options the default options of the output "new_table_options". @@ -405,43 +370,26 @@ Status GetPlainTableOptionsFromMap( const ConfigOptions& config_options, const PlainTableOptions& table_options, const std::unordered_map& opts_map, PlainTableOptions* new_table_options); -Status GetPlainTableOptionsFromMap( - const PlainTableOptions& table_options, - const std::unordered_map& opts_map, - PlainTableOptions* new_table_options, bool input_strings_escaped = false, - bool ignore_unknown_options = false); -// Take a string representation of option names and values, apply them into the -// base_options, and return the new options as a result. The string has the -// following format: +// Take a ConfigOptions `config_options`, a string representation of option +// names and values, apply them into the base_options, and return the new +// options as a result. The string has the following format: // "write_buffer_size=1024;max_write_buffer_number=2" // Nested options config is also possible. For example, you can define // BlockBasedTableOptions as part of the string for block-based table factory: // "write_buffer_size=1024;block_based_table_factory={block_size=4k};" // "max_write_buffer_num=2" // -// -// The GetColumnFamilyOptionsFromString(ConfigOptions, ...) should be used; the -// alternative signature may be deprecated in a future release. The equivalent -// functionality can be achieved by setting the corresponding options in -// the ConfigOptions parameter. Status GetColumnFamilyOptionsFromString(const ConfigOptions& config_options, const ColumnFamilyOptions& base_options, const std::string& opts_str, ColumnFamilyOptions* new_options); -Status GetColumnFamilyOptionsFromString(const ColumnFamilyOptions& base_options, - const std::string& opts_str, - ColumnFamilyOptions* new_options); Status GetDBOptionsFromString(const ConfigOptions& config_options, const DBOptions& base_options, const std::string& opts_str, DBOptions* new_options); -Status GetDBOptionsFromString(const DBOptions& base_options, - const std::string& opts_str, - DBOptions* new_options); - Status GetStringFromDBOptions(const ConfigOptions& config_options, const DBOptions& db_options, std::string* opts_str); @@ -461,17 +409,11 @@ Status GetStringFromCompressionType(std::string* compression_str, std::vector GetSupportedCompressions(); -Status GetBlockBasedTableOptionsFromString( - const BlockBasedTableOptions& table_options, const std::string& opts_str, - BlockBasedTableOptions* new_table_options); Status GetBlockBasedTableOptionsFromString( const ConfigOptions& config_options, const BlockBasedTableOptions& table_options, const std::string& opts_str, BlockBasedTableOptions* new_table_options); -Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options, - const std::string& opts_str, - PlainTableOptions* new_table_options); Status GetPlainTableOptionsFromString(const ConfigOptions& config_options, const PlainTableOptions& table_options, const std::string& opts_str, @@ -517,9 +459,8 @@ Status VerifySstFileChecksum(const Options& options, // Verify the checksum of file Status VerifySstFileChecksum(const Options& options, const EnvOptions& env_options, - const ReadOptions& read_options, + const ReadOptions& _read_options, const std::string& file_path, const SequenceNumber& largest_seqno = 0); -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/customizable.h b/include/rocksdb/customizable.h index 92f7504ae1bb..076aca659092 100644 --- a/include/rocksdb/customizable.h +++ b/include/rocksdb/customizable.h @@ -157,12 +157,10 @@ class Customizable : public Configurable { bool AreEquivalent(const ConfigOptions& config_options, const Configurable* other, std::string* mismatch) const override; -#ifndef ROCKSDB_LITE // Gets the value of the option associated with the input name // @see Configurable::GetOption for more details Status GetOption(const ConfigOptions& config_options, const std::string& name, std::string* value) const override; -#endif // ROCKSDB_LITE // Helper method for getting for parsing the opt_value into the corresponding // options for use in potentially creating a new Customizable object (this // method is primarily a support method for LoadSharedObject et al for new @@ -224,10 +222,8 @@ class Customizable : public Configurable { virtual const char* NickName() const { return ""; } // Given a name (e.g. rocksdb.my.type.opt), returns the short name (opt) std::string GetOptionName(const std::string& long_name) const override; -#ifndef ROCKSDB_LITE std::string SerializeOptions(const ConfigOptions& options, const std::string& prefix) const override; -#endif // ROCKSDB_LITE }; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/data_structure.h b/include/rocksdb/data_structure.h index f868a6be59f9..ffab82c514a5 100644 --- a/include/rocksdb/data_structure.h +++ b/include/rocksdb/data_structure.h @@ -15,37 +15,172 @@ namespace ROCKSDB_NAMESPACE { -// This is a data structure specifically designed as a "Set" for a -// pretty small scale of Enum structure. For now, it can support up -// to 64 element, and it is expandable in the future. -template +namespace detail { +int CountTrailingZeroBitsForSmallEnumSet(uint64_t); +} // namespace detail + +// Represents a set of values of some enum type with a small number of +// possible enumerators. For now, it supports enums where no enumerator +// exceeds 63 when converted to int. +template class SmallEnumSet { + private: + using StateT = uint64_t; + static constexpr int kStateBits = sizeof(StateT) * 8; + static constexpr int kMaxMax = kStateBits - 1; + static constexpr int kMaxValue = static_cast(MAX_ENUMERATOR); + static_assert(kMaxValue >= 0); + static_assert(kMaxValue <= kMaxMax); + public: + // construct / create SmallEnumSet() : state_(0) {} - ~SmallEnumSet() {} + template + /*implicit*/ constexpr SmallEnumSet(const ENUM_TYPE e, TRest... rest) { + *this = SmallEnumSet(rest...).With(e); + } - // Return true if the input enum is included in the "Set" (i.e., changes the - // internal scalar state successfully), otherwise, it will return false. - bool Add(const ENUM_TYPE value) { - static_assert(MAX_VALUE <= 63, "Size currently limited to 64"); - assert(value >= 0 && value <= MAX_VALUE); - uint64_t old_state = state_; - uint64_t tmp = 1; - state_ |= (tmp << value); - return old_state != state_; + // Return the set that includes all valid values, assuming the enum + // is "dense" (includes all values converting to 0 through kMaxValue) + static constexpr SmallEnumSet All() { + StateT tmp = StateT{1} << kMaxValue; + return SmallEnumSet(RawStateMarker(), tmp | (tmp - 1)); } + // equality + bool operator==(const SmallEnumSet& that) const { + return this->state_ == that.state_; + } + bool operator!=(const SmallEnumSet& that) const { return !(*this == that); } + + // query + // Return true if the input enum is contained in the "Set". - bool Contains(const ENUM_TYPE value) { - static_assert(MAX_VALUE <= 63, "Size currently limited to 64"); - assert(value >= 0 && value <= MAX_VALUE); - uint64_t tmp = 1; + bool Contains(const ENUM_TYPE e) const { + int value = static_cast(e); + assert(value >= 0 && value <= kMaxValue); + StateT tmp = 1; return state_ & (tmp << value); } + bool empty() const { return state_ == 0; } + + // iterator + class const_iterator { + public: + // copy + const_iterator(const const_iterator& that) = default; + const_iterator& operator=(const const_iterator& that) = default; + + // move + const_iterator(const_iterator&& that) noexcept = default; + const_iterator& operator=(const_iterator&& that) noexcept = default; + + // equality + bool operator==(const const_iterator& that) const { + assert(set_ == that.set_); + return this->pos_ == that.pos_; + } + + bool operator!=(const const_iterator& that) const { + return !(*this == that); + } + + // ++iterator + const_iterator& operator++() { + if (pos_ < kMaxValue) { + pos_ = set_->SkipUnset(pos_ + 1); + } else { + pos_ = kStateBits; + } + return *this; + } + + // iterator++ + const_iterator operator++(int) { + auto old = *this; + ++*this; + return old; + } + + ENUM_TYPE operator*() const { + assert(pos_ <= kMaxValue); + return static_cast(pos_); + } + + private: + friend class SmallEnumSet; + const_iterator(const SmallEnumSet* set, int pos) : set_(set), pos_(pos) {} + const SmallEnumSet* set_; + int pos_; + }; + + const_iterator begin() const { return const_iterator(this, SkipUnset(0)); } + + const_iterator end() const { return const_iterator(this, kStateBits); } + + // mutable ops + + // Modifies the set (if needed) to include the given value. Returns true + // iff the set was modified. + bool Add(const ENUM_TYPE e) { + int value = static_cast(e); + assert(value >= 0 && value <= kMaxValue); + StateT old_state = state_; + state_ |= (StateT{1} << value); + return old_state != state_; + } + + // Modifies the set (if needed) not to include the given value. Returns true + // iff the set was modified. + bool Remove(const ENUM_TYPE e) { + int value = static_cast(e); + assert(value >= 0 && value <= kMaxValue); + StateT old_state = state_; + state_ &= ~(StateT{1} << value); + return old_state != state_; + } + + // applicative ops + + // Return a new set based on this one with the additional value(s) inserted + constexpr SmallEnumSet With(const ENUM_TYPE e) const { + int value = static_cast(e); + assert(value >= 0 && value <= kMaxValue); + return SmallEnumSet(RawStateMarker(), state_ | (StateT{1} << value)); + } + template + constexpr SmallEnumSet With(const ENUM_TYPE e1, const ENUM_TYPE e2, + TRest... rest) const { + return With(e1).With(e2, rest...); + } + + // Return a new set based on this one excluding the given value(s) + constexpr SmallEnumSet Without(const ENUM_TYPE e) const { + int value = static_cast(e); + assert(value >= 0 && value <= kMaxValue); + return SmallEnumSet(RawStateMarker(), state_ & ~(StateT{1} << value)); + } + template + constexpr SmallEnumSet Without(const ENUM_TYPE e1, const ENUM_TYPE e2, + TRest... rest) const { + return Without(e1).Without(e2, rest...); + } + private: - uint64_t state_; + int SkipUnset(int pos) const { + StateT tmp = state_ >> pos; + if (tmp == 0) { + return kStateBits; + } else { + return pos + detail::CountTrailingZeroBitsForSmallEnumSet(tmp); + } + } + struct RawStateMarker {}; + explicit SmallEnumSet(RawStateMarker, StateT state) : state_(state) {} + + StateT state_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 53dcea7ddb8e..0d37780577b9 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -53,20 +53,14 @@ struct Options; struct ReadOptions; struct TableProperties; struct WriteOptions; -#ifdef ROCKSDB_LITE -class CompactionJobInfo; -#endif +struct WaitForCompactOptions; class Env; class EventListener; class FileSystem; -#ifndef ROCKSDB_LITE class Replayer; -#endif class StatsHistoryIterator; -#ifndef ROCKSDB_LITE class TraceReader; class TraceWriter; -#endif class WriteBatch; extern const std::string kDefaultColumnFamilyName; @@ -106,6 +100,8 @@ static const int kMinorVersion = __ROCKSDB_MINOR__; // A range of keys struct Range { + // In case of user_defined timestamp, if enabled, `start` and `limit` should + // point to key without timestamp part. Slice start; Slice limit; @@ -114,6 +110,8 @@ struct Range { }; struct RangePtr { + // In case of user_defined timestamp, if enabled, `start` and `limit` should + // point to key without timestamp part. const Slice* start; const Slice* limit; @@ -137,6 +135,11 @@ struct IngestExternalFileArg { }; struct GetMergeOperandsOptions { + // A limit on the number of merge operands returned by the GetMergeOperands() + // API. In contrast with ReadOptions::merge_operator_max_count, this is a hard + // limit: when it is exceeded, no merge operands will be returned and the + // query will fail with an Incomplete status. See also the + // DB::GetMergeOperands() API below. int expected_max_number_of_operands = 0; }; @@ -197,8 +200,6 @@ class DB { // Open the database for read only. // - // Not supported in ROCKSDB_LITE, in which case the function will - // return Status::NotSupported. static Status OpenForReadOnly(const Options& options, const std::string& name, DB** dbptr, bool error_if_wal_file_exists = false); @@ -210,8 +211,6 @@ class DB { // to specify default column family. The default column family name is // 'default' and it's stored in ROCKSDB_NAMESPACE::kDefaultColumnFamilyName // - // Not supported in ROCKSDB_LITE, in which case the function will - // return Status::NotSupported. static Status OpenForReadOnly( const DBOptions& db_options, const std::string& name, const std::vector& column_families, @@ -312,6 +311,18 @@ class DB { std::vector* handles, DB** dbptr, std::string trim_ts); + // Manually, synchronously attempt to resume DB writes after a write failure + // to the underlying filesystem. See + // https://github.com/facebook/rocksdb/wiki/Background-Error-Handling + // + // Returns OK if writes are successfully resumed, or there was no + // outstanding error to recover from. Returns underlying write error if + // it is not recoverable. + // + // WART: Does not mix well with auto-resume. Will return Busy if an + // auto-resume is in progress, without waiting for it to complete. + // See DBOptions::max_bgerror_resume_count and + // EventListener::OnErrorRecoveryBegin virtual Status Resume() { return Status::NotSupported(); } // Close the DB by releasing resources, closing files etc. This should be @@ -320,12 +331,17 @@ class DB { // If syncing is required, the caller must first call SyncWAL(), or Write() // using an empty write batch with WriteOptions.sync=true. // Regardless of the return status, the DB must be freed. + // // If the return status is Aborted(), closing fails because there is // unreleased snapshot in the system. In this case, users can release // the unreleased snapshots and try again and expect it to succeed. For // other status, re-calling Close() will be no-op and return the original // close status. If the return status is NotSupported(), then the DB // implementation does cleanup in the destructor + // + // WaitForCompact() with WaitForCompactOptions.close_db=true will be a good + // choice for users who want to wait for background work before closing + // (rather than aborting and potentially redoing some work on re-open) virtual Status Close() { return Status::NotSupported(); } // ListColumnFamilies will open the DB specified by argument name @@ -346,6 +362,10 @@ class DB { // Create a column_family and return the handle of column family // through the argument handle. + // NOTE: creating many column families one-by-one is not recommended because + // of quadratic overheads, such as writing a full OPTIONS file for all CFs + // after each new CF creation. Use CreateColumnFamilies(), or DB::Open() with + // create_missing_column_families=true. virtual Status CreateColumnFamily(const ColumnFamilyOptions& options, const std::string& column_family_name, ColumnFamilyHandle** handle); @@ -415,6 +435,10 @@ class DB { virtual Status PutEntity(const WriteOptions& options, ColumnFamilyHandle* column_family, const Slice& key, const WideColumns& columns); + // Split and store wide column entities in multiple column families (a.k.a. + // AttributeGroups) + virtual Status PutEntity(const WriteOptions& options, const Slice& key, + const AttributeGroups& attribute_groups); // Remove the database entry (if any) for "key". Returns OK on // success, and a non-OK status on error. It is not an error if "key" @@ -485,6 +509,15 @@ class DB { ColumnFamilyHandle* column_family, const Slice& begin_key, const Slice& end_key, const Slice& ts); + virtual Status DeleteRange(const WriteOptions& options, + const Slice& begin_key, const Slice& end_key) { + return DeleteRange(options, DefaultColumnFamily(), begin_key, end_key); + } + virtual Status DeleteRange(const WriteOptions& options, + const Slice& begin_key, const Slice& end_key, + const Slice& ts) { + return DeleteRange(options, DefaultColumnFamily(), begin_key, end_key, ts); + } // Merge the database entry for "key" with "value". Returns OK on success, // and a non-OK status on error. The semantics of this operation is @@ -582,6 +615,16 @@ class DB { return Status::NotSupported("GetEntity not supported"); } + // Returns logically grouped wide-column entities per column family (a.k.a. + // attribute groups) for a single key. PinnableAttributeGroups is a vector of + // PinnableAttributeGroup. Each PinnableAttributeGroup will have + // ColumnFamilyHandle* as input, and Status and PinnableWideColumns as output. + virtual Status GetEntity(const ReadOptions& /* options */, + const Slice& /* key */, + PinnableAttributeGroups* /* result */) { + return Status::NotSupported("GetEntity not supported"); + } + // Populates the `merge_operands` array with all the merge operands in the DB // for `key`. The `merge_operands` array will be populated in the order of // insertion. The number of entries populated in `merge_operands` will be @@ -780,6 +823,96 @@ class DB { } } + // Batched MultiGet-like API that returns wide-column entities from a single + // column family. For any given "key[i]" in "keys" (where 0 <= "i" < + // "num_keys"), if the column family specified by "column_family" contains an + // entry, it is returned it as a wide-column entity in "results[i]". If the + // entry is a wide-column entity, it is returned as-is; if it is a plain + // key-value, it is returned as an entity with a single anonymous column (see + // kDefaultWideColumnName) which contains the value. + // + // "statuses[i]" is set to OK if "keys[i]" is successfully retrieved. It is + // set to NotFound and an empty wide-column entity is returned in "results[i]" + // if there is no entry for "keys[i]". Finally, "statuses[i]" is set to some + // other non-OK status on error. + // + // If "keys" are sorted according to the column family's comparator, the + // "sorted_input" flag can be set for a small performance improvement. + // + // Note that it is the caller's responsibility to ensure that "keys", + // "results", and "statuses" point to "num_keys" number of contiguous objects + // (Slices, PinnableWideColumns, and Statuses respectively). + virtual void MultiGetEntity(const ReadOptions& /* options */, + ColumnFamilyHandle* /* column_family */, + size_t num_keys, const Slice* /* keys */, + PinnableWideColumns* /* results */, + Status* statuses, + bool /* sorted_input */ = false) { + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = Status::NotSupported("MultiGetEntity not supported"); + } + } + + // Batched MultiGet-like API that returns wide-column entities potentially + // from multiple column families. For any given "key[i]" in "keys" (where 0 <= + // "i" < "num_keys"), if the column family specified by "column_families[i]" + // contains an entry, it is returned it as a wide-column entity in + // "results[i]". If the entry is a wide-column entity, it is returned as-is; + // if it is a plain key-value, it is returned as an entity with a single + // anonymous column (see kDefaultWideColumnName) which contains the value. + // + // "statuses[i]" is set to OK if "keys[i]" is successfully retrieved. It is + // set to NotFound and an empty wide-column entity is returned in "results[i]" + // if there is no entry for "keys[i]". Finally, "statuses[i]" is set to some + // other non-OK status on error. + // + // If "keys" are sorted by column family id and within each column family, + // according to the column family's comparator, the "sorted_input" flag can be + // set for a small performance improvement. + // + // Note that it is the caller's responsibility to ensure that + // "column_families", "keys", "results", and "statuses" point to "num_keys" + // number of contiguous objects (ColumnFamilyHandle pointers, Slices, + // PinnableWideColumns, and Statuses respectively). + virtual void MultiGetEntity(const ReadOptions& /* options */, size_t num_keys, + ColumnFamilyHandle** /* column_families */, + const Slice* /* keys */, + PinnableWideColumns* /* results */, + Status* statuses, + bool /* sorted_input */ = false) { + for (size_t i = 0; i < num_keys; ++i) { + statuses[i] = Status::NotSupported("MultiGetEntity not supported"); + } + } + + // Batched MultiGet-like API that returns attribute groups. + // An "attribute group" refers to a logical grouping of wide-column entities + // within RocksDB. These attribute groups are implemented using column + // families. Attribute group allows users to group wide-columns based on + // various criteria, such as similar access patterns or data types + // + // The input is a list of keys and PinnableAttributeGroups. For any given + // keys[i] (where 0 <= i < num_keys), results[i] will contain result for the + // ith key. Each result will be returned as PinnableAttributeGroups. + // PinnableAttributeGroups is a vector of PinnableAttributeGroup. Each + // PinnableAttributeGroup will contain a ColumnFamilyHandle pointer, Status + // and PinnableWideColumns. + // + // Note that it is the caller's responsibility to ensure that + // "keys" and "results" have the same "num_keys" number of objects. Also + // PinnableAttributeGroup needs to have ColumnFamilyHandle pointer set + // properly to get the corresponding wide columns from the column family. + virtual void MultiGetEntity(const ReadOptions& /* options */, size_t num_keys, + const Slice* /* keys */, + PinnableAttributeGroups* results) { + for (size_t i = 0; i < num_keys; ++i) { + for (size_t j = 0; j < results[i].size(); ++j) { + results[i][j].SetStatus( + Status::NotSupported("MultiGetEntity not supported")); + } + } + } + // If the key definitely does not exist in the database, then this method // returns false, else true. If the caller wants to obtain value when the key // is found in memory, a bool for 'value_found' must be passed. 'value_found' @@ -869,7 +1002,6 @@ class DB { // use "snapshot" after this call. virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; -#ifndef ROCKSDB_LITE // Contains all valid property arguments for GetProperty() or // GetMapProperty(). Each is a "string" property for retrieval with // GetProperty() unless noted as a "map" property, for GetMapProperty(). @@ -911,6 +1043,18 @@ class DB { // level, as well as the histogram of latency of single requests. static const std::string kCFFileHistogram; + // "rocksdb.cf-write-stall-stats" - returns a multi-line string or + // map with statistics on CF-scope write stalls for a given CF + // See`WriteStallStatsMapKeys` for structured representation of keys + // available in the map form. + static const std::string kCFWriteStallStats; + + // "rocksdb.db-write-stall-stats" - returns a multi-line string or + // map with statistics on DB-scope write stalls + // See`WriteStallStatsMapKeys` for structured representation of keys + // available in the map form. + static const std::string kDBWriteStallStats; + // "rocksdb.dbstats" - As a string property, returns a multi-line string // with general database stats, both cumulative (over the db's // lifetime) and interval (since the last retrieval of kDBStats). @@ -1042,12 +1186,12 @@ class DB { static const std::string kMinObsoleteSstNumberToKeep; // "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST - // files. + // files belonging to any of the CF's versions. // WARNING: may slow down online queries if there are too many files. static const std::string kTotalSstFilesSize; // "rocksdb.live-sst-files-size" - returns total size (bytes) of all SST - // files belong to the latest LSM tree. + // files belong to the CF's current version. static const std::string kLiveSstFilesSize; // "rocksdb.live-non-bottommost-sst-files-size" - returns total size @@ -1055,6 +1199,15 @@ class DB { // to the latest LSM tree. static const std::string kLiveNonBottommostSstFilesSize; + // "rocksdb.obsolete-sst-files-size" - returns total size (bytes) of all + // SST files that became obsolete but have not yet been deleted or + // scheduled for deletion. SST files can end up in this state when + // using `DisableFileDeletions()`, for example. + // + // N.B. Unlike the other "*SstFilesSize" properties, this property + // includes SST files that originated in any of the DB's CFs. + static const std::string kObsoleteSstFilesSize; + // "rocksdb.live_sst_files_size_at_temperature" - returns total size (bytes) // of SST files at all certain file temperature static const std::string kLiveSstFilesSizeAtTemperature; @@ -1140,7 +1293,6 @@ class DB { // entries being pinned in blob cache. static const std::string kBlobCachePinnedUsage; }; -#endif /* ROCKSDB_LITE */ // DB implementations export properties about their state via this method. // If "property" is a valid "string" property understood by this DB @@ -1189,6 +1341,7 @@ class DB { // "rocksdb.min-obsolete-sst-number-to-keep" // "rocksdb.total-sst-files-size" // "rocksdb.live-sst-files-size" + // "rocksdb.obsolete-sst-files-size" // "rocksdb.base-level" // "rocksdb.estimate-pending-compaction-bytes" // "rocksdb.num-running-compactions" @@ -1293,6 +1446,9 @@ class DB { // the files. In this case, client could set options.change_level to true, to // move the files back to the minimum level capable of holding the data set // or a given level (specified by non-negative options.target_level). + // + // In case of user-defined timestamp, if enabled, `begin` and `end` should + // not contain timestamp. virtual Status CompactRange(const CompactRangeOptions& options, ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end) = 0; @@ -1466,6 +1622,18 @@ class DB { // DisableManualCompaction() has been called. virtual void EnableManualCompaction() = 0; + // Wait for all flush and compactions jobs to finish. Jobs to wait include the + // unscheduled (queued, but not scheduled yet). If the db is shutting down, + // Status::ShutdownInProgress will be returned. + // + // NOTE: This may also never return if there's sufficient ongoing writes that + // keeps flush and compaction going without stopping. The user would have to + // cease all the writes to DB to make this eventually return in a stable + // state. The user may also use timeout option in WaitForCompactOptions to + // make this stop waiting and return when timeout expires. + virtual Status WaitForCompact( + const WaitForCompactOptions& /* wait_for_compact_options */) = 0; + // Number of levels used for this DB. virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0; virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); } @@ -1505,7 +1673,7 @@ class DB { virtual DBOptions GetDBOptions() const = 0; - // Flush all mem-table data. + // Flush all memtable data. // Flush a single column family, even when atomic flush is enabled. To flush // multiple column families, use Flush(options, column_families). virtual Status Flush(const FlushOptions& options, @@ -1513,7 +1681,7 @@ class DB { virtual Status Flush(const FlushOptions& options) { return Flush(options, DefaultColumnFamily()); } - // Flushes multiple column families. + // Flushes memtables of multiple column families. // If atomic flush is not enabled, Flush(options, column_families) is // equivalent to calling Flush(options, column_family) multiple times. // If atomic flush is enabled, Flush(options, column_families) will flush all @@ -1525,29 +1693,41 @@ class DB { const FlushOptions& options, const std::vector& column_families) = 0; - // Flush the WAL memory buffer to the file. If sync is true, it calls SyncWAL - // afterwards. + // When using the manual_wal_flush option, flushes RocksDB internal buffers + // of WAL data to the file, so that the data can survive process crash or be + // included in a Checkpoint or Backup. Without manual_wal_flush, there is no + // such internal buffer. If sync is true, it calls SyncWAL() afterwards. virtual Status FlushWAL(bool /*sync*/) { return Status::NotSupported("FlushWAL not implemented"); } - // Sync the wal. Note that Write() followed by SyncWAL() is not exactly the - // same as Write() with sync=true: in the latter case the changes won't be - // visible until the sync is done. - // Currently only works if allow_mmap_writes = false in Options. + + // Ensure all WAL writes have been synced to storage, so that (assuming OS + // and hardware support) data will survive power loss. This function does + // not imply FlushWAL, so `FlushWAL(true)` is recommended if using + // manual_wal_flush=true. Currently only works if allow_mmap_writes = false + // in Options. + // + // Note that Write() followed by SyncWAL() is not exactly the same as Write() + // with sync=true: in the latter case the changes won't be visible until the + // sync is done. virtual Status SyncWAL() = 0; - // Lock the WAL. Also flushes the WAL after locking. - // After this method returns ok, writes to the database will be stopped until - // UnlockWAL() is called. - // This method may internally acquire and release DB mutex and the WAL write - // mutex, but after it returns, neither mutex is held by caller. + // Freezes the logical state of the DB (by stopping writes), and if WAL is + // enabled, ensures that state has been flushed to DB files (as in + // FlushWAL()). This can be used for taking a Checkpoint at a known DB + // state, though the user must use options to insure no DB flush is invoked + // in this frozen state. Other operations allowed on a "read only" DB should + // work while frozen. Each LockWAL() call that returns OK must eventually be + // followed by a corresponding call to UnlockWAL(). Where supported, non-OK + // status is generally only possible with some kind of corruption or I/O + // error. virtual Status LockWAL() { return Status::NotSupported("LockWAL not implemented"); } - // Unlock the WAL. - // The write stop on the database will be cleared. - // This method may internally acquire and release DB mutex. + // Unfreeze the DB state from a successful LockWAL(). + // The write stop on the database will be cleared when UnlockWAL() have been + // called for each successful LockWAL(). virtual Status UnlockWAL() { return Status::NotSupported("UnlockWAL not implemented"); } @@ -1571,7 +1751,17 @@ class DB { virtual Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family, std::string* ts_low) = 0; - // Allow compactions to delete obsolete files. + // Enable deleting obsolete files. + // Usually users should only need to call this if they have previously called + // `DisableFileDeletions`. + // File deletions disabling and enabling is not controlled by a binary flag, + // instead it's represented as a counter to allow different callers to + // independently disable file deletion. Disabling file deletion can be + // critical for operations like making a backup. So the counter implementation + // makes the file deletion disabled as long as there is one caller requesting + // so, and only when every caller agrees to re-enable file deletion, it will + // be enabled. So be careful when calling this function with force = true as + // explained below. // If force == true, the call to EnableFileDeletions() will guarantee that // file deletions are enabled after the call, even if DisableFileDeletions() // was called multiple times before. @@ -1580,9 +1770,8 @@ class DB { // enabling the two methods to be called by two threads concurrently without // synchronization -- i.e., file deletions will be enabled only after both // threads call EnableFileDeletions() - virtual Status EnableFileDeletions(bool force = true) = 0; + virtual Status EnableFileDeletions(bool force) = 0; -#ifndef ROCKSDB_LITE // Retrieves the creation time of the oldest file in the DB. // This API only works if max_open_files = -1, if it is not then // Status returned is Status::NotSupported() @@ -1749,11 +1938,12 @@ class DB { const std::vector& args) = 0; // CreateColumnFamilyWithImport() will create a new column family with - // column_family_name and import external SST files specified in metadata into - // this column family. + // column_family_name and import external SST files specified in `metadata` + // into this column family. // (1) External SST files can be created using SstFileWriter. // (2) External SST files can be exported from a particular column family in - // an existing DB using Checkpoint::ExportColumnFamily. + // an existing DB using Checkpoint::ExportColumnFamily. `metadata` should + // be the output from Checkpoint::ExportColumnFamily. // Option in import_options specifies whether the external files are copied or // moved (default is copy). When option specifies copy, managing files at // external_file_path is caller's responsibility. When option specifies a @@ -1767,9 +1957,40 @@ class DB { virtual Status CreateColumnFamilyWithImport( const ColumnFamilyOptions& options, const std::string& column_family_name, const ImportColumnFamilyOptions& import_options, - const ExportImportFilesMetaData& metadata, + const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) { + const std::vector& metadatas{&metadata}; + return CreateColumnFamilyWithImport(options, column_family_name, + import_options, metadatas, handle); + } + + // EXPERIMENTAL + // Overload of the CreateColumnFamilyWithImport() that allows the caller to + // pass a list of ExportImportFilesMetaData pointers to support creating + // ColumnFamily by importing multiple ColumnFamilies. + // It should be noticed that if the user keys of the imported column families + // overlap with each other, an error will be returned. + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const std::vector& metadatas, ColumnFamilyHandle** handle) = 0; + // EXPERIMENTAL + // ClipColumnFamily() will clip the entries in the CF according to the range + // [begin_key, end_key). Returns OK on success, and a non-OK status on error. + // Any entries outside this range will be completely deleted (including + // tombstones). + // The main difference between ClipColumnFamily(begin, end) and + // DeleteRange(begin, end) + // is that the former physically deletes all keys outside the range, but is + // more heavyweight than the latter. + // This feature is mainly used to ensure that there is no overlapping Key when + // calling CreateColumnFamilyWithImport() to import multiple CFs. + // Note that: concurrent updates cannot be performed during Clip. + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) = 0; + // Verify the checksums of files in db. Currently the whole-file checksum of // table files are checked. virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) { @@ -1782,8 +2003,6 @@ class DB { virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } -#endif // ROCKSDB_LITE - // Returns the unique ID which is read from IDENTITY file during the opening // of database by setting in the identity variable // Returns Status::OK if identity could be set properly @@ -1799,8 +2018,6 @@ class DB { // Returns default column family handle virtual ColumnFamilyHandle* DefaultColumnFamily() const = 0; -#ifndef ROCKSDB_LITE - virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, TablePropertiesCollection* props) = 0; virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) { @@ -1866,8 +2083,6 @@ class DB { return Status::NotSupported("NewDefaultReplayer() is not implemented."); } -#endif // ROCKSDB_LITE - // Needed for StackableDB virtual DB* GetRootDB() { return this; } @@ -1880,7 +2095,6 @@ class DB { return Status::NotSupported("GetStatsHistory() is not implemented."); } -#ifndef ROCKSDB_LITE // Make the secondary instance catch up with the primary by tailing and // replaying the MANIFEST and WAL of the primary. // Column families created by the primary after the secondary instance starts @@ -1894,7 +2108,6 @@ class DB { virtual Status TryCatchUpWithPrimary() { return Status::NotSupported("Supported only by secondary instance"); } -#endif // !ROCKSDB_LITE // Generate new MANIFEST file during next kManifestWrite virtual void NewManifestOnNextUpdate() = 0; @@ -1904,6 +2117,24 @@ class DB { virtual uint64_t GetNextFileNumber() const = 0; }; +struct WriteStallStatsMapKeys { + static const std::string& TotalStops(); + static const std::string& TotalDelays(); + + static const std::string& CFL0FileCountLimitDelaysWithOngoingCompaction(); + static const std::string& CFL0FileCountLimitStopsWithOngoingCompaction(); + + // REQUIRES: + // `cause` isn't any of these: `WriteStallCause::kNone`, + // `WriteStallCause::kCFScopeWriteStallCauseEnumMax`, + // `WriteStallCause::kDBScopeWriteStallCauseEnumMax` + // + // REQUIRES: + // `condition` isn't any of these: `WriteStallCondition::kNormal` + static std::string CauseConditionCount(WriteStallCause cause, + WriteStallCondition condition); +}; + // Overloaded operators for enum class SizeApproximationFlags. inline DB::SizeApproximationFlags operator&(DB::SizeApproximationFlags lhs, DB::SizeApproximationFlags rhs) { @@ -1936,7 +2167,6 @@ Status DestroyDB(const std::string& name, const Options& options, const std::vector& column_families = std::vector()); -#ifndef ROCKSDB_LITE // If a DB cannot be opened, you may attempt to call this method to // resurrect as much of the contents of the database as possible. // Some data may be lost, so be careful when calling this function @@ -1958,7 +2188,6 @@ Status RepairDB(const std::string& dbname, const DBOptions& db_options, // @param options These options will be used for the database and for ALL column // families encountered during the repair Status RepairDB(const std::string& dbname, const Options& options); -#endif void SetThreadLogging(bool v); bool GetThreadLogging(); diff --git a/include/rocksdb/db_dump_tool.h b/include/rocksdb/db_dump_tool.h index b7d4766a2f0f..2c97bad75570 100644 --- a/include/rocksdb/db_dump_tool.h +++ b/include/rocksdb/db_dump_tool.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include @@ -42,4 +41,3 @@ class DbUndumpTool { ROCKSDB_NAMESPACE::Options options = ROCKSDB_NAMESPACE::Options()); }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index bef60a2124d2..7b0220635ec7 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -27,6 +27,7 @@ #include "rocksdb/customizable.h" #include "rocksdb/functor_wrapper.h" +#include "rocksdb/port_defs.h" #include "rocksdb/status.h" #include "rocksdb/thread_status.h" @@ -69,13 +70,6 @@ struct ConfigOptions; const size_t kDefaultPageSize = 4 * 1024; -enum class CpuPriority { - kIdle = 0, - kLow = 1, - kNormal = 2, - kHigh = 3, -}; - // Options while opening a file to read/write struct EnvOptions { // Construct with default Options @@ -179,17 +173,6 @@ class Env : public Customizable { // should implement this method. const char* Name() const override { return ""; } - // Loads the environment specified by the input value into the result - // The CreateFromString alternative should be used; this method may be - // deprecated in a future release. - static Status LoadEnv(const std::string& value, Env** result); - - // Loads the environment specified by the input value into the result - // The CreateFromString alternative should be used; this method may be - // deprecated in a future release. - static Status LoadEnv(const std::string& value, Env** result, - std::shared_ptr* guard); - // Loads the environment specified by the input value into the result // @see Customizable for a more detailed description of the parameters and // return codes @@ -297,7 +280,7 @@ class Env : public Customizable { const EnvOptions& options); // Open `fname` for random read and write, if file doesn't exist the file - // will be created. On success, stores a pointer to the new file in + // will not be created. On success, stores a pointer to the new file in // *result and returns OK. On failure returns non-OK. // // The returned file will only be accessed by one thread at a time. @@ -453,6 +436,21 @@ class Env : public Customizable { IO_TOTAL = 4 }; + // EXPERIMENTAL + enum class IOActivity : uint8_t { + kFlush = 0, + kCompaction = 1, + kDBOpen = 2, + kGet = 3, + kMultiGet = 4, + kDBIterator = 5, + kVerifyDBChecksum = 6, + kVerifyFileChecksums = 7, + kGetEntity = 8, + kMultiGetEntity = 9, + kUnknown, // Keep last for easy array of non-unknowns + }; + // Arrange to run "(*function)(arg)" once in a background thread, in // the thread pool specified by pri. By default, jobs go to the 'LOW' // priority thread pool. @@ -884,6 +882,9 @@ class WritableFile { WritableFile(const WritableFile&) = delete; void operator=(const WritableFile&) = delete; + // For cases when Close() hasn't been called, many derived classes of + // WritableFile will need to call Close() non-virtually in their destructor, + // and ignore the result, to ensure resources are released. virtual ~WritableFile(); // Append data to the end of the file @@ -947,6 +948,12 @@ class WritableFile { // size due to whole pages writes. The behavior is undefined if called // with other writes to follow. virtual Status Truncate(uint64_t /*size*/) { return Status::OK(); } + + // The caller should call Close() before destroying the WritableFile to + // surface any errors associated with finishing writes to the file. + // The file is considered closed regardless of return status. + // (However, implementations must also clean up properly in the destructor + // even if Close() is not called.) virtual Status Close() = 0; virtual Status Flush() = 0; virtual Status Sync() = 0; // sync data @@ -1093,6 +1100,9 @@ class RandomRWFile { RandomRWFile(const RandomRWFile&) = delete; RandomRWFile& operator=(const RandomRWFile&) = delete; + // For cases when Close() hasn't been called, many derived classes of + // RandomRWFile will need to call Close() non-virtually in their destructor, + // and ignore the result, to ensure resources are released. virtual ~RandomRWFile() {} // Indicates if the class makes use of direct I/O @@ -1124,6 +1134,11 @@ class RandomRWFile { virtual Status Fsync() { return Sync(); } + // The caller should call Close() before destroying the RandomRWFile to + // surface any errors associated with finishing writes to the file. + // The file is considered closed regardless of return status. + // (However, implementations must also clean up properly in the destructor + // even if Close() is not called.) virtual Status Close() = 0; // If you're adding methods here, remember to add them to @@ -1156,10 +1171,14 @@ class MemoryMappedFileBuffer { // filesystem operations that can be executed on directories. class Directory { public: + // Many derived classes of Directory will need to call Close() in their + // destructor, when not called already, to ensure resources are released. virtual ~Directory() {} // Fsync directory. Can be called concurrently from multiple threads. virtual Status Fsync() = 0; - // Close directory. + // Calling Close() before destroying a Directory is recommended to surface + // any errors associated with finishing writes (in case of future features). + // The directory is considered closed regardless of return status. virtual Status Close() { return Status::NotSupported("Close"); } virtual size_t GetUniqueId(char* /*id*/, size_t /*max_size*/) const { @@ -1197,9 +1216,11 @@ class Logger { virtual ~Logger(); - // Close the log file. Must be called before destructor. If the return - // status is NotSupported(), it means the implementation does cleanup in - // the destructor + // Because Logger is typically a shared object, Close() may or may not be + // called before the object is destroyed, but is recommended to reveal any + // final errors in finishing outstanding writes. No other functions are + // supported after calling Close(), and the Logger is considered closed + // regardless of return status. virtual Status Close(); // Write a header to the log file with the specified format @@ -1660,10 +1681,8 @@ class EnvWrapper : public Env { target_.env->SanitizeEnvOptions(env_opts); } Status PrepareOptions(const ConfigOptions& options) override; -#ifndef ROCKSDB_LITE std::string SerializeOptions(const ConfigOptions& config_options, const std::string& header) const override; -#endif // ROCKSDB_LITE private: Target target_; diff --git a/include/rocksdb/env_encryption.h b/include/rocksdb/env_encryption.h index 282db6ed4138..6feae06811b9 100644 --- a/include/rocksdb/env_encryption.h +++ b/include/rocksdb/env_encryption.h @@ -5,7 +5,6 @@ #pragma once -#if !defined(ROCKSDB_LITE) #include @@ -68,7 +67,7 @@ class BlockAccessCipherStream { // including data loss, unreported corruption, deadlocks, and more. class BlockCipher : public Customizable { public: - virtual ~BlockCipher(){}; + virtual ~BlockCipher() {} // Creates a new BlockCipher from the input config_options and value // The value describes the type of provider (and potentially optional @@ -115,13 +114,13 @@ class BlockCipher : public Customizable { // including data loss, unreported corruption, deadlocks, and more. class EncryptionProvider : public Customizable { public: - virtual ~EncryptionProvider(){}; + virtual ~EncryptionProvider() {} - // Creates a new EncryptionProvider from the input config_options and value + // Creates a new EncryptionProvider from the input config_options and value. // The value describes the type of provider (and potentially optional // configuration parameters) used to create this provider. // For example, if the value is "CTR", a CTREncryptionProvider will be - // created. If the value is ends with "://test" (e.g CTR://test"), the + // created. If the value is end with "://test" (e.g CTR://test"), the // provider will be initialized in "TEST" mode prior to being returned. // // @param config_options Options to control how this provider is created @@ -154,7 +153,7 @@ class EncryptionProvider : public Customizable { size_t prefixLength) const = 0; // Method to add a new cipher key for use by the EncryptionProvider. - // @param description Descriptor for this key. + // @param descriptor Descriptor for this key // @param cipher The cryptographic key to use // @param len The length of the cipher key // @param for_write If true, this cipher should be used for writing files. @@ -166,15 +165,15 @@ class EncryptionProvider : public Customizable { size_t len, bool for_write) = 0; // CreateCipherStream creates a block access cipher stream for a file given - // given name and options. + // name and options. virtual Status CreateCipherStream( const std::string& fname, const EnvOptions& options, Slice& prefix, std::unique_ptr* result) = 0; // Returns a string representing an encryption marker prefix for this // provider. If a marker is provided, this marker can be used to tell whether - // or not a file is encrypted by this provider. The maker will also be part - // of any encryption prefix for this provider. + // a file is encrypted by this provider. The marker will also be part of any + // encryption prefix for this provider. virtual std::string GetMarker() const { return ""; } }; @@ -183,7 +182,7 @@ class EncryptedSequentialFile : public FSSequentialFile { std::unique_ptr file_; std::unique_ptr stream_; uint64_t offset_; - size_t prefixLength_; + const size_t prefixLength_; public: // Default ctor. Given underlying sequential file is supposed to be at @@ -196,47 +195,22 @@ class EncryptedSequentialFile : public FSSequentialFile { offset_(prefixLength), prefixLength_(prefixLength) {} - // Read up to "n" bytes from the file. "scratch[0..n-1]" may be - // written by this routine. Sets "*result" to the data that was - // read (including if fewer than "n" bytes were successfully read). - // May set "*result" to point at data in "scratch[0..n-1]", so - // "scratch[0..n-1]" must be live when "*result" is used. - // If an error was encountered, returns a non-OK status. - // - // REQUIRES: External synchronization IOStatus Read(size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) override; - // Skip "n" bytes from the file. This is guaranteed to be no - // slower that reading the same data, but may be faster. - // - // If end of file is reached, skipping will stop at the end of the - // file, and Skip will return OK. - // - // REQUIRES: External synchronization IOStatus Skip(uint64_t n) override; - // Indicates the upper layers if the current SequentialFile implementation - // uses direct IO. bool use_direct_io() const override; - // Use the returned alignment value to allocate - // aligned buffer for Direct I/O size_t GetRequiredBufferAlignment() const override; - // Remove any kind of caching of data from the offset to offset+length - // of this file. If the length is 0, then it refers to the end of file. - // If the system is not caching the file contents, then this is a noop. IOStatus InvalidateCache(size_t offset, size_t length) override; - // Positioned Read for direct I/O - // If Direct I/O enabled, offset, n, and scratch should be properly aligned IOStatus PositionedRead(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) override; }; -// A file abstraction for randomly reading the contents of a file. class EncryptedRandomAccessFile : public FSRandomAccessFile { protected: std::unique_ptr file_; @@ -251,60 +225,24 @@ class EncryptedRandomAccessFile : public FSRandomAccessFile { stream_(std::move(s)), prefixLength_(prefixLength) {} - // Read up to "n" bytes from the file starting at "offset". - // "scratch[0..n-1]" may be written by this routine. Sets "*result" - // to the data that was read (including if fewer than "n" bytes were - // successfully read). May set "*result" to point at data in - // "scratch[0..n-1]", so "scratch[0..n-1]" must be live when - // "*result" is used. If an error was encountered, returns a non-OK - // status. - // - // Safe for concurrent use by multiple threads. - // If Direct I/O enabled, offset, n, and scratch should be aligned properly. IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) const override; - // Readahead the file starting from offset by n bytes for caching. IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options, IODebugContext* dbg) override; - // Tries to get an unique ID for this file that will be the same each time - // the file is opened (and will stay the same while the file is open). - // Furthermore, it tries to make this ID at most "max_size" bytes. If such an - // ID can be created this function returns the length of the ID and places it - // in "id"; otherwise, this function returns 0, in which case "id" - // may not have been modified. - // - // This function guarantees, for IDs from a given environment, two unique ids - // cannot be made equal to each other by adding arbitrary bytes to one of - // them. That is, no unique ID is the prefix of another. - // - // This function guarantees that the returned ID will not be interpretable as - // a single varint. - // - // Note: these IDs are only valid for the duration of the process. size_t GetUniqueId(char* id, size_t max_size) const override; void Hint(AccessPattern pattern) override; - // Indicates the upper layers if the current RandomAccessFile implementation - // uses direct IO. bool use_direct_io() const override; - // Use the returned alignment value to allocate - // aligned buffer for Direct I/O size_t GetRequiredBufferAlignment() const override; - // Remove any kind of caching of data from the offset to offset+length - // of this file. If the length is 0, then it refers to the end of file. - // If the system is not caching the file contents, then this is a noop. IOStatus InvalidateCache(size_t offset, size_t length) override; }; -// A file abstraction for sequential writing. The implementation -// must provide buffering since callers may append small fragments -// at a time to the file. class EncryptedWritableFile : public FSWritableFile { protected: std::unique_ptr file_; @@ -329,50 +267,22 @@ class EncryptedWritableFile : public FSWritableFile { const IOOptions& options, IODebugContext* dbg) override; - // true if Sync() and Fsync() are safe to call concurrently with Append() - // and Flush(). bool IsSyncThreadSafe() const override; - // Indicates the upper layers if the current WritableFile implementation - // uses direct IO. bool use_direct_io() const override; - // Use the returned alignment value to allocate - // aligned buffer for Direct I/O size_t GetRequiredBufferAlignment() const override; - /* - * Get the size of valid data in the file. - */ uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override; - // Truncate is necessary to trim the file to the correct size - // before closing. It is not always possible to keep track of the file - // size due to whole pages writes. The behavior is undefined if called - // with other writes to follow. IOStatus Truncate(uint64_t size, const IOOptions& options, IODebugContext* dbg) override; - // Remove any kind of caching of data from the offset to offset+length - // of this file. If the length is 0, then it refers to the end of file. - // If the system is not caching the file contents, then this is a noop. - // This call has no effect on dirty pages in the cache. IOStatus InvalidateCache(size_t offset, size_t length) override; - // Sync a file range with disk. - // offset is the starting byte of the file range to be synchronized. - // nbytes specifies the length of the range to be synchronized. - // This asks the OS to initiate flushing the cached data to disk, - // without waiting for completion. - // Default implementation does nothing. IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options, IODebugContext* dbg) override; - // PrepareWrite performs any necessary preparation for a write - // before the write actually occurs. This allows for pre-allocation - // of space on devices where it can result in less file - // fragmentation and/or less waste from over-zealous filesystem - // pre-allocation. void PrepareWrite(size_t offset, size_t len, const IOOptions& options, IODebugContext* dbg) override; @@ -381,7 +291,6 @@ class EncryptedWritableFile : public FSWritableFile { void GetPreallocationStatus(size_t* block_size, size_t* last_allocated_block) override; - // Pre-allocates space for a file. IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options, IODebugContext* dbg) override; @@ -392,7 +301,6 @@ class EncryptedWritableFile : public FSWritableFile { IOStatus Close(const IOOptions& options, IODebugContext* dbg) override; }; -// A file abstraction for random reading and writing. class EncryptedRandomRWFile : public FSRandomRWFile { protected: std::unique_ptr file_; @@ -407,22 +315,13 @@ class EncryptedRandomRWFile : public FSRandomRWFile { stream_(std::move(s)), prefixLength_(prefixLength) {} - // Indicates if the class makes use of direct I/O - // If false you must pass aligned buffer to Write() bool use_direct_io() const override; - // Use the returned alignment value to allocate - // aligned buffer for Direct I/O size_t GetRequiredBufferAlignment() const override; - // Write bytes in `data` at offset `offset`, Returns Status::OK() on success. - // Pass aligned buffer when use_direct_io() returns true. IOStatus Write(uint64_t offset, const Slice& data, const IOOptions& options, IODebugContext* dbg) override; - // Read up to `n` bytes starting from offset `offset` and store them in - // result, provided `scratch` size should be at least `n`. - // Returns Status::OK() on success. IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) const override; @@ -462,4 +361,3 @@ class EncryptedFileSystem : public FileSystemWrapper { }; } // namespace ROCKSDB_NAMESPACE -#endif // !defined(ROCKSDB_LITE) diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index 91ad47218e68..647aad6c9455 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -79,6 +79,10 @@ enum class IOType : uint8_t { kInvalid, }; +// enum representing various operations supported by underlying FileSystem. +// These need to be set in SupportedOps API for RocksDB to use them. +enum FSSupportedOps { kAsyncIO, kFSBuffer }; + // Per-request options that can be passed down to the FileSystem // implementation. These are hints and are not necessarily guaranteed to be // honored. More hints can be added here in the future to indicate things like @@ -116,6 +120,9 @@ struct IOOptions { // directories and list only files in GetChildren API. bool do_not_recurse; + // EXPERIMENTAL + Env::IOActivity io_activity = Env::IOActivity::kUnknown; + IOOptions() : IOOptions(false) {} explicit IOOptions(bool force_dir_fsync_) @@ -272,12 +279,6 @@ class FileSystem : public Customizable { static const char* Type() { return "FileSystem"; } static const char* kDefaultName() { return "DefaultFileSystem"; } - // Loads the FileSystem specified by the input value into the result - // The CreateFromString alternative should be used; this method may be - // deprecated in a future release. - static Status Load(const std::string& value, - std::shared_ptr* result); - // Loads the FileSystem specified by the input value into the result // @see Customizable for a more detailed description of the parameters and // return codes @@ -390,7 +391,7 @@ class FileSystem : public Customizable { IODebugContext* dbg); // Open `fname` for random read and write, if file doesn't exist the file - // will be created. On success, stores a pointer to the new file in + // will not be created. On success, stores a pointer to the new file in // *result and returns OK. On failure returns non-OK. // // The returned file will only be accessed by one thread at a time. @@ -661,7 +662,6 @@ class FileSystem : public Customizable { const IOOptions& options, bool* is_dir, IODebugContext* /*dgb*/) = 0; - // EXPERIMENTAL // Poll for completion of read IO requests. The Poll() method should call the // callback functions to indicate completion of read requests. // Underlying FS is required to support Poll API. Poll implementation should @@ -669,25 +669,34 @@ class FileSystem : public Customizable { // after the callback has been called. // If Poll returns partial results for any reads, its caller reponsibility to // call Read or ReadAsync in order to get the remaining bytes. - // - // Default implementation is to return IOStatus::OK. - virtual IOStatus Poll(std::vector& /*io_handles*/, size_t /*min_completions*/) { return IOStatus::OK(); } - // EXPERIMENTAL // Abort the read IO requests submitted asynchronously. Underlying FS is // required to support AbortIO API. AbortIO implementation should ensure that // the all the read requests related to io_handles should be aborted and // it shouldn't call the callback for these io_handles. - // - // Default implementation is to return IOStatus::OK. virtual IOStatus AbortIO(std::vector& /*io_handles*/) { return IOStatus::OK(); } + // Indicates to upper layers which FileSystem operations mentioned in + // FSSupportedOps are supported by underlying FileSystem. Each bit in + // supported_ops argument represent corresponding FSSupportedOps operation. + // Foreg: + // If async_io is supported by the underlying FileSystem, then supported_ops + // will have corresponding bit (i.e FSSupportedOps::kAsyncIO) set to 1. + // + // By default, async_io operation is set and FS should override this API and + // set all the operations they support provided in FSSupportedOps (including + // async_io). + virtual void SupportedOps(int64_t& supported_ops) { + supported_ops = 0; + supported_ops |= (1 << FSSupportedOps::kAsyncIO); + } + // If you're adding methods here, remember to add them to EnvWrapper too. private: @@ -791,6 +800,42 @@ struct FSReadRequest { // Output parameter set by underlying FileSystem that represents status of // read request. IOStatus status; + + // fs_scratch is a data buffer allocated and provided by underlying FileSystem + // to RocksDB during reads, when FS wants to provide its own buffer with data + // instead of using RocksDB provided FSReadRequest::scratch. + // + // FileSystem needs to provide a buffer and custom delete function. The + // lifecycle of fs_scratch until data is used by RocksDB. The buffer + // should be released by RocksDB using custom delete function provided in + // unique_ptr fs_scratch. + // + // Optimization benefits: + // This is helpful in cases where underlying FileSystem has to do additional + // copy of data to RocksDB provided buffer which can consume CPU cycles. It + // can be optimized by avoiding copying to RocksDB buffer and directly using + // FS provided buffer. + // + // How to enable: + // In order to enable this option, FS needs to override SupportedOps() API and + // set FSSupportedOps::kFSBuffer in SupportedOps() as: + // { + // supported_ops |= (1 << FSSupportedOps::kFSBuffer); + // } + // + // Work in progress: + // Right now it's only enabled for MultiReads (sync and async + // both) with non direct io. + // If RocksDB provide its own buffer (scratch) during reads, that's a + // signal for FS to use RocksDB buffer. + // If FSSupportedOps::kFSBuffer is enabled and scratch == nullptr, + // then FS have to provide its own buffer in fs_scratch. + // + // NOTE: + // - FSReadRequest::result should point to fs_scratch. + // - This is needed only if FSSupportedOps::kFSBuffer support is provided by + // underlying FS. + std::unique_ptr> fs_scratch; }; // A file abstraction for randomly reading the contents of a file. @@ -885,7 +930,6 @@ class FSRandomAccessFile { return IOStatus::NotSupported("InvalidateCache not supported."); } - // EXPERIMENTAL // This API reads the requested data in FSReadRequest asynchronously. This is // a asynchronous call, i.e it should return after submitting the request. // @@ -907,6 +951,16 @@ class FSRandomAccessFile { // request and result and status fields are output parameter set by underlying // FileSystem. The data should always be read into scratch field. // + // How to enable: + // In order to enable ReadAsync, FS needs to override SupportedOps() API and + // set FSSupportedOps::kAsyncIO in SupportedOps() as: + // { + // supported_ops |= (1 << FSSupportedOps::kAsyncIO); + // } + // + // Note: If FS supports ReadAsync API, it should also override Poll and + // AbortIO API. + // // Default implementation is to read the data synchronously. virtual IOStatus ReadAsync( FSReadRequest& req, const IOOptions& opts, @@ -955,6 +1009,9 @@ class FSWritableFile { write_hint_(Env::WLTH_NOT_SET), strict_bytes_per_sync_(options.strict_bytes_per_sync) {} + // For cases when Close() hasn't been called, many derived classes of + // FSWritableFile will need to call Close() non-virtually in their destructor, + // and ignore the result, to ensure resources are released. virtual ~FSWritableFile() {} // Append data to the end of the file @@ -1028,6 +1085,12 @@ class FSWritableFile { IODebugContext* /*dbg*/) { return IOStatus::OK(); } + + // The caller should call Close() before destroying the FSWritableFile to + // surface any errors associated with finishing writes to the file. + // The file is considered closed regardless of return status. + // (However, implementations must also clean up properly in the destructor + // even if Close() is not called.) virtual IOStatus Close(const IOOptions& /*options*/, IODebugContext* /*dbg*/) = 0; @@ -1185,6 +1248,9 @@ class FSRandomRWFile { public: FSRandomRWFile() {} + // For cases when Close() hasn't been called, many derived classes of + // FSRandomRWFile will need to call Close() non-virtually in their destructor, + // and ignore the result, to ensure resources are released. virtual ~FSRandomRWFile() {} // Indicates if the class makes use of direct I/O @@ -1220,6 +1286,11 @@ class FSRandomRWFile { return Sync(options, dbg); } + // The caller should call Close() before destroying the FSRandomRWFile to + // surface any errors associated with finishing writes to the file. + // The file is considered closed regardless of return status. + // (However, implementations must also clean up properly in the destructor + // even if Close() is not called.) virtual IOStatus Close(const IOOptions& options, IODebugContext* dbg) = 0; // EXPERIMENTAL @@ -1263,6 +1334,9 @@ class FSMemoryMappedFileBuffer { // filesystem operations that can be executed on directories. class FSDirectory { public: + // For cases when Close() hasn't been called, many derived classes of + // FSDirectory will need to call Close() non-virtually in their destructor, + // and ignore the result, to ensure resources are released. virtual ~FSDirectory() {} // Fsync directory. Can be called concurrently from multiple threads. virtual IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) = 0; @@ -1276,7 +1350,9 @@ class FSDirectory { return Fsync(options, dbg); } - // Close directory + // Calling Close() before destroying a FSDirectory is recommended to surface + // any errors associated with finishing writes (in case of future features). + // The directory is considered closed regardless of return status. virtual IOStatus Close(const IOOptions& /*options*/, IODebugContext* /*dbg*/) { return IOStatus::NotSupported("Close"); @@ -1516,10 +1592,8 @@ class FileSystemWrapper : public FileSystem { const Customizable* Inner() const override { return target_.get(); } Status PrepareOptions(const ConfigOptions& options) override; -#ifndef ROCKSDB_LITE std::string SerializeOptions(const ConfigOptions& config_options, const std::string& header) const override; -#endif // ROCKSDB_LITE virtual IOStatus Poll(std::vector& io_handles, size_t min_completions) override { @@ -1530,6 +1604,10 @@ class FileSystemWrapper : public FileSystem { return target_->AbortIO(io_handles); } + virtual void SupportedOps(int64_t& supported_ops) override { + return target_->SupportedOps(supported_ops); + } + protected: std::shared_ptr target_; }; diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index 954d15b4a19c..039b826de798 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -162,7 +162,7 @@ class FilterPolicy : public Customizable { // ignores trailing spaces, it would be incorrect to use a // FilterPolicy (like NewBloomFilterPolicy) that does not ignore // trailing spaces in keys. -extern const FilterPolicy* NewBloomFilterPolicy( +const FilterPolicy* NewBloomFilterPolicy( double bits_per_key, bool IGNORED_use_block_based_builder = false); // A new Bloom alternative that saves about 30% space compared to @@ -184,6 +184,11 @@ extern const FilterPolicy* NewBloomFilterPolicy( // flushes under Level and Universal compaction styles. // bloom_before_level=-1 -> Always generate Ribbon filters (except in // some extreme or exceptional cases). +// bloom_before_level=INT_MAX -> Always generate Bloom filters. +// +// The bloom_before_level option is mutable in the Configurable interface +// and through the SetOptions() API, as in +// db->SetOptions({{"table_factory.filter_policy.bloom_before_level", "3"}}); // // Ribbon filters are compatible with RocksDB >= 6.15.0. Earlier // versions reading the data will behave as if no filter was used @@ -200,7 +205,7 @@ extern const FilterPolicy* NewBloomFilterPolicy( // // Also consider using optimize_filters_for_memory to save filter // memory. -extern const FilterPolicy* NewRibbonFilterPolicy( - double bloom_equivalent_bits_per_key, int bloom_before_level = 0); +FilterPolicy* NewRibbonFilterPolicy(double bloom_equivalent_bits_per_key, + int bloom_before_level = 0); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h index 9d4c9f73a105..8568dd2588c0 100644 --- a/include/rocksdb/iterator.h +++ b/include/rocksdb/iterator.h @@ -107,10 +107,16 @@ class Iterator : public Cleanable { // satisfied without doing some IO, then this returns Status::Incomplete(). virtual Status status() const = 0; - // If supported, renew the iterator to represent the latest state. The - // iterator will be invalidated after the call. Not supported if - // ReadOptions.snapshot is given when creating the iterator. - virtual Status Refresh() { + // If supported, the DB state that the iterator reads from is updated to + // the latest state. The iterator will be invalidated after the call. + // Regardless of whether the iterator was created/refreshed previously + // with or without a snapshot, the iterator will be reading the + // latest DB state after this call. + virtual Status Refresh() { return Refresh(nullptr); } + + // Similar to Refresh() but the iterator will be reading the latest DB state + // under the given snapshot. + virtual Status Refresh(const class Snapshot*) { return Status::NotSupported("Refresh() is not supported"); } @@ -127,6 +133,16 @@ class Iterator : public Cleanable { // Property "rocksdb.iterator.internal-key": // Get the user-key portion of the internal key at which the iteration // stopped. + // Property "rocksdb.iterator.write-time": + // DO NOT USE, UNDER CONSTRUCTION + // Get the unix time of the best estimate of the write time of the entry. + // Returned as 64-bit raw value (8 bytes). It can be converted to uint64_t + // with util method `DecodeU64Ts`. The accuracy of the write time depends on + // settings like preserve_internal_time_seconds. If this feature is + // disabled, this property will always be empty. The actual write time of + // the entry should be the same or newer than the returned write time. So + // this property can be interpreted as the possible oldest write time for + // the entry. virtual Status GetProperty(std::string prop_name, std::string* prop); virtual Slice timestamp() const { diff --git a/include/rocksdb/ldb_tool.h b/include/rocksdb/ldb_tool.h index 7408cbc8738a..b8f2e222fa81 100644 --- a/include/rocksdb/ldb_tool.h +++ b/include/rocksdb/ldb_tool.h @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -41,4 +40,3 @@ class LDBTool { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h index 853b587581bc..2cc30d871a49 100644 --- a/include/rocksdb/listener.h +++ b/include/rocksdb/listener.h @@ -161,6 +161,9 @@ enum class CompactionReason : int { kNumOfReasons, }; +const char* GetCompactionReasonString(CompactionReason compaction_reason); + +// When adding flush reason, make sure to also update `GetFlushReasonString()`. enum class FlushReason : int { kOthers = 0x00, kGetLiveFiles = 0x01, @@ -178,8 +181,12 @@ enum class FlushReason : int { // will not be called to avoid many small immutable memtables. kErrorRecoveryRetryFlush = 0xc, kWalFull = 0xd, + // SwitchMemtable will not be called for this flush reason. + kCatchUpAfterErrorRecovery = 0xe, }; +const char* GetFlushReasonString(FlushReason flush_reason); + // TODO: In the future, BackgroundErrorReason will only be used to indicate // why the BG Error is happening (e.g., flush, compaction). We may introduce // other data structure to indicate other essential information such as @@ -194,12 +201,6 @@ enum class BackgroundErrorReason { kManifestWriteNoWAL, }; -enum class WriteStallCondition { - kNormal, - kDelayed, - kStopped, -}; - struct WriteStallInfo { // the name of the column family std::string cf_name; @@ -210,7 +211,6 @@ struct WriteStallInfo { } condition; }; -#ifndef ROCKSDB_LITE struct FileDeletionInfo { FileDeletionInfo() = default; @@ -843,11 +843,5 @@ class EventListener : public Customizable { ~EventListener() override {} }; -#else - -class EventListener {}; -struct FlushJobInfo {}; - -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/memory_allocator.h b/include/rocksdb/memory_allocator.h index d126abfe6d63..dc744d7d1a3b 100644 --- a/include/rocksdb/memory_allocator.h +++ b/include/rocksdb/memory_allocator.h @@ -81,7 +81,7 @@ struct JemallocAllocatorOptions { // The tcache normally incurs 0.5M extra memory usage per-thread. The usage // can be reduced by limiting allocation sizes to cache. extern Status NewJemallocNodumpAllocator( - JemallocAllocatorOptions& options, + const JemallocAllocatorOptions& options, std::shared_ptr* memory_allocator); } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index cb5444dca356..be0f6cd1f189 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -359,7 +359,6 @@ class SkipListFactory : public MemTableRepFactory { size_t lookahead_; }; -#ifndef ROCKSDB_LITE // This creates MemTableReps that are backed by an std::vector. On iteration, // the vector is sorted. This is useful for workloads where iteration is very // rare and writes are generally not issued after reads begin. @@ -419,5 +418,4 @@ extern MemTableRepFactory* NewHashLinkListRepFactory( bool if_log_bucket_dist_when_flash = true, uint32_t threshold_use_skiplist = 256); -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h index 077130475dab..6be9e3962b22 100755 --- a/include/rocksdb/merge_operator.h +++ b/include/rocksdb/merge_operator.h @@ -8,10 +8,13 @@ #include #include #include +#include +#include #include #include "rocksdb/customizable.h" #include "rocksdb/slice.h" +#include "rocksdb/wide_columns.h" namespace ROCKSDB_NAMESPACE { @@ -33,7 +36,7 @@ class Logger; // into rocksdb); numeric addition and string concatenation are examples; // // b) MergeOperator - the generic class for all the more abstract / complex -// operations; one method (FullMergeV2) to merge a Put/Delete value with a +// operations; one method (FullMergeV3) to merge a Put/Delete value with a // merge operand; and another method (PartialMerge) that merges multiple // operands together. this is especially useful if your key values have // complex structures but you would still like to support client-specific @@ -158,6 +161,54 @@ class MergeOperator : public Customizable { virtual bool FullMergeV2(const MergeOperationInput& merge_in, MergeOperationOutput* merge_out) const; + struct MergeOperationInputV3 { + using ExistingValue = std::variant; + using OperandList = std::vector; + + explicit MergeOperationInputV3(const Slice& _key, + ExistingValue&& _existing_value, + const OperandList& _operand_list, + Logger* _logger) + : key(_key), + existing_value(std::move(_existing_value)), + operand_list(_operand_list), + logger(_logger) {} + + // The user key, including the user-defined timestamp if applicable. + const Slice& key; + // The base value of the merge operation. Can be one of three things (see + // the ExistingValue variant above): no existing value, plain existing + // value, or wide-column existing value. + ExistingValue existing_value; + // The list of operands to apply. + const OperandList& operand_list; + // The logger to use in case a failure happens during the merge operation. + Logger* logger; + }; + + struct MergeOperationOutputV3 { + using NewColumns = std::vector>; + using NewValue = std::variant; + + // The result of the merge operation. Can be one of three things (see the + // NewValue variant above): a new plain value, a new wide-column value, or + // an existing merge operand. + NewValue new_value; + // The scope of the failure if applicable. See above for more details. + OpFailureScope op_failure_scope = OpFailureScope::kDefault; + }; + + // An extended version of FullMergeV2() that supports wide columns on both the + // input and the output side, enabling the application to perform general + // transformations during merges. For backward compatibility, the default + // implementation calls FullMergeV2(). Specifically, if there is no base value + // or the base value is a plain key-value, the default implementation falls + // back to FullMergeV2(). If the base value is a wide-column entity, the + // default implementation invokes FullMergeV2() to perform the merge on the + // default column, and leaves any other columns unchanged. + virtual bool FullMergeV3(const MergeOperationInputV3& merge_in, + MergeOperationOutputV3* merge_out) const; + // This function performs merge(left_op, right_op) // when both the operands are themselves merge operation types // that you would have passed to a DB::Merge() call in the same order @@ -186,7 +237,7 @@ class MergeOperator : public Customizable { // TODO: Presently there is no way to differentiate between error/corruption // and simply "return false". For now, the client should simply return // false in any case it cannot perform partial-merge, regardless of reason. - // If there is corruption in the data, handle it in the FullMergeV2() function + // If there is corruption in the data, handle it in the FullMergeV3() function // and return false there. The default implementation of PartialMerge will // always return false. virtual bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/, @@ -243,8 +294,8 @@ class MergeOperator : public Customizable { // Doesn't help with iterators. // // Note: the merge operands are passed to this function in the reversed order - // relative to how they were merged (passed to FullMerge or FullMergeV2) - // for performance reasons, see also: + // relative to how they were merged (passed to + // FullMerge/FullMergeV2/FullMergeV3) for performance reasons, see also: // https://github.com/facebook/rocksdb/issues/3865 virtual bool ShouldMerge(const std::vector& /*operands*/) const { return false; diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h index 3cdd8bd8a381..4ab3842dda80 100644 --- a/include/rocksdb/metadata.h +++ b/include/rocksdb/metadata.h @@ -148,6 +148,13 @@ struct SstFileMetaData : public FileStorageInfo { // For L0, larger `epoch_number` indicates newer L0 file. // 0 if the information is not available. uint64_t epoch_number = 0; + + // These bounds define the effective key range for range tombstones + // in this file. + // Currently only used by CreateColumnFamilyWithImport(). + std::string smallest{}; // Smallest internal key served by table + std::string largest{}; // Largest internal key served by table + // DEPRECATED: The name of the file within its directory with a // leading slash (e.g. "/123456.sst"). Use relative_filename from base struct // instead. diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 1dec10a6c94f..76a2f3b1e6ed 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -84,7 +85,6 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // Use this if you don't need to keep the data sorted, i.e. you'll never use // an iterator, only Put() and Get() API calls // - // Not supported in ROCKSDB_LITE ColumnFamilyOptions* OptimizeForPointLookup(uint64_t block_cache_size_mb); // Default values for some parameters in ColumnFamilyOptions are not @@ -101,8 +101,6 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // biggest performance gains. // Note: we might use more memory than memtable_memory_budget during high // write rate period - // - // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE ColumnFamilyOptions* OptimizeLevelStyleCompaction( uint64_t memtable_memory_budget = 512 * 1024 * 1024); ColumnFamilyOptions* OptimizeUniversalStyleCompaction( @@ -211,6 +209,7 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // - kZSTD: 3 // - kZlibCompression: Z_DEFAULT_COMPRESSION (currently -1) // - kLZ4HCCompression: 0 + // - kLZ4: -1 (i.e., `acceleration=1`; see `CompressionOptions::level` doc) // - For all others, we do not specify a compression level // // Dynamically changeable through SetOptions() API @@ -349,6 +348,17 @@ struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // Default: false, write stall will be enabled bool disable_write_stall = false; + // RocksDB will try to flush the current memtable after the number of range + // deletions is >= this limit. For workloads with many range + // deletions, limiting the number of range deletions in memtable can help + // prevent performance degradation and/or OOM caused by too many range + // tombstones in a single memtable. + // + // Default: 0 (disabled) + // + // Dynamically changeable through SetOptions() API + uint32_t memtable_max_range_deletions = 0; + // Create ColumnFamilyOptions with default values for all fields ColumnFamilyOptions(); // Create ColumnFamilyOptions from Options @@ -526,20 +536,19 @@ struct DBOptions { // memtable to cost to DBOptions* OptimizeForSmallDb(std::shared_ptr* cache = nullptr); -#ifndef ROCKSDB_LITE // By default, RocksDB uses only one background thread for flush and // compaction. Calling this function will set it up such that total of // `total_threads` is used. Good value for `total_threads` is the number of // cores. You almost definitely want to call this function if your system is // bottlenecked by RocksDB. DBOptions* IncreaseParallelism(int total_threads = 16); -#endif // ROCKSDB_LITE // If true, the database will be created if it is missing. // Default: false bool create_if_missing = false; - // If true, missing column families will be automatically created. + // If true, missing column families will be automatically created on + // DB::Open(). // Default: false bool create_missing_column_families = false; @@ -557,11 +566,29 @@ struct DBOptions { // If true, during memtable flush, RocksDB will validate total entries // read in flush, and compare with counter inserted into it. + // // The option is here to turn the feature off in case this new validation - // feature has a bug. + // feature has a bug. The option may be removed in the future once the + // feature is stable. + // // Default: true bool flush_verify_memtable_count = true; + // If true, during compaction, RocksDB will count the number of entries + // read and compare it against the number of entries in the compaction + // input files. This is intended to add protection against corruption + // during compaction. Note that + // - this verification is not done for compactions during which a compaction + // filter returns kRemoveAndSkipUntil, and + // - the number of range deletions is not verified. + // + // The option is here to turn the feature off in case this new validation + // feature has a bug. The option may be removed in the future once the + // feature is stable. + // + // Default: true + bool compaction_verify_record_count = true; + // If true, the log numbers and sizes of the synced WALs are tracked // in MANIFEST. During DB recovery, if a synced WAL is missing // from disk, or the WAL's size does not match the recorded size in @@ -824,7 +851,6 @@ struct DBOptions { // If specified with non-zero value, log file will be rolled // if it has been active longer than `log_file_time_to_roll`. // Default: 0 (disabled) - // Not supported in ROCKSDB_LITE mode! size_t log_file_time_to_roll = 0; // Maximal info log files to be kept. @@ -850,18 +876,23 @@ struct DBOptions { // Number of shards used for table cache. int table_cache_numshardbits = 6; - // The following two fields affect how archived logs will be deleted. - // 1. If both set to 0, logs will be deleted asap and will not get into - // the archive. - // 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, - // WAL files will be checked every 10 min and if total size is greater - // then WAL_size_limit_MB, they will be deleted starting with the - // earliest until size_limit is met. All empty files will be deleted. - // 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then - // WAL files will be checked every WAL_ttl_seconds / 2 and those that - // are older than WAL_ttl_seconds will be deleted. - // 4. If both are not 0, WAL files will be checked every 10 min and both - // checks will be performed with ttl being first. + // The following two fields affect when WALs will be archived and deleted. + // + // When both are zero, obsolete WALs will not be archived and will be deleted + // immediately. Otherwise, obsolete WALs will be archived prior to deletion. + // + // When `WAL_size_limit_MB` is nonzero, archived WALs starting with the + // earliest will be deleted until the total size of the archive falls below + // this limit. All empty WALs will be deleted. + // + // When `WAL_ttl_seconds` is nonzero, archived WALs older than + // `WAL_ttl_seconds` will be deleted. + // + // When only `WAL_ttl_seconds` is nonzero, the frequency at which archived + // WALs are deleted is every `WAL_ttl_seconds / 2` seconds. When only + // `WAL_size_limit_MB` is nonzero, the deletion frequency is every ten + // minutes. When both are nonzero, the deletion frequency is the minimum of + // those two values. uint64_t WAL_ttl_seconds = 0; uint64_t WAL_size_limit_MB = 0; @@ -899,12 +930,10 @@ struct DBOptions { // Use O_DIRECT for user and compaction reads. // Default: false - // Not supported in ROCKSDB_LITE mode! bool use_direct_reads = false; // Use O_DIRECT for writes in background flush and compactions. // Default: false - // Not supported in ROCKSDB_LITE mode! bool use_direct_io_for_flush_and_compaction = false; // If false, fallocate() calls are bypassed, which disables file @@ -983,6 +1012,9 @@ struct DBOptions { // Default: null std::shared_ptr write_buffer_manager = nullptr; + // DEPRECATED + // This flag has no effect on the behavior of compaction and we plan to delete + // it in the future. // Specify the file access pattern once a compaction is started. // It will be applied to all input files of a compaction. // Default: NORMAL @@ -993,10 +1025,10 @@ struct DBOptions { // running RocksDB on spinning disks, you should set this to at least 2MB. // That way RocksDB's compaction is doing sequential instead of random reads. // - // Default: 0 + // Default: 2MB // // Dynamically changeable through SetDBOptions() API. - size_t compaction_readahead_size = 0; + size_t compaction_readahead_size = 2 * 1024 * 1024; // This is a maximum buffer size that is used by WinMmapReadableFile in // unbuffered disk I/O mode. We need to maintain an aligned buffer for @@ -1141,7 +1173,7 @@ struct DBOptions { // // By default, i.e., when it is false, rocksdb does not advance the sequence // number for new snapshots unless all the writes with lower sequence numbers - // are already finished. This provides the immutability that we except from + // are already finished. This provides the immutability that we expect from // snapshots. Moreover, since Iterator and MultiGet internally depend on // snapshots, the snapshot immutability results into Iterator and MultiGet // offering consistent-point-in-time view. If set to true, although @@ -1227,23 +1259,20 @@ struct DBOptions { // A global cache for table-level rows. // Default: nullptr (disabled) - // Not supported in ROCKSDB_LITE mode! - std::shared_ptr row_cache = nullptr; + std::shared_ptr row_cache = nullptr; -#ifndef ROCKSDB_LITE // A filter object supplied to be invoked while processing write-ahead-logs // (WALs) during recovery. The filter provides a way to inspect log // records, ignoring a particular record or skipping replay. // The filter is invoked at startup and is invoked from a single-thread // currently. WalFilter* wal_filter = nullptr; -#endif // ROCKSDB_LITE - // If true, then DB::Open / CreateColumnFamily / DropColumnFamily + // If true, then DB::Open, CreateColumnFamily, DropColumnFamily, and // SetOptions will fail if options file is not properly persisted. // - // DEFAULT: false - bool fail_if_options_file_error = false; + // DEFAULT: true + bool fail_if_options_file_error = true; // If false, we won't use options file. // DEFAULT: true @@ -1275,10 +1304,13 @@ struct DBOptions { // Set this option to true during creation of database if you want // to be able to ingest behind (call IngestExternalFile() skipping keys // that already exist, rather than overwriting matching keys). - // Setting this option to true will affect 2 things: - // 1) Disable some internal optimizations around SST file compression - // 2) Reserve bottom-most level for ingested files only. - // 3) Note that num_levels should be >= 3 if this option is turned on. + // Setting this option to true has the following effects: + // 1) Disable some internal optimizations around SST file compression. + // 2) Reserve the last level for ingested files only. + // 3) Compaction will not include any file from the last level. + // Note that only Universal Compaction supports allow_ingest_behind. + // `num_levels` should be >= 3 if this option is turned on. + // // // DEFAULT: false // Immutable. @@ -1351,46 +1383,46 @@ struct DBOptions { // Default: nullptr std::shared_ptr file_checksum_gen_factory = nullptr; - // By default, RocksDB recovery fails if any table/blob file referenced in the - // final version reconstructed from the - // MANIFEST are missing after scanning the MANIFEST pointed to by the - // CURRENT file. It can also fail if verification of unique SST id fails. - // Best-efforts recovery is another recovery mode that does not necessarily - // fail when certain table/blob files are missing/corrupted or have mismatched - // unique id table property. Instead, best-efforts recovery recovers each - // column family to a point in the MANIFEST that corresponds to a version. In - // such a version, all valid table/blob files referenced have the expected - // file size. For table files, their unique id table property match the - // MANIFEST. - // - // Best-efforts recovery does not need a valid CURRENT file, and tries to - // recover the database using one of the available MANIFEST files in the db - // directory. - // Best-efforts recovery tries the available MANIFEST files from high file - // numbers (newer) to low file numbers (older), and stops after finding the - // first MANIFEST file from which the db can be recovered to a state without - // invalid (missing/filesize-mismatch/unique-id-mismatch) table and blob - // files. It is possible that the database can be restored to an empty state - // with no table or blob files. - // - // Regardless of this option, the IDENTITY file - // is updated if needed during recovery to match the DB ID in the MANIFEST (if - // previously using write_dbid_to_manifest) or to be in some valid state - // (non-empty DB ID). Currently, not compatible with atomic flush. - // Furthermore, WAL files will not be used for recovery if - // best_efforts_recovery is true. Also requires either 1) LOCK file exists or - // 2) underlying env's LockFile() call returns ok even for non-existing LOCK - // file. + // By default, RocksDB will attempt to detect any data losses or corruptions + // in DB files and return an error to the user, either at DB::Open time or + // later during DB operation. The exception to this policy is the WAL file, + // whose recovery is controlled by the wal_recovery_mode option. + // + // Best-efforts recovery (this option set to true) signals a preference for + // opening the DB to any point-in-time valid state for each column family, + // including the empty/new state, versus the default of returning non-WAL + // data losses to the user as errors. In terms of RocksDB user data, this + // is like applying WALRecoveryMode::kPointInTimeRecovery to each column + // family rather than just the WAL. + // + // Best-efforts recovery (BER) is specifically designed to recover a DB with + // files that are missing or truncated to some smaller size, such as the + // result of an incomplete DB "physical" (FileSystem) copy. BER can also + // detect when an SST file has been replaced with a different one of the + // same size (assuming SST unique IDs are tracked in DB manifest). + // BER is not yet designed to produce a usable DB from other corruptions to + // DB files (which should generally be detectable by DB::VerifyChecksum()), + // and BER does not yet attempt to recover any WAL files. + // + // For example, if an SST or blob file referenced by the MANIFEST is missing, + // BER might be able to find a set of files corresponding to an old "point in + // time" version of the column family, possibly from an older MANIFEST + // file. Some other kinds of DB files (e.g. CURRENT, LOCK, IDENTITY) are + // either ignored or replaced with BER, or quietly fixed regardless of BER + // setting. BER does require at least one valid MANIFEST to recover to a + // non-trivial DB state, unlike `ldb repair`. + // + // Currently, best_efforts_recovery=true is not compatible with atomic flush. // // Default: false bool best_efforts_recovery = false; - // It defines how many times db resume is called by a separate thread when + // It defines how many times DB::Resume() is called by a separate thread when // background retryable IO Error happens. When background retryable IO // Error happens, SetBGError is called to deal with the error. If the error // can be auto-recovered (e.g., retryable IO Error during Flush or WAL write), // then db resume is called in background to recover from the error. If this - // value is 0 or negative, db resume will not be called. + // value is 0 or negative, DB::Resume() will not be called automatically. // // Default: INT_MAX int max_bgerror_resume_count = INT_MAX; @@ -1485,6 +1517,24 @@ struct DBOptions { // // Default: false bool disable_delete_obsolete_files_on_open = false; + + // EXPERIMENTAL + // Implementing off-peak duration awareness in RocksDB. In this context, + // "off-peak time" signifies periods characterized by significantly less read + // and write activity compared to other times. By leveraging this knowledge, + // we can prevent low-priority tasks, such as TTL-based compactions, from + // competing with read and write operations during peak hours. Essentially, we + // preprocess these tasks during the preceding off-peak period, just before + // the next peak cycle begins. For example, if the TTL is configured for 25 + // days, we may compact the files during the off-peak hours of the 24th day. + // + // Time of the day in UTC, start_time-end_time inclusive. + // Format - HH:mm-HH:mm (00:00-23:59) + // If the start time > end time, it will be considered that the time period + // spans to the next day (e.g., 23:30-04:00). To make an entire day off-peak, + // use "0:00-23:59". To make an entire day have no offpeak period, leave + // this field blank. Default: Empty string (no offpeak). + std::string daily_offpeak_time_utc = ""; }; // Options to control the behavior of a database (passed to DB::Open) @@ -1549,12 +1599,123 @@ enum ReadTier { // Options that control read operations struct ReadOptions { + // *** BEGIN options relevant to point lookups as well as scans *** + // If "snapshot" is non-nullptr, read as of the supplied snapshot // (which must belong to the DB that is being read and which must // not have been released). If "snapshot" is nullptr, use an implicit // snapshot of the state at the beginning of this read operation. - // Default: nullptr - const Snapshot* snapshot; + const Snapshot* snapshot = nullptr; + + // Timestamp of operation. Read should return the latest data visible to the + // specified timestamp. All timestamps of the same database must be of the + // same length and format. The user is responsible for providing a customized + // compare function via Comparator to order tuples. + // For iterator, iter_start_ts is the lower bound (older) and timestamp + // serves as the upper bound. Versions of the same record that fall in + // the timestamp range will be returned. If iter_start_ts is nullptr, + // only the most recent version visible to timestamp is returned. + // The user-specified timestamp feature is still under active development, + // and the API is subject to change. + const Slice* timestamp = nullptr; + const Slice* iter_start_ts = nullptr; + + // Deadline for completing an API call (Get/MultiGet/Seek/Next for now) + // in microseconds. + // It should be set to microseconds since epoch, i.e, gettimeofday or + // equivalent plus allowed duration in microseconds. The best way is to use + // env->NowMicros() + some timeout. + // This is best efforts. The call may exceed the deadline if there is IO + // involved and the file system doesn't support deadlines, or due to + // checking for deadline periodically rather than for every key if + // processing a batch + std::chrono::microseconds deadline = std::chrono::microseconds::zero(); + + // A timeout in microseconds to be passed to the underlying FileSystem for + // reads. As opposed to deadline, this determines the timeout for each + // individual file read request. If a MultiGet/Get/Seek/Next etc call + // results in multiple reads, each read can last up to io_timeout us. + std::chrono::microseconds io_timeout = std::chrono::microseconds::zero(); + + // Specify if this read request should process data that ALREADY + // resides on a particular cache. If the required data is not + // found at the specified cache, then Status::Incomplete is returned. + ReadTier read_tier = kReadAllTier; + + // For file reads associated with this option, charge the internal rate + // limiter (see `DBOptions::rate_limiter`) at the specified priority. The + // special value `Env::IO_TOTAL` disables charging the rate limiter. + // + // The rate limiting is bypassed no matter this option's value for file reads + // on plain tables (these can exist when `ColumnFamilyOptions::table_factory` + // is a `PlainTableFactory`) and cuckoo tables (these can exist when + // `ColumnFamilyOptions::table_factory` is a `CuckooTableFactory`). + // + // The bytes charged to rate limiter may not exactly match the file read bytes + // since there are some seemingly insignificant reads, like for file + // headers/footers, that we currently do not charge to rate limiter. + Env::IOPriority rate_limiter_priority = Env::IO_TOTAL; + + // It limits the maximum cumulative value size of the keys in batch while + // reading through MultiGet. Once the cumulative value size exceeds this + // soft limit then all the remaining keys are returned with status Aborted. + uint64_t value_size_soft_limit = std::numeric_limits::max(); + + // When the number of merge operands applied exceeds this threshold + // during a successful query, the operation will return a special OK + // Status with subcode kMergeOperandThresholdExceeded. Currently only applies + // to point lookups and is disabled by default. + std::optional merge_operand_count_threshold; + + // If true, all data read from underlying storage will be + // verified against corresponding checksums. + bool verify_checksums = true; + + // Should the "data block"/"index block" read for this iteration be placed in + // block cache? + // Callers may wish to set this field to false for bulk scans. + // This would help not to the change eviction order of existing items in the + // block cache. + bool fill_cache = true; + + // If true, range tombstones handling will be skipped in key lookup paths. + // For DB instances that don't use DeleteRange() calls, this setting can + // be used to optimize the read performance. + // Note that, if this assumption (of no previous DeleteRange() calls) is + // broken, stale keys could be served in read paths. + bool ignore_range_deletions = false; + + // If async_io is enabled, RocksDB will prefetch some of data asynchronously. + // RocksDB apply it if reads are sequential and its internal automatic + // prefetching. + bool async_io = false; + + // Experimental + // + // If async_io is set, then this flag controls whether we read SST files + // in multiple levels asynchronously. Enabling this flag can help reduce + // MultiGet latency by maximizing the number of SST files read in + // parallel if the keys in the MultiGet batch are in different levels. It + // comes at the expense of slightly higher CPU overhead. + bool optimize_multiget_for_io = true; + + // *** END options relevant to point lookups (as well as scans) *** + // *** BEGIN options only relevant to iterators or scans *** + + // RocksDB does auto-readahead for iterators on noticing more than two reads + // for a table file. The readahead starts at 8KB and doubles on every + // additional read up to 256KB. + // This option can help if most of the range scans are large, and if it is + // determined that a larger readahead than that enabled by auto-readahead is + // needed. + // Using a large readahead size (> 2MB) can typically improve the performance + // of forward iteration on spinning disks. + size_t readahead_size = 0; + + // A threshold for the number of keys that can be skipped before failing an + // iterator seek as incomplete. The default value of 0 should be used to + // never fail a request as incomplete, even on skipping too many keys. + uint64_t max_skippable_internal_keys = 0; // `iterate_lower_bound` defines the smallest key at which the backward // iterator can return an entry. Once the bound is passed, Valid() will be @@ -1567,8 +1728,7 @@ struct ReadOptions { // // In case of user_defined timestamp, if enabled, iterate_lower_bound should // point to key without timestamp part. - // Default: nullptr - const Slice* iterate_lower_bound; + const Slice* iterate_lower_bound = nullptr; // "iterate_upper_bound" defines the extent up to which the forward iterator // can return entries. Once the bound is reached, Valid() will be false. @@ -1588,70 +1748,28 @@ struct ReadOptions { // // In case of user_defined timestamp, if enabled, iterate_upper_bound should // point to key without timestamp part. - // Default: nullptr - const Slice* iterate_upper_bound; - - // RocksDB does auto-readahead for iterators on noticing more than two reads - // for a table file. The readahead starts at 8KB and doubles on every - // additional read up to 256KB. - // This option can help if most of the range scans are large, and if it is - // determined that a larger readahead than that enabled by auto-readahead is - // needed. - // Using a large readahead size (> 2MB) can typically improve the performance - // of forward iteration on spinning disks. - // Default: 0 - size_t readahead_size; - - // A threshold for the number of keys that can be skipped before failing an - // iterator seek as incomplete. The default value of 0 should be used to - // never fail a request as incomplete, even on skipping too many keys. - // Default: 0 - uint64_t max_skippable_internal_keys; - - // Specify if this read request should process data that ALREADY - // resides on a particular cache. If the required data is not - // found at the specified cache, then Status::Incomplete is returned. - // Default: kReadAllTier - ReadTier read_tier; - - // If true, all data read from underlying storage will be - // verified against corresponding checksums. - // Default: true - bool verify_checksums; - - // Should the "data block"/"index block" read for this iteration be placed in - // block cache? - // Callers may wish to set this field to false for bulk scans. - // This would help not to the change eviction order of existing items in the - // block cache. - // Default: true - bool fill_cache; + const Slice* iterate_upper_bound = nullptr; // Specify to create a tailing iterator -- a special iterator that has a // view of the complete database (i.e. it can also be used to read newly // added data) and is optimized for sequential reads. It will return records // that were inserted into the database after the creation of the iterator. - // Default: false - // Not supported in ROCKSDB_LITE mode! - bool tailing; + bool tailing = false; // This options is not used anymore. It was to turn on a functionality that - // has been removed. - bool managed; + // has been removed. DEPRECATED + bool managed = false; // Enable a total order seek regardless of index format (e.g. hash index) // used in the table. Some table format (e.g. plain table) may not support // this option. // If true when calling Get(), we also skip prefix bloom when reading from // block based table, which only affects Get() performance. - // Default: false - bool total_order_seek; + bool total_order_seek = false; // When true, by default use total_order_seek = true, and RocksDB can // selectively enable prefix seek mode if won't generate a different result // from total_order_seek, based on seek key, and iterator upper bound. - // Not supported in ROCKSDB_LITE mode, in the way that even with value true - // prefix mode is not used. // BUG: Using Comparator::IsSameLengthImmediateSuccessor and // SliceTransform::FullLengthEnabled to enable prefix mode in cases where // prefix of upper bound differs from prefix of seek key has a flaw. @@ -1663,38 +1781,37 @@ struct ReadOptions { // iterators. (We are also assuming the new condition on // IsSameLengthImmediateSuccessor is satisfied; see its BUG section). // A bug example is in DBTest2::AutoPrefixMode1, search for "BUG". - // Default: false - bool auto_prefix_mode; + bool auto_prefix_mode = false; // Enforce that the iterator only iterates over the same prefix as the seek. // This option is effective only for prefix seeks, i.e. prefix_extractor is // non-null for the column family and total_order_seek is false. Unlike // iterate_upper_bound, prefix_same_as_start only works within a prefix // but in both directions. - // Default: false - bool prefix_same_as_start; + bool prefix_same_as_start = false; // Keep the blocks loaded by the iterator pinned in memory as long as the // iterator is not deleted, If used when reading from tables created with // BlockBasedTableOptions::use_delta_encoding = false, // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to // return 1. - // Default: false - bool pin_data; + bool pin_data = false; + + // For iterators, RocksDB does auto-readahead on noticing more than two + // sequential reads for a table file if user doesn't provide readahead_size. + // The readahead starts at 8KB and doubles on every additional read upto + // max_auto_readahead_size only when reads are sequential. However at each + // level, if iterator moves over next file, readahead_size starts again from + // 8KB. + // + // By enabling this option, RocksDB will do some enhancements for + // prefetching the data. + bool adaptive_readahead = false; // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we // schedule a background job in the flush job queue and delete obsolete files // in background. - // Default: false - bool background_purge_on_iterator_cleanup; - - // If true, range tombstones handling will be skipped in key lookup paths. - // For DB instances that don't use DeleteRange() calls, this setting can - // be used to optimize the read performance. - // Note that, if this assumption (of no previous DeleteRange() calls) is - // broken, stale keys could be served in read paths. - // Default: false - bool ignore_range_deletions; + bool background_purge_on_iterator_cleanup = false; // This flag specifies that the implementation should optimize reads // mainly for cases where keys are found rather than also optimize for keys @@ -1705,8 +1822,7 @@ struct ReadOptions { // which contains data of the LSM store. For keys which are hits, querying // filters in this level is not useful because we will search for the data // anyway. - // Default: false - bool optimize_for_hits; + bool optimize_for_hits = false; // A callback to determine whether relevant keys for this scan exist in a // given table based on the table's properties. The callback is passed the @@ -1716,95 +1832,36 @@ struct ReadOptions { // Default: empty (every table will be scanned) std::function table_filter; - // Timestamp of operation. Read should return the latest data visible to the - // specified timestamp. All timestamps of the same database must be of the - // same length and format. The user is responsible for providing a customized - // compare function via Comparator to order tuples. - // For iterator, iter_start_ts is the lower bound (older) and timestamp - // serves as the upper bound. Versions of the same record that fall in - // the timestamp range will be returned. If iter_start_ts is nullptr, - // only the most recent version visible to timestamp is returned. - // The user-specified timestamp feature is still under active development, - // and the API is subject to change. - // Default: nullptr - const Slice* timestamp; - const Slice* iter_start_ts; - - // Deadline for completing an API call (Get/MultiGet/Seek/Next for now) - // in microseconds. - // It should be set to microseconds since epoch, i.e, gettimeofday or - // equivalent plus allowed duration in microseconds. The best way is to use - // env->NowMicros() + some timeout. - // This is best efforts. The call may exceed the deadline if there is IO - // involved and the file system doesn't support deadlines, or due to - // checking for deadline periodically rather than for every key if - // processing a batch - std::chrono::microseconds deadline; - - // A timeout in microseconds to be passed to the underlying FileSystem for - // reads. As opposed to deadline, this determines the timeout for each - // individual file read request. If a MultiGet/Get/Seek/Next etc call - // results in multiple reads, each read can last up to io_timeout us. - std::chrono::microseconds io_timeout; - - // It limits the maximum cumulative value size of the keys in batch while - // reading through MultiGet. Once the cumulative value size exceeds this - // soft limit then all the remaining keys are returned with status Aborted. + // Experimental // - // Default: std::numeric_limits::max() - uint64_t value_size_soft_limit; - - // For iterators, RocksDB does auto-readahead on noticing more than two - // sequential reads for a table file if user doesn't provide readahead_size. - // The readahead starts at 8KB and doubles on every additional read upto - // max_auto_readahead_size only when reads are sequential. However at each - // level, if iterator moves over next file, readahead_size starts again from - // 8KB. + // If auto_readahead_size is set to true, it will auto tune the readahead_size + // during scans internally. + // For this feature to enabled, iterate_upper_bound must also be specified. // - // By enabling this option, RocksDB will do some enhancements for - // prefetching the data. + // NOTE: - Recommended for forward Scans only. + // - In case of backward scans like Prev or SeekForPrev, the + // cost of these backward operations might increase and affect the + // performace. So this option should not be enabled if workload + // contains backward scans. + // - If there is a backward scans, this option will be + // disabled internally and won't be reset if forward scan is done + // again. // // Default: false - bool adaptive_readahead; + bool auto_readahead_size = false; - // For file reads associated with this option, charge the internal rate - // limiter (see `DBOptions::rate_limiter`) at the specified priority. The - // special value `Env::IO_TOTAL` disables charging the rate limiter. - // - // The rate limiting is bypassed no matter this option's value for file reads - // on plain tables (these can exist when `ColumnFamilyOptions::table_factory` - // is a `PlainTableFactory`) and cuckoo tables (these can exist when - // `ColumnFamilyOptions::table_factory` is a `CuckooTableFactory`). - // - // The bytes charged to rate limiter may not exactly match the file read bytes - // since there are some seemingly insignificant reads, like for file - // headers/footers, that we currently do not charge to rate limiter. - // - // Default: `Env::IO_TOTAL`. - Env::IOPriority rate_limiter_priority = Env::IO_TOTAL; + // *** END options only relevant to iterators or scans *** - // Experimental - // - // If async_io is enabled, RocksDB will prefetch some of data asynchronously. - // RocksDB apply it if reads are sequential and its internal automatic - // prefetching. - // - // Default: false - bool async_io; + // *** BEGIN options for RocksDB internal use only *** - // Experimental - // - // If async_io is set, then this flag controls whether we read SST files - // in multiple levels asynchronously. Enabling this flag can help reduce - // MultiGet latency by maximizing the number of SST files read in - // parallel if the keys in the MultiGet batch are in different levels. It - // comes at the expense of slightly higher CPU overhead. - // - // Default: true - bool optimize_multiget_for_io; + // EXPERIMENTAL + Env::IOActivity io_activity = Env::IOActivity::kUnknown; - ReadOptions(); - ReadOptions(bool cksum, bool cache); + // *** END options for RocksDB internal use only *** + + ReadOptions() {} + ReadOptions(bool _verify_checksums, bool _fill_cache); + explicit ReadOptions(Env::IOActivity _io_activity); }; // Options that control write operations @@ -1943,15 +2000,18 @@ struct CompactionOptions { // For level based compaction, we can configure if we want to skip/force // bottommost level compaction. enum class BottommostLevelCompaction { - // Skip bottommost level compaction + // Skip bottommost level compaction. kSkip, - // Only compact bottommost level if there is a compaction filter - // This is the default option + // Only compact bottommost level if there is a compaction filter. + // This is the default option. + // Similar to kForceOptimized, when compacting bottommost level, avoid + // double-compacting files + // created in the same manual compaction. kIfHaveCompactionFilter, - // Always compact bottommost level + // Always compact bottommost level. kForce, // Always compact bottommost level but in bottommost level avoid - // double-compacting files created in the same compaction + // double-compacting files created in the same compaction. kForceOptimized, }; @@ -2034,7 +2094,8 @@ struct IngestExternalFileOptions { bool snapshot_consistency = true; // If set to false, IngestExternalFile() will fail if the file key range // overlaps with existing keys or tombstones or output of ongoing compaction - // during file ingestion in the DB. + // during file ingestion in the DB (the conditions under which a global_seqno + // must be assigned to the ingested file). bool allow_global_seqno = true; // If set to false and the file key range overlaps with the memtable key range // (memtable flush required), IngestExternalFile will fail. @@ -2047,18 +2108,14 @@ struct IngestExternalFileOptions { // with allow_ingest_behind=true since the dawn of time. // All files will be ingested at the bottommost level with seqno=0. bool ingest_behind = false; - // Set to true if you would like to write global_seqno to a given offset in - // the external SST file for backward compatibility. Older versions of - // RocksDB writes a global_seqno to a given offset within ingested SST files, - // and new versions of RocksDB do not. If you ingest an external SST using - // new version of RocksDB and would like to be able to downgrade to an - // older version of RocksDB, you should set 'write_global_seqno' to true. If - // your service is just starting to use the new RocksDB, we recommend that - // you set this option to false, which brings two benefits: - // 1. No extra random write for global_seqno during ingestion. - // 2. Without writing external SST file, it's possible to do checksum. - // We have a plan to set this option to false by default in the future. - bool write_global_seqno = true; + // DEPRECATED - Set to true if you would like to write global_seqno to + // the external SST file on ingestion for backward compatibility before + // RocksDB 5.16.0. Such old versions of RocksDB expect any global_seqno to + // be written to the SST file rather than recorded in the DB manifest. + // This functionality was deprecated because (a) random writes might be + // costly or unsupported on some FileSystems, and (b) the file checksum + // changes with such a write. + bool write_global_seqno = false; // Set to true if you would like to verify the checksums of each block of the // external SST file before ingestion. // Warning: setting this to true causes slowdown in file ingestion because @@ -2208,7 +2265,6 @@ struct OpenAndCompactOptions { std::atomic* canceled = nullptr; }; -#ifndef ROCKSDB_LITE struct LiveFilesStorageInfoOptions { // Whether to populate FileStorageInfo::file_checksum* or leave blank bool include_checksum_info = false; @@ -2217,6 +2273,31 @@ struct LiveFilesStorageInfoOptions { // Default: always force a flush without checking sizes. uint64_t wal_size_for_flush = 0; }; -#endif // !ROCKSDB_LITE + +struct WaitForCompactOptions { + // A boolean to abort waiting in case of a pause (PauseBackgroundWork() + // called) If true, Status::Aborted will be returned immediately. If false, + // ContinueBackgroundWork() must be called to resume the background jobs. + // Otherwise, jobs that were queued, but not scheduled yet may never finish + // and WaitForCompact() may wait indefinitely (if timeout is set, it will + // expire and return Status::TimedOut). + bool abort_on_pause = false; + + // A boolean to flush all column families before starting to wait. + bool flush = false; + + // A boolean to call Close() after waiting is done. By the time Close() is + // called here, there should be no background jobs in progress and no new + // background jobs should be added. DB may not have been closed if Close() + // returned Aborted status due to unreleased snapshots in the system. See + // comments in DB::Close() for details. + bool close_db = false; + + // Timeout in microseconds for waiting for compaction to complete. + // Status::TimedOut will be returned if timeout expires. + // when timeout == 0, WaitForCompact() will wait as long as there's background + // work to finish. + std::chrono::microseconds timeout = std::chrono::microseconds::zero(); +}; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index 6eb08eafdb96..216dd07d9bee 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -14,13 +14,19 @@ namespace ROCKSDB_NAMESPACE { -// A thread local context for gathering performance counter efficiently -// and transparently. -// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats. +/* + * NOTE: + * Please do not reorder the fields in this structure. If you plan to do that or + * add/remove fields to this structure, builds would fail. The way to fix the + * builds would be to add the appropriate fields to the + * DEF_PERF_CONTEXT_LEVEL_METRICS() macro in the perf_context.cc file. + */ // Break down performance counters by level and store per-level perf context in // PerfContextByLevel -struct PerfContextByLevel { +struct PerfContextByLevelBase { + // These Bloom stats apply to point reads (Get/MultiGet) for whole key and + // prefix filters. // # of times bloom filter has avoided file reads, i.e., negatives. uint64_t bloom_filter_useful = 0; // # of times bloom FullFilter has not avoided the reads. @@ -38,37 +44,34 @@ struct PerfContextByLevel { uint64_t block_cache_hit_count = 0; // total number of block cache hits uint64_t block_cache_miss_count = 0; // total number of block cache misses - - void Reset(); // reset all performance counters to zero }; -struct PerfContext { - ~PerfContext(); - - PerfContext() {} - - PerfContext(const PerfContext&); - PerfContext& operator=(const PerfContext&); - PerfContext(PerfContext&&) noexcept; +// A thread local context for gathering performance counter efficiently +// and transparently. +// Use SetPerfLevel(PerfLevel::kEnableTime) to enable time stats. +// Break down performance counters by level and store per-level perf context in +// PerfContextByLevel +struct PerfContextByLevel : public PerfContextByLevelBase { void Reset(); // reset all performance counters to zero +}; - std::string ToString(bool exclude_zero_counters = false) const; - - // enable per level perf context and allocate storage for PerfContextByLevel - void EnablePerLevelPerfContext(); - - // temporarily disable per level perf context by setting the flag to false - void DisablePerLevelPerfContext(); - - // free the space for PerfContextByLevel, also disable per level perf context - void ClearPerLevelPerfContext(); +/* + * NOTE: + * Please do not reorder the fields in this structure. If you plan to do that or + * add/remove fields to this structure, builds would fail. The way to fix the + * builds would be to add the appropriate fields to the + * DEF_PERF_CONTEXT_METRICS() macro in the perf_context.cc file. + */ +struct PerfContextBase { uint64_t user_key_comparison_count; // total number of user key comparisons uint64_t block_cache_hit_count; // total number of block cache hits uint64_t block_read_count; // total number of block reads (with IO) uint64_t block_read_byte; // total number of bytes from block reads uint64_t block_read_time; // total nanos spent on block reads + // total cpu time in nanos spent on block reads + uint64_t block_read_cpu_time; uint64_t block_cache_index_hit_count; // total number of index block hits // total number of standalone handles lookup from secondary cache uint64_t block_cache_standalone_handle_count; @@ -80,7 +83,7 @@ struct PerfContext { uint64_t filter_block_read_count; // total number of filter block reads uint64_t compression_dict_block_read_count; // total number of compression // dictionary block reads - + // RocksDB-Cloud contribution begin // Total number of files read in MultiGet operations @@ -144,9 +147,14 @@ struct PerfContext { // than the snapshot that iterator is using. // uint64_t internal_recent_skipped_count; - // How many values were fed into merge operator by iterators. + // How many merge operands were fed into the merge operator by iterators. + // Note: base values are not included in the count. // uint64_t internal_merge_count; + // How many merge operands were fed into the merge operator by point lookups. + // Note: base values are not included in the count. + // + uint64_t internal_merge_point_lookup_count; // Number of times we reseeked inside a merging iterator, specifically to skip // after or before a range of keys covered by a range deletion in a newer LSM // component. @@ -220,9 +228,9 @@ struct PerfContext { uint64_t bloom_memtable_hit_count; // total number of mem table bloom misses uint64_t bloom_memtable_miss_count; - // total number of SST table bloom hits + // total number of SST bloom hits uint64_t bloom_sst_hit_count; - // total number of SST table bloom misses + // total number of SST bloom misses uint64_t bloom_sst_miss_count; // Time spent waiting on key locks in transaction lock manager. @@ -258,15 +266,47 @@ struct PerfContext { uint64_t iter_prev_cpu_nanos; uint64_t iter_seek_cpu_nanos; + // EXPERIMENTAL + // Total number of db iterator's Next(), Prev(), Seek-related APIs being + // called + uint64_t iter_next_count; + uint64_t iter_prev_count; + uint64_t iter_seek_count; + // Time spent in encrypting data. Populated when EncryptedEnv is used. uint64_t encrypt_data_nanos; // Time spent in decrypting data. Populated when EncryptedEnv is used. uint64_t decrypt_data_nanos; uint64_t number_async_seek; +}; + +struct PerfContext : public PerfContextBase { + ~PerfContext(); + + PerfContext() {} + + PerfContext(const PerfContext&); + PerfContext& operator=(const PerfContext&); + PerfContext(PerfContext&&) noexcept; + + void Reset(); // reset all performance counters to zero + + std::string ToString(bool exclude_zero_counters = false) const; + + // enable per level perf context and allocate storage for PerfContextByLevel + void EnablePerLevelPerfContext(); + + // temporarily disable per level perf context by setting the flag to false + void DisablePerLevelPerfContext(); + + // free the space for PerfContextByLevel, also disable per level perf context + void ClearPerLevelPerfContext(); std::map* level_to_perf_context = nullptr; bool per_level_perf_context_enabled = false; + + void copyMetrics(const PerfContext* other) noexcept; }; // If RocksDB is compiled with -DNPERF_CONTEXT, then a pointer to a global, diff --git a/include/rocksdb/port_defs.h b/include/rocksdb/port_defs.h new file mode 100644 index 000000000000..68f1d61d237a --- /dev/null +++ b/include/rocksdb/port_defs.h @@ -0,0 +1,26 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file includes the common definitions used in the port/, +// the public API (this directory), and other directories + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +namespace port { +class CondVar; +} + +enum class CpuPriority { + kIdle = 0, + kLow = 1, + kNormal = 2, + kHigh = 3, +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/rate_limiter.h b/include/rocksdb/rate_limiter.h index 9cad6edf4aae..3515b1e953ba 100644 --- a/include/rocksdb/rate_limiter.h +++ b/include/rocksdb/rate_limiter.h @@ -40,6 +40,15 @@ class RateLimiter { // REQUIRED: bytes_per_second > 0 virtual void SetBytesPerSecond(int64_t bytes_per_second) = 0; + // This API allows user to dynamically change the max bytes can be granted in + // a single refill period (i.e, burst) + // + // REQUIRED: single_burst_bytes > 0. Otherwise `Status::InvalidArgument` will + // be returned. + virtual Status SetSingleBurstBytes(int64_t /* single_burst_bytes */) { + return Status::NotSupported(); + } + // Deprecated. New RateLimiter derived classes should override // Request(const int64_t, const Env::IOPriority, Statistics*) or // Request(const int64_t, const Env::IOPriority, Statistics*, OpType) diff --git a/include/rocksdb/secondary_cache.h b/include/rocksdb/secondary_cache.h index cb6f74450a50..49792ca67a5b 100644 --- a/include/rocksdb/secondary_cache.h +++ b/include/rocksdb/secondary_cache.h @@ -9,19 +9,32 @@ #include #include -#include "rocksdb/cache.h" +#include "rocksdb/advanced_cache.h" #include "rocksdb/customizable.h" +#include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" namespace ROCKSDB_NAMESPACE { -// A handle for lookup result. The handle may not be immediately ready or -// have a valid value. The caller must call isReady() to determine if its -// ready, and call Wait() in order to block until it becomes ready. -// The caller must call Value() after it becomes ready to determine if the -// handle successfullly read the item. +// A handle for lookup result. Immediately after SecondaryCache::Lookup() with +// wait=false (and depending on the implementation), the handle could be in any +// of the below states. It must not be destroyed while in the pending state. +// * Pending state (IsReady() == false): result is not ready. Value() and Size() +// must not be called. +// * Ready + not found state (IsReady() == true, Value() == nullptr): the lookup +// has completed, finding no match. Or an error occurred that prevented +// normal completion of the Lookup. +// * Ready + found state (IsReady() == false, Value() != nullptr): the lookup +// has completed, finding an entry that has been loaded into an object that is +// now owned by the caller. +// +// Wait() or SecondaryCache::WaitAll() may be skipped if IsReady() happens to +// return true, but (depending on the implementation) IsReady() might never +// return true without Wait() or SecondaryCache::WaitAll(). After the handle +// is known ready, calling Value() is required to avoid a memory leak in case +// of a cache hit. class SecondaryCacheResultHandle { public: virtual ~SecondaryCacheResultHandle() = default; @@ -36,7 +49,9 @@ class SecondaryCacheResultHandle { // the lookup was unsuccessful. virtual Cache::ObjectPtr Value() = 0; - // Return the size of value + // Return the out_charge from the helper->create_cb used to construct the + // object. + // WART: potentially confusing name virtual size_t Size() = 0; }; @@ -57,36 +72,31 @@ class SecondaryCache : public Customizable { const std::string& id, std::shared_ptr* result); - // Insert the given value into this cache. Ownership of `value` is - // transferred to the callee, who is reponsible for deleting the value - // with helper->del_cb if del_cb is not nullptr. Unlike Cache::Insert(), - // the callee is responsible for such cleanup even in case of non-OK - // Status. - // Typically, the value is not saved directly but the implementation - // uses the SaveToCallback provided by helper to extract value's - // persistable data (typically uncompressed block), which will be written - // to this tier. The implementation may or may not write it to cache - // depending on the admission control policy, even if the return status - // is success (OK). - // - // If the implementation is asynchronous or otherwise uses `value` after - // the call returns, then InsertSaved() must be overridden not to rely on - // Insert(). For example, there could be a "holding area" in memory where - // Lookup() might return the same parsed value back. But more typically, if - // the implementation only uses `value` for getting persistable data during - // the call, then the default implementation of `InsertSaved()` suffices. + // Suggest inserting an entry into this cache. The caller retains ownership + // of `obj` (also called the "value"), so is only used directly by the + // SecondaryCache during Insert(). When the cache chooses to perform the + // suggested insertion, it uses the size_cb and saveto_cb provided by + // `helper` to extract the persistable data (typically an uncompressed block) + // and writes it to this cache tier. OK may be returned even if the insertion + // is not made. virtual Status Insert(const Slice& key, Cache::ObjectPtr obj, - const Cache::CacheItemHelper* helper) = 0; + const Cache::CacheItemHelper* helper, + bool force_insert) = 0; // Insert a value from its saved/persistable data (typically uncompressed - // block), as if generated by SaveToCallback/SizeCallback. This can be used - // in "warming up" the cache from some auxiliary source, and like Insert() - // may or may not write it to cache depending on the admission control - // policy, even if the return status is success. + // block), as if generated by SaveToCallback/SizeCallback. The data can be + // compressed, in which case the type argument should specify the + // compression algorithm used. Additionally, the source argument should + // be set to the appropriate tier that will be responsible for + // uncompressing the data. // - // The default implementation assumes synchronous, non-escaping Insert(), - // wherein `value` is not used after return of Insert(). See Insert(). - virtual Status InsertSaved(const Slice& key, const Slice& saved); + // This method can be used in "warming up" the cache from some auxiliary + // source, and like Insert() may or may not write it to cache depending on + // the admission control policy, even if the return status is success. + virtual Status InsertSaved( + const Slice& key, const Slice& saved, + CompressionType type = CompressionType::kNoCompression, + CacheTier source = CacheTier::kVolatileTier) = 0; // Lookup the data for the given key in this cache. The create_cb // will be used to create the object. The handle returned may not be @@ -99,12 +109,12 @@ class SecondaryCache : public Customizable { // needs to return true. // This hint can also be safely ignored. // - // is_in_sec_cache is to indicate whether the handle is possibly erased - // from the secondary cache after the Lookup. + // kept_in_sec_cache is to indicate whether the entry will be kept in the + // secondary cache after the Lookup (rather than erased because of Lookup) virtual std::unique_ptr Lookup( const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool wait, bool advise_erase, - bool& is_in_sec_cache) = 0; + bool& kept_in_sec_cache) = 0; // Indicate whether a handle can be erased in this secondary cache. [[nodiscard]] virtual bool SupportForceErase() const = 0; @@ -130,6 +140,83 @@ class SecondaryCache : public Customizable { virtual Status GetCapacity(size_t& /* capacity */) { return Status::NotSupported(); } + + // Temporarily decrease the cache capacity in RAM by the specified amount. + // The caller should call Inflate() to restore the cache capacity. This is + // intended to be lighter weight than SetCapacity(). The latter evenly + // distributes the new capacity across all shards and is meant for large + // changes in capacity, whereas the former is meant for relatively small + // changes and may be uneven by lowering capacity in a single shard. + virtual Status Deflate(size_t /*decrease*/) { return Status::NotSupported(); } + + // Restore the capacity reduced by a prior call to Deflate(). + virtual Status Inflate(size_t /*increase*/) { return Status::NotSupported(); } }; +// A wrapper around a SecondaryCache object. A derived class may selectively +// override methods to implement a different behavior. +class SecondaryCacheWrapper : public SecondaryCache { + public: + explicit SecondaryCacheWrapper(std::shared_ptr target) + : target_(std::move(target)) {} + + virtual Status Insert(const Slice& key, Cache::ObjectPtr obj, + const Cache::CacheItemHelper* helper, + bool force_insert) override { + return target()->Insert(key, obj, helper, force_insert); + } + + virtual Status InsertSaved( + const Slice& key, const Slice& saved, + CompressionType type = CompressionType::kNoCompression, + CacheTier source = CacheTier::kVolatileTier) override { + return target()->InsertSaved(key, saved, type, source); + } + + virtual std::unique_ptr Lookup( + const Slice& key, const Cache::CacheItemHelper* helper, + Cache::CreateContext* create_context, bool wait, bool advise_erase, + bool& kept_in_sec_cache) override { + return target()->Lookup(key, helper, create_context, wait, advise_erase, + kept_in_sec_cache); + } + + virtual bool SupportForceErase() const override { + return target()->SupportForceErase(); + } + + virtual void Erase(const Slice& key) override { target()->Erase(key); } + + virtual void WaitAll( + std::vector handles) override { + target()->WaitAll(handles); + } + + virtual Status SetCapacity(size_t capacity) override { + return target()->SetCapacity(capacity); + } + + virtual Status GetCapacity(size_t& capacity) override { + return target()->GetCapacity(capacity); + } + + virtual Status Deflate(size_t decrease) override { + return target()->Deflate(decrease); + } + + virtual Status Inflate(size_t increase) override { + return target()->Inflate(increase); + } + + protected: + SecondaryCache* target() const { return target_.get(); } + + private: + std::shared_ptr target_; +}; + +// Useful for cache entries that just need to be copied into a +// secondary cache, such as compressed blocks +extern const Cache::CacheItemHelper kSliceCacheItemHelper; + } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/sst_dump_tool.h b/include/rocksdb/sst_dump_tool.h index 9261ba47d0d3..0b81833f7d5f 100644 --- a/include/rocksdb/sst_dump_tool.h +++ b/include/rocksdb/sst_dump_tool.h @@ -2,7 +2,6 @@ // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #pragma once #include "rocksdb/options.h" @@ -16,4 +15,3 @@ class SSTDumpTool { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/sst_file_manager.h b/include/rocksdb/sst_file_manager.h index 613292151219..b4e5a9bafa4b 100644 --- a/include/rocksdb/sst_file_manager.h +++ b/include/rocksdb/sst_file_manager.h @@ -93,7 +93,8 @@ class SstFileManager { // // @param env: Pointer to Env object, please see "rocksdb/env.h". // @param fs: Pointer to FileSystem object (rocksdb/file_system.h" -// @param info_log: If not nullptr, info_log will be used to log errors. +// @param info_log: If not nullptr, info_log will be used to log messages of +// INFO, WARN or ERROR level with respect to info_log's info level. // // == Deletion rate limiting specific arguments == // @param trash_dir: Deprecated, this argument have no effect diff --git a/include/rocksdb/sst_file_reader.h b/include/rocksdb/sst_file_reader.h index 4b8642480977..026ae66d036e 100644 --- a/include/rocksdb/sst_file_reader.h +++ b/include/rocksdb/sst_file_reader.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include "rocksdb/iterator.h" #include "rocksdb/options.h" @@ -44,4 +43,3 @@ class SstFileReader { } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/include/rocksdb/sst_file_writer.h b/include/rocksdb/sst_file_writer.h index e0c7c9fe713c..04d8bd0b8bc0 100644 --- a/include/rocksdb/sst_file_writer.h +++ b/include/rocksdb/sst_file_writer.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -14,6 +13,7 @@ #include "rocksdb/options.h" #include "rocksdb/table_properties.h" #include "rocksdb/types.h" +#include "rocksdb/wide_columns.h" #if defined(__GNUC__) || defined(__clang__) #define ROCKSDB_DEPRECATED_FUNC __attribute__((__deprecated__)) @@ -46,7 +46,7 @@ struct ExternalSstFileInfo { const std::string& _smallest_key, const std::string& _largest_key, SequenceNumber _sequence_number, uint64_t _file_size, - int32_t _num_entries, int32_t _version) + uint64_t _num_entries, int32_t _version) : file_path(_file_path), smallest_key(_smallest_key), largest_key(_largest_key), @@ -118,47 +118,68 @@ class SstFileWriter { Status Open(const std::string& file_path); // Add a Put key with value to currently opened file (deprecated) - // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) + // key according to the comparator. // REQUIRES: comparator is *not* timestamp-aware. ROCKSDB_DEPRECATED_FUNC Status Add(const Slice& user_key, const Slice& value); // Add a Put key with value to currently opened file - // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) + // key according to the comparator. // REQUIRES: comparator is *not* timestamp-aware. Status Put(const Slice& user_key, const Slice& value); // Add a Put (key with timestamp, value) to the currently opened file - // REQUIRES: key is after any previously added key according to the - // comparator. - // REQUIRES: the timestamp's size is equal to what is expected by - // the comparator. + // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) + // key according to the comparator. + // REQUIRES: timestamp's size is equal to what is expected by the comparator. Status Put(const Slice& user_key, const Slice& timestamp, const Slice& value); + // Add a PutEntity (key with the wide-column entity defined by "columns") to + // the currently opened file + Status PutEntity(const Slice& user_key, const WideColumns& columns); + // Add a Merge key with value to currently opened file - // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) + // key according to the comparator. // REQUIRES: comparator is *not* timestamp-aware. Status Merge(const Slice& user_key, const Slice& value); // Add a deletion key to currently opened file - // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) + // key according to the comparator. // REQUIRES: comparator is *not* timestamp-aware. Status Delete(const Slice& user_key); // Add a deletion key with timestamp to the currently opened file - // REQUIRES: key is after any previously added key according to the - // comparator. - // REQUIRES: the timestamp's size is equal to what is expected by - // the comparator. + // REQUIRES: user_key is after any previously added point (Put/Merge/Delete) + // key according to the comparator. + // REQUIRES: timestamp's size is equal to what is expected by the comparator. Status Delete(const Slice& user_key, const Slice& timestamp); - // Add a range deletion tombstone to currently opened file + // Add a range deletion tombstone to currently opened file. Such a range + // deletion tombstone does NOT delete point (Put/Merge/Delete) keys in the + // same file. + // + // Range deletion tombstones may be added in any order, both with respect to + // each other and with respect to the point (Put/Merge/Delete) keys in the + // same file. + // + // REQUIRES: The comparator orders `begin_key` at or before `end_key` // REQUIRES: comparator is *not* timestamp-aware. Status DeleteRange(const Slice& begin_key, const Slice& end_key); - // Add a range deletion tombstone to currently opened file. + // Add a range deletion tombstone to currently opened file. Such a range + // deletion tombstone does NOT delete point (Put/Merge/Delete) keys in the + // same file. + // + // Range deletion tombstones may be added in any order, both with respect to + // each other and with respect to the point (Put/Merge/Delete) keys in the + // same file. + // // REQUIRES: begin_key and end_key are user keys without timestamp. - // REQUIRES: the timestamp's size is equal to what is expected by - // the comparator. + // REQUIRES: The comparator orders `begin_key` at or before `end_key` + // REQUIRES: timestamp's size is equal to what is expected by the comparator. Status DeleteRange(const Slice& begin_key, const Slice& end_key, const Slice& timestamp); @@ -177,5 +198,3 @@ class SstFileWriter { std::unique_ptr rep_; }; } // namespace ROCKSDB_NAMESPACE - -#endif // !ROCKSDB_LITE diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 42a938f30c4b..ecddf5c7a948 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -19,13 +19,14 @@ namespace ROCKSDB_NAMESPACE { /** - * Keep adding tickers here. - * 1. Any ticker should be added immediately before TICKER_ENUM_MAX, taking - * over its old value. + * Keep adding tickers here. Note that the C++ enum values, unlike the values in + * the Java bindings, are not guaranteed to be stable; also, the C++ and Java + * values for any given ticker are not guaranteed to match. + * 1. Add the new ticker before TICKER_ENUM_MAX. * 2. Add a readable string in TickersNameMap below for the newly added ticker. - * 3. Add a corresponding enum value to TickerType.java in the java API - * 4. Add the enum conversions from Java and C++ to portal.h's toJavaTickerType - * and toCppTickers + * 3. Add a corresponding enum value to TickerType.java in the Java API. + * 4. Add the enum conversions from/to Java/C++ to portal.h's toJavaTickerType + * and toCppTickers. */ enum Tickers : uint32_t { // total block cache misses @@ -50,8 +51,6 @@ enum Tickers : uint32_t { BLOCK_CACHE_INDEX_ADD, // # of bytes of index blocks inserted into cache BLOCK_CACHE_INDEX_BYTES_INSERT, - // # of bytes of index block erased from cache - BLOCK_CACHE_INDEX_BYTES_EVICT, // # of times cache miss when accessing filter block from block cache. BLOCK_CACHE_FILTER_MISS, // # of times cache hit when accessing filter block from block cache. @@ -60,8 +59,6 @@ enum Tickers : uint32_t { BLOCK_CACHE_FILTER_ADD, // # of bytes of bloom filter blocks inserted into cache BLOCK_CACHE_FILTER_BYTES_INSERT, - // # of bytes of bloom filter block erased from cache - BLOCK_CACHE_FILTER_BYTES_EVICT, // # of times cache miss when accessing data block from block cache. BLOCK_CACHE_DATA_MISS, // # of times cache hit when accessing data block from block cache. @@ -83,8 +80,6 @@ enum Tickers : uint32_t { // exist. BLOOM_FILTER_FULL_TRUE_POSITIVE, - BLOOM_FILTER_MICROS, - // # persistent cache hit PERSISTENT_CACHE_HIT, // # persistent cache miss @@ -147,39 +142,31 @@ enum Tickers : uint32_t { // The number of uncompressed bytes read from an iterator. // Includes size of key and value. ITER_BYTES_READ, - NO_FILE_CLOSES, NO_FILE_OPENS, NO_FILE_ERRORS, - // DEPRECATED Time system had to wait to do LO-L1 compactions - STALL_L0_SLOWDOWN_MICROS, - // DEPRECATED Time system had to wait to move memtable to L1. - STALL_MEMTABLE_COMPACTION_MICROS, - // DEPRECATED write throttle because of too many files in L0 - STALL_L0_NUM_FILES_MICROS, // Writer has to wait for compaction or flush to finish. STALL_MICROS, // The wait time for db mutex. // Disabled by default. To enable it set stats level to kAll DB_MUTEX_WAIT_MICROS, - RATE_LIMIT_DELAY_MILLIS, - // DEPRECATED number of iterators currently open - NO_ITERATORS, // Number of MultiGet calls, keys read, and bytes read NUMBER_MULTIGET_CALLS, NUMBER_MULTIGET_KEYS_READ, NUMBER_MULTIGET_BYTES_READ, - // Number of deletes records that were not required to be - // written to storage because key does not exist - NUMBER_FILTERED_DELETES, NUMBER_MERGE_FAILURES, - // number of times bloom was checked before creating iterator on a - // file, and the number of times the check was useful in avoiding - // iterator creation (and thus likely IOPs). + // Prefix filter stats when used for point lookups (Get / MultiGet). + // (For prefix filter stats on iterators, see *_LEVEL_SEEK_*.) + // Checked: filter was queried BLOOM_FILTER_PREFIX_CHECKED, + // Useful: filter returned false so prevented accessing data+index blocks BLOOM_FILTER_PREFIX_USEFUL, + // True positive: found a key matching the point query. When another key + // with the same prefix matches, it is considered a false positive by + // these statistics even though the filter returned a true positive. + BLOOM_FILTER_PREFIX_TRUE_POSITIVE, // Number of times we had to reseek inside an iteration to skip // over large number of keys with same userkey. @@ -188,12 +175,6 @@ enum Tickers : uint32_t { // Record the number of calls to GetUpdatesSince. Useful to keep track of // transaction log iterator refreshes GET_UPDATES_SINCE_CALLS, - BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache - BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache - // Number of blocks added to compressed block cache - BLOCK_CACHE_COMPRESSED_ADD, - // Number of failures when adding blocks to compressed block cache - BLOCK_CACHE_COMPRESSED_ADD_FAILURES, WAL_FILE_SYNCED, // Number of times WAL sync is done WAL_FILE_BYTES, // Number of bytes written to WAL @@ -201,7 +182,6 @@ enum Tickers : uint32_t { // head of the writers queue. WRITE_DONE_BY_SELF, WRITE_DONE_BY_OTHER, // Equivalent to writes done for others - WRITE_TIMEDOUT, // Number of writes ending up with timed-out. WRITE_WITH_WAL, // Number of Write calls that request WAL COMPACT_READ_BYTES, // Bytes read during compaction COMPACT_WRITE_BYTES, // Bytes written during compaction @@ -226,9 +206,13 @@ enum Tickers : uint32_t { NUMBER_BLOCK_COMPRESSED, NUMBER_BLOCK_DECOMPRESSED, + // DEPRECATED / unused (see NUMBER_BLOCK_COMPRESSION_*) NUMBER_BLOCK_NOT_COMPRESSED, + + // Tickers that record cumulative time. MERGE_OPERATION_TOTAL_TIME, FILTER_OPERATION_TOTAL_TIME, + COMPACTION_CPU_TOTAL_TIME, // Row cache. ROW_CACHE_HIT, @@ -312,20 +296,8 @@ enum Tickers : uint32_t { BLOB_DB_GC_NUM_NEW_FILES, // # of BlobDB garbage collection failures. Only applicable to legacy BlobDB. BLOB_DB_GC_FAILURES, - // # of keys dropped by BlobDB garbage collection because they had been - // overwritten. DEPRECATED. - BLOB_DB_GC_NUM_KEYS_OVERWRITTEN, - // # of keys dropped by BlobDB garbage collection because of expiration. - // DEPRECATED. - BLOB_DB_GC_NUM_KEYS_EXPIRED, // # of keys relocated to new blob file by garbage collection. BLOB_DB_GC_NUM_KEYS_RELOCATED, - // # of bytes dropped by BlobDB garbage collection because they had been - // overwritten. DEPRECATED. - BLOB_DB_GC_BYTES_OVERWRITTEN, - // # of bytes dropped by BlobDB garbage collection because of expiration. - // DEPRECATED. - BLOB_DB_GC_BYTES_EXPIRED, // # of bytes relocated to new blob file by garbage collection. BLOB_DB_GC_BYTES_RELOCATED, // # of blob files evicted because of BlobDB is full. Only applicable to @@ -363,7 +335,6 @@ enum Tickers : uint32_t { BLOCK_CACHE_COMPRESSION_DICT_HIT, BLOCK_CACHE_COMPRESSION_DICT_ADD, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, - BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT, // # of blocks redundantly inserted into block cache. // REQUIRES: BLOCK_CACHE_ADD_REDUNDANT <= BLOCK_CACHE_ADD @@ -385,14 +356,22 @@ enum Tickers : uint32_t { // # of files marked as trash by sst file manager and will be deleted // later by background thread. FILES_MARKED_TRASH, - // # of files deleted immediately by sst file manger through delete scheduler. + // # of trash files deleted by the background thread from the trash queue. + FILES_DELETED_FROM_TRASH_QUEUE, + // # of files deleted immediately by sst file manager through delete + // scheduler. FILES_DELETED_IMMEDIATELY, // The counters for error handler, not that, bg_io_error is the subset of - // bg_error and bg_retryable_io_error is the subset of bg_io_error + // bg_error and bg_retryable_io_error is the subset of bg_io_error. + // The misspelled versions are deprecated and only kept for compatibility. + // TODO: remove the misspelled tickers in the next major release. ERROR_HANDLER_BG_ERROR_COUNT, + ERROR_HANDLER_BG_ERROR_COUNT_MISSPELLED, ERROR_HANDLER_BG_IO_ERROR_COUNT, + ERROR_HANDLER_BG_IO_ERROR_COUNT_MISSPELLED, ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT, + ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT_MISSPELLED, ERROR_HANDLER_AUTORESUME_COUNT, ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT, ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT, @@ -431,7 +410,39 @@ enum Tickers : uint32_t { NON_LAST_LEVEL_READ_BYTES, NON_LAST_LEVEL_READ_COUNT, + // Statistics on iterator Seek() (and variants) for each sorted run. I.e. a + // single user Seek() can result in many sorted run Seek()s. + // The stats are split between last level and non-last level. + // Filtered: a filter such as prefix Bloom filter indicate the Seek() would + // not find anything relevant, so avoided a likely access to data+index + // blocks. + LAST_LEVEL_SEEK_FILTERED, + // Filter match: a filter such as prefix Bloom filter was queried but did + // not filter out the seek. + LAST_LEVEL_SEEK_FILTER_MATCH, + // At least one data block was accessed for a Seek() (or variant) on a + // sorted run. + LAST_LEVEL_SEEK_DATA, + // At least one value() was accessed for the seek (suggesting it was useful), + // and no filter such as prefix Bloom was queried. + LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER, + // At least one value() was accessed for the seek (suggesting it was useful), + // after querying a filter such as prefix Bloom. + LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH, + // The same set of stats, but for non-last level seeks. + NON_LAST_LEVEL_SEEK_FILTERED, + NON_LAST_LEVEL_SEEK_FILTER_MATCH, + NON_LAST_LEVEL_SEEK_DATA, + NON_LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER, + NON_LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH, + + // Number of block checksum verifications BLOCK_CHECKSUM_COMPUTE_COUNT, + // Number of times RocksDB detected a corruption while verifying a block + // checksum. RocksDB does not remember corruptions that happened during user + // reads so the same block corruption may be detected multiple times. + BLOCK_CHECKSUM_MISMATCH_COUNT, + MULTIGET_COROUTINE_COUNT, // Integrated BlobDB specific stats @@ -453,6 +464,73 @@ enum Tickers : uint32_t { // Number of errors returned to the async read callback ASYNC_READ_ERROR_COUNT, + // Fine grained secondary cache stats + SECONDARY_CACHE_FILTER_HITS, + SECONDARY_CACHE_INDEX_HITS, + SECONDARY_CACHE_DATA_HITS, + + // Number of lookup into the prefetched tail (see + // `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) + // that can't find its data for table open + TABLE_OPEN_PREFETCH_TAIL_MISS, + // Number of lookup into the prefetched tail (see + // `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) + // that finds its data for table open + TABLE_OPEN_PREFETCH_TAIL_HIT, + + // Statistics on the filtering by user-defined timestamps + // # of times timestamps are checked on accessing the table + TIMESTAMP_FILTER_TABLE_CHECKED, + // # of times timestamps can successfully help skip the table access + TIMESTAMP_FILTER_TABLE_FILTERED, + + // Number of input bytes (uncompressed) to compression for SST blocks that + // are stored compressed. + BYTES_COMPRESSED_FROM, + // Number of output bytes (compressed) from compression for SST blocks that + // are stored compressed. + BYTES_COMPRESSED_TO, + // Number of uncompressed bytes for SST blocks that are stored uncompressed + // because compression type is kNoCompression, or some error case caused + // compression not to run or produce an output. Index blocks are only counted + // if enable_index_compression is true. + BYTES_COMPRESSION_BYPASSED, + // Number of input bytes (uncompressed) to compression for SST blocks that + // are stored uncompressed because the compression result was rejected, + // either because the ratio was not acceptable (see + // CompressionOptions::max_compressed_bytes_per_kb) or found invalid by the + // `verify_compression` option. + BYTES_COMPRESSION_REJECTED, + + // Like BYTES_COMPRESSION_BYPASSED but counting number of blocks + NUMBER_BLOCK_COMPRESSION_BYPASSED, + // Like BYTES_COMPRESSION_REJECTED but counting number of blocks + NUMBER_BLOCK_COMPRESSION_REJECTED, + + // Number of input bytes (compressed) to decompression in reading compressed + // SST blocks from storage. + BYTES_DECOMPRESSED_FROM, + // Number of output bytes (uncompressed) from decompression in reading + // compressed SST blocks from storage. + BYTES_DECOMPRESSED_TO, + + // Number of times readahead is trimmed during scans when + // ReadOptions.auto_readahead_size is set. + READAHEAD_TRIMMED, + + // Number of FIFO compactions that drop files based on different reasons + FIFO_MAX_SIZE_COMPACTIONS, + FIFO_TTL_COMPACTIONS, + + // Number of bytes prefetched during user initiated scan + PREFETCH_BYTES, + + // Number of prefetched bytes that were actually useful + PREFETCH_BYTES_USEFUL, + + // Number of FS reads avoided due to scan prefetching + PREFETCH_HITS, + TICKER_ENUM_MAX }; @@ -461,12 +539,15 @@ enum Tickers : uint32_t { extern const std::vector> TickersNameMap; /** - * Keep adding histogram's here. - * Any histogram should have value less than HISTOGRAM_ENUM_MAX - * Add a new Histogram by assigning it the current value of HISTOGRAM_ENUM_MAX - * Add a string representation in HistogramsNameMap below - * And increment HISTOGRAM_ENUM_MAX - * Add a corresponding enum value to HistogramType.java in the java API + * Keep adding histograms here. Note that the C++ enum values, unlike the values + * in the Java bindings, are not guaranteed to be stable; also, the C++ and Java + * values for any given histogram are not guaranteed to match. + * 1. Add the new histogram before HISTOGRAM_ENUM_MAX. + * 2. Add a readable string in HistogramsNameMap below for the newly added + * histogram. + * 3. Add a corresponding enum value to HistogramType.java in the Java API. + * 4. Add the enum conversions from/to Java/C++ to portal.h's + * toJavaHistogramsType and toCppHistograms. */ enum Histograms : uint32_t { DB_GET = 0, @@ -484,15 +565,24 @@ enum Histograms : uint32_t { READ_BLOCK_COMPACTION_MICROS, READ_BLOCK_GET_MICROS, WRITE_RAW_BLOCK_MICROS, - STALL_L0_SLOWDOWN_COUNT, - STALL_MEMTABLE_COMPACTION_COUNT, - STALL_L0_NUM_FILES_COUNT, - HARD_RATE_LIMIT_DELAY_COUNT, - SOFT_RATE_LIMIT_DELAY_COUNT, NUM_FILES_IN_SINGLE_COMPACTION, DB_SEEK, WRITE_STALL, + // Time spent in reading block-based or plain SST table SST_READ_MICROS, + // Time spent in reading SST table (currently only block-based table) or blob + // file corresponding to `Env::IOActivity` + FILE_READ_FLUSH_MICROS, + FILE_READ_COMPACTION_MICROS, + FILE_READ_DB_OPEN_MICROS, + // The following `FILE_READ_*` require stats level greater than + // `StatsLevel::kExceptDetailedTimers` + FILE_READ_GET_MICROS, + FILE_READ_MULTIGET_MICROS, + FILE_READ_DB_ITERATOR_MICROS, + FILE_READ_VERIFY_DB_CHECKSUM_MICROS, + FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS, + // The number of subcompactions actually scheduled during a compaction NUM_SUBCOMPACTIONS_SCHEDULED, // Value size distribution in each operation @@ -500,10 +590,8 @@ enum Histograms : uint32_t { BYTES_PER_WRITE, BYTES_PER_MULTIGET, - // number of bytes compressed/decompressed - // number of bytes is when uncompressed; i.e. before/after respectively - BYTES_COMPRESSED, - BYTES_DECOMPRESSED, + BYTES_COMPRESSED, // DEPRECATED / unused (see BYTES_COMPRESSED_{FROM,TO}) + BYTES_DECOMPRESSED, // DEPRECATED / unused (see BYTES_DECOMPRESSED_{FROM,TO}) COMPRESSION_TIMES_NANOS, DECOMPRESSION_TIMES_NANOS, // Number of merge operands passed to the merge operator in user read @@ -535,8 +623,6 @@ enum Histograms : uint32_t { BLOB_DB_BLOB_FILE_READ_MICROS, // Blob file sync latency. BLOB_DB_BLOB_FILE_SYNC_MICROS, - // BlobDB garbage collection time. DEPRECATED. - BLOB_DB_GC_MICROS, // BlobDB compression time. BLOB_DB_COMPRESSION_MICROS, // BlobDB decompression time. @@ -548,9 +634,6 @@ enum Histograms : uint32_t { // MultiGet stats logged per level // Num of index and filter blocks read from file system per level. NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, - // Num of data blocks read from file system per level. - // Obsolete - NUM_DATA_BLOCKS_READ_PER_LEVEL, // Num of sst files read from file system per level. NUM_SST_READ_PER_LEVEL, @@ -573,7 +656,11 @@ enum Histograms : uint32_t { // Wait time for aborting async read in FilePrefetchBuffer destructor ASYNC_PREFETCH_ABORT_MICROS, - HISTOGRAM_ENUM_MAX, + // Number of bytes read for RocksDB's prefetching contents (as opposed to file + // system's prefetch) from the end of SST table during block based table open + TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, + + HISTOGRAM_ENUM_MAX }; extern const std::vector> HistogramsNameMap; @@ -645,7 +732,7 @@ class Statistics : public Customizable { virtual void histogramData(uint32_t type, HistogramData* const data) const = 0; virtual std::string getHistogramString(uint32_t /*type*/) const { return ""; } - virtual void recordTick(uint32_t tickerType, uint64_t count = 0) = 0; + virtual void recordTick(uint32_t tickerType, uint64_t count = 1) = 0; virtual void setTickerCount(uint32_t tickerType, uint64_t count) = 0; virtual uint64_t getAndResetTickerCount(uint32_t tickerType) = 0; virtual void reportTimeToHistogram(uint32_t histogramType, uint64_t time) { @@ -672,9 +759,7 @@ class Statistics : public Customizable { // Resets all ticker and histogram stats virtual Status Reset() { return Status::NotSupported("Not implemented"); } -#ifndef ROCKSDB_LITE using Customizable::ToString; -#endif // ROCKSDB_LITE // String representation of the statistic object. Must be thread-safe. virtual std::string ToString() const { // Do nothing by default diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index 39af9455991c..82597239fff7 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -113,6 +113,8 @@ class Status { kOverwritten = 12, kTxnNotPrepared = 13, kIOFenced = 14, + kMergeOperatorFailed = 15, + kMergeOperandThresholdExceeded = 16, kMaxSubCode }; @@ -149,6 +151,25 @@ class Status { return state_.get(); } + // Override this status with another, unless this status is already non-ok. + // Returns *this. Thus, the result of `a.UpdateIfOk(b).UpdateIfOk(c)` is + // non-ok (and `a` modified as such) iff any input was non-ok, with + // left-most taking precedence as far as the details. + Status& UpdateIfOk(Status&& s) { + if (code() == kOk) { + *this = std::move(s); + } else { + // Alright to ignore that status as long as this one is checked + s.PermitUncheckedError(); + } + MustCheck(); + return *this; + } + + Status& UpdateIfOk(const Status& s) { + return UpdateIfOk(std::forward(Status(s))); + } + // Return a success status. static Status OK() { return Status(); } @@ -158,6 +179,14 @@ class Status { // changing public APIs. static Status OkOverwritten() { return Status(kOk, kOverwritten); } + // Successful, though the number of operands merged during the query exceeded + // the threshold. Note: using variants of OK status for program logic is + // discouraged, but it can be useful for communicating statistical information + // without changing public APIs. + static Status OkMergeOperandThresholdExceeded() { + return Status(kOk, kMergeOperandThresholdExceeded); + } + // Return error status of an appropriate type. static Status NotFound(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kNotFound, msg, msg2); @@ -300,6 +329,13 @@ class Status { return code() == kOk && subcode() == kOverwritten; } + // Returns true iff the status indicates success *with* the number of operands + // merged exceeding the threshold + bool IsOkMergeOperandThresholdExceeded() const { + MarkChecked(); + return code() == kOk && subcode() == kMergeOperandThresholdExceeded; + } + // Returns true iff the status indicates a NotFound error. bool IsNotFound() const { MarkChecked(); diff --git a/include/rocksdb/system_clock.h b/include/rocksdb/system_clock.h index 486183d60e60..c4cfcecb552d 100644 --- a/include/rocksdb/system_clock.h +++ b/include/rocksdb/system_clock.h @@ -9,9 +9,11 @@ #pragma once #include +#include #include #include "rocksdb/customizable.h" +#include "rocksdb/port_defs.h" #include "rocksdb/rocksdb_namespace.h" #include "rocksdb/status.h" @@ -68,6 +70,14 @@ class SystemClock : public Customizable { // Sleep/delay the thread for the prescribed number of micro-seconds. virtual void SleepForMicroseconds(int micros) = 0; + // For internal use/extension only. + // + // Issues a wait on `cv` that times out at `deadline`. May wakeup and return + // spuriously. + // + // Returns true if wait timed out, false otherwise + virtual bool TimedWait(port::CondVar* cv, std::chrono::microseconds deadline); + // Get the number of seconds since the Epoch, 1970-01-01 00:00:00 (UTC). // Only overwrites *unix_time on success. virtual Status GetCurrentTime(int64_t* unix_time) = 0; @@ -94,6 +104,11 @@ class SystemClockWrapper : public SystemClock { return target_->SleepForMicroseconds(micros); } + virtual bool TimedWait(port::CondVar* cv, + std::chrono::microseconds deadline) override { + return target_->TimedWait(cv, deadline); + } + Status GetCurrentTime(int64_t* unix_time) override { return target_->GetCurrentTime(unix_time); } @@ -103,10 +118,8 @@ class SystemClockWrapper : public SystemClock { } Status PrepareOptions(const ConfigOptions& options) override; -#ifndef ROCKSDB_LITE std::string SerializeOptions(const ConfigOptions& config_options, const std::string& header) const override; -#endif // ROCKSDB_LITE const Customizable* Inner() const override { return target_.get(); } protected: diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 3a2bf26299e3..d19a95fa8e4d 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -47,7 +47,10 @@ struct EnvOptions; // Types of checksums to use for checking integrity of logical blocks within // files. All checksums currently use 32 bits of checking power (1 in 4B -// chance of failing to detect random corruption). +// chance of failing to detect random corruption). Traditionally, the actual +// checking power can be far from ideal if the corruption is due to misplaced +// data (e.g. physical blocks out of order in a file, or from another file), +// which is fixed in format_version=6 (see below). enum ChecksumType : char { kNoChecksum = 0x0, kCRC32c = 0x1, @@ -259,22 +262,13 @@ struct BlockBasedTableOptions { bool no_block_cache = false; // If non-NULL use the specified cache for blocks. - // If NULL, rocksdb will automatically create and use an 8MB internal cache. + // If NULL, rocksdb will automatically create and use a 32MB internal cache. std::shared_ptr block_cache = nullptr; // If non-NULL use the specified cache for pages read from device // IF NULL, no page cache is used std::shared_ptr persistent_cache = nullptr; - // DEPRECATED: This feature is planned for removal in a future release. - // Use SecondaryCache instead. - // - // If non-NULL use the specified cache for compressed blocks. - // If NULL, rocksdb will not use a compressed block cache. - // Note: though it looks similar to `block_cache`, RocksDB doesn't put the - // same type of object there. - std::shared_ptr block_cache_compressed = nullptr; - // Approximate size of user data packed per block. Note that the // block size specified here corresponds to uncompressed data. The // actual size of the unit read from disk may be smaller if @@ -521,6 +515,9 @@ struct BlockBasedTableOptions { // 5 -- Can be read by RocksDB's versions since 6.6.0. Full and partitioned // filters use a generally faster and more accurate Bloom filter // implementation, with a different schema. + // 6 -- Modified the file footer and checksum matching so that SST data + // misplaced within or between files is as likely to fail checksum + // verification as random corruption. Also checksum-protects SST footer. uint32_t format_version = 5; // Store index blocks on disk in compressed format. Changing this option to @@ -680,7 +677,6 @@ struct BlockBasedTablePropertyNames { extern TableFactory* NewBlockBasedTableFactory( const BlockBasedTableOptions& table_options = BlockBasedTableOptions()); -#ifndef ROCKSDB_LITE enum EncodingType : char { // Always write full keys without any special encoding. @@ -837,7 +833,6 @@ struct CuckooTableOptions { extern TableFactory* NewCuckooTableFactory( const CuckooTableOptions& table_options = CuckooTableOptions()); -#endif // ROCKSDB_LITE class RandomAccessFileReader; @@ -919,7 +914,6 @@ class TableFactory : public Customizable { virtual bool IsDeleteRangeSupported() const { return false; } }; -#ifndef ROCKSDB_LITE // Create a special table factory that can open either of the supported // table formats, based on setting inside the SST files. It should be used to // convert a DB from one table format to another. @@ -935,6 +929,5 @@ extern TableFactory* NewAdaptiveTableFactory( std::shared_ptr plain_table_factory = nullptr, std::shared_ptr cuckoo_table_factory = nullptr); -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index cbe87fa3af1a..052df35035b5 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -70,6 +70,8 @@ struct TablePropertiesNames { static const std::string kSlowCompressionEstimatedDataSize; static const std::string kFastCompressionEstimatedDataSize; static const std::string kSequenceNumberTimeMapping; + static const std::string kTailStartOffset; + static const std::string kUserDefinedTimestampsPersisted; }; // `TablePropertiesCollector` provides the mechanism for users to collect @@ -120,12 +122,15 @@ class TablePropertiesCollector { // Finish() will be called when a table has already been built and is ready // for writing the properties block. + // It will be called only once by RocksDB internal. + // // @params properties User will add their collected statistics to // `properties`. virtual Status Finish(UserCollectedProperties* properties) = 0; // Return the human-readable properties, where the key is property name and // the value is the human-readable form of value. + // It will only be called after Finish() has been called by RocksDB internal. virtual UserCollectedProperties GetReadableProperties() const = 0; // The name of the properties collector can be used for debugging purpose. @@ -217,9 +222,20 @@ struct TableProperties { // by column_family_name. uint64_t column_family_id = ROCKSDB_NAMESPACE:: TablePropertiesCollectorFactory::Context::kUnknownColumnFamily; - // Timestamp of the latest key. 0 means unknown. - // TODO(sagar0): Should be changed to latest_key_time ... but don't know the - // full implications of backward compatibility. Hence retaining for now. + + // Oldest ancester time. 0 means unknown. + // + // For flush output file, oldest ancestor time is the oldest key time in the + // file. If the oldest key time is not available, flush time is used. + // + // For compaction output file, oldest ancestor time is the oldest + // among all the oldest key time of its input files, since the file could be + // the compaction output from other SST files, which could in turn be outputs + // for compact older SST files. If that's not available, creation time of this + // compaction output file is used. + // + // TODO(sagar0): Should be changed to oldest_ancester_time ... but don't know + // the full implications of backward compatibility. Hence retaining for now. uint64_t creation_time = 0; // Timestamp of the earliest key. 0 means unknown. @@ -239,6 +255,15 @@ struct TableProperties { // 0 means not exists. uint64_t external_sst_file_global_seqno_offset = 0; + // Offset where the "tail" part of SST file starts + // "Tail" refers to all blocks after data blocks till the end of the SST file + uint64_t tail_start_offset = 0; + + // Value of the `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` + // when the file is created. Default to be true, only when this flag is false, + // it's explicitly written to meta properties block. + uint64_t user_defined_timestamps_persisted = 1; + // DB identity // db_id is an identifier generated the first time the DB is created // If DB identity is unset or unassigned, `db_id` will be an empty string. diff --git a/include/rocksdb/thread_status.h b/include/rocksdb/thread_status.h index 1b5f8c046328..5bc6eeb2b440 100644 --- a/include/rocksdb/thread_status.h +++ b/include/rocksdb/thread_status.h @@ -22,7 +22,7 @@ #include "rocksdb/rocksdb_namespace.h" -#if !defined(ROCKSDB_LITE) && !defined(NROCKSDB_THREAD_STATUS) +#if !defined(NROCKSDB_THREAD_STATUS) #define ROCKSDB_USING_THREAD_STATUS #endif @@ -56,6 +56,14 @@ struct ThreadStatus { OP_UNKNOWN = 0, OP_COMPACTION, OP_FLUSH, + OP_DBOPEN, + OP_GET, + OP_MULTIGET, + OP_DBITERATOR, + OP_VERIFY_DB_CHECKSUM, + OP_VERIFY_FILE_CHECKSUMS, + OP_GETENTITY, + OP_MULTIGETENTITY, NUM_OP_TYPES }; diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h index 6fb53d8466f3..c9c2146865a1 100644 --- a/include/rocksdb/types.h +++ b/include/rocksdb/types.h @@ -7,6 +7,9 @@ #include +#include +#include + #include "rocksdb/slice.h" namespace ROCKSDB_NAMESPACE { @@ -18,6 +21,10 @@ using ColumnFamilyId = uint32_t; // Represents a sequence number in a WAL file. using SequenceNumber = uint64_t; +struct TableProperties; +using TablePropertiesCollection = + std::unordered_map>; + const SequenceNumber kMinUnCommittedSeq = 1; // 0 is always committed enum class TableFileCreationReason { @@ -63,4 +70,32 @@ enum EntryType { kEntryOther, }; +enum class WriteStallCause { + // Beginning of CF-scope write stall causes + // + // Always keep `kMemtableLimit` as the first stat in this section + kMemtableLimit, + kL0FileCountLimit, + kPendingCompactionBytes, + kCFScopeWriteStallCauseEnumMax, + // End of CF-scope write stall causes + + // Beginning of DB-scope write stall causes + // + // Always keep `kWriteBufferManagerLimit` as the first stat in this section + kWriteBufferManagerLimit, + kDBScopeWriteStallCauseEnumMax, + // End of DB-scope write stall causes + + // Always add new WriteStallCause before `kNone` + kNone, +}; + +enum class WriteStallCondition { + kDelayed, + kStopped, + // Always add new WriteStallCondition before `kNormal` + kNormal, +}; + } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/utilities/backup_engine.h b/include/rocksdb/utilities/backup_engine.h index 892c9493240a..204d12d6fea5 100644 --- a/include/rocksdb/utilities/backup_engine.h +++ b/include/rocksdb/utilities/backup_engine.h @@ -8,7 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -688,4 +687,3 @@ class BackupEngineReadOnly : public BackupEngineReadOnlyBase { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/cache_dump_load.h b/include/rocksdb/utilities/cache_dump_load.h index fde03db7e68e..8b91bb7e15b6 100644 --- a/include/rocksdb/utilities/cache_dump_load.h +++ b/include/rocksdb/utilities/cache_dump_load.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include @@ -139,4 +138,3 @@ Status NewDefaultCacheDumpedLoader( std::unique_ptr* cache_dump_loader); } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/checkpoint.h b/include/rocksdb/utilities/checkpoint.h index 6046513aba4b..6509f38d9693 100644 --- a/include/rocksdb/utilities/checkpoint.h +++ b/include/rocksdb/utilities/checkpoint.h @@ -6,7 +6,6 @@ // A checkpoint is an openable snapshot of a database at a point in time. #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -62,4 +61,3 @@ class Checkpoint { }; } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/include/rocksdb/utilities/customizable_util.h b/include/rocksdb/utilities/customizable_util.h index 62240763b5bb..adf254054079 100644 --- a/include/rocksdb/utilities/customizable_util.h +++ b/include/rocksdb/utilities/customizable_util.h @@ -13,7 +13,6 @@ // for more information on how to develop and use customizable objects #pragma once -#include #include #include @@ -24,24 +23,6 @@ #include "rocksdb/utilities/object_registry.h" namespace ROCKSDB_NAMESPACE { -// The FactoryFunc functions are used to create a new customizable object -// without going through the ObjectRegistry. This methodology is especially -// useful in LITE mode, where there is no ObjectRegistry. The methods take -// in an ID of the object to create and a pointer to store the created object. -// If the factory successfully recognized the input ID, the method should return -// success; otherwise false should be returned. On success, the object -// parameter contains the new object. -template -using SharedFactoryFunc = - std::function*)>; - -template -using UniqueFactoryFunc = - std::function*)>; - -template -using StaticFactoryFunc = std::function; - // Creates a new shared customizable instance object based on the // input parameters using the object registry. // @@ -69,11 +50,7 @@ static Status NewSharedObject( std::shared_ptr* result) { if (!id.empty()) { Status status; -#ifndef ROCKSDB_LITE status = config_options.registry->NewSharedObject(id, result); -#else - status = Status::NotSupported("Cannot load object in LITE mode ", id); -#endif // ROCKSDB_LITE if (config_options.ignore_unsupported_options && status.IsNotSupported()) { status = Status::OK(); } else if (status.ok()) { @@ -123,16 +100,10 @@ static Status NewManagedObject( std::shared_ptr* result) { Status status; if (!id.empty()) { -#ifndef ROCKSDB_LITE status = config_options.registry->GetOrCreateManagedObject( id, result, [config_options, opt_map](T* object) { return object->ConfigureFromMap(config_options, opt_map); }); -#else - (void)result; - (void)opt_map; - status = Status::NotSupported("Cannot load object in LITE mode ", id); -#endif // ROCKSDB_LITE if (config_options.ignore_unsupported_options && status.IsNotSupported()) { return Status::OK(); } @@ -166,12 +137,10 @@ static Status NewManagedObject( // handled // @param value Either the simple name of the instance to create, or a set of // name-value pairs to create and initailize the object -// @param func Optional function to call to attempt to create an instance // @param result The newly created instance. template static Status LoadSharedObject(const ConfigOptions& config_options, const std::string& value, - const SharedFactoryFunc& func, std::shared_ptr* result) { std::string id; std::unordered_map opt_map; @@ -180,12 +149,8 @@ static Status LoadSharedObject(const ConfigOptions& config_options, value, &id, &opt_map); if (!status.ok()) { // GetOptionsMap failed return status; - } else if (func == nullptr || - !func(id, result)) { // No factory, or it failed - return NewSharedObject(config_options, id, opt_map, result); } else { - return Customizable::ConfigureNewObject(config_options, result->get(), - opt_map); + return NewSharedObject(config_options, id, opt_map, result); } } @@ -214,7 +179,6 @@ static Status LoadSharedObject(const ConfigOptions& config_options, // handled // @param value Either the simple name of the instance to create, or a set of // name-value pairs to create and initailize the object -// @param func Optional function to call to attempt to create an instance // @param result The newly created instance. template static Status LoadManagedObject(const ConfigOptions& config_options, @@ -254,11 +218,7 @@ static Status NewUniqueObject( std::unique_ptr* result) { if (!id.empty()) { Status status; -#ifndef ROCKSDB_LITE status = config_options.registry->NewUniqueObject(id, result); -#else - status = Status::NotSupported("Cannot load object in LITE mode ", id); -#endif // ROCKSDB_LITE if (config_options.ignore_unsupported_options && status.IsNotSupported()) { status = Status::OK(); } else if (status.ok()) { @@ -284,12 +244,10 @@ static Status NewUniqueObject( // handled // @param value Either the simple name of the instance to create, or a set of // name-value pairs to create and initailize the object -// @param func Optional function to call to attempt to create an instance // @param result The newly created instance. template static Status LoadUniqueObject(const ConfigOptions& config_options, const std::string& value, - const UniqueFactoryFunc& func, std::unique_ptr* result) { std::string id; std::unordered_map opt_map; @@ -297,12 +255,8 @@ static Status LoadUniqueObject(const ConfigOptions& config_options, value, &id, &opt_map); if (!status.ok()) { // GetOptionsMap failed return status; - } else if (func == nullptr || - !func(id, result)) { // No factory, or it failed - return NewUniqueObject(config_options, id, opt_map, result); } else { - return Customizable::ConfigureNewObject(config_options, result->get(), - opt_map); + return NewUniqueObject(config_options, id, opt_map, result); } } @@ -325,11 +279,7 @@ static Status NewStaticObject( const std::unordered_map& opt_map, T** result) { if (!id.empty()) { Status status; -#ifndef ROCKSDB_LITE status = config_options.registry->NewStaticObject(id, result); -#else - status = Status::NotSupported("Cannot load object in LITE mode ", id); -#endif // ROCKSDB_LITE if (config_options.ignore_unsupported_options && status.IsNotSupported()) { status = Status::OK(); } else if (status.ok()) { @@ -355,23 +305,18 @@ static Status NewStaticObject( // handled // @param value Either the simple name of the instance to create, or a set of // name-value pairs to create and initailize the object -// @param func Optional function to call to attempt to create an instance // @param result The newly created instance. template static Status LoadStaticObject(const ConfigOptions& config_options, - const std::string& value, - const StaticFactoryFunc& func, T** result) { + const std::string& value, T** result) { std::string id; std::unordered_map opt_map; Status status = Customizable::GetOptionsMap(config_options, *result, value, &id, &opt_map); if (!status.ok()) { // GetOptionsMap failed return status; - } else if (func == nullptr || - !func(id, result)) { // No factory, or it failed - return NewStaticObject(config_options, id, opt_map, result); } else { - return Customizable::ConfigureNewObject(config_options, *result, opt_map); + return NewStaticObject(config_options, id, opt_map, result); } } } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/utilities/db_ttl.h b/include/rocksdb/utilities/db_ttl.h index d57e7473ad97..12f5cbac0f75 100644 --- a/include/rocksdb/utilities/db_ttl.h +++ b/include/rocksdb/utilities/db_ttl.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -69,4 +68,3 @@ class DBWithTTL : public StackableDB { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/debug.h b/include/rocksdb/utilities/debug.h index 0e05265573c5..e1fc76e3e0e4 100644 --- a/include/rocksdb/utilities/debug.h +++ b/include/rocksdb/utilities/debug.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include "rocksdb/db.h" #include "rocksdb/types.h" @@ -45,4 +44,3 @@ Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key, } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/env_mirror.h b/include/rocksdb/utilities/env_mirror.h index ffde5effad06..2a1261287087 100644 --- a/include/rocksdb/utilities/env_mirror.h +++ b/include/rocksdb/utilities/env_mirror.h @@ -18,7 +18,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -178,4 +177,3 @@ class EnvMirror : public EnvWrapper { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/ldb_cmd.h b/include/rocksdb/utilities/ldb_cmd.h index 00763819257e..ed4f5de7e862 100644 --- a/include/rocksdb/utilities/ldb_cmd.h +++ b/include/rocksdb/utilities/ldb_cmd.h @@ -5,7 +5,6 @@ // #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -227,6 +226,12 @@ class LDBCommand { static std::string PrintKeyValue(const std::string& key, const std::string& value, bool is_hex); + static std::string PrintKeyValueOrWideColumns(const Slice& key, + const Slice& value, + const WideColumns& wide_columns, + bool is_key_hex, + bool is_value_hex); + /** * Return true if the specified flag is present in the specified flags vector */ @@ -314,5 +319,3 @@ class LDBCommandRunner { }; } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/memory_util.h b/include/rocksdb/utilities/memory_util.h index 4f1606b51945..b141b4ef0e80 100644 --- a/include/rocksdb/utilities/memory_util.h +++ b/include/rocksdb/utilities/memory_util.h @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #pragma once @@ -47,4 +46,3 @@ class MemoryUtil { std::map* usage_by_type); }; } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/include/rocksdb/utilities/object_registry.h b/include/rocksdb/utilities/object_registry.h index 3bafb837c8af..613ef1cd9b06 100644 --- a/include/rocksdb/utilities/object_registry.h +++ b/include/rocksdb/utilities/object_registry.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -582,4 +581,3 @@ class ObjectRegistry { mutable std::mutex library_mutex_; // Mutex for managed libraries }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/optimistic_transaction_db.h b/include/rocksdb/utilities/optimistic_transaction_db.h index c070e49a309b..0925eaf0a345 100644 --- a/include/rocksdb/utilities/optimistic_transaction_db.h +++ b/include/rocksdb/utilities/optimistic_transaction_db.h @@ -4,8 +4,8 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE +#include #include #include @@ -44,11 +44,42 @@ enum class OccValidationPolicy { kValidateParallel = 1 }; +class OccLockBuckets { + public: + // Most details in internal derived class. + // Users should not derive from this class. + virtual ~OccLockBuckets() {} + + virtual size_t ApproximateMemoryUsage() const = 0; + + private: + friend class OccLockBucketsImplBase; + OccLockBuckets() {} +}; + +// An object for sharing a pool of locks across DB instances. +// +// Making the locks cache-aligned avoids potential false sharing, at the +// potential cost of extra memory. The implementation has historically +// used cache_aligned = false. +std::shared_ptr MakeSharedOccLockBuckets( + size_t bucket_count, bool cache_aligned = false); + struct OptimisticTransactionDBOptions { OccValidationPolicy validate_policy = OccValidationPolicy::kValidateParallel; - // works only if validate_policy == OccValidationPolicy::kValidateParallel + // Number of striped/bucketed mutex locks for validating transactions. + // Used on only if validate_policy == OccValidationPolicy::kValidateParallel + // and shared_lock_buckets (below) is empty. Larger number potentially + // reduces contention but uses more memory. uint32_t occ_lock_buckets = (1 << 20); + + // A pool of mutex locks for validating transactions. Can be shared among + // DBs. Ignored if validate_policy != OccValidationPolicy::kValidateParallel. + // If empty and validate_policy == OccValidationPolicy::kValidateParallel, + // an OccLockBuckets will be created using the count in occ_lock_buckets. + // See MakeSharedOccLockBuckets() + std::shared_ptr shared_lock_buckets; }; // Range deletions (including those in `WriteBatch`es passed to `Write()`) are @@ -96,5 +127,3 @@ class OptimisticTransactionDB : public StackableDB { }; } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/options_type.h b/include/rocksdb/utilities/options_type.h index cd340ed59678..782b14e652ad 100644 --- a/include/rocksdb/utilities/options_type.h +++ b/include/rocksdb/utilities/options_type.h @@ -40,8 +40,9 @@ enum class OptionType { kUInt32T, kUInt64T, kSizeT, - kString, kDouble, + kAtomicInt, + kString, kCompactionStyle, kCompactionPri, kCompressionType, diff --git a/include/rocksdb/utilities/options_util.h b/include/rocksdb/utilities/options_util.h index 064c087f05c8..8d9488f5f885 100644 --- a/include/rocksdb/utilities/options_util.h +++ b/include/rocksdb/utilities/options_util.h @@ -6,7 +6,6 @@ // This file contains utility functions for RocksDB Options. #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -50,15 +49,12 @@ struct ConfigOptions; // casting the return value of TableFactory::GetOptions() to // BlockBasedTableOptions and making necessary changes. // -// ignore_unknown_options can be set to true if you want to ignore options -// that are from a newer version of the db, essentially for forward -// compatibility. -// // config_options contains a set of options that controls the processing -// of the options. The LoadLatestOptions(ConfigOptions...) should be preferred; -// the alternative signature may be deprecated in a future release. The -// equivalent functionality can be achieved by setting the corresponding options -// in the ConfigOptions parameter. +// of the options. +// +// config_options.ignore_unknown_options can be set to true if you want to +// ignore options that are from a newer version of the db, essentially for +// forward compatibility. // // examples/options_file_example.cc demonstrates how to use this function // to open a RocksDB instance. @@ -70,11 +66,6 @@ struct ConfigOptions; // to the options file itself. // // @see LoadOptionsFromFile -Status LoadLatestOptions(const std::string& dbpath, Env* env, - DBOptions* db_options, - std::vector* cf_descs, - bool ignore_unknown_options = false, - std::shared_ptr* cache = {}); Status LoadLatestOptions(const ConfigOptions& config_options, const std::string& dbpath, DBOptions* db_options, std::vector* cf_descs, @@ -83,17 +74,7 @@ Status LoadLatestOptions(const ConfigOptions& config_options, // Similar to LoadLatestOptions, this function constructs the DBOptions // and ColumnFamilyDescriptors based on the specified RocksDB Options file. // -// The LoadOptionsFile(ConfigOptions...) should be preferred; -// the alternative signature may be deprecated in a future release. The -// equivalent functionality can be achieved by setting the corresponding -// options in the ConfigOptions parameter. -// // @see LoadLatestOptions -Status LoadOptionsFromFile(const std::string& options_file_name, Env* env, - DBOptions* db_options, - std::vector* cf_descs, - bool ignore_unknown_options = false, - std::shared_ptr* cache = {}); Status LoadOptionsFromFile(const ConfigOptions& config_options, const std::string& options_file_name, DBOptions* db_options, @@ -115,14 +96,10 @@ Status GetLatestOptionsFileName(const std::string& dbpath, Env* env, // * prefix_extractor // * table_factory // * merge_operator -Status CheckOptionsCompatibility( - const std::string& dbpath, Env* env, const DBOptions& db_options, - const std::vector& cf_descs, - bool ignore_unknown_options = false); +// * persist_user_defined_timestamps Status CheckOptionsCompatibility( const ConfigOptions& config_options, const std::string& dbpath, const DBOptions& db_options, const std::vector& cf_descs); } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/include/rocksdb/utilities/replayer.h b/include/rocksdb/utilities/replayer.h index 4fdd8d73a7aa..fc5319989bc9 100644 --- a/include/rocksdb/utilities/replayer.h +++ b/include/rocksdb/utilities/replayer.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -84,4 +83,3 @@ class Replayer { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/sim_cache.h b/include/rocksdb/utilities/sim_cache.h index a682c7748415..6c52453e7e94 100644 --- a/include/rocksdb/utilities/sim_cache.h +++ b/include/rocksdb/utilities/sim_cache.h @@ -10,7 +10,7 @@ #include #include -#include "rocksdb/cache.h" +#include "rocksdb/advanced_cache.h" #include "rocksdb/env.h" #include "rocksdb/slice.h" #include "rocksdb/statistics.h" @@ -42,13 +42,10 @@ extern std::shared_ptr NewSimCache(std::shared_ptr sim_cache, std::shared_ptr cache, int num_shard_bits); -class SimCache : public Cache { +// An abstract base class (public interface) to the SimCache implementation +class SimCache : public CacheWrapper { public: - SimCache() {} - - ~SimCache() override {} - - const char* Name() const override { return "SimCache"; } + using CacheWrapper::CacheWrapper; // returns the maximum configured capacity of the simcache for simulation virtual size_t GetSimCapacity() const = 0; diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index 7b9b8ca29f19..6f4a8af00d73 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -92,6 +92,10 @@ class StackableDB : public DB { const WideColumns& columns) override { return db_->PutEntity(options, column_family, key, columns); } + Status PutEntity(const WriteOptions& options, const Slice& key, + const AttributeGroups& attribute_groups) override { + return db_->PutEntity(options, key, attribute_groups); + } using DB::Get; virtual Status Get(const ReadOptions& options, @@ -136,6 +140,24 @@ class StackableDB : public DB { statuses, sorted_input); } + using DB::MultiGetEntity; + + void MultiGetEntity(const ReadOptions& options, + ColumnFamilyHandle* column_family, size_t num_keys, + const Slice* keys, PinnableWideColumns* results, + Status* statuses, bool sorted_input) override { + db_->MultiGetEntity(options, column_family, num_keys, keys, results, + statuses, sorted_input); + } + + void MultiGetEntity(const ReadOptions& options, size_t num_keys, + ColumnFamilyHandle** column_families, const Slice* keys, + PinnableWideColumns* results, Status* statuses, + bool sorted_input) override { + db_->MultiGetEntity(options, num_keys, column_families, keys, results, + statuses, sorted_input); + } + using DB::IngestExternalFile; virtual Status IngestExternalFile( ColumnFamilyHandle* column_family, @@ -160,6 +182,22 @@ class StackableDB : public DB { import_options, metadata, handle); } + virtual Status CreateColumnFamilyWithImport( + const ColumnFamilyOptions& options, const std::string& column_family_name, + const ImportColumnFamilyOptions& import_options, + const std::vector& metadatas, + ColumnFamilyHandle** handle) override { + return db_->CreateColumnFamilyWithImport(options, column_family_name, + import_options, metadatas, handle); + } + + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) override { + return db_->ClipColumnFamily(column_family, begin_key, end_key); + } + using DB::VerifyFileChecksums; Status VerifyFileChecksums(const ReadOptions& read_opts) override { return db_->VerifyFileChecksums(read_opts); @@ -336,6 +374,11 @@ class StackableDB : public DB { return db_->DisableManualCompaction(); } + virtual Status WaitForCompact( + const WaitForCompactOptions& wait_for_compact_options) override { + return db_->WaitForCompact(wait_for_compact_options); + } + using DB::NumberLevels; virtual int NumberLevels(ColumnFamilyHandle* column_family) override { return db_->NumberLevels(column_family); @@ -390,7 +433,6 @@ class StackableDB : public DB { virtual Status UnlockWAL() override { return db_->UnlockWAL(); } -#ifndef ROCKSDB_LITE virtual Status DisableFileDeletions() override { return db_->DisableFileDeletions(); @@ -462,7 +504,6 @@ class StackableDB : public DB { return db_->NewDefaultReplayer(handles, std::move(reader), replayer); } -#endif // ROCKSDB_LITE virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, bool flush_memtable = true) override { @@ -595,11 +636,11 @@ class StackableDB : public DB { return db_->GetNextFileNumber(); } -#ifndef ROCKSDB_LITE Status TryCatchUpWithPrimary() override { return db_->TryCatchUpWithPrimary(); } -#endif // ROCKSDB_LITE + + virtual Status Resume() override { return db_->Resume(); } protected: DB* db_; diff --git a/include/rocksdb/utilities/table_properties_collectors.h b/include/rocksdb/utilities/table_properties_collectors.h index f3a4ba0052fa..f9d8d5dcdd70 100644 --- a/include/rocksdb/utilities/table_properties_collectors.h +++ b/include/rocksdb/utilities/table_properties_collectors.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -87,4 +86,3 @@ NewCompactOnDeletionCollectorFactory(size_t sliding_window_size, double deletion_ratio = 0); } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h index 1d2822988fc7..e6452056a007 100644 --- a/include/rocksdb/utilities/transaction.h +++ b/include/rocksdb/utilities/transaction.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -228,7 +227,8 @@ class Transaction { // Status::Busy() may be returned if the transaction could not guarantee // that there are no write conflicts. Status::TryAgain() may be returned // if the memtable history size is not large enough - // (See max_write_buffer_size_to_maintain). + // (see max_write_buffer_size_to_maintain). In either case, a Rollback() + // or new transaction is required to expect a different result. // // If this transaction was created by a TransactionDB(), Status::Expired() // may be returned if this transaction has lived for longer than @@ -260,6 +260,7 @@ class Transaction { std::shared_ptr* snapshot = nullptr); // Discard all batched writes in this transaction. + // FIXME: what happens if this isn't called before destruction? virtual Status Rollback() = 0; // Records the state of the transaction for future calls to @@ -334,8 +335,22 @@ class Transaction { const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, const bool /*sorted_input*/ = false) { + if (options.io_activity != Env::IOActivity::kUnknown && + options.io_activity != Env::IOActivity::kMultiGet) { + Status s = Status::InvalidArgument( + "Can only call MultiGet with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); + + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + statuses[i] = s; + } + } + return; + } + for (size_t i = 0; i < num_keys; ++i) { - statuses[i] = Get(options, column_family, keys[i], &values[i]); + statuses[i] = GetImpl(options, column_family, keys[i], &values[i]); } } @@ -512,6 +527,15 @@ class Transaction { virtual Status SingleDeleteUntracked(const Slice& key) = 0; + // Collpase the merge chain for the given key. This is can be used by the + // application to trigger an on-demand collpase to a key that has a long + // merge chain to reduce read amplification, without waiting for compaction + // to kick in. + virtual Status CollapseKey(const ReadOptions&, const Slice&, + ColumnFamilyHandle* = nullptr) { + return Status::NotSupported("collpase not supported"); + } + // Similar to WriteBatch::PutLogData virtual void PutLogData(const Slice& blob) = 0; @@ -672,6 +696,21 @@ class Transaction { id_ = id; } + virtual Status GetImpl(const ReadOptions& /* options */, + ColumnFamilyHandle* /* column_family */, + const Slice& /* key */, std::string* /* value */) { + return Status::NotSupported("Not implemented"); + } + + virtual Status GetImpl(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* pinnable_val) { + assert(pinnable_val != nullptr); + auto s = GetImpl(options, column_family, key, pinnable_val->GetSelf()); + pinnable_val->PinSelf(); + return s; + } + virtual uint64_t GetLastLogNumber() const { return log_number_; } private: @@ -682,5 +721,3 @@ class Transaction { }; } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index 741c59574746..3c4b63068e6a 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -505,4 +504,3 @@ class TransactionDB : public StackableDB { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/transaction_db_mutex.h b/include/rocksdb/utilities/transaction_db_mutex.h index e352f325a022..4ef566dad92b 100644 --- a/include/rocksdb/utilities/transaction_db_mutex.h +++ b/include/rocksdb/utilities/transaction_db_mutex.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include @@ -88,4 +87,3 @@ class TransactionDBMutexFactory { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h index 84dc11a312cc..e0536712c4ae 100644 --- a/include/rocksdb/utilities/write_batch_with_index.h +++ b/include/rocksdb/utilities/write_batch_with_index.h @@ -10,7 +10,6 @@ // inserted. #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -113,13 +112,30 @@ class WriteBatchWithIndex : public WriteBatchBase { Status Put(ColumnFamilyHandle* column_family, const Slice& key, const Slice& ts, const Slice& value) override; + using WriteBatchBase::TimedPut; + Status TimedPut(ColumnFamilyHandle* /* column_family */, + const Slice& /* key */, const Slice& /* value */, + uint64_t /* write_unix_time */) override { + return Status::NotSupported( + "TimedPut not supported by WriteBatchWithIndex"); + } + Status PutEntity(ColumnFamilyHandle* column_family, const Slice& /* key */, const WideColumns& /* columns */) override { if (!column_family) { return Status::InvalidArgument( "Cannot call this method without a column family handle"); } + return Status::NotSupported( + "PutEntity not supported by WriteBatchWithIndex"); + } + Status PutEntity(const Slice& /* key */, + const AttributeGroups& attribute_groups) override { + if (attribute_groups.empty()) { + return Status::InvalidArgument( + "Cannot call this method without attribute groups"); + } return Status::NotSupported( "PutEntity not supported by WriteBatchWithIndex"); } @@ -190,9 +206,6 @@ class WriteBatchWithIndex : public WriteBatchBase { // Will create a new Iterator that will use WBWIIterator as a delta and // base_iterator as base. // - // This function is only supported if the WriteBatchWithIndex was - // constructed with overwrite_key=true. - // // The returned iterator should be deleted by the caller. // The base_iterator is now 'owned' by the returned iterator. Deleting the // returned iterator will also delete the base_iterator. @@ -305,5 +318,3 @@ class WriteBatchWithIndex : public WriteBatchBase { }; } // namespace ROCKSDB_NAMESPACE - -#endif // !ROCKSDB_LITE diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index 18f6e11190f2..b1ab4f460844 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -11,9 +11,9 @@ // NOTE: in 'main' development branch, this should be the *next* // minor or major version number planned for release. -#define ROCKSDB_MAJOR 7 -#define ROCKSDB_MINOR 10 -#define ROCKSDB_PATCH 2 +#define ROCKSDB_MAJOR 8 +#define ROCKSDB_MINOR 9 +#define ROCKSDB_PATCH 1 // Do not use these. We made the mistake of declaring macros starting with // double underscore. Now we have to live with our choice. We'll deprecate these diff --git a/include/rocksdb/wide_columns.h b/include/rocksdb/wide_columns.h index 7ddc61f033e3..35b81268bed8 100644 --- a/include/rocksdb/wide_columns.h +++ b/include/rocksdb/wide_columns.h @@ -16,6 +16,8 @@ namespace ROCKSDB_NAMESPACE { +class ColumnFamilyHandle; + // Class representing a wide column, which is defined as a pair of column name // and column value. class WideColumn { @@ -74,8 +76,19 @@ inline bool operator!=(const WideColumn& lhs, const WideColumn& rhs) { inline std::ostream& operator<<(std::ostream& os, const WideColumn& column) { const bool hex = (os.flags() & std::ios_base::basefield) == std::ios_base::hex; - os << column.name().ToString(hex) << ':' << column.value().ToString(hex); - + if (!column.name().empty()) { + if (hex) { + os << "0x"; + } + os << column.name().ToString(hex); + } + os << ':'; + if (!column.value().empty()) { + if (hex) { + os << "0x"; + } + os << column.value().ToString(hex); + } return os; } @@ -97,15 +110,22 @@ class PinnableWideColumns { void SetPlainValue(const Slice& value); void SetPlainValue(const Slice& value, Cleanable* cleanable); + void SetPlainValue(PinnableSlice&& value); + void SetPlainValue(std::string&& value); Status SetWideColumnValue(const Slice& value); Status SetWideColumnValue(const Slice& value, Cleanable* cleanable); + Status SetWideColumnValue(PinnableSlice&& value); + Status SetWideColumnValue(std::string&& value); void Reset(); private: void CopyValue(const Slice& value); void PinOrCopyValue(const Slice& value, Cleanable* cleanable); + void MoveValue(PinnableSlice&& value); + void MoveValue(std::string&& value); + void CreateIndexForPlainValue(); Status CreateIndexForWideColumns(); @@ -127,6 +147,18 @@ inline void PinnableWideColumns::PinOrCopyValue(const Slice& value, value_.PinSlice(value, cleanable); } +inline void PinnableWideColumns::MoveValue(PinnableSlice&& value) { + value_ = std::move(value); +} + +inline void PinnableWideColumns::MoveValue(std::string&& value) { + std::string* const buf = value_.GetSelf(); + assert(buf); + + *buf = std::move(value); + value_.PinSelf(); +} + inline void PinnableWideColumns::CreateIndexForPlainValue() { columns_ = WideColumns{{kDefaultWideColumnName, value_}}; } @@ -142,6 +174,16 @@ inline void PinnableWideColumns::SetPlainValue(const Slice& value, CreateIndexForPlainValue(); } +inline void PinnableWideColumns::SetPlainValue(PinnableSlice&& value) { + MoveValue(std::move(value)); + CreateIndexForPlainValue(); +} + +inline void PinnableWideColumns::SetPlainValue(std::string&& value) { + MoveValue(std::move(value)); + CreateIndexForPlainValue(); +} + inline Status PinnableWideColumns::SetWideColumnValue(const Slice& value) { CopyValue(value); return CreateIndexForWideColumns(); @@ -153,6 +195,16 @@ inline Status PinnableWideColumns::SetWideColumnValue(const Slice& value, return CreateIndexForWideColumns(); } +inline Status PinnableWideColumns::SetWideColumnValue(PinnableSlice&& value) { + MoveValue(std::move(value)); + return CreateIndexForWideColumns(); +} + +inline Status PinnableWideColumns::SetWideColumnValue(std::string&& value) { + MoveValue(std::move(value)); + return CreateIndexForWideColumns(); +} + inline void PinnableWideColumns::Reset() { value_.Reset(); columns_.clear(); @@ -168,4 +220,61 @@ inline bool operator!=(const PinnableWideColumns& lhs, return !(lhs == rhs); } +// Class representing attribute group. Attribute group is a logical grouping of +// wide-column entities by leveraging Column Families. +// Used in Write Path +class AttributeGroup { + public: + ColumnFamilyHandle* column_family() const { return column_family_; } + const WideColumns& columns() const { return columns_; } + WideColumns& columns() { return columns_; } + + explicit AttributeGroup(ColumnFamilyHandle* column_family, + const WideColumns& columns) + : column_family_(column_family), columns_(columns) {} + + private: + ColumnFamilyHandle* column_family_; + WideColumns columns_; +}; + +// A collection of Attribute Groups. +using AttributeGroups = std::vector; + +// Used in Read Path. Wide-columns returned from the query are pinnable. +class PinnableAttributeGroup { + public: + ColumnFamilyHandle* column_family() const { return column_family_; } + const Status& status() const { return status_; } + const WideColumns& columns() const { return columns_.columns(); } + + explicit PinnableAttributeGroup(ColumnFamilyHandle* column_family) + : column_family_(column_family), status_(Status::OK()) {} + + void SetStatus(const Status& status); + void SetColumns(PinnableWideColumns&& columns); + + void Reset(); + + private: + ColumnFamilyHandle* column_family_; + Status status_; + PinnableWideColumns columns_; +}; + +inline void PinnableAttributeGroup::SetStatus(const Status& status) { + status_ = status; +} +inline void PinnableAttributeGroup::SetColumns(PinnableWideColumns&& columns) { + columns_ = std::move(columns); +} + +inline void PinnableAttributeGroup::Reset() { + SetStatus(Status::OK()); + columns_.Reset(); +} + +// A collection of Pinnable Attribute Groups. +using PinnableAttributeGroups = std::vector; + } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index 6f6079a1276c..ee4402695e1f 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -43,13 +43,13 @@ struct SavePoints; struct SliceParts; struct SavePoint { - size_t size; // size of rep_ - int count; // count of elements in rep_ + size_t size; // size of rep_ + uint32_t count; // count of elements in rep_ uint32_t content_flags; SavePoint() : size(0), count(0), content_flags(0) {} - SavePoint(size_t _size, int _count, uint32_t _flags) + SavePoint(size_t _size, uint32_t _count, uint32_t _flags) : size(_size), count(_count), content_flags(_flags) {} void clear() { @@ -100,12 +100,28 @@ class WriteBatch : public WriteBatchBase { return Put(nullptr, key, value); } + using WriteBatchBase::TimedPut; + // DO NOT USE, UNDER CONSTRUCTION + // Stores the mapping "key->value" in the database with the specified write + // time in the column family. + Status TimedPut(ColumnFamilyHandle* /* column_family */, + const Slice& /* key */, const Slice& /* value */, + uint64_t /* write_unix_time */) override { + // TODO(yuzhangyu): implement take in the write time. + return Status::NotSupported("TimedPut is under construction"); + } + // Store the mapping "key->{column1:value1, column2:value2, ...}" in the // column family specified by "column_family". using WriteBatchBase::PutEntity; Status PutEntity(ColumnFamilyHandle* column_family, const Slice& key, const WideColumns& columns) override; + // Split and store wide column entities in multiple column families (a.k.a. + // AttributeGroups) + Status PutEntity(const Slice& key, + const AttributeGroups& attribute_groups) override; + using WriteBatchBase::Delete; // If the database contains a mapping for "key", erase it. Else do nothing. // The following Delete(..., const Slice& key) can be used when user-defined @@ -357,6 +373,9 @@ class WriteBatch : public WriteBatchBase { const std::string& Data() const { return rep_; } std::string& Data() { return rep_; } + // Release the serialized data and clear this batch. + std::string Release(); + // Retrieve data size of the batch. size_t GetDataSize() const { return rep_.size(); } @@ -393,8 +412,6 @@ class WriteBatch : public WriteBatchBase { // Returns true if MarkRollback will be called during Iterate bool HasRollback() const; - // Experimental. - // // Update timestamps of existing entries in the write batch if // applicable. If a key is intended for a column family that disables // timestamp, then this API won't set the timestamp for this key. diff --git a/include/rocksdb/write_batch_base.h b/include/rocksdb/write_batch_base.h index f6f39ef0becc..5b26ee543b51 100644 --- a/include/rocksdb/write_batch_base.h +++ b/include/rocksdb/write_batch_base.h @@ -42,11 +42,32 @@ class WriteBatchBase { const SliceParts& value); virtual Status Put(const SliceParts& key, const SliceParts& value); + // Store the mapping "key->value" in the database with the specified write + // time in the column family. Using some write time that is in the past to + // fast track data to their correct placement and preservation is the intended + // usage of this API. The DB makes a reasonable best effort to treat the data + // as having the given write time for this purpose but doesn't currently make + // any guarantees. + // + // When a regular Put("foo", "v1") is followed by a + // TimedPut("foo", "v2", some_time_before_first_put), the behavior of read + // queries are undefined and can change over time, for example due to + // compactions. + // Note: this feature is currently not compatible with user-defined timestamps + // and wide columns. + virtual Status TimedPut(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, uint64_t write_unix_time) = 0; + // Store the mapping "key->{column1:value1, column2:value2, ...}" in the // column family specified by "column_family". virtual Status PutEntity(ColumnFamilyHandle* column_family, const Slice& key, const WideColumns& columns) = 0; + // Split and store wide column entities in multiple column families (a.k.a. + // AttributeGroups) + virtual Status PutEntity(const Slice& key, + const AttributeGroups& attribute_groups) = 0; + // Merge "value" with the existing value of "key" in the database. // "key->merge(existing, value)" virtual Status Merge(ColumnFamilyHandle* column_family, const Slice& key, diff --git a/include/rocksdb/write_buffer_manager.h b/include/rocksdb/write_buffer_manager.h index 7fb18196d7e5..61e75c8888e3 100644 --- a/include/rocksdb/write_buffer_manager.h +++ b/include/rocksdb/write_buffer_manager.h @@ -81,13 +81,20 @@ class WriteBufferManager final { return buffer_size_.load(std::memory_order_relaxed); } + // REQUIRED: `new_size` > 0 void SetBufferSize(size_t new_size) { + assert(new_size > 0); buffer_size_.store(new_size, std::memory_order_relaxed); mutable_limit_.store(new_size * 7 / 8, std::memory_order_relaxed); // Check if stall is active and can be ended. MaybeEndWriteStall(); } + void SetAllowStall(bool new_allow_stall) { + allow_stall_.store(new_allow_stall, std::memory_order_relaxed); + MaybeEndWriteStall(); + } + // Below functions should be called by RocksDB internally. // Should only be called from write thread @@ -117,7 +124,7 @@ class WriteBufferManager final { // // Should only be called by RocksDB internally . bool ShouldStall() const { - if (!allow_stall_ || !enabled()) { + if (!allow_stall_.load(std::memory_order_relaxed) || !enabled()) { return false; } @@ -165,7 +172,7 @@ class WriteBufferManager final { std::list queue_; // Protects the queue_ and stall_active_. std::mutex mu_; - bool allow_stall_; + std::atomic allow_stall_; // Value should only be changed by BeginWriteStall() and MaybeEndWriteStall() // while holding mu_, but it can be read without a lock. std::atomic stall_active_; diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt index 5d62630fde88..0fc503e69ed7 100644 --- a/java/CMakeLists.txt +++ b/java/CMakeLists.txt @@ -1,10 +1,17 @@ cmake_minimum_required(VERSION 3.4) +set(JAVA_JUNIT_VERSION "4.13.1") +set(JAVA_HAMCR_VERSION "2.2") +set(JAVA_MOCKITO_VERSION "1.10.19") +set(JAVA_CGLIB_VERSION "3.3.0") +set(JAVA_ASSERTJ_VERSION "2.9.0") + + if(${CMAKE_VERSION} VERSION_LESS "3.11.4") message("Please consider switching to CMake 3.11.4 or newer") endif() -set(CMAKE_JAVA_COMPILE_FLAGS -source 7) +set(CMAKE_JAVA_COMPILE_FLAGS -source 8) set(JNI_NATIVE_SOURCES rocksjni/backup_engine_options.cc @@ -33,7 +40,10 @@ set(JNI_NATIVE_SOURCES rocksjni/env_options.cc rocksjni/event_listener.cc rocksjni/event_listener_jnicallback.cc + rocksjni/export_import_files_metadatajni.cc rocksjni/filter.cc + rocksjni/import_column_family_options.cc + rocksjni/hyper_clock_cache.cc rocksjni/ingest_external_file_options.cc rocksjni/iterator.cc rocksjni/jnicallback.cc @@ -48,6 +58,7 @@ set(JNI_NATIVE_SOURCES rocksjni/options.cc rocksjni/options_util.cc rocksjni/persistent_cache.cc + rocksjni/jni_perf_context.cc rocksjni/ratelimiterjni.cc rocksjni/remove_emptyvalue_compactionfilterjni.cc rocksjni/restorejni.cc @@ -149,8 +160,10 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/EnvOptions.java src/main/java/org/rocksdb/EventListener.java src/main/java/org/rocksdb/Experimental.java + src/main/java/org/rocksdb/ExportImportFilesMetaData.java src/main/java/org/rocksdb/ExternalFileIngestionInfo.java src/main/java/org/rocksdb/Filter.java + src/main/java/org/rocksdb/FilterPolicyType.java src/main/java/org/rocksdb/FileOperationInfo.java src/main/java/org/rocksdb/FlushJobInfo.java src/main/java/org/rocksdb/FlushReason.java @@ -160,6 +173,8 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/HistogramData.java src/main/java/org/rocksdb/HistogramType.java src/main/java/org/rocksdb/Holder.java + src/main/java/org/rocksdb/ImportColumnFamilyOptions.java + src/main/java/org/rocksdb/HyperClockCache.java src/main/java/org/rocksdb/IndexShorteningMode.java src/main/java/org/rocksdb/IndexType.java src/main/java/org/rocksdb/InfoLogLevel.java @@ -193,6 +208,8 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/OptionString.java src/main/java/org/rocksdb/OptionsUtil.java src/main/java/org/rocksdb/PersistentCache.java + src/main/java/org/rocksdb/PerfContext.java + src/main/java/org/rocksdb/PerfLevel.java src/main/java/org/rocksdb/PlainTableConfig.java src/main/java/org/rocksdb/PrepopulateBlobCache.java src/main/java/org/rocksdb/Priority.java @@ -275,20 +292,221 @@ set(JAVA_MAIN_CLASSES src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java src/main/java/org/rocksdb/util/SizeUnit.java src/main/java/org/rocksdb/UInt64AddOperator.java -) - -set(JAVA_TEST_CLASSES - src/test/java/org/rocksdb/BackupEngineTest.java - src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java src/test/java/org/rocksdb/NativeComparatorWrapperTest.java - src/test/java/org/rocksdb/PlatformRandomHelper.java src/test/java/org/rocksdb/RocksDBExceptionTest.java - src/test/java/org/rocksdb/RocksNativeLibraryResource.java - src/test/java/org/rocksdb/SnapshotTest.java + src/test/java/org/rocksdb/test/TestableEventListener.java src/test/java/org/rocksdb/WriteBatchTest.java + src/test/java/org/rocksdb/RocksNativeLibraryResource.java src/test/java/org/rocksdb/util/CapturingWriteBatchHandler.java src/test/java/org/rocksdb/util/WriteBatchGetter.java +) + +set(JAVA_TEST_CLASSES + src/test/java/org/rocksdb/ConcurrentTaskLimiterTest.java + src/test/java/org/rocksdb/EventListenerTest.java + src/test/java/org/rocksdb/CompactionOptionsTest.java + src/test/java/org/rocksdb/PlatformRandomHelper.java + src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java + src/test/java/org/rocksdb/MutableDBOptionsTest.java + src/test/java/org/rocksdb/WriteOptionsTest.java + src/test/java/org/rocksdb/SstPartitionerTest.java + src/test/java/org/rocksdb/RocksMemEnvTest.java + src/test/java/org/rocksdb/CompactionOptionsUniversalTest.java + src/test/java/org/rocksdb/ClockCacheTest.java + src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java + src/test/java/org/rocksdb/SnapshotTest.java + src/test/java/org/rocksdb/CompactionJobStatsTest.java + src/test/java/org/rocksdb/MemTableTest.java + src/test/java/org/rocksdb/CompactionFilterFactoryTest.java + src/test/java/org/rocksdb/DefaultEnvTest.java + src/test/java/org/rocksdb/DBOptionsTest.java + src/test/java/org/rocksdb/RocksIteratorTest.java + src/test/java/org/rocksdb/SliceTest.java + src/test/java/org/rocksdb/MultiGetTest.java + src/test/java/org/rocksdb/ComparatorOptionsTest.java + src/test/java/org/rocksdb/NativeLibraryLoaderTest.java + src/test/java/org/rocksdb/StatisticsTest.java + src/test/java/org/rocksdb/WALRecoveryModeTest.java + src/test/java/org/rocksdb/TransactionLogIteratorTest.java + src/test/java/org/rocksdb/ReadOptionsTest.java + src/test/java/org/rocksdb/SecondaryDBTest.java + src/test/java/org/rocksdb/KeyMayExistTest.java + src/test/java/org/rocksdb/BlobOptionsTest.java + src/test/java/org/rocksdb/InfoLogLevelTest.java + src/test/java/org/rocksdb/CompactionPriorityTest.java + src/test/java/org/rocksdb/FlushOptionsTest.java + src/test/java/org/rocksdb/VerifyChecksumsTest.java + src/test/java/org/rocksdb/MultiColumnRegressionTest.java + src/test/java/org/rocksdb/FlushTest.java + src/test/java/org/rocksdb/HyperClockCacheTest.java + src/test/java/org/rocksdb/PutMultiplePartsTest.java + src/test/java/org/rocksdb/StatisticsCollectorTest.java + src/test/java/org/rocksdb/LRUCacheTest.java + src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java + src/test/java/org/rocksdb/TransactionTest.java + src/test/java/org/rocksdb/CompactionOptionsFIFOTest.java + src/test/java/org/rocksdb/BackupEngineOptionsTest.java + src/test/java/org/rocksdb/CheckPointTest.java + src/test/java/org/rocksdb/PlainTableConfigTest.java + src/test/java/org/rocksdb/TransactionDBOptionsTest.java + src/test/java/org/rocksdb/ReadOnlyTest.java + src/test/java/org/rocksdb/EnvOptionsTest.java + src/test/java/org/rocksdb/test/RemoveEmptyValueCompactionFilterFactory.java + src/test/java/org/rocksdb/test/RemoveEmptyValueCompactionFilterFactory.java + src/test/java/org/rocksdb/test/TestableEventListener.java + src/test/java/org/rocksdb/test/RemoveEmptyValueCompactionFilterFactory.java src/test/java/org/rocksdb/test/TestableEventListener.java + src/test/java/org/rocksdb/test/RocksJunitRunner.java + src/test/java/org/rocksdb/LoggerTest.java + src/test/java/org/rocksdb/FilterTest.java + src/test/java/org/rocksdb/ByteBufferUnsupportedOperationTest.java + src/test/java/org/rocksdb/util/IntComparatorTest.java + src/test/java/org/rocksdb/util/JNIComparatorTest.java + src/test/java/org/rocksdb/util/ByteBufferAllocator.java + src/test/java/org/rocksdb/util/SizeUnitTest.java + src/test/java/org/rocksdb/util/BytewiseComparatorTest.java + src/test/java/org/rocksdb/util/EnvironmentTest.java + src/test/java/org/rocksdb/util/BytewiseComparatorIntTest.java + src/test/java/org/rocksdb/util/DirectByteBufferAllocator.java + src/test/java/org/rocksdb/util/HeapByteBufferAllocator.java + src/test/java/org/rocksdb/util/TestUtil.java + src/test/java/org/rocksdb/util/ReverseBytewiseComparatorIntTest.java + src/test/java/org/rocksdb/Types.java + src/test/java/org/rocksdb/MixedOptionsTest.java + src/test/java/org/rocksdb/CompactRangeOptionsTest.java + src/test/java/org/rocksdb/SstFileWriterTest.java + src/test/java/org/rocksdb/WalFilterTest.java + src/test/java/org/rocksdb/AbstractTransactionTest.java + src/test/java/org/rocksdb/MergeTest.java + src/test/java/org/rocksdb/OptionsTest.java + src/test/java/org/rocksdb/WriteBatchThreadedTest.java + src/test/java/org/rocksdb/MultiGetManyKeysTest.java + src/test/java/org/rocksdb/TimedEnvTest.java + src/test/java/org/rocksdb/CompactionStopStyleTest.java + src/test/java/org/rocksdb/CompactionJobInfoTest.java + src/test/java/org/rocksdb/BlockBasedTableConfigTest.java + src/test/java/org/rocksdb/BuiltinComparatorTest.java + src/test/java/org/rocksdb/RateLimiterTest.java + src/test/java/org/rocksdb/TransactionOptionsTest.java + src/test/java/org/rocksdb/WriteBatchWithIndexTest.java + src/test/java/org/rocksdb/WriteBatchHandlerTest.java + src/test/java/org/rocksdb/OptimisticTransactionDBTest.java + src/test/java/org/rocksdb/OptionsUtilTest.java + src/test/java/org/rocksdb/OptimisticTransactionTest.java + src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java + src/test/java/org/rocksdb/CompressionOptionsTest.java + src/test/java/org/rocksdb/ColumnFamilyTest.java + src/test/java/org/rocksdb/SstFileReaderTest.java + src/test/java/org/rocksdb/TransactionDBTest.java + src/test/java/org/rocksdb/RocksDBTest.java + src/test/java/org/rocksdb/MutableOptionsGetSetTest.java + src/test/java/org/rocksdb/OptimisticTransactionOptionsTest.java + src/test/java/org/rocksdb/SstFileManagerTest.java + src/test/java/org/rocksdb/BackupEngineTest.java + src/test/java/org/rocksdb/DirectSliceTest.java + src/test/java/org/rocksdb/StatsCallbackMock.java + src/test/java/org/rocksdb/CompressionTypesTest.java + src/test/java/org/rocksdb/MemoryUtilTest.java + src/test/java/org/rocksdb/TableFilterTest.java + src/test/java/org/rocksdb/TtlDBTest.java +) + +set(JAVA_TEST_RUNNING_CLASSES + org.rocksdb.ConcurrentTaskLimiterTest + org.rocksdb.EventListenerTest + org.rocksdb.CompactionOptionsTest + org.rocksdb.IngestExternalFileOptionsTest + org.rocksdb.MutableDBOptionsTest + org.rocksdb.WriteOptionsTest + org.rocksdb.SstPartitionerTest + org.rocksdb.RocksMemEnvTest + org.rocksdb.CompactionOptionsUniversalTest + org.rocksdb.ClockCacheTest + # org.rocksdb.BytewiseComparatorRegressionTest + org.rocksdb.SnapshotTest + org.rocksdb.CompactionJobStatsTest + org.rocksdb.MemTableTest + org.rocksdb.CompactionFilterFactoryTest + # org.rocksdb.DefaultEnvTest + org.rocksdb.DBOptionsTest + org.rocksdb.WriteBatchTest + org.rocksdb.RocksIteratorTest + org.rocksdb.SliceTest + org.rocksdb.MultiGetTest + org.rocksdb.ComparatorOptionsTest + # org.rocksdb.NativeLibraryLoaderTest + org.rocksdb.StatisticsTest + org.rocksdb.WALRecoveryModeTest + org.rocksdb.TransactionLogIteratorTest + org.rocksdb.ReadOptionsTest + org.rocksdb.SecondaryDBTest + org.rocksdb.KeyMayExistTest + org.rocksdb.BlobOptionsTest + org.rocksdb.InfoLogLevelTest + org.rocksdb.CompactionPriorityTest + org.rocksdb.FlushOptionsTest + org.rocksdb.VerifyChecksumsTest + org.rocksdb.MultiColumnRegressionTest + org.rocksdb.FlushTest + org.rocksdb.HyperClockCacheTest + org.rocksdb.PutMultiplePartsTest + org.rocksdb.StatisticsCollectorTest + org.rocksdb.LRUCacheTest + org.rocksdb.ColumnFamilyOptionsTest + org.rocksdb.TransactionTest + org.rocksdb.CompactionOptionsFIFOTest + org.rocksdb.BackupEngineOptionsTest + org.rocksdb.CheckPointTest + org.rocksdb.PlainTableConfigTest + # org.rocksdb.TransactionDBOptionsTest + org.rocksdb.ReadOnlyTest + org.rocksdb.EnvOptionsTest + org.rocksdb.LoggerTest + org.rocksdb.FilterTest + # org.rocksdb.ByteBufferUnsupportedOperationTest + # org.rocksdb.util.IntComparatorTest + # org.rocksdb.util.JNIComparatorTest + org.rocksdb.util.SizeUnitTest + # org.rocksdb.util.BytewiseComparatorTest + org.rocksdb.util.EnvironmentTest + # org.rocksdb.util.BytewiseComparatorIntTest + # org.rocksdb.util.ReverseBytewiseComparatorIntTest + org.rocksdb.MixedOptionsTest + org.rocksdb.CompactRangeOptionsTest + # org.rocksdb.SstFileWriterTest + org.rocksdb.WalFilterTest + # org.rocksdb.AbstractTransactionTest + org.rocksdb.MergeTest + org.rocksdb.OptionsTest + org.rocksdb.WriteBatchThreadedTest + org.rocksdb.MultiGetManyKeysTest + org.rocksdb.TimedEnvTest + org.rocksdb.CompactionStopStyleTest + org.rocksdb.CompactionJobInfoTest + org.rocksdb.BlockBasedTableConfigTest + org.rocksdb.BuiltinComparatorTest + org.rocksdb.RateLimiterTest + # org.rocksdb.TransactionOptionsTest + org.rocksdb.WriteBatchWithIndexTest + org.rocksdb.WriteBatchHandlerTest + org.rocksdb.OptimisticTransactionDBTest + org.rocksdb.OptionsUtilTest + org.rocksdb.OptimisticTransactionTest + org.rocksdb.MutableColumnFamilyOptionsTest + org.rocksdb.CompressionOptionsTest + org.rocksdb.ColumnFamilyTest + org.rocksdb.SstFileReaderTest + org.rocksdb.TransactionDBTest + org.rocksdb.RocksDBTest + org.rocksdb.MutableOptionsGetSetTest + # org.rocksdb.OptimisticTransactionOptionsTest + org.rocksdb.SstFileManagerTest + org.rocksdb.BackupEngineTest + org.rocksdb.DirectSliceTest + org.rocksdb.CompressionTypesTest + org.rocksdb.MemoryUtilTest + org.rocksdb.TableFilterTest + org.rocksdb.TtlDBTest ) include(FindJava) @@ -300,12 +518,20 @@ include_directories(${PROJECT_SOURCE_DIR}/java) set(JAVA_TEST_LIBDIR ${PROJECT_SOURCE_DIR}/java/test-libs) set(JAVA_TMP_JAR ${JAVA_TEST_LIBDIR}/tmp.jar) -set(JAVA_JUNIT_JAR ${JAVA_TEST_LIBDIR}/junit-4.12.jar) -set(JAVA_HAMCR_JAR ${JAVA_TEST_LIBDIR}/hamcrest-core-1.3.jar) -set(JAVA_MOCKITO_JAR ${JAVA_TEST_LIBDIR}/mockito-all-1.10.19.jar) -set(JAVA_CGLIB_JAR ${JAVA_TEST_LIBDIR}/cglib-2.2.2.jar) -set(JAVA_ASSERTJ_JAR ${JAVA_TEST_LIBDIR}/assertj-core-1.7.1.jar) +set(JAVA_JUNIT_JAR ${JAVA_TEST_LIBDIR}/junit-${JAVA_JUNIT_VERSION}.jar) +set(JAVA_HAMCR_JAR ${JAVA_TEST_LIBDIR}/hamcrest-${JAVA_HAMCR_VERSION}.jar) +set(JAVA_MOCKITO_JAR ${JAVA_TEST_LIBDIR}/mockito-all-${JAVA_MOCKITO_VERSION}.jar) +set(JAVA_CGLIB_JAR ${JAVA_TEST_LIBDIR}/cglib-${JAVA_CGLIB_VERSION}.jar) +set(JAVA_ASSERTJ_JAR ${JAVA_TEST_LIBDIR}/assertj-core-${JAVA_ASSERTJ_VERSION}.jar) set(JAVA_TESTCLASSPATH ${JAVA_JUNIT_JAR} ${JAVA_HAMCR_JAR} ${JAVA_MOCKITO_JAR} ${JAVA_CGLIB_JAR} ${JAVA_ASSERTJ_JAR}) +message("CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}") +message("MINGW: ${MINGW}") + +if(${CMAKE_SYSTEM_NAME} MATCHES "Windows") + set(JAVA_RUN_TESTCLASSPATH ${JAVA_JUNIT_JAR}$${JAVA_HAMCR_JAR}$${JAVA_MOCKITO_JAR}$${JAVA_CGLIB_JAR}$${JAVA_ASSERTJ_JAR}) +else() + set(JAVA_RUN_TESTCLASSPATH ${JAVA_JUNIT_JAR}:${JAVA_HAMCR_JAR}:${JAVA_MOCKITO_JAR}:${JAVA_CGLIB_JAR}:${JAVA_ASSERTJ_JAR}) +endif() set(JNI_OUTPUT_DIR ${PROJECT_SOURCE_DIR}/java/include) file(MAKE_DIRECTORY ${JNI_OUTPUT_DIR}) @@ -323,26 +549,31 @@ elseif(${CMAKE_VERSION} VERSION_LESS "3.11.4") message("Using an old CMAKE (${CMAKE_VERSION}) - JNI headers generated in separate step") add_jar( rocksdbjni_classes - SOURCES - ${JAVA_MAIN_CLASSES} - ${JAVA_TEST_CLASSES} - INCLUDE_JARS ${JAVA_TESTCLASSPATH} + SOURCES ${JAVA_MAIN_CLASSES} ) else () # Java 1.8 or newer prepare the JAR... message("Preparing Jar for JDK ${Java_VERSION_STRING}") + message("JAVA_TESTCLASSPATH=${JAVA_TESTCLASSPATH}") add_jar( rocksdbjni_classes - SOURCES - ${JAVA_MAIN_CLASSES} - ${JAVA_TEST_CLASSES} - INCLUDE_JARS ${JAVA_TESTCLASSPATH} + SOURCES ${JAVA_MAIN_CLASSES} + INCLUDE_JARS ${ROCKSDBJNI_CLASSES_JAR_FILE} ${JAVA_TESTCLASSPATH} GENERATE_NATIVE_HEADERS rocksdbjni_headers DESTINATION ${JNI_OUTPUT_DIR} ) endif() +add_jar( + rocksdbjni_test_classes + SOURCES + ${JAVA_MAIN_CLASSES} + ${JAVA_TEST_CLASSES} + INCLUDE_JARS ${JAVA_TESTCLASSPATH} + GENERATE_NATIVE_HEADERS rocksdbjni_test_headers DESTINATION ${JNI_OUTPUT_DIR} +) + if(NOT EXISTS ${PROJECT_SOURCE_DIR}/java/classes) file(MAKE_DIRECTORY ${PROJECT_SOURCE_DIR}/java/classes) endif() @@ -361,7 +592,7 @@ endif() if(NOT EXISTS ${JAVA_JUNIT_JAR}) message("Downloading ${JAVA_JUNIT_JAR}") - file(DOWNLOAD ${DEPS_URL}/junit-4.12.jar ${JAVA_TMP_JAR} STATUS downloadStatus) + file(DOWNLOAD ${DEPS_URL}/junit-${JAVA_JUNIT_VERSION}.jar ${JAVA_TMP_JAR} STATUS downloadStatus) list(GET downloadStatus 0 error_code) list(GET downloadStatus 1 error_message) if(NOT error_code EQUAL 0) @@ -371,7 +602,7 @@ if(NOT EXISTS ${JAVA_JUNIT_JAR}) endif() if(NOT EXISTS ${JAVA_HAMCR_JAR}) message("Downloading ${JAVA_HAMCR_JAR}") - file(DOWNLOAD ${DEPS_URL}/hamcrest-core-1.3.jar ${JAVA_TMP_JAR} STATUS downloadStatus) + file(DOWNLOAD ${DEPS_URL}/hamcrest-${JAVA_HAMCR_VERSION}.jar ${JAVA_TMP_JAR} STATUS downloadStatus) list(GET downloadStatus 0 error_code) list(GET downloadStatus 1 error_message) if(NOT error_code EQUAL 0) @@ -381,7 +612,7 @@ if(NOT EXISTS ${JAVA_HAMCR_JAR}) endif() if(NOT EXISTS ${JAVA_MOCKITO_JAR}) message("Downloading ${JAVA_MOCKITO_JAR}") - file(DOWNLOAD ${DEPS_URL}/mockito-all-1.10.19.jar ${JAVA_TMP_JAR} STATUS downloadStatus) + file(DOWNLOAD ${DEPS_URL}/mockito-all-${JAVA_MOCKITO_VERSION}.jar ${JAVA_TMP_JAR} STATUS downloadStatus) list(GET downloadStatus 0 error_code) list(GET downloadStatus 1 error_message) if(NOT error_code EQUAL 0) @@ -391,7 +622,7 @@ if(NOT EXISTS ${JAVA_MOCKITO_JAR}) endif() if(NOT EXISTS ${JAVA_CGLIB_JAR}) message("Downloading ${JAVA_CGLIB_JAR}") - file(DOWNLOAD ${DEPS_URL}/cglib-2.2.2.jar ${JAVA_TMP_JAR} STATUS downloadStatus) + file(DOWNLOAD ${DEPS_URL}/cglib-${JAVA_CGLIB_VERSION}.jar ${JAVA_TMP_JAR} STATUS downloadStatus) list(GET downloadStatus 0 error_code) list(GET downloadStatus 1 error_message) if(NOT error_code EQUAL 0) @@ -401,7 +632,7 @@ if(NOT EXISTS ${JAVA_CGLIB_JAR}) endif() if(NOT EXISTS ${JAVA_ASSERTJ_JAR}) message("Downloading ${JAVA_ASSERTJ_JAR}") - file(DOWNLOAD ${DEPS_URL}/assertj-core-1.7.1.jar ${JAVA_TMP_JAR} STATUS downloadStatus) + file(DOWNLOAD ${DEPS_URL}/assertj-core-${JAVA_ASSERTJ_VERSION}.jar ${JAVA_TMP_JAR} STATUS downloadStatus) list(GET downloadStatus 0 error_code) list(GET downloadStatus 1 error_message) if(NOT error_code EQUAL 0) @@ -455,6 +686,7 @@ if(${CMAKE_VERSION} VERSION_LESS "3.11.4") org.rocksdb.FlushOptions org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig + org.rocksdb.HyperClockCache org.rocksdb.IngestExternalFileOptions org.rocksdb.Logger org.rocksdb.LRUCache @@ -530,7 +762,12 @@ if(NOT MSVC) endif() set(ROCKSDBJNI_STATIC_LIB rocksdbjni${ARTIFACT_SUFFIX}) -add_library(${ROCKSDBJNI_STATIC_LIB} ${JNI_NATIVE_SOURCES}) +if(MINGW) + # Build mingw only as staic library. + add_library(${ROCKSDBJNI_STATIC_LIB} ${JNI_NATIVE_SOURCES}) +else() + add_library(${ROCKSDBJNI_STATIC_LIB} SHARED ${JNI_NATIVE_SOURCES}) +endif() add_dependencies(${ROCKSDBJNI_STATIC_LIB} rocksdbjni_headers) target_link_libraries(${ROCKSDBJNI_STATIC_LIB} ${ROCKSDB_STATIC_LIB} ${ROCKSDB_LIB}) @@ -547,3 +784,19 @@ if(NOT MINGW) COMPILE_PDB_NAME ${ROCKSDBJNI_STATIC_LIB}.pdb ) endif() + +enable_testing() +get_target_property(ROCKSDBJNI_CLASSES_TEST_JAR_FILE rocksdbjni_test_classes JAR_FILE) +foreach (CLAZZ ${JAVA_TEST_RUNNING_CLASSES}) + if(${CMAKE_SYSTEM_NAME} MATCHES "Windows") + add_test( + NAME jtest_${CLAZZ} + COMMAND ${Java_JAVA_EXECUTABLE} ${JVMARGS} -ea -Xcheck:jni -Djava.library.path=${PROJECT_BINARY_DIR}/java/${CMAKE_BUILD_TYPE} -classpath ${JAVA_RUN_TESTCLASSPATH}$${ROCKSDBJNI_CLASSES_TEST_JAR_FILE} org.rocksdb.test.RocksJunitRunner ${CLAZZ} + ) + else() + add_test( + NAME jtest_${CLAZZ} + COMMAND ${Java_JAVA_EXECUTABLE} ${JVMARGS} -ea -Xcheck:jni -Djava.library.path=${PROJECT_BINARY_DIR}/java -classpath ${JAVA_RUN_TESTCLASSPATH}:${ROCKSDBJNI_CLASSES_TEST_JAR_FILE} org.rocksdb.test.RocksJunitRunner ${CLAZZ} + ) + endif() +endforeach(CLAZZ) \ No newline at end of file diff --git a/java/Makefile b/java/Makefile index bc7e121c4120..e71589e9e18c 100644 --- a/java/Makefile +++ b/java/Makefile @@ -51,6 +51,8 @@ NATIVE_JAVA_CLASSES = \ org.rocksdb.Options\ org.rocksdb.OptionsUtil\ org.rocksdb.PersistentCache\ + org.rocksdb.PerfContext\ + org.rocksdb.PerfLevel\ org.rocksdb.PlainTableConfig\ org.rocksdb.RateLimiter\ org.rocksdb.ReadOptions\ @@ -110,6 +112,7 @@ JAVA_TESTS = \ org.rocksdb.BlobOptionsTest\ org.rocksdb.BlockBasedTableConfigTest\ org.rocksdb.BuiltinComparatorTest\ + org.rocksdb.ByteBufferUnsupportedOperationTest\ org.rocksdb.BytewiseComparatorRegressionTest\ org.rocksdb.util.BytewiseComparatorTest\ org.rocksdb.util.BytewiseComparatorIntTest\ @@ -138,7 +141,9 @@ JAVA_TESTS = \ org.rocksdb.util.JNIComparatorTest\ org.rocksdb.FilterTest\ org.rocksdb.FlushTest\ + org.rocksdb.ImportColumnFamilyTest\ org.rocksdb.InfoLogLevelTest\ + org.rocksdb.KeyExistsTest \ org.rocksdb.KeyMayExistTest\ org.rocksdb.ConcurrentTaskLimiterTest\ org.rocksdb.LoggerTest\ @@ -160,6 +165,8 @@ JAVA_TESTS = \ org.rocksdb.OptimisticTransactionOptionsTest\ org.rocksdb.OptionsUtilTest\ org.rocksdb.OptionsTest\ + org.rocksdb.PerfLevelTest \ + org.rocksdb.PerfContextTest \ org.rocksdb.PlainTableConfigTest\ org.rocksdb.RateLimiterTest\ org.rocksdb.ReadOnlyTest\ @@ -262,6 +269,8 @@ JAVADOC_CMD := javadoc endif endif +MAVEN_CMD := mvn + # Look for the Java version (1.6->6, 1.7->7, 1.8->8, 11.0->11, 13.0->13, 15.0->15 etc..) JAVAC_VERSION := $(shell $(JAVAC_CMD) -version 2>&1) JAVAC_MAJOR_VERSION := $(word 2,$(subst ., ,$(JAVAC_VERSION))) @@ -443,6 +452,7 @@ test: java java_test run_test: $(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH):target/*" org.rocksdb.test.RocksJunitRunner $(ALL_JAVA_TESTS) + $(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH):target/*" org.rocksdb.test.RocksJunitRunner org.rocksdb.StatisticsTest run_plugin_test: $(JAVA_CMD) $(JAVA_ARGS) -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH):target/*" org.rocksdb.test.RocksJunitRunner $(ROCKSDB_PLUGIN_JAVA_TESTS) @@ -450,3 +460,6 @@ run_plugin_test: db_bench: java $(AM_V_GEN)mkdir -p $(BENCHMARK_MAIN_CLASSES) $(AM_V_at)$(JAVAC_CMD) $(JAVAC_ARGS) -cp $(MAIN_CLASSES) -d $(BENCHMARK_MAIN_CLASSES) $(BENCHMARK_MAIN_SRC)/org/rocksdb/benchmark/*.java + +pmd: + $(MAVEN_CMD) pmd:pmd pmd:cpd pmd:check diff --git a/java/pmd-rules.xml b/java/pmd-rules.xml new file mode 100644 index 000000000000..b710277f1489 --- /dev/null +++ b/java/pmd-rules.xml @@ -0,0 +1,62 @@ + + + + + + Custom rules for checking RocksDB + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/java/pom.xml.template b/java/pom.xml.template index 8a1981c66de0..9dd9c74f3485 100644 --- a/java/pom.xml.template +++ b/java/pom.xml.template @@ -140,7 +140,44 @@ - + + com.github.spotbugs + spotbugs-maven-plugin + 4.7.2.1 + + spotbugs-exclude.xml + + + + + com.github.spotbugs + spotbugs + 4.7.3 + + + + + org.apache.maven.plugins + maven-pmd-plugin + 3.20.0 + + + + check + cpd-check + + + + + + + /pmd-rules.xml + + + + + + @@ -174,5 +211,15 @@ 1.10.19 test - + + + + + + org.apache.maven.plugins + maven-jxr-plugin + 3.3.0 + + + diff --git a/java/rocksjni/cache.cc b/java/rocksjni/cache.cc index 33c0a2f0be71..5ca1d5175974 100644 --- a/java/rocksjni/cache.cc +++ b/java/rocksjni/cache.cc @@ -6,11 +6,10 @@ // This file implements the "bridge" between Java and C++ for // ROCKSDB_NAMESPACE::Cache. -#include "rocksdb/cache.h" - #include #include "include/org_rocksdb_Cache.h" +#include "rocksdb/advanced_cache.h" /* * Class: org_rocksdb_Cache diff --git a/java/rocksjni/checkpoint.cc b/java/rocksjni/checkpoint.cc index d7cfd813bdb0..cef5f3ca88e0 100644 --- a/java/rocksjni/checkpoint.cc +++ b/java/rocksjni/checkpoint.cc @@ -69,3 +69,38 @@ void Java_org_rocksdb_Checkpoint_createCheckpoint(JNIEnv* env, jobject /*jobj*/, ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); } } + +/* + * Class: org_rocksdb_Checkpoint + * Method: exportColumnFamily + * Signature: (JJLjava/lang/String;)Lorg/rocksdb/ExportImportFilesMetaData; + */ +jlong Java_org_rocksdb_Checkpoint_exportColumnFamily( + JNIEnv* env, jobject /*jobj*/, jlong jcheckpoint_handle, + jlong jcolumn_family_handle, jstring jexport_path) { + const char* export_path = env->GetStringUTFChars(jexport_path, 0); + if (export_path == nullptr) { + // exception thrown: OutOfMemoryError + return 0; + } + + auto* checkpoint = + reinterpret_cast(jcheckpoint_handle); + + auto* column_family_handle = + reinterpret_cast( + jcolumn_family_handle); + + ROCKSDB_NAMESPACE::ExportImportFilesMetaData* metadata = nullptr; + + ROCKSDB_NAMESPACE::Status s = checkpoint->ExportColumnFamily( + column_family_handle, export_path, &metadata); + + env->ReleaseStringUTFChars(jexport_path, export_path); + + if (!s.ok()) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); + } + + return GET_CPLUSPLUS_POINTER(metadata); +} diff --git a/java/rocksjni/compact_range_options.cc b/java/rocksjni/compact_range_options.cc index 77fbb8890e23..d07263ab683f 100644 --- a/java/rocksjni/compact_range_options.cc +++ b/java/rocksjni/compact_range_options.cc @@ -12,6 +12,56 @@ #include "rocksdb/options.h" #include "rocksjni/cplusplus_to_java_convert.h" #include "rocksjni/portal.h" +#include "util/coding.h" + +/** + * @brief Class containing compact range options for Java API + * + * An object of this class is returned as the native handle for + * ROCKSDB_NAMESPACE::CompactRangeOptions It contains objects for various + * parameters which are passed by reference/pointer in CompactRangeOptions. We + * maintain the lifetime of these parameters (`full_history_ts_low`, `canceled`) + * by including their values in this class. + */ +class Java_org_rocksdb_CompactRangeOptions { + public: + ROCKSDB_NAMESPACE::CompactRangeOptions compactRangeOptions; + + private: + std::string full_history_ts_low; + std::atomic canceled; + + public: + void set_full_history_ts_low(uint64_t start, uint64_t range) { + full_history_ts_low = ""; + ROCKSDB_NAMESPACE::PutFixed64(&full_history_ts_low, start); + ROCKSDB_NAMESPACE::PutFixed64(&full_history_ts_low, range); + compactRangeOptions.full_history_ts_low = + new ROCKSDB_NAMESPACE::Slice(full_history_ts_low); + } + + bool read_full_history_ts_low(uint64_t* start, uint64_t* range) { + if (compactRangeOptions.full_history_ts_low == nullptr) return false; + ROCKSDB_NAMESPACE::Slice read_slice( + compactRangeOptions.full_history_ts_low->ToStringView()); + if (!ROCKSDB_NAMESPACE::GetFixed64(&read_slice, start)) return false; + return ROCKSDB_NAMESPACE::GetFixed64(&read_slice, range); + } + + void set_canceled(bool value) { + if (compactRangeOptions.canceled == nullptr) { + canceled.store(value, std::memory_order_seq_cst); + compactRangeOptions.canceled = &canceled; + } else { + compactRangeOptions.canceled->store(value, std::memory_order_seq_cst); + } + } + + bool get_canceled() { + return compactRangeOptions.canceled && + compactRangeOptions.canceled->load(std::memory_order_seq_cst); + } +}; /* * Class: org_rocksdb_CompactRangeOptions @@ -20,8 +70,8 @@ */ jlong Java_org_rocksdb_CompactRangeOptions_newCompactRangeOptions( JNIEnv* /*env*/, jclass /*jclazz*/) { - auto* options = new ROCKSDB_NAMESPACE::CompactRangeOptions(); - return GET_CPLUSPLUS_POINTER(options); + auto* options = new Java_org_rocksdb_CompactRangeOptions(); + return GET_CPLUSPLUS_POINTER(&options->compactRangeOptions); } /* @@ -32,8 +82,9 @@ jlong Java_org_rocksdb_CompactRangeOptions_newCompactRangeOptions( jboolean Java_org_rocksdb_CompactRangeOptions_exclusiveManualCompaction( JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { auto* options = - reinterpret_cast(jhandle); - return static_cast(options->exclusive_manual_compaction); + reinterpret_cast(jhandle); + return static_cast( + options->compactRangeOptions.exclusive_manual_compaction); } /* @@ -45,8 +96,8 @@ void Java_org_rocksdb_CompactRangeOptions_setExclusiveManualCompaction( JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean exclusive_manual_compaction) { auto* options = - reinterpret_cast(jhandle); - options->exclusive_manual_compaction = + reinterpret_cast(jhandle); + options->compactRangeOptions.exclusive_manual_compaction = static_cast(exclusive_manual_compaction); } @@ -58,9 +109,10 @@ void Java_org_rocksdb_CompactRangeOptions_setExclusiveManualCompaction( jint Java_org_rocksdb_CompactRangeOptions_bottommostLevelCompaction( JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { auto* options = - reinterpret_cast(jhandle); + reinterpret_cast(jhandle); return ROCKSDB_NAMESPACE::BottommostLevelCompactionJni:: - toJavaBottommostLevelCompaction(options->bottommost_level_compaction); + toJavaBottommostLevelCompaction( + options->compactRangeOptions.bottommost_level_compaction); } /* @@ -72,8 +124,8 @@ void Java_org_rocksdb_CompactRangeOptions_setBottommostLevelCompaction( JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint bottommost_level_compaction) { auto* options = - reinterpret_cast(jhandle); - options->bottommost_level_compaction = + reinterpret_cast(jhandle); + options->compactRangeOptions.bottommost_level_compaction = ROCKSDB_NAMESPACE::BottommostLevelCompactionJni:: toCppBottommostLevelCompaction(bottommost_level_compaction); } @@ -87,8 +139,8 @@ jboolean Java_org_rocksdb_CompactRangeOptions_changeLevel(JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { auto* options = - reinterpret_cast(jhandle); - return static_cast(options->change_level); + reinterpret_cast(jhandle); + return static_cast(options->compactRangeOptions.change_level); } /* @@ -99,8 +151,8 @@ jboolean Java_org_rocksdb_CompactRangeOptions_changeLevel(JNIEnv* /*env*/, void Java_org_rocksdb_CompactRangeOptions_setChangeLevel( JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean change_level) { auto* options = - reinterpret_cast(jhandle); - options->change_level = static_cast(change_level); + reinterpret_cast(jhandle); + options->compactRangeOptions.change_level = static_cast(change_level); } /* @@ -112,8 +164,8 @@ jint Java_org_rocksdb_CompactRangeOptions_targetLevel(JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { auto* options = - reinterpret_cast(jhandle); - return static_cast(options->target_level); + reinterpret_cast(jhandle); + return static_cast(options->compactRangeOptions.target_level); } /* @@ -126,8 +178,8 @@ void Java_org_rocksdb_CompactRangeOptions_setTargetLevel(JNIEnv* /*env*/, jlong jhandle, jint target_level) { auto* options = - reinterpret_cast(jhandle); - options->target_level = static_cast(target_level); + reinterpret_cast(jhandle); + options->compactRangeOptions.target_level = static_cast(target_level); } /* @@ -139,8 +191,8 @@ jint Java_org_rocksdb_CompactRangeOptions_targetPathId(JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { auto* options = - reinterpret_cast(jhandle); - return static_cast(options->target_path_id); + reinterpret_cast(jhandle); + return static_cast(options->compactRangeOptions.target_path_id); } /* @@ -153,8 +205,9 @@ void Java_org_rocksdb_CompactRangeOptions_setTargetPathId(JNIEnv* /*env*/, jlong jhandle, jint target_path_id) { auto* options = - reinterpret_cast(jhandle); - options->target_path_id = static_cast(target_path_id); + reinterpret_cast(jhandle); + options->compactRangeOptions.target_path_id = + static_cast(target_path_id); } /* @@ -166,8 +219,8 @@ jboolean Java_org_rocksdb_CompactRangeOptions_allowWriteStall(JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { auto* options = - reinterpret_cast(jhandle); - return static_cast(options->allow_write_stall); + reinterpret_cast(jhandle); + return static_cast(options->compactRangeOptions.allow_write_stall); } /* @@ -179,8 +232,9 @@ void Java_org_rocksdb_CompactRangeOptions_setAllowWriteStall( JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jboolean allow_write_stall) { auto* options = - reinterpret_cast(jhandle); - options->allow_write_stall = static_cast(allow_write_stall); + reinterpret_cast(jhandle); + options->compactRangeOptions.allow_write_stall = + static_cast(allow_write_stall); } /* @@ -192,8 +246,8 @@ jint Java_org_rocksdb_CompactRangeOptions_maxSubcompactions(JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { auto* options = - reinterpret_cast(jhandle); - return static_cast(options->max_subcompactions); + reinterpret_cast(jhandle); + return static_cast(options->compactRangeOptions.max_subcompactions); } /* @@ -204,8 +258,70 @@ jint Java_org_rocksdb_CompactRangeOptions_maxSubcompactions(JNIEnv* /*env*/, void Java_org_rocksdb_CompactRangeOptions_setMaxSubcompactions( JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle, jint max_subcompactions) { auto* options = - reinterpret_cast(jhandle); - options->max_subcompactions = static_cast(max_subcompactions); + reinterpret_cast(jhandle); + options->compactRangeOptions.max_subcompactions = + static_cast(max_subcompactions); +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: setFullHistoryTSLow + * Signature: (JJJ)V + */ +void Java_org_rocksdb_CompactRangeOptions_setFullHistoryTSLow(JNIEnv*, jobject, + jlong jhandle, + jlong start, + jlong range) { + auto* options = + reinterpret_cast(jhandle); + options->set_full_history_ts_low(start, range); +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: fullHistoryTSLow + * Signature: (J)Lorg/rocksdb/CompactRangeOptions/Timestamp; + */ +jobject Java_org_rocksdb_CompactRangeOptions_fullHistoryTSLow(JNIEnv* env, + jobject, + jlong jhandle) { + auto* options = + reinterpret_cast(jhandle); + uint64_t start; + uint64_t range; + jobject result = nullptr; + if (options->read_full_history_ts_low(&start, &range)) { + result = + ROCKSDB_NAMESPACE::CompactRangeOptionsTimestampJni::fromCppTimestamp( + env, start, range); + } + + return result; +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: setCanceled + * Signature: (JZ)V + */ +void Java_org_rocksdb_CompactRangeOptions_setCanceled(JNIEnv*, jobject, + jlong jhandle, + jboolean jcanceled) { + auto* options = + reinterpret_cast(jhandle); + options->set_canceled(jcanceled); +} + +/* + * Class: org_rocksdb_CompactRangeOptions + * Method: canceled + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_CompactRangeOptions_canceled(JNIEnv*, jobject, + jlong jhandle) { + auto* options = + reinterpret_cast(jhandle); + return options->get_canceled(); } /* @@ -217,6 +333,6 @@ void Java_org_rocksdb_CompactRangeOptions_disposeInternal(JNIEnv* /*env*/, jobject /*jobj*/, jlong jhandle) { auto* options = - reinterpret_cast(jhandle); + reinterpret_cast(jhandle); delete options; } diff --git a/java/rocksjni/comparatorjnicallback.cc b/java/rocksjni/comparatorjnicallback.cc index 07ab9fa41cc8..d354b40b851e 100644 --- a/java/rocksjni/comparatorjnicallback.cc +++ b/java/rocksjni/comparatorjnicallback.cc @@ -14,7 +14,8 @@ namespace ROCKSDB_NAMESPACE { ComparatorJniCallback::ComparatorJniCallback( JNIEnv* env, jobject jcomparator, const ComparatorJniCallbackOptions* options) - : JniCallback(env, jcomparator), m_options(options) { + : JniCallback(env, jcomparator), + m_options(std::make_unique(*options)) { // cache the AbstractComparatorJniBridge class as we will reuse it many times // for each callback m_abstract_comparator_jni_bridge_clazz = static_cast( diff --git a/java/rocksjni/comparatorjnicallback.h b/java/rocksjni/comparatorjnicallback.h index a983ce4b595b..034c0d5d7d54 100644 --- a/java/rocksjni/comparatorjnicallback.h +++ b/java/rocksjni/comparatorjnicallback.h @@ -45,15 +45,12 @@ enum ReusedSynchronisationType { struct ComparatorJniCallbackOptions { // Set the synchronisation type used to guard the reused buffers. // Only used if max_reused_buffer_size > 0. - // Default: ADAPTIVE_MUTEX - ReusedSynchronisationType reused_synchronisation_type = - ReusedSynchronisationType::ADAPTIVE_MUTEX; + ReusedSynchronisationType reused_synchronisation_type = ADAPTIVE_MUTEX; // Indicates if a direct byte buffer (i.e. outside of the normal // garbage-collected heap) is used for the callbacks to Java, // as opposed to a non-direct byte buffer which is a wrapper around // an on-heap byte[]. - // Default: true bool direct_buffer = true; // Maximum size of a buffer (in bytes) that will be reused. @@ -63,7 +60,6 @@ struct ComparatorJniCallbackOptions { // if it requires less than max_reused_buffer_size, then an // existing buffer will be reused, else a new buffer will be // allocated just for that callback. -1 to disable. - // Default: 64 bytes int32_t max_reused_buffer_size = 64; }; @@ -92,7 +88,7 @@ class ComparatorJniCallback : public JniCallback, public Comparator { virtual void FindShortestSeparator(std::string* start, const Slice& limit) const; virtual void FindShortSuccessor(std::string* key) const; - const ComparatorJniCallbackOptions* m_options; + const std::unique_ptr m_options; private: struct ThreadLocalBuf { diff --git a/java/rocksjni/config_options.cc b/java/rocksjni/config_options.cc index e62111323676..55a9cbb663d5 100644 --- a/java/rocksjni/config_options.cc +++ b/java/rocksjni/config_options.cc @@ -36,6 +36,19 @@ jlong Java_org_rocksdb_ConfigOptions_newConfigOptions(JNIEnv *, jclass) { return GET_CPLUSPLUS_POINTER(cfg_opt); } +/* + * Class: org_rocksdb_ConfigOptions + * Method: setEnv + * Signature: (JJ;)V + */ +void Java_org_rocksdb_ConfigOptions_setEnv(JNIEnv *, jclass, jlong handle, + jlong rocksdb_env_handle) { + auto *cfg_opt = reinterpret_cast(handle); + auto *rocksdb_env = + reinterpret_cast(rocksdb_env_handle); + cfg_opt->env = rocksdb_env; +} + /* * Class: org_rocksdb_ConfigOptions * Method: setDelimiter diff --git a/java/rocksjni/export_import_files_metadatajni.cc b/java/rocksjni/export_import_files_metadatajni.cc new file mode 100644 index 000000000000..213977ac2074 --- /dev/null +++ b/java/rocksjni/export_import_files_metadatajni.cc @@ -0,0 +1,22 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "include/org_rocksdb_ExportImportFilesMetaData.h" +#include "include/org_rocksdb_LiveFileMetaData.h" +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_ExportImportFilesMetaData + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_ExportImportFilesMetaData_disposeInternal( + JNIEnv* /*env*/, jobject /*jopt*/, jlong jhandle) { + auto* metadata = + reinterpret_cast(jhandle); + assert(metadata != nullptr); + delete metadata; +} diff --git a/java/rocksjni/hyper_clock_cache.cc b/java/rocksjni/hyper_clock_cache.cc new file mode 100644 index 000000000000..782f123a5e0e --- /dev/null +++ b/java/rocksjni/hyper_clock_cache.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// This file implements the "bridge" between Java and C++ for +// ROCKSDB_NAMESPACE::HyperClockCache. + +#include + +#include "cache/clock_cache.h" +#include "include/org_rocksdb_HyperClockCache.h" +#include "rocksjni/cplusplus_to_java_convert.h" + +/* + * Class: org_rocksdb_HyperClockCache + * Method: newHyperClockCache + * Signature: (JJIZ)J + */ +jlong Java_org_rocksdb_HyperClockCache_newHyperClockCache( + JNIEnv*, jclass, jlong capacity, jlong estimatedEntryCharge, + jint numShardBits, jboolean strictCapacityLimit) { + ROCKSDB_NAMESPACE::HyperClockCacheOptions cacheOptions = + ROCKSDB_NAMESPACE::HyperClockCacheOptions( + capacity, estimatedEntryCharge, numShardBits, strictCapacityLimit); + + auto* cache = new std::shared_ptr( + cacheOptions.MakeSharedCache()); + return GET_CPLUSPLUS_POINTER(cache); +} + +/* + * Class: org_rocksdb_HyperClockCache + * Method: disposeInternalJni + * Signature: (J)V + */ +void Java_org_rocksdb_HyperClockCache_disposeInternalJni(JNIEnv*, jclass, + jlong jhandle) { + auto* hyper_clock_cache = + reinterpret_cast*>(jhandle); + delete hyper_clock_cache; // delete std::shared_ptr +} \ No newline at end of file diff --git a/java/rocksjni/import_column_family_options.cc b/java/rocksjni/import_column_family_options.cc new file mode 100644 index 000000000000..1a9bded516b1 --- /dev/null +++ b/java/rocksjni/import_column_family_options.cc @@ -0,0 +1,59 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "include/org_rocksdb_ImportColumnFamilyOptions.h" +#include "rocksdb/options.h" +#include "rocksjni/cplusplus_to_java_convert.h" + +/* + * Class: org_rocksdb_ImportColumnFamilyOptions + * Method: newImportColumnFamilyOptions + * Signature: ()J + */ +jlong Java_org_rocksdb_ImportColumnFamilyOptions_newImportColumnFamilyOptions( + JNIEnv *, jclass) { + ROCKSDB_NAMESPACE::ImportColumnFamilyOptions *opts = + new ROCKSDB_NAMESPACE::ImportColumnFamilyOptions(); + return GET_CPLUSPLUS_POINTER(opts); +} + +/* + * Class: org_rocksdb_ImportColumnFamilyOptions + * Method: setMoveFiles + * Signature: (JZ)V + */ +void Java_org_rocksdb_ImportColumnFamilyOptions_setMoveFiles( + JNIEnv *, jobject, jlong jhandle, jboolean jmove_files) { + auto *options = + reinterpret_cast(jhandle); + options->move_files = static_cast(jmove_files); +} + +/* + * Class: org_rocksdb_ImportColumnFamilyOptions + * Method: moveFiles + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ImportColumnFamilyOptions_moveFiles(JNIEnv *, jobject, + jlong jhandle) { + auto *options = + reinterpret_cast(jhandle); + return static_cast(options->move_files); +} + +/* + * Class: org_rocksdb_ImportColumnFamilyOptions + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_ImportColumnFamilyOptions_disposeInternal(JNIEnv *, + jobject, + jlong jhandle) { + delete reinterpret_cast( + jhandle); +} \ No newline at end of file diff --git a/java/rocksjni/jni_perf_context.cc b/java/rocksjni/jni_perf_context.cc new file mode 100644 index 000000000000..e0124fdaa282 --- /dev/null +++ b/java/rocksjni/jni_perf_context.cc @@ -0,0 +1,1188 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include + +#include "include/org_rocksdb_PerfContext.h" +#include "rocksdb/db.h" +#include "rocksdb/perf_context.h" + +void Java_org_rocksdb_PerfContext_reset(JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + perf_context->Reset(); +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getUserKeyComparisonCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getUserKeyComparisonCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->user_key_comparison_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlockCacheHitCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlockCacheHitCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->block_cache_hit_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlockReadCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlockReadCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->block_read_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlockCacheIndexHitCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlockCacheIndexHitCount( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->block_cache_index_hit_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlockCacheStandaloneHandleCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlockCacheStandaloneHandleCount( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->block_cache_standalone_handle_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlockCacheRealHandleCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlockCacheRealHandleCount( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->block_cache_real_handle_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getIndexBlockReadCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getIndexBlockReadCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->index_block_read_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlockCacheFilterHitCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlockCacheFilterHitCount( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->block_cache_filter_hit_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getFilterBlockReadCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getFilterBlockReadCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->filter_block_read_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getCompressionDictBlockReadCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getCompressionDictBlockReadCount( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->compression_dict_block_read_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlockReadByte + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlockReadByte(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->block_read_byte; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlockReadTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlockReadTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->block_read_time; +} + +jlong Java_org_rocksdb_PerfContext_getBlockReadCpuTime(JNIEnv*, jobject, + jlong jpc_handler) { + // reinterpret_cast(jcf_handle); + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handler); + return perf_context->block_read_cpu_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getSecondaryCacheHitCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getSecondaryCacheHitCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->secondary_cache_hit_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getCompressedSecCacheInsertRealCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getCompressedSecCacheInsertRealCount( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->compressed_sec_cache_insert_real_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getCompressedSecCacheInsertDummyCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getCompressedSecCacheInsertDummyCount( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->compressed_sec_cache_insert_dummy_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getCompressedSecCacheUncompressedBytes + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getCompressedSecCacheUncompressedBytes( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->compressed_sec_cache_uncompressed_bytes; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getCompressedSecCacheCompressedBytes + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getCompressedSecCacheCompressedBytes( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->compressed_sec_cache_compressed_bytes; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlockChecksumTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlockChecksumTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->block_checksum_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlockDecompressTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlockDecompressTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->block_decompress_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getReadBytes + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getReadBytes(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->get_read_bytes; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getMultigetReadBytes + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getMultigetReadBytes(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->multiget_read_bytes; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getIterReadBytes + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getIterReadBytes(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->iter_read_bytes; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlobCacheHitCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlobCacheHitCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->blob_cache_hit_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlobReadCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlobReadCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->blob_read_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlobReadByte + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlobReadByte(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->blob_read_byte; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlobReadTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlobReadTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->blob_read_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlobChecksumTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlobChecksumTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->blob_checksum_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlobDecompressTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlobDecompressTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->blob_decompress_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getInternal_key_skipped_count + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getInternalKeySkippedCount( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->internal_key_skipped_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getInternalDeleteSkippedCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getInternalDeleteSkippedCount( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->internal_delete_skipped_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getInternalRecentSkippedCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getInternalRecentSkippedCount( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->internal_recent_skipped_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getInternalMergeCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getInternalMergeCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->internal_merge_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getInternalMergePointLookupCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getInternalMergePointLookupCount( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->internal_merge_point_lookup_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getInternalRangeDelReseekCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getInternalRangeDelReseekCount( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->internal_range_del_reseek_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getSnapshotTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getSnapshotTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->get_snapshot_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getFromMemtableTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getFromMemtableTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->get_from_memtable_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getFromMemtableCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getFromMemtableCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->get_from_memtable_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getPostProcessTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getPostProcessTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->get_post_process_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getFromOutputFilesTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getFromOutputFilesTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->get_from_output_files_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getSeekOnMemtableTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getSeekOnMemtableTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->seek_on_memtable_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getSeekOnMemtableCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getSeekOnMemtableCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->seek_on_memtable_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getNextOnMemtableCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getNextOnMemtableCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->next_on_memtable_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getPrevOnMemtableCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getPrevOnMemtableCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->prev_on_memtable_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getSeekChildSeekTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getSeekChildSeekTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->seek_child_seek_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getSeekChildSeekCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getSeekChildSeekCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->seek_child_seek_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getSeekMinHeapTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getSeekMinHeapTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->seek_min_heap_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getSeekMaxHeapTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getSeekMaxHeapTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->seek_max_heap_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getSeekInternalSeekTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getSeekInternalSeekTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->seek_internal_seek_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getFindNextUserEntryTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getFindNextUserEntryTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->find_next_user_entry_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getWriteWalTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getWriteWalTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->write_wal_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getWriteMemtableTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getWriteMemtableTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->write_memtable_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getWriteDelayTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getWriteDelayTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->write_delay_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getWriteSchedulingFlushesCompactionsTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getWriteSchedulingFlushesCompactionsTime( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->write_scheduling_flushes_compactions_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getWritePreAndPostProcessTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getWritePreAndPostProcessTime( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->write_pre_and_post_process_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getWriteThreadWaitNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getWriteThreadWaitNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->write_thread_wait_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getDbMutexLockNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getDbMutexLockNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->db_mutex_lock_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getDbConditionWaitNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getDbConditionWaitNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->db_condition_wait_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getMergeOperatorTimeNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getMergeOperatorTimeNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->merge_operator_time_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getReadIndexBlockNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getReadIndexBlockNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->read_index_block_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getReadFilterBlockNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getReadFilterBlockNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->read_filter_block_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getNewTableBlockIterNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getNewTableBlockIterNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->new_table_block_iter_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getNewTableIteratorNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getNewTableIteratorNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->new_table_iterator_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBlockSeekNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBlockSeekNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->block_seek_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getFindTableNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getFindTableNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->find_table_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBloomMemtableHitCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBloomMemtableHitCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->bloom_memtable_hit_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBloomMemtableMissCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBloomMemtableMissCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->bloom_memtable_miss_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBloomSstHitCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBloomSstHitCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->bloom_sst_hit_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getBloomSstMissCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getBloomSstMissCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->bloom_sst_miss_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getKeyLockWaitTime + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getKeyLockWaitTime(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->key_lock_wait_time; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getKeyLockWaitCount + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getKeyLockWaitCount(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->key_lock_wait_count; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvNewSequentialFileNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvNewSequentialFileNanos( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_new_sequential_file_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvNewRandomAccessFileNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvNewRandomAccessFileNanos( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_new_random_access_file_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvNewWritableFileNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvNewWritableFileNanos( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_new_writable_file_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvReuseWritableFileNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvReuseWritableFileNanos( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_reuse_writable_file_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvNewRandomRwFileNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvNewRandomRwFileNanos( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_new_random_rw_file_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvNewDirectoryNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvNewDirectoryNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_new_directory_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvFileExistsNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvFileExistsNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_file_exists_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvGetChildrenNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvGetChildrenNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_get_children_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvGetChildrenFileAttributesNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvGetChildrenFileAttributesNanos( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_get_children_file_attributes_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvDeleteFileNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvDeleteFileNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_delete_file_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvCreateDirNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvCreateDirNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_create_dir_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvCreateDirIfMissingNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvCreateDirIfMissingNanos( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_create_dir_if_missing_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvDeleteDirNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvDeleteDirNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_delete_dir_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvGetFileSizeNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvGetFileSizeNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_get_file_size_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvGetFileModificationTimeNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvGetFileModificationTimeNanos( + JNIEnv*, jobject, jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_get_file_modification_time_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvRenameFileNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvRenameFileNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_rename_file_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvLinkFileNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvLinkFileNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_link_file_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvLockFileNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvLockFileNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_lock_file_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvUnlockFileNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvUnlockFileNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_unlock_file_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEnvNewLoggerNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEnvNewLoggerNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->env_new_logger_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getCpuNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getGetCpuNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->get_cpu_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getIterNextCpuNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getIterNextCpuNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->iter_next_cpu_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getIterPrevCpuNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getIterPrevCpuNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->iter_prev_cpu_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getIterSeekCpuNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getIterSeekCpuNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->iter_seek_cpu_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getEncryptDataNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getEncryptDataNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->encrypt_data_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getDecryptDataNanos + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getDecryptDataNanos(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->decrypt_data_nanos; +} + +/* + * Class: org_rocksdb_PerfContext + * Method: getNumberAsyncSeek + * Signature: (J)J + */ +jlong Java_org_rocksdb_PerfContext_getNumberAsyncSeek(JNIEnv*, jobject, + jlong jpc_handle) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = + reinterpret_cast(jpc_handle); + return perf_context->number_async_seek; +} diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index b848ea9cffd4..0d84901c9176 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -3904,6 +3904,29 @@ jbyte Java_org_rocksdb_Options_prepopulateBlobCache(JNIEnv*, jobject, opts->prepopulate_blob_cache); } +/* + * Class: org_rocksdb_Options + * Method: setMemtableMaxRangeDeletions + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setMemtableMaxRangeDeletions( + JNIEnv*, jobject, jlong jhandle, jint jmemtable_max_range_deletions) { + auto* opts = reinterpret_cast(jhandle); + opts->memtable_max_range_deletions = + static_cast(jmemtable_max_range_deletions); +} + +/* + * Class: org_rocksdb_Options + * Method: memtableMaxRangeDeletions + * Signature: (J)I + */ +jint Java_org_rocksdb_Options_memtableMaxRangeDeletions(JNIEnv*, jobject, + jlong jhandle) { + auto* opts = reinterpret_cast(jhandle); + return static_cast(opts->memtable_max_range_deletions); +} + ////////////////////////////////////////////////////////////////////////////// // ROCKSDB_NAMESPACE::ColumnFamilyOptions @@ -3990,9 +4013,13 @@ jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps__Ljav } auto* cf_options = new ROCKSDB_NAMESPACE::ColumnFamilyOptions(); + ROCKSDB_NAMESPACE::ConfigOptions config_options; + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; ROCKSDB_NAMESPACE::Status status = ROCKSDB_NAMESPACE::GetColumnFamilyOptionsFromString( - ROCKSDB_NAMESPACE::ColumnFamilyOptions(), opt_string, cf_options); + config_options, ROCKSDB_NAMESPACE::ColumnFamilyOptions(), opt_string, + cf_options); env->ReleaseStringUTFChars(jopt_string, opt_string); @@ -5766,6 +5793,30 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_prepopulateBlobCache(JNIEnv*, opts->prepopulate_blob_cache); } +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMemtableMaxRangeDeletions + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setMemtableMaxRangeDeletions( + JNIEnv*, jobject, jlong jhandle, jint jmemtable_max_range_deletions) { + auto* opts = + reinterpret_cast(jhandle); + opts->memtable_max_range_deletions = jmemtable_max_range_deletions; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: memtableMaxRangeDeletions + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_memtableMaxRangeDeletions( + JNIEnv*, jobject, jlong jhandle) { + auto* opts = + reinterpret_cast(jhandle); + return static_cast(opts->memtable_max_range_deletions); +} + ///////////////////////////////////////////////////////////////////// // ROCKSDB_NAMESPACE::DBOptions @@ -5848,9 +5899,13 @@ jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps__Ljava_lang_String_2( return 0; } + const ROCKSDB_NAMESPACE::DBOptions base_options; auto* db_options = new ROCKSDB_NAMESPACE::DBOptions(); + ROCKSDB_NAMESPACE::ConfigOptions config_options(base_options); + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; ROCKSDB_NAMESPACE::Status status = ROCKSDB_NAMESPACE::GetDBOptionsFromString( - ROCKSDB_NAMESPACE::DBOptions(), opt_string, db_options); + config_options, base_options, opt_string, db_options); env->ReleaseStringUTFChars(jopt_string, opt_string); diff --git a/java/rocksjni/options_util.cc b/java/rocksjni/options_util.cc index 1a5fb9bb5ac7..5ebdbba929f0 100644 --- a/java/rocksjni/options_util.cc +++ b/java/rocksjni/options_util.cc @@ -54,37 +54,9 @@ void build_column_family_descriptor_list( /* * Class: org_rocksdb_OptionsUtil * Method: loadLatestOptions - * Signature: (Ljava/lang/String;JLjava/util/List;Z)V - */ -void Java_org_rocksdb_OptionsUtil_loadLatestOptions__Ljava_lang_String_2JJLjava_util_List_2Z( - JNIEnv* env, jclass /*jcls*/, jstring jdbpath, jlong jenv_handle, - jlong jdb_opts_handle, jobject jcfds, jboolean ignore_unknown_options) { - jboolean has_exception = JNI_FALSE; - auto db_path = - ROCKSDB_NAMESPACE::JniUtil::copyStdString(env, jdbpath, &has_exception); - if (has_exception == JNI_TRUE) { - // exception occurred - return; - } - std::vector cf_descs; - ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::LoadLatestOptions( - db_path, reinterpret_cast(jenv_handle), - reinterpret_cast(jdb_opts_handle), - &cf_descs, ignore_unknown_options); - if (!s.ok()) { - // error, raise an exception - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); - } else { - build_column_family_descriptor_list(env, jcfds, cf_descs); - } -} - -/* - * Class: org_rocksdb_OptionsUtil - * Method: loadLatestOptions_1 * Signature: (JLjava/lang/String;JLjava/util/List;)V */ -void Java_org_rocksdb_OptionsUtil_loadLatestOptions__JLjava_lang_String_2JLjava_util_List_2( +void Java_org_rocksdb_OptionsUtil_loadLatestOptions( JNIEnv* env, jclass /*jcls*/, jlong cfg_handle, jstring jdbpath, jlong jdb_opts_handle, jobject jcfds) { jboolean has_exception = JNI_FALSE; @@ -109,40 +81,12 @@ void Java_org_rocksdb_OptionsUtil_loadLatestOptions__JLjava_lang_String_2JLjava_ } } -/* - * Class: org_rocksdb_OptionsUtil - * Method: loadOptionsFromFile - * Signature: (Ljava/lang/String;JJLjava/util/List;Z)V - */ -void Java_org_rocksdb_OptionsUtil_loadOptionsFromFile__Ljava_lang_String_2JJLjava_util_List_2Z( - JNIEnv* env, jclass /*jcls*/, jstring jopts_file_name, jlong jenv_handle, - jlong jdb_opts_handle, jobject jcfds, jboolean ignore_unknown_options) { - jboolean has_exception = JNI_FALSE; - auto opts_file_name = ROCKSDB_NAMESPACE::JniUtil::copyStdString( - env, jopts_file_name, &has_exception); - if (has_exception == JNI_TRUE) { - // exception occurred - return; - } - std::vector cf_descs; - ROCKSDB_NAMESPACE::Status s = ROCKSDB_NAMESPACE::LoadOptionsFromFile( - opts_file_name, reinterpret_cast(jenv_handle), - reinterpret_cast(jdb_opts_handle), - &cf_descs, ignore_unknown_options); - if (!s.ok()) { - // error, raise an exception - ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); - } else { - build_column_family_descriptor_list(env, jcfds, cf_descs); - } -} - /* * Class: org_rocksdb_OptionsUtil * Method: loadOptionsFromFile * Signature: (JLjava/lang/String;JLjava/util/List;)V */ -void Java_org_rocksdb_OptionsUtil_loadOptionsFromFile__JLjava_lang_String_2JLjava_util_List_2( +void Java_org_rocksdb_OptionsUtil_loadOptionsFromFile( JNIEnv* env, jclass /*jcls*/, jlong cfg_handle, jstring jopts_file_name, jlong jdb_opts_handle, jobject jcfds) { jboolean has_exception = JNI_FALSE; @@ -193,3 +137,51 @@ jstring Java_org_rocksdb_OptionsUtil_getLatestOptionsFileName( return env->NewStringUTF(options_file_name.c_str()); } } + +/* + * Class: org_rocksdb_OptionsUtil + * Method: readTableFormatConfig + * Signature: (J)Lorg/rocksdb/TableFormatConfig; + */ +jobject Java_org_rocksdb_OptionsUtil_readTableFormatConfig(JNIEnv* env, jclass, + jlong jcf_options) { + if (jcf_options == 0) { + env->ThrowNew( + ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::getJClass(env), + "Null column family options handle supplied to " + "readNewTableFormatConfig"); + return nullptr; + } + + auto* cf_options = + reinterpret_cast(jcf_options); + auto* table_factory = cf_options->table_factory.get(); + if (table_factory == nullptr) { + env->ThrowNew( + ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::getJClass(env), + "Column family options supplied to readNewTableFormatConfig has no " + "table options"); + return nullptr; + } + + if (strcmp(ROCKSDB_NAMESPACE::TableFactory::kBlockBasedTableName(), + table_factory->Name()) == 0) { + auto* table_factory_options = + table_factory->GetOptions(); + if (table_factory_options == nullptr) { + ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew( + env, std::string("Null table format options supplied to " + "readNewTableFormatConfig() ") + + table_factory->Name()); + return nullptr; + } + return ROCKSDB_NAMESPACE::BlockBasedTableOptionsJni::construct( + env, table_factory_options); + } else { + ROCKSDB_NAMESPACE::IllegalArgumentExceptionJni::ThrowNew( + env, std::string("readNewTableFormatConfig() is not implemented for " + "this table format: ") + + table_factory->Name()); + return nullptr; + } +} diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 1a72507a9356..840956dae9be 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -27,6 +27,7 @@ #include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/filter_policy.h" +#include "rocksdb/perf_level.h" #include "rocksdb/rate_limiter.h" #include "rocksdb/status.h" #include "rocksdb/table.h" @@ -207,6 +208,18 @@ class IllegalArgumentExceptionJni return JavaException::ThrowNew(env, s.ToString()); } + + /** + * Create and throw a Java IllegalArgumentException with the provided message + * + * @param env A pointer to the Java environment + * @param msg The message for the exception + * + * @return true if an exception was thrown, false otherwise + */ + static bool ThrowNew(JNIEnv* env, const std::string& msg) { + return JavaException::ThrowNew(env, msg); + } }; // The portal class for org.rocksdb.Status.Code @@ -3561,13 +3574,20 @@ class IteratorJni } }; -// The portal class for org.rocksdb.Filter -class FilterJni +// The portal class for org.rocksdb.FilterPolicy + +enum FilterPolicyTypeJni { + kUnknownFilterPolicy = 0x00, + kBloomFilterPolicy = 0x01, + kRibbonFilterPolicy = 0x02, +}; +class FilterPolicyJni : public RocksDBNativeClass< - std::shared_ptr*, FilterJni> { + std::shared_ptr*, FilterPolicyJni> { + private: public: /** - * Get the Java Class org.rocksdb.Filter + * Get the Java Class org.rocksdb.FilterPolicy * * @param env A pointer to the Java environment * @@ -3576,7 +3596,19 @@ class FilterJni * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown */ static jclass getJClass(JNIEnv* env) { - return RocksDBNativeClass::getJClass(env, "org/rocksdb/Filter"); + return RocksDBNativeClass::getJClass(env, "org/rocksdb/FilterPolicy"); + } + + static jbyte toJavaIndexType(const FilterPolicyTypeJni& filter_policy_type) { + return static_cast(filter_policy_type); + } + + static FilterPolicyTypeJni getFilterPolicyType( + const std::string& policy_class_name) { + if (policy_class_name == "rocksdb.BuiltinBloomFilter") { + return kBloomFilterPolicy; + } + return kUnknownFilterPolicy; } }; @@ -4806,8 +4838,6 @@ class TickerTypeJni { return 0x6; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_BYTES_INSERT: return 0x7; - case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_BYTES_EVICT: - return 0x8; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_MISS: return 0x9; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_HIT: @@ -4816,8 +4846,6 @@ class TickerTypeJni { return 0xB; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_BYTES_INSERT: return 0xC; - case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_BYTES_EVICT: - return 0xD; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_MISS: return 0xE; case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_HIT: @@ -4884,34 +4912,20 @@ class TickerTypeJni { return 0x2D; case ROCKSDB_NAMESPACE::Tickers::ITER_BYTES_READ: return 0x2E; - case ROCKSDB_NAMESPACE::Tickers::NO_FILE_CLOSES: - return 0x2F; case ROCKSDB_NAMESPACE::Tickers::NO_FILE_OPENS: return 0x30; case ROCKSDB_NAMESPACE::Tickers::NO_FILE_ERRORS: return 0x31; - case ROCKSDB_NAMESPACE::Tickers::STALL_L0_SLOWDOWN_MICROS: - return 0x32; - case ROCKSDB_NAMESPACE::Tickers::STALL_MEMTABLE_COMPACTION_MICROS: - return 0x33; - case ROCKSDB_NAMESPACE::Tickers::STALL_L0_NUM_FILES_MICROS: - return 0x34; case ROCKSDB_NAMESPACE::Tickers::STALL_MICROS: return 0x35; case ROCKSDB_NAMESPACE::Tickers::DB_MUTEX_WAIT_MICROS: return 0x36; - case ROCKSDB_NAMESPACE::Tickers::RATE_LIMIT_DELAY_MILLIS: - return 0x37; - case ROCKSDB_NAMESPACE::Tickers::NO_ITERATORS: - return 0x38; case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_CALLS: return 0x39; case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_READ: return 0x3A; case ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_BYTES_READ: return 0x3B; - case ROCKSDB_NAMESPACE::Tickers::NUMBER_FILTERED_DELETES: - return 0x3C; case ROCKSDB_NAMESPACE::Tickers::NUMBER_MERGE_FAILURES: return 0x3D; case ROCKSDB_NAMESPACE::Tickers::BLOOM_FILTER_PREFIX_CHECKED: @@ -4922,14 +4936,6 @@ class TickerTypeJni { return 0x40; case ROCKSDB_NAMESPACE::Tickers::GET_UPDATES_SINCE_CALLS: return 0x41; - case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_MISS: - return 0x42; - case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_HIT: - return 0x43; - case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_ADD: - return 0x44; - case ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_ADD_FAILURES: - return 0x45; case ROCKSDB_NAMESPACE::Tickers::WAL_FILE_SYNCED: return 0x46; case ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES: @@ -4938,8 +4944,6 @@ class TickerTypeJni { return 0x48; case ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_OTHER: return 0x49; - case ROCKSDB_NAMESPACE::Tickers::WRITE_TIMEDOUT: - return 0x4A; case ROCKSDB_NAMESPACE::Tickers::WRITE_WITH_WAL: return 0x4B; case ROCKSDB_NAMESPACE::Tickers::COMPACT_READ_BYTES: @@ -5043,16 +5047,8 @@ class TickerTypeJni { return 0x7C; case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_FAILURES: return 0x7D; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_OVERWRITTEN: - return 0x7E; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_EXPIRED: - return 0x7F; case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_RELOCATED: return -0x02; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_OVERWRITTEN: - return -0x03; - case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_EXPIRED: - return -0x04; case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_RELOCATED: return -0x05; case ROCKSDB_NAMESPACE::Tickers::BLOB_DB_FIFO_NUM_FILES_EVICTED: @@ -5155,6 +5151,30 @@ class TickerTypeJni { return -0x35; case ROCKSDB_NAMESPACE::Tickers::ASYNC_READ_ERROR_COUNT: return -0x36; + case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_FILTER_HITS: + return -0x37; + case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_INDEX_HITS: + return -0x38; + case ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_DATA_HITS: + return -0x39; + case ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_MISS: + return -0x3A; + case ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT: + return -0x3B; + case ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_MISMATCH_COUNT: + return -0x3C; + case ROCKSDB_NAMESPACE::Tickers::READAHEAD_TRIMMED: + return -0x3D; + case ROCKSDB_NAMESPACE::Tickers::FIFO_MAX_SIZE_COMPACTIONS: + return -0x3E; + case ROCKSDB_NAMESPACE::Tickers::FIFO_TTL_COMPACTIONS: + return -0x3F; + case ROCKSDB_NAMESPACE::Tickers::PREFETCH_BYTES: + return -0x40; + case ROCKSDB_NAMESPACE::Tickers::PREFETCH_BYTES_USEFUL: + return -0x41; + case ROCKSDB_NAMESPACE::Tickers::PREFETCH_HITS: + return -0x42; case ROCKSDB_NAMESPACE::Tickers::TICKER_ENUM_MAX: // 0x5F was the max value in the initial copy of tickers to Java. // Since these values are exposed directly to Java clients, we keep @@ -5191,8 +5211,6 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_ADD; case 0x7: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_BYTES_INSERT; - case 0x8: - return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_INDEX_BYTES_EVICT; case 0x9: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_MISS; case 0xA: @@ -5201,8 +5219,6 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_ADD; case 0xC: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_BYTES_INSERT; - case 0xD: - return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_FILTER_BYTES_EVICT; case 0xE: return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_DATA_MISS; case 0xF: @@ -5269,34 +5285,20 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::NUMBER_DB_PREV_FOUND; case 0x2E: return ROCKSDB_NAMESPACE::Tickers::ITER_BYTES_READ; - case 0x2F: - return ROCKSDB_NAMESPACE::Tickers::NO_FILE_CLOSES; case 0x30: return ROCKSDB_NAMESPACE::Tickers::NO_FILE_OPENS; case 0x31: return ROCKSDB_NAMESPACE::Tickers::NO_FILE_ERRORS; - case 0x32: - return ROCKSDB_NAMESPACE::Tickers::STALL_L0_SLOWDOWN_MICROS; - case 0x33: - return ROCKSDB_NAMESPACE::Tickers::STALL_MEMTABLE_COMPACTION_MICROS; - case 0x34: - return ROCKSDB_NAMESPACE::Tickers::STALL_L0_NUM_FILES_MICROS; case 0x35: return ROCKSDB_NAMESPACE::Tickers::STALL_MICROS; case 0x36: return ROCKSDB_NAMESPACE::Tickers::DB_MUTEX_WAIT_MICROS; - case 0x37: - return ROCKSDB_NAMESPACE::Tickers::RATE_LIMIT_DELAY_MILLIS; - case 0x38: - return ROCKSDB_NAMESPACE::Tickers::NO_ITERATORS; case 0x39: return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_CALLS; case 0x3A: return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_KEYS_READ; case 0x3B: return ROCKSDB_NAMESPACE::Tickers::NUMBER_MULTIGET_BYTES_READ; - case 0x3C: - return ROCKSDB_NAMESPACE::Tickers::NUMBER_FILTERED_DELETES; case 0x3D: return ROCKSDB_NAMESPACE::Tickers::NUMBER_MERGE_FAILURES; case 0x3E: @@ -5307,14 +5309,6 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::NUMBER_OF_RESEEKS_IN_ITERATION; case 0x41: return ROCKSDB_NAMESPACE::Tickers::GET_UPDATES_SINCE_CALLS; - case 0x42: - return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_MISS; - case 0x43: - return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_HIT; - case 0x44: - return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_ADD; - case 0x45: - return ROCKSDB_NAMESPACE::Tickers::BLOCK_CACHE_COMPRESSED_ADD_FAILURES; case 0x46: return ROCKSDB_NAMESPACE::Tickers::WAL_FILE_SYNCED; case 0x47: @@ -5323,8 +5317,6 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_SELF; case 0x49: return ROCKSDB_NAMESPACE::Tickers::WRITE_DONE_BY_OTHER; - case 0x4A: - return ROCKSDB_NAMESPACE::Tickers::WRITE_TIMEDOUT; case 0x4B: return ROCKSDB_NAMESPACE::Tickers::WRITE_WITH_WAL; case 0x4C: @@ -5429,16 +5421,8 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_NEW_FILES; case 0x7D: return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_FAILURES; - case 0x7E: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_OVERWRITTEN; - case 0x7F: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_EXPIRED; case -0x02: return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_NUM_KEYS_RELOCATED; - case -0x03: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_OVERWRITTEN; - case -0x04: - return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_EXPIRED; case -0x05: return ROCKSDB_NAMESPACE::Tickers::BLOB_DB_GC_BYTES_RELOCATED; case -0x06: @@ -5542,6 +5526,30 @@ class TickerTypeJni { return ROCKSDB_NAMESPACE::Tickers::READ_ASYNC_MICROS; case -0x36: return ROCKSDB_NAMESPACE::Tickers::ASYNC_READ_ERROR_COUNT; + case -0x37: + return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_FILTER_HITS; + case -0x38: + return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_INDEX_HITS; + case -0x39: + return ROCKSDB_NAMESPACE::Tickers::SECONDARY_CACHE_DATA_HITS; + case -0x3A: + return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_MISS; + case -0x3B: + return ROCKSDB_NAMESPACE::Tickers::TABLE_OPEN_PREFETCH_TAIL_HIT; + case -0x3C: + return ROCKSDB_NAMESPACE::Tickers::BLOCK_CHECKSUM_MISMATCH_COUNT; + case -0x3D: + return ROCKSDB_NAMESPACE::Tickers::READAHEAD_TRIMMED; + case -0x3E: + return ROCKSDB_NAMESPACE::Tickers::FIFO_MAX_SIZE_COMPACTIONS; + case -0x3F: + return ROCKSDB_NAMESPACE::Tickers::FIFO_TTL_COMPACTIONS; + case -0x40: + return ROCKSDB_NAMESPACE::Tickers::PREFETCH_BYTES; + case -0x41: + return ROCKSDB_NAMESPACE::Tickers::PREFETCH_BYTES_USEFUL; + case -0x42: + return ROCKSDB_NAMESPACE::Tickers::PREFETCH_HITS; case 0x5F: // 0x5F was the max value in the initial copy of tickers to Java. // Since these values are exposed directly to Java clients, we keep @@ -5594,16 +5602,6 @@ class HistogramTypeJni { return 0xB; case ROCKSDB_NAMESPACE::Histograms::WRITE_RAW_BLOCK_MICROS: return 0xC; - case ROCKSDB_NAMESPACE::Histograms::STALL_L0_SLOWDOWN_COUNT: - return 0xD; - case ROCKSDB_NAMESPACE::Histograms::STALL_MEMTABLE_COMPACTION_COUNT: - return 0xE; - case ROCKSDB_NAMESPACE::Histograms::STALL_L0_NUM_FILES_COUNT: - return 0xF; - case ROCKSDB_NAMESPACE::Histograms::HARD_RATE_LIMIT_DELAY_COUNT: - return 0x10; - case ROCKSDB_NAMESPACE::Histograms::SOFT_RATE_LIMIT_DELAY_COUNT: - return 0x11; case ROCKSDB_NAMESPACE::Histograms::NUM_FILES_IN_SINGLE_COMPACTION: return 0x12; case ROCKSDB_NAMESPACE::Histograms::DB_SEEK: @@ -5656,8 +5654,6 @@ class HistogramTypeJni { return 0x2A; case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_SYNC_MICROS: return 0x2B; - case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_GC_MICROS: - return 0x2C; case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_COMPRESSION_MICROS: return 0x2D; case ROCKSDB_NAMESPACE::Histograms::BLOB_DB_DECOMPRESSION_MICROS: @@ -5665,8 +5661,6 @@ class HistogramTypeJni { case ROCKSDB_NAMESPACE::Histograms:: NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL: return 0x2F; - case ROCKSDB_NAMESPACE::Histograms::NUM_DATA_BLOCKS_READ_PER_LEVEL: - return 0x30; case ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL: return 0x31; case ROCKSDB_NAMESPACE::Histograms::ERROR_HANDLER_AUTORESUME_RETRY_COUNT: @@ -5683,6 +5677,25 @@ class HistogramTypeJni { return 0x37; case ASYNC_PREFETCH_ABORT_MICROS: return 0x38; + case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_PREFETCH_TAIL_READ_BYTES: + return 0x39; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_FLUSH_MICROS: + return 0x3A; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_COMPACTION_MICROS: + return 0x3B; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_DB_OPEN_MICROS: + return 0x3C; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_GET_MICROS: + return 0x3D; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_MULTIGET_MICROS: + return 0x3E; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_DB_ITERATOR_MICROS: + return 0x3F; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_VERIFY_DB_CHECKSUM_MICROS: + return 0x40; + case ROCKSDB_NAMESPACE::Histograms:: + FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS: + return 0x41; case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: // 0x1F for backwards compatibility on current minor version. return 0x1F; @@ -5723,16 +5736,6 @@ class HistogramTypeJni { return ROCKSDB_NAMESPACE::Histograms::READ_BLOCK_GET_MICROS; case 0xC: return ROCKSDB_NAMESPACE::Histograms::WRITE_RAW_BLOCK_MICROS; - case 0xD: - return ROCKSDB_NAMESPACE::Histograms::STALL_L0_SLOWDOWN_COUNT; - case 0xE: - return ROCKSDB_NAMESPACE::Histograms::STALL_MEMTABLE_COMPACTION_COUNT; - case 0xF: - return ROCKSDB_NAMESPACE::Histograms::STALL_L0_NUM_FILES_COUNT; - case 0x10: - return ROCKSDB_NAMESPACE::Histograms::HARD_RATE_LIMIT_DELAY_COUNT; - case 0x11: - return ROCKSDB_NAMESPACE::Histograms::SOFT_RATE_LIMIT_DELAY_COUNT; case 0x12: return ROCKSDB_NAMESPACE::Histograms::NUM_FILES_IN_SINGLE_COMPACTION; case 0x13: @@ -5785,8 +5788,6 @@ class HistogramTypeJni { return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_READ_MICROS; case 0x2B: return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_BLOB_FILE_SYNC_MICROS; - case 0x2C: - return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_GC_MICROS; case 0x2D: return ROCKSDB_NAMESPACE::Histograms::BLOB_DB_COMPRESSION_MICROS; case 0x2E: @@ -5794,8 +5795,6 @@ class HistogramTypeJni { case 0x2F: return ROCKSDB_NAMESPACE::Histograms:: NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL; - case 0x30: - return ROCKSDB_NAMESPACE::Histograms::NUM_DATA_BLOCKS_READ_PER_LEVEL; case 0x31: return ROCKSDB_NAMESPACE::Histograms::NUM_SST_READ_PER_LEVEL; case 0x32: @@ -5813,6 +5812,27 @@ class HistogramTypeJni { return ROCKSDB_NAMESPACE::Histograms::NUM_LEVEL_READ_PER_MULTIGET; case 0x38: return ROCKSDB_NAMESPACE::Histograms::ASYNC_PREFETCH_ABORT_MICROS; + case 0x39: + return ROCKSDB_NAMESPACE::Histograms:: + TABLE_OPEN_PREFETCH_TAIL_READ_BYTES; + case 0x3A: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_FLUSH_MICROS; + case 0x3B: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_COMPACTION_MICROS; + case 0x3C: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_DB_OPEN_MICROS; + case 0x3D: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_GET_MICROS; + case 0x3E: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_MULTIGET_MICROS; + case 0x3F: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_DB_ITERATOR_MICROS; + case 0x40: + return ROCKSDB_NAMESPACE::Histograms:: + FILE_READ_VERIFY_DB_CHECKSUM_MICROS; + case 0x41: + return ROCKSDB_NAMESPACE::Histograms:: + FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS; case 0x1F: // 0x1F for backwards compatibility on current minor version. return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; @@ -5945,6 +5965,52 @@ class MemoryUsageTypeJni { } }; +class PerfLevelTypeJni { + public: + static jbyte toJavaPerfLevelType(const ROCKSDB_NAMESPACE::PerfLevel level) { + switch (level) { + case ROCKSDB_NAMESPACE::PerfLevel::kUninitialized: + return 0x0; + case ROCKSDB_NAMESPACE::PerfLevel::kDisable: + return 0x1; + case ROCKSDB_NAMESPACE::PerfLevel::kEnableCount: + return 0x2; + case ROCKSDB_NAMESPACE::PerfLevel::kEnableTimeExceptForMutex: + return 0x3; + case ROCKSDB_NAMESPACE::PerfLevel::kEnableTimeAndCPUTimeExceptForMutex: + return 0x4; + case ROCKSDB_NAMESPACE::PerfLevel::kEnableTime: + return 0x5; + case ROCKSDB_NAMESPACE::PerfLevel::kOutOfBounds: + return 0x6; + default: + return 0x6; + } + } + + static ROCKSDB_NAMESPACE::PerfLevel toCppPerfLevelType(const jbyte level) { + switch (level) { + case 0x0: + return ROCKSDB_NAMESPACE::PerfLevel::kUninitialized; + case 0x1: + return ROCKSDB_NAMESPACE::PerfLevel::kDisable; + case 0x2: + return ROCKSDB_NAMESPACE::PerfLevel::kEnableCount; + case 0x3: + return ROCKSDB_NAMESPACE::PerfLevel::kEnableTimeExceptForMutex; + case 0x4: + return ROCKSDB_NAMESPACE::PerfLevel:: + kEnableTimeAndCPUTimeExceptForMutex; + case 0x5: + return ROCKSDB_NAMESPACE::PerfLevel::kEnableTime; + case 0x6: + return ROCKSDB_NAMESPACE::PerfLevel::kOutOfBounds; + default: + return ROCKSDB_NAMESPACE::PerfLevel::kOutOfBounds; + } + } +}; + // The portal class for org.rocksdb.Transaction class TransactionJni : public JavaClass { public: @@ -6707,7 +6773,7 @@ class ChecksumTypeJni { return ROCKSDB_NAMESPACE::ChecksumType::kXXH3; default: // undefined/default - return ROCKSDB_NAMESPACE::ChecksumType::kCRC32c; + return ROCKSDB_NAMESPACE::ChecksumType::kXXH3; } } }; @@ -6852,6 +6918,8 @@ class OperationTypeJni { return 0x1; case ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_FLUSH: return 0x2; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_DBOPEN: + return 0x3; default: return 0x7F; // undefined } @@ -6868,6 +6936,8 @@ class OperationTypeJni { return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_COMPACTION; case 0x2: return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_FLUSH; + case 0x3: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_DBOPEN; default: // undefined/default return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_UNKNOWN; @@ -8757,5 +8827,124 @@ class FileOperationInfoJni : public JavaClass { "(Ljava/lang/String;JJJJLorg/rocksdb/Status;)V"); } }; + +class CompactRangeOptionsTimestampJni : public JavaClass { + public: + static jobject fromCppTimestamp(JNIEnv* env, const uint64_t start, + const uint64_t range) { + jclass jclazz = getJClass(env); + assert(jclazz != nullptr); + static jmethodID ctor = getConstructorMethodId(env, jclazz); + assert(ctor != nullptr); + return env->NewObject(jclazz, ctor, static_cast(start), + static_cast(range)); + } + + static jclass getJClass(JNIEnv* env) { + return JavaClass::getJClass(env, + "org/rocksdb/CompactRangeOptions$Timestamp"); + } + + static jmethodID getConstructorMethodId(JNIEnv* env, jclass clazz) { + return env->GetMethodID(clazz, "", "(JJ)V"); + } +}; + +// The portal class for org.rocksdb.BlockBasedTableOptions +class BlockBasedTableOptionsJni + : public RocksDBNativeClass { + public: + /** + * Get the Java Class org.rocksdb.BlockBasedTableConfig + * + * @param env A pointer to the Java environment + * + * @return The Java Class or nullptr if one of the + * ClassFormatError, ClassCircularityError, NoClassDefFoundError, + * OutOfMemoryError or ExceptionInInitializerError exceptions is thrown + */ + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/BlockBasedTableConfig"); + } + + /** + * Create a new Java org.rocksdb.BlockBasedTableConfig object with the + * properties as the provided C++ ROCKSDB_NAMESPACE::BlockBasedTableOptions + * object + * + * @param env A pointer to the Java environment + * @param cfoptions A pointer to ROCKSDB_NAMESPACE::ColumnFamilyOptions object + * + * @return A reference to a Java org.rocksdb.ColumnFamilyOptions object, or + * nullptr if an an exception occurs + */ + static jobject construct( + JNIEnv* env, const BlockBasedTableOptions* table_factory_options) { + jclass jclazz = getJClass(env); + if (jclazz == nullptr) { + // exception occurred accessing class + return nullptr; + } + + jmethodID method_id_init = + env->GetMethodID(jclazz, "", "(ZZZZBBDBZJIIIJZZZZZIIZZBBJD)V"); + if (method_id_init == nullptr) { + // exception thrown: NoSuchMethodException or OutOfMemoryError + return nullptr; + } + + FilterPolicyTypeJni filter_policy_type = + FilterPolicyTypeJni::kUnknownFilterPolicy; + jlong filter_policy_handle = 0L; + jdouble filter_policy_config_value = 0.0; + if (table_factory_options->filter_policy) { + auto filter_policy = table_factory_options->filter_policy.get(); + filter_policy_type = FilterPolicyJni::getFilterPolicyType( + filter_policy->CompatibilityName()); + if (FilterPolicyTypeJni::kUnknownFilterPolicy != filter_policy_type) { + filter_policy_handle = GET_CPLUSPLUS_POINTER(filter_policy); + } + } + + jobject jcfd = env->NewObject( + jclazz, method_id_init, + table_factory_options->cache_index_and_filter_blocks, + table_factory_options->cache_index_and_filter_blocks_with_high_priority, + table_factory_options->pin_l0_filter_and_index_blocks_in_cache, + table_factory_options->pin_top_level_index_and_filter, + IndexTypeJni::toJavaIndexType(table_factory_options->index_type), + DataBlockIndexTypeJni::toJavaDataBlockIndexType( + table_factory_options->data_block_index_type), + table_factory_options->data_block_hash_table_util_ratio, + ChecksumTypeJni::toJavaChecksumType(table_factory_options->checksum), + table_factory_options->no_block_cache, + static_cast(table_factory_options->block_size), + table_factory_options->block_size_deviation, + table_factory_options->block_restart_interval, + table_factory_options->index_block_restart_interval, + static_cast(table_factory_options->metadata_block_size), + table_factory_options->partition_filters, + table_factory_options->optimize_filters_for_memory, + table_factory_options->use_delta_encoding, + table_factory_options->whole_key_filtering, + table_factory_options->verify_compression, + table_factory_options->read_amp_bytes_per_bit, + table_factory_options->format_version, + table_factory_options->enable_index_compression, + table_factory_options->block_align, + IndexShorteningModeJni::toJavaIndexShorteningMode( + table_factory_options->index_shortening), + FilterPolicyJni::toJavaIndexType(filter_policy_type), + filter_policy_handle, filter_policy_config_value); + if (env->ExceptionCheck()) { + return nullptr; + } + + return jcfd; + } +}; + } // namespace ROCKSDB_NAMESPACE #endif // JAVA_ROCKSJNI_PORTAL_H_ diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc index ced72e841602..66eb2488b1bf 100644 --- a/java/rocksjni/rocksjni.cc +++ b/java/rocksjni/rocksjni.cc @@ -22,6 +22,7 @@ #include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/options.h" +#include "rocksdb/perf_context.h" #include "rocksdb/types.h" #include "rocksdb/version.h" #include "rocksjni/cplusplus_to_java_convert.h" @@ -489,6 +490,63 @@ jlongArray Java_org_rocksdb_RocksDB_createColumnFamilies__J_3J_3_3B( return jcf_handles; } +/* + * Class: org_rocksdb_RocksDB + * Method: createColumnFamilyWithImport + * Signature: (J[BIJJ[J)J + */ +jlong Java_org_rocksdb_RocksDB_createColumnFamilyWithImport( + JNIEnv* env, jobject, jlong jdb_handle, jbyteArray jcf_name, + jint jcf_name_len, jlong j_cf_options, jlong j_cf_import_options, + jlongArray j_metadata_handle_array) { + auto* db = reinterpret_cast(jdb_handle); + jboolean has_exception = JNI_FALSE; + const std::string cf_name = + ROCKSDB_NAMESPACE::JniUtil::byteString( + env, jcf_name, jcf_name_len, + [](const char* str, const size_t len) { + return std::string(str, len); + }, + &has_exception); + if (has_exception == JNI_TRUE) { + // exception occurred + return 0; + } + auto* cf_options = + reinterpret_cast(j_cf_options); + + auto* cf_import_options = + reinterpret_cast( + j_cf_import_options); + + std::vector metadatas; + jlong* ptr_metadata_handle_array = + env->GetLongArrayElements(j_metadata_handle_array, nullptr); + if (j_metadata_handle_array == nullptr) { + // exception thrown: OutOfMemoryError + return 0; + } + const jsize array_size = env->GetArrayLength(j_metadata_handle_array); + for (jsize i = 0; i < array_size; ++i) { + const ROCKSDB_NAMESPACE::ExportImportFilesMetaData* metadata_ptr = + reinterpret_cast( + ptr_metadata_handle_array[i]); + metadatas.push_back(metadata_ptr); + } + env->ReleaseLongArrayElements(j_metadata_handle_array, + ptr_metadata_handle_array, JNI_ABORT); + + ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle = nullptr; + ROCKSDB_NAMESPACE::Status s = db->CreateColumnFamilyWithImport( + *cf_options, cf_name, *cf_import_options, metadatas, &cf_handle); + if (!s.ok()) { + // error occurred + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); + return 0; + } + return GET_CPLUSPLUS_POINTER(cf_handle); +} + /* * Class: org_rocksdb_RocksDB * Method: dropColumnFamily @@ -978,8 +1036,13 @@ bool rocksdb_delete_range_helper( ROCKSDB_NAMESPACE::Slice end_key_slice(reinterpret_cast(end_key), jend_key_len); - ROCKSDB_NAMESPACE::Status s = - db->DeleteRange(write_options, cf_handle, begin_key_slice, end_key_slice); + ROCKSDB_NAMESPACE::Status s; + if (cf_handle != nullptr) { + s = db->DeleteRange(write_options, cf_handle, begin_key_slice, + end_key_slice); + } else { + s = db->DeleteRange(write_options, begin_key_slice, end_key_slice); + } // cleanup delete[] begin_key; @@ -1165,6 +1228,61 @@ void Java_org_rocksdb_RocksDB_deleteRange__JJ_3BII_3BIIJ( } } +/* + * Class: org_rocksdb_RocksDB + * Method: clipColumnFamily + * Signature: (JJ[BII[BII)V + */ +void Java_org_rocksdb_RocksDB_clipColumnFamily( + JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, + jbyteArray jbegin_key, jint jbegin_key_off, jint jbegin_key_len, + jbyteArray jend_key, jint jend_key_off, jint jend_key_len) { + auto* db = reinterpret_cast(jdb_handle); + auto* cf_handle = + reinterpret_cast(jcf_handle); + if (cf_handle != nullptr) { + jbyte* begin_key = new jbyte[jbegin_key_len]; + env->GetByteArrayRegion(jbegin_key, jbegin_key_off, jbegin_key_len, + begin_key); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + delete[] begin_key; + return; + } + ROCKSDB_NAMESPACE::Slice begin_key_slice(reinterpret_cast(begin_key), + jbegin_key_len); + + jbyte* end_key = new jbyte[jend_key_len]; + env->GetByteArrayRegion(jend_key, jend_key_off, jend_key_len, end_key); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + delete[] begin_key; + delete[] end_key; + return; + } + ROCKSDB_NAMESPACE::Slice end_key_slice(reinterpret_cast(end_key), + jend_key_len); + + ROCKSDB_NAMESPACE::Status s = + db->ClipColumnFamily(cf_handle, begin_key_slice, end_key_slice); + + // cleanup + delete[] begin_key; + delete[] end_key; + + if (s.ok()) { + return; + } + + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); + return; + } else { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, ROCKSDB_NAMESPACE::Status::InvalidArgument( + "Invalid ColumnFamilyHandle.")); + } +} + /* * Class: org_rocksdb_RocksDB * Method: getDirect @@ -2214,6 +2332,108 @@ bool key_may_exist_direct_helper(JNIEnv* env, jlong jdb_handle, return exists; } +jboolean key_exists_helper(JNIEnv* env, jlong jdb_handle, jlong jcf_handle, + jlong jread_opts_handle, char* key, jint jkey_len) { + std::string value; + bool value_found = false; + + auto* db = reinterpret_cast(jdb_handle); + + ROCKSDB_NAMESPACE::ColumnFamilyHandle* cf_handle; + if (jcf_handle == 0) { + cf_handle = db->DefaultColumnFamily(); + } else { + cf_handle = + reinterpret_cast(jcf_handle); + } + + ROCKSDB_NAMESPACE::ReadOptions read_opts = + jread_opts_handle == 0 + ? ROCKSDB_NAMESPACE::ReadOptions() + : *(reinterpret_cast( + jread_opts_handle)); + + ROCKSDB_NAMESPACE::Slice key_slice(key, jkey_len); + + const bool may_exist = + db->KeyMayExist(read_opts, cf_handle, key_slice, &value, &value_found); + + if (may_exist) { + ROCKSDB_NAMESPACE::Status s; + { + ROCKSDB_NAMESPACE::PinnableSlice pinnable_val; + s = db->Get(read_opts, cf_handle, key_slice, &pinnable_val); + } + if (s.IsNotFound()) { + return JNI_FALSE; + } else if (s.ok()) { + return JNI_TRUE; + } else { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew(env, s); + return JNI_FALSE; + } + } else { + return JNI_FALSE; + } +} + +/* + * Class: org_rocksdb_RocksDB + * Method: keyExist + * Signature: (JJJ[BII)Z + */ +jboolean Java_org_rocksdb_RocksDB_keyExists(JNIEnv* env, jobject, + jlong jdb_handle, jlong jcf_handle, + jlong jread_opts_handle, + jbyteArray jkey, jint jkey_offset, + jint jkey_len) { + jbyte* key = new jbyte[jkey_len]; + env->GetByteArrayRegion(jkey, jkey_offset, jkey_len, key); + if (env->ExceptionCheck()) { + // exception thrown: ArrayIndexOutOfBoundsException + delete[] key; + return JNI_FALSE; + } else { + jboolean key_exists = + key_exists_helper(env, jdb_handle, jcf_handle, jread_opts_handle, + reinterpret_cast(key), jkey_len); + delete[] key; + return key_exists; + } +} + +/* + private native boolean keyExistDirect(final long handle, final long + cfHandle, final long readOptHandle, final ByteBuffer key, final int keyOffset, + final int keyLength); + + + * Class: org_rocksdb_RocksDB + * Method: keyExistDirect + * Signature: (JJJLjava/nio/ByteBuffer;II)Z + */ +jboolean Java_org_rocksdb_RocksDB_keyExistsDirect( + JNIEnv* env, jobject, jlong jdb_handle, jlong jcf_handle, + jlong jread_opts_handle, jobject jkey, jint jkey_offset, jint jkey_len) { + char* key = reinterpret_cast(env->GetDirectBufferAddress(jkey)); + if (key == nullptr) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, + "Invalid key argument (argument is not a valid direct ByteBuffer)"); + return JNI_FALSE; + } + if (env->GetDirectBufferCapacity(jkey) < (jkey_offset + jkey_len)) { + ROCKSDB_NAMESPACE::RocksDBExceptionJni::ThrowNew( + env, + "Invalid key argument. Capacity is less than requested region (offset " + "+ length)."); + return JNI_FALSE; + } + + return key_exists_helper(env, jdb_handle, jcf_handle, jread_opts_handle, key, + jkey_len); +} + /* * Class: org_rocksdb_RocksDB * Method: keyMayExist @@ -3078,6 +3298,37 @@ jstring Java_org_rocksdb_RocksDB_getDBOptions(JNIEnv* env, jobject, return env->NewStringUTF(options_as_string.c_str()); } +/* + * Class: org_rocksdb_RocksDB + * Method: setPerfLevel + * Signature: (JB)V + */ +void Java_org_rocksdb_RocksDB_setPerfLevel(JNIEnv*, jobject, + jbyte jperf_level) { + rocksdb::SetPerfLevel( + ROCKSDB_NAMESPACE::PerfLevelTypeJni::toCppPerfLevelType(jperf_level)); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: getPerfLevel + * Signature: (J)B + */ +jbyte Java_org_rocksdb_RocksDB_getPerfLevelNative(JNIEnv*, jobject) { + return ROCKSDB_NAMESPACE::PerfLevelTypeJni::toJavaPerfLevelType( + rocksdb::GetPerfLevel()); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: getPerfContextNative + * Signature: ()J + */ +jlong Java_org_rocksdb_RocksDB_getPerfContextNative(JNIEnv*, jobject) { + ROCKSDB_NAMESPACE::PerfContext* perf_context = rocksdb::get_perf_context(); + return reinterpret_cast(perf_context); +} + /* * Class: org_rocksdb_RocksDB * Method: compactFiles diff --git a/java/rocksjni/statisticsjni.h b/java/rocksjni/statisticsjni.h index ce823f9b1280..3262b296cf51 100644 --- a/java/rocksjni/statisticsjni.h +++ b/java/rocksjni/statisticsjni.h @@ -13,7 +13,7 @@ #include #include -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "rocksdb/statistics.h" namespace ROCKSDB_NAMESPACE { diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc index 0054e5c1fb6d..7f99900e4cb3 100644 --- a/java/rocksjni/table.cc +++ b/java/rocksjni/table.cc @@ -55,8 +55,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( jbyte jdata_block_index_type_value, jdouble jdata_block_hash_table_util_ratio, jbyte jchecksum_type_value, jboolean jno_block_cache, jlong jblock_cache_handle, - jlong jpersistent_cache_handle, jlong jblock_cache_compressed_handle, - jlong jblock_size, jint jblock_size_deviation, jint jblock_restart_interval, + jlong jpersistent_cache_handle, jlong jblock_size, + jint jblock_size_deviation, jint jblock_restart_interval, jint jindex_block_restart_interval, jlong jmetadata_block_size, jboolean jpartition_filters, jboolean joptimize_filters_for_memory, jboolean juse_delta_encoding, jlong jfilter_policy_handle, @@ -64,8 +64,7 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( jint jread_amp_bytes_per_bit, jint jformat_version, jboolean jenable_index_compression, jboolean jblock_align, jbyte jindex_shortening, jlong jblock_cache_size, - jint jblock_cache_num_shard_bits, jlong jblock_cache_compressed_size, - jint jblock_cache_compressed_num_shard_bits) { + jint jblock_cache_num_shard_bits) { ROCKSDB_NAMESPACE::BlockBasedTableOptions options; options.cache_index_and_filter_blocks = static_cast(jcache_index_and_filter_blocks); @@ -113,21 +112,6 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( jpersistent_cache_handle); options.persistent_cache = *pCache; } - if (jblock_cache_compressed_handle > 0) { - std::shared_ptr *pCache = - reinterpret_cast *>( - jblock_cache_compressed_handle); - options.block_cache_compressed = *pCache; - } else if (jblock_cache_compressed_size > 0) { - if (jblock_cache_compressed_num_shard_bits > 0) { - options.block_cache_compressed = ROCKSDB_NAMESPACE::NewLRUCache( - static_cast(jblock_cache_compressed_size), - static_cast(jblock_cache_compressed_num_shard_bits)); - } else { - options.block_cache_compressed = ROCKSDB_NAMESPACE::NewLRUCache( - static_cast(jblock_cache_compressed_size)); - } - } options.block_size = static_cast(jblock_size); options.block_size_deviation = static_cast(jblock_size_deviation); options.block_restart_interval = static_cast(jblock_restart_interval); diff --git a/java/rocksjni/write_buffer_manager.cc b/java/rocksjni/write_buffer_manager.cc index b5b7d193b572..9ce697e10ab8 100644 --- a/java/rocksjni/write_buffer_manager.cc +++ b/java/rocksjni/write_buffer_manager.cc @@ -7,6 +7,8 @@ #include +#include + #include "include/org_rocksdb_WriteBufferManager.h" #include "rocksdb/cache.h" #include "rocksjni/cplusplus_to_java_convert.h" diff --git a/java/samples/src/main/java/RocksDBSample.java b/java/samples/src/main/java/RocksDBSample.java index ea650b1414ea..8ab9b2de35e3 100644 --- a/java/samples/src/main/java/RocksDBSample.java +++ b/java/samples/src/main/java/RocksDBSample.java @@ -92,8 +92,7 @@ public static void main(final String[] args) { .setFilterPolicy(bloomFilter) .setBlockSizeDeviation(5) .setBlockRestartInterval(10) - .setCacheIndexAndFilterBlocks(true) - .setBlockCacheCompressed(new LRUCache(64 * 1000, 10)); + .setCacheIndexAndFilterBlocks(true); assert (table_options.blockSizeDeviation() == 5); assert (table_options.blockRestartInterval() == 10); diff --git a/java/spotbugs-exclude.xml b/java/spotbugs-exclude.xml new file mode 100644 index 000000000000..bc3d5ea9a62a --- /dev/null +++ b/java/spotbugs-exclude.xml @@ -0,0 +1,151 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java b/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java index 2f0d4f3ca483..fd7eef4d4cfb 100644 --- a/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java +++ b/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java @@ -7,8 +7,8 @@ /** * A CompactionFilter allows an application to modify/delete a key-value at * the time of compaction. - * - * At present we just permit an overriding Java class to wrap a C++ + *

+ * At present, we just permit an overriding Java class to wrap a C++ * implementation */ public abstract class AbstractCompactionFilter> @@ -49,10 +49,10 @@ protected AbstractCompactionFilter(final long nativeHandle) { /** * Deletes underlying C++ compaction pointer. - * + *

* Note that this function should be called only after all * RocksDB instances referencing the compaction filter are closed. - * Otherwise an undefined behavior will occur. + * Otherwise, an undefined behavior will occur. */ @Override protected final native void disposeInternal(final long handle); diff --git a/java/src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java b/java/src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java index 380b4461d01a..728cda8c1d42 100644 --- a/java/src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java +++ b/java/src/main/java/org/rocksdb/AbstractCompactionFilterFactory.java @@ -15,7 +15,7 @@ public abstract class AbstractCompactionFilterFactory * The name will be printed to the LOG file on start up for diagnosis * * @return name which identifies this compaction filter. diff --git a/java/src/main/java/org/rocksdb/AbstractComparator.java b/java/src/main/java/org/rocksdb/AbstractComparator.java index c08e9127c7b5..83e0f0676019 100644 --- a/java/src/main/java/org/rocksdb/AbstractComparator.java +++ b/java/src/main/java/org/rocksdb/AbstractComparator.java @@ -20,8 +20,8 @@ public abstract class AbstractComparator super(); } - protected AbstractComparator(final ComparatorOptions copt) { - super(copt.nativeHandle_); + protected AbstractComparator(final ComparatorOptions comparatorOptions) { + super(comparatorOptions.nativeHandle_); } @Override @@ -31,7 +31,7 @@ protected long initializeNative(final long... nativeParameterHandles) { /** * Get the type of this comparator. - * + *

* Used for determining the correct C++ cast in native code. * * @return The type of the comparator. @@ -44,11 +44,11 @@ ComparatorType getComparatorType() { * The name of the comparator. Used to check for comparator * mismatches (i.e., a DB created with one comparator is * accessed using a different comparator). - * + *

* A new name should be used whenever * the comparator implementation changes in a way that will cause * the relative ordering of any two keys to change. - * + *

* Names starting with "rocksdb." are reserved and should not be used. * * @return The name of this comparator implementation diff --git a/java/src/main/java/org/rocksdb/AbstractComparatorJniBridge.java b/java/src/main/java/org/rocksdb/AbstractComparatorJniBridge.java index b732d2495687..d0ceef93d419 100644 --- a/java/src/main/java/org/rocksdb/AbstractComparatorJniBridge.java +++ b/java/src/main/java/org/rocksdb/AbstractComparatorJniBridge.java @@ -18,108 +18,105 @@ * {@link org.rocksdb.AbstractComparator} clean. */ class AbstractComparatorJniBridge { + /** + * Only called from JNI. + *

+ * Simply a bridge to calling + * {@link AbstractComparator#compare(ByteBuffer, ByteBuffer)}, + * which ensures that the byte buffer lengths are correct + * before and after the call. + * + * @param comparator the comparator object on which to + * call {@link AbstractComparator#compare(ByteBuffer, ByteBuffer)} + * @param a buffer access to first key + * @param aLen the length of the a key, + * may be smaller than the buffer {@code a} + * @param b buffer access to second key + * @param bLen the length of the b key, + * may be smaller than the buffer {@code b} + * + * @return the result of the comparison + */ + @SuppressWarnings("PMD.UnusedPrivateMethod") + private static int compareInternal(final AbstractComparator comparator, final ByteBuffer a, + final int aLen, final ByteBuffer b, final int bLen) { + if (aLen != -1) { + a.mark(); + a.limit(aLen); + } + if (bLen != -1) { + b.mark(); + b.limit(bLen); + } - /** - * Only called from JNI. - * - * Simply a bridge to calling - * {@link AbstractComparator#compare(ByteBuffer, ByteBuffer)}, - * which ensures that the byte buffer lengths are correct - * before and after the call. - * - * @param comparator the comparator object on which to - * call {@link AbstractComparator#compare(ByteBuffer, ByteBuffer)} - * @param a buffer access to first key - * @param aLen the length of the a key, - * may be smaller than the buffer {@code a} - * @param b buffer access to second key - * @param bLen the length of the b key, - * may be smaller than the buffer {@code b} - * - * @return the result of the comparison - */ - private static int compareInternal( - final AbstractComparator comparator, - final ByteBuffer a, final int aLen, - final ByteBuffer b, final int bLen) { - if (aLen != -1) { - a.mark(); - a.limit(aLen); - } - if (bLen != -1) { - b.mark(); - b.limit(bLen); - } + final int c = comparator.compare(a, b); - final int c = comparator.compare(a, b); + if (aLen != -1) { + a.reset(); + } + if (bLen != -1) { + b.reset(); + } - if (aLen != -1) { - a.reset(); - } - if (bLen != -1) { - b.reset(); - } + return c; + } - return c; + /** + * Only called from JNI. + *

+ * Simply a bridge to calling + * {@link AbstractComparator#findShortestSeparator(ByteBuffer, ByteBuffer)}, + * which ensures that the byte buffer lengths are correct + * before the call. + * + * @param comparator the comparator object on which to + * call {@link AbstractComparator#findShortestSeparator(ByteBuffer, ByteBuffer)} + * @param start buffer access to the start key + * @param startLen the length of the start key, + * may be smaller than the buffer {@code start} + * @param limit buffer access to the limit key + * @param limitLen the length of the limit key, + * may be smaller than the buffer {@code limit} + * + * @return either {@code startLen} if the start key is unchanged, otherwise + * the new length of the start key + */ + @SuppressWarnings("PMD.UnusedPrivateMethod") + private static int findShortestSeparatorInternal(final AbstractComparator comparator, + final ByteBuffer start, final int startLen, final ByteBuffer limit, final int limitLen) { + if (startLen != -1) { + start.limit(startLen); } - - /** - * Only called from JNI. - * - * Simply a bridge to calling - * {@link AbstractComparator#findShortestSeparator(ByteBuffer, ByteBuffer)}, - * which ensures that the byte buffer lengths are correct - * before the call. - * - * @param comparator the comparator object on which to - * call {@link AbstractComparator#findShortestSeparator(ByteBuffer, ByteBuffer)} - * @param start buffer access to the start key - * @param startLen the length of the start key, - * may be smaller than the buffer {@code start} - * @param limit buffer access to the limit key - * @param limitLen the length of the limit key, - * may be smaller than the buffer {@code limit} - * - * @return either {@code startLen} if the start key is unchanged, otherwise - * the new length of the start key - */ - private static int findShortestSeparatorInternal( - final AbstractComparator comparator, - final ByteBuffer start, final int startLen, - final ByteBuffer limit, final int limitLen) { - if (startLen != -1) { - start.limit(startLen); - } - if (limitLen != -1) { - limit.limit(limitLen); - } - comparator.findShortestSeparator(start, limit); - return start.remaining(); + if (limitLen != -1) { + limit.limit(limitLen); } + comparator.findShortestSeparator(start, limit); + return start.remaining(); + } - /** - * Only called from JNI. - * - * Simply a bridge to calling - * {@link AbstractComparator#findShortestSeparator(ByteBuffer, ByteBuffer)}, - * which ensures that the byte buffer length is correct - * before the call. - * - * @param comparator the comparator object on which to - * call {@link AbstractComparator#findShortSuccessor(ByteBuffer)} - * @param key buffer access to the key - * @param keyLen the length of the key, - * may be smaller than the buffer {@code key} - * - * @return either keyLen if the key is unchanged, otherwise the new length of the key - */ - private static int findShortSuccessorInternal( - final AbstractComparator comparator, - final ByteBuffer key, final int keyLen) { - if (keyLen != -1) { - key.limit(keyLen); - } - comparator.findShortSuccessor(key); - return key.remaining(); + /** + * Only called from JNI. + *

+ * Simply a bridge to calling + * {@link AbstractComparator#findShortestSeparator(ByteBuffer, ByteBuffer)}, + * which ensures that the byte buffer length is correct + * before the call. + * + * @param comparator the comparator object on which to + * call {@link AbstractComparator#findShortSuccessor(ByteBuffer)} + * @param key buffer access to the key + * @param keyLen the length of the key, + * may be smaller than the buffer {@code key} + * + * @return either keyLen if the key is unchanged, otherwise the new length of the key + */ + @SuppressWarnings("PMD.UnusedPrivateMethod") + private static int findShortSuccessorInternal( + final AbstractComparator comparator, final ByteBuffer key, final int keyLen) { + if (keyLen != -1) { + key.limit(keyLen); } + comparator.findShortSuccessor(key); + return key.remaining(); + } } diff --git a/java/src/main/java/org/rocksdb/AbstractEventListener.java b/java/src/main/java/org/rocksdb/AbstractEventListener.java index 6698acf88f2c..c9371c45eb0c 100644 --- a/java/src/main/java/org/rocksdb/AbstractEventListener.java +++ b/java/src/main/java/org/rocksdb/AbstractEventListener.java @@ -10,6 +10,7 @@ /** * Base class for Event Listeners. */ +@SuppressWarnings("PMD.AvoidDuplicateLiterals") public abstract class AbstractEventListener extends RocksCallbackObject implements EventListener { public enum EnabledEventCallback { ON_FLUSH_COMPLETED((byte) 0x0), @@ -58,7 +59,7 @@ byte getValue() { * @throws IllegalArgumentException if the value is unknown. */ static EnabledEventCallback fromValue(final byte value) { - for (final EnabledEventCallback enabledEventCallback : EnabledEventCallback.values()) { + for (final EnabledEventCallback enabledEventCallback : values()) { if (enabledEventCallback.value == value) { return enabledEventCallback; } @@ -71,8 +72,8 @@ static EnabledEventCallback fromValue(final byte value) { /** * Creates an Event Listener that will - * received all callbacks from C++. - * + * receive all callbacks from C++. + *

* If you don't need all callbacks, it is much more efficient to * just register for the ones you need by calling * {@link #AbstractEventListener(EnabledEventCallback...)} instead. @@ -106,8 +107,8 @@ protected AbstractEventListener(final EnabledEventCallback... enabledEventCallba */ private static long packToLong(final EnabledEventCallback... enabledEventCallbacks) { long l = 0; - for (int i = 0; i < enabledEventCallbacks.length; i++) { - l |= 1 << enabledEventCallbacks[i].getValue(); + for (final EnabledEventCallback enabledEventCallback : enabledEventCallbacks) { + l |= 1L << enabledEventCallback.getValue(); } return l; } @@ -124,8 +125,9 @@ public void onFlushCompleted(final RocksDB db, final FlushJobInfo flushJobInfo) * @param dbHandle native handle of the database * @param flushJobInfo the flush job info */ + @SuppressWarnings("PMD.UnusedPrivateMethod") private void onFlushCompletedProxy(final long dbHandle, final FlushJobInfo flushJobInfo) { - final RocksDB db = new RocksDB(dbHandle); + final RocksDB db = new RocksDB(dbHandle); // NOPMD - CloseResource db.disOwnNativeHandle(); // we don't own this! onFlushCompleted(db, flushJobInfo); } @@ -142,8 +144,9 @@ public void onFlushBegin(final RocksDB db, final FlushJobInfo flushJobInfo) { * @param dbHandle native handle of the database * @param flushJobInfo the flush job info */ + @SuppressWarnings("PMD.UnusedPrivateMethod") private void onFlushBeginProxy(final long dbHandle, final FlushJobInfo flushJobInfo) { - final RocksDB db = new RocksDB(dbHandle); + final RocksDB db = new RocksDB(dbHandle); // NOPMD - CloseResource db.disOwnNativeHandle(); // we don't own this! onFlushBegin(db, flushJobInfo); } @@ -165,9 +168,10 @@ public void onCompactionBegin(final RocksDB db, final CompactionJobInfo compacti * @param dbHandle native handle of the database * @param compactionJobInfo the flush job info */ + @SuppressWarnings("PMD.UnusedPrivateMethod") private void onCompactionBeginProxy( final long dbHandle, final CompactionJobInfo compactionJobInfo) { - final RocksDB db = new RocksDB(dbHandle); + final RocksDB db = new RocksDB(dbHandle); // NOPMD - CloseResource db.disOwnNativeHandle(); // we don't own this! onCompactionBegin(db, compactionJobInfo); } @@ -184,9 +188,10 @@ public void onCompactionCompleted(final RocksDB db, final CompactionJobInfo comp * @param dbHandle native handle of the database * @param compactionJobInfo the flush job info */ + @SuppressWarnings("PMD.UnusedPrivateMethod") private void onCompactionCompletedProxy( final long dbHandle, final CompactionJobInfo compactionJobInfo) { - final RocksDB db = new RocksDB(dbHandle); + final RocksDB db = new RocksDB(dbHandle); // NOPMD - CloseResource db.disOwnNativeHandle(); // we don't own this! onCompactionCompleted(db, compactionJobInfo); } @@ -225,9 +230,10 @@ public void onExternalFileIngested( * @param dbHandle native handle of the database * @param externalFileIngestionInfo the flush job info */ + @SuppressWarnings("PMD.UnusedPrivateMethod") private void onExternalFileIngestedProxy( final long dbHandle, final ExternalFileIngestionInfo externalFileIngestionInfo) { - final RocksDB db = new RocksDB(dbHandle); + final RocksDB db = new RocksDB(dbHandle); // NOPMD - CloseResource db.disOwnNativeHandle(); // we don't own this! onExternalFileIngested(db, externalFileIngestionInfo); } @@ -245,6 +251,7 @@ public void onBackgroundError( * @param reasonByte byte value representing error reason * @param backgroundError status with error code */ + @SuppressWarnings("PMD.UnusedPrivateMethod") private void onBackgroundErrorProxy(final byte reasonByte, final Status backgroundError) { onBackgroundError(BackgroundErrorReason.fromValue(reasonByte), backgroundError); } @@ -307,6 +314,7 @@ public boolean onErrorRecoveryBegin( * @param reasonByte byte value representing error reason * @param backgroundError status with error code */ + @SuppressWarnings("PMD.UnusedPrivateMethod") private boolean onErrorRecoveryBeginProxy(final byte reasonByte, final Status backgroundError) { return onErrorRecoveryBegin(BackgroundErrorReason.fromValue(reasonByte), backgroundError); } diff --git a/java/src/main/java/org/rocksdb/AbstractMutableOptions.java b/java/src/main/java/org/rocksdb/AbstractMutableOptions.java index 7189272b8871..ff9b8569fd89 100644 --- a/java/src/main/java/org/rocksdb/AbstractMutableOptions.java +++ b/java/src/main/java/org/rocksdb/AbstractMutableOptions.java @@ -3,12 +3,18 @@ import java.util.*; -public abstract class AbstractMutableOptions { - +/** + * This class is not strictly abstract in Java language terms, so we do not declare it as such. + * The name remains {@code AbstractMutableOptions} to reflect the underlying C++ name. + * The constructor is protected, so it will always be used as a base class. + */ +public class AbstractMutableOptions { protected static final String KEY_VALUE_PAIR_SEPARATOR = ";"; protected static final char KEY_VALUE_SEPARATOR = '='; static final String INT_ARRAY_INT_SEPARATOR = ":"; + private static final String HAS_NOT_BEEN_SET = " has not been set"; + protected final String[] keys; private final String[] values; @@ -18,15 +24,18 @@ public abstract class AbstractMutableOptions { * @param keys the keys * @param values the values */ + @SuppressWarnings("PMD.ArrayIsStoredDirectly") protected AbstractMutableOptions(final String[] keys, final String[] values) { this.keys = keys; this.values = values; } + @SuppressWarnings("PMD.MethodReturnsInternalArray") String[] getKeys() { return keys; } + @SuppressWarnings("PMD.MethodReturnsInternalArray") String[] getValues() { return values; } @@ -53,25 +62,23 @@ public String toString() { return buffer.toString(); } - public static abstract class AbstractMutableOptionsBuilder< - T extends AbstractMutableOptions, - U extends AbstractMutableOptionsBuilder, - K extends MutableOptionKey> { - + public abstract static class AbstractMutableOptionsBuilder< + T extends AbstractMutableOptions, U extends AbstractMutableOptionsBuilder, K + extends MutableOptionKey> { private final Map> options = new LinkedHashMap<>(); private final List unknown = new ArrayList<>(); protected abstract U self(); /** - * Get all of the possible keys + * Get all the possible keys * * @return A map of all keys, indexed by name. */ protected abstract Map allKeys(); /** - * Construct a sub-class instance of {@link AbstractMutableOptions}. + * Construct a subclass instance of {@link AbstractMutableOptions}. * * @param keys the keys * @param values the values @@ -108,7 +115,7 @@ protected double getDouble(final K key) throws NoSuchElementException, NumberFormatException { final MutableOptionValue value = options.get(key); if(value == null) { - throw new NoSuchElementException(key.name() + " has not been set"); + throw new NoSuchElementException(key.name() + HAS_NOT_BEEN_SET); } return value.asDouble(); } @@ -127,7 +134,7 @@ protected long getLong(final K key) throws NoSuchElementException, NumberFormatException { final MutableOptionValue value = options.get(key); if(value == null) { - throw new NoSuchElementException(key.name() + " has not been set"); + throw new NoSuchElementException(key.name() + HAS_NOT_BEEN_SET); } return value.asLong(); } @@ -146,7 +153,7 @@ protected int getInt(final K key) throws NoSuchElementException, NumberFormatException { final MutableOptionValue value = options.get(key); if(value == null) { - throw new NoSuchElementException(key.name() + " has not been set"); + throw new NoSuchElementException(key.name() + HAS_NOT_BEEN_SET); } return value.asInt(); } @@ -165,7 +172,7 @@ protected boolean getBoolean(final K key) throws NoSuchElementException, NumberFormatException { final MutableOptionValue value = options.get(key); if(value == null) { - throw new NoSuchElementException(key.name() + " has not been set"); + throw new NoSuchElementException(key.name() + HAS_NOT_BEEN_SET); } return value.asBoolean(); } @@ -184,7 +191,7 @@ protected int[] getIntArray(final K key) throws NoSuchElementException, NumberFormatException { final MutableOptionValue value = options.get(key); if(value == null) { - throw new NoSuchElementException(key.name() + " has not been set"); + throw new NoSuchElementException(key.name() + HAS_NOT_BEEN_SET); } return value.asIntArray(); } @@ -204,7 +211,7 @@ protected > N getEnum(final K key) throws NoSuchElementException, NumberFormatException { final MutableOptionValue value = options.get(key); if (value == null) { - throw new NoSuchElementException(key.name() + " has not been set"); + throw new NoSuchElementException(key.name() + HAS_NOT_BEEN_SET); } if (!(value instanceof MutableOptionValue.MutableOptionEnumValue)) { @@ -224,10 +231,10 @@ protected > N getEnum(final K key) private long parseAsLong(final String value) { try { return Long.parseLong(value); - } catch (NumberFormatException nfe) { + } catch (final NumberFormatException nfe) { final double doubleValue = Double.parseDouble(value); if (doubleValue != Math.round(doubleValue)) - throw new IllegalArgumentException("Unable to parse or round " + value + " to long"); + throw new IllegalArgumentException("Unable to parse or round " + value + " to long", nfe); return Math.round(doubleValue); } } @@ -242,10 +249,10 @@ private long parseAsLong(final String value) { private int parseAsInt(final String value) { try { return Integer.parseInt(value); - } catch (NumberFormatException nfe) { + } catch (final NumberFormatException nfe) { final double doubleValue = Double.parseDouble(value); if (doubleValue != Math.round(doubleValue)) - throw new IllegalArgumentException("Unable to parse or round " + value + " to int"); + throw new IllegalArgumentException("Unable to parse or round " + value + " to int", nfe); return (int) Math.round(doubleValue); } } @@ -271,7 +278,7 @@ protected U fromParsed(final List options, final boolean ign throw new IllegalArgumentException("options string is invalid: " + option); } fromOptionString(option, ignoreUnknown); - } catch (NumberFormatException nfe) { + } catch (final NumberFormatException nfe) { throw new IllegalArgumentException( "" + option.key + "=" + option.value + " - not a valid value for its type", nfe); } @@ -287,8 +294,9 @@ protected U fromParsed(final List options, final boolean ign * @param ignoreUnknown if this is not set, throw an exception when a key is not in the known * set * @return the same object, after adding options - * @throws IllegalArgumentException if the key is unkown, or a value has the wrong type/form + * @throws IllegalArgumentException if the key is unknown, or a value has the wrong type/form */ + @SuppressWarnings("PMD.AvoidLiteralsInIfCondition") private U fromOptionString(final OptionString.Entry option, final boolean ignoreUnknown) throws IllegalArgumentException { Objects.requireNonNull(option.key); @@ -299,7 +307,7 @@ private U fromOptionString(final OptionString.Entry option, final boolean ignore unknown.add(option); return self(); } else if (key == null) { - throw new IllegalArgumentException("Key: " + key + " is not a known option key"); + throw new IllegalArgumentException("Key: " + null + " is not a known option key"); } if (!option.value.isList()) { @@ -341,13 +349,13 @@ private U fromOptionString(final OptionString.Entry option, final boolean ignore return setIntArray(key, value); case ENUM: - String optionName = key.name(); - if (optionName.equals("prepopulate_blob_cache")) { + final String optionName = key.name(); + if ("prepopulate_blob_cache".equals(optionName)) { final PrepopulateBlobCache prepopulateBlobCache = PrepopulateBlobCache.getFromInternal(valueStr); return setEnum(key, prepopulateBlobCache); - } else if (optionName.equals("compression") - || optionName.equals("blob_compression_type")) { + } else if ("compression".equals(optionName) + || "blob_compression_type".equals(optionName)) { final CompressionType compressionType = CompressionType.getFromInternal(valueStr); return setEnum(key, compressionType); } else { diff --git a/java/src/main/java/org/rocksdb/AbstractNativeReference.java b/java/src/main/java/org/rocksdb/AbstractNativeReference.java index 88b2963b616e..1ce54fcba0fd 100644 --- a/java/src/main/java/org/rocksdb/AbstractNativeReference.java +++ b/java/src/main/java/org/rocksdb/AbstractNativeReference.java @@ -16,8 +16,9 @@ * try-with-resources * statement, when you are finished with the object. It is no longer - * called automatically during the regular Java GC process via + * called automatically during the regular Java GC process via finalization * {@link AbstractNativeReference#finalize()}.

+ * which is deprecated from Java 9. *

* Explanatory note - When or if the Garbage Collector calls {@link Object#finalize()} * depends on the JVM implementation and system conditions, which the programmer diff --git a/java/src/main/java/org/rocksdb/AbstractSlice.java b/java/src/main/java/org/rocksdb/AbstractSlice.java index 5a22e29562e7..f321b9910aeb 100644 --- a/java/src/main/java/org/rocksdb/AbstractSlice.java +++ b/java/src/main/java/org/rocksdb/AbstractSlice.java @@ -8,7 +8,7 @@ /** * Slices are used by RocksDB to provide * efficient access to keys and values. - * + *

* This class is package private, implementers * should extend either of the public abstract classes: * @see org.rocksdb.Slice @@ -119,14 +119,16 @@ public String toString() { */ public int compare(final AbstractSlice other) { assert (other != null); - if(!isOwningHandle()) { - return other.isOwningHandle() ? -1 : 0; + if (isOwningHandle() && other.isOwningHandle()) { + return compare0(getNativeHandle(), other.getNativeHandle()); + } + if (!isOwningHandle() && !other.isOwningHandle()) { + return 0; + } + if (isOwningHandle()) { + return 1; } else { - if(!other.isOwningHandle()) { - return 1; - } else { - return compare0(getNativeHandle(), other.getNativeHandle()); - } + return -1; } } @@ -147,7 +149,7 @@ public int hashCode() { */ @Override public boolean equals(final Object other) { - if (other != null && other instanceof AbstractSlice) { + if (other instanceof AbstractSlice) { return compare((AbstractSlice)other) == 0; } else { return false; @@ -172,7 +174,7 @@ public boolean startsWith(final AbstractSlice prefix) { } } - protected native static long createNewSliceFromString(final String str); + protected static native long createNewSliceFromString(final String str); private native int size0(long handle); private native boolean empty0(long handle); private native String toString0(long handle, boolean hex); @@ -183,7 +185,7 @@ public boolean startsWith(final AbstractSlice prefix) { * Deletes underlying C++ slice pointer. * Note that this function should be called only after all * RocksDB instances referencing the slice are closed. - * Otherwise an undefined behavior will occur. + * Otherwise, an undefined behavior will occur. */ @Override protected final native void disposeInternal(final long handle); diff --git a/java/src/main/java/org/rocksdb/AbstractTraceWriter.java b/java/src/main/java/org/rocksdb/AbstractTraceWriter.java index 806709b1f78f..e235c9296232 100644 --- a/java/src/main/java/org/rocksdb/AbstractTraceWriter.java +++ b/java/src/main/java/org/rocksdb/AbstractTraceWriter.java @@ -25,6 +25,7 @@ protected long initializeNative(final long... nativeParameterHandles) { * {@link Status.Code#getValue()} and the second byte is the * {@link Status.SubCode#getValue()}. */ + @SuppressWarnings("PMD.UnusedPrivateMethod") private short writeProxy(final long sliceHandle) { try { write(new Slice(sliceHandle)); @@ -41,6 +42,7 @@ private short writeProxy(final long sliceHandle) { * {@link Status.Code#getValue()} and the second byte is the * {@link Status.SubCode#getValue()}. */ + @SuppressWarnings("PMD.UnusedPrivateMethod") private short closeWriterProxy() { try { closeWriter(); @@ -62,7 +64,7 @@ private static short statusToShort(/*@Nullable*/ final Status status) { private static short statusToShort(final Status.Code code, final Status.SubCode subCode) { - short result = (short)(code.getValue() << 8); + final short result = (short) (code.getValue() << 8); return (short)(result | subCode.getValue()); } diff --git a/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java b/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java index cbb49836d1cd..b117e5cc2ad4 100644 --- a/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java +++ b/java/src/main/java/org/rocksdb/AbstractTransactionNotifier.java @@ -41,10 +41,10 @@ protected long initializeNative(final long... nativeParameterHandles) { /** * Deletes underlying C++ TransactionNotifier pointer. - * + *

* Note that this function should be called only after all * Transactions referencing the comparator are closed. - * Otherwise an undefined behavior will occur. + * Otherwise, an undefined behavior will occur. */ @Override protected void disposeInternal() { diff --git a/java/src/main/java/org/rocksdb/AbstractWalFilter.java b/java/src/main/java/org/rocksdb/AbstractWalFilter.java index d525045c6bb3..92180f90e64c 100644 --- a/java/src/main/java/org/rocksdb/AbstractWalFilter.java +++ b/java/src/main/java/org/rocksdb/AbstractWalFilter.java @@ -30,9 +30,9 @@ protected long initializeNative(final long... nativeParameterHandles) { * {@link WalFilter.LogRecordFoundResult#walProcessingOption} * {@link WalFilter.LogRecordFoundResult#batchChanged}. */ - private short logRecordFoundProxy(final long logNumber, - final String logFileName, final long batchHandle, - final long newBatchHandle) { + @SuppressWarnings("PMD.UnusedPrivateMethod") + private short logRecordFoundProxy(final long logNumber, final String logFileName, + final long batchHandle, final long newBatchHandle) { final LogRecordFoundResult logRecordFoundResult = logRecordFound( logNumber, logFileName, new WriteBatch(batchHandle), new WriteBatch(newBatchHandle)); @@ -41,7 +41,7 @@ logNumber, logFileName, new WriteBatch(batchHandle), private static short logRecordFoundResultToShort( final LogRecordFoundResult logRecordFoundResult) { - short result = (short)(logRecordFoundResult.walProcessingOption.getValue() << 8); + final short result = (short) (logRecordFoundResult.walProcessingOption.getValue() << 8); return (short)(result | (logRecordFoundResult.batchChanged ? 1 : 0)); } diff --git a/java/src/main/java/org/rocksdb/AbstractWriteBatch.java b/java/src/main/java/org/rocksdb/AbstractWriteBatch.java index 9527a2fd9914..41d967f53179 100644 --- a/java/src/main/java/org/rocksdb/AbstractWriteBatch.java +++ b/java/src/main/java/org/rocksdb/AbstractWriteBatch.java @@ -20,25 +20,25 @@ public int count() { } @Override - public void put(byte[] key, byte[] value) throws RocksDBException { + public void put(final byte[] key, final byte[] value) throws RocksDBException { put(nativeHandle_, key, key.length, value, value.length); } @Override - public void put(ColumnFamilyHandle columnFamilyHandle, byte[] key, - byte[] value) throws RocksDBException { + public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, final byte[] value) + throws RocksDBException { put(nativeHandle_, key, key.length, value, value.length, columnFamilyHandle.nativeHandle_); } @Override - public void merge(byte[] key, byte[] value) throws RocksDBException { + public void merge(final byte[] key, final byte[] value) throws RocksDBException { merge(nativeHandle_, key, key.length, value, value.length); } @Override - public void merge(ColumnFamilyHandle columnFamilyHandle, byte[] key, - byte[] value) throws RocksDBException { + public void merge(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, + final byte[] value) throws RocksDBException { merge(nativeHandle_, key, key.length, value, value.length, columnFamilyHandle.nativeHandle_); } @@ -53,7 +53,7 @@ public void put(final ByteBuffer key, final ByteBuffer value) throws RocksDBExce } @Override - public void put(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, + public void put(final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, final ByteBuffer value) throws RocksDBException { assert key.isDirect() && value.isDirect(); putDirect(nativeHandle_, key, key.position(), key.remaining(), value, value.position(), @@ -63,12 +63,12 @@ public void put(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key, } @Override - public void delete(byte[] key) throws RocksDBException { + public void delete(final byte[] key) throws RocksDBException { delete(nativeHandle_, key, key.length); } @Override - public void delete(ColumnFamilyHandle columnFamilyHandle, byte[] key) + public void delete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) throws RocksDBException { delete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); } @@ -80,7 +80,7 @@ public void delete(final ByteBuffer key) throws RocksDBException { } @Override - public void delete(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key) + public void delete(final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key) throws RocksDBException { deleteDirect( nativeHandle_, key, key.position(), key.remaining(), columnFamilyHandle.nativeHandle_); @@ -88,31 +88,30 @@ public void delete(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key) } @Override - public void singleDelete(byte[] key) throws RocksDBException { + public void singleDelete(final byte[] key) throws RocksDBException { singleDelete(nativeHandle_, key, key.length); } @Override - public void singleDelete(ColumnFamilyHandle columnFamilyHandle, byte[] key) + public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) throws RocksDBException { singleDelete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); } @Override - public void deleteRange(byte[] beginKey, byte[] endKey) - throws RocksDBException { + public void deleteRange(final byte[] beginKey, final byte[] endKey) throws RocksDBException { deleteRange(nativeHandle_, beginKey, beginKey.length, endKey, endKey.length); } @Override - public void deleteRange(ColumnFamilyHandle columnFamilyHandle, - byte[] beginKey, byte[] endKey) throws RocksDBException { + public void deleteRange(final ColumnFamilyHandle columnFamilyHandle, final byte[] beginKey, + final byte[] endKey) throws RocksDBException { deleteRange(nativeHandle_, beginKey, beginKey.length, endKey, endKey.length, columnFamilyHandle.nativeHandle_); } @Override - public void putLogData(byte[] blob) throws RocksDBException { + public void putLogData(final byte[] blob) throws RocksDBException { putLogData(nativeHandle_, blob, blob.length); } diff --git a/java/src/main/java/org/rocksdb/AccessHint.java b/java/src/main/java/org/rocksdb/AccessHint.java index 877c4ab39ae1..b7ccadd84a66 100644 --- a/java/src/main/java/org/rocksdb/AccessHint.java +++ b/java/src/main/java/org/rocksdb/AccessHint.java @@ -8,6 +8,7 @@ /** * File access pattern once a compaction has started */ +@Deprecated public enum AccessHint { NONE((byte)0x0), NORMAL((byte)0x1), diff --git a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java index 5338bc42d7e7..d1d1123dded4 100644 --- a/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/AdvancedColumnFamilyOptionsInterface.java @@ -9,12 +9,12 @@ /** * Advanced Column Family Options which are not - * mutable (i.e. present in {@link AdvancedMutableColumnFamilyOptionsInterface} - * + * mutable (i.e. present in {@link AdvancedMutableColumnFamilyOptionsInterface}) + *

* Taken from include/rocksdb/advanced_options.h */ public interface AdvancedColumnFamilyOptionsInterface< - T extends AdvancedColumnFamilyOptionsInterface> { + T extends AdvancedColumnFamilyOptionsInterface & ColumnFamilyOptionsInterface> { /** * The minimum number of write buffers that will be merged together * before writing to storage. If set to 1, then @@ -51,23 +51,23 @@ T setMinWriteBufferNumberToMerge( * this parameter does not affect flushing. * This controls the minimum amount of write history that will be available * in memory for conflict checking when Transactions are used. - * + *

* When using an OptimisticTransactionDB: * If this value is too low, some transactions may fail at commit time due * to not being able to determine whether there were any write conflicts. - * + *

* When using a TransactionDB: * If Transaction::SetSnapshot is used, TransactionDB will read either * in-memory write buffers or SST files to do write-conflict checking. * Increasing this value can reduce the number of reads to SST files * done for conflict detection. - * + *

* Setting this value to 0 will cause write buffers to be freed immediately * after they are flushed. * If this value is set to -1, * {@link AdvancedMutableColumnFamilyOptionsInterface#maxWriteBufferNumber()} * will be used. - * + *

* Default: * If using a TransactionDB/OptimisticTransactionDB, the default value will * be set to the value of @@ -336,14 +336,13 @@ T setMaxCompactionBytes( /** * Set compaction style for DB. - * + *

* Default: LEVEL. * * @param compactionStyle Compaction style. * @return the reference to the current options. */ - ColumnFamilyOptionsInterface setCompactionStyle( - CompactionStyle compactionStyle); + ColumnFamilyOptionsInterface setCompactionStyle(CompactionStyle compactionStyle); /** * Compaction style for DB. @@ -355,7 +354,7 @@ ColumnFamilyOptionsInterface setCompactionStyle( /** * If level {@link #compactionStyle()} == {@link CompactionStyle#LEVEL}, * for each level, which files are prioritized to be picked to compact. - * + *

* Default: {@link CompactionPriority#ByCompensatedSize} * * @param compactionPriority The compaction priority @@ -444,7 +443,7 @@ T setOptimizeFiltersForHits( * By default, RocksDB runs consistency checks on the LSM every time the LSM * changes (Flush, Compaction, AddFile). Use this option if you need to * disable them. - * + *

* Default: true * * @param forceConsistencyChecks false to disable consistency checks diff --git a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java index 162d15d80bb7..c8fc841737dd 100644 --- a/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/AdvancedMutableColumnFamilyOptionsInterface.java @@ -7,7 +7,7 @@ /** * Advanced Column Family Options which are mutable - * + *

* Taken from include/rocksdb/advanced_options.h * and MutableCFOptions in util/cf_options.h */ @@ -58,8 +58,8 @@ T setInplaceUpdateNumLocks( * if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0, * create prefix bloom for memtable with the size of * write_buffer_size * memtable_prefix_bloom_size_ratio. - * If it is larger than 0.25, it is santinized to 0.25. - * + * If it is larger than 0.25, it is sanitized to 0.25. + *

* Default: 0 (disabled) * * @param memtablePrefixBloomSizeRatio the ratio of memtable used by the @@ -73,8 +73,8 @@ T setMemtablePrefixBloomSizeRatio( * if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0, * create prefix bloom for memtable with the size of * write_buffer_size * memtable_prefix_bloom_size_ratio. - * If it is larger than 0.25, it is santinized to 0.25. - * + * If it is larger than 0.25, it is sanitized to 0.25. + *

* Default: 0 (disabled) * * @return the ratio of memtable used by the bloom filter @@ -85,7 +85,7 @@ T setMemtablePrefixBloomSizeRatio( * Threshold used in the MemPurge (memtable garbage collection) * feature. A value of 0.0 corresponds to no MemPurge, * a value of 1.0 will trigger a MemPurge as often as possible. - * + *

* Default: 0.0 (disabled) * * @param experimentalMempurgeThreshold the threshold used by @@ -98,7 +98,7 @@ T setMemtablePrefixBloomSizeRatio( * Threshold used in the MemPurge (memtable garbage collection) * feature. A value of 0.0 corresponds to no MemPurge, * a value of 1.0 will trigger a MemPurge as often as possible. - * + *

* Default: 0 (disabled) * * @return the threshold used by the MemPurge decider @@ -109,7 +109,7 @@ T setMemtablePrefixBloomSizeRatio( * Enable whole key bloom filter in memtable. Note this will only take effect * if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering * can potentially reduce CPU usage for point-look-ups. - * + *

* Default: false (disabled) * * @param memtableWholeKeyFiltering true if whole key bloom filter is enabled @@ -154,12 +154,12 @@ T setMemtableHugePageSize( * The size of one block in arena memory allocation. * If ≤ 0, a proper value is automatically calculated (usually 1/10 of * writer_buffer_size). - * + *

* There are two additional restriction of the specified size: * (1) size should be in the range of [4096, 2 << 30] and * (2) be the multiple of the CPU word (which helps with the memory * alignment). - * + *

* We'll automatically check and adjust the size number to make sure it * conforms to the restrictions. * Default: 0 @@ -175,12 +175,12 @@ T setMemtableHugePageSize( * The size of one block in arena memory allocation. * If ≤ 0, a proper value is automatically calculated (usually 1/10 of * writer_buffer_size). - * + *

* There are two additional restriction of the specified size: * (1) size should be in the range of [4096, 2 << 30] and * (2) be the multiple of the CPU word (which helps with the memory * alignment). - * + *

* We'll automatically check and adjust the size number to make sure it * conforms to the restrictions. * Default: 0 @@ -294,7 +294,7 @@ T setTargetFileSizeMultiplier( * @param multiplier the ratio between the total size of level-(L+1) * files and the total size of level-L files for all L. * @return the reference to the current options. - * + *

* See {@link MutableColumnFamilyOptionsInterface#setMaxBytesForLevelBase(long)} */ T setMaxBytesForLevelMultiplier(double multiplier); @@ -306,7 +306,7 @@ T setTargetFileSizeMultiplier( * * @return the ratio between the total size of level-(L+1) files and * the total size of level-L files for all L. - * + *

* See {@link MutableColumnFamilyOptionsInterface#maxBytesForLevelBase()} */ double maxBytesForLevelMultiplier(); @@ -315,7 +315,7 @@ T setTargetFileSizeMultiplier( * Different max-size multipliers for different levels. * These are multiplied by max_bytes_for_level_multiplier to arrive * at the max-size of each level. - * + *

* Default: 1 * * @param maxBytesForLevelMultiplierAdditional The max-size multipliers @@ -329,7 +329,7 @@ T setMaxBytesForLevelMultiplierAdditional( * Different max-size multipliers for different levels. * These are multiplied by max_bytes_for_level_multiplier to arrive * at the max-size of each level. - * + *

* Default: 1 * * @return The max-size multipliers for each level @@ -339,7 +339,7 @@ T setMaxBytesForLevelMultiplierAdditional( /** * All writes will be slowed down to at least delayed_write_rate if estimated * bytes needed to be compaction exceed this threshold. - * + *

* Default: 64GB * * @param softPendingCompactionBytesLimit The soft limit to impose on @@ -352,7 +352,7 @@ T setSoftPendingCompactionBytesLimit( /** * All writes will be slowed down to at least delayed_write_rate if estimated * bytes needed to be compaction exceed this threshold. - * + *

* Default: 64GB * * @return The soft limit to impose on compaction @@ -362,7 +362,7 @@ T setSoftPendingCompactionBytesLimit( /** * All writes are stopped if estimated bytes needed to be compaction exceed * this threshold. - * + *

* Default: 256GB * * @param hardPendingCompactionBytesLimit The hard limit to impose on @@ -375,7 +375,7 @@ T setHardPendingCompactionBytesLimit( /** * All writes are stopped if estimated bytes needed to be compaction exceed * this threshold. - * + *

* Default: 256GB * * @return The hard limit to impose on compaction @@ -390,7 +390,7 @@ T setHardPendingCompactionBytesLimit( * Default: 8 * * @param maxSequentialSkipInIterations the number of keys could - * be skipped in a iteration. + * be skipped in an iteration. * @return the reference to the current options. */ T setMaxSequentialSkipInIterations( @@ -403,19 +403,19 @@ T setMaxSequentialSkipInIterations( * skipped before a reseek is issued. * Default: 8 * - * @return the number of keys could be skipped in a iteration. + * @return the number of keys could be skipped in an iteration. */ long maxSequentialSkipInIterations(); /** * Maximum number of successive merge operations on a key in the memtable. - * + *

* When a merge operation is added to the memtable and the maximum number of * successive merges is reached, the value of the key will be calculated and * inserted into the memtable instead of the merge operation. This will * ensure that there are never more than max_successive_merges merge * operations in the memtable. - * + *

* Default: 0 (disabled) * * @param maxSuccessiveMerges the maximum number of successive merges. @@ -428,13 +428,13 @@ T setMaxSuccessiveMerges( /** * Maximum number of successive merge operations on a key in the memtable. - * + *

* When a merge operation is added to the memtable and the maximum number of * successive merges is reached, the value of the key will be calculated and * inserted into the memtable instead of the merge operation. This will * ensure that there are never more than max_successive_merges merge * operations in the memtable. - * + *

* Default: 0 (disabled) * * @return the maximum number of successive merges. @@ -443,7 +443,7 @@ T setMaxSuccessiveMerges( /** * After writing every SST file, reopen it and read all the keys. - * + *

* Default: false * * @param paranoidFileChecks true to enable paranoid file checks @@ -454,7 +454,7 @@ T setParanoidFileChecks( /** * After writing every SST file, reopen it and read all the keys. - * + *

* Default: false * * @return true if paranoid file checks are enabled @@ -463,7 +463,7 @@ T setParanoidFileChecks( /** * Measure IO stats in compactions and flushes, if true. - * + *

* Default: false * * @param reportBgIoStats true to enable reporting @@ -483,11 +483,11 @@ T setReportBgIoStats( * Non-bottom-level files older than TTL will go through the compaction * process. This needs {@link MutableDBOptionsInterface#maxOpenFiles()} to be * set to -1. - * + *

* Enabled only for level compaction for now. - * + *

* Default: 0 (disabled) - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -500,7 +500,7 @@ T setReportBgIoStats( /** * Get the TTL for Non-bottom-level files that will go through the compaction * process. - * + *

* See {@link #setTtl(long)}. * * @return the time-to-live. @@ -513,18 +513,18 @@ T setReportBgIoStats( * One main use of the feature is to make sure a file goes through compaction * filters periodically. Users can also use the feature to clear up SST * files using old format. - * + *

* A file's age is computed by looking at file_creation_time or creation_time * table properties in order, if they have valid non-zero values; if not, the * age is based on the file's last modified time (given by the underlying * Env). - * + *

* Supported in Level and FIFO compaction. * In FIFO compaction, this option has the same meaning as TTL and whichever * stricter will be used. * Pre-req: max_open_file == -1. * unit: seconds. Ex: 7 days = 7 * 24 * 60 * 60 - * + *

* Values: * 0: Turn off Periodic compactions. * UINT64_MAX - 1 (i.e 0xfffffffffffffffe): Let RocksDB control this feature @@ -534,9 +534,9 @@ T setReportBgIoStats( * In FIFO compaction, since the option has the same meaning as ttl, * when this value is left default, and ttl is left to 0, 30 days will be * used. Otherwise, min(ttl, periodic_compaction_seconds) will be used. - * + *

* Default: 0xfffffffffffffffe (allow RocksDB to auto-tune) - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -548,7 +548,7 @@ T setReportBgIoStats( /** * Get the periodicCompactionSeconds. - * + *

* See {@link #setPeriodicCompactionSeconds(long)}. * * @return the periodic compaction in seconds. @@ -566,9 +566,9 @@ T setReportBgIoStats( * for reads. See also the options min_blob_size, blob_file_size, * blob_compression_type, enable_blob_garbage_collection, and * blob_garbage_collection_age_cutoff below. - * + *

* Default: false - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -585,9 +585,9 @@ T setReportBgIoStats( * for reads. See also the options min_blob_size, blob_file_size, * blob_compression_type, enable_blob_garbage_collection, and * blob_garbage_collection_age_cutoff below. - * + *

* Default: false - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -601,9 +601,9 @@ T setReportBgIoStats( * alongside the keys in SST files in the usual fashion. A value of zero for * this option means that all values are stored in blob files. Note that * enable_blob_files has to be set in order for this option to have any effect. - * + *

* Default: 0 - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -618,9 +618,9 @@ T setReportBgIoStats( * alongside the keys in SST files in the usual fashion. A value of zero for * this option means that all values are stored in blob files. Note that * enable_blob_files has to be set in order for this option to have any effect. - * + *

* Default: 0 - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -632,9 +632,9 @@ T setReportBgIoStats( * Set the size limit for blob files. When writing blob files, a new file is opened * once this limit is reached. Note that enable_blob_files has to be set in * order for this option to have any effect. - * + *

* Default: 256 MB - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -656,9 +656,9 @@ T setReportBgIoStats( * Set the compression algorithm to use for large values stored in blob files. Note * that enable_blob_files has to be set in order for this option to have any * effect. - * + *

* Default: no compression - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -683,7 +683,7 @@ T setReportBgIoStats( * relocated to new files as they are encountered during compaction, which makes * it possible to clean up blob files once they contain nothing but * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below. - * + *

* Default: false * * @param enableBlobGarbageCollection the new enabled/disabled state of blob garbage collection @@ -698,7 +698,7 @@ T setReportBgIoStats( * relocated to new files as they are encountered during compaction, which makes * it possible to clean up blob files once they contain nothing but * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below. - * + *

* Default: false * * @return true if blob garbage collection is currently enabled. @@ -711,7 +711,7 @@ T setReportBgIoStats( * where N = garbage_collection_cutoff * number_of_blob_files. Note that * enable_blob_garbage_collection has to be set in order for this option to have * any effect. - * + *

* Default: 0.25 * * @param blobGarbageCollectionAgeCutoff the new age cutoff @@ -725,7 +725,7 @@ T setReportBgIoStats( * where N = garbage_collection_cutoff * number_of_blob_files. Note that * enable_blob_garbage_collection has to be set in order for this option to have * any effect. - * + *

* Default: 0.25 * * @return the current age cutoff for garbage collection @@ -738,12 +738,12 @@ T setReportBgIoStats( * the blob files in question, assuming they are all eligible based on the * value of {@link #blobGarbageCollectionAgeCutoff} above. This option is * currently only supported with leveled compactions. - * + *

* Note that {@link #enableBlobGarbageCollection} has to be set in order for this * option to have any effect. - * + *

* Default: 1.0 - * + *

* Dynamically changeable through the SetOptions() API * * @param blobGarbageCollectionForceThreshold new value for the threshold @@ -752,16 +752,16 @@ T setReportBgIoStats( T setBlobGarbageCollectionForceThreshold(double blobGarbageCollectionForceThreshold); /** - * Get the current value for the {@link #blobGarbageCollectionForceThreshold} + * Get the current value for the {@code #blobGarbageCollectionForceThreshold} * @return the current threshold at which garbage collection of blobs is forced */ double blobGarbageCollectionForceThreshold(); /** * Set compaction readahead for blob files. - * + *

* Default: 0 - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -780,9 +780,9 @@ T setReportBgIoStats( /** * Set a certain LSM tree level to enable blob files. - * + *

* Default: 0 - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -794,7 +794,7 @@ T setReportBgIoStats( /** * Get the starting LSM tree level to enable blob files. - * + *

* Default: 0 * * @return the current LSM tree level to enable blob files. @@ -803,13 +803,13 @@ T setReportBgIoStats( /** * Set a certain prepopulate blob cache option. - * + *

* Default: 0 - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * - * @param prepopulateBlobCache the prepopulate blob cache option + * @param prepopulateBlobCache prepopulate the blob cache option * * @return the reference to the current options. */ @@ -817,7 +817,7 @@ T setReportBgIoStats( /** * Get the prepopulate blob cache option. - * + *

* Default: 0 * * @return the current prepopulate blob cache option. diff --git a/java/src/main/java/org/rocksdb/BackupEngine.java b/java/src/main/java/org/rocksdb/BackupEngine.java index 515824a91b40..3ab2206830f3 100644 --- a/java/src/main/java/org/rocksdb/BackupEngine.java +++ b/java/src/main/java/org/rocksdb/BackupEngine.java @@ -9,7 +9,7 @@ /** * BackupEngine allows you to backup * and restore the database - * + *

* Be aware, that `new BackupEngine` takes time proportional to the amount * of backups. So if you have a slow filesystem to backup * and you have a lot of backups then restoring can take some time. @@ -39,12 +39,12 @@ public static BackupEngine open(final Env env, final BackupEngineOptions options /** * Captures the state of the database in the latest backup - * + *

* Just a convenience for {@link #createNewBackup(RocksDB, boolean)} with * the flushBeforeBackup parameter set to false * * @param db The database to backup - * + *

* Note - This method is not thread safe * * @throws RocksDBException thrown if a new backup could not be created @@ -72,7 +72,7 @@ public void createNewBackup(final RocksDB db) throws RocksDBException { * always be consistent with the current state of the * database regardless of the flushBeforeBackup * parameter. - * + *

* Note - This method is not thread safe * * @throws RocksDBException thrown if a new backup could not be created @@ -105,7 +105,7 @@ public void createNewBackup( * always be consistent with the current state of the * database regardless of the flushBeforeBackup * parameter. - * + *

* Note - This method is not thread safe * * @throws RocksDBException thrown if a new backup could not be created @@ -179,11 +179,11 @@ public void deleteBackup(final int backupId) throws RocksDBException { /** * Restore the database from a backup - * + *

* IMPORTANT: if options.share_table_files == true and you restore the DB * from some backup that is not the latest, and you start creating new * backups from the new DB, they will probably fail! - * + *

* Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3. * If you add new data to the DB and try creating a new backup now, the * database will diverge from backups 4 and 5 and the new backup will fail. @@ -226,7 +226,7 @@ public void restoreDbFromLatestBackup( restoreOptions.nativeHandle_); } - private native static long open(final long env, final long backupEngineOptions) + private static native long open(final long env, final long backupEngineOptions) throws RocksDBException; private native void createNewBackup(final long handle, final long dbHandle, diff --git a/java/src/main/java/org/rocksdb/BackupEngineOptions.java b/java/src/main/java/org/rocksdb/BackupEngineOptions.java index 6e2dacc02786..7747b944f918 100644 --- a/java/src/main/java/org/rocksdb/BackupEngineOptions.java +++ b/java/src/main/java/org/rocksdb/BackupEngineOptions.java @@ -25,7 +25,7 @@ public class BackupEngineOptions extends RocksObject { /** *

BackupEngineOptions constructor.

* - * @param path Where to keep the backup files. Has to be different than db + * @param path Where to keep the backup files. Has to be different from db * name. Best to set this to {@code db name_ + "/backups"} * @throws java.lang.IllegalArgumentException if illegal path is used. */ @@ -55,9 +55,9 @@ public String backupDir() { /** * Backup Env object. It will be used for backup file I/O. If it's - * null, backups will be written out using DBs Env. Otherwise + * null, backups will be written out using DBs Env. Otherwise, * backup's I/O will be performed using this object. - * + *

* Default: null * * @param env The environment to use @@ -72,9 +72,9 @@ public BackupEngineOptions setBackupEnv(final Env env) { /** * Backup Env object. It will be used for backup file I/O. If it's - * null, backups will be written out using DBs Env. Otherwise + * null, backups will be written out using DBs Env. Otherwise, * backup's I/O will be performed using this object. - * + *

* Default: null * * @return The environment in use @@ -128,7 +128,7 @@ public BackupEngineOptions setInfoLog(final Logger logger) { /** * Set the logger to use for Backup info and error messages - * + *

* Default: null * * @return The logger in use for the backup @@ -143,7 +143,7 @@ public Logger infoLog() { * @param sync If {@code sync == true}, we can guarantee you'll get consistent * backup even on a machine crash/reboot. Backup process is slower with sync * enabled. If {@code sync == false}, we don't guarantee anything on machine - * reboot. However, chances are some of the backups are consistent. + * reboot. However, chances are some backups are consistent. * *

Default: true

* @@ -194,7 +194,7 @@ public boolean destroyOldData() { /** *

Set if log files shall be persisted.

* - * @param backupLogFiles If false, we won't backup log files. This option can + * @param backupLogFiles If false, we won't back up log files. This option can * be useful for backing up in-memory databases where log file are * persisted, but table files are in memory. * @@ -228,10 +228,9 @@ public boolean backupLogFiles() { * * @return instance of current BackupEngineOptions. */ - public BackupEngineOptions setBackupRateLimit(long backupRateLimit) { + public BackupEngineOptions setBackupRateLimit(final long backupRateLimit) { assert(isOwningHandle()); - backupRateLimit = (backupRateLimit <= 0) ? 0 : backupRateLimit; - setBackupRateLimit(nativeHandle_, backupRateLimit); + setBackupRateLimit(nativeHandle_, (backupRateLimit <= 0) ? 0 : backupRateLimit); return this; } @@ -250,7 +249,7 @@ public long backupRateLimit() { /** * Backup rate limiter. Used to control transfer speed for backup. If this is * not null, {@link #backupRateLimit()} is ignored. - * + *

* Default: null * * @param backupRateLimiter The rate limiter to use for the backup @@ -266,7 +265,7 @@ public BackupEngineOptions setBackupRateLimiter(final RateLimiter backupRateLimi /** * Backup rate limiter. Used to control transfer speed for backup. If this is * not null, {@link #backupRateLimit()} is ignored. - * + *

* Default: null * * @return The rate limiter in use for the backup @@ -286,10 +285,9 @@ public RateLimiter backupRateLimiter() { * * @return instance of current BackupEngineOptions. */ - public BackupEngineOptions setRestoreRateLimit(long restoreRateLimit) { + public BackupEngineOptions setRestoreRateLimit(final long restoreRateLimit) { assert(isOwningHandle()); - restoreRateLimit = (restoreRateLimit <= 0) ? 0 : restoreRateLimit; - setRestoreRateLimit(nativeHandle_, restoreRateLimit); + setRestoreRateLimit(nativeHandle_, (restoreRateLimit <= 0) ? 0 : restoreRateLimit); return this; } @@ -308,7 +306,7 @@ public long restoreRateLimit() { /** * Restore rate limiter. Used to control transfer speed during restore. If * this is not null, {@link #restoreRateLimit()} is ignored. - * + *

* Default: null * * @param restoreRateLimiter The rate limiter to use during restore @@ -324,7 +322,7 @@ public BackupEngineOptions setRestoreRateLimiter(final RateLimiter restoreRateLi /** * Restore rate limiter. Used to control transfer speed during restore. If * this is not null, {@link #restoreRateLimit()} is ignored. - * + *

* Default: null * * @return The rate limiter in use during restore @@ -400,7 +398,7 @@ public int maxBackgroundOperations() { /** * During backup user can get callback every time next * {@link #callbackTriggerIntervalSize()} bytes being copied. - * + *

* Default: 4194304 * * @param callbackTriggerIntervalSize The interval size for the @@ -416,8 +414,8 @@ public BackupEngineOptions setCallbackTriggerIntervalSize( /** * During backup user can get callback every time next - * {@link #callbackTriggerIntervalSize()} bytes being copied. - * + * {@code #callbackTriggerIntervalSize()} bytes being copied. + *

* Default: 4194304 * * @return The interval size for the callback trigger @@ -427,7 +425,7 @@ public long callbackTriggerIntervalSize() { return callbackTriggerIntervalSize(nativeHandle_); } - private native static long newBackupEngineOptions(final String path); + private static native long newBackupEngineOptions(final String path); private native String backupDir(long handle); private native void setBackupEnv(final long handle, final long envHandle); private native void setShareTableFiles(long handle, boolean flag); diff --git a/java/src/main/java/org/rocksdb/BackupInfo.java b/java/src/main/java/org/rocksdb/BackupInfo.java index 9244e4eb19ee..9581b098fc4c 100644 --- a/java/src/main/java/org/rocksdb/BackupInfo.java +++ b/java/src/main/java/org/rocksdb/BackupInfo.java @@ -68,9 +68,9 @@ public String appMetadata() { return app_metadata_; } - private int backupId_; - private long timestamp_; - private long size_; - private int numberFiles_; - private String app_metadata_; + private final int backupId_; + private final long timestamp_; + private final long size_; + private final int numberFiles_; + private final String app_metadata_; } diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java index 0404fc620c8d..c82c3ea10ee9 100644 --- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java +++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java @@ -5,13 +5,13 @@ package org.rocksdb; /** - * The config for plain table sst format. - * + * The config for block based table sst format. + *

* BlockBasedTable is a RocksDB's default SST file format. */ -//TODO(AR) should be renamed BlockBasedTableOptions +// TODO(AR) should be renamed BlockBasedTableOptions public class BlockBasedTableConfig extends TableFormatConfig { - + @SuppressWarnings("PMD.NullAssignment") public BlockBasedTableConfig() { //TODO(AR) flushBlockPolicyFactory cacheIndexAndFilterBlocks = false; @@ -21,11 +21,10 @@ public BlockBasedTableConfig() { indexType = IndexType.kBinarySearch; dataBlockIndexType = DataBlockIndexType.kDataBlockBinarySearch; dataBlockHashTableUtilRatio = 0.75; - checksumType = ChecksumType.kCRC32c; + checksumType = ChecksumType.kXXH3; noBlockCache = false; blockCache = null; persistentCache = null; - blockCacheCompressed = null; blockSize = 4 * 1024; blockSizeDeviation = 10; blockRestartInterval = 16; @@ -46,10 +45,55 @@ public BlockBasedTableConfig() { // NOTE: ONLY used if blockCache == null blockCacheSize = 8 * 1024 * 1024; blockCacheNumShardBits = 0; + } - // NOTE: ONLY used if blockCacheCompressed == null - blockCacheCompressedSize = 0; - blockCacheCompressedNumShardBits = 0; + /** + * Constructor for use by C++ via JNI + */ + private BlockBasedTableConfig(final boolean cacheIndexAndFilterBlocks, + final boolean cacheIndexAndFilterBlocksWithHighPriority, + final boolean pinL0FilterAndIndexBlocksInCache, final boolean pinTopLevelIndexAndFilter, + final byte indexType, final byte dataBlockIndexType, final double dataBlockHashTableUtilRatio, + final byte checksumType, final boolean noBlockCache, final long blockSize, + final int blockSizeDeviation, final int blockRestartInterval, + final int indexBlockRestartInterval, final long metadataBlockSize, + final boolean partitionFilters, final boolean optimizeFiltersForMemory, + final boolean useDeltaEncoding, final boolean wholeKeyFiltering, + final boolean verifyCompression, final int readAmpBytesPerBit, final int formatVersion, + final boolean enableIndexCompression, final boolean blockAlign, final byte indexShortening, + final byte filterPolicyType, final long filterPolicyHandle, + final double filterPolicyConfigValue) { + this.cacheIndexAndFilterBlocks = cacheIndexAndFilterBlocks; + this.cacheIndexAndFilterBlocksWithHighPriority = cacheIndexAndFilterBlocksWithHighPriority; + this.pinL0FilterAndIndexBlocksInCache = pinL0FilterAndIndexBlocksInCache; + this.pinTopLevelIndexAndFilter = pinTopLevelIndexAndFilter; + this.indexType = IndexType.values()[indexType]; + this.dataBlockIndexType = DataBlockIndexType.values()[dataBlockIndexType]; + this.dataBlockHashTableUtilRatio = dataBlockHashTableUtilRatio; + this.checksumType = ChecksumType.values()[checksumType]; + this.noBlockCache = noBlockCache; + this.blockSize = blockSize; + this.blockSizeDeviation = blockSizeDeviation; + this.blockRestartInterval = blockRestartInterval; + this.indexBlockRestartInterval = indexBlockRestartInterval; + this.metadataBlockSize = metadataBlockSize; + this.partitionFilters = partitionFilters; + this.optimizeFiltersForMemory = optimizeFiltersForMemory; + this.useDeltaEncoding = useDeltaEncoding; + this.wholeKeyFiltering = wholeKeyFiltering; + this.verifyCompression = verifyCompression; + this.readAmpBytesPerBit = readAmpBytesPerBit; + this.formatVersion = formatVersion; + this.enableIndexCompression = enableIndexCompression; + this.blockAlign = blockAlign; + this.indexShortening = IndexShorteningMode.values()[indexShortening]; + try (Filter filterPolicy = FilterPolicyType.values()[filterPolicyType].createFilter( + filterPolicyHandle, filterPolicyConfigValue)) { + if (filterPolicy != null) { + filterPolicy.disOwnNativeHandle(); + this.setFilterPolicy(filterPolicy); + } + } } /** @@ -248,7 +292,7 @@ public boolean noBlockCache() { * Disable block cache. If this is set to true, * then no block cache should be used, and the {@link #setBlockCache(Cache)} * should point to a {@code null} object. - * + *

* Default: false * * @param noBlockCache if use block cache @@ -262,10 +306,10 @@ public BlockBasedTableConfig setNoBlockCache(final boolean noBlockCache) { /** * Use the specified cache for blocks. * When not null this take precedence even if the user sets a block cache size. - * + *

* {@link org.rocksdb.Cache} should not be disposed before options instances * using this cache is disposed. - * + *

* {@link org.rocksdb.Cache} instance can be re-used in multiple options * instances. * @@ -281,7 +325,7 @@ public BlockBasedTableConfig setBlockCache(final Cache blockCache) { /** * Use the specified persistent cache. - * + *

* If {@code !null} use the specified cache for pages read from device, * otherwise no page cache is used. * @@ -295,31 +339,6 @@ public BlockBasedTableConfig setPersistentCache( return this; } - /** - * Use the specified cache for compressed blocks. - * - * If {@code null}, RocksDB will not use a compressed block cache. - * - * Note: though it looks similar to {@link #setBlockCache(Cache)}, RocksDB - * doesn't put the same type of object there. - * - * {@link org.rocksdb.Cache} should not be disposed before options instances - * using this cache is disposed. - * - * {@link org.rocksdb.Cache} instance can be re-used in multiple options - * instances. - * - * @param blockCacheCompressed {@link org.rocksdb.Cache} Cache java instance - * (e.g. LRUCache). - * - * @return the reference to the current config. - */ - public BlockBasedTableConfig setBlockCacheCompressed( - final Cache blockCacheCompressed) { - this.blockCacheCompressed = blockCacheCompressed; - return this; - } - /** * Get the approximate size of user data packed per block. * @@ -357,7 +376,7 @@ public int blockSizeDeviation() { * is less than this specified number and adding a new record to the block * will exceed the configured block size, then this block will be closed and * the new record will be written to the next block. - * + *

* Default is 10. * * @param blockSizeDeviation the deviation to block size allowed @@ -444,7 +463,7 @@ public boolean partitionFilters() { /** * Use partitioned full filters for each SST file. This option is incompatible * with block-based filters. - * + *

* Defaults to false. * * @param partitionFilters use partition filters. @@ -458,7 +477,7 @@ public BlockBasedTableConfig setPartitionFilters(final boolean partitionFilters) /*** * Option to generate Bloom filters that minimize memory * internal fragmentation. - * + *

* See {@link #setOptimizeFiltersForMemory(boolean)}. * * @return true if bloom filters are used to minimize memory internal @@ -472,7 +491,7 @@ public boolean optimizeFiltersForMemory() { /** * Option to generate Bloom filters that minimize memory * internal fragmentation. - * + *

* When false, malloc_usable_size is not available, or format_version < 5, * filters are generated without regard to internal fragmentation when * loaded into memory (historical behavior). When true (and @@ -482,21 +501,21 @@ public boolean optimizeFiltersForMemory() { * the reading DB has the same memory allocation characteristics as the * generating DB. This option does not break forward or backward * compatibility. - * + *

* While individual filters will vary in bits/key and false positive rate * when setting is true, the implementation attempts to maintain a weighted * average FP rate for filters consistent with this option set to false. - * + *

* With Jemalloc for example, this setting is expected to save about 10% of * the memory footprint and block cache charge of filters, while increasing * disk usage of filters by about 1-2% due to encoding efficiency losses * with variance in bits/key. - * + *

* NOTE: Because some memory counted by block cache might be unmapped pages * within internal fragmentation, this option can increase observed RSS * memory usage. With {@link #cacheIndexAndFilterBlocks()} == true, * this option makes the block cache better at using space it is allowed. - * + *

* NOTE: Do not set to true if you do not trust malloc_usable_size. With * this option, RocksDB might access an allocated memory object beyond its * original size if malloc_usable_size says it is safe to do so. While this @@ -525,9 +544,9 @@ public boolean useDeltaEncoding() { /** * Use delta encoding to compress keys in blocks. - * + *

* NOTE: {@link ReadOptions#pinData()} requires this option to be disabled. - * + *

* Default: true * * @param useDeltaEncoding true to enable delta encoding @@ -551,10 +570,10 @@ public Filter filterPolicy() { /** * Use the specified filter policy to reduce disk reads. - * + *

* {@link org.rocksdb.Filter} should not be closed before options instances * using this filter are closed. - * + *

* {@link org.rocksdb.Filter} instance can be re-used in multiple options * instances. * @@ -606,7 +625,7 @@ public BlockBasedTableConfig setWholeKeyFiltering( /** * Returns true when compression verification is enabled. - * + *

* See {@link #setVerifyCompression(boolean)}. * * @return true if compression verification is enabled. @@ -632,7 +651,7 @@ public BlockBasedTableConfig setVerifyCompression( /** * Get the Read amplification bytes per-bit. - * + *

* See {@link #setReadAmpBytesPerBit(int)}. * * @return the bytes per-bit. @@ -643,27 +662,27 @@ public int readAmpBytesPerBit() { /** * Set the Read amplification bytes per-bit. - * + *

* If used, For every data block we load into memory, we will create a bitmap * of size ((block_size / `read_amp_bytes_per_bit`) / 8) bytes. This bitmap * will be used to figure out the percentage we actually read of the blocks. - * + *

* When this feature is used Tickers::READ_AMP_ESTIMATE_USEFUL_BYTES and * Tickers::READ_AMP_TOTAL_READ_BYTES can be used to calculate the * read amplification using this formula * (READ_AMP_TOTAL_READ_BYTES / READ_AMP_ESTIMATE_USEFUL_BYTES) - * + *

* value => memory usage (percentage of loaded blocks memory) * 1 => 12.50 % * 2 => 06.25 % * 4 => 03.12 % * 8 => 01.56 % * 16 => 00.78 % - * + *

* Note: This number must be a power of 2, if not it will be sanitized * to be the next lowest power of 2, for example a value of 7 will be * treated as 4, a value of 19 will be treated as 16. - * + *

* Default: 0 (disabled) * * @param readAmpBytesPerBit the bytes per-bit @@ -729,7 +748,7 @@ public BlockBasedTableConfig setFormatVersion( /** * Determine if index compression is enabled. - * + *

* See {@link #setEnableIndexCompression(boolean)}. * * @return true if index compression is enabled, false otherwise @@ -740,7 +759,7 @@ public boolean enableIndexCompression() { /** * Store index blocks on disk in compressed format. - * + *

* Changing this option to false will avoid the overhead of decompression * if index blocks are evicted and read back. * @@ -859,64 +878,6 @@ public BlockBasedTableConfig setCacheNumShardBits( return this; } - /** - * Size of compressed block cache. If 0, then block_cache_compressed is set - * to null. - * - * @return size of compressed block cache. - */ - @Deprecated - public long blockCacheCompressedSize() { - return blockCacheCompressedSize; - } - - /** - * Size of compressed block cache. If 0, then block_cache_compressed is set - * to null. - * - * @param blockCacheCompressedSize of compressed block cache. - * @return the reference to the current config. - * - * @deprecated Use {@link #setBlockCacheCompressed(Cache)}. - */ - @Deprecated - public BlockBasedTableConfig setBlockCacheCompressedSize( - final long blockCacheCompressedSize) { - this.blockCacheCompressedSize = blockCacheCompressedSize; - return this; - } - - /** - * Controls the number of shards for the block compressed cache. - * This is applied only if blockCompressedCacheSize is set to non-negative. - * - * @return numShardBits the number of shard bits. The resulting - * number of shards would be 2 ^ numShardBits. Any negative - * number means use default settings. - */ - @Deprecated - public int blockCacheCompressedNumShardBits() { - return blockCacheCompressedNumShardBits; - } - - /** - * Controls the number of shards for the block compressed cache. - * This is applied only if blockCompressedCacheSize is set to non-negative. - * - * @param blockCacheCompressedNumShardBits the number of shard bits. The resulting - * number of shards would be 2 ^ numShardBits. Any negative - * number means use default settings." - * @return the reference to the current option. - * - * @deprecated Use {@link #setBlockCacheCompressed(Cache)}. - */ - @Deprecated - public BlockBasedTableConfig setBlockCacheCompressedNumShardBits( - final int blockCacheCompressedNumShardBits) { - this.blockCacheCompressedNumShardBits = blockCacheCompressedNumShardBits; - return this; - } - /** * Influence the behavior when kHashSearch is used. * if false, stores a precise prefix to block range mapping @@ -977,23 +938,15 @@ public BlockBasedTableConfig setHashIndexAllowCollision( persistentCacheHandle = 0; } - final long blockCacheCompressedHandle; - if (blockCacheCompressed != null) { - blockCacheCompressedHandle = blockCacheCompressed.nativeHandle_; - } else { - blockCacheCompressedHandle = 0; - } - return newTableFactoryHandle(cacheIndexAndFilterBlocks, cacheIndexAndFilterBlocksWithHighPriority, pinL0FilterAndIndexBlocksInCache, pinTopLevelIndexAndFilter, indexType.getValue(), dataBlockIndexType.getValue(), dataBlockHashTableUtilRatio, checksumType.getValue(), noBlockCache, blockCacheHandle, - persistentCacheHandle, blockCacheCompressedHandle, blockSize, blockSizeDeviation, - blockRestartInterval, indexBlockRestartInterval, metadataBlockSize, partitionFilters, - optimizeFiltersForMemory, useDeltaEncoding, filterPolicyHandle, wholeKeyFiltering, - verifyCompression, readAmpBytesPerBit, formatVersion, enableIndexCompression, blockAlign, - indexShortening.getValue(), blockCacheSize, blockCacheNumShardBits, - blockCacheCompressedSize, blockCacheCompressedNumShardBits); + persistentCacheHandle, blockSize, blockSizeDeviation, blockRestartInterval, + indexBlockRestartInterval, metadataBlockSize, partitionFilters, optimizeFiltersForMemory, + useDeltaEncoding, filterPolicyHandle, wholeKeyFiltering, verifyCompression, + readAmpBytesPerBit, formatVersion, enableIndexCompression, blockAlign, + indexShortening.getValue(), blockCacheSize, blockCacheNumShardBits); } private native long newTableFactoryHandle(final boolean cacheIndexAndFilterBlocks, @@ -1002,18 +955,15 @@ private native long newTableFactoryHandle(final boolean cacheIndexAndFilterBlock final byte indexTypeValue, final byte dataBlockIndexTypeValue, final double dataBlockHashTableUtilRatio, final byte checksumTypeValue, final boolean noBlockCache, final long blockCacheHandle, final long persistentCacheHandle, - final long blockCacheCompressedHandle, final long blockSize, final int blockSizeDeviation, - final int blockRestartInterval, final int indexBlockRestartInterval, - final long metadataBlockSize, final boolean partitionFilters, - final boolean optimizeFiltersForMemory, final boolean useDeltaEncoding, - final long filterPolicyHandle, final boolean wholeKeyFiltering, - final boolean verifyCompression, final int readAmpBytesPerBit, final int formatVersion, - final boolean enableIndexCompression, final boolean blockAlign, final byte indexShortening, - - @Deprecated final long blockCacheSize, @Deprecated final int blockCacheNumShardBits, + final long blockSize, final int blockSizeDeviation, final int blockRestartInterval, + final int indexBlockRestartInterval, final long metadataBlockSize, + final boolean partitionFilters, final boolean optimizeFiltersForMemory, + final boolean useDeltaEncoding, final long filterPolicyHandle, + final boolean wholeKeyFiltering, final boolean verifyCompression, + final int readAmpBytesPerBit, final int formatVersion, final boolean enableIndexCompression, + final boolean blockAlign, final byte indexShortening, - @Deprecated final long blockCacheCompressedSize, - @Deprecated final int blockCacheCompressedNumShardBits); + @Deprecated final long blockCacheSize, @Deprecated final int blockCacheNumShardBits); //TODO(AR) flushBlockPolicyFactory private boolean cacheIndexAndFilterBlocks; @@ -1027,7 +977,6 @@ private native long newTableFactoryHandle(final boolean cacheIndexAndFilterBlock private boolean noBlockCache; private Cache blockCache; private PersistentCache persistentCache; - private Cache blockCacheCompressed; private long blockSize; private int blockSizeDeviation; private int blockRestartInterval; @@ -1048,8 +997,4 @@ private native long newTableFactoryHandle(final boolean cacheIndexAndFilterBlock // NOTE: ONLY used if blockCache == null @Deprecated private long blockCacheSize; @Deprecated private int blockCacheNumShardBits; - - // NOTE: ONLY used if blockCacheCompressed == null - @Deprecated private long blockCacheCompressedSize; - @Deprecated private int blockCacheCompressedNumShardBits; } diff --git a/java/src/main/java/org/rocksdb/BloomFilter.java b/java/src/main/java/org/rocksdb/BloomFilter.java index 8aff715b79c0..c08966c0ee32 100644 --- a/java/src/main/java/org/rocksdb/BloomFilter.java +++ b/java/src/main/java/org/rocksdb/BloomFilter.java @@ -5,6 +5,8 @@ package org.rocksdb; +import java.util.Objects; + /** * Bloom filter policy that uses a bloom filter with approximately * the specified number of bits per key. @@ -33,6 +35,9 @@ public BloomFilter() { this(DEFAULT_BITS_PER_KEY); } + // record this for comparison of filters. + private final double bitsPerKey; + /** * BloomFilter constructor * @@ -47,7 +52,17 @@ public BloomFilter() { * @param bitsPerKey number of bits to use */ public BloomFilter(final double bitsPerKey) { - super(createNewBloomFilter(bitsPerKey)); + this(createNewBloomFilter(bitsPerKey), bitsPerKey); + } + + /** + * + * @param nativeHandle handle to existing bloom filter at RocksDB C++ side + * @param bitsPerKey number of bits to use - recorded for comparison + */ + BloomFilter(final long nativeHandle, final double bitsPerKey) { + super(nativeHandle); + this.bitsPerKey = bitsPerKey; } /** @@ -65,9 +80,25 @@ public BloomFilter(final double bitsPerKey) { * @param bitsPerKey number of bits to use * @param IGNORED_useBlockBasedMode obsolete, ignored parameter */ + @SuppressWarnings("PMD.UnusedFormalParameter") public BloomFilter(final double bitsPerKey, final boolean IGNORED_useBlockBasedMode) { this(bitsPerKey); } - private native static long createNewBloomFilter(final double bitsKeyKey); + @SuppressWarnings("PMD.") + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + return bitsPerKey == ((BloomFilter) o).bitsPerKey; + } + + @Override + public int hashCode() { + return Objects.hash(bitsPerKey); + } + + private static native long createNewBloomFilter(final double bitsKeyKey); } diff --git a/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java b/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java index 8eef954478c3..4ab9e8475ce9 100644 --- a/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java +++ b/java/src/main/java/org/rocksdb/ByteBufferGetStatus.java @@ -12,7 +12,7 @@ /** * A ByteBuffer containing fetched data, together with a result for the fetch * and the total size of the object fetched. - * + *

* Used for the individual results of * {@link RocksDB#multiGetByteBuffers(List, List)} * {@link RocksDB#multiGetByteBuffers(List, List, List)} @@ -42,6 +42,7 @@ public class ByteBufferGetStatus { * * @param status the status of the request to fetch into the buffer */ + @SuppressWarnings("PMD.NullAssignment") ByteBufferGetStatus(final Status status) { this.status = status; this.requiredSize = 0; diff --git a/java/src/main/java/org/rocksdb/Cache.java b/java/src/main/java/org/rocksdb/Cache.java index 569a1df06cf5..04bd3fcaa398 100644 --- a/java/src/main/java/org/rocksdb/Cache.java +++ b/java/src/main/java/org/rocksdb/Cache.java @@ -35,6 +35,6 @@ public long getPinnedUsage() { return getPinnedUsage(this.nativeHandle_); } - private native static long getUsage(final long handle); - private native static long getPinnedUsage(final long handle); + private static native long getUsage(final long handle); + private static native long getPinnedUsage(final long handle); } diff --git a/java/src/main/java/org/rocksdb/CassandraCompactionFilter.java b/java/src/main/java/org/rocksdb/CassandraCompactionFilter.java index 6c87cc1884f9..12854c5102be 100644 --- a/java/src/main/java/org/rocksdb/CassandraCompactionFilter.java +++ b/java/src/main/java/org/rocksdb/CassandraCompactionFilter.java @@ -10,10 +10,11 @@ */ public class CassandraCompactionFilter extends AbstractCompactionFilter { - public CassandraCompactionFilter(boolean purgeTtlOnExpiration, int gcGracePeriodInSeconds) { + public CassandraCompactionFilter( + final boolean purgeTtlOnExpiration, final int gcGracePeriodInSeconds) { super(createNewCassandraCompactionFilter0(purgeTtlOnExpiration, gcGracePeriodInSeconds)); } - private native static long createNewCassandraCompactionFilter0( + private static native long createNewCassandraCompactionFilter0( boolean purgeTtlOnExpiration, int gcGracePeriodInSeconds); } diff --git a/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java b/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java index 4b0c71ba5a5f..732faee207a6 100644 --- a/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java +++ b/java/src/main/java/org/rocksdb/CassandraValueMergeOperator.java @@ -10,16 +10,16 @@ * values. */ public class CassandraValueMergeOperator extends MergeOperator { - public CassandraValueMergeOperator(int gcGracePeriodInSeconds) { + public CassandraValueMergeOperator(final int gcGracePeriodInSeconds) { super(newSharedCassandraValueMergeOperator(gcGracePeriodInSeconds, 0)); - } + } - public CassandraValueMergeOperator(int gcGracePeriodInSeconds, int operandsLimit) { - super(newSharedCassandraValueMergeOperator(gcGracePeriodInSeconds, operandsLimit)); - } + public CassandraValueMergeOperator(final int gcGracePeriodInSeconds, final int operandsLimit) { + super(newSharedCassandraValueMergeOperator(gcGracePeriodInSeconds, operandsLimit)); + } - private native static long newSharedCassandraValueMergeOperator( - int gcGracePeriodInSeconds, int limit); + private static native long newSharedCassandraValueMergeOperator( + int gcGracePeriodInSeconds, int limit); - @Override protected final native void disposeInternal(final long handle); + @Override protected final native void disposeInternal(final long handle); } diff --git a/java/src/main/java/org/rocksdb/Checkpoint.java b/java/src/main/java/org/rocksdb/Checkpoint.java index 0009699325c0..347221df6ed6 100644 --- a/java/src/main/java/org/rocksdb/Checkpoint.java +++ b/java/src/main/java/org/rocksdb/Checkpoint.java @@ -31,8 +31,7 @@ public static Checkpoint create(final RocksDB db) { throw new IllegalStateException( "RocksDB instance must be initialized."); } - Checkpoint checkpoint = new Checkpoint(db); - return checkpoint; + return new Checkpoint(db); } /** @@ -51,16 +50,22 @@ public void createCheckpoint(final String checkpointPath) createCheckpoint(nativeHandle_, checkpointPath); } + public ExportImportFilesMetaData exportColumnFamily(final ColumnFamilyHandle columnFamilyHandle, + final String exportPath) throws RocksDBException { + return new ExportImportFilesMetaData( + exportColumnFamily(nativeHandle_, columnFamilyHandle.nativeHandle_, exportPath)); + } + private Checkpoint(final RocksDB db) { super(newCheckpoint(db.nativeHandle_)); - this.db_ = db; } - private final RocksDB db_; - private static native long newCheckpoint(long dbHandle); @Override protected final native void disposeInternal(final long handle); private native void createCheckpoint(long handle, String checkpointPath) throws RocksDBException; + + private native long exportColumnFamily(long handle, long columnFamilyHandle, String exportPath) + throws RocksDBException; } diff --git a/java/src/main/java/org/rocksdb/ChecksumType.java b/java/src/main/java/org/rocksdb/ChecksumType.java index e03fa14bace8..5b3d2249250f 100644 --- a/java/src/main/java/org/rocksdb/ChecksumType.java +++ b/java/src/main/java/org/rocksdb/ChecksumType.java @@ -37,7 +37,7 @@ public byte getValue() { return value_; } - private ChecksumType(final byte value) { + ChecksumType(final byte value) { value_ = value; } diff --git a/java/src/main/java/org/rocksdb/ClockCache.java b/java/src/main/java/org/rocksdb/ClockCache.java index a66dc0e8a72b..f9f6da74c081 100644 --- a/java/src/main/java/org/rocksdb/ClockCache.java +++ b/java/src/main/java/org/rocksdb/ClockCache.java @@ -8,12 +8,18 @@ /** * Similar to {@link LRUCache}, but based on the CLOCK algorithm with * better concurrent performance in some cases + * + * @deprecated The old Clock Cache implementation had an unresolved bug and + * has been removed. The new HyperClockCache requires an additional + * configuration parameter that is not provided by this API. This function + * simply returns a new LRUCache for functional compatibility. */ public class ClockCache extends Cache { - /** * Create a new cache with a fixed size capacity. * + * @deprecated The old Clock Cache implementation had an unresolved bug and has been removed. + * * @param capacity The fixed size capacity of the cache */ public ClockCache(final long capacity) { @@ -27,6 +33,8 @@ public ClockCache(final long capacity) { * numShardBits = -1 means it is automatically determined: every shard * will be at least 512KB and number of shard bits will not exceed 6. * + * @deprecated The old Clock Cache implementation had an unresolved bug and has been removed. + * * @param capacity The fixed size capacity of the cache * @param numShardBits The cache is sharded to 2^numShardBits shards, * by hash of the key @@ -43,6 +51,8 @@ public ClockCache(final long capacity, final int numShardBits) { * numShardBits = -1 means it is automatically determined: every shard * will be at least 512KB and number of shard bits will not exceed 6. * + * @deprecated The old Clock Cache implementation had an unresolved bug and has been removed. + * * @param capacity The fixed size capacity of the cache * @param numShardBits The cache is sharded to 2^numShardBits shards, * by hash of the key @@ -53,7 +63,7 @@ public ClockCache(final long capacity, final int numShardBits, super(newClockCache(capacity, numShardBits, strictCapacityLimit)); } - private native static long newClockCache(final long capacity, - final int numShardBits, final boolean strictCapacityLimit); + private static native long newClockCache( + final long capacity, final int numShardBits, final boolean strictCapacityLimit); @Override protected final native void disposeInternal(final long handle); } diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java b/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java index 125a8dcf8513..dd9567829b42 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java @@ -33,8 +33,9 @@ public ColumnFamilyDescriptor(final byte[] columnFamilyName) { * column family. * @since 3.10.0 */ - public ColumnFamilyDescriptor(final byte[] columnFamilyName, - final ColumnFamilyOptions columnFamilyOptions) { + @SuppressWarnings("PMD.ArrayIsStoredDirectly") + public ColumnFamilyDescriptor( + final byte[] columnFamilyName, final ColumnFamilyOptions columnFamilyOptions) { columnFamilyName_ = columnFamilyName; columnFamilyOptions_ = columnFamilyOptions; } @@ -45,6 +46,7 @@ public ColumnFamilyDescriptor(final byte[] columnFamilyName, * @return column family name. * @since 3.10.0 */ + @SuppressWarnings("PMD.MethodReturnsInternalArray") public byte[] getName() { return columnFamilyName_; } diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java b/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java index 1ac0a35bbd71..9fd63e768052 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java @@ -32,17 +32,17 @@ public class ColumnFamilyHandle extends RocksObject { /** * Constructor called only from JNI. - * + *

* NOTE: we are producing an additional Java Object here to represent the underlying native C++ * ColumnFamilyHandle object. The underlying object is not owned by ourselves. The Java API user * likely already had a ColumnFamilyHandle Java object which owns the underlying C++ object, as * they will have been presented it when they opened the database or added a Column Family. - * + *

* * TODO(AR) - Potentially a better design would be to cache the active Java Column Family Objects * in RocksDB, and return the same Java Object instead of instantiating a new one here. This could * also help us to improve the Java API semantics for Java users. See for example - * https://github.com/facebook/rocksdb/issues/2687. + * .... * * @param nativeHandle native handle to the column family. */ @@ -80,7 +80,7 @@ public int getID() { * information, this call might internally lock and release DB mutex to * access the up-to-date CF options. In addition, all the pointer-typed * options cannot be referenced any longer than the original options exist. - * + *

* Note that this function is not supported in RocksDBLite. * * @return the up-to-date descriptor. @@ -102,12 +102,12 @@ public boolean equals(final Object o) { return false; } - final ColumnFamilyHandle that = (ColumnFamilyHandle) o; + @SuppressWarnings("PMD.CloseResource") final ColumnFamilyHandle that = (ColumnFamilyHandle) o; try { return rocksDB_.nativeHandle_ == that.rocksDB_.nativeHandle_ && getID() == that.getID() && Arrays.equals(getName(), that.getName()); - } catch (RocksDBException e) { + } catch (final RocksDBException e) { throw new RuntimeException("Cannot compare column family handles", e); } } @@ -118,7 +118,7 @@ public int hashCode() { int result = Objects.hash(getID(), rocksDB_.nativeHandle_); result = 31 * result + Arrays.hashCode(getName()); return result; - } catch (RocksDBException e) { + } catch (final RocksDBException e) { throw new RuntimeException("Cannot calculate hash code of column family handle", e); } } diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyMetaData.java b/java/src/main/java/org/rocksdb/ColumnFamilyMetaData.java index 1919040172d3..9b6d1a70cc80 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyMetaData.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyMetaData.java @@ -11,6 +11,7 @@ /** * The metadata that describes a column family. */ +@SuppressWarnings("PMD.MissingStaticMethodInNonInstantiatableClass") public class ColumnFamilyMetaData { private final long size; private final long fileCount; @@ -55,6 +56,7 @@ public long fileCount() { * * @return the name */ + @SuppressWarnings("PMD.MethodReturnsInternalArray") public byte[] name() { return name; } diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java index a642cb6fabf4..607a17936e16 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java @@ -11,36 +11,32 @@ /** * ColumnFamilyOptions to control the behavior of a database. It will be used * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()). - * - * As a descendent of {@link AbstractNativeReference}, this class is {@link AutoCloseable} + *

+ * As a descendant of {@link AbstractNativeReference}, this class is {@link AutoCloseable} * and will be automatically released if opened in the preamble of a try with resources block. */ -public class ColumnFamilyOptions extends RocksObject - implements ColumnFamilyOptionsInterface, - MutableColumnFamilyOptionsInterface { - static { - RocksDB.loadLibrary(); - } - +public class ColumnFamilyOptions + extends RocksObject implements ColumnFamilyOptionsInterface, + MutableColumnFamilyOptionsInterface { /** * Construct ColumnFamilyOptions. - * + *

* This constructor will create (by allocating a block of memory) * an {@code rocksdb::ColumnFamilyOptions} in the c++ side. */ public ColumnFamilyOptions() { - super(newColumnFamilyOptions()); + super(newColumnFamilyOptionsInstance()); } /** * Copy constructor for ColumnFamilyOptions. - * + *

* NOTE: This does a shallow copy, which means comparator, merge_operator, compaction_filter, * compaction_filter_factory and other pointers will be cloned! * * @param other The ColumnFamilyOptions to copy. */ - public ColumnFamilyOptions(ColumnFamilyOptions other) { + public ColumnFamilyOptions(final ColumnFamilyOptions other) { super(copyColumnFamilyOptions(other.nativeHandle_)); this.memTableConfig_ = other.memTableConfig_; this.tableFormatConfig_ = other.tableFormatConfig_; @@ -602,6 +598,10 @@ public ColumnFamilyOptions setTableFormatConfig( return this; } + void setFetchedTableFormatConfig(final TableFormatConfig tableFormatConfig) { + this.tableFormatConfig_ = tableFormatConfig; + } + @Override public String tableFactoryName() { assert(isOwningHandle()); @@ -707,7 +707,7 @@ public boolean memtableWholeKeyFiltering() { } @Override - public ColumnFamilyOptions setBloomLocality(int bloomLocality) { + public ColumnFamilyOptions setBloomLocality(final int bloomLocality) { setBloomLocality(nativeHandle_, bloomLocality); return this; } @@ -742,9 +742,7 @@ public boolean optimizeFiltersForHits() { } @Override - public ColumnFamilyOptions - setMemtableHugePageSize( - long memtableHugePageSize) { + public ColumnFamilyOptions setMemtableHugePageSize(final long memtableHugePageSize) { setMemtableHugePageSize(nativeHandle_, memtableHugePageSize); return this; @@ -756,7 +754,8 @@ public long memtableHugePageSize() { } @Override - public ColumnFamilyOptions setSoftPendingCompactionBytesLimit(long softPendingCompactionBytesLimit) { + public ColumnFamilyOptions setSoftPendingCompactionBytesLimit( + final long softPendingCompactionBytesLimit) { setSoftPendingCompactionBytesLimit(nativeHandle_, softPendingCompactionBytesLimit); return this; @@ -768,7 +767,8 @@ public long softPendingCompactionBytesLimit() { } @Override - public ColumnFamilyOptions setHardPendingCompactionBytesLimit(long hardPendingCompactionBytesLimit) { + public ColumnFamilyOptions setHardPendingCompactionBytesLimit( + final long hardPendingCompactionBytesLimit) { setHardPendingCompactionBytesLimit(nativeHandle_, hardPendingCompactionBytesLimit); return this; } @@ -779,7 +779,8 @@ public long hardPendingCompactionBytesLimit() { } @Override - public ColumnFamilyOptions setLevel0FileNumCompactionTrigger(int level0FileNumCompactionTrigger) { + public ColumnFamilyOptions setLevel0FileNumCompactionTrigger( + final int level0FileNumCompactionTrigger) { setLevel0FileNumCompactionTrigger(nativeHandle_, level0FileNumCompactionTrigger); return this; } @@ -790,7 +791,7 @@ public int level0FileNumCompactionTrigger() { } @Override - public ColumnFamilyOptions setLevel0SlowdownWritesTrigger(int level0SlowdownWritesTrigger) { + public ColumnFamilyOptions setLevel0SlowdownWritesTrigger(final int level0SlowdownWritesTrigger) { setLevel0SlowdownWritesTrigger(nativeHandle_, level0SlowdownWritesTrigger); return this; } @@ -801,7 +802,7 @@ public int level0SlowdownWritesTrigger() { } @Override - public ColumnFamilyOptions setLevel0StopWritesTrigger(int level0StopWritesTrigger) { + public ColumnFamilyOptions setLevel0StopWritesTrigger(final int level0StopWritesTrigger) { setLevel0StopWritesTrigger(nativeHandle_, level0StopWritesTrigger); return this; } @@ -812,7 +813,8 @@ public int level0StopWritesTrigger() { } @Override - public ColumnFamilyOptions setMaxBytesForLevelMultiplierAdditional(int[] maxBytesForLevelMultiplierAdditional) { + public ColumnFamilyOptions setMaxBytesForLevelMultiplierAdditional( + final int[] maxBytesForLevelMultiplierAdditional) { setMaxBytesForLevelMultiplierAdditional(nativeHandle_, maxBytesForLevelMultiplierAdditional); return this; } @@ -823,7 +825,7 @@ public int[] maxBytesForLevelMultiplierAdditional() { } @Override - public ColumnFamilyOptions setParanoidFileChecks(boolean paranoidFileChecks) { + public ColumnFamilyOptions setParanoidFileChecks(final boolean paranoidFileChecks) { setParanoidFileChecks(nativeHandle_, paranoidFileChecks); return this; } @@ -931,7 +933,8 @@ public boolean forceConsistencyChecks() { } @Override - public ColumnFamilyOptions setSstPartitionerFactory(SstPartitionerFactory sstPartitionerFactory) { + public ColumnFamilyOptions setSstPartitionerFactory( + final SstPartitionerFactory sstPartitionerFactory) { setSstPartitionerFactory(nativeHandle_, sstPartitionerFactory.nativeHandle_); this.sstPartitionerFactory_ = sstPartitionerFactory; return this; @@ -956,6 +959,17 @@ public SstPartitionerFactory sstPartitionerFactory() { return sstPartitionerFactory_; } + @Override + public ColumnFamilyOptions setMemtableMaxRangeDeletions(final int count) { + setMemtableMaxRangeDeletions(nativeHandle_, count); + return this; + } + + @Override + public int memtableMaxRangeDeletions() { + return memtableMaxRangeDeletions(nativeHandle_); + } + // // BEGIN options for blobs (integrated BlobDB) // @@ -967,9 +981,9 @@ public SstPartitionerFactory sstPartitionerFactory() { * for reads. See also the options min_blob_size, blob_file_size, * blob_compression_type, enable_blob_garbage_collection, and * blob_garbage_collection_age_cutoff below. - * + *

* Default: false - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -990,14 +1004,15 @@ public ColumnFamilyOptions setEnableBlobFiles(final boolean enableBlobFiles) { * for reads. See also the options min_blob_size, blob_file_size, * blob_compression_type, enable_blob_garbage_collection, and * blob_garbage_collection_age_cutoff below. - * + *

* Default: false - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * * @return true iff blob files are currently enabled */ + @Override public boolean enableBlobFiles() { return enableBlobFiles(nativeHandle_); } @@ -1008,9 +1023,9 @@ public boolean enableBlobFiles() { * alongside the keys in SST files in the usual fashion. A value of zero for * this option means that all values are stored in blob files. Note that * enable_blob_files has to be set in order for this option to have any effect. - * + *

* Default: 0 - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -1029,9 +1044,9 @@ public ColumnFamilyOptions setMinBlobSize(final long minBlobSize) { * alongside the keys in SST files in the usual fashion. A value of zero for * this option means that all values are stored in blob files. Note that * enable_blob_files has to be set in order for this option to have any effect. - * + *

* Default: 0 - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -1046,9 +1061,9 @@ public long minBlobSize() { * Set the size limit for blob files. When writing blob files, a new file is opened * once this limit is reached. Note that enable_blob_files has to be set in * order for this option to have any effect. - * + *

* Default: 256 MB - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -1066,9 +1081,9 @@ public ColumnFamilyOptions setBlobFileSize(final long blobFileSize) { * Get the size limit for blob files. When writing blob files, a new file is opened * once this limit is reached. Note that enable_blob_files has to be set in * order for this option to have any effect. - * + *

* Default: 256 MB - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -1083,9 +1098,9 @@ public long blobFileSize() { * Set the compression algorithm to use for large values stored in blob files. Note * that enable_blob_files has to be set in order for this option to have any * effect. - * + *

* Default: no compression - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -1103,9 +1118,9 @@ public ColumnFamilyOptions setBlobCompressionType(final CompressionType compress * Get the compression algorithm to use for large values stored in blob files. Note * that enable_blob_files has to be set in order for this option to have any * effect. - * + *

* Default: no compression - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -1122,7 +1137,7 @@ public CompressionType blobCompressionType() { * relocated to new files as they are encountered during compaction, which makes * it possible to clean up blob files once they contain nothing but * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below. - * + *

* Default: false * * @param enableBlobGarbageCollection true iff blob garbage collection is to be enabled @@ -1142,7 +1157,7 @@ public ColumnFamilyOptions setEnableBlobGarbageCollection( * relocated to new files as they are encountered during compaction, which makes * it possible to clean up blob files once they contain nothing but * obsolete/garbage blobs. See also blob_garbage_collection_age_cutoff below. - * + *

* Default: false * * @return true iff blob garbage collection is currently enabled @@ -1158,7 +1173,7 @@ public boolean enableBlobGarbageCollection() { * where N = garbage_collection_cutoff * number_of_blob_files. Note that * enable_blob_garbage_collection has to be set in order for this option to have * any effect. - * + *

* Default: 0.25 * * @param blobGarbageCollectionAgeCutoff the new blob garbage collection age cutoff @@ -1178,7 +1193,7 @@ public ColumnFamilyOptions setBlobGarbageCollectionAgeCutoff( * where N = garbage_collection_cutoff * number_of_blob_files. Note that * enable_blob_garbage_collection has to be set in order for this option to have * any effect. - * + *

* Default: 0.25 * * @return the current blob garbage collection age cutoff @@ -1194,12 +1209,12 @@ public double blobGarbageCollectionAgeCutoff() { * the blob files in question, assuming they are all eligible based on the * value of {@link #blobGarbageCollectionAgeCutoff} above. This option is * currently only supported with leveled compactions. - * + *

* Note that {@link #enableBlobGarbageCollection} has to be set in order for this * option to have any effect. - * + *

* Default: 1.0 - * + *

* Dynamically changeable through the SetOptions() API * * @param blobGarbageCollectionForceThreshold new value for the threshold @@ -1223,9 +1238,9 @@ public double blobGarbageCollectionForceThreshold() { /** * Set compaction readahead for blob files. - * + *

* Default: 0 - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -1252,9 +1267,9 @@ public long blobCompactionReadaheadSize() { /** * Set a certain LSM tree level to enable blob files. - * + *

* Default: 0 - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * @@ -1270,7 +1285,7 @@ public ColumnFamilyOptions setBlobFileStartingLevel(final int blobFileStartingLe /** * Get the starting LSM tree level to enable blob files. - * + *

* Default: 0 * * @return the current LSM tree level to enable blob files. @@ -1282,13 +1297,13 @@ public int blobFileStartingLevel() { /** * Set a certain prepopulate blob cache option. - * + *

* Default: 0 - * + *

* Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)}. * - * @param prepopulateBlobCache the prepopulate blob cache option + * @param prepopulateBlobCache prepopulate the blob cache option * * @return the reference to the current options. */ @@ -1301,7 +1316,7 @@ public ColumnFamilyOptions setPrepopulateBlobCache( /** * Get the prepopulate blob cache option. - * + *

* Default: 0 * * @return the current prepopulate blob cache option. @@ -1319,6 +1334,10 @@ private static native long getColumnFamilyOptionsFromProps( final long cfgHandle, String optString); private static native long getColumnFamilyOptionsFromProps(final String optString); + private static long newColumnFamilyOptionsInstance() { + RocksDB.loadLibrary(); + return newColumnFamilyOptions(); + } private static native long newColumnFamilyOptions(); private static native long copyColumnFamilyOptions(final long handle); private static native long newColumnFamilyOptionsFromOptions( @@ -1495,7 +1514,8 @@ private native void setForceConsistencyChecks(final long handle, private native void setSstPartitionerFactory(long nativeHandle_, long newFactoryHandle); private static native void setCompactionThreadLimiter( final long nativeHandle_, final long compactionThreadLimiterHandle); - + private native void setMemtableMaxRangeDeletions(final long handle, final int count); + private native int memtableMaxRangeDeletions(final long handle); private native void setEnableBlobFiles(final long nativeHandle_, final boolean enableBlobFiles); private native boolean enableBlobFiles(final long nativeHandle_); private native void setMinBlobSize(final long nativeHandle_, final long minBlobSize); diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java index 97357aacf90f..4776773bd8bd 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java @@ -121,9 +121,9 @@ T optimizeUniversalStyleCompaction( /** * Set {@link BuiltinComparator} to be used with RocksDB. - * + *

* Note: Comparator can be set once upon database creation. - * + *

* Default: BytewiseComparator. * @param builtinComparator a {@link BuiltinComparator} type. * @return the instance of the current object. @@ -133,11 +133,11 @@ T setComparator( /** * Use the specified comparator for key ordering. - * + *

* Comparator should not be disposed before options instances using this comparator is * disposed. If dispose() function is not called, then comparator object will be * GC'd automatically. - * + *

* Comparator instance can be re-used in multiple options instances. * * @param comparator java instance. @@ -176,17 +176,17 @@ T setComparator( * A single CompactionFilter instance to call into during compaction. * Allows an application to modify/delete a key-value during background * compaction. - * + *

* If the client requires a new compaction filter to be used for different * compaction runs, it can specify call * {@link #setCompactionFilterFactory(AbstractCompactionFilterFactory)} * instead. - * + *

* The client should specify only set one of the two. - * {@link #setCompactionFilter(AbstractCompactionFilter)} takes precedence + * {#setCompactionFilter(AbstractCompactionFilter)} takes precedence * over {@link #setCompactionFilterFactory(AbstractCompactionFilterFactory)} * if the client specifies both. - * + *

* If multithreaded compaction is being used, the supplied CompactionFilter * instance may be used from different threads concurrently and so should be thread-safe. * @@ -207,7 +207,7 @@ T setCompactionFilter( * This is a factory that provides {@link AbstractCompactionFilter} objects * which allow an application to modify/delete a key-value during background * compaction. - * + *

* A new filter will be created on each compaction run. If multithreaded * compaction is being used, each created CompactionFilter will only be used * from a single thread and so does not need to be thread-safe. @@ -228,7 +228,7 @@ T setCompactionFilterFactory( /** * This prefix-extractor uses the first n bytes of a key as its prefix. - * + *

* In some hash-based memtable representation such as HashLinkedList * and HashSkipList, prefixes are used to partition the keys into * several buckets. Prefix extractor is used to specify how to @@ -404,7 +404,7 @@ T setMaxTableFilesSizeFIFO( * families, it would have files and total size from all * the column families combined. User should provision for the * total size(from all the column families) in such cases. - * + *

* If left empty, db_paths will be used. * Default: empty * @@ -422,7 +422,7 @@ T setMaxTableFilesSizeFIFO( * Compression algorithm that will be used for the bottommost level that * contain files. If level-compaction is used, this option will only affect * levels after base level. - * + *

* Default: {@link CompressionType#DISABLE_COMPRESSION_OPTION} * * @param bottommostCompressionType The compression type to use for the @@ -437,7 +437,7 @@ T setBottommostCompressionType( * Compression algorithm that will be used for the bottommost level that * contain files. If level-compaction is used, this option will only affect * levels after base level. - * + *

* Default: {@link CompressionType#DISABLE_COMPRESSION_OPTION} * * @return The compression type used for the bottommost level @@ -447,7 +447,7 @@ T setBottommostCompressionType( /** * Set the options for compression algorithms used by * {@link #bottommostCompressionType()} if it is enabled. - * + *

* To enable it, please see the definition of * {@link CompressionOptions}. * @@ -460,7 +460,7 @@ T setBottommostCompressionOptions( /** * Get the bottom most compression options. - * + *

* See {@link #setBottommostCompressionOptions(CompressionOptions)}. * * @return the bottom most compression options. @@ -489,7 +489,7 @@ T setCompressionOptions( * partitioning of sst files. This helps compaction to split the files * on interesting boundaries (key prefixes) to make propagation of sst * files less write amplifying (covering the whole key space). - * + *

* Default: nullptr * * @param factory The factory reference @@ -506,6 +506,23 @@ T setCompressionOptions( @Experimental("Caution: this option is experimental") SstPartitionerFactory sstPartitionerFactory(); + /** + * Sets the maximum range delete calls, after which memtable is flushed. + * This applies to the mutable memtable. + * + * @param count a positive integer, 0 (default) to disable the feature. + * @return the reference of the current options. + */ + T setMemtableMaxRangeDeletions(final int count); + + /** + * Gets the current setting of maximum range deletes allowed + * 0(default) indicates that feature is disabled. + * + * @return current value of memtable_max_range_deletions + */ + int memtableMaxRangeDeletions(); + /** * Compaction concurrent thread limiter for the column family. * If non-nullptr, use given concurrent thread limiter to control diff --git a/java/src/main/java/org/rocksdb/CompactRangeOptions.java b/java/src/main/java/org/rocksdb/CompactRangeOptions.java index da023d3669fb..616a77572d41 100644 --- a/java/src/main/java/org/rocksdb/CompactRangeOptions.java +++ b/java/src/main/java/org/rocksdb/CompactRangeOptions.java @@ -5,15 +5,17 @@ package org.rocksdb; +import java.util.Objects; + /** * CompactRangeOptions is used by CompactRange() call. In the documentation of the methods "the compaction" refers to * any compaction that is using this CompactRangeOptions. */ public class CompactRangeOptions extends RocksObject { - - private final static byte VALUE_kSkip = 0; - private final static byte VALUE_kIfHaveCompactionFilter = 1; - private final static byte VALUE_kForce = 2; + private static final byte VALUE_kSkip = 0; + private static final byte VALUE_kIfHaveCompactionFilter = 1; + private static final byte VALUE_kForce = 2; + private static final byte VALUE_kForceOptimized = 3; // For level based compaction, we can configure if we want to skip/force bottommost level // compaction. The order of this enum MUST follow the C++ layer. See BottommostLevelCompaction in @@ -30,7 +32,12 @@ public enum BottommostLevelCompaction { /** * Always compact bottommost level */ - kForce(VALUE_kForce); + kForce(VALUE_kForce), + /** + * Always compact bottommost level but in bottommost level avoid + * double-compacting files created in the same compaction + */ + kForceOptimized(VALUE_kForceOptimized); private final byte value; @@ -57,11 +64,43 @@ public static BottommostLevelCompaction fromRocksId(final int bottommostLevelCom case VALUE_kSkip: return kSkip; case VALUE_kIfHaveCompactionFilter: return kIfHaveCompactionFilter; case VALUE_kForce: return kForce; + case VALUE_kForceOptimized: + return kForceOptimized; default: return null; } } } + public static class Timestamp { + public final long start; + public final long range; + + public Timestamp(final long start, final long duration) { + this.start = start; + this.range = duration; + } + + public Timestamp() { + this.start = 0; + this.range = 0; + } + + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Timestamp timestamp = (Timestamp) o; + return start == timestamp.start && range == timestamp.range; + } + + @Override + public int hashCode() { + return Objects.hash(start, range); + } + } + /** * Construct CompactRangeOptions. */ @@ -211,7 +250,25 @@ public CompactRangeOptions setMaxSubcompactions(final int maxSubcompactions) { return this; } - private native static long newCompactRangeOptions(); + public CompactRangeOptions setFullHistoryTSLow(final Timestamp tsLow) { + setFullHistoryTSLow(nativeHandle_, tsLow.start, tsLow.range); + return this; + } + + public Timestamp fullHistoryTSLow() { + return fullHistoryTSLow(nativeHandle_); + } + + public CompactRangeOptions setCanceled(final boolean canceled) { + setCanceled(nativeHandle_, canceled); + return this; + } + + public boolean canceled() { + return canceled(nativeHandle_); + } + + private static native long newCompactRangeOptions(); @Override protected final native void disposeInternal(final long handle); private native boolean exclusiveManualCompaction(final long handle); @@ -235,4 +292,13 @@ private native void setAllowWriteStall(final long handle, private native void setMaxSubcompactions(final long handle, final int maxSubcompactions); private native int maxSubcompactions(final long handle); + + private native void setFullHistoryTSLow( + final long handle, final long timestampStart, final long timestampRange); + + private native Timestamp fullHistoryTSLow(final long handle); + + private native void setCanceled(final long handle, final boolean canceled); + + private native boolean canceled(final long handle); } diff --git a/java/src/main/java/org/rocksdb/CompactionJobInfo.java b/java/src/main/java/org/rocksdb/CompactionJobInfo.java index 4e3b8d68b827..cf04bde24930 100644 --- a/java/src/main/java/org/rocksdb/CompactionJobInfo.java +++ b/java/src/main/java/org/rocksdb/CompactionJobInfo.java @@ -98,7 +98,7 @@ public List outputFiles() { /** * Get the table properties for the input and output tables. - * + *

* The map is keyed by values from {@link #inputFiles()} and * {@link #outputFiles()}. * diff --git a/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java b/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java index 4c8d6545cb18..92b21fc50c30 100644 --- a/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java +++ b/java/src/main/java/org/rocksdb/CompactionOptionsFIFO.java @@ -17,7 +17,7 @@ public CompactionOptionsFIFO() { /** * Once the total sum of table files reaches this, we will delete the oldest * table file - * + *

* Default: 1GB * * @param maxTableFilesSize The maximum size of the table files @@ -33,7 +33,7 @@ public CompactionOptionsFIFO setMaxTableFilesSize( /** * Once the total sum of table files reaches this, we will delete the oldest * table file - * + *

* Default: 1GB * * @return max table file size in bytes @@ -48,7 +48,7 @@ public long maxTableFilesSize() { * and compaction won't trigger if average compact bytes per del file is * larger than options.write_buffer_size. This is to protect large files * from being compacted again. - * + *

* Default: false * * @param allowCompaction true to allow intra-L0 compaction @@ -61,13 +61,12 @@ public CompactionOptionsFIFO setAllowCompaction( return this; } - /** * Check if intra-L0 compaction is enabled. * When enabled, we try to compact smaller files into larger ones. - * + *

* See {@link #setAllowCompaction(boolean)}. - * + *

* Default: false * * @return true if intra-L0 compaction is enabled, false otherwise. @@ -76,8 +75,7 @@ public boolean allowCompaction() { return allowCompaction(nativeHandle_); } - - private native static long newCompactionOptionsFIFO(); + private static native long newCompactionOptionsFIFO(); @Override protected final native void disposeInternal(final long handle); private native void setMaxTableFilesSize(final long handle, diff --git a/java/src/main/java/org/rocksdb/CompactionOptionsUniversal.java b/java/src/main/java/org/rocksdb/CompactionOptionsUniversal.java index d2dfa4eef1ae..4d2ebdb1f562 100644 --- a/java/src/main/java/org/rocksdb/CompactionOptionsUniversal.java +++ b/java/src/main/java/org/rocksdb/CompactionOptionsUniversal.java @@ -18,7 +18,7 @@ public CompactionOptionsUniversal() { * Percentage flexibility while comparing file size. If the candidate file(s) * size is 1% smaller than the next file's size, then include next file into * this candidate set. - * + *

* Default: 1 * * @param sizeRatio The size ratio to use @@ -34,7 +34,7 @@ public CompactionOptionsUniversal setSizeRatio(final int sizeRatio) { * Percentage flexibility while comparing file size. If the candidate file(s) * size is 1% smaller than the next file's size, then include next file into * this candidate set. - * + *

* Default: 1 * * @return The size ratio in use @@ -45,7 +45,7 @@ public int sizeRatio() { /** * The minimum number of files in a single compaction run. - * + *

* Default: 2 * * @param minMergeWidth minimum number of files in a single compaction run @@ -59,7 +59,7 @@ public CompactionOptionsUniversal setMinMergeWidth(final int minMergeWidth) { /** * The minimum number of files in a single compaction run. - * + *

* Default: 2 * * @return minimum number of files in a single compaction run @@ -70,7 +70,7 @@ public int minMergeWidth() { /** * The maximum number of files in a single compaction run. - * + *

* Default: {@link Long#MAX_VALUE} * * @param maxMergeWidth maximum number of files in a single compaction run @@ -84,7 +84,7 @@ public CompactionOptionsUniversal setMaxMergeWidth(final int maxMergeWidth) { /** * The maximum number of files in a single compaction run. - * + *

* Default: {@link Long#MAX_VALUE} * * @return maximum number of files in a single compaction run @@ -102,7 +102,7 @@ public int maxMergeWidth() { * a size amplification of 0%. Rocksdb uses the following heuristic * to calculate size amplification: it assumes that all files excluding * the earliest file contribute to the size amplification. - * + *

* Default: 200, which means that a 100 byte database could require upto * 300 bytes of storage. * @@ -126,7 +126,7 @@ public CompactionOptionsUniversal setMaxSizeAmplificationPercent( * a size amplification of 0%. Rocksdb uses the following heuristic * to calculate size amplification: it assumes that all files excluding * the earliest file contribute to the size amplification. - * + *

* Default: 200, which means that a 100 byte database could require upto * 300 bytes of storage. * @@ -140,11 +140,11 @@ public int maxSizeAmplificationPercent() { /** * If this option is set to be -1 (the default value), all the output files * will follow compression type specified. - * + *

* If this option is not negative, we will try to make sure compressed * size is just above this value. In normal cases, at least this percentage * of data will be compressed. - * + *

* When we are compacting to a new file, here is the criteria whether * it needs to be compressed: assuming here are the list of files sorted * by generation time: @@ -154,7 +154,7 @@ public int maxSizeAmplificationPercent() { * well as the total size of C1...Ct as total_C, the compaction output file * will be compressed iff * total_C / total_size < this percentage - * + *

* Default: -1 * * @param compressionSizePercent percentage of size for compression @@ -170,11 +170,11 @@ public CompactionOptionsUniversal setCompressionSizePercent( /** * If this option is set to be -1 (the default value), all the output files * will follow compression type specified. - * + *

* If this option is not negative, we will try to make sure compressed * size is just above this value. In normal cases, at least this percentage * of data will be compressed. - * + *

* When we are compacting to a new file, here is the criteria whether * it needs to be compressed: assuming here are the list of files sorted * by generation time: @@ -184,7 +184,7 @@ public CompactionOptionsUniversal setCompressionSizePercent( * well as the total size of C1...Ct as total_C, the compaction output file * will be compressed iff * total_C / total_size < this percentage - * + *

* Default: -1 * * @return percentage of size for compression @@ -195,7 +195,7 @@ public int compressionSizePercent() { /** * The algorithm used to stop picking files into a single compaction run - * + *

* Default: {@link CompactionStopStyle#CompactionStopStyleTotalSize} * * @param compactionStopStyle The compaction algorithm @@ -210,7 +210,7 @@ public CompactionOptionsUniversal setStopStyle( /** * The algorithm used to stop picking files into a single compaction run - * + *

* Default: {@link CompactionStopStyle#CompactionStopStyleTotalSize} * * @return The compaction algorithm @@ -222,7 +222,7 @@ public CompactionStopStyle stopStyle() { /** * Option to optimize the universal multi level compaction by enabling * trivial move for non overlapping files. - * + *

* Default: false * * @param allowTrivialMove true if trivial move is allowed @@ -238,7 +238,7 @@ public CompactionOptionsUniversal setAllowTrivialMove( /** * Option to optimize the universal multi level compaction by enabling * trivial move for non overlapping files. - * + *

* Default: false * * @return true if trivial move is allowed @@ -247,7 +247,7 @@ public boolean allowTrivialMove() { return allowTrivialMove(nativeHandle_); } - private native static long newCompactionOptionsUniversal(); + private static native long newCompactionOptionsUniversal(); @Override protected final native void disposeInternal(final long handle); private native void setSizeRatio(final long handle, final int sizeRatio); diff --git a/java/src/main/java/org/rocksdb/CompactionStyle.java b/java/src/main/java/org/rocksdb/CompactionStyle.java index b24bbf850958..7b955a7a248c 100644 --- a/java/src/main/java/org/rocksdb/CompactionStyle.java +++ b/java/src/main/java/org/rocksdb/CompactionStyle.java @@ -5,11 +5,9 @@ package org.rocksdb; -import java.util.List; - /** * Enum CompactionStyle - * + *

* RocksDB supports different styles of compaction. Available * compaction styles can be chosen using this enumeration. * @@ -25,7 +23,8 @@ * the old data, so it's basically a TTL compaction style. *

  • NONE - Disable background compaction. * Compaction jobs are submitted - * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int, CompactionJobInfo)} ()}.
  • + * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, java.util.List, int, int, + * CompactionJobInfo)} ()}. * * * @see * Note that dispose() must be called before a ComparatorOptions * instance becomes out-of-scope to release the allocated memory in C++. */ @@ -48,10 +48,10 @@ public ComparatorOptions setReusedSynchronisationType( } /** - * Indicates if a direct byte buffer (i.e. outside of the normal + * Indicates if a direct byte buffer (i.e. outside the normal * garbage-collected heap) is used, as opposed to a non-direct byte buffer * which is a wrapper around an on-heap byte[]. - * + *

    * Default: true * * @return true if a direct byte buffer will be used, false otherwise @@ -62,10 +62,10 @@ public boolean useDirectBuffer() { } /** - * Controls whether a direct byte buffer (i.e. outside of the normal + * Controls whether a direct byte buffer (i.e. outside the normal * garbage-collected heap) is used, as opposed to a non-direct byte buffer * which is a wrapper around an on-heap byte[]. - * + *

    * Default: true * * @param useDirectBuffer true if a direct byte buffer should be used, @@ -86,7 +86,7 @@ public ComparatorOptions setUseDirectBuffer(final boolean useDirectBuffer) { * if it requires less than {@code maxReuseBufferSize}, then an * existing buffer will be reused, else a new buffer will be * allocated just for that callback. - * + *

    * Default: 64 bytes * * @return the maximum size of a buffer which is reused, @@ -105,7 +105,7 @@ public int maxReusedBufferSize() { * if it requires less than {@code maxReuseBufferSize}, then an * existing buffer will be reused, else a new buffer will be * allocated just for that callback. - * + *

    * Default: 64 bytes * * @param maxReusedBufferSize the maximum size for a buffer to reuse, or 0 to @@ -119,7 +119,7 @@ public ComparatorOptions setMaxReusedBufferSize(final int maxReusedBufferSize) { return this; } - private native static long newComparatorOptions(); + private static native long newComparatorOptions(); private native byte reusedSynchronisationType(final long handle); private native void setReusedSynchronisationType(final long handle, final byte reusedSynchronisationType); diff --git a/java/src/main/java/org/rocksdb/CompressionOptions.java b/java/src/main/java/org/rocksdb/CompressionOptions.java index a9072bbb97f8..2e1ee57310b1 100644 --- a/java/src/main/java/org/rocksdb/CompressionOptions.java +++ b/java/src/main/java/org/rocksdb/CompressionOptions.java @@ -48,9 +48,9 @@ public int strategy() { * loaded into the compression library before compressing/uncompressing each * data block of subsequent files in the subcompaction. Effectively, this * improves compression ratios when there are repetitions across data blocks. - * + *

    * A value of 0 indicates the feature is disabled. - * + *

    * Default: 0. * * @param maxDictBytes Maximum bytes to use for the dictionary @@ -75,10 +75,10 @@ public int maxDictBytes() { * Maximum size of training data passed to zstd's dictionary trainer. Using * zstd's dictionary trainer can achieve even better compression ratio * improvements than using {@link #setMaxDictBytes(int)} alone. - * + *

    * The training data will be used to generate a dictionary * of {@link #maxDictBytes()}. - * + *

    * Default: 0. * * @param zstdMaxTrainBytes Maximum bytes to use for training ZStd. @@ -104,10 +104,10 @@ public int zstdMaxTrainBytes() { * For bottommost_compression_opts, to enable it, user must set enabled=true. * Otherwise, bottommost compression will use compression_opts as default * compression options. - * + *

    * For compression_opts, if compression_opts.enabled=false, it is still * used as compression options for compression process. - * + *

    * Default: false. * * @param enabled true to use these compression options @@ -131,8 +131,7 @@ public boolean enabled() { return enabled(nativeHandle_); } - - private native static long newCompressionOptions(); + private static native long newCompressionOptions(); @Override protected final native void disposeInternal(final long handle); private native void setWindowBits(final long handle, final int windowBits); diff --git a/java/src/main/java/org/rocksdb/CompressionType.java b/java/src/main/java/org/rocksdb/CompressionType.java index d1d73d51aaef..d1ecf0ac84c5 100644 --- a/java/src/main/java/org/rocksdb/CompressionType.java +++ b/java/src/main/java/org/rocksdb/CompressionType.java @@ -35,9 +35,9 @@ public enum CompressionType { * * @return CompressionType instance. */ - public static CompressionType getCompressionType(String libraryName) { + public static CompressionType getCompressionType(final String libraryName) { if (libraryName != null) { - for (CompressionType compressionType : CompressionType.values()) { + for (final CompressionType compressionType : CompressionType.values()) { if (compressionType.getLibraryName() != null && compressionType.getLibraryName().equals(libraryName)) { return compressionType; @@ -58,7 +58,7 @@ public static CompressionType getCompressionType(String libraryName) { * @throws IllegalArgumentException If CompressionType cannot be found for the * provided byteIdentifier */ - public static CompressionType getCompressionType(byte byteIdentifier) { + public static CompressionType getCompressionType(final byte byteIdentifier) { for (final CompressionType compressionType : CompressionType.values()) { if (compressionType.getValue() == byteIdentifier) { return compressionType; diff --git a/java/src/main/java/org/rocksdb/ConfigOptions.java b/java/src/main/java/org/rocksdb/ConfigOptions.java index 4d93f0c9929d..b3b5423c876d 100644 --- a/java/src/main/java/org/rocksdb/ConfigOptions.java +++ b/java/src/main/java/org/rocksdb/ConfigOptions.java @@ -7,15 +7,11 @@ package org.rocksdb; public class ConfigOptions extends RocksObject { - static { - RocksDB.loadLibrary(); - } - /** * Construct with default Options */ public ConfigOptions() { - super(newConfigOptions()); + super(newConfigOptionsInstance()); } public ConfigOptions setDelimiter(final String delimiter) { @@ -44,10 +40,14 @@ public ConfigOptions setSanityLevel(final SanityLevel level) { @Override protected final native void disposeInternal(final long handle); - private native static long newConfigOptions(); - private native static void setEnv(final long handle, final long envHandle); - private native static void setDelimiter(final long handle, final String delimiter); - private native static void setIgnoreUnknownOptions(final long handle, final boolean ignore); - private native static void setInputStringsEscaped(final long handle, final boolean escaped); - private native static void setSanityLevel(final long handle, final byte level); + private static long newConfigOptionsInstance() { + RocksDB.loadLibrary(); + return newConfigOptions(); + } + private static native long newConfigOptions(); + private static native void setEnv(final long handle, final long envHandle); + private static native void setDelimiter(final long handle, final String delimiter); + private static native void setIgnoreUnknownOptions(final long handle, final boolean ignore); + private static native void setInputStringsEscaped(final long handle, final boolean escaped); + private static native void setSanityLevel(final long handle, final byte level); } diff --git a/java/src/main/java/org/rocksdb/DBOptions.java b/java/src/main/java/org/rocksdb/DBOptions.java index 9eb5ca8738ee..de10c058501c 100644 --- a/java/src/main/java/org/rocksdb/DBOptions.java +++ b/java/src/main/java/org/rocksdb/DBOptions.java @@ -11,38 +11,33 @@ /** * DBOptions to control the behavior of a database. It will be used * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()). - * + *

    * As a descendent of {@link AbstractNativeReference}, this class is {@link AutoCloseable} * and will be automatically released if opened in the preamble of a try with resources block. */ public class DBOptions extends RocksObject - implements DBOptionsInterface, - MutableDBOptionsInterface { - static { - RocksDB.loadLibrary(); - } - + implements DBOptionsInterface, MutableDBOptionsInterface { /** * Construct DBOptions. - * + *

    * This constructor will create (by allocating a block of memory) * an {@code rocksdb::DBOptions} in the c++ side. */ public DBOptions() { - super(newDBOptions()); + super(newDBOptionsInstance()); numShardBits_ = DEFAULT_NUM_SHARD_BITS; env_ = Env.getDefault(); } /** * Copy constructor for DBOptions. - * + *

    * NOTE: This does a shallow copy, which means env, rate_limiter, sst_file_manager, * info_log and other pointers will be cloned! * * @param other The DBOptions to copy. */ - public DBOptions(DBOptions other) { + public DBOptions(final DBOptions other) { super(copyDBOptions(other.nativeHandle_)); this.env_ = other.env_; this.numShardBits_ = other.numShardBits_; @@ -752,6 +747,7 @@ public long dbWriteBufferSize() { } @Override + @Deprecated public DBOptions setAccessHintOnCompactionStart(final AccessHint accessHint) { assert(isOwningHandle()); setAccessHintOnCompactionStart(nativeHandle_, accessHint.getValue()); @@ -759,6 +755,7 @@ public DBOptions setAccessHintOnCompactionStart(final AccessHint accessHint) { } @Override + @Deprecated public AccessHint accessHintOnCompactionStart() { assert(isOwningHandle()); return AccessHint.getAccessHint(accessHintOnCompactionStart(nativeHandle_)); @@ -1251,7 +1248,12 @@ private DBOptions(final long nativeHandle) { private static native long getDBOptionsFromProps(long cfgHandle, String optString); private static native long getDBOptionsFromProps(String optString); + private static long newDBOptionsInstance() { + RocksDB.loadLibrary(); + return newDBOptions(); + } private static native long newDBOptions(); + private static native long copyDBOptions(final long handle); private static native long newDBOptionsFromOptions(final long optionsHandle); @Override protected final native void disposeInternal(final long handle); diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java index ef1b86bffa85..084a399cd03b 100644 --- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java @@ -615,21 +615,24 @@ public interface DBOptionsInterface> { int tableCacheNumshardbits(); /** - * {@link #walTtlSeconds()} and {@link #walSizeLimitMB()} affect how archived logs - * will be deleted. - *

      - *
    1. If both set to 0, logs will be deleted asap and will not get into - * the archive.
    2. - *
    3. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, - * WAL files will be checked every 10 min and if total size is greater - * then WAL_size_limit_MB, they will be deleted starting with the - * earliest until size_limit is met. All empty files will be deleted.
    4. - *
    5. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then - * WAL files will be checked every WAL_ttl_seconds / 2 and those that - * are older than WAL_ttl_seconds will be deleted.
    6. - *
    7. If both are not 0, WAL files will be checked every 10 min and both - * checks will be performed with ttl being first.
    8. - *
    + * {@link #walTtlSeconds()} and {@link #walSizeLimitMB()} affect when WALs + * will be archived and deleted. + * + * When both are zero, obsolete WALs will not be archived and will be deleted + * immediately. Otherwise, obsolete WALs will be archived prior to deletion. + * + * When `WAL_size_limit_MB` is nonzero, archived WALs starting with the + * earliest will be deleted until the total size of the archive falls below + * this limit. All empty WALs will be deleted. + * + * When `WAL_ttl_seconds` is nonzero, archived WALs older than + * `WAL_ttl_seconds` will be deleted. + * + * When only `WAL_ttl_seconds` is nonzero, the frequency at which archived + * WALs are deleted is every `WAL_ttl_seconds / 2` seconds. When only + * `WAL_size_limit_MB` is nonzero, the deletion frequency is every ten + * minutes. When both are nonzero, the deletion frequency is the minimum of + * those two values. * * @param walTtlSeconds the ttl seconds * @return the instance of the current object. @@ -638,21 +641,24 @@ public interface DBOptionsInterface> { T setWalTtlSeconds(long walTtlSeconds); /** - * WalTtlSeconds() and walSizeLimitMB() affect how archived logs - * will be deleted. - *
      - *
    1. If both set to 0, logs will be deleted asap and will not get into - * the archive.
    2. - *
    3. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, - * WAL files will be checked every 10 min and if total size is greater - * then WAL_size_limit_MB, they will be deleted starting with the - * earliest until size_limit is met. All empty files will be deleted.
    4. - *
    5. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then - * WAL files will be checked every WAL_ttl_seconds / 2 and those that - * are older than WAL_ttl_seconds will be deleted.
    6. - *
    7. If both are not 0, WAL files will be checked every 10 min and both - * checks will be performed with ttl being first.
    8. - *
    + * WalTtlSeconds() and walSizeLimitMB() affect when WALs will be archived and + * deleted. + * + * When both are zero, obsolete WALs will not be archived and will be deleted + * immediately. Otherwise, obsolete WALs will be archived prior to deletion. + * + * When `WAL_size_limit_MB` is nonzero, archived WALs starting with the + * earliest will be deleted until the total size of the archive falls below + * this limit. All empty WALs will be deleted. + * + * When `WAL_ttl_seconds` is nonzero, archived WALs older than + * `WAL_ttl_seconds` will be deleted. + * + * When only `WAL_ttl_seconds` is nonzero, the frequency at which archived + * WALs are deleted is every `WAL_ttl_seconds / 2` seconds. When only + * `WAL_size_limit_MB` is nonzero, the deletion frequency is every ten + * minutes. When both are nonzero, the deletion frequency is the minimum of + * those two values. * * @return the wal-ttl seconds * @see #walSizeLimitMB() @@ -662,19 +668,22 @@ public interface DBOptionsInterface> { /** * WalTtlSeconds() and walSizeLimitMB() affect how archived logs * will be deleted. - *
      - *
    1. If both set to 0, logs will be deleted asap and will not get into - * the archive.
    2. - *
    3. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, - * WAL files will be checked every 10 min and if total size is greater - * then WAL_size_limit_MB, they will be deleted starting with the - * earliest until size_limit is met. All empty files will be deleted.
    4. - *
    5. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then - * WAL files will be checked every WAL_ttl_secondsi / 2 and those that - * are older than WAL_ttl_seconds will be deleted.
    6. - *
    7. If both are not 0, WAL files will be checked every 10 min and both - * checks will be performed with ttl being first.
    8. - *
    + * + * When both are zero, obsolete WALs will not be archived and will be deleted + * immediately. Otherwise, obsolete WALs will be archived prior to deletion. + * + * When `WAL_size_limit_MB` is nonzero, archived WALs starting with the + * earliest will be deleted until the total size of the archive falls below + * this limit. All empty WALs will be deleted. + * + * When `WAL_ttl_seconds` is nonzero, archived WALs older than + * `WAL_ttl_seconds` will be deleted. + * + * When only `WAL_ttl_seconds` is nonzero, the frequency at which archived + * WALs are deleted is every `WAL_ttl_seconds / 2` seconds. When only + * `WAL_size_limit_MB` is nonzero, the deletion frequency is every ten + * minutes. When both are nonzero, the deletion frequency is the minimum of + * those two values. * * @param sizeLimitMB size limit in mega-bytes. * @return the instance of the current object. @@ -683,21 +692,25 @@ public interface DBOptionsInterface> { T setWalSizeLimitMB(long sizeLimitMB); /** - * {@link #walTtlSeconds()} and {@code #walSizeLimitMB()} affect how archived logs - * will be deleted. - *
      - *
    1. If both set to 0, logs will be deleted asap and will not get into - * the archive.
    2. - *
    3. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, - * WAL files will be checked every 10 min and if total size is greater - * then WAL_size_limit_MB, they will be deleted starting with the - * earliest until size_limit is met. All empty files will be deleted.
    4. - *
    5. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then - * WAL files will be checked every WAL_ttl_seconds i / 2 and those that - * are older than WAL_ttl_seconds will be deleted.
    6. - *
    7. If both are not 0, WAL files will be checked every 10 min and both - * checks will be performed with ttl being first.
    8. - *
    + * WalTtlSeconds() and walSizeLimitMB() affect when WALs will be archived and + * deleted. + * + * When both are zero, obsolete WALs will not be archived and will be deleted + * immediately. Otherwise, obsolete WALs will be archived prior to deletion. + * + * When `WAL_size_limit_MB` is nonzero, archived WALs starting with the + * earliest will be deleted until the total size of the archive falls below + * this limit. All empty WALs will be deleted. + * + * When `WAL_ttl_seconds` is nonzero, archived WALs older than + * `WAL_ttl_seconds` will be deleted. + * + * When only `WAL_ttl_seconds` is nonzero, the frequency at which archived + * WALs are deleted is every `WAL_ttl_seconds / 2` seconds. When only + * `WAL_size_limit_MB` is nonzero, the deletion frequency is every ten + * minutes. When both are nonzero, the deletion frequency is the minimum of + * those two values. + * * @return size limit in mega-bytes. * @see #walSizeLimitMB() */ @@ -935,7 +948,7 @@ public interface DBOptionsInterface> { * * @return the reference to the current options. */ - T setAccessHintOnCompactionStart(final AccessHint accessHint); + @Deprecated T setAccessHintOnCompactionStart(final AccessHint accessHint); /** * Specify the file access pattern once a compaction is started. @@ -945,7 +958,7 @@ public interface DBOptionsInterface> { * * @return The access hint */ - AccessHint accessHintOnCompactionStart(); + @Deprecated AccessHint accessHintOnCompactionStart(); /** * This is a maximum buffer size that is used by WinMmapReadableFile in diff --git a/java/src/main/java/org/rocksdb/DirectSlice.java b/java/src/main/java/org/rocksdb/DirectSlice.java index 02fa3511fc04..5aa0866ffe29 100644 --- a/java/src/main/java/org/rocksdb/DirectSlice.java +++ b/java/src/main/java/org/rocksdb/DirectSlice.java @@ -10,13 +10,13 @@ /** * Base class for slices which will receive direct * ByteBuffer based access to the underlying data. - * + *

    * ByteBuffer backed slices typically perform better with * larger keys and values. When using smaller keys and * values consider using @see org.rocksdb.Slice */ public class DirectSlice extends AbstractSlice { - public final static DirectSlice NONE = new DirectSlice(); + public static final DirectSlice NONE = new DirectSlice(); /** * Indicates whether we have to free the memory pointed to by the Slice @@ -29,7 +29,7 @@ public class DirectSlice extends AbstractSlice { * Called from JNI to construct a new Java DirectSlice * without an underlying C++ object set * at creation time. - * + *

    * Note: You should be aware that it is intentionally marked as * package-private. This is so that developers cannot construct their own * default DirectSlice objects (at present). As developers cannot construct @@ -123,9 +123,8 @@ protected void disposeInternal() { disposeInternal(nativeHandle); } - private native static long createNewDirectSlice0(final ByteBuffer data, - final int length); - private native static long createNewDirectSlice1(final ByteBuffer data); + private static native long createNewDirectSlice0(final ByteBuffer data, final int length); + private static native long createNewDirectSlice1(final ByteBuffer data); @Override protected final native ByteBuffer data0(long handle); private native byte get0(long handle, int offset); private native void clear0(long handle, boolean internalBuffer, diff --git a/java/src/main/java/org/rocksdb/EncodingType.java b/java/src/main/java/org/rocksdb/EncodingType.java index 5ceeb54c8263..e93ffcc23ca9 100644 --- a/java/src/main/java/org/rocksdb/EncodingType.java +++ b/java/src/main/java/org/rocksdb/EncodingType.java @@ -47,7 +47,7 @@ public byte getValue() { return value_; } - private EncodingType(byte value) { + EncodingType(final byte value) { value_ = value; } diff --git a/java/src/main/java/org/rocksdb/Env.java b/java/src/main/java/org/rocksdb/Env.java index 07b5319bb8cf..6783d8158113 100644 --- a/java/src/main/java/org/rocksdb/Env.java +++ b/java/src/main/java/org/rocksdb/Env.java @@ -7,25 +7,13 @@ import java.util.Arrays; import java.util.List; +import java.util.concurrent.atomic.AtomicReference; /** * Base class for all Env implementations in RocksDB. */ public abstract class Env extends RocksObject { - - static { - RocksDB.loadLibrary(); - } - - private static final Env DEFAULT_ENV = new RocksEnv(getDefaultEnvInternal()); - static { - /** - * The Ownership of the Default Env belongs to C++ - * and so we disown the native handle here so that - * we cannot accidentally free it from Java. - */ - DEFAULT_ENV.disOwnNativeHandle(); - } + private static final AtomicReference SINGULAR_DEFAULT_ENV = new AtomicReference<>(null); /** *

    Returns the default environment suitable for the current operating @@ -38,8 +26,32 @@ public abstract class Env extends RocksObject { * * @return the default {@link org.rocksdb.RocksEnv} instance. */ + @SuppressWarnings({"PMD.CloseResource", "PMD.AssignmentInOperand"}) public static Env getDefault() { - return DEFAULT_ENV; + RocksEnv defaultEnv; + RocksEnv newDefaultEnv = null; + + while ((defaultEnv = SINGULAR_DEFAULT_ENV.get()) == null) { + // construct the RocksEnv only once in this thread + if (newDefaultEnv == null) { + // load the library just in-case it isn't already loaded! + RocksDB.loadLibrary(); + + newDefaultEnv = new RocksEnv(getDefaultEnvInternal()); + + /* + * The Ownership of the Default Env belongs to C++ + * and so we disown the native handle here so that + * we cannot accidentally free it from Java. + */ + newDefaultEnv.disOwnNativeHandle(); + } + + // use CAS to gracefully handle thread pre-emption + SINGULAR_DEFAULT_ENV.compareAndSet(null, newDefaultEnv); + } + + return defaultEnv; } /** diff --git a/java/src/main/java/org/rocksdb/EnvOptions.java b/java/src/main/java/org/rocksdb/EnvOptions.java index 6baddb310245..fd56bc49e523 100644 --- a/java/src/main/java/org/rocksdb/EnvOptions.java +++ b/java/src/main/java/org/rocksdb/EnvOptions.java @@ -9,15 +9,11 @@ * Options while opening a file to read/write */ public class EnvOptions extends RocksObject { - static { - RocksDB.loadLibrary(); - } - /** * Construct with default Options */ public EnvOptions() { - super(newEnvOptions()); + super(newEnvOptionsInstance()); } /** @@ -31,7 +27,7 @@ public EnvOptions(final DBOptions dbOptions) { /** * Enable/Disable memory mapped reads. - * + *

    * Default: false * * @param useMmapReads true to enable memory mapped reads, false to disable. @@ -55,7 +51,7 @@ public boolean useMmapReads() { /** * Enable/Disable memory mapped Writes. - * + *

    * Default: true * * @param useMmapWrites true to enable memory mapped writes, false to disable. @@ -79,7 +75,7 @@ public boolean useMmapWrites() { /** * Enable/Disable direct reads, i.e. {@code O_DIRECT}. - * + *

    * Default: false * * @param useDirectReads true to enable direct reads, false to disable. @@ -103,7 +99,7 @@ public boolean useDirectReads() { /** * Enable/Disable direct writes, i.e. {@code O_DIRECT}. - * + *

    * Default: false * * @param useDirectWrites true to enable direct writes, false to disable. @@ -127,9 +123,9 @@ public boolean useDirectWrites() { /** * Enable/Disable fallocate calls. - * + *

    * Default: true - * + *

    * If false, {@code fallocate()} calls are bypassed. * * @param allowFallocate true to enable fallocate calls, false to disable. @@ -153,7 +149,7 @@ public boolean allowFallocate() { /** * Enable/Disable the {@code FD_CLOEXEC} bit when opening file descriptors. - * + *

    * Default: true * * @param setFdCloexec true to enable the {@code FB_CLOEXEC} bit, @@ -181,7 +177,7 @@ public boolean setFdCloexec() { * Allows OS to incrementally sync files to disk while they are being * written, in the background. Issue one request for every * {@code bytesPerSync} written. - * + *

    * Default: 0 * * @param bytesPerSync 0 to disable, otherwise the number of bytes. @@ -323,8 +319,12 @@ public RateLimiter rateLimiter() { return rateLimiter; } - private native static long newEnvOptions(); - private native static long newEnvOptions(final long dboptions_handle); + private static long newEnvOptionsInstance() { + RocksDB.loadLibrary(); + return newEnvOptions(); + } + private static native long newEnvOptions(); + private static native long newEnvOptions(final long dboptions_handle); @Override protected final native void disposeInternal(final long handle); private native void setUseMmapReads(final long handle, diff --git a/java/src/main/java/org/rocksdb/EventListener.java b/java/src/main/java/org/rocksdb/EventListener.java index a12ab92ba1f3..a2632580631c 100644 --- a/java/src/main/java/org/rocksdb/EventListener.java +++ b/java/src/main/java/org/rocksdb/EventListener.java @@ -5,33 +5,31 @@ package org.rocksdb; -import java.util.List; - /** * EventListener class contains a set of callback functions that will * be called when specific RocksDB event happens such as flush. It can * be used as a building block for developing custom features such as * stats-collector or external compaction algorithm. - * + *

    * Note that callback functions should not run for an extended period of * time before the function returns, otherwise RocksDB may be blocked. * For example, it is not suggested to do - * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int, + * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, java.util.List, int, int, * CompactionJobInfo)} (as it may run for a long while) or issue many of * {@link RocksDB#put(ColumnFamilyHandle, WriteOptions, byte[], byte[])} * (as Put may be blocked in certain cases) in the same thread in the * EventListener callback. - * + *

    * However, doing - * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, List, int, int, + * {@link RocksDB#compactFiles(CompactionOptions, ColumnFamilyHandle, java.util.List, int, int, * CompactionJobInfo)} and {@link RocksDB#put(ColumnFamilyHandle, WriteOptions, byte[], byte[])} in * another thread is considered safe. - * + *

    * [Threading] All EventListener callback will be called using the * actual thread that involves in that specific event. For example, it * is the RocksDB background flush thread that does the actual flush to * call {@link #onFlushCompleted(RocksDB, FlushJobInfo)}. - * + *

    * [Locking] All EventListener callbacks are designed to be called without * the current thread holding any DB mutex. This is to prevent potential * deadlock and performance issue when using EventListener callback @@ -41,7 +39,7 @@ public interface EventListener { /** * A callback function to RocksDB which will be called before a * RocksDB starts to flush memtables. - * + *

    * Note that the this function must be implemented in a way such that * it should not run for an extended period of time before the function * returns. Otherwise, RocksDB may be blocked. @@ -55,7 +53,7 @@ public interface EventListener { /** * callback function to RocksDB which will be called whenever a * registered RocksDB flushes a file. - * + *

    * Note that the this function must be implemented in a way such that * it should not run for an extended period of time before the function * returns. Otherwise, RocksDB may be blocked. @@ -77,7 +75,7 @@ public interface EventListener { * on file creations and deletions is suggested to implement * {@link #onFlushCompleted(RocksDB, FlushJobInfo)} and * {@link #onCompactionCompleted(RocksDB, CompactionJobInfo)}. - * + *

    * Note that if applications would like to use the passed reference * outside this function call, they should make copies from the * returned value. @@ -91,7 +89,7 @@ public interface EventListener { * A callback function to RocksDB which will be called before a * RocksDB starts to compact. The default implementation is * no-op. - * + *

    * Note that the this function must be implemented in a way such that * it should not run for an extended period of time before the function * returns. Otherwise, RocksDB may be blocked. @@ -108,7 +106,7 @@ public interface EventListener { * A callback function for RocksDB which will be called whenever * a registered RocksDB compacts a file. The default implementation * is a no-op. - * + *

    * Note that this function must be implemented in a way such that * it should not run for an extended period of time before the function * returns. Otherwise, RocksDB may be blocked. @@ -129,11 +127,11 @@ public interface EventListener { * of a pointer to DB. Applications that build logic basic based * on file creations and deletions is suggested to implement * OnFlushCompleted and OnCompactionCompleted. - * + *

    * Historically it will only be called if the file is successfully created. * Now it will also be called on failure case. User can check info.status * to see if it succeeded or not. - * + *

    * Note that if applications would like to use the passed reference * outside this function call, they should make copies from these * returned value. @@ -147,7 +145,7 @@ public interface EventListener { * A callback function for RocksDB which will be called before * a SST file is being created. It will follow by OnTableFileCreated after * the creation finishes. - * + *

    * Note that if applications would like to use the passed reference * outside this function call, they should make copies from these * returned value. @@ -160,11 +158,11 @@ public interface EventListener { /** * A callback function for RocksDB which will be called before * a memtable is made immutable. - * + *

    * Note that the this function must be implemented in a way such that * it should not run for an extended period of time before the function * returns. Otherwise, RocksDB may be blocked. - * + *

    * Note that if applications would like to use the passed reference * outside this function call, they should make copies from these * returned value. @@ -177,7 +175,7 @@ public interface EventListener { /** * A callback function for RocksDB which will be called before * a column family handle is deleted. - * + *

    * Note that the this function must be implemented in a way such that * it should not run for an extended period of time before the function * returns. Otherwise, RocksDB may be blocked. @@ -190,7 +188,7 @@ public interface EventListener { /** * A callback function for RocksDB which will be called after an external * file is ingested using IngestExternalFile. - * + *

    * Note that the this function will run on the same thread as * IngestExternalFile(), if this function is blocked, IngestExternalFile() * will be blocked from finishing. @@ -210,7 +208,7 @@ void onExternalFileIngested( * preventing the database from entering read-only mode. We do not provide any * guarantee when failed flushes/compactions will be rescheduled if the user * suppresses an error. - * + *

    * Note that this function can run on the same threads as flush, compaction, * and user writes. So, it is extremely important not to perform heavy * computations or blocking calls in this function. @@ -224,7 +222,7 @@ void onBackgroundError( /** * A callback function for RocksDB which will be called whenever a change * of superversion triggers a change of the stall conditions. - * + *

    * Note that the this function must be implemented in a way such that * it should not run for an extended period of time before the function * returns. Otherwise, RocksDB may be blocked. @@ -301,7 +299,7 @@ void onBackgroundError( * If true, the {@link #onFileReadFinish(FileOperationInfo)} * and {@link #onFileWriteFinish(FileOperationInfo)} will be called. If * false, then they won't be called. - * + *

    * Default: false * * @return whether to callback when file read/write is finished diff --git a/java/src/main/java/org/rocksdb/ExportImportFilesMetaData.java b/java/src/main/java/org/rocksdb/ExportImportFilesMetaData.java new file mode 100644 index 000000000000..1589f631c480 --- /dev/null +++ b/java/src/main/java/org/rocksdb/ExportImportFilesMetaData.java @@ -0,0 +1,18 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +/** + * The metadata that describes a column family. + */ +public class ExportImportFilesMetaData extends RocksObject { + ExportImportFilesMetaData(final long nativeHandle) { + super(nativeHandle); + } + + @Override protected native void disposeInternal(final long handle); +} diff --git a/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java b/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java index 6b14a80240ba..7a99dd6bfe2f 100644 --- a/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java +++ b/java/src/main/java/org/rocksdb/ExternalFileIngestionInfo.java @@ -74,12 +74,12 @@ public TableProperties getTableProperties() { } @Override - public boolean equals(Object o) { + public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; - ExternalFileIngestionInfo that = (ExternalFileIngestionInfo) o; + final ExternalFileIngestionInfo that = (ExternalFileIngestionInfo) o; return globalSeqno == that.globalSeqno && Objects.equals(columnFamilyName, that.columnFamilyName) && Objects.equals(externalFilePath, that.externalFilePath) diff --git a/java/src/main/java/org/rocksdb/FileOperationInfo.java b/java/src/main/java/org/rocksdb/FileOperationInfo.java index aa5743ed377b..fae9cd5de3bb 100644 --- a/java/src/main/java/org/rocksdb/FileOperationInfo.java +++ b/java/src/main/java/org/rocksdb/FileOperationInfo.java @@ -87,7 +87,7 @@ public Status getStatus() { } @Override - public boolean equals(Object o) { + public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) diff --git a/java/src/main/java/org/rocksdb/FilterPolicyType.java b/java/src/main/java/org/rocksdb/FilterPolicyType.java new file mode 100644 index 000000000000..6a693ee4039d --- /dev/null +++ b/java/src/main/java/org/rocksdb/FilterPolicyType.java @@ -0,0 +1,49 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +/** + * IndexType used in conjunction with BlockBasedTable. + */ +public enum FilterPolicyType { + kUnknownFilterPolicy((byte) 0), + + /** + * This is a user-facing policy that automatically choose between + * LegacyBloom and FastLocalBloom based on context at build time, + * including compatibility with format_version. + */ + kBloomFilterPolicy((byte) 1), + + /** + * This is a user-facing policy that chooses between Standard128Ribbon + * and FastLocalBloom based on context at build time (LSM level and other + * factors in extreme cases). + */ + kRibbonFilterPolicy((byte) 2); + + public Filter createFilter(final long handle, final double param) { + if (this == kBloomFilterPolicy) { + return new BloomFilter(handle, param); + } + return null; + } + + /** + * Returns the byte value of the enumerations value + * + * @return byte representation + */ + public byte getValue() { + return value_; + } + + FilterPolicyType(byte value) { + value_ = value; + } + + private final byte value_; +} diff --git a/java/src/main/java/org/rocksdb/FlushJobInfo.java b/java/src/main/java/org/rocksdb/FlushJobInfo.java index ca9aa05236b8..414d3a2f332e 100644 --- a/java/src/main/java/org/rocksdb/FlushJobInfo.java +++ b/java/src/main/java/org/rocksdb/FlushJobInfo.java @@ -90,7 +90,7 @@ public int getJobId() { * Determine if rocksdb is currently slowing-down all writes to prevent * creating too many Level 0 files as compaction seems not able to * catch up the write request speed. - * + *

    * This indicates that there are too many files in Level 0. * * @return true if rocksdb is currently slowing-down all writes, @@ -103,7 +103,7 @@ public boolean isTriggeredWritesSlowdown() { /** * Determine if rocksdb is currently blocking any writes to prevent * creating more L0 files. - * + *

    * This indicates that there are too many files in level 0. * Compactions should try to compact L0 files down to lower levels as soon * as possible. @@ -151,12 +151,12 @@ public FlushReason getFlushReason() { } @Override - public boolean equals(Object o) { + public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; - FlushJobInfo that = (FlushJobInfo) o; + final FlushJobInfo that = (FlushJobInfo) o; return columnFamilyId == that.columnFamilyId && threadId == that.threadId && jobId == that.jobId && triggeredWritesSlowdown == that.triggeredWritesSlowdown && triggeredWritesStop == that.triggeredWritesStop && smallestSeqno == that.smallestSeqno diff --git a/java/src/main/java/org/rocksdb/FlushOptions.java b/java/src/main/java/org/rocksdb/FlushOptions.java index 760b515fdff4..be8c4bc9460f 100644 --- a/java/src/main/java/org/rocksdb/FlushOptions.java +++ b/java/src/main/java/org/rocksdb/FlushOptions.java @@ -10,15 +10,11 @@ * {@link org.rocksdb.RocksDB}. */ public class FlushOptions extends RocksObject { - static { - RocksDB.loadLibrary(); - } - /** * Construct a new instance of FlushOptions. */ public FlushOptions(){ - super(newFlushOptions()); + super(newFlushOptionsInance()); } /** @@ -47,13 +43,13 @@ public boolean waitForFlush() { } /** - * Set to true so that flush would proceeds immediately even it it means + * Set to true so that flush would proceed immediately even if it means * writes will stall for the duration of the flush. - * + *

    * Set to false so that the operation will wait until it's possible to do * the flush without causing stall or until required flush is performed by * someone else (foreground call or background thread). - * + *

    * Default: false * * @param allowWriteStall true to allow writes to stall for flush, false @@ -77,8 +73,11 @@ public boolean allowWriteStall() { assert(isOwningHandle()); return allowWriteStall(nativeHandle_); } - - private native static long newFlushOptions(); + private static long newFlushOptionsInance() { + RocksDB.loadLibrary(); + return newFlushOptions(); + } + private static native long newFlushOptions(); @Override protected final native void disposeInternal(final long handle); private native void setWaitForFlush(final long handle, diff --git a/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java b/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java index 05cc2bb909fd..a9868df57d7b 100644 --- a/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java +++ b/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java @@ -6,7 +6,7 @@ * Such memtable contains a fix-sized array of buckets, where * each bucket points to a sorted singly-linked * list (or null if the bucket is empty). - * + *

    * Note that since this mem-table representation relies on the * key prefix, it is required to invoke one of the usePrefixExtractor * functions to specify how to extract key prefix given a key. @@ -15,7 +15,7 @@ * and post a warning in the LOG. */ public class HashLinkedListMemTableConfig extends MemTableConfig { - public static final long DEFAULT_BUCKET_COUNT = 50000; + public static final long DEFAULT_BUCKET_COUNT = 50_000; public static final long DEFAULT_HUGE_PAGE_TLB_SIZE = 0; public static final int DEFAULT_BUCKET_ENTRIES_LOG_THRES = 4096; public static final boolean diff --git a/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java b/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java index efc78b14e626..80d6b7115182 100644 --- a/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java +++ b/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java @@ -6,7 +6,7 @@ * Such mem-table representation contains a fix-sized array of * buckets, where each bucket points to a skiplist (or null if the * bucket is empty). - * + *

    * Note that since this mem-table representation relies on the * key prefix, it is required to invoke one of the usePrefixExtractor * functions to specify how to extract key prefix given a key. @@ -15,7 +15,7 @@ * and post a warning in the LOG. */ public class HashSkipListMemTableConfig extends MemTableConfig { - public static final int DEFAULT_BUCKET_COUNT = 1000000; + public static final int DEFAULT_BUCKET_COUNT = 1_000_000; public static final int DEFAULT_BRANCHING_FACTOR = 4; public static final int DEFAULT_HEIGHT = 4; diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java index d5f7da5e03c8..41fe241ad3ab 100644 --- a/java/src/main/java/org/rocksdb/HistogramType.java +++ b/java/src/main/java/org/rocksdb/HistogramType.java @@ -36,16 +36,6 @@ public enum HistogramType { WRITE_RAW_BLOCK_MICROS((byte) 0xC), - STALL_L0_SLOWDOWN_COUNT((byte) 0xD), - - STALL_MEMTABLE_COMPACTION_COUNT((byte) 0xE), - - STALL_L0_NUM_FILES_COUNT((byte) 0xF), - - HARD_RATE_LIMIT_DELAY_COUNT((byte) 0x10), - - SOFT_RATE_LIMIT_DELAY_COUNT((byte) 0x11), - NUM_FILES_IN_SINGLE_COMPACTION((byte) 0x12), DB_SEEK((byte) 0x13), @@ -73,7 +63,7 @@ public enum HistogramType { /** * number of bytes decompressed. - * + *

    * number of bytes is when uncompressed; i.e. before/after respectively */ BYTES_DECOMPRESSED((byte) 0x1B), @@ -144,11 +134,6 @@ public enum HistogramType { */ BLOB_DB_BLOB_FILE_SYNC_MICROS((byte) 0x2B), - /** - * BlobDB garbage collection time. - */ - BLOB_DB_GC_MICROS((byte) 0x2C), - /** * BlobDB compression time. */ @@ -165,11 +150,6 @@ public enum HistogramType { */ NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL((byte) 0x2F), - /** - * Num of Data blocks read from file system per level in MultiGet request. - */ - NUM_DATA_BLOCKS_READ_PER_LEVEL((byte) 0x30), - /** * Num of SST files read from file system per level in MultiGet request. */ @@ -182,6 +162,29 @@ public enum HistogramType { ASYNC_READ_BYTES((byte) 0x33), + /** + * Number of bytes read for RocksDB's prefetching contents + * (as opposed to file system's prefetch) + * from the end of SST table during block based table open + */ + TABLE_OPEN_PREFETCH_TAIL_READ_BYTES((byte) 0x39), + + FILE_READ_FLUSH_MICROS((byte) 0x3A), + + FILE_READ_COMPACTION_MICROS((byte) 0x3B), + + FILE_READ_DB_OPEN_MICROS((byte) 0x3C), + + FILE_READ_GET_MICROS((byte) 0x3D), + + FILE_READ_MULTIGET_MICROS((byte) 0x3E), + + FILE_READ_DB_ITERATOR_MICROS((byte) 0x3F), + + FILE_READ_VERIFY_DB_CHECKSUM_MICROS((byte) 0x40), + + FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS((byte) 0x41), + // 0x1F for backwards compatibility on current minor version. HISTOGRAM_ENUM_MAX((byte) 0x1F); diff --git a/java/src/main/java/org/rocksdb/HyperClockCache.java b/java/src/main/java/org/rocksdb/HyperClockCache.java new file mode 100644 index 000000000000..f8fe42be750e --- /dev/null +++ b/java/src/main/java/org/rocksdb/HyperClockCache.java @@ -0,0 +1,60 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +/** + * HyperClockCache - A lock-free Cache alternative for RocksDB block cache + * that offers much improved CPU efficiency vs. LRUCache under high parallel + * load or high contention, with some caveats: + *

      + *
    • + * Not a general Cache implementation: can only be used for + * BlockBasedTableOptions::block_cache, which RocksDB uses in a way that is + * compatible with HyperClockCache. + *
    • + *
    • + * Requires an extra tuning parameter: see estimated_entry_charge below. + * Similarly, substantially changing the capacity with SetCapacity could + * harm efficiency. -> EXPERIMENTAL: the tuning parameter can be set to 0 + * to find the appropriate balance automatically. + *
    • + *
    • + * Cache priorities are less aggressively enforced, which could cause + * cache dilution from long range scans (unless they use fill_cache=false). + *
    • + *
    • + * Can be worse for small caches, because if almost all of a cache shard is + * pinned (more likely with non-partitioned filters), then CLOCK eviction + * becomes very CPU intensive. + *
    • + *
    + */ +@Experimental("HyperClockCache is still experimental and this API may change in future.") +public class HyperClockCache extends Cache { + /** + * + * @param capacity The fixed size capacity of the cache + * @param estimatedEntryCharge EXPERIMENTAL: the field can be set to 0 to size the table + * dynamically and automatically. See C++ Api for more info. + * @param numShardBits The cache is sharded to 2^numShardBits shards, by hash of the key + * @param strictCapacityLimit insert to the cache will fail when cache is full + */ + public HyperClockCache(final long capacity, final long estimatedEntryCharge, int numShardBits, + boolean strictCapacityLimit) { + super(newHyperClockCache(capacity, estimatedEntryCharge, numShardBits, strictCapacityLimit)); + } + + @Override + protected void disposeInternal(long handle) { + disposeInternalJni(handle); + } + + private static native void disposeInternalJni(long handle); + + private static native long newHyperClockCache(final long capacity, + final long estimatedEntryCharge, int numShardBits, boolean strictCapacityLimit); +} diff --git a/java/src/main/java/org/rocksdb/ImportColumnFamilyOptions.java b/java/src/main/java/org/rocksdb/ImportColumnFamilyOptions.java new file mode 100644 index 000000000000..652bd19dc8c1 --- /dev/null +++ b/java/src/main/java/org/rocksdb/ImportColumnFamilyOptions.java @@ -0,0 +1,44 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +/** + * ImportColumnFamilyOptions is used by + * {@link RocksDB#createColumnFamilyWithImport(ColumnFamilyDescriptor, ImportColumnFamilyOptions, + * ExportImportFilesMetaData)}. + */ +public class ImportColumnFamilyOptions extends RocksObject { + public ImportColumnFamilyOptions() { + super(newImportColumnFamilyOptions()); + } + + /** + * Can be set to true to move the files instead of copying them. + * + * @return true if files will be moved + */ + public boolean moveFiles() { + return moveFiles(nativeHandle_); + } + + /** + * Can be set to true to move the files instead of copying them. + * + * @param moveFiles true if files should be moved instead of copied + * + * @return the reference to the current IngestExternalFileOptions. + */ + public ImportColumnFamilyOptions setMoveFiles(final boolean moveFiles) { + setMoveFiles(nativeHandle_, moveFiles); + return this; + } + + private static native long newImportColumnFamilyOptions(); + private native boolean moveFiles(final long handle); + private native void setMoveFiles(final long handle, final boolean move_files); + @Override protected final native void disposeInternal(final long handle); +} diff --git a/java/src/main/java/org/rocksdb/IndexType.java b/java/src/main/java/org/rocksdb/IndexType.java index 162edad1bb4f..5615e929b300 100644 --- a/java/src/main/java/org/rocksdb/IndexType.java +++ b/java/src/main/java/org/rocksdb/IndexType.java @@ -47,7 +47,7 @@ public byte getValue() { return value_; } - IndexType(byte value) { + IndexType(final byte value) { value_ = value; } diff --git a/java/src/main/java/org/rocksdb/InfoLogLevel.java b/java/src/main/java/org/rocksdb/InfoLogLevel.java index b7c0f070028d..197bd89dab68 100644 --- a/java/src/main/java/org/rocksdb/InfoLogLevel.java +++ b/java/src/main/java/org/rocksdb/InfoLogLevel.java @@ -15,7 +15,7 @@ public enum InfoLogLevel { private final byte value_; - private InfoLogLevel(final byte value) { + InfoLogLevel(final byte value) { value_ = value; } diff --git a/java/src/main/java/org/rocksdb/IngestExternalFileOptions.java b/java/src/main/java/org/rocksdb/IngestExternalFileOptions.java index a6a308daa3c0..1a6a5fccd945 100644 --- a/java/src/main/java/org/rocksdb/IngestExternalFileOptions.java +++ b/java/src/main/java/org/rocksdb/IngestExternalFileOptions.java @@ -136,15 +136,15 @@ public boolean ingestBehind() { /** * Set to true if you would like duplicate keys in the file being ingested * to be skipped rather than overwriting existing data under that key. - * + *

    * Usecase: back-fill of some historical data in the database without * over-writing existing newer version of data. - * + *

    * This option could only be used if the DB has been running * with DBOptions#allowIngestBehind() == true since the dawn of time. - * + *

    * All files will be ingested at the bottommost level with seqno=0. - * + *

    * Default: false * * @param ingestBehind true if you would like duplicate keys in the file being @@ -160,7 +160,7 @@ public IngestExternalFileOptions setIngestBehind(final boolean ingestBehind) { /** * Returns true write if the global_seqno is written to a given offset * in the external SST file for backward compatibility. - * + *

    * See {@link #setWriteGlobalSeqno(boolean)}. * * @return true if the global_seqno is written to a given offset, @@ -173,21 +173,21 @@ public boolean writeGlobalSeqno() { /** * Set to true if you would like to write the global_seqno to a given offset * in the external SST file for backward compatibility. - * + *

    * Older versions of RocksDB write the global_seqno to a given offset within * the ingested SST files, and new versions of RocksDB do not. - * + *

    * If you ingest an external SST using new version of RocksDB and would like * to be able to downgrade to an older version of RocksDB, you should set * {@link #writeGlobalSeqno()} to true. - * + *

    * If your service is just starting to use the new RocksDB, we recommend that * you set this option to false, which brings two benefits: * 1. No extra random write for global_seqno during ingestion. * 2. Without writing external SST file, it's possible to do checksum. - * + *

    * We have a plan to set this option to false by default in the future. - * + *

    * Default: true * * @param writeGlobalSeqno true to write the gloal_seqno to a given offset, @@ -201,10 +201,10 @@ public IngestExternalFileOptions setWriteGlobalSeqno( return this; } - private native static long newIngestExternalFileOptions(); - private native static long newIngestExternalFileOptions( - final boolean moveFiles, final boolean snapshotConsistency, - final boolean allowGlobalSeqNo, final boolean allowBlockingFlush); + private static native long newIngestExternalFileOptions(); + private static native long newIngestExternalFileOptions(final boolean moveFiles, + final boolean snapshotConsistency, final boolean allowGlobalSeqNo, + final boolean allowBlockingFlush); @Override protected final native void disposeInternal(final long handle); private native boolean moveFiles(final long handle); diff --git a/java/src/main/java/org/rocksdb/KeyMayExist.java b/java/src/main/java/org/rocksdb/KeyMayExist.java index 36185d8c9ab9..6149b85292aa 100644 --- a/java/src/main/java/org/rocksdb/KeyMayExist.java +++ b/java/src/main/java/org/rocksdb/KeyMayExist.java @@ -24,7 +24,6 @@ public int hashCode() { } public enum KeyMayExistEnum { kNotExist, kExistsWithoutValue, kExistsWithValue } - ; public KeyMayExist(final KeyMayExistEnum exists, final int valueLength) { this.exists = exists; diff --git a/java/src/main/java/org/rocksdb/LRUCache.java b/java/src/main/java/org/rocksdb/LRUCache.java index db90b17c5b21..0a9d02e878ee 100644 --- a/java/src/main/java/org/rocksdb/LRUCache.java +++ b/java/src/main/java/org/rocksdb/LRUCache.java @@ -99,7 +99,7 @@ public LRUCache(final long capacity, final int numShardBits, final boolean stric capacity, numShardBits, strictCapacityLimit, highPriPoolRatio, lowPriPoolRatio)); } - private native static long newLRUCache(final long capacity, final int numShardBits, + private static native long newLRUCache(final long capacity, final int numShardBits, final boolean strictCapacityLimit, final double highPriPoolRatio, final double lowPriPoolRatio); @Override protected final native void disposeInternal(final long handle); diff --git a/java/src/main/java/org/rocksdb/LevelMetaData.java b/java/src/main/java/org/rocksdb/LevelMetaData.java index c5685098be12..424bcb026688 100644 --- a/java/src/main/java/org/rocksdb/LevelMetaData.java +++ b/java/src/main/java/org/rocksdb/LevelMetaData.java @@ -11,6 +11,7 @@ /** * The metadata that describes a level. */ +@SuppressWarnings("PMD.MissingStaticMethodInNonInstantiatableClass") public class LevelMetaData { private final int level; private final long size; diff --git a/java/src/main/java/org/rocksdb/LiveFileMetaData.java b/java/src/main/java/org/rocksdb/LiveFileMetaData.java index 35d883e180da..cb0f1a30225b 100644 --- a/java/src/main/java/org/rocksdb/LiveFileMetaData.java +++ b/java/src/main/java/org/rocksdb/LiveFileMetaData.java @@ -8,6 +8,7 @@ /** * The full set of metadata associated with each SST file. */ +@SuppressWarnings("PMD.MissingStaticMethodInNonInstantiatableClass") public class LiveFileMetaData extends SstFileMetaData { private final byte[] columnFamilyName; private final int level; @@ -40,6 +41,7 @@ private LiveFileMetaData( * * @return the name of the column family */ + @SuppressWarnings("PMD.MethodReturnsInternalArray") public byte[] columnFamilyName() { return columnFamilyName; } @@ -52,4 +54,18 @@ public byte[] columnFamilyName() { public int level() { return level; } + + public long newLiveFileMetaDataHandle() { + return newLiveFileMetaDataHandle(columnFamilyName(), columnFamilyName().length, level(), + fileName(), path(), size(), smallestSeqno(), largestSeqno(), smallestKey(), + smallestKey().length, largestKey(), largestKey().length, numReadsSampled(), + beingCompacted(), numEntries(), numDeletions()); + } + + private native long newLiveFileMetaDataHandle(final byte[] columnFamilyName, + final int columnFamilyNameLength, final int level, final String fileName, final String path, + final long size, final long smallestSeqno, final long largestSeqno, final byte[] smallestKey, + final int smallestKeyLength, final byte[] largestKey, final int largestKeyLength, + final long numReadsSampled, final boolean beingCompacted, final long numEntries, + final long numDeletions); } diff --git a/java/src/main/java/org/rocksdb/LogFile.java b/java/src/main/java/org/rocksdb/LogFile.java index ef24a6427c18..5ee2c9fcc64a 100644 --- a/java/src/main/java/org/rocksdb/LogFile.java +++ b/java/src/main/java/org/rocksdb/LogFile.java @@ -5,6 +5,7 @@ package org.rocksdb; +@SuppressWarnings("PMD.MissingStaticMethodInNonInstantiatableClass") public class LogFile { private final String pathName; private final long logNumber; diff --git a/java/src/main/java/org/rocksdb/Logger.java b/java/src/main/java/org/rocksdb/Logger.java index 00a5d5674568..614a7fa502f1 100644 --- a/java/src/main/java/org/rocksdb/Logger.java +++ b/java/src/main/java/org/rocksdb/Logger.java @@ -36,9 +36,8 @@ *

    */ public abstract class Logger extends RocksCallbackObject { - - private final static long WITH_OPTIONS = 0; - private final static long WITH_DBOPTIONS = 1; + private static final long WITH_OPTIONS = 0; + private static final long WITH_DBOPTIONS = 1; /** *

    AbstractLogger constructor.

    @@ -68,7 +67,7 @@ public Logger(final DBOptions dboptions) { } @Override - protected long initializeNative(long... nativeParameterHandles) { + protected long initializeNative(final long... nativeParameterHandles) { if(nativeParameterHandles[1] == WITH_OPTIONS) { return createNewLoggerOptions(nativeParameterHandles[0]); } else if(nativeParameterHandles[1] == WITH_DBOPTIONS) { diff --git a/java/src/main/java/org/rocksdb/MemTableConfig.java b/java/src/main/java/org/rocksdb/MemTableConfig.java index 83cee974a757..17033d251735 100644 --- a/java/src/main/java/org/rocksdb/MemTableConfig.java +++ b/java/src/main/java/org/rocksdb/MemTableConfig.java @@ -8,7 +8,7 @@ * MemTableConfig is used to config the internal mem-table of a RocksDB. * It is required for each memtable to have one such sub-class to allow * Java developers to use it. - * + *

    * To make a RocksDB to use a specific MemTable format, its associated * MemTableConfig should be properly set and passed into Options * via Options.setMemTableFactory() and open the db using that Options. @@ -25,5 +25,5 @@ public abstract class MemTableConfig { * * @return native handle address to native memory table instance. */ - abstract protected long newMemTableFactoryHandle(); + protected abstract long newMemTableFactoryHandle(); } diff --git a/java/src/main/java/org/rocksdb/MemTableInfo.java b/java/src/main/java/org/rocksdb/MemTableInfo.java index f4fb577c3a93..3d429035a343 100644 --- a/java/src/main/java/org/rocksdb/MemTableInfo.java +++ b/java/src/main/java/org/rocksdb/MemTableInfo.java @@ -77,12 +77,12 @@ public long getNumDeletes() { } @Override - public boolean equals(Object o) { + public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; - MemTableInfo that = (MemTableInfo) o; + final MemTableInfo that = (MemTableInfo) o; return firstSeqno == that.firstSeqno && earliestSeqno == that.earliestSeqno && numEntries == that.numEntries && numDeletes == that.numDeletes && Objects.equals(columnFamilyName, that.columnFamilyName); diff --git a/java/src/main/java/org/rocksdb/MemoryUsageType.java b/java/src/main/java/org/rocksdb/MemoryUsageType.java index 6010ce7af5a9..40e6d1716476 100644 --- a/java/src/main/java/org/rocksdb/MemoryUsageType.java +++ b/java/src/main/java/org/rocksdb/MemoryUsageType.java @@ -64,7 +64,7 @@ public static MemoryUsageType getMemoryUsageType(final byte byteIdentifier) { "Illegal value provided for MemoryUsageType."); } - MemoryUsageType(byte value) { + MemoryUsageType(final byte value) { value_ = value; } diff --git a/java/src/main/java/org/rocksdb/MemoryUtil.java b/java/src/main/java/org/rocksdb/MemoryUtil.java index 52b2175e6b19..dac6d9b84ef6 100644 --- a/java/src/main/java/org/rocksdb/MemoryUtil.java +++ b/java/src/main/java/org/rocksdb/MemoryUtil.java @@ -27,13 +27,15 @@ public class MemoryUtil { * @param caches Set of caches to collect memory usage for. * @return Map from {@link MemoryUsageType} to memory usage as a {@link Long}. */ - public static Map getApproximateMemoryUsageByType(final List dbs, final Set caches) { - int dbCount = (dbs == null) ? 0 : dbs.size(); - int cacheCount = (caches == null) ? 0 : caches.size(); - long[] dbHandles = new long[dbCount]; - long[] cacheHandles = new long[cacheCount]; + @SuppressWarnings("PMD.CloseResource") + public static Map getApproximateMemoryUsageByType( + final List dbs, final Set caches) { + final int dbCount = (dbs == null) ? 0 : dbs.size(); + final int cacheCount = (caches == null) ? 0 : caches.size(); + final long[] dbHandles = new long[dbCount]; + final long[] cacheHandles = new long[cacheCount]; if (dbCount > 0) { - ListIterator dbIter = dbs.listIterator(); + final ListIterator dbIter = dbs.listIterator(); while (dbIter.hasNext()) { dbHandles[dbIter.nextIndex()] = dbIter.next().nativeHandle_; } @@ -42,19 +44,19 @@ public static Map getApproximateMemoryUsageByType(final L // NOTE: This index handling is super ugly but I couldn't get a clean way to track both the // index and the iterator simultaneously within a Set. int i = 0; - for (Cache cache : caches) { + for (final Cache cache : caches) { cacheHandles[i] = cache.nativeHandle_; i++; } } - Map byteOutput = getApproximateMemoryUsageByType(dbHandles, cacheHandles); - Map output = new HashMap<>(); - for(Map.Entry longEntry : byteOutput.entrySet()) { + final Map byteOutput = getApproximateMemoryUsageByType(dbHandles, cacheHandles); + final Map output = new HashMap<>(); + for (final Map.Entry longEntry : byteOutput.entrySet()) { output.put(MemoryUsageType.getMemoryUsageType(longEntry.getKey()), longEntry.getValue()); } return output; } - private native static Map getApproximateMemoryUsageByType(final long[] dbHandles, - final long[] cacheHandles); + private static native Map getApproximateMemoryUsageByType( + final long[] dbHandles, final long[] cacheHandles); } diff --git a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java index af28fa8ce785..e54db7171e54 100644 --- a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java +++ b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptions.java @@ -7,15 +7,13 @@ import java.util.*; -public class MutableColumnFamilyOptions - extends AbstractMutableOptions { - +public class MutableColumnFamilyOptions extends AbstractMutableOptions { /** * User must use builder pattern, or parser. * * @param keys the keys * @param values the values - * + *

    * See {@link #builder()} and {@link #parse(String)}. */ private MutableColumnFamilyOptions(final String[] keys, @@ -36,11 +34,11 @@ public static MutableColumnFamilyOptionsBuilder builder() { /** * Parses a String representation of MutableColumnFamilyOptions - * + *

    * The format is: key1=value1;key2=value2;key3=value3 etc - * + *

    * For int[] values, each int should be separated by a colon, e.g. - * + *

    * key1=value1;intArrayKey1=1:2:3 * * @param str The string representation of the mutable column family options @@ -157,8 +155,8 @@ public ValueType getValueType() { public static class MutableColumnFamilyOptionsBuilder extends AbstractMutableOptionsBuilder implements MutableColumnFamilyOptionsInterface { - - private final static Map ALL_KEYS_LOOKUP = new HashMap<>(); + private static final Map ALL_KEYS_LOOKUP = + new HashMap<>(); static { for(final MutableColumnFamilyOptionKey key : MemtableOption.values()) { ALL_KEYS_LOOKUP.put(key.name(), key); @@ -476,7 +474,7 @@ public MutableColumnFamilyOptionsBuilder setCompressionType( @Override public CompressionType compressionType() { - return (CompressionType) getEnum(MiscOption.compression); + return getEnum(MiscOption.compression); } @Override @@ -549,7 +547,7 @@ public MutableColumnFamilyOptionsBuilder setBlobCompressionType( @Override public CompressionType blobCompressionType() { - return (CompressionType) getEnum(BlobOption.blob_compression_type); + return getEnum(BlobOption.blob_compression_type); } @Override @@ -617,7 +615,7 @@ public MutableColumnFamilyOptionsBuilder setPrepopulateBlobCache( @Override public PrepopulateBlobCache prepopulateBlobCache() { - return (PrepopulateBlobCache) getEnum(BlobOption.prepopulate_blob_cache); + return getEnum(BlobOption.prepopulate_blob_cache); } } } diff --git a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java index 0f5fe7d78705..729b0e882788 100644 --- a/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/MutableColumnFamilyOptionsInterface.java @@ -11,15 +11,15 @@ public interface MutableColumnFamilyOptionsInterface< /** * Amount of data to build up in memory (backed by an unsorted log * on disk) before converting to a sorted on-disk file. - * + *

    * Larger values increase performance, especially during bulk loads. * Up to {@code max_write_buffer_number} write buffers may be held in memory * at the same time, so you may wish to adjust this parameter * to control memory usage. - * + *

    * Also, a larger write buffer will result in a longer recovery time * the next time the database is opened. - * + *

    * Default: 64MB * @param writeBufferSize the size of write buffer. * @return the instance of the current object. @@ -56,7 +56,7 @@ public interface MutableColumnFamilyOptionsInterface< /** * Number of files to trigger level-0 compaction. A value < 0 means that * level-0 compaction will not be triggered by number of files at all. - * + *

    * Default: 4 * * @param level0FileNumCompactionTrigger The number of files to trigger @@ -68,7 +68,7 @@ public interface MutableColumnFamilyOptionsInterface< /** * Number of files to trigger level-0 compaction. A value < 0 means that * level-0 compaction will not be triggered by number of files at all. - * + *

    * Default: 4 * * @return The number of files to trigger @@ -109,7 +109,7 @@ public interface MutableColumnFamilyOptionsInterface< * @param maxBytesForLevelBase maximum bytes for level base. * * @return the reference to the current option. - * + *

    * See {@link AdvancedMutableColumnFamilyOptionsInterface#setMaxBytesForLevelMultiplier(double)} */ T setMaxBytesForLevelBase( @@ -127,7 +127,7 @@ T setMaxBytesForLevelBase( * * @return the upper-bound of the total size of level-1 files * in bytes. - * + *

    * See {@link AdvancedMutableColumnFamilyOptionsInterface#maxBytesForLevelMultiplier()} */ long maxBytesForLevelBase(); @@ -135,7 +135,7 @@ T setMaxBytesForLevelBase( /** * Compress blocks using the specified compression algorithm. This * parameter can be changed dynamically. - * + *

    * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression. * * @param compressionType Compression Type. @@ -147,7 +147,7 @@ T setCompressionType( /** * Compress blocks using the specified compression algorithm. This * parameter can be changed dynamically. - * + *

    * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression. * * @return Compression type. diff --git a/java/src/main/java/org/rocksdb/MutableDBOptions.java b/java/src/main/java/org/rocksdb/MutableDBOptions.java index bfba1dab3c66..927e80522272 100644 --- a/java/src/main/java/org/rocksdb/MutableDBOptions.java +++ b/java/src/main/java/org/rocksdb/MutableDBOptions.java @@ -11,13 +11,12 @@ import java.util.Objects; public class MutableDBOptions extends AbstractMutableOptions { - /** * User must use builder pattern, or parser. * * @param keys the keys * @param values the values - * + *

    * See {@link #builder()} and {@link #parse(String)}. */ private MutableDBOptions(final String[] keys, final String[] values) { @@ -37,11 +36,11 @@ public static MutableDBOptionsBuilder builder() { /** * Parses a String representation of MutableDBOptions - * + *

    * The format is: key1=value1;key2=value2;key3=value3 etc - * + *

    * For int[] values, each int should be separated by a comma, e.g. - * + *

    * key1=value1;intArrayKey1=1:2:3 * * @param str The string representation of the mutable db options @@ -49,7 +48,7 @@ public static MutableDBOptionsBuilder builder() { * * @return A builder for the mutable db options */ - public static MutableDBOptionsBuilder parse(final String str, boolean ignoreUnknown) { + public static MutableDBOptionsBuilder parse(final String str, final boolean ignoreUnknown) { Objects.requireNonNull(str); final List parsedOptions = OptionString.Parser.parse(str); @@ -93,8 +92,7 @@ public ValueType getValueType() { public static class MutableDBOptionsBuilder extends AbstractMutableOptionsBuilder implements MutableDBOptionsInterface { - - private final static Map ALL_KEYS_LOOKUP = new HashMap<>(); + private static final Map ALL_KEYS_LOOKUP = new HashMap<>(); static { for(final MutableDBOptionKey key : DBOption.values()) { ALL_KEYS_LOOKUP.put(key.name(), key); diff --git a/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java b/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java index bdf9d7bf600d..8bf7b0d64bea 100644 --- a/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/MutableDBOptionsInterface.java @@ -27,7 +27,7 @@ public interface MutableDBOptionsInterface * Specifies the maximum number of concurrent background compaction jobs, * submitted to the default LOW priority thread pool. * If you're increasing this, also consider increasing number of threads in @@ -52,7 +52,7 @@ public interface MutableDBOptionsInterface * Returns the maximum number of concurrent background compaction jobs, * submitted to the default LOW priority thread pool. * When increasing this number, we may also want to consider increasing @@ -72,9 +72,9 @@ public interface MutableDBOptionsInterface * DEFAULT: false - * + *

    * Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)} * API. @@ -90,9 +90,9 @@ public interface MutableDBOptionsInterface * DEFAULT: false - * + *

    * Dynamically changeable through * {@link RocksDB#setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)} * API. @@ -105,7 +105,7 @@ public interface MutableDBOptionsInterface * Default: 1024 * 1024 (1 MB) * * @param writableFileMaxBufferSize the maximum buffer size @@ -118,7 +118,7 @@ public interface MutableDBOptionsInterface * Default: 1024 * 1024 (1 MB) * * @return the maximum buffer size @@ -137,11 +137,11 @@ public interface MutableDBOptionsInterface * Unit: bytes per second. - * + *

    * Default: 0 - * + *

    * Dynamically changeable through {@link RocksDB#setDBOptions(MutableDBOptions)}. * * @param delayedWriteRate the rate in bytes per second @@ -162,11 +162,11 @@ public interface MutableDBOptionsInterface * Unit: bytes per second. - * + *

    * Default: 0 - * + *

    * Dynamically changeable through {@link RocksDB#setDBOptions(MutableDBOptions)}. * * @return the rate in bytes per second @@ -358,7 +358,7 @@ public interface MutableDBOptionsInterface * Default: 0, turned off * * @param walBytesPerSync size in bytes @@ -368,7 +368,7 @@ public interface MutableDBOptionsInterface * Default: 0, turned off * * @return size in bytes @@ -383,7 +383,7 @@ public interface MutableDBOptionsInterface * - If `sync_file_range` is supported it achieves this by waiting for any * prior `sync_file_range`s to finish before proceeding. In this way, * processing (compression, etc.) can proceed uninhibited in the gap @@ -391,11 +391,11 @@ public interface MutableDBOptionsInterface * Note: Enabling this option does not provide any additional persistence * guarantees, as it may use `sync_file_range`, which does not write out * metadata. - * + *

    * Default: false * * @param strictBytesPerSync the bytes per sync @@ -405,7 +405,7 @@ public interface MutableDBOptionsInterface * See {@link #setStrictBytesPerSync(boolean)} * * @return the limit in bytes. @@ -415,9 +415,9 @@ public interface MutableDBOptionsInterface * That way RocksDB's compaction is doing sequential instead of random reads. - * + *

    * Default: 0 * * @param compactionReadaheadSize The compaction read-ahead size @@ -429,9 +429,9 @@ public interface MutableDBOptionsInterface * That way RocksDB's compaction is doing sequential instead of random reads. - * + *

    * Default: 0 * * @return The compaction read-ahead size diff --git a/java/src/main/java/org/rocksdb/MutableOptionValue.java b/java/src/main/java/org/rocksdb/MutableOptionValue.java index 7f69eeb9ea2a..fe689b5d01b0 100644 --- a/java/src/main/java/org/rocksdb/MutableOptionValue.java +++ b/java/src/main/java/org/rocksdb/MutableOptionValue.java @@ -13,8 +13,7 @@ public abstract class MutableOptionValue { abstract String asString(); abstract T asObject(); - private static abstract class MutableOptionValueObject - extends MutableOptionValue { + private abstract static class MutableOptionValueObject extends MutableOptionValue { protected final T value; protected MutableOptionValueObject(final T value) { diff --git a/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java b/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java index 6acc146f745d..b270b8d36699 100644 --- a/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java +++ b/java/src/main/java/org/rocksdb/NativeComparatorWrapper.java @@ -10,11 +10,13 @@ /** * A simple abstraction to allow a Java class to wrap a custom comparator * implemented in C++. - * + *

    * The native comparator must directly extend rocksdb::Comparator. */ public abstract class NativeComparatorWrapper extends AbstractComparator { + static final String NATIVE_CODE_IMPLEMENTATION_SHOULD_NOT_BE_CALLED = + "This should not be called. Implementation is in Native code"; @Override final ComparatorType getComparatorType() { @@ -23,26 +25,22 @@ final ComparatorType getComparatorType() { @Override public final String name() { - throw new IllegalStateException("This should not be called. " + - "Implementation is in Native code"); + throw new IllegalStateException(NATIVE_CODE_IMPLEMENTATION_SHOULD_NOT_BE_CALLED); } @Override public final int compare(final ByteBuffer s1, final ByteBuffer s2) { - throw new IllegalStateException("This should not be called. " + - "Implementation is in Native code"); + throw new IllegalStateException(NATIVE_CODE_IMPLEMENTATION_SHOULD_NOT_BE_CALLED); } @Override public final void findShortestSeparator(final ByteBuffer start, final ByteBuffer limit) { - throw new IllegalStateException("This should not be called. " + - "Implementation is in Native code"); + throw new IllegalStateException(NATIVE_CODE_IMPLEMENTATION_SHOULD_NOT_BE_CALLED); } @Override public final void findShortSuccessor(final ByteBuffer key) { - throw new IllegalStateException("This should not be called. " + - "Implementation is in Native code"); + throw new IllegalStateException(NATIVE_CODE_IMPLEMENTATION_SHOULD_NOT_BE_CALLED); } /** diff --git a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java index b97cf28b913e..6fe97994d201 100644 --- a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java +++ b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java @@ -16,13 +16,17 @@ public class NativeLibraryLoader { private static final NativeLibraryLoader instance = new NativeLibraryLoader(); private static boolean initialized = false; - private static final String sharedLibraryName = Environment.getSharedLibraryName("rocksdb"); - private static final String jniLibraryName = Environment.getJniLibraryName("rocksdb"); + private static final String ROCKSDB_LIBRARY_NAME = "rocksdb"; + + private static final String sharedLibraryName = + Environment.getSharedLibraryName(ROCKSDB_LIBRARY_NAME); + private static final String jniLibraryName = Environment.getJniLibraryName(ROCKSDB_LIBRARY_NAME); private static final /* @Nullable */ String fallbackJniLibraryName = - Environment.getFallbackJniLibraryName("rocksdb"); - private static final String jniLibraryFileName = Environment.getJniLibraryFileName("rocksdb"); + Environment.getFallbackJniLibraryName(ROCKSDB_LIBRARY_NAME); + private static final String jniLibraryFileName = + Environment.getJniLibraryFileName(ROCKSDB_LIBRARY_NAME); private static final /* @Nullable */ String fallbackJniLibraryFileName = - Environment.getFallbackJniLibraryFileName("rocksdb"); + Environment.getFallbackJniLibraryFileName(ROCKSDB_LIBRARY_NAME); private static final String tempFilePrefix = "librocksdbjni"; private static final String tempFileSuffix = Environment.getJniLibraryExtension(); @@ -51,6 +55,7 @@ public static NativeLibraryLoader getInstance() { * * @throws java.io.IOException if a filesystem operation fails. */ + @SuppressWarnings("PMD.EmptyCatchBlock") public synchronized void loadLibrary(final String tmpDir) throws IOException { try { // try dynamic library @@ -104,64 +109,58 @@ void loadLibraryFromJar(final String tmpDir) } } - File loadLibraryFromJarToTemp(final String tmpDir) - throws IOException { - InputStream is = null; - try { - // attempt to look up the static library in the jar file - String libraryFileName = jniLibraryFileName; - is = getClass().getClassLoader().getResourceAsStream(libraryFileName); - - if (is == null) { - // is there a fallback we can try - if (fallbackJniLibraryFileName == null) { - throw new RuntimeException(libraryFileName + " was not found inside JAR."); - } - - // attempt to look up the fallback static library in the jar file - libraryFileName = fallbackJniLibraryFileName; - is = getClass().getClassLoader().getResourceAsStream(libraryFileName); - if (is == null) { - throw new RuntimeException(libraryFileName + " was not found inside JAR."); - } + private File createTemp(final String tmpDir, final String libraryFileName) throws IOException { + // create a temporary file to copy the library to + final File temp; + if (tmpDir == null || tmpDir.isEmpty()) { + temp = File.createTempFile(tempFilePrefix, tempFileSuffix); + } else { + final File parentDir = new File(tmpDir); + if (!parentDir.exists()) { + throw new RuntimeException( + "Directory: " + parentDir.getAbsolutePath() + " does not exist!"); } - - // create a temporary file to copy the library to - final File temp; - if (tmpDir == null || tmpDir.isEmpty()) { - temp = File.createTempFile(tempFilePrefix, tempFileSuffix); - } else { - final File parentDir = new File(tmpDir); - if (!parentDir.exists()) { - throw new RuntimeException( - "Directory: " + parentDir.getAbsolutePath() + " does not exist!"); - } - temp = new File(parentDir, libraryFileName); - if (temp.exists() && !temp.delete()) { - throw new RuntimeException( - "File: " + temp.getAbsolutePath() + " already exists and cannot be removed."); - } - if (!temp.createNewFile()) { - throw new RuntimeException("File: " + temp.getAbsolutePath() + " could not be created."); - } + temp = new File(parentDir, libraryFileName); + if (temp.exists() && !temp.delete()) { + throw new RuntimeException( + "File: " + temp.getAbsolutePath() + " already exists and cannot be removed."); } - if (!temp.exists()) { - throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist."); - } else { - temp.deleteOnExit(); + if (!temp.createNewFile()) { + throw new RuntimeException("File: " + temp.getAbsolutePath() + " could not be created."); } + } + if (temp.exists()) { + temp.deleteOnExit(); + return temp; + } else { + throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist."); + } + } - // copy the library from the Jar file to the temp destination - Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); + @SuppressWarnings({"PMD.UseProperClassLoader", "PMD.UseTryWithResources"}) + File loadLibraryFromJarToTemp(final String tmpDir) throws IOException { + try (InputStream is = getClass().getClassLoader().getResourceAsStream(jniLibraryFileName)) { + if (is != null) { + final File temp = createTemp(tmpDir, jniLibraryFileName); + Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); + return temp; + } + } - // return the temporary library file - return temp; + if (fallbackJniLibraryFileName == null) { + throw new RuntimeException(fallbackJniLibraryFileName + " was not found inside JAR."); + } - } finally { + try (InputStream is = + getClass().getClassLoader().getResourceAsStream(fallbackJniLibraryFileName)) { if (is != null) { - is.close(); + final File temp = createTemp(tmpDir, fallbackJniLibraryFileName); + Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); + return temp; } } + + throw new RuntimeException(jniLibraryFileName + " was not found inside JAR."); } /** diff --git a/java/src/main/java/org/rocksdb/OperationType.java b/java/src/main/java/org/rocksdb/OperationType.java index 7cc9b65cdf24..bf73534683cc 100644 --- a/java/src/main/java/org/rocksdb/OperationType.java +++ b/java/src/main/java/org/rocksdb/OperationType.java @@ -7,14 +7,15 @@ /** * The type used to refer to a thread operation. - * + *

    * A thread operation describes high-level action of a thread, * examples include compaction and flush. */ public enum OperationType { OP_UNKNOWN((byte)0x0), OP_COMPACTION((byte)0x1), - OP_FLUSH((byte)0x2); + OP_FLUSH((byte) 0x2), + OP_DBOPEN((byte) 0x3); private final byte value; diff --git a/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java b/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java index 5a2e1f3edf57..80d3c720bf6b 100644 --- a/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java +++ b/java/src/main/java/org/rocksdb/OptimisticTransactionDB.java @@ -94,20 +94,20 @@ public static OptimisticTransactionDB open(final DBOptions dbOptions, return otdb; } - /** * This is similar to {@link #close()} except that it * throws an exception if any error occurs. - * + *

    * This will not fsync the WAL files. * If syncing is required, the caller must first call {@link #syncWal()} * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch * with {@link WriteOptions#setSync(boolean)} set to true. - * + *

    * See also {@link #close()}. * * @throws RocksDBException if an error occurs whilst closing. */ + @Override public void closeE() throws RocksDBException { if (owningHandle_.compareAndSet(true, false)) { try { @@ -121,14 +121,15 @@ public void closeE() throws RocksDBException { /** * This is similar to {@link #closeE()} except that it * silently ignores any errors. - * + *

    * This will not fsync the WAL files. * If syncing is required, the caller must first call {@link #syncWal()} * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch * with {@link WriteOptions#setSync(boolean)} set to true. - * + *

    * See also {@link #close()}. */ + @SuppressWarnings("PMD.EmptyCatchBlock") @Override public void close() { if (owningHandle_.compareAndSet(true, false)) { @@ -209,8 +210,7 @@ protected static native long open(final long optionsHandle, final String path) throws RocksDBException; protected static native long[] open(final long handle, final String path, final byte[][] columnFamilyNames, final long[] columnFamilyOptions); - private native static void closeDatabase(final long handle) - throws RocksDBException; + private static native void closeDatabase(final long handle) throws RocksDBException; private native long beginTransaction(final long handle, final long writeOptionsHandle); private native long beginTransaction(final long handle, diff --git a/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java b/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java index 250edf80683d..a2f5d85ab5c1 100644 --- a/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java +++ b/java/src/main/java/org/rocksdb/OptimisticTransactionOptions.java @@ -43,7 +43,7 @@ public OptimisticTransactionOptions setComparator( return this; } - private native static long newOptimisticTransactionOptions(); + private static native long newOptimisticTransactionOptions(); private native boolean isSetSnapshot(final long handle); private native void setSetSnapshot(final long handle, final boolean setSnapshot); diff --git a/java/src/main/java/org/rocksdb/OptionString.java b/java/src/main/java/org/rocksdb/OptionString.java index 7f97827cb889..bcbf1d152962 100644 --- a/java/src/main/java/org/rocksdb/OptionString.java +++ b/java/src/main/java/org/rocksdb/OptionString.java @@ -9,14 +9,15 @@ import java.util.List; import java.util.Objects; +@SuppressWarnings("PMD.AvoidStringBufferField") public class OptionString { - private final static char kvPairSeparator = ';'; - private final static char kvSeparator = '='; - private final static char complexValueBegin = '{'; - private final static char complexValueEnd = '}'; - private final static char wrappedValueBegin = '{'; - private final static char wrappedValueEnd = '}'; - private final static char arrayValueSeparator = ':'; + private static final char kvPairSeparator = ';'; + private static final char kvSeparator = '='; + private static final char complexValueBegin = '{'; + private static final char complexValueEnd = '}'; + private static final char wrappedValueBegin = '{'; + private static final char wrappedValueEnd = '}'; + private static final char arrayValueSeparator = ':'; static class Value { final List list; @@ -39,6 +40,7 @@ public static Value fromComplex(final List complex) { return new Value(null, complex); } + @Override public String toString() { final StringBuilder sb = new StringBuilder(); if (isList()) { @@ -68,6 +70,7 @@ private Entry(final String key, final Value value) { this.value = value; } + @Override public String toString() { return "" + key + "=" + value; } @@ -75,6 +78,8 @@ public String toString() { static class Parser { static class Exception extends RuntimeException { + private static final long serialVersionUID = 752283782841276408L; + public Exception(final String s) { super(s); } @@ -122,7 +127,7 @@ private boolean hasNext() { return (sb.length() > 0); } - private boolean is(final char c) { + private boolean isChar(final char c) { return (sb.length() > 0 && sb.charAt(0) == c); } @@ -151,10 +156,10 @@ private String parseKey() { } private String parseSimpleValue() { - if (is(wrappedValueBegin)) { + if (isChar(wrappedValueBegin)) { next(); final String result = parseSimpleValue(); - if (!is(wrappedValueEnd)) { + if (!isChar(wrappedValueEnd)) { exception("Expected to end a wrapped value with " + wrappedValueEnd); } next(); @@ -172,7 +177,7 @@ private List parseList() { final List list = new ArrayList<>(1); while (true) { list.add(parseSimpleValue()); - if (!is(arrayValueSeparator)) + if (!isChar(arrayValueSeparator)) break; next(); @@ -188,7 +193,7 @@ private Entry parseOption() { } final String key = parseKey(); skipWhite(); - if (is(kvSeparator)) { + if (isChar(kvSeparator)) { next(); } else { exception("Expected = separating key and value"); @@ -200,12 +205,12 @@ private Entry parseOption() { private Value parseValue() { skipWhite(); - if (is(complexValueBegin)) { + if (isChar(complexValueBegin)) { next(); skipWhite(); final Value value = Value.fromComplex(parseComplex()); skipWhite(); - if (is(complexValueEnd)) { + if (isChar(complexValueEnd)) { next(); skipWhite(); } else { @@ -214,6 +219,11 @@ private Value parseValue() { return value; } else if (isValueChar()) { return Value.fromList(parseList()); + } else if (isChar(kvPairSeparator)) { + // e.g. empty vector embedded in a struct option looks like + // struct_opt = {vector_opt=;...} + final List entries = new ArrayList<>(); + return Value.fromList(entries); } exception("No valid value character(s) for value in key=value"); @@ -227,7 +237,7 @@ private List parseComplex() { if (hasNext()) { entries.add(parseOption()); skipWhite(); - while (is(kvPairSeparator)) { + while (isChar(kvPairSeparator)) { next(); skipWhite(); if (!isKeyChar()) { diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java index 54f88262bd98..29f5e8e0d233 100644 --- a/java/src/main/java/org/rocksdb/Options.java +++ b/java/src/main/java/org/rocksdb/Options.java @@ -11,19 +11,13 @@ /** * Options to control the behavior of a database. It will be used * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()). - * + *

    * As a descendent of {@link AbstractNativeReference}, this class is {@link AutoCloseable} * and will be automatically released if opened in the preamble of a try with resources block. */ public class Options extends RocksObject - implements DBOptionsInterface, - MutableDBOptionsInterface, - ColumnFamilyOptionsInterface, - MutableColumnFamilyOptionsInterface { - static { - RocksDB.loadLibrary(); - } - + implements DBOptionsInterface, MutableDBOptionsInterface, + ColumnFamilyOptionsInterface, MutableColumnFamilyOptionsInterface { /** * Converts the input properties into a Options-style formatted string * @param properties The set of properties to convert @@ -33,7 +27,7 @@ public static String getOptionStringFromProps(final Properties properties) { if (properties == null || properties.size() == 0) { throw new IllegalArgumentException("Properties value must contain at least one value."); } - StringBuilder stringBuilder = new StringBuilder(); + final StringBuilder stringBuilder = new StringBuilder(); for (final String name : properties.stringPropertyNames()) { stringBuilder.append(name); stringBuilder.append("="); @@ -45,12 +39,12 @@ public static String getOptionStringFromProps(final Properties properties) { /** * Construct options for opening a RocksDB. - * + *

    * This constructor will create (by allocating a block of memory) * an {@code rocksdb::Options} in the c++ side. */ public Options() { - super(newOptions()); + super(newOptionsInstance()); env_ = Env.getDefault(); } @@ -71,13 +65,13 @@ public Options(final DBOptions dbOptions, /** * Copy constructor for ColumnFamilyOptions. - * + *

    * NOTE: This does a shallow copy, which means comparator, merge_operator * and other pointers will be cloned! * * @param other The Options to copy. */ - public Options(Options other) { + public Options(final Options other) { super(copyOptions(other.nativeHandle_)); this.env_ = other.env_; this.memTableConfig_ = other.memTableConfig_; @@ -179,8 +173,7 @@ public Options optimizeForSmallDb(final Cache cache) { } @Override - public Options optimizeForPointLookup( - long blockCacheSizeMb) { + public Options optimizeForPointLookup(final long blockCacheSizeMb) { optimizeForPointLookup(nativeHandle_, blockCacheSizeMb); return this; @@ -194,8 +187,7 @@ public Options optimizeLevelStyleCompaction() { } @Override - public Options optimizeLevelStyleCompaction( - long memtableMemoryBudget) { + public Options optimizeLevelStyleCompaction(final long memtableMemoryBudget) { optimizeLevelStyleCompaction(nativeHandle_, memtableMemoryBudget); return this; @@ -388,8 +380,8 @@ public Options setDbPaths(final Collection dbPaths) { assert(isOwningHandle()); final int len = dbPaths.size(); - final String paths[] = new String[len]; - final long targetSizes[] = new long[len]; + final String[] paths = new String[len]; + final long[] targetSizes = new long[len]; int i = 0; for(final DbPath dbPath : dbPaths) { @@ -407,8 +399,8 @@ public List dbPaths() { if(len == 0) { return Collections.emptyList(); } else { - final String paths[] = new String[len]; - final long targetSizes[] = new long[len]; + final String[] paths = new String[len]; + final long[] targetSizes = new long[len]; dbPaths(nativeHandle_, paths, targetSizes); @@ -651,7 +643,7 @@ public long walSizeLimitMB() { } @Override - public Options setMaxWriteBatchGroupSizeBytes(long maxWriteBatchGroupSizeBytes) { + public Options setMaxWriteBatchGroupSizeBytes(final long maxWriteBatchGroupSizeBytes) { setMaxWriteBatchGroupSizeBytes(nativeHandle_, maxWriteBatchGroupSizeBytes); return this; } @@ -842,6 +834,7 @@ public long dbWriteBufferSize() { } @Override + @Deprecated public Options setAccessHintOnCompactionStart(final AccessHint accessHint) { assert(isOwningHandle()); setAccessHintOnCompactionStart(nativeHandle_, accessHint.getValue()); @@ -849,6 +842,7 @@ public Options setAccessHintOnCompactionStart(final AccessHint accessHint) { } @Override + @Deprecated public AccessHint accessHintOnCompactionStart() { assert(isOwningHandle()); return AccessHint.getAccessHint(accessHintOnCompactionStart(nativeHandle_)); @@ -1066,7 +1060,8 @@ public boolean skipStatsUpdateOnDbOpen() { } @Override - public Options setSkipCheckingSstFileSizesOnDbOpen(boolean skipCheckingSstFileSizesOnDbOpen) { + public Options setSkipCheckingSstFileSizesOnDbOpen( + final boolean skipCheckingSstFileSizesOnDbOpen) { setSkipCheckingSstFileSizesOnDbOpen(nativeHandle_, skipCheckingSstFileSizesOnDbOpen); return this; } @@ -1377,12 +1372,11 @@ public List compressionPerLevel() { } @Override - public Options setCompressionType(CompressionType compressionType) { + public Options setCompressionType(final CompressionType compressionType) { setCompressionType(nativeHandle_, compressionType.getValue()); return this; } - @Override public Options setBottommostCompressionType( final CompressionType bottommostCompressionType) { @@ -1442,7 +1436,7 @@ public int numLevels() { } @Override - public Options setNumLevels(int numLevels) { + public Options setNumLevels(final int numLevels) { setNumLevels(nativeHandle_, numLevels); return this; } @@ -1490,7 +1484,7 @@ public long targetFileSizeBase() { } @Override - public Options setTargetFileSizeBase(long targetFileSizeBase) { + public Options setTargetFileSizeBase(final long targetFileSizeBase) { setTargetFileSizeBase(nativeHandle_, targetFileSizeBase); return this; } @@ -1501,7 +1495,7 @@ public int targetFileSizeMultiplier() { } @Override - public Options setTargetFileSizeMultiplier(int multiplier) { + public Options setTargetFileSizeMultiplier(final int multiplier) { setTargetFileSizeMultiplier(nativeHandle_, multiplier); return this; } @@ -1662,7 +1656,7 @@ public long maxSuccessiveMerges() { } @Override - public Options setMaxSuccessiveMerges(long maxSuccessiveMerges) { + public Options setMaxSuccessiveMerges(final long maxSuccessiveMerges) { setMaxSuccessiveMerges(nativeHandle_, maxSuccessiveMerges); return this; } @@ -1692,9 +1686,7 @@ public boolean optimizeFiltersForHits() { } @Override - public Options - setMemtableHugePageSize( - long memtableHugePageSize) { + public Options setMemtableHugePageSize(final long memtableHugePageSize) { setMemtableHugePageSize(nativeHandle_, memtableHugePageSize); return this; @@ -1706,7 +1698,7 @@ public long memtableHugePageSize() { } @Override - public Options setSoftPendingCompactionBytesLimit(long softPendingCompactionBytesLimit) { + public Options setSoftPendingCompactionBytesLimit(final long softPendingCompactionBytesLimit) { setSoftPendingCompactionBytesLimit(nativeHandle_, softPendingCompactionBytesLimit); return this; @@ -1718,7 +1710,7 @@ public long softPendingCompactionBytesLimit() { } @Override - public Options setHardPendingCompactionBytesLimit(long hardPendingCompactionBytesLimit) { + public Options setHardPendingCompactionBytesLimit(final long hardPendingCompactionBytesLimit) { setHardPendingCompactionBytesLimit(nativeHandle_, hardPendingCompactionBytesLimit); return this; } @@ -1729,7 +1721,7 @@ public long hardPendingCompactionBytesLimit() { } @Override - public Options setLevel0FileNumCompactionTrigger(int level0FileNumCompactionTrigger) { + public Options setLevel0FileNumCompactionTrigger(final int level0FileNumCompactionTrigger) { setLevel0FileNumCompactionTrigger(nativeHandle_, level0FileNumCompactionTrigger); return this; } @@ -1740,7 +1732,7 @@ public int level0FileNumCompactionTrigger() { } @Override - public Options setLevel0SlowdownWritesTrigger(int level0SlowdownWritesTrigger) { + public Options setLevel0SlowdownWritesTrigger(final int level0SlowdownWritesTrigger) { setLevel0SlowdownWritesTrigger(nativeHandle_, level0SlowdownWritesTrigger); return this; } @@ -1751,7 +1743,7 @@ public int level0SlowdownWritesTrigger() { } @Override - public Options setLevel0StopWritesTrigger(int level0StopWritesTrigger) { + public Options setLevel0StopWritesTrigger(final int level0StopWritesTrigger) { setLevel0StopWritesTrigger(nativeHandle_, level0StopWritesTrigger); return this; } @@ -1762,7 +1754,8 @@ public int level0StopWritesTrigger() { } @Override - public Options setMaxBytesForLevelMultiplierAdditional(int[] maxBytesForLevelMultiplierAdditional) { + public Options setMaxBytesForLevelMultiplierAdditional( + final int[] maxBytesForLevelMultiplierAdditional) { setMaxBytesForLevelMultiplierAdditional(nativeHandle_, maxBytesForLevelMultiplierAdditional); return this; } @@ -1773,7 +1766,7 @@ public int[] maxBytesForLevelMultiplierAdditional() { } @Override - public Options setParanoidFileChecks(boolean paranoidFileChecks) { + public Options setParanoidFileChecks(final boolean paranoidFileChecks) { setParanoidFileChecks(nativeHandle_, paranoidFileChecks); return this; } @@ -1892,7 +1885,7 @@ public boolean atomicFlush() { } @Override - public Options setAvoidUnnecessaryBlockingIO(boolean avoidUnnecessaryBlockingIO) { + public Options setAvoidUnnecessaryBlockingIO(final boolean avoidUnnecessaryBlockingIO) { setAvoidUnnecessaryBlockingIO(nativeHandle_, avoidUnnecessaryBlockingIO); return this; } @@ -1904,7 +1897,7 @@ public boolean avoidUnnecessaryBlockingIO() { } @Override - public Options setPersistStatsToDisk(boolean persistStatsToDisk) { + public Options setPersistStatsToDisk(final boolean persistStatsToDisk) { setPersistStatsToDisk(nativeHandle_, persistStatsToDisk); return this; } @@ -1916,7 +1909,7 @@ public boolean persistStatsToDisk() { } @Override - public Options setWriteDbidToManifest(boolean writeDbidToManifest) { + public Options setWriteDbidToManifest(final boolean writeDbidToManifest) { setWriteDbidToManifest(nativeHandle_, writeDbidToManifest); return this; } @@ -1928,7 +1921,7 @@ public boolean writeDbidToManifest() { } @Override - public Options setLogReadaheadSize(long logReadaheadSize) { + public Options setLogReadaheadSize(final long logReadaheadSize) { setLogReadaheadSize(nativeHandle_, logReadaheadSize); return this; } @@ -1940,7 +1933,7 @@ public long logReadaheadSize() { } @Override - public Options setBestEffortsRecovery(boolean bestEffortsRecovery) { + public Options setBestEffortsRecovery(final boolean bestEffortsRecovery) { setBestEffortsRecovery(nativeHandle_, bestEffortsRecovery); return this; } @@ -1952,7 +1945,7 @@ public boolean bestEffortsRecovery() { } @Override - public Options setMaxBgErrorResumeCount(int maxBgerrorResumeCount) { + public Options setMaxBgErrorResumeCount(final int maxBgerrorResumeCount) { setMaxBgErrorResumeCount(nativeHandle_, maxBgerrorResumeCount); return this; } @@ -1964,7 +1957,7 @@ public int maxBgerrorResumeCount() { } @Override - public Options setBgerrorResumeRetryInterval(long bgerrorResumeRetryInterval) { + public Options setBgerrorResumeRetryInterval(final long bgerrorResumeRetryInterval) { setBgerrorResumeRetryInterval(nativeHandle_, bgerrorResumeRetryInterval); return this; } @@ -1976,7 +1969,7 @@ public long bgerrorResumeRetryInterval() { } @Override - public Options setSstPartitionerFactory(SstPartitionerFactory sstPartitionerFactory) { + public Options setSstPartitionerFactory(final SstPartitionerFactory sstPartitionerFactory) { setSstPartitionerFactory(nativeHandle_, sstPartitionerFactory.nativeHandle_); this.sstPartitionerFactory_ = sstPartitionerFactory; return this; @@ -1987,6 +1980,17 @@ public SstPartitionerFactory sstPartitionerFactory() { return sstPartitionerFactory_; } + @Override + public Options setMemtableMaxRangeDeletions(final int count) { + setMemtableMaxRangeDeletions(nativeHandle_, count); + return this; + } + + @Override + public int memtableMaxRangeDeletions() { + return memtableMaxRangeDeletions(nativeHandle_); + } + @Override public Options setCompactionThreadLimiter(final ConcurrentTaskLimiter compactionThreadLimiter) { setCompactionThreadLimiter(nativeHandle_, compactionThreadLimiter.nativeHandle_); @@ -2038,7 +2042,7 @@ public long blobFileSize() { } @Override - public Options setBlobCompressionType(CompressionType compressionType) { + public Options setBlobCompressionType(final CompressionType compressionType) { setBlobCompressionType(nativeHandle_, compressionType.getValue()); return this; } @@ -2119,10 +2123,13 @@ public PrepopulateBlobCache prepopulateBlobCache() { // END options for blobs (integrated BlobDB) // - private native static long newOptions(); - private native static long newOptions(long dbOptHandle, - long cfOptHandle); - private native static long copyOptions(long handle); + private static long newOptionsInstance() { + RocksDB.loadLibrary(); + return newOptions(); + } + private static native long newOptions(); + private static native long newOptions(long dbOptHandle, long cfOptHandle); + private static native long copyOptions(long handle); @Override protected final native void disposeInternal(final long handle); private native void setEnv(long optHandle, long envHandle); private native void prepareForBulkLoad(long handle); @@ -2506,6 +2513,8 @@ private native void setAtomicFlush(final long handle, final boolean atomicFlush); private native boolean atomicFlush(final long handle); private native void setSstPartitionerFactory(long nativeHandle_, long newFactoryHandle); + private native void setMemtableMaxRangeDeletions(final long handle, final int count); + private native int memtableMaxRangeDeletions(final long handle); private static native void setCompactionThreadLimiter( final long nativeHandle_, final long newLimiterHandle); private static native void setAvoidUnnecessaryBlockingIO( @@ -2528,7 +2537,6 @@ private static native void setMaxBgErrorResumeCount( private static native void setBgerrorResumeRetryInterval( final long handle, final long bgerrorResumeRetryInterval); private static native long bgerrorResumeRetryInterval(final long handle); - private native void setEnableBlobFiles(final long nativeHandle_, final boolean enableBlobFiles); private native boolean enableBlobFiles(final long nativeHandle_); private native void setMinBlobSize(final long nativeHandle_, final long minBlobSize); diff --git a/java/src/main/java/org/rocksdb/OptionsUtil.java b/java/src/main/java/org/rocksdb/OptionsUtil.java index 899996af9194..4168921f2a05 100644 --- a/java/src/main/java/org/rocksdb/OptionsUtil.java +++ b/java/src/main/java/org/rocksdb/OptionsUtil.java @@ -12,12 +12,12 @@ public class OptionsUtil { * A static method to construct the DBOptions and ColumnFamilyDescriptors by * loading the latest RocksDB options file stored in the specified rocksdb * database. - * + *

    * Note that the all the pointer options (except table_factory, which will * be described in more details below) will be initialized with the default * values. Developers can further initialize them after this function call. * Below is an example list of pointer options which will be initialized. - * + *

    * - env * - memtable_factory * - compaction_filter_factory @@ -25,57 +25,16 @@ public class OptionsUtil { * - comparator * - merge_operator * - compaction_filter - * + *

    * For table_factory, this function further supports deserializing * BlockBasedTableFactory and its BlockBasedTableOptions except the * pointer options of BlockBasedTableOptions (flush_block_policy_factory, - * block_cache, and block_cache_compressed), which will be initialized with + * and block_cache), which will be initialized with * default values. Developers can further specify these three options by * casting the return value of TableFactoroy::GetOptions() to * BlockBasedTableOptions and making necessary changes. * * @param dbPath the path to the RocksDB. - * @param env {@link org.rocksdb.Env} instance. - * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be - * filled and returned. - * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be - * returned. - * - * @throws RocksDBException thrown if error happens in underlying - * native library. - */ - - public static void loadLatestOptions(String dbPath, Env env, DBOptions dbOptions, - List cfDescs) throws RocksDBException { - loadLatestOptions(dbPath, env, dbOptions, cfDescs, false); - } - - /** - * @param dbPath the path to the RocksDB. - * @param env {@link org.rocksdb.Env} instance. - * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be - * filled and returned. - * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be - * returned. - * @param ignoreUnknownOptions this flag can be set to true if you want to - * ignore options that are from a newer version of the db, essentially for - * forward compatibility. - * - * @throws RocksDBException thrown if error happens in underlying - * native library. - */ - public static void loadLatestOptions(String dbPath, Env env, DBOptions dbOptions, - List cfDescs, boolean ignoreUnknownOptions) throws RocksDBException { - loadLatestOptions( - dbPath, env.nativeHandle_, dbOptions.nativeHandle_, cfDescs, ignoreUnknownOptions); - } - - /** - * Similar to LoadLatestOptions, this function constructs the DBOptions - * and ColumnFamilyDescriptors based on the specified RocksDB Options file. - * See LoadLatestOptions above. - * - * @param dbPath the path to the RocksDB. * @param configOptions {@link org.rocksdb.ConfigOptions} instance. * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be * filled and returned. @@ -84,49 +43,11 @@ public static void loadLatestOptions(String dbPath, Env env, DBOptions dbOptions * @throws RocksDBException thrown if error happens in underlying * native library. */ - public static void loadLatestOptions(ConfigOptions configOptions, String dbPath, - DBOptions dbOptions, List cfDescs) throws RocksDBException { + public static void loadLatestOptions(final ConfigOptions configOptions, final String dbPath, + final DBOptions dbOptions, final List cfDescs) + throws RocksDBException { loadLatestOptions(configOptions.nativeHandle_, dbPath, dbOptions.nativeHandle_, cfDescs); - } - - /** - * Similar to LoadLatestOptions, this function constructs the DBOptions - * and ColumnFamilyDescriptors based on the specified RocksDB Options file. - * See LoadLatestOptions above. - * - * @param optionsFileName the RocksDB options file path. - * @param env {@link org.rocksdb.Env} instance. - * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be - * filled and returned. - * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be - * returned. - * - * @throws RocksDBException thrown if error happens in underlying - * native library. - */ - public static void loadOptionsFromFile(String optionsFileName, Env env, DBOptions dbOptions, - List cfDescs) throws RocksDBException { - loadOptionsFromFile(optionsFileName, env, dbOptions, cfDescs, false); - } - - /** - * @param optionsFileName the RocksDB options file path. - * @param env {@link org.rocksdb.Env} instance. - * @param dbOptions {@link org.rocksdb.DBOptions} instance. This will be - * filled and returned. - * @param cfDescs A list of {@link org.rocksdb.ColumnFamilyDescriptor}'s be - * returned. - * @param ignoreUnknownOptions this flag can be set to true if you want to - * ignore options that are from a newer version of the db, esentially for - * forward compatibility. - * - * @throws RocksDBException thrown if error happens in underlying - * native library. - */ - public static void loadOptionsFromFile(String optionsFileName, Env env, DBOptions dbOptions, - List cfDescs, boolean ignoreUnknownOptions) throws RocksDBException { - loadOptionsFromFile( - optionsFileName, env.nativeHandle_, dbOptions.nativeHandle_, cfDescs, ignoreUnknownOptions); + loadTableFormatConfig(cfDescs); } /** @@ -143,10 +64,12 @@ public static void loadOptionsFromFile(String optionsFileName, Env env, DBOption * @throws RocksDBException thrown if error happens in underlying * native library. */ - public static void loadOptionsFromFile(ConfigOptions configOptions, String optionsFileName, - DBOptions dbOptions, List cfDescs) throws RocksDBException { + public static void loadOptionsFromFile(final ConfigOptions configOptions, + final String optionsFileName, final DBOptions dbOptions, + final List cfDescs) throws RocksDBException { loadOptionsFromFile( configOptions.nativeHandle_, optionsFileName, dbOptions.nativeHandle_, cfDescs); + loadTableFormatConfig(cfDescs); } /** @@ -159,10 +82,20 @@ public static void loadOptionsFromFile(ConfigOptions configOptions, String optio * @throws RocksDBException thrown if error happens in underlying * native library. */ - public static String getLatestOptionsFileName(String dbPath, Env env) throws RocksDBException { + public static String getLatestOptionsFileName(final String dbPath, final Env env) + throws RocksDBException { return getLatestOptionsFileName(dbPath, env.nativeHandle_); } + private static void loadTableFormatConfig(final List cfDescs) { + for (final ColumnFamilyDescriptor columnFamilyDescriptor : cfDescs) { + @SuppressWarnings("PMD.CloseResource") + final ColumnFamilyOptions columnFamilyOptions = columnFamilyDescriptor.getOptions(); + columnFamilyOptions.setFetchedTableFormatConfig( + readTableFormatConfig(columnFamilyOptions.nativeHandle_)); + } + } + /** * Private constructor. * This class has only static methods and shouldn't be instantiated. @@ -170,15 +103,12 @@ public static String getLatestOptionsFileName(String dbPath, Env env) throws Roc private OptionsUtil() {} // native methods - private native static void loadLatestOptions(String dbPath, long envHandle, long dbOptionsHandle, - List cfDescs, boolean ignoreUnknownOptions) throws RocksDBException; - private native static void loadLatestOptions(long cfgHandle, String dbPath, long dbOptionsHandle, + private static native void loadLatestOptions(long cfgHandle, String dbPath, long dbOptionsHandle, List cfDescs) throws RocksDBException; - private native static void loadOptionsFromFile(String optionsFileName, long envHandle, - long dbOptionsHandle, List cfDescs, boolean ignoreUnknownOptions) - throws RocksDBException; - private native static void loadOptionsFromFile(long cfgHandle, String optionsFileName, + private static native void loadOptionsFromFile(long cfgHandle, String optionsFileName, long dbOptionsHandle, List cfDescs) throws RocksDBException; - private native static String getLatestOptionsFileName(String dbPath, long envHandle) + private static native String getLatestOptionsFileName(String dbPath, long envHandle) throws RocksDBException; + + private native static TableFormatConfig readTableFormatConfig(final long nativeHandle_); } diff --git a/java/src/main/java/org/rocksdb/PerfContext.java b/java/src/main/java/org/rocksdb/PerfContext.java new file mode 100644 index 000000000000..3934e4115cf5 --- /dev/null +++ b/java/src/main/java/org/rocksdb/PerfContext.java @@ -0,0 +1,761 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +public class PerfContext extends RocksObject { + protected PerfContext(final long nativeHandle) { + super(nativeHandle); + } + + public void reset() { + reset(nativeHandle_); + } + + /** + * @return total number of user key comparisons + */ + public long getUserKeyComparisonCount() { + return getUserKeyComparisonCount(nativeHandle_); + } + + /** + * @return total number of block cache hits + */ + public long getBlockCacheHitCount() { + return getBlockCacheHitCount(nativeHandle_); + } + + /** + * @return total number of block reads (with IO) + */ + public long getBlockReadCount() { + return getBlockReadCount(nativeHandle_); + } + + /** + * @return total number of bytes from block reads + */ + public long getBlockReadByte() { + return getBlockReadByte(nativeHandle_); + } + + /* + @return total nanos spent on block reads + */ + public long getBlockReadTime() { + return getBlockReadTime(nativeHandle_); + } + + /** + * @return total cpu time in nanos spent on block reads + */ + public long getBlockReadCpuTime() { + return getBlockReadCpuTime(nativeHandle_); + } + + /** + * @return total number of index block hits + */ + public long getBlockCacheIndexHitCount() { + return getBlockCacheIndexHitCount(nativeHandle_); + } + + /** + * @return total number of standalone handles lookup from secondary cache + */ + public long getBlockCacheStandaloneHandleCount() { + return getBlockCacheStandaloneHandleCount(nativeHandle_); + } + + /** + * @return total number of real handles lookup from secondary cache that are inserted into + * primary cache + */ + public long getBlockCacheRealHandleCount() { + return getBlockCacheRealHandleCount(nativeHandle_); + } + + /** + * @return total number of index block reads + */ + public long getIndexBlockReadCount() { + return getIndexBlockReadCount(nativeHandle_); + } + + /** + * @return total number of filter block hits + */ + public long getBlockCacheFilterHitCount() { + return getBlockCacheFilterHitCount(nativeHandle_); + } + + /** + * @return total number of filter block reads + */ + public long getFilterBlockReadCount() { + return getFilterBlockReadCount(nativeHandle_); + } + + /** + * @return total number of compression dictionary block reads + */ + public long getCompressionDictBlockReadCount() { + return getCompressionDictBlockReadCount(nativeHandle_); + } + + /** + * @return total number of secondary cache hits + */ + public long getSecondaryCacheHitCount() { + return getSecondaryCacheHitCount(nativeHandle_); + } + + /** + * @return total number of real handles inserted into secondary cache + */ + public long getCompressedSecCacheInsertRealCount() { + return getCompressedSecCacheInsertRealCount(nativeHandle_); + } + + /** + * @return total number of dummy handles inserted into secondary cache + */ + public long getCompressedSecCacheInsertDummyCount() { + return getCompressedSecCacheInsertDummyCount(nativeHandle_); + } + + /** + * @return bytes for vals before compression in secondary cache + */ + public long getCompressedSecCacheUncompressedBytes() { + return getCompressedSecCacheUncompressedBytes(nativeHandle_); + } + + /** + * @return bytes for vals after compression in secondary cache + */ + public long getCompressedSecCacheCompressedBytes() { + return getCompressedSecCacheCompressedBytes(nativeHandle_); + } + + /** + * @return total nanos spent on block checksum + */ + public long getBlockChecksumTime() { + return getBlockChecksumTime(nativeHandle_); + } + + /** + * + * @return total nanos spent on block decompression + */ + public long getBlockDecompressTime() { + return getBlockDecompressTime(nativeHandle_); + } + + /** + * @return bytes for vals returned by Get + */ + public long getReadBytes() { + return getReadBytes(nativeHandle_); + } + + /** + * @return bytes for vals returned by MultiGet + */ + public long getMultigetReadBytes() { + return getMultigetReadBytes(nativeHandle_); + } + + /** + * @return bytes for keys/vals decoded by iterator + */ + public long getIterReadBytes() { + return getIterReadBytes(nativeHandle_); + } + + /** + * @return total number of blob cache hits + */ + public long getBlobCacheHitCount() { + return getBlobCacheHitCount(nativeHandle_); + } + + /** + * @return total number of blob reads (with IO) + */ + public long getBlobReadCount() { + return getBlobReadCount(nativeHandle_); + } + + /** + * @return total number of bytes from blob reads + */ + public long getBlobReadByte() { + return getBlobReadByte(nativeHandle_); + } + + /** + * @return total nanos spent on blob reads + */ + public long getBlobReadTime() { + return getBlobReadTime(nativeHandle_); + } + + /** + * @return total nanos spent on blob checksum + */ + public long getBlobChecksumTime() { + return getBlobChecksumTime(nativeHandle_); + } + + /** + * @return total nanos spent on blob decompression + */ + public long getBlobDecompressTime() { + return getBlobDecompressTime(nativeHandle_); + } + + /** + * total number of internal keys skipped over during iteration. + * There are several reasons for it: + * 1. when calling Next(), the iterator is in the position of the previous + * key, so that we'll need to skip it. It means this counter will always + * be incremented in Next(). + * 2. when calling Next(), we need to skip internal entries for the previous + * keys that are overwritten. + * 3. when calling Next(), Seek() or SeekToFirst(), after previous key + * before calling Next(), the seek key in Seek() or the beginning for + * SeekToFirst(), there may be one or more deleted keys before the next + * valid key that the operation should place the iterator to. We need + * to skip both of the tombstone and updates hidden by the tombstones. The + * tombstones are not included in this counter, while previous updates + * hidden by the tombstones will be included here. + * 4. symmetric cases for Prev() and SeekToLast() + * internal_recent_skipped_count is not included in this counter. + */ + public long getInternalKeySkippedCount() { + return getInternalKeySkippedCount(nativeHandle_); + } + + /** + * Total number of deletes and single deletes skipped over during iteration + * When calling Next(), Seek() or SeekToFirst(), after previous position + * before calling Next(), the seek key in Seek() or the beginning for + * SeekToFirst(), there may be one or more deleted keys before the next valid + * key. Every deleted key is counted once. We don't recount here if there are + * still older updates invalidated by the tombstones. + */ + public long getInternalDeleteSkippedCount() { + return getInternalDeleteSkippedCount(nativeHandle_); + } + + /** + * How many times iterators skipped over internal keys that are more recent + * than the snapshot that iterator is using. + */ + public long getInternalRecentSkippedCount() { + return getInternalRecentSkippedCount(nativeHandle_); + } + + /** + * How many merge operands were fed into the merge operator by iterators. + * Note: base values are not included in the count. + */ + public long getInternalMergeCount() { + return getInternalMergeCount(nativeHandle_); + } + + /** + * How many merge operands were fed into the merge operator by point lookups. + * Note: base values are not included in the count. + */ + public long getInternalMergePointLookupCount() { + return getInternalMergePointLookupCount(nativeHandle_); + } + + /** + * Number of times we reseeked inside a merging iterator, specifically to skip + * after or before a range of keys covered by a range deletion in a newer LSM + * component. + */ + public long getInternalRangeDelReseekCount() { + return getInternalRangeDelReseekCount(nativeHandle_); + } + + /** + * @return total nanos spent on getting snapshot + */ + public long getSnapshotTime() { + return getSnapshotTime(nativeHandle_); + } + + /** + * @return total nanos spent on querying memtables + */ + public long getFromMemtableTime() { + return getFromMemtableTime(nativeHandle_); + } + + /** + * @return number of mem tables queried + */ + public long getFromMemtableCount() { + return getFromMemtableCount(nativeHandle_); + } + + /** + * @return total nanos spent after Get() finds a key + */ + public long getPostProcessTime() { + return getPostProcessTime(nativeHandle_); + } + + /** + * @return total nanos reading from output files + */ + public long getFromOutputFilesTime() { + return getFromOutputFilesTime(nativeHandle_); + } + + /** + * @return total nanos spent on seeking memtable + */ + public long getSeekOnMemtableTime() { + return getSeekOnMemtableTime(nativeHandle_); + } + + /** + * number of seeks issued on memtable + * (including SeekForPrev but not SeekToFirst and SeekToLast) + * @return number of seeks issued on memtable + */ + public long getSeekOnMemtableCount() { + return getSeekOnMemtableCount(nativeHandle_); + } + + /** + * @return number of Next()s issued on memtable + */ + public long getNextOnMemtableCount() { + return getNextOnMemtableCount(nativeHandle_); + } + + /** + * @return number of Prev()s issued on memtable + */ + public long getPrevOnMemtableCount() { + return getPrevOnMemtableCount(nativeHandle_); + } + + /** + * @return total nanos spent on seeking child iters + */ + public long getSeekChildSeekTime() { + return getSeekChildSeekTime(nativeHandle_); + } + + /** + * @return number of seek issued in child iterators + */ + public long getSeekChildSeekCount() { + return getSeekChildSeekCount(nativeHandle_); + } + + /** + * @return total nanos spent on the merge min heap + */ + public long getSeekMinHeapTime() { + return getSeekMinHeapTime(nativeHandle_); + } + + /** + * @return total nanos spent on the merge max heap + */ + public long getSeekMaxHeapTime() { + return getSeekMaxHeapTime(nativeHandle_); + } + + /** + * @return total nanos spent on seeking the internal entries + */ + public long getSeekInternalSeekTime() { + return getSeekInternalSeekTime(nativeHandle_); + } + + /** + * @return total nanos spent on iterating internal entries to find the next user entry + */ + public long getFindNextUserEntryTime() { + return getFindNextUserEntryTime(nativeHandle_); + } + + /** + * @return total nanos spent on writing to WAL + */ + public long getWriteWalTime() { + return getWriteWalTime(nativeHandle_); + } + + /** + * @return total nanos spent on writing to mem tables + */ + public long getWriteMemtableTime() { + return getWriteMemtableTime(nativeHandle_); + } + + /** + * @return total nanos spent on delaying or throttling write + */ + public long getWriteDelayTime() { + return getWriteDelayTime(nativeHandle_); + } + + /** + * @return total nanos spent on switching memtable/wal and scheduling flushes/compactions. + */ + public long getWriteSchedulingFlushesCompactionsTime() { + return getWriteSchedulingFlushesCompactionsTime(nativeHandle_); + } + + /** + * @return total nanos spent on writing a record, excluding the above four things + */ + public long getWritePreAndPostProcessTime() { + return getWritePreAndPostProcessTime(nativeHandle_); + } + + /** + * @return time spent waiting for other threads of the batch group + */ + public long getWriteThreadWaitNanos() { + return getWriteThreadWaitNanos(nativeHandle_); + } + + /** + * @return time spent on acquiring DB mutex. + */ + public long getDbMutexLockNanos() { + return getDbMutexLockNanos(nativeHandle_); + } + + /** + * @return Time spent on waiting with a condition variable created with DB mutex. + */ + public long getDbConditionWaitNanos() { + return getDbConditionWaitNanos(nativeHandle_); + } + + /** + * @return Time spent on merge operator. + */ + public long getMergeOperatorTimeNanos() { + return getMergeOperatorTimeNanos(nativeHandle_); + } + + /** + * @return Time spent on reading index block from block cache or SST file + */ + public long getReadIndexBlockNanos() { + return getReadIndexBlockNanos(nativeHandle_); + } + + /** + * @return Time spent on reading filter block from block cache or SST file + */ + public long getReadFilterBlockNanos() { + return getReadFilterBlockNanos(nativeHandle_); + } + + /** + * @return Time spent on creating data block iterator + */ + public long getNewTableBlockIterNanos() { + return getNewTableBlockIterNanos(nativeHandle_); + } + + /** + * @return Time spent on creating a iterator of an SST file. + */ + public long getNewTableIteratorNanos() { + return getNewTableIteratorNanos(nativeHandle_); + } + + /** + * @return Time spent on seeking a key in data/index blocks + */ + public long getBlockSeekNanos() { + return getBlockSeekNanos(nativeHandle_); + } + /** + * @return Time spent on finding or creating a table reader + */ + public long getFindTableNanos() { + return getFindTableNanos(nativeHandle_); + } + + /** + * @return total number of mem table bloom hits + */ + public long getBloomMemtableHitCount() { + return getBloomMemtableHitCount(nativeHandle_); + } + + // total number of mem table bloom misses + public long getBloomMemtableMissCount() { + return getBloomMemtableMissCount(nativeHandle_); + } + + /** + * @return total number of SST bloom hits + */ + public long getBloomSstHitCount() { + return getBloomSstHitCount(nativeHandle_); + } + + /** + * @return total number of SST bloom misses + */ + public long getBloomSstMissCount() { + return getBloomSstMissCount(nativeHandle_); + } + + /** + * @return Time spent waiting on key locks in transaction lock manager. + */ + public long getKeyLockWaitTime() { + return getKeyLockWaitTime(nativeHandle_); + } + /** + * @return number of times acquiring a lock was blocked by another transaction. + */ + public long getKeyLockWaitCount() { + return getKeyLockWaitCount(nativeHandle_); + } + + /** + * @return Total time spent in Env filesystem operations. These are only populated when TimedEnv + * is used. + */ + public long getEnvNewSequentialFileNanos() { + return getEnvNewSequentialFileNanos(nativeHandle_); + } + + public long getEnvNewRandomAccessFileNanos() { + return getEnvNewRandomAccessFileNanos(nativeHandle_); + } + + public long getEnvNewWritableFileNanos() { + return getEnvNewWritableFileNanos(nativeHandle_); + } + + public long getEnvReuseWritableFileNanos() { + return getEnvReuseWritableFileNanos(nativeHandle_); + } + + public long getEnvNewRandomRwFileNanos() { + return getEnvNewRandomRwFileNanos(nativeHandle_); + } + + public long getEnvNewDirectoryNanos() { + return getEnvNewDirectoryNanos(nativeHandle_); + } + + public long getEnvFileExistsNanos() { + return getEnvFileExistsNanos(nativeHandle_); + } + public long getEnvGetChildrenNanos() { + return getEnvGetChildrenNanos(nativeHandle_); + } + + public long getEnvGetChildrenFileAttributesNanos() { + return getEnvGetChildrenFileAttributesNanos(nativeHandle_); + } + + public long getEnvDeleteFileNanos() { + return getEnvDeleteFileNanos(nativeHandle_); + } + + public long getEnvCreateDirNanos() { + return getEnvCreateDirNanos(nativeHandle_); + } + public long getEnvCreateDirIfMissingNanos() { + return getEnvCreateDirIfMissingNanos(nativeHandle_); + } + + public long getEnvDeleteDirNanos() { + return getEnvDeleteDirNanos(nativeHandle_); + } + + public long getEnvGetFileSizeNanos() { + return getEnvGetFileSizeNanos(nativeHandle_); + } + + public long getEnvGetFileModificationTimeNanos() { + return getEnvGetFileModificationTimeNanos(nativeHandle_); + } + + public long getEnvRenameFileNanos() { + return getEnvRenameFileNanos(nativeHandle_); + } + + public long getEnvLinkFileNanos() { + return getEnvLinkFileNanos(nativeHandle_); + } + + public long getEnvLockFileNanos() { + return getEnvLockFileNanos(nativeHandle_); + } + + public long getEnvUnlockFileNanos() { + return getEnvUnlockFileNanos(nativeHandle_); + } + + public long getEnvNewLoggerNanos() { + return getEnvNewLoggerNanos(nativeHandle_); + } + + public long getGetCpuNanos() { + return getGetCpuNanos(nativeHandle_); + } + + public long getIterNextCpuNanos() { + return getIterNextCpuNanos(nativeHandle_); + } + public long getIterPrevCpuNanos() { + return getIterPrevCpuNanos(nativeHandle_); + } + + public long getIterSeekCpuNanos() { + return getIterSeekCpuNanos(nativeHandle_); + } + + /** + * @return Time spent in encrypting data. Populated when EncryptedEnv is used. + */ + public long getEncryptDataNanos() { + return getEncryptDataNanos(nativeHandle_); + } + + /** + * @return Time spent in decrypting data. Populated when EncryptedEnv is used. + */ + public long getDecryptDataNanos() { + return getDecryptDataNanos(nativeHandle_); + } + + public long getNumberAsyncSeek() { + return getNumberAsyncSeek(nativeHandle_); + } + + @Override + protected void disposeInternal(long handle) { + // Nothing to do. Perf context is valid for all the time of application is running. + } + + private native void reset(final long nativeHandle); + + private native long getUserKeyComparisonCount(final long handle); + private native long getBlockCacheHitCount(final long handle); + private native long getBlockReadCount(final long handle); + private native long getBlockReadByte(final long handle); + private native long getBlockReadTime(final long handle); + private native long getBlockReadCpuTime(final long handle); + private native long getBlockCacheIndexHitCount(final long handle); + private native long getBlockCacheStandaloneHandleCount(final long handle); + private native long getBlockCacheRealHandleCount(final long handle); + private native long getIndexBlockReadCount(final long handle); + private native long getBlockCacheFilterHitCount(final long handle); + private native long getFilterBlockReadCount(final long handle); + private native long getCompressionDictBlockReadCount(final long handle); + + private native long getSecondaryCacheHitCount(long handle); + private native long getCompressedSecCacheInsertRealCount(long handle); + + private native long getCompressedSecCacheInsertDummyCount(final long handle); + private native long getCompressedSecCacheUncompressedBytes(final long handle); + private native long getCompressedSecCacheCompressedBytes(final long handle); + private native long getBlockChecksumTime(final long handle); + private native long getBlockDecompressTime(final long handle); + private native long getReadBytes(final long handle); + private native long getMultigetReadBytes(final long handle); + private native long getIterReadBytes(final long handle); + private native long getBlobCacheHitCount(final long handle); + private native long getBlobReadCount(final long handle); + private native long getBlobReadByte(final long handle); + private native long getBlobReadTime(final long handle); + private native long getBlobChecksumTime(final long handle); + private native long getBlobDecompressTime(final long handle); + private native long getInternalKeySkippedCount(final long handle); + private native long getInternalDeleteSkippedCount(final long handle); + private native long getInternalRecentSkippedCount(final long handle); + private native long getInternalMergeCount(final long handle); + private native long getInternalMergePointLookupCount(final long handle); + private native long getInternalRangeDelReseekCount(final long handle); + private native long getSnapshotTime(final long handle); + private native long getFromMemtableTime(final long handle); + private native long getFromMemtableCount(final long handle); + private native long getPostProcessTime(final long handle); + private native long getFromOutputFilesTime(final long handle); + private native long getSeekOnMemtableTime(final long handle); + private native long getSeekOnMemtableCount(final long handle); + private native long getNextOnMemtableCount(final long handle); + private native long getPrevOnMemtableCount(final long handle); + private native long getSeekChildSeekTime(final long handle); + private native long getSeekChildSeekCount(final long handle); + private native long getSeekMinHeapTime(final long handle); + private native long getSeekMaxHeapTime(final long handle); + private native long getSeekInternalSeekTime(final long handle); + private native long getFindNextUserEntryTime(final long handle); + private native long getWriteWalTime(long handle); + private native long getWriteMemtableTime(long handle); + private native long getWriteDelayTime(long handle); + private native long getWriteSchedulingFlushesCompactionsTime(long handle); + private native long getWritePreAndPostProcessTime(long handle); + private native long getWriteThreadWaitNanos(long handle); + private native long getDbMutexLockNanos(long handle); + private native long getDbConditionWaitNanos(long handle); + private native long getMergeOperatorTimeNanos(long handle); + private native long getReadIndexBlockNanos(long handle); + private native long getReadFilterBlockNanos(long handle); + private native long getNewTableBlockIterNanos(long handle); + private native long getNewTableIteratorNanos(long handle); + private native long getBlockSeekNanos(long handle); + private native long getFindTableNanos(long handle); + private native long getBloomMemtableHitCount(long handle); + private native long getBloomMemtableMissCount(long handle); + private native long getBloomSstHitCount(long handle); + private native long getBloomSstMissCount(long handle); + private native long getKeyLockWaitTime(long handle); + private native long getKeyLockWaitCount(long handle); + private native long getEnvNewSequentialFileNanos(long handle); + private native long getEnvNewRandomAccessFileNanos(long handle); + private native long getEnvNewWritableFileNanos(long handle); + private native long getEnvReuseWritableFileNanos(long handle); + private native long getEnvNewRandomRwFileNanos(long handle); + private native long getEnvNewDirectoryNanos(long handle); + private native long getEnvFileExistsNanos(long handle); + private native long getEnvGetChildrenNanos(long handle); + private native long getEnvGetChildrenFileAttributesNanos(long handle); + private native long getEnvDeleteFileNanos(long handle); + private native long getEnvCreateDirNanos(long handle); + private native long getEnvCreateDirIfMissingNanos(long handle); + private native long getEnvDeleteDirNanos(long handle); + private native long getEnvGetFileSizeNanos(long handle); + private native long getEnvGetFileModificationTimeNanos(long handle); + private native long getEnvRenameFileNanos(long handle); + private native long getEnvLinkFileNanos(long handle); + private native long getEnvLockFileNanos(long handle); + private native long getEnvUnlockFileNanos(long handle); + private native long getEnvNewLoggerNanos(long handle); + private native long getGetCpuNanos(long nativeHandle_); + private native long getIterNextCpuNanos(long nativeHandle_); + private native long getIterPrevCpuNanos(long nativeHandle_); + private native long getIterSeekCpuNanos(long nativeHandle_); + private native long getEncryptDataNanos(long nativeHandle_); + private native long getDecryptDataNanos(long nativeHandle_); + private native long getNumberAsyncSeek(long nativeHandle_); +} diff --git a/java/src/main/java/org/rocksdb/PerfLevel.java b/java/src/main/java/org/rocksdb/PerfLevel.java new file mode 100644 index 000000000000..332e6d7d977b --- /dev/null +++ b/java/src/main/java/org/rocksdb/PerfLevel.java @@ -0,0 +1,60 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +public enum PerfLevel { + /** + * Unknown setting + */ + UNINITIALIZED((byte) 0), + /** + * disable perf stats + */ + DISABLE((byte) 1), + /** + * enable only count stats + */ + ENABLE_COUNT((byte) 2), + /** + * Other than count stats, also enable time stats except for mutexes + */ + ENABLE_TIME_EXCEPT_FOR_MUTEX((byte) 3), + + /** + * Other than time, also measure CPU time counters. Still don't measure + * time (neither wall time nor CPU time) for mutexes + */ + ENABLE_TIME_AND_CPU_TIME_EXCEPT_FOR_MUTEX((byte) 4), + /** + * enable count and time stats + */ + ENABLE_TIME((byte) 5), + + /** + * Do not use + * @deprecated It's here to just keep parity with C++ API. + */ + @Deprecated OUT_OF_BOUNDS((byte) 6); + + PerfLevel(byte _value) { + this._value = _value; + } + + private final byte _value; + + public byte getValue() { + return _value; + } + + public static PerfLevel getPerfLevel(byte level) { + for (PerfLevel l : PerfLevel.values()) { + if (l.getValue() == level) { + return l; + } + } + throw new IllegalArgumentException("Uknknown PerfLevel constant : " + level); + } +} diff --git a/java/src/main/java/org/rocksdb/PersistentCache.java b/java/src/main/java/org/rocksdb/PersistentCache.java index aed565297315..5297111e6f98 100644 --- a/java/src/main/java/org/rocksdb/PersistentCache.java +++ b/java/src/main/java/org/rocksdb/PersistentCache.java @@ -18,9 +18,9 @@ public PersistentCache(final Env env, final String path, final long size, logger.nativeHandle_, optimizedForNvm)); } - private native static long newPersistentCache(final long envHandle, - final String path, final long size, final long loggerHandle, - final boolean optimizedForNvm) throws RocksDBException; + private static native long newPersistentCache(final long envHandle, final String path, + final long size, final long loggerHandle, final boolean optimizedForNvm) + throws RocksDBException; @Override protected final native void disposeInternal(final long handle); } diff --git a/java/src/main/java/org/rocksdb/PlainTableConfig.java b/java/src/main/java/org/rocksdb/PlainTableConfig.java index c099981678b7..46077ba56530 100644 --- a/java/src/main/java/org/rocksdb/PlainTableConfig.java +++ b/java/src/main/java/org/rocksdb/PlainTableConfig.java @@ -48,7 +48,7 @@ public PlainTableConfig() { * @param keySize the length of the user key. * @return the reference to the current config. */ - public PlainTableConfig setKeySize(int keySize) { + public PlainTableConfig setKeySize(final int keySize) { keySize_ = keySize; return this; } @@ -68,7 +68,7 @@ public int keySize() { * @param bitsPerKey the number of bits per key for bloom filer. * @return the reference to the current config. */ - public PlainTableConfig setBloomBitsPerKey(int bitsPerKey) { + public PlainTableConfig setBloomBitsPerKey(final int bitsPerKey) { bloomBitsPerKey_ = bitsPerKey; return this; } @@ -89,7 +89,7 @@ public int bloomBitsPerKey() { * @param ratio the hash table ratio. * @return the reference to the current config. */ - public PlainTableConfig setHashTableRatio(double ratio) { + public PlainTableConfig setHashTableRatio(final double ratio) { hashTableRatio_ = ratio; return this; } @@ -110,7 +110,7 @@ public double hashTableRatio() { * @param sparseness the index sparseness. * @return the reference to the current config. */ - public PlainTableConfig setIndexSparseness(int sparseness) { + public PlainTableConfig setIndexSparseness(final int sparseness) { indexSparseness_ = sparseness; return this; } @@ -134,7 +134,7 @@ public long indexSparseness() { * @param hugePageTlbSize huge page tlb size * @return the reference to the current config. */ - public PlainTableConfig setHugePageTlbSize(int hugePageTlbSize) { + public PlainTableConfig setHugePageTlbSize(final int hugePageTlbSize) { this.hugePageTlbSize_ = hugePageTlbSize; return this; } @@ -166,7 +166,7 @@ public int hugePageTlbSize() { * @param encodingType {@link org.rocksdb.EncodingType} value. * @return the reference to the current config. */ - public PlainTableConfig setEncodingType(EncodingType encodingType) { + public PlainTableConfig setEncodingType(final EncodingType encodingType) { this.encodingType_ = encodingType; return this; } @@ -188,7 +188,7 @@ public EncodingType encodingType() { * scan mode shall be enabled. * @return the reference to the current config. */ - public PlainTableConfig setFullScanMode(boolean fullScanMode) { + public PlainTableConfig setFullScanMode(final boolean fullScanMode) { this.fullScanMode_ = fullScanMode; return this; } @@ -212,7 +212,7 @@ public boolean fullScanMode() { * be stored in a file * @return the reference to the current config. */ - public PlainTableConfig setStoreIndexInFile(boolean storeIndexInFile) { + public PlainTableConfig setStoreIndexInFile(final boolean storeIndexInFile) { this.storeIndexInFile_ = storeIndexInFile; return this; } diff --git a/java/src/main/java/org/rocksdb/ReadOptions.java b/java/src/main/java/org/rocksdb/ReadOptions.java old mode 100755 new mode 100644 index 0836f0f18435..481101fc9320 --- a/java/src/main/java/org/rocksdb/ReadOptions.java +++ b/java/src/main/java/org/rocksdb/ReadOptions.java @@ -7,7 +7,7 @@ /** * The class that controls the get behavior. - * + *

    * Note that dispose() must be called before an Options instance * become out-of-scope to release the allocated memory in c++. */ @@ -27,13 +27,13 @@ public ReadOptions(final boolean verifyChecksums, final boolean fillCache) { /** * Copy constructor. - * + *

    * NOTE: This does a shallow copy, which means snapshot, iterate_upper_bound * and other pointers will be cloned! * * @param other The ReadOptions to copy. */ - public ReadOptions(ReadOptions other) { + public ReadOptions(final ReadOptions other) { super(copyReadOptions(other.nativeHandle_)); this.iterateLowerBoundSlice_ = other.iterateLowerBoundSlice_; this.iterateUpperBoundSlice_ = other.iterateUpperBoundSlice_; @@ -106,7 +106,7 @@ public ReadOptions setFillCache(final boolean fillCache) { */ public Snapshot snapshot() { assert(isOwningHandle()); - long snapshotHandle = snapshot(nativeHandle_); + final long snapshotHandle = snapshot(nativeHandle_); if (snapshotHandle != 0) { return new Snapshot(snapshotHandle); } @@ -128,7 +128,7 @@ public ReadOptions setSnapshot(final Snapshot snapshot) { if (snapshot != null) { setSnapshot(nativeHandle_, snapshot.nativeHandle_); } else { - setSnapshot(nativeHandle_, 0l); + setSnapshot(nativeHandle_, 0L); } return this; } @@ -163,9 +163,6 @@ public ReadOptions setReadTier(final ReadTier readTier) { * added data) and is optimized for sequential reads. It will return records * that were inserted into the database after the creation of the iterator. * Default: false - * - * Not supported in {@code ROCKSDB_LITE} mode! - * * @return true if tailing iterator is enabled. */ public boolean tailing() { @@ -179,7 +176,6 @@ public boolean tailing() { * added data) and is optimized for sequential reads. It will return records * that were inserted into the database after the creation of the iterator. * Default: false - * Not supported in ROCKSDB_LITE mode! * * @param tailing if true, then tailing iterator will be enabled. * @return the reference to the current ReadOptions. @@ -260,7 +256,7 @@ public boolean prefixSameAsStart() { * Enforce that the iterator only iterates over the same prefix as the seek. * This option is effective only for prefix seeks, i.e. prefix_extractor is * non-null for the column family and {@link #totalOrderSeek()} is false. - * Unlike iterate_upper_bound, {@link #setPrefixSameAsStart(boolean)} only + * Unlike iterate_upper_bound, {@code #setPrefixSameAsStart(boolean)} only * works within a prefix but in both directions. * * @param prefixSameAsStart if true, then the iterator only iterates over the @@ -304,7 +300,7 @@ public ReadOptions setPinData(final boolean pinData) { * If true, when PurgeObsoleteFile is called in CleanupIteratorState, we * schedule a background job in the flush job queue and delete obsolete files * in background. - * + *

    * Default: false * * @return true when PurgeObsoleteFile is called in CleanupIteratorState @@ -318,7 +314,7 @@ public boolean backgroundPurgeOnIteratorCleanup() { * If true, when PurgeObsoleteFile is called in CleanupIteratorState, we * schedule a background job in the flush job queue and delete obsolete files * in background. - * + *

    * Default: false * * @param backgroundPurgeOnIteratorCleanup true when PurgeObsoleteFile is @@ -337,7 +333,7 @@ public ReadOptions setBackgroundPurgeOnIteratorCleanup( * If non-zero, NewIterator will create a new table reader which * performs reads of the given size. Using a large size (> 2MB) can * improve the performance of forward iteration on spinning disks. - * + *

    * Default: 0 * * @return The readahead size is bytes @@ -351,7 +347,7 @@ public long readaheadSize() { * If non-zero, NewIterator will create a new table reader which * performs reads of the given size. Using a large size (> 2MB) can * improve the performance of forward iteration on spinning disks. - * + *

    * Default: 0 * * @param readaheadSize The readahead size is bytes @@ -379,7 +375,7 @@ public long maxSkippableInternalKeys() { * A threshold for the number of keys that can be skipped before failing an * iterator seek as incomplete. The default value of 0 should be used to * never fail a request as incomplete, even on skipping too many keys. - * + *

    * Default: 0 * * @param maxSkippableInternalKeys the number of keys that can be skipped @@ -398,7 +394,7 @@ public ReadOptions setMaxSkippableInternalKeys( * If true, keys deleted using the DeleteRange() API will be visible to * readers until they are naturally deleted during compaction. This improves * read performance in DBs with many range deletions. - * + *

    * Default: false * * @return true if keys deleted using the DeleteRange() API will be visible @@ -412,7 +408,7 @@ public boolean ignoreRangeDeletions() { * If true, keys deleted using the DeleteRange() API will be visible to * readers until they are naturally deleted during compaction. This improves * read performance in DBs with many range deletions. - * + *

    * Default: false * * @param ignoreRangeDeletions true if keys deleted using the DeleteRange() @@ -429,14 +425,14 @@ public ReadOptions setIgnoreRangeDeletions(final boolean ignoreRangeDeletions) { * Defines the smallest key at which the backward * iterator can return an entry. Once the bound is passed, * {@link RocksIterator#isValid()} will be false. - * + *

    * The lower bound is inclusive i.e. the bound value is a valid * entry. - * + *

    * If prefix_extractor is not null, the Seek target and `iterate_lower_bound` * need to have the same prefix. This is because ordering is not guaranteed * outside of prefix domain. - * + *

    * Default: null * * @param iterateLowerBound Slice representing the lower bound @@ -454,7 +450,7 @@ public ReadOptions setIterateLowerBound(final AbstractSlice iterateLowerBound /** * Returns the smallest key at which the backward * iterator can return an entry. - * + *

    * The lower bound is inclusive i.e. the bound value is a valid entry. * * @return the smallest key, or null if there is no lower bound defined. @@ -472,15 +468,15 @@ public Slice iterateLowerBound() { /** * Defines the extent up to which the forward iterator - * can returns entries. Once the bound is reached, + * can return entries. Once the bound is reached, * {@link RocksIterator#isValid()} will be false. - * + *

    * The upper bound is exclusive i.e. the bound value is not a valid entry. - * + *

    * If prefix_extractor is not null, the Seek target and iterate_upper_bound * need to have the same prefix. This is because ordering is not guaranteed * outside of prefix domain. - * + *

    * Default: null * * @param iterateUpperBound Slice representing the upper bound @@ -498,7 +494,7 @@ public ReadOptions setIterateUpperBound(final AbstractSlice iterateUpperBound /** * Returns the largest key at which the forward * iterator can return an entry. - * + *

    * The upper bound is exclusive i.e. the bound value is not a valid entry. * * @return the largest key, or null if there is no upper bound defined. @@ -520,7 +516,7 @@ public Slice iterateUpperBound() { * properties of each table during iteration. If the callback returns false, * the table will not be scanned. This option only affects Iterators and has * no impact on point lookups. - * + *

    * Default: null (every table will be scanned) * * @param tableFilter the table filter for the callback. @@ -537,8 +533,6 @@ public ReadOptions setTableFilter(final AbstractTableFilter tableFilter) { * When true, by default use total_order_seek = true, and RocksDB can * selectively enable prefix seek mode if won't generate a different result * from total_order_seek, based on seek key, and iterator upper bound. - * Not supported in ROCKSDB_LITE mode, in the way that even with value true - * prefix mode is not used. * Default: false * * @return true if auto prefix mode is set. @@ -553,8 +547,6 @@ public boolean autoPrefixMode() { * When true, by default use total_order_seek = true, and RocksDB can * selectively enable prefix seek mode if won't generate a different result * from total_order_seek, based on seek key, and iterator upper bound. - * Not supported in ROCKSDB_LITE mode, in the way that even with value true - * prefix mode is not used. * Default: false * @param mode auto prefix mode * @return the reference to the current ReadOptions. @@ -576,19 +568,19 @@ public ReadOptions setAutoPrefixMode(final boolean mode) { * only the most recent version visible to timestamp is returned. * The user-specified timestamp feature is still under active development, * and the API is subject to change. - * + *

    * Default: null * @see #iterStartTs() * @return Reference to timestamp or null if there is no timestamp defined. */ + @SuppressWarnings("PMD.ConfusingTernary") public Slice timestamp() { assert (isOwningHandle()); final long timestampSliceHandle = timestamp(nativeHandle_); - if (timestampSliceHandle != 0) { - return new Slice(timestampSliceHandle); - } else { + if (timestampSliceHandle == 0) { return null; } + return new Slice(timestampSliceHandle); } /** @@ -602,7 +594,7 @@ public Slice timestamp() { * only the most recent version visible to timestamp is returned. * The user-specified timestamp feature is still under active development, * and the API is subject to change. - * + *

    * Default: null * @see #setIterStartTs(AbstractSlice) * @param timestamp Slice representing the timestamp @@ -626,19 +618,19 @@ public ReadOptions setTimestamp(final AbstractSlice timestamp) { * only the most recent version visible to timestamp is returned. * The user-specified timestamp feature is still under active development, * and the API is subject to change. - * + *

    * Default: null * @return Reference to lower bound timestamp or null if there is no lower bound timestamp * defined. */ + @SuppressWarnings("PMD.ConfusingTernary") public Slice iterStartTs() { assert (isOwningHandle()); final long iterStartTsHandle = iterStartTs(nativeHandle_); - if (iterStartTsHandle != 0) { - return new Slice(iterStartTsHandle); - } else { + if (iterStartTsHandle == 0) { return null; } + return new Slice(iterStartTsHandle); } /** @@ -652,7 +644,7 @@ public Slice iterStartTs() { * only the most recent version visible to timestamp is returned. * The user-specified timestamp feature is still under active development, * and the API is subject to change. - * + *

    * Default: null * * @param iterStartTs Reference to lower bound timestamp or null if there is no lower bound @@ -735,7 +727,7 @@ public ReadOptions setIoTimeout(final long ioTimeout) { * It limits the maximum cumulative value size of the keys in batch while * reading through MultiGet. Once the cumulative value size exceeds this * soft limit then all the remaining keys are returned with status Aborted. - * + *

    * Default: {@code std::numeric_limits::max()} * @return actual valueSizeSofLimit */ @@ -748,7 +740,7 @@ public long valueSizeSoftLimit() { * It limits the maximum cumulative value size of the keys in batch while * reading through MultiGet. Once the cumulative value size exceeds this * soft limit then all the remaining keys are returned with status Aborted. - * + *

    * Default: {@code std::numeric_limits::max()} * * @param valueSizeSoftLimit the maximum cumulative value size of the keys @@ -773,10 +765,9 @@ public ReadOptions setValueSizeSoftLimit(final long valueSizeSoftLimit) { private AbstractSlice timestampSlice_; private AbstractSlice iterStartTs_; - private native static long newReadOptions(); - private native static long newReadOptions(final boolean verifyChecksums, - final boolean fillCache); - private native static long copyReadOptions(long handle); + private static native long newReadOptions(); + private static native long newReadOptions(final boolean verifyChecksums, final boolean fillCache); + private static native long copyReadOptions(long handle); @Override protected final native void disposeInternal(final long handle); private native boolean verifyChecksums(long handle); diff --git a/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java b/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java index 6ee81d858c80..e96694313b4a 100644 --- a/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java +++ b/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java @@ -14,5 +14,5 @@ public RemoveEmptyValueCompactionFilter() { super(createNewRemoveEmptyValueCompactionFilter0()); } - private native static long createNewRemoveEmptyValueCompactionFilter0(); + private static native long createNewRemoveEmptyValueCompactionFilter0(); } diff --git a/java/src/main/java/org/rocksdb/RestoreOptions.java b/java/src/main/java/org/rocksdb/RestoreOptions.java index 54dc0e61c245..a6b43d476064 100644 --- a/java/src/main/java/org/rocksdb/RestoreOptions.java +++ b/java/src/main/java/org/rocksdb/RestoreOptions.java @@ -7,7 +7,7 @@ /** * RestoreOptions to control the behavior of restore. - * + *

    * Note that dispose() must be called before this instance become out-of-scope * to release the allocated memory in c++. * @@ -27,6 +27,6 @@ public RestoreOptions(final boolean keepLogFiles) { super(newRestoreOptions(keepLogFiles)); } - private native static long newRestoreOptions(boolean keepLogFiles); + private static native long newRestoreOptions(boolean keepLogFiles); @Override protected final native void disposeInternal(final long handle); } diff --git a/java/src/main/java/org/rocksdb/RocksCallbackObject.java b/java/src/main/java/org/rocksdb/RocksCallbackObject.java index 8d7a867ee7c3..2c4547b12918 100644 --- a/java/src/main/java/org/rocksdb/RocksCallbackObject.java +++ b/java/src/main/java/org/rocksdb/RocksCallbackObject.java @@ -11,10 +11,10 @@ * RocksCallbackObject is similar to {@link RocksObject} but varies * in its construction as it is designed for Java objects which have functions * which are called from C++ via JNI. - * + *

    * RocksCallbackObject is the base-class any RocksDB classes that acts as a * callback from some underlying underlying native C++ {@code rocksdb} object. - * + *

    * The use of {@code RocksObject} should always be preferred over * {@link RocksCallbackObject} if callbacks are not required. */ @@ -39,7 +39,7 @@ protected RocksCallbackObject(final long... nativeParameterHandles) { static /* @Nullable */ long[] toNativeHandleList( /* @Nullable */ final List objectList) { if (objectList == null) { - return null; + return new long[0]; } final int len = objectList.size(); final long[] handleList = new long[len]; diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java index 77484288f5a7..54e95e6e8a13 100644 --- a/java/src/main/java/org/rocksdb/RocksDB.java +++ b/java/src/main/java/org/rocksdb/RocksDB.java @@ -9,10 +9,7 @@ import java.io.IOException; import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.concurrent.atomic.AtomicReference; import org.rocksdb.util.Environment; @@ -39,6 +36,9 @@ private enum LibraryState { RocksDB.loadLibrary(); } + static final String PERFORMANCE_OPTIMIZATION_FOR_A_VERY_SPECIFIC_WORKLOAD = + "Performance optimization for a very specific workload"; + private final List ownedColumnFamilyHandles = new ArrayList<>(); /** @@ -48,6 +48,7 @@ private enum LibraryState { * java.io.tmpdir, however, you can override this temporary location by * setting the environment variable ROCKSDB_SHAREDLIB_DIR. */ + @SuppressWarnings("PMD.EmptyCatchBlock") public static void loadLibrary() { if (libraryLoaded.get() == LibraryState.LOADED) { return; @@ -97,6 +98,7 @@ public static void loadLibrary() { * @param paths a list of strings where each describes a directory * of a library. */ + @SuppressWarnings("PMD.EmptyCatchBlock") public static void loadLibrary(final List paths) { if (libraryLoaded.get() == LibraryState.LOADED) { return; @@ -178,9 +180,11 @@ protected RocksDB(final long nativeHandle) { * @see Options#setCreateIfMissing(boolean) */ public static RocksDB open(final String path) throws RocksDBException { - final Options options = new Options(); - options.setCreateIfMissing(true); - return open(options, path); + RocksDB.loadLibrary(); + try (Options options = new Options()) { + options.setCreateIfMissing(true); + return open(options, path); + } } /** @@ -216,8 +220,9 @@ public static RocksDB open(final String path, final List columnFamilyDescriptors, final List columnFamilyHandles) throws RocksDBException { - final DBOptions options = new DBOptions(); - return open(options, path, columnFamilyDescriptors, columnFamilyHandles); + try (DBOptions options = new DBOptions()) { + return open(options, path, columnFamilyDescriptors, columnFamilyHandles); + } } /** @@ -310,7 +315,8 @@ public static RocksDB open(final DBOptions options, final String path, db.storeOptionsInstance(options); for (int i = 1; i < handles.length; i++) { - final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]); + final ColumnFamilyHandle columnFamilyHandle = // NOPMD - CloseResource + new ColumnFamilyHandle(db, handles[i]); columnFamilyHandles.add(columnFamilyHandle); } @@ -333,17 +339,19 @@ public static RocksDB open(final DBOptions options, final String path, */ public static RocksDB openReadOnly(final String path) throws RocksDBException { + RocksDB.loadLibrary(); // This allows to use the rocksjni default Options instead of // the c++ one. - final Options options = new Options(); - return openReadOnly(options, path); + try (Options options = new Options()) { + return openReadOnly(options, path); + } } /** * The factory constructor of RocksDB that opens a RocksDB instance in * Read-Only mode given the path to the database using the specified * options and db path. - * + *

    * Options instance *should* not be disposed before all DBs using this options * instance have been closed. If user doesn't call options dispose explicitly, * then this options instance will be GC'd automatically. @@ -365,7 +373,7 @@ public static RocksDB openReadOnly(final Options options, final String path) * The factory constructor of RocksDB that opens a RocksDB instance in * Read-Only mode given the path to the database using the specified * options and db path. - * + *

    * Options instance *should* not be disposed before all DBs using this options * instance have been closed. If user doesn't call options dispose explicitly, * then this options instance will be GC'd automatically. @@ -411,8 +419,9 @@ public static RocksDB openReadOnly(final String path, throws RocksDBException { // This allows to use the rocksjni default Options instead of // the c++ one. - final DBOptions options = new DBOptions(); - return openReadOnly(options, path, columnFamilyDescriptors, columnFamilyHandles, false); + try (DBOptions options = new DBOptions()) { + return openReadOnly(options, path, columnFamilyDescriptors, columnFamilyHandles, false); + } } /** @@ -490,7 +499,8 @@ public static RocksDB openReadOnly(final DBOptions options, final String path, db.storeOptionsInstance(options); for (int i = 1; i < handles.length; i++) { - final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]); + final ColumnFamilyHandle columnFamilyHandle = // NOPMD - CloseResource + new ColumnFamilyHandle(db, handles[i]); columnFamilyHandles.add(columnFamilyHandle); } @@ -501,7 +511,7 @@ public static RocksDB openReadOnly(final DBOptions options, final String path, /** * Open DB as secondary instance with only the default column family. - * + *

    * The secondary instance can dynamically tail the MANIFEST of * a primary that must have already been created. User can call * {@link #tryCatchUpWithPrimary()} to make the secondary instance catch up @@ -538,7 +548,7 @@ public static RocksDB openAsSecondary(final Options options, final String path, /** * Open DB as secondary instance with column families. * You can open a subset of column families in secondary mode. - * + *

    * The secondary instance can dynamically tail the MANIFEST of * a primary that must have already been created. User can call * {@link #tryCatchUpWithPrimary()} to make the secondary instance catch up @@ -586,7 +596,8 @@ public static RocksDB openAsSecondary(final DBOptions options, final String path db.storeOptionsInstance(options); for (int i = 1; i < handles.length; i++) { - final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(db, handles[i]); + final ColumnFamilyHandle columnFamilyHandle = // NOPMD - CloseResource + new ColumnFamilyHandle(db, handles[i]); columnFamilyHandles.add(columnFamilyHandle); } @@ -598,18 +609,19 @@ public static RocksDB openAsSecondary(final DBOptions options, final String path /** * This is similar to {@link #close()} except that it * throws an exception if any error occurs. - * + *

    * This will not fsync the WAL files. * If syncing is required, the caller must first call {@link #syncWal()} * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch * with {@link WriteOptions#setSync(boolean)} set to true. - * + *

    * See also {@link #close()}. * * @throws RocksDBException if an error occurs whilst closing. */ public void closeE() throws RocksDBException { - for (final ColumnFamilyHandle columnFamilyHandle : ownedColumnFamilyHandles) { + for (final ColumnFamilyHandle columnFamilyHandle : // NOPMD - CloseResource + ownedColumnFamilyHandles) { columnFamilyHandle.close(); } ownedColumnFamilyHandles.clear(); @@ -626,17 +638,19 @@ public void closeE() throws RocksDBException { /** * This is similar to {@link #closeE()} except that it * silently ignores any errors. - * + *

    * This will not fsync the WAL files. * If syncing is required, the caller must first call {@link #syncWal()} * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch * with {@link WriteOptions#setSync(boolean)} set to true. - * + *

    * See also {@link #close()}. */ + @SuppressWarnings("PMD.EmptyCatchBlock") @Override public void close() { - for (final ColumnFamilyHandle columnFamilyHandle : ownedColumnFamilyHandles) { + for (final ColumnFamilyHandle columnFamilyHandle : // NOPMD - CloseResource + ownedColumnFamilyHandles) { columnFamilyHandle.close(); } ownedColumnFamilyHandles.clear(); @@ -711,8 +725,8 @@ public List createColumnFamilies( columnFamilyOptions.nativeHandle_, cfNames); final List columnFamilyHandles = new ArrayList<>(cfHandles.length); - for (int i = 0; i < cfHandles.length; i++) { - final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandles[i]); + for (final long cfHandle : cfHandles) { + final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandle); // NOPMD columnFamilyHandles.add(columnFamilyHandle); } ownedColumnFamilyHandles.addAll(columnFamilyHandles); @@ -744,14 +758,54 @@ public List createColumnFamilies( cfOptsHandles, cfNames); final List columnFamilyHandles = new ArrayList<>(cfHandles.length); - for (int i = 0; i < cfHandles.length; i++) { - final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandles[i]); + for (final long cfHandle : cfHandles) { + final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, cfHandle); // NOPMD columnFamilyHandles.add(columnFamilyHandle); } ownedColumnFamilyHandles.addAll(columnFamilyHandles); return columnFamilyHandles; } + /** + * Creates a new column family with the name columnFamilyName and + * import external SST files specified in `metadata` allocates a + * ColumnFamilyHandle within an internal structure. + * The ColumnFamilyHandle is automatically disposed with DB disposal. + * + * @param columnFamilyDescriptor column family to be created. + * @return {@link org.rocksdb.ColumnFamilyHandle} instance. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public ColumnFamilyHandle createColumnFamilyWithImport( + final ColumnFamilyDescriptor columnFamilyDescriptor, + final ImportColumnFamilyOptions importColumnFamilyOptions, + final ExportImportFilesMetaData metadata) throws RocksDBException { + List metadatas = new ArrayList<>(); + metadatas.add(metadata); + return createColumnFamilyWithImport( + columnFamilyDescriptor, importColumnFamilyOptions, metadatas); + } + + public ColumnFamilyHandle createColumnFamilyWithImport( + final ColumnFamilyDescriptor columnFamilyDescriptor, + final ImportColumnFamilyOptions importColumnFamilyOptions, + final List metadatas) throws RocksDBException { + final int metadataNum = metadatas.size(); + final long[] metadataHandleList = new long[metadataNum]; + for (int i = 0; i < metadataNum; i++) { + metadataHandleList[i] = metadatas.get(i).getNativeHandle(); + } + final ColumnFamilyHandle columnFamilyHandle = new ColumnFamilyHandle(this, + createColumnFamilyWithImport(nativeHandle_, columnFamilyDescriptor.getName(), + columnFamilyDescriptor.getName().length, + columnFamilyDescriptor.getOptions().nativeHandle_, + importColumnFamilyOptions.nativeHandle_, metadataHandleList)); + ownedColumnFamilyHandles.add(columnFamilyHandle); + return columnFamilyHandle; + } + /** * Drops the column family specified by {@code columnFamilyHandle}. This call * only records a drop record in the manifest and prevents the column @@ -789,7 +843,7 @@ public void dropColumnFamilies( */ public void destroyColumnFamilyHandle(final ColumnFamilyHandle columnFamilyHandle) { for (int i = 0; i < ownedColumnFamilyHandles.size(); ++i) { - final ColumnFamilyHandle ownedHandle = ownedColumnFamilyHandles.get(i); + final ColumnFamilyHandle ownedHandle = ownedColumnFamilyHandles.get(i); // NOPMD if (ownedHandle.equals(columnFamilyHandle)) { columnFamilyHandle.close(); ownedColumnFamilyHandles.remove(i); @@ -846,7 +900,7 @@ public void put(final byte[] key, final int offset, final int len, * instance * @param key the specified key to be inserted. * @param value the value associated with the specified key. - * + *

    * throws IllegalArgumentException if column family is not present * * @throws RocksDBException thrown if error happens in underlying @@ -943,7 +997,7 @@ public void put(final WriteOptions writeOpts, * @param writeOpts {@link org.rocksdb.WriteOptions} instance. * @param key the specified key to be inserted. * @param value the value associated with the specified key. - * + *

    * throws IllegalArgumentException if column family is not present * * @throws RocksDBException thrown if error happens in underlying @@ -968,7 +1022,7 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, * Supports direct buffer only. * @param value the value associated with the specified key. Position and limit is used. * Supports direct buffer only. - * + *

    * throws IllegalArgumentException if column family is not present * * @throws RocksDBException thrown if error happens in underlying @@ -992,7 +1046,7 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions * Supports direct buffer only. * @param value the value associated with the specified key. Position and limit is used. * Supports direct buffer only. - * + *

    * throws IllegalArgumentException if column family is not present * * @throws RocksDBException thrown if error happens in underlying @@ -1215,8 +1269,8 @@ public void delete(final ColumnFamilyHandle columnFamilyHandle, public int get(final ReadOptions opt, final ByteBuffer key, final ByteBuffer value) throws RocksDBException { assert key.isDirect() && value.isDirect(); - int result = getDirect(nativeHandle_, opt.nativeHandle_, key, key.position(), key.remaining(), - value, value.position(), value.remaining(), 0); + final int result = getDirect(nativeHandle_, opt.nativeHandle_, key, key.position(), + key.remaining(), value, value.position(), value.remaining(), 0); if (result != NOT_FOUND) { value.limit(Math.min(value.limit(), value.position() + result)); } @@ -1248,8 +1302,9 @@ public int get(final ReadOptions opt, final ByteBuffer key, final ByteBuffer val public int get(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions opt, final ByteBuffer key, final ByteBuffer value) throws RocksDBException { assert key.isDirect() && value.isDirect(); - int result = getDirect(nativeHandle_, opt.nativeHandle_, key, key.position(), key.remaining(), - value, value.position(), value.remaining(), columnFamilyHandle.nativeHandle_); + final int result = + getDirect(nativeHandle_, opt.nativeHandle_, key, key.position(), key.remaining(), value, + value.position(), value.remaining(), columnFamilyHandle.nativeHandle_); if (result != NOT_FOUND) { value.limit(Math.min(value.limit(), value.position() + result)); } @@ -1261,12 +1316,12 @@ public int get(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions op * Remove the database entry for {@code key}. Requires that the key exists * and was not overwritten. It is not an error if the key did not exist * in the database. - * + *

    * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple * times), then the result of calling SingleDelete() on this key is undefined. * SingleDelete() only behaves correctly if there has been only one Put() * for this key since the previous call to SingleDelete() for this key. - * + *

    * This feature is currently an experimental performance optimization * for a very specific workload. It is up to the caller to ensure that * SingleDelete is only used for a key that is not deleted using Delete() or @@ -1278,7 +1333,7 @@ public int get(final ColumnFamilyHandle columnFamilyHandle, final ReadOptions op * @throws RocksDBException thrown if error happens in underlying * native library. */ - @Experimental("Performance optimization for a very specific workload") + @Experimental(PERFORMANCE_OPTIMIZATION_FOR_A_VERY_SPECIFIC_WORKLOAD) public void singleDelete(final byte[] key) throws RocksDBException { singleDelete(nativeHandle_, key, key.length); } @@ -1287,12 +1342,12 @@ public void singleDelete(final byte[] key) throws RocksDBException { * Remove the database entry for {@code key}. Requires that the key exists * and was not overwritten. It is not an error if the key did not exist * in the database. - * + *

    * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple * times), then the result of calling SingleDelete() on this key is undefined. * SingleDelete() only behaves correctly if there has been only one Put() * for this key since the previous call to SingleDelete() for this key. - * + *

    * This feature is currently an experimental performance optimization * for a very specific workload. It is up to the caller to ensure that * SingleDelete is only used for a key that is not deleted using Delete() or @@ -1305,9 +1360,9 @@ public void singleDelete(final byte[] key) throws RocksDBException { * @throws RocksDBException thrown if error happens in underlying * native library. */ - @Experimental("Performance optimization for a very specific workload") - public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, - final byte[] key) throws RocksDBException { + @Experimental(PERFORMANCE_OPTIMIZATION_FOR_A_VERY_SPECIFIC_WORKLOAD) + public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) + throws RocksDBException { singleDelete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); } @@ -1316,18 +1371,18 @@ public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, * Remove the database entry for {@code key}. Requires that the key exists * and was not overwritten. It is not an error if the key did not exist * in the database. - * + *

    * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple * times), then the result of calling SingleDelete() on this key is undefined. * SingleDelete() only behaves correctly if there has been only one Put() * for this key since the previous call to SingleDelete() for this key. - * + *

    * This feature is currently an experimental performance optimization * for a very specific workload. It is up to the caller to ensure that * SingleDelete is only used for a key that is not deleted using Delete() or * written using Merge(). Mixing SingleDelete operations with Deletes and * Merges can result in undefined behavior. - * + *

    * Note: consider setting {@link WriteOptions#setSync(boolean)} true. * * @param writeOpt Write options for the delete @@ -1336,9 +1391,8 @@ public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, * @throws RocksDBException thrown if error happens in underlying * native library. */ - @Experimental("Performance optimization for a very specific workload") - public void singleDelete(final WriteOptions writeOpt, final byte[] key) - throws RocksDBException { + @Experimental(PERFORMANCE_OPTIMIZATION_FOR_A_VERY_SPECIFIC_WORKLOAD) + public void singleDelete(final WriteOptions writeOpt, final byte[] key) throws RocksDBException { singleDelete(nativeHandle_, writeOpt.nativeHandle_, key, key.length); } @@ -1346,18 +1400,18 @@ public void singleDelete(final WriteOptions writeOpt, final byte[] key) * Remove the database entry for {@code key}. Requires that the key exists * and was not overwritten. It is not an error if the key did not exist * in the database. - * + *

    * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple * times), then the result of calling SingleDelete() on this key is undefined. * SingleDelete() only behaves correctly if there has been only one Put() * for this key since the previous call to SingleDelete() for this key. - * + *

    * This feature is currently an experimental performance optimization * for a very specific workload. It is up to the caller to ensure that * SingleDelete is only used for a key that is not deleted using Delete() or * written using Merge(). Mixing SingleDelete operations with Deletes and * Merges can result in undefined behavior. - * + *

    * Note: consider setting {@link WriteOptions#setSync(boolean)} true. * * @param columnFamilyHandle The column family to delete the key from @@ -1367,19 +1421,18 @@ public void singleDelete(final WriteOptions writeOpt, final byte[] key) * @throws RocksDBException thrown if error happens in underlying * native library. */ - @Experimental("Performance optimization for a very specific workload") - public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, - final WriteOptions writeOpt, final byte[] key) throws RocksDBException { + @Experimental(PERFORMANCE_OPTIMIZATION_FOR_A_VERY_SPECIFIC_WORKLOAD) + public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final WriteOptions writeOpt, + final byte[] key) throws RocksDBException { singleDelete(nativeHandle_, writeOpt.nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); } - /** * Removes the database entries in the range ["beginKey", "endKey"), i.e., * including "beginKey" and excluding "endKey". a non-OK status on error. It * is not an error if no keys exist in the range ["beginKey", "endKey"). - * + *

    * Delete the database entry (if any) for "key". Returns OK on success, and a * non-OK status on error. It is not an error if "key" did not exist in the * database. @@ -1400,7 +1453,7 @@ public void deleteRange(final byte[] beginKey, final byte[] endKey) * Removes the database entries in the range ["beginKey", "endKey"), i.e., * including "beginKey" and excluding "endKey". a non-OK status on error. It * is not an error if no keys exist in the range ["beginKey", "endKey"). - * + *

    * Delete the database entry (if any) for "key". Returns OK on success, and a * non-OK status on error. It is not an error if "key" did not exist in the * database. @@ -1422,7 +1475,7 @@ public void deleteRange(final ColumnFamilyHandle columnFamilyHandle, * Removes the database entries in the range ["beginKey", "endKey"), i.e., * including "beginKey" and excluding "endKey". a non-OK status on error. It * is not an error if no keys exist in the range ["beginKey", "endKey"). - * + *

    * Delete the database entry (if any) for "key". Returns OK on success, and a * non-OK status on error. It is not an error if "key" did not exist in the * database. @@ -1444,7 +1497,7 @@ public void deleteRange(final WriteOptions writeOpt, final byte[] beginKey, * Removes the database entries in the range ["beginKey", "endKey"), i.e., * including "beginKey" and excluding "endKey". a non-OK status on error. It * is not an error if no keys exist in the range ["beginKey", "endKey"). - * + *

    * Delete the database entry (if any) for "key". Returns OK on success, and a * non-OK status on error. It is not an error if "key" did not exist in the * database. @@ -1501,7 +1554,7 @@ public void merge(final byte[] key, final byte[] value) * native library. * @throws IndexOutOfBoundsException if an offset or length is out of bounds */ - public void merge(final byte[] key, int offset, int len, final byte[] value, + public void merge(final byte[] key, final int offset, final int len, final byte[] value, final int vOffset, final int vLen) throws RocksDBException { checkBounds(offset, len, key.length); checkBounds(vOffset, vLen, value.length); @@ -2137,7 +2190,7 @@ public byte[] get(final ColumnFamilyHandle columnFamilyHandle, */ public List multiGetAsList(final List keys) throws RocksDBException { - assert(keys.size() != 0); + assert (!keys.isEmpty()); final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); final int[] keyOffsets = new int[keysArray.length]; @@ -2173,7 +2226,7 @@ public List multiGetAsList( final List columnFamilyHandleList, final List keys) throws RocksDBException, IllegalArgumentException { - assert(keys.size() != 0); + assert (!keys.isEmpty()); // Check if key size equals cfList size. If not a exception must be // thrown. If not a Segmentation fault happens. if (keys.size() != columnFamilyHandleList.size()) { @@ -2210,7 +2263,7 @@ public List multiGetAsList( */ public List multiGetAsList(final ReadOptions opt, final List keys) throws RocksDBException { - assert(keys.size() != 0); + assert (!keys.isEmpty()); final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); final int[] keyOffsets = new int[keysArray.length]; @@ -2246,7 +2299,7 @@ public List multiGetAsList(final ReadOptions opt, public List multiGetAsList(final ReadOptions opt, final List columnFamilyHandleList, final List keys) throws RocksDBException { - assert(keys.size() != 0); + assert (!keys.isEmpty()); // Check if key size equals cfList size. If not a exception must be // thrown. If not a Segmentation fault happens. if (keys.size()!=columnFamilyHandleList.size()){ @@ -2283,10 +2336,11 @@ public List multiGetAsList(final ReadOptions opt, */ public List multiGetByteBuffers( final List keys, final List values) throws RocksDBException { - final ReadOptions readOptions = new ReadOptions(); - final List columnFamilyHandleList = new ArrayList<>(1); - columnFamilyHandleList.add(getDefaultColumnFamily()); - return multiGetByteBuffers(readOptions, columnFamilyHandleList, keys, values); + try (ReadOptions readOptions = new ReadOptions()) { + final List columnFamilyHandleList = new ArrayList<>(1); + columnFamilyHandleList.add(getDefaultColumnFamily()); + return multiGetByteBuffers(readOptions, columnFamilyHandleList, keys, values); + } } /** @@ -2326,8 +2380,9 @@ public List multiGetByteBuffers(final ReadOptions readOptio public List multiGetByteBuffers( final List columnFamilyHandleList, final List keys, final List values) throws RocksDBException { - final ReadOptions readOptions = new ReadOptions(); - return multiGetByteBuffers(readOptions, columnFamilyHandleList, keys, values); + try (ReadOptions readOptions = new ReadOptions()) { + return multiGetByteBuffers(readOptions, columnFamilyHandleList, keys, values); + } } /** @@ -2350,7 +2405,7 @@ public List multiGetByteBuffers( public List multiGetByteBuffers(final ReadOptions readOptions, final List columnFamilyHandleList, final List keys, final List values) throws RocksDBException { - assert (keys.size() != 0); + assert (!keys.isEmpty()); // Check if key size equals cfList size. If not a exception must be // thrown. If not a Segmentation fault happens. @@ -2420,15 +2475,268 @@ public List multiGetByteBuffers(final ReadOptions readOptio return results; } + /** + * Check if a key exists in the database. + * This method is not as lightweight as {@code keyMayExist} but it gives a 100% guarantee + * of a correct result, whether the key exists or not. + * + * Internally it checks if the key may exist and then double checks with read operation + * that confirms the key exists. This deals with the case where {@code keyMayExist} may return + * a false positive. + * + * The code crosses the Java/JNI boundary only once. + * @param key byte array of a key to search for* + * @return true if key exist in database, otherwise false. + */ + public boolean keyExists(final byte[] key) { + return keyExists(key, 0, key.length); + } + /** + * Check if a key exists in the database. + * This method is not as lightweight as {@code keyMayExist} but it gives a 100% guarantee + * of a correct result, whether the key exists or not. + * + * Internally it checks if the key may exist and then double checks with read operation + * that confirms the key exists. This deals with the case where {@code keyMayExist} may return + * a false positive. + * + * The code crosses the Java/JNI boundary only once. + * @param key byte array of a key to search for + * @param offset the offset of the "key" array to be used, must be + * non-negative and no larger than "key".length + * @param len the length of the "key" array to be used, must be non-negative + * and no larger than "key".length + * @return true if key exist in database, otherwise false. + */ + public boolean keyExists(final byte[] key, final int offset, final int len) { + return keyExists(null, null, key, offset, len); + } + /** + * Check if a key exists in the database. + * This method is not as lightweight as {@code keyMayExist} but it gives a 100% guarantee + * of a correct result, whether the key exists or not. + * + * Internally it checks if the key may exist and then double checks with read operation + * that confirms the key exists. This deals with the case where {@code keyMayExist} may return + * a false positive. + * + * The code crosses the Java/JNI boundary only once. + * + * @param columnFamilyHandle {@link ColumnFamilyHandle} instance + * @param key byte array of a key to search for + * @return true if key exist in database, otherwise false. + */ + public boolean keyExists(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) { + return keyExists(columnFamilyHandle, key, 0, key.length); + } + + /** + * Check if a key exists in the database. + * This method is not as lightweight as {@code keyMayExist} but it gives a 100% guarantee + * of a correct result, whether the key exists or not. + * + * Internally it checks if the key may exist and then double checks with read operation + * that confirms the key exists. This deals with the case where {@code keyMayExist} may return + * a false positive. + * + * The code crosses the Java/JNI boundary only once. + * + * @param columnFamilyHandle {@link ColumnFamilyHandle} instance + * @param key byte array of a key to search for + * @param offset the offset of the "key" array to be used, must be + * non-negative and no larger than "key".length + * @param len the length of the "key" array to be used, must be non-negative + * and no larger than "key".length + * @return true if key exist in database, otherwise false. + */ + public boolean keyExists(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, + final int offset, final int len) { + return keyExists(columnFamilyHandle, null, key, offset, len); + } + + /** + * Check if a key exists in the database. + * This method is not as lightweight as {@code keyMayExist} but it gives a 100% guarantee + * of a correct result, whether the key exists or not. + * + * Internally it checks if the key may exist and then double checks with read operation + * that confirms the key exists. This deals with the case where {@code keyMayExist} may return + * a false positive. + * + * The code crosses the Java/JNI boundary only once. + * + * @param readOptions {@link ReadOptions} instance + * @param key byte array of a key to search for + * @return true if key exist in database, otherwise false. + */ + public boolean keyExists(final ReadOptions readOptions, final byte[] key) { + return keyExists(readOptions, key, 0, key.length); + } + + /** + * Check if a key exists in the database. + * This method is not as lightweight as {@code keyMayExist} but it gives a 100% guarantee + * of a correct result, whether the key exists or not. + * + * Internally it checks if the key may exist and then double checks with read operation + * that confirms the key exists. This deals with the case where {@code keyMayExist} may return + * a false positive. + * + * The code crosses the Java/JNI boundary only once. + * + * @param readOptions {@link ReadOptions} instance + * @param key byte array of a key to search for + * @param offset the offset of the "key" array to be used, must be + * non-negative and no larger than "key".length + * @param len the length of the "key" array to be used, must be non-negative + * and no larger than "key".length + * @return true if key exist in database, otherwise false. + */ + public boolean keyExists( + final ReadOptions readOptions, final byte[] key, final int offset, final int len) { + return keyExists(null, readOptions, key, offset, len); + } + + /** + * Check if a key exists in the database. + * This method is not as lightweight as {@code keyMayExist} but it gives a 100% guarantee + * of a correct result, whether the key exists or not. + * + * Internally it checks if the key may exist and then double checks with read operation + * that confirms the key exists. This deals with the case where {@code keyMayExist} may return + * a false positive. + * + * The code crosses the Java/JNI boundary only once. + * + * @param columnFamilyHandle {@link ColumnFamilyHandle} instance + * @param readOptions {@link ReadOptions} instance + * @param key byte array of a key to search for + * @return true if key exist in database, otherwise false. + */ + public boolean keyExists(final ColumnFamilyHandle columnFamilyHandle, + final ReadOptions readOptions, final byte[] key) { + return keyExists(columnFamilyHandle, readOptions, key, 0, key.length); + } + + /** + * Check if a key exists in the database. + * This method is not as lightweight as {@code keyMayExist} but it gives a 100% guarantee + * of a correct result, whether the key exists or not. + * + * Internally it checks if the key may exist and then double checks with read operation + * that confirms the key exists. This deals with the case where {@code keyMayExist} may return + * a false positive. + * + * The code crosses the Java/JNI boundary only once. + * + * @param columnFamilyHandle {@link ColumnFamilyHandle} instance + * @param readOptions {@link ReadOptions} instance + * @param key byte array of a key to search for + * @param offset the offset of the "key" array to be used, must be + * non-negative and no larger than "key".length + * @param len the length of the "key" array to be used, must be non-negative + * and no larger than "key".length + * @return true if key exist in database, otherwise false. + */ + public boolean keyExists(final ColumnFamilyHandle columnFamilyHandle, + final ReadOptions readOptions, final byte[] key, final int offset, final int len) { + checkBounds(offset, len, key.length); + return keyExists(nativeHandle_, + columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, + readOptions == null ? 0 : readOptions.nativeHandle_, key, offset, len); + } + + /** + * Check if a key exists in the database. + * This method is not as lightweight as {@code keyMayExist} but it gives a 100% guarantee + * of a correct result, whether the key exists or not. + * + * Internally it checks if the key may exist and then double checks with read operation + * that confirms the key exists. This deals with the case where {@code keyMayExist} may return + * a false positive. + * + * The code crosses the Java/JNI boundary only once. + * + * @param key ByteBuffer with key. Must be allocated as direct. + * @return true if key exist in database, otherwise false. + */ + public boolean keyExists(final ByteBuffer key) { + return keyExists(null, null, key); + } + + /** + * Check if a key exists in the database. + * This method is not as lightweight as {@code keyMayExist} but it gives a 100% guarantee + * of a correct result, whether the key exists or not. + * + * Internally it checks if the key may exist and then double checks with read operation + * that confirms the key exists. This deals with the case where {@code keyMayExist} may return + * a false positive. + * + * The code crosses the Java/JNI boundary only once. + * + * @param columnFamilyHandle {@link ColumnFamilyHandle} instance + * @param key ByteBuffer with key. Must be allocated as direct. + * @return true if key exist in database, otherwise false. + */ + public boolean keyExists(final ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key) { + return keyExists(columnFamilyHandle, null, key); + } + + /** + * Check if a key exists in the database. + * This method is not as lightweight as {@code keyMayExist} but it gives a 100% guarantee + * of a correct result, whether the key exists or not. + * + * Internally it checks if the key may exist and then double checks with read operation + * that confirms the key exists. This deals with the case where {@code keyMayExist} may return + * a false positive. + * + * The code crosses the Java/JNI boundary only once. + * + * @param readOptions {@link ReadOptions} instance + * @param key ByteBuffer with key. Must be allocated as direct. + * @return true if key exist in database, otherwise false. + */ + public boolean keyExists(final ReadOptions readOptions, final ByteBuffer key) { + return keyExists(null, readOptions, key); + } + + /** + * Check if a key exists in the database. + * This method is not as lightweight as {@code keyMayExist} but it gives a 100% guarantee + * of a correct result, whether the key exists or not. + * + * Internally it checks if the key may exist and then double checks with read operation + * that confirms the key exists. This deals with the case where {@code keyMayExist} may return + * a false positive. + * + * The code crosses the Java/JNI boundary only once. + * + * @param columnFamilyHandle {@link ColumnFamilyHandle} instance + * @param readOptions {@link ReadOptions} instance + * @param key ByteBuffer with key. Must be allocated as direct. + * @return true if key exist in database, otherwise false. + */ + public boolean keyExists(final ColumnFamilyHandle columnFamilyHandle, + final ReadOptions readOptions, final ByteBuffer key) { + assert key != null : "key ByteBuffer parameter cannot be null"; + assert key.isDirect() : "key parameter must be a direct ByteBuffer"; + + return keyExistsDirect(nativeHandle_, + columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, + readOptions == null ? 0 : readOptions.nativeHandle_, key, key.position(), key.limit()); + } + /** * If the key definitely does not exist in the database, then this method * returns false, otherwise it returns true if the key might exist. * That is to say that this method is probabilistic and may return false * positives, but never a false negative. - * + *

    * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. - * + *

    * This check is potentially lighter-weight than invoking * {@link #get(byte[])}. One way to make this lighter weight is to avoid * doing any IOs. @@ -2451,10 +2759,10 @@ public boolean keyMayExist(final byte[] key, * returns false, otherwise it returns true if the key might exist. * That is to say that this method is probabilistic and may return false * positives, but never a false negative. - * + *

    * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. - * + *

    * This check is potentially lighter-weight than invoking * {@link #get(byte[], int, int)}. One way to make this lighter weight is to * avoid doing any IOs. @@ -2482,10 +2790,10 @@ public boolean keyMayExist(final byte[] key, * returns false, otherwise it returns true if the key might exist. * That is to say that this method is probabilistic and may return false * positives, but never a false negative. - * + *

    * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. - * + *

    * This check is potentially lighter-weight than invoking * {@link #get(ColumnFamilyHandle,byte[])}. One way to make this lighter * weight is to avoid doing any IOs. @@ -2511,10 +2819,10 @@ public boolean keyMayExist( * returns false, otherwise it returns true if the key might exist. * That is to say that this method is probabilistic and may return false * positives, but never a false negative. - * + *

    * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. - * + *

    * This check is potentially lighter-weight than invoking * {@link #get(ColumnFamilyHandle, byte[], int, int)}. One way to make this * lighter weight is to avoid doing any IOs. @@ -2532,9 +2840,8 @@ public boolean keyMayExist( * @return false if the key definitely does not exist in the database, * otherwise true. */ - public boolean keyMayExist( - final ColumnFamilyHandle columnFamilyHandle, - final byte[] key, int offset, int len, + public boolean keyMayExist(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, + final int offset, final int len, /* @Nullable */ final Holder valueHolder) { return keyMayExist(columnFamilyHandle, null, key, offset, len, valueHolder); @@ -2545,10 +2852,10 @@ public boolean keyMayExist( * returns false, otherwise it returns true if the key might exist. * That is to say that this method is probabilistic and may return false * positives, but never a true negative. - * + *

    * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. - * + *

    * This check is potentially lighter-weight than invoking * {@link #get(ReadOptions, byte[])}. One way to make this * lighter weight is to avoid doing any IOs. @@ -2574,10 +2881,10 @@ public boolean keyMayExist( * returns false, otherwise it returns true if the key might exist. * That is to say that this method is probabilistic and may return false * positives, but never a true negative. - * + *

    * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. - * + *

    * This check is potentially lighter-weight than invoking * {@link #get(ReadOptions, byte[], int, int)}. One way to make this * lighter weight is to avoid doing any IOs. @@ -2608,10 +2915,10 @@ public boolean keyMayExist( * returns false, otherwise it returns true if the key might exist. * That is to say that this method is probabilistic and may return false * positives, but never a true negative. - * + *

    * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. - * + *

    * This check is potentially lighter-weight than invoking * {@link #get(ColumnFamilyHandle, ReadOptions, byte[])}. One way to make this * lighter weight is to avoid doing any IOs. @@ -2639,10 +2946,10 @@ public boolean keyMayExist( * returns false, otherwise it returns true if the key might exist. * That is to say that this method is probabilistic and may return false * positives, but never a false negative. - * + *

    * If the caller wants to obtain value when the key * is found in memory, then {@code valueHolder} must be set. - * + *

    * This check is potentially lighter-weight than invoking * {@link #get(ColumnFamilyHandle, ReadOptions, byte[], int, int)}. * One way to make this lighter weight is to avoid doing any IOs. @@ -2985,7 +3292,7 @@ public List newIterators( * @return Snapshot {@link Snapshot} instance */ public Snapshot getSnapshot() { - long snapshotHandle = getSnapshot(nativeHandle_); + final long snapshotHandle = getSnapshot(nativeHandle_); if (snapshotHandle != 0) { return new Snapshot(snapshotHandle); } @@ -2994,7 +3301,7 @@ public Snapshot getSnapshot() { /** * Release a previously acquired snapshot. - * + *

    * The caller must not use "snapshot" after this call. * * @param snapshot {@link Snapshot} instance @@ -3161,7 +3468,7 @@ public long getLongProperty( /** * Reset internal stats for DB and all column families. - * + *

    * Note this doesn't reset {@link Options#statistics()} as it is not * owned by DB. * @@ -3200,11 +3507,11 @@ public long getAggregatedLongProperty(final String property) /** * Get the approximate file system space used by keys in each range. - * + *

    * Note that the returned sizes measure file system space usage, so * if the user data compresses by a factor of ten, the returned * sizes will be one-tenth the size of the corresponding user data size. - * + *

    * If {@code sizeApproximationFlags} defines whether the returned size * should include the recently written data in the mem-tables (if * the mem-table type supports it), data serialized to disk, or both. @@ -3236,11 +3543,11 @@ public long[] getApproximateSizes( /** * Get the approximate file system space used by keys in each range for * the default column family. - * + *

    * Note that the returned sizes measure file system space usage, so * if the user data compresses by a factor of ten, the returned * sizes will be one-tenth the size of the corresponding user data size. - * + *

    * If {@code sizeApproximationFlags} defines whether the returned size * should include the recently written data in the mem-tables (if * the mem-table type supports it), data serialized to disk, or both. @@ -3420,6 +3727,26 @@ public void compactRange( columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_); } + /** + * ClipColumnFamily() will clip the entries in the CF according to the range + * [begin_key, end_key). Returns OK on success, and a non-OK status on error. + * Any entries outside this range will be completely deleted (including + * tombstones). + * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance + * @param beginKey First key to clip within database (inclusive) + * @param endKey Last key to clip within database (exclusive) + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void clipColumnFamily(final ColumnFamilyHandle columnFamilyHandle, final byte[] beginKey, + final byte[] endKey) throws RocksDBException { + clipColumnFamily(nativeHandle_, + columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_, beginKey, 0, + beginKey.length, endKey, 0, endKey.length); + } + /** * Change the options for the column family handle. * @@ -3437,6 +3764,40 @@ public void setOptions( mutableColumnFamilyOptions.getKeys(), mutableColumnFamilyOptions.getValues()); } + /** + * Set performance level for rocksdb performance measurement. + * @param level + * @throws IllegalArgumentException for UNINITIALIZED and OUT_OF_BOUNDS values + * as they can't be used for settings. + */ + public void setPerfLevel(final PerfLevel level) { + if (level == PerfLevel.UNINITIALIZED) { + throw new IllegalArgumentException("Unable to set UNINITIALIZED level"); + } else if (level == PerfLevel.OUT_OF_BOUNDS) { + throw new IllegalArgumentException("Unable to set OUT_OF_BOUNDS level"); + } else { + setPerfLevel(level.getValue()); + } + } + + /** + * Return current performance level measurement settings. + * @return + */ + public PerfLevel getPerfLevel() { + byte level = getPerfLevelNative(); + return PerfLevel.getPerfLevel(level); + } + + /** + * Return perf context bound to this thread. + * @return + */ + public PerfContext getPerfContext() { + long native_handle = getPerfContextNative(); + return new PerfContext(native_handle); + } + /** * Get the options for the column family handle * @@ -3450,7 +3811,7 @@ public void setOptions( */ public MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder getOptions( /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle) throws RocksDBException { - String optionsString = getOptions( + final String optionsString = getOptions( nativeHandle_, columnFamilyHandle == null ? 0 : columnFamilyHandle.nativeHandle_); return MutableColumnFamilyOptions.parse(optionsString, true); } @@ -3477,7 +3838,7 @@ public MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder getOptions() * resulting options string into options */ public MutableDBOptions.MutableDBOptionsBuilder getDBOptions() throws RocksDBException { - String optionsString = getDBOptions(nativeHandle_); + final String optionsString = getDBOptions(nativeHandle_); return MutableDBOptions.parse(optionsString, true); } @@ -3511,7 +3872,7 @@ public void setDBOptions(final MutableDBOptions mutableDBoptions) /** * Takes a list of files specified by file names and * compacts them to the specified level. - * + *

    * Note that the behavior is different from * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])} * in that CompactFiles() performs the compaction job using the CURRENT @@ -3543,7 +3904,7 @@ public List compactFiles( /** * Takes a list of files specified by file names and * compacts them to the specified level. - * + *

    * Note that the behavior is different from * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])} * in that CompactFiles() performs the compaction job using the CURRENT @@ -3586,7 +3947,7 @@ public List compactFiles( * returning. * */ - public void cancelAllBackgroundWork(boolean wait) { + public void cancelAllBackgroundWork(final boolean wait) { cancelAllBackgroundWork(nativeHandle_, wait); } @@ -3614,11 +3975,11 @@ public void continueBackgroundWork() throws RocksDBException { /** * Enable automatic compactions for the given column * families if they were previously disabled. - * + *

    * The function will first set the * {@link ColumnFamilyOptions#disableAutoCompactions()} option for each * column family to false, after which it will schedule a flush/compaction. - * + *

    * NOTE: Setting disableAutoCompactions to 'false' through * {@link #setOptions(ColumnFamilyHandle, MutableColumnFamilyOptions)} * does NOT schedule a flush/compaction afterwards, and only changes the @@ -3742,7 +4103,7 @@ public Env getEnv() { */ public void flush(final FlushOptions flushOptions) throws RocksDBException { - flush(flushOptions, (List) null); + flush(flushOptions, Collections.singletonList(getDefaultColumnFamily())); } /** @@ -3761,15 +4122,15 @@ public void flush(final FlushOptions flushOptions, /* @Nullable */ final ColumnFamilyHandle columnFamilyHandle) throws RocksDBException { flush(flushOptions, - columnFamilyHandle == null ? null : Arrays.asList(columnFamilyHandle)); + columnFamilyHandle == null ? null : Collections.singletonList(columnFamilyHandle)); } /** * Flushes multiple column families. - * + *

    * If atomic flush is not enabled, this is equivalent to calling * {@link #flush(FlushOptions, ColumnFamilyHandle)} multiple times. - * + *

    * If atomic flush is enabled, this will flush all column families * specified up to the latest sequence number at the time when flush is * requested. @@ -3800,13 +4161,13 @@ public void flushWal(final boolean sync) throws RocksDBException { /** * Sync the WAL. - * + *

    * Note that {@link #write(WriteOptions, WriteBatch)} followed by - * {@link #syncWal()} is not exactly the same as + * {@code #syncWal()} is not exactly the same as * {@link #write(WriteOptions, WriteBatch)} with * {@link WriteOptions#sync()} set to true; In the latter case the changes * won't be visible until the sync is done. - * + *

    * Currently only works if {@link Options#allowMmapWrites()} is set to false. * * @throws RocksDBException if an error occurs whilst syncing @@ -3838,7 +4199,7 @@ public void disableFileDeletions() throws RocksDBException { } /** - *

    Allow compactions to delete obsolete files. + *

    Enable deleting obsolete files. * If force == true, the call to EnableFileDeletions() * will guarantee that file deletions are enabled after * the call, even if DisableFileDeletions() was called @@ -3884,7 +4245,7 @@ public static class LiveFiles { /** * Retrieve the list of all files in the database after flushing the memtable. - * + *

    * See {@link #getLiveFiles(boolean)}. * * @return the live files @@ -3898,14 +4259,14 @@ public LiveFiles getLiveFiles() throws RocksDBException { /** * Retrieve the list of all files in the database. - * + *

    * In case you have multiple column families, even if {@code flushMemtable} * is true, you still need to call {@link #getSortedWalFiles()} - * after {@link #getLiveFiles(boolean)} to compensate for new data that + * after {@code #getLiveFiles(boolean)} to compensate for new data that * arrived to already-flushed column families while other column families * were flushing. - * - * NOTE: Calling {@link #getLiveFiles(boolean)} followed by + *

    + * NOTE: Calling {@code #getLiveFiles(boolean)} followed by * {@link #getSortedWalFiles()} can generate a lossless backup. * * @param flushMemtable set to true to flush before recoding the live @@ -4016,7 +4377,7 @@ public ColumnFamilyMetaData getColumnFamilyMetaData() { * ingest the file into this level (2). A file that have a key range that * overlap with the memtable key range will require us to Flush the memtable * first before ingesting the file. - * + *

    * (1) External SST files can be created using {@link SstFileWriter} * (2) We will try to ingest the files to the lowest possible level * even if the file compression doesn't match the level compression @@ -4041,7 +4402,7 @@ public void ingestExternalFile(final List filePathList, * ingest the file into this level (2). A file that have a key range that * overlap with the memtable key range will require us to Flush the memtable * first before ingesting the file. - * + *

    * (1) External SST files can be created using {@link SstFileWriter} * (2) We will try to ingest the files to the lowest possible level * even if the file compression doesn't match the level compression @@ -4207,7 +4568,7 @@ public void promoteL0(final int targetLevel) /** * Trace DB operations. - * + *

    * Use {@link #endTrace()} to stop tracing. * * @param traceOptions the options @@ -4219,7 +4580,7 @@ public void startTrace(final TraceOptions traceOptions, final AbstractTraceWriter traceWriter) throws RocksDBException { startTrace(nativeHandle_, traceOptions.getMaxTraceFileSize(), traceWriter.nativeHandle_); - /** + /* * NOTE: {@link #startTrace(long, long, long) transfers the ownership * from Java to C++, so we must disown the native handle here. */ @@ -4228,7 +4589,7 @@ public void startTrace(final TraceOptions traceOptions, /** * Stop tracing DB operations. - * + *

    * See {@link #startTrace(TraceOptions, AbstractTraceWriter)} * * @throws RocksDBException if an error occurs whilst ending the trace @@ -4268,10 +4629,9 @@ public void tryCatchUpWithPrimary() throws RocksDBException { * @throws RocksDBException thrown if error happens in underlying * native library. */ - public void deleteFilesInRanges(final ColumnFamilyHandle columnFamily, - final List ranges, final boolean includeEnd) - throws RocksDBException { - if (ranges.size() == 0) { + public void deleteFilesInRanges(final ColumnFamilyHandle columnFamily, final List ranges, + final boolean includeEnd) throws RocksDBException { + if (ranges.isEmpty()) { return; } if ((ranges.size() % 2) != 0) { @@ -4303,7 +4663,7 @@ public static void destroyDB(final String path, final Options options) private /* @Nullable */ long[] toNativeHandleList( /* @Nullable */ final List objectList) { if (objectList == null) { - return null; + return new long[0]; } final int len = objectList.size(); final long[] handleList = new long[len]; @@ -4313,8 +4673,9 @@ public static void destroyDB(final String path, final Options options) return handleList; } + @SuppressWarnings({"PMD.ForLoopVariableCount", "PMD.AvoidReassigningLoopVariables"}) private static long[] toRangeSliceHandles(final List ranges) { - final long rangeSliceHandles[] = new long [ranges.size() * 2]; + final long[] rangeSliceHandles = new long[ranges.size() * 2]; for (int i = 0, j = 0; i < ranges.size(); i++) { final Range range = ranges.get(i); rangeSliceHandles[j++] = range.start.getNativeHandle(); @@ -4323,25 +4684,19 @@ private static long[] toRangeSliceHandles(final List ranges) { return rangeSliceHandles; } - protected void storeOptionsInstance(DBOptionsInterface options) { + protected void storeOptionsInstance(final DBOptionsInterface options) { options_ = options; } - private static void checkBounds(int offset, int len, int size) { + private static void checkBounds(final int offset, final int len, final int size) { if ((offset | len | (offset + len) | (size - (offset + len))) < 0) { throw new IndexOutOfBoundsException(String.format("offset(%d), len(%d), size(%d)", offset, len, size)); } } - private static int computeCapacityHint(final int estimatedNumberOfItems) { - // Default load factor for HashMap is 0.75, so N * 1.5 will be at the load - // limit. We add +1 for a buffer. - return (int)Math.ceil(estimatedNumberOfItems * 1.5 + 1.0); - } - // native methods - private native static long open(final long optionsHandle, - final String path) throws RocksDBException; + private static native long open(final long optionsHandle, final String path) + throws RocksDBException; /** * @param optionsHandle Native handle pointing to an Options object @@ -4355,11 +4710,10 @@ private native static long open(final long optionsHandle, * * @throws RocksDBException thrown if the database could not be opened */ - private native static long[] open(final long optionsHandle, - final String path, final byte[][] columnFamilyNames, - final long[] columnFamilyOptions) throws RocksDBException; + private static native long[] open(final long optionsHandle, final String path, + final byte[][] columnFamilyNames, final long[] columnFamilyOptions) throws RocksDBException; - private native static long openROnly(final long optionsHandle, final String path, + private static native long openROnly(final long optionsHandle, final String path, final boolean errorIfWalFileExists) throws RocksDBException; /** @@ -4374,31 +4728,34 @@ private native static long openROnly(final long optionsHandle, final String path * * @throws RocksDBException thrown if the database could not be opened */ - private native static long[] openROnly(final long optionsHandle, final String path, + private static native long[] openROnly(final long optionsHandle, final String path, final byte[][] columnFamilyNames, final long[] columnFamilyOptions, final boolean errorIfWalFileExists) throws RocksDBException; - private native static long openAsSecondary(final long optionsHandle, final String path, + private static native long openAsSecondary(final long optionsHandle, final String path, final String secondaryPath) throws RocksDBException; - private native static long[] openAsSecondary(final long optionsHandle, final String path, + private static native long[] openAsSecondary(final long optionsHandle, final String path, final String secondaryPath, final byte[][] columnFamilyNames, final long[] columnFamilyOptions) throws RocksDBException; @Override protected native void disposeInternal(final long handle); - private native static void closeDatabase(final long handle) + private static native void closeDatabase(final long handle) throws RocksDBException; + private static native byte[][] listColumnFamilies(final long optionsHandle, final String path) throws RocksDBException; - private native static byte[][] listColumnFamilies(final long optionsHandle, - final String path) throws RocksDBException; private native long createColumnFamily(final long handle, final byte[] columnFamilyName, final int columnFamilyNamelen, final long columnFamilyOptions) throws RocksDBException; private native long[] createColumnFamilies(final long handle, final long columnFamilyOptionsHandle, final byte[][] columnFamilyNames) throws RocksDBException; - private native long[] createColumnFamilies(final long handle, - final long columnFamilyOptionsHandles[], final byte[][] columnFamilyNames) + private native long[] createColumnFamilies( + final long handle, final long[] columnFamilyOptionsHandles, final byte[][] columnFamilyNames) + throws RocksDBException; + private native long createColumnFamilyWithImport(final long handle, final byte[] columnFamilyName, + final int columnFamilyNamelen, final long columnFamilyOptions, + final long importColumnFamilyOptions, final long[] metadataHandleList) throws RocksDBException; private native void dropColumnFamily( final long handle, final long cfHandle) throws RocksDBException; @@ -4458,6 +4815,9 @@ private native void deleteRange( final int beginKeyOffset, final int beginKeyLength, final byte[] endKey, final int endKeyOffset, final int endKeyLength, final long cfHandle) throws RocksDBException; + private native void clipColumnFamily(final long handle, final long cfHandle, + final byte[] beginKey, final int beginKeyOffset, final int beginKeyLength, + final byte[] endKey, final int endKeyOffset, final int endKeyLength) throws RocksDBException; private native void merge(final long handle, final byte[] key, final int keyOffset, final int keyLength, final byte[] value, final int valueOffset, final int valueLength) throws RocksDBException; @@ -4519,6 +4879,12 @@ private native void multiGet(final long dbHandle, final long rOptHandle, final int[] keyLengths, final ByteBuffer[] valuesArray, final int[] valuesSizeArray, final Status[] statusArray); + private native boolean keyExists(final long handle, final long cfHandle, final long readOptHandle, + final byte[] key, final int keyOffset, final int keyLength); + + private native boolean keyExistsDirect(final long handle, final long cfHandle, + final long readOptHandle, final ByteBuffer key, final int keyOffset, final int keyLength); + private native boolean keyMayExist( final long handle, final long cfHandle, final long readOptHandle, final byte[] key, final int keyOffset, final int keyLength); @@ -4579,6 +4945,11 @@ private native void setOptions(final long handle, final long cfHandle, private native void setDBOptions(final long handle, final String[] keys, final String[] values) throws RocksDBException; private native String getDBOptions(final long handle); + private native void setPerfLevel(final byte level); + private native byte getPerfLevelNative(); + + private native long getPerfContextNative(); + private native String[] compactFiles(final long handle, final long compactionOptionsHandle, final long columnFamilyHandle, @@ -4645,10 +5016,10 @@ private native void startTrace(final long handle, final long maxTraceFileSize, private native void deleteFilesInRanges(long handle, long cfHandle, final byte[][] ranges, boolean include_end) throws RocksDBException; - private native static void destroyDB(final String path, - final long optionsHandle) throws RocksDBException; + private static native void destroyDB(final String path, final long optionsHandle) + throws RocksDBException; - private native static int version(); + private static native int version(); protected DBOptionsInterface options_; private static Version version; @@ -4681,12 +5052,10 @@ public String toString() { return getMajor() + "." + getMinor() + "." + getPatch(); } - private static Version fromEncodedVersion(int encodedVersion) { + private static Version fromEncodedVersion(final int encodedVersion) { final byte patch = (byte) (encodedVersion & 0xff); - encodedVersion >>= 8; - final byte minor = (byte) (encodedVersion & 0xff); - encodedVersion >>= 8; - final byte major = (byte) (encodedVersion & 0xff); + final byte minor = (byte) (encodedVersion >> 8 & 0xff); + final byte major = (byte) (encodedVersion >> 16 & 0xff); return new Version(major, minor, patch); } diff --git a/java/src/main/java/org/rocksdb/RocksDBException.java b/java/src/main/java/org/rocksdb/RocksDBException.java index 8b035f458f38..9df411d121cc 100644 --- a/java/src/main/java/org/rocksdb/RocksDBException.java +++ b/java/src/main/java/org/rocksdb/RocksDBException.java @@ -10,7 +10,7 @@ * type is used to describe an internal error from the c++ rocksdb library. */ public class RocksDBException extends Exception { - + private static final long serialVersionUID = -5187634878466267120L; /* @Nullable */ private final Status status; /** diff --git a/java/src/main/java/org/rocksdb/RocksEnv.java b/java/src/main/java/org/rocksdb/RocksEnv.java index b3681d77db4f..ca010c9f9c54 100644 --- a/java/src/main/java/org/rocksdb/RocksEnv.java +++ b/java/src/main/java/org/rocksdb/RocksEnv.java @@ -27,6 +27,5 @@ public class RocksEnv extends Env { super(handle); } - @Override - protected native final void disposeInternal(final long handle); + @Override protected final native void disposeInternal(final long handle); } diff --git a/java/src/main/java/org/rocksdb/RocksMutableObject.java b/java/src/main/java/org/rocksdb/RocksMutableObject.java index e92289dc0c59..eb3215290f84 100644 --- a/java/src/main/java/org/rocksdb/RocksMutableObject.java +++ b/java/src/main/java/org/rocksdb/RocksMutableObject.java @@ -71,7 +71,7 @@ protected synchronized long getNativeHandle() { } @Override - public synchronized final void close() { + public final synchronized void close() { if (isOwningHandle()) { disposeInternal(); this.owningHandle_ = false; diff --git a/java/src/main/java/org/rocksdb/Slice.java b/java/src/main/java/org/rocksdb/Slice.java index 50d9f7652586..6a01374d6556 100644 --- a/java/src/main/java/org/rocksdb/Slice.java +++ b/java/src/main/java/org/rocksdb/Slice.java @@ -125,9 +125,8 @@ protected void disposeInternal() { } @Override protected final native byte[] data0(long handle); - private native static long createNewSlice0(final byte[] data, - final int length); - private native static long createNewSlice1(final byte[] data); + private static native long createNewSlice0(final byte[] data, final int length); + private static native long createNewSlice1(final byte[] data); private native void clear0(long handle, boolean internalBuffer, long internalBufferOffset); private native void removePrefix0(long handle, int length); diff --git a/java/src/main/java/org/rocksdb/Snapshot.java b/java/src/main/java/org/rocksdb/Snapshot.java index 39cdf0c2d278..1f471bd31a40 100644 --- a/java/src/main/java/org/rocksdb/Snapshot.java +++ b/java/src/main/java/org/rocksdb/Snapshot.java @@ -29,7 +29,7 @@ public long getSequenceNumber() { @Override protected final void disposeInternal(final long handle) { - /** + /* * Nothing to release, we never own the pointer for a * Snapshot. The pointer * to the snapshot is released by the database diff --git a/java/src/main/java/org/rocksdb/SstFileManager.java b/java/src/main/java/org/rocksdb/SstFileManager.java index 8805410aa898..0b9a60061f0d 100644 --- a/java/src/main/java/org/rocksdb/SstFileManager.java +++ b/java/src/main/java/org/rocksdb/SstFileManager.java @@ -10,9 +10,9 @@ /** * SstFileManager is used to track SST files in the DB and control their * deletion rate. - * + *

    * All SstFileManager public functions are thread-safe. - * + *

    * SstFileManager is not extensible. */ //@ThreadSafe @@ -55,7 +55,7 @@ public SstFileManager(final Env env, /*@Nullable*/ final Logger logger) * * @param env the environment. * @param logger if not null, the logger will be used to log errors. - * + *

    * == Deletion rate limiting specific arguments == * @param rateBytesPerSec how many bytes should be deleted per second, If * this value is set to 1024 (1 Kb / sec) and we deleted a file of size @@ -75,7 +75,7 @@ public SstFileManager(final Env env, /*@Nullable*/ final Logger logger, * * @param env the environment. * @param logger if not null, the logger will be used to log errors. - * + *

    * == Deletion rate limiting specific arguments == * @param rateBytesPerSec how many bytes should be deleted per second, If * this value is set to 1024 (1 Kb / sec) and we deleted a file of size @@ -100,7 +100,7 @@ public SstFileManager(final Env env, /*@Nullable*/ final Logger logger, * * @param env the environment. * @param logger if not null, the logger will be used to log errors. - * + *

    * == Deletion rate limiting specific arguments == * @param rateBytesPerSec how many bytes should be deleted per second, If * this value is set to 1024 (1 Kb / sec) and we deleted a file of size @@ -123,12 +123,11 @@ public SstFileManager(final Env env, /*@Nullable*/final Logger logger, rateBytesPerSec, maxTrashDbRatio, bytesMaxDeleteChunk)); } - /** * Update the maximum allowed space that should be used by RocksDB, if * the total size of the SST files exceeds {@code maxAllowedSpace}, writes to * RocksDB will fail. - * + *

    * Setting {@code maxAllowedSpace} to 0 will disable this feature; * maximum allowed space will be infinite (Default value). * @@ -202,7 +201,7 @@ public long getDeleteRateBytesPerSecond() { /** * Set the delete rate limit. - * + *

    * Zero means disable delete rate limiting and delete files immediately. * * @param deleteRate the delete rate limit (in bytes per second). @@ -229,9 +228,8 @@ public void setMaxTrashDBRatio(final double ratio) { setMaxTrashDBRatio(nativeHandle_, ratio); } - private native static long newSstFileManager(final long handle, - final long logger_handle, final long rateBytesPerSec, - final double maxTrashDbRatio, final long bytesMaxDeleteChunk) + private static native long newSstFileManager(final long handle, final long logger_handle, + final long rateBytesPerSec, final double maxTrashDbRatio, final long bytesMaxDeleteChunk) throws RocksDBException; private native void setMaxAllowedSpaceUsage(final long handle, final long maxAllowedSpace); @@ -247,5 +245,5 @@ private native void setDeleteRateBytesPerSecond(final long handle, final long deleteRate); private native double getMaxTrashDBRatio(final long handle); private native void setMaxTrashDBRatio(final long handle, final double ratio); - @Override protected final native void disposeInternal(final long handle); + @Override protected native void disposeInternal(final long handle); } diff --git a/java/src/main/java/org/rocksdb/SstFileMetaData.java b/java/src/main/java/org/rocksdb/SstFileMetaData.java index a04d05cb5cfe..88ea8152a6a2 100644 --- a/java/src/main/java/org/rocksdb/SstFileMetaData.java +++ b/java/src/main/java/org/rocksdb/SstFileMetaData.java @@ -36,18 +36,11 @@ public class SstFileMetaData { * @param numEntries the number of entries * @param numDeletions the number of deletions */ - protected SstFileMetaData( - final String fileName, - final String path, - final long size, - final long smallestSeqno, - final long largestSeqno, - final byte[] smallestKey, - final byte[] largestKey, - final long numReadsSampled, - final boolean beingCompacted, - final long numEntries, - final long numDeletions) { + @SuppressWarnings("PMD.ArrayIsStoredDirectly") + protected SstFileMetaData(final String fileName, final String path, final long size, + final long smallestSeqno, final long largestSeqno, final byte[] smallestKey, + final byte[] largestKey, final long numReadsSampled, final boolean beingCompacted, + final long numEntries, final long numDeletions) { this.fileName = fileName; this.path = path; this.size = size; @@ -111,6 +104,7 @@ public long largestSeqno() { * * @return the smallest user defined key */ + @SuppressWarnings("PMD.MethodReturnsInternalArray") public byte[] smallestKey() { return smallestKey; } @@ -120,6 +114,7 @@ public byte[] smallestKey() { * * @return the largest user defined key */ + @SuppressWarnings("PMD.MethodReturnsInternalArray") public byte[] largestKey() { return largestKey; } diff --git a/java/src/main/java/org/rocksdb/SstFileReader.java b/java/src/main/java/org/rocksdb/SstFileReader.java index bb1e94ee08eb..939d3937536c 100644 --- a/java/src/main/java/org/rocksdb/SstFileReader.java +++ b/java/src/main/java/org/rocksdb/SstFileReader.java @@ -6,10 +6,6 @@ package org.rocksdb; public class SstFileReader extends RocksObject { - static { - RocksDB.loadLibrary(); - } - public SstFileReader(final Options options) { super(newSstFileReader(options.nativeHandle_)); } @@ -18,12 +14,12 @@ public SstFileReader(final Options options) { * Returns an iterator that will iterate on all keys in the default * column family including both keys in the DB and uncommitted keys in this * transaction. - * + *

    * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read * from the DB but will NOT change which keys are read from this transaction * (the keys in this transaction do not yet belong to any snapshot and will be * fetched regardless). - * + *

    * Caller is responsible for deleting the returned Iterator. * * @param readOptions Read options. @@ -32,7 +28,7 @@ public SstFileReader(final Options options) { */ public SstFileReaderIterator newIterator(final ReadOptions readOptions) { assert (isOwningHandle()); - long iter = newIterator(nativeHandle_, readOptions.nativeHandle_); + final long iter = newIterator(nativeHandle_, readOptions.nativeHandle_); return new SstFileReaderIterator(this, iter); } @@ -75,7 +71,7 @@ public TableProperties getTableProperties() throws RocksDBException { private native void open(final long handle, final String filePath) throws RocksDBException; - private native static long newSstFileReader(final long optionsHandle); + private static native long newSstFileReader(final long optionsHandle); private native void verifyChecksum(final long handle) throws RocksDBException; private native TableProperties getTableProperties(final long handle) throws RocksDBException; diff --git a/java/src/main/java/org/rocksdb/SstFileWriter.java b/java/src/main/java/org/rocksdb/SstFileWriter.java index fe00c1a12833..d5766bffb61c 100644 --- a/java/src/main/java/org/rocksdb/SstFileWriter.java +++ b/java/src/main/java/org/rocksdb/SstFileWriter.java @@ -13,10 +13,6 @@ * sequence number = 0. */ public class SstFileWriter extends RocksObject { - static { - RocksDB.loadLibrary(); - } - /** * SstFileWriter Constructor. * @@ -199,12 +195,13 @@ public long fileSize() throws RocksDBException { return fileSize(nativeHandle_); } - private native static long newSstFileWriter( - final long envOptionsHandle, final long optionsHandle, + @SuppressWarnings("PMD.UnusedPrivateMethod") + // (AP) Should we expose a constructor wrapping this ? + private static native long newSstFileWriter(final long envOptionsHandle, final long optionsHandle, final long userComparatorHandle, final byte comparatorType); - private native static long newSstFileWriter(final long envOptionsHandle, - final long optionsHandle); + private static native long newSstFileWriter( + final long envOptionsHandle, final long optionsHandle); private native void open(final long handle, final String filePath) throws RocksDBException; diff --git a/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java b/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java index d513c5f153f9..b1ccf08c1405 100644 --- a/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java +++ b/java/src/main/java/org/rocksdb/SstPartitionerFixedPrefixFactory.java @@ -9,11 +9,11 @@ * Fixed prefix factory. It partitions SST files using fixed prefix of the key. */ public class SstPartitionerFixedPrefixFactory extends SstPartitionerFactory { - public SstPartitionerFixedPrefixFactory(long prefixLength) { + public SstPartitionerFixedPrefixFactory(final long prefixLength) { super(newSstPartitionerFixedPrefixFactory0(prefixLength)); } - private native static long newSstPartitionerFixedPrefixFactory0(long prefixLength); + private static native long newSstPartitionerFixedPrefixFactory0(long prefixLength); @Override protected final native void disposeInternal(final long handle); } diff --git a/java/src/main/java/org/rocksdb/StateType.java b/java/src/main/java/org/rocksdb/StateType.java index 803456bb2d11..803fa37d91ec 100644 --- a/java/src/main/java/org/rocksdb/StateType.java +++ b/java/src/main/java/org/rocksdb/StateType.java @@ -7,7 +7,7 @@ /** * The type used to refer to a thread state. - * + *

    * A state describes lower-level action of a thread * such as reading / writing a file or waiting for a mutex. */ diff --git a/java/src/main/java/org/rocksdb/Statistics.java b/java/src/main/java/org/rocksdb/Statistics.java index 0938a6d58346..09e08ee5699c 100644 --- a/java/src/main/java/org/rocksdb/Statistics.java +++ b/java/src/main/java/org/rocksdb/Statistics.java @@ -14,7 +14,7 @@ public class Statistics extends RocksObject { public Statistics() { - super(newStatistics()); + super(newStatisticsInstance()); } public Statistics(final Statistics otherStatistics) { @@ -22,7 +22,7 @@ public Statistics(final Statistics otherStatistics) { } public Statistics(final EnumSet ignoreHistograms) { - super(newStatistics(toArrayValues(ignoreHistograms))); + super(newStatisticsInstance(toArrayValues(ignoreHistograms))); } public Statistics(final EnumSet ignoreHistograms, final Statistics otherStatistics) { @@ -31,7 +31,7 @@ public Statistics(final EnumSet ignoreHistograms, final Statistic /** * Intentionally package-private. - * + *

    * Used from {@link DBOptions#statistics()} * * @param existingStatisticsHandle The C++ pointer to an existing statistics object @@ -134,10 +134,19 @@ public String toString() { return toString(nativeHandle_); } - private native static long newStatistics(); - private native static long newStatistics(final long otherStatisticsHandle); - private native static long newStatistics(final byte[] ignoreHistograms); - private native static long newStatistics(final byte[] ignoreHistograms, final long otherStatisticsHandle); + private static long newStatisticsInstance() { + RocksDB.loadLibrary(); + return newStatistics(); + } + private static native long newStatistics(); + private static native long newStatistics(final long otherStatisticsHandle); + private static long newStatisticsInstance(final byte[] ignoreHistograms) { + RocksDB.loadLibrary(); + return newStatistics(ignoreHistograms); + } + private static native long newStatistics(final byte[] ignoreHistograms); + private static native long newStatistics( + final byte[] ignoreHistograms, final long otherStatisticsHandle); @Override protected final native void disposeInternal(final long handle); diff --git a/java/src/main/java/org/rocksdb/StatisticsCollector.java b/java/src/main/java/org/rocksdb/StatisticsCollector.java index fb3f57150f05..dd0d98fe5214 100644 --- a/java/src/main/java/org/rocksdb/StatisticsCollector.java +++ b/java/src/main/java/org/rocksdb/StatisticsCollector.java @@ -61,49 +61,41 @@ public void shutDown(final int shutdownTimeout) throws InterruptedException { _executorService.awaitTermination(shutdownTimeout, TimeUnit.MILLISECONDS); } + @SuppressWarnings("PMD.CloseResource") private Runnable collectStatistics() { - return new Runnable() { - - @Override - public void run() { - while (_isRunning) { - try { - if(Thread.currentThread().isInterrupted()) { - break; - } - for(final StatsCollectorInput statsCollectorInput : - _statsCollectorInputList) { - Statistics statistics = statsCollectorInput.getStatistics(); - StatisticsCollectorCallback statsCallback = - statsCollectorInput.getCallback(); + return () -> { + while (_isRunning) { + try { + if (Thread.currentThread().isInterrupted()) { + break; + } + for (final StatsCollectorInput statsCollectorInput : _statsCollectorInputList) { + final Statistics statistics = statsCollectorInput.getStatistics(); + final StatisticsCollectorCallback statsCallback = statsCollectorInput.getCallback(); - // Collect ticker data - for(final TickerType ticker : TickerType.values()) { - if(ticker != TickerType.TICKER_ENUM_MAX) { - final long tickerValue = statistics.getTickerCount(ticker); - statsCallback.tickerCallback(ticker, tickerValue); - } + // Collect ticker data + for (final TickerType ticker : TickerType.values()) { + if (ticker != TickerType.TICKER_ENUM_MAX) { + final long tickerValue = statistics.getTickerCount(ticker); + statsCallback.tickerCallback(ticker, tickerValue); } + } - // Collect histogram data - for(final HistogramType histogramType : HistogramType.values()) { - if(histogramType != HistogramType.HISTOGRAM_ENUM_MAX) { - final HistogramData histogramData = - statistics.getHistogramData(histogramType); - statsCallback.histogramCallback(histogramType, histogramData); - } + // Collect histogram data + for (final HistogramType histogramType : HistogramType.values()) { + if (histogramType != HistogramType.HISTOGRAM_ENUM_MAX) { + final HistogramData histogramData = statistics.getHistogramData(histogramType); + statsCallback.histogramCallback(histogramType, histogramData); } } - - Thread.sleep(_statsCollectionInterval); - } - catch (final InterruptedException e) { - Thread.currentThread().interrupt(); - break; - } - catch (final Exception e) { - throw new RuntimeException("Error while calculating statistics", e); } + + Thread.sleep(_statsCollectionInterval); + } catch (final InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } catch (final Exception e) { + throw new RuntimeException("Error while calculating statistics", e); } } }; diff --git a/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java b/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java index f3785b15f6ce..bed7828e0560 100644 --- a/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java +++ b/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java @@ -7,7 +7,7 @@ /** * Callback interface provided to StatisticsCollector. - * + *

    * Thread safety: * StatisticsCollector doesn't make any guarantees about thread safety. * If the same reference of StatisticsCollectorCallback is passed to multiple diff --git a/java/src/main/java/org/rocksdb/StatsLevel.java b/java/src/main/java/org/rocksdb/StatsLevel.java index 58504b84a2b9..8190e503a2af 100644 --- a/java/src/main/java/org/rocksdb/StatsLevel.java +++ b/java/src/main/java/org/rocksdb/StatsLevel.java @@ -23,7 +23,7 @@ public enum StatsLevel { /** * Collect all stats, including measuring duration of mutex operations. - * + *

    * If getting time is expensive on the platform to run, it can * reduce scalability to more threads, especially for writes. */ diff --git a/java/src/main/java/org/rocksdb/Status.java b/java/src/main/java/org/rocksdb/Status.java index 033ed3ea1c0a..5f751f422089 100644 --- a/java/src/main/java/org/rocksdb/Status.java +++ b/java/src/main/java/org/rocksdb/Status.java @@ -5,15 +5,17 @@ package org.rocksdb; +import java.io.Serializable; import java.util.Objects; /** * Represents the status returned by a function call in RocksDB. - * + *

    * Currently only used with {@link RocksDBException} when the * status is not {@link Code#Ok} */ -public class Status { +public class Status implements Serializable { + private static final long serialVersionUID = -3794191127754280439L; private final Code code; /* @Nullable */ private final SubCode subCode; /* @Nullable */ private final String state; @@ -139,12 +141,12 @@ public byte getValue() { } @Override - public boolean equals(Object o) { + public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; - Status status = (Status) o; + final Status status = (Status) o; return code == status.code && subCode == status.subCode && Objects.equals(state, status.state); } diff --git a/java/src/main/java/org/rocksdb/StringAppendOperator.java b/java/src/main/java/org/rocksdb/StringAppendOperator.java index ddbccff46b5d..547371e7c08b 100644 --- a/java/src/main/java/org/rocksdb/StringAppendOperator.java +++ b/java/src/main/java/org/rocksdb/StringAppendOperator.java @@ -11,19 +11,19 @@ * two strings. */ public class StringAppendOperator extends MergeOperator { - public StringAppendOperator() { - this(','); - } + public StringAppendOperator() { + this(','); + } - public StringAppendOperator(char delim) { - super(newSharedStringAppendOperator(delim)); - } + public StringAppendOperator(final char delim) { + super(newSharedStringAppendOperator(delim)); + } - public StringAppendOperator(String delim) { - super(newSharedStringAppendOperator(delim)); - } + public StringAppendOperator(final String delim) { + super(newSharedStringAppendOperator(delim)); + } - private native static long newSharedStringAppendOperator(final char delim); - private native static long newSharedStringAppendOperator(final String delim); - @Override protected final native void disposeInternal(final long handle); + private static native long newSharedStringAppendOperator(final char delim); + private static native long newSharedStringAppendOperator(final String delim); + @Override protected final native void disposeInternal(final long handle); } diff --git a/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java b/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java index 5a383ade41dd..8dc56796a25d 100644 --- a/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java +++ b/java/src/main/java/org/rocksdb/TableFileCreationBriefInfo.java @@ -82,12 +82,12 @@ public TableFileCreationReason getReason() { } @Override - public boolean equals(Object o) { + public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; - TableFileCreationBriefInfo that = (TableFileCreationBriefInfo) o; + final TableFileCreationBriefInfo that = (TableFileCreationBriefInfo) o; return jobId == that.jobId && Objects.equals(dbName, that.dbName) && Objects.equals(columnFamilyName, that.columnFamilyName) && Objects.equals(filePath, that.filePath) && reason == that.reason; diff --git a/java/src/main/java/org/rocksdb/TableFileCreationInfo.java b/java/src/main/java/org/rocksdb/TableFileCreationInfo.java index 7742f32f19d0..5654603c3833 100644 --- a/java/src/main/java/org/rocksdb/TableFileCreationInfo.java +++ b/java/src/main/java/org/rocksdb/TableFileCreationInfo.java @@ -62,12 +62,12 @@ public Status getStatus() { } @Override - public boolean equals(Object o) { + public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; - TableFileCreationInfo that = (TableFileCreationInfo) o; + final TableFileCreationInfo that = (TableFileCreationInfo) o; return fileSize == that.fileSize && Objects.equals(tableProperties, that.tableProperties) && Objects.equals(status, that.status); } diff --git a/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java b/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java index 8aad03ae8fa6..9a777e3336c2 100644 --- a/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java +++ b/java/src/main/java/org/rocksdb/TableFileDeletionInfo.java @@ -62,12 +62,12 @@ public Status getStatus() { } @Override - public boolean equals(Object o) { + public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; - TableFileDeletionInfo that = (TableFileDeletionInfo) o; + final TableFileDeletionInfo that = (TableFileDeletionInfo) o; return jobId == that.jobId && Objects.equals(dbName, that.dbName) && Objects.equals(filePath, that.filePath) && Objects.equals(status, that.status); } diff --git a/java/src/main/java/org/rocksdb/TableFormatConfig.java b/java/src/main/java/org/rocksdb/TableFormatConfig.java index dbe524c4226b..726c6f122e21 100644 --- a/java/src/main/java/org/rocksdb/TableFormatConfig.java +++ b/java/src/main/java/org/rocksdb/TableFormatConfig.java @@ -18,5 +18,5 @@ public abstract class TableFormatConfig { * * @return native handle address to native table instance. */ - abstract protected long newTableFactoryHandle(); + protected abstract long newTableFactoryHandle(); } diff --git a/java/src/main/java/org/rocksdb/TableProperties.java b/java/src/main/java/org/rocksdb/TableProperties.java index 096341a4c13b..7fb1bcc774df 100644 --- a/java/src/main/java/org/rocksdb/TableProperties.java +++ b/java/src/main/java/org/rocksdb/TableProperties.java @@ -46,6 +46,7 @@ public class TableProperties { * Access is package private as this will only be constructed from * C++ via JNI and for testing. */ + @SuppressWarnings("PMD.ArrayIsStoredDirectly") TableProperties(final long dataSize, final long indexSize, final long indexPartitions, final long topLevelIndexSize, final long indexKeyIsUserKey, final long indexValueIsDeltaEncoded, final long filterSize, final long rawKeySize, @@ -116,6 +117,7 @@ public long getIndexSize() { * * @return the total number of index partitions. */ + @SuppressWarnings("PMD.MethodReturnsInternalArray") public long getIndexPartitions() { return indexPartitions; } @@ -299,6 +301,7 @@ public long getFastCompressionEstimatedDataSize() { * @return the name of the column family, or null if the * column family is unknown. */ + @SuppressWarnings("PMD.MethodReturnsInternalArray") /*@Nullable*/ public byte[] getColumnFamilyName() { return columnFamilyName; } @@ -380,12 +383,12 @@ public Map getReadableProperties() { } @Override - public boolean equals(Object o) { + public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; - TableProperties that = (TableProperties) o; + final TableProperties that = (TableProperties) o; return dataSize == that.dataSize && indexSize == that.indexSize && indexPartitions == that.indexPartitions && topLevelIndexSize == that.topLevelIndexSize && indexKeyIsUserKey == that.indexKeyIsUserKey diff --git a/java/src/main/java/org/rocksdb/ThreadStatus.java b/java/src/main/java/org/rocksdb/ThreadStatus.java index 062df5889e46..4211453d1a0b 100644 --- a/java/src/main/java/org/rocksdb/ThreadStatus.java +++ b/java/src/main/java/org/rocksdb/ThreadStatus.java @@ -15,7 +15,7 @@ public class ThreadStatus { private final OperationType operationType; private final long operationElapsedTime; // microseconds private final OperationStage operationStage; - private final long operationProperties[]; + private final long[] operationProperties; private final StateType stateType; /** @@ -113,11 +113,12 @@ public OperationStage getOperationStage() { /** * Get the list of properties that describe some details about the current * operation. - * + *

    * Each field in might have different meanings for different operations. * * @return the properties */ + @SuppressWarnings("PMD.MethodReturnsInternalArray") public long[] getOperationProperties() { return operationProperties; } diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java index 0d00add5bbe4..f2ca42776e79 100644 --- a/java/src/main/java/org/rocksdb/TickerType.java +++ b/java/src/main/java/org/rocksdb/TickerType.java @@ -7,7 +7,7 @@ /** * The logical mapping of tickers defined in rocksdb::Tickers. - * + *

    * Java byte value mappings don't align 1:1 to the c++ values. c++ rocksdb::Tickers enumeration type * is uint32_t and java org.rocksdb.TickerType is byte, this causes mapping issues when * rocksdb::Tickers value is greater then 127 (0x7F) for jbyte jni interface as range greater is not @@ -62,11 +62,6 @@ public enum TickerType { */ BLOCK_CACHE_INDEX_BYTES_INSERT((byte) 0x7), - /** - * # of bytes of index block erased from cache - */ - BLOCK_CACHE_INDEX_BYTES_EVICT((byte) 0x8), - /** * # of times cache miss when accessing filter block from block cache. */ @@ -87,11 +82,6 @@ public enum TickerType { */ BLOCK_CACHE_FILTER_BYTES_INSERT((byte) 0xC), - /** - * # of bytes of bloom filter block erased from cache - */ - BLOCK_CACHE_FILTER_BYTES_EVICT((byte) 0xD), - /** * # of times cache miss when accessing data block from block cache. */ @@ -269,36 +259,10 @@ public enum TickerType { */ ITER_BYTES_READ((byte) 0x2E), - NO_FILE_CLOSES((byte) 0x2F), - NO_FILE_OPENS((byte) 0x30), NO_FILE_ERRORS((byte) 0x31), - /** - * Time system had to wait to do LO-L1 compactions. - * - * @deprecated - */ - @Deprecated - STALL_L0_SLOWDOWN_MICROS((byte) 0x32), - - /** - * Time system had to wait to move memtable to L1. - * - * @deprecated - */ - @Deprecated - STALL_MEMTABLE_COMPACTION_MICROS((byte) 0x33), - - /** - * write throttle because of too many files in L0. - * - * @deprecated - */ - @Deprecated - STALL_L0_NUM_FILES_MICROS((byte) 0x34), - /** * Writer has to wait for compaction or flush to finish. */ @@ -311,14 +275,6 @@ public enum TickerType { */ DB_MUTEX_WAIT_MICROS((byte) 0x36), - RATE_LIMIT_DELAY_MILLIS((byte) 0x37), - - /** - * Number of iterators created. - * - */ - NO_ITERATORS((byte) 0x38), - /** * Number of MultiGet calls. */ @@ -334,11 +290,6 @@ public enum TickerType { */ NUMBER_MULTIGET_BYTES_READ((byte) 0x3B), - /** - * Number of deletes records that were not required to be - * written to storage because key does not exist. - */ - NUMBER_FILTERED_DELETES((byte) 0x3C), NUMBER_MERGE_FAILURES((byte) 0x3D), /** @@ -361,26 +312,6 @@ public enum TickerType { */ GET_UPDATES_SINCE_CALLS((byte) 0x41), - /** - * Miss in the compressed block cache. - */ - BLOCK_CACHE_COMPRESSED_MISS((byte) 0x42), - - /** - * Hit in the compressed block cache. - */ - BLOCK_CACHE_COMPRESSED_HIT((byte) 0x43), - - /** - * Number of blocks added to compressed block cache. - */ - BLOCK_CACHE_COMPRESSED_ADD((byte) 0x44), - - /** - * Number of failures when adding blocks to compressed block cache. - */ - BLOCK_CACHE_COMPRESSED_ADD_FAILURES((byte) 0x45), - /** * Number of times WAL sync is done. */ @@ -402,11 +333,6 @@ public enum TickerType { */ WRITE_DONE_BY_OTHER((byte) 0x49), - /** - * Number of writes ending up with timed-out. - */ - WRITE_TIMEDOUT((byte) 0x4A), - /** * Number of Write calls that request WAL. */ @@ -648,33 +574,11 @@ public enum TickerType { */ BLOB_DB_GC_FAILURES((byte) 0x7D), - /** - * # of keys drop by BlobDB garbage collection because they had been - * overwritten. - */ - BLOB_DB_GC_NUM_KEYS_OVERWRITTEN((byte) 0x7E), - - /** - * # of keys drop by BlobDB garbage collection because of expiration. - */ - BLOB_DB_GC_NUM_KEYS_EXPIRED((byte) 0x7F), - /** * # of keys relocated to new blob file by garbage collection. */ BLOB_DB_GC_NUM_KEYS_RELOCATED((byte) -0x02), - /** - * # of bytes drop by BlobDB garbage collection because they had been - * overwritten. - */ - BLOB_DB_GC_BYTES_OVERWRITTEN((byte) -0x03), - - /** - * # of bytes drop by BlobDB garbage collection because of expiration. - */ - BLOB_DB_GC_BYTES_EXPIRED((byte) -0x04), - /** * # of bytes relocated to new blob file by garbage collection. */ @@ -804,6 +708,9 @@ public enum TickerType { NON_LAST_LEVEL_READ_BYTES((byte) -0x2C), NON_LAST_LEVEL_READ_COUNT((byte) -0x2D), + /** + * Number of block checksum verifications + */ BLOCK_CHECKSUM_COMPUTE_COUNT((byte) -0x2E), /** @@ -836,6 +743,39 @@ public enum TickerType { */ BLOB_DB_CACHE_BYTES_WRITE((byte) -0x34), + /** + * Number of lookup into the prefetched tail (see + * `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) + * that can't find its data for table open + */ + TABLE_OPEN_PREFETCH_TAIL_MISS((byte) -0x3A), + + /** + * Number of lookup into the prefetched tail (see + * `TABLE_OPEN_PREFETCH_TAIL_READ_BYTES`) + * that finds its data for table open + */ + TABLE_OPEN_PREFETCH_TAIL_HIT((byte) -0x3B), + + /** + * Number of times RocksDB detected a corruption while verifying a block + * checksum. RocksDB does not remember corruptions that happened during user + * reads so the same block corruption may be detected multiple times. + */ + BLOCK_CHECKSUM_MISMATCH_COUNT((byte) -0x3C), + + READAHEAD_TRIMMED((byte) -0x3D), + + FIFO_MAX_SIZE_COMPACTIONS((byte) -0x3E), + + FIFO_TTL_COMPACTIONS((byte) -0x3F), + + PREFETCH_BYTES((byte) -0x40), + + PREFETCH_BYTES_USEFUL((byte) -0x41), + + PREFETCH_HITS((byte) -0x42), + TICKER_ENUM_MAX((byte) 0x5F); private final byte value; diff --git a/java/src/main/java/org/rocksdb/Transaction.java b/java/src/main/java/org/rocksdb/Transaction.java index b2cc8a9326a7..8ab968a3c6ed 100644 --- a/java/src/main/java/org/rocksdb/Transaction.java +++ b/java/src/main/java/org/rocksdb/Transaction.java @@ -5,13 +5,15 @@ package org.rocksdb; +import static org.rocksdb.RocksDB.PERFORMANCE_OPTIMIZATION_FOR_A_VERY_SPECIFIC_WORKLOAD; + import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * Provides BEGIN/COMMIT/ROLLBACK transactions. - * + *

    * To use transactions, you must first create either an * {@link OptimisticTransactionDB} or a {@link TransactionDB} * @@ -20,12 +22,14 @@ * {@link TransactionDB#beginTransaction(org.rocksdb.WriteOptions)} * * It is up to the caller to synchronize access to this object. - * + *

    * See samples/src/main/java/OptimisticTransactionSample.java and * samples/src/main/java/TransactionSample.java for some simple * examples. */ public class Transaction extends RocksObject { + private static final String FOR_EACH_KEY_THERE_MUST_BE_A_COLUMNFAMILYHANDLE = + "For each key there must be a ColumnFamilyHandle."; private final RocksDB parent; @@ -50,22 +54,22 @@ public class Transaction extends RocksObject { * any keys successfully written (or fetched via {@link #getForUpdate}) have * not been modified outside of this transaction since the time the snapshot * was set. - * + *

    * If a snapshot has not been set, the transaction guarantees that keys have * not been modified since the time each key was first written (or fetched via * {@link #getForUpdate}). - * - * Using {@link #setSnapshot()} will provide stricter isolation guarantees + *

    + * Using {@code #setSnapshot()} will provide stricter isolation guarantees * at the expense of potentially more transaction failures due to conflicts * with other writes. - * - * Calling {@link #setSnapshot()} has no effect on keys written before this + *

    + * Calling {@code #setSnapshot()} has no effect on keys written before this * function has been called. - * - * {@link #setSnapshot()} may be called multiple times if you would like to + *

    + * {@code #setSnapshot()} may be called multiple times if you would like to * change the snapshot used for different operations in this transaction. - * - * Calling {@link #setSnapshot()} will not affect the version of Data returned + *

    + * Calling {@code #setSnapshot()} will not affect the version of Data returned * by get(...) methods. See {@link #get} for more details. */ public void setSnapshot() { @@ -79,19 +83,19 @@ public void setSnapshot() { * By calling this function, the transaction will essentially call * {@link #setSnapshot()} for you right before performing the next * write/getForUpdate. - * - * Calling {@link #setSnapshotOnNextOperation()} will not affect what + *

    + * Calling {@code #setSnapshotOnNextOperation()} will not affect what * snapshot is returned by {@link #getSnapshot} until the next * write/getForUpdate is executed. - * + *

    * When the snapshot is created the notifier's snapshotCreated method will * be called so that the caller can get access to the snapshot. - * + *

    * This is an optimization to reduce the likelihood of conflicts that * could occur in between the time {@link #setSnapshot()} is called and the * first write/getForUpdate operation. i.e. this prevents the following * race-condition: - * + *

    * txn1->setSnapshot(); * txn2->put("A", ...); * txn2->commit(); @@ -108,20 +112,20 @@ public void setSnapshotOnNextOperation() { * By calling this function, the transaction will essentially call * {@link #setSnapshot()} for you right before performing the next * write/getForUpdate. - * + *

    * Calling {@link #setSnapshotOnNextOperation()} will not affect what * snapshot is returned by {@link #getSnapshot} until the next * write/getForUpdate is executed. - * + *

    * When the snapshot is created the * {@link AbstractTransactionNotifier#snapshotCreated(Snapshot)} method will * be called so that the caller can get access to the snapshot. - * + *

    * This is an optimization to reduce the likelihood of conflicts that * could occur in between the time {@link #setSnapshot()} is called and the * first write/getForUpdate operation. i.e. this prevents the following * race-condition: - * + *

    * txn1->setSnapshot(); * txn2->put("A", ...); * txn2->commit(); @@ -137,38 +141,37 @@ public void setSnapshotOnNextOperation( setSnapshotOnNextOperation(nativeHandle_, transactionNotifier.nativeHandle_); } - /** - * Returns the Snapshot created by the last call to {@link #setSnapshot()}. - * - * REQUIRED: The returned Snapshot is only valid up until the next time - * {@link #setSnapshot()}/{@link #setSnapshotOnNextOperation()} is called, - * {@link #clearSnapshot()} is called, or the Transaction is deleted. - * - * @return The snapshot or null if there is no snapshot - */ + /** + * Returns the Snapshot created by the last call to {@link #setSnapshot()}. + *

    + * REQUIRED: The returned Snapshot is only valid up until the next time + * {@link #setSnapshot()}/{@link #setSnapshotOnNextOperation()} is called, + * {@link #clearSnapshot()} is called, or the Transaction is deleted. + * + * @return The snapshot or null if there is no snapshot + */ public Snapshot getSnapshot() { assert(isOwningHandle()); final long snapshotNativeHandle = getSnapshot(nativeHandle_); if(snapshotNativeHandle == 0) { return null; } else { - final Snapshot snapshot = new Snapshot(snapshotNativeHandle); - return snapshot; + return new Snapshot(snapshotNativeHandle); } } /** * Clears the current snapshot (i.e. no snapshot will be 'set') - * + *

    * This removes any snapshot that currently exists or is set to be created * on the next update operation ({@link #setSnapshotOnNextOperation()}). - * - * Calling {@link #clearSnapshot()} has no effect on keys written before this + *

    + * Calling {@code #clearSnapshot()} has no effect on keys written before this * function has been called. - * + *

    * If a reference to a snapshot was retrieved via {@link #getSnapshot()}, it * will no longer be valid and should be discarded after a call to - * {@link #clearSnapshot()}. + * {@code #clearSnapshot()}. */ public void clearSnapshot() { assert(isOwningHandle()); @@ -186,17 +189,17 @@ public void prepare() throws RocksDBException { /** * Write all batched keys to the db atomically. - * + *

    * Returns OK on success. - * + *

    * May return any error status that could be returned by DB:Write(). - * + *

    * If this transaction was created by an {@link OptimisticTransactionDB} * Status::Busy() may be returned if the transaction could not guarantee * that there are no write conflicts. Status::TryAgain() may be returned * if the memtable history size is not large enough * (See max_write_buffer_number_to_maintain). - * + *

    * If this transaction was created by a {@link TransactionDB}, * Status::Expired() may be returned if this transaction has lived for * longer than {@link TransactionOptions#getExpiration()}. @@ -221,7 +224,7 @@ public void rollback() throws RocksDBException { /** * Records the state of the transaction for future calls to * {@link #rollbackToSavePoint()}. - * + *

    * May be called multiple times to set multiple save points. * * @throws RocksDBException if an error occurs whilst setting a save point @@ -235,7 +238,7 @@ public void setSavePoint() throws RocksDBException { * Undo all operations in this transaction (put, merge, delete, putLogData) * since the most recent call to {@link #setSavePoint()} and removes the most * recent {@link #setSavePoint()}. - * + *

    * If there is no previous call to {@link #setSavePoint()}, * returns Status::NotFound() * @@ -252,11 +255,11 @@ public void rollbackToSavePoint() throws RocksDBException { * also read pending changes in this transaction. * Currently, this function will return Status::MergeInProgress if the most * recent write to the queried key in this batch is a Merge. - * + *

    * If {@link ReadOptions#snapshot()} is not set, the current version of the * key will be read. Calling {@link #setSnapshot()} does not affect the * version of the data returned. - * + *

    * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect * what is read from the DB but will NOT change which keys are read from this * transaction (the keys in this transaction do not yet belong to any snapshot @@ -285,11 +288,11 @@ public byte[] get(final ColumnFamilyHandle columnFamilyHandle, * also read pending changes in this transaction. * Currently, this function will return Status::MergeInProgress if the most * recent write to the queried key in this batch is a Merge. - * + *

    * If {@link ReadOptions#snapshot()} is not set, the current version of the * key will be read. Calling {@link #setSnapshot()} does not affect the * version of the data returned. - * + *

    * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect * what is read from the DB but will NOT change which keys are read from this * transaction (the keys in this transaction do not yet belong to any snapshot @@ -316,11 +319,11 @@ public byte[] get(final ReadOptions readOptions, final byte[] key) * also read pending changes in this transaction. * Currently, this function will return Status::MergeInProgress if the most * recent write to the queried key in this batch is a Merge. - * + *

    * If {@link ReadOptions#snapshot()} is not set, the current version of the * key will be read. Calling {@link #setSnapshot()} does not affect the * version of the data returned. - * + *

    * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect * what is read from the DB but will NOT change which keys are read from this * transaction (the keys in this transaction do not yet belong to any snapshot @@ -346,8 +349,7 @@ public byte[][] multiGet(final ReadOptions readOptions, // Check if key size equals cfList size. If not a exception must be // thrown. If not a Segmentation fault happens. if (keys.length != columnFamilyHandles.size()) { - throw new IllegalArgumentException( - "For each key there must be a ColumnFamilyHandle."); + throw new IllegalArgumentException(FOR_EACH_KEY_THERE_MUST_BE_A_COLUMNFAMILYHANDLE); } if(keys.length == 0) { return new byte[0][0]; @@ -367,11 +369,11 @@ public byte[][] multiGet(final ReadOptions readOptions, * also read pending changes in this transaction. * Currently, this function will return Status::MergeInProgress if the most * recent write to the queried key in this batch is a Merge. - * + *

    * If {@link ReadOptions#snapshot()} is not set, the current version of the * key will be read. Calling {@link #setSnapshot()} does not affect the * version of the data returned. - * + *

    * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect * what is read from the DB but will NOT change which keys are read from this * transaction (the keys in this transaction do not yet belong to any snapshot @@ -397,9 +399,9 @@ public List multiGetAsList(final ReadOptions readOptions, // Check if key size equals cfList size. If not a exception must be // thrown. If not a Segmentation fault happens. if (keys.size() != columnFamilyHandles.size()) { - throw new IllegalArgumentException("For each key there must be a ColumnFamilyHandle."); + throw new IllegalArgumentException(FOR_EACH_KEY_THERE_MUST_BE_A_COLUMNFAMILYHANDLE); } - if (keys.size() == 0) { + if (keys.isEmpty()) { return new ArrayList<>(0); } final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); @@ -417,11 +419,11 @@ public List multiGetAsList(final ReadOptions readOptions, * also read pending changes in this transaction. * Currently, this function will return Status::MergeInProgress if the most * recent write to the queried key in this batch is a Merge. - * + *

    * If {@link ReadOptions#snapshot()} is not set, the current version of the * key will be read. Calling {@link #setSnapshot()} does not affect the * version of the data returned. - * + *

    * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect * what is read from the DB but will NOT change which keys are read from this * transaction (the keys in this transaction do not yet belong to any snapshot @@ -454,11 +456,11 @@ public byte[][] multiGet(final ReadOptions readOptions, final byte[][] keys) * also read pending changes in this transaction. * Currently, this function will return Status::MergeInProgress if the most * recent write to the queried key in this batch is a Merge. - * + *

    * If {@link ReadOptions#snapshot()} is not set, the current version of the * key will be read. Calling {@link #setSnapshot()} does not affect the * version of the data returned. - * + *

    * Note that setting {@link ReadOptions#setSnapshot(Snapshot)} will affect * what is read from the DB but will NOT change which keys are read from this * transaction (the keys in this transaction do not yet belong to any snapshot @@ -475,7 +477,7 @@ public byte[][] multiGet(final ReadOptions readOptions, final byte[][] keys) */ public List multiGetAsList(final ReadOptions readOptions, final List keys) throws RocksDBException { - if (keys.size() == 0) { + if (keys.isEmpty()) { return new ArrayList<>(0); } final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); @@ -489,22 +491,22 @@ public List multiGetAsList(final ReadOptions readOptions, final List * Note: Currently, this function will return Status::MergeInProgress * if the most recent write to the queried key in this batch is a Merge. - * + *

    * The values returned by this function are similar to * {@link RocksDB#get(ColumnFamilyHandle, ReadOptions, byte[])}. * If value==nullptr, then this function will not read any data, but will * still ensure that this key cannot be written to by outside of this * transaction. - * + *

    * If this transaction was created by an {@link OptimisticTransactionDB}, * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)} * could cause {@link #commit()} to fail. Otherwise, it could return any error * that could be returned by * {@link RocksDB#get(ColumnFamilyHandle, ReadOptions, byte[])}. - * + *

    * If this transaction was created on a {@link TransactionDB}, an * {@link RocksDBException} may be thrown with an accompanying {@link Status} * when: @@ -570,22 +572,22 @@ public byte[] getForUpdate(final ReadOptions readOptions, * transaction after it has first been read (or after the snapshot if a * snapshot is set in this transaction). The transaction behavior is the * same regardless of whether the key exists or not. - * + *

    * Note: Currently, this function will return Status::MergeInProgress * if the most recent write to the queried key in this batch is a Merge. - * + *

    * The values returned by this function are similar to * {@link RocksDB#get(ReadOptions, byte[])}. * If value==nullptr, then this function will not read any data, but will * still ensure that this key cannot be written to by outside of this * transaction. - * + *

    * If this transaction was created on an {@link OptimisticTransactionDB}, * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)} * could cause {@link #commit()} to fail. Otherwise, it could return any error * that could be returned by * {@link RocksDB#get(ReadOptions, byte[])}. - * + *

    * If this transaction was created on a {@link TransactionDB}, an * {@link RocksDBException} may be thrown with an accompanying {@link Status} * when: @@ -618,7 +620,7 @@ public byte[] getForUpdate(final ReadOptions readOptions, final byte[] key, /** * A multi-key version of * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}. - * + *

    * * @param readOptions Read options. * @param columnFamilyHandles {@link org.rocksdb.ColumnFamilyHandle} @@ -638,8 +640,7 @@ public byte[][] multiGetForUpdate(final ReadOptions readOptions, // Check if key size equals cfList size. If not a exception must be // thrown. If not a Segmentation fault happens. if (keys.length != columnFamilyHandles.size()){ - throw new IllegalArgumentException( - "For each key there must be a ColumnFamilyHandle."); + throw new IllegalArgumentException(FOR_EACH_KEY_THERE_MUST_BE_A_COLUMNFAMILYHANDLE); } if(keys.length == 0) { return new byte[0][0]; @@ -655,7 +656,7 @@ public byte[][] multiGetForUpdate(final ReadOptions readOptions, /** * A multi-key version of * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}. - * + *

    * * @param readOptions Read options. * @param columnFamilyHandles {@link org.rocksdb.ColumnFamilyHandle} @@ -674,9 +675,9 @@ public List multiGetForUpdateAsList(final ReadOptions readOptions, // Check if key size equals cfList size. If not a exception must be // thrown. If not a Segmentation fault happens. if (keys.size() != columnFamilyHandles.size()) { - throw new IllegalArgumentException("For each key there must be a ColumnFamilyHandle."); + throw new IllegalArgumentException(FOR_EACH_KEY_THERE_MUST_BE_A_COLUMNFAMILYHANDLE); } - if (keys.size() == 0) { + if (keys.isEmpty()) { return new ArrayList<>(); } final byte[][] keysArray = keys.toArray(new byte[keys.size()][]); @@ -691,7 +692,7 @@ public List multiGetForUpdateAsList(final ReadOptions readOptions, /** * A multi-key version of {@link #getForUpdate(ReadOptions, byte[], boolean)}. - * + *

    * * @param readOptions Read options. * @param keys the keys to retrieve the values for. @@ -715,7 +716,7 @@ public byte[][] multiGetForUpdate(final ReadOptions readOptions, final byte[][] /** * A multi-key version of {@link #getForUpdate(ReadOptions, byte[], boolean)}. - * + *

    * * @param readOptions Read options. * @param keys the keys to retrieve the values for. @@ -728,7 +729,7 @@ public byte[][] multiGetForUpdate(final ReadOptions readOptions, final byte[][] public List multiGetForUpdateAsList( final ReadOptions readOptions, final List keys) throws RocksDBException { assert (isOwningHandle()); - if (keys.size() == 0) { + if (keys.isEmpty()) { return new ArrayList<>(0); } @@ -741,14 +742,14 @@ public List multiGetForUpdateAsList( * Returns an iterator that will iterate on all keys in the default * column family including both keys in the DB and uncommitted keys in this * transaction. - * + *

    * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read * from the DB but will NOT change which keys are read from this transaction * (the keys in this transaction do not yet belong to any snapshot and will be * fetched regardless). - * + *

    * Caller is responsible for deleting the returned Iterator. - * + *

    * The returned iterator is only valid until {@link #commit()}, * {@link #rollback()}, or {@link #rollbackToSavePoint()} is called. * @@ -766,15 +767,15 @@ public RocksIterator getIterator(final ReadOptions readOptions) { * Returns an iterator that will iterate on all keys in the column family * specified by {@code columnFamilyHandle} including both keys in the DB * and uncommitted keys in this transaction. - * + *

    * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is read * from the DB but will NOT change which keys are read from this transaction * (the keys in this transaction do not yet belong to any snapshot and will be * fetched regardless). - * + *

    * Caller is responsible for calling {@link RocksIterator#close()} on * the returned Iterator. - * + *

    * The returned iterator is only valid until {@link #commit()}, * {@link #rollback()}, or {@link #rollbackToSavePoint()} is called. * @@ -794,10 +795,10 @@ public RocksIterator getIterator(final ReadOptions readOptions, /** * Similar to {@link RocksDB#put(ColumnFamilyHandle, byte[], byte[])}, but * will also perform conflict checking on the keys be written. - * + *

    * If this Transaction was created on an {@link OptimisticTransactionDB}, * these functions should always succeed. - * + *

    * If this Transaction was created on a {@link TransactionDB}, an * {@link RocksDBException} may be thrown with an accompanying {@link Status} * when: @@ -829,12 +830,12 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, /** * Similar to {@link #put(ColumnFamilyHandle, byte[], byte[], boolean)} * but with {@code assumeTracked = false}. - * + *

    * Will also perform conflict checking on the keys be written. - * + *

    * If this Transaction was created on an {@link OptimisticTransactionDB}, * these functions should always succeed. - * + *

    * If this Transaction was created on a {@link TransactionDB}, an * {@link RocksDBException} may be thrown with an accompanying {@link Status} * when: @@ -861,10 +862,10 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, /** * Similar to {@link RocksDB#put(byte[], byte[])}, but * will also perform conflict checking on the keys be written. - * + *

    * If this Transaction was created on an {@link OptimisticTransactionDB}, * these functions should always succeed. - * + *

    * If this Transaction was created on a {@link TransactionDB}, an * {@link RocksDBException} may be thrown with an accompanying {@link Status} * when: @@ -915,7 +916,7 @@ public void put(final ColumnFamilyHandle columnFamilyHandle, /** * Similar to {@link #put(ColumnFamilyHandle, byte[][], byte[][], boolean)} * but with with {@code assumeTracked = false}. - * + *

    * Allows you to specify the key and value in several parts that will be * concatenated together. * @@ -956,10 +957,10 @@ public void put(final byte[][] keyParts, final byte[][] valueParts) /** * Similar to {@link RocksDB#merge(ColumnFamilyHandle, byte[], byte[])}, but * will also perform conflict checking on the keys be written. - * + *

    * If this Transaction was created on an {@link OptimisticTransactionDB}, * these functions should always succeed. - * + *

    * If this Transaction was created on a {@link TransactionDB}, an * {@link RocksDBException} may be thrown with an accompanying {@link Status} * when: @@ -992,12 +993,12 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle, /** * Similar to {@link #merge(ColumnFamilyHandle, byte[], byte[], boolean)} * but with {@code assumeTracked = false}. - * + *

    * Will also perform conflict checking on the keys be written. - * + *

    * If this Transaction was created on an {@link OptimisticTransactionDB}, * these functions should always succeed. - * + *

    * If this Transaction was created on a {@link TransactionDB}, an * {@link RocksDBException} may be thrown with an accompanying {@link Status} * when: @@ -1024,10 +1025,10 @@ public void merge(final ColumnFamilyHandle columnFamilyHandle, /** * Similar to {@link RocksDB#merge(byte[], byte[])}, but * will also perform conflict checking on the keys be written. - * + *

    * If this Transaction was created on an {@link OptimisticTransactionDB}, * these functions should always succeed. - * + *

    * If this Transaction was created on a {@link TransactionDB}, an * {@link RocksDBException} may be thrown with an accompanying {@link Status} * when: @@ -1052,10 +1053,10 @@ public void merge(final byte[] key, final byte[] value) /** * Similar to {@link RocksDB#delete(ColumnFamilyHandle, byte[])}, but * will also perform conflict checking on the keys be written. - * + *

    * If this Transaction was created on an {@link OptimisticTransactionDB}, * these functions should always succeed. - * + *

    * If this Transaction was created on a {@link TransactionDB}, an * {@link RocksDBException} may be thrown with an accompanying {@link Status} * when: @@ -1086,12 +1087,12 @@ public void delete(final ColumnFamilyHandle columnFamilyHandle, /** * Similar to {@link #delete(ColumnFamilyHandle, byte[], boolean)} * but with {@code assumeTracked = false}. - * + *

    * Will also perform conflict checking on the keys be written. - * + *

    * If this Transaction was created on an {@link OptimisticTransactionDB}, * these functions should always succeed. - * + *

    * If this Transaction was created on a {@link TransactionDB}, an * {@link RocksDBException} may be thrown with an accompanying {@link Status} * when: @@ -1117,10 +1118,10 @@ public void delete(final ColumnFamilyHandle columnFamilyHandle, /** * Similar to {@link RocksDB#delete(byte[])}, but * will also perform conflict checking on the keys be written. - * + *

    * If this Transaction was created on an {@link OptimisticTransactionDB}, * these functions should always succeed. - * + *

    * If this Transaction was created on a {@link TransactionDB}, an * {@link RocksDBException} may be thrown with an accompanying {@link Status} * when: @@ -1168,7 +1169,7 @@ public void delete(final ColumnFamilyHandle columnFamilyHandle, /** * Similar to{@link #delete(ColumnFamilyHandle, byte[][], boolean)} * but with {@code assumeTracked = false}. - * + *

    * Allows you to specify the key in several parts that will be * concatenated together. * @@ -1204,10 +1205,10 @@ public void delete(final byte[][] keyParts) throws RocksDBException { /** * Similar to {@link RocksDB#singleDelete(ColumnFamilyHandle, byte[])}, but * will also perform conflict checking on the keys be written. - * + *

    * If this Transaction was created on an {@link OptimisticTransactionDB}, * these functions should always succeed. - * + *

    * If this Transaction was created on a {@link TransactionDB}, an * {@link RocksDBException} may be thrown with an accompanying {@link Status} * when: @@ -1228,9 +1229,9 @@ public void delete(final byte[][] keyParts) throws RocksDBException { * @throws RocksDBException when one of the TransactionalDB conditions * described above occurs, or in the case of an unexpected error */ - @Experimental("Performance optimization for a very specific workload") - public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, - final byte[] key, final boolean assumeTracked) throws RocksDBException { + @Experimental(PERFORMANCE_OPTIMIZATION_FOR_A_VERY_SPECIFIC_WORKLOAD) + public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key, + final boolean assumeTracked) throws RocksDBException { assert (isOwningHandle()); singleDelete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_, assumeTracked); @@ -1239,12 +1240,12 @@ public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, /** * Similar to {@link #singleDelete(ColumnFamilyHandle, byte[], boolean)} * but with {@code assumeTracked = false}. - * + *

    * will also perform conflict checking on the keys be written. - * + *

    * If this Transaction was created on an {@link OptimisticTransactionDB}, * these functions should always succeed. - * + *

    * If this Transaction was created on a {@link TransactionDB}, an * {@link RocksDBException} may be thrown with an accompanying {@link Status} * when: @@ -1260,9 +1261,9 @@ public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, * @throws RocksDBException when one of the TransactionalDB conditions * described above occurs, or in the case of an unexpected error */ - @Experimental("Performance optimization for a very specific workload") - public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, - final byte[] key) throws RocksDBException { + @Experimental(PERFORMANCE_OPTIMIZATION_FOR_A_VERY_SPECIFIC_WORKLOAD) + public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) + throws RocksDBException { assert(isOwningHandle()); singleDelete(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_, false); @@ -1271,10 +1272,10 @@ public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, /** * Similar to {@link RocksDB#singleDelete(byte[])}, but * will also perform conflict checking on the keys be written. - * + *

    * If this Transaction was created on an {@link OptimisticTransactionDB}, * these functions should always succeed. - * + *

    * If this Transaction was created on a {@link TransactionDB}, an * {@link RocksDBException} may be thrown with an accompanying {@link Status} * when: @@ -1289,7 +1290,7 @@ public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, * @throws RocksDBException when one of the TransactionalDB conditions * described above occurs, or in the case of an unexpected error */ - @Experimental("Performance optimization for a very specific workload") + @Experimental(PERFORMANCE_OPTIMIZATION_FOR_A_VERY_SPECIFIC_WORKLOAD) public void singleDelete(final byte[] key) throws RocksDBException { assert(isOwningHandle()); singleDelete(nativeHandle_, key, key.length); @@ -1312,10 +1313,9 @@ public void singleDelete(final byte[] key) throws RocksDBException { * @throws RocksDBException when one of the TransactionalDB conditions * described above occurs, or in the case of an unexpected error */ - @Experimental("Performance optimization for a very specific workload") - public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, - final byte[][] keyParts, final boolean assumeTracked) - throws RocksDBException { + @Experimental(PERFORMANCE_OPTIMIZATION_FOR_A_VERY_SPECIFIC_WORKLOAD) + public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[][] keyParts, + final boolean assumeTracked) throws RocksDBException { assert (isOwningHandle()); singleDelete(nativeHandle_, keyParts, keyParts.length, columnFamilyHandle.nativeHandle_, assumeTracked); @@ -1324,7 +1324,7 @@ public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, /** * Similar to{@link #singleDelete(ColumnFamilyHandle, byte[][], boolean)} * but with {@code assumeTracked = false}. - * + *

    * Allows you to specify the key in several parts that will be * concatenated together. * @@ -1334,9 +1334,9 @@ public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, * @throws RocksDBException when one of the TransactionalDB conditions * described above occurs, or in the case of an unexpected error */ - @Experimental("Performance optimization for a very specific workload") - public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, - final byte[][] keyParts) throws RocksDBException { + @Experimental(PERFORMANCE_OPTIMIZATION_FOR_A_VERY_SPECIFIC_WORKLOAD) + public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[][] keyParts) + throws RocksDBException { assert(isOwningHandle()); singleDelete(nativeHandle_, keyParts, keyParts.length, columnFamilyHandle.nativeHandle_, false); @@ -1353,7 +1353,7 @@ public void singleDelete(final ColumnFamilyHandle columnFamilyHandle, * @throws RocksDBException when one of the TransactionalDB conditions * described above occurs, or in the case of an unexpected error */ - @Experimental("Performance optimization for a very specific workload") + @Experimental(PERFORMANCE_OPTIMIZATION_FOR_A_VERY_SPECIFIC_WORKLOAD) public void singleDelete(final byte[][] keyParts) throws RocksDBException { assert(isOwningHandle()); singleDelete(nativeHandle_, keyParts, keyParts.length); @@ -1363,10 +1363,10 @@ public void singleDelete(final byte[][] keyParts) throws RocksDBException { * Similar to {@link RocksDB#put(ColumnFamilyHandle, byte[], byte[])}, * but operates on the transactions write batch. This write will only happen * if this transaction gets committed successfully. - * + *

    * Unlike {@link #put(ColumnFamilyHandle, byte[], byte[])} no conflict * checking will be performed for this key. - * + *

    * If this Transaction was created on a {@link TransactionDB}, this function * will still acquire locks necessary to make sure this write doesn't cause * conflicts in other transactions; This may cause a {@link RocksDBException} @@ -1390,10 +1390,10 @@ public void putUntracked(final ColumnFamilyHandle columnFamilyHandle, * Similar to {@link RocksDB#put(byte[], byte[])}, * but operates on the transactions write batch. This write will only happen * if this transaction gets committed successfully. - * + *

    * Unlike {@link #put(byte[], byte[])} no conflict * checking will be performed for this key. - * + *

    * If this Transaction was created on a {@link TransactionDB}, this function * will still acquire locks necessary to make sure this write doesn't cause * conflicts in other transactions; This may cause a {@link RocksDBException} @@ -1455,10 +1455,10 @@ public void putUntracked(final byte[][] keyParts, final byte[][] valueParts) * Similar to {@link RocksDB#merge(ColumnFamilyHandle, byte[], byte[])}, * but operates on the transactions write batch. This write will only happen * if this transaction gets committed successfully. - * + *

    * Unlike {@link #merge(ColumnFamilyHandle, byte[], byte[])} no conflict * checking will be performed for this key. - * + *

    * If this Transaction was created on a {@link TransactionDB}, this function * will still acquire locks necessary to make sure this write doesn't cause * conflicts in other transactions; This may cause a {@link RocksDBException} @@ -1481,10 +1481,10 @@ public void mergeUntracked(final ColumnFamilyHandle columnFamilyHandle, * Similar to {@link RocksDB#merge(byte[], byte[])}, * but operates on the transactions write batch. This write will only happen * if this transaction gets committed successfully. - * + *

    * Unlike {@link #merge(byte[], byte[])} no conflict * checking will be performed for this key. - * + *

    * If this Transaction was created on a {@link TransactionDB}, this function * will still acquire locks necessary to make sure this write doesn't cause * conflicts in other transactions; This may cause a {@link RocksDBException} @@ -1506,10 +1506,10 @@ public void mergeUntracked(final byte[] key, final byte[] value) * Similar to {@link RocksDB#delete(ColumnFamilyHandle, byte[])}, * but operates on the transactions write batch. This write will only happen * if this transaction gets committed successfully. - * + *

    * Unlike {@link #delete(ColumnFamilyHandle, byte[])} no conflict * checking will be performed for this key. - * + *

    * If this Transaction was created on a {@link TransactionDB}, this function * will still acquire locks necessary to make sure this write doesn't cause * conflicts in other transactions; This may cause a {@link RocksDBException} @@ -1532,10 +1532,10 @@ public void deleteUntracked(final ColumnFamilyHandle columnFamilyHandle, * Similar to {@link RocksDB#delete(byte[])}, * but operates on the transactions write batch. This write will only happen * if this transaction gets committed successfully. - * + *

    * Unlike {@link #delete(byte[])} no conflict * checking will be performed for this key. - * + *

    * If this Transaction was created on a {@link TransactionDB}, this function * will still acquire locks necessary to make sure this write doesn't cause * conflicts in other transactions; This may cause a {@link RocksDBException} @@ -1600,13 +1600,13 @@ public void putLogData(final byte[] blob) { * By default, all put/merge/delete operations will be indexed in the * transaction so that get/getForUpdate/getIterator can search for these * keys. - * + *

    * If the caller does not want to fetch the keys about to be written, * they may want to avoid indexing as a performance optimization. - * Calling {@link #disableIndexing()} will turn off indexing for all future + * Calling {@code #disableIndexing()} will turn off indexing for all future * put/merge/delete operations until {@link #enableIndexing()} is called. - * - * If a key is put/merge/deleted after {@link #disableIndexing()} is called + *

    + * If a key is put/merge/deleted after {@code #disableIndexing()} is called * and then is fetched via get/getForUpdate/getIterator, the result of the * fetch is undefined. */ @@ -1684,7 +1684,7 @@ public long getElapsedTime() { /** * Fetch the underlying write batch that contains all pending changes to be * committed. - * + *

    * Note: You should not write or delete anything from the batch directly and * should only use the functions in the {@link Transaction} class to * write to this transaction. @@ -1693,15 +1693,13 @@ public long getElapsedTime() { */ public WriteBatchWithIndex getWriteBatch() { assert(isOwningHandle()); - final WriteBatchWithIndex writeBatchWithIndex = - new WriteBatchWithIndex(getWriteBatch(nativeHandle_)); - return writeBatchWithIndex; + return new WriteBatchWithIndex(getWriteBatch(nativeHandle_)); } /** * Change the value of {@link TransactionOptions#getLockTimeout()} * (in milliseconds) for this transaction. - * + *

    * Has no effect on OptimisticTransactions. * * @param lockTimeout the timeout (in milliseconds) for locks used by this @@ -1719,9 +1717,7 @@ public void setLockTimeout(final long lockTimeout) { */ public WriteOptions getWriteOptions() { assert(isOwningHandle()); - final WriteOptions writeOptions = - new WriteOptions(getWriteOptions(nativeHandle_)); - return writeOptions; + return new WriteOptions(getWriteOptions(nativeHandle_)); } /** @@ -1738,28 +1734,28 @@ public void setWriteOptions(final WriteOptions writeOptions) { * If this key was previously fetched in this transaction using * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}/ * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, calling - * {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} will tell + * {@code #undoGetForUpdate(ColumnFamilyHandle, byte[])} will tell * the transaction that it no longer needs to do any conflict checking * for this key. - * + *

    * If a key has been fetched N times via * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}/ * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, then - * {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} will only have an + * {@code #undoGetForUpdate(ColumnFamilyHandle, byte[])} will only have an * effect if it is also called N times. If this key has been written to in - * this transaction, {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} + * this transaction, {@code #undoGetForUpdate(ColumnFamilyHandle, byte[])} * will have no effect. - * + *

    * If {@link #setSavePoint()} has been called after the * {@link #getForUpdate(ReadOptions, ColumnFamilyHandle, byte[], boolean)}, - * {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} will not have any + * {@code #undoGetForUpdate(ColumnFamilyHandle, byte[])} will not have any * effect. - * + *

    * If this Transaction was created by an {@link OptimisticTransactionDB}, - * calling {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} can affect + * calling {@code #undoGetForUpdate(ColumnFamilyHandle, byte[])} can affect * whether this key is conflict checked at commit time. * If this Transaction was created by a {@link TransactionDB}, - * calling {@link #undoGetForUpdate(ColumnFamilyHandle, byte[])} may release + * calling {@code #undoGetForUpdate(ColumnFamilyHandle, byte[])} may release * any held locks for this key. * * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} @@ -1776,28 +1772,28 @@ public void undoGetForUpdate(final ColumnFamilyHandle columnFamilyHandle, * If this key was previously fetched in this transaction using * {@link #getForUpdate(ReadOptions, byte[], boolean)}/ * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, calling - * {@link #undoGetForUpdate(byte[])} will tell + * {@code #undoGetForUpdate(byte[])} will tell * the transaction that it no longer needs to do any conflict checking * for this key. - * + *

    * If a key has been fetched N times via * {@link #getForUpdate(ReadOptions, byte[], boolean)}/ * {@link #multiGetForUpdate(ReadOptions, List, byte[][])}, then - * {@link #undoGetForUpdate(byte[])} will only have an + * {@code #undoGetForUpdate(byte[])} will only have an * effect if it is also called N times. If this key has been written to in - * this transaction, {@link #undoGetForUpdate(byte[])} + * this transaction, {@code #undoGetForUpdate(byte[])} * will have no effect. - * + *

    * If {@link #setSavePoint()} has been called after the * {@link #getForUpdate(ReadOptions, byte[], boolean)}, - * {@link #undoGetForUpdate(byte[])} will not have any + * {@code #undoGetForUpdate(byte[])} will not have any * effect. - * + *

    * If this Transaction was created by an {@link OptimisticTransactionDB}, - * calling {@link #undoGetForUpdate(byte[])} can affect + * calling {@code #undoGetForUpdate(byte[])} can affect * whether this key is conflict checked at commit time. * If this Transaction was created by a {@link TransactionDB}, - * calling {@link #undoGetForUpdate(byte[])} may release + * calling {@code #undoGetForUpdate(byte[])} may release * any held locks for this key. * * @param key the key to retrieve the value for. @@ -1828,9 +1824,7 @@ public void rebuildFromWriteBatch(final WriteBatch writeBatch) */ public WriteBatch getCommitTimeWriteBatch() { assert(isOwningHandle()); - final WriteBatch writeBatch = - new WriteBatch(getCommitTimeWriteBatch(nativeHandle_)); - return writeBatch; + return new WriteBatch(getCommitTimeWriteBatch(nativeHandle_)); } /** @@ -1908,7 +1902,7 @@ public WaitingTransactions getWaitingTxns() { /** * Get the execution status of the transaction. - * + *

    * NOTE: The execution status of an Optimistic Transaction * never changes. This is only useful for non-optimistic transactions! * @@ -1987,9 +1981,9 @@ public static TransactionState getTransactionState(final byte value) { * * @return The waiting transactions */ + @SuppressWarnings("PMD.UnusedPrivateMethod") private WaitingTransactions newWaitingTransactions( - final long columnFamilyId, final String key, - final long[] transactionIds) { + final long columnFamilyId, final String key, final long[] transactionIds) { return new WaitingTransactions(columnFamilyId, key, transactionIds); } @@ -2028,6 +2022,7 @@ public String getKey() { * * @return The IDs of the waiting transactions */ + @SuppressWarnings("PMD.MethodReturnsInternalArray") public long[] getTransactionIds() { return transactionIds; } @@ -2045,11 +2040,10 @@ private native void setSnapshotOnNextOperation(final long handle, private native void setSavePoint(final long handle) throws RocksDBException; private native void rollbackToSavePoint(final long handle) throws RocksDBException; - private native byte[] get(final long handle, final long readOptionsHandle, - final byte key[], final int keyLength, final long columnFamilyHandle) - throws RocksDBException; - private native byte[] get(final long handle, final long readOptionsHandle, - final byte key[], final int keyLen) throws RocksDBException; + private native byte[] get(final long handle, final long readOptionsHandle, final byte[] key, + final int keyLength, final long columnFamilyHandle) throws RocksDBException; + private native byte[] get(final long handle, final long readOptionsHandle, final byte[] key, + final int keyLen) throws RocksDBException; private native byte[][] multiGet(final long handle, final long readOptionsHandle, final byte[][] keys, final long[] columnFamilyHandles) throws RocksDBException; @@ -2057,10 +2051,10 @@ private native byte[][] multiGet(final long handle, final long readOptionsHandle, final byte[][] keys) throws RocksDBException; private native byte[] getForUpdate(final long handle, final long readOptionsHandle, - final byte key[], final int keyLength, final long columnFamilyHandle, final boolean exclusive, + final byte[] key, final int keyLength, final long columnFamilyHandle, final boolean exclusive, final boolean doValidate) throws RocksDBException; private native byte[] getForUpdate(final long handle, final long readOptionsHandle, - final byte key[], final int keyLen, final boolean exclusive, final boolean doValidate) + final byte[] key, final int keyLen, final boolean exclusive, final boolean doValidate) throws RocksDBException; private native byte[][] multiGetForUpdate(final long handle, final long readOptionsHandle, final byte[][] keys, diff --git a/java/src/main/java/org/rocksdb/TransactionDB.java b/java/src/main/java/org/rocksdb/TransactionDB.java index 86f25fe155f2..134a0c8a13e2 100644 --- a/java/src/main/java/org/rocksdb/TransactionDB.java +++ b/java/src/main/java/org/rocksdb/TransactionDB.java @@ -14,8 +14,8 @@ */ public class TransactionDB extends RocksDB implements TransactionalDB { - - private TransactionDBOptions transactionDbOptions_; + // Field is "used" to prevent GC of the + @SuppressWarnings("PMD.UnusedPrivateField") private TransactionDBOptions transactionDbOptions_; /** * Private constructor. @@ -106,16 +106,17 @@ public static TransactionDB open(final DBOptions dbOptions, /** * This is similar to {@link #close()} except that it * throws an exception if any error occurs. - * + *

    * This will not fsync the WAL files. * If syncing is required, the caller must first call {@link #syncWal()} * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch * with {@link WriteOptions#setSync(boolean)} set to true. - * + *

    * See also {@link #close()}. * * @throws RocksDBException if an error occurs whilst closing. */ + @Override public void closeE() throws RocksDBException { if (owningHandle_.compareAndSet(true, false)) { try { @@ -129,14 +130,15 @@ public void closeE() throws RocksDBException { /** * This is similar to {@link #closeE()} except that it * silently ignores any errors. - * + *

    * This will not fsync the WAL files. * If syncing is required, the caller must first call {@link #syncWal()} * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch * with {@link WriteOptions#setSync(boolean)} set to true. - * + *

    * See also {@link #close()}. */ + @SuppressWarnings("PMD.EmptyCatchBlock") @Override public void close() { if (owningHandle_.compareAndSet(true, false)) { @@ -218,7 +220,7 @@ public List getAllPreparedTransactions() { final List txns = new ArrayList<>(); for(final long jtxnHandle : jtxnHandles) { - final Transaction txn = new Transaction(this, jtxnHandle); + final Transaction txn = new Transaction(this, jtxnHandle); // NOPMD - CloseResource // this instance doesn't own the underlying C++ object txn.disOwnNativeHandle(); @@ -233,8 +235,8 @@ public static class KeyLockInfo { private final long[] transactionIDs; private final boolean exclusive; - public KeyLockInfo(final String key, final long transactionIDs[], - final boolean exclusive) { + @SuppressWarnings("PMD.ArrayIsStoredDirectly") + public KeyLockInfo(final String key, final long[] transactionIDs, final boolean exclusive) { this.key = key; this.transactionIDs = transactionIDs; this.exclusive = exclusive; @@ -254,6 +256,7 @@ public String getKey() { * * @return the Transaction IDs. */ + @SuppressWarnings("PMD.MethodReturnsInternalArray") public long[] getTransactionIDs() { return transactionIDs; } @@ -288,8 +291,8 @@ public Map getLockStatusData() { * * @return The waiting transactions */ - private DeadlockInfo newDeadlockInfo( - final long transactionID, final long columnFamilyId, + @SuppressWarnings("PMD.UnusedPrivateMethod") + private DeadlockInfo newDeadlockInfo(final long transactionID, final long columnFamilyId, final String waitingKey, final boolean exclusive) { return new DeadlockInfo(transactionID, columnFamilyId, waitingKey, exclusive); @@ -350,6 +353,7 @@ public static class DeadlockPath { final DeadlockInfo[] path; final boolean limitExceeded; + @SuppressWarnings("PMD.ArrayIsStoredDirectly") public DeadlockPath(final DeadlockInfo[] path, final boolean limitExceeded) { this.path = path; this.limitExceeded = limitExceeded; @@ -381,8 +385,7 @@ private static native long open(final long optionsHandle, private static native long[] open(final long dbOptionsHandle, final long transactionDbOptionsHandle, final String path, final byte[][] columnFamilyNames, final long[] columnFamilyOptions); - private native static void closeDatabase(final long handle) - throws RocksDBException; + private static native void closeDatabase(final long handle) throws RocksDBException; private native long beginTransaction(final long handle, final long writeOptionsHandle); private native long beginTransaction(final long handle, diff --git a/java/src/main/java/org/rocksdb/TransactionDBOptions.java b/java/src/main/java/org/rocksdb/TransactionDBOptions.java index 7f4296a7c988..391025d6ae9d 100644 --- a/java/src/main/java/org/rocksdb/TransactionDBOptions.java +++ b/java/src/main/java/org/rocksdb/TransactionDBOptions.java @@ -14,8 +14,8 @@ public TransactionDBOptions() { /** * Specifies the maximum number of keys that can be locked at the same time * per column family. - * - * If the number of locked keys is greater than {@link #getMaxNumLocks()}, + *

    + * If the number of locked keys is greater than {@code #getMaxNumLocks()}, * transaction writes (or GetForUpdate) will return an error. * * @return The maximum number of keys that can be locked @@ -28,7 +28,7 @@ public long getMaxNumLocks() { /** * Specifies the maximum number of keys that can be locked at the same time * per column family. - * + *

    * If the number of locked keys is greater than {@link #getMaxNumLocks()}, * transaction writes (or GetForUpdate) will return an error. * @@ -57,7 +57,7 @@ public long getNumStripes() { * Increasing this value will increase the concurrency by dividing the lock * table (per column family) into more sub-tables, each with their own * separate mutex. - * + *

    * Default: 16 * * @param numStripes The number of sub-tables @@ -94,7 +94,7 @@ public long getTransactionLockTimeout() { * If negative, there is no timeout. Not using a timeout is not recommended * as it can lead to deadlocks. Currently, there is no deadlock-detection to * recover from a deadlock. - * + *

    * Default: 1000 * * @param transactionLockTimeout the default wait timeout in milliseconds @@ -113,7 +113,7 @@ public TransactionDBOptions setTransactionLockTimeout( * OUTSIDE of a transaction (ie by calling {@link RocksDB#put}, * {@link RocksDB#merge}, {@link RocksDB#delete} or {@link RocksDB#write} * directly). - * + *

    * If 0, no waiting is done if a lock cannot instantly be acquired. * If negative, there is no timeout and will block indefinitely when acquiring * a lock. @@ -131,29 +131,28 @@ public long getDefaultLockTimeout() { * OUTSIDE of a transaction (ie by calling {@link RocksDB#put}, * {@link RocksDB#merge}, {@link RocksDB#delete} or {@link RocksDB#write} * directly). - * + *

    * If 0, no waiting is done if a lock cannot instantly be acquired. * If negative, there is no timeout and will block indefinitely when acquiring * a lock. - * + *

    * Not using a timeout can lead to deadlocks. Currently, there * is no deadlock-detection to recover from a deadlock. While DB writes * cannot deadlock with other DB writes, they can deadlock with a transaction. * A negative timeout should only be used if all transactions have a small * expiration set. - * + *

    * Default: 1000 * * @param defaultLockTimeout the timeout in milliseconds when writing a key * OUTSIDE of a transaction * @return this TransactionDBOptions instance */ - public TransactionDBOptions setDefaultLockTimeout( - final long defaultLockTimeout) { - assert(isOwningHandle()); - setDefaultLockTimeout(nativeHandle_, defaultLockTimeout); - return this; - } + public TransactionDBOptions setDefaultLockTimeout(final long defaultLockTimeout) { + assert (isOwningHandle()); + setDefaultLockTimeout(nativeHandle_, defaultLockTimeout); + return this; + } // /** // * If set, the {@link TransactionDB} will use this implementation of a mutex @@ -199,7 +198,7 @@ public TransactionDBOptions setWritePolicy( return this; } - private native static long newTransactionDBOptions(); + private static native long newTransactionDBOptions(); private native long getMaxNumLocks(final long handle); private native void setMaxNumLocks(final long handle, final long maxNumLocks); diff --git a/java/src/main/java/org/rocksdb/TransactionOptions.java b/java/src/main/java/org/rocksdb/TransactionOptions.java index 195fc85e489b..f93d3cb3cbb8 100644 --- a/java/src/main/java/org/rocksdb/TransactionOptions.java +++ b/java/src/main/java/org/rocksdb/TransactionOptions.java @@ -54,7 +54,7 @@ public TransactionOptions setDeadlockDetect(final boolean deadlockDetect) { /** * The wait timeout in milliseconds when a transaction attempts to lock a key. - * + *

    * If 0, no waiting is done if a lock cannot instantly be acquired. * If negative, {@link TransactionDBOptions#getTransactionLockTimeout(long)} * will be used @@ -69,11 +69,11 @@ public long getLockTimeout() { /** * If positive, specifies the wait timeout in milliseconds when * a transaction attempts to lock a key. - * + *

    * If 0, no waiting is done if a lock cannot instantly be acquired. * If negative, {@link TransactionDBOptions#getTransactionLockTimeout(long)} * will be used - * + *

    * Default: -1 * * @param lockTimeout the lock timeout in milliseconds @@ -88,7 +88,7 @@ public TransactionOptions setLockTimeout(final long lockTimeout) { /** * Expiration duration in milliseconds. - * + *

    * If non-negative, transactions that last longer than this many milliseconds * will fail to commit. If not set, a forgotten transaction that is never * committed, rolled back, or deleted will never relinquish any locks it @@ -103,12 +103,12 @@ public long getExpiration() { /** * Expiration duration in milliseconds. - * + *

    * If non-negative, transactions that last longer than this many milliseconds * will fail to commit. If not set, a forgotten transaction that is never * committed, rolled back, or deleted will never relinquish any locks it * holds. This could prevent keys from being written by other writers. - * + *

    * Default: -1 * * @param expiration the expiration duration in milliseconds @@ -133,7 +133,7 @@ public long getDeadlockDetectDepth() { /** * Sets the number of traversals to make during deadlock detection. - * + *

    * Default: 50 * * @param deadlockDetectDepth the number of traversals to make during @@ -168,7 +168,7 @@ public TransactionOptions setMaxWriteBatchSize(final long maxWriteBatchSize) { return this; } - private native static long newTransactionOptions(); + private static native long newTransactionOptions(); private native boolean isSetSnapshot(final long handle); private native void setSetSnapshot(final long handle, final boolean setSnapshot); diff --git a/java/src/main/java/org/rocksdb/TransactionalDB.java b/java/src/main/java/org/rocksdb/TransactionalDB.java index 740181989629..1ba9554965ca 100644 --- a/java/src/main/java/org/rocksdb/TransactionalDB.java +++ b/java/src/main/java/org/rocksdb/TransactionalDB.java @@ -8,7 +8,7 @@ interface TransactionalDB> extends AutoCloseable { /** * Starts a new Transaction. - * + *

    * Caller is responsible for calling {@link #close()} on the returned * transaction when it is no longer needed. * @@ -19,7 +19,7 @@ interface TransactionalDB> extends AutoCloseab /** * Starts a new Transaction. - * + *

    * Caller is responsible for calling {@link #close()} on the returned * transaction when it is no longer needed. * @@ -32,7 +32,7 @@ Transaction beginTransaction(final WriteOptions writeOptions, /** * Starts a new Transaction. - * + *

    * Caller is responsible for calling {@link #close()} on the returned * transaction when it is no longer needed. * @@ -48,7 +48,7 @@ Transaction beginTransaction(final WriteOptions writeOptions, /** * Starts a new Transaction. - * + *

    * Caller is responsible for calling {@link #close()} on the returned * transaction when it is no longer needed. * diff --git a/java/src/main/java/org/rocksdb/TransactionalOptions.java b/java/src/main/java/org/rocksdb/TransactionalOptions.java index d55ee900c80e..2175693fdede 100644 --- a/java/src/main/java/org/rocksdb/TransactionalOptions.java +++ b/java/src/main/java/org/rocksdb/TransactionalOptions.java @@ -20,7 +20,7 @@ interface TransactionalOptions> /** * Setting the setSnapshot to true is the same as calling * {@link Transaction#setSnapshot()}. - * + *

    * Default: false * * @param setSnapshot Whether to set a snapshot diff --git a/java/src/main/java/org/rocksdb/TtlDB.java b/java/src/main/java/org/rocksdb/TtlDB.java index a7adaf4b222f..9a90ba3586bb 100644 --- a/java/src/main/java/org/rocksdb/TtlDB.java +++ b/java/src/main/java/org/rocksdb/TtlDB.java @@ -125,7 +125,7 @@ public static TtlDB open(final DBOptions options, final String db_path, cfOptionHandles[i] = cfDescriptor.getOptions().nativeHandle_; } - final int ttlVals[] = new int[ttlValues.size()]; + final int[] ttlVals = new int[ttlValues.size()]; for(int i = 0; i < ttlValues.size(); i++) { ttlVals[i] = ttlValues.get(i); } @@ -144,16 +144,17 @@ public static TtlDB open(final DBOptions options, final String db_path, * * This is similar to {@link #close()} except that it * throws an exception if any error occurs. - * + *

    * This will not fsync the WAL files. * If syncing is required, the caller must first call {@link #syncWal()} * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch * with {@link WriteOptions#setSync(boolean)} set to true. - * + *

    * See also {@link #close()}. * * @throws RocksDBException if an error occurs whilst closing. */ + @Override public void closeE() throws RocksDBException { if (owningHandle_.compareAndSet(true, false)) { try { @@ -172,9 +173,10 @@ public void closeE() throws RocksDBException { * If syncing is required, the caller must first call {@link #syncWal()} * or {@link #write(WriteOptions, WriteBatch)} using an empty write batch * with {@link WriteOptions#setSync(boolean)} set to true. - * + *

    * See also {@link #close()}. */ + @SuppressWarnings("PMD.EmptyCatchBlock") @Override public void close() { if (owningHandle_.compareAndSet(true, false)) { @@ -230,16 +232,13 @@ protected TtlDB(final long nativeHandle) { @Override protected native void disposeInternal(final long handle); - private native static long open(final long optionsHandle, - final String db_path, final int ttl, final boolean readOnly) - throws RocksDBException; - private native static long[] openCF(final long optionsHandle, - final String db_path, final byte[][] columnFamilyNames, - final long[] columnFamilyOptions, final int[] ttlValues, + private static native long open(final long optionsHandle, final String db_path, final int ttl, + final boolean readOnly) throws RocksDBException; + private static native long[] openCF(final long optionsHandle, final String db_path, + final byte[][] columnFamilyNames, final long[] columnFamilyOptions, final int[] ttlValues, final boolean readOnly) throws RocksDBException; private native long createColumnFamilyWithTtl(final long handle, final byte[] columnFamilyName, final long columnFamilyOptions, int ttl) throws RocksDBException; - private native static void closeDatabase(final long handle) - throws RocksDBException; + private static native void closeDatabase(final long handle) throws RocksDBException; } diff --git a/java/src/main/java/org/rocksdb/TxnDBWritePolicy.java b/java/src/main/java/org/rocksdb/TxnDBWritePolicy.java index 837ce6157f7e..28cb8556b2cf 100644 --- a/java/src/main/java/org/rocksdb/TxnDBWritePolicy.java +++ b/java/src/main/java/org/rocksdb/TxnDBWritePolicy.java @@ -23,7 +23,7 @@ public enum TxnDBWritePolicy { */ WRITE_UNPREPARED((byte)0x2); - private byte value; + private final byte value; TxnDBWritePolicy(final byte value) { this.value = value; diff --git a/java/src/main/java/org/rocksdb/UInt64AddOperator.java b/java/src/main/java/org/rocksdb/UInt64AddOperator.java index cce9b298d8a7..0cffdce8c117 100644 --- a/java/src/main/java/org/rocksdb/UInt64AddOperator.java +++ b/java/src/main/java/org/rocksdb/UInt64AddOperator.java @@ -14,6 +14,6 @@ public UInt64AddOperator() { super(newSharedUInt64AddOperator()); } - private native static long newSharedUInt64AddOperator(); + private static native long newSharedUInt64AddOperator(); @Override protected final native void disposeInternal(final long handle); } diff --git a/java/src/main/java/org/rocksdb/WALRecoveryMode.java b/java/src/main/java/org/rocksdb/WALRecoveryMode.java index d8b9eeceda0a..b8c098f94afa 100644 --- a/java/src/main/java/org/rocksdb/WALRecoveryMode.java +++ b/java/src/main/java/org/rocksdb/WALRecoveryMode.java @@ -9,10 +9,9 @@ * The WAL Recover Mode */ public enum WALRecoveryMode { - /** * Original levelDB recovery - * + *

    * We tolerate incomplete record in trailing data on all logs * Use case : This is legacy behavior (default) */ @@ -20,7 +19,7 @@ public enum WALRecoveryMode { /** * Recover from clean shutdown - * + *

    * We don't expect to find any corruption in the WAL * Use case : This is ideal for unit tests and rare applications that * can require high consistency guarantee @@ -44,7 +43,7 @@ public enum WALRecoveryMode { */ SkipAnyCorruptedRecords((byte)0x03); - private byte value; + private final byte value; WALRecoveryMode(final byte value) { this.value = value; diff --git a/java/src/main/java/org/rocksdb/WBWIRocksIterator.java b/java/src/main/java/org/rocksdb/WBWIRocksIterator.java index ce146eb3f922..25d6e6f9d666 100644 --- a/java/src/main/java/org/rocksdb/WBWIRocksIterator.java +++ b/java/src/main/java/org/rocksdb/WBWIRocksIterator.java @@ -18,12 +18,12 @@ protected WBWIRocksIterator(final WriteBatchWithIndex wbwi, /** * Get the current entry - * + *

    * The WriteEntry is only valid * until the iterator is repositioned. * If you want to keep the WriteEntry across iterator * movements, you must make a copy of its data! - * + *

    * Note - This method is not thread-safe with respect to the WriteEntry * as it performs a non-atomic update across the fields of the WriteEntry * @@ -159,10 +159,10 @@ public DirectSlice getKey() { * no value */ public DirectSlice getValue() { - if(!value.isOwningHandle()) { - return null; //TODO(AR) migrate to JDK8 java.util.Optional#empty() - } else { + if (value.isOwningHandle()) { return value; + } else { + return null; // TODO(AR) migrate to JDK8 java.util.Optional#empty() } } @@ -178,6 +178,7 @@ public int hashCode() { return (key == null) ? 0 : key.hashCode(); } + @SuppressWarnings("PMD.CloseResource") @Override public boolean equals(final Object other) { if(other == null) { diff --git a/java/src/main/java/org/rocksdb/WalFilter.java b/java/src/main/java/org/rocksdb/WalFilter.java index 37e36213ae89..a2836634af65 100644 --- a/java/src/main/java/org/rocksdb/WalFilter.java +++ b/java/src/main/java/org/rocksdb/WalFilter.java @@ -12,13 +12,12 @@ * records or modify their processing on recovery. */ public interface WalFilter { - /** * Provide ColumnFamily->LogNumber map to filter * so that filter can determine whether a log number applies to a given * column family (i.e. that log hasn't been flushed to SST already for the * column family). - * + *

    * We also pass in name>id map as only name is known during * recovery (as handles are opened post-recovery). * while write batch callbacks happen in terms of column family id. diff --git a/java/src/main/java/org/rocksdb/WalProcessingOption.java b/java/src/main/java/org/rocksdb/WalProcessingOption.java index 889602edc94f..3a9c2be0e3b5 100644 --- a/java/src/main/java/org/rocksdb/WalProcessingOption.java +++ b/java/src/main/java/org/rocksdb/WalProcessingOption.java @@ -6,7 +6,7 @@ package org.rocksdb; public enum WalProcessingOption { - /** + /* * Continue processing as usual. */ CONTINUE_PROCESSING((byte)0x0), diff --git a/java/src/main/java/org/rocksdb/WriteBatch.java b/java/src/main/java/org/rocksdb/WriteBatch.java index 9b46108d07b5..49e1f7f204a2 100644 --- a/java/src/main/java/org/rocksdb/WriteBatch.java +++ b/java/src/main/java/org/rocksdb/WriteBatch.java @@ -9,16 +9,16 @@ /** * WriteBatch holds a collection of updates to apply atomically to a DB. - * + *

    * The updates are applied in the order in which they are added * to the WriteBatch. For example, the value of "key" will be "v3" * after the following batch is written: - * + *

    * batch.put("key", "v1"); * batch.remove("key"); * batch.put("key", "v2"); * batch.put("key", "v3"); - * + *

    * Multiple threads can invoke const methods on a WriteBatch without * external synchronization, but if any of the threads may call a * non-const method, all threads accessing the same WriteBatch must use @@ -180,7 +180,7 @@ public void markWalTerminationPoint() { /** * Gets the WAL termination point. - * + *

    * See {@link #markWalTerminationPoint()} * * @return the WAL termination point @@ -260,9 +260,8 @@ final native void deleteRange(final long handle, final byte[] beginKey, final in @Override final native void setMaxBytes(final long nativeHandle, final long maxBytes); - private native static long newWriteBatch(final int reserved_bytes); - private native static long newWriteBatch(final byte[] serialized, - final int serializedLength); + private static native long newWriteBatch(final int reserved_bytes); + private static native long newWriteBatch(final byte[] serialized, final int serializedLength); private native void iterate(final long handle, final long handlerHandle) throws RocksDBException; private native byte[] data(final long nativeHandle) throws RocksDBException; @@ -282,10 +281,9 @@ private native void iterate(final long handle, final long handlerHandle) /** * Handler callback for iterating over the contents of a batch. */ - public static abstract class Handler - extends RocksCallbackObject { + public abstract static class Handler extends RocksCallbackObject { public Handler() { - super(null); + super(0L); } @Override diff --git a/java/src/main/java/org/rocksdb/WriteBatchInterface.java b/java/src/main/java/org/rocksdb/WriteBatchInterface.java index 92caa22b30ef..32cd8d1e70bf 100644 --- a/java/src/main/java/org/rocksdb/WriteBatchInterface.java +++ b/java/src/main/java/org/rocksdb/WriteBatchInterface.java @@ -136,12 +136,12 @@ void delete(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key) * Remove the database entry for {@code key}. Requires that the key exists * and was not overwritten. It is not an error if the key did not exist * in the database. - * + *

    * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple * times), then the result of calling SingleDelete() on this key is undefined. * SingleDelete() only behaves correctly if there has been only one Put() * for this key since the previous call to SingleDelete() for this key. - * + *

    * This feature is currently an experimental performance optimization * for a very specific workload. It is up to the caller to ensure that * SingleDelete is only used for a key that is not deleted using Delete() or @@ -160,12 +160,12 @@ void delete(ColumnFamilyHandle columnFamilyHandle, final ByteBuffer key) * Remove the database entry for {@code key}. Requires that the key exists * and was not overwritten. It is not an error if the key did not exist * in the database. - * + *

    * If a key is overwritten (by calling {@link #put(byte[], byte[])} multiple * times), then the result of calling SingleDelete() on this key is undefined. * SingleDelete() only behaves correctly if there has been only one Put() * for this key since the previous call to SingleDelete() for this key. - * + *

    * This feature is currently an experimental performance optimization * for a very specific workload. It is up to the caller to ensure that * SingleDelete is only used for a key that is not deleted using Delete() or @@ -186,7 +186,7 @@ void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) * Removes the database entries in the range ["beginKey", "endKey"), i.e., * including "beginKey" and excluding "endKey". a non-OK status on error. It * is not an error if no keys exist in the range ["beginKey", "endKey"). - * + *

    * Delete the database entry (if any) for "key". Returns OK on success, and a * non-OK status on error. It is not an error if "key" did not exist in the * database. @@ -203,7 +203,7 @@ void singleDelete(final ColumnFamilyHandle columnFamilyHandle, final byte[] key) * Removes the database entries in the range ["beginKey", "endKey"), i.e., * including "beginKey" and excluding "endKey". a non-OK status on error. It * is not an error if no keys exist in the range ["beginKey", "endKey"). - * + *

    * Delete the database entry (if any) for "key". Returns OK on success, and a * non-OK status on error. It is not an error if "key" did not exist in the * database. @@ -224,9 +224,9 @@ void deleteRange(ColumnFamilyHandle columnFamilyHandle, byte[] beginKey, byte[] * it will not be persisted to the SST files. When iterating over this * WriteBatch, WriteBatch::Handler::LogData will be called with the contents * of the blob as it is encountered. Blobs, puts, deletes, and merges will be - * encountered in the same order in thich they were inserted. The blob will + * encountered in the same order in which they were inserted. The blob will * NOT consume sequence number(s) and will NOT increase the count of the batch - * + *

    * Example application: add timestamps to the transaction log for use in * replication. * @@ -257,7 +257,7 @@ void deleteRange(ColumnFamilyHandle columnFamilyHandle, byte[] beginKey, byte[] /** * Pop the most recent save point. - * + *

    * That is to say that it removes the last save point, * which was set by {@link #setSavePoint()}. * diff --git a/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java b/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java index c73bd7dda621..d41be5856ce7 100644 --- a/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java +++ b/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java @@ -10,10 +10,10 @@ /** * Similar to {@link org.rocksdb.WriteBatch} but with a binary searchable * index built for all the keys inserted. - * + *

    * Calling put, merge, remove or putLogData calls the same function * as with {@link org.rocksdb.WriteBatch} whilst also building an index. - * + *

    * A user can call {@link org.rocksdb.WriteBatchWithIndex#newIterator()} to * create an iterator over the write batch or * {@link org.rocksdb.WriteBatchWithIndex#newIteratorWithBase(org.rocksdb.RocksIterator)} @@ -22,7 +22,7 @@ public class WriteBatchWithIndex extends AbstractWriteBatch { /** * Creates a WriteBatchWithIndex where no bytes - * are reserved up-front, bytewise comparison is + * are reserved up-front, byte wise comparison is * used for fallback key comparisons, * and duplicate keys operations are retained */ @@ -30,10 +30,9 @@ public WriteBatchWithIndex() { super(newWriteBatchWithIndex()); } - /** * Creates a WriteBatchWithIndex where no bytes - * are reserved up-front, bytewise comparison is + * are reserved up-front, byte wise comparison is * used for fallback key comparisons, and duplicate key * assignment is determined by the constructor argument * @@ -48,9 +47,9 @@ public WriteBatchWithIndex(final boolean overwriteKey) { /** * Creates a WriteBatchWithIndex * - * @param fallbackIndexComparator We fallback to this comparator + * @param fallbackIndexComparator We fall back to this comparator * to compare keys within a column family if we cannot determine - * the column family and so look up it's comparator. + * the column family and so look up its comparator. * * @param reservedBytes reserved bytes in underlying WriteBatch * @@ -115,7 +114,7 @@ public WBWIRocksIterator newIterator() { * Provides Read-Your-Own-Writes like functionality by * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator} * as a delta and baseIterator as a base - * + *

    * Updating write batch with the current key of the iterator is not safe. * We strongly recommend users not to do it. It will invalidate the current * key() and value() of the iterator. This invalidation happens even before @@ -138,7 +137,7 @@ public RocksIterator newIteratorWithBase( * Provides Read-Your-Own-Writes like functionality by * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator} * as a delta and baseIterator as a base - * + *

    * Updating write batch with the current key of the iterator is not safe. * We strongly recommend users not to do it. It will invalidate the current * key() and value() of the iterator. This invalidation happens even before @@ -173,7 +172,7 @@ public RocksIterator newIteratorWithBase(final ColumnFamilyHandle columnFamilyHa * @param baseIterator The base iterator, * e.g. {@link org.rocksdb.RocksDB#newIterator()} * @return An iterator which shows a view comprised of both the database - * point-in-timefrom baseIterator and modifications made in this write batch. + * point-in-time from baseIterator and modifications made in this write batch. */ public RocksIterator newIteratorWithBase(final RocksIterator baseIterator) { return newIteratorWithBase(baseIterator.parent_.getDefaultColumnFamily(), baseIterator, null); @@ -189,7 +188,7 @@ public RocksIterator newIteratorWithBase(final RocksIterator baseIterator) { * e.g. {@link org.rocksdb.RocksDB#newIterator()} * @param readOptions the read options, or null * @return An iterator which shows a view comprised of both the database - * point-in-timefrom baseIterator and modifications made in this write batch. + * point-in-time from baseIterator and modifications made in this write batch. */ public RocksIterator newIteratorWithBase(final RocksIterator baseIterator, /* @Nullable */ final ReadOptions readOptions) { @@ -238,11 +237,11 @@ public byte[] getFromBatch(final DBOptions options, final byte[] key) /** * Similar to {@link RocksDB#get(ColumnFamilyHandle, byte[])} but will also * read writes from this batch. - * + *

    * This function will query both this batch and the DB and then merge * the results using the DB's merge operator (if the batch contains any * merge requests). - * + *

    * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is * read from the DB but will NOT change which keys are read from the batch * (the keys in this batch do not yet belong to any snapshot and will be @@ -268,11 +267,11 @@ public byte[] getFromBatchAndDB(final RocksDB db, final ColumnFamilyHandle colum /** * Similar to {@link RocksDB#get(byte[])} but will also * read writes from this batch. - * + *

    * This function will query both this batch and the DB and then merge * the results using the DB's merge operator (if the batch contains any * merge requests). - * + *

    * Setting {@link ReadOptions#setSnapshot(Snapshot)} will affect what is * read from the DB but will NOT change which keys are read from the batch * (the keys in this batch do not yet belong to any snapshot and will be @@ -338,16 +337,14 @@ final native void deleteRange(final long handle, final byte[] beginKey, final in final long maxBytes); @Override final native WriteBatch getWriteBatch(final long handle); - private native static long newWriteBatchWithIndex(); - private native static long newWriteBatchWithIndex(final boolean overwriteKey); - private native static long newWriteBatchWithIndex( - final long fallbackIndexComparatorHandle, - final byte comparatorType, final int reservedBytes, - final boolean overwriteKey); + private static native long newWriteBatchWithIndex(); + private static native long newWriteBatchWithIndex(final boolean overwriteKey); + private static native long newWriteBatchWithIndex(final long fallbackIndexComparatorHandle, + final byte comparatorType, final int reservedBytes, final boolean overwriteKey); private native long iterator0(final long handle); private native long iterator1(final long handle, final long cfHandle); - private native long iteratorWithBase(final long handle, final long baseIteratorHandle, - final long cfHandle, final long readOptionsHandle); + private native long iteratorWithBase(final long handle, final long cfHandle, + final long baseIteratorHandle, final long readOptionsHandle); private native byte[] getFromBatch(final long handle, final long optHandle, final byte[] key, final int keyLen); private native byte[] getFromBatch(final long handle, final long optHandle, diff --git a/java/src/main/java/org/rocksdb/WriteBufferManager.java b/java/src/main/java/org/rocksdb/WriteBufferManager.java index 8ec9639586ca..40176aba42fb 100644 --- a/java/src/main/java/org/rocksdb/WriteBufferManager.java +++ b/java/src/main/java/org/rocksdb/WriteBufferManager.java @@ -9,13 +9,9 @@ * Java wrapper over native write_buffer_manager class */ public class WriteBufferManager extends RocksObject { - static { - RocksDB.loadLibrary(); - } - /** * Construct a new instance of WriteBufferManager. - * + *

    * Check * https://github.com/facebook/rocksdb/wiki/Write-Buffer-Manager * for more details on when to use it @@ -28,7 +24,7 @@ public class WriteBufferManager extends RocksObject { */ public WriteBufferManager( final long bufferSizeBytes, final Cache cache, final boolean allowStall) { - super(newWriteBufferManager(bufferSizeBytes, cache.nativeHandle_, allowStall)); + super(newWriteBufferManagerInstance(bufferSizeBytes, cache.nativeHandle_, allowStall)); this.allowStall_ = allowStall; } @@ -40,11 +36,16 @@ public boolean allowStall() { return allowStall_; } - private native static long newWriteBufferManager( + private static long newWriteBufferManagerInstance( + final long bufferSizeBytes, final long cacheHandle, final boolean allowStall) { + RocksDB.loadLibrary(); + return newWriteBufferManager(bufferSizeBytes, cacheHandle, allowStall); + } + private static native long newWriteBufferManager( final long bufferSizeBytes, final long cacheHandle, final boolean allowStall); @Override protected native void disposeInternal(final long handle); - private boolean allowStall_; + private final boolean allowStall_; } diff --git a/java/src/main/java/org/rocksdb/WriteOptions.java b/java/src/main/java/org/rocksdb/WriteOptions.java index 5a3ffa6c503d..7c184b094926 100644 --- a/java/src/main/java/org/rocksdb/WriteOptions.java +++ b/java/src/main/java/org/rocksdb/WriteOptions.java @@ -7,7 +7,7 @@ /** * Options that control write operations. - * + *

    * Note that developers should call WriteOptions.dispose() to release the * c++ side memory before a WriteOptions instance runs out of scope. */ @@ -28,33 +28,32 @@ public WriteOptions() { /** * Copy constructor for WriteOptions. - * + *

    * NOTE: This does a shallow copy, which means comparator, merge_operator, compaction_filter, * compaction_filter_factory and other pointers will be cloned! * * @param other The ColumnFamilyOptions to copy. */ - public WriteOptions(WriteOptions other) { + public WriteOptions(final WriteOptions other) { super(copyWriteOptions(other.nativeHandle_)); } - /** * If true, the write will be flushed from the operating system * buffer cache (by calling WritableFile::Sync()) before the write * is considered complete. If this flag is true, writes will be * slower. - * + *

    * If this flag is false, and the machine crashes, some recent * writes may be lost. Note that if it is just the process that * crashes (i.e., the machine does not reboot), no writes will be * lost even if sync==false. - * + *

    * In other words, a DB write with sync==false has similar * crash semantics as the "write()" system call. A DB write * with sync==true has similar crash semantics to a "write()" * system call followed by "fdatasync()". - * + *

    * Default: false * * @param flag a boolean flag to indicate whether a write @@ -71,12 +70,12 @@ public WriteOptions setSync(final boolean flag) { * buffer cache (by calling WritableFile::Sync()) before the write * is considered complete. If this flag is true, writes will be * slower. - * + *

    * If this flag is false, and the machine crashes, some recent * writes may be lost. Note that if it is just the process that * crashes (i.e., the machine does not reboot), no writes will be * lost even if sync==false. - * + *

    * In other words, a DB write with sync==false has similar * crash semantics as the "write()" system call. A DB write * with sync==true has similar crash semantics to a "write()" @@ -121,7 +120,7 @@ public boolean disableWAL() { * If true and if user is trying to write to column families that don't exist * (they were dropped), ignore the write (don't return an error). If there * are multiple writes in a WriteBatch, other writes will succeed. - * + *

    * Default: false * * @param ignoreMissingColumnFamilies true to ignore writes to column families @@ -138,7 +137,7 @@ public WriteOptions setIgnoreMissingColumnFamilies( * If true and if user is trying to write to column families that don't exist * (they were dropped), ignore the write (don't return an error). If there * are multiple writes in a WriteBatch, other writes will succeed. - * + *

    * Default: false * * @return true if writes to column families which don't exist are ignored @@ -175,7 +174,7 @@ public boolean noSlowdown() { * will be cancelled immediately with {@link Status.Code#Incomplete} returned. * Otherwise, it will be slowed down. The slowdown value is determined by * RocksDB to guarantee it introduces minimum impacts to high priority writes. - * + *

    * Default: false * * @param lowPri true if the write request should be of lower priority than @@ -191,7 +190,7 @@ public WriteOptions setLowPri(final boolean lowPri) { /** * Returns true if this write request is of lower priority if compaction is * behind. - * + *

    * See {@link #setLowPri(boolean)}. * * @return true if this write request is of lower priority, false otherwise. @@ -206,7 +205,7 @@ public boolean lowPri() { * in concurrent writes if keys in one writebatch are sequential. In * non-concurrent writes (when {@code concurrent_memtable_writes} is false) this * option will be ignored. - * + *

    * Default: false * * @return true if writebatch will maintain the last insert positions of each memtable as hints in @@ -222,7 +221,7 @@ public boolean memtableInsertHintPerBatch() { * in concurrent writes if keys in one writebatch are sequential. In * non-concurrent writes (when {@code concurrent_memtable_writes} is false) this * option will be ignored. - * + *

    * Default: false * * @param memtableInsertHintPerBatch true if writebatch should maintain the last insert positions @@ -234,8 +233,8 @@ public WriteOptions setMemtableInsertHintPerBatch(final boolean memtableInsertHi return this; } - private native static long newWriteOptions(); - private native static long copyWriteOptions(long handle); + private static native long newWriteOptions(); + private static native long copyWriteOptions(long handle); @Override protected final native void disposeInternal(final long handle); private native void setSync(long handle, boolean flag); diff --git a/java/src/main/java/org/rocksdb/WriteStallCondition.java b/java/src/main/java/org/rocksdb/WriteStallCondition.java index 3bc9d410431d..98d9e2ce4adf 100644 --- a/java/src/main/java/org/rocksdb/WriteStallCondition.java +++ b/java/src/main/java/org/rocksdb/WriteStallCondition.java @@ -6,9 +6,9 @@ package org.rocksdb; public enum WriteStallCondition { - NORMAL((byte) 0x0), - DELAYED((byte) 0x1), - STOPPED((byte) 0x2); + DELAYED((byte) 0x0), + STOPPED((byte) 0x1), + NORMAL((byte) 0x2); private final byte value; diff --git a/java/src/main/java/org/rocksdb/WriteStallInfo.java b/java/src/main/java/org/rocksdb/WriteStallInfo.java index 4aef0eda9ad6..1cade0acb8ed 100644 --- a/java/src/main/java/org/rocksdb/WriteStallInfo.java +++ b/java/src/main/java/org/rocksdb/WriteStallInfo.java @@ -51,12 +51,12 @@ public WriteStallCondition getPreviousCondition() { } @Override - public boolean equals(Object o) { + public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; - WriteStallInfo that = (WriteStallInfo) o; + final WriteStallInfo that = (WriteStallInfo) o; return Objects.equals(columnFamilyName, that.columnFamilyName) && currentCondition == that.currentCondition && previousCondition == that.previousCondition; } diff --git a/java/src/main/java/org/rocksdb/util/BytewiseComparator.java b/java/src/main/java/org/rocksdb/util/BytewiseComparator.java index 9561b0a317b6..202241d3bad9 100644 --- a/java/src/main/java/org/rocksdb/util/BytewiseComparator.java +++ b/java/src/main/java/org/rocksdb/util/BytewiseComparator.java @@ -52,9 +52,9 @@ static int _compare(final ByteBuffer a, final ByteBuffer b) { return r; } + @SuppressWarnings("PMD.EmptyControlStatement") @Override - public void findShortestSeparator(final ByteBuffer start, - final ByteBuffer limit) { + public void findShortestSeparator(final ByteBuffer start, final ByteBuffer limit) { // Find length of common prefix final int minLength = Math.min(start.remaining(), limit.remaining()); int diffIndex = 0; diff --git a/java/src/main/java/org/rocksdb/util/Environment.java b/java/src/main/java/org/rocksdb/util/Environment.java index 9ad51c7c7369..53ff65d26377 100644 --- a/java/src/main/java/org/rocksdb/util/Environment.java +++ b/java/src/main/java/org/rocksdb/util/Environment.java @@ -3,12 +3,20 @@ import java.io.File; import java.io.IOException; +import java.util.Locale; public class Environment { - private static String OS = System.getProperty("os.name").toLowerCase(); - private static String ARCH = System.getProperty("os.arch").toLowerCase(); + @SuppressWarnings("FieldMayBeFinal") + private static String OS = System.getProperty("os.name").toLowerCase(Locale.getDefault()); + @SuppressWarnings("FieldMayBeFinal") + private static String ARCH = System.getProperty("os.arch").toLowerCase(Locale.getDefault()); + @SuppressWarnings("FieldMayBeFinal") private static String MUSL_ENVIRONMENT = System.getenv("ROCKSDB_MUSL_LIBC"); + private static final String LIBC_MUSL_PREFIX = "libc.musl"; + + private static final String SPARCV9 = "sparcv9"; + /** * Will be lazily initialised by {@link #isMuslLibc()} instead of the previous static * initialisation. The lazy initialisation prevents Windows from reporting suspicious behaviour of @@ -70,6 +78,7 @@ public static boolean isMuslLibc() { * * @return true if the environment has a musl libc, false otherwise. */ + @SuppressWarnings("PMD.EmptyCatchBlock") static boolean initIsMuslLibc() { // consider explicit user setting from environment first if ("true".equalsIgnoreCase(MUSL_ENVIRONMENT)) { @@ -114,7 +123,7 @@ static boolean initIsMuslLibc() { return false; } for (final File f : libFiles) { - if (f.getName().startsWith("libc.musl")) { + if (f.getName().startsWith(LIBC_MUSL_PREFIX)) { return true; } } @@ -132,7 +141,7 @@ public static boolean isOpenBSD() { } public static boolean is64Bit() { - if (ARCH.indexOf("sparcv9") >= 0) { + if (ARCH.contains(SPARCV9)) { return true; } return (ARCH.indexOf("64") > 0); diff --git a/java/src/main/java/org/rocksdb/util/IntComparator.java b/java/src/main/java/org/rocksdb/util/IntComparator.java index cc096cd14973..2caf0c601572 100644 --- a/java/src/main/java/org/rocksdb/util/IntComparator.java +++ b/java/src/main/java/org/rocksdb/util/IntComparator.java @@ -48,7 +48,7 @@ public int compare(final ByteBuffer a, final ByteBuffer b) { * * @return negative if a < b, 0 if a == b, positive otherwise */ - private final int compareIntKeys(final ByteBuffer a, final ByteBuffer b) { + private int compareIntKeys(final ByteBuffer a, final ByteBuffer b) { final int iA = a.getInt(); final int iB = b.getInt(); diff --git a/java/src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java b/java/src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java index 4c06f80aacd7..3d3c429416b0 100644 --- a/java/src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java +++ b/java/src/main/java/org/rocksdb/util/ReverseBytewiseComparator.java @@ -38,9 +38,9 @@ public int compare(final ByteBuffer a, final ByteBuffer b) { return -BytewiseComparator._compare(a, b); } + @SuppressWarnings("PMD.EmptyControlStatement") @Override - public void findShortestSeparator(final ByteBuffer start, - final ByteBuffer limit) { + public void findShortestSeparator(final ByteBuffer start, final ByteBuffer limit) { // Find length of common prefix final int minLength = Math.min(start.remaining(), limit.remaining()); int diffIndex = 0; diff --git a/java/src/test/java/org/rocksdb/AbstractTransactionTest.java b/java/src/test/java/org/rocksdb/AbstractTransactionTest.java index 46685f9fd611..d57258009027 100644 --- a/java/src/test/java/org/rocksdb/AbstractTransactionTest.java +++ b/java/src/test/java/org/rocksdb/AbstractTransactionTest.java @@ -5,26 +5,22 @@ package org.rocksdb; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Random; - -import static java.nio.charset.StandardCharsets.UTF_8; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.fail; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; /** * Base class of {@link TransactionTest} and {@link OptimisticTransactionTest} */ public abstract class AbstractTransactionTest { - - protected final static byte[] TXN_TEST_COLUMN_FAMILY = "txn_test_cf" - .getBytes(); + protected static final byte[] TXN_TEST_COLUMN_FAMILY = "txn_test_cf".getBytes(); protected static final Random rand = PlatformRandomHelper. getPlatformSpecificRandomFactory(); @@ -107,8 +103,8 @@ public void clearSnapshot_none() throws RocksDBException { @Test public void commit() throws RocksDBException { - final byte k1[] = "rollback-key1".getBytes(UTF_8); - final byte v1[] = "rollback-value1".getBytes(UTF_8); + final byte[] k1 = "rollback-key1".getBytes(UTF_8); + final byte[] v1 = "rollback-value1".getBytes(UTF_8); try(final DBContainer dbContainer = startDb()) { try(final Transaction txn = dbContainer.beginTransaction()) { txn.put(k1, v1); @@ -124,8 +120,8 @@ public void commit() throws RocksDBException { @Test public void rollback() throws RocksDBException { - final byte k1[] = "rollback-key1".getBytes(UTF_8); - final byte v1[] = "rollback-value1".getBytes(UTF_8); + final byte[] k1 = "rollback-key1".getBytes(UTF_8); + final byte[] v1 = "rollback-value1".getBytes(UTF_8); try(final DBContainer dbContainer = startDb()) { try(final Transaction txn = dbContainer.beginTransaction()) { txn.put(k1, v1); @@ -141,10 +137,10 @@ public void rollback() throws RocksDBException { @Test public void savePoint() throws RocksDBException { - final byte k1[] = "savePoint-key1".getBytes(UTF_8); - final byte v1[] = "savePoint-value1".getBytes(UTF_8); - final byte k2[] = "savePoint-key2".getBytes(UTF_8); - final byte v2[] = "savePoint-value2".getBytes(UTF_8); + final byte[] k1 = "savePoint-key1".getBytes(UTF_8); + final byte[] v1 = "savePoint-value1".getBytes(UTF_8); + final byte[] k2 = "savePoint-key2".getBytes(UTF_8); + final byte[] v2 = "savePoint-value2".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final ReadOptions readOptions = new ReadOptions()) { @@ -179,8 +175,8 @@ public void savePoint() throws RocksDBException { @Test public void getPut_cf() throws RocksDBException { - final byte k1[] = "key1".getBytes(UTF_8); - final byte v1[] = "value1".getBytes(UTF_8); + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final ReadOptions readOptions = new ReadOptions(); final Transaction txn = dbContainer.beginTransaction()) { @@ -193,8 +189,8 @@ public void getPut_cf() throws RocksDBException { @Test public void getPut() throws RocksDBException { - final byte k1[] = "key1".getBytes(UTF_8); - final byte v1[] = "value1".getBytes(UTF_8); + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final ReadOptions readOptions = new ReadOptions(); final Transaction txn = dbContainer.beginTransaction()) { @@ -279,8 +275,8 @@ public void multiGetPutAsList() throws RocksDBException { @Test public void getForUpdate_cf() throws RocksDBException { - final byte k1[] = "key1".getBytes(UTF_8); - final byte v1[] = "value1".getBytes(UTF_8); + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final ReadOptions readOptions = new ReadOptions(); final Transaction txn = dbContainer.beginTransaction()) { @@ -293,8 +289,8 @@ public void getForUpdate_cf() throws RocksDBException { @Test public void getForUpdate() throws RocksDBException { - final byte k1[] = "key1".getBytes(UTF_8); - final byte v1[] = "value1".getBytes(UTF_8); + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final ReadOptions readOptions = new ReadOptions(); final Transaction txn = dbContainer.beginTransaction()) { @@ -306,12 +302,8 @@ public void getForUpdate() throws RocksDBException { @Test public void multiGetForUpdate_cf() throws RocksDBException { - final byte keys[][] = new byte[][] { - "key1".getBytes(UTF_8), - "key2".getBytes(UTF_8)}; - final byte values[][] = new byte[][] { - "value1".getBytes(UTF_8), - "value2".getBytes(UTF_8)}; + final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)}; + final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)}; try(final DBContainer dbContainer = startDb(); final ReadOptions readOptions = new ReadOptions(); @@ -331,12 +323,8 @@ public void multiGetForUpdate_cf() throws RocksDBException { @Test public void multiGetForUpdate() throws RocksDBException { - final byte keys[][] = new byte[][]{ - "key1".getBytes(UTF_8), - "key2".getBytes(UTF_8)}; - final byte values[][] = new byte[][]{ - "value1".getBytes(UTF_8), - "value2".getBytes(UTF_8)}; + final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)}; + final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)}; try (final DBContainer dbContainer = startDb(); final ReadOptions readOptions = new ReadOptions(); @@ -349,6 +337,53 @@ public void multiGetForUpdate() throws RocksDBException { } } + @Test + public void multiGetForUpdateAsList_cf() throws RocksDBException { + final List keys = Arrays.asList("key1".getBytes(UTF_8), "key2".getBytes(UTF_8)); + final List values = Arrays.asList("value1".getBytes(UTF_8), "value2".getBytes(UTF_8)); + + try (final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final ColumnFamilyHandle testCf = dbContainer.getTestColumnFamily(); + final List cfList = Arrays.asList(testCf, testCf); + + assertThat(txn.multiGetForUpdateAsList(readOptions, cfList, keys)) + .isEqualTo(Arrays.asList(null, null)); + + txn.put(testCf, keys.get(0), values.get(0)); + txn.put(testCf, keys.get(1), values.get(1)); + final List result = txn.multiGetForUpdateAsList(readOptions, cfList, keys); + assertThat(result.size()).isEqualTo(values.size()); + for (int i = 0; i < result.size(); i++) { + assertThat(result.get(i)).isEqualTo(values.get(i)); + } + } + } + + @Test + public void multiGetForUpdateAsList() throws RocksDBException { + final List keys = Arrays.asList("key1".getBytes(UTF_8), "key2".getBytes(UTF_8)); + final List values = Arrays.asList("value1".getBytes(UTF_8), "value2".getBytes(UTF_8)); + + try (final DBContainer dbContainer = startDb(); + final ReadOptions readOptions = new ReadOptions(); + final Transaction txn = dbContainer.beginTransaction()) { + final List nulls = new ArrayList<>(); + nulls.add(null); + nulls.add(null); + assertThat(txn.multiGetForUpdateAsList(readOptions, keys)).isEqualTo(nulls); + + txn.put(keys.get(0), values.get(0)); + txn.put(keys.get(1), values.get(1)); + final List result = txn.multiGetForUpdateAsList(readOptions, keys); + assertThat(result.size()).isEqualTo(values.size()); + for (int i = 0; i < result.size(); i++) { + assertThat(result.get(i)).isEqualTo(values.get(i)); + } + } + } + @Test public void getIterator() throws RocksDBException { try(final DBContainer dbContainer = startDb(); @@ -449,12 +484,8 @@ public void delete() throws RocksDBException { @Test public void delete_parts_cf() throws RocksDBException { - final byte keyParts[][] = new byte[][] { - "ke".getBytes(UTF_8), - "y1".getBytes(UTF_8)}; - final byte valueParts[][] = new byte[][] { - "val".getBytes(UTF_8), - "ue1".getBytes(UTF_8)}; + final byte[][] keyParts = new byte[][] {"ke".getBytes(UTF_8), "y1".getBytes(UTF_8)}; + final byte[][] valueParts = new byte[][] {"val".getBytes(UTF_8), "ue1".getBytes(UTF_8)}; final byte[] key = concat(keyParts); final byte[] value = concat(valueParts); @@ -474,12 +505,8 @@ public void delete_parts_cf() throws RocksDBException { @Test public void delete_parts() throws RocksDBException { - final byte keyParts[][] = new byte[][] { - "ke".getBytes(UTF_8), - "y1".getBytes(UTF_8)}; - final byte valueParts[][] = new byte[][] { - "val".getBytes(UTF_8), - "ue1".getBytes(UTF_8)}; + final byte[][] keyParts = new byte[][] {"ke".getBytes(UTF_8), "y1".getBytes(UTF_8)}; + final byte[][] valueParts = new byte[][] {"val".getBytes(UTF_8), "ue1".getBytes(UTF_8)}; final byte[] key = concat(keyParts); final byte[] value = concat(valueParts); @@ -499,8 +526,8 @@ public void delete_parts() throws RocksDBException { @Test public void getPutUntracked_cf() throws RocksDBException { - final byte k1[] = "key1".getBytes(UTF_8); - final byte v1[] = "value1".getBytes(UTF_8); + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final ReadOptions readOptions = new ReadOptions(); final Transaction txn = dbContainer.beginTransaction()) { @@ -513,8 +540,8 @@ public void getPutUntracked_cf() throws RocksDBException { @Test public void getPutUntracked() throws RocksDBException { - final byte k1[] = "key1".getBytes(UTF_8); - final byte v1[] = "value1".getBytes(UTF_8); + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final ReadOptions readOptions = new ReadOptions(); final Transaction txn = dbContainer.beginTransaction()) { @@ -527,12 +554,8 @@ public void getPutUntracked() throws RocksDBException { @Deprecated @Test public void multiGetPutUntracked_cf() throws RocksDBException { - final byte keys[][] = new byte[][] { - "key1".getBytes(UTF_8), - "key2".getBytes(UTF_8)}; - final byte values[][] = new byte[][] { - "value1".getBytes(UTF_8), - "value2".getBytes(UTF_8)}; + final byte[][] keys = new byte[][] {"key1".getBytes(UTF_8), "key2".getBytes(UTF_8)}; + final byte[][] values = new byte[][] {"value1".getBytes(UTF_8), "value2".getBytes(UTF_8)}; try(final DBContainer dbContainer = startDb(); final ReadOptions readOptions = new ReadOptions(); @@ -659,12 +682,8 @@ public void deleteUntracked() throws RocksDBException { @Test public void deleteUntracked_parts_cf() throws RocksDBException { - final byte keyParts[][] = new byte[][] { - "ke".getBytes(UTF_8), - "y1".getBytes(UTF_8)}; - final byte valueParts[][] = new byte[][] { - "val".getBytes(UTF_8), - "ue1".getBytes(UTF_8)}; + final byte[][] keyParts = new byte[][] {"ke".getBytes(UTF_8), "y1".getBytes(UTF_8)}; + final byte[][] valueParts = new byte[][] {"val".getBytes(UTF_8), "ue1".getBytes(UTF_8)}; final byte[] key = concat(keyParts); final byte[] value = concat(valueParts); @@ -682,12 +701,8 @@ public void deleteUntracked_parts_cf() throws RocksDBException { @Test public void deleteUntracked_parts() throws RocksDBException { - final byte keyParts[][] = new byte[][] { - "ke".getBytes(UTF_8), - "y1".getBytes(UTF_8)}; - final byte valueParts[][] = new byte[][] { - "val".getBytes(UTF_8), - "ue1".getBytes(UTF_8)}; + final byte[][] keyParts = new byte[][] {"ke".getBytes(UTF_8), "y1".getBytes(UTF_8)}; + final byte[][] valueParts = new byte[][] {"val".getBytes(UTF_8), "ue1".getBytes(UTF_8)}; final byte[] key = concat(keyParts); final byte[] value = concat(valueParts); @@ -724,12 +739,12 @@ public void enabledDisableIndexing() throws RocksDBException { @Test public void numKeys() throws RocksDBException { - final byte k1[] = "key1".getBytes(UTF_8); - final byte v1[] = "value1".getBytes(UTF_8); - final byte k2[] = "key2".getBytes(UTF_8); - final byte v2[] = "value2".getBytes(UTF_8); - final byte k3[] = "key3".getBytes(UTF_8); - final byte v3[] = "value3".getBytes(UTF_8); + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + final byte[] k2 = "key2".getBytes(UTF_8); + final byte[] v2 = "value2".getBytes(UTF_8); + final byte[] k3 = "key3".getBytes(UTF_8); + final byte[] v3 = "value3".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final Transaction txn = dbContainer.beginTransaction()) { @@ -761,8 +776,8 @@ public void elapsedTime() throws RocksDBException, InterruptedException { @Test public void getWriteBatch() throws RocksDBException { - final byte k1[] = "key1".getBytes(UTF_8); - final byte v1[] = "value1".getBytes(UTF_8); + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final Transaction txn = dbContainer.beginTransaction()) { @@ -786,8 +801,8 @@ public void setLockTimeout() throws RocksDBException { @Test public void writeOptions() throws RocksDBException { - final byte k1[] = "key1".getBytes(UTF_8); - final byte v1[] = "value1".getBytes(UTF_8); + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final WriteOptions writeOptions = new WriteOptions() @@ -816,8 +831,8 @@ public void writeOptions() throws RocksDBException { @Test public void undoGetForUpdate_cf() throws RocksDBException { - final byte k1[] = "key1".getBytes(UTF_8); - final byte v1[] = "value1".getBytes(UTF_8); + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final ReadOptions readOptions = new ReadOptions(); final Transaction txn = dbContainer.beginTransaction()) { @@ -831,8 +846,8 @@ public void undoGetForUpdate_cf() throws RocksDBException { @Test public void undoGetForUpdate() throws RocksDBException { - final byte k1[] = "key1".getBytes(UTF_8); - final byte v1[] = "value1".getBytes(UTF_8); + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final ReadOptions readOptions = new ReadOptions(); final Transaction txn = dbContainer.beginTransaction()) { @@ -845,12 +860,12 @@ public void undoGetForUpdate() throws RocksDBException { @Test public void rebuildFromWriteBatch() throws RocksDBException { - final byte k1[] = "key1".getBytes(UTF_8); - final byte v1[] = "value1".getBytes(UTF_8); - final byte k2[] = "key2".getBytes(UTF_8); - final byte v2[] = "value2".getBytes(UTF_8); - final byte k3[] = "key3".getBytes(UTF_8); - final byte v3[] = "value3".getBytes(UTF_8); + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); + final byte[] k2 = "key2".getBytes(UTF_8); + final byte[] v2 = "value2".getBytes(UTF_8); + final byte[] k3 = "key3".getBytes(UTF_8); + final byte[] v3 = "value3".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final ReadOptions readOptions = new ReadOptions(); @@ -876,8 +891,8 @@ public void rebuildFromWriteBatch() throws RocksDBException { @Test public void getCommitTimeWriteBatch() throws RocksDBException { - final byte k1[] = "key1".getBytes(UTF_8); - final byte v1[] = "value1".getBytes(UTF_8); + final byte[] k1 = "key1".getBytes(UTF_8); + final byte[] v1 = "value1".getBytes(UTF_8); try(final DBContainer dbContainer = startDb(); final Transaction txn = dbContainer.beginTransaction()) { @@ -933,8 +948,7 @@ public List getCreatedSnapshots() { } } - protected static abstract class DBContainer - implements AutoCloseable { + protected abstract static class DBContainer implements AutoCloseable { protected final WriteOptions writeOptions; protected final List columnFamilyHandles; protected final ColumnFamilyOptions columnFamilyOptions; diff --git a/java/src/test/java/org/rocksdb/BackupEngineOptionsTest.java b/java/src/test/java/org/rocksdb/BackupEngineOptionsTest.java index 794bf04fb0be..b07f8d33c1d8 100644 --- a/java/src/test/java/org/rocksdb/BackupEngineOptionsTest.java +++ b/java/src/test/java/org/rocksdb/BackupEngineOptionsTest.java @@ -15,8 +15,7 @@ import org.junit.rules.ExpectedException; public class BackupEngineOptionsTest { - private final static String ARBITRARY_PATH = - System.getProperty("java.io.tmpdir"); + private static final String ARBITRARY_PATH = System.getProperty("java.io.tmpdir"); @ClassRule public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = @@ -61,13 +60,10 @@ public void infoLog() { try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) { assertThat(backupEngineOptions.infoLog()).isNull(); - try(final Options options = new Options(); - final Logger logger = new Logger(options){ - @Override - protected void log(InfoLogLevel infoLogLevel, String logMsg) { - - } - }) { + try (final Options options = new Options(); final Logger logger = new Logger(options) { + @Override + protected void log(final InfoLogLevel infoLogLevel, final String logMsg) {} + }) { backupEngineOptions.setInfoLog(logger); assertThat(backupEngineOptions.infoLog()).isEqualTo(logger); } @@ -85,7 +81,7 @@ public void sync() { @Test public void destroyOldData() { - try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH);) { + try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) { final boolean value = rand.nextBoolean(); backupEngineOptions.setDestroyOldData(value); assertThat(backupEngineOptions.destroyOldData()).isEqualTo(value); @@ -154,7 +150,7 @@ public void restoreRateLimiter() { @Test public void shareFilesWithChecksum() { try (final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH)) { - boolean value = rand.nextBoolean(); + final boolean value = rand.nextBoolean(); backupEngineOptions.setShareFilesWithChecksum(value); assertThat(backupEngineOptions.shareFilesWithChecksum()).isEqualTo(value); } @@ -181,7 +177,7 @@ public void callbackTriggerIntervalSize() { @Test public void failBackupDirIsNull() { exception.expect(IllegalArgumentException.class); - try (final BackupEngineOptions opts = new BackupEngineOptions(null)) { + try (final BackupEngineOptions ignored = new BackupEngineOptions(null)) { //no-op } } @@ -202,7 +198,7 @@ public void failSetShareTableFilesIfDisposed() { @Test public void failShareTableFilesIfDisposed() { - try (BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) { + try (final BackupEngineOptions options = setupUninitializedBackupEngineOptions(exception)) { options.shareTableFiles(); } } @@ -291,7 +287,8 @@ public void failShareFilesWithChecksumIfDisposed() { } } - private BackupEngineOptions setupUninitializedBackupEngineOptions(ExpectedException exception) { + private BackupEngineOptions setupUninitializedBackupEngineOptions( + final ExpectedException exception) { final BackupEngineOptions backupEngineOptions = new BackupEngineOptions(ARBITRARY_PATH); backupEngineOptions.close(); exception.expect(AssertionError.class); diff --git a/java/src/test/java/org/rocksdb/BlobOptionsTest.java b/java/src/test/java/org/rocksdb/BlobOptionsTest.java index fe3d9b246a57..a0a2af84a858 100644 --- a/java/src/test/java/org/rocksdb/BlobOptionsTest.java +++ b/java/src/test/java/org/rocksdb/BlobOptionsTest.java @@ -7,8 +7,6 @@ import static java.nio.charset.StandardCharsets.UTF_8; import static org.assertj.core.api.Assertions.assertThat; -import java.io.File; -import java.io.FilenameFilter; import java.util.*; import org.junit.ClassRule; import org.junit.Rule; @@ -34,35 +32,29 @@ public class BlobOptionsTest { */ @SuppressWarnings("CallToStringConcatCanBeReplacedByOperator") private int countDBFiles(final String endsWith) { - return Objects - .requireNonNull(dbFolder.getRoot().list(new FilenameFilter() { - @Override - public boolean accept(File dir, String name) { - return name.endsWith(endsWith); - } - })) + return Objects.requireNonNull(dbFolder.getRoot().list((dir, name) -> name.endsWith(endsWith))) .length; } @SuppressWarnings("SameParameterValue") - private byte[] small_key(String suffix) { + private byte[] small_key(final String suffix) { return ("small_key_" + suffix).getBytes(UTF_8); } @SuppressWarnings("SameParameterValue") - private byte[] small_value(String suffix) { + private byte[] small_value(final String suffix) { return ("small_value_" + suffix).getBytes(UTF_8); } - private byte[] large_key(String suffix) { + private byte[] large_key(final String suffix) { return ("large_key_" + suffix).getBytes(UTF_8); } - private byte[] large_value(String repeat) { + private byte[] large_value(final String repeat) { final byte[] large_value = ("" + repeat + "_" + largeBlobSize + "b").getBytes(UTF_8); final byte[] large_buffer = new byte[largeBlobSize]; for (int pos = 0; pos < largeBlobSize; pos += large_value.length) { - int numBytes = Math.min(large_value.length, large_buffer.length - pos); + final int numBytes = Math.min(large_value.length, large_buffer.length - pos); System.arraycopy(large_value, 0, large_buffer, pos, numBytes); } return large_buffer; @@ -232,14 +224,18 @@ public void testBlobWriteAboveThreshold() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { db.put(small_key("default"), small_value("default")); - db.flush(new FlushOptions().setWaitForFlush(true)); + try (final FlushOptions flushOptions = new FlushOptions().setWaitForFlush(true)) { + db.flush(flushOptions); + } // check there are no blobs in the database assertThat(countDBFiles(".sst")).isEqualTo(1); assertThat(countDBFiles(".blob")).isEqualTo(0); db.put(large_key("default"), large_value("default")); - db.flush(new FlushOptions().setWaitForFlush(true)); + try (final FlushOptions flushOptions = new FlushOptions().setWaitForFlush(true)) { + db.flush(flushOptions); + } // wrote and flushed a value larger than the blobbing threshold // check there is a single blob in the database @@ -277,7 +273,9 @@ public void testBlobWriteAboveThresholdCF() throws RocksDBException { final RocksDB db = RocksDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, columnFamilyHandles)) { db.put(columnFamilyHandles.get(0), small_key("default"), small_value("default")); - db.flush(new FlushOptions().setWaitForFlush(true)); + try (final FlushOptions flushOptions = new FlushOptions().setWaitForFlush(true)) { + db.flush(flushOptions); + } assertThat(countDBFiles(".blob")).isEqualTo(0); @@ -338,12 +336,16 @@ public void testBlobWriteAboveThresholdCF() throws RocksDBException { db.put(columnFamilyHandles.get(1), large_key("column_family_1_k2"), large_value("column_family_1_k2")); - db.flush(new FlushOptions().setWaitForFlush(true), columnFamilyHandles.get(1)); + try (final FlushOptions flushOptions = new FlushOptions().setWaitForFlush(true)) { + db.flush(flushOptions, columnFamilyHandles.get(1)); + } assertThat(countDBFiles(".blob")).isEqualTo(1); db.put(columnFamilyHandles.get(2), large_key("column_family_2_k2"), large_value("column_family_2_k2")); - db.flush(new FlushOptions().setWaitForFlush(true), columnFamilyHandles.get(2)); + try (final FlushOptions flushOptions = new FlushOptions().setWaitForFlush(true)) { + db.flush(flushOptions, columnFamilyHandles.get(2)); + } assertThat(countDBFiles(".blob")).isEqualTo(1); } } diff --git a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java index 330881764dff..13247d1e6635 100644 --- a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java +++ b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java @@ -110,7 +110,7 @@ public void jniPortal() throws Exception { tableConfig.setDataBlockIndexType(DataBlockIndexType.kDataBlockBinarySearch); tableConfig.setChecksumType(ChecksumType.kNoChecksum); try (final Options options = new Options().setTableFormatConfig(tableConfig)) { - String opts = getOptionAsString(options); + final String opts = getOptionAsString(options); assertThat(opts).contains("index_type=kBinarySearch"); assertThat(opts).contains("data_block_index_type=kDataBlockBinarySearch"); assertThat(opts).contains("checksum=kNoChecksum"); @@ -121,7 +121,7 @@ public void jniPortal() throws Exception { tableConfig.setChecksumType(ChecksumType.kCRC32c); try (final Options options = new Options().setTableFormatConfig(tableConfig)) { options.useCappedPrefixExtractor(1); // Needed to use kHashSearch - String opts = getOptionAsString(options); + final String opts = getOptionAsString(options); assertThat(opts).contains("index_type=kHashSearch"); assertThat(opts).contains("data_block_index_type=kDataBlockBinaryAndHash"); assertThat(opts).contains("checksum=kCRC32c"); @@ -130,7 +130,7 @@ public void jniPortal() throws Exception { tableConfig.setIndexType(IndexType.kTwoLevelIndexSearch); tableConfig.setChecksumType(ChecksumType.kxxHash); try (final Options options = new Options().setTableFormatConfig(tableConfig)) { - String opts = getOptionAsString(options); + final String opts = getOptionAsString(options); assertThat(opts).contains("index_type=kTwoLevelIndexSearch"); assertThat(opts).contains("checksum=kxxHash"); } @@ -138,30 +138,29 @@ public void jniPortal() throws Exception { tableConfig.setIndexType(IndexType.kBinarySearchWithFirstKey); tableConfig.setChecksumType(ChecksumType.kxxHash64); try (final Options options = new Options().setTableFormatConfig(tableConfig)) { - String opts = getOptionAsString(options); + final String opts = getOptionAsString(options); assertThat(opts).contains("index_type=kBinarySearchWithFirstKey"); assertThat(opts).contains("checksum=kxxHash64"); } tableConfig.setChecksumType(ChecksumType.kXXH3); try (final Options options = new Options().setTableFormatConfig(tableConfig)) { - String opts = getOptionAsString(options); + final String opts = getOptionAsString(options); assertThat(opts).contains("checksum=kXXH3"); } } - private String getOptionAsString(Options options) throws Exception { + private String getOptionAsString(final Options options) throws Exception { options.setCreateIfMissing(true); - String dbPath = dbFolder.getRoot().getAbsolutePath(); - String result; - try (final RocksDB db = RocksDB.open(options, dbPath); + final String dbPath = dbFolder.getRoot().getAbsolutePath(); + final String result; + try (final RocksDB ignored = RocksDB.open(options, dbPath); final Stream pathStream = Files.walk(Paths.get(dbPath))) { - Path optionsPath = - pathStream - .filter(p -> p.getFileName().toString().startsWith("OPTIONS")) + final Path optionsPath = + pathStream.filter(p -> p.getFileName().toString().startsWith("OPTIONS")) .findAny() .orElseThrow(() -> new AssertionError("Missing options file")); - byte[] optionsData = Files.readAllBytes(optionsPath); + final byte[] optionsData = Files.readAllBytes(optionsPath); result = new String(optionsData, StandardCharsets.UTF_8); } RocksDB.destroyDB(dbPath, options); @@ -230,63 +229,6 @@ protected void log(final InfoLogLevel infoLogLevel, final String logMsg) { } } - @Test - public void blockCacheCompressed() { - try (final Cache cache = new LRUCache(17 * 1024 * 1024); - final Options options = new Options().setTableFormatConfig( - new BlockBasedTableConfig().setBlockCacheCompressed(cache))) { - assertThat(options.tableFactoryName()).isEqualTo("BlockBasedTable"); - } - } - - @Ignore("See issue: https://github.com/facebook/rocksdb/issues/4822") - @Test - public void blockCacheCompressedIntegration() throws RocksDBException { - final byte[] key1 = "some-key1".getBytes(StandardCharsets.UTF_8); - final byte[] key2 = "some-key1".getBytes(StandardCharsets.UTF_8); - final byte[] key3 = "some-key1".getBytes(StandardCharsets.UTF_8); - final byte[] key4 = "some-key1".getBytes(StandardCharsets.UTF_8); - final byte[] value = "some-value".getBytes(StandardCharsets.UTF_8); - - try (final Cache compressedCache = new LRUCache(8 * 1024 * 1024); - final Statistics statistics = new Statistics()) { - - final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig() - .setNoBlockCache(true) - .setBlockCache(null) - .setBlockCacheCompressed(compressedCache) - .setFormatVersion(4); - - try (final Options options = new Options() - .setCreateIfMissing(true) - .setStatistics(statistics) - .setTableFormatConfig(blockBasedTableConfig)) { - - for (int shard = 0; shard < 8; shard++) { - try (final FlushOptions flushOptions = new FlushOptions(); - final WriteOptions writeOptions = new WriteOptions(); - final ReadOptions readOptions = new ReadOptions(); - final RocksDB db = - RocksDB.open(options, dbFolder.getRoot().getAbsolutePath() + "/" + shard)) { - - db.put(writeOptions, key1, value); - db.put(writeOptions, key2, value); - db.put(writeOptions, key3, value); - db.put(writeOptions, key4, value); - db.flush(flushOptions); - - db.get(readOptions, key1); - db.get(readOptions, key2); - db.get(readOptions, key3); - db.get(readOptions, key4); - - assertThat(statistics.getTickerCount(TickerType.BLOCK_CACHE_COMPRESSED_ADD)).isEqualTo(shard + 1); - } - } - } - } - } - @Test public void blockSize() { final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); @@ -414,7 +356,7 @@ public void invalidFormatVersion() throws RocksDBException { new BlockBasedTableConfig().setFormatVersion(99999); try (final Options options = new Options().setTableFormatConfig(blockBasedTableConfig); - final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + final RocksDB ignored = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { fail("Opening the database with an invalid format_version should have raised an exception"); } } @@ -470,21 +412,4 @@ public void blockCacheNumShardBits() { isEqualTo(5); } - @Deprecated - @Test - public void blockCacheCompressedSize() { - final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); - blockBasedTableConfig.setBlockCacheCompressedSize(40); - assertThat(blockBasedTableConfig.blockCacheCompressedSize()). - isEqualTo(40); - } - - @Deprecated - @Test - public void blockCacheCompressedNumShardBits() { - final BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); - blockBasedTableConfig.setBlockCacheCompressedNumShardBits(4); - assertThat(blockBasedTableConfig.blockCacheCompressedNumShardBits()). - isEqualTo(4); - } } diff --git a/java/src/test/java/org/rocksdb/ByteBufferUnsupportedOperationTest.java b/java/src/test/java/org/rocksdb/ByteBufferUnsupportedOperationTest.java new file mode 100644 index 000000000000..f596f573f29c --- /dev/null +++ b/java/src/test/java/org/rocksdb/ByteBufferUnsupportedOperationTest.java @@ -0,0 +1,132 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.rocksdb.util.ReverseBytewiseComparator; + +public class ByteBufferUnsupportedOperationTest { + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + public static class Handler { + private final RocksDB database; + private final Map columnFamilies; + + public Handler(final String path, final Options options) throws RocksDBException { + RocksDB.destroyDB(path, options); + this.database = RocksDB.open(options, path); + this.columnFamilies = new ConcurrentHashMap<>(); + } + + public void addTable(final UUID streamID) throws RocksDBException { + final ColumnFamilyOptions tableOptions = new ColumnFamilyOptions(); + tableOptions.optimizeUniversalStyleCompaction(); + try (final ComparatorOptions comparatorOptions = new ComparatorOptions()) { + // comparatorOptions.setReusedSynchronisationType(ReusedSynchronisationType.ADAPTIVE_MUTEX); + tableOptions.setComparator(new ReverseBytewiseComparator(comparatorOptions)); + final ColumnFamilyDescriptor tableDescriptor = new ColumnFamilyDescriptor( + streamID.toString().getBytes(StandardCharsets.UTF_8), tableOptions); + final ColumnFamilyHandle tableHandle = database.createColumnFamily(tableDescriptor); + columnFamilies.put(streamID, tableHandle); + } + } + + public void updateAll(final List keyValuePairs, final UUID streamID) + throws RocksDBException { + final ColumnFamilyHandle currTable = columnFamilies.get(streamID); + try (final WriteBatch batchedWrite = new WriteBatch(); + final WriteOptions writeOptions = new WriteOptions()) { + for (final byte[][] pair : keyValuePairs) { + final byte[] keyBytes = pair[0]; + final byte[] valueBytes = pair[1]; + batchedWrite.put(currTable, keyBytes, valueBytes); + } + database.write(writeOptions, batchedWrite); + } + } + public boolean containsValue(final byte[] encodedValue, final UUID streamID) { + try (final RocksIterator iter = database.newIterator(columnFamilies.get(streamID))) { + iter.seekToFirst(); + while (iter.isValid()) { + final byte[] val = iter.value(); + if (Arrays.equals(val, encodedValue)) { + return true; + } + iter.next(); + } + } + return false; + } + + public void close() { + for (final ColumnFamilyHandle handle : columnFamilies.values()) { + handle.close(); + } + database.close(); + } + } + + private void inner(final int numRepeats) throws RocksDBException { + final Options opts = new Options(); + opts.setCreateIfMissing(true); + final Handler handler = new Handler("testDB", opts); + final UUID stream1 = UUID.randomUUID(); + + final List entries = new ArrayList<>(); + for (int i = 0; i < numRepeats; i++) { + final byte[] value = value(i); + final byte[] key = key(i); + entries.add(new byte[][] {key, value}); + } + handler.addTable(stream1); + handler.updateAll(entries, stream1); + + for (int i = 0; i < numRepeats; i++) { + final byte[] val = value(i); + final boolean hasValue = handler.containsValue(val, stream1); + if (!hasValue) { + throw new IllegalStateException("not has value " + i); + } + } + + handler.close(); + } + + private static byte[] key(final int i) { + return ("key" + i).getBytes(StandardCharsets.UTF_8); + } + + private static byte[] value(final int i) { + return ("value" + i).getBytes(StandardCharsets.UTF_8); + } + + @Test + public void unsupportedOperation() throws RocksDBException { + final int numRepeats = 1000; + final int repeatTest = 10; + + // the error is not always reproducible... let's try to increase the odds by repeating the main + // test body + for (int i = 0; i < repeatTest; i++) { + try { + inner(numRepeats); + } catch (final RuntimeException runtimeException) { + System.out.println("Exception on repeat " + i); + throw runtimeException; + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java b/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java index fe950362b7da..13aa6c2bdbb5 100644 --- a/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java +++ b/java/src/test/java/org/rocksdb/BytewiseComparatorRegressionTest.java @@ -21,7 +21,7 @@ * by a change made between 6.2.2 and 6.22.1, * to wit {@link ...} * which as part of its effect, changed the Java bytewise comparators. - * + *

    * {@link ...} * {@link ...} */ @@ -34,8 +34,8 @@ public class BytewiseComparatorRegressionTest { @Rule public TemporaryFolder temporarySSTFolder = new TemporaryFolder(); - private final static byte[][] testData = {{10, -11, 13}, {10, 11, 12}, {10, 11, 14}}; - private final static byte[][] orderedData = {{10, 11, 12}, {10, 11, 14}, {10, -11, 13}}; + private static final byte[][] testData = {{10, -11, 13}, {10, 11, 12}, {10, 11, 14}}; + private static final byte[][] orderedData = {{10, 11, 12}, {10, 11, 14}, {10, -11, 13}}; /** * {@link ...} @@ -43,12 +43,16 @@ public class BytewiseComparatorRegressionTest { @Test public void testJavaComparator() throws RocksDBException { final BytewiseComparator comparator = new BytewiseComparator(new ComparatorOptions()); - performTest(new Options().setCreateIfMissing(true).setComparator(comparator)); + try (final Options options = new Options().setCreateIfMissing(true).setComparator(comparator)) { + performTest(options); + } } @Test public void testDefaultComparator() throws RocksDBException { - performTest(new Options().setCreateIfMissing(true)); + try (final Options options = new Options().setCreateIfMissing(true)) { + performTest(options); + } } /** @@ -56,8 +60,10 @@ public void testDefaultComparator() throws RocksDBException { */ @Test public void testCppComparator() throws RocksDBException { - performTest(new Options().setCreateIfMissing(true).setComparator( - BuiltinComparator.BYTEWISE_COMPARATOR)); + try (final Options options = new Options().setCreateIfMissing(true).setComparator( + BuiltinComparator.BYTEWISE_COMPARATOR)) { + performTest(options); + } } private void performTest(final Options options) throws RocksDBException { diff --git a/java/src/test/java/org/rocksdb/CheckPointTest.java b/java/src/test/java/org/rocksdb/CheckPointTest.java index c2cc6fc623d7..3b0b5d86a154 100644 --- a/java/src/test/java/org/rocksdb/CheckPointTest.java +++ b/java/src/test/java/org/rocksdb/CheckPointTest.java @@ -57,10 +57,27 @@ public void checkPoint() throws RocksDBException { } } + @Test + public void exportColumnFamily() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true)) { + try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + db.put("key".getBytes(), "value".getBytes()); + try (final Checkpoint checkpoint = Checkpoint.create(db)) { + ExportImportFilesMetaData metadata1 = + checkpoint.exportColumnFamily(db.getDefaultColumnFamily(), + checkpointFolder.getRoot().getAbsolutePath() + "/export_column_family1"); + db.put("key2".getBytes(), "value2".getBytes()); + ExportImportFilesMetaData metadata2 = + checkpoint.exportColumnFamily(db.getDefaultColumnFamily(), + checkpointFolder.getRoot().getAbsolutePath() + "/export_column_family2"); + } + } + } + } + @Test(expected = IllegalArgumentException.class) public void failIfDbIsNull() { - try (final Checkpoint checkpoint = Checkpoint.create(null)) { - + try (final Checkpoint ignored = Checkpoint.create(null)) { } } diff --git a/java/src/test/java/org/rocksdb/ClockCacheTest.java b/java/src/test/java/org/rocksdb/ClockCacheTest.java index d1241ac75b82..718c24f70a36 100644 --- a/java/src/test/java/org/rocksdb/ClockCacheTest.java +++ b/java/src/test/java/org/rocksdb/ClockCacheTest.java @@ -18,8 +18,7 @@ public void newClockCache() { final long capacity = 1000; final int numShardBits = 16; final boolean strictCapacityLimit = true; - try(final Cache clockCache = new ClockCache(capacity, - numShardBits, strictCapacityLimit)) { + try (final Cache ignored = new ClockCache(capacity, numShardBits, strictCapacityLimit)) { //no op } } diff --git a/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java b/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java index 7d758104882f..35a04a697f84 100644 --- a/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java +++ b/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java @@ -9,7 +9,6 @@ import static org.junit.Assert.assertEquals; import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Paths; import java.util.*; import org.junit.ClassRule; @@ -27,11 +26,11 @@ public class ColumnFamilyOptionsTest { @Test public void copyConstructor() { - ColumnFamilyOptions origOpts = new ColumnFamilyOptions(); + final ColumnFamilyOptions origOpts = new ColumnFamilyOptions(); origOpts.setNumLevels(rand.nextInt(8)); origOpts.setTargetFileSizeMultiplier(rand.nextInt(100)); origOpts.setLevel0StopWritesTrigger(rand.nextInt(50)); - ColumnFamilyOptions copyOpts = new ColumnFamilyOptions(origOpts); + final ColumnFamilyOptions copyOpts = new ColumnFamilyOptions(origOpts); assertThat(origOpts.numLevels()).isEqualTo(copyOpts.numLevels()); assertThat(origOpts.targetFileSizeMultiplier()).isEqualTo(copyOpts.targetFileSizeMultiplier()); assertThat(origOpts.level0StopWritesTrigger()).isEqualTo(copyOpts.level0StopWritesTrigger()); @@ -39,7 +38,7 @@ public void copyConstructor() { @Test public void getColumnFamilyOptionsFromProps() { - Properties properties = new Properties(); + final Properties properties = new Properties(); properties.put("write_buffer_size", "112"); properties.put("max_write_buffer_number", "13"); @@ -90,16 +89,15 @@ public void failColumnFamilyOptionsFromPropsWithIllegalValue() { @Test(expected = IllegalArgumentException.class) public void failColumnFamilyOptionsFromPropsWithNullValue() { - try (final ColumnFamilyOptions opt = + try (final ColumnFamilyOptions ignored = ColumnFamilyOptions.getColumnFamilyOptionsFromProps(null)) { } } @Test(expected = IllegalArgumentException.class) public void failColumnFamilyOptionsFromPropsWithEmptyProps() { - try (final ColumnFamilyOptions opt = - ColumnFamilyOptions.getColumnFamilyOptionsFromProps( - new Properties())) { + try (final ColumnFamilyOptions ignored = + ColumnFamilyOptions.getColumnFamilyOptionsFromProps(new Properties())) { } } @@ -455,7 +453,7 @@ public void compressionPerLevel() { } columnFamilyOptions.setCompressionPerLevel(compressionTypeList); compressionTypeList = columnFamilyOptions.compressionPerLevel(); - for (CompressionType compressionType : compressionTypeList) { + for (final CompressionType compressionType : compressionTypeList) { assertThat(compressionType).isEqualTo( CompressionType.NO_COMPRESSION); } @@ -711,4 +709,14 @@ public void cfPaths() throws IOException { assertThat(options.cfPaths()).isEqualTo(paths); } } + + @Test + public void memtableMaxRangeDeletions() { + try (final ColumnFamilyOptions options = new ColumnFamilyOptions()) { + assertThat(options.memtableMaxRangeDeletions()).isEqualTo(0); + final int val = 32; + assertThat(options.setMemtableMaxRangeDeletions(val)).isEqualTo(options); + assertThat(options.memtableMaxRangeDeletions()).isEqualTo(val); + } + } } diff --git a/java/src/test/java/org/rocksdb/ColumnFamilyTest.java b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java index e98327d93df0..fb8a45085505 100644 --- a/java/src/test/java/org/rocksdb/ColumnFamilyTest.java +++ b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java @@ -22,16 +22,14 @@ public class ColumnFamilyTest { public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = new RocksNativeLibraryResource(); - @Rule - public TemporaryFolder dbFolder = new TemporaryFolder(); + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); @Test public void columnFamilyDescriptorName() throws RocksDBException { final byte[] cfName = "some_name".getBytes(UTF_8); try(final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions()) { - final ColumnFamilyDescriptor cfDescriptor = - new ColumnFamilyDescriptor(cfName, cfOptions); + final ColumnFamilyDescriptor cfDescriptor = new ColumnFamilyDescriptor(cfName, cfOptions); assertThat(cfDescriptor.getName()).isEqualTo(cfName); } } @@ -40,24 +38,23 @@ public void columnFamilyDescriptorName() throws RocksDBException { public void columnFamilyDescriptorOptions() throws RocksDBException { final byte[] cfName = "some_name".getBytes(UTF_8); - try(final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions() - .setCompressionType(CompressionType.BZLIB2_COMPRESSION)) { + try (final ColumnFamilyOptions cfOptions = + new ColumnFamilyOptions().setCompressionType(CompressionType.BZLIB2_COMPRESSION)) { final ColumnFamilyDescriptor cfDescriptor = new ColumnFamilyDescriptor(cfName, cfOptions); - assertThat(cfDescriptor.getOptions().compressionType()) - .isEqualTo(CompressionType.BZLIB2_COMPRESSION); + assertThat(cfDescriptor.getOptions().compressionType()) + .isEqualTo(CompressionType.BZLIB2_COMPRESSION); } } @Test public void listColumnFamilies() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); - final RocksDB db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath())) { + final RocksDB ignored = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { // Test listColumnFamilies - final List columnFamilyNames = RocksDB.listColumnFamilies(options, - dbFolder.getRoot().getAbsolutePath()); + final List columnFamilyNames = + RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); assertThat(columnFamilyNames).isNotNull(); assertThat(columnFamilyNames.size()).isGreaterThan(0); assertThat(columnFamilyNames.size()).isEqualTo(1); @@ -70,8 +67,7 @@ public void defaultColumnFamily() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { - final ColumnFamilyHandle cfh = db.getDefaultColumnFamily(); - try { + try (final ColumnFamilyHandle cfh = db.getDefaultColumnFamily()) { assertThat(cfh).isNotNull(); assertThat(cfh.getName()).isEqualTo("default".getBytes(UTF_8)); @@ -87,8 +83,6 @@ public void defaultColumnFamily() throws RocksDBException { assertThat(cfh).isNotNull(); assertThat(actualValue).isEqualTo(value); - } finally { - cfh.close(); } } } @@ -96,31 +90,25 @@ public void defaultColumnFamily() throws RocksDBException { @Test public void createColumnFamily() throws RocksDBException { final byte[] cfName = "new_cf".getBytes(UTF_8); - final ColumnFamilyDescriptor cfDescriptor = new ColumnFamilyDescriptor(cfName, - new ColumnFamilyOptions()); + final ColumnFamilyDescriptor cfDescriptor = + new ColumnFamilyDescriptor(cfName, new ColumnFamilyOptions()); try (final Options options = new Options().setCreateIfMissing(true); - final RocksDB db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath())) { - - final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(cfDescriptor); - - try { + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + try (final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(cfDescriptor)) { assertThat(columnFamilyHandle.getName()).isEqualTo(cfName); assertThat(columnFamilyHandle.getID()).isEqualTo(1); final ColumnFamilyDescriptor latestDescriptor = columnFamilyHandle.getDescriptor(); assertThat(latestDescriptor.getName()).isEqualTo(cfName); - final List columnFamilyNames = RocksDB.listColumnFamilies( - options, dbFolder.getRoot().getAbsolutePath()); + final List columnFamilyNames = + RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); assertThat(columnFamilyNames).isNotNull(); assertThat(columnFamilyNames.size()).isGreaterThan(0); assertThat(columnFamilyNames.size()).isEqualTo(2); assertThat(new String(columnFamilyNames.get(0))).isEqualTo("default"); assertThat(new String(columnFamilyNames.get(1))).isEqualTo("new_cf"); - } finally { - columnFamilyHandle.close(); } } } @@ -147,7 +135,8 @@ public void openWithColumnFamilies() throws RocksDBException { db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(), "dfvalue".getBytes()); db.put(columnFamilyHandleList.get(1), "newcfkey1".getBytes(), "newcfvalue".getBytes()); - String retVal = new String(db.get(columnFamilyHandleList.get(1), "newcfkey1".getBytes())); + final String retVal = + new String(db.get(columnFamilyHandleList.get(1), "newcfkey1".getBytes())); assertThat(retVal).isEqualTo("newcfvalue"); assertThat((db.get(columnFamilyHandleList.get(1), "dfkey1".getBytes()))).isNull(); db.delete(columnFamilyHandleList.get(1), "newcfkey1".getBytes()); @@ -160,8 +149,8 @@ public void openWithColumnFamilies() throws RocksDBException { @Test public void getWithOutValueAndCf() throws RocksDBException { - final List cfDescriptors = Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + final List cfDescriptors = + Collections.singletonList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); final List columnFamilyHandleList = new ArrayList<>(); // Test open database with column family names @@ -202,7 +191,7 @@ public void createWriteDropColumnFamily() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - ColumnFamilyHandle tmpColumnFamilyHandle; + final ColumnFamilyHandle tmpColumnFamilyHandle; tmpColumnFamilyHandle = db.createColumnFamily( new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions())); db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes()); @@ -223,8 +212,8 @@ public void createWriteDropColumnFamilies() throws RocksDBException { final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList)) { - ColumnFamilyHandle tmpColumnFamilyHandle = null; - ColumnFamilyHandle tmpColumnFamilyHandle2 = null; + final ColumnFamilyHandle tmpColumnFamilyHandle; + final ColumnFamilyHandle tmpColumnFamilyHandle2; tmpColumnFamilyHandle = db.createColumnFamily( new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions())); tmpColumnFamilyHandle2 = db.createColumnFamily( @@ -264,7 +253,7 @@ public void writeBatch() throws RocksDBException { writeBatch.delete(columnFamilyHandleList.get(1), "xyz".getBytes()); db.write(writeOpt, writeBatch); - assertThat(db.get(columnFamilyHandleList.get(1), "xyz".getBytes()) == null); + assertThat(db.get(columnFamilyHandleList.get(1), "xyz".getBytes())).isNull(); assertThat(new String(db.get(columnFamilyHandleList.get(1), "newcfkey".getBytes()))) .isEqualTo("value"); assertThat(new String(db.get(columnFamilyHandleList.get(1), "newcfkey2".getBytes()))) @@ -293,7 +282,7 @@ public void iteratorOnColumnFamily() throws RocksDBException { db.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), "value2".getBytes()); try (final RocksIterator rocksIterator = db.newIterator(columnFamilyHandleList.get(1))) { rocksIterator.seekToFirst(); - Map refMap = new HashMap<>(); + final Map refMap = new HashMap<>(); refMap.put("newcfkey", "value"); refMap.put("newcfkey2", "value2"); int i = 0; @@ -323,8 +312,7 @@ public void multiGet() throws RocksDBException { db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes()); db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes()); - final List keys = - Arrays.asList(new byte[][] {"key".getBytes(), "newcfkey".getBytes()}); + final List keys = Arrays.asList("key".getBytes(), "newcfkey".getBytes()); List retValues = db.multiGetAsList(columnFamilyHandleList, keys); assertThat(retValues.size()).isEqualTo(2); @@ -352,8 +340,7 @@ public void multiGetAsList() throws RocksDBException { db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes()); db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes()); - final List keys = - Arrays.asList(new byte[][] {"key".getBytes(), "newcfkey".getBytes()}); + final List keys = Arrays.asList("key".getBytes(), "newcfkey".getBytes()); List retValues = db.multiGetAsList(columnFamilyHandleList, keys); assertThat(retValues.size()).isEqualTo(2); assertThat(new String(retValues.get(0))).isEqualTo("value"); @@ -528,15 +515,12 @@ public void testByteCreateFolumnFamily() throws RocksDBException { @Test public void testCFNamesWithZeroBytes() throws RocksDBException { - ColumnFamilyHandle cf1 = null, cf2 = null; try (final Options options = new Options().setCreateIfMissing(true); - final RocksDB db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath()); - ) { + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { final byte[] b0 = new byte[] {0, 0}; final byte[] b1 = new byte[] {0, 1}; - cf1 = db.createColumnFamily(new ColumnFamilyDescriptor(b0)); - cf2 = db.createColumnFamily(new ColumnFamilyDescriptor(b1)); + db.createColumnFamily(new ColumnFamilyDescriptor(b0)); + db.createColumnFamily(new ColumnFamilyDescriptor(b1)); final List families = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); assertThat(families).contains("default".getBytes(), b0, b1); @@ -545,14 +529,10 @@ public void testCFNamesWithZeroBytes() throws RocksDBException { @Test public void testCFNameSimplifiedChinese() throws RocksDBException { - ColumnFamilyHandle columnFamilyHandle = null; try (final Options options = new Options().setCreateIfMissing(true); - final RocksDB db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath()); - ) { + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { final String simplifiedChinese = "\u7b80\u4f53\u5b57"; - columnFamilyHandle = - db.createColumnFamily(new ColumnFamilyDescriptor(simplifiedChinese.getBytes())); + db.createColumnFamily(new ColumnFamilyDescriptor(simplifiedChinese.getBytes())); final List families = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); @@ -563,7 +543,7 @@ public void testCFNameSimplifiedChinese() throws RocksDBException { @Test public void testDestroyColumnFamilyHandle() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); - final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath());) { + final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { final byte[] name1 = "cf1".getBytes(); final byte[] name2 = "cf2".getBytes(); final ColumnFamilyDescriptor desc1 = new ColumnFamilyDescriptor(name1); diff --git a/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java b/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java index 18c187ddbab8..549b74beb1cc 100644 --- a/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java +++ b/java/src/test/java/org/rocksdb/CompactRangeOptionsTest.java @@ -18,81 +18,121 @@ public class CompactRangeOptionsTest { @Test public void exclusiveManualCompaction() { - CompactRangeOptions opt = new CompactRangeOptions(); - boolean value = false; - opt.setExclusiveManualCompaction(value); - assertThat(opt.exclusiveManualCompaction()).isEqualTo(value); - value = true; - opt.setExclusiveManualCompaction(value); - assertThat(opt.exclusiveManualCompaction()).isEqualTo(value); + try (final CompactRangeOptions opt = new CompactRangeOptions()) { + opt.setExclusiveManualCompaction(false); + assertThat(opt.exclusiveManualCompaction()).isEqualTo(false); + opt.setExclusiveManualCompaction(true); + assertThat(opt.exclusiveManualCompaction()).isEqualTo(true); + } } @Test public void bottommostLevelCompaction() { - CompactRangeOptions opt = new CompactRangeOptions(); - BottommostLevelCompaction value = BottommostLevelCompaction.kSkip; - opt.setBottommostLevelCompaction(value); - assertThat(opt.bottommostLevelCompaction()).isEqualTo(value); - value = BottommostLevelCompaction.kForce; - opt.setBottommostLevelCompaction(value); - assertThat(opt.bottommostLevelCompaction()).isEqualTo(value); - value = BottommostLevelCompaction.kIfHaveCompactionFilter; - opt.setBottommostLevelCompaction(value); - assertThat(opt.bottommostLevelCompaction()).isEqualTo(value); + try (final CompactRangeOptions opt = new CompactRangeOptions()) { + BottommostLevelCompaction value = BottommostLevelCompaction.kSkip; + opt.setBottommostLevelCompaction(value); + assertThat(opt.bottommostLevelCompaction()).isEqualTo(value); + value = BottommostLevelCompaction.kForce; + opt.setBottommostLevelCompaction(value); + assertThat(opt.bottommostLevelCompaction()).isEqualTo(value); + value = BottommostLevelCompaction.kIfHaveCompactionFilter; + opt.setBottommostLevelCompaction(value); + assertThat(opt.bottommostLevelCompaction()).isEqualTo(value); + value = BottommostLevelCompaction.kForceOptimized; + opt.setBottommostLevelCompaction(value); + assertThat(opt.bottommostLevelCompaction()).isEqualTo(value); + } } @Test public void changeLevel() { - CompactRangeOptions opt = new CompactRangeOptions(); - boolean value = false; - opt.setChangeLevel(value); - assertThat(opt.changeLevel()).isEqualTo(value); - value = true; - opt.setChangeLevel(value); - assertThat(opt.changeLevel()).isEqualTo(value); + try (final CompactRangeOptions opt = new CompactRangeOptions()) { + opt.setChangeLevel(false); + assertThat(opt.changeLevel()).isEqualTo(false); + opt.setChangeLevel(true); + assertThat(opt.changeLevel()).isEqualTo(true); + } } @Test public void targetLevel() { - CompactRangeOptions opt = new CompactRangeOptions(); - int value = 2; - opt.setTargetLevel(value); - assertThat(opt.targetLevel()).isEqualTo(value); - value = 3; - opt.setTargetLevel(value); - assertThat(opt.targetLevel()).isEqualTo(value); + try (final CompactRangeOptions opt = new CompactRangeOptions()) { + int value = 2; + opt.setTargetLevel(value); + assertThat(opt.targetLevel()).isEqualTo(value); + value = 3; + opt.setTargetLevel(value); + assertThat(opt.targetLevel()).isEqualTo(value); + } } @Test public void targetPathId() { - CompactRangeOptions opt = new CompactRangeOptions(); - int value = 2; - opt.setTargetPathId(value); - assertThat(opt.targetPathId()).isEqualTo(value); - value = 3; - opt.setTargetPathId(value); - assertThat(opt.targetPathId()).isEqualTo(value); + try (final CompactRangeOptions opt = new CompactRangeOptions()) { + int value = 2; + opt.setTargetPathId(value); + assertThat(opt.targetPathId()).isEqualTo(value); + value = 3; + opt.setTargetPathId(value); + assertThat(opt.targetPathId()).isEqualTo(value); + } } @Test public void allowWriteStall() { - CompactRangeOptions opt = new CompactRangeOptions(); - boolean value = false; - opt.setAllowWriteStall(value); - assertThat(opt.allowWriteStall()).isEqualTo(value); - value = true; - opt.setAllowWriteStall(value); - assertThat(opt.allowWriteStall()).isEqualTo(value); + try (final CompactRangeOptions opt = new CompactRangeOptions()) { + opt.setAllowWriteStall(false); + assertThat(opt.allowWriteStall()).isEqualTo(false); + opt.setAllowWriteStall(true); + assertThat(opt.allowWriteStall()).isEqualTo(true); + } } @Test public void maxSubcompactions() { + try (final CompactRangeOptions opt = new CompactRangeOptions()) { + int value = 2; + opt.setMaxSubcompactions(value); + assertThat(opt.maxSubcompactions()).isEqualTo(value); + value = 3; + opt.setMaxSubcompactions(value); + assertThat(opt.maxSubcompactions()).isEqualTo(value); + } + } + + @Test + public void fullHistoryTSLow() { + CompactRangeOptions opt = new CompactRangeOptions(); + CompactRangeOptions.Timestamp timestamp = new CompactRangeOptions.Timestamp(18, 1); + opt.setFullHistoryTSLow(timestamp); + + for (int times = 1; times <= 2; times++) { + // worried slightly about destructive reads, so read it twice + CompactRangeOptions.Timestamp timestampResult = opt.fullHistoryTSLow(); + assertThat(timestamp.start).isEqualTo(timestampResult.start); + assertThat(timestamp.range).isEqualTo(timestampResult.range); + assertThat(timestamp).isEqualTo(timestampResult); + } + } + + @Test + public void fullHistoryTSLowDefault() { + CompactRangeOptions opt = new CompactRangeOptions(); + CompactRangeOptions.Timestamp timestampResult = opt.fullHistoryTSLow(); + assertThat(timestampResult).isNull(); + } + + @Test + public void canceled() { CompactRangeOptions opt = new CompactRangeOptions(); - int value = 2; - opt.setMaxSubcompactions(value); - assertThat(opt.maxSubcompactions()).isEqualTo(value); - value = 3; - opt.setMaxSubcompactions(value); - assertThat(opt.maxSubcompactions()).isEqualTo(value); + assertThat(opt.canceled()).isEqualTo(false); + opt.setCanceled(true); + assertThat(opt.canceled()).isEqualTo(true); + opt.setCanceled(false); + assertThat(opt.canceled()).isEqualTo(false); + opt.setCanceled(true); + assertThat(opt.canceled()).isEqualTo(true); + opt.setCanceled(true); + assertThat(opt.canceled()).isEqualTo(true); } } diff --git a/java/src/test/java/org/rocksdb/CompressionTypesTest.java b/java/src/test/java/org/rocksdb/CompressionTypesTest.java index e26cc0aca0f5..a983f471a58f 100644 --- a/java/src/test/java/org/rocksdb/CompressionTypesTest.java +++ b/java/src/test/java/org/rocksdb/CompressionTypesTest.java @@ -5,16 +5,21 @@ package org.rocksdb; -import org.junit.Test; +import static org.assertj.core.api.Assertions.assertThat; +import org.junit.Test; public class CompressionTypesTest { @Test public void getCompressionType() { for (final CompressionType compressionType : CompressionType.values()) { - String libraryName = compressionType.getLibraryName(); - compressionType.equals(CompressionType.getCompressionType( - libraryName)); + final String libraryName = compressionType.getLibraryName(); + if (compressionType == CompressionType.DISABLE_COMPRESSION_OPTION) { + assertThat(CompressionType.getCompressionType(libraryName)) + .isEqualTo(CompressionType.NO_COMPRESSION); + } else { + assertThat(CompressionType.getCompressionType(libraryName)).isEqualTo(compressionType); + } } } -} +} \ No newline at end of file diff --git a/java/src/test/java/org/rocksdb/DBOptionsTest.java b/java/src/test/java/org/rocksdb/DBOptionsTest.java index d55ceebcf778..cb7eabcfb118 100644 --- a/java/src/test/java/org/rocksdb/DBOptionsTest.java +++ b/java/src/test/java/org/rocksdb/DBOptionsTest.java @@ -27,11 +27,11 @@ public class DBOptionsTest { @Test public void copyConstructor() { - DBOptions origOpts = new DBOptions(); + final DBOptions origOpts = new DBOptions(); origOpts.setCreateIfMissing(rand.nextBoolean()); origOpts.setAllow2pc(rand.nextBoolean()); origOpts.setMaxBackgroundJobs(rand.nextInt(10)); - DBOptions copyOpts = new DBOptions(origOpts); + final DBOptions copyOpts = new DBOptions(origOpts); assertThat(origOpts.createIfMissing()).isEqualTo(copyOpts.createIfMissing()); assertThat(origOpts.allow2pc()).isEqualTo(copyOpts.allow2pc()); } @@ -437,9 +437,8 @@ public void dbWriteBufferSize() { @Test public void setWriteBufferManager() throws RocksDBException { - try (final DBOptions opt = new DBOptions(); - final Cache cache = new LRUCache(1 * 1024 * 1024); - final WriteBufferManager writeBufferManager = new WriteBufferManager(2000l, cache)) { + try (final DBOptions opt = new DBOptions(); final Cache cache = new LRUCache(1024 * 1024); + final WriteBufferManager writeBufferManager = new WriteBufferManager(2000L, cache)) { opt.setWriteBufferManager(writeBufferManager); assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager); } @@ -447,14 +446,14 @@ public void setWriteBufferManager() throws RocksDBException { @Test public void setWriteBufferManagerWithZeroBufferSize() throws RocksDBException { - try (final DBOptions opt = new DBOptions(); - final Cache cache = new LRUCache(1 * 1024 * 1024); - final WriteBufferManager writeBufferManager = new WriteBufferManager(0l, cache)) { + try (final DBOptions opt = new DBOptions(); final Cache cache = new LRUCache(1024 * 1024); + final WriteBufferManager writeBufferManager = new WriteBufferManager(0L, cache)) { opt.setWriteBufferManager(writeBufferManager); assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager); } } + @SuppressWarnings("deprecated") @Test public void accessHintOnCompactionStart() { try(final DBOptions opt = new DBOptions()) { @@ -887,16 +886,18 @@ public void onMemTableSealed(final MemTableInfo memTableInfo) { wasCalled2.set(true); } }) { + assertThat(options.setListeners(null)).isEqualTo(options); + assertThat(options.listeners().size()).isEqualTo(0); assertThat(options.setListeners(Arrays.asList(el1, el2))).isEqualTo(options); - List listeners = options.listeners(); + final List listeners = options.listeners(); assertEquals(el1, listeners.get(0)); assertEquals(el2, listeners.get(1)); - options.setListeners(Collections.emptyList()); + options.setListeners(Collections.emptyList()); listeners.get(0).onTableFileDeleted(null); assertTrue(wasCalled1.get()); listeners.get(1).onMemTableSealed(null); assertTrue(wasCalled2.get()); - List listeners2 = options.listeners(); + final List listeners2 = options.listeners(); assertNotNull(listeners2); assertEquals(0, listeners2.size()); } diff --git a/java/src/test/java/org/rocksdb/EventListenerTest.java b/java/src/test/java/org/rocksdb/EventListenerTest.java index aec0af617edf..84be232f972d 100644 --- a/java/src/test/java/org/rocksdb/EventListenerTest.java +++ b/java/src/test/java/org/rocksdb/EventListenerTest.java @@ -181,7 +181,7 @@ void deleteColumnFamilyHandle(final AbstractEventListener el, final AtomicBoolea final byte[] value = new byte[24]; rand.nextBytes(value); db.put("testKey".getBytes(), value); - ColumnFamilyHandle columnFamilyHandle = db.getDefaultColumnFamily(); + final ColumnFamilyHandle columnFamilyHandle = db.getDefaultColumnFamily(); columnFamilyHandle.close(); assertThat(wasCbCalled.get()).isTrue(); } @@ -266,7 +266,7 @@ public void testAllCallbacksInvocation() { final FileOperationInfo fileOperationInfoTestData = new FileOperationInfo("/file/path", TEST_LONG_VAL, TEST_LONG_VAL, 1_600_699_420_000_000_000L, 5_000_000_000L, statusTestData); final WriteStallInfo writeStallInfoTestData = - new WriteStallInfo("columnFamilyName", (byte) 0x1, (byte) 0x2); + new WriteStallInfo("columnFamilyName", (byte) 0x0, (byte) 0x1); final ExternalFileIngestionInfo externalFileIngestionInfoTestData = new ExternalFileIngestionInfo("columnFamilyName", "/external/file/path", "/internal/file/path", TEST_LONG_VAL, tablePropertiesTestData); @@ -475,7 +475,7 @@ private static void assertEventsCalled( private static void assertNoCallbackErrors( final CapturingTestableEventListener capturingTestableEventListener) { - for (AssertionError error : capturingTestableEventListener.capturedAssertionErrors) { + for (final AssertionError error : capturingTestableEventListener.capturedAssertionErrors) { throw new Error("An assertion failed in callback", error); } } @@ -565,16 +565,16 @@ private static class ListenerEvents { private static class CapturingObjectAssert extends ObjectAssert { private final List assertionErrors; - public CapturingObjectAssert(T t, List assertionErrors) { + public CapturingObjectAssert(final T t, final List assertionErrors) { super(t); this.assertionErrors = assertionErrors; } @Override - public ObjectAssert isEqualTo(Object other) { + public ObjectAssert isEqualTo(final Object other) { try { return super.isEqualTo(other); - } catch (AssertionError error) { + } catch (final AssertionError error) { assertionErrors.add(error); throw error; } @@ -584,7 +584,7 @@ public ObjectAssert isEqualTo(Object other) { public ObjectAssert isNotNull() { try { return super.isNotNull(); - } catch (AssertionError error) { + } catch (final AssertionError error) { assertionErrors.add(error); throw error; } @@ -596,8 +596,8 @@ private static class CapturingTestableEventListener extends TestableEventListene final List capturedAssertionErrors = new ArrayList<>(); - protected AbstractObjectAssert assertThat(T actual) { - return new CapturingObjectAssert(actual, capturedAssertionErrors); + protected AbstractObjectAssert assertThat(final T actual) { + return new CapturingObjectAssert<>(actual, capturedAssertionErrors); } public CapturingTestableEventListener() {} diff --git a/java/src/test/java/org/rocksdb/HyperClockCacheTest.java b/java/src/test/java/org/rocksdb/HyperClockCacheTest.java new file mode 100644 index 000000000000..132d69351560 --- /dev/null +++ b/java/src/test/java/org/rocksdb/HyperClockCacheTest.java @@ -0,0 +1,36 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class HyperClockCacheTest { + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void newHyperClockCache() throws RocksDBException { + RocksDB.loadLibrary(); + try (Cache cache = new HyperClockCache(1024 * 1024, 0, 8, false)) { + BlockBasedTableConfig tableConfing = new BlockBasedTableConfig(); + tableConfing.setBlockCache(cache); + try (Options options = new Options()) { + options.setTableFormatConfig(tableConfing); + options.setCreateIfMissing(true); + try (RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + db.put("testKey".getBytes(), "testData".getBytes()); + // no op + assertThat(cache.getUsage()).isGreaterThanOrEqualTo(0); + assertThat(cache.getPinnedUsage()).isGreaterThanOrEqualTo(0); + } + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/ImportColumnFamilyTest.java b/java/src/test/java/org/rocksdb/ImportColumnFamilyTest.java new file mode 100644 index 000000000000..ee569d497561 --- /dev/null +++ b/java/src/test/java/org/rocksdb/ImportColumnFamilyTest.java @@ -0,0 +1,98 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.fail; + +import java.io.File; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.rocksdb.util.BytewiseComparator; + +public class ImportColumnFamilyTest { + private static final String SST_FILE_NAME = "test.sst"; + private static final String DB_DIRECTORY_NAME = "test_db"; + + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Rule public TemporaryFolder checkpointFolder = new TemporaryFolder(); + + @Test + public void testImportColumnFamily() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true)) { + try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + db.put("key".getBytes(), "value".getBytes()); + db.put("key1".getBytes(), "value1".getBytes()); + + try (final Checkpoint checkpoint = Checkpoint.create(db); + final ImportColumnFamilyOptions importColumnFamilyOptions = + new ImportColumnFamilyOptions()) { + ExportImportFilesMetaData default_cf_metadata = + checkpoint.exportColumnFamily(db.getDefaultColumnFamily(), + checkpointFolder.getRoot().getAbsolutePath() + "/default_cf_metadata"); + ColumnFamilyDescriptor columnFamilyDescriptor = + new ColumnFamilyDescriptor("new_cf".getBytes()); + final ColumnFamilyHandle importCfHandle = db.createColumnFamilyWithImport( + columnFamilyDescriptor, importColumnFamilyOptions, default_cf_metadata); + assertThat(db.get(importCfHandle, "key".getBytes())).isEqualTo("value".getBytes()); + assertThat(db.get(importCfHandle, "key1".getBytes())).isEqualTo("value1".getBytes()); + } + } + } + } + + @Test + public void ImportMultiColumnFamilyTest() throws RocksDBException { + try (final Options options = new Options().setCreateIfMissing(true)) { + try (final RocksDB db1 = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath() + "db1"); + final RocksDB db2 = + RocksDB.open(options, dbFolder.getRoot().getAbsolutePath() + "db2");) { + db1.put("key".getBytes(), "value".getBytes()); + db1.put("key1".getBytes(), "value1".getBytes()); + db2.put("key2".getBytes(), "value2".getBytes()); + db2.put("key3".getBytes(), "value3".getBytes()); + try (final Checkpoint checkpoint1 = Checkpoint.create(db1); + final Checkpoint checkpoint2 = Checkpoint.create(db2); + final ImportColumnFamilyOptions importColumnFamilyOptions = + new ImportColumnFamilyOptions()) { + ExportImportFilesMetaData default_cf_metadata1 = + checkpoint1.exportColumnFamily(db1.getDefaultColumnFamily(), + checkpointFolder.getRoot().getAbsolutePath() + "/default_cf_metadata1"); + ExportImportFilesMetaData default_cf_metadata2 = + checkpoint2.exportColumnFamily(db2.getDefaultColumnFamily(), + checkpointFolder.getRoot().getAbsolutePath() + "/default_cf_metadata2"); + + ColumnFamilyDescriptor columnFamilyDescriptor = + new ColumnFamilyDescriptor("new_cf".getBytes()); + + List importMetaDatas = new ArrayList(); + importMetaDatas.add(default_cf_metadata1); + importMetaDatas.add(default_cf_metadata2); + + final ColumnFamilyHandle importCfHandle = db1.createColumnFamilyWithImport( + columnFamilyDescriptor, importColumnFamilyOptions, importMetaDatas); + assertThat(db1.get(importCfHandle, "key".getBytes())).isEqualTo("value".getBytes()); + assertThat(db1.get(importCfHandle, "key1".getBytes())).isEqualTo("value1".getBytes()); + assertThat(db1.get(importCfHandle, "key2".getBytes())).isEqualTo("value2".getBytes()); + assertThat(db1.get(importCfHandle, "key3".getBytes())).isEqualTo("value3".getBytes()); + } + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/InfoLogLevelTest.java b/java/src/test/java/org/rocksdb/InfoLogLevelTest.java index 12ee537d9cf8..90b0b4e2deab 100644 --- a/java/src/test/java/org/rocksdb/InfoLogLevelTest.java +++ b/java/src/test/java/org/rocksdb/InfoLogLevelTest.java @@ -95,12 +95,12 @@ private String getLogContentsWithoutHeader() throws IOException { int first_non_header = lines.length; // Identify the last line of the header for (int i = lines.length - 1; i >= 0; --i) { - if (lines[i].indexOf("DB pointer") >= 0) { + if (lines[i].contains("DB pointer")) { first_non_header = i + 1; break; } } - StringBuilder builder = new StringBuilder(); + final StringBuilder builder = new StringBuilder(); for (int i = first_non_header; i < lines.length; ++i) { builder.append(lines[i]).append(separator); } diff --git a/java/src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java b/java/src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java index ab7e21568204..2306946152d3 100644 --- a/java/src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java +++ b/java/src/test/java/org/rocksdb/IngestExternalFileOptionsTest.java @@ -99,9 +99,9 @@ public void ingestBehind() { public void writeGlobalSeqno() { try (final IngestExternalFileOptions options = new IngestExternalFileOptions()) { - assertThat(options.writeGlobalSeqno()).isTrue(); - options.setWriteGlobalSeqno(false); assertThat(options.writeGlobalSeqno()).isFalse(); + options.setWriteGlobalSeqno(true); + assertThat(options.writeGlobalSeqno()).isTrue(); } } } diff --git a/java/src/test/java/org/rocksdb/KeyExistsTest.java b/java/src/test/java/org/rocksdb/KeyExistsTest.java new file mode 100644 index 000000000000..1ee9bdce23fb --- /dev/null +++ b/java/src/test/java/org/rocksdb/KeyExistsTest.java @@ -0,0 +1,229 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +package org.rocksdb; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.junit.*; +import org.junit.rules.ExpectedException; +import org.junit.rules.TemporaryFolder; + +public class KeyExistsTest { + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Rule public ExpectedException exceptionRule = ExpectedException.none(); + + List cfDescriptors; + List columnFamilyHandleList = new ArrayList<>(); + RocksDB db; + @Before + public void before() throws RocksDBException { + cfDescriptors = Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + + db = RocksDB.open( + options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList); + } + + @After + public void after() { + for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { + columnFamilyHandle.close(); + } + db.close(); + } + + @Test + public void keyExists() throws RocksDBException { + db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8)); + boolean exists = db.keyExists("key".getBytes(UTF_8)); + assertThat(exists).isTrue(); + exists = db.keyExists("key2".getBytes(UTF_8)); + assertThat(exists).isFalse(); + } + + @Test + public void keyExistsColumnFamily() throws RocksDBException { + byte[] key1 = "keyBBCF0".getBytes(UTF_8); + byte[] key2 = "keyBBCF1".getBytes(UTF_8); + db.put(columnFamilyHandleList.get(0), key1, "valueBBCF0".getBytes(UTF_8)); + db.put(columnFamilyHandleList.get(1), key2, "valueBBCF1".getBytes(UTF_8)); + + assertThat(db.keyExists(columnFamilyHandleList.get(0), key1)).isTrue(); + assertThat(db.keyExists(columnFamilyHandleList.get(0), key2)).isFalse(); + + assertThat(db.keyExists(columnFamilyHandleList.get(1), key1)).isFalse(); + assertThat(db.keyExists(columnFamilyHandleList.get(1), key2)).isTrue(); + } + + @Test + public void keyExistsColumnFamilyReadOptions() throws RocksDBException { + try (final ReadOptions readOptions = new ReadOptions()) { + byte[] key1 = "keyBBCF0".getBytes(UTF_8); + byte[] key2 = "keyBBCF1".getBytes(UTF_8); + db.put(columnFamilyHandleList.get(0), key1, "valueBBCF0".getBytes(UTF_8)); + db.put(columnFamilyHandleList.get(1), key2, "valueBBCF1".getBytes(UTF_8)); + + assertThat(db.keyExists(columnFamilyHandleList.get(0), readOptions, key1)).isTrue(); + assertThat(db.keyExists(columnFamilyHandleList.get(0), readOptions, key2)).isFalse(); + + assertThat(db.keyExists(columnFamilyHandleList.get(1), readOptions, key1)).isFalse(); + assertThat(db.keyExists(columnFamilyHandleList.get(1), readOptions, key2)).isTrue(); + } + } + + @Test + public void keyExistsReadOptions() throws RocksDBException { + try (final ReadOptions readOptions = new ReadOptions()) { + db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8)); + boolean exists = db.keyExists(readOptions, "key".getBytes(UTF_8)); + assertThat(exists).isTrue(); + exists = db.keyExists("key2".getBytes(UTF_8)); + assertThat(exists).isFalse(); + } + } + + @Test + public void keyExistsAfterDelete() throws RocksDBException { + db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8)); + boolean exists = db.keyExists(null, null, "key".getBytes(UTF_8), 0, 3); + assertThat(exists).isTrue(); + db.delete("key".getBytes(UTF_8)); + exists = db.keyExists(null, null, "key".getBytes(UTF_8), 0, 3); + assertThat(exists).isFalse(); + } + + @Test + public void keyExistsArrayIndexOutOfBoundsException() throws RocksDBException { + db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8)); + exceptionRule.expect(IndexOutOfBoundsException.class); + db.keyExists(null, null, "key".getBytes(UTF_8), 0, 5); + } + + @Test() + public void keyExistsArrayIndexOutOfBoundsExceptionWrongOffset() throws RocksDBException { + db.put("key".getBytes(UTF_8), "value".getBytes(UTF_8)); + exceptionRule.expect(IndexOutOfBoundsException.class); + db.keyExists(null, null, "key".getBytes(UTF_8), 6, 2); + } + + @Test + public void keyExistsDirectByteBuffer() throws RocksDBException { + byte[] key = "key".getBytes(UTF_8); + + db.put(key, "value".getBytes(UTF_8)); + ByteBuffer buff = ByteBuffer.allocateDirect(key.length); + buff.put(key); + buff.flip(); + boolean exists = db.keyExists(buff); + assertThat(exists).isTrue(); + } + + @Test + public void keyExistsDirectByteBufferReadOptions() throws RocksDBException { + try (final ReadOptions readOptions = new ReadOptions()) { + byte[] key = "key".getBytes(UTF_8); + + db.put(key, "value".getBytes(UTF_8)); + ByteBuffer buff = ByteBuffer.allocateDirect(key.length); + buff.put(key); + buff.flip(); + + boolean exists = db.keyExists(buff); + assertThat(exists).isTrue(); + } + } + + @Test + public void keyExistsDirectByteBufferAfterDelete() throws RocksDBException { + byte[] key = "key".getBytes(UTF_8); + + db.put(key, "value".getBytes(UTF_8)); + ByteBuffer buff = ByteBuffer.allocateDirect(key.length); + buff.put(key); + buff.flip(); + boolean exists = db.keyExists(buff); + assertThat(exists).isTrue(); + db.delete(key); + exists = db.keyExists(buff); + assertThat(exists).isFalse(); + } + + @Test + public void keyExistsDirectByteBufferColumnFamily() throws RocksDBException { + byte[] key1 = "keyBBCF0".getBytes(UTF_8); + byte[] key2 = "keyBBCF1".getBytes(UTF_8); + db.put(columnFamilyHandleList.get(0), key1, "valueBBCF0".getBytes(UTF_8)); + db.put(columnFamilyHandleList.get(1), key2, "valueBBCF1".getBytes(UTF_8)); + + ByteBuffer key1Buff = ByteBuffer.allocateDirect(key1.length); + key1Buff.put(key1); + key1Buff.flip(); + + ByteBuffer key2Buff = ByteBuffer.allocateDirect(key2.length); + key2Buff.put(key2); + key2Buff.flip(); + + assertThat(db.keyExists(columnFamilyHandleList.get(0), key1Buff)).isTrue(); + assertThat(db.keyExists(columnFamilyHandleList.get(0), key2Buff)).isFalse(); + + assertThat(db.keyExists(columnFamilyHandleList.get(1), key1Buff)).isFalse(); + assertThat(db.keyExists(columnFamilyHandleList.get(1), key2Buff)).isTrue(); + } + + @Test + public void keyExistsDirectByteBufferColumnFamilyReadOptions() throws RocksDBException { + try (final ReadOptions readOptions = new ReadOptions()) { + byte[] key1 = "keyBBCF0".getBytes(UTF_8); + byte[] key2 = "keyBBCF1".getBytes(UTF_8); + db.put(columnFamilyHandleList.get(0), key1, "valueBBCF0".getBytes(UTF_8)); + db.put(columnFamilyHandleList.get(1), key2, "valueBBCF1".getBytes(UTF_8)); + + ByteBuffer key1Buff = ByteBuffer.allocateDirect(key1.length); + key1Buff.put(key1); + key1Buff.flip(); + + ByteBuffer key2Buff = ByteBuffer.allocateDirect(key2.length); + key2Buff.put(key2); + key2Buff.flip(); + + assertThat(db.keyExists(columnFamilyHandleList.get(0), readOptions, key1Buff)).isTrue(); + assertThat(db.keyExists(columnFamilyHandleList.get(0), readOptions, key2Buff)).isFalse(); + + assertThat(db.keyExists(columnFamilyHandleList.get(1), readOptions, key1Buff)).isFalse(); + assertThat(db.keyExists(columnFamilyHandleList.get(1), readOptions, key2Buff)).isTrue(); + } + } + + @Test + public void keyExistsDirectReadOptions() throws RocksDBException { + try (final ReadOptions readOptions = new ReadOptions()) { + byte[] key = "key1".getBytes(UTF_8); + db.put(key, "value".getBytes(UTF_8)); + ByteBuffer buff = ByteBuffer.allocateDirect(key.length); + buff.put(key); + buff.flip(); + boolean exists = db.keyExists(readOptions, key); + assertThat(exists).isTrue(); + buff.clear(); + + buff.put("key2".getBytes(UTF_8)); + buff.flip(); + exists = db.keyExists("key2".getBytes(UTF_8)); + assertThat(exists).isFalse(); + } + } +} diff --git a/java/src/test/java/org/rocksdb/LoggerTest.java b/java/src/test/java/org/rocksdb/LoggerTest.java index 5bc299f110c3..b6a7be55e7f0 100644 --- a/java/src/test/java/org/rocksdb/LoggerTest.java +++ b/java/src/test/java/org/rocksdb/LoggerTest.java @@ -1,17 +1,16 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. package org.rocksdb; -import org.junit.ClassRule; -import org.junit.Rule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; +import static org.assertj.core.api.Assertions.assertThat; import java.util.ArrayList; -import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; - -import static org.assertj.core.api.Assertions.assertThat; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; public class LoggerTest { @ClassRule @@ -30,7 +29,7 @@ public void customLogger() throws RocksDBException { final Logger logger = new Logger(options) { // Create new logger with max log level passed by options @Override - protected void log(InfoLogLevel infoLogLevel, String logMsg) { + protected void log(final InfoLogLevel infoLogLevel, final String logMsg) { assertThat(logMsg).isNotNull(); assertThat(logMsg.length()).isGreaterThan(0); logMessageCounter.incrementAndGet(); @@ -59,7 +58,7 @@ public void warnLogger() throws RocksDBException { final Logger logger = new Logger(options) { // Create new logger with max log level passed by options @Override - protected void log(InfoLogLevel infoLogLevel, String logMsg) { + protected void log(final InfoLogLevel infoLogLevel, final String logMsg) { assertThat(logMsg).isNotNull(); assertThat(logMsg.length()).isGreaterThan(0); logMessageCounter.incrementAndGet(); @@ -90,7 +89,7 @@ public void fatalLogger() throws RocksDBException { final Logger logger = new Logger(options) { // Create new logger with max log level passed by options @Override - protected void log(InfoLogLevel infoLogLevel, String logMsg) { + protected void log(final InfoLogLevel infoLogLevel, final String logMsg) { assertThat(logMsg).isNotNull(); assertThat(logMsg.length()).isGreaterThan(0); logMessageCounter.incrementAndGet(); @@ -119,7 +118,7 @@ public void dbOptionsLogger() throws RocksDBException { final Logger logger = new Logger(options) { // Create new logger with max log level passed by options @Override - protected void log(InfoLogLevel infoLogLevel, String logMsg) { + protected void log(final InfoLogLevel infoLogLevel, final String logMsg) { assertThat(logMsg).isNotNull(); assertThat(logMsg.length()).isGreaterThan(0); logMessageCounter.incrementAndGet(); @@ -130,8 +129,7 @@ protected void log(InfoLogLevel infoLogLevel, String logMsg) { options.setLogger(logger); final List cfDescriptors = - Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + Collections.singletonList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); final List cfHandles = new ArrayList<>(); try (final RocksDB db = RocksDB.open(options, @@ -159,7 +157,7 @@ public void setWarnLogLevel() { final Logger logger = new Logger(options) { // Create new logger with max log level passed by options @Override - protected void log(InfoLogLevel infoLogLevel, String logMsg) { + protected void log(final InfoLogLevel infoLogLevel, final String logMsg) { assertThat(logMsg).isNotNull(); assertThat(logMsg.length()).isGreaterThan(0); logMessageCounter.incrementAndGet(); @@ -183,7 +181,7 @@ public void setInfoLogLevel() { final Logger logger = new Logger(options) { // Create new logger with max log level passed by options @Override - protected void log(InfoLogLevel infoLogLevel, String logMsg) { + protected void log(final InfoLogLevel infoLogLevel, final String logMsg) { assertThat(logMsg).isNotNull(); assertThat(logMsg.length()).isGreaterThan(0); logMessageCounter.incrementAndGet(); @@ -201,20 +199,18 @@ protected void log(InfoLogLevel infoLogLevel, String logMsg) { @Test public void changeLogLevelAtRuntime() throws RocksDBException { final AtomicInteger logMessageCounter = new AtomicInteger(); - try (final Options options = new Options(). - setInfoLogLevel(InfoLogLevel.FATAL_LEVEL). - setCreateIfMissing(true); + try (final Options options = + new Options().setInfoLogLevel(InfoLogLevel.FATAL_LEVEL).setCreateIfMissing(true); // Create new logger with max log level passed by options final Logger logger = new Logger(options) { @Override - protected void log(InfoLogLevel infoLogLevel, String logMsg) { + protected void log(final InfoLogLevel infoLogLevel, final String logMsg) { assertThat(logMsg).isNotNull(); assertThat(logMsg.length()).isGreaterThan(0); logMessageCounter.incrementAndGet(); } - } - ) { + }) { // Set custom logger to options options.setLogger(logger); diff --git a/java/src/test/java/org/rocksdb/MemTableTest.java b/java/src/test/java/org/rocksdb/MemTableTest.java index 73ac589a90ef..6ebf9ef51e5b 100644 --- a/java/src/test/java/org/rocksdb/MemTableTest.java +++ b/java/src/test/java/org/rocksdb/MemTableTest.java @@ -20,8 +20,7 @@ public class MemTableTest { public void hashSkipListMemTable() throws RocksDBException { try(final Options options = new Options()) { // Test HashSkipListMemTableConfig - HashSkipListMemTableConfig memTableConfig = - new HashSkipListMemTableConfig(); + final HashSkipListMemTableConfig memTableConfig = new HashSkipListMemTableConfig(); assertThat(memTableConfig.bucketCount()). isEqualTo(1000000); memTableConfig.setBucketCount(2000000); @@ -44,8 +43,7 @@ public void hashSkipListMemTable() throws RocksDBException { @Test public void skipListMemTable() throws RocksDBException { try(final Options options = new Options()) { - SkipListMemTableConfig skipMemTableConfig = - new SkipListMemTableConfig(); + final SkipListMemTableConfig skipMemTableConfig = new SkipListMemTableConfig(); assertThat(skipMemTableConfig.lookahead()). isEqualTo(0); skipMemTableConfig.setLookahead(20); @@ -58,7 +56,7 @@ public void skipListMemTable() throws RocksDBException { @Test public void hashLinkedListMemTable() throws RocksDBException { try(final Options options = new Options()) { - HashLinkedListMemTableConfig hashLinkedListMemTableConfig = + final HashLinkedListMemTableConfig hashLinkedListMemTableConfig = new HashLinkedListMemTableConfig(); assertThat(hashLinkedListMemTableConfig.bucketCount()). isEqualTo(50000); @@ -98,8 +96,7 @@ public void hashLinkedListMemTable() throws RocksDBException { @Test public void vectorMemTable() throws RocksDBException { try(final Options options = new Options()) { - VectorMemTableConfig vectorMemTableConfig = - new VectorMemTableConfig(); + final VectorMemTableConfig vectorMemTableConfig = new VectorMemTableConfig(); assertThat(vectorMemTableConfig.reservedSize()). isEqualTo(0); vectorMemTableConfig.setReservedSize(123); diff --git a/java/src/test/java/org/rocksdb/MemoryUtilTest.java b/java/src/test/java/org/rocksdb/MemoryUtilTest.java index 1bea023797b3..bfdcb9fe1320 100644 --- a/java/src/test/java/org/rocksdb/MemoryUtilTest.java +++ b/java/src/test/java/org/rocksdb/MemoryUtilTest.java @@ -45,10 +45,9 @@ public void getApproximateMemoryUsageByType() throws RocksDBException { new FlushOptions().setWaitForFlush(true); final RocksDB db = RocksDB.open(options, dbFolder1.getRoot().getAbsolutePath())) { - - List dbs = new ArrayList<>(1); + final List dbs = new ArrayList<>(1); dbs.add(db); - Set caches = new HashSet<>(1); + final Set caches = new HashSet<>(1); caches.add(cache); Map usage = MemoryUtil.getApproximateMemoryUsageByType(dbs, caches); @@ -85,7 +84,7 @@ public void getApproximateMemoryUsageByType() throws RocksDBException { */ @Test public void getApproximateMemoryUsageByTypeNulls() throws RocksDBException { - Map usage = MemoryUtil.getApproximateMemoryUsageByType(null, null); + final Map usage = MemoryUtil.getApproximateMemoryUsageByType(null, null); assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isEqualTo(null); assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isEqualTo(null); @@ -98,38 +97,32 @@ public void getApproximateMemoryUsageByTypeNulls() throws RocksDBException { */ @Test public void getApproximateMemoryUsageByTypeMultiple() throws RocksDBException { - try (final Cache cache1 = new LRUCache(1 * 1024 * 1024); - final Options options1 = - new Options() - .setCreateIfMissing(true) - .setTableFormatConfig(new BlockBasedTableConfig().setBlockCache(cache1)); - final RocksDB db1 = - RocksDB.open(options1, dbFolder1.getRoot().getAbsolutePath()); - final Cache cache2 = new LRUCache(1 * 1024 * 1024); - final Options options2 = - new Options() - .setCreateIfMissing(true) - .setTableFormatConfig(new BlockBasedTableConfig().setBlockCache(cache2)); - final RocksDB db2 = - RocksDB.open(options2, dbFolder2.getRoot().getAbsolutePath()); - final FlushOptions flushOptions = - new FlushOptions().setWaitForFlush(true); + try (final Cache cache1 = new LRUCache(1024 * 1024); + final Options options1 = new Options().setCreateIfMissing(true).setTableFormatConfig( + new BlockBasedTableConfig().setBlockCache(cache1)); + final RocksDB db1 = RocksDB.open(options1, dbFolder1.getRoot().getAbsolutePath()); + final Cache cache2 = new LRUCache(1024 * 1024); + final Options options2 = new Options().setCreateIfMissing(true).setTableFormatConfig( + new BlockBasedTableConfig().setBlockCache(cache2)); + final RocksDB db2 = RocksDB.open(options2, dbFolder2.getRoot().getAbsolutePath()); + final FlushOptions flushOptions = new FlushOptions().setWaitForFlush(true) ) { - List dbs = new ArrayList<>(1); + final List dbs = new ArrayList<>(1); dbs.add(db1); dbs.add(db2); - Set caches = new HashSet<>(1); + final Set caches = new HashSet<>(1); caches.add(cache1); caches.add(cache2); - for (RocksDB db: dbs) { + for (final RocksDB db : dbs) { db.put(key, value); db.flush(flushOptions); db.get(key); } - Map usage = MemoryUtil.getApproximateMemoryUsageByType(dbs, caches); + final Map usage = + MemoryUtil.getApproximateMemoryUsageByType(dbs, caches); assertThat(usage.get(MemoryUsageType.kMemTableTotal)).isEqualTo( db1.getAggregatedLongProperty(MEMTABLE_SIZE) + db2.getAggregatedLongProperty(MEMTABLE_SIZE)); assertThat(usage.get(MemoryUsageType.kMemTableUnFlushed)).isEqualTo( @@ -137,7 +130,6 @@ public void getApproximateMemoryUsageByTypeMultiple() throws RocksDBException { assertThat(usage.get(MemoryUsageType.kTableReadersTotal)).isEqualTo( db1.getAggregatedLongProperty(TABLE_READERS) + db2.getAggregatedLongProperty(TABLE_READERS)); assertThat(usage.get(MemoryUsageType.kCacheTotal)).isGreaterThan(0); - } } diff --git a/java/src/test/java/org/rocksdb/MergeTest.java b/java/src/test/java/org/rocksdb/MergeTest.java index a840eb104693..f99ac49d3dd6 100644 --- a/java/src/test/java/org/rocksdb/MergeTest.java +++ b/java/src/test/java/org/rocksdb/MergeTest.java @@ -45,14 +45,16 @@ public void stringOption() } } - private byte[] longToByteArray(long l) { - ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE).order(ByteOrder.LITTLE_ENDIAN); + private byte[] longToByteArray(final long l) { + final ByteBuffer buf = + ByteBuffer.allocate(Long.SIZE / Byte.SIZE).order(ByteOrder.LITTLE_ENDIAN); buf.putLong(l); return buf.array(); } - private long longFromByteArray(byte[] a) { - ByteBuffer buf = ByteBuffer.allocate(Long.SIZE / Byte.SIZE).order(ByteOrder.LITTLE_ENDIAN); + private long longFromByteArray(final byte[] a) { + final ByteBuffer buf = + ByteBuffer.allocate(Long.SIZE / Byte.SIZE).order(ByteOrder.LITTLE_ENDIAN); buf.put(a); buf.flip(); return buf.getLong(); @@ -106,9 +108,8 @@ public void cFStringOption() db.merge(columnFamilyHandleList.get(1), "cfkey".getBytes(), "bb".getBytes()); - byte[] value = db.get(columnFamilyHandleList.get(1), - "cfkey".getBytes()); - String strValue = new String(value); + final byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes()); + final String strValue = new String(value); assertThat(strValue).isEqualTo("aa,bb"); } finally { for (final ColumnFamilyHandle handle : columnFamilyHandleList) { @@ -147,9 +148,8 @@ public void cFUInt64AddOption() // merge (long)157 under key db.merge(columnFamilyHandleList.get(1), "cfkey".getBytes(), longToByteArray(157)); - byte[] value = db.get(columnFamilyHandleList.get(1), - "cfkey".getBytes()); - long longValue = longFromByteArray(value); + final byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes()); + final long longValue = longFromByteArray(value); assertThat(longValue).isEqualTo(257); } finally { for (final ColumnFamilyHandle handle : columnFamilyHandleList) { @@ -234,7 +234,7 @@ public void cFOperatorOption() "cfkey".getBytes(), "bb".getBytes()); byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes()); - String strValue = new String(value); + final String strValue = new String(value); // Test also with createColumnFamily try (final ColumnFamilyOptions cfHandleOpts = @@ -251,7 +251,7 @@ public void cFOperatorOption() db.merge(cfHandle, new WriteOptions(), "cfkey2".getBytes(), "yy".getBytes()); value = db.get(cfHandle, "cfkey2".getBytes()); - String strValueTmpCf = new String(value); + final String strValueTmpCf = new String(value); assertThat(strValue).isEqualTo("aa,bb"); assertThat(strValueTmpCf).isEqualTo("xx,yy"); @@ -296,7 +296,7 @@ public void cFUInt64AddOperatorOption() "cfkey".getBytes(), longToByteArray(1)); byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes()); - long longValue = longFromByteArray(value); + final long longValue = longFromByteArray(value); // Test also with createColumnFamily try (final ColumnFamilyOptions cfHandleOpts = @@ -313,7 +313,7 @@ public void cFUInt64AddOperatorOption() db.merge(cfHandle, new WriteOptions(), "cfkey2".getBytes(), longToByteArray(50)); value = db.get(cfHandle, "cfkey2".getBytes()); - long longValueTmpCf = longFromByteArray(value); + final long longValueTmpCf = longFromByteArray(value); assertThat(longValue).isEqualTo(101); assertThat(longValueTmpCf).isEqualTo(250); diff --git a/java/src/test/java/org/rocksdb/MultiColumnRegressionTest.java b/java/src/test/java/org/rocksdb/MultiColumnRegressionTest.java index cdfd9d3a9f18..6087b0260b37 100644 --- a/java/src/test/java/org/rocksdb/MultiColumnRegressionTest.java +++ b/java/src/test/java/org/rocksdb/MultiColumnRegressionTest.java @@ -51,15 +51,14 @@ public MultiColumnRegressionTest(final Params params) { public void transactionDB() throws RocksDBException { final List columnFamilyDescriptors = new ArrayList<>(); for (int i = 0; i < params.numColumns; i++) { - StringBuilder sb = new StringBuilder(); + final StringBuilder sb = new StringBuilder(); sb.append("cf" + i); for (int j = 0; j < params.keySize; j++) sb.append("_cf"); columnFamilyDescriptors.add(new ColumnFamilyDescriptor(sb.toString().getBytes())); } try (final Options opt = new Options().setCreateIfMissing(true); final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { - final List columnFamilyHandles = - db.createColumnFamilies(columnFamilyDescriptors); + db.createColumnFamilies(columnFamilyDescriptors); } columnFamilyDescriptors.add(new ColumnFamilyDescriptor("default".getBytes())); @@ -68,7 +67,7 @@ public void transactionDB() throws RocksDBException { new TransactionDBOptions(), dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, columnFamilyHandles)) { final WriteOptions writeOptions = new WriteOptions(); - try (Transaction transaction = tdb.beginTransaction(writeOptions)) { + try (final Transaction transaction = tdb.beginTransaction(writeOptions)) { for (int i = 0; i < params.numColumns; i++) { transaction.put( columnFamilyHandles.get(i), ("key" + i).getBytes(), ("value" + (i - 7)).getBytes()); @@ -76,7 +75,7 @@ public void transactionDB() throws RocksDBException { transaction.put("key".getBytes(), "value".getBytes()); transaction.commit(); } - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) { + for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) { columnFamilyHandle.close(); } } @@ -85,7 +84,7 @@ public void transactionDB() throws RocksDBException { try (final TransactionDB tdb = TransactionDB.open(new DBOptions().setCreateIfMissing(true), new TransactionDBOptions(), dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, columnFamilyHandles2)) { - try (Transaction transaction = tdb.beginTransaction(new WriteOptions())) { + try (final Transaction transaction = tdb.beginTransaction(new WriteOptions())) { final ReadOptions readOptions = new ReadOptions(); for (int i = 0; i < params.numColumns; i++) { final byte[] value = @@ -94,7 +93,7 @@ public void transactionDB() throws RocksDBException { } transaction.commit(); } - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles2) { + for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles2) { columnFamilyHandle.close(); } } @@ -112,7 +111,7 @@ public void optimisticDB() throws RocksDBException { try (final OptimisticTransactionDB otdb = OptimisticTransactionDB.open( new DBOptions().setCreateIfMissing(true), dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, columnFamilyHandles)) { - try (Transaction transaction = otdb.beginTransaction(new WriteOptions())) { + try (final Transaction transaction = otdb.beginTransaction(new WriteOptions())) { for (int i = 0; i < params.numColumns; i++) { transaction.put( columnFamilyHandles.get(i), ("key" + i).getBytes(), ("value" + (i - 7)).getBytes()); @@ -120,7 +119,7 @@ public void optimisticDB() throws RocksDBException { transaction.put("key".getBytes(), "value".getBytes()); transaction.commit(); } - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) { + for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) { columnFamilyHandle.close(); } } @@ -129,7 +128,7 @@ public void optimisticDB() throws RocksDBException { try (final OptimisticTransactionDB otdb = OptimisticTransactionDB.open( new DBOptions().setCreateIfMissing(true), dbFolder.getRoot().getAbsolutePath(), columnFamilyDescriptors, columnFamilyHandles2)) { - try (Transaction transaction = otdb.beginTransaction(new WriteOptions())) { + try (final Transaction transaction = otdb.beginTransaction(new WriteOptions())) { final ReadOptions readOptions = new ReadOptions(); for (int i = 0; i < params.numColumns; i++) { final byte[] value = @@ -138,7 +137,7 @@ public void optimisticDB() throws RocksDBException { } transaction.commit(); } - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles2) { + for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles2) { columnFamilyHandle.close(); } } diff --git a/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java b/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java index 90a13e1da05c..e66eef6229a7 100644 --- a/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java +++ b/java/src/test/java/org/rocksdb/MultiGetManyKeysTest.java @@ -114,7 +114,7 @@ public void multiGetAsListLargeTransactionalCF() throws RocksDBException { transaction.multiGetAsList(new ReadOptions(), columnFamilyHandlesForMultiGet, keys); assertKeysAndValues(keys, keyValues, values); } - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) { + for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) { columnFamilyHandle.close(); } } @@ -148,7 +148,7 @@ public void multiGetForUpdateAsListLargeTransactionalCF() throws RocksDBExceptio new ReadOptions(), columnFamilyHandlesForMultiGet, keys); assertKeysAndValues(keys, keyValues, values); } - for (ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) { + for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandles) { columnFamilyHandle.close(); } } @@ -178,22 +178,22 @@ private Map generateRandomKeyValues(final List keys, final return keyValues; } - private void putKeysAndValues(Map keyValues) throws RocksDBException { + private void putKeysAndValues(final Map keyValues) throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { - for (Map.Entry keyValue : keyValues.entrySet()) { + for (final Map.Entry keyValue : keyValues.entrySet()) { db.put(keyValue.getKey().get(), keyValue.getValue()); } } } - private void putKeysAndValues(ColumnFamilyDescriptor columnFamilyDescriptor, - Map keyValues) throws RocksDBException { + private void putKeysAndValues(final ColumnFamilyDescriptor columnFamilyDescriptor, + final Map keyValues) throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(columnFamilyDescriptor)) { - for (Map.Entry keyValue : keyValues.entrySet()) { + for (final Map.Entry keyValue : keyValues.entrySet()) { db.put(columnFamilyHandle, keyValue.getKey().get(), keyValue.getValue()); } } @@ -213,9 +213,9 @@ private void assertKeysAndValues( } } - static private class Key { + private static class Key { private final byte[] bytes; - public Key(byte[] bytes) { + public Key(final byte[] bytes) { this.bytes = bytes; } @@ -224,12 +224,12 @@ public byte[] get() { } @Override - public boolean equals(Object o) { + public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; - Key key = (Key) o; + final Key key = (Key) o; return Arrays.equals(bytes, key.bytes); } diff --git a/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java b/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java index b2b2599a7fcd..d858a150dfc9 100644 --- a/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java +++ b/java/src/test/java/org/rocksdb/MutableColumnFamilyOptionsTest.java @@ -122,7 +122,7 @@ public void mutableColumnFamilyOptions_parse_getOptions_output() { + "max_write_buffer_size_to_maintain=0; memtable_insert_with_hint_prefix_extractor=nullptr; level_compaction_dynamic_level_bytes=false; " + "inplace_update_support=false; experimental_mempurge_threshold=0.003"; - MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder cf = + final MutableColumnFamilyOptions.MutableColumnFamilyOptionsBuilder cf = MutableColumnFamilyOptions.parse(optionsString, true); // Check the values from the parsed string which are column family options diff --git a/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java b/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java index 970e58c0c2e8..1e0ded816f31 100644 --- a/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java +++ b/java/src/test/java/org/rocksdb/NativeComparatorWrapperTest.java @@ -5,15 +5,15 @@ package org.rocksdb; +import static org.junit.Assert.assertEquals; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.Random; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.util.*; -import java.util.Comparator; - -import static org.junit.Assert.assertEquals; - public class NativeComparatorWrapperTest { static { RocksDB.loadLibrary(); @@ -39,7 +39,7 @@ public void rountrip() throws RocksDBException { try (final RocksDB db = RocksDB.open(opt, dbPath)) { for (int i = 0; i < ITERATIONS; i++) { final String strKey = randomString(); - final byte key[] = strKey.getBytes(); + final byte[] key = strKey.getBytes(); // does key already exist (avoid duplicates) if (i > 0 && db.get(key) != null) { i--; // generate a different key @@ -51,12 +51,7 @@ public void rountrip() throws RocksDBException { } // sort the stored keys into ascending alpha-numeric order - Arrays.sort(storedKeys, new Comparator() { - @Override - public int compare(final String o1, final String o2) { - return o1.compareTo(o2); - } - }); + Arrays.sort(storedKeys, Comparator.naturalOrder()); // re-open db and read from start to end // string keys should be in ascending diff --git a/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java b/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java index ab60081a0760..6b954f67e2b3 100644 --- a/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java +++ b/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java @@ -32,7 +32,7 @@ public void tempFolder() throws IOException { @Test public void overridesExistingLibrary() throws IOException { - File first = NativeLibraryLoader.getInstance().loadLibraryFromJarToTemp( + final File first = NativeLibraryLoader.getInstance().loadLibraryFromJarToTemp( temporaryFolder.getRoot().getAbsolutePath()); NativeLibraryLoader.getInstance().loadLibraryFromJarToTemp( temporaryFolder.getRoot().getAbsolutePath()); diff --git a/java/src/test/java/org/rocksdb/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java index 129f1c39ae19..4b59464b1e30 100644 --- a/java/src/test/java/org/rocksdb/OptionsTest.java +++ b/java/src/test/java/org/rocksdb/OptionsTest.java @@ -8,8 +8,6 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.junit.Assert.*; -import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.atomic.AtomicBoolean; @@ -28,11 +26,11 @@ public class OptionsTest { @Test public void copyConstructor() { - Options origOpts = new Options(); + final Options origOpts = new Options(); origOpts.setNumLevels(rand.nextInt(8)); origOpts.setTargetFileSizeMultiplier(rand.nextInt(100)); origOpts.setLevel0StopWritesTrigger(rand.nextInt(50)); - Options copyOpts = new Options(origOpts); + final Options copyOpts = new Options(origOpts); assertThat(origOpts.numLevels()).isEqualTo(copyOpts.numLevels()); assertThat(origOpts.targetFileSizeMultiplier()).isEqualTo(copyOpts.targetFileSizeMultiplier()); assertThat(origOpts.level0StopWritesTrigger()).isEqualTo(copyOpts.level0StopWritesTrigger()); @@ -675,9 +673,8 @@ public void dbWriteBufferSize() { @Test public void setWriteBufferManager() throws RocksDBException { - try (final Options opt = new Options(); - final Cache cache = new LRUCache(1 * 1024 * 1024); - final WriteBufferManager writeBufferManager = new WriteBufferManager(2000l, cache)) { + try (final Options opt = new Options(); final Cache cache = new LRUCache(1024 * 1024); + final WriteBufferManager writeBufferManager = new WriteBufferManager(2000L, cache)) { opt.setWriteBufferManager(writeBufferManager); assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager); } @@ -685,9 +682,8 @@ public void setWriteBufferManager() throws RocksDBException { @Test public void setWriteBufferManagerWithZeroBufferSize() throws RocksDBException { - try (final Options opt = new Options(); - final Cache cache = new LRUCache(1 * 1024 * 1024); - final WriteBufferManager writeBufferManager = new WriteBufferManager(0l, cache)) { + try (final Options opt = new Options(); final Cache cache = new LRUCache(1024 * 1024); + final WriteBufferManager writeBufferManager = new WriteBufferManager(0L, cache)) { opt.setWriteBufferManager(writeBufferManager); assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager); } @@ -695,14 +691,15 @@ public void setWriteBufferManagerWithZeroBufferSize() throws RocksDBException { @Test public void setWriteBufferManagerWithAllowStall() throws RocksDBException { - try (final Options opt = new Options(); final Cache cache = new LRUCache(1 * 1024 * 1024); - final WriteBufferManager writeBufferManager = new WriteBufferManager(2000l, cache, true)) { + try (final Options opt = new Options(); final Cache cache = new LRUCache(1024 * 1024); + final WriteBufferManager writeBufferManager = new WriteBufferManager(2000L, cache, true)) { opt.setWriteBufferManager(writeBufferManager); assertThat(opt.writeBufferManager()).isEqualTo(writeBufferManager); assertThat(opt.writeBufferManager().allowStall()).isEqualTo(true); } } + @SuppressWarnings("deprecated") @Test public void accessHintOnCompactionStart() { try (final Options opt = new Options()) { @@ -1456,6 +1453,16 @@ public void skipCheckingSstFileSizesOnDbOpen() { } } + @Test + public void memtableMaxRangeDeletions() { + try (final Options options = new Options()) { + assertThat(options.memtableMaxRangeDeletions()).isEqualTo(0); + final int val = 32; + assertThat(options.setMemtableMaxRangeDeletions(val)).isEqualTo(options); + assertThat(options.memtableMaxRangeDeletions()).isEqualTo(val); + } + } + @Test public void eventListeners() { final AtomicBoolean wasCalled1 = new AtomicBoolean(); @@ -1476,15 +1483,15 @@ public void onMemTableSealed(final MemTableInfo memTableInfo) { } }) { assertThat(options.setListeners(Arrays.asList(el1, el2))).isEqualTo(options); - List listeners = options.listeners(); + final List listeners = options.listeners(); assertEquals(el1, listeners.get(0)); assertEquals(el2, listeners.get(1)); - options.setListeners(Collections.emptyList()); + options.setListeners(Collections.emptyList()); listeners.get(0).onTableFileDeleted(null); assertTrue(wasCalled1.get()); listeners.get(1).onMemTableSealed(null); assertTrue(wasCalled2.get()); - List listeners2 = options.listeners(); + final List listeners2 = options.listeners(); assertNotNull(listeners2); assertEquals(0, listeners2.size()); } diff --git a/java/src/test/java/org/rocksdb/OptionsUtilTest.java b/java/src/test/java/org/rocksdb/OptionsUtilTest.java index b84314eecd27..23949ac06626 100644 --- a/java/src/test/java/org/rocksdb/OptionsUtilTest.java +++ b/java/src/test/java/org/rocksdb/OptionsUtilTest.java @@ -20,16 +20,146 @@ public class OptionsUtilTest { @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); - enum TestAPI { LOAD_LATEST_OPTIONS, LOAD_OPTIONS_FROM_FILE } - @Test public void loadLatestOptions() throws RocksDBException { - verifyOptions(TestAPI.LOAD_LATEST_OPTIONS); + verifyOptions(new LoaderUnderTest() { + @Override + List loadOptions(final String dbPath, final DBOptions dbOptions) + throws RocksDBException { + try (final ConfigOptions configOptions = new ConfigOptions() + .setIgnoreUnknownOptions(false) + .setInputStringsEscaped(true) + .setEnv(Env.getDefault())) { + final List cfDescs = new ArrayList<>(); + OptionsUtil.loadLatestOptions(configOptions, dbPath, dbOptions, cfDescs); + return cfDescs; + } + } + }); } @Test public void loadOptionsFromFile() throws RocksDBException { - verifyOptions(TestAPI.LOAD_OPTIONS_FROM_FILE); + verifyOptions(new LoaderUnderTest() { + @Override + List loadOptions(final String dbPath, final DBOptions dbOptions) + throws RocksDBException { + try (final ConfigOptions configOptions = new ConfigOptions() + .setIgnoreUnknownOptions(false) + .setInputStringsEscaped(true) + .setEnv(Env.getDefault())) { + final List cfDescs = new ArrayList<>(); + final String path = + dbPath + "/" + OptionsUtil.getLatestOptionsFileName(dbPath, Env.getDefault()); + OptionsUtil.loadOptionsFromFile(configOptions, path, dbOptions, cfDescs); + return cfDescs; + } + } + }); + } + + @Test + public void loadLatestTableFormatOptions() throws RocksDBException { + verifyTableFormatOptions(new LoaderUnderTest() { + @Override + List loadOptions(final String dbPath, final DBOptions dbOptions) + throws RocksDBException { + try (final ConfigOptions configOptions = new ConfigOptions() + .setIgnoreUnknownOptions(false) + .setInputStringsEscaped(true) + .setEnv(Env.getDefault())) { + final List cfDescs = new ArrayList<>(); + OptionsUtil.loadLatestOptions(configOptions, dbPath, dbOptions, cfDescs); + return cfDescs; + } + } + }); + } + + @Test + public void loadLatestTableFormatOptions2() throws RocksDBException { + verifyTableFormatOptions(new LoaderUnderTest() { + @Override + List loadOptions(final String dbPath, final DBOptions dbOptions) + throws RocksDBException { + try (final ConfigOptions configOptions = new ConfigOptions() + .setIgnoreUnknownOptions(false) + .setInputStringsEscaped(true) + .setEnv(Env.getDefault())) { + final List cfDescs = new ArrayList<>(); + OptionsUtil.loadLatestOptions(configOptions, dbPath, dbOptions, cfDescs); + return cfDescs; + } + } + }); + } + + @Test + public void loadLatestTableFormatOptions3() throws RocksDBException { + verifyTableFormatOptions(new LoaderUnderTest() { + @Override + List loadOptions(final String dbPath, final DBOptions dbOptions) + throws RocksDBException { + final List cfDescs = new ArrayList<>(); + OptionsUtil.loadLatestOptions(new ConfigOptions(), dbPath, dbOptions, cfDescs); + return cfDescs; + } + }); + } + + @Test + public void loadTableFormatOptionsFromFile() throws RocksDBException { + verifyTableFormatOptions(new LoaderUnderTest() { + @Override + List loadOptions(final String dbPath, final DBOptions dbOptions) + throws RocksDBException { + try (final ConfigOptions configOptions = new ConfigOptions() + .setIgnoreUnknownOptions(false) + .setInputStringsEscaped(true) + .setEnv(Env.getDefault())) { + final List cfDescs = new ArrayList<>(); + final String path = + dbPath + "/" + OptionsUtil.getLatestOptionsFileName(dbPath, Env.getDefault()); + OptionsUtil.loadOptionsFromFile(configOptions, path, dbOptions, cfDescs); + return cfDescs; + } + } + }); + } + + @Test + public void loadTableFormatOptionsFromFile2() throws RocksDBException { + verifyTableFormatOptions(new LoaderUnderTest() { + @Override + List loadOptions(final String dbPath, final DBOptions dbOptions) + throws RocksDBException { + try (final ConfigOptions configOptions = new ConfigOptions() + .setIgnoreUnknownOptions(false) + .setInputStringsEscaped(true) + .setEnv(Env.getDefault())) { + final List cfDescs = new ArrayList<>(); + final String path = + dbPath + "/" + OptionsUtil.getLatestOptionsFileName(dbPath, Env.getDefault()); + OptionsUtil.loadOptionsFromFile(configOptions, path, dbOptions, cfDescs); + return cfDescs; + } + } + }); + } + + @Test + public void loadTableFormatOptionsFromFile3() throws RocksDBException { + verifyTableFormatOptions(new LoaderUnderTest() { + @Override + List loadOptions(final String dbPath, final DBOptions dbOptions) + throws RocksDBException { + final List cfDescs = new ArrayList<>(); + final String path = + dbPath + "/" + OptionsUtil.getLatestOptionsFileName(dbPath, Env.getDefault()); + OptionsUtil.loadOptionsFromFile(new ConfigOptions(), path, dbOptions, cfDescs); + return cfDescs; + } + }); } @Test @@ -40,13 +170,18 @@ public void getLatestOptionsFileName() throws RocksDBException { assertThat(db).isNotNull(); } - String fName = OptionsUtil.getLatestOptionsFileName(dbPath, Env.getDefault()); + final String fName = OptionsUtil.getLatestOptionsFileName(dbPath, Env.getDefault()); assertThat(fName).isNotNull(); - assert(fName.startsWith("OPTIONS-") == true); + assert (fName.startsWith("OPTIONS-")); // System.out.println("latest options fileName: " + fName); } - private void verifyOptions(TestAPI apiType) throws RocksDBException { + static abstract class LoaderUnderTest { + abstract List loadOptions(final String path, final DBOptions dbOptions) + throws RocksDBException; + } + + private void verifyOptions(final LoaderUnderTest loaderUnderTest) throws RocksDBException { final String dbPath = dbFolder.getRoot().getAbsolutePath(); final Options options = new Options() .setCreateIfMissing(true) @@ -76,15 +211,113 @@ private void verifyOptions(TestAPI apiType) throws RocksDBException { } // Read the options back and verify - DBOptions dbOptions = new DBOptions(); - final List cfDescs = new ArrayList<>(); - String path = dbPath; - if (apiType == TestAPI.LOAD_LATEST_OPTIONS) { - OptionsUtil.loadLatestOptions(path, Env.getDefault(), dbOptions, cfDescs, false); - } else if (apiType == TestAPI.LOAD_OPTIONS_FROM_FILE) { - path = dbPath + "/" + OptionsUtil.getLatestOptionsFileName(dbPath, Env.getDefault()); - OptionsUtil.loadOptionsFromFile(path, Env.getDefault(), dbOptions, cfDescs, false); + try (DBOptions dbOptions = new DBOptions()) { + final List cfDescs = loaderUnderTest.loadOptions(dbPath, dbOptions); + + assertThat(dbOptions.createIfMissing()).isEqualTo(options.createIfMissing()); + assertThat(dbOptions.paranoidChecks()).isEqualTo(options.paranoidChecks()); + assertThat(dbOptions.maxOpenFiles()).isEqualTo(options.maxOpenFiles()); + assertThat(dbOptions.delayedWriteRate()).isEqualTo(options.delayedWriteRate()); + + assertThat(cfDescs.size()).isEqualTo(2); + assertThat(cfDescs.get(0)).isNotNull(); + assertThat(cfDescs.get(1)).isNotNull(); + assertThat(cfDescs.get(0).getName()).isEqualTo(RocksDB.DEFAULT_COLUMN_FAMILY); + assertThat(cfDescs.get(1).getName()).isEqualTo(secondCFName); + + final ColumnFamilyOptions defaultCFOpts = cfDescs.get(0).getOptions(); + assertThat(defaultCFOpts.writeBufferSize()).isEqualTo(baseDefaultCFOpts.writeBufferSize()); + assertThat(defaultCFOpts.maxWriteBufferNumber()) + .isEqualTo(baseDefaultCFOpts.maxWriteBufferNumber()); + assertThat(defaultCFOpts.maxBytesForLevelBase()) + .isEqualTo(baseDefaultCFOpts.maxBytesForLevelBase()); + assertThat(defaultCFOpts.level0FileNumCompactionTrigger()) + .isEqualTo(baseDefaultCFOpts.level0FileNumCompactionTrigger()); + assertThat(defaultCFOpts.level0SlowdownWritesTrigger()) + .isEqualTo(baseDefaultCFOpts.level0SlowdownWritesTrigger()); + assertThat(defaultCFOpts.bottommostCompressionType()) + .isEqualTo(baseDefaultCFOpts.bottommostCompressionType()); + + final ColumnFamilyOptions secondCFOpts = cfDescs.get(1).getOptions(); + assertThat(secondCFOpts.writeBufferSize()).isEqualTo(baseSecondCFOpts.writeBufferSize()); + assertThat(secondCFOpts.maxWriteBufferNumber()) + .isEqualTo(baseSecondCFOpts.maxWriteBufferNumber()); + assertThat(secondCFOpts.maxBytesForLevelBase()) + .isEqualTo(baseSecondCFOpts.maxBytesForLevelBase()); + assertThat(secondCFOpts.level0FileNumCompactionTrigger()) + .isEqualTo(baseSecondCFOpts.level0FileNumCompactionTrigger()); + assertThat(secondCFOpts.level0SlowdownWritesTrigger()) + .isEqualTo(baseSecondCFOpts.level0SlowdownWritesTrigger()); + assertThat(secondCFOpts.bottommostCompressionType()) + .isEqualTo(baseSecondCFOpts.bottommostCompressionType()); } + } + + private void verifyTableFormatOptions(final LoaderUnderTest loaderUnderTest) + throws RocksDBException { + final String dbPath = dbFolder.getRoot().getAbsolutePath(); + final Options options = new Options() + .setCreateIfMissing(true) + .setParanoidChecks(false) + .setMaxOpenFiles(478) + .setDelayedWriteRate(1234567L); + final ColumnFamilyOptions defaultCFOptions = new ColumnFamilyOptions(); + defaultCFOptions.setTableFormatConfig(new BlockBasedTableConfig()); + final byte[] altCFName = "alt_cf".getBytes(); + final ColumnFamilyOptions altCFOptions = + new ColumnFamilyOptions() + .setWriteBufferSize(70 * 1024) + .setMaxWriteBufferNumber(7) + .setMaxBytesForLevelBase(53 * 1024 * 1024) + .setLevel0FileNumCompactionTrigger(3) + .setLevel0SlowdownWritesTrigger(51) + .setBottommostCompressionType(CompressionType.ZSTD_COMPRESSION); + + final BlockBasedTableConfig altCFTableConfig = new BlockBasedTableConfig(); + altCFTableConfig.setCacheIndexAndFilterBlocks(true); + altCFTableConfig.setCacheIndexAndFilterBlocksWithHighPriority(false); + altCFTableConfig.setPinL0FilterAndIndexBlocksInCache(true); + altCFTableConfig.setPinTopLevelIndexAndFilter(false); + altCFTableConfig.setIndexType(IndexType.kTwoLevelIndexSearch); + altCFTableConfig.setDataBlockIndexType(DataBlockIndexType.kDataBlockBinaryAndHash); + altCFTableConfig.setDataBlockHashTableUtilRatio(0.65); + altCFTableConfig.setChecksumType(ChecksumType.kxxHash64); + altCFTableConfig.setNoBlockCache(true); + altCFTableConfig.setBlockSize(35 * 1024); + altCFTableConfig.setBlockSizeDeviation(20); + altCFTableConfig.setBlockRestartInterval(12); + altCFTableConfig.setIndexBlockRestartInterval(6); + altCFTableConfig.setMetadataBlockSize(12 * 1024); + altCFTableConfig.setPartitionFilters(true); + altCFTableConfig.setOptimizeFiltersForMemory(true); + altCFTableConfig.setUseDeltaEncoding(false); + altCFTableConfig.setFilterPolicy(new BloomFilter(7.5)); + altCFTableConfig.setWholeKeyFiltering(false); + altCFTableConfig.setVerifyCompression(true); + altCFTableConfig.setReadAmpBytesPerBit(2); + altCFTableConfig.setFormatVersion(8); + altCFTableConfig.setEnableIndexCompression(false); + altCFTableConfig.setBlockAlign(true); + altCFTableConfig.setIndexShortening(IndexShorteningMode.kShortenSeparatorsAndSuccessor); + altCFTableConfig.setBlockCacheSize(3 * 1024 * 1024); + // Note cache objects are not set here, as they are not read back when reading config. + + altCFOptions.setTableFormatConfig(altCFTableConfig); + + // Create a database with a new column family + try (final RocksDB db = RocksDB.open(options, dbPath)) { + assertThat(db).isNotNull(); + + // create column family + try (final ColumnFamilyHandle columnFamilyHandle = + db.createColumnFamily(new ColumnFamilyDescriptor(altCFName, altCFOptions))) { + assert (columnFamilyHandle != null); + } + } + + // Read the options back and verify + final DBOptions dbOptions = new DBOptions(); + final List cfDescs = loaderUnderTest.loadOptions(dbPath, dbOptions); assertThat(dbOptions.createIfMissing()).isEqualTo(options.createIfMissing()); assertThat(dbOptions.paranoidChecks()).isEqualTo(options.paranoidChecks()); @@ -95,32 +328,51 @@ private void verifyOptions(TestAPI apiType) throws RocksDBException { assertThat(cfDescs.get(0)).isNotNull(); assertThat(cfDescs.get(1)).isNotNull(); assertThat(cfDescs.get(0).getName()).isEqualTo(RocksDB.DEFAULT_COLUMN_FAMILY); - assertThat(cfDescs.get(1).getName()).isEqualTo(secondCFName); - - ColumnFamilyOptions defaultCFOpts = cfDescs.get(0).getOptions(); - assertThat(defaultCFOpts.writeBufferSize()).isEqualTo(baseDefaultCFOpts.writeBufferSize()); - assertThat(defaultCFOpts.maxWriteBufferNumber()) - .isEqualTo(baseDefaultCFOpts.maxWriteBufferNumber()); - assertThat(defaultCFOpts.maxBytesForLevelBase()) - .isEqualTo(baseDefaultCFOpts.maxBytesForLevelBase()); - assertThat(defaultCFOpts.level0FileNumCompactionTrigger()) - .isEqualTo(baseDefaultCFOpts.level0FileNumCompactionTrigger()); - assertThat(defaultCFOpts.level0SlowdownWritesTrigger()) - .isEqualTo(baseDefaultCFOpts.level0SlowdownWritesTrigger()); - assertThat(defaultCFOpts.bottommostCompressionType()) - .isEqualTo(baseDefaultCFOpts.bottommostCompressionType()); - - ColumnFamilyOptions secondCFOpts = cfDescs.get(1).getOptions(); - assertThat(secondCFOpts.writeBufferSize()).isEqualTo(baseSecondCFOpts.writeBufferSize()); - assertThat(secondCFOpts.maxWriteBufferNumber()) - .isEqualTo(baseSecondCFOpts.maxWriteBufferNumber()); - assertThat(secondCFOpts.maxBytesForLevelBase()) - .isEqualTo(baseSecondCFOpts.maxBytesForLevelBase()); - assertThat(secondCFOpts.level0FileNumCompactionTrigger()) - .isEqualTo(baseSecondCFOpts.level0FileNumCompactionTrigger()); - assertThat(secondCFOpts.level0SlowdownWritesTrigger()) - .isEqualTo(baseSecondCFOpts.level0SlowdownWritesTrigger()); - assertThat(secondCFOpts.bottommostCompressionType()) - .isEqualTo(baseSecondCFOpts.bottommostCompressionType()); + assertThat(cfDescs.get(1).getName()).isEqualTo(altCFName); + + verifyBlockBasedTableConfig( + cfDescs.get(0).getOptions().tableFormatConfig(), new BlockBasedTableConfig()); + verifyBlockBasedTableConfig(cfDescs.get(1).getOptions().tableFormatConfig(), altCFTableConfig); + } + + private void verifyBlockBasedTableConfig( + final TableFormatConfig actualTableConfig, final BlockBasedTableConfig expected) { + assertThat(actualTableConfig).isNotNull(); + assertThat(actualTableConfig).isInstanceOf(BlockBasedTableConfig.class); + final BlockBasedTableConfig actual = (BlockBasedTableConfig) actualTableConfig; + assertThat(actual.cacheIndexAndFilterBlocks()).isEqualTo(expected.cacheIndexAndFilterBlocks()); + assertThat(actual.cacheIndexAndFilterBlocksWithHighPriority()) + .isEqualTo(expected.cacheIndexAndFilterBlocksWithHighPriority()); + assertThat(actual.pinL0FilterAndIndexBlocksInCache()) + .isEqualTo(expected.pinL0FilterAndIndexBlocksInCache()); + assertThat(actual.indexType()).isEqualTo(expected.indexType()); + assertThat(actual.dataBlockIndexType()).isEqualTo(expected.dataBlockIndexType()); + assertThat(actual.dataBlockHashTableUtilRatio()) + .isEqualTo(expected.dataBlockHashTableUtilRatio()); + assertThat(actual.checksumType()).isEqualTo(expected.checksumType()); + assertThat(actual.noBlockCache()).isEqualTo(expected.noBlockCache()); + assertThat(actual.blockSize()).isEqualTo(expected.blockSize()); + assertThat(actual.blockSizeDeviation()).isEqualTo(expected.blockSizeDeviation()); + assertThat(actual.blockRestartInterval()).isEqualTo(expected.blockRestartInterval()); + assertThat(actual.indexBlockRestartInterval()).isEqualTo(expected.indexBlockRestartInterval()); + assertThat(actual.metadataBlockSize()).isEqualTo(expected.metadataBlockSize()); + assertThat(actual.partitionFilters()).isEqualTo(expected.partitionFilters()); + assertThat(actual.optimizeFiltersForMemory()).isEqualTo(expected.optimizeFiltersForMemory()); + assertThat(actual.wholeKeyFiltering()).isEqualTo(expected.wholeKeyFiltering()); + assertThat(actual.verifyCompression()).isEqualTo(expected.verifyCompression()); + assertThat(actual.readAmpBytesPerBit()).isEqualTo(expected.readAmpBytesPerBit()); + assertThat(actual.formatVersion()).isEqualTo(expected.formatVersion()); + assertThat(actual.enableIndexCompression()).isEqualTo(expected.enableIndexCompression()); + assertThat(actual.blockAlign()).isEqualTo(expected.blockAlign()); + assertThat(actual.indexShortening()).isEqualTo(expected.indexShortening()); + if (expected.filterPolicy() == null) { + assertThat(actual.filterPolicy()).isNull(); + } else { + assertThat(expected.filterPolicy().equals(actual.filterPolicy())); + } + + // not currently persisted - always true when read from options + // this test will fail, and need repaired, if and when "useDeltaEncoding" is persisted. + assertThat(actual.useDeltaEncoding()).isEqualTo(true); } } diff --git a/java/src/test/java/org/rocksdb/PerfContextTest.java b/java/src/test/java/org/rocksdb/PerfContextTest.java new file mode 100644 index 000000000000..3145b59e4390 --- /dev/null +++ b/java/src/test/java/org/rocksdb/PerfContextTest.java @@ -0,0 +1,102 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.beans.BeanInfo; +import java.beans.IntrospectionException; +import java.beans.Introspector; +import java.beans.PropertyDescriptor; +import java.lang.reflect.InvocationTargetException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.junit.*; +import org.junit.rules.TemporaryFolder; + +public class PerfContextTest { + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + List cfDescriptors; + List columnFamilyHandleList = new ArrayList<>(); + RocksDB db; + + @Before + public void before() throws RocksDBException { + cfDescriptors = Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + + db = RocksDB.open( + options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList); + } + + @After + public void after() { + for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { + columnFamilyHandle.close(); + } + db.close(); + } + + @Test + public void testReset() { + db.setPerfLevel(PerfLevel.ENABLE_TIME_AND_CPU_TIME_EXCEPT_FOR_MUTEX); + PerfContext ctx = db.getPerfContext(); + assertThat(ctx).isNotNull(); + ctx.reset(); + } + + /** + * Call all properties to check that we don't have problem with UnsatisfiedLinkError. + */ + @Test + public void testAllGetters() throws RocksDBException, IntrospectionException, + InvocationTargetException, IllegalAccessException { + db.setPerfLevel(PerfLevel.ENABLE_TIME_AND_CPU_TIME_EXCEPT_FOR_MUTEX); + db.put("key".getBytes(), "value".getBytes()); + db.compactRange(); + db.get("key".getBytes()); + PerfContext ctx = db.getPerfContext(); + + BeanInfo info = Introspector.getBeanInfo(ctx.getClass(), RocksObject.class); + for (PropertyDescriptor property : info.getPropertyDescriptors()) { + if (property.getReadMethod() != null) { + Object result = property.getReadMethod().invoke(ctx); + assertThat(result).isNotNull(); + assertThat(result).isInstanceOf(Long.class); + } + } + } + + @Test + public void testGetBlockReadCpuTime() throws RocksDBException { + db.setPerfLevel(PerfLevel.ENABLE_TIME_AND_CPU_TIME_EXCEPT_FOR_MUTEX); + db.put("key".getBytes(), "value".getBytes()); + db.compactRange(); + db.get("key".getBytes()); + PerfContext ctx = db.getPerfContext(); + assertThat(ctx).isNotNull(); + assertThat(ctx.getBlockReadCpuTime()).isGreaterThan(0); + } + + @Test + public void testGetPostProcessTime() throws RocksDBException { + db.setPerfLevel(PerfLevel.ENABLE_TIME_AND_CPU_TIME_EXCEPT_FOR_MUTEX); + db.put("key".getBytes(), "value".getBytes()); + db.compactRange(); + db.get("key".getBytes()); + PerfContext ctx = db.getPerfContext(); + assertThat(ctx).isNotNull(); + assertThat(ctx.getPostProcessTime()).isGreaterThan(0); + } +} diff --git a/java/src/test/java/org/rocksdb/PerfLevelTest.java b/java/src/test/java/org/rocksdb/PerfLevelTest.java new file mode 100644 index 000000000000..bb766cbd4333 --- /dev/null +++ b/java/src/test/java/org/rocksdb/PerfLevelTest.java @@ -0,0 +1,65 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +package org.rocksdb; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.rocksdb.PerfLevel.*; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.junit.*; +import org.junit.rules.ExpectedException; +import org.junit.rules.TemporaryFolder; + +public class PerfLevelTest { + @ClassRule + public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = + new RocksNativeLibraryResource(); + + @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + + List cfDescriptors; + List columnFamilyHandleList = new ArrayList<>(); + RocksDB db; + + @Before + public void before() throws RocksDBException { + cfDescriptors = Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes())); + final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + + db = RocksDB.open( + options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList); + } + + @After + public void after() { + for (final ColumnFamilyHandle columnFamilyHandle : columnFamilyHandleList) { + columnFamilyHandle.close(); + } + db.close(); + } + @Test + public void testForInvalidValues() { + assertThatThrownBy(() -> db.setPerfLevel(UNINITIALIZED)) + .isInstanceOf(IllegalArgumentException.class); + assertThatThrownBy(() -> db.setPerfLevel(OUT_OF_BOUNDS)) + .isInstanceOf(IllegalArgumentException.class); + } + + @Test + public void testAllPerfLevels() { + for (PerfLevel level : new PerfLevel[] {DISABLE, ENABLE_COUNT, ENABLE_TIME_EXCEPT_FOR_MUTEX, + ENABLE_TIME_AND_CPU_TIME_EXCEPT_FOR_MUTEX, ENABLE_TIME}) { + db.setPerfLevel(level); + assertThat(db.getPerfLevel()).isEqualTo(level); + } + db.setPerfLevel(DISABLE); + } +} diff --git a/java/src/test/java/org/rocksdb/PlainTableConfigTest.java b/java/src/test/java/org/rocksdb/PlainTableConfigTest.java index c813dbbb438e..827eb79f9e8d 100644 --- a/java/src/test/java/org/rocksdb/PlainTableConfigTest.java +++ b/java/src/test/java/org/rocksdb/PlainTableConfigTest.java @@ -18,7 +18,7 @@ public class PlainTableConfigTest { @Test public void keySize() { - PlainTableConfig plainTableConfig = new PlainTableConfig(); + final PlainTableConfig plainTableConfig = new PlainTableConfig(); plainTableConfig.setKeySize(5); assertThat(plainTableConfig.keySize()). isEqualTo(5); @@ -26,7 +26,7 @@ public void keySize() { @Test public void bloomBitsPerKey() { - PlainTableConfig plainTableConfig = new PlainTableConfig(); + final PlainTableConfig plainTableConfig = new PlainTableConfig(); plainTableConfig.setBloomBitsPerKey(11); assertThat(plainTableConfig.bloomBitsPerKey()). isEqualTo(11); @@ -34,7 +34,7 @@ public void bloomBitsPerKey() { @Test public void hashTableRatio() { - PlainTableConfig plainTableConfig = new PlainTableConfig(); + final PlainTableConfig plainTableConfig = new PlainTableConfig(); plainTableConfig.setHashTableRatio(0.95); assertThat(plainTableConfig.hashTableRatio()). isEqualTo(0.95); @@ -42,7 +42,7 @@ public void hashTableRatio() { @Test public void indexSparseness() { - PlainTableConfig plainTableConfig = new PlainTableConfig(); + final PlainTableConfig plainTableConfig = new PlainTableConfig(); plainTableConfig.setIndexSparseness(18); assertThat(plainTableConfig.indexSparseness()). isEqualTo(18); @@ -50,7 +50,7 @@ public void indexSparseness() { @Test public void hugePageTlbSize() { - PlainTableConfig plainTableConfig = new PlainTableConfig(); + final PlainTableConfig plainTableConfig = new PlainTableConfig(); plainTableConfig.setHugePageTlbSize(1); assertThat(plainTableConfig.hugePageTlbSize()). isEqualTo(1); @@ -58,7 +58,7 @@ public void hugePageTlbSize() { @Test public void encodingType() { - PlainTableConfig plainTableConfig = new PlainTableConfig(); + final PlainTableConfig plainTableConfig = new PlainTableConfig(); plainTableConfig.setEncodingType(EncodingType.kPrefix); assertThat(plainTableConfig.encodingType()).isEqualTo( EncodingType.kPrefix); @@ -66,13 +66,13 @@ public void encodingType() { @Test public void fullScanMode() { - PlainTableConfig plainTableConfig = new PlainTableConfig(); + final PlainTableConfig plainTableConfig = new PlainTableConfig(); plainTableConfig.setFullScanMode(true); assertThat(plainTableConfig.fullScanMode()).isTrue(); } @Test public void storeIndexInFile() { - PlainTableConfig plainTableConfig = new PlainTableConfig(); + final PlainTableConfig plainTableConfig = new PlainTableConfig(); plainTableConfig.setStoreIndexInFile(true); assertThat(plainTableConfig.storeIndexInFile()). isTrue(); diff --git a/java/src/test/java/org/rocksdb/PutMultiplePartsTest.java b/java/src/test/java/org/rocksdb/PutMultiplePartsTest.java index 471ef07287d8..7835737ae314 100644 --- a/java/src/test/java/org/rocksdb/PutMultiplePartsTest.java +++ b/java/src/test/java/org/rocksdb/PutMultiplePartsTest.java @@ -113,12 +113,12 @@ private void validateResults() throws RocksDBException { final List keys = generateItemsAsList("key", ":", numParts); final byte[][] values = generateItems("value", "", numParts); - StringBuilder singleKey = new StringBuilder(); + final StringBuilder singleKey = new StringBuilder(); for (int i = 0; i < numParts; i++) { singleKey.append(new String(keys.get(i), StandardCharsets.UTF_8)); } final byte[] result = db.get(singleKey.toString().getBytes()); - StringBuilder singleValue = new StringBuilder(); + final StringBuilder singleValue = new StringBuilder(); for (int i = 0; i < numParts; i++) { singleValue.append(new String(values[i], StandardCharsets.UTF_8)); } @@ -136,12 +136,12 @@ private void validateResultsCF() throws RocksDBException { final List keys = generateItemsAsList("key", ":", numParts); final byte[][] values = generateItems("value", "", numParts); - StringBuilder singleKey = new StringBuilder(); + final StringBuilder singleKey = new StringBuilder(); for (int i = 0; i < numParts; i++) { singleKey.append(new String(keys.get(i), StandardCharsets.UTF_8)); } final byte[] result = db.get(columnFamilyHandles.get(0), singleKey.toString().getBytes()); - StringBuilder singleValue = new StringBuilder(); + final StringBuilder singleValue = new StringBuilder(); for (int i = 0; i < numParts; i++) { singleValue.append(new String(values[i], StandardCharsets.UTF_8)); } diff --git a/java/src/test/java/org/rocksdb/ReadOnlyTest.java b/java/src/test/java/org/rocksdb/ReadOnlyTest.java index 5b40a5df1fad..99549b61ba25 100644 --- a/java/src/test/java/org/rocksdb/ReadOnlyTest.java +++ b/java/src/test/java/org/rocksdb/ReadOnlyTest.java @@ -4,17 +4,16 @@ // (found in the LICENSE.Apache file in the root directory). package org.rocksdb; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import static org.assertj.core.api.Assertions.assertThat; - public class ReadOnlyTest { @ClassRule @@ -71,14 +70,14 @@ public void readOnlyOpen() throws RocksDBException { @Test(expected = RocksDBException.class) public void failToWriteInReadOnly() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true)) { - try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + try (final RocksDB ignored = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { // no-op } } try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { - final List cfDescriptors = - Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); + final List cfDescriptors = Collections.singletonList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); final List readOnlyColumnFamilyHandleList = new ArrayList<>(); try (final RocksDB rDb = RocksDB.openReadOnly(dbFolder.getRoot().getAbsolutePath(), @@ -92,15 +91,13 @@ public void failToWriteInReadOnly() throws RocksDBException { @Test(expected = RocksDBException.class) public void failToCFWriteInReadOnly() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); - final RocksDB db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath())) { + final RocksDB ignored = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { //no-op } try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { - final List cfDescriptors = Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts) - ); + final List cfDescriptors = Collections.singletonList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); final List readOnlyColumnFamilyHandleList = new ArrayList<>(); try (final RocksDB rDb = RocksDB.openReadOnly( @@ -114,15 +111,13 @@ public void failToCFWriteInReadOnly() throws RocksDBException { @Test(expected = RocksDBException.class) public void failToRemoveInReadOnly() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); - final RocksDB db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath())) { + final RocksDB ignored = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { //no-op } try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { - final List cfDescriptors = Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts) - ); + final List cfDescriptors = Collections.singletonList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); final List readOnlyColumnFamilyHandleList = new ArrayList<>(); @@ -138,23 +133,20 @@ public void failToRemoveInReadOnly() throws RocksDBException { @Test(expected = RocksDBException.class) public void failToCFRemoveInReadOnly() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); - final RocksDB db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath())) { + final RocksDB ignored = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { //no-op } try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { - final List cfDescriptors = Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts) - ); + final List cfDescriptors = Collections.singletonList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); final List readOnlyColumnFamilyHandleList = new ArrayList<>(); try (final RocksDB rDb = RocksDB.openReadOnly( dbFolder.getRoot().getAbsolutePath(), cfDescriptors, readOnlyColumnFamilyHandleList)) { - rDb.delete(readOnlyColumnFamilyHandleList.get(0), - "key".getBytes()); + rDb.delete(readOnlyColumnFamilyHandleList.get(0), "key".getBytes()); } } } @@ -162,15 +154,13 @@ public void failToCFRemoveInReadOnly() throws RocksDBException { @Test(expected = RocksDBException.class) public void failToWriteBatchReadOnly() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); - final RocksDB db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath())) { + final RocksDB ignored = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { //no-op } try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { - final List cfDescriptors = Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts) - ); + final List cfDescriptors = Collections.singletonList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); final List readOnlyColumnFamilyHandleList = new ArrayList<>(); @@ -179,8 +169,8 @@ public void failToWriteBatchReadOnly() throws RocksDBException { readOnlyColumnFamilyHandleList); final WriteBatch wb = new WriteBatch(); final WriteOptions wOpts = new WriteOptions()) { - wb.put("key".getBytes(), "value".getBytes()); - rDb.write(wOpts, wb); + wb.put("key".getBytes(), "value".getBytes()); + rDb.write(wOpts, wb); } } } @@ -188,15 +178,13 @@ public void failToWriteBatchReadOnly() throws RocksDBException { @Test(expected = RocksDBException.class) public void failToCFWriteBatchReadOnly() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); - final RocksDB db = RocksDB.open(options, - dbFolder.getRoot().getAbsolutePath())) { + final RocksDB ignored = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { //no-op } try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { - final List cfDescriptors = Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts) - ); + final List cfDescriptors = Collections.singletonList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); final List readOnlyColumnFamilyHandleList = new ArrayList<>(); @@ -205,9 +193,8 @@ public void failToCFWriteBatchReadOnly() throws RocksDBException { readOnlyColumnFamilyHandleList); final WriteBatch wb = new WriteBatch(); final WriteOptions wOpts = new WriteOptions()) { - wb.put(readOnlyColumnFamilyHandleList.get(0), "key".getBytes(), - "value".getBytes()); - rDb.write(wOpts, wb); + wb.put(readOnlyColumnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes()); + rDb.write(wOpts, wb); } } } @@ -215,18 +202,19 @@ public void failToCFWriteBatchReadOnly() throws RocksDBException { @Test(expected = RocksDBException.class) public void errorIfWalFileExists() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true); - final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { + final RocksDB ignored = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath())) { // no-op } try (final ColumnFamilyOptions cfOpts = new ColumnFamilyOptions()) { - final List cfDescriptors = - Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); + final List cfDescriptors = Collections.singletonList( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, cfOpts)); final List readOnlyColumnFamilyHandleList = new ArrayList<>(); try (final DBOptions options = new DBOptions(); - final RocksDB rDb = RocksDB.openReadOnly(options, dbFolder.getRoot().getAbsolutePath(), - cfDescriptors, readOnlyColumnFamilyHandleList, true);) { + final RocksDB ignored = + RocksDB.openReadOnly(options, dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + readOnlyColumnFamilyHandleList, true)) { // no-op... should have raised an error as errorIfWalFileExists=true } } diff --git a/java/src/test/java/org/rocksdb/ReadOptionsTest.java b/java/src/test/java/org/rocksdb/ReadOptionsTest.java index 156dd3730028..1bc24b98449c 100644 --- a/java/src/test/java/org/rocksdb/ReadOptionsTest.java +++ b/java/src/test/java/org/rocksdb/ReadOptionsTest.java @@ -160,7 +160,7 @@ public void ignoreRangeDeletions() { @Test public void iterateUpperBound() { try (final ReadOptions opt = new ReadOptions()) { - Slice upperBound = buildRandomSlice(); + final Slice upperBound = buildRandomSlice(); opt.setIterateUpperBound(upperBound); assertThat(Arrays.equals(upperBound.data(), opt.iterateUpperBound().data())).isTrue(); opt.setIterateUpperBound(null); @@ -178,7 +178,7 @@ public void iterateUpperBoundNull() { @Test public void iterateLowerBound() { try (final ReadOptions opt = new ReadOptions()) { - Slice lowerBound = buildRandomSlice(); + final Slice lowerBound = buildRandomSlice(); opt.setIterateLowerBound(lowerBound); assertThat(Arrays.equals(lowerBound.data(), opt.iterateLowerBound().data())).isTrue(); opt.setIterateLowerBound(null); @@ -212,7 +212,7 @@ public void autoPrefixMode() { @Test public void timestamp() { try (final ReadOptions opt = new ReadOptions()) { - Slice timestamp = buildRandomSlice(); + final Slice timestamp = buildRandomSlice(); opt.setTimestamp(timestamp); assertThat(Arrays.equals(timestamp.data(), opt.timestamp().data())).isTrue(); opt.setTimestamp(null); @@ -223,7 +223,7 @@ public void timestamp() { @Test public void iterStartTs() { try (final ReadOptions opt = new ReadOptions()) { - Slice itertStartTsSlice = buildRandomSlice(); + final Slice itertStartTsSlice = buildRandomSlice(); opt.setIterStartTs(itertStartTsSlice); assertThat(Arrays.equals(itertStartTsSlice.data(), opt.iterStartTs().data())).isTrue(); opt.setIterStartTs(null); @@ -234,24 +234,24 @@ public void iterStartTs() { @Test public void deadline() { try (final ReadOptions opt = new ReadOptions()) { - opt.setDeadline(1999l); - assertThat(opt.deadline()).isEqualTo(1999l); + opt.setDeadline(1999L); + assertThat(opt.deadline()).isEqualTo(1999L); } } @Test public void ioTimeout() { try (final ReadOptions opt = new ReadOptions()) { - opt.setIoTimeout(34555l); - assertThat(opt.ioTimeout()).isEqualTo(34555l); + opt.setIoTimeout(34555L); + assertThat(opt.ioTimeout()).isEqualTo(34555L); } } @Test public void valueSizeSoftLimit() { try (final ReadOptions opt = new ReadOptions()) { - opt.setValueSizeSoftLimit(12134324l); - assertThat(opt.valueSizeSoftLimit()).isEqualTo(12134324l); + opt.setValueSizeSoftLimit(12134324L); + assertThat(opt.valueSizeSoftLimit()).isEqualTo(12134324L); } } @@ -351,8 +351,7 @@ public void failIterateLowerBoundUninitialized() { } } - private ReadOptions setupUninitializedReadOptions( - ExpectedException exception) { + private ReadOptions setupUninitializedReadOptions(final ExpectedException exception) { final ReadOptions readOptions = new ReadOptions(); readOptions.close(); exception.expect(AssertionError.class); @@ -361,7 +360,7 @@ private ReadOptions setupUninitializedReadOptions( private Slice buildRandomSlice() { final Random rand = new Random(); - byte[] sliceBytes = new byte[rand.nextInt(100) + 1]; + final byte[] sliceBytes = new byte[rand.nextInt(100) + 1]; rand.nextBytes(sliceBytes); return new Slice(sliceBytes); } diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java index 488dbafe802c..ed6e989a8d44 100644 --- a/java/src/test/java/org/rocksdb/RocksDBTest.java +++ b/java/src/test/java/org/rocksdb/RocksDBTest.java @@ -48,8 +48,8 @@ public void open_opt() throws RocksDBException { public void openWhenOpen() throws RocksDBException { final String dbPath = dbFolder.getRoot().getAbsolutePath(); - try (final RocksDB db1 = RocksDB.open(dbPath)) { - try (final RocksDB db2 = RocksDB.open(dbPath)) { + try (final RocksDB ignored = RocksDB.open(dbPath)) { + try (final RocksDB ignored1 = RocksDB.open(dbPath)) { fail("Should have thrown an exception when opening the same db twice"); } catch (final RocksDBException e) { assertThat(e.getStatus().getCode()).isEqualTo(Status.Code.IOError); @@ -74,11 +74,10 @@ public void createColumnFamily() throws RocksDBException { } final List cfHandles = new ArrayList<>(); - try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(), - Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), - new ColumnFamilyDescriptor(col1Name)), - cfHandles)) { + try (final RocksDB ignored = RocksDB.open(dbFolder.getRoot().getAbsolutePath(), + Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor(col1Name)), + cfHandles)) { try { assertThat(cfHandles.size()).isEqualTo(2); assertThat(cfHandles.get(1)).isNotNull(); @@ -117,12 +116,10 @@ public void createColumnFamilies() throws RocksDBException { } cfHandles = new ArrayList<>(); - try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(), - Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), - new ColumnFamilyDescriptor(col1Name), - new ColumnFamilyDescriptor(col2Name)), - cfHandles)) { + try (final RocksDB ignored = RocksDB.open(dbFolder.getRoot().getAbsolutePath(), + Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor(col1Name), new ColumnFamilyDescriptor(col2Name)), + cfHandles)) { try { assertThat(cfHandles.size()).isEqualTo(3); assertThat(cfHandles.get(1)).isNotNull(); @@ -163,12 +160,10 @@ public void createColumnFamiliesfromDescriptors() throws RocksDBException { } cfHandles = new ArrayList<>(); - try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath(), - Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), - new ColumnFamilyDescriptor(col1Name), - new ColumnFamilyDescriptor(col2Name)), - cfHandles)) { + try (final RocksDB ignored = RocksDB.open(dbFolder.getRoot().getAbsolutePath(), + Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor(col1Name), new ColumnFamilyDescriptor(col2Name)), + cfHandles)) { try { assertThat(cfHandles.size()).isEqualTo(3); assertThat(cfHandles.get(1)).isNotNull(); @@ -194,8 +189,8 @@ public void put() throws RocksDBException { assertThat(db.get("key2".getBytes())).isEqualTo( "12345678".getBytes()); - ByteBuffer key = ByteBuffer.allocateDirect(12); - ByteBuffer value = ByteBuffer.allocateDirect(12); + final ByteBuffer key = ByteBuffer.allocateDirect(12); + final ByteBuffer value = ByteBuffer.allocateDirect(12); key.position(4); key.put("key3".getBytes()); key.position(4).limit(8); @@ -213,14 +208,14 @@ public void put() throws RocksDBException { key.position(4); - ByteBuffer result = ByteBuffer.allocateDirect(12); + final ByteBuffer result = ByteBuffer.allocateDirect(12); assertThat(db.get(optr, key, result)).isEqualTo(4); assertThat(result.position()).isEqualTo(0); assertThat(result.limit()).isEqualTo(4); assertThat(key.position()).isEqualTo(8); assertThat(key.limit()).isEqualTo(8); - byte[] tmp = new byte[4]; + final byte[] tmp = new byte[4]; result.get(tmp); assertThat(tmp).isEqualTo("val3".getBytes()); @@ -232,15 +227,15 @@ public void put() throws RocksDBException { assertThat(result.limit()).isEqualTo(12); assertThat(key.position()).isEqualTo(8); assertThat(key.limit()).isEqualTo(8); - byte[] tmp2 = new byte[3]; + final byte[] tmp2 = new byte[3]; result.get(tmp2); assertThat(tmp2).isEqualTo("val".getBytes()); // put - Segment key3 = sliceSegment("key3"); - Segment key4 = sliceSegment("key4"); - Segment value0 = sliceSegment("value 0"); - Segment value1 = sliceSegment("value 1"); + final Segment key3 = sliceSegment("key3"); + final Segment key4 = sliceSegment("key4"); + final Segment value0 = sliceSegment("value 0"); + final Segment value1 = sliceSegment("value 1"); db.put(key3.data, key3.offset, key3.len, value0.data, value0.offset, value0.len); db.put(opt, key4.data, key4.offset, key4.len, value1.data, value1.offset, value1.len); @@ -250,8 +245,8 @@ public void put() throws RocksDBException { } } - private static Segment sliceSegment(String key) { - ByteBuffer rawKey = ByteBuffer.allocate(key.length() + 4); + private static Segment sliceSegment(final String key) { + final ByteBuffer rawKey = ByteBuffer.allocate(key.length() + 4); rawKey.put((byte)0); rawKey.put((byte)0); rawKey.put(key.getBytes()); @@ -264,7 +259,7 @@ private static class Segment { final int offset; final int len; - public boolean isSamePayload(byte[] value) { + public boolean isSamePayload(final byte[] value) { if (value == null) { return false; } @@ -281,7 +276,7 @@ public boolean isSamePayload(byte[] value) { return true; } - public Segment(byte[] value, int offset, int len) { + public Segment(final byte[] value, final int offset, final int len) { this.data = value; this.offset = offset; this.len = len; @@ -323,7 +318,7 @@ public void getWithOutValue() throws RocksDBException { RocksDB.open(dbFolder.getRoot().getAbsolutePath())) { db.put("key1".getBytes(), "value".getBytes()); db.put("key2".getBytes(), "12345678".getBytes()); - byte[] outValue = new byte[5]; + final byte[] outValue = new byte[5]; // not found value int getResult = db.get("keyNotFound".getBytes(), outValue); assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND); @@ -344,7 +339,7 @@ public void getWithOutValueReadOptions() throws RocksDBException { final ReadOptions rOpt = new ReadOptions()) { db.put("key1".getBytes(), "value".getBytes()); db.put("key2".getBytes(), "12345678".getBytes()); - byte[] outValue = new byte[5]; + final byte[] outValue = new byte[5]; // not found value int getResult = db.get(rOpt, "keyNotFound".getBytes(), outValue); @@ -368,9 +363,9 @@ public void getOutOfArrayMaxSizeValue() throws RocksDBException { final int numberOfValueSplits = 10; final int splitSize = Integer.MAX_VALUE / numberOfValueSplits; - Runtime runtime = Runtime.getRuntime(); - long neededMemory = ((long)(splitSize)) * (((long)numberOfValueSplits) + 3); - boolean isEnoughMemory = runtime.maxMemory() - runtime.totalMemory() > neededMemory; + final Runtime runtime = Runtime.getRuntime(); + final long neededMemory = ((long) (splitSize)) * (((long) numberOfValueSplits) + 3); + final boolean isEnoughMemory = runtime.maxMemory() - runtime.totalMemory() > neededMemory; Assume.assumeTrue(isEnoughMemory); final byte[] valueSplit = new byte[splitSize]; @@ -399,7 +394,7 @@ public void multiGetAsList() throws RocksDBException { final ReadOptions rOpt = new ReadOptions()) { db.put("key1".getBytes(), "value".getBytes()); db.put("key2".getBytes(), "12345678".getBytes()); - List lookupKeys = new ArrayList<>(); + final List lookupKeys = new ArrayList<>(); lookupKeys.add("key1".getBytes()); lookupKeys.add("key2".getBytes()); List results = db.multiGetAsList(lookupKeys); @@ -454,10 +449,10 @@ public void merge() throws RocksDBException { assertThat(db.get("key2".getBytes())).isEqualTo( "xxxx".getBytes()); - Segment key3 = sliceSegment("key3"); - Segment key4 = sliceSegment("key4"); - Segment value0 = sliceSegment("value 0"); - Segment value1 = sliceSegment("value 1"); + final Segment key3 = sliceSegment("key3"); + final Segment key4 = sliceSegment("key4"); + final Segment value0 = sliceSegment("value 0"); + final Segment value1 = sliceSegment("value 1"); db.merge(key3.data, key3.offset, key3.len, value0.data, value0.offset, value0.len); db.merge(wOpt, key4.data, key4.offset, key4.len, value1.data, value1.offset, value1.len); @@ -482,7 +477,7 @@ public void delete() throws RocksDBException { assertThat(db.get("key3".getBytes())).isEqualTo("33".getBytes()); db.delete("key1".getBytes()); db.delete(wOpt, "key2".getBytes()); - ByteBuffer key = ByteBuffer.allocateDirect(16); + final ByteBuffer key = ByteBuffer.allocateDirect(16); key.put("key3".getBytes()).flip(); db.delete(wOpt, key); assertThat(key.position()).isEqualTo(4); @@ -491,8 +486,8 @@ public void delete() throws RocksDBException { assertThat(db.get("key1".getBytes())).isNull(); assertThat(db.get("key2".getBytes())).isNull(); - Segment key3 = sliceSegment("key3"); - Segment key4 = sliceSegment("key4"); + final Segment key3 = sliceSegment("key3"); + final Segment key4 = sliceSegment("key4"); db.put("key3".getBytes(), "key3 value".getBytes()); db.put("key4".getBytes(), "key4 value".getBytes()); @@ -551,6 +546,28 @@ public void deleteRange() throws RocksDBException { } } + @Test + public void clipColumnFamily() throws RocksDBException { + try (final RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath())) { + db.put("key1".getBytes(), "value".getBytes()); + db.put("key2".getBytes(), "12345678".getBytes()); + db.put("key3".getBytes(), "abcdefg".getBytes()); + db.put("key4".getBytes(), "xyz".getBytes()); + db.put("key5".getBytes(), "qwer".getBytes()); + assertThat(db.get("key1".getBytes())).isEqualTo("value".getBytes()); + assertThat(db.get("key2".getBytes())).isEqualTo("12345678".getBytes()); + assertThat(db.get("key3".getBytes())).isEqualTo("abcdefg".getBytes()); + assertThat(db.get("key4".getBytes())).isEqualTo("xyz".getBytes()); + assertThat(db.get("key5".getBytes())).isEqualTo("qwer".getBytes()); + db.clipColumnFamily(db.getDefaultColumnFamily(), "key2".getBytes(), "key4".getBytes()); + assertThat(db.get("key1".getBytes())).isNull(); + assertThat(db.get("key2".getBytes())).isEqualTo("12345678".getBytes()); + assertThat(db.get("key3".getBytes())).isEqualTo("abcdefg".getBytes()); + assertThat(db.get("key4".getBytes())).isNull(); + assertThat(db.get("key5".getBytes())).isNull(); + } + } + @Test public void getIntProperty() throws RocksDBException { try ( @@ -590,7 +607,7 @@ public void fullCompactRange() throws RocksDBException { final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { // fill database with key/value pairs - byte[] b = new byte[10000]; + final byte[] b = new byte[10000]; for (int i = 0; i < 200; i++) { rand.nextBytes(b); db.put((String.valueOf(i)).getBytes(), b); @@ -631,7 +648,7 @@ public void fullCompactRangeColumnFamily() columnFamilyHandles)) { try { // fill database with key/value pairs - byte[] b = new byte[10000]; + final byte[] b = new byte[10000]; for (int i = 0; i < 200; i++) { rand.nextBytes(b); db.put(columnFamilyHandles.get(1), @@ -665,7 +682,7 @@ public void compactRangeWithKeys() final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { // fill database with key/value pairs - byte[] b = new byte[10000]; + final byte[] b = new byte[10000]; for (int i = 0; i < 200; i++) { rand.nextBytes(b); db.put((String.valueOf(i)).getBytes(), b); @@ -693,12 +710,14 @@ public void compactRangeWithKeysReduce() final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { // fill database with key/value pairs - byte[] b = new byte[10000]; + final byte[] b = new byte[10000]; for (int i = 0; i < 200; i++) { rand.nextBytes(b); db.put((String.valueOf(i)).getBytes(), b); } - db.flush(new FlushOptions().setWaitForFlush(true)); + try (final FlushOptions flushOptions = new FlushOptions().setWaitForFlush(true)) { + db.flush(flushOptions); + } try (final CompactRangeOptions compactRangeOpts = new CompactRangeOptions() .setChangeLevel(true) .setTargetLevel(-1) @@ -742,7 +761,7 @@ public void compactRangeWithKeysColumnFamily() columnFamilyHandles)) { try { // fill database with key/value pairs - byte[] b = new byte[10000]; + final byte[] b = new byte[10000]; for (int i = 0; i < 200; i++) { rand.nextBytes(b); db.put(columnFamilyHandles.get(1), @@ -794,7 +813,7 @@ public void compactRangeWithKeysReduceColumnFamily() .setTargetLevel(-1) .setTargetPathId(0)) { // fill database with key/value pairs - byte[] b = new byte[10000]; + final byte[] b = new byte[10000]; for (int i = 0; i < 200; i++) { rand.nextBytes(b); db.put(columnFamilyHandles.get(1), @@ -812,8 +831,7 @@ public void compactRangeWithKeysReduceColumnFamily() } @Test - public void compactRangeToLevel() - throws RocksDBException, InterruptedException { + public void compactRangeToLevel() throws RocksDBException { final int NUM_KEYS_PER_L0_FILE = 100; final int KEY_SIZE = 20; final int VALUE_SIZE = 300; @@ -822,30 +840,32 @@ public void compactRangeToLevel() final int NUM_L0_FILES = 10; final int TEST_SCALE = 5; final int KEY_INTERVAL = 100; - try (final Options opt = new Options(). - setCreateIfMissing(true). - setCompactionStyle(CompactionStyle.LEVEL). - setNumLevels(5). - // a slightly bigger write buffer than L0 file - // so that we can ensure manual flush always - // go before background flush happens. - setWriteBufferSize(L0_FILE_SIZE * 2). - // Disable auto L0 -> L1 compaction - setLevelZeroFileNumCompactionTrigger(20). - setTargetFileSizeBase(L0_FILE_SIZE * 100). - setTargetFileSizeMultiplier(1). - // To disable auto compaction - setMaxBytesForLevelBase(NUM_L0_FILES * L0_FILE_SIZE * 100). - setMaxBytesForLevelMultiplier(2). - setDisableAutoCompactions(true); - final RocksDB db = RocksDB.open(opt, - dbFolder.getRoot().getAbsolutePath()) - ) { + try (final Options opt = new Options() + .setCreateIfMissing(true) + .setCompactionStyle(CompactionStyle.LEVEL) + .setLevelCompactionDynamicLevelBytes(false) + .setNumLevels(5) + . + // a slightly bigger write buffer than L0 file + // so that we can ensure manual flush always + // go before background flush happens. + setWriteBufferSize(L0_FILE_SIZE * 2) + . + // Disable auto L0 -> L1 compaction + setLevelZeroFileNumCompactionTrigger(20) + .setTargetFileSizeBase(L0_FILE_SIZE * 100) + .setTargetFileSizeMultiplier(1) + . + // To disable auto compaction + setMaxBytesForLevelBase(NUM_L0_FILES * L0_FILE_SIZE * 100) + .setMaxBytesForLevelMultiplier(2) + .setDisableAutoCompactions(true); + final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { // fill database with key/value pairs - byte[] value = new byte[VALUE_SIZE]; + final byte[] value = new byte[VALUE_SIZE]; int int_key = 0; for (int round = 0; round < 5; ++round) { - int initial_key = int_key; + final int initial_key = int_key; for (int f = 1; f <= NUM_L0_FILES; ++f) { for (int i = 0; i < NUM_KEYS_PER_L0_FILE; ++i) { int_key += KEY_INTERVAL; @@ -854,7 +874,9 @@ public void compactRangeToLevel() db.put(String.format("%020d", int_key).getBytes(), value); } - db.flush(new FlushOptions().setWaitForFlush(true)); + try (final FlushOptions flushOptions = new FlushOptions().setWaitForFlush(true)) { + db.flush(flushOptions); + } // Make sure we do create one more L0 files. assertThat( db.getProperty("rocksdb.num-files-at-level0")). @@ -887,7 +909,7 @@ public void compactRangeToLevel() } @Test - public void deleteFilesInRange() throws RocksDBException, InterruptedException { + public void deleteFilesInRange() throws RocksDBException { final int KEY_SIZE = 20; final int VALUE_SIZE = 1000; final int FILE_SIZE = 64000; @@ -899,19 +921,20 @@ public void deleteFilesInRange() throws RocksDBException, InterruptedException { * we will be deleting using deleteFilesInRange. * It is writing roughly number of keys that will fit in 10 files (target size) * It is writing interleaved so that files from memory on L0 will overlap - * Then compaction cleans everything and we should end up with 10 files + * Then compaction cleans everything, and we should end up with 10 files */ try (final Options opt = new Options() .setCreateIfMissing(true) .setCompressionType(CompressionType.NO_COMPRESSION) .setTargetFileSizeBase(FILE_SIZE) .setWriteBufferSize(FILE_SIZE / 2) - .setDisableAutoCompactions(true); + .setDisableAutoCompactions(true) + .setLevelCompactionDynamicLevelBytes(false); final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { - int records = FILE_SIZE / (KEY_SIZE + VALUE_SIZE); + final int records = FILE_SIZE / (KEY_SIZE + VALUE_SIZE); // fill database with key/value pairs - byte[] value = new byte[VALUE_SIZE]; + final byte[] value = new byte[VALUE_SIZE]; int key_init = 0; for (int o = 0; o < NUM_FILES; ++o) { int int_key = key_init++; @@ -922,7 +945,9 @@ public void deleteFilesInRange() throws RocksDBException, InterruptedException { db.put(String.format("%020d", int_key).getBytes(), value); } } - db.flush(new FlushOptions().setWaitForFlush(true)); + try (final FlushOptions flushOptions = new FlushOptions().setWaitForFlush(true)) { + db.flush(flushOptions); + } db.compactRange(); // Make sure we do create one more L0 files. assertThat(db.getProperty("rocksdb.num-files-at-level0")).isEqualTo("0"); @@ -954,25 +979,28 @@ public void compactRangeToLevelColumnFamily() final int TEST_SCALE = 5; final int KEY_INTERVAL = 100; - try (final DBOptions opt = new DBOptions(). - setCreateIfMissing(true). - setCreateMissingColumnFamilies(true); - final ColumnFamilyOptions new_cf_opts = new ColumnFamilyOptions(). - setCompactionStyle(CompactionStyle.LEVEL). - setNumLevels(5). + try (final DBOptions opt = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true); + final ColumnFamilyOptions new_cf_opts = + new ColumnFamilyOptions() + .setCompactionStyle(CompactionStyle.LEVEL) + .setLevelCompactionDynamicLevelBytes(false) + .setNumLevels(5) + . // a slightly bigger write buffer than L0 file // so that we can ensure manual flush always // go before background flush happens. - setWriteBufferSize(L0_FILE_SIZE * 2). + setWriteBufferSize(L0_FILE_SIZE * 2) + . // Disable auto L0 -> L1 compaction - setLevelZeroFileNumCompactionTrigger(20). - setTargetFileSizeBase(L0_FILE_SIZE * 100). - setTargetFileSizeMultiplier(1). + setLevelZeroFileNumCompactionTrigger(20) + .setTargetFileSizeBase(L0_FILE_SIZE * 100) + .setTargetFileSizeMultiplier(1) + . // To disable auto compaction - setMaxBytesForLevelBase(NUM_L0_FILES * L0_FILE_SIZE * 100). - setMaxBytesForLevelMultiplier(2). - setDisableAutoCompactions(true) - ) { + setMaxBytesForLevelBase(NUM_L0_FILES * L0_FILE_SIZE * 100) + .setMaxBytesForLevelMultiplier(2) + .setDisableAutoCompactions(true)) { final List columnFamilyDescriptors = Arrays.asList( new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), @@ -987,10 +1015,10 @@ public void compactRangeToLevelColumnFamily() columnFamilyHandles)) { try { // fill database with key/value pairs - byte[] value = new byte[VALUE_SIZE]; + final byte[] value = new byte[VALUE_SIZE]; int int_key = 0; for (int round = 0; round < 5; ++round) { - int initial_key = int_key; + final int initial_key = int_key; for (int f = 1; f <= NUM_L0_FILES; ++f) { for (int i = 0; i < NUM_KEYS_PER_L0_FILE; ++i) { int_key += KEY_INTERVAL; @@ -1000,8 +1028,9 @@ public void compactRangeToLevelColumnFamily() String.format("%020d", int_key).getBytes(), value); } - db.flush(new FlushOptions().setWaitForFlush(true), - columnFamilyHandles.get(1)); + try (final FlushOptions flushOptions = new FlushOptions().setWaitForFlush(true)) { + db.flush(flushOptions, columnFamilyHandles.get(1)); + } // Make sure we do create one more L0 files. assertThat( db.getProperty(columnFamilyHandles.get(1), @@ -1069,10 +1098,13 @@ public void continueBackgroundWorkAfterCancelAllBackgroundWork() throws RocksDBE db.cancelAllBackgroundWork(true); try { db.put(new byte[KEY_SIZE], new byte[VALUE_SIZE]); - db.flush(new FlushOptions().setWaitForFlush(true)); + try (final FlushOptions flushOptions = new FlushOptions().setWaitForFlush(true)) { + db.flush(flushOptions); + } fail("Expected RocksDBException to be thrown if we attempt to trigger a flush after" + " all background work is cancelled."); - } catch (RocksDBException ignored) { } + } catch (final RocksDBException ignored) { + } } finally { for (final ColumnFamilyHandle handle : columnFamilyHandles) { handle.close(); @@ -1158,14 +1190,16 @@ public void setOptions() throws RocksDBException { @Test public void destroyDB() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true)) { - String dbPath = dbFolder.getRoot().getAbsolutePath(); + final String dbPath = dbFolder.getRoot().getAbsolutePath(); try (final RocksDB db = RocksDB.open(options, dbPath)) { db.put("key1".getBytes(), "value".getBytes()); } - assertThat(dbFolder.getRoot().exists() && dbFolder.getRoot().listFiles().length != 0) + assertThat(dbFolder.getRoot().exists() + && Objects.requireNonNull(dbFolder.getRoot().listFiles()).length != 0) .isTrue(); RocksDB.destroyDB(dbPath, options); - assertThat(dbFolder.getRoot().exists() && dbFolder.getRoot().listFiles().length != 0) + assertThat(dbFolder.getRoot().exists() + && Objects.requireNonNull(dbFolder.getRoot().listFiles()).length != 0) .isFalse(); } } @@ -1173,8 +1207,8 @@ public void destroyDB() throws RocksDBException { @Test(expected = RocksDBException.class) public void destroyDBFailIfOpen() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true)) { - String dbPath = dbFolder.getRoot().getAbsolutePath(); - try (final RocksDB db = RocksDB.open(options, dbPath)) { + final String dbPath = dbFolder.getRoot().getAbsolutePath(); + try (final RocksDB ignored = RocksDB.open(options, dbPath)) { // Fails as the db is open and locked. RocksDB.destroyDB(dbPath, options); } @@ -1183,9 +1217,9 @@ public void destroyDBFailIfOpen() throws RocksDBException { @Test public void getApproximateSizes() throws RocksDBException { - final byte key1[] = "key1".getBytes(UTF_8); - final byte key2[] = "key2".getBytes(UTF_8); - final byte key3[] = "key3".getBytes(UTF_8); + final byte[] key1 = "key1".getBytes(UTF_8); + final byte[] key2 = "key2".getBytes(UTF_8); + final byte[] key3 = "key3".getBytes(UTF_8); try (final Options options = new Options().setCreateIfMissing(true)) { final String dbPath = dbFolder.getRoot().getAbsolutePath(); try (final RocksDB db = RocksDB.open(options, dbPath)) { @@ -1210,9 +1244,9 @@ public void getApproximateSizes() throws RocksDBException { @Test public void getApproximateMemTableStats() throws RocksDBException { - final byte key1[] = "key1".getBytes(UTF_8); - final byte key2[] = "key2".getBytes(UTF_8); - final byte key3[] = "key3".getBytes(UTF_8); + final byte[] key1 = "key1".getBytes(UTF_8); + final byte[] key2 = "key2".getBytes(UTF_8); + final byte[] key3 = "key3".getBytes(UTF_8); try (final Options options = new Options().setCreateIfMissing(true)) { final String dbPath = dbFolder.getRoot().getAbsolutePath(); try (final RocksDB db = RocksDB.open(options, dbPath)) { @@ -1233,9 +1267,8 @@ public void getApproximateMemTableStats() throws RocksDBException { @Test public void getApproximateMemTableStatsSingleKey() throws RocksDBException { - final byte key1[] = "key1".getBytes(UTF_8); - final byte key2[] = "key2".getBytes(UTF_8); - final byte key3[] = "key3".getBytes(UTF_8); + final byte[] key1 = "key1".getBytes(UTF_8); + final byte[] key3 = "key3".getBytes(UTF_8); try (final Options options = new Options().setCreateIfMissing(true)) { final String dbPath = dbFolder.getRoot().getAbsolutePath(); try (final RocksDB db = RocksDB.open(options, dbPath)) { @@ -1262,15 +1295,16 @@ public void compactFiles() throws RocksDBException { final byte[] cfName = "pikachu".getBytes(UTF_8); try (final Options options = new Options() - .setCreateIfMissing(true) - .setWriteBufferSize(writeBufferSize) - .setCompactionStyle(CompactionStyle.LEVEL) - .setTargetFileSizeBase(writeBufferSize) - .setMaxBytesForLevelBase(writeBufferSize * 2) - .setLevel0StopWritesTrigger(2) - .setMaxBytesForLevelMultiplier(2) - .setCompressionType(CompressionType.NO_COMPRESSION) - .setMaxSubcompactions(4)) { + .setCreateIfMissing(true) + .setWriteBufferSize(writeBufferSize) + .setCompactionStyle(CompactionStyle.LEVEL) + .setLevelCompactionDynamicLevelBytes(false) + .setTargetFileSizeBase(writeBufferSize) + .setMaxBytesForLevelBase(writeBufferSize * 2) + .setLevel0StopWritesTrigger(2) + .setMaxBytesForLevelMultiplier(2) + .setCompressionType(CompressionType.NO_COMPRESSION) + .setMaxSubcompactions(4)) { final String dbPath = dbFolder.getRoot().getAbsolutePath(); try (final RocksDB db = RocksDB.open(options, dbPath); final ColumnFamilyOptions cfOptions = new ColumnFamilyOptions(options)) { @@ -1285,9 +1319,7 @@ public void compactFiles() throws RocksDBException { ); final List cfHandles = new ArrayList<>(); try (final DBOptions dbOptions = new DBOptions(options); - final RocksDB db = RocksDB.open(dbOptions, dbPath, cfDescriptors, - cfHandles); - ) { + final RocksDB db = RocksDB.open(dbOptions, dbPath, cfDescriptors, cfHandles)) { try (final FlushOptions flushOptions = new FlushOptions() .setWaitForFlush(true) .setAllowWriteStall(true); @@ -1320,9 +1352,8 @@ public void compactFiles() throws RocksDBException { public void enableAutoCompaction() throws RocksDBException { try (final DBOptions options = new DBOptions() .setCreateIfMissing(true)) { - final List cfDescs = Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY) - ); + final List cfDescs = + Collections.singletonList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); final List cfHandles = new ArrayList<>(); final String dbPath = dbFolder.getRoot().getAbsolutePath(); try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) { @@ -1337,6 +1368,25 @@ public void enableAutoCompaction() throws RocksDBException { } } + @Test + public void enableAutoCompactionNull() throws RocksDBException { + try (final DBOptions options = new DBOptions().setCreateIfMissing(true)) { + final List cfDescs = + Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + final List cfHandles = new ArrayList<>(); + final String dbPath = dbFolder.getRoot().getAbsolutePath(); + try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) { + try { + db.enableAutoCompaction(null); + } finally { + for (final ColumnFamilyHandle cfHandle : cfHandles) { + cfHandle.close(); + } + } + } + } + } + @Test public void numberLevels() throws RocksDBException { try (final Options options = new Options().setCreateIfMissing(true)) { @@ -1425,7 +1475,7 @@ public void getLiveFiles() throws RocksDBException { try (final RocksDB db = RocksDB.open(options, dbPath)) { final RocksDB.LiveFiles livefiles = db.getLiveFiles(true); assertThat(livefiles).isNotNull(); - assertThat(livefiles.manifestFileSize).isEqualTo(66); + assertThat(livefiles.manifestFileSize).isEqualTo(70); assertThat(livefiles.files.size()).isEqualTo(3); assertThat(livefiles.files.get(0)).isEqualTo("/CURRENT"); assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000005"); @@ -1476,9 +1526,8 @@ public void getLiveFilesMetaData() throws RocksDBException { public void getColumnFamilyMetaData() throws RocksDBException { try (final DBOptions options = new DBOptions() .setCreateIfMissing(true)) { - final List cfDescs = Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY) - ); + final List cfDescs = + Collections.singletonList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); final List cfHandles = new ArrayList<>(); final String dbPath = dbFolder.getRoot().getAbsolutePath(); try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) { @@ -1512,9 +1561,8 @@ public void verifyChecksum() throws RocksDBException { public void getPropertiesOfAllTables() throws RocksDBException { try (final DBOptions options = new DBOptions() .setCreateIfMissing(true)) { - final List cfDescs = Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY) - ); + final List cfDescs = + Collections.singletonList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); final List cfHandles = new ArrayList<>(); final String dbPath = dbFolder.getRoot().getAbsolutePath(); try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) { @@ -1536,9 +1584,8 @@ public void getPropertiesOfAllTables() throws RocksDBException { public void getPropertiesOfTablesInRange() throws RocksDBException { try (final DBOptions options = new DBOptions() .setCreateIfMissing(true)) { - final List cfDescs = Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY) - ); + final List cfDescs = + Collections.singletonList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); final List cfHandles = new ArrayList<>(); final String dbPath = dbFolder.getRoot().getAbsolutePath(); try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) { @@ -1550,8 +1597,7 @@ public void getPropertiesOfTablesInRange() throws RocksDBException { new Slice("key1".getBytes(UTF_8)), new Slice("key3".getBytes(UTF_8))); final Map properties = - db.getPropertiesOfTablesInRange( - cfHandles.get(0), Arrays.asList(range)); + db.getPropertiesOfTablesInRange(cfHandles.get(0), Collections.singletonList(range)); assertThat(properties).isNotNull(); } finally { for (final ColumnFamilyHandle cfHandle : cfHandles) { @@ -1566,18 +1612,51 @@ public void getPropertiesOfTablesInRange() throws RocksDBException { public void suggestCompactRange() throws RocksDBException { try (final DBOptions options = new DBOptions() .setCreateIfMissing(true)) { - final List cfDescs = Arrays.asList( - new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY) - ); + final List cfDescs = + Collections.singletonList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + final List cfHandles = new ArrayList<>(); + final String dbPath = dbFolder.getRoot().getAbsolutePath(); + try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) { + db.put(cfHandles.get(0), "key1".getBytes(UTF_8), "value1".getBytes(UTF_8)); + db.put(cfHandles.get(0), "key2".getBytes(UTF_8), "value2".getBytes(UTF_8)); + db.put(cfHandles.get(0), "key3".getBytes(UTF_8), "value3".getBytes(UTF_8)); + try { + final Range range = db.suggestCompactRange(); + assertThat(range).isNotNull(); + } finally { + for (final ColumnFamilyHandle cfHandle : cfHandles) { + cfHandle.close(); + } + } + } + } + } + + @Test + public void suggestCompactRangeCF() throws RocksDBException { + try (final DBOptions options = + new DBOptions().setCreateIfMissing(true).setCreateMissingColumnFamilies(true)) { + final List cfDescs = + Arrays.asList(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY), + new ColumnFamilyDescriptor("new_cf".getBytes(), new ColumnFamilyOptions()), + new ColumnFamilyDescriptor("new_cf2".getBytes(), new ColumnFamilyOptions())); + final List cfHandles = new ArrayList<>(); final String dbPath = dbFolder.getRoot().getAbsolutePath(); try (final RocksDB db = RocksDB.open(options, dbPath, cfDescs, cfHandles)) { db.put(cfHandles.get(0), "key1".getBytes(UTF_8), "value1".getBytes(UTF_8)); db.put(cfHandles.get(0), "key2".getBytes(UTF_8), "value2".getBytes(UTF_8)); db.put(cfHandles.get(0), "key3".getBytes(UTF_8), "value3".getBytes(UTF_8)); + db.put(cfHandles.get(1), "key1_new_cf".getBytes(UTF_8), "value1".getBytes(UTF_8)); + db.put(cfHandles.get(1), "key2_new_cf".getBytes(UTF_8), "value2".getBytes(UTF_8)); + db.put(cfHandles.get(1), "key3_new_cf".getBytes(UTF_8), "value3".getBytes(UTF_8)); try { final Range range = db.suggestCompactRange(cfHandles.get(0)); assertThat(range).isNotNull(); + final Range rangeCF = db.suggestCompactRange(cfHandles.get(1)); + assertThat(rangeCF).isNotNull(); + final Range rangeCFEmpty = db.suggestCompactRange(cfHandles.get(2)); + assertThat(rangeCFEmpty).isNotNull(); } finally { for (final ColumnFamilyHandle cfHandle : cfHandles) { cfHandle.close(); @@ -1682,8 +1761,8 @@ public void closeWriter() { @Override public long getFileSize() { long size = 0; - for (int i = 0; i < writes.size(); i++) { - size += writes.get(i).length; + for (final byte[] write : writes) { + size += write.length; } return size; } diff --git a/java/src/test/java/org/rocksdb/RocksMemEnvTest.java b/java/src/test/java/org/rocksdb/RocksMemEnvTest.java index cce0c61e0ed6..40b24ffa3804 100644 --- a/java/src/test/java/org/rocksdb/RocksMemEnvTest.java +++ b/java/src/test/java/org/rocksdb/RocksMemEnvTest.java @@ -32,12 +32,8 @@ public void memEnvFillAndReopen() throws RocksDBException { }; try (final Env env = new RocksMemEnv(Env.getDefault()); - final Options options = new Options() - .setCreateIfMissing(true) - .setEnv(env); - final FlushOptions flushOptions = new FlushOptions() - .setWaitForFlush(true); - ) { + final Options options = new Options().setCreateIfMissing(true).setEnv(env); + final FlushOptions flushOptions = new FlushOptions().setWaitForFlush(true)) { try (final RocksDB db = RocksDB.open(options, "/dir/db")) { // write key/value pairs using MemEnv for (int i = 0; i < keys.length; i++) { diff --git a/java/src/test/java/org/rocksdb/SstFileReaderTest.java b/java/src/test/java/org/rocksdb/SstFileReaderTest.java index e29df99f2a60..ef74b08a72ab 100644 --- a/java/src/test/java/org/rocksdb/SstFileReaderTest.java +++ b/java/src/test/java/org/rocksdb/SstFileReaderTest.java @@ -58,7 +58,7 @@ public static Iterable parameters() { {"direct", ByteBufferAllocator.DIRECT}, {"indirect", ByteBufferAllocator.HEAP}}); } - @Parameterized.Parameter(0) public String name; + @Parameterized.Parameter() public String name; @Parameterized.Parameter(1) public ByteBufferAllocator byteBufferAllocator; diff --git a/java/src/test/java/org/rocksdb/SstFileWriterTest.java b/java/src/test/java/org/rocksdb/SstFileWriterTest.java index 87165bfe1f11..c0f4ed9f1a4a 100644 --- a/java/src/test/java/org/rocksdb/SstFileWriterTest.java +++ b/java/src/test/java/org/rocksdb/SstFileWriterTest.java @@ -12,7 +12,7 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; -import java.util.Arrays; +import java.util.Collections; import java.util.List; import org.junit.ClassRule; import org.junit.Rule; @@ -33,7 +33,7 @@ public class SstFileWriterTest { enum OpType { PUT, PUT_BYTES, PUT_DIRECT, MERGE, MERGE_BYTES, DELETE, DELETE_BYTES } static class KeyValueWithOp { - KeyValueWithOp(String key, String value, OpType opType) { + KeyValueWithOp(final String key, final String value, final OpType opType) { this.key = key; this.value = value; this.opType = opType; @@ -54,14 +54,14 @@ OpType getOpType() { private final String key; private final String value; private final OpType opType; - }; + } private File newSstFile(final List keyValues, - boolean useJavaBytewiseComparator) throws IOException, RocksDBException { + final boolean useJavaBytewiseComparator) throws IOException, RocksDBException { final EnvOptions envOptions = new EnvOptions(); final StringAppendOperator stringAppendOperator = new StringAppendOperator(); final Options options = new Options().setMergeOperator(stringAppendOperator); - SstFileWriter sstFileWriter = null; + final SstFileWriter sstFileWriter; ComparatorOptions comparatorOptions = null; BytewiseComparator comparator = null; if (useJavaBytewiseComparator) { @@ -77,15 +77,15 @@ private File newSstFile(final List keyValues, try { sstFileWriter.open(sstFile.getAbsolutePath()); assertThat(sstFileWriter.fileSize()).isEqualTo(0); - for (KeyValueWithOp keyValue : keyValues) { - Slice keySlice = new Slice(keyValue.getKey()); - Slice valueSlice = new Slice(keyValue.getValue()); - byte[] keyBytes = keyValue.getKey().getBytes(); - byte[] valueBytes = keyValue.getValue().getBytes(); - ByteBuffer keyDirect = ByteBuffer.allocateDirect(keyBytes.length); + for (final KeyValueWithOp keyValue : keyValues) { + final Slice keySlice = new Slice(keyValue.getKey()); + final Slice valueSlice = new Slice(keyValue.getValue()); + final byte[] keyBytes = keyValue.getKey().getBytes(); + final byte[] valueBytes = keyValue.getValue().getBytes(); + final ByteBuffer keyDirect = ByteBuffer.allocateDirect(keyBytes.length); keyDirect.put(keyBytes); keyDirect.flip(); - ByteBuffer valueDirect = ByteBuffer.allocateDirect(valueBytes.length); + final ByteBuffer valueDirect = ByteBuffer.allocateDirect(valueBytes.length); valueDirect.put(valueBytes); valueDirect.flip(); switch (keyValue.getOpType()) { @@ -185,8 +185,8 @@ public void ingestSstFile() throws RocksDBException, IOException { final RocksDB db = RocksDB.open(options, dbFolder.getAbsolutePath()); final IngestExternalFileOptions ingestExternalFileOptions = new IngestExternalFileOptions()) { - db.ingestExternalFile(Arrays.asList(sstFile.getAbsolutePath()), - ingestExternalFileOptions); + db.ingestExternalFile( + Collections.singletonList(sstFile.getAbsolutePath()), ingestExternalFileOptions); assertThat(db.get("key1".getBytes())).isEqualTo("value1".getBytes()); assertThat(db.get("key2".getBytes())).isEqualTo("value2".getBytes()); @@ -222,9 +222,7 @@ public void ingestSstFile_cf() throws RocksDBException, IOException { .setMergeOperator(stringAppendOperator); final ColumnFamilyHandle cf_handle = db.createColumnFamily( new ColumnFamilyDescriptor("new_cf".getBytes(), cf_opts))) { - - db.ingestExternalFile(cf_handle, - Arrays.asList(sstFile.getAbsolutePath()), + db.ingestExternalFile(cf_handle, Collections.singletonList(sstFile.getAbsolutePath()), ingestExternalFileOptions); assertThat(db.get(cf_handle, diff --git a/java/src/test/java/org/rocksdb/SstPartitionerTest.java b/java/src/test/java/org/rocksdb/SstPartitionerTest.java index 74816db932d8..3ee739053794 100644 --- a/java/src/test/java/org/rocksdb/SstPartitionerTest.java +++ b/java/src/test/java/org/rocksdb/SstPartitionerTest.java @@ -23,7 +23,7 @@ public class SstPartitionerTest { @Test public void sstFixedPrefix() throws RocksDBException { - try (SstPartitionerFixedPrefixFactory factory = new SstPartitionerFixedPrefixFactory(4); + try (final SstPartitionerFixedPrefixFactory factory = new SstPartitionerFixedPrefixFactory(4); final Options opt = new Options().setCreateIfMissing(true).setSstPartitionerFactory(factory); final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) { @@ -38,7 +38,7 @@ public void sstFixedPrefix() throws RocksDBException { db.compactRange(); - List metadata = db.getLiveFilesMetaData(); + final List metadata = db.getLiveFilesMetaData(); assertThat(metadata.size()).isEqualTo(2); } } @@ -65,7 +65,7 @@ public void sstFixedPrefixFamily() throws RocksDBException { db.compactRange(columnFamilyHandle); - List metadata = db.getLiveFilesMetaData(); + final List metadata = db.getLiveFilesMetaData(); assertThat(metadata.size()).isEqualTo(2); } } diff --git a/java/src/test/java/org/rocksdb/StatisticsTest.java b/java/src/test/java/org/rocksdb/StatisticsTest.java index de92102ec297..269cc56a0e63 100644 --- a/java/src/test/java/org/rocksdb/StatisticsTest.java +++ b/java/src/test/java/org/rocksdb/StatisticsTest.java @@ -5,24 +5,28 @@ package org.rocksdb; +import static org.assertj.core.api.Assertions.assertThat; + +import java.nio.charset.StandardCharsets; +import java.util.EnumSet; import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.nio.charset.StandardCharsets; - -import static org.assertj.core.api.Assertions.assertThat; - public class StatisticsTest { - - @ClassRule - public static final RocksNativeLibraryResource ROCKS_NATIVE_LIBRARY_RESOURCE = - new RocksNativeLibraryResource(); - @Rule public TemporaryFolder dbFolder = new TemporaryFolder(); + @Test + public void createStatistics() throws RocksDBException { + final Statistics statistics = new Statistics(); + statistics.setStatsLevel(StatsLevel.EXCEPT_DETAILED_TIMERS); + final Statistics statisticsWithHistogramOptions = + new Statistics(EnumSet.of(HistogramType.DB_WRITE, HistogramType.COMPACTION_TIME)); + statisticsWithHistogramOptions.reset(); + } + @Test public void statsLevel() throws RocksDBException { final Statistics statistics = new Statistics(); diff --git a/java/src/test/java/org/rocksdb/StatsCallbackMock.java b/java/src/test/java/org/rocksdb/StatsCallbackMock.java index af8db0caabd0..c6a7294c97f1 100644 --- a/java/src/test/java/org/rocksdb/StatsCallbackMock.java +++ b/java/src/test/java/org/rocksdb/StatsCallbackMock.java @@ -9,12 +9,11 @@ public class StatsCallbackMock implements StatisticsCollectorCallback { public int tickerCallbackCount = 0; public int histCallbackCount = 0; - public void tickerCallback(TickerType tickerType, long tickerCount) { + public void tickerCallback(final TickerType tickerType, final long tickerCount) { tickerCallbackCount++; } - public void histogramCallback(HistogramType histType, - HistogramData histData) { + public void histogramCallback(final HistogramType histType, final HistogramData histData) { histCallbackCount++; } } diff --git a/java/src/test/java/org/rocksdb/TimedEnvTest.java b/java/src/test/java/org/rocksdb/TimedEnvTest.java index c958f96b2036..31bad2e2edd8 100644 --- a/java/src/test/java/org/rocksdb/TimedEnvTest.java +++ b/java/src/test/java/org/rocksdb/TimedEnvTest.java @@ -31,10 +31,7 @@ public void construct() throws RocksDBException { @Test public void construct_integration() throws RocksDBException { try (final Env env = new TimedEnv(Env.getDefault()); - final Options options = new Options() - .setCreateIfMissing(true) - .setEnv(env); - ) { + final Options options = new Options().setCreateIfMissing(true).setEnv(env)) { try (final RocksDB db = RocksDB.open(options, dbFolder.getRoot().getPath())) { db.put("key1".getBytes(UTF_8), "value1".getBytes(UTF_8)); } diff --git a/java/src/test/java/org/rocksdb/TransactionDBTest.java b/java/src/test/java/org/rocksdb/TransactionDBTest.java index b0ea813ff5e0..56acb21c7252 100644 --- a/java/src/test/java/org/rocksdb/TransactionDBTest.java +++ b/java/src/test/java/org/rocksdb/TransactionDBTest.java @@ -130,9 +130,8 @@ public void lockStatusData() throws RocksDBException { final ReadOptions readOptions = new ReadOptions()) { try (final Transaction txn = tdb.beginTransaction(writeOptions)) { - - final byte key[] = "key".getBytes(UTF_8); - final byte value[] = "value".getBytes(UTF_8); + final byte[] key = "key".getBytes(UTF_8); + final byte[] value = "value".getBytes(UTF_8); txn.put(key, value); assertThat(txn.getForUpdate(readOptions, key, true)).isEqualTo(value); diff --git a/java/src/test/java/org/rocksdb/TransactionTest.java b/java/src/test/java/org/rocksdb/TransactionTest.java index 8a3067de9ac8..b80445c5c807 100644 --- a/java/src/test/java/org/rocksdb/TransactionTest.java +++ b/java/src/test/java/org/rocksdb/TransactionTest.java @@ -116,7 +116,7 @@ public void prepare_read_prepared_commit() throws RocksDBException { txn.commit(); } - Transaction txnPrepare; + final Transaction txnPrepare; txnPrepare = dbContainer.beginTransaction(); txnPrepare.setName("txnPrepare1"); txnPrepare.put(k1, v12); @@ -147,7 +147,7 @@ public void prepare_read_prepared_rollback() throws RocksDBException { txn.commit(); } - Transaction txnPrepare; + final Transaction txnPrepare; txnPrepare = dbContainer.beginTransaction(); txnPrepare.setName("txnPrepare1"); txnPrepare.put(k1, v12); diff --git a/java/src/test/java/org/rocksdb/TtlDBTest.java b/java/src/test/java/org/rocksdb/TtlDBTest.java index ffa15e768ed4..ebf9e9eaa3b4 100644 --- a/java/src/test/java/org/rocksdb/TtlDBTest.java +++ b/java/src/test/java/org/rocksdb/TtlDBTest.java @@ -40,7 +40,7 @@ public void ttlDBOpen() throws RocksDBException, InterruptedException { @Test public void ttlDBOpenWithTtl() throws RocksDBException, InterruptedException { try (final Options options = new Options().setCreateIfMissing(true).setMaxCompactionBytes(0); - final TtlDB ttlDB = TtlDB.open(options, dbFolder.getRoot().getAbsolutePath(), 1, false);) { + final TtlDB ttlDB = TtlDB.open(options, dbFolder.getRoot().getAbsolutePath(), 1, false)) { ttlDB.put("key".getBytes(), "value".getBytes()); assertThat(ttlDB.get("key".getBytes())). isEqualTo("value".getBytes()); diff --git a/java/src/test/java/org/rocksdb/Types.java b/java/src/test/java/org/rocksdb/Types.java index c3c1de833a51..a6abdecbca05 100644 --- a/java/src/test/java/org/rocksdb/Types.java +++ b/java/src/test/java/org/rocksdb/Types.java @@ -18,7 +18,7 @@ public class Types { * * @return An integer */ - public static int byteToInt(final byte data[]) { + public static int byteToInt(final byte[] data) { return (data[0] & 0xff) | ((data[1] & 0xff) << 8) | ((data[2] & 0xff) << 16) | diff --git a/java/src/test/java/org/rocksdb/WalFilterTest.java b/java/src/test/java/org/rocksdb/WalFilterTest.java index adeb959d1e18..08bc6eef5058 100644 --- a/java/src/test/java/org/rocksdb/WalFilterTest.java +++ b/java/src/test/java/org/rocksdb/WalFilterTest.java @@ -63,10 +63,10 @@ public void walFilter() throws RocksDBException { cfDescriptors, cfHandles)) { try (final WriteOptions writeOptions = new WriteOptions()) { // Write given keys in given batches - for (int i = 0; i < batchKeys.length; i++) { + for (final byte[][] batchKey : batchKeys) { final WriteBatch batch = new WriteBatch(); - for (int j = 0; j < batchKeys[i].length; j++) { - batch.put(cfHandles.get(0), batchKeys[i][j], dummyString(1024)); + for (final byte[] bytes : batchKey) { + batch.put(cfHandles.get(0), bytes, dummyString(1024)); } db.write(writeOptions, batch); } diff --git a/java/src/test/java/org/rocksdb/WriteBatchThreadedTest.java b/java/src/test/java/org/rocksdb/WriteBatchThreadedTest.java index c5090dbceba3..0321da3fac44 100644 --- a/java/src/test/java/org/rocksdb/WriteBatchThreadedTest.java +++ b/java/src/test/java/org/rocksdb/WriteBatchThreadedTest.java @@ -23,7 +23,7 @@ public class WriteBatchThreadedTest { @Parameters(name = "WriteBatchThreadedTest(threadCount={0})") public static Iterable data() { - return Arrays.asList(new Integer[]{1, 10, 50, 100}); + return Arrays.asList(1, 10, 50, 100); } @Parameter @@ -56,18 +56,15 @@ public void threadedWrites() throws InterruptedException, ExecutionException { final List> callables = new ArrayList<>(); for (int i = 0; i < 100; i++) { final int offset = i * 100; - callables.add(new Callable() { - @Override - public Void call() throws RocksDBException { - try (final WriteBatch wb = new WriteBatch(); - final WriteOptions w_opt = new WriteOptions()) { - for (int i = offset; i < offset + 100; i++) { - wb.put(ByteBuffer.allocate(4).putInt(i).array(), "parallel rocks test".getBytes()); - } - db.write(w_opt, wb); + callables.add(() -> { + try (final WriteBatch wb = new WriteBatch(); + final WriteOptions w_opt = new WriteOptions()) { + for (int i1 = offset; i1 < offset + 100; i1++) { + wb.put(ByteBuffer.allocate(4).putInt(i1).array(), "parallel rocks test".getBytes()); } - return null; + db.write(w_opt, wb); } + return null; }); } diff --git a/java/src/test/java/org/rocksdb/WriteOptionsTest.java b/java/src/test/java/org/rocksdb/WriteOptionsTest.java index 735677cb78a9..1e1c93fb5e22 100644 --- a/java/src/test/java/org/rocksdb/WriteOptionsTest.java +++ b/java/src/test/java/org/rocksdb/WriteOptionsTest.java @@ -59,12 +59,12 @@ public void writeOptions() { @Test public void copyConstructor() { - WriteOptions origOpts = new WriteOptions(); + final WriteOptions origOpts = new WriteOptions(); origOpts.setDisableWAL(rand.nextBoolean()); origOpts.setIgnoreMissingColumnFamilies(rand.nextBoolean()); origOpts.setSync(rand.nextBoolean()); origOpts.setMemtableInsertHintPerBatch(true); - WriteOptions copyOpts = new WriteOptions(origOpts); + final WriteOptions copyOpts = new WriteOptions(origOpts); assertThat(origOpts.disableWAL()).isEqualTo(copyOpts.disableWAL()); assertThat(origOpts.ignoreMissingColumnFamilies()).isEqualTo( copyOpts.ignoreMissingColumnFamilies()); diff --git a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java index ae340e06d5d7..5e53692171b9 100644 --- a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java +++ b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java @@ -5,7 +5,6 @@ package org.rocksdb.util; import static org.assertj.core.api.Assertions.assertThat; -import static org.hamcrest.Matchers.is; import java.lang.reflect.Field; import org.junit.AfterClass; @@ -13,11 +12,11 @@ import org.junit.Test; public class EnvironmentTest { - private final static String ARCH_FIELD_NAME = "ARCH"; - private final static String OS_FIELD_NAME = "OS"; + private static final String ARCH_FIELD_NAME = "ARCH"; + private static final String OS_FIELD_NAME = "OS"; - private final static String MUSL_ENVIRONMENT_FIELD_NAME = "MUSL_ENVIRONMENT"; - private final static String MUSL_LIBC_FIELD_NAME = "MUSL_LIBC"; + private static final String MUSL_ENVIRONMENT_FIELD_NAME = "MUSL_ENVIRONMENT"; + private static final String MUSL_LIBC_FIELD_NAME = "MUSL_LIBC"; private static String INITIAL_OS; private static String INITIAL_ARCH; @@ -255,8 +254,7 @@ public void resolveIsMuslLibc() { assertThat(Environment.initIsMuslLibc()).isFalse(); } - private void setEnvironmentClassFields(String osName, - String osArch) { + private void setEnvironmentClassFields(final String osName, final String osArch) { setEnvironmentClassField(OS_FIELD_NAME, osName); setEnvironmentClassField(ARCH_FIELD_NAME, osArch); } @@ -270,7 +268,7 @@ public static void restoreState() { } @SuppressWarnings("unchecked") - private static T getEnvironmentClassField(String fieldName) { + private static T getEnvironmentClassField(final String fieldName) { final Field field; try { field = Environment.class.getDeclaredField(fieldName); @@ -286,7 +284,7 @@ private static T getEnvironmentClassField(String fieldName) { } } - private static void setEnvironmentClassField(String fieldName, Object value) { + private static void setEnvironmentClassField(final String fieldName, final Object value) { final Field field; try { field = Environment.class.getDeclaredField(fieldName); diff --git a/logging/auto_roll_logger.cc b/logging/auto_roll_logger.cc index fe0958479127..9e9ad45aee0f 100644 --- a/logging/auto_roll_logger.cc +++ b/logging/auto_roll_logger.cc @@ -16,7 +16,6 @@ namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE // -- AutoRollLogger AutoRollLogger::AutoRollLogger(const std::shared_ptr& fs, @@ -269,7 +268,6 @@ bool AutoRollLogger::LogExpired() { ++cached_now_access_count; return cached_now >= ctime_ + kLogFileTimeToRoll; } -#endif // !ROCKSDB_LITE Status CreateLoggerFromOptions(const std::string& dbname, const DBOptions& options, @@ -312,7 +310,6 @@ Status CreateLoggerFromOptions(const std::string& dbname, return s; } } -#ifndef ROCKSDB_LITE // Currently we only support roll by time-to-roll and log size if (options.log_file_time_to_roll > 0 || options.max_log_file_size > 0) { AutoRollLogger* result = new AutoRollLogger( @@ -327,7 +324,6 @@ Status CreateLoggerFromOptions(const std::string& dbname, } return s; } -#endif // !ROCKSDB_LITE // Open a log file in the same directory as the db s = env->FileExists(fname); if (s.ok()) { diff --git a/logging/auto_roll_logger.h b/logging/auto_roll_logger.h index 805925e5a8ad..dca9996fea0f 100644 --- a/logging/auto_roll_logger.h +++ b/logging/auto_roll_logger.h @@ -21,7 +21,6 @@ namespace ROCKSDB_NAMESPACE { class FileSystem; class SystemClock; -#ifndef ROCKSDB_LITE // Rolls the log file by size and/or time class AutoRollLogger : public Logger { public: @@ -158,7 +157,6 @@ class AutoRollLogger : public Logger { IODebugContext io_context_; mutable port::Mutex mutex_; }; -#endif // !ROCKSDB_LITE // Facade to craete logger automatically Status CreateLoggerFromOptions(const std::string& dbname, diff --git a/logging/auto_roll_logger_test.cc b/logging/auto_roll_logger_test.cc index 8e94a78c8244..3d0ec1763f28 100644 --- a/logging/auto_roll_logger_test.cc +++ b/logging/auto_roll_logger_test.cc @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include "logging/auto_roll_logger.h" @@ -730,13 +729,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "SKIPPED as AutoRollLogger is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/logging/env_logger.h b/logging/env_logger.h index 8164945cfa9b..fc9b245504f8 100644 --- a/logging/env_logger.h +++ b/logging/env_logger.h @@ -76,6 +76,7 @@ class EnvLogger : public Logger { if (flush_pending_) { flush_pending_ = false; file_.Flush().PermitUncheckedError(); + file_.reset_seen_error(); } last_flush_micros_ = clock_->NowMicros(); } @@ -162,6 +163,7 @@ class EnvLogger : public Logger { FileOpGuard guard(*this); // We will ignore any error returned by Append(). file_.Append(Slice(base, p - base)).PermitUncheckedError(); + file_.reset_seen_error(); flush_pending_ = true; const uint64_t now_micros = clock_->NowMicros(); if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { diff --git a/memory/arena_test.cc b/memory/arena_test.cc index 21bf7ed6282d..592bbd723f59 100644 --- a/memory/arena_test.cc +++ b/memory/arena_test.cc @@ -219,21 +219,28 @@ size_t PopMinorPageFaultCount() { TEST(MmapTest, AllocateLazyZeroed) { // Doesn't have to be page aligned - constexpr size_t len = 1234567; - MemMapping m = MemMapping::AllocateLazyZeroed(len); - auto arr = static_cast(m.Get()); + constexpr size_t len = 1234567; // in bytes + constexpr size_t count = len / 8; // in uint64_t objects + // Implicit conversion move + TypedMemMapping pre_arr = MemMapping::AllocateLazyZeroed(len); + // Move from same type + TypedMemMapping arr = std::move(pre_arr); - // Should generally work - ASSERT_NE(arr, nullptr); + ASSERT_NE(arr.Get(), nullptr); + ASSERT_EQ(arr.Get(), &arr[0]); + ASSERT_EQ(arr.Get(), arr.MemMapping::Get()); + + ASSERT_EQ(arr.Length(), len); + ASSERT_EQ(arr.Count(), count); // Start counting page faults PopMinorPageFaultCount(); // Access half of the allocation size_t i = 0; - for (; i < len / 2; ++i) { + for (; i < count / 2; ++i) { ASSERT_EQ(arr[i], 0); - arr[i] = static_cast(i & 255); + arr[i] = i; } // Appropriate page faults (maybe more) @@ -241,9 +248,9 @@ TEST(MmapTest, AllocateLazyZeroed) { ASSERT_GE(faults, len / 2 / port::kPageSize); // Access rest of the allocation - for (; i < len; ++i) { + for (; i < count; ++i) { ASSERT_EQ(arr[i], 0); - arr[i] = static_cast(i & 255); + arr[i] = i; } // Appropriate page faults (maybe more) @@ -251,8 +258,8 @@ TEST(MmapTest, AllocateLazyZeroed) { ASSERT_GE(faults, len / 2 / port::kPageSize); // Verify data - for (i = 0; i < len; ++i) { - ASSERT_EQ(arr[i], static_cast(i & 255)); + for (i = 0; i < count; ++i) { + ASSERT_EQ(arr[i], i); } } diff --git a/memory/jemalloc_nodump_allocator.cc b/memory/jemalloc_nodump_allocator.cc index 6540242f8c17..7d9dc7ea0c5c 100644 --- a/memory/jemalloc_nodump_allocator.cc +++ b/memory/jemalloc_nodump_allocator.cc @@ -25,7 +25,6 @@ std::atomic JemallocNodumpAllocator::original_alloc_{nullptr}; #endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR static std::unordered_map jemalloc_type_info = { -#ifndef ROCKSDB_LITE {"limit_tcache_size", {offsetof(struct JemallocAllocatorOptions, limit_tcache_size), OptionType::kBoolean, OptionVerificationType::kNormal, @@ -41,7 +40,6 @@ static std::unordered_map jemalloc_type_info = { {"num_arenas", {offsetof(struct JemallocAllocatorOptions, num_arenas), OptionType::kSizeT, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, -#endif // ROCKSDB_LITE }; bool JemallocNodumpAllocator::IsSupported(std::string* why) { #ifndef ROCKSDB_JEMALLOC @@ -65,7 +63,7 @@ bool JemallocNodumpAllocator::IsSupported(std::string* why) { } JemallocNodumpAllocator::JemallocNodumpAllocator( - JemallocAllocatorOptions& options) + const JemallocAllocatorOptions& options) : options_(options) #ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR , @@ -286,7 +284,7 @@ void JemallocNodumpAllocator::DestroyThreadSpecificCache(void* ptr) { #endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR Status NewJemallocNodumpAllocator( - JemallocAllocatorOptions& options, + const JemallocAllocatorOptions& options, std::shared_ptr* memory_allocator) { if (memory_allocator == nullptr) { return Status::InvalidArgument("memory_allocator must be non-null."); diff --git a/memory/jemalloc_nodump_allocator.h b/memory/jemalloc_nodump_allocator.h index 4481fd50621c..2e9e81b39a10 100644 --- a/memory/jemalloc_nodump_allocator.h +++ b/memory/jemalloc_nodump_allocator.h @@ -30,7 +30,7 @@ namespace ROCKSDB_NAMESPACE { // arena mutexes. class JemallocNodumpAllocator : public BaseMemoryAllocator { public: - explicit JemallocNodumpAllocator(JemallocAllocatorOptions& options); + explicit JemallocNodumpAllocator(const JemallocAllocatorOptions& options); #ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR ~JemallocNodumpAllocator(); #endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR @@ -89,7 +89,7 @@ class JemallocNodumpAllocator : public BaseMemoryAllocator { std::vector> per_arena_hooks_; // Hold thread-local tcache index. - mutable ThreadLocalPtr tcache_; + ThreadLocalPtr tcache_; std::vector arena_indexes_; #endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR diff --git a/memory/memory_allocator.cc b/memory/memory_allocator.cc index 34dce9bb66d5..d0de26b94d52 100644 --- a/memory/memory_allocator.cc +++ b/memory/memory_allocator.cc @@ -15,13 +15,10 @@ namespace ROCKSDB_NAMESPACE { namespace { static std::unordered_map ma_wrapper_type_info = { -#ifndef ROCKSDB_LITE {"target", OptionTypeInfo::AsCustomSharedPtr( 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}, -#endif // ROCKSDB_LITE }; -#ifndef ROCKSDB_LITE static int RegisterBuiltinAllocators(ObjectLibrary& library, const std::string& /*arg*/) { library.AddFactory( @@ -61,7 +58,6 @@ static int RegisterBuiltinAllocators(ObjectLibrary& library, size_t num_types; return static_cast(library.GetFactoryCount(&num_types)); } -#endif // ROCKSDB_LITE } // namespace MemoryAllocatorWrapper::MemoryAllocatorWrapper( @@ -73,17 +69,10 @@ MemoryAllocatorWrapper::MemoryAllocatorWrapper( Status MemoryAllocator::CreateFromString( const ConfigOptions& options, const std::string& value, std::shared_ptr* result) { -#ifndef ROCKSDB_LITE static std::once_flag once; std::call_once(once, [&]() { RegisterBuiltinAllocators(*(ObjectLibrary::Default().get()), ""); }); -#else - if (value == DefaultMemoryAllocator::kClassName()) { - result->reset(new DefaultMemoryAllocator()); - return Status::OK(); - } -#endif // ROCKSDB_LITE ConfigOptions copy = options; copy.invoke_prepare_options = true; return LoadManagedObject(copy, value, result); diff --git a/memory/memory_allocator.h b/memory/memory_allocator_impl.h similarity index 100% rename from memory/memory_allocator.h rename to memory/memory_allocator_impl.h diff --git a/memory/memory_allocator_test.cc b/memory/memory_allocator_test.cc index 19fa72f58d84..6060bdd7858d 100644 --- a/memory/memory_allocator_test.cc +++ b/memory/memory_allocator_test.cc @@ -20,7 +20,6 @@ namespace ROCKSDB_NAMESPACE { // TODO: the tests do not work in LITE mode due to relying on // `CreateFromString()` to create non-default memory allocators. -#ifndef ROCKSDB_LITE class MemoryAllocatorTest : public testing::Test, @@ -63,11 +62,9 @@ TEST_P(MemoryAllocatorTest, CreateAllocator) { } else { ASSERT_OK(s); ASSERT_NE(orig, nullptr); -#ifndef ROCKSDB_LITE std::string str = orig->ToString(config_options); ASSERT_OK(MemoryAllocator::CreateFromString(config_options, str, ©)); ASSERT_EQ(orig, copy); -#endif // ROCKSDB_LITE } } @@ -241,7 +238,6 @@ INSTANTIATE_TEST_CASE_P( JemallocNodumpAllocator::IsSupported()))); #endif // ROCKSDB_JEMALLOC -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/memtable/hash_linklist_rep.cc b/memtable/hash_linklist_rep.cc index a717683048ce..9e60f9be3784 100644 --- a/memtable/hash_linklist_rep.cc +++ b/memtable/hash_linklist_rep.cc @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include #include @@ -923,4 +922,3 @@ MemTableRepFactory* NewHashLinkListRepFactory( } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/memtable/hash_skiplist_rep.cc b/memtable/hash_skiplist_rep.cc index 9d093829ba8c..15ff4f0719b9 100644 --- a/memtable/hash_skiplist_rep.cc +++ b/memtable/hash_skiplist_rep.cc @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include #include "db/memtable.h" @@ -390,4 +389,3 @@ MemTableRepFactory* NewHashSkipListRepFactory( } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/memtable/inlineskiplist_test.cc b/memtable/inlineskiplist_test.cc index f856440649bb..930574ec7264 100644 --- a/memtable/inlineskiplist_test.cc +++ b/memtable/inlineskiplist_test.cc @@ -576,6 +576,7 @@ static void ConcurrentReader(void* arg) { state->t_.ReadStep(&rnd); ++reads; } + (void)reads; state->Change(TestState::DONE); } diff --git a/memtable/memtablerep_bench.cc b/memtable/memtablerep_bench.cc index a915abed7863..83db461581ba 100644 --- a/memtable/memtablerep_bench.cc +++ b/memtable/memtablerep_bench.cc @@ -579,7 +579,6 @@ int main(int argc, char** argv) { std::unique_ptr factory; if (FLAGS_memtablerep == "skiplist") { factory.reset(new ROCKSDB_NAMESPACE::SkipListFactory); -#ifndef ROCKSDB_LITE } else if (FLAGS_memtablerep == "vector") { factory.reset(new ROCKSDB_NAMESPACE::VectorRepFactory); } else if (FLAGS_memtablerep == "hashskiplist" || @@ -597,7 +596,6 @@ int main(int argc, char** argv) { FLAGS_if_log_bucket_dist_when_flash, FLAGS_threshold_use_skiplist)); options.prefix_extractor.reset( ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_length)); -#endif // ROCKSDB_LITE } else { ROCKSDB_NAMESPACE::ConfigOptions config_options; config_options.ignore_unsupported_options = false; diff --git a/memtable/skiplist_test.cc b/memtable/skiplist_test.cc index a070885110f0..868c51876e9d 100644 --- a/memtable/skiplist_test.cc +++ b/memtable/skiplist_test.cc @@ -348,6 +348,7 @@ static void ConcurrentReader(void* arg) { state->t_.ReadStep(&rnd); ++reads; } + (void)reads; state->Change(TestState::DONE); } diff --git a/memtable/skiplistrep.cc b/memtable/skiplistrep.cc index 40f13a2c17dd..c3b4c785d38e 100644 --- a/memtable/skiplistrep.cc +++ b/memtable/skiplistrep.cc @@ -341,11 +341,9 @@ class SkipListRep : public MemTableRep { } // namespace static std::unordered_map skiplist_factory_info = { -#ifndef ROCKSDB_LITE {"lookahead", {0, OptionType::kSizeT, OptionVerificationType::kNormal, OptionTypeFlags::kDontSerialize /*Since it is part of the ID*/}}, -#endif }; SkipListFactory::SkipListFactory(size_t lookahead) : lookahead_(lookahead) { diff --git a/memtable/vectorrep.cc b/memtable/vectorrep.cc index 29316334999e..e42ae4439c84 100644 --- a/memtable/vectorrep.cc +++ b/memtable/vectorrep.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include #include #include @@ -306,4 +305,3 @@ MemTableRep* VectorRepFactory::CreateMemTableRep( return new VectorRep(compare, allocator, count_); } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/memtable/write_buffer_manager.cc b/memtable/write_buffer_manager.cc index 8db9816bef71..ce1789c20d69 100644 --- a/memtable/write_buffer_manager.cc +++ b/memtable/write_buffer_manager.cc @@ -28,7 +28,6 @@ WriteBufferManager::WriteBufferManager(size_t _buffer_size, cache_res_mgr_(nullptr), allow_stall_(allow_stall), stall_active_(false) { -#ifndef ROCKSDB_LITE if (cache) { // Memtable's memory usage tends to fluctuate frequently // therefore we set delayed_decrease = true to save some dummy entry @@ -37,9 +36,6 @@ WriteBufferManager::WriteBufferManager(size_t _buffer_size, CacheReservationManagerImpl>( cache, true /* delayed_decrease */); } -#else - (void)cache; -#endif // ROCKSDB_LITE } WriteBufferManager::~WriteBufferManager() { @@ -70,7 +66,6 @@ void WriteBufferManager::ReserveMem(size_t mem) { // Should only be called from write thread void WriteBufferManager::ReserveMemWithCache(size_t mem) { -#ifndef ROCKSDB_LITE assert(cache_res_mgr_ != nullptr); // Use a mutex to protect various data structures. Can be optimized to a // lock-free solution if it ends up with a performance bottleneck. @@ -86,9 +81,6 @@ void WriteBufferManager::ReserveMemWithCache(size_t mem) { // [TODO] We'll need to improve it in the future and figure out what to do on // error s.PermitUncheckedError(); -#else - (void)mem; -#endif // ROCKSDB_LITE } void WriteBufferManager::ScheduleFreeMem(size_t mem) { @@ -108,7 +100,6 @@ void WriteBufferManager::FreeMem(size_t mem) { } void WriteBufferManager::FreeMemWithCache(size_t mem) { -#ifndef ROCKSDB_LITE assert(cache_res_mgr_ != nullptr); // Use a mutex to protect various data structures. Can be optimized to a // lock-free solution if it ends up with a performance bottleneck. @@ -122,14 +113,10 @@ void WriteBufferManager::FreeMemWithCache(size_t mem) { // [TODO] We'll need to improve it in the future and figure out what to do on // error s.PermitUncheckedError(); -#else - (void)mem; -#endif // ROCKSDB_LITE } void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) { assert(wbm_stall != nullptr); - assert(allow_stall_); // Allocate outside of the lock. std::list new_node = {wbm_stall}; @@ -152,16 +139,12 @@ void WriteBufferManager::BeginWriteStall(StallInterface* wbm_stall) { // Called when memory is freed in FreeMem or the buffer size has changed. void WriteBufferManager::MaybeEndWriteStall() { - // Cannot early-exit on !enabled() because SetBufferSize(0) needs to unblock - // the writers. - if (!allow_stall_) { + // Stall conditions have not been resolved. + if (allow_stall_.load(std::memory_order_relaxed) && + IsStallThresholdExceeded()) { return; } - if (IsStallThresholdExceeded()) { - return; // Stall conditions have not resolved. - } - // Perform all deallocations outside of the lock. std::list cleanup; @@ -186,7 +169,7 @@ void WriteBufferManager::RemoveDBFromQueue(StallInterface* wbm_stall) { // Deallocate the removed nodes outside of the lock. std::list cleanup; - if (enabled() && allow_stall_) { + if (enabled() && allow_stall_.load(std::memory_order_relaxed)) { std::unique_lock lock(mu_); for (auto it = queue_.begin(); it != queue_.end();) { auto next = std::next(it); diff --git a/memtable/write_buffer_manager_test.cc b/memtable/write_buffer_manager_test.cc index 1cc4c2cc5764..c992d2eabcb8 100644 --- a/memtable/write_buffer_manager_test.cc +++ b/memtable/write_buffer_manager_test.cc @@ -9,12 +9,12 @@ #include "rocksdb/write_buffer_manager.h" +#include "rocksdb/advanced_cache.h" #include "test_util/testharness.h" namespace ROCKSDB_NAMESPACE { class WriteBufferManagerTest : public testing::Test {}; -#ifndef ROCKSDB_LITE const size_t kSizeDummyEntry = 256 * 1024; TEST_F(WriteBufferManagerTest, ShouldFlush) { @@ -295,7 +295,6 @@ TEST_F(ChargeWriteBufferTest, BasicWithCacheFull) { 46 * kSizeDummyEntry + kMetaDataChargeOverhead); } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/microbench/db_basic_bench.cc b/microbench/db_basic_bench.cc index 6c70ad21d0fb..c2e547f607bf 100644 --- a/microbench/db_basic_bench.cc +++ b/microbench/db_basic_bench.cc @@ -303,7 +303,7 @@ static void DBPut(benchmark::State& state) { if (state.thread_index() == 0) { auto db_full = static_cast_with_check(db.get()); - Status s = db_full->WaitForCompact(true); + Status s = db_full->WaitForCompact(WaitForCompactOptions()); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); return; @@ -410,7 +410,7 @@ static void ManualCompaction(benchmark::State& state) { if (state.thread_index() == 0) { auto db_full = static_cast_with_check(db.get()); - s = db_full->WaitForCompact(true); + s = db_full->WaitForCompact(WaitForCompactOptions()); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); return; @@ -436,6 +436,8 @@ static void ManualCompaction(benchmark::State& state) { static_cast(get_perf_context()->block_read_count); state.counters["block_read_time"] = static_cast(get_perf_context()->block_read_time); + state.counters["block_read_cpu_time"] = + static_cast(get_perf_context()->block_read_cpu_time); state.counters["block_checksum_time"] = static_cast(get_perf_context()->block_checksum_time); state.counters["new_table_block_iter_nanos"] = @@ -508,7 +510,7 @@ static void ManualFlush(benchmark::State& state) { if (state.thread_index() == 0) { auto db_full = static_cast_with_check(db.get()); - Status s = db_full->WaitForCompact(true); + Status s = db_full->WaitForCompact(WaitForCompactOptions()); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); return; @@ -536,6 +538,23 @@ static void ManualFlushArguments(benchmark::internal::Benchmark* b) { BENCHMARK(ManualFlush)->Iterations(1)->Apply(ManualFlushArguments); +// Copied from test_util.cc to not depend on rocksdb_test_lib +// when building microbench binaries. +static Slice CompressibleString(Random* rnd, double compressed_fraction, + int len, std::string* dst) { + int raw = static_cast(len * compressed_fraction); + if (raw < 1) raw = 1; + std::string raw_data = rnd->RandomBinaryString(raw); + + // Duplicate the random data until we have filled "len" bytes + dst->clear(); + while (dst->size() < (unsigned int)len) { + dst->append(raw_data); + } + dst->resize(len); + return Slice(*dst); +} + static void DBGet(benchmark::State& state) { auto compaction_style = static_cast(state.range(0)); uint64_t max_data = state.range(1); @@ -544,6 +563,9 @@ static void DBGet(benchmark::State& state) { bool negative_query = state.range(4); bool enable_filter = state.range(5); bool mmap = state.range(6); + auto compression_type = static_cast(state.range(7)); + bool compression_checksum = static_cast(state.range(8)); + bool no_blockcache = state.range(9); uint64_t key_num = max_data / per_key_size; // setup DB @@ -566,39 +588,45 @@ static void DBGet(benchmark::State& state) { table_options.no_block_cache = true; table_options.block_restart_interval = 1; } + options.compression = compression_type; + options.compression_opts.checksum = compression_checksum; + if (no_blockcache) { + table_options.no_block_cache = true; + } else { + table_options.block_cache = NewLRUCache(100 << 20); + } options.table_factory.reset(NewBlockBasedTableFactory(table_options)); auto rnd = Random(301 + state.thread_index()); - KeyGenerator kg(&rnd, key_num); if (state.thread_index() == 0) { + KeyGenerator kg_seq(key_num /* max_key */); SetupDB(state, options, &db, "DBGet"); - // load db + // Load all valid keys into DB. That way, iterations in `!negative_query` + // runs can always find the key even though it is generated from a random + // number. auto wo = WriteOptions(); wo.disableWAL = true; + std::string val; for (uint64_t i = 0; i < key_num; i++) { - Status s = db->Put(wo, kg.Next(), - rnd.RandomString(static_cast(per_key_size))); + CompressibleString(&rnd, 0.5, static_cast(per_key_size), &val); + Status s = db->Put(wo, kg_seq.Next(), val); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); } } - FlushOptions fo; - Status s = db->Flush(fo); - if (!s.ok()) { - state.SkipWithError(s.ToString().c_str()); - } - - auto db_full = static_cast_with_check(db.get()); - s = db_full->WaitForCompact(true); + // Compact whole DB into one level, so each iteration will consider the same + // number of files (one). + Status s = db->CompactRange(CompactRangeOptions(), nullptr /* begin */, + nullptr /* end */); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); - return; } } + KeyGenerator kg_rnd(&rnd, key_num /* max_key */); auto ro = ReadOptions(); if (mmap) { ro.verify_checksums = false; @@ -607,7 +635,7 @@ static void DBGet(benchmark::State& state) { if (negative_query) { for (auto _ : state) { std::string val; - Status s = db->Get(ro, kg.NextNonExist(), &val); + Status s = db->Get(ro, kg_rnd.NextNonExist(), &val); if (s.IsNotFound()) { not_found++; } @@ -615,7 +643,7 @@ static void DBGet(benchmark::State& state) { } else { for (auto _ : state) { std::string val; - Status s = db->Get(ro, kg.Next(), &val); + Status s = db->Get(ro, kg_rnd.Next(), &val); if (s.IsNotFound()) { not_found++; } @@ -634,21 +662,30 @@ static void DBGet(benchmark::State& state) { state.counters["get_p99"] = histogram_data.percentile99 * std::milli::den; } - TeardownDB(state, db, options, kg); + TeardownDB(state, db, options, kg_rnd); } } static void DBGetArguments(benchmark::internal::Benchmark* b) { for (int comp_style : {kCompactionStyleLevel, kCompactionStyleUniversal, kCompactionStyleFIFO}) { - for (int64_t max_data : {128l << 20, 512l << 20}) { + for (int64_t max_data : {1l << 20, 128l << 20, 512l << 20}) { for (int64_t per_key_size : {256, 1024}) { for (bool enable_statistics : {false, true}) { for (bool negative_query : {false, true}) { for (bool enable_filter : {false, true}) { for (bool mmap : {false, true}) { - b->Args({comp_style, max_data, per_key_size, enable_statistics, - negative_query, enable_filter, mmap}); + for (int compression_type : + {kNoCompression /* 0x0 */, kZSTD /* 0x7 */}) { + for (bool compression_checksum : {false, true}) { + for (bool no_blockcache : {false, true}) { + b->Args({comp_style, max_data, per_key_size, + enable_statistics, negative_query, enable_filter, + mmap, compression_type, compression_checksum, + no_blockcache}); + } + } + } } } } @@ -657,12 +694,13 @@ static void DBGetArguments(benchmark::internal::Benchmark* b) { } } b->ArgNames({"comp_style", "max_data", "per_key_size", "enable_statistics", - "negative_query", "enable_filter", "mmap"}); + "negative_query", "enable_filter", "mmap", "compression_type", + "compression_checksum", "no_blockcache"}); } -static constexpr uint64_t kDBGetNum = 1l << 20; -BENCHMARK(DBGet)->Threads(1)->Iterations(kDBGetNum)->Apply(DBGetArguments); -BENCHMARK(DBGet)->Threads(8)->Iterations(kDBGetNum / 8)->Apply(DBGetArguments); +static const uint64_t DBGetNum = 10000l; +BENCHMARK(DBGet)->Threads(1)->Iterations(DBGetNum)->Apply(DBGetArguments); +BENCHMARK(DBGet)->Threads(8)->Iterations(DBGetNum / 8)->Apply(DBGetArguments); static void SimpleGetWithPerfContext(benchmark::State& state) { // setup DB @@ -705,7 +743,7 @@ static void SimpleGetWithPerfContext(benchmark::State& state) { } } auto db_full = static_cast_with_check(db.get()); - s = db_full->WaitForCompact(true); + s = db_full->WaitForCompact(WaitForCompactOptions()); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); return; @@ -721,6 +759,7 @@ static void SimpleGetWithPerfContext(benchmark::State& state) { size_t not_found = 0; uint64_t user_key_comparison_count = 0; uint64_t block_read_time = 0; + uint64_t block_read_cpu_time = 0; uint64_t block_checksum_time = 0; uint64_t get_snapshot_time = 0; uint64_t get_post_process_time = 0; @@ -740,6 +779,7 @@ static void SimpleGetWithPerfContext(benchmark::State& state) { } user_key_comparison_count += get_perf_context()->user_key_comparison_count; block_read_time += get_perf_context()->block_read_time; + block_read_cpu_time += get_perf_context()->block_read_cpu_time; block_checksum_time += get_perf_context()->block_checksum_time; get_snapshot_time += get_perf_context()->get_snapshot_time; get_post_process_time += get_perf_context()->get_post_process_time; @@ -760,6 +800,9 @@ static void SimpleGetWithPerfContext(benchmark::State& state) { benchmark::Counter::kAvgIterations); state.counters["block_read_time"] = benchmark::Counter( static_cast(block_read_time), benchmark::Counter::kAvgIterations); + state.counters["block_read_cpu_time"] = + benchmark::Counter(static_cast(block_read_cpu_time), + benchmark::Counter::kAvgIterations); state.counters["block_checksum_time"] = benchmark::Counter(static_cast(block_checksum_time), benchmark::Counter::kAvgIterations); @@ -1108,7 +1151,7 @@ static void IteratorSeek(benchmark::State& state) { } auto db_full = static_cast_with_check(db.get()); - s = db_full->WaitForCompact(true); + s = db_full->WaitForCompact(WaitForCompactOptions()); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); return; @@ -1199,7 +1242,7 @@ static void IteratorNext(benchmark::State& state) { } auto db_full = static_cast_with_check(db.get()); - s = db_full->WaitForCompact(true); + s = db_full->WaitForCompact(WaitForCompactOptions()); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); return; @@ -1263,7 +1306,7 @@ static void IteratorNextWithPerfContext(benchmark::State& state) { } } auto db_full = static_cast_with_check(db.get()); - Status s = db_full->WaitForCompact(true); + Status s = db_full->WaitForCompact(WaitForCompactOptions()); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); return; @@ -1361,7 +1404,7 @@ static void IteratorPrev(benchmark::State& state) { } auto db_full = static_cast_with_check(db.get()); - s = db_full->WaitForCompact(true); + s = db_full->WaitForCompact(WaitForCompactOptions()); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); return; @@ -1453,7 +1496,7 @@ static void PrefixSeek(benchmark::State& state) { } auto db_full = static_cast_with_check(db.get()); - s = db_full->WaitForCompact(true); + s = db_full->WaitForCompact(WaitForCompactOptions()); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); return; @@ -1541,7 +1584,8 @@ static void RandomAccessFileReaderRead(benchmark::State& state) { : Temperature::kCold; readers.emplace_back(new RandomAccessFileReader( std::move(f), fname, env->GetSystemClock().get(), nullptr, statistics, - 0, nullptr, nullptr, {}, temperature, rand_num == 1)); + Histograms::HISTOGRAM_ENUM_MAX, nullptr, nullptr, {}, temperature, + rand_num == 1)); } IOOptions io_options; @@ -1550,8 +1594,7 @@ static void RandomAccessFileReaderRead(benchmark::State& state) { uint64_t idx = 0; for (auto _ : state) { s = readers[idx++ % kFileNum]->Read(io_options, 0, kDefaultPageSize / 3, - &result, scratch.get(), nullptr, - Env::IO_TOTAL); + &result, scratch.get(), nullptr); if (!s.ok()) { state.SkipWithError(s.ToString().c_str()); } diff --git a/monitoring/instrumented_mutex.h b/monitoring/instrumented_mutex.h index e5aae34dfb85..33e2427593e2 100644 --- a/monitoring/instrumented_mutex.h +++ b/monitoring/instrumented_mutex.h @@ -5,7 +5,7 @@ #pragma once -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "port/port.h" #include "rocksdb/statistics.h" #include "rocksdb/system_clock.h" @@ -46,7 +46,7 @@ class InstrumentedMutex { void Unlock() { mutex_.Unlock(); } - void AssertHeld() { mutex_.AssertHeld(); } + void AssertHeld() const { mutex_.AssertHeld(); } private: void LockInternal(); diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc index 9a760d2881fe..eda3f0ddc012 100644 --- a/monitoring/perf_context.cc +++ b/monitoring/perf_context.cc @@ -10,6 +10,156 @@ namespace ROCKSDB_NAMESPACE { +/* + * Please add new metrics to this macro and appropriate fields will be copied, + * and/or emitted when converted to string. + * When people need to add new metrics please add the metric to the macro below + * and enclose the name of the specific metric within defCmd(). + * The position of the field will be dictated by the + * order in which the macros are enumerated and the offsets of the fields will + * be matched against ''PerfContextByLevelBase'' declared in perf_context.h. + */ +// clang-format off +#define DEF_PERF_CONTEXT_LEVEL_METRICS(defCmd) \ + defCmd(bloom_filter_useful) \ + defCmd(bloom_filter_full_positive) \ + defCmd(bloom_filter_full_true_positive) \ + defCmd(user_key_return_count) \ + defCmd(get_from_table_nanos) \ + defCmd(block_cache_hit_count) \ + defCmd(block_cache_miss_count) +// clang-format on + +// Break down performance counters by level and store per-level perf context in +// PerfContextByLevel +struct PerfContextByLevelInt { +#define EMIT_FIELDS(x) uint64_t x = 0; + DEF_PERF_CONTEXT_LEVEL_METRICS(EMIT_FIELDS) +#undef EMIT_FIELDS +}; + +/* + * Please add new metrics to this macro and appropriate fields will be copied, + * and/or emitted when converted to string. + * When people need to add new metrics please enclose the name of the specific + * metric within defCmd(). The position of the field will be dictated by the + * order in which the macros are enumerated and the offsets of the fields will + * be matched against ''PerfContextBase'' declared in perf_context.h. + */ + +// clang-format off +#define DEF_PERF_CONTEXT_METRICS(defCmd) \ + defCmd(user_key_comparison_count) \ + defCmd(block_cache_hit_count) \ + defCmd(block_read_count) \ + defCmd(block_read_byte) \ + defCmd(block_read_time) \ + defCmd(block_read_cpu_time) \ + defCmd(block_cache_index_hit_count) \ + defCmd(block_cache_standalone_handle_count) \ + defCmd(block_cache_real_handle_count) \ + defCmd(index_block_read_count) \ + defCmd(block_cache_filter_hit_count) \ + defCmd(filter_block_read_count) \ + defCmd(compression_dict_block_read_count) \ + defCmd(multiget_sst_file_read_count) \ + defCmd(multiget_sst_serialized_file_read_count) \ + defCmd(secondary_cache_hit_count) \ + defCmd(compressed_sec_cache_insert_real_count) \ + defCmd(compressed_sec_cache_insert_dummy_count) \ + defCmd(compressed_sec_cache_uncompressed_bytes) \ + defCmd(compressed_sec_cache_compressed_bytes) \ + defCmd(block_checksum_time) \ + defCmd(block_decompress_time) \ + defCmd(get_read_bytes) \ + defCmd(multiget_read_bytes) \ + defCmd(iter_read_bytes) \ + defCmd(blob_cache_hit_count) \ + defCmd(blob_read_count) \ + defCmd(blob_read_byte) \ + defCmd(blob_read_time) \ + defCmd(blob_checksum_time) \ + defCmd(blob_decompress_time) \ + defCmd(internal_key_skipped_count) \ + defCmd(internal_delete_skipped_count) \ + defCmd(internal_recent_skipped_count) \ + defCmd(internal_merge_count) \ + defCmd(internal_merge_point_lookup_count) \ + defCmd(internal_range_del_reseek_count) \ + defCmd(get_snapshot_time) \ + defCmd(get_from_memtable_time) \ + defCmd(get_from_memtable_count) \ + defCmd(get_post_process_time) \ + defCmd(get_from_output_files_time) \ + defCmd(seek_on_memtable_time) \ + defCmd(seek_on_memtable_count) \ + defCmd(next_on_memtable_count) \ + defCmd(prev_on_memtable_count) \ + defCmd(seek_child_seek_time) \ + defCmd(seek_child_seek_count) \ + defCmd(seek_min_heap_time) \ + defCmd(seek_max_heap_time) \ + defCmd(seek_internal_seek_time) \ + defCmd(find_next_user_entry_time) \ + defCmd(write_wal_time) \ + defCmd(write_memtable_time) \ + defCmd(write_delay_time) \ + defCmd(write_scheduling_flushes_compactions_time)\ + defCmd(write_pre_and_post_process_time) \ + defCmd(write_thread_wait_nanos) \ + defCmd(db_mutex_lock_nanos) \ + defCmd(db_condition_wait_nanos) \ + defCmd(merge_operator_time_nanos) \ + defCmd(read_index_block_nanos) \ + defCmd(read_filter_block_nanos) \ + defCmd(new_table_block_iter_nanos) \ + defCmd(new_table_iterator_nanos) \ + defCmd(block_seek_nanos) \ + defCmd(find_table_nanos) \ + defCmd(bloom_memtable_hit_count) \ + defCmd(bloom_memtable_miss_count) \ + defCmd(bloom_sst_hit_count) \ + defCmd(bloom_sst_miss_count) \ + defCmd(key_lock_wait_time) \ + defCmd(key_lock_wait_count) \ + defCmd(env_new_sequential_file_nanos) \ + defCmd(env_new_random_access_file_nanos) \ + defCmd(env_new_writable_file_nanos) \ + defCmd(env_reuse_writable_file_nanos) \ + defCmd(env_new_random_rw_file_nanos) \ + defCmd(env_new_directory_nanos) \ + defCmd(env_file_exists_nanos) \ + defCmd(env_get_children_nanos) \ + defCmd(env_get_children_file_attributes_nanos) \ + defCmd(env_delete_file_nanos) \ + defCmd(env_create_dir_nanos) \ + defCmd(env_create_dir_if_missing_nanos) \ + defCmd(env_delete_dir_nanos) \ + defCmd(env_get_file_size_nanos) \ + defCmd(env_get_file_modification_time_nanos) \ + defCmd(env_rename_file_nanos) \ + defCmd(env_link_file_nanos) \ + defCmd(env_lock_file_nanos) \ + defCmd(env_unlock_file_nanos) \ + defCmd(env_new_logger_nanos) \ + defCmd(get_cpu_nanos) \ + defCmd(iter_next_cpu_nanos) \ + defCmd(iter_prev_cpu_nanos) \ + defCmd(iter_seek_cpu_nanos) \ + defCmd(iter_next_count) \ + defCmd(iter_prev_count) \ + defCmd(iter_seek_count) \ + defCmd(encrypt_data_nanos) \ + defCmd(decrypt_data_nanos) \ + defCmd(number_async_seek) +// clang-format on + +struct PerfContextInt { +#define EMIT_FIELDS(x) uint64_t x; + DEF_PERF_CONTEXT_METRICS(EMIT_FIELDS) +#undef EMIT_FIELDS +}; + #if defined(NPERF_CONTEXT) // Should not be used because the counters are not thread-safe. // Put here just to make get_perf_context() simple without ifdef. @@ -18,7 +168,30 @@ PerfContext perf_context; thread_local PerfContext perf_context; #endif -PerfContext* get_perf_context() { return &perf_context; } +PerfContext* get_perf_context() { + static_assert(sizeof(PerfContextBase) == sizeof(PerfContextInt)); + static_assert(sizeof(PerfContextByLevelBase) == + sizeof(PerfContextByLevelInt)); + /* + * Validate that we have the same fields and offsets between the external user + * facing + * ''PerfContextBase'' and ''PerfContextByLevelBase' structures with the + * internal structures that we generate from the DEF_* macros above. This way + * if people add metrics to the user-facing header file, they will have a + * build failure and need to add similar fields to the macros in this file. + * These are compile-time validations and don't impose any run-time penalties. + */ +#define EMIT_OFFSET_ASSERTION(x) \ + static_assert(offsetof(PerfContextBase, x) == offsetof(PerfContextInt, x)); + DEF_PERF_CONTEXT_METRICS(EMIT_OFFSET_ASSERTION) +#undef EMIT_OFFSET_ASSERTION +#define EMIT_OFFSET_ASSERTION(x) \ + static_assert(offsetof(PerfContextByLevelBase, x) == \ + offsetof(PerfContextByLevelInt, x)); + DEF_PERF_CONTEXT_LEVEL_METRICS(EMIT_OFFSET_ASSERTION) +#undef EMIT_OFFSET_ASSERTION + return &perf_context; +} PerfContext::~PerfContext() { #if !defined(NPERF_CONTEXT) && !defined(OS_SOLARIS) @@ -30,125 +203,7 @@ PerfContext::PerfContext(const PerfContext& other) { #ifdef NPERF_CONTEXT (void)other; #else - user_key_comparison_count = other.user_key_comparison_count; - block_cache_hit_count = other.block_cache_hit_count; - block_read_count = other.block_read_count; - block_read_byte = other.block_read_byte; - block_read_time = other.block_read_time; - block_cache_index_hit_count = other.block_cache_index_hit_count; - block_cache_standalone_handle_count = - other.block_cache_standalone_handle_count; - block_cache_real_handle_count = other.block_cache_real_handle_count; - index_block_read_count = other.index_block_read_count; - block_cache_filter_hit_count = other.block_cache_filter_hit_count; - filter_block_read_count = other.filter_block_read_count; - compression_dict_block_read_count = other.compression_dict_block_read_count; - - // RocksDB-Cloud contribution begin - multiget_sst_file_read_count = other.multiget_sst_file_read_count; - multiget_sst_serialized_file_read_count = - other.multiget_sst_serialized_file_read_count; - // RocksDB-Cloud contribution end - - secondary_cache_hit_count = other.secondary_cache_hit_count; - compressed_sec_cache_insert_real_count = - other.compressed_sec_cache_insert_real_count; - compressed_sec_cache_insert_dummy_count = - other.compressed_sec_cache_insert_dummy_count; - compressed_sec_cache_uncompressed_bytes = - other.compressed_sec_cache_uncompressed_bytes; - compressed_sec_cache_compressed_bytes = - other.compressed_sec_cache_compressed_bytes; - block_checksum_time = other.block_checksum_time; - block_decompress_time = other.block_decompress_time; - get_read_bytes = other.get_read_bytes; - multiget_read_bytes = other.multiget_read_bytes; - iter_read_bytes = other.iter_read_bytes; - - blob_cache_hit_count = other.blob_cache_hit_count; - blob_read_count = other.blob_read_count; - blob_read_byte = other.blob_read_byte; - blob_read_time = other.blob_read_time; - blob_checksum_time = other.blob_checksum_time; - blob_decompress_time = other.blob_decompress_time; - - internal_key_skipped_count = other.internal_key_skipped_count; - internal_delete_skipped_count = other.internal_delete_skipped_count; - internal_recent_skipped_count = other.internal_recent_skipped_count; - internal_merge_count = other.internal_merge_count; - internal_range_del_reseek_count = other.internal_range_del_reseek_count; - write_wal_time = other.write_wal_time; - get_snapshot_time = other.get_snapshot_time; - get_from_memtable_time = other.get_from_memtable_time; - get_from_memtable_count = other.get_from_memtable_count; - get_post_process_time = other.get_post_process_time; - get_from_output_files_time = other.get_from_output_files_time; - seek_on_memtable_time = other.seek_on_memtable_time; - seek_on_memtable_count = other.seek_on_memtable_count; - next_on_memtable_count = other.next_on_memtable_count; - prev_on_memtable_count = other.prev_on_memtable_count; - seek_child_seek_time = other.seek_child_seek_time; - seek_child_seek_count = other.seek_child_seek_count; - seek_min_heap_time = other.seek_min_heap_time; - seek_internal_seek_time = other.seek_internal_seek_time; - find_next_user_entry_time = other.find_next_user_entry_time; - write_pre_and_post_process_time = other.write_pre_and_post_process_time; - write_memtable_time = other.write_memtable_time; - write_delay_time = other.write_delay_time; - write_thread_wait_nanos = other.write_thread_wait_nanos; - write_scheduling_flushes_compactions_time = - other.write_scheduling_flushes_compactions_time; - db_mutex_lock_nanos = other.db_mutex_lock_nanos; - db_condition_wait_nanos = other.db_condition_wait_nanos; - merge_operator_time_nanos = other.merge_operator_time_nanos; - read_index_block_nanos = other.read_index_block_nanos; - read_filter_block_nanos = other.read_filter_block_nanos; - new_table_block_iter_nanos = other.new_table_block_iter_nanos; - new_table_iterator_nanos = other.new_table_iterator_nanos; - block_seek_nanos = other.block_seek_nanos; - find_table_nanos = other.find_table_nanos; - bloom_memtable_hit_count = other.bloom_memtable_hit_count; - bloom_memtable_miss_count = other.bloom_memtable_miss_count; - bloom_sst_hit_count = other.bloom_sst_hit_count; - bloom_sst_miss_count = other.bloom_sst_miss_count; - key_lock_wait_time = other.key_lock_wait_time; - key_lock_wait_count = other.key_lock_wait_count; - - env_new_sequential_file_nanos = other.env_new_sequential_file_nanos; - env_new_random_access_file_nanos = other.env_new_random_access_file_nanos; - env_new_writable_file_nanos = other.env_new_writable_file_nanos; - env_reuse_writable_file_nanos = other.env_reuse_writable_file_nanos; - env_new_random_rw_file_nanos = other.env_new_random_rw_file_nanos; - env_new_directory_nanos = other.env_new_directory_nanos; - env_file_exists_nanos = other.env_file_exists_nanos; - env_get_children_nanos = other.env_get_children_nanos; - env_get_children_file_attributes_nanos = - other.env_get_children_file_attributes_nanos; - env_delete_file_nanos = other.env_delete_file_nanos; - env_create_dir_nanos = other.env_create_dir_nanos; - env_create_dir_if_missing_nanos = other.env_create_dir_if_missing_nanos; - env_delete_dir_nanos = other.env_delete_dir_nanos; - env_get_file_size_nanos = other.env_get_file_size_nanos; - env_get_file_modification_time_nanos = - other.env_get_file_modification_time_nanos; - env_rename_file_nanos = other.env_rename_file_nanos; - env_link_file_nanos = other.env_link_file_nanos; - env_lock_file_nanos = other.env_lock_file_nanos; - env_unlock_file_nanos = other.env_unlock_file_nanos; - env_new_logger_nanos = other.env_new_logger_nanos; - get_cpu_nanos = other.get_cpu_nanos; - iter_next_cpu_nanos = other.iter_next_cpu_nanos; - iter_prev_cpu_nanos = other.iter_prev_cpu_nanos; - iter_seek_cpu_nanos = other.iter_seek_cpu_nanos; - number_async_seek = other.number_async_seek; - if (per_level_perf_context_enabled && level_to_perf_context != nullptr) { - ClearPerLevelPerfContext(); - } - if (other.level_to_perf_context != nullptr) { - level_to_perf_context = new std::map(); - *level_to_perf_context = *other.level_to_perf_context; - } - per_level_perf_context_enabled = other.per_level_perf_context_enabled; + copyMetrics(&other); #endif } @@ -156,362 +211,42 @@ PerfContext::PerfContext(PerfContext&& other) noexcept { #ifdef NPERF_CONTEXT (void)other; #else - user_key_comparison_count = other.user_key_comparison_count; - block_cache_hit_count = other.block_cache_hit_count; - block_read_count = other.block_read_count; - block_read_byte = other.block_read_byte; - block_read_time = other.block_read_time; - block_cache_index_hit_count = other.block_cache_index_hit_count; - block_cache_standalone_handle_count = - other.block_cache_standalone_handle_count; - block_cache_real_handle_count = other.block_cache_real_handle_count; - index_block_read_count = other.index_block_read_count; - block_cache_filter_hit_count = other.block_cache_filter_hit_count; - filter_block_read_count = other.filter_block_read_count; - compression_dict_block_read_count = other.compression_dict_block_read_count; - - // RocksDB-Cloud contribution begin - multiget_sst_file_read_count = other.multiget_sst_file_read_count; - multiget_sst_serialized_file_read_count = - other.multiget_sst_serialized_file_read_count; - // RocksDB-Cloud contribution end - - secondary_cache_hit_count = other.secondary_cache_hit_count; - compressed_sec_cache_insert_real_count = - other.compressed_sec_cache_insert_real_count; - compressed_sec_cache_insert_dummy_count = - other.compressed_sec_cache_insert_dummy_count; - compressed_sec_cache_uncompressed_bytes = - other.compressed_sec_cache_uncompressed_bytes; - compressed_sec_cache_compressed_bytes = - other.compressed_sec_cache_compressed_bytes; - block_checksum_time = other.block_checksum_time; - block_decompress_time = other.block_decompress_time; - get_read_bytes = other.get_read_bytes; - multiget_read_bytes = other.multiget_read_bytes; - iter_read_bytes = other.iter_read_bytes; - - blob_cache_hit_count = other.blob_cache_hit_count; - blob_read_count = other.blob_read_count; - blob_read_byte = other.blob_read_byte; - blob_read_time = other.blob_read_time; - blob_checksum_time = other.blob_checksum_time; - blob_decompress_time = other.blob_decompress_time; - - internal_key_skipped_count = other.internal_key_skipped_count; - internal_delete_skipped_count = other.internal_delete_skipped_count; - internal_recent_skipped_count = other.internal_recent_skipped_count; - internal_merge_count = other.internal_merge_count; - internal_range_del_reseek_count = other.internal_range_del_reseek_count; - write_wal_time = other.write_wal_time; - get_snapshot_time = other.get_snapshot_time; - get_from_memtable_time = other.get_from_memtable_time; - get_from_memtable_count = other.get_from_memtable_count; - get_post_process_time = other.get_post_process_time; - get_from_output_files_time = other.get_from_output_files_time; - seek_on_memtable_time = other.seek_on_memtable_time; - seek_on_memtable_count = other.seek_on_memtable_count; - next_on_memtable_count = other.next_on_memtable_count; - prev_on_memtable_count = other.prev_on_memtable_count; - seek_child_seek_time = other.seek_child_seek_time; - seek_child_seek_count = other.seek_child_seek_count; - seek_min_heap_time = other.seek_min_heap_time; - seek_internal_seek_time = other.seek_internal_seek_time; - find_next_user_entry_time = other.find_next_user_entry_time; - write_pre_and_post_process_time = other.write_pre_and_post_process_time; - write_memtable_time = other.write_memtable_time; - write_delay_time = other.write_delay_time; - write_thread_wait_nanos = other.write_thread_wait_nanos; - write_scheduling_flushes_compactions_time = - other.write_scheduling_flushes_compactions_time; - db_mutex_lock_nanos = other.db_mutex_lock_nanos; - db_condition_wait_nanos = other.db_condition_wait_nanos; - merge_operator_time_nanos = other.merge_operator_time_nanos; - read_index_block_nanos = other.read_index_block_nanos; - read_filter_block_nanos = other.read_filter_block_nanos; - new_table_block_iter_nanos = other.new_table_block_iter_nanos; - new_table_iterator_nanos = other.new_table_iterator_nanos; - block_seek_nanos = other.block_seek_nanos; - find_table_nanos = other.find_table_nanos; - bloom_memtable_hit_count = other.bloom_memtable_hit_count; - bloom_memtable_miss_count = other.bloom_memtable_miss_count; - bloom_sst_hit_count = other.bloom_sst_hit_count; - bloom_sst_miss_count = other.bloom_sst_miss_count; - key_lock_wait_time = other.key_lock_wait_time; - key_lock_wait_count = other.key_lock_wait_count; - - env_new_sequential_file_nanos = other.env_new_sequential_file_nanos; - env_new_random_access_file_nanos = other.env_new_random_access_file_nanos; - env_new_writable_file_nanos = other.env_new_writable_file_nanos; - env_reuse_writable_file_nanos = other.env_reuse_writable_file_nanos; - env_new_random_rw_file_nanos = other.env_new_random_rw_file_nanos; - env_new_directory_nanos = other.env_new_directory_nanos; - env_file_exists_nanos = other.env_file_exists_nanos; - env_get_children_nanos = other.env_get_children_nanos; - env_get_children_file_attributes_nanos = - other.env_get_children_file_attributes_nanos; - env_delete_file_nanos = other.env_delete_file_nanos; - env_create_dir_nanos = other.env_create_dir_nanos; - env_create_dir_if_missing_nanos = other.env_create_dir_if_missing_nanos; - env_delete_dir_nanos = other.env_delete_dir_nanos; - env_get_file_size_nanos = other.env_get_file_size_nanos; - env_get_file_modification_time_nanos = - other.env_get_file_modification_time_nanos; - env_rename_file_nanos = other.env_rename_file_nanos; - env_link_file_nanos = other.env_link_file_nanos; - env_lock_file_nanos = other.env_lock_file_nanos; - env_unlock_file_nanos = other.env_unlock_file_nanos; - env_new_logger_nanos = other.env_new_logger_nanos; - get_cpu_nanos = other.get_cpu_nanos; - iter_next_cpu_nanos = other.iter_next_cpu_nanos; - iter_prev_cpu_nanos = other.iter_prev_cpu_nanos; - iter_seek_cpu_nanos = other.iter_seek_cpu_nanos; - number_async_seek = other.number_async_seek; - if (per_level_perf_context_enabled && level_to_perf_context != nullptr) { - ClearPerLevelPerfContext(); - } - if (other.level_to_perf_context != nullptr) { - level_to_perf_context = other.level_to_perf_context; - other.level_to_perf_context = nullptr; - } - per_level_perf_context_enabled = other.per_level_perf_context_enabled; + copyMetrics(&other); #endif } -// TODO(Zhongyi): reduce code duplication between copy constructor and -// assignment operator PerfContext& PerfContext::operator=(const PerfContext& other) { #ifdef NPERF_CONTEXT (void)other; #else - user_key_comparison_count = other.user_key_comparison_count; - block_cache_hit_count = other.block_cache_hit_count; - block_read_count = other.block_read_count; - block_read_byte = other.block_read_byte; - block_read_time = other.block_read_time; - block_cache_index_hit_count = other.block_cache_index_hit_count; - block_cache_standalone_handle_count = - other.block_cache_standalone_handle_count; - block_cache_real_handle_count = other.block_cache_real_handle_count; - index_block_read_count = other.index_block_read_count; - block_cache_filter_hit_count = other.block_cache_filter_hit_count; - filter_block_read_count = other.filter_block_read_count; - compression_dict_block_read_count = other.compression_dict_block_read_count; - - // RocksDB-Cloud contribution begin - multiget_sst_file_read_count = other.multiget_sst_file_read_count; - multiget_sst_serialized_file_read_count = - other.multiget_sst_serialized_file_read_count; - // RocksDB-Cloud contribution end - - secondary_cache_hit_count = other.secondary_cache_hit_count; - compressed_sec_cache_insert_real_count = - other.compressed_sec_cache_insert_real_count; - compressed_sec_cache_insert_dummy_count = - other.compressed_sec_cache_insert_dummy_count; - compressed_sec_cache_uncompressed_bytes = - other.compressed_sec_cache_uncompressed_bytes; - compressed_sec_cache_compressed_bytes = - other.compressed_sec_cache_compressed_bytes; - block_checksum_time = other.block_checksum_time; - block_decompress_time = other.block_decompress_time; - get_read_bytes = other.get_read_bytes; - multiget_read_bytes = other.multiget_read_bytes; - iter_read_bytes = other.iter_read_bytes; - - blob_cache_hit_count = other.blob_cache_hit_count; - blob_read_count = other.blob_read_count; - blob_read_byte = other.blob_read_byte; - blob_read_time = other.blob_read_time; - blob_checksum_time = other.blob_checksum_time; - blob_decompress_time = other.blob_decompress_time; - - internal_key_skipped_count = other.internal_key_skipped_count; - internal_delete_skipped_count = other.internal_delete_skipped_count; - internal_recent_skipped_count = other.internal_recent_skipped_count; - internal_merge_count = other.internal_merge_count; - internal_range_del_reseek_count = other.internal_range_del_reseek_count; - write_wal_time = other.write_wal_time; - get_snapshot_time = other.get_snapshot_time; - get_from_memtable_time = other.get_from_memtable_time; - get_from_memtable_count = other.get_from_memtable_count; - get_post_process_time = other.get_post_process_time; - get_from_output_files_time = other.get_from_output_files_time; - seek_on_memtable_time = other.seek_on_memtable_time; - seek_on_memtable_count = other.seek_on_memtable_count; - next_on_memtable_count = other.next_on_memtable_count; - prev_on_memtable_count = other.prev_on_memtable_count; - seek_child_seek_time = other.seek_child_seek_time; - seek_child_seek_count = other.seek_child_seek_count; - seek_min_heap_time = other.seek_min_heap_time; - seek_internal_seek_time = other.seek_internal_seek_time; - find_next_user_entry_time = other.find_next_user_entry_time; - write_pre_and_post_process_time = other.write_pre_and_post_process_time; - write_memtable_time = other.write_memtable_time; - write_delay_time = other.write_delay_time; - write_thread_wait_nanos = other.write_thread_wait_nanos; - write_scheduling_flushes_compactions_time = - other.write_scheduling_flushes_compactions_time; - db_mutex_lock_nanos = other.db_mutex_lock_nanos; - db_condition_wait_nanos = other.db_condition_wait_nanos; - merge_operator_time_nanos = other.merge_operator_time_nanos; - read_index_block_nanos = other.read_index_block_nanos; - read_filter_block_nanos = other.read_filter_block_nanos; - new_table_block_iter_nanos = other.new_table_block_iter_nanos; - new_table_iterator_nanos = other.new_table_iterator_nanos; - block_seek_nanos = other.block_seek_nanos; - find_table_nanos = other.find_table_nanos; - bloom_memtable_hit_count = other.bloom_memtable_hit_count; - bloom_memtable_miss_count = other.bloom_memtable_miss_count; - bloom_sst_hit_count = other.bloom_sst_hit_count; - bloom_sst_miss_count = other.bloom_sst_miss_count; - key_lock_wait_time = other.key_lock_wait_time; - key_lock_wait_count = other.key_lock_wait_count; + copyMetrics(&other); +#endif + return *this; +} - env_new_sequential_file_nanos = other.env_new_sequential_file_nanos; - env_new_random_access_file_nanos = other.env_new_random_access_file_nanos; - env_new_writable_file_nanos = other.env_new_writable_file_nanos; - env_reuse_writable_file_nanos = other.env_reuse_writable_file_nanos; - env_new_random_rw_file_nanos = other.env_new_random_rw_file_nanos; - env_new_directory_nanos = other.env_new_directory_nanos; - env_file_exists_nanos = other.env_file_exists_nanos; - env_get_children_nanos = other.env_get_children_nanos; - env_get_children_file_attributes_nanos = - other.env_get_children_file_attributes_nanos; - env_delete_file_nanos = other.env_delete_file_nanos; - env_create_dir_nanos = other.env_create_dir_nanos; - env_create_dir_if_missing_nanos = other.env_create_dir_if_missing_nanos; - env_delete_dir_nanos = other.env_delete_dir_nanos; - env_get_file_size_nanos = other.env_get_file_size_nanos; - env_get_file_modification_time_nanos = - other.env_get_file_modification_time_nanos; - env_rename_file_nanos = other.env_rename_file_nanos; - env_link_file_nanos = other.env_link_file_nanos; - env_lock_file_nanos = other.env_lock_file_nanos; - env_unlock_file_nanos = other.env_unlock_file_nanos; - env_new_logger_nanos = other.env_new_logger_nanos; - get_cpu_nanos = other.get_cpu_nanos; - iter_next_cpu_nanos = other.iter_next_cpu_nanos; - iter_prev_cpu_nanos = other.iter_prev_cpu_nanos; - iter_seek_cpu_nanos = other.iter_seek_cpu_nanos; - number_async_seek = other.number_async_seek; +void PerfContext::copyMetrics(const PerfContext* other) noexcept { +#ifdef NPERF_CONTEXT + (void)other; +#else +#define EMIT_COPY_FIELDS(x) x = other->x; + DEF_PERF_CONTEXT_METRICS(EMIT_COPY_FIELDS) +#undef EMIT_COPY_FIELDS if (per_level_perf_context_enabled && level_to_perf_context != nullptr) { ClearPerLevelPerfContext(); } - if (other.level_to_perf_context != nullptr) { + if (other->level_to_perf_context != nullptr) { level_to_perf_context = new std::map(); - *level_to_perf_context = *other.level_to_perf_context; + *level_to_perf_context = *other->level_to_perf_context; } - per_level_perf_context_enabled = other.per_level_perf_context_enabled; + per_level_perf_context_enabled = other->per_level_perf_context_enabled; #endif - return *this; } void PerfContext::Reset() { #ifndef NPERF_CONTEXT - user_key_comparison_count = 0; - block_cache_hit_count = 0; - block_read_count = 0; - block_read_byte = 0; - block_read_time = 0; - block_cache_index_hit_count = 0; - block_cache_standalone_handle_count = 0; - block_cache_real_handle_count = 0; - index_block_read_count = 0; - block_cache_filter_hit_count = 0; - filter_block_read_count = 0; - compression_dict_block_read_count = 0; - - // RocksDB-Cloud contribution begin - multiget_sst_file_read_count = 0; - multiget_sst_serialized_file_read_count = 0; - // RocksDB-Cloud contribution end - - secondary_cache_hit_count = 0; - compressed_sec_cache_insert_real_count = 0; - compressed_sec_cache_insert_dummy_count = 0; - compressed_sec_cache_uncompressed_bytes = 0; - compressed_sec_cache_compressed_bytes = 0; - block_checksum_time = 0; - block_decompress_time = 0; - get_read_bytes = 0; - multiget_read_bytes = 0; - iter_read_bytes = 0; - - blob_cache_hit_count = 0; - blob_read_count = 0; - blob_read_byte = 0; - blob_read_time = 0; - blob_checksum_time = 0; - blob_decompress_time = 0; - - internal_key_skipped_count = 0; - internal_delete_skipped_count = 0; - internal_recent_skipped_count = 0; - internal_merge_count = 0; - internal_range_del_reseek_count = 0; - write_wal_time = 0; - - get_snapshot_time = 0; - get_from_memtable_time = 0; - get_from_memtable_count = 0; - get_post_process_time = 0; - get_from_output_files_time = 0; - seek_on_memtable_time = 0; - seek_on_memtable_count = 0; - next_on_memtable_count = 0; - prev_on_memtable_count = 0; - seek_child_seek_time = 0; - seek_child_seek_count = 0; - seek_min_heap_time = 0; - seek_internal_seek_time = 0; - find_next_user_entry_time = 0; - write_pre_and_post_process_time = 0; - write_memtable_time = 0; - write_delay_time = 0; - write_thread_wait_nanos = 0; - write_scheduling_flushes_compactions_time = 0; - db_mutex_lock_nanos = 0; - db_condition_wait_nanos = 0; - merge_operator_time_nanos = 0; - read_index_block_nanos = 0; - read_filter_block_nanos = 0; - new_table_block_iter_nanos = 0; - new_table_iterator_nanos = 0; - block_seek_nanos = 0; - find_table_nanos = 0; - bloom_memtable_hit_count = 0; - bloom_memtable_miss_count = 0; - bloom_sst_hit_count = 0; - bloom_sst_miss_count = 0; - key_lock_wait_time = 0; - key_lock_wait_count = 0; - - env_new_sequential_file_nanos = 0; - env_new_random_access_file_nanos = 0; - env_new_writable_file_nanos = 0; - env_reuse_writable_file_nanos = 0; - env_new_random_rw_file_nanos = 0; - env_new_directory_nanos = 0; - env_file_exists_nanos = 0; - env_get_children_nanos = 0; - env_get_children_file_attributes_nanos = 0; - env_delete_file_nanos = 0; - env_create_dir_nanos = 0; - env_create_dir_if_missing_nanos = 0; - env_delete_dir_nanos = 0; - env_get_file_size_nanos = 0; - env_get_file_modification_time_nanos = 0; - env_rename_file_nanos = 0; - env_link_file_nanos = 0; - env_lock_file_nanos = 0; - env_unlock_file_nanos = 0; - env_new_logger_nanos = 0; - get_cpu_nanos = 0; - iter_next_cpu_nanos = 0; - iter_prev_cpu_nanos = 0; - iter_seek_cpu_nanos = 0; - number_async_seek = 0; +#define EMIT_FIELDS(x) x = 0; + DEF_PERF_CONTEXT_METRICS(EMIT_FIELDS) +#undef EMIT_FIELDS if (per_level_perf_context_enabled && level_to_perf_context) { for (auto& kv : *level_to_perf_context) { kv.second.Reset(); @@ -520,28 +255,11 @@ void PerfContext::Reset() { #endif } -#define PERF_CONTEXT_OUTPUT(counter) \ - if (!exclude_zero_counters || (counter > 0)) { \ - ss << #counter << " = " << counter << ", "; \ - } - -#define PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(counter) \ - if (per_level_perf_context_enabled && level_to_perf_context) { \ - ss << #counter << " = "; \ - for (auto& kv : *level_to_perf_context) { \ - if (!exclude_zero_counters || (kv.second.counter > 0)) { \ - ss << kv.second.counter << "@level" << kv.first << ", "; \ - } \ - } \ - } - void PerfContextByLevel::Reset() { #ifndef NPERF_CONTEXT - bloom_filter_useful = 0; - bloom_filter_full_positive = 0; - bloom_filter_full_true_positive = 0; - block_cache_hit_count = 0; - block_cache_miss_count = 0; +#define EMIT_FIELDS(x) x = 0; + DEF_PERF_CONTEXT_LEVEL_METRICS(EMIT_FIELDS) +#undef EMIT_FIELDS #endif } @@ -551,111 +269,23 @@ std::string PerfContext::ToString(bool exclude_zero_counters) const { return ""; #else std::ostringstream ss; - PERF_CONTEXT_OUTPUT(user_key_comparison_count); - PERF_CONTEXT_OUTPUT(block_cache_hit_count); - PERF_CONTEXT_OUTPUT(block_read_count); - PERF_CONTEXT_OUTPUT(block_read_byte); - PERF_CONTEXT_OUTPUT(block_read_time); - PERF_CONTEXT_OUTPUT(block_cache_index_hit_count); - PERF_CONTEXT_OUTPUT(block_cache_standalone_handle_count); - PERF_CONTEXT_OUTPUT(block_cache_real_handle_count); - PERF_CONTEXT_OUTPUT(index_block_read_count); - PERF_CONTEXT_OUTPUT(block_cache_filter_hit_count); - PERF_CONTEXT_OUTPUT(filter_block_read_count); - PERF_CONTEXT_OUTPUT(compression_dict_block_read_count); - - // RocksDB-Cloud contribution begin - PERF_CONTEXT_OUTPUT(multiget_sst_file_read_count); - PERF_CONTEXT_OUTPUT(multiget_sst_serialized_file_read_count); - // RocksDB-Cloud contribution end - - PERF_CONTEXT_OUTPUT(secondary_cache_hit_count); - PERF_CONTEXT_OUTPUT(compressed_sec_cache_insert_real_count); - PERF_CONTEXT_OUTPUT(compressed_sec_cache_insert_dummy_count); - PERF_CONTEXT_OUTPUT(compressed_sec_cache_uncompressed_bytes); - PERF_CONTEXT_OUTPUT(compressed_sec_cache_compressed_bytes); - PERF_CONTEXT_OUTPUT(block_checksum_time); - PERF_CONTEXT_OUTPUT(block_decompress_time); - PERF_CONTEXT_OUTPUT(get_read_bytes); - PERF_CONTEXT_OUTPUT(multiget_read_bytes); - PERF_CONTEXT_OUTPUT(iter_read_bytes); - PERF_CONTEXT_OUTPUT(blob_cache_hit_count); - PERF_CONTEXT_OUTPUT(blob_read_count); - PERF_CONTEXT_OUTPUT(blob_read_byte); - PERF_CONTEXT_OUTPUT(blob_read_time); - PERF_CONTEXT_OUTPUT(blob_checksum_time); - PERF_CONTEXT_OUTPUT(blob_decompress_time); - PERF_CONTEXT_OUTPUT(internal_key_skipped_count); - PERF_CONTEXT_OUTPUT(internal_delete_skipped_count); - PERF_CONTEXT_OUTPUT(internal_recent_skipped_count); - PERF_CONTEXT_OUTPUT(internal_merge_count); - PERF_CONTEXT_OUTPUT(internal_range_del_reseek_count); - PERF_CONTEXT_OUTPUT(write_wal_time); - PERF_CONTEXT_OUTPUT(get_snapshot_time); - PERF_CONTEXT_OUTPUT(get_from_memtable_time); - PERF_CONTEXT_OUTPUT(get_from_memtable_count); - PERF_CONTEXT_OUTPUT(get_post_process_time); - PERF_CONTEXT_OUTPUT(get_from_output_files_time); - PERF_CONTEXT_OUTPUT(seek_on_memtable_time); - PERF_CONTEXT_OUTPUT(seek_on_memtable_count); - PERF_CONTEXT_OUTPUT(next_on_memtable_count); - PERF_CONTEXT_OUTPUT(prev_on_memtable_count); - PERF_CONTEXT_OUTPUT(seek_child_seek_time); - PERF_CONTEXT_OUTPUT(seek_child_seek_count); - PERF_CONTEXT_OUTPUT(seek_min_heap_time); - PERF_CONTEXT_OUTPUT(seek_internal_seek_time); - PERF_CONTEXT_OUTPUT(find_next_user_entry_time); - PERF_CONTEXT_OUTPUT(write_pre_and_post_process_time); - PERF_CONTEXT_OUTPUT(write_memtable_time); - PERF_CONTEXT_OUTPUT(write_thread_wait_nanos); - PERF_CONTEXT_OUTPUT(write_scheduling_flushes_compactions_time); - PERF_CONTEXT_OUTPUT(db_mutex_lock_nanos); - PERF_CONTEXT_OUTPUT(db_condition_wait_nanos); - PERF_CONTEXT_OUTPUT(merge_operator_time_nanos); - PERF_CONTEXT_OUTPUT(write_delay_time); - PERF_CONTEXT_OUTPUT(read_index_block_nanos); - PERF_CONTEXT_OUTPUT(read_filter_block_nanos); - PERF_CONTEXT_OUTPUT(new_table_block_iter_nanos); - PERF_CONTEXT_OUTPUT(new_table_iterator_nanos); - PERF_CONTEXT_OUTPUT(block_seek_nanos); - PERF_CONTEXT_OUTPUT(find_table_nanos); - PERF_CONTEXT_OUTPUT(bloom_memtable_hit_count); - PERF_CONTEXT_OUTPUT(bloom_memtable_miss_count); - PERF_CONTEXT_OUTPUT(bloom_sst_hit_count); - PERF_CONTEXT_OUTPUT(bloom_sst_miss_count); - PERF_CONTEXT_OUTPUT(key_lock_wait_time); - PERF_CONTEXT_OUTPUT(key_lock_wait_count); - PERF_CONTEXT_OUTPUT(env_new_sequential_file_nanos); - PERF_CONTEXT_OUTPUT(env_new_random_access_file_nanos); - PERF_CONTEXT_OUTPUT(env_new_writable_file_nanos); - PERF_CONTEXT_OUTPUT(env_reuse_writable_file_nanos); - PERF_CONTEXT_OUTPUT(env_new_random_rw_file_nanos); - PERF_CONTEXT_OUTPUT(env_new_directory_nanos); - PERF_CONTEXT_OUTPUT(env_file_exists_nanos); - PERF_CONTEXT_OUTPUT(env_get_children_nanos); - PERF_CONTEXT_OUTPUT(env_get_children_file_attributes_nanos); - PERF_CONTEXT_OUTPUT(env_delete_file_nanos); - PERF_CONTEXT_OUTPUT(env_create_dir_nanos); - PERF_CONTEXT_OUTPUT(env_create_dir_if_missing_nanos); - PERF_CONTEXT_OUTPUT(env_delete_dir_nanos); - PERF_CONTEXT_OUTPUT(env_get_file_size_nanos); - PERF_CONTEXT_OUTPUT(env_get_file_modification_time_nanos); - PERF_CONTEXT_OUTPUT(env_rename_file_nanos); - PERF_CONTEXT_OUTPUT(env_link_file_nanos); - PERF_CONTEXT_OUTPUT(env_lock_file_nanos); - PERF_CONTEXT_OUTPUT(env_unlock_file_nanos); - PERF_CONTEXT_OUTPUT(env_new_logger_nanos); - PERF_CONTEXT_OUTPUT(get_cpu_nanos); - PERF_CONTEXT_OUTPUT(iter_next_cpu_nanos); - PERF_CONTEXT_OUTPUT(iter_prev_cpu_nanos); - PERF_CONTEXT_OUTPUT(iter_seek_cpu_nanos); - PERF_CONTEXT_OUTPUT(number_async_seek); - PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_useful); - PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_full_positive); - PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(bloom_filter_full_true_positive); - PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(block_cache_hit_count); - PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(block_cache_miss_count); - +#define PERF_CONTEXT_OUTPUT(counter) \ + if (!exclude_zero_counters || (counter > 0)) { \ + ss << #counter << " = " << counter << ", "; \ + } + DEF_PERF_CONTEXT_METRICS(PERF_CONTEXT_OUTPUT) +#undef PERF_CONTEXT_OUTPUT + if (per_level_perf_context_enabled && level_to_perf_context) { +#define PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER(counter) \ + ss << #counter << " = "; \ + for (auto& kv : *level_to_perf_context) { \ + if (!exclude_zero_counters || (kv.second.counter > 0)) { \ + ss << kv.second.counter << "@level" << kv.first << ", "; \ + } \ + } + DEF_PERF_CONTEXT_LEVEL_METRICS(PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER) +#undef PERF_CONTEXT_BY_LEVEL_OUTPUT_ONE_COUNTER + } std::string str = ss.str(); str.erase(str.find_last_not_of(", ") + 1); return str; diff --git a/monitoring/perf_step_timer.h b/monitoring/perf_step_timer.h index 8deb312527f9..f6c45d773cf9 100644 --- a/monitoring/perf_step_timer.h +++ b/monitoring/perf_step_timer.h @@ -5,7 +5,7 @@ // #pragma once #include "monitoring/perf_level_imp.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index e01eed3f381b..ebfd443002ff 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -3,14 +3,14 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#include "monitoring/statistics.h" +#include "rocksdb/statistics.h" #include #include #include +#include "monitoring/statistics_impl.h" #include "rocksdb/convenience.h" -#include "rocksdb/statistics.h" #include "rocksdb/utilities/customizable_util.h" #include "rocksdb/utilities/options_type.h" #include "util/string_util.h" @@ -28,13 +28,11 @@ const std::vector> TickersNameMap = { {BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"}, {BLOCK_CACHE_INDEX_ADD, "rocksdb.block.cache.index.add"}, {BLOCK_CACHE_INDEX_BYTES_INSERT, "rocksdb.block.cache.index.bytes.insert"}, - {BLOCK_CACHE_INDEX_BYTES_EVICT, "rocksdb.block.cache.index.bytes.evict"}, {BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"}, {BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"}, {BLOCK_CACHE_FILTER_ADD, "rocksdb.block.cache.filter.add"}, {BLOCK_CACHE_FILTER_BYTES_INSERT, "rocksdb.block.cache.filter.bytes.insert"}, - {BLOCK_CACHE_FILTER_BYTES_EVICT, "rocksdb.block.cache.filter.bytes.evict"}, {BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"}, {BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"}, {BLOCK_CACHE_DATA_ADD, "rocksdb.block.cache.data.add"}, @@ -45,7 +43,6 @@ const std::vector> TickersNameMap = { {BLOOM_FILTER_FULL_POSITIVE, "rocksdb.bloom.filter.full.positive"}, {BLOOM_FILTER_FULL_TRUE_POSITIVE, "rocksdb.bloom.filter.full.true.positive"}, - {BLOOM_FILTER_MICROS, "rocksdb.bloom.filter.micros"}, {PERSISTENT_CACHE_HIT, "rocksdb.persistent.cache.hit"}, {PERSISTENT_CACHE_MISS, "rocksdb.persistent.cache.miss"}, {SIM_BLOCK_CACHE_HIT, "rocksdb.sim.block.cache.hit"}, @@ -76,35 +73,24 @@ const std::vector> TickersNameMap = { {NUMBER_DB_NEXT_FOUND, "rocksdb.number.db.next.found"}, {NUMBER_DB_PREV_FOUND, "rocksdb.number.db.prev.found"}, {ITER_BYTES_READ, "rocksdb.db.iter.bytes.read"}, - {NO_FILE_CLOSES, "rocksdb.no.file.closes"}, {NO_FILE_OPENS, "rocksdb.no.file.opens"}, {NO_FILE_ERRORS, "rocksdb.no.file.errors"}, - {STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"}, - {STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"}, - {STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"}, {STALL_MICROS, "rocksdb.stall.micros"}, {DB_MUTEX_WAIT_MICROS, "rocksdb.db.mutex.wait.micros"}, - {RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"}, - {NO_ITERATORS, "rocksdb.num.iterators"}, {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"}, {NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read"}, {NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read"}, - {NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered"}, {NUMBER_MERGE_FAILURES, "rocksdb.number.merge.failures"}, {BLOOM_FILTER_PREFIX_CHECKED, "rocksdb.bloom.filter.prefix.checked"}, {BLOOM_FILTER_PREFIX_USEFUL, "rocksdb.bloom.filter.prefix.useful"}, + {BLOOM_FILTER_PREFIX_TRUE_POSITIVE, + "rocksdb.bloom.filter.prefix.true.positive"}, {NUMBER_OF_RESEEKS_IN_ITERATION, "rocksdb.number.reseeks.iteration"}, {GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"}, - {BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"}, - {BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"}, - {BLOCK_CACHE_COMPRESSED_ADD, "rocksdb.block.cachecompressed.add"}, - {BLOCK_CACHE_COMPRESSED_ADD_FAILURES, - "rocksdb.block.cachecompressed.add.failures"}, {WAL_FILE_SYNCED, "rocksdb.wal.synced"}, {WAL_FILE_BYTES, "rocksdb.wal.bytes"}, {WRITE_DONE_BY_SELF, "rocksdb.write.self"}, {WRITE_DONE_BY_OTHER, "rocksdb.write.other"}, - {WRITE_TIMEDOUT, "rocksdb.write.timeout"}, {WRITE_WITH_WAL, "rocksdb.write.wal"}, {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"}, {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"}, @@ -125,6 +111,7 @@ const std::vector> TickersNameMap = { {NUMBER_BLOCK_NOT_COMPRESSED, "rocksdb.number.block.not_compressed"}, {MERGE_OPERATION_TOTAL_TIME, "rocksdb.merge.operation.time.nanos"}, {FILTER_OPERATION_TOTAL_TIME, "rocksdb.filter.operation.time.nanos"}, + {COMPACTION_CPU_TOTAL_TIME, "rocksdb.compaction.total.time.cpu_micros"}, {ROW_CACHE_HIT, "rocksdb.row.cache.hit"}, {ROW_CACHE_MISS, "rocksdb.row.cache.miss"}, {READ_AMP_ESTIMATE_USEFUL_BYTES, "rocksdb.read.amp.estimate.useful.bytes"}, @@ -158,11 +145,7 @@ const std::vector> TickersNameMap = { {BLOB_DB_GC_NUM_FILES, "rocksdb.blobdb.gc.num.files"}, {BLOB_DB_GC_NUM_NEW_FILES, "rocksdb.blobdb.gc.num.new.files"}, {BLOB_DB_GC_FAILURES, "rocksdb.blobdb.gc.failures"}, - {BLOB_DB_GC_NUM_KEYS_OVERWRITTEN, "rocksdb.blobdb.gc.num.keys.overwritten"}, - {BLOB_DB_GC_NUM_KEYS_EXPIRED, "rocksdb.blobdb.gc.num.keys.expired"}, {BLOB_DB_GC_NUM_KEYS_RELOCATED, "rocksdb.blobdb.gc.num.keys.relocated"}, - {BLOB_DB_GC_BYTES_OVERWRITTEN, "rocksdb.blobdb.gc.bytes.overwritten"}, - {BLOB_DB_GC_BYTES_EXPIRED, "rocksdb.blobdb.gc.bytes.expired"}, {BLOB_DB_GC_BYTES_RELOCATED, "rocksdb.blobdb.gc.bytes.relocated"}, {BLOB_DB_FIFO_NUM_FILES_EVICTED, "rocksdb.blobdb.fifo.num.files.evicted"}, {BLOB_DB_FIFO_NUM_KEYS_EVICTED, "rocksdb.blobdb.fifo.num.keys.evicted"}, @@ -184,8 +167,6 @@ const std::vector> TickersNameMap = { "rocksdb.block.cache.compression.dict.add"}, {BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT, "rocksdb.block.cache.compression.dict.bytes.insert"}, - {BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT, - "rocksdb.block.cache.compression.dict.bytes.evict"}, {BLOCK_CACHE_ADD_REDUNDANT, "rocksdb.block.cache.add.redundant"}, {BLOCK_CACHE_INDEX_ADD_REDUNDANT, "rocksdb.block.cache.index.add.redundant"}, @@ -195,11 +176,18 @@ const std::vector> TickersNameMap = { {BLOCK_CACHE_COMPRESSION_DICT_ADD_REDUNDANT, "rocksdb.block.cache.compression.dict.add.redundant"}, {FILES_MARKED_TRASH, "rocksdb.files.marked.trash"}, + {FILES_DELETED_FROM_TRASH_QUEUE, "rocksdb.files.marked.trash.deleted"}, {FILES_DELETED_IMMEDIATELY, "rocksdb.files.deleted.immediately"}, - {ERROR_HANDLER_BG_ERROR_COUNT, "rocksdb.error.handler.bg.errro.count"}, + {ERROR_HANDLER_BG_ERROR_COUNT, "rocksdb.error.handler.bg.error.count"}, + {ERROR_HANDLER_BG_ERROR_COUNT_MISSPELLED, + "rocksdb.error.handler.bg.errro.count"}, {ERROR_HANDLER_BG_IO_ERROR_COUNT, + "rocksdb.error.handler.bg.io.error.count"}, + {ERROR_HANDLER_BG_IO_ERROR_COUNT_MISSPELLED, "rocksdb.error.handler.bg.io.errro.count"}, {ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT, + "rocksdb.error.handler.bg.retryable.io.error.count"}, + {ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT_MISSPELLED, "rocksdb.error.handler.bg.retryable.io.errro.count"}, {ERROR_HANDLER_AUTORESUME_COUNT, "rocksdb.error.handler.autoresume.count"}, {ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT, @@ -226,7 +214,23 @@ const std::vector> TickersNameMap = { {LAST_LEVEL_READ_COUNT, "rocksdb.last.level.read.count"}, {NON_LAST_LEVEL_READ_BYTES, "rocksdb.non.last.level.read.bytes"}, {NON_LAST_LEVEL_READ_COUNT, "rocksdb.non.last.level.read.count"}, + {LAST_LEVEL_SEEK_FILTERED, "rocksdb.last.level.seek.filtered"}, + {LAST_LEVEL_SEEK_FILTER_MATCH, "rocksdb.last.level.seek.filter.match"}, + {LAST_LEVEL_SEEK_DATA, "rocksdb.last.level.seek.data"}, + {LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER, + "rocksdb.last.level.seek.data.useful.no.filter"}, + {LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH, + "rocksdb.last.level.seek.data.useful.filter.match"}, + {NON_LAST_LEVEL_SEEK_FILTERED, "rocksdb.non.last.level.seek.filtered"}, + {NON_LAST_LEVEL_SEEK_FILTER_MATCH, + "rocksdb.non.last.level.seek.filter.match"}, + {NON_LAST_LEVEL_SEEK_DATA, "rocksdb.non.last.level.seek.data"}, + {NON_LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER, + "rocksdb.non.last.level.seek.data.useful.no.filter"}, + {NON_LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH, + "rocksdb.non.last.level.seek.data.useful.filter.match"}, {BLOCK_CHECKSUM_COMPUTE_COUNT, "rocksdb.block.checksum.compute.count"}, + {BLOCK_CHECKSUM_MISMATCH_COUNT, "rocksdb.block.checksum.mismatch.count"}, {MULTIGET_COROUTINE_COUNT, "rocksdb.multiget.coroutine.count"}, {BLOB_DB_CACHE_MISS, "rocksdb.blobdb.cache.miss"}, {BLOB_DB_CACHE_HIT, "rocksdb.blobdb.cache.hit"}, @@ -235,7 +239,32 @@ const std::vector> TickersNameMap = { {BLOB_DB_CACHE_BYTES_READ, "rocksdb.blobdb.cache.bytes.read"}, {BLOB_DB_CACHE_BYTES_WRITE, "rocksdb.blobdb.cache.bytes.write"}, {READ_ASYNC_MICROS, "rocksdb.read.async.micros"}, - {ASYNC_READ_ERROR_COUNT, "rocksdb.async.read.error.count"}}; + {ASYNC_READ_ERROR_COUNT, "rocksdb.async.read.error.count"}, + {SECONDARY_CACHE_FILTER_HITS, "rocksdb.secondary.cache.filter.hits"}, + {SECONDARY_CACHE_INDEX_HITS, "rocksdb.secondary.cache.index.hits"}, + {SECONDARY_CACHE_DATA_HITS, "rocksdb.secondary.cache.data.hits"}, + {TABLE_OPEN_PREFETCH_TAIL_MISS, "rocksdb.table.open.prefetch.tail.miss"}, + {TABLE_OPEN_PREFETCH_TAIL_HIT, "rocksdb.table.open.prefetch.tail.hit"}, + {TIMESTAMP_FILTER_TABLE_CHECKED, "rocksdb.timestamp.filter.table.checked"}, + {TIMESTAMP_FILTER_TABLE_FILTERED, + "rocksdb.timestamp.filter.table.filtered"}, + {BYTES_COMPRESSED_FROM, "rocksdb.bytes.compressed.from"}, + {BYTES_COMPRESSED_TO, "rocksdb.bytes.compressed.to"}, + {BYTES_COMPRESSION_BYPASSED, "rocksdb.bytes.compression_bypassed"}, + {BYTES_COMPRESSION_REJECTED, "rocksdb.bytes.compression.rejected"}, + {NUMBER_BLOCK_COMPRESSION_BYPASSED, + "rocksdb.number.block_compression_bypassed"}, + {NUMBER_BLOCK_COMPRESSION_REJECTED, + "rocksdb.number.block_compression_rejected"}, + {BYTES_DECOMPRESSED_FROM, "rocksdb.bytes.decompressed.from"}, + {BYTES_DECOMPRESSED_TO, "rocksdb.bytes.decompressed.to"}, + {READAHEAD_TRIMMED, "rocksdb.readahead.trimmed"}, + {FIFO_MAX_SIZE_COMPACTIONS, "rocksdb.fifo.max.size.compactions"}, + {FIFO_TTL_COMPACTIONS, "rocksdb.fifo.ttl.compactions"}, + {PREFETCH_BYTES, "rocksdb.prefetch.bytes"}, + {PREFETCH_BYTES_USEFUL, "rocksdb.prefetch.bytes.useful"}, + {PREFETCH_HITS, "rocksdb.prefetch.hits"}, +}; const std::vector> HistogramsNameMap = { {DB_GET, "rocksdb.db.get.micros"}, @@ -252,15 +281,20 @@ const std::vector> HistogramsNameMap = { {READ_BLOCK_COMPACTION_MICROS, "rocksdb.read.block.compaction.micros"}, {READ_BLOCK_GET_MICROS, "rocksdb.read.block.get.micros"}, {WRITE_RAW_BLOCK_MICROS, "rocksdb.write.raw.block.micros"}, - {STALL_L0_SLOWDOWN_COUNT, "rocksdb.l0.slowdown.count"}, - {STALL_MEMTABLE_COMPACTION_COUNT, "rocksdb.memtable.compaction.count"}, - {STALL_L0_NUM_FILES_COUNT, "rocksdb.num.files.stall.count"}, - {HARD_RATE_LIMIT_DELAY_COUNT, "rocksdb.hard.rate.limit.delay.count"}, - {SOFT_RATE_LIMIT_DELAY_COUNT, "rocksdb.soft.rate.limit.delay.count"}, {NUM_FILES_IN_SINGLE_COMPACTION, "rocksdb.numfiles.in.singlecompaction"}, {DB_SEEK, "rocksdb.db.seek.micros"}, {WRITE_STALL, "rocksdb.db.write.stall"}, {SST_READ_MICROS, "rocksdb.sst.read.micros"}, + {FILE_READ_FLUSH_MICROS, "rocksdb.file.read.flush.micros"}, + {FILE_READ_COMPACTION_MICROS, "rocksdb.file.read.compaction.micros"}, + {FILE_READ_DB_OPEN_MICROS, "rocksdb.file.read.db.open.micros"}, + {FILE_READ_GET_MICROS, "rocksdb.file.read.get.micros"}, + {FILE_READ_MULTIGET_MICROS, "rocksdb.file.read.multiget.micros"}, + {FILE_READ_DB_ITERATOR_MICROS, "rocksdb.file.read.db.iterator.micros"}, + {FILE_READ_VERIFY_DB_CHECKSUM_MICROS, + "rocksdb.file.read.verify.db.checksum.micros"}, + {FILE_READ_VERIFY_FILE_CHECKSUMS_MICROS, + "rocksdb.file.read.verify.file.checksums.micros"}, {NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"}, {BYTES_PER_READ, "rocksdb.bytes.per.read"}, {BYTES_PER_WRITE, "rocksdb.bytes.per.write"}, @@ -281,14 +315,12 @@ const std::vector> HistogramsNameMap = { {BLOB_DB_BLOB_FILE_WRITE_MICROS, "rocksdb.blobdb.blob.file.write.micros"}, {BLOB_DB_BLOB_FILE_READ_MICROS, "rocksdb.blobdb.blob.file.read.micros"}, {BLOB_DB_BLOB_FILE_SYNC_MICROS, "rocksdb.blobdb.blob.file.sync.micros"}, - {BLOB_DB_GC_MICROS, "rocksdb.blobdb.gc.micros"}, {BLOB_DB_COMPRESSION_MICROS, "rocksdb.blobdb.compression.micros"}, {BLOB_DB_DECOMPRESSION_MICROS, "rocksdb.blobdb.decompression.micros"}, {FLUSH_TIME, "rocksdb.db.flush.micros"}, {SST_BATCH_SIZE, "rocksdb.sst.batch.size"}, {NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, "rocksdb.num.index.and.filter.blocks.read.per.level"}, - {NUM_DATA_BLOCKS_READ_PER_LEVEL, "rocksdb.num.data.blocks.read.per.level"}, {NUM_SST_READ_PER_LEVEL, "rocksdb.num.sst.read.per.level"}, {ERROR_HANDLER_AUTORESUME_RETRY_COUNT, "rocksdb.error.handler.autoresume.retry.count"}, @@ -298,13 +330,14 @@ const std::vector> HistogramsNameMap = { {MULTIGET_IO_BATCH_SIZE, "rocksdb.multiget.io.batch.size"}, {NUM_LEVEL_READ_PER_MULTIGET, "rocksdb.num.level.read.per.multiget"}, {ASYNC_PREFETCH_ABORT_MICROS, "rocksdb.async.prefetch.abort.micros"}, + {TABLE_OPEN_PREFETCH_TAIL_READ_BYTES, + "rocksdb.table.open.prefetch.tail.read.bytes"}, }; std::shared_ptr CreateDBStatistics() { return std::make_shared(nullptr); } -#ifndef ROCKSDB_LITE static int RegisterBuiltinStatistics(ObjectLibrary& library, const std::string& /*arg*/) { library.AddFactory( @@ -316,34 +349,29 @@ static int RegisterBuiltinStatistics(ObjectLibrary& library, }); return 1; } -#endif // ROCKSDB_LITE Status Statistics::CreateFromString(const ConfigOptions& config_options, const std::string& id, std::shared_ptr* result) { -#ifndef ROCKSDB_LITE static std::once_flag once; std::call_once(once, [&]() { RegisterBuiltinStatistics(*(ObjectLibrary::Default().get()), ""); }); -#endif // ROCKSDB_LITE Status s; if (id == "" || id == StatisticsImpl::kClassName()) { result->reset(new StatisticsImpl(nullptr)); } else if (id == kNullptrString) { result->reset(); } else { - s = LoadSharedObject(config_options, id, nullptr, result); + s = LoadSharedObject(config_options, id, result); } return s; } static std::unordered_map stats_type_info = { -#ifndef ROCKSDB_LITE {"inner", OptionTypeInfo::AsCustomSharedPtr( 0, OptionVerificationType::kByNameAllowFromNull, OptionTypeFlags::kCompareNever)}, -#endif // !ROCKSDB_LITE }; StatisticsImpl::StatisticsImpl(std::shared_ptr stats) diff --git a/monitoring/statistics.h b/monitoring/statistics_impl.h similarity index 100% rename from monitoring/statistics.h rename to monitoring/statistics_impl.h diff --git a/monitoring/statistics_test.cc b/monitoring/statistics_test.cc index cffa5054a97b..98aae0c82621 100644 --- a/monitoring/statistics_test.cc +++ b/monitoring/statistics_test.cc @@ -42,12 +42,10 @@ TEST_F(StatisticsTest, SanityHistograms) { TEST_F(StatisticsTest, NoNameStats) { static std::unordered_map no_name_opt_info = { -#ifndef ROCKSDB_LITE {"inner", OptionTypeInfo::AsCustomSharedPtr( 0, OptionVerificationType::kByName, OptionTypeFlags::kAllowNull | OptionTypeFlags::kCompareNever)}, -#endif // ROCKSDB_LITE }; class DefaultNameStatistics : public Statistics { @@ -73,7 +71,6 @@ TEST_F(StatisticsTest, NoNameStats) { options.ignore_unsupported_options = false; auto stats = std::make_shared(); ASSERT_STREQ(stats->Name(), ""); -#ifndef ROCKSDB_LITE ASSERT_EQ("", stats->ToString( options)); // A stats with no name with have no options... ASSERT_OK(stats->ConfigureFromString(options, "inner=")); @@ -81,7 +78,6 @@ TEST_F(StatisticsTest, NoNameStats) { options)); // A stats with no name with have no options... ASSERT_NE(stats->inner, nullptr); ASSERT_NE("", stats->inner->ToString(options)); // ... even if it does... -#endif // ROCKSDB_LITE } } // namespace ROCKSDB_NAMESPACE diff --git a/monitoring/stats_history_test.cc b/monitoring/stats_history_test.cc index fed8535f4fb9..37db0cfe1840 100644 --- a/monitoring/stats_history_test.cc +++ b/monitoring/stats_history_test.cc @@ -29,7 +29,6 @@ namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE class StatsHistoryTest : public DBTestBase { public: StatsHistoryTest() : DBTestBase("stats_history_test", /*env_do_fsync=*/true) { @@ -186,6 +185,8 @@ TEST_F(StatsHistoryTest, GetStatsHistoryInMemory) { TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) { constexpr int kPeriodSec = 1; + constexpr int kEstimatedOneSliceSize = 16000; + Options options; options.create_if_missing = true; options.statistics = CreateDBStatistics(); @@ -207,6 +208,7 @@ TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) { for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { ASSERT_TRUE(iterator->key() == iterator->value()); } + ASSERT_OK(iterator->status()); delete iterator; ASSERT_OK(Flush()); ASSERT_OK(Delete("sol")); @@ -220,6 +222,7 @@ TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) { for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) { ASSERT_TRUE(iterator->key() == iterator->value()); } + ASSERT_OK(iterator->status()); delete iterator; ASSERT_OK(Flush()); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); @@ -243,10 +246,12 @@ TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) { } size_t stats_history_size = dbfull()->TEST_EstimateInMemoryStatsHistorySize(); ASSERT_GE(slice_count, kIterations - 1); - ASSERT_GE(stats_history_size, 15000); - // capping memory cost at 15000 bytes since one slice is around 10000~15000 - ASSERT_OK(dbfull()->SetDBOptions({{"stats_history_buffer_size", "15000"}})); - ASSERT_EQ(15000, dbfull()->GetDBOptions().stats_history_buffer_size); + ASSERT_GE(stats_history_size, kEstimatedOneSliceSize); + // capping memory cost to roughly one slice's size + ASSERT_OK(dbfull()->SetDBOptions( + {{"stats_history_buffer_size", std::to_string(kEstimatedOneSliceSize)}})); + ASSERT_EQ(kEstimatedOneSliceSize, + dbfull()->GetDBOptions().stats_history_buffer_size); // Wait for stats persist to finish for (int i = 0; i < kIterations; ++i) { @@ -266,9 +271,13 @@ TEST_F(StatsHistoryTest, InMemoryStatsHistoryPurging) { } size_t stats_history_size_reopen = dbfull()->TEST_EstimateInMemoryStatsHistorySize(); - // only one slice can fit under the new stats_history_buffer_size - ASSERT_LT(slice_count, 2); - ASSERT_TRUE(stats_history_size_reopen < 15000 && + + // Only one slice can fit under the new stats_history_buffer_size + // + // If `slice_count == 0` when new statistics are added, consider increasing + // `kEstimatedOneSliceSize` + ASSERT_EQ(slice_count, 1); + ASSERT_TRUE(stats_history_size_reopen < 16000 && stats_history_size_reopen > 0); ASSERT_TRUE(stats_count_reopen < stats_count && stats_count_reopen > 0); Close(); @@ -281,6 +290,7 @@ int countkeys(Iterator* iter) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { count++; } + EXPECT_OK(iter->status()); return count; } @@ -654,7 +664,6 @@ TEST_F(StatsHistoryTest, ForceManualFlushStatsCF) { Close(); } -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/monitoring/thread_status_updater.cc b/monitoring/thread_status_updater.cc index 9707d22656c6..37fcef62b0f9 100644 --- a/monitoring/thread_status_updater.cc +++ b/monitoring/thread_status_updater.cc @@ -47,15 +47,19 @@ void ThreadStatusUpdater::ResetThreadStatus() { SetColumnFamilyInfoKey(nullptr); } +void ThreadStatusUpdater::SetEnableTracking(bool enable_tracking) { + auto* data = Get(); + if (data == nullptr) { + return; + } + data->enable_tracking.store(enable_tracking, std::memory_order_relaxed); +} + void ThreadStatusUpdater::SetColumnFamilyInfoKey(const void* cf_key) { auto* data = Get(); if (data == nullptr) { return; } - // set the tracking flag based on whether cf_key is non-null or not. - // If enable_thread_tracking is set to false, the input cf_key - // would be nullptr. - data->enable_tracking = (cf_key != nullptr); data->cf_key.store(const_cast(cf_key), std::memory_order_relaxed); } @@ -86,6 +90,14 @@ void ThreadStatusUpdater::SetThreadOperation( } } +ThreadStatus::OperationType ThreadStatusUpdater::GetThreadOperation() { + ThreadStatusData* data = GetLocalThreadStatus(); + if (data == nullptr) { + return ThreadStatus::OperationType::OP_UNKNOWN; + } + return data->operation_type.load(std::memory_order_relaxed); +} + void ThreadStatusUpdater::SetThreadOperationProperty(int i, uint64_t value) { auto* data = GetLocalThreadStatus(); if (data == nullptr) { @@ -211,9 +223,7 @@ ThreadStatusData* ThreadStatusUpdater::GetLocalThreadStatus() { if (thread_status_data_ == nullptr) { return nullptr; } - if (!thread_status_data_->enable_tracking) { - assert(thread_status_data_->cf_key.load(std::memory_order_relaxed) == - nullptr); + if (!thread_status_data_->enable_tracking.load(std::memory_order_relaxed)) { return nullptr; } return thread_status_data_; diff --git a/monitoring/thread_status_updater.h b/monitoring/thread_status_updater.h index 762c73ae2bb3..696063cb46cd 100644 --- a/monitoring/thread_status_updater.h +++ b/monitoring/thread_status_updater.h @@ -62,7 +62,8 @@ struct ConstantColumnFamilyInfo { // status of a thread using a set of atomic pointers. struct ThreadStatusData { #ifdef ROCKSDB_USING_THREAD_STATUS - explicit ThreadStatusData() : enable_tracking(false) { + explicit ThreadStatusData() { + enable_tracking.store(false); thread_id.store(0); thread_type.store(ThreadStatus::USER); cf_key.store(nullptr); @@ -72,13 +73,10 @@ struct ThreadStatusData { } // A flag to indicate whether the thread tracking is enabled - // in the current thread. This value will be updated based on whether - // the associated Options::enable_thread_tracking is set to true - // in ThreadStatusUtil::SetColumnFamily(). - // + // in the current thread. // If set to false, then SetThreadOperation and SetThreadState // will be no-op. - bool enable_tracking; + std::atomic enable_tracking; std::atomic thread_id; std::atomic thread_type; @@ -119,8 +117,10 @@ class ThreadStatusUpdater { // Register the current thread for tracking. void RegisterThread(ThreadStatus::ThreadType ttype, uint64_t thread_id); + void SetEnableTracking(bool enable_tracking); + // Update the column-family info of the current thread by setting - // its thread-local pointer of ThreadStateInfo to the correct entry. + // its thread-local pointer of ThreadStatusData to the correct entry. void SetColumnFamilyInfoKey(const void* cf_key); // returns the column family info key. @@ -129,6 +129,9 @@ class ThreadStatusUpdater { // Update the thread operation of the current thread. void SetThreadOperation(const ThreadStatus::OperationType type); + // Return the thread operation of the current thread. + ThreadStatus::OperationType GetThreadOperation(); + // The start time of the current thread operation. It is in the format // of micro-seconds since some fixed point in time. void SetOperationStartTime(const uint64_t start_time); diff --git a/monitoring/thread_status_util.cc b/monitoring/thread_status_util.cc index c07b85fa8c10..9b66dc28e860 100644 --- a/monitoring/thread_status_util.cc +++ b/monitoring/thread_status_util.cc @@ -33,27 +33,23 @@ void ThreadStatusUtil::UnregisterThread() { } } -void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd, - const Env* env, - bool enable_thread_tracking) { - if (!MaybeInitThreadLocalUpdater(env)) { +void ThreadStatusUtil::SetEnableTracking(bool enable_tracking) { + if (thread_updater_local_cache_ == nullptr) { return; } - assert(thread_updater_local_cache_); - if (cfd != nullptr && enable_thread_tracking) { - thread_updater_local_cache_->SetColumnFamilyInfoKey(cfd); - } else { - // When cfd == nullptr or enable_thread_tracking == false, we set - // ColumnFamilyInfoKey to nullptr, which makes SetThreadOperation - // and SetThreadState become no-op. - thread_updater_local_cache_->SetColumnFamilyInfoKey(nullptr); + thread_updater_local_cache_->SetEnableTracking(enable_tracking); +} + +void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd) { + if (thread_updater_local_cache_ == nullptr) { + return; } + assert(cfd); + thread_updater_local_cache_->SetColumnFamilyInfoKey(cfd); } void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) { if (thread_updater_local_cache_ == nullptr) { - // thread_updater_local_cache_ must be set in SetColumnFamily - // or other ThreadStatusUtil functions. return; } @@ -68,6 +64,13 @@ void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) { thread_updater_local_cache_->SetThreadOperation(op); } +ThreadStatus::OperationType ThreadStatusUtil::GetThreadOperation() { + if (thread_updater_local_cache_ == nullptr) { + return ThreadStatus::OperationType::OP_UNKNOWN; + } + return thread_updater_local_cache_->GetThreadOperation(); +} + ThreadStatus::OperationStage ThreadStatusUtil::SetThreadOperationStage( ThreadStatus::OperationStage stage) { if (thread_updater_local_cache_ == nullptr) { @@ -172,9 +175,7 @@ bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* /*env*/) { return false; } -void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* /*cfd*/, - const Env* /*env*/, - bool /*enable_thread_tracking*/) {} +void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* /*cfd*/) {} void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType /*op*/) {} @@ -189,7 +190,7 @@ void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType /*state*/) {} void ThreadStatusUtil::NewColumnFamilyInfo(const DB* /*db*/, const ColumnFamilyData* /*cfd*/, const std::string& /*cf_name*/, - const Env* /*env*/) {} + const Env* env) {} void ThreadStatusUtil::EraseColumnFamilyInfo(const ColumnFamilyData* /*cfd*/) {} diff --git a/monitoring/thread_status_util.h b/monitoring/thread_status_util.h index 0137d26823f0..df148a039565 100644 --- a/monitoring/thread_status_util.h +++ b/monitoring/thread_status_util.h @@ -52,13 +52,18 @@ class ThreadStatusUtil { // the current thread does not hold db_mutex. static void EraseDatabaseInfo(const DB* db); + static void SetEnableTracking(bool enable_tracking); + // Update the thread status to indicate the current thread is doing // something related to the specified column family. - static void SetColumnFamily(const ColumnFamilyData* cfd, const Env* env, - bool enable_thread_tracking); + // + // REQUIRES: cfd != nullptr + static void SetColumnFamily(const ColumnFamilyData* cfd); static void SetThreadOperation(ThreadStatus::OperationType type); + static ThreadStatus::OperationType GetThreadOperation(); + static ThreadStatus::OperationStage SetThreadOperationStage( ThreadStatus::OperationStage stage); @@ -74,6 +79,9 @@ class ThreadStatusUtil { static void TEST_SetStateDelay(const ThreadStatus::StateType state, int micro); static void TEST_StateDelay(const ThreadStatus::StateType state); + + static Env::IOActivity TEST_GetExpectedIOActivity( + ThreadStatus::OperationType thread_op); #endif protected: diff --git a/monitoring/thread_status_util_debug.cc b/monitoring/thread_status_util_debug.cc index f7a94355dbee..24d269cbbb0c 100644 --- a/monitoring/thread_status_util_debug.cc +++ b/monitoring/thread_status_util_debug.cc @@ -27,6 +27,34 @@ void ThreadStatusUtil::TEST_StateDelay(const ThreadStatus::StateType state) { } } +Env::IOActivity ThreadStatusUtil::TEST_GetExpectedIOActivity( + ThreadStatus::OperationType thread_op) { + switch (thread_op) { + case ThreadStatus::OperationType::OP_FLUSH: + return Env::IOActivity::kFlush; + case ThreadStatus::OperationType::OP_COMPACTION: + return Env::IOActivity::kCompaction; + case ThreadStatus::OperationType::OP_DBOPEN: + return Env::IOActivity::kDBOpen; + case ThreadStatus::OperationType::OP_GET: + return Env::IOActivity::kGet; + case ThreadStatus::OperationType::OP_MULTIGET: + return Env::IOActivity::kMultiGet; + case ThreadStatus::OperationType::OP_DBITERATOR: + return Env::IOActivity::kDBIterator; + case ThreadStatus::OperationType::OP_VERIFY_DB_CHECKSUM: + return Env::IOActivity::kVerifyDBChecksum; + case ThreadStatus::OperationType::OP_VERIFY_FILE_CHECKSUMS: + return Env::IOActivity::kVerifyFileChecksums; + case ThreadStatus::OperationType::OP_GETENTITY: + return Env::IOActivity::kGetEntity; + case ThreadStatus::OperationType::OP_MULTIGETENTITY: + return Env::IOActivity::kMultiGetEntity; + default: + return Env::IOActivity::kUnknown; + } +} + #endif // !NDEBUG } // namespace ROCKSDB_NAMESPACE diff --git a/options/cf_options.cc b/options/cf_options.cc index c7eae9e1ec67..7ae7c8ca4b1a 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -16,6 +16,7 @@ #include "options/options_helper.h" #include "options/options_parser.h" #include "port/port.h" +#include "rocksdb/advanced_cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/concurrent_task_limiter.h" #include "rocksdb/configurable.h" @@ -36,7 +37,6 @@ namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE static Status ParseCompressionOptions(const std::string& value, const std::string& name, CompressionOptions& compression_opts) { @@ -147,6 +147,10 @@ static std::unordered_map {"strategy", {offsetof(struct CompressionOptions, strategy), OptionType::kInt, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"max_compressed_bytes_per_kb", + {offsetof(struct CompressionOptions, max_compressed_bytes_per_kb), + OptionType::kInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {"max_dict_bytes", {offsetof(struct CompressionOptions, max_dict_bytes), OptionType::kInt, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, @@ -169,6 +173,20 @@ static std::unordered_map {offsetof(struct CompressionOptions, use_zstd_dict_trainer), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"checksum", + {offsetof(struct CompressionOptions, checksum), OptionType::kBoolean, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, +}; + +static std::unordered_map + file_temperature_age_type_info = { + {"temperature", + {offsetof(struct FileTemperatureAge, temperature), + OptionType::kTemperature, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"age", + {offsetof(struct FileTemperatureAge, age), OptionType::kUInt64T, + OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, }; static std::unordered_map @@ -188,7 +206,15 @@ static std::unordered_map {offsetof(struct CompactionOptionsFIFO, allow_compaction), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, -}; + {"file_temperature_age_thresholds", + OptionTypeInfo::Vector( + offsetof(struct CompactionOptionsFIFO, + file_temperature_age_thresholds), + OptionVerificationType::kNormal, OptionTypeFlags::kMutable, + OptionTypeInfo::Struct("file_temperature_age_thresholds", + &file_temperature_age_type_info, 0, + OptionVerificationType::kNormal, + OptionTypeFlags::kMutable))}}; static std::unordered_map universal_compaction_options_type_info = { @@ -484,6 +510,14 @@ static std::unordered_map {offsetof(struct MutableCFOptions, memtable_protection_bytes_per_key), OptionType::kUInt32T, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"bottommost_file_compaction_delay", + {offsetof(struct MutableCFOptions, bottommost_file_compaction_delay), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + {"block_protection_bytes_per_key", + {offsetof(struct MutableCFOptions, block_protection_bytes_per_key), + OptionType::kUInt8T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {kOptNameCompOpts, OptionTypeInfo::Struct( kOptNameCompOpts, &compression_options_type_info, @@ -531,8 +565,13 @@ static std::unordered_map {"disable_write_stall", {offsetof(struct MutableCFOptions, disable_write_stall), OptionType::kBoolean, OptionVerificationType::kNormal, - OptionTypeFlags::kMutable}} + OptionTypeFlags::kMutable}}, // End special case properties + {"memtable_max_range_deletions", + {offsetof(struct MutableCFOptions, memtable_max_range_deletions), + OptionType::kUInt32T, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + }; static std::unordered_map @@ -576,6 +615,10 @@ static std::unordered_map {offsetof(struct ImmutableCFOptions, force_consistency_checks), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"default_temperature", + {offsetof(struct ImmutableCFOptions, default_temperature), + OptionType::kTemperature, OptionVerificationType::kNormal, + OptionTypeFlags::kCompareNever}}, {"preclude_last_level_data_seconds", {offsetof(struct ImmutableCFOptions, preclude_last_level_data_seconds), OptionType::kUInt64T, OptionVerificationType::kNormal, @@ -623,19 +666,12 @@ static std::unordered_map // it's a const pointer of const Comparator* const auto* ptr = static_cast(addr); - // Since the user-specified comparator will be wrapped by - // InternalKeyComparator, we should persist the - // user-specified one instead of InternalKeyComparator. if (*ptr == nullptr) { *value = kNullptrString; } else if (opts.mutable_options_only) { *value = ""; } else { - const Comparator* root_comp = (*ptr)->GetRootComparator(); - if (root_comp == nullptr) { - root_comp = (*ptr); - } - *value = root_comp->ToString(opts); + *value = (*ptr)->ToString(opts); } return Status::OK(); })}, @@ -783,6 +819,10 @@ static std::unordered_map auto* cache = static_cast*>(addr); return Cache::CreateFromString(opts, value, cache); }}}, + {"persist_user_defined_timestamps", + {offsetof(struct ImmutableCFOptions, persist_user_defined_timestamps), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kCompareLoose}}, }; const std::string OptionsHelper::kCFOptionsName = "ColumnFamilyOptions"; @@ -887,7 +927,6 @@ std::unique_ptr CFOptionsAsConfigurable( std::unique_ptr ptr(new ConfigurableCFOptions(opts, opt_map)); return ptr; } -#endif // ROCKSDB_LITE ImmutableCFOptions::ImmutableCFOptions() : ImmutableCFOptions(Options()) {} @@ -919,6 +958,7 @@ ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options) num_levels(cf_options.num_levels), optimize_filters_for_hits(cf_options.optimize_filters_for_hits), force_consistency_checks(cf_options.force_consistency_checks), + default_temperature(cf_options.default_temperature), preclude_last_level_data_seconds( cf_options.preclude_last_level_data_seconds), preserve_internal_time_seconds(cf_options.preserve_internal_time_seconds), @@ -927,7 +967,9 @@ ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options) cf_paths(cf_options.cf_paths), compaction_thread_limiter(cf_options.compaction_thread_limiter), sst_partitioner_factory(cf_options.sst_partitioner_factory), - blob_cache(cf_options.blob_cache) {} + blob_cache(cf_options.blob_cache), + persist_user_defined_timestamps( + cf_options.persist_user_defined_timestamps) {} ImmutableOptions::ImmutableOptions() : ImmutableOptions(Options()) {} @@ -1088,6 +1130,8 @@ void MutableCFOptions::Dump(Logger* log) const { ROCKS_LOG_INFO(log, " experimental_mempurge_threshold: %f", experimental_mempurge_threshold); + ROCKS_LOG_INFO(log, " bottommost_file_compaction_delay: %" PRIu32, + bottommost_file_compaction_delay); // Universal Compaction Options ROCKS_LOG_INFO(log, "compaction_options_universal.size_ratio : %d", @@ -1148,7 +1192,6 @@ void MutableCFOptions::Dump(Logger* log) const { MutableCFOptions::MutableCFOptions(const Options& options) : MutableCFOptions(ColumnFamilyOptions(options)) {} -#ifndef ROCKSDB_LITE Status GetMutableOptionsFromStrings( const MutableCFOptions& base_options, const std::unordered_map& options_map, @@ -1172,5 +1215,4 @@ Status GetStringFromMutableCFOptions(const ConfigOptions& config_options, return OptionTypeInfo::SerializeType( config_options, cf_mutable_options_type_info, &mutable_opts, opt_string); } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/options/cf_options.h b/options/cf_options.h index 4edb8a7e108b..f61a2a5460b7 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -72,6 +72,8 @@ struct ImmutableCFOptions { bool force_consistency_checks; + Temperature default_temperature; + uint64_t preclude_last_level_data_seconds; uint64_t preserve_internal_time_seconds; @@ -86,6 +88,8 @@ struct ImmutableCFOptions { std::shared_ptr sst_partitioner_factory; std::shared_ptr blob_cache; + + bool persist_user_defined_timestamps; }; struct ImmutableOptions : public ImmutableDBOptions, public ImmutableCFOptions { @@ -170,9 +174,13 @@ struct MutableCFOptions { : options.last_level_temperature), memtable_protection_bytes_per_key( options.memtable_protection_bytes_per_key), + block_protection_bytes_per_key(options.block_protection_bytes_per_key), sample_for_compression( options.sample_for_compression), // TODO: is 0 fine here? compression_per_level(options.compression_per_level), + memtable_max_range_deletions(options.memtable_max_range_deletions), + bottommost_file_compaction_delay( + options.bottommost_file_compaction_delay), disable_auto_flush(options.disable_auto_flush), disable_write_stall(options.disable_write_stall) { RefreshDerivedOptions(options.num_levels, options.compaction_style); @@ -222,7 +230,9 @@ struct MutableCFOptions { bottommost_compression(kDisableCompressionOption), last_level_temperature(Temperature::kUnknown), memtable_protection_bytes_per_key(0), + block_protection_bytes_per_key(0), sample_for_compression(0), + memtable_max_range_deletions(0), disable_auto_flush(false), disable_write_stall(false) {} @@ -314,9 +324,12 @@ struct MutableCFOptions { CompressionOptions bottommost_compression_opts; Temperature last_level_temperature; uint32_t memtable_protection_bytes_per_key; + uint8_t block_protection_bytes_per_key; uint64_t sample_for_compression; std::vector compression_per_level; + uint32_t memtable_max_range_deletions; + uint32_t bottommost_file_compaction_delay; // Derived options // Per-level target file size. @@ -337,7 +350,6 @@ uint64_t MaxFileSizeForLevel(const MutableCFOptions& cf_options, // `pin_l0_filter_and_index_blocks_in_cache` is set. size_t MaxFileSizeForL0MetaPin(const MutableCFOptions& cf_options); -#ifndef ROCKSDB_LITE Status GetStringFromMutableCFOptions(const ConfigOptions& config_options, const MutableCFOptions& mutable_opts, std::string* opt_string); @@ -346,6 +358,5 @@ Status GetMutableOptionsFromStrings( const MutableCFOptions& base_options, const std::unordered_map& options_map, Logger* info_log, MutableCFOptions* new_options); -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/options/configurable.cc b/options/configurable.cc index 08aff10fd2c3..5491336e0a78 100644 --- a/options/configurable.cc +++ b/options/configurable.cc @@ -22,11 +22,7 @@ void Configurable::RegisterOptions( const std::unordered_map* type_map) { RegisteredOptions opts; opts.name = name; -#ifndef ROCKSDB_LITE opts.type_map = type_map; -#else - (void)type_map; -#endif // ROCKSDB_LITE opts.opt_ptr = opt_ptr; options_.emplace_back(opts); } @@ -41,7 +37,6 @@ Status Configurable::PrepareOptions(const ConfigOptions& opts) { // We ignore the invoke_prepare_options here intentionally, // as if you are here, you must have called PrepareOptions explicitly. Status status = Status::OK(); -#ifndef ROCKSDB_LITE for (auto opt_iter : options_) { if (opt_iter.type_map != nullptr) { for (auto map_iter : *(opt_iter.type_map)) { @@ -55,16 +50,12 @@ Status Configurable::PrepareOptions(const ConfigOptions& opts) { } } } -#else - (void)opts; -#endif // ROCKSDB_LITE return status; } Status Configurable::ValidateOptions(const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const { Status status; -#ifndef ROCKSDB_LITE for (auto opt_iter : options_) { if (opt_iter.type_map != nullptr) { for (auto map_iter : *(opt_iter.type_map)) { @@ -79,10 +70,6 @@ Status Configurable::ValidateOptions(const DBOptions& db_opts, } } } -#else - (void)db_opts; - (void)cf_opts; -#endif // ROCKSDB_LITE return status; } @@ -105,7 +92,6 @@ std::string Configurable::GetOptionName(const std::string& opt_name) const { return opt_name; } -#ifndef ROCKSDB_LITE const OptionTypeInfo* ConfigurableHelper::FindOption( const std::vector& options, const std::string& short_name, std::string* opt_name, void** opt_ptr) { @@ -121,7 +107,6 @@ const OptionTypeInfo* ConfigurableHelper::FindOption( } return nullptr; } -#endif // ROCKSDB_LITE //************************************************************************* // @@ -156,7 +141,6 @@ Status Configurable::ConfigureOptions( // the configuration is complete. ConfigOptions copy = config_options; copy.invoke_prepare_options = false; -#ifndef ROCKSDB_LITE if (!config_options.ignore_unknown_options) { // If we are not ignoring unused, get the defaults in case we need to // reset @@ -164,14 +148,12 @@ Status Configurable::ConfigureOptions( copy.delimiter = "; "; GetOptionString(copy, &curr_opts).PermitUncheckedError(); } -#endif // ROCKSDB_LITE s = ConfigurableHelper::ConfigureOptions(copy, *this, opts_map, unused); } if (config_options.invoke_prepare_options && s.ok()) { s = PrepareOptions(config_options); } -#ifndef ROCKSDB_LITE if (!s.ok() && !curr_opts.empty()) { ConfigOptions reset = config_options; reset.ignore_unknown_options = true; @@ -180,7 +162,6 @@ Status Configurable::ConfigureOptions( // There are some options to reset from this current error ConfigureFromString(reset, curr_opts).PermitUncheckedError(); } -#endif // ROCKSDB_LITE return s; } @@ -193,7 +174,6 @@ Status Configurable::ConfigureFromString(const ConfigOptions& config_options, const std::string& opts_str) { Status s; if (!opts_str.empty()) { -#ifndef ROCKSDB_LITE if (opts_str.find(';') != std::string::npos || opts_str.find('=') != std::string::npos) { std::unordered_map opt_map; @@ -202,14 +182,11 @@ Status Configurable::ConfigureFromString(const ConfigOptions& config_options, s = ConfigureFromMap(config_options, opt_map, nullptr); } } else { -#endif // ROCKSDB_LITE s = ParseStringOptions(config_options, opts_str); if (s.ok() && config_options.invoke_prepare_options) { s = PrepareOptions(config_options); } -#ifndef ROCKSDB_LITE } -#endif // ROCKSDB_LITE } else if (config_options.invoke_prepare_options) { s = PrepareOptions(config_options); } else { @@ -218,7 +195,6 @@ Status Configurable::ConfigureFromString(const ConfigOptions& config_options, return s; } -#ifndef ROCKSDB_LITE /** * Sets the value of the named property to the input value, returning OK on * succcess. @@ -257,7 +233,6 @@ Status Configurable::ParseOption(const ConfigOptions& config_options, } } -#endif // ROCKSDB_LITE Status ConfigurableHelper::ConfigureOptions( const ConfigOptions& config_options, Configurable& configurable, @@ -266,7 +241,6 @@ Status ConfigurableHelper::ConfigureOptions( std::unordered_map remaining = opts_map; Status s = Status::OK(); if (!opts_map.empty()) { -#ifndef ROCKSDB_LITE for (const auto& iter : configurable.options_) { if (iter.type_map != nullptr) { s = ConfigureSomeOptions(config_options, configurable, *(iter.type_map), @@ -278,12 +252,6 @@ Status ConfigurableHelper::ConfigureOptions( } } } -#else - (void)configurable; - if (!config_options.ignore_unknown_options) { - s = Status::NotSupported("ConfigureFromMap not supported in LITE mode"); - } -#endif // ROCKSDB_LITE } if (unused != nullptr && !remaining.empty()) { unused->insert(remaining.begin(), remaining.end()); @@ -296,7 +264,6 @@ Status ConfigurableHelper::ConfigureOptions( return s; } -#ifndef ROCKSDB_LITE /** * Updates the object with the named-value property values, returning OK on * succcess. Any properties that were found are removed from the options list; @@ -480,7 +447,6 @@ Status ConfigurableHelper::ConfigureOption( return Status::NotFound("Could not find option: ", name); } } -#endif // ROCKSDB_LITE //******************************************************************************* // @@ -492,16 +458,10 @@ Status Configurable::GetOptionString(const ConfigOptions& config_options, std::string* result) const { assert(result); result->clear(); -#ifndef ROCKSDB_LITE return ConfigurableHelper::SerializeOptions(config_options, *this, "", result); -#else - (void)config_options; - return Status::NotSupported("GetOptionString not supported in LITE mode"); -#endif // ROCKSDB_LITE } -#ifndef ROCKSDB_LITE std::string Configurable::ToString(const ConfigOptions& config_options, const std::string& prefix) const { std::string result = SerializeOptions(config_options, prefix); @@ -601,14 +561,12 @@ Status ConfigurableHelper::SerializeOptions(const ConfigOptions& config_options, } return Status::OK(); } -#endif // ROCKSDB_LITE //******************************************************************************** // // Methods for listing the options from Configurables // //******************************************************************************** -#ifndef ROCKSDB_LITE Status Configurable::GetOptionNames( const ConfigOptions& config_options, std::unordered_set* result) const { @@ -639,7 +597,6 @@ Status ConfigurableHelper::ListOptions( } return status; } -#endif // ROCKSDB_LITE //******************************************************************************* // @@ -655,18 +612,13 @@ bool Configurable::AreEquivalent(const ConfigOptions& config_options, if (this == other || config_options.IsCheckDisabled()) { return true; } else if (other != nullptr) { -#ifndef ROCKSDB_LITE return ConfigurableHelper::AreEquivalent(config_options, *this, *other, name); -#else - return true; -#endif // ROCKSDB_LITE } else { return false; } } -#ifndef ROCKSDB_LITE bool Configurable::OptionsAreEqual(const ConfigOptions& config_options, const OptionTypeInfo& opt_info, const std::string& opt_name, @@ -722,7 +674,6 @@ bool ConfigurableHelper::AreEquivalent(const ConfigOptions& config_options, } return true; } -#endif // ROCKSDB_LITE Status Configurable::GetOptionsMap( const std::string& value, const std::string& default_id, std::string* id, @@ -734,7 +685,6 @@ Status Configurable::GetOptionsMap( *id = default_id; } else if (value.find('=') == std::string::npos) { *id = value; -#ifndef ROCKSDB_LITE } else { status = StringToMap(value, props); if (!status.ok()) { // There was an error creating the map. @@ -756,11 +706,6 @@ Status Configurable::GetOptionsMap( props->clear(); // Clear the properties } } -#else - } else { - *id = value; - props->clear(); -#endif } return status; } diff --git a/options/configurable_helper.h b/options/configurable_helper.h index 0f5f918cb600..5d409f82a45b 100644 --- a/options/configurable_helper.h +++ b/options/configurable_helper.h @@ -46,7 +46,6 @@ class ConfigurableHelper { const std::unordered_map& options, std::unordered_map* unused); -#ifndef ROCKSDB_LITE // Internal method to configure a set of options for this object. // Classes may override this value to change its behavior. // @param config_options Controls how the options are being configured @@ -181,7 +180,6 @@ class ConfigurableHelper { const ConfigOptions& config_options, Configurable& configurable, const OptionTypeInfo& opt_info, const std::string& opt_name, const std::string& name, const std::string& value, void* opt_ptr); -#endif // ROCKSDB_LITE }; } // namespace ROCKSDB_NAMESPACE diff --git a/options/configurable_test.cc b/options/configurable_test.cc index 6ec02cf3a48f..a03d8f0a52fd 100644 --- a/options/configurable_test.cc +++ b/options/configurable_test.cc @@ -46,20 +46,16 @@ class StringLogger : public Logger { std::string string_; }; static std::unordered_map struct_option_info = { -#ifndef ROCKSDB_LITE {"struct", OptionTypeInfo::Struct("struct", &simple_option_info, 0, OptionVerificationType::kNormal, OptionTypeFlags::kMutable)}, -#endif // ROCKSDB_LITE }; static std::unordered_map imm_struct_option_info = { -#ifndef ROCKSDB_LITE {"struct", OptionTypeInfo::Struct("struct", &simple_option_info, 0, OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, -#endif // ROCKSDB_LITE }; class SimpleConfigurable : public TestConfigurable { @@ -113,14 +109,12 @@ TEST_F(ConfigurableTest, ConfigureFromMapTest) { auto* opts = configurable->GetOptions("simple"); ASSERT_OK(configurable->ConfigureFromMap(config_options_, {})); ASSERT_NE(opts, nullptr); -#ifndef ROCKSDB_LITE std::unordered_map options_map = { {"int", "1"}, {"bool", "true"}, {"string", "string"}}; ASSERT_OK(configurable->ConfigureFromMap(config_options_, options_map)); ASSERT_EQ(opts->i, 1); ASSERT_EQ(opts->b, true); ASSERT_EQ(opts->s, "string"); -#endif } TEST_F(ConfigurableTest, ConfigureFromStringTest) { @@ -128,16 +122,13 @@ TEST_F(ConfigurableTest, ConfigureFromStringTest) { auto* opts = configurable->GetOptions("simple"); ASSERT_OK(configurable->ConfigureFromString(config_options_, "")); ASSERT_NE(opts, nullptr); -#ifndef ROCKSDB_LITE // GetOptionsFromMap is not supported in ROCKSDB_LITE ASSERT_OK(configurable->ConfigureFromString(config_options_, "int=1;bool=true;string=s")); ASSERT_EQ(opts->i, 1); ASSERT_EQ(opts->b, true); ASSERT_EQ(opts->s, "s"); -#endif } -#ifndef ROCKSDB_LITE // GetOptionsFromMap is not supported in ROCKSDB_LITE TEST_F(ConfigurableTest, ConfigureIgnoreTest) { std::unique_ptr configurable(SimpleConfigurable::Create()); std::unordered_map options_map = {{"unused", "u"}}; @@ -217,27 +208,21 @@ TEST_F(ConfigurableTest, InvalidOptionTest) { } static std::unordered_map validated_option_info = { -#ifndef ROCKSDB_LITE {"validated", {0, OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, -#endif // ROCKSDB_LITE }; static std::unordered_map prepared_option_info = { -#ifndef ROCKSDB_LITE {"prepared", {0, OptionType::kInt, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, -#endif // ROCKSDB_LITE }; static std::unordered_map dont_prepare_option_info = { -#ifndef ROCKSDB_LITE {"unique", {0, OptionType::kConfigurable, OptionVerificationType::kNormal, (OptionTypeFlags::kUnique | OptionTypeFlags::kDontPrepare)}}, -#endif // ROCKSDB_LITE }; class ValidatedConfigurable : public SimpleConfigurable { @@ -367,11 +352,9 @@ TEST_F(ConfigurableTest, CopyObjectTest) { TEST_F(ConfigurableTest, MutableOptionsTest) { static std::unordered_map imm_option_info = { -#ifndef ROCKSDB_LITE {"imm", OptionTypeInfo::Struct("imm", &simple_option_info, 0, OptionVerificationType::kNormal, OptionTypeFlags::kNone)}, -#endif // ROCKSDB_LITE }; class MutableConfigurable : public SimpleConfigurable { @@ -610,7 +593,6 @@ TEST_F(ConfigurableTest, ConfigurableEnumTest) { ASSERT_NOK(base->ConfigureOption(config_options_, "unknown", "bad")); } -#ifndef ROCKSDB_LITE static std::unordered_map noserialize_option_info = { {"int", @@ -680,7 +662,6 @@ TEST_F(ConfigurableTest, NullOptionMapTest) { ASSERT_OK(copy->ConfigureFromString(config_options_, str)); ASSERT_TRUE(base->AreEquivalent(config_options_, copy.get(), &str)); } -#endif static std::unordered_map TestFactories = { {"Simple", []() { return SimpleConfigurable::Create("simple"); }}, @@ -867,7 +848,6 @@ INSTANTIATE_TEST_CASE_P( std::pair("BlockBased", "block_size=1024;" "no_block_cache=true;"))); -#endif // ROCKSDB_LITE } // namespace test } // namespace ROCKSDB_NAMESPACE diff --git a/options/configurable_test.h b/options/configurable_test.h index cf9d06678373..3d6fe84108b7 100644 --- a/options/configurable_test.h +++ b/options/configurable_test.h @@ -38,7 +38,6 @@ struct TestOptions { }; static std::unordered_map simple_option_info = { -#ifndef ROCKSDB_LITE {"int", {offsetof(struct TestOptions, i), OptionType::kInt, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, @@ -48,37 +47,28 @@ static std::unordered_map simple_option_info = { {"string", {offsetof(struct TestOptions, s), OptionType::kString, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, -#endif // ROCKSDB_LITE }; static std::unordered_map enum_option_info = { -#ifndef ROCKSDB_LITE {"enum", OptionTypeInfo::Enum(offsetof(struct TestOptions, e), &test_enum_map)} -#endif }; static std::unordered_map unique_option_info = { -#ifndef ROCKSDB_LITE {"unique", {0, OptionType::kConfigurable, OptionVerificationType::kNormal, (OptionTypeFlags::kUnique | OptionTypeFlags::kMutable)}}, -#endif // ROCKSDB_LITE }; static std::unordered_map shared_option_info = { -#ifndef ROCKSDB_LITE {"shared", {0, OptionType::kConfigurable, OptionVerificationType::kNormal, (OptionTypeFlags::kShared)}}, -#endif // ROCKSDB_LITE }; static std::unordered_map pointer_option_info = { -#ifndef ROCKSDB_LITE {"pointer", {0, OptionType::kConfigurable, OptionVerificationType::kNormal, OptionTypeFlags::kRawPointer}}, -#endif // ROCKSDB_LITE }; enum TestConfigMode { diff --git a/options/customizable.cc b/options/customizable.cc index cd39550e524f..2f154d84c576 100644 --- a/options/customizable.cc +++ b/options/customizable.cc @@ -35,7 +35,6 @@ std::string Customizable::GenerateIndividualId() const { return ostr.str(); } -#ifndef ROCKSDB_LITE Status Customizable::GetOption(const ConfigOptions& config_options, const std::string& opt_name, std::string* value) const { @@ -68,7 +67,6 @@ std::string Customizable::SerializeOptions(const ConfigOptions& config_options, return result; } -#endif // ROCKSDB_LITE bool Customizable::AreEquivalent(const ConfigOptions& config_options, const Configurable* other, @@ -102,9 +100,6 @@ Status Customizable::GetOptionsMap( } else if (customizable != nullptr) { status = Configurable::GetOptionsMap(value, customizable->GetId(), id, props); -#ifdef ROCKSDB_LITE - (void)config_options; -#else if (status.ok() && customizable->IsInstanceOf(*id)) { // The new ID and the old ID match, so the objects are the same type. // Try to get the existing options, ignoring any errors @@ -118,7 +113,6 @@ Status Customizable::GetOptionsMap( } } } -#endif // ROCKSDB_LITE } else { status = Configurable::GetOptionsMap(value, "", id, props); } diff --git a/options/customizable_test.cc b/options/customizable_test.cc index 2ed4eeb9e7ec..0e614ed16087 100644 --- a/options/customizable_test.cc +++ b/options/customizable_test.cc @@ -35,7 +35,7 @@ #include "rocksdb/utilities/object_registry.h" #include "rocksdb/utilities/options_type.h" #include "table/block_based/filter_policy_internal.h" -#include "table/block_based/flush_block_policy.h" +#include "table/block_based/flush_block_policy_impl.h" #include "table/mock_table.h" #include "test_util/mock_time_env.h" #include "test_util/testharness.h" @@ -84,7 +84,6 @@ class TestCustomizable : public Customizable { const char* Name() const override { return name_.c_str(); } static const char* Type() { return "test.custom"; } -#ifndef ROCKSDB_LITE static Status CreateFromString(const ConfigOptions& opts, const std::string& value, std::unique_ptr* result); @@ -94,7 +93,6 @@ class TestCustomizable : public Customizable { static Status CreateFromString(const ConfigOptions& opts, const std::string& value, TestCustomizable** result); -#endif // ROCKSDB_LITE bool IsInstanceOf(const std::string& name) const override { if (name == kClassName()) { return true; @@ -114,14 +112,12 @@ struct AOptions { }; static std::unordered_map a_option_info = { -#ifndef ROCKSDB_LITE {"int", {offsetof(struct AOptions, i), OptionType::kInt, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, {"bool", {offsetof(struct AOptions, b), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, -#endif // ROCKSDB_LITE }; class ACustomizable : public TestCustomizable { @@ -144,14 +140,12 @@ struct BOptions { }; static std::unordered_map b_option_info = { -#ifndef ROCKSDB_LITE {"string", {offsetof(struct BOptions, s), OptionType::kString, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, {"bool", {offsetof(struct BOptions, b), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, -#endif // ROCKSDB_LITE }; class BCustomizable : public TestCustomizable { @@ -166,20 +160,6 @@ class BCustomizable : public TestCustomizable { BOptions opts_; }; -#ifndef ROCKSDB_LITE -static bool LoadSharedB(const std::string& id, - std::shared_ptr* result) { - if (id == "B") { - result->reset(new BCustomizable(id)); - return true; - } else if (id.empty()) { - result->reset(); - return true; - } else { - return false; - } -} - static int A_count = 0; static int RegisterCustomTestObjects(ObjectLibrary& library, const std::string& /*arg*/) { @@ -191,6 +171,12 @@ static int RegisterCustomTestObjects(ObjectLibrary& library, A_count++; return guard->get(); }); + library.AddFactory( + "B", [](const std::string& name, std::unique_ptr* guard, + std::string* /* msg */) { + guard->reset(new BCustomizable(name)); + return guard->get(); + }); library.AddFactory( "S", [](const std::string& name, @@ -199,7 +185,6 @@ static int RegisterCustomTestObjects(ObjectLibrary& library, size_t num_types; return static_cast(library.GetFactoryCount(&num_types)); } -#endif // ROCKSDB_LITE struct SimpleOptions { static const char* kName() { return "simple"; } @@ -210,7 +195,6 @@ struct SimpleOptions { }; static std::unordered_map simple_option_info = { -#ifndef ROCKSDB_LITE {"bool", {offsetof(struct SimpleOptions, b), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, @@ -226,7 +210,6 @@ static std::unordered_map simple_option_info = { OptionTypeInfo::AsCustomRawPtr( offsetof(struct SimpleOptions, cp), OptionVerificationType::kNormal, OptionTypeFlags::kAllowNull)}, -#endif // ROCKSDB_LITE }; class SimpleConfigurable : public Configurable { @@ -242,7 +225,6 @@ class SimpleConfigurable : public Configurable { } }; -#ifndef ROCKSDB_LITE static void GetMapFromProperties( const std::string& props, std::unordered_map* map) { @@ -258,71 +240,37 @@ static void GetMapFromProperties( (*map)[name] = value; } } -#endif // ROCKSDB_LITE } // namespace -#ifndef ROCKSDB_LITE Status TestCustomizable::CreateFromString( const ConfigOptions& config_options, const std::string& value, std::shared_ptr* result) { - return LoadSharedObject(config_options, value, LoadSharedB, - result); + return LoadSharedObject(config_options, value, result); } Status TestCustomizable::CreateFromString( const ConfigOptions& config_options, const std::string& value, std::unique_ptr* result) { - return LoadUniqueObject( - config_options, value, - [](const std::string& id, std::unique_ptr* u) { - if (id == "B") { - u->reset(new BCustomizable(id)); - return true; - } else if (id.empty()) { - u->reset(); - return true; - } else { - return false; - } - }, - result); + return LoadUniqueObject(config_options, value, result); } Status TestCustomizable::CreateFromString(const ConfigOptions& config_options, const std::string& value, TestCustomizable** result) { - return LoadStaticObject( - config_options, value, - [](const std::string& id, TestCustomizable** ptr) { - if (id == "B") { - *ptr = new BCustomizable(id); - return true; - } else if (id.empty()) { - *ptr = nullptr; - return true; - } else { - return false; - } - }, - result); + return LoadStaticObject(config_options, value, result); } -#endif // ROCKSDB_LITE class CustomizableTest : public testing::Test { public: CustomizableTest() { config_options_.invoke_prepare_options = false; -#ifndef ROCKSDB_LITE - // GetOptionsFromMap is not supported in ROCKSDB_LITE config_options_.registry->AddLibrary("CustomizableTest", RegisterCustomTestObjects, ""); -#endif // ROCKSDB_LITE } ConfigOptions config_options_; }; -#ifndef ROCKSDB_LITE // GetOptionsFromMap is not supported in ROCKSDB_LITE // Tests that a Customizable can be created by: // - a simple name // - a XXX.id option @@ -584,11 +532,9 @@ TEST_F(CustomizableTest, IsInstanceOfTest) { TEST_F(CustomizableTest, PrepareOptionsTest) { static std::unordered_map p_option_info = { -#ifndef ROCKSDB_LITE {"can_prepare", {0, OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, -#endif // ROCKSDB_LITE }; class PrepareCustomizable : public TestCustomizable { @@ -665,11 +611,9 @@ TEST_F(CustomizableTest, PrepareOptionsTest) { namespace { static std::unordered_map inner_option_info = { -#ifndef ROCKSDB_LITE {"inner", OptionTypeInfo::AsCustomSharedPtr( 0, OptionVerificationType::kNormal, OptionTypeFlags::kStringNameOnly)} -#endif // ROCKSDB_LITE }; struct InnerOptions { @@ -947,7 +891,6 @@ TEST_F(CustomizableTest, NewEmptyStaticTest) { } namespace { -#ifndef ROCKSDB_LITE static std::unordered_map vector_option_info = { {"vector", OptionTypeInfo::Vector>( @@ -996,7 +939,6 @@ TEST_F(CustomizableTest, NoNameTest) { ASSERT_EQ(copts->cu, nullptr); } -#endif // ROCKSDB_LITE TEST_F(CustomizableTest, IgnoreUnknownObjects) { ConfigOptions ignore = config_options_; @@ -1004,70 +946,34 @@ TEST_F(CustomizableTest, IgnoreUnknownObjects) { std::unique_ptr unique; TestCustomizable* pointer = nullptr; ignore.ignore_unsupported_options = false; - ASSERT_NOK( - LoadSharedObject(ignore, "Unknown", nullptr, &shared)); - ASSERT_NOK( - LoadUniqueObject(ignore, "Unknown", nullptr, &unique)); - ASSERT_NOK( - LoadStaticObject(ignore, "Unknown", nullptr, &pointer)); + ASSERT_NOK(LoadSharedObject(ignore, "Unknown", &shared)); + ASSERT_NOK(LoadUniqueObject(ignore, "Unknown", &unique)); + ASSERT_NOK(LoadStaticObject(ignore, "Unknown", &pointer)); ASSERT_EQ(shared.get(), nullptr); ASSERT_EQ(unique.get(), nullptr); ASSERT_EQ(pointer, nullptr); ignore.ignore_unsupported_options = true; - ASSERT_OK( - LoadSharedObject(ignore, "Unknown", nullptr, &shared)); - ASSERT_OK( - LoadUniqueObject(ignore, "Unknown", nullptr, &unique)); - ASSERT_OK( - LoadStaticObject(ignore, "Unknown", nullptr, &pointer)); + ASSERT_OK(LoadSharedObject(ignore, "Unknown", &shared)); + ASSERT_OK(LoadUniqueObject(ignore, "Unknown", &unique)); + ASSERT_OK(LoadStaticObject(ignore, "Unknown", &pointer)); ASSERT_EQ(shared.get(), nullptr); ASSERT_EQ(unique.get(), nullptr); ASSERT_EQ(pointer, nullptr); - ASSERT_OK(LoadSharedObject(ignore, "id=Unknown", nullptr, - &shared)); - ASSERT_OK(LoadUniqueObject(ignore, "id=Unknown", nullptr, - &unique)); - ASSERT_OK(LoadStaticObject(ignore, "id=Unknown", nullptr, - &pointer)); + ASSERT_OK(LoadSharedObject(ignore, "id=Unknown", &shared)); + ASSERT_OK(LoadUniqueObject(ignore, "id=Unknown", &unique)); + ASSERT_OK(LoadStaticObject(ignore, "id=Unknown", &pointer)); ASSERT_EQ(shared.get(), nullptr); ASSERT_EQ(unique.get(), nullptr); ASSERT_EQ(pointer, nullptr); ASSERT_OK(LoadSharedObject(ignore, "id=Unknown;option=bad", - nullptr, &shared)); + &shared)); ASSERT_OK(LoadUniqueObject(ignore, "id=Unknown;option=bad", - nullptr, &unique)); + &unique)); ASSERT_OK(LoadStaticObject(ignore, "id=Unknown;option=bad", - nullptr, &pointer)); - ASSERT_EQ(shared.get(), nullptr); - ASSERT_EQ(unique.get(), nullptr); - ASSERT_EQ(pointer, nullptr); -} - -TEST_F(CustomizableTest, FactoryFunctionTest) { - std::shared_ptr shared; - std::unique_ptr unique; - TestCustomizable* pointer = nullptr; - ConfigOptions ignore = config_options_; - ignore.ignore_unsupported_options = false; - ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &shared)); - ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &unique)); - ASSERT_OK(TestCustomizable::CreateFromString(ignore, "B", &pointer)); - ASSERT_NE(shared.get(), nullptr); - ASSERT_NE(unique.get(), nullptr); - ASSERT_NE(pointer, nullptr); - delete pointer; - pointer = nullptr; - ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &shared)); - ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &unique)); - ASSERT_OK(TestCustomizable::CreateFromString(ignore, "id=", &pointer)); + &pointer)); ASSERT_EQ(shared.get(), nullptr); ASSERT_EQ(unique.get(), nullptr); ASSERT_EQ(pointer, nullptr); - ASSERT_NOK(TestCustomizable::CreateFromString(ignore, "option=bad", &shared)); - ASSERT_NOK(TestCustomizable::CreateFromString(ignore, "option=bad", &unique)); - ASSERT_NOK( - TestCustomizable::CreateFromString(ignore, "option=bad", &pointer)); - ASSERT_EQ(pointer, nullptr); } TEST_F(CustomizableTest, URLFactoryTest) { @@ -1317,7 +1223,6 @@ TEST_F(CustomizableTest, CreateManagedObjects) { ASSERT_EQ(mc1, obj); } -#endif // !ROCKSDB_LITE namespace { class TestSecondaryCache : public SecondaryCache { @@ -1325,14 +1230,19 @@ class TestSecondaryCache : public SecondaryCache { static const char* kClassName() { return "Test"; } const char* Name() const override { return kClassName(); } Status Insert(const Slice& /*key*/, Cache::ObjectPtr /*value*/, - const Cache::CacheItemHelper* /*helper*/) override { + const Cache::CacheItemHelper* /*helper*/, + bool /*force_insert*/) override { return Status::NotSupported(); } + Status InsertSaved(const Slice& /*key*/, const Slice& /*saved*/, + CompressionType /*type*/, CacheTier /*source*/) override { + return Status::OK(); + } std::unique_ptr Lookup( const Slice& /*key*/, const Cache::CacheItemHelper* /*helper*/, Cache::CreateContext* /*create_context*/, bool /*wait*/, - bool /*advise_erase*/, bool& is_in_sec_cache) override { - is_in_sec_cache = true; + bool /*advise_erase*/, bool& kept_in_sec_cache) override { + kept_in_sec_cache = true; return nullptr; } @@ -1385,7 +1295,6 @@ class MockMemoryAllocator : public BaseMemoryAllocator { const char* Name() const override { return kClassName(); } }; -#ifndef ROCKSDB_LITE class MockEncryptionProvider : public EncryptionProvider { public: explicit MockEncryptionProvider(const std::string& id) : id_(id) {} @@ -1428,7 +1337,6 @@ class MockCipher : public BlockCipher { Status Encrypt(char* /*data*/) override { return Status::NotSupported(); } Status Decrypt(char* data) override { return Encrypt(data); } }; -#endif // ROCKSDB_LITE class DummyFileSystem : public FileSystemWrapper { public: @@ -1438,9 +1346,7 @@ class DummyFileSystem : public FileSystemWrapper { const char* Name() const override { return kClassName(); } }; -#ifndef ROCKSDB_LITE -#endif // ROCKSDB_LITE class MockTablePropertiesCollectorFactory : public TablePropertiesCollectorFactory { @@ -1489,7 +1395,6 @@ class MockFilterPolicy : public FilterPolicy { } }; -#ifndef ROCKSDB_LITE static int RegisterLocalObjects(ObjectLibrary& library, const std::string& /*arg*/) { size_t num_types; @@ -1615,7 +1520,6 @@ static int RegisterLocalObjects(ObjectLibrary& library, return static_cast(library.GetFactoryCount(&num_types)); } -#endif // !ROCKSDB_LITE } // namespace class LoadCustomizableTest : public testing::Test { @@ -1625,16 +1529,11 @@ class LoadCustomizableTest : public testing::Test { config_options_.invoke_prepare_options = false; } bool RegisterTests(const std::string& arg) { -#ifndef ROCKSDB_LITE config_options_.registry->AddLibrary("custom-tests", test::RegisterTestObjects, arg); config_options_.registry->AddLibrary("local-tests", RegisterLocalObjects, arg); return true; -#else - (void)arg; - return false; -#endif // !ROCKSDB_LITE } template @@ -1676,11 +1575,9 @@ class LoadCustomizableTest : public testing::Test { std::unordered_set factories = expected; Status s = T::CreateFromString(config_options_, mock, object); EXPECT_NOK(s); -#ifndef ROCKSDB_LITE std::vector builtins; ObjectLibrary::Default()->GetFactoryNames(T::Type(), &builtins); factories.insert(builtins.begin(), builtins.end()); -#endif // ROCKSDB_LITE Status result; int created = 0; for (const auto& name : factories) { @@ -1702,7 +1599,6 @@ class LoadCustomizableTest : public testing::Test { EXPECT_TRUE(object->get()->IsInstanceOf(name)); } } -#ifndef ROCKSDB_LITE std::vector plugins; ObjectRegistry::Default()->GetFactoryNames(T::Type(), &plugins); if (plugins.size() > builtins.size()) { @@ -1737,10 +1633,6 @@ class LoadCustomizableTest : public testing::Test { T::Type(), created, (int)expected.size(), (int)(factories.size() - expected.size()), (int)(plugins.size() - builtins.size()), (int)failed->size()); -#else - printf("%s: Created %d (expected %d) %d Failed\n", T::Type(), created, - (int)expected.size(), (int)failed->size()); -#endif // ROCKSDB_LITE return result; } @@ -1771,11 +1663,9 @@ class LoadCustomizableTest : public testing::Test { std::unordered_set factories = expected; Status s = TestCreateStatic(mock, object, delete_objects); EXPECT_NOK(s); -#ifndef ROCKSDB_LITE std::vector builtins; ObjectLibrary::Default()->GetFactoryNames(T::Type(), &builtins); factories.insert(builtins.begin(), builtins.end()); -#endif // ROCKSDB_LITE int created = 0; Status result; for (const auto& name : factories) { @@ -1786,7 +1676,6 @@ class LoadCustomizableTest : public testing::Test { failed->push_back(name); } } -#ifndef ROCKSDB_LITE std::vector plugins; ObjectRegistry::Default()->GetFactoryNames(T::Type(), &plugins); if (plugins.size() > builtins.size()) { @@ -1814,10 +1703,6 @@ class LoadCustomizableTest : public testing::Test { T::Type(), created, (int)expected.size(), (int)(factories.size() - expected.size()), (int)(plugins.size() - builtins.size()), (int)failed->size()); -#else - printf("%s: Created %d (expected %d) %d Failed\n", T::Type(), created, - (int)expected.size(), (int)failed->size()); -#endif // ROCKSDB_LITE return result; } @@ -1831,7 +1716,6 @@ TEST_F(LoadCustomizableTest, LoadTableFactoryTest) { ASSERT_OK( TestSharedBuiltins(mock::MockTableFactory::kClassName(), TableFactory::kBlockBasedTableName())); -#ifndef ROCKSDB_LITE std::string opts_str = "table_factory="; ASSERT_OK(GetColumnFamilyOptionsFromString( config_options_, cf_opts_, @@ -1839,17 +1723,14 @@ TEST_F(LoadCustomizableTest, LoadTableFactoryTest) { ASSERT_NE(cf_opts_.table_factory.get(), nullptr); ASSERT_STREQ(cf_opts_.table_factory->Name(), TableFactory::kBlockBasedTableName()); -#endif // ROCKSDB_LITE if (RegisterTests("Test")) { ExpectCreateShared(mock::MockTableFactory::kClassName()); -#ifndef ROCKSDB_LITE ASSERT_OK(GetColumnFamilyOptionsFromString( config_options_, cf_opts_, opts_str + mock::MockTableFactory::kClassName(), &cf_opts_)); ASSERT_NE(cf_opts_.table_factory.get(), nullptr); ASSERT_STREQ(cf_opts_.table_factory->Name(), mock::MockTableFactory::kClassName()); -#endif // ROCKSDB_LITE } } @@ -1870,7 +1751,6 @@ TEST_F(LoadCustomizableTest, LoadSecondaryCacheTest) { } } -#ifndef ROCKSDB_LITE TEST_F(LoadCustomizableTest, LoadSstPartitionerFactoryTest) { ASSERT_OK(TestSharedBuiltins( "Mock", SstPartitionerFixedPrefixFactory::kClassName())); @@ -1878,7 +1758,6 @@ TEST_F(LoadCustomizableTest, LoadSstPartitionerFactoryTest) { ExpectCreateShared("Mock"); } } -#endif // ROCKSDB_LITE TEST_F(LoadCustomizableTest, LoadChecksumGenFactoryTest) { ASSERT_OK(TestSharedBuiltins("Mock", "")); @@ -1945,7 +1824,6 @@ TEST_F(LoadCustomizableTest, LoadStatisticsTest) { ASSERT_NE(db_opts_.statistics, nullptr); ASSERT_STREQ(db_opts_.statistics->Name(), "BasicStatistics"); -#ifndef ROCKSDB_LITE ASSERT_NOK(GetDBOptionsFromString(config_options_, db_opts_, "statistics=Test", &db_opts_)); ASSERT_OK(GetDBOptionsFromString(config_options_, db_opts_, @@ -1981,7 +1859,6 @@ TEST_F(LoadCustomizableTest, LoadStatisticsTest) { ASSERT_NE(inner->get(), nullptr); ASSERT_STREQ(inner->get()->Name(), TestStatistics::kClassName()); } -#endif } TEST_F(LoadCustomizableTest, LoadMemTableRepFactoryTest) { @@ -2011,7 +1888,6 @@ TEST_F(LoadCustomizableTest, LoadMergeOperatorTest) { "put", "put_v1", "PutOperator", "uint64add", "UInt64AddOperator", "max", "MaxOperator", }; -#ifndef ROCKSDB_LITE expected.insert({ StringAppendOperator::kClassName(), StringAppendOperator::kNickName(), @@ -2022,7 +1898,6 @@ TEST_F(LoadCustomizableTest, LoadMergeOperatorTest) { BytesXOROperator::kClassName(), BytesXOROperator::kNickName(), }); -#endif // ROCKSDB_LITE ASSERT_OK(TestExpectedBuiltins("Changling", expected, &result, &failed)); @@ -2048,7 +1923,6 @@ TEST_F(LoadCustomizableTest, LoadCompactionFilterTest) { } } -#ifndef ROCKSDB_LITE TEST_F(LoadCustomizableTest, LoadEventListenerTest) { ASSERT_OK(TestSharedBuiltins( OnFileDeletionListener::kClassName(), "")); @@ -2092,7 +1966,6 @@ TEST_F(LoadCustomizableTest, LoadEncryptionCipherTest) { ExpectCreateShared("Mock"); } } -#endif // !ROCKSDB_LITE TEST_F(LoadCustomizableTest, LoadSystemClockTest) { ASSERT_OK(TestSharedBuiltins(MockSystemClock::kClassName(), @@ -2138,20 +2011,17 @@ TEST_F(LoadCustomizableTest, LoadFilterPolicyTest) { ReadOnlyBuiltinFilterPolicy::kClassName(), }; -#ifndef ROCKSDB_LITE expected.insert({ kAutoBloom, BloomFilterPolicy::kNickName(), kAutoRibbon, RibbonFilterPolicy::kNickName(), }); -#endif // ROCKSDB_LITE ASSERT_OK(TestExpectedBuiltins( "Mock", expected, &result, &failures, [](const std::string& name) { std::vector names = {name + ":1.234"}; return names; })); -#ifndef ROCKSDB_LITE ASSERT_OK(FilterPolicy::CreateFromString( config_options_, kAutoBloom + ":1.234:false", &result)); ASSERT_NE(result.get(), nullptr); @@ -2168,7 +2038,6 @@ TEST_F(LoadCustomizableTest, LoadFilterPolicyTest) { kAutoRibbon + ":1.234:56", &result)); ASSERT_NE(result.get(), nullptr); ASSERT_TRUE(result->IsInstanceOf(kAutoRibbon)); -#endif // ROCKSDB_LITE if (RegisterTests("Test")) { ExpectCreateShared(MockFilterPolicy::kClassName(), &result); @@ -2176,7 +2045,6 @@ TEST_F(LoadCustomizableTest, LoadFilterPolicyTest) { std::shared_ptr table; -#ifndef ROCKSDB_LITE std::string table_opts = "id=BlockBasedTable; filter_policy="; ASSERT_OK(TableFactory::CreateFromString(config_options_, table_opts + "nullptr", &table)); @@ -2199,7 +2067,6 @@ TEST_F(LoadCustomizableTest, LoadFilterPolicyTest) { ASSERT_NE(bbto->filter_policy.get(), nullptr); ASSERT_TRUE( bbto->filter_policy->IsInstanceOf(MockFilterPolicy::kClassName())); -#endif // ROCKSDB_LITE } TEST_F(LoadCustomizableTest, LoadFlushBlockPolicyFactoryTest) { @@ -2220,7 +2087,6 @@ TEST_F(LoadCustomizableTest, LoadFlushBlockPolicyFactoryTest) { ASSERT_NE(result, nullptr); ASSERT_STREQ(result->Name(), FlushBlockBySizePolicyFactory::kClassName()); -#ifndef ROCKSDB_LITE std::string table_opts = "id=BlockBasedTable; flush_block_policy_factory="; ASSERT_OK(TableFactory::CreateFromString( config_options_, @@ -2242,7 +2108,6 @@ TEST_F(LoadCustomizableTest, LoadFlushBlockPolicyFactoryTest) { ASSERT_STREQ(bbto->flush_block_policy_factory->Name(), TestFlushBlockPolicyFactory::kClassName()); } -#endif // ROCKSDB_LITE } } // namespace ROCKSDB_NAMESPACE diff --git a/options/db_options.cc b/options/db_options.cc index 453f199cd454..bd18416b5bc8 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -12,6 +12,7 @@ #include "options/options_helper.h" #include "options/options_parser.h" #include "port/port.h" +#include "rocksdb/advanced_cache.h" #include "rocksdb/configurable.h" #include "rocksdb/env.h" #include "rocksdb/file_system.h" @@ -25,7 +26,6 @@ #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE static std::unordered_map wal_recovery_mode_string_map = { {"kTolerateCorruptedTailRecords", @@ -129,6 +129,10 @@ static std::unordered_map {offsetof(struct MutableDBOptions, max_background_flushes), OptionType::kInt, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, + {"daily_offpeak_time_utc", + {offsetof(struct MutableDBOptions, daily_offpeak_time_utc), + OptionType::kString, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, }; static std::unordered_map @@ -222,6 +226,10 @@ static std::unordered_map {offsetof(struct ImmutableDBOptions, flush_verify_memtable_count), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"compaction_verify_record_count", + {offsetof(struct ImmutableDBOptions, compaction_verify_record_count), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"track_and_verify_wals_in_manifest", {offsetof(struct ImmutableDBOptions, track_and_verify_wals_in_manifest), @@ -683,7 +691,6 @@ std::unique_ptr DBOptionsAsConfigurable( std::unique_ptr ptr(new DBOptionsConfigurable(opts, opt_map)); return ptr; } -#endif // ROCKSDB_LITE ImmutableDBOptions::ImmutableDBOptions() : ImmutableDBOptions(Options()) {} @@ -694,6 +701,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) error_if_exists(options.error_if_exists), paranoid_checks(options.paranoid_checks), flush_verify_memtable_count(options.flush_verify_memtable_count), + compaction_verify_record_count(options.compaction_verify_record_count), track_and_verify_wals_in_manifest( options.track_and_verify_wals_in_manifest), verify_sst_unique_id_in_manifest( @@ -748,9 +756,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) wal_recovery_mode(options.wal_recovery_mode), allow_2pc(options.allow_2pc), row_cache(options.row_cache), -#ifndef ROCKSDB_LITE wal_filter(options.wal_filter), -#endif // ROCKSDB_LITE fail_if_options_file_error(options.fail_if_options_file_error), use_options_file(options.use_options_file), dump_malloc_stats(options.dump_malloc_stats), @@ -791,6 +797,8 @@ void ImmutableDBOptions::Dump(Logger* log) const { paranoid_checks); ROCKS_LOG_HEADER(log, " Options.flush_verify_memtable_count: %d", flush_verify_memtable_count); + ROCKS_LOG_HEADER(log, " Options.compaction_verify_record_count: %d", + compaction_verify_record_count); ROCKS_LOG_HEADER(log, " " "Options.track_and_verify_wals_in_manifest: %d", @@ -807,6 +815,11 @@ void ImmutableDBOptions::Dump(Logger* log) const { max_file_opening_threads); ROCKS_LOG_HEADER(log, " Options.statistics: %p", stats); + if (stats) { + ROCKS_LOG_HEADER( + log, " Options.statistics stats level: %u", + stats->get_stats_level()); + } ROCKS_LOG_HEADER(log, " Options.use_fsync: %d", use_fsync); ROCKS_LOG_HEADER( @@ -905,10 +918,8 @@ void ImmutableDBOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.row_cache: None"); } -#ifndef ROCKSDB_LITE ROCKS_LOG_HEADER(log, " Options.wal_filter: %s", wal_filter ? wal_filter->Name() : "None"); -#endif // ROCKDB_LITE ROCKS_LOG_HEADER(log, " Options.avoid_flush_during_recovery: %d", avoid_flush_during_recovery); @@ -1005,7 +1016,8 @@ MutableDBOptions::MutableDBOptions() wal_bytes_per_sync(0), strict_bytes_per_sync(false), compaction_readahead_size(0), - max_background_flushes(-1) {} + max_background_flushes(-1), + daily_offpeak_time_utc("") {} MutableDBOptions::MutableDBOptions(const DBOptions& options) : max_background_jobs(options.max_background_jobs), @@ -1025,7 +1037,8 @@ MutableDBOptions::MutableDBOptions(const DBOptions& options) wal_bytes_per_sync(options.wal_bytes_per_sync), strict_bytes_per_sync(options.strict_bytes_per_sync), compaction_readahead_size(options.compaction_readahead_size), - max_background_flushes(options.max_background_flushes) {} + max_background_flushes(options.max_background_flushes), + daily_offpeak_time_utc(options.daily_offpeak_time_utc) {} void MutableDBOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.max_background_jobs: %d", @@ -1070,9 +1083,10 @@ void MutableDBOptions::Dump(Logger* log) const { compaction_readahead_size); ROCKS_LOG_HEADER(log, " Options.max_background_flushes: %d", max_background_flushes); + ROCKS_LOG_HEADER(log, "Options.daily_offpeak_time_utc: %s", + daily_offpeak_time_utc.c_str()); } -#ifndef ROCKSDB_LITE Status GetMutableDBOptionsFromStrings( const MutableDBOptions& base_options, const std::unordered_map& options_map, @@ -1103,5 +1117,4 @@ Status GetStringFromMutableDBOptions(const ConfigOptions& config_options, return OptionTypeInfo::SerializeType( config_options, db_mutable_options_type_info, &mutable_opts, opt_string); } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/options/db_options.h b/options/db_options.h index 9f6e23a6d4fd..8e323f192c9c 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -26,6 +26,7 @@ struct ImmutableDBOptions { bool error_if_exists; bool paranoid_checks; bool flush_verify_memtable_count; + bool compaction_verify_record_count; bool track_and_verify_wals_in_manifest; bool verify_sst_unique_id_in_manifest; Env* env; @@ -77,9 +78,7 @@ struct ImmutableDBOptions { WALRecoveryMode wal_recovery_mode; bool allow_2pc; std::shared_ptr row_cache; -#ifndef ROCKSDB_LITE WalFilter* wal_filter; -#endif // ROCKSDB_LITE bool fail_if_options_file_error; bool use_options_file; bool dump_malloc_stats; @@ -141,9 +140,9 @@ struct MutableDBOptions { bool strict_bytes_per_sync; size_t compaction_readahead_size; int max_background_flushes; + std::string daily_offpeak_time_utc; }; -#ifndef ROCKSDB_LITE Status GetStringFromMutableDBOptions(const ConfigOptions& config_options, const MutableDBOptions& mutable_opts, std::string* opt_string); @@ -155,6 +154,5 @@ Status GetMutableDBOptionsFromStrings( bool MutableDBOptionsAreEqual(const MutableDBOptions& this_options, const MutableDBOptions& that_options); -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/options/offpeak_time_info.cc b/options/offpeak_time_info.cc new file mode 100644 index 000000000000..4eaeb6e27c29 --- /dev/null +++ b/options/offpeak_time_info.cc @@ -0,0 +1,59 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "options/offpeak_time_info.h" + +#include "rocksdb/system_clock.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { +OffpeakTimeOption::OffpeakTimeOption() : OffpeakTimeOption("") {} +OffpeakTimeOption::OffpeakTimeOption(const std::string& offpeak_time_string) { + SetFromOffpeakTimeString(offpeak_time_string); +} + +void OffpeakTimeOption::SetFromOffpeakTimeString( + const std::string& offpeak_time_string) { + const int old_start_time = daily_offpeak_start_time_utc; + const int old_end_time = daily_offpeak_end_time_utc; + if (TryParseTimeRangeString(offpeak_time_string, daily_offpeak_start_time_utc, + daily_offpeak_end_time_utc)) { + daily_offpeak_time_utc = offpeak_time_string; + } else { + daily_offpeak_start_time_utc = old_start_time; + daily_offpeak_end_time_utc = old_end_time; + } +} + +OffpeakTimeInfo OffpeakTimeOption::GetOffpeakTimeInfo( + const int64_t& current_time) const { + OffpeakTimeInfo offpeak_time_info; + if (daily_offpeak_start_time_utc == daily_offpeak_end_time_utc) { + return offpeak_time_info; + } + int seconds_since_midnight = static_cast(current_time % kSecondsPerDay); + int seconds_since_midnight_to_nearest_minute = + (seconds_since_midnight / kSecondsPerMinute) * kSecondsPerMinute; + // if the offpeak duration spans overnight (i.e. 23:30 - 4:30 next day) + if (daily_offpeak_start_time_utc > daily_offpeak_end_time_utc) { + offpeak_time_info.is_now_offpeak = + daily_offpeak_start_time_utc <= + seconds_since_midnight_to_nearest_minute || + seconds_since_midnight_to_nearest_minute <= daily_offpeak_end_time_utc; + } else { + offpeak_time_info.is_now_offpeak = + daily_offpeak_start_time_utc <= + seconds_since_midnight_to_nearest_minute && + seconds_since_midnight_to_nearest_minute <= daily_offpeak_end_time_utc; + } + offpeak_time_info.seconds_till_next_offpeak_start = + seconds_since_midnight < daily_offpeak_start_time_utc + ? daily_offpeak_start_time_utc - seconds_since_midnight + : ((daily_offpeak_start_time_utc + kSecondsPerDay) - + seconds_since_midnight); + return offpeak_time_info; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/options/offpeak_time_info.h b/options/offpeak_time_info.h new file mode 100644 index 000000000000..75d61abb49be --- /dev/null +++ b/options/offpeak_time_info.h @@ -0,0 +1,36 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { +class SystemClock; + +struct OffpeakTimeInfo { + bool is_now_offpeak = false; + int seconds_till_next_offpeak_start = 0; +}; + +struct OffpeakTimeOption { + static constexpr int kSecondsPerDay = 86400; + static constexpr int kSecondsPerHour = 3600; + static constexpr int kSecondsPerMinute = 60; + + OffpeakTimeOption(); + explicit OffpeakTimeOption(const std::string& offpeak_time_string); + std::string daily_offpeak_time_utc = ""; + int daily_offpeak_start_time_utc = 0; + int daily_offpeak_end_time_utc = 0; + + void SetFromOffpeakTimeString(const std::string& offpeak_time_string); + + OffpeakTimeInfo GetOffpeakTimeInfo(const int64_t& current_time) const; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/options/options.cc b/options/options.cc index c84a5a88c35e..2615018680a7 100644 --- a/options/options.cc +++ b/options/options.cc @@ -13,7 +13,7 @@ #include #include "logging/logging.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "options/db_options.h" #include "options/options_helper.h" #include "rocksdb/cache.h" @@ -94,6 +94,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) ttl(options.ttl), periodic_compaction_seconds(options.periodic_compaction_seconds), sample_for_compression(options.sample_for_compression), + default_temperature(options.default_temperature), preclude_last_level_data_seconds( options.preclude_last_level_data_seconds), preserve_internal_time_seconds(options.preserve_internal_time_seconds), @@ -109,7 +110,8 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) blob_compaction_readahead_size(options.blob_compaction_readahead_size), blob_file_starting_level(options.blob_file_starting_level), blob_cache(options.blob_cache), - prepopulate_blob_cache(options.prepopulate_blob_cache) { + prepopulate_blob_cache(options.prepopulate_blob_cache), + persist_user_defined_timestamps(options.persist_user_defined_timestamps) { assert(memtable_factory.get() != nullptr); if (max_bytes_for_level_multiplier_additional.size() < static_cast(num_levels)) { @@ -137,6 +139,11 @@ void DBOptions::Dump(Logger* log) const { void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.comparator: %s", comparator->Name()); + if (comparator->timestamp_size() > 0) { + ROCKS_LOG_HEADER( + log, " Options.persist_user_defined_timestamps: %s", + persist_user_defined_timestamps ? "true" : "false"); + } ROCKS_LOG_HEADER(log, " Options.merge_operator: %s", merge_operator ? merge_operator->Name() : "None"); ROCKS_LOG_HEADER(log, " Options.compaction_filter: %s", @@ -406,6 +413,17 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER(log, " Options.periodic_compaction_seconds: %" PRIu64, periodic_compaction_seconds); + const auto& it_temp = temperature_to_string.find(default_temperature); + std::string str_default_temperature; + if (it_temp == temperature_to_string.end()) { + assert(false); + str_default_temperature = "unknown_temperature"; + } else { + str_default_temperature = it_temp->second; + } + ROCKS_LOG_HEADER(log, + " Options.default_temperature: %s", + str_default_temperature.c_str()); ROCKS_LOG_HEADER(log, " Options.preclude_last_level_data_seconds: %" PRIu64, preclude_last_level_data_seconds); ROCKS_LOG_HEADER(log, " Options.preserve_internal_time_seconds: %" PRIu64, @@ -446,8 +464,10 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ? "flush only" : "disabled"); } - ROCKS_LOG_HEADER(log, "Options.experimental_mempurge_threshold: %f", + ROCKS_LOG_HEADER(log, " Options.experimental_mempurge_threshold: %f", experimental_mempurge_threshold); + ROCKS_LOG_HEADER(log, " Options.memtable_max_range_deletions: %d", + memtable_max_range_deletions); } // ColumnFamilyOptions::Dump void Options::Dump(Logger* log) const { @@ -610,7 +630,6 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForSmallDb( return this; } -#ifndef ROCKSDB_LITE ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup( uint64_t block_cache_size_mb) { BlockBasedTableOptions block_based_options; @@ -682,60 +701,10 @@ DBOptions* DBOptions::IncreaseParallelism(int total_threads) { return this; } -#endif // !ROCKSDB_LITE - -ReadOptions::ReadOptions() - : snapshot(nullptr), - iterate_lower_bound(nullptr), - iterate_upper_bound(nullptr), - readahead_size(0), - max_skippable_internal_keys(0), - read_tier(kReadAllTier), - verify_checksums(true), - fill_cache(true), - tailing(false), - managed(false), - total_order_seek(false), - auto_prefix_mode(false), - prefix_same_as_start(false), - pin_data(false), - background_purge_on_iterator_cleanup(false), - ignore_range_deletions(false), - optimize_for_hits(false), - timestamp(nullptr), - iter_start_ts(nullptr), - deadline(std::chrono::microseconds::zero()), - io_timeout(std::chrono::microseconds::zero()), - value_size_soft_limit(std::numeric_limits::max()), - adaptive_readahead(false), - async_io(false), - optimize_multiget_for_io(true) {} - -ReadOptions::ReadOptions(bool cksum, bool cache) - : snapshot(nullptr), - iterate_lower_bound(nullptr), - iterate_upper_bound(nullptr), - readahead_size(0), - max_skippable_internal_keys(0), - read_tier(kReadAllTier), - verify_checksums(cksum), - fill_cache(cache), - tailing(false), - managed(false), - total_order_seek(false), - auto_prefix_mode(false), - prefix_same_as_start(false), - pin_data(false), - background_purge_on_iterator_cleanup(false), - ignore_range_deletions(false), - optimize_for_hits(false), - timestamp(nullptr), - iter_start_ts(nullptr), - deadline(std::chrono::microseconds::zero()), - io_timeout(std::chrono::microseconds::zero()), - value_size_soft_limit(std::numeric_limits::max()), - adaptive_readahead(false), - async_io(false), - optimize_multiget_for_io(true) {} +ReadOptions::ReadOptions(bool _verify_checksums, bool _fill_cache) + : verify_checksums(_verify_checksums), fill_cache(_fill_cache) {} + +ReadOptions::ReadOptions(Env::IOActivity _io_activity) + : io_activity(_io_activity) {} } // namespace ROCKSDB_NAMESPACE diff --git a/options/options_helper.cc b/options/options_helper.cc index 9cdd6bdd83e8..8932c5c8dade 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -4,6 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). #include "options/options_helper.h" +#include #include #include #include @@ -30,30 +31,22 @@ namespace ROCKSDB_NAMESPACE { ConfigOptions::ConfigOptions() -#ifndef ROCKSDB_LITE : registry(ObjectRegistry::NewInstance()) -#endif { env = Env::Default(); } ConfigOptions::ConfigOptions(const DBOptions& db_opts) : env(db_opts.env) { -#ifndef ROCKSDB_LITE registry = ObjectRegistry::NewInstance(); -#endif } Status ValidateOptions(const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) { Status s; -#ifndef ROCKSDB_LITE auto db_cfg = DBOptionsAsConfigurable(db_opts); auto cf_cfg = CFOptionsAsConfigurable(cf_opts); s = db_cfg->ValidateOptions(db_opts, cf_opts); if (s.ok()) s = cf_cfg->ValidateOptions(db_opts, cf_opts); -#else - s = cf_opts.table_factory->ValidateOptions(db_opts, cf_opts); -#endif return s; } @@ -68,6 +61,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.paranoid_checks = immutable_db_options.paranoid_checks; options.flush_verify_memtable_count = immutable_db_options.flush_verify_memtable_count; + options.compaction_verify_record_count = + immutable_db_options.compaction_verify_record_count; options.track_and_verify_wals_in_manifest = immutable_db_options.track_and_verify_wals_in_manifest; options.verify_sst_unique_id_in_manifest = @@ -154,9 +149,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.wal_recovery_mode = immutable_db_options.wal_recovery_mode; options.allow_2pc = immutable_db_options.allow_2pc; options.row_cache = immutable_db_options.row_cache; -#ifndef ROCKSDB_LITE options.wal_filter = immutable_db_options.wal_filter; -#endif // ROCKSDB_LITE options.fail_if_options_file_error = immutable_db_options.fail_if_options_file_error; options.use_options_file = immutable_db_options.use_options_file; @@ -190,6 +183,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, immutable_db_options.enforce_single_del_contracts; options.disable_delete_obsolete_files_on_open = immutable_db_options.disable_delete_obsolete_files_on_open; + options.daily_offpeak_time_utc = mutable_db_options.daily_offpeak_time_utc; return options; } @@ -222,6 +216,10 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, moptions.experimental_mempurge_threshold; cf_opts->memtable_protection_bytes_per_key = moptions.memtable_protection_bytes_per_key; + cf_opts->block_protection_bytes_per_key = + moptions.block_protection_bytes_per_key; + cf_opts->bottommost_file_compaction_delay = + moptions.bottommost_file_compaction_delay; // Compaction related options cf_opts->disable_auto_compactions = moptions.disable_auto_compactions; @@ -284,6 +282,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, cf_opts->compression_per_level = moptions.compression_per_level; cf_opts->last_level_temperature = moptions.last_level_temperature; cf_opts->bottommost_temperature = moptions.last_level_temperature; + cf_opts->memtable_max_range_deletions = moptions.memtable_max_range_deletions; } void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, @@ -324,6 +323,9 @@ void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, ioptions.preclude_last_level_data_seconds; cf_opts->preserve_internal_time_seconds = ioptions.preserve_internal_time_seconds; + cf_opts->persist_user_defined_timestamps = + ioptions.persist_user_defined_timestamps; + cf_opts->default_temperature = ioptions.default_temperature; // TODO(yhchiang): find some way to handle the following derived options // * max_file_size @@ -408,7 +410,6 @@ std::vector GetSupportedChecksums() { checksum_types.end()); } -#ifndef ROCKSDB_LITE static bool ParseOptionHelper(void* opt_address, const OptionType& opt_type, const std::string& value) { switch (opt_type) { @@ -439,6 +440,10 @@ static bool ParseOptionHelper(void* opt_address, const OptionType& opt_type, case OptionType::kSizeT: PutUnaligned(static_cast(opt_address), ParseSizeT(value)); break; + case OptionType::kAtomicInt: + static_cast*>(opt_address) + ->store(ParseInt(value), std::memory_order_release); + break; case OptionType::kString: *static_cast(opt_address) = value; break; @@ -528,6 +533,10 @@ bool SerializeSingleOptionHelper(const void* opt_address, case OptionType::kDouble: *value = std::to_string(*(static_cast(opt_address))); break; + case OptionType::kAtomicInt: + *value = std::to_string(static_cast*>(opt_address) + ->load(std::memory_order_acquire)); + break; case OptionType::kString: *value = EscapeOptionString(*(static_cast(opt_address))); @@ -675,18 +684,6 @@ Status GetStringFromCompressionType(std::string* compression_str, } } -Status GetColumnFamilyOptionsFromMap( - const ColumnFamilyOptions& base_options, - const std::unordered_map& opts_map, - ColumnFamilyOptions* new_options, bool input_strings_escaped, - bool ignore_unknown_options) { - ConfigOptions config_options; - config_options.ignore_unknown_options = ignore_unknown_options; - config_options.input_strings_escaped = input_strings_escaped; - return GetColumnFamilyOptionsFromMap(config_options, base_options, opts_map, - new_options); -} - Status GetColumnFamilyOptionsFromMap( const ConfigOptions& config_options, const ColumnFamilyOptions& base_options, @@ -708,17 +705,6 @@ Status GetColumnFamilyOptionsFromMap( } } -Status GetColumnFamilyOptionsFromString( - const ColumnFamilyOptions& base_options, - const std::string& opts_str, - ColumnFamilyOptions* new_options) { - ConfigOptions config_options; - config_options.input_strings_escaped = false; - config_options.ignore_unknown_options = false; - return GetColumnFamilyOptionsFromString(config_options, base_options, - opts_str, new_options); -} - Status GetColumnFamilyOptionsFromString(const ConfigOptions& config_options, const ColumnFamilyOptions& base_options, const std::string& opts_str, @@ -733,18 +719,6 @@ Status GetColumnFamilyOptionsFromString(const ConfigOptions& config_options, new_options); } -Status GetDBOptionsFromMap( - const DBOptions& base_options, - const std::unordered_map& opts_map, - DBOptions* new_options, bool input_strings_escaped, - bool ignore_unknown_options) { - ConfigOptions config_options(base_options); - config_options.input_strings_escaped = input_strings_escaped; - config_options.ignore_unknown_options = ignore_unknown_options; - return GetDBOptionsFromMap(config_options, base_options, opts_map, - new_options); -} - Status GetDBOptionsFromMap( const ConfigOptions& config_options, const DBOptions& base_options, const std::unordered_map& opts_map, @@ -763,17 +737,6 @@ Status GetDBOptionsFromMap( } } -Status GetDBOptionsFromString(const DBOptions& base_options, - const std::string& opts_str, - DBOptions* new_options) { - ConfigOptions config_options(base_options); - config_options.input_strings_escaped = false; - config_options.ignore_unknown_options = false; - - return GetDBOptionsFromString(config_options, base_options, opts_str, - new_options); -} - Status GetDBOptionsFromString(const ConfigOptions& config_options, const DBOptions& base_options, const std::string& opts_str, @@ -1222,6 +1185,8 @@ static bool AreOptionsEqual(OptionType type, const void* this_offset, GetUnaligned(static_cast(that_offset), &v2); return (v1 == v2); } + case OptionType::kAtomicInt: + return IsOptionEqual>(this_offset, that_offset); case OptionType::kString: return IsOptionEqual(this_offset, that_offset); case OptionType::kDouble: @@ -1479,6 +1444,5 @@ const OptionTypeInfo* OptionTypeInfo::Find( } return nullptr; } -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/options/options_helper.h b/options/options_helper.h index 7c751fc25260..76e312a63cf5 100644 --- a/options/options_helper.h +++ b/options/options_helper.h @@ -54,7 +54,6 @@ void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, ColumnFamilyOptions* cf_opts); -#ifndef ROCKSDB_LITE std::unique_ptr DBOptionsAsConfigurable( const MutableDBOptions& opts); std::unique_ptr DBOptionsAsConfigurable( @@ -69,7 +68,6 @@ std::unique_ptr CFOptionsAsConfigurable( extern Status StringToMap( const std::string& opts_str, std::unordered_map* opts_map); -#endif // !ROCKSDB_LITE struct OptionsHelper { static const std::string kCFOptionsName /*= "ColumnFamilyOptions"*/; @@ -84,7 +82,6 @@ struct OptionsHelper { compression_type_string_map; static std::unordered_map prepopulate_blob_cache_string_map; -#ifndef ROCKSDB_LITE static std::unordered_map compaction_stop_style_string_map; static std::unordered_map encoding_type_string_map; @@ -93,7 +90,6 @@ struct OptionsHelper { static std::unordered_map compaction_pri_string_map; static std::unordered_map temperature_string_map; -#endif // !ROCKSDB_LITE }; // Some aliasing @@ -104,7 +100,6 @@ static auto& compaction_stop_style_to_string = OptionsHelper::compaction_stop_style_to_string; static auto& temperature_to_string = OptionsHelper::temperature_to_string; static auto& checksum_type_string_map = OptionsHelper::checksum_type_string_map; -#ifndef ROCKSDB_LITE static auto& compaction_stop_style_string_map = OptionsHelper::compaction_stop_style_string_map; static auto& compression_type_string_map = @@ -117,6 +112,5 @@ static auto& compaction_pri_string_map = static auto& temperature_string_map = OptionsHelper::temperature_string_map; static auto& prepopulate_blob_cache_string_map = OptionsHelper::prepopulate_blob_cache_string_map; -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/options/options_parser.cc b/options/options_parser.cc index 562a7b214b13..a8c855d6e22a 100644 --- a/options/options_parser.cc +++ b/options/options_parser.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "options/options_parser.h" @@ -76,6 +75,7 @@ Status PersistRocksDBOptions(const ConfigOptions& config_options_in, std::unique_ptr writable; writable.reset(new WritableFileWriter(std::move(wf), file_name, EnvOptions(), nullptr /* statistics */)); + TEST_SYNC_POINT("PersistRocksDBOptions:create"); std::string options_file_content; @@ -136,6 +136,7 @@ Status PersistRocksDBOptions(const ConfigOptions& config_options_in, if (s.ok()) { s = writable->Close(); } + TEST_SYNC_POINT("PersistRocksDBOptions:written"); if (s.ok()) { return RocksDBOptionsParser::VerifyRocksDBOptionsFromFile( config_options, db_opt, cf_names, cf_opts, file_name, fs); @@ -680,6 +681,15 @@ Status RocksDBOptionsParser::VerifyCFOptions( Status s = base_config->GetOption(config_options, mismatch, &base_value); if (s.ok()) { s = file_config->GetOption(config_options, mismatch, &file_value); + // In file_opt, certain options like MergeOperator may be nullptr due to + // factor methods not available. So we use opt_map to get + // option value to use in the error message below. + if (s.ok() && file_value == kNullptrString && opt_map) { + auto const& opt_val_str = (opt_map->find(mismatch)); + if (opt_val_str != opt_map->end()) { + file_value = opt_val_str->second; + } + } } int offset = snprintf(buffer, sizeof(buffer), "[RocksDBOptionsParser]: " @@ -724,4 +734,3 @@ Status RocksDBOptionsParser::VerifyTableFactory( } } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/options/options_parser.h b/options/options_parser.h index 20e3d772dab1..4268051f3404 100644 --- a/options/options_parser.h +++ b/options/options_parser.h @@ -14,7 +14,6 @@ namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE struct ConfigOptions; class OptionTypeInfo; class TableFactory; @@ -146,6 +145,5 @@ class RocksDBOptionsParser { int opt_file_version[3]; }; -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 5fa5aca40fa2..73548304e5b5 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -31,7 +31,6 @@ namespace ROCKSDB_NAMESPACE { // As a result, we only run the tests to verify new fields in options are // settable through string on limited platforms as it depends on behavior of // compilers. -#ifndef ROCKSDB_LITE #if defined OS_LINUX || defined OS_WIN #ifndef __clang__ #ifndef ROCKSDB_UBSAN_RUN @@ -126,8 +125,6 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { sizeof(std::shared_ptr)}, {offsetof(struct BlockBasedTableOptions, persistent_cache), sizeof(std::shared_ptr)}, - {offsetof(struct BlockBasedTableOptions, block_cache_compressed), - sizeof(std::shared_ptr)}, {offsetof(struct BlockBasedTableOptions, cache_usage_options), sizeof(CacheUsageOptions)}, {offsetof(struct BlockBasedTableOptions, filter_policy), @@ -170,8 +167,13 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { kBbtoExcluded); // Need to update the option string if a new option is added. + ConfigOptions config_options; + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; + config_options.invoke_prepare_options = false; + config_options.ignore_unsupported_options = false; ASSERT_OK(GetBlockBasedTableOptionsFromString( - *bbto, + config_options, *bbto, "cache_index_and_filter_blocks=1;" "cache_index_and_filter_blocks_with_high_priority=true;" "metadata_cache_options={top_level_index_pinning=kFallback;" @@ -207,7 +209,6 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { kBbtoExcluded)); ASSERT_TRUE(new_bbto->block_cache.get() != nullptr); - ASSERT_TRUE(new_bbto->block_cache_compressed.get() != nullptr); ASSERT_TRUE(new_bbto->filter_policy.get() != nullptr); bbto->~BlockBasedTableOptions(); @@ -251,6 +252,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { sizeof(FileTypeSet)}, {offsetof(struct DBOptions, compaction_service), sizeof(std::shared_ptr)}, + {offsetof(struct DBOptions, daily_offpeak_time_utc), sizeof(std::string)}, }; char* options_ptr = new char[sizeof(DBOptions)]; @@ -277,8 +279,11 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { FillWithSpecialChar(new_options_ptr, sizeof(DBOptions), kDBOptionsExcluded); // Need to update the option string if a new option is added. + ConfigOptions config_options(*options); + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; ASSERT_OK( - GetDBOptionsFromString(*options, + GetDBOptionsFromString(config_options, *options, "wal_bytes_per_sync=4295048118;" "delete_obsolete_files_period_micros=4294967758;" "WAL_ttl_seconds=4295008036;" @@ -304,6 +309,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "writable_file_max_buffer_size=1048576;" "paranoid_checks=true;" "flush_verify_memtable_count=true;" + "compaction_verify_record_count=true;" "track_and_verify_wals_in_manifest=true;" "verify_sst_unique_id_in_manifest=true;" "is_fd_close_on_exec=false;" @@ -362,7 +368,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "db_host_id=hostname;" "lowest_used_cache_tier=kNonVolatileBlockTier;" "allow_data_in_errors=false;" - "enforce_single_del_contracts=false;", + "enforce_single_del_contracts=false;" + "daily_offpeak_time_utc=08:30-19:00;", new_options)); ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions), @@ -375,6 +382,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { delete[] new_options_ptr; } +// status check adds CXX flag -fno-elide-constructors which fails this test. +#ifndef ROCKSDB_ASSERT_STATUS_CHECKED // If the test fails, likely a new option is added to ColumnFamilyOptions // but it cannot be set through GetColumnFamilyOptionsFromString(), or the // test is not updated accordingly. @@ -398,6 +407,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { {offsetof(struct ColumnFamilyOptions, max_bytes_for_level_multiplier_additional), sizeof(std::vector)}, + {offsetof(struct ColumnFamilyOptions, compaction_options_fifo), + sizeof(struct CompactionOptionsFIFO)}, {offsetof(struct ColumnFamilyOptions, memtable_factory), sizeof(std::shared_ptr)}, {offsetof(struct ColumnFamilyOptions, @@ -467,8 +478,11 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { kColumnFamilyOptionsExcluded); // Need to update the option string if a new option is added. + ConfigOptions config_options; + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; ASSERT_OK(GetColumnFamilyOptionsFromString( - *options, + config_options, *options, "compaction_filter_factory=mpudlojcujCompactionFilterFactory;" "table_factory=PlainTable;" "prefix_extractor=rocksdb.CappedPrefix.13;" @@ -492,8 +506,14 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "max_bytes_for_level_multiplier=60;" "memtable_factory=SkipListFactory;" "compression=kNoCompression;" - "compression_opts=5:6:7:8:9:10:true:11:false;" - "bottommost_compression_opts=4:5:6:7:8:9:true:10:true;" + "compression_opts={max_dict_buffer_bytes=5;use_zstd_dict_trainer=true;" + "enabled=false;parallel_threads=6;zstd_max_train_bytes=7;strategy=8;max_" + "dict_bytes=9;level=10;window_bits=11;max_compressed_bytes_per_kb=987;" + "checksum=true};" + "bottommost_compression_opts={max_dict_buffer_bytes=4;use_zstd_dict_" + "trainer=true;enabled=true;parallel_threads=5;zstd_max_train_bytes=6;" + "strategy=7;max_dict_bytes=8;level=9;window_bits=10;max_compressed_bytes_" + "per_kb=876;checksum=true};" "bottommost_compression=kDisableCompressionOption;" "level0_stop_writes_trigger=33;" "num_levels=99;" @@ -536,12 +556,18 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "prepopulate_blob_cache=kDisable;" "bottommost_temperature=kWarm;" "last_level_temperature=kWarm;" + "default_temperature=kHot;" "preclude_last_level_data_seconds=86400;" "preserve_internal_time_seconds=86400;" "compaction_options_fifo={max_table_files_size=3;allow_" - "compaction=false;age_for_warm=1;};" + "compaction=true;age_for_warm=0;file_temperature_age_thresholds={{" + "temperature=kCold;age=12345}};};" "blob_cache=1M;" - "memtable_protection_bytes_per_key=2;", + "memtable_protection_bytes_per_key=2;" + "persist_user_defined_timestamps=true;" + "block_protection_bytes_per_key=1;" + "memtable_max_range_deletions=999999;" + "bottommost_file_compaction_delay=7200;", new_options)); ASSERT_NE(new_options->blob_cache.get(), nullptr); @@ -550,6 +576,22 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { NumUnsetBytes(new_options_ptr, sizeof(ColumnFamilyOptions), kColumnFamilyOptionsExcluded)); + // Custom verification since compaction_options_fifo was in + // kColumnFamilyOptionsExcluded + ASSERT_EQ(new_options->compaction_options_fifo.max_table_files_size, 3); + ASSERT_EQ(new_options->compaction_options_fifo.allow_compaction, true); + ASSERT_EQ(new_options->compaction_options_fifo.file_temperature_age_thresholds + .size(), + 1); + ASSERT_EQ( + new_options->compaction_options_fifo.file_temperature_age_thresholds[0] + .temperature, + Temperature::kCold); + ASSERT_EQ( + new_options->compaction_options_fifo.file_temperature_age_thresholds[0] + .age, + 12345); + ColumnFamilyOptions rnd_filled_options = *new_options; options->~ColumnFamilyOptions(); @@ -566,6 +608,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { {offsetof(struct MutableCFOptions, max_bytes_for_level_multiplier_additional), sizeof(std::vector)}, + {offsetof(struct MutableCFOptions, compaction_options_fifo), + sizeof(struct CompactionOptionsFIFO)}, {offsetof(struct MutableCFOptions, compression_per_level), sizeof(std::vector)}, {offsetof(struct MutableCFOptions, max_file_size), @@ -606,10 +650,10 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { delete[] mcfo2_ptr; delete[] cfo_clean_ptr; } +#endif // !ROCKSDB_ASSERT_STATUS_CHECKED #endif // !ROCKSDB_UBSAN_RUN #endif // !__clang__ #endif // OS_LINUX || OS_WIN -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/options/options_test.cc b/options/options_test.cc index dfe592c273cc..af031422fa12 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -63,7 +63,6 @@ class UnregisteredTableFactory : public TableFactory { } }; -#ifndef ROCKSDB_LITE // GetOptionsFromMap is not supported in ROCKSDB_LITE TEST_F(OptionsTest, GetOptionsFromMapTest) { std::unordered_map cf_options_map = { {"write_buffer_size", "1"}, @@ -102,7 +101,9 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"compaction_style", "kCompactionStyleLevel"}, {"compaction_pri", "kOldestSmallestSeqFirst"}, {"verify_checksums_in_compaction", "false"}, - {"compaction_options_fifo", "23"}, + {"compaction_options_fifo", + "{allow_compaction=true;max_table_files_size=11002244;" + "file_temperature_age_thresholds={{temperature=kCold;age=12345}}}"}, {"max_sequential_skip_in_iterations", "24"}, {"inplace_update_support", "true"}, {"report_bg_io_stats", "true"}, @@ -130,6 +131,9 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"blob_file_starting_level", "1"}, {"prepopulate_blob_cache", "kDisable"}, {"last_level_temperature", "kWarm"}, + {"default_temperature", "kHot"}, + {"persist_user_defined_timestamps", "true"}, + {"memtable_max_range_deletions", "0"}, }; std::unordered_map db_options_map = { @@ -175,6 +179,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"wal_bytes_per_sync", "48"}, {"strict_bytes_per_sync", "true"}, {"preserve_deletes", "false"}, + {"daily_offpeak_time_utc", ""}, }; ColumnFamilyOptions base_cf_opt; @@ -245,7 +250,18 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.compaction_style, kCompactionStyleLevel); ASSERT_EQ(new_cf_opt.compaction_pri, kOldestSmallestSeqFirst); ASSERT_EQ(new_cf_opt.compaction_options_fifo.max_table_files_size, - static_cast(23)); + static_cast(11002244)); + ASSERT_EQ(new_cf_opt.compaction_options_fifo.allow_compaction, true); + ASSERT_EQ( + new_cf_opt.compaction_options_fifo.file_temperature_age_thresholds.size(), + 1); + ASSERT_EQ( + new_cf_opt.compaction_options_fifo.file_temperature_age_thresholds[0] + .temperature, + Temperature::kCold); + ASSERT_EQ( + new_cf_opt.compaction_options_fifo.file_temperature_age_thresholds[0].age, + 12345); ASSERT_EQ(new_cf_opt.max_sequential_skip_in_iterations, static_cast(24)); ASSERT_EQ(new_cf_opt.inplace_update_support, true); @@ -273,6 +289,9 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.bottommost_temperature, Temperature::kWarm); ASSERT_EQ(new_cf_opt.disable_auto_flush, false); ASSERT_EQ(new_cf_opt.disable_write_stall, false); + ASSERT_EQ(new_cf_opt.default_temperature, Temperature::kHot); + ASSERT_EQ(new_cf_opt.persist_user_defined_timestamps, true); + ASSERT_EQ(new_cf_opt.memtable_max_range_deletions, 0); cf_options_map["write_buffer_size"] = "hello"; ASSERT_NOK(GetColumnFamilyOptionsFromMap(exact, base_cf_opt, cf_options_map, @@ -343,6 +362,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast(47)); ASSERT_EQ(new_db_opt.wal_bytes_per_sync, static_cast(48)); ASSERT_EQ(new_db_opt.strict_bytes_per_sync, true); + ASSERT_EQ(new_db_opt.daily_offpeak_time_utc, ""); db_options_map["max_open_files"] = "hello"; Status s = @@ -370,10 +390,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_NOK( RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt)); } -#endif // !ROCKSDB_LITE -#ifndef ROCKSDB_LITE // GetColumnFamilyOptionsFromString is not supported in - // ROCKSDB_LITE TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) { ColumnFamilyOptions base_cf_opt; ColumnFamilyOptions new_cf_opt; @@ -818,9 +835,11 @@ TEST_F(OptionsTest, OldInterfaceTest) { ColumnFamilyOptions base_cf_opt; ColumnFamilyOptions new_cf_opt; ConfigOptions exact; - + ConfigOptions cf_config_options; + cf_config_options.input_strings_escaped = false; + cf_config_options.ignore_unknown_options = false; ASSERT_OK(GetColumnFamilyOptionsFromString( - base_cf_opt, + cf_config_options, base_cf_opt, "write_buffer_size=18;prefix_extractor=capped:8;" "arena_block_size=19", &new_cf_opt)); @@ -831,7 +850,7 @@ TEST_F(OptionsTest, OldInterfaceTest) { // And with a bad option ASSERT_NOK(GetColumnFamilyOptionsFromString( - base_cf_opt, + cf_config_options, base_cf_opt, "write_buffer_size=10;max_write_buffer_number=16;" "block_based_table_factory={xx_block_size=4;}", &new_cf_opt)); @@ -843,15 +862,17 @@ TEST_F(OptionsTest, OldInterfaceTest) { {"max_write_buffer_number", "2"}, {"min_write_buffer_number_to_merge", "3"}, }; - ASSERT_OK( - GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map, &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromMap(cf_config_options, base_cf_opt, + cf_options_map, &new_cf_opt)); cf_options_map["unknown_option"] = "1"; - ASSERT_NOK( - GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map, &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromMap(cf_config_options, base_cf_opt, + cf_options_map, &new_cf_opt)); ASSERT_OK( RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); - ASSERT_OK(GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map, - &new_cf_opt, true, true)); + cf_config_options.input_strings_escaped = true; + cf_config_options.ignore_unknown_options = true; + ASSERT_OK(GetColumnFamilyOptionsFromMap(cf_config_options, base_cf_opt, + cf_options_map, &new_cf_opt)); DBOptions base_db_opt; DBOptions new_db_opt; @@ -863,8 +884,14 @@ TEST_F(OptionsTest, OldInterfaceTest) { {"track_and_verify_wals_in_manifest", "true"}, {"verify_sst_unique_id_in_manifest", "true"}, {"max_open_files", "32"}, + {"daily_offpeak_time_utc", "06:30-23:30"}, }; - ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt)); + + ConfigOptions db_config_options(base_db_opt); + db_config_options.input_strings_escaped = false; + db_config_options.ignore_unknown_options = false; + ASSERT_OK(GetDBOptionsFromMap(db_config_options, base_db_opt, db_options_map, + &new_db_opt)); ASSERT_EQ(new_db_opt.create_if_missing, false); ASSERT_EQ(new_db_opt.create_missing_column_families, true); ASSERT_EQ(new_db_opt.error_if_exists, false); @@ -873,23 +900,30 @@ TEST_F(OptionsTest, OldInterfaceTest) { ASSERT_EQ(new_db_opt.verify_sst_unique_id_in_manifest, true); ASSERT_EQ(new_db_opt.max_open_files, 32); db_options_map["unknown_option"] = "1"; - Status s = GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt); + Status s = GetDBOptionsFromMap(db_config_options, base_db_opt, db_options_map, + &new_db_opt); ASSERT_NOK(s); ASSERT_TRUE(s.IsInvalidArgument()); ASSERT_OK( RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt)); - ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt, true, - true)); + db_config_options.input_strings_escaped = true; + db_config_options.ignore_unknown_options = true; + ASSERT_OK(GetDBOptionsFromMap(db_config_options, base_db_opt, db_options_map, + &new_db_opt)); + db_config_options.input_strings_escaped = false; + db_config_options.ignore_unknown_options = false; ASSERT_OK(GetDBOptionsFromString( - base_db_opt, - "create_if_missing=false;error_if_exists=false;max_open_files=42;", + db_config_options, base_db_opt, + "create_if_missing=false;error_if_exists=false;max_open_files=42;" + "daily_offpeak_time_utc=08:30-19:00;", &new_db_opt)); ASSERT_EQ(new_db_opt.create_if_missing, false); ASSERT_EQ(new_db_opt.error_if_exists, false); ASSERT_EQ(new_db_opt.max_open_files, 42); + ASSERT_EQ(new_db_opt.daily_offpeak_time_utc, "08:30-19:00"); s = GetDBOptionsFromString( - base_db_opt, + db_config_options, base_db_opt, "create_if_missing=false;error_if_exists=false;max_open_files=42;" "unknown_option=1;", &new_db_opt); @@ -899,9 +933,7 @@ TEST_F(OptionsTest, OldInterfaceTest) { RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt)); } -#endif // !ROCKSDB_LITE -#ifndef ROCKSDB_LITE // GetBlockBasedTableOptionsFromString is not supported TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { BlockBasedTableOptions table_opt; BlockBasedTableOptions new_opt; @@ -931,8 +963,6 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(new_opt.checksum, ChecksumType::kxxHash); ASSERT_TRUE(new_opt.block_cache != nullptr); ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL); - ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); - ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL); ASSERT_EQ(new_opt.block_size, 1024UL); ASSERT_EQ(new_opt.block_size_deviation, 8); ASSERT_EQ(new_opt.block_restart_interval, 4); @@ -1073,16 +1103,6 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true); ASSERT_EQ(std::dynamic_pointer_cast( new_opt.block_cache)->GetHighPriPoolRatio(), 0.5); - ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); - ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache_compressed) - ->GetNumShardBits(), - 4); - ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache_compressed)->GetHighPriPoolRatio(), - 0.5); // Set only block cache capacity. Check other values are // reset to default values. @@ -1101,18 +1121,6 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache) ->GetHighPriPoolRatio(), 0.5); - ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); - ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 2*1024UL*1024UL); - // Default values - ASSERT_EQ( - std::dynamic_pointer_cast( - new_opt.block_cache_compressed) - ->GetNumShardBits(), - GetDefaultCacheShardBits(new_opt.block_cache_compressed->GetCapacity())); - ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false); - ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache_compressed) - ->GetHighPriPoolRatio(), - 0.5); // Set couple of block cache options. ASSERT_OK(GetBlockBasedTableOptionsFromString( @@ -1128,16 +1136,6 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false); ASSERT_EQ(std::dynamic_pointer_cast( new_opt.block_cache)->GetHighPriPoolRatio(), 0.5); - ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); - ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 0); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache_compressed) - ->GetNumShardBits(), - 5); - ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false); - ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache_compressed) - ->GetHighPriPoolRatio(), - 0.0); // Set couple of block cache options. ASSERT_OK(GetBlockBasedTableOptionsFromString( @@ -1156,16 +1154,6 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache) ->GetHighPriPoolRatio(), 0.5); - ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); - ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache_compressed) - ->GetNumShardBits(), - 4); - ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true); - ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache_compressed) - ->GetHighPriPoolRatio(), - 0.5); ASSERT_OK(GetBlockBasedTableOptionsFromString( config_options, table_opt, "filter_policy=rocksdb.BloomFilter:1.234", @@ -1186,10 +1174,8 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) { ASSERT_TRUE( new_opt.filter_policy->IsInstanceOf(RibbonFilterPolicy::kNickName())); } -#endif // !ROCKSDB_LITE -#ifndef ROCKSDB_LITE // GetPlainTableOptionsFromString is not supported TEST_F(OptionsTest, GetPlainTableOptionsFromString) { PlainTableOptions table_opt; PlainTableOptions new_opt; @@ -1230,9 +1216,7 @@ TEST_F(OptionsTest, GetPlainTableOptionsFromString) { ASSERT_NOK(s); ASSERT_TRUE(s.IsInvalidArgument()); } -#endif // !ROCKSDB_LITE -#ifndef ROCKSDB_LITE // GetMemTableRepFactoryFromString is not supported TEST_F(OptionsTest, GetMemTableRepFactoryFromString) { std::unique_ptr new_mem_factory = nullptr; @@ -1269,7 +1253,6 @@ TEST_F(OptionsTest, GetMemTableRepFactoryFromString) { ASSERT_NOK(GetMemTableRepFactoryFromString("bad_factory", &new_mem_factory)); } -#endif // !ROCKSDB_LITE TEST_F(OptionsTest, MemTableRepFactoryCreateFromString) { std::unique_ptr new_mem_factory = nullptr; @@ -1297,7 +1280,6 @@ TEST_F(OptionsTest, MemTableRepFactoryCreateFromString) { ASSERT_NOK(MemTableRepFactory::CreateFromString( config_options, "invalid_opt=10", &new_mem_factory)); -#ifndef ROCKSDB_LITE ASSERT_OK(MemTableRepFactory::CreateFromString( config_options, "id=skip_list; lookahead=32", &new_mem_factory)); ASSERT_OK(MemTableRepFactory::CreateFromString(config_options, "prefix_hash", @@ -1353,7 +1335,6 @@ TEST_F(OptionsTest, MemTableRepFactoryCreateFromString) { config_options, "id=vector; count=42", &new_mem_factory)); ASSERT_NOK(MemTableRepFactory::CreateFromString( config_options, "id=vector; invalid=unknown", &new_mem_factory)); -#endif // ROCKSDB_LITE ASSERT_NOK(MemTableRepFactory::CreateFromString(config_options, "cuckoo", &new_mem_factory)); // CuckooHash memtable is already removed. @@ -1364,7 +1345,6 @@ TEST_F(OptionsTest, MemTableRepFactoryCreateFromString) { &new_mem_factory)); } -#ifndef ROCKSDB_LITE // GetOptionsFromString is not supported in RocksDB Lite class CustomEnv : public EnvWrapper { public: explicit CustomEnv(Env* _target) : EnvWrapper(_target) {} @@ -1436,7 +1416,7 @@ TEST_F(OptionsTest, GetOptionsFromStringTest) { ASSERT_EQ(new_options.max_open_files, 1); ASSERT_TRUE(new_options.rate_limiter.get() != nullptr); Env* newEnv = new_options.env; - ASSERT_OK(Env::LoadEnv(CustomEnv::kClassName(), &newEnv)); + ASSERT_OK(Env::CreateFromString({}, CustomEnv::kClassName(), &newEnv)); ASSERT_EQ(newEnv, new_options.env); config_options.ignore_unknown_options = false; @@ -1611,6 +1591,7 @@ TEST_F(OptionsTest, GetMutableCFOptions) { TEST_F(OptionsTest, ColumnFamilyOptionsSerialization) { Options options; ColumnFamilyOptions base_opt, new_opt; + base_opt.comparator = test::BytewiseComparatorWithU64TsWrapper(); Random rnd(302); ConfigOptions config_options; config_options.input_strings_escaped = false; @@ -1631,6 +1612,7 @@ TEST_F(OptionsTest, ColumnFamilyOptionsSerialization) { base_options_file_content, &new_opt)); ASSERT_OK( RocksDBOptionsParser::VerifyCFOptions(config_options, base_opt, new_opt)); + ASSERT_EQ(base_opt.comparator, new_opt.comparator); if (base_opt.compaction_filter) { delete base_opt.compaction_filter; } @@ -1778,13 +1760,11 @@ TEST_F(OptionsTest, MutableCFOptions) { ASSERT_EQ(bbto->block_size, 32768); } -#endif // !ROCKSDB_LITE Status StringToMap( const std::string& opts_str, std::unordered_map* opts_map); -#ifndef ROCKSDB_LITE // StringToMap is not supported in ROCKSDB_LITE TEST_F(OptionsTest, StringToMapTest) { std::unordered_map opts_map; // Regular options @@ -1901,9 +1881,7 @@ TEST_F(OptionsTest, StringToMapTest) { ASSERT_NOK(StringToMap("k1=v1;k2={{}}{}", &opts_map)); ASSERT_NOK(StringToMap("k1=v1;k2={{dfdl}adfa}{}", &opts_map)); } -#endif // ROCKSDB_LITE -#ifndef ROCKSDB_LITE // StringToMap is not supported in ROCKSDB_LITE TEST_F(OptionsTest, StringToMapRandomTest) { std::unordered_map opts_map; // Make sure segfault is not hit by semi-random strings @@ -2194,7 +2172,6 @@ TEST_F(OptionsTest, OptionTablePropertiesTest) { ASSERT_EQ(copy.table_properties_collector_factories.size(), 2); ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(cfg_opts, orig, copy)); } -#endif // !ROCKSDB_LITE TEST_F(OptionsTest, ConvertOptionsTest) { LevelDBOptions leveldb_opt; @@ -2213,13 +2190,12 @@ TEST_F(OptionsTest, ConvertOptionsTest) { const auto table_opt = table_factory->GetOptions(); ASSERT_NE(table_opt, nullptr); - ASSERT_EQ(table_opt->block_cache->GetCapacity(), 8UL << 20); + ASSERT_EQ(table_opt->block_cache->GetCapacity(), 32UL << 20); ASSERT_EQ(table_opt->block_size, leveldb_opt.block_size); ASSERT_EQ(table_opt->block_restart_interval, leveldb_opt.block_restart_interval); ASSERT_EQ(table_opt->filter_policy.get(), leveldb_opt.filter_policy); } -#ifndef ROCKSDB_LITE class TestEventListener : public EventListener { private: std::string id_; @@ -2286,9 +2262,7 @@ TEST_F(OptionsTest, OptionsListenerTest) { 2); // The Test{Config}1 Listeners could be loaded but not the others ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_opts, orig, copy)); } -#endif // ROCKSDB_LITE -#ifndef ROCKSDB_LITE const static std::string kCustomEnvName = "Custom"; const static std::string kCustomEnvProp = "env=" + kCustomEnvName; @@ -2348,7 +2322,9 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { {"compaction_style", "kCompactionStyleLevel"}, {"compaction_pri", "kOldestSmallestSeqFirst"}, {"verify_checksums_in_compaction", "false"}, - {"compaction_options_fifo", "23"}, + {"compaction_options_fifo", + "{allow_compaction=true;max_table_files_size=11002244;" + "file_temperature_age_thresholds={{temperature=kCold;age=12345}}}"}, {"max_sequential_skip_in_iterations", "24"}, {"inplace_update_support", "true"}, {"report_bg_io_stats", "true"}, @@ -2375,6 +2351,9 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { {"blob_file_starting_level", "1"}, {"prepopulate_blob_cache", "kDisable"}, {"last_level_temperature", "kWarm"}, + {"default_temperature", "kHot"}, + {"persist_user_defined_timestamps", "true"}, + {"memtable_max_range_deletions", "0"}, }; std::unordered_map db_options_map = { @@ -2424,8 +2403,11 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ColumnFamilyOptions base_cf_opt; ColumnFamilyOptions new_cf_opt; - ASSERT_OK(GetColumnFamilyOptionsFromMap( - base_cf_opt, cf_options_map, &new_cf_opt)); + ConfigOptions cf_config_options; + cf_config_options.ignore_unknown_options = false; + cf_config_options.input_strings_escaped = false; + ASSERT_OK(GetColumnFamilyOptionsFromMap(cf_config_options, base_cf_opt, + cf_options_map, &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 1U); ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2); ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3); @@ -2485,7 +2467,18 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.compaction_style, kCompactionStyleLevel); ASSERT_EQ(new_cf_opt.compaction_pri, kOldestSmallestSeqFirst); ASSERT_EQ(new_cf_opt.compaction_options_fifo.max_table_files_size, - static_cast(23)); + static_cast(11002244)); + ASSERT_EQ(new_cf_opt.compaction_options_fifo.allow_compaction, true); + ASSERT_EQ( + new_cf_opt.compaction_options_fifo.file_temperature_age_thresholds.size(), + 1); + ASSERT_EQ( + new_cf_opt.compaction_options_fifo.file_temperature_age_thresholds[0] + .temperature, + Temperature::kCold); + ASSERT_EQ( + new_cf_opt.compaction_options_fifo.file_temperature_age_thresholds[0].age, + 12345); ASSERT_EQ(new_cf_opt.max_sequential_skip_in_iterations, static_cast(24)); ASSERT_EQ(new_cf_opt.inplace_update_support, true); @@ -2511,10 +2504,13 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_cf_opt.prepopulate_blob_cache, PrepopulateBlobCache::kDisable); ASSERT_EQ(new_cf_opt.last_level_temperature, Temperature::kWarm); ASSERT_EQ(new_cf_opt.bottommost_temperature, Temperature::kWarm); + ASSERT_EQ(new_cf_opt.default_temperature, Temperature::kHot); + ASSERT_EQ(new_cf_opt.persist_user_defined_timestamps, true); + ASSERT_EQ(new_cf_opt.memtable_max_range_deletions, 0); cf_options_map["write_buffer_size"] = "hello"; - ASSERT_NOK(GetColumnFamilyOptionsFromMap( - base_cf_opt, cf_options_map, &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromMap(cf_config_options, base_cf_opt, + cf_options_map, &new_cf_opt)); ConfigOptions exact, loose; exact.sanity_level = ConfigOptions::kSanityLevelExactMatch; loose.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible; @@ -2522,18 +2518,18 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); cf_options_map["write_buffer_size"] = "1"; - ASSERT_OK(GetColumnFamilyOptionsFromMap( - base_cf_opt, cf_options_map, &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromMap(cf_config_options, base_cf_opt, + cf_options_map, &new_cf_opt)); cf_options_map["unknown_option"] = "1"; - ASSERT_NOK(GetColumnFamilyOptionsFromMap( - base_cf_opt, cf_options_map, &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromMap(cf_config_options, base_cf_opt, + cf_options_map, &new_cf_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); - ASSERT_OK(GetColumnFamilyOptionsFromMap(base_cf_opt, cf_options_map, - &new_cf_opt, - false, /* input_strings_escaped */ - true /* ignore_unknown_options */)); + cf_config_options.input_strings_escaped = false; + cf_config_options.ignore_unknown_options = true; + ASSERT_OK(GetColumnFamilyOptionsFromMap(cf_config_options, base_cf_opt, + cf_options_map, &new_cf_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions( loose, base_cf_opt, new_cf_opt, nullptr /* new_opt_map */)); ASSERT_NOK(RocksDBOptionsParser::VerifyCFOptions( @@ -2541,7 +2537,11 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { DBOptions base_db_opt; DBOptions new_db_opt; - ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt)); + ConfigOptions db_config_options(base_db_opt); + db_config_options.input_strings_escaped = false; + db_config_options.ignore_unknown_options = false; + ASSERT_OK(GetDBOptionsFromMap(db_config_options, base_db_opt, db_options_map, + &new_db_opt)); ASSERT_EQ(new_db_opt.create_if_missing, false); ASSERT_EQ(new_db_opt.create_missing_column_families, true); ASSERT_EQ(new_db_opt.error_if_exists, false); @@ -2584,18 +2584,21 @@ TEST_F(OptionsOldApiTest, GetOptionsFromMapTest) { ASSERT_EQ(new_db_opt.strict_bytes_per_sync, true); db_options_map["max_open_files"] = "hello"; - ASSERT_NOK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt)); + ASSERT_NOK(GetDBOptionsFromMap(db_config_options, base_db_opt, db_options_map, + &new_db_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(loose, base_db_opt, new_db_opt)); // unknow options should fail parsing without ignore_unknown_options = true db_options_map["unknown_db_option"] = "1"; - ASSERT_NOK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt)); + ASSERT_NOK(GetDBOptionsFromMap(db_config_options, base_db_opt, db_options_map, + &new_db_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt)); - ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt, - false, /* input_strings_escaped */ - true /* ignore_unknown_options */)); + db_config_options.input_strings_escaped = false; + db_config_options.ignore_unknown_options = true; + ASSERT_OK(GetDBOptionsFromMap(db_config_options, base_db_opt, db_options_map, + &new_db_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(loose, base_db_opt, new_db_opt)); ASSERT_NOK(RocksDBOptionsParser::VerifyDBOptions(exact, base_db_opt, new_db_opt)); } @@ -2604,33 +2607,38 @@ TEST_F(OptionsOldApiTest, GetColumnFamilyOptionsFromStringTest) { ColumnFamilyOptions base_cf_opt; ColumnFamilyOptions new_cf_opt; base_cf_opt.table_factory.reset(); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, "", &new_cf_opt)); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=5", &new_cf_opt)); + ConfigOptions config_options; + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; + ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt, "", + &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, "write_buffer_size=5", &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 5U); ASSERT_TRUE(new_cf_opt.table_factory == nullptr); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=6;", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, "write_buffer_size=6;", &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 6U); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - " write_buffer_size = 7 ", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, " write_buffer_size = 7 ", &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 7U); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - " write_buffer_size = 8 ; ", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, " write_buffer_size = 8 ; ", &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 8U); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 9U); ASSERT_EQ(new_cf_opt.max_write_buffer_number, 10); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=11; max_write_buffer_number = 12 ;", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=11; max_write_buffer_number = 12 ;", &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 11U); ASSERT_EQ(new_cf_opt.max_write_buffer_number, 12); // Wrong name "max_write_buffer_number_" - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=13;max_write_buffer_number_=14;", - &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=13;max_write_buffer_number_=14;", &new_cf_opt)); ConfigOptions exact; exact.sanity_level = ConfigOptions::kSanityLevelExactMatch; ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); @@ -2643,30 +2651,34 @@ TEST_F(OptionsOldApiTest, GetColumnFamilyOptionsFromStringTest) { std::unique_ptr* /*guard*/, std::string* /* errmsg */) { return ReverseBytewiseComparator(); }); - ASSERT_OK(GetColumnFamilyOptionsFromString( - base_cf_opt, "comparator=" + kCompName + ";", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt, + "comparator=" + kCompName + ";", + &new_cf_opt)); ASSERT_EQ(new_cf_opt.comparator, ReverseBytewiseComparator()); // MergeOperator from object registry std::unique_ptr bxo(new BytesXOROperator()); std::string kMoName = bxo->Name(); - ASSERT_OK(GetColumnFamilyOptionsFromString( - base_cf_opt, "merge_operator=" + kMoName + ";", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt, + "merge_operator=" + kMoName + ";", + &new_cf_opt)); ASSERT_EQ(kMoName, std::string(new_cf_opt.merge_operator->Name())); // Wrong key/value pair - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); // Error Paring value - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=13;max_write_buffer_number=;", &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=13;max_write_buffer_number=;", &new_cf_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); // Missing option name - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=13; =100;", &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, "write_buffer_size=13; =100;", &new_cf_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); const uint64_t kilo = 1024UL; @@ -2676,17 +2688,17 @@ TEST_F(OptionsOldApiTest, GetColumnFamilyOptionsFromStringTest) { // Units (k) ASSERT_OK(GetColumnFamilyOptionsFromString( - base_cf_opt, "max_write_buffer_number=15K", &new_cf_opt)); + config_options, base_cf_opt, "max_write_buffer_number=15K", &new_cf_opt)); ASSERT_EQ(new_cf_opt.max_write_buffer_number, 15 * kilo); // Units (m) - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "max_write_buffer_number=16m;inplace_update_num_locks=17M", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "max_write_buffer_number=16m;inplace_update_num_locks=17M", &new_cf_opt)); ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16 * mega); ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17u * mega); // Units (g) ASSERT_OK(GetColumnFamilyOptionsFromString( - base_cf_opt, + config_options, base_cf_opt, "write_buffer_size=18g;prefix_extractor=capped:8;" "arena_block_size=19G", &new_cf_opt)); @@ -2697,107 +2709,119 @@ TEST_F(OptionsOldApiTest, GetColumnFamilyOptionsFromStringTest) { ASSERT_EQ(new_cf_opt.prefix_extractor->AsString(), "rocksdb.CappedPrefix.8"); // Units (t) - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=20t;arena_block_size=21T", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, "write_buffer_size=20t;arena_block_size=21T", + &new_cf_opt)); ASSERT_EQ(new_cf_opt.write_buffer_size, 20 * tera); ASSERT_EQ(new_cf_opt.arena_block_size, 21 * tera); // Nested block based table options // Empty - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={};arena_block_size=1024", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={};arena_block_size=1024", + &new_cf_opt)); ASSERT_TRUE(new_cf_opt.table_factory != nullptr); // Non-empty - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={block_cache=1M;block_size=4;};" - "arena_block_size=1024", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_cache=1M;block_size=4;};" + "arena_block_size=1024", + &new_cf_opt)); ASSERT_TRUE(new_cf_opt.table_factory != nullptr); // Last one - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={block_cache=1M;block_size=4;}", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_cache=1M;block_size=4;}", + &new_cf_opt)); ASSERT_TRUE(new_cf_opt.table_factory != nullptr); // Mismatch curly braces - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={{{block_size=4;};" - "arena_block_size=1024", - &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={{{block_size=4;};" + "arena_block_size=1024", + &new_cf_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); // Unexpected chars after closing curly brace - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={block_size=4;}};" - "arena_block_size=1024", - &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_size=4;}};" + "arena_block_size=1024", + &new_cf_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={block_size=4;}xdfa;" - "arena_block_size=1024", - &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_size=4;}xdfa;" + "arena_block_size=1024", + &new_cf_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={block_size=4;}xdfa", - &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_size=4;}xdfa", + &new_cf_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); // Invalid block based table option - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "block_based_table_factory={xx_block_size=4;}", - &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={xx_block_size=4;}", + &new_cf_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "optimize_filters_for_hits=true", - &new_cf_opt)); - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "optimize_filters_for_hits=false", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt, + "optimize_filters_for_hits=true", + &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt, + "optimize_filters_for_hits=false", + &new_cf_opt)); - ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, - "optimize_filters_for_hits=junk", - &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString(config_options, base_cf_opt, + "optimize_filters_for_hits=junk", + &new_cf_opt)); ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(exact, base_cf_opt, new_cf_opt)); // Nested plain table options // Empty - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "plain_table_factory={};arena_block_size=1024", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "plain_table_factory={};arena_block_size=1024", + &new_cf_opt)); ASSERT_TRUE(new_cf_opt.table_factory != nullptr); ASSERT_EQ(std::string(new_cf_opt.table_factory->Name()), "PlainTable"); // Non-empty - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "plain_table_factory={user_key_len=66;bloom_bits_per_key=20;};" - "arena_block_size=1024", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "plain_table_factory={user_key_len=66;bloom_bits_per_key=20;};" + "arena_block_size=1024", + &new_cf_opt)); ASSERT_TRUE(new_cf_opt.table_factory != nullptr); ASSERT_EQ(std::string(new_cf_opt.table_factory->Name()), "PlainTable"); // memtable factory - ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, - "write_buffer_size=10;max_write_buffer_number=16;" - "memtable=skip_list:10;arena_block_size=1024", - &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "memtable=skip_list:10;arena_block_size=1024", + &new_cf_opt)); ASSERT_TRUE(new_cf_opt.memtable_factory != nullptr); ASSERT_TRUE(new_cf_opt.memtable_factory->IsInstanceOf("SkipListFactory")); // blob cache ASSERT_OK(GetColumnFamilyOptionsFromString( - base_cf_opt, + config_options, base_cf_opt, "blob_cache={capacity=1M;num_shard_bits=4;" "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};", &new_cf_opt)); @@ -2873,7 +2897,6 @@ TEST_F(OptionsTest, SliceTransformCreateFromString) { ASSERT_NOK( SliceTransform::CreateFromString(config_options, "invalid", &transform)); -#ifndef ROCKSDB_LITE ASSERT_OK(SliceTransform::CreateFromString( config_options, "rocksdb.CappedPrefix.11", &transform)); ASSERT_NE(transform, nullptr); @@ -2897,15 +2920,20 @@ TEST_F(OptionsTest, SliceTransformCreateFromString) { ASSERT_FALSE(transform->IsInstanceOf("capped:11")); ASSERT_FALSE(transform->IsInstanceOf("rocksdb.CappedPrefix")); ASSERT_FALSE(transform->IsInstanceOf("rocksdb.CappedPrefix.11")); -#endif // ROCKSDB_LITE } TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) { BlockBasedTableOptions table_opt; BlockBasedTableOptions new_opt; + ConfigOptions config_options; + config_options.input_strings_escaped = false; + config_options.ignore_unknown_options = false; + config_options.invoke_prepare_options = false; + config_options.ignore_unsupported_options = false; + // make sure default values are overwritten by something else ASSERT_OK(GetBlockBasedTableOptionsFromString( - table_opt, + config_options, table_opt, "cache_index_and_filter_blocks=1;index_type=kHashSearch;" "checksum=kxxHash;no_block_cache=1;" "block_cache=1M;block_cache_compressed=1k;block_size=1024;" @@ -2919,8 +2947,6 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) { ASSERT_TRUE(new_opt.no_block_cache); ASSERT_TRUE(new_opt.block_cache != nullptr); ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL); - ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); - ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL); ASSERT_EQ(new_opt.block_size, 1024UL); ASSERT_EQ(new_opt.block_size_deviation, 8); ASSERT_EQ(new_opt.block_restart_interval, 4); @@ -2933,54 +2959,57 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) { EXPECT_EQ(bfp->GetWholeBitsPerKey(), 5); // unknown option - ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, - "cache_index_and_filter_blocks=1;index_type=kBinarySearch;" - "bad_option=1", - &new_opt)); + ASSERT_NOK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, + "cache_index_and_filter_blocks=1;index_type=kBinarySearch;" + "bad_option=1", + &new_opt)); ASSERT_EQ(static_cast(table_opt.cache_index_and_filter_blocks), new_opt.cache_index_and_filter_blocks); ASSERT_EQ(table_opt.index_type, new_opt.index_type); // unrecognized index type - ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, - "cache_index_and_filter_blocks=1;index_type=kBinarySearchXX", - &new_opt)); + ASSERT_NOK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, + "cache_index_and_filter_blocks=1;index_type=kBinarySearchXX", &new_opt)); ASSERT_EQ(table_opt.cache_index_and_filter_blocks, new_opt.cache_index_and_filter_blocks); ASSERT_EQ(table_opt.index_type, new_opt.index_type); // unrecognized checksum type - ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, - "cache_index_and_filter_blocks=1;checksum=kxxHashXX", - &new_opt)); + ASSERT_NOK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, + "cache_index_and_filter_blocks=1;checksum=kxxHashXX", &new_opt)); ASSERT_EQ(table_opt.cache_index_and_filter_blocks, new_opt.cache_index_and_filter_blocks); ASSERT_EQ(table_opt.index_type, new_opt.index_type); // unrecognized filter policy name - ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, - "cache_index_and_filter_blocks=1;" - "filter_policy=bloomfilterxx:4:true", - &new_opt)); + ASSERT_NOK( + GetBlockBasedTableOptionsFromString(config_options, table_opt, + "cache_index_and_filter_blocks=1;" + "filter_policy=bloomfilterxx:4:true", + &new_opt)); ASSERT_EQ(table_opt.cache_index_and_filter_blocks, new_opt.cache_index_and_filter_blocks); ASSERT_EQ(table_opt.filter_policy, new_opt.filter_policy); // Used to be rejected, now accepted ASSERT_OK(GetBlockBasedTableOptionsFromString( - table_opt, "filter_policy=bloomfilter:4", &new_opt)); + config_options, table_opt, "filter_policy=bloomfilter:4", &new_opt)); bfp = dynamic_cast(new_opt.filter_policy.get()); EXPECT_EQ(bfp->GetMillibitsPerKey(), 4000); EXPECT_EQ(bfp->GetWholeBitsPerKey(), 4); // Check block cache options are overwritten when specified // in new format as a struct. - ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt, - "block_cache={capacity=1M;num_shard_bits=4;" - "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};" - "block_cache_compressed={capacity=1M;num_shard_bits=4;" - "strict_capacity_limit=true;high_pri_pool_ratio=0.5;}", - &new_opt)); + ASSERT_OK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, + "block_cache={capacity=1M;num_shard_bits=4;" + "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};" + "block_cache_compressed={capacity=1M;num_shard_bits=4;" + "strict_capacity_limit=true;high_pri_pool_ratio=0.5;}", + &new_opt)); ASSERT_TRUE(new_opt.block_cache != nullptr); ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL); ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache) @@ -2989,23 +3018,14 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), true); ASSERT_EQ(std::dynamic_pointer_cast( new_opt.block_cache)->GetHighPriPoolRatio(), 0.5); - ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); - ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache_compressed) - ->GetNumShardBits(), - 4); - ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache_compressed)->GetHighPriPoolRatio(), - 0.5); // Set only block cache capacity. Check other values are // reset to default values. - ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt, - "block_cache={capacity=2M};" - "block_cache_compressed={capacity=2M}", - &new_opt)); + ASSERT_OK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, + "block_cache={capacity=2M};" + "block_cache_compressed={capacity=2M}", + &new_opt)); ASSERT_TRUE(new_opt.block_cache != nullptr); ASSERT_EQ(new_opt.block_cache->GetCapacity(), 2*1024UL*1024UL); // Default values @@ -3016,22 +3036,10 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache) ->GetHighPriPoolRatio(), 0.5); - ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); - ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 2*1024UL*1024UL); - // Default values - ASSERT_EQ( - std::dynamic_pointer_cast( - new_opt.block_cache_compressed) - ->GetNumShardBits(), - GetDefaultCacheShardBits(new_opt.block_cache_compressed->GetCapacity())); - ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false); - ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache_compressed) - ->GetHighPriPoolRatio(), - 0.5); // Set couple of block cache options. ASSERT_OK(GetBlockBasedTableOptionsFromString( - table_opt, + config_options, table_opt, "block_cache={num_shard_bits=5;high_pri_pool_ratio=0.5;};" "block_cache_compressed={num_shard_bits=5;" "high_pri_pool_ratio=0.0;}", @@ -3043,24 +3051,15 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(new_opt.block_cache->HasStrictCapacityLimit(), false); ASSERT_EQ(std::dynamic_pointer_cast( new_opt.block_cache)->GetHighPriPoolRatio(), 0.5); - ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); - ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 0); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache_compressed) - ->GetNumShardBits(), - 5); - ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), false); - ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache_compressed) - ->GetHighPriPoolRatio(), - 0.0); // Set couple of block cache options. - ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt, - "block_cache={capacity=1M;num_shard_bits=4;" - "strict_capacity_limit=true;};" - "block_cache_compressed={capacity=1M;num_shard_bits=4;" - "strict_capacity_limit=true;}", - &new_opt)); + ASSERT_OK(GetBlockBasedTableOptionsFromString( + config_options, table_opt, + "block_cache={capacity=1M;num_shard_bits=4;" + "strict_capacity_limit=true;};" + "block_cache_compressed={capacity=1M;num_shard_bits=4;" + "strict_capacity_limit=true;}", + &new_opt)); ASSERT_TRUE(new_opt.block_cache != nullptr); ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL); ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache) @@ -3070,27 +3069,22 @@ TEST_F(OptionsOldApiTest, GetBlockBasedTableOptionsFromString) { ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache) ->GetHighPriPoolRatio(), 0.5); - ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); - ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL*1024UL); - ASSERT_EQ(std::dynamic_pointer_cast( - new_opt.block_cache_compressed) - ->GetNumShardBits(), - 4); - ASSERT_EQ(new_opt.block_cache_compressed->HasStrictCapacityLimit(), true); - ASSERT_EQ(std::dynamic_pointer_cast(new_opt.block_cache_compressed) - ->GetHighPriPoolRatio(), - 0.5); } TEST_F(OptionsOldApiTest, GetPlainTableOptionsFromString) { PlainTableOptions table_opt; PlainTableOptions new_opt; // make sure default values are overwritten by something else - ASSERT_OK(GetPlainTableOptionsFromString(table_opt, - "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;" - "index_sparseness=8;huge_page_tlb_size=4;encoding_type=kPrefix;" - "full_scan_mode=true;store_index_in_file=true", - &new_opt)); + ConfigOptions config_options_from_string; + config_options_from_string.input_strings_escaped = false; + config_options_from_string.ignore_unknown_options = false; + config_options_from_string.invoke_prepare_options = false; + ASSERT_OK(GetPlainTableOptionsFromString( + config_options_from_string, table_opt, + "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;" + "index_sparseness=8;huge_page_tlb_size=4;encoding_type=kPrefix;" + "full_scan_mode=true;store_index_in_file=true", + &new_opt)); ASSERT_EQ(new_opt.user_key_len, 66u); ASSERT_EQ(new_opt.bloom_bits_per_key, 20); ASSERT_EQ(new_opt.hash_table_ratio, 0.5); @@ -3103,22 +3097,28 @@ TEST_F(OptionsOldApiTest, GetPlainTableOptionsFromString) { std::unordered_map opt_map; ASSERT_OK(StringToMap( "user_key_len=55;bloom_bits_per_key=10;huge_page_tlb_size=8;", &opt_map)); - ASSERT_OK(GetPlainTableOptionsFromMap(table_opt, opt_map, &new_opt)); + ConfigOptions config_options_from_map; + config_options_from_map.input_strings_escaped = false; + config_options_from_map.ignore_unknown_options = false; + ASSERT_OK(GetPlainTableOptionsFromMap(config_options_from_map, table_opt, + opt_map, &new_opt)); ASSERT_EQ(new_opt.user_key_len, 55u); ASSERT_EQ(new_opt.bloom_bits_per_key, 10); ASSERT_EQ(new_opt.huge_page_tlb_size, 8); // unknown option - ASSERT_NOK(GetPlainTableOptionsFromString(table_opt, - "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;" - "bad_option=1", - &new_opt)); + ASSERT_NOK(GetPlainTableOptionsFromString( + config_options_from_string, table_opt, + "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;" + "bad_option=1", + &new_opt)); // unrecognized EncodingType - ASSERT_NOK(GetPlainTableOptionsFromString(table_opt, - "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;" - "encoding_type=kPrefixXX", - &new_opt)); + ASSERT_NOK(GetPlainTableOptionsFromString( + config_options_from_string, table_opt, + "user_key_len=66;bloom_bits_per_key=20;hash_table_ratio=0.5;" + "encoding_type=kPrefixXX", + &new_opt)); } TEST_F(OptionsOldApiTest, GetOptionsFromStringTest) { @@ -3183,7 +3183,7 @@ TEST_F(OptionsOldApiTest, GetOptionsFromStringTest) { ASSERT_EQ(new_options.max_open_files, 1); ASSERT_TRUE(new_options.rate_limiter.get() != nullptr); Env* newEnv = new_options.env; - ASSERT_OK(Env::LoadEnv("CustomEnvDefault", &newEnv)); + ASSERT_OK(Env::CreateFromString({}, "CustomEnvDefault", &newEnv)); ASSERT_EQ(newEnv, new_options.env); } @@ -3200,10 +3200,15 @@ TEST_F(OptionsOldApiTest, DBOptionsSerialization) { // Phase 3: Set new_options from the derived string and expect // new_options == base_options - ASSERT_OK(GetDBOptionsFromString(DBOptions(), base_options_file_content, - &new_options)); - ConfigOptions config_options; - ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(config_options, base_options, new_options)); + const DBOptions base_db_options; + ConfigOptions db_config_options(base_db_options); + db_config_options.input_strings_escaped = false; + db_config_options.ignore_unknown_options = false; + ASSERT_OK(GetDBOptionsFromString(db_config_options, base_db_options, + base_options_file_content, &new_options)); + ConfigOptions verify_db_config_options; + ASSERT_OK(RocksDBOptionsParser::VerifyDBOptions(verify_db_config_options, + base_options, new_options)); } TEST_F(OptionsOldApiTest, ColumnFamilyOptionsSerialization) { @@ -3221,17 +3226,20 @@ TEST_F(OptionsOldApiTest, ColumnFamilyOptionsSerialization) { // Phase 3: Set new_opt from the derived string and expect // new_opt == base_opt - ASSERT_OK(GetColumnFamilyOptionsFromString( - ColumnFamilyOptions(), base_options_file_content, &new_opt)); - ConfigOptions config_options; - ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(config_options, base_opt, new_opt)); + ConfigOptions cf_config_options; + cf_config_options.input_strings_escaped = false; + cf_config_options.ignore_unknown_options = false; + ASSERT_OK( + GetColumnFamilyOptionsFromString(cf_config_options, ColumnFamilyOptions(), + base_options_file_content, &new_opt)); + ConfigOptions verify_cf_config_options; + ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(verify_cf_config_options, + base_opt, new_opt)); if (base_opt.compaction_filter) { delete base_opt.compaction_filter; } } -#endif // !ROCKSDB_LITE -#ifndef ROCKSDB_LITE class OptionsParserTest : public testing::Test { public: OptionsParserTest() { fs_.reset(new test::StringFS(FileSystem::Default())); } @@ -3963,6 +3971,36 @@ class OptionsSanityCheckTest : public OptionsParserTest, const std::string kOptionsFileName = "OPTIONS"; }; +TEST_P(OptionsSanityCheckTest, MergeOperatorErrorMessage) { + ColumnFamilyOptions opts; + Random rnd(301); + opts.merge_operator.reset(test::RandomMergeOperator(&rnd)); + std::string merge_op_name = opts.merge_operator->Name(); + ASSERT_OK(PersistCFOptions(opts)); + + // Test when going from merge operator -> nullptr + opts.merge_operator = nullptr; + Status s = + SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelLooselyCompatible); + ASSERT_TRUE(s.IsInvalidArgument()); + std::string err_msg = s.ToString(); + std::string specified = "The specified one is " + kNullptrString; + std::string persisted = "the persisted one is " + merge_op_name; + ASSERT_TRUE(err_msg.find(specified) != std::string::npos); + ASSERT_TRUE(err_msg.find(persisted) != std::string::npos); + + // Test when using a different merge operator + opts.merge_operator.reset(test::RandomMergeOperator(&rnd)); + s = SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelLooselyCompatible); + ASSERT_TRUE(s.IsInvalidArgument()); + err_msg = s.ToString(); + specified = + "The specified one is " + std::string(opts.merge_operator->Name()); + persisted = "the persisted one is " + merge_op_name; + ASSERT_TRUE(err_msg.find(specified) != std::string::npos); + ASSERT_TRUE(err_msg.find(persisted) != std::string::npos); +} + TEST_P(OptionsSanityCheckTest, CFOptionsSanityCheck) { ColumnFamilyOptions opts; Random rnd(301); @@ -4108,6 +4146,30 @@ TEST_P(OptionsSanityCheckTest, CFOptionsSanityCheck) { SanityCheckCFOptions(opts, config_options_.ignore_unsupported_options); } } + + // persist_user_defined_timestamps + { + // Test change from true to false not allowed in loose and exact mode. + opts.persist_user_defined_timestamps = false; + ASSERT_NOK(SanityCheckCFOptions( + opts, ConfigOptions::kSanityLevelLooselyCompatible)); + ASSERT_NOK( + SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch)); + + // persist the change + ASSERT_OK(PersistCFOptions(opts)); + SanityCheckCFOptions(opts, config_options_.ignore_unsupported_options); + + // Test change from false to true not allowed in loose and exact mode. + opts.persist_user_defined_timestamps = true; + ASSERT_NOK(SanityCheckCFOptions( + opts, ConfigOptions::kSanityLevelLooselyCompatible)); + ASSERT_NOK( + SanityCheckCFOptions(opts, ConfigOptions::kSanityLevelExactMatch)); + + // persist the change + ASSERT_OK(PersistCFOptions(opts)); + } } TEST_P(OptionsSanityCheckTest, DBOptionsSanityCheck) { @@ -5003,7 +5065,6 @@ TEST_F(ConfigOptionsTest, ConfiguringOptionsDoesNotRevertRateLimiterBandwidth) { INSTANTIATE_TEST_CASE_P(OptionsSanityCheckTest, OptionsSanityCheckTest, ::testing::Bool()); -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/port/lang.h b/port/lang.h index 52c597acdc0f..a4201ca3b282 100644 --- a/port/lang.h +++ b/port/lang.h @@ -68,3 +68,30 @@ constexpr bool kMustFreeHeapAllocations = false; #else #define TSAN_SUPPRESSION #endif // TSAN_SUPPRESSION + +// Compile-time CPU feature testing compatibility +// +// A way to be extra sure these defines have been included. +#define ASSERT_FEATURE_COMPAT_HEADER() /* empty */ + +// MSVC doesn't support the same defines that gcc and clang provide +// but does some like __AVX__. Here we can infer some features from others. +#ifdef __AVX__ +#define __SSE4_2__ 1 +#define __PCLMUL__ 1 +#endif // __AVX__ + +// A way to disable PCLMUL +#ifdef NO_PCLMUL +#undef __PCLMUL__ +#endif + +// popcnt is generally implied by SSE4.2 +#if defined(__SSE4_2__) +#define __POPCNT__ 1 +#endif + +// A way to disable POPCNT +#ifdef NO_POPCNT +#undef __POPCNT__ +#endif diff --git a/port/mmap.h b/port/mmap.h index 7342a13f967f..0f385522fc35 100644 --- a/port/mmap.h +++ b/port/mmap.h @@ -14,6 +14,7 @@ #endif // OS_WIN #include +#include #include "rocksdb/rocksdb_namespace.h" @@ -67,4 +68,23 @@ class MemMapping { static MemMapping AllocateAnonymous(size_t length, bool huge); }; +// Simple MemMapping wrapper that presents the memory as an array of T. +// For example, +// TypedMemMapping arr = MemMapping::AllocateLazyZeroed(num_bytes); +template +class TypedMemMapping : public MemMapping { + public: + /*implicit*/ TypedMemMapping(MemMapping&& v) noexcept + : MemMapping(std::move(v)) {} + TypedMemMapping& operator=(MemMapping&& v) noexcept { + MemMapping& base = *this; + base = std::move(v); + } + + inline T* Get() const { return static_cast(MemMapping::Get()); } + inline size_t Count() const { return MemMapping::Length() / sizeof(T); } + + inline T& operator[](size_t index) const { return Get()[index]; } +}; + } // namespace ROCKSDB_NAMESPACE diff --git a/port/port_example.h b/port/port_example.h index 794149a6906d..2a19ffee0557 100644 --- a/port/port_example.h +++ b/port/port_example.h @@ -43,7 +43,7 @@ class Mutex { // Optionally crash if this thread does not hold this mutex. // The implementation must be fast, especially if NDEBUG is // defined. The implementation is allowed to skip all checks. - void AssertHeld(); + void AssertHeld() const; }; class CondVar { diff --git a/port/port_posix.cc b/port/port_posix.cc index 3872293b8177..749ad5d607d1 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -100,7 +100,7 @@ bool Mutex::TryLock() { return ret; } -void Mutex::AssertHeld() { +void Mutex::AssertHeld() const { #ifndef NDEBUG assert(locked_); #endif diff --git a/port/port_posix.h b/port/port_posix.h index 417fbf4f6114..95641c0c54b4 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -13,7 +13,7 @@ #include -#include "rocksdb/options.h" +#include "rocksdb/port_defs.h" #include "rocksdb/rocksdb_namespace.h" // size_t printf formatting named in the manner of C99 standard formatting @@ -109,9 +109,9 @@ class Mutex { bool TryLock(); - // this will assert if the mutex is not locked - // it does NOT verify that mutex is held by a calling thread - void AssertHeld(); + // This will fail assertion if the mutex is not locked. + // It does NOT verify that mutex is held by a calling thread. + void AssertHeld() const; // Also implement std Lockable inline void lock() { Lock(); } @@ -139,7 +139,7 @@ class RWMutex { void WriteLock(); void ReadUnlock(); void WriteUnlock(); - void AssertHeld() {} + void AssertHeld() const {} private: pthread_rwlock_t mu_; // the underlying platform mutex @@ -149,6 +149,9 @@ class CondVar { public: explicit CondVar(Mutex* mu); ~CondVar(); + + Mutex* GetMutex() const { return mu_; } + void Wait(); // Timed condition wait. Returns true if timeout occurred. bool TimedWait(uint64_t abs_time_us); diff --git a/port/stack_trace.cc b/port/stack_trace.cc index ef7144947fb1..a5a6d2e77c84 100644 --- a/port/stack_trace.cc +++ b/port/stack_trace.cc @@ -5,8 +5,7 @@ // #include "port/stack_trace.h" -#if defined(ROCKSDB_LITE) || \ - !(defined(ROCKSDB_BACKTRACE) || defined(OS_MACOSX)) || defined(CYGWIN) || \ +#if !(defined(ROCKSDB_BACKTRACE) || defined(OS_MACOSX)) || defined(CYGWIN) || \ defined(OS_SOLARIS) || defined(OS_WIN) // noop @@ -32,12 +31,22 @@ void* SaveStack(int* /*num_frames*/, int /*first_frames_to_skip*/) { #include #include -#if defined(OS_FREEBSD) +#ifdef OS_OPENBSD +#include #include -#endif +#endif // OS_OPENBSD +#ifdef OS_FREEBSD +#include +#endif // OS_FREEBSD #ifdef OS_LINUX #include -#endif +#include +#include +#if __GLIBC__ < 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ < 30) +#include +#define gettid() syscall(SYS_gettid) +#endif // GLIBC version +#endif // OS_LINUX #include "port/lang.h" @@ -46,28 +55,39 @@ namespace port { namespace { -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_GNU_KFREEBSD) const char* GetExecutableName() { static char name[1024]; -#if !defined(OS_FREEBSD) - char link[1024]; - snprintf(link, sizeof(link), "/proc/%d/exe", getpid()); - auto read = readlink(link, name, sizeof(name) - 1); - if (-1 == read) { +#if defined(OS_FREEBSD) + int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1}; + size_t namesz = sizeof(name); + + auto ret = sysctl(mib, 4, name, &namesz, nullptr, 0); + if (-1 == ret) { return nullptr; } else { - name[read] = 0; return name; } -#else - int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1}; +#elif defined(OS_OPENBSD) + int mib[4] = {CTL_KERN, KERN_PROC_ARGS, getpid(), KERN_PROC_ARGV}; size_t namesz = sizeof(name); + char* bin[namesz]; - auto ret = sysctl(mib, 4, name, &namesz, nullptr, 0); + auto ret = sysctl(mib, 4, bin, &namesz, nullptr, 0); if (-1 == ret) { return nullptr; } else { + return bin[0]; + } +#else + char link[1024]; + snprintf(link, sizeof(link), "/proc/%d/exe", getpid()); + auto read = readlink(link, name, sizeof(name) - 1); + if (-1 == read) { + return nullptr; + } else { + name[read] = 0; return name; } #endif @@ -123,6 +143,14 @@ void PrintStackTraceLine(const char* symbol, void* frame) { #endif +const char* GetLldbScriptSelectThread(long long tid) { + // NOTE: called from a signal handler, so no heap allocation + static char script[80]; + snprintf(script, sizeof(script), + "script -l python -- lldb.process.SetSelectedThreadByID(%lld)", tid); + return script; +} + } // namespace void PrintStack(void* frames[], int num_frames) { @@ -136,10 +164,134 @@ void PrintStack(void* frames[], int num_frames) { } void PrintStack(int first_frames_to_skip) { + // Default to getting stack traces with GDB, at least on Linux where we + // know how to attach to a particular thread. + // + // * Address space layout randomization (ASLR) interferes with getting good + // stack information from backtrace+addr2line. This is more likely to show + // up with LIB_MODE=shared builds (when kernel.randomize_va_space >= 1) + // but can also show up with LIB_MODE=static builds ((when + // kernel.randomize_va_space == 2). + // * It doesn't appear easy to detect when ASLR is in use. + // * With DEBUG_LEVEL < 2, backtrace() can skip frames that are not skipped + // in GDB. + // + // LLDB also available as an option + bool lldb_stack_trace = getenv("ROCKSDB_LLDB_STACK") != nullptr; +#if defined(OS_LINUX) + // Default true, override with ROCKSDB_BACKTRACE_STACK=1 + bool gdb_stack_trace = + !lldb_stack_trace && getenv("ROCKSDB_BACKTRACE_STACK") == nullptr; +#else + // Default false, override with ROCKSDB_GDB_STACK=1 + bool gdb_stack_trace = getenv("ROCKSDB_GDB_STACK") != nullptr; +#endif + // Also support invoking interactive debugger on stack trace, with this + // envvar set to non-empty + char* debug_env = getenv("ROCKSDB_DEBUG"); + bool debug = debug_env != nullptr && strlen(debug_env) > 0; + + if (!debug && getenv("ROCKSDB_NO_STACK") != nullptr) { + // Skip stack trace + return; + } + + if (lldb_stack_trace || gdb_stack_trace || debug) { + // Allow ouside debugger to attach, even with Yama security restrictions +#ifdef PR_SET_PTRACER_ANY + (void)prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); +#endif + // Try to invoke GDB, either for stack trace or debugging. + long long attach_pid = getpid(); + // NOTE: we're in a signal handler, so no heap allocation + char attach_pid_str[20]; + snprintf(attach_pid_str, sizeof(attach_pid_str), "%lld", attach_pid); + + // `gdb -p PID` seems to always attach to main thread, but `gdb -p TID` + // seems to be able to attach to a particular thread in a process, which + // makes sense as the main thread TID == PID of the process. + // But I haven't found that gdb capability documented anywhere, so leave + // a back door to attach to main thread. + long long gdb_attach_id = attach_pid; + // Save current thread id before fork + long long attach_tid = 0; +#ifdef OS_LINUX + attach_tid = gettid(); + if (getenv("ROCKSDB_DEBUG_USE_PID") == nullptr) { + gdb_attach_id = attach_tid; + } +#endif + + char gdb_attach_id_str[20]; + snprintf(gdb_attach_id_str, sizeof(gdb_attach_id_str), "%lld", + gdb_attach_id); + + pid_t child_pid = fork(); + if (child_pid == 0) { + // child process + if (debug) { + if (strcmp(debug_env, "lldb") == 0) { + fprintf(stderr, "Invoking LLDB for debugging (ROCKSDB_DEBUG=%s)...\n", + debug_env); + execlp(/*cmd in PATH*/ "lldb", /*arg0*/ "lldb", "-p", attach_pid_str, + /*"-Q",*/ "-o", GetLldbScriptSelectThread(attach_tid), + (char*)nullptr); + return; + } else { + fprintf(stderr, "Invoking GDB for debugging (ROCKSDB_DEBUG=%s)...\n", + debug_env); + execlp(/*cmd in PATH*/ "gdb", /*arg0*/ "gdb", "-p", gdb_attach_id_str, + (char*)nullptr); + return; + } + } else { + // Redirect child stdout to original stderr + dup2(2, 1); + // No child stdin (don't use pager) + close(0); + if (lldb_stack_trace) { + fprintf(stderr, "Invoking LLDB for stack trace...\n"); + + // Skip top ~8 frames here in PrintStack + auto bt_in_lldb = + "script -l python -- for f in lldb.thread.frames[8:]: print(f)"; + execlp(/*cmd in PATH*/ "lldb", /*arg0*/ "lldb", "-p", attach_pid_str, + "-b", "-Q", "-o", GetLldbScriptSelectThread(attach_tid), "-o", + bt_in_lldb, (char*)nullptr); + } else { + // gdb_stack_trace + fprintf(stderr, "Invoking GDB for stack trace...\n"); + + // Skip top ~4 frames here in PrintStack + // See https://stackoverflow.com/q/40991943/454544 + auto bt_in_gdb = + "frame apply level 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 " + "21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 " + "42 43 44 -q frame"; + // -n : Loading config files can apparently cause failures with the + // other options here. + // -batch : non-interactive; suppress banners as much as possible + execlp(/*cmd in PATH*/ "gdb", /*arg0*/ "gdb", "-n", "-batch", "-p", + gdb_attach_id_str, "-ex", bt_in_gdb, (char*)nullptr); + } + return; + } + } else { + // parent process; wait for child + int wstatus; + waitpid(child_pid, &wstatus, 0); + if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == EXIT_SUCCESS) { + // Good + return; + } + } + fprintf(stderr, "GDB failed; falling back on backtrace+addr2line...\n"); + } + const int kMaxFrames = 100; void* frames[kMaxFrames]; - auto num_frames = backtrace(frames, kMaxFrames); + int num_frames = (int) backtrace(frames, kMaxFrames); PrintStack(&frames[first_frames_to_skip], num_frames - first_frames_to_skip); } @@ -152,7 +304,7 @@ void* SaveStack(int* num_frames, int first_frames_to_skip) { const int kMaxFrames = 100; void* frames[kMaxFrames]; - auto count = backtrace(frames, kMaxFrames); + int count = (int) backtrace(frames, kMaxFrames); *num_frames = count - first_frames_to_skip; void* callstack = malloc(sizeof(void*) * *num_frames); memcpy(callstack, &frames[first_frames_to_skip], sizeof(void*) * *num_frames); @@ -190,7 +342,9 @@ void InstallStackTraceHandler() { signal(SIGSEGV, StackTraceHandler); signal(SIGBUS, StackTraceHandler); signal(SIGABRT, StackTraceHandler); - // Allow ouside debugger to attach, even with Yama security restrictions + // Allow ouside debugger to attach, even with Yama security restrictions. + // This is needed even outside of PrintStack() so that external mechanisms + // can dump stacks if they suspect that a test has hung. #ifdef PR_SET_PTRACER_ANY (void)prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); #endif diff --git a/port/win/env_win.h b/port/win/env_win.h index 8fbfb8246cdf..b6482ba923ee 100644 --- a/port/win/env_win.h +++ b/port/win/env_win.h @@ -97,7 +97,7 @@ class WinClock : public SystemClock { Status GetCurrentTime(int64_t* unix_time) override; // Converts seconds-since-Jan-01-1970 to a printable string - virtual std::string TimeToString(uint64_t time); + std::string TimeToString(uint64_t time) override; uint64_t GetPerfCounterFrequency() const { return perf_counter_frequency_; } @@ -116,7 +116,7 @@ class WinFileSystem : public FileSystem { ~WinFileSystem() {} static const char* kClassName() { return "WinFS"; } const char* Name() const override { return kClassName(); } - const char* NickName() const { return kDefaultName(); } + const char* NickName() const override { return kDefaultName(); } static size_t GetSectorSize(const std::string& fname); size_t GetPageSize() const { return page_size_; } @@ -227,6 +227,7 @@ class WinFileSystem : public FileSystem { const FileOptions& file_options) const override; FileOptions OptimizeForManifestWrite( const FileOptions& file_options) const override; + void SupportedOps(int64_t& supported_ops) override { supported_ops = 0; } protected: static uint64_t FileTimeToUnixTime(const FILETIME& ftTime); diff --git a/port/win/port_win.h b/port/win/port_win.h index 989b5620b9bb..4aa10d0052b0 100644 --- a/port/win/port_win.h +++ b/port/win/port_win.h @@ -17,21 +17,22 @@ #endif #include -#include -#include -#include -#include -#include -#include -#include +//^^ should be included first before other system lib #include +#include #include - #include +#include -#include "port/win/win_thread.h" +#include +#include +#include +#include +#include +#include -#include "rocksdb/options.h" +#include "port/win/win_thread.h" +#include "rocksdb/port_defs.h" #undef min #undef max @@ -116,7 +117,7 @@ class Mutex { // this will assert if the mutex is not locked // it does NOT verify that mutex is held by a calling thread - void AssertHeld() { + void AssertHeld() const { #ifndef NDEBUG assert(locked_); #endif @@ -158,7 +159,7 @@ class RWMutex { void WriteUnlock() { ReleaseSRWLockExclusive(&srwLock_); } // Empty as in POSIX - void AssertHeld() {} + void AssertHeld() const {} private: SRWLOCK srwLock_; @@ -169,6 +170,9 @@ class CondVar { explicit CondVar(Mutex* mu) : mu_(mu) {} ~CondVar(); + + Mutex* GetMutex() const { return mu_; } + void Wait(); bool TimedWait(uint64_t expiration_time); void Signal(); diff --git a/src.mk b/src.mk index f52df4ac0545..0dc9f713075f 100644 --- a/src.mk +++ b/src.mk @@ -10,7 +10,9 @@ LIB_SOURCES = \ cache/lru_cache.cc \ cache/compressed_secondary_cache.cc \ cache/secondary_cache.cc \ + cache/secondary_cache_adapter.cc \ cache/sharded_cache.cc \ + cache/tiered_secondary_cache.cc \ cloud/aws/aws_file_system.cc \ cloud/aws/aws_kafka.cc \ cloud/aws/aws_kinesis.cc \ @@ -109,9 +111,11 @@ LIB_SOURCES = \ db/wal_manager.cc \ db/wide/wide_column_serialization.cc \ db/wide/wide_columns.cc \ + db/wide/wide_columns_helper.cc \ db/write_batch.cc \ db/write_batch_base.cc \ db/write_controller.cc \ + db/write_stall_stats.cc \ db/write_thread.cc \ env/composite_env.cc \ env/env.cc \ @@ -168,6 +172,7 @@ LIB_SOURCES = \ options/configurable.cc \ options/customizable.cc \ options/db_options.cc \ + options/offpeak_time_info.cc \ options/options.cc \ options/options_helper.cc \ options/options_parser.cc \ @@ -214,6 +219,7 @@ LIB_SOURCES = \ table/get_context.cc \ table/iterator.cc \ table/merging_iterator.cc \ + table/compaction_merging_iterator.cc \ table/meta_blocks.cc \ table/persistent_cache_helper.cc \ table/plain/plain_table_bloom.cc \ @@ -250,6 +256,7 @@ LIB_SOURCES = \ util/concurrent_task_limiter_impl.cc \ util/crc32c.cc \ util/crc32c_arm64.cc \ + util/data_structure.cc \ util/dynamic_bloom.cc \ util/hash.cc \ util/murmurhash.cc \ @@ -263,6 +270,8 @@ LIB_SOURCES = \ util/string_util.cc \ util/thread_local.cc \ util/threadpool_imp.cc \ + util/udt_util.cc \ + util/write_batch_util.cc \ util/xxhash.cc \ utilities/agg_merge/agg_merge.cc \ utilities/backup/backup_engine.cc \ @@ -388,7 +397,9 @@ STRESS_LIB_SOURCES = \ db_stress_tool/db_stress_stat.cc \ db_stress_tool/db_stress_test_base.cc \ db_stress_tool/db_stress_tool.cc \ + db_stress_tool/db_stress_wide_merge_operator.cc \ db_stress_tool/expected_state.cc \ + db_stress_tool/expected_value.cc \ db_stress_tool/no_batched_ops_stress.cc \ db_stress_tool/multi_ops_txns_stress.cc \ @@ -396,6 +407,7 @@ TEST_LIB_SOURCES = \ db/db_test_util.cc \ db/db_with_timestamp_test_util.cc \ test_util/mock_time_env.cc \ + test_util/secondary_cache_test_util.cc \ test_util/testharness.cc \ test_util/testutil.cc \ utilities/agg_merge/test_agg_merge.cc \ @@ -439,13 +451,14 @@ BENCH_MAIN_SOURCES = \ TEST_MAIN_SOURCES = \ cache/cache_test.cc \ cache/cache_reservation_manager_test.cc \ - cache/lru_cache_test.cc \ cloud/db_cloud_test.cc \ cloud/cloud_file_system_test.cc \ cloud/cloud_manifest_test.cc \ cloud/cloud_scheduler_test.cc \ cloud/replication_test.cc \ cache/compressed_secondary_cache_test.cc \ + cache/lru_cache_test.cc \ + cache/tiered_secondary_cache_test.cc \ db/blob/blob_counting_iterator_test.cc \ db/blob/blob_file_addition_test.cc \ db/blob/blob_file_builder_test.cc \ @@ -475,6 +488,7 @@ TEST_MAIN_SOURCES = \ db/db_bloom_filter_test.cc \ db/db_compaction_filter_test.cc \ db/db_compaction_test.cc \ + db/db_clip_test.cc \ db/db_dynamic_level_test.cc \ db/db_encryption_test.cc \ db/db_flush_test.cc \ @@ -540,6 +554,7 @@ TEST_MAIN_SOURCES = \ db/wal_manager_test.cc \ db/wide/db_wide_basic_test.cc \ db/wide/wide_column_serialization_test.cc \ + db/wide/wide_columns_helper_test.cc \ db/write_batch_test.cc \ db/write_callback_test.cc \ db/write_controller_test.cc \ @@ -607,6 +622,7 @@ TEST_MAIN_SOURCES = \ util/timer_test.cc \ util/thread_list_test.cc \ util/thread_local_test.cc \ + util/udt_util_test.cc \ util/work_queue_test.cc \ utilities/agg_merge/agg_merge_test.cc \ utilities/backup/backup_engine_test.cc \ @@ -667,13 +683,17 @@ JNI_NATIVE_SOURCES = \ java/rocksjni/compression_options.cc \ java/rocksjni/concurrent_task_limiter.cc \ java/rocksjni/config_options.cc \ + java/rocksjni/export_import_files_metadatajni.cc \ java/rocksjni/env.cc \ java/rocksjni/env_options.cc \ java/rocksjni/event_listener.cc \ java/rocksjni/event_listener_jnicallback.cc \ + java/rocksjni/import_column_family_options.cc \ java/rocksjni/ingest_external_file_options.cc \ java/rocksjni/filter.cc \ + java/rocksjni/hyper_clock_cache.cc \ java/rocksjni/iterator.cc \ + java/rocksjni/jni_perf_context.cc \ java/rocksjni/jnicallback.cc \ java/rocksjni/loggerjnicallback.cc \ java/rocksjni/lru_cache.cc \ diff --git a/table/adaptive/adaptive_table_factory.cc b/table/adaptive/adaptive_table_factory.cc index bbea91b54269..5a573ca992aa 100644 --- a/table/adaptive/adaptive_table_factory.cc +++ b/table/adaptive/adaptive_table_factory.cc @@ -3,7 +3,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #include "table/adaptive/adaptive_table_factory.h" #include "port/port.h" @@ -124,4 +123,3 @@ extern TableFactory* NewAdaptiveTableFactory( } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/table/adaptive/adaptive_table_factory.h b/table/adaptive/adaptive_table_factory.h index 3b631942d235..55c8bca1f42d 100644 --- a/table/adaptive/adaptive_table_factory.h +++ b/table/adaptive/adaptive_table_factory.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include @@ -55,4 +54,3 @@ class AdaptiveTableFactory : public TableFactory { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/table/block_based/binary_search_index_reader.cc b/table/block_based/binary_search_index_reader.cc index 21787cc1aa52..50e2ca894510 100644 --- a/table/block_based/binary_search_index_reader.cc +++ b/table/block_based/binary_search_index_reader.cc @@ -46,9 +46,8 @@ InternalIteratorBase* BinarySearchIndexReader::NewIterator( const BlockBasedTable::Rep* rep = table()->get_rep(); const bool no_io = (read_options.read_tier == kBlockCacheTier); CachableEntry index_block; - const Status s = - GetOrReadIndexBlock(no_io, read_options.rate_limiter_priority, - get_context, lookup_context, &index_block); + const Status s = GetOrReadIndexBlock(no_io, get_context, lookup_context, + &index_block, read_options); if (!s.ok()) { if (iter != nullptr) { iter->Invalidate(s); @@ -64,7 +63,8 @@ InternalIteratorBase* BinarySearchIndexReader::NewIterator( auto it = index_block.GetValue()->NewIndexIterator( internal_comparator()->user_comparator(), rep->get_global_seqno(BlockType::kIndex), iter, kNullStats, true, - index_has_first_key(), index_key_includes_seq(), index_value_is_full()); + index_has_first_key(), index_key_includes_seq(), index_value_is_full(), + false /* block_contents_pinned */, user_defined_timestamps_persisted()); assert(it != nullptr); index_block.TransferTo(it); diff --git a/table/block_based/block.cc b/table/block_based/block.cc index 7eb0b010f2ad..9bebdfbdcabe 100644 --- a/table/block_based/block.cc +++ b/table/block_based/block.cc @@ -30,7 +30,7 @@ namespace ROCKSDB_NAMESPACE { // Helper routine: decode the next block entry starting at "p", // storing the number of shared key bytes, non_shared key bytes, // and the length of the value in "*shared", "*non_shared", and -// "*value_length", respectively. Will not derefence past "limit". +// "*value_length", respectively. Will not dereference past "limit". // // If any errors are detected, returns nullptr. Otherwise, returns a // pointer to the key delta (just past the three decoded values). @@ -137,17 +137,26 @@ struct DecodeEntryV4 { return DecodeKeyV4()(p, limit, shared, non_shared); } }; + void DataBlockIter::NextImpl() { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("DataBlockIter::NextImpl")) return; +#endif bool is_shared = false; ParseNextDataKey(&is_shared); + ++cur_entry_idx_; } void MetaBlockIter::NextImpl() { bool is_shared = false; ParseNextKey(&is_shared); + ++cur_entry_idx_; } -void IndexBlockIter::NextImpl() { ParseNextIndexKey(); } +void IndexBlockIter::NextImpl() { + ParseNextIndexKey(); + ++cur_entry_idx_; +} void IndexBlockIter::PrevImpl() { assert(Valid()); @@ -166,6 +175,7 @@ void IndexBlockIter::PrevImpl() { // Loop until end of current entry hits the start of original entry while (ParseNextIndexKey() && NextEntryOffset() < original) { } + --cur_entry_idx_; } void MetaBlockIter::PrevImpl() { @@ -187,6 +197,7 @@ void MetaBlockIter::PrevImpl() { while (ParseNextKey(&is_shared) && NextEntryOffset() < original) { } + --cur_entry_idx_; } // Similar to IndexBlockIter::PrevImpl but also caches the prev entries @@ -195,6 +206,7 @@ void DataBlockIter::PrevImpl() { assert(prev_entries_idx_ == -1 || static_cast(prev_entries_idx_) < prev_entries_.size()); + --cur_entry_idx_; // Check if we can use cached prev_entries_ if (prev_entries_idx_ > 0 && prev_entries_[prev_entries_idx_].offset == current_) { @@ -307,11 +319,11 @@ void MetaBlockIter::SeekImpl(const Slice& target) { // target = "seek_user_key @ type | seqno". // // For any type other than kTypeValue, kTypeDeletion, kTypeSingleDeletion, -// kTypeBlobIndex, or kTypeWideColumnEntity, this function behaves identically -// to Seek(). +// kTypeBlobIndex, kTypeWideColumnEntity or kTypeMerge, this function behaves +// identically to Seek(). // // For any type in kTypeValue, kTypeDeletion, kTypeSingleDeletion, -// kTypeBlobIndex, or kTypeWideColumnEntity: +// kTypeBlobIndex, kTypeWideColumnEntity, or kTypeMerge: // // If the return value is FALSE, iter location is undefined, and it means: // 1) there is no key in this block falling into the range: @@ -319,10 +331,10 @@ void MetaBlockIter::SeekImpl(const Slice& target) { // inclusive; AND // 2) the last key of this block has a greater user_key from seek_user_key // -// If the return value is TRUE, iter location has two possibilies: -// 1) If iter is valid, it is set to a location as if set by BinarySeek. In -// this case, it points to the first key with a larger user_key or a matching -// user_key with a seqno no greater than the seeking seqno. +// If the return value is TRUE, iter location has two possibilities: +// 1) If iter is valid, it is set to a location as if set by SeekImpl(target). +// In this case, it points to the first key with a larger user_key or a +// matching user_key with a seqno no greater than the seeking seqno. // 2) If the iter is invalid, it means that either all the user_key is less // than the seek_user_key, or the block ends with a matching user_key but // with a smaller [ type | seqno ] (i.e. a larger seqno, or the same seqno @@ -347,11 +359,11 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { // boundary key: axy@50 (we make minimal assumption about a boundary key) // Block N+1: [axy@10, ... ] // - // If seek_key = axy@60, the search will starts from Block N. + // If seek_key = axy@60, the search will start from Block N. // Even if the user_key is not found in the hash map, the caller still // have to continue searching the next block. // - // In this case, we pretend the key is the the last restart interval. + // In this case, we pretend the key is in the last restart interval. // The while-loop below will search the last restart interval for the // key. It will stop at the first key that is larger than the seek_key, // or to the end of the block if no one is larger. @@ -364,12 +376,15 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { assert(restart_index < num_restarts_); SeekToRestartPoint(restart_index); current_ = GetRestartPoint(restart_index); + cur_entry_idx_ = + static_cast(restart_index * block_restart_interval_) - 1; uint32_t limit = restarts_; if (restart_index + 1 < num_restarts_) { limit = GetRestartPoint(restart_index + 1); } while (current_ < limit) { + ++cur_entry_idx_; bool shared; // Here we only linear seek the target key inside the restart interval. // If a key does not exist inside a restart interval, we avoid @@ -381,14 +396,20 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { // we stop at the first potential matching user key. break; } + // If the loop exits due to CompareCurrentKey(target) >= 0, then current key + // exists, and its checksum verification will be done in UpdateKey() called + // in SeekForGet(). + // TODO(cbi): If this loop exits with current_ == restart_, per key-value + // checksum will not be verified in UpdateKey() since Valid() + // will return false. } if (current_ == restarts_) { - // Search reaches to the end of the block. There are three possibilites: - // 1) there is only one user_key match in the block (otherwise collsion). + // Search reaches to the end of the block. There are three possibilities: + // 1) there is only one user_key match in the block (otherwise collision). // the matching user_key resides in the last restart interval, and it // is the last key of the restart interval and of the block as well. - // ParseNextKey() skiped it as its [ type | seqno ] is smaller. + // ParseNextKey() skipped it as its [ type | seqno ] is smaller. // // 2) The seek_key is not found in the HashIndex Lookup(), i.e. kNoEntry, // AND all existing user_keys in the restart interval are smaller than @@ -412,11 +433,11 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { ValueType value_type = ExtractValueType(raw_key_.GetInternalKey()); if (value_type != ValueType::kTypeValue && value_type != ValueType::kTypeDeletion && + value_type != ValueType::kTypeMerge && value_type != ValueType::kTypeSingleDeletion && value_type != ValueType::kTypeBlobIndex && value_type != ValueType::kTypeWideColumnEntity) { SeekImpl(target); - return true; } // Result found, and the iter is correctly set. @@ -424,6 +445,9 @@ bool DataBlockIter::SeekForGetImpl(const Slice& target) { } void IndexBlockIter::SeekImpl(const Slice& target) { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("IndexBlockIter::SeekImpl")) return; +#endif TEST_SYNC_POINT("IndexBlockIter::Seek:0"); PERF_TIMER_GUARD(block_seek_nanos); if (data_ == nullptr) { // Not init yet @@ -478,7 +502,9 @@ void DataBlockIter::SeekForPrevImpl(const Slice& target) { FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); if (!Valid()) { - SeekToLastImpl(); + if (status_.ok()) { + SeekToLastImpl(); + } } else { while (Valid() && CompareCurrentKey(seek_key) > 0) { PrevImpl(); @@ -502,7 +528,9 @@ void MetaBlockIter::SeekForPrevImpl(const Slice& target) { FindKeyAfterBinarySeek(seek_key, index, skip_linear_scan); if (!Valid()) { - SeekToLastImpl(); + if (status_.ok()) { + SeekToLastImpl(); + } } else { while (Valid() && CompareCurrentKey(seek_key) > 0) { PrevImpl(); @@ -517,6 +545,7 @@ void DataBlockIter::SeekToFirstImpl() { SeekToRestartPoint(0); bool is_shared = false; ParseNextDataKey(&is_shared); + cur_entry_idx_ = 0; } void MetaBlockIter::SeekToFirstImpl() { @@ -526,15 +555,20 @@ void MetaBlockIter::SeekToFirstImpl() { SeekToRestartPoint(0); bool is_shared = false; ParseNextKey(&is_shared); + cur_entry_idx_ = 0; } void IndexBlockIter::SeekToFirstImpl() { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("IndexBlockIter::SeekToFirstImpl")) return; +#endif if (data_ == nullptr) { // Not init yet return; } status_ = Status::OK(); SeekToRestartPoint(0); ParseNextIndexKey(); + cur_entry_idx_ = 0; } void DataBlockIter::SeekToLastImpl() { @@ -543,8 +577,10 @@ void DataBlockIter::SeekToLastImpl() { } SeekToRestartPoint(num_restarts_ - 1); bool is_shared = false; + cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_; while (ParseNextDataKey(&is_shared) && NextEntryOffset() < restarts_) { // Keep skipping + ++cur_entry_idx_; } } @@ -554,9 +590,13 @@ void MetaBlockIter::SeekToLastImpl() { } SeekToRestartPoint(num_restarts_ - 1); bool is_shared = false; + assert(num_restarts_ >= 1); + cur_entry_idx_ = + static_cast((num_restarts_ - 1) * block_restart_interval_); while (ParseNextKey(&is_shared) && NextEntryOffset() < restarts_) { - // Keep skipping + // Will probably never reach here since restart_interval is always 1 + ++cur_entry_idx_; } } @@ -566,20 +606,12 @@ void IndexBlockIter::SeekToLastImpl() { } status_ = Status::OK(); SeekToRestartPoint(num_restarts_ - 1); + cur_entry_idx_ = (num_restarts_ - 1) * block_restart_interval_; while (ParseNextIndexKey() && NextEntryOffset() < restarts_) { - // Keep skipping + ++cur_entry_idx_; } } -template -void BlockIter::CorruptionError() { - current_ = restarts_; - restart_index_ = num_restarts_; - status_ = Status::Corruption("bad entry in block"); - raw_key_.Clear(); - value_.clear(); -} - template template bool BlockIter::ParseNextKey(bool* is_shared) { @@ -602,13 +634,22 @@ bool BlockIter::ParseNextKey(bool* is_shared) { } else { if (shared == 0) { *is_shared = false; - // If this key doesn't share any bytes with prev key then we don't need - // to decode it and can use its address in the block directly. - raw_key_.SetKey(Slice(p, non_shared), false /* copy */); + // If this key doesn't share any bytes with prev key, and no min timestamp + // needs to be padded to the key, then we don't need to decode it and + // can use its address in the block directly (no copy). + UpdateRawKeyAndMaybePadMinTimestamp(Slice(p, non_shared)); } else { // This key share `shared` bytes with prev key, we need to decode it *is_shared = true; - raw_key_.TrimAppend(shared, p, non_shared); + // If user-defined timestamp is stripped from user key before keys are + // delta encoded, the decoded key consisting of the shared and non shared + // bytes do not have user-defined timestamp yet. We need to pad min + // timestamp to it. + if (pad_min_timestamp_) { + raw_key_.TrimAppendWithTimestamp(shared, p, non_shared, ts_sz_); + } else { + raw_key_.TrimAppend(shared, p, non_shared); + } } value_ = Slice(p + non_shared, value_length); if (shared == 0) { @@ -630,7 +671,8 @@ bool DataBlockIter::ParseNextDataKey(bool* is_shared) { // If we are reading a file with a global sequence number we should // expect that all encoded sequence numbers are zeros and any value // type is kTypeValue, kTypeMerge, kTypeDeletion, - // kTypeDeletionWithTimestamp, or kTypeRangeDeletion. + // kTypeDeletionWithTimestamp, kTypeRangeDeletion, or + // kTypeWideColumnEntity. uint64_t packed = ExtractInternalKeyFooter(raw_key_.GetKey()); SequenceNumber seqno; ValueType value_type; @@ -639,7 +681,8 @@ bool DataBlockIter::ParseNextDataKey(bool* is_shared) { value_type == ValueType::kTypeMerge || value_type == ValueType::kTypeDeletion || value_type == ValueType::kTypeDeletionWithTimestamp || - value_type == ValueType::kTypeRangeDeletion); + value_type == ValueType::kTypeRangeDeletion || + value_type == ValueType::kTypeWideColumnEntity); assert(seqno == 0); } #endif // NDEBUG @@ -654,7 +697,8 @@ bool IndexBlockIter::ParseNextIndexKey() { bool ok = (value_delta_encoded_) ? ParseNextKey(&is_shared) : ParseNextKey(&is_shared); if (ok) { - if (value_delta_encoded_ || global_seqno_state_ != nullptr) { + if (value_delta_encoded_ || global_seqno_state_ != nullptr || + pad_min_timestamp_) { DecodeCurrentValue(is_shared); } } @@ -666,12 +710,12 @@ bool IndexBlockIter::ParseNextIndexKey() { // restart_point 1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) // ... // restart_point n-1: k, v (off, sz), k, v (delta-sz), ..., k, v (delta-sz) -// where, k is key, v is value, and its encoding is in parenthesis. +// where, k is key, v is value, and its encoding is in parentheses. // The format of each key is (shared_size, non_shared_size, shared, non_shared) // The format of each value, i.e., block handle, is (offset, size) whenever the // is_shared is false, which included the first entry in each restart point. -// Otherwise the format is delta-size = block handle size - size of last block -// handle. +// Otherwise, the format is delta-size = the size of current block - the size o +// last block. void IndexBlockIter::DecodeCurrentValue(bool is_shared) { Slice v(value_.data(), data_ + restarts_ - value_.data()); // Delta encoding is used if `shared` != 0. @@ -694,12 +738,19 @@ void IndexBlockIter::DecodeCurrentValue(bool is_shared) { assert(value_type == ValueType::kTypeValue || value_type == ValueType::kTypeMerge || value_type == ValueType::kTypeDeletion || - value_type == ValueType::kTypeRangeDeletion); + value_type == ValueType::kTypeRangeDeletion || + value_type == ValueType::kTypeWideColumnEntity); first_internal_key.UpdateInternalKey(global_seqno_state_->global_seqno, value_type); decoded_value_.first_internal_key = first_internal_key.GetKey(); } + if (pad_min_timestamp_ && !decoded_value_.first_internal_key.empty()) { + first_internal_key_with_ts_.clear(); + PadInternalKeyWithMinTimestamp(&first_internal_key_with_ts_, + decoded_value_.first_internal_key, ts_sz_); + decoded_value_.first_internal_key = first_internal_key_with_ts_; + } } template @@ -710,6 +761,7 @@ void BlockIter::FindKeyAfterBinarySeek(const Slice& target, // to follow it up with NextImpl() to position the iterator at the restart // key. SeekToRestartPoint(index); + cur_entry_idx_ = static_cast(index * block_restart_interval_) - 1; NextImpl(); if (!skip_linear_scan) { @@ -728,6 +780,8 @@ void BlockIter::FindKeyAfterBinarySeek(const Slice& target, while (true) { NextImpl(); if (!Valid()) { + // TODO(cbi): per key-value checksum will not be verified in UpdateKey() + // since Valid() will returns false. break; } if (current_ == max_offset) { @@ -782,7 +836,7 @@ bool BlockIter::BinarySeek(const Slice& target, uint32_t* index, return false; } Slice mid_key(key_ptr, non_shared); - raw_key_.SetKey(mid_key, false /* copy */); + UpdateRawKeyAndMaybePadMinTimestamp(mid_key); int cmp = CompareCurrentKey(target); if (cmp < 0) { // Key at "mid" is smaller than "target". Therefore all @@ -825,7 +879,7 @@ int IndexBlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { return 1; // Return target is smaller } Slice block_key(key_ptr, non_shared); - raw_key_.SetKey(block_key, false /* copy */); + UpdateRawKeyAndMaybePadMinTimestamp(block_key); return CompareCurrentKey(target); } @@ -976,6 +1030,7 @@ Block::~Block() { // This sync point can be re-enabled if RocksDB can control the // initialization order of any/all static options created by the user. // TEST_SYNC_POINT("Block::~Block"); + delete[] kv_checksum_; } Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit, @@ -1010,10 +1065,8 @@ Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit, uint16_t map_offset; data_block_hash_index_.Initialize( - contents.data.data(), - static_cast(contents.data.size() - - sizeof(uint32_t)), /*chop off - NUM_RESTARTS*/ + data_, static_cast(size_ - sizeof(uint32_t)), /*chop off + NUM_RESTARTS*/ &map_offset); restart_offset_ = map_offset - num_restarts_ * sizeof(uint32_t); @@ -1035,6 +1088,132 @@ Block::Block(BlockContents&& contents, size_t read_amp_bytes_per_bit, } } +void Block::InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp) { + protection_bytes_per_key_ = 0; + if (protection_bytes_per_key > 0 && num_restarts_ > 0) { + // NewDataIterator() is called with protection_bytes_per_key_ = 0. + // This is intended since checksum is not constructed yet. + // + // We do not know global_seqno yet, so checksum computation and + // verification all assume global_seqno = 0. + // TODO(yuzhangyu): handle the implication of padding timestamp for kv + // protection. + std::unique_ptr iter{NewDataIterator( + raw_ucmp, kDisableGlobalSequenceNumber, nullptr /* iter */, + nullptr /* stats */, true /* block_contents_pinned */, + true /* user_defined_timestamps_persisted */)}; + if (iter->status().ok()) { + block_restart_interval_ = iter->GetRestartInterval(); + } + uint32_t num_keys = 0; + if (iter->status().ok()) { + num_keys = iter->NumberOfKeys(block_restart_interval_); + } + if (iter->status().ok()) { + checksum_size_ = num_keys * protection_bytes_per_key; + kv_checksum_ = new char[(size_t)checksum_size_]; + size_t i = 0; + iter->SeekToFirst(); + while (iter->Valid()) { + GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, + iter->key(), iter->value()); + iter->Next(); + i += protection_bytes_per_key; + } + assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); + } + if (!iter->status().ok()) { + size_ = 0; // Error marker + return; + } + protection_bytes_per_key_ = protection_bytes_per_key; + } +} + +void Block::InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp, + bool value_is_full, + bool index_has_first_key) { + protection_bytes_per_key_ = 0; + if (num_restarts_ > 0 && protection_bytes_per_key > 0) { + // Note that `global_seqno` and `key_includes_seq` are hardcoded here. They + // do not impact how the index block is parsed. During checksum + // construction/verification, we use the entire key buffer from + // raw_key_.GetKey() returned by iter->key() as the `key` part of key-value + // checksum, and the content of this buffer do not change for different + // values of `global_seqno` or `key_includes_seq`. + // TODO(yuzhangyu): handle the implication of padding timestamp for kv + // protection. + std::unique_ptr iter{NewIndexIterator( + raw_ucmp, kDisableGlobalSequenceNumber /* global_seqno */, nullptr, + nullptr /* Statistics */, true /* total_order_seek */, + index_has_first_key /* have_first_key */, false /* key_includes_seq */, + value_is_full, true /* block_contents_pinned */, + true /* user_defined_timestamps_persisted*/, + nullptr /* prefix_index */)}; + if (iter->status().ok()) { + block_restart_interval_ = iter->GetRestartInterval(); + } + uint32_t num_keys = 0; + if (iter->status().ok()) { + num_keys = iter->NumberOfKeys(block_restart_interval_); + } + if (iter->status().ok()) { + checksum_size_ = num_keys * protection_bytes_per_key; + kv_checksum_ = new char[(size_t)checksum_size_]; + iter->SeekToFirst(); + size_t i = 0; + while (iter->Valid()) { + GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, + iter->key(), iter->raw_value()); + iter->Next(); + i += protection_bytes_per_key; + } + assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); + } + if (!iter->status().ok()) { + size_ = 0; // Error marker + return; + } + protection_bytes_per_key_ = protection_bytes_per_key; + } +} + +void Block::InitializeMetaIndexBlockProtectionInfo( + uint8_t protection_bytes_per_key) { + protection_bytes_per_key_ = 0; + if (num_restarts_ > 0 && protection_bytes_per_key > 0) { + std::unique_ptr iter{ + NewMetaIterator(true /* block_contents_pinned */)}; + if (iter->status().ok()) { + block_restart_interval_ = iter->GetRestartInterval(); + } + uint32_t num_keys = 0; + if (iter->status().ok()) { + num_keys = iter->NumberOfKeys(block_restart_interval_); + } + if (iter->status().ok()) { + checksum_size_ = num_keys * protection_bytes_per_key; + kv_checksum_ = new char[(size_t)checksum_size_]; + iter->SeekToFirst(); + size_t i = 0; + while (iter->Valid()) { + GenerateKVChecksum(kv_checksum_ + i, protection_bytes_per_key, + iter->key(), iter->value()); + iter->Next(); + i += protection_bytes_per_key; + } + assert(!iter->status().ok() || i == num_keys * protection_bytes_per_key); + } + if (!iter->status().ok()) { + size_ = 0; // Error marker + return; + } + protection_bytes_per_key_ = protection_bytes_per_key; + } +} + MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) { MetaBlockIter* iter = new MetaBlockIter(); if (size_ < 2 * sizeof(uint32_t)) { @@ -1045,7 +1224,8 @@ MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) { iter->Invalidate(Status::OK()); } else { iter->Initialize(data_, restart_offset_, num_restarts_, - block_contents_pinned); + block_contents_pinned, protection_bytes_per_key_, + kv_checksum_, block_restart_interval_); } return iter; } @@ -1053,7 +1233,8 @@ MetaBlockIter* Block::NewMetaIterator(bool block_contents_pinned) { DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp, SequenceNumber global_seqno, DataBlockIter* iter, Statistics* stats, - bool block_contents_pinned) { + bool block_contents_pinned, + bool user_defined_timestamps_persisted) { DataBlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; @@ -1072,7 +1253,9 @@ DataBlockIter* Block::NewDataIterator(const Comparator* raw_ucmp, ret_iter->Initialize( raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno, read_amp_bitmap_.get(), block_contents_pinned, - data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr); + user_defined_timestamps_persisted, + data_block_hash_index_.Valid() ? &data_block_hash_index_ : nullptr, + protection_bytes_per_key_, kv_checksum_, block_restart_interval_); if (read_amp_bitmap_) { if (read_amp_bitmap_->GetStatistics() != stats) { // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ @@ -1088,7 +1271,8 @@ IndexBlockIter* Block::NewIndexIterator( const Comparator* raw_ucmp, SequenceNumber global_seqno, IndexBlockIter* iter, Statistics* /*stats*/, bool total_order_seek, bool have_first_key, bool key_includes_seq, bool value_is_full, - bool block_contents_pinned, BlockPrefixIndex* prefix_index) { + bool block_contents_pinned, bool user_defined_timestamps_persisted, + BlockPrefixIndex* prefix_index) { IndexBlockIter* ret_iter; if (iter != nullptr) { ret_iter = iter; @@ -1106,10 +1290,11 @@ IndexBlockIter* Block::NewIndexIterator( } else { BlockPrefixIndex* prefix_index_ptr = total_order_seek ? nullptr : prefix_index; - ret_iter->Initialize(raw_ucmp, data_, restart_offset_, num_restarts_, - global_seqno, prefix_index_ptr, have_first_key, - key_includes_seq, value_is_full, - block_contents_pinned); + ret_iter->Initialize( + raw_ucmp, data_, restart_offset_, num_restarts_, global_seqno, + prefix_index_ptr, have_first_key, key_includes_seq, value_is_full, + block_contents_pinned, user_defined_timestamps_persisted, + protection_bytes_per_key_, kv_checksum_, block_restart_interval_); } return ret_iter; @@ -1125,6 +1310,7 @@ size_t Block::ApproximateMemoryUsage() const { if (read_amp_bitmap_) { usage += read_amp_bitmap_->ApproximateMemoryUsage(); } + usage += checksum_size_; return usage; } diff --git a/table/block_based/block.h b/table/block_based/block.h index 90f9aa397bf2..dcd83aa6e4a9 100644 --- a/table/block_based/block.h +++ b/table/block_based/block.h @@ -14,8 +14,10 @@ #include #include +#include "db/kv_checksum.h" #include "db/pinned_iterators_manager.h" #include "port/malloc.h" +#include "rocksdb/advanced_cache.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/statistics.h" @@ -186,6 +188,9 @@ class Block { // will not go away (for example, it's from mmapped file which will not be // closed). // + // `user_defined_timestamps_persisted` controls whether a min timestamp is + // padded while key is being parsed from the block. + // // NOTE: for the hash based lookup, if a key prefix doesn't match any key, // the iterator will simply be set as "invalid", rather than returning // the key that is just pass the target key. @@ -193,7 +198,8 @@ class Block { SequenceNumber global_seqno, DataBlockIter* iter = nullptr, Statistics* stats = nullptr, - bool block_contents_pinned = false); + bool block_contents_pinned = false, + bool user_defined_timestamps_persisted = true); // Returns an MetaBlockIter for iterating over blocks containing metadata // (like Properties blocks). Unlike data blocks, the keys for these blocks @@ -225,13 +231,15 @@ class Block { // first_internal_key. It affects data serialization format, so the same value // have_first_key must be used when writing and reading index. // It is determined by IndexType property of the table. - IndexBlockIter* NewIndexIterator(const Comparator* raw_ucmp, - SequenceNumber global_seqno, - IndexBlockIter* iter, Statistics* stats, - bool total_order_seek, bool have_first_key, - bool key_includes_seq, bool value_is_full, - bool block_contents_pinned = false, - BlockPrefixIndex* prefix_index = nullptr); + // `user_defined_timestamps_persisted` controls whether a min timestamp is + // padded while key is being parsed from the block. + IndexBlockIter* NewIndexIterator( + const Comparator* raw_ucmp, SequenceNumber global_seqno, + IndexBlockIter* iter, Statistics* stats, bool total_order_seek, + bool have_first_key, bool key_includes_seq, bool value_is_full, + bool block_contents_pinned = false, + bool user_defined_timestamps_persisted = true, + BlockPrefixIndex* prefix_index = nullptr); // Report an approximation of how much memory has been used. size_t ApproximateMemoryUsage() const; @@ -239,6 +247,34 @@ class Block { // For TypedCacheInterface const Slice& ContentSlice() const { return contents_.data; } + // Initializes per key-value checksum protection. + // After this method is called, each DataBlockIterator returned + // by NewDataIterator will verify per key-value checksum for any key it read. + void InitializeDataBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp); + + // Initializes per key-value checksum protection. + // After this method is called, each IndexBlockIterator returned + // by NewIndexIterator will verify per key-value checksum for any key it read. + // value_is_full and index_has_first_key are needed to be able to parse + // the index block content and construct checksums. + void InitializeIndexBlockProtectionInfo(uint8_t protection_bytes_per_key, + const Comparator* raw_ucmp, + bool value_is_full, + bool index_has_first_key); + + // Initializes per key-value checksum protection. + // After this method is called, each MetaBlockIter returned + // by NewMetaIterator will verify per key-value checksum for any key it read. + void InitializeMetaIndexBlockProtectionInfo(uint8_t protection_bytes_per_key); + + static void GenerateKVChecksum(char* checksum_ptr, uint8_t checksum_len, + const Slice& key, const Slice& value) { + ProtectionInfo64().ProtectKV(key, value).Encode(checksum_len, checksum_ptr); + } + + const char* TEST_GetKVChecksum() const { return kv_checksum_; } + private: BlockContents contents_; const char* data_; // contents_.data.data() @@ -246,6 +282,11 @@ class Block { uint32_t restart_offset_; // Offset in data_ of restart array uint32_t num_restarts_; std::unique_ptr read_amp_bitmap_; + char* kv_checksum_{nullptr}; + uint32_t checksum_size_{0}; + // Used by block iterators to calculate current key index within a block + uint32_t block_restart_interval_{0}; + uint8_t protection_bytes_per_key_{0}; DataBlockHashIndex data_block_hash_index_; }; @@ -268,6 +309,14 @@ class Block { // `Seek()` logic would be implemented by subclasses in `SeekImpl()`. These // "Impl" functions are responsible for positioning `raw_key_` but not // invoking `UpdateKey()`. +// +// Per key-value checksum is enabled if relevant states are passed in during +// `InitializeBase()`. The checksum verification is done in each call to +// UpdateKey() for the current key. Each subclass is responsible for keeping +// track of cur_entry_idx_, the index of the current key within the block. +// BlockIter uses this index to get the corresponding checksum for current key. +// Additional checksum verification may be done in subclasses if they read keys +// other than the key being processed in UpdateKey(). template class BlockIter : public InternalIteratorBase { public: @@ -285,9 +334,16 @@ class BlockIter : public InternalIteratorBase { Cleanable::Reset(); } - bool Valid() const override { return current_ < restarts_; } + bool Valid() const override { + // When status_ is not ok, iter should be invalid. + assert(status_.ok() || current_ >= restarts_); + return current_ < restarts_; + } virtual void SeekToFirst() override final { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("BlockIter::SeekToFirst")) return; +#endif SeekToFirstImpl(); UpdateKey(); } @@ -324,6 +380,7 @@ class BlockIter : public InternalIteratorBase { } Status status() const override { return status_; } + Slice key() const override { assert(Valid()); return key_; @@ -336,10 +393,22 @@ class BlockIter : public InternalIteratorBase { (pinned_iters_mgr_ && !pinned_iters_mgr_->PinningEnabled())); status_.PermitUncheckedError(); } + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { pinned_iters_mgr_ = pinned_iters_mgr; } + PinnedIteratorsManager* pinned_iters_mgr_ = nullptr; + + bool TEST_Corrupt_Callback(const std::string& sync_point) { + bool corrupt = false; + TEST_SYNC_POINT_CALLBACK(sync_point, static_cast(&corrupt)); + + if (corrupt) { + CorruptionError(); + } + return corrupt; + } #endif bool IsKeyPinned() const override { @@ -376,27 +445,89 @@ class BlockIter : public InternalIteratorBase { Status status_; // Key to be exposed to users. Slice key_; + SequenceNumber global_seqno_; + // Size of the user-defined timestamp. + size_t ts_sz_ = 0; + // If user-defined timestamp is enabled but not persisted. A min timestamp + // will be padded to the key during key parsing where it applies. Such as when + // parsing keys from data block, index block, parsing the first internal + // key from IndexValue entry. Min timestamp padding is different for when + // `raw_key_` is a user key vs is an internal key. + // + // This only applies to data block and index blocks including index block for + // data blocks, index block for partitioned filter blocks, index block for + // partitioned index blocks. In summary, this only applies to block whose key + // are real user keys or internal keys created from user keys. + bool pad_min_timestamp_; + + // Per key-value checksum related states + const char* kv_checksum_; + int32_t cur_entry_idx_; + uint32_t block_restart_interval_; + uint8_t protection_bytes_per_key_; + bool key_pinned_; // Whether the block data is guaranteed to outlive this iterator, and // as long as the cleanup functions are transferred to another class, // e.g. PinnableSlice, the pointer to the bytes will still be valid. bool block_contents_pinned_; - SequenceNumber global_seqno_; virtual void SeekToFirstImpl() = 0; virtual void SeekToLastImpl() = 0; virtual void SeekImpl(const Slice& target) = 0; virtual void SeekForPrevImpl(const Slice& target) = 0; virtual void NextImpl() = 0; - virtual void PrevImpl() = 0; + // Returns the restart interval of this block. + // Returns 0 if num_restarts_ <= 1 or if the BlockIter is not initialized. + virtual uint32_t GetRestartInterval() { + if (num_restarts_ <= 1 || data_ == nullptr) { + return 0; + } + SeekToFirstImpl(); + uint32_t end_index = GetRestartPoint(1); + uint32_t count = 1; + while (NextEntryOffset() < end_index && status_.ok()) { + assert(Valid()); + NextImpl(); + ++count; + } + return count; + } + + // Returns the number of keys in this block. + virtual uint32_t NumberOfKeys(uint32_t block_restart_interval) { + if (num_restarts_ == 0 || data_ == nullptr) { + return 0; + } + uint32_t count = (num_restarts_ - 1) * block_restart_interval; + // Add number of keys from the last restart interval + SeekToRestartPoint(num_restarts_ - 1); + while (NextEntryOffset() < restarts_ && status_.ok()) { + NextImpl(); + ++count; + } + return count; + } + + // Stores whether the current key has a shared bytes with prev key in + // *is_shared. + // Sets raw_key_, value_ to the current parsed key and value. + // Sets restart_index_ to point to the restart interval that contains + // the current key. template inline bool ParseNextKey(bool* is_shared); + // protection_bytes_per_key, kv_checksum, and block_restart_interval + // are needed only for per kv checksum verification. void InitializeBase(const Comparator* raw_ucmp, const char* data, uint32_t restarts, uint32_t num_restarts, - SequenceNumber global_seqno, bool block_contents_pinned) { + SequenceNumber global_seqno, bool block_contents_pinned, + bool user_defined_timestamp_persisted, + + uint8_t protection_bytes_per_key, const char* kv_checksum, + uint32_t block_restart_interval) { assert(data_ == nullptr); // Ensure it is called only once assert(num_restarts > 0); // Ensure the param is valid @@ -407,13 +538,61 @@ class BlockIter : public InternalIteratorBase { current_ = restarts_; restart_index_ = num_restarts_; global_seqno_ = global_seqno; + if (raw_ucmp != nullptr) { + ts_sz_ = raw_ucmp->timestamp_size(); + } + pad_min_timestamp_ = ts_sz_ > 0 && !user_defined_timestamp_persisted; block_contents_pinned_ = block_contents_pinned; cache_handle_ = nullptr; + cur_entry_idx_ = -1; + protection_bytes_per_key_ = protection_bytes_per_key; + kv_checksum_ = kv_checksum; + block_restart_interval_ = block_restart_interval; + // Checksum related states are either all 0/nullptr or all non-zero. + // One exception is when num_restarts == 0, block_restart_interval can be 0 + // since we are not able to compute it. + assert((protection_bytes_per_key == 0 && kv_checksum == nullptr) || + (protection_bytes_per_key > 0 && kv_checksum != nullptr && + (block_restart_interval > 0 || num_restarts == 1))); + } + + void CorruptionError(const std::string& error_msg = "bad entry in block") { + current_ = restarts_; + restart_index_ = num_restarts_; + status_ = Status::Corruption(error_msg); + raw_key_.Clear(); + value_.clear(); + } + + void PerKVChecksumCorruptionError() { + std::string error_msg{ + "Corrupted block entry: per key-value checksum verification " + "failed."}; + error_msg.append(" Offset: " + std::to_string(current_) + "."); + error_msg.append(" Entry index: " + std::to_string(cur_entry_idx_) + "."); + CorruptionError(error_msg); + } + + void UpdateRawKeyAndMaybePadMinTimestamp(const Slice& key) { + if (pad_min_timestamp_) { + std::string buf; + if (raw_key_.IsUserKey()) { + AppendKeyWithMinTimestamp(&buf, key, ts_sz_); + } else { + PadInternalKeyWithMinTimestamp(&buf, key, ts_sz_); + } + raw_key_.SetKey(buf, true /* copy */); + } else { + raw_key_.SetKey(key, false /* copy */); + } } // Must be called every time a key is found that needs to be returned to user, // and may be called when no key is found (as a no-op). Updates `key_`, // `key_buf_`, and `key_pinned_` with info about the found key. + // Per key-value checksum verification is done if available for the key to be + // returned. Iterator is invalidated with corruption status if checksum + // verification fails. void UpdateKey() { key_buf_.Clear(); if (!Valid()) { @@ -432,6 +611,19 @@ class BlockIter : public InternalIteratorBase { key_ = key_buf_.GetInternalKey(); key_pinned_ = false; } + TEST_SYNC_POINT_CALLBACK("BlockIter::UpdateKey::value", + (void*)value_.data()); + TEST_SYNC_POINT_CALLBACK("Block::VerifyChecksum::checksum_len", + &protection_bytes_per_key_); + if (protection_bytes_per_key_ > 0) { + if (!ProtectionInfo64() + .ProtectKV(raw_key_.GetKey(), value_) + .Verify( + protection_bytes_per_key_, + kv_checksum_ + protection_bytes_per_key_ * cur_entry_idx_)) { + PerKVChecksumCorruptionError(); + } + } } // Returns the result of `Comparator::Compare()`, where the appropriate @@ -463,7 +655,7 @@ class BlockIter : public InternalIteratorBase { return static_cast((value_.data() + value_.size()) - data_); } - uint32_t GetRestartPoint(uint32_t index) { + uint32_t GetRestartPoint(uint32_t index) const { assert(index < num_restarts_); return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t)); } @@ -478,13 +670,20 @@ class BlockIter : public InternalIteratorBase { value_ = Slice(data_ + offset, 0); } - void CorruptionError(); - protected: template inline bool BinarySeek(const Slice& target, uint32_t* index, bool* is_index_key_result); + // Find the first key in restart interval `index` that is >= `target`. + // If there is no such key, iterator is positioned at the first key in + // restart interval `index + 1`. + // If is_index_key_result is true, it positions the iterator at the first key + // in this restart interval. + // Per key-value checksum verification is done for all keys scanned + // up to but not including the last key (the key that current_ points to + // when this function returns). This key's checksum is verified in + // UpdateKey(). void FindKeyAfterBinarySeek(const Slice& target, uint32_t index, bool is_index_key_result); }; @@ -493,22 +692,19 @@ class DataBlockIter final : public BlockIter { public: DataBlockIter() : BlockIter(), read_amp_bitmap_(nullptr), last_bitmap_offset_(0) {} - DataBlockIter(const Comparator* raw_ucmp, const char* data, uint32_t restarts, - uint32_t num_restarts, SequenceNumber global_seqno, - BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned, - DataBlockHashIndex* data_block_hash_index) - : DataBlockIter() { - Initialize(raw_ucmp, data, restarts, num_restarts, global_seqno, - read_amp_bitmap, block_contents_pinned, data_block_hash_index); - } void Initialize(const Comparator* raw_ucmp, const char* data, uint32_t restarts, uint32_t num_restarts, SequenceNumber global_seqno, BlockReadAmpBitmap* read_amp_bitmap, bool block_contents_pinned, - DataBlockHashIndex* data_block_hash_index) { + bool user_defined_timestamps_persisted, + DataBlockHashIndex* data_block_hash_index, + uint8_t protection_bytes_per_key, const char* kv_checksum, + uint32_t block_restart_interval) { InitializeBase(raw_ucmp, data, restarts, num_restarts, global_seqno, - block_contents_pinned); + block_contents_pinned, user_defined_timestamps_persisted, + protection_bytes_per_key, kv_checksum, + block_restart_interval); raw_key_.SetIsUserKey(false); read_amp_bitmap_ = read_amp_bitmap; last_bitmap_offset_ = current_ + 1; @@ -526,7 +722,11 @@ class DataBlockIter final : public BlockIter { return value_; } + // Returns if `target` may exist. inline bool SeekForGet(const Slice& target) { +#ifndef NDEBUG + if (TEST_Corrupt_Callback("DataBlockIter::SeekForGet")) return true; +#endif if (!data_block_hash_index_) { SeekImpl(target); UpdateKey(); @@ -598,11 +798,15 @@ class MetaBlockIter final : public BlockIter { public: MetaBlockIter() : BlockIter() { raw_key_.SetIsUserKey(true); } void Initialize(const char* data, uint32_t restarts, uint32_t num_restarts, - bool block_contents_pinned) { + bool block_contents_pinned, uint8_t protection_bytes_per_key, + const char* kv_checksum, uint32_t block_restart_interval) { // Initializes the iterator with a BytewiseComparator and // the raw key being a user key. InitializeBase(BytewiseComparator(), data, restarts, num_restarts, - kDisableGlobalSequenceNumber, block_contents_pinned); + kDisableGlobalSequenceNumber, block_contents_pinned, + /* user_defined_timestamps_persisted */ true, + protection_bytes_per_key, kv_checksum, + block_restart_interval); raw_key_.SetIsUserKey(true); } @@ -612,12 +816,17 @@ class MetaBlockIter final : public BlockIter { } protected: + friend Block; void SeekToFirstImpl() override; void SeekToLastImpl() override; void SeekImpl(const Slice& target) override; void SeekForPrevImpl(const Slice& target) override; void NextImpl() override; void PrevImpl() override; + // Meta index block's restart interval is always 1. See + // MetaIndexBuilder::MetaIndexBuilder() for hard-coded restart interval. + uint32_t GetRestartInterval() override { return 1; } + uint32_t NumberOfKeys(uint32_t) override { return num_restarts_; } }; class IndexBlockIter final : public BlockIter { @@ -632,9 +841,14 @@ class IndexBlockIter final : public BlockIter { uint32_t restarts, uint32_t num_restarts, SequenceNumber global_seqno, BlockPrefixIndex* prefix_index, bool have_first_key, bool key_includes_seq, - bool value_is_full, bool block_contents_pinned) { + bool value_is_full, bool block_contents_pinned, + bool user_defined_timestamps_persisted, + uint8_t protection_bytes_per_key, const char* kv_checksum, + uint32_t block_restart_interval) { InitializeBase(raw_ucmp, data, restarts, num_restarts, - kDisableGlobalSequenceNumber, block_contents_pinned); + kDisableGlobalSequenceNumber, block_contents_pinned, + user_defined_timestamps_persisted, protection_bytes_per_key, + kv_checksum, block_restart_interval); raw_key_.SetIsUserKey(!key_includes_seq); prefix_index_ = prefix_index; value_delta_encoded_ = !value_is_full; @@ -653,7 +867,8 @@ class IndexBlockIter final : public BlockIter { IndexValue value() const override { assert(Valid()); - if (value_delta_encoded_ || global_seqno_state_ != nullptr) { + if (value_delta_encoded_ || global_seqno_state_ != nullptr || + pad_min_timestamp_) { return decoded_value_; } else { IndexValue entry; @@ -665,11 +880,17 @@ class IndexBlockIter final : public BlockIter { } } + Slice raw_value() const { + assert(Valid()); + return value_; + } + bool IsValuePinned() const override { return global_seqno_state_ != nullptr ? false : BlockIter::IsValuePinned(); } protected: + friend Block; // IndexBlockIter follows a different contract for prefix iterator // from data iterators. // If prefix of the seek key `target` exists in the file, it must @@ -691,11 +912,8 @@ class IndexBlockIter final : public BlockIter { } void PrevImpl() override; - void NextImpl() override; - void SeekToFirstImpl() override; - void SeekToLastImpl() override; private: @@ -725,6 +943,10 @@ class IndexBlockIter final : public BlockIter { std::unique_ptr global_seqno_state_; + // Buffers the `first_internal_key` referred by `decoded_value_` when + // `pad_min_timestamp_` is true. + std::string first_internal_key_with_ts_; + // Set *prefix_may_exist to false if no key possibly share the same prefix // as `target`. If not set, the result position should be the same as total // order Seek. diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 81113c9c7a8f..e66c4939a09e 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -29,7 +29,7 @@ #include "db/dbformat.h" #include "index_builder.h" #include "logging/logging.h" -#include "memory/memory_allocator.h" +#include "memory/memory_allocator_impl.h" #include "rocksdb/cache.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" @@ -70,7 +70,8 @@ FilterBlockBuilder* CreateFilterBlockBuilder( const ImmutableCFOptions& /*opt*/, const MutableCFOptions& mopt, const FilterBuildingContext& context, const bool use_delta_encoding_for_index_values, - PartitionedIndexBuilder* const p_index_builder) { + PartitionedIndexBuilder* const p_index_builder, size_t ts_sz, + const bool persist_user_defined_timestamps) { const BlockBasedTableOptions& table_opt = context.table_options; assert(table_opt.filter_policy); // precondition @@ -95,7 +96,8 @@ FilterBlockBuilder* CreateFilterBlockBuilder( return new PartitionedFilterBlockBuilder( mopt.prefix_extractor.get(), table_opt.whole_key_filtering, filter_bits_builder, table_opt.index_block_restart_interval, - use_delta_encoding_for_index_values, p_index_builder, partition_size); + use_delta_encoding_for_index_values, p_index_builder, partition_size, + ts_sz, persist_user_defined_timestamps); } else { return new FullFilterBlockBuilder(mopt.prefix_extractor.get(), table_opt.whole_key_filtering, @@ -104,9 +106,12 @@ FilterBlockBuilder* CreateFilterBlockBuilder( } } -bool GoodCompressionRatio(size_t compressed_size, size_t uncomp_size) { - // Check to see if compressed less than 12.5% - return compressed_size < uncomp_size - (uncomp_size / 8u); +bool GoodCompressionRatio(size_t compressed_size, size_t uncomp_size, + int max_compressed_bytes_per_kb) { + // For efficiency, avoid floating point and division + return compressed_size <= + (static_cast(max_compressed_bytes_per_kb) * uncomp_size) >> + 10; } } // namespace @@ -114,7 +119,7 @@ bool GoodCompressionRatio(size_t compressed_size, size_t uncomp_size) { // format_version is the block format as defined in include/rocksdb/table.h Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info, CompressionType* type, uint32_t format_version, - bool do_sample, std::string* compressed_output, + bool allow_sample, std::string* compressed_output, std::string* sampled_output_fast, std::string* sampled_output_slow) { assert(type); @@ -126,15 +131,15 @@ Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info, // The users can use these stats to decide if it is worthwhile // enabling compression and they also get a hint about which // compression algorithm wil be beneficial. - if (do_sample && info.SampleForCompression() && + if (allow_sample && info.SampleForCompression() && Random::GetTLSInstance()->OneIn( static_cast(info.SampleForCompression()))) { // Sampling with a fast compression algorithm if (sampled_output_fast && (LZ4_Supported() || Snappy_Supported())) { CompressionType c = LZ4_Supported() ? kLZ4Compression : kSnappyCompression; - CompressionContext context(c); CompressionOptions options; + CompressionContext context(c, options); CompressionInfo info_tmp(options, context, CompressionDict::GetEmptyDict(), c, info.SampleForCompression()); @@ -147,8 +152,8 @@ Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info, // Sampling with a slow but high-compression algorithm if (sampled_output_slow && (ZSTD_Supported() || Zlib_Supported())) { CompressionType c = ZSTD_Supported() ? kZSTD : kZlibCompression; - CompressionContext context(c); CompressionOptions options; + CompressionContext context(c, options); CompressionInfo info_tmp(options, context, CompressionDict::GetEmptyDict(), c, info.SampleForCompression()); @@ -159,7 +164,8 @@ Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info, } } - if (info.type() == kNoCompression) { + int max_compressed_bytes_per_kb = info.options().max_compressed_bytes_per_kb; + if (info.type() == kNoCompression || max_compressed_bytes_per_kb <= 0) { *type = kNoCompression; return uncompressed_data; } @@ -175,8 +181,8 @@ Slice CompressBlock(const Slice& uncompressed_data, const CompressionInfo& info, // Check the compression ratio; if it's not good enough, just fall back to // uncompressed - if (!GoodCompressionRatio(compressed_output->size(), - uncompressed_data.size())) { + if (!GoodCompressionRatio(compressed_output->size(), uncompressed_data.size(), + max_compressed_bytes_per_kb)) { *type = kNoCompression; return uncompressed_data; } @@ -257,9 +263,25 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector struct BlockBasedTableBuilder::Rep { const ImmutableOptions ioptions; - const MutableCFOptions moptions; + // BEGIN from MutableCFOptions + std::shared_ptr prefix_extractor; + // END from MutableCFOptions const BlockBasedTableOptions table_options; const InternalKeyComparator& internal_comparator; + // Size in bytes for the user-defined timestamps. + size_t ts_sz; + // When `ts_sz` > 0 and this flag is false, the user-defined timestamp in the + // user key will be stripped when creating the block based table. This + // stripping happens for all user keys, including the keys in data block, + // index block for data block, index block for index block (if index type is + // `kTwoLevelIndexSearch`), index for filter blocks (if using partitioned + // filters), the `first_internal_key` in `IndexValue`, the `end_key` for range + // deletion entries. + // As long as the user keys are sorted when added via `Add` API, their logic + // ordering won't change after timestamps are stripped. However, for each user + // key to be logically equivalent before and after timestamp is stripped, the + // user key should contain the minimum timestamp. + bool persist_user_defined_timestamps; WritableFileWriter* file; std::atomic offset; size_t alignment; @@ -337,6 +359,13 @@ struct BlockBasedTableBuilder::Rep { std::unique_ptr pc_rep; BlockCreateContext create_context; + // The size of the "tail" part of a SST file. "Tail" refers to + // all blocks after data blocks till the end of the SST file. + uint64_t tail_size; + + // See class Footer + uint32_t base_context_checksum; + uint64_t get_offset() { return offset.load(std::memory_order_relaxed); } void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); } @@ -365,6 +394,12 @@ struct BlockBasedTableBuilder::Rep { // to false, and this is ensured by io_status_mutex, so no special memory // order for io_status_ok is required. if (io_status_ok.load(std::memory_order_relaxed)) { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED // Avoid unnecessary lock acquisition + auto ios = CopyIOStatus(); + ios.PermitUncheckedError(); + // Assume no races in unit tests + assert(ios.ok()); +#endif // ROCKSDB_ASSERT_STATUS_CHECKED return IOStatus::OK(); } else { return CopyIOStatus(); @@ -405,9 +440,12 @@ struct BlockBasedTableBuilder::Rep { Rep(const BlockBasedTableOptions& table_opt, const TableBuilderOptions& tbo, WritableFileWriter* f) : ioptions(tbo.ioptions), - moptions(tbo.moptions), + prefix_extractor(tbo.moptions.prefix_extractor), table_options(table_opt), internal_comparator(tbo.internal_comparator), + ts_sz(tbo.internal_comparator.user_comparator()->timestamp_size()), + persist_user_defined_timestamps( + tbo.ioptions.persist_user_defined_timestamps), file(f), offset(0), alignment(table_options.block_align @@ -421,9 +459,15 @@ struct BlockBasedTableBuilder::Rep { ->CanKeysWithDifferentByteContentsBeEqual() ? BlockBasedTableOptions::kDataBlockBinarySearch : table_options.data_block_index_type, - table_options.data_block_hash_table_util_ratio), - range_del_block(1 /* block_restart_interval */), - internal_prefix_transform(tbo.moptions.prefix_extractor.get()), + table_options.data_block_hash_table_util_ratio, ts_sz, + persist_user_defined_timestamps), + range_del_block( + 1 /* block_restart_interval */, true /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */, + 0.75 /* data_block_hash_table_util_ratio */, ts_sz, + persist_user_defined_timestamps), + internal_prefix_transform(prefix_extractor.get()), compression_type(tbo.compression_type), sample_for_compression(tbo.moptions.sample_for_compression), compressible_input_data_bytes(0), @@ -444,9 +488,15 @@ struct BlockBasedTableBuilder::Rep { flush_block_policy( table_options.flush_block_policy_factory->NewFlushBlockPolicy( table_options, data_block)), - create_context(&table_options, ioptions.stats, + create_context(&table_options, &ioptions, ioptions.stats, compression_type == kZSTD || - compression_type == kZSTDNotFinalCompression), + compression_type == kZSTDNotFinalCompression, + tbo.moptions.block_protection_bytes_per_key, + tbo.internal_comparator.user_comparator(), + !use_delta_encoding_for_index_values, + table_opt.index_type == + BlockBasedTableOptions::kBinarySearchWithFirstKey), + tail_size(0), status_ok(true), io_status_ok(true) { if (tbo.target_file_size == 0) { @@ -475,20 +525,22 @@ struct BlockBasedTableBuilder::Rep { compression_dict_buffer_cache_res_mgr = nullptr; } + assert(compression_ctxs.size() >= compression_opts.parallel_threads); for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) { - compression_ctxs[i].reset(new CompressionContext(compression_type)); + compression_ctxs[i].reset( + new CompressionContext(compression_type, compression_opts)); } if (table_options.index_type == BlockBasedTableOptions::kTwoLevelIndexSearch) { p_index_builder_ = PartitionedIndexBuilder::CreateIndexBuilder( &internal_comparator, use_delta_encoding_for_index_values, - table_options); + table_options, ts_sz, persist_user_defined_timestamps); index_builder.reset(p_index_builder_); } else { index_builder.reset(IndexBuilder::CreateIndexBuilder( table_options.index_type, &internal_comparator, &this->internal_prefix_transform, use_delta_encoding_for_index_values, - table_options)); + table_options, ts_sz, persist_user_defined_timestamps)); } if (ioptions.optimize_filters_for_hits && tbo.is_bottommost) { // Apply optimize_filters_for_hits setting here when applicable by @@ -518,8 +570,9 @@ struct BlockBasedTableBuilder::Rep { } filter_builder.reset(CreateFilterBlockBuilder( - ioptions, moptions, filter_context, - use_delta_encoding_for_index_values, p_index_builder_)); + ioptions, tbo.moptions, filter_context, + use_delta_encoding_for_index_values, p_index_builder_, ts_sz, + persist_user_defined_timestamps)); } assert(tbo.int_tbl_prop_collector_factories); @@ -533,12 +586,11 @@ struct BlockBasedTableBuilder::Rep { table_properties_collectors.emplace_back( new BlockBasedTablePropertiesCollector( table_options.index_type, table_options.whole_key_filtering, - moptions.prefix_extractor != nullptr)); - const Comparator* ucmp = tbo.internal_comparator.user_comparator(); - assert(ucmp); - if (ucmp->timestamp_size() > 0) { + prefix_extractor != nullptr)); + if (ts_sz > 0 && persist_user_defined_timestamps) { table_properties_collectors.emplace_back( - new TimestampTablePropertiesCollector(ucmp)); + new TimestampTablePropertiesCollector( + tbo.internal_comparator.user_comparator())); } if (table_options.verify_compression) { for (uint32_t i = 0; i < compression_opts.parallel_threads; i++) { @@ -558,6 +610,17 @@ struct BlockBasedTableBuilder::Rep { if (!ReifyDbHostIdProperty(ioptions.env, &props.db_host_id).ok()) { ROCKS_LOG_INFO(ioptions.logger, "db_host_id property will not be set"); } + + if (FormatVersionUsesContextChecksum(table_options.format_version)) { + // Must be non-zero and semi- or quasi-random + // TODO: ideally guaranteed different for related files (e.g. use file + // number and db_session, for benefit of SstFileWriter) + do { + base_context_checksum = Random::GetTLSInstance()->Next(); + } while (UNLIKELY(base_context_checksum == 0)); + } else { + base_context_checksum = 0; + } } Rep(const Rep&) = delete; @@ -896,7 +959,9 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( // behavior sanitized_table_options.format_version = 1; } - + auto ucmp = tbo.internal_comparator.user_comparator(); + assert(ucmp); + (void)ucmp; // avoids unused variable error. rep_ = new Rep(sanitized_table_options, tbo, file); TEST_SYNC_POINT_CALLBACK( @@ -980,9 +1045,8 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { r->pc_rep->curr_block_keys->PushBack(key); } else { if (r->filter_builder != nullptr) { - size_t ts_sz = - r->internal_comparator.user_comparator()->timestamp_size(); - r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); + r->filter_builder->Add( + ExtractUserKeyAndStripTimestamp(key, r->ts_sz)); } } } @@ -1003,6 +1067,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { r->ioptions.logger); } else if (value_type == kTypeRangeDeletion) { + // TODO(yuzhangyu): handle range deletion entries for UDT in memtable only. r->range_del_block.Add(key, value); // TODO offset passed in is not accurate for parallel compression case NotifyCollectTableCollectorsOnAdd(key, value, r->get_offset(), @@ -1014,6 +1079,9 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { r->props.num_entries++; r->props.raw_key_size += key.size(); + if (!r->persist_user_defined_timestamps) { + r->props.raw_key_size -= r->ts_sz; + } r->props.raw_value_size += value.size(); if (value_type == kTypeDeletion || value_type == kTypeSingleDeletion || value_type == kTypeDeletionWithTimestamp) { @@ -1079,6 +1147,9 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& uncompressed_block_data, return; } + TEST_SYNC_POINT_CALLBACK( + "BlockBasedTableBuilder::WriteBlock:TamperWithCompressedData", + &r->compressed_output); WriteMaybeCompressedBlock(block_contents, type, handle, block_type, &uncompressed_block_data); r->compressed_output.clear(); @@ -1108,25 +1179,17 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock( const CompressionContext& compression_ctx, UncompressionContext* verify_ctx, std::string* compressed_output, Slice* block_contents, CompressionType* type, Status* out_status) { - // File format contains a sequence of blocks where each block has: - // block_data: uint8[n] - // type: uint8 - // crc: uint32 Rep* r = rep_; bool is_status_ok = ok(); if (!r->IsParallelCompressionEnabled()) { assert(is_status_ok); } - *type = r->compression_type; - uint64_t sample_for_compression = r->sample_for_compression; - bool abort_compression = false; - - StopWatchNano timer( - r->ioptions.clock, - ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)); - if (is_status_ok && uncompressed_block_data.size() < kCompressionSizeLimit) { + StopWatchNano timer( + r->ioptions.clock, + ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)); + if (is_data_block) { r->compressible_input_data_bytes.fetch_add(uncompressed_block_data.size(), std::memory_order_relaxed); @@ -1139,14 +1202,14 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock( } assert(compression_dict != nullptr); CompressionInfo compression_info(r->compression_opts, compression_ctx, - *compression_dict, *type, - sample_for_compression); + *compression_dict, r->compression_type, + r->sample_for_compression); std::string sampled_output_fast; std::string sampled_output_slow; *block_contents = CompressBlock( uncompressed_block_data, compression_info, type, - r->table_options.format_version, is_data_block /* do_sample */, + r->table_options.format_version, is_data_block /* allow_sample */, compressed_output, &sampled_output_fast, &sampled_output_slow); if (sampled_output_slow.size() > 0 || sampled_output_fast.size() > 0) { @@ -1179,35 +1242,38 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock( BlockContents contents; UncompressionInfo uncompression_info(*verify_ctx, *verify_dict, r->compression_type); - Status stat = UncompressBlockData( + Status uncompress_status = UncompressBlockData( uncompression_info, block_contents->data(), block_contents->size(), &contents, r->table_options.format_version, r->ioptions); - if (stat.ok()) { - bool compressed_ok = - contents.data.compare(uncompressed_block_data) == 0; - if (!compressed_ok) { + if (uncompress_status.ok()) { + bool data_match = contents.data.compare(uncompressed_block_data) == 0; + if (!data_match) { // The result of the compression was invalid. abort. - abort_compression = true; const char* const msg = "Decompressed block did not match pre-compression block"; ROCKS_LOG_ERROR(r->ioptions.logger, "%s", msg); *out_status = Status::Corruption(msg); + *type = kNoCompression; } } else { // Decompression reported an error. abort. *out_status = Status::Corruption(std::string("Could not decompress: ") + - stat.getState()); - abort_compression = true; + uncompress_status.getState()); + *type = kNoCompression; } } + if (timer.IsStarted()) { + RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS, + timer.ElapsedNanos()); + } } else { - // Block is too big to be compressed. + // Status is not OK, or block is too big to be compressed. if (is_data_block) { r->uncompressible_input_data_bytes.fetch_add( uncompressed_block_data.size(), std::memory_order_relaxed); } - abort_compression = true; + *type = kNoCompression; } if (is_data_block) { r->uncompressible_input_data_bytes.fetch_add(kBlockTrailerSize, @@ -1216,37 +1282,44 @@ void BlockBasedTableBuilder::CompressAndVerifyBlock( // Abort compression if the block is too big, or did not pass // verification. - if (abort_compression) { - RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED); - *type = kNoCompression; + if (*type == kNoCompression) { *block_contents = uncompressed_block_data; - } else if (*type != kNoCompression) { - if (ShouldReportDetailedTime(r->ioptions.env, r->ioptions.stats)) { - RecordTimeToHistogram(r->ioptions.stats, COMPRESSION_TIMES_NANOS, - timer.ElapsedNanos()); - } - RecordInHistogram(r->ioptions.stats, BYTES_COMPRESSED, - uncompressed_block_data.size()); + bool compression_attempted = !compressed_output->empty(); + RecordTick(r->ioptions.stats, compression_attempted + ? NUMBER_BLOCK_COMPRESSION_REJECTED + : NUMBER_BLOCK_COMPRESSION_BYPASSED); + RecordTick(r->ioptions.stats, + compression_attempted ? BYTES_COMPRESSION_REJECTED + : BYTES_COMPRESSION_BYPASSED, + uncompressed_block_data.size()); + } else { RecordTick(r->ioptions.stats, NUMBER_BLOCK_COMPRESSED); - } else if (*type != r->compression_type) { - RecordTick(r->ioptions.stats, NUMBER_BLOCK_NOT_COMPRESSED); + RecordTick(r->ioptions.stats, BYTES_COMPRESSED_FROM, + uncompressed_block_data.size()); + RecordTick(r->ioptions.stats, BYTES_COMPRESSED_TO, + compressed_output->size()); } } void BlockBasedTableBuilder::WriteMaybeCompressedBlock( - const Slice& block_contents, CompressionType type, BlockHandle* handle, + const Slice& block_contents, CompressionType comp_type, BlockHandle* handle, BlockType block_type, const Slice* uncompressed_block_data) { + // File format contains a sequence of blocks where each block has: + // block_data: uint8[n] + // compression_type: uint8 + // checksum: uint32 Rep* r = rep_; bool is_data_block = block_type == BlockType::kData; // Old, misleading name of this function: WriteRawBlock StopWatch sw(r->ioptions.clock, r->ioptions.stats, WRITE_RAW_BLOCK_MICROS); - handle->set_offset(r->get_offset()); + const uint64_t offset = r->get_offset(); + handle->set_offset(offset); handle->set_size(block_contents.size()); assert(status().ok()); assert(io_status().ok()); if (uncompressed_block_data == nullptr) { uncompressed_block_data = &block_contents; - assert(type == kNoCompression); + assert(comp_type == kNoCompression); } { @@ -1258,10 +1331,11 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( } std::array trailer; - trailer[0] = type; + trailer[0] = comp_type; uint32_t checksum = ComputeBuiltinChecksumWithLastByte( r->table_options.checksum, block_contents.data(), block_contents.size(), - /*last_byte*/ type); + /*last_byte*/ comp_type); + checksum += ChecksumModifierForContext(r->base_context_checksum, offset); if (block_type == BlockType::kFilter) { Status s = r->filter_builder->MaybePostVerifyFilter(block_contents); @@ -1284,7 +1358,6 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( } { - Status s = Status::OK(); bool warm_cache; switch (r->table_options.prepopulate_block_cache) { case BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly: @@ -1299,18 +1372,13 @@ void BlockBasedTableBuilder::WriteMaybeCompressedBlock( warm_cache = false; } if (warm_cache) { - s = InsertBlockInCacheHelper(*uncompressed_block_data, handle, - block_type); + Status s = InsertBlockInCacheHelper(*uncompressed_block_data, handle, + block_type); if (!s.ok()) { r->SetStatus(s); return; } } - s = InsertBlockInCompressedCache(block_contents, type, handle); - if (!s.ok()) { - r->SetStatus(s); - return; - } } r->set_offset(r->get_offset() + block_contents.size() + kBlockTrailerSize); @@ -1358,9 +1426,7 @@ void BlockBasedTableBuilder::BGWorkWriteMaybeCompressedBlock() { for (size_t i = 0; i < block_rep->keys->Size(); i++) { auto& key = (*block_rep->keys)[i]; if (r->filter_builder != nullptr) { - size_t ts_sz = - r->internal_comparator.user_comparator()->timestamp_size(); - r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); + r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, r->ts_sz)); } r->index_builder->OnKeyAdded(key); } @@ -1422,47 +1488,6 @@ IOStatus BlockBasedTableBuilder::io_status() const { return rep_->GetIOStatus(); } -// -// Make a copy of the block contents and insert into compressed block cache -// -Status BlockBasedTableBuilder::InsertBlockInCompressedCache( - const Slice& block_contents, const CompressionType type, - const BlockHandle* handle) { - Rep* r = rep_; - CompressedBlockCacheInterface block_cache_compressed{ - r->table_options.block_cache_compressed.get()}; - Status s; - if (type != kNoCompression && block_cache_compressed) { - size_t size = block_contents.size(); - - auto ubuf = AllocateBlock(size + 1, - block_cache_compressed.get()->memory_allocator()); - memcpy(ubuf.get(), block_contents.data(), size); - ubuf[size] = type; - - BlockContents* block_contents_to_cache = - new BlockContents(std::move(ubuf), size); -#ifndef NDEBUG - block_contents_to_cache->has_trailer = true; -#endif // NDEBUG - - CacheKey key = BlockBasedTable::GetCacheKey(rep_->base_cache_key, *handle); - - s = block_cache_compressed.Insert( - key.AsSlice(), block_contents_to_cache, - block_contents_to_cache->ApproximateMemoryUsage()); - if (s.ok()) { - RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD); - } else { - RecordTick(rep_->ioptions.stats, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); - } - // Invalidate OS cache. - r->file->InvalidateCache(static_cast(r->get_offset()), size) - .PermitUncheckedError(); - } - return s; -} - Status BlockBasedTableBuilder::InsertBlockInCacheHelper( const Slice& block_contents, const BlockHandle* handle, BlockType block_type) { @@ -1604,6 +1629,11 @@ void BlockBasedTableBuilder::WriteIndexBlock( // The last index_block_handle will be for the partition index block } } + // If success and need to record in metaindex rather than footer... + if (!FormatVersionUsesIndexHandleInFooter( + rep_->table_options.format_version)) { + meta_index_builder->Add(kIndexBlockName, *index_block_handle); + } } void BlockBasedTableBuilder::WritePropertiesBlock( @@ -1629,9 +1659,7 @@ void BlockBasedTableBuilder::WritePropertiesBlock( rep_->props.compression_options = CompressionOptionsToString(rep_->compression_opts); rep_->props.prefix_extractor_name = - rep_->moptions.prefix_extractor != nullptr - ? rep_->moptions.prefix_extractor->AsString() - : "nullptr"; + rep_->prefix_extractor ? rep_->prefix_extractor->AsString() : "nullptr"; std::string property_collectors_names = "["; for (size_t i = 0; i < rep_->ioptions.table_properties_collector_factories.size(); ++i) { @@ -1675,14 +1703,17 @@ void BlockBasedTableBuilder::WritePropertiesBlock( rep_->compressible_input_data_bytes + rep_->uncompressible_input_data_bytes; } + rep_->props.user_defined_timestamps_persisted = + rep_->persist_user_defined_timestamps; // Add basic properties property_block_builder.AddTableProperty(rep_->props); // Add use collected properties - NotifyCollectTableCollectorsOnFinish(rep_->table_properties_collectors, - rep_->ioptions.logger, - &property_block_builder); + NotifyCollectTableCollectorsOnFinish( + rep_->table_properties_collectors, rep_->ioptions.logger, + &property_block_builder, rep_->props.user_collected_properties, + rep_->props.readable_properties); Slice block_data = property_block_builder.Finish(); TEST_SYNC_POINT_CALLBACK( @@ -1748,16 +1779,20 @@ void BlockBasedTableBuilder::WriteRangeDelBlock( void BlockBasedTableBuilder::WriteFooter(BlockHandle& metaindex_block_handle, BlockHandle& index_block_handle) { + assert(ok()); Rep* r = rep_; // this is guaranteed by BlockBasedTableBuilder's constructor assert(r->table_options.checksum == kCRC32c || r->table_options.format_version != 0); - assert(ok()); - FooterBuilder footer; - footer.Build(kBlockBasedTableMagicNumber, r->table_options.format_version, - r->get_offset(), r->table_options.checksum, - metaindex_block_handle, index_block_handle); + Status s = footer.Build(kBlockBasedTableMagicNumber, + r->table_options.format_version, r->get_offset(), + r->table_options.checksum, metaindex_block_handle, + index_block_handle, r->base_context_checksum); + if (!s.ok()) { + r->SetStatus(s); + return; + } IOStatus ios = r->file->Append(footer.GetSlice()); if (ios.ok()) { r->set_offset(r->get_offset() + footer.GetSlice().size()); @@ -1843,7 +1878,9 @@ void BlockBasedTableBuilder::EnterUnbuffered() { Block reader{BlockContents{data_block}}; DataBlockIter* iter = reader.NewDataIterator( - r->internal_comparator.user_comparator(), kDisableGlobalSequenceNumber); + r->internal_comparator.user_comparator(), kDisableGlobalSequenceNumber, + nullptr /* iter */, nullptr /* stats */, + false /* block_contents_pinned */, r->persist_user_defined_timestamps); iter->SeekToFirst(); assert(iter->Valid()); @@ -1889,9 +1926,8 @@ void BlockBasedTableBuilder::EnterUnbuffered() { for (; iter->Valid(); iter->Next()) { Slice key = iter->key(); if (r->filter_builder != nullptr) { - size_t ts_sz = - r->internal_comparator.user_comparator()->timestamp_size(); - r->filter_builder->Add(ExtractUserKeyAndStripTimestamp(key, ts_sz)); + r->filter_builder->Add( + ExtractUserKeyAndStripTimestamp(key, r->ts_sz)); } r->index_builder->OnKeyAdded(key); } @@ -1945,6 +1981,8 @@ Status BlockBasedTableBuilder::Finish() { } } + r->props.tail_start_offset = r->offset; + // Write meta blocks, metaindex block and footer in the following order. // 1. [meta block: filter] // 2. [meta block: index] @@ -1969,9 +2007,14 @@ Status BlockBasedTableBuilder::Finish() { WriteFooter(metaindex_block_handle, index_block_handle); } r->state = Rep::State::kClosed; - r->SetStatus(r->CopyIOStatus()); + r->tail_size = r->offset - r->props.tail_start_offset; + Status ret_status = r->CopyStatus(); - assert(!ret_status.ok() || io_status().ok()); + IOStatus ios = r->GetIOStatus(); + if (!ios.ok() && ret_status.ok()) { + // Let io_status supersede ok status (otherwise status takes precedennce) + ret_status = ios; + } return ret_status; } @@ -1981,8 +2024,10 @@ void BlockBasedTableBuilder::Abandon() { StopParallelCompression(); } rep_->state = Rep::State::kClosed; +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED // Avoid unnecessary lock acquisition rep_->CopyStatus().PermitUncheckedError(); rep_->CopyIOStatus().PermitUncheckedError(); +#endif // ROCKSDB_ASSERT_STATUS_CHECKED } uint64_t BlockBasedTableBuilder::NumEntries() const { @@ -2005,6 +2050,8 @@ uint64_t BlockBasedTableBuilder::EstimatedFileSize() const { } } +uint64_t BlockBasedTableBuilder::GetTailSize() const { return rep_->tail_size; } + bool BlockBasedTableBuilder::NeedCompact() const { for (const auto& collector : rep_->table_properties_collectors) { if (collector->NeedCompact()) { @@ -2015,14 +2062,7 @@ bool BlockBasedTableBuilder::NeedCompact() const { } TableProperties BlockBasedTableBuilder::GetTableProperties() const { - TableProperties ret = rep_->props; - for (const auto& collector : rep_->table_properties_collectors) { - for (const auto& prop : collector->GetReadableProperties()) { - ret.readable_properties.insert(prop); - } - collector->Finish(&ret.user_collected_properties).PermitUncheckedError(); - } - return ret; + return rep_->props; } std::string BlockBasedTableBuilder::GetFileChecksum() const { diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h index 7cf33953a12f..3949474c5809 100644 --- a/table/block_based/block_based_table_builder.h +++ b/table/block_based/block_based_table_builder.h @@ -53,7 +53,9 @@ class BlockBasedTableBuilder : public TableBuilder { ~BlockBasedTableBuilder(); // Add key,value to the table being constructed. - // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Unless key has type kTypeRangeDeletion, key is after any + // previously added non-kTypeRangeDeletion key according to + // comparator. // REQUIRES: Finish(), Abandon() have not been called void Add(const Slice& key, const Slice& value) override; @@ -89,6 +91,10 @@ class BlockBasedTableBuilder : public TableBuilder { // is enabled. uint64_t EstimatedFileSize() const override; + // Get the size of the "tail" part of a SST file. "Tail" refers to + // all blocks after data blocks till the end of the SST file. + uint64_t GetTailSize() const override; + bool NeedCompact() const override; // Get table properties diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index b3f76a731a95..25299ecab44e 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -83,7 +83,7 @@ size_t TailPrefetchStats::GetSuggestedPrefetchSize() { // // and we use every of the value as a candidate, and estimate how much we // wasted, compared to read. For example, when we use the 3rd record - // as candiate. This area is what we read: + // as candidate. This area is what we read: // +---+ // +---+ | | // | | | | @@ -123,7 +123,7 @@ size_t TailPrefetchStats::GetSuggestedPrefetchSize() { // +---+ +---+ +---+ +---+ +---+ // // Which can be calculated iteratively. - // The difference between wasted using 4st and 3rd record, will + // The difference between wasted using 4th and 3rd record, will // be following area: // +---+ // +--+ +-+ ++ +-+ +-+ +---+ | | @@ -143,8 +143,8 @@ size_t TailPrefetchStats::GetSuggestedPrefetchSize() { // | | | | | | | | | | // +---+ +---+ +---+ +---+ +---+ // - // which will be the size difference between 4st and 3rd record, - // times 3, which is number of records before the 4st. + // which will be the size difference between 4th and 3rd record, + // times 3, which is number of records before the 4th. // Here we assume that all data within the prefetch range will be useful. In // reality, it may not be the case when a partial block is inside the range, // or there are data in the middle that is not read. We ignore those cases @@ -165,7 +165,6 @@ size_t TailPrefetchStats::GetSuggestedPrefetchSize() { return std::min(kMaxPrefetchSize, max_qualified_size); } -#ifndef ROCKSDB_LITE const std::string kOptNameMetadataCacheOpts = "metadata_cache_options"; @@ -226,14 +225,10 @@ static std::unordered_map block_based_table_type_info = { -#ifndef ROCKSDB_LITE /* currently not supported std::shared_ptr block_cache = nullptr; - std::shared_ptr block_cache_compressed = nullptr; CacheUsageOptions cache_usage_options; */ {"flush_block_policy_factory", @@ -314,6 +309,9 @@ static std::unordered_map {offsetof(struct BlockBasedTableOptions, optimize_filters_for_memory), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + // TODO "use_delta_encoding" has not been persisted - + // this may have been an omission, but changing this now might be a + // breaker {"filter_policy", OptionTypeInfo::AsCustomSharedPtr( offsetof(struct BlockBasedTableOptions, filter_policy), @@ -394,15 +392,8 @@ static std::unordered_map return Cache::CreateFromString(opts, value, cache); }}}, {"block_cache_compressed", - {offsetof(struct BlockBasedTableOptions, block_cache_compressed), - OptionType::kUnknown, OptionVerificationType::kNormal, - (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize), - // Parses the input value as a Cache - [](const ConfigOptions& opts, const std::string&, - const std::string& value, void* addr) { - auto* cache = static_cast*>(addr); - return Cache::CreateFromString(opts, value, cache); - }}}, + {0, OptionType::kUnknown, OptionVerificationType::kDeprecated, + OptionTypeFlags::kNone}}, {"max_auto_readahead_size", {offsetof(struct BlockBasedTableOptions, max_auto_readahead_size), OptionType::kSizeT, OptionVerificationType::kNormal, @@ -422,7 +413,6 @@ static std::unordered_map OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kMutable}}, -#endif // ROCKSDB_LITE }; // TODO(myabandeh): We should return an error instead of silently changing the @@ -455,11 +445,8 @@ void BlockBasedTableFactory::InitializeOptions() { table_options_.block_cache.reset(); } else if (table_options_.block_cache == nullptr) { LRUCacheOptions co; - co.capacity = 8 << 20; - // It makes little sense to pay overhead for mid-point insertion while the - // block size is only 8MB. - co.high_pri_pool_ratio = 0.0; - co.low_pri_pool_ratio = 0.0; + // 32MB, the recommended minimum size for 64 shards, to reduce contention + co.capacity = 32 << 20; table_options_.block_cache = NewLRUCache(co); } if (table_options_.block_size_deviation < 0 || @@ -509,20 +496,12 @@ namespace { // they must not share an underlying key space with each other. Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) { int cache_count = (bbto.block_cache != nullptr) + - (bbto.block_cache_compressed != nullptr) + (bbto.persistent_cache != nullptr); if (cache_count <= 1) { // Nothing to share / overlap return Status::OK(); } - // Simple pointer equality - if (bbto.block_cache == bbto.block_cache_compressed) { - return Status::InvalidArgument( - "block_cache same as block_cache_compressed not currently supported, " - "and would be bad for performance anyway"); - } - // More complex test of shared key space, in case the instances are wrappers // for some shared underlying cache. static Cache::CacheItemHelper kHelper{CacheEntryRole::kMisc}; @@ -532,19 +511,12 @@ Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) { char c; }; static SentinelValue kRegularBlockCacheMarker{'b'}; - static SentinelValue kCompressedBlockCacheMarker{'c'}; static char kPersistentCacheMarker{'p'}; if (bbto.block_cache) { bbto.block_cache ->Insert(sentinel_key.AsSlice(), &kRegularBlockCacheMarker, &kHelper, 1) .PermitUncheckedError(); } - if (bbto.block_cache_compressed) { - bbto.block_cache_compressed - ->Insert(sentinel_key.AsSlice(), &kCompressedBlockCacheMarker, &kHelper, - 1) - .PermitUncheckedError(); - } if (bbto.persistent_cache) { // Note: persistent cache copies the data, not keeping the pointer bbto.persistent_cache @@ -559,11 +531,7 @@ Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) { auto v = static_cast(bbto.block_cache->Value(handle)); char c = v->c; bbto.block_cache->Release(handle); - if (v == &kCompressedBlockCacheMarker) { - return Status::InvalidArgument( - "block_cache and block_cache_compressed share the same key space, " - "which is not supported"); - } else if (c == kPersistentCacheMarker) { + if (c == kPersistentCacheMarker) { return Status::InvalidArgument( "block_cache and persistent_cache share the same key space, " "which is not supported"); @@ -572,28 +540,7 @@ Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) { } } } - if (bbto.block_cache_compressed) { - auto handle = bbto.block_cache_compressed->Lookup(sentinel_key.AsSlice()); - if (handle) { - auto v = static_cast( - bbto.block_cache_compressed->Value(handle)); - char c = v->c; - bbto.block_cache_compressed->Release(handle); - if (v == &kRegularBlockCacheMarker) { - return Status::InvalidArgument( - "block_cache_compressed and block_cache share the same key space, " - "which is not supported"); - } else if (c == kPersistentCacheMarker) { - return Status::InvalidArgument( - "block_cache_compressed and persistent_cache share the same key " - "space, " - "which is not supported"); - } else if (v != &kCompressedBlockCacheMarker) { - return Status::Corruption( - "Unexpected mutation to block_cache_compressed"); - } - } - } + if (bbto.persistent_cache) { std::unique_ptr data; size_t size = 0; @@ -604,11 +551,6 @@ Status CheckCacheOptionCompatibility(const BlockBasedTableOptions& bbto) { return Status::InvalidArgument( "persistent_cache and block_cache share the same key space, " "which is not supported"); - } else if (data[0] == kCompressedBlockCacheMarker.c) { - return Status::InvalidArgument( - "persistent_cache and block_cache_compressed share the same key " - "space, " - "which is not supported"); } else if (data[0] != kPersistentCacheMarker) { return Status::Corruption("Unexpected mutation to persistent_cache"); } @@ -627,7 +569,8 @@ Status BlockBasedTableFactory::NewTableReader( return BlockBasedTable::Open( ro, table_reader_options.ioptions, table_reader_options.env_options, table_options_, table_reader_options.internal_comparator, std::move(file), - file_size, table_reader, table_reader_cache_res_mgr_, + file_size, table_reader_options.block_protection_bytes_per_key, + table_reader, table_reader_options.tail_size, table_reader_cache_res_mgr_, table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, table_reader_options.level, table_reader_options.immortal, table_reader_options.largest_seqno, @@ -635,7 +578,8 @@ Status BlockBasedTableFactory::NewTableReader( table_reader_options.block_cache_tracer, table_reader_options.max_file_size_for_l0_meta_pin, table_reader_options.cur_db_session_id, table_reader_options.cur_file_num, - table_reader_options.unique_id); + table_reader_options.unique_id, + table_reader_options.user_defined_timestamps_persisted); } TableBuilder* BlockBasedTableFactory::NewTableBuilder( @@ -828,20 +772,6 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const { ret.append(" block_cache_options:\n"); ret.append(table_options_.block_cache->GetPrintableOptions()); } - snprintf(buffer, kBufferSize, " block_cache_compressed: %p\n", - static_cast(table_options_.block_cache_compressed.get())); - ret.append(buffer); - if (table_options_.block_cache_compressed) { - const char* block_cache_compressed_name = - table_options_.block_cache_compressed->Name(); - if (block_cache_compressed_name != nullptr) { - snprintf(buffer, kBufferSize, " block_cache_name: %s\n", - block_cache_compressed_name); - ret.append(buffer); - } - ret.append(" block_cache_compressed_options:\n"); - ret.append(table_options_.block_cache_compressed->GetPrintableOptions()); - } snprintf(buffer, kBufferSize, " persistent_cache: %p\n", static_cast(table_options_.persistent_cache.get())); ret.append(buffer); @@ -925,7 +855,6 @@ const void* BlockBasedTableFactory::GetOptionsPtr( } } -#ifndef ROCKSDB_LITE // Take a default BlockBasedTableOptions "table_options" in addition to a // map "opts_map" of option name to option value to construct the new // BlockBasedTableOptions "new_table_options". @@ -981,18 +910,6 @@ Status BlockBasedTableFactory::ParseOption(const ConfigOptions& config_options, return status; } -Status GetBlockBasedTableOptionsFromString( - const BlockBasedTableOptions& table_options, const std::string& opts_str, - BlockBasedTableOptions* new_table_options) { - ConfigOptions config_options; - config_options.input_strings_escaped = false; - config_options.ignore_unknown_options = false; - config_options.invoke_prepare_options = false; - config_options.ignore_unsupported_options = false; - - return GetBlockBasedTableOptionsFromString(config_options, table_options, - opts_str, new_table_options); -} Status GetBlockBasedTableOptionsFromString( const ConfigOptions& config_options, const BlockBasedTableOptions& table_options, const std::string& opts_str, @@ -1012,20 +929,6 @@ Status GetBlockBasedTableOptionsFromString( } } -Status GetBlockBasedTableOptionsFromMap( - const BlockBasedTableOptions& table_options, - const std::unordered_map& opts_map, - BlockBasedTableOptions* new_table_options, bool input_strings_escaped, - bool ignore_unknown_options) { - ConfigOptions config_options; - config_options.input_strings_escaped = input_strings_escaped; - config_options.ignore_unknown_options = ignore_unknown_options; - config_options.invoke_prepare_options = false; - - return GetBlockBasedTableOptionsFromMap(config_options, table_options, - opts_map, new_table_options); -} - Status GetBlockBasedTableOptionsFromMap( const ConfigOptions& config_options, const BlockBasedTableOptions& table_options, @@ -1041,7 +944,6 @@ Status GetBlockBasedTableOptionsFromMap( } return s; } -#endif // !ROCKSDB_LITE TableFactory* NewBlockBasedTableFactory( const BlockBasedTableOptions& _table_options) { diff --git a/table/block_based/block_based_table_factory.h b/table/block_based/block_based_table_factory.h index 3166cd3cc90c..1f787697772e 100644 --- a/table/block_based/block_based_table_factory.h +++ b/table/block_based/block_based_table_factory.h @@ -28,6 +28,9 @@ class BlockBasedTableBuilder; class RandomAccessFileReader; class WritableFileWriter; +// TODO: deprecate this class as it can be replaced with +// `FileMetaData::tail_size` +// // A class used to track actual bytes written from the tail in the recent SST // file opens, and provide a suggestion for following open. class TailPrefetchStats { @@ -80,12 +83,10 @@ class BlockBasedTableFactory : public TableFactory { protected: const void* GetOptionsPtr(const std::string& name) const override; -#ifndef ROCKSDB_LITE Status ParseOption(const ConfigOptions& config_options, const OptionTypeInfo& opt_info, const std::string& opt_name, const std::string& opt_value, void* opt_ptr) override; -#endif void InitializeOptions(); private: diff --git a/table/block_based/block_based_table_iterator.cc b/table/block_based/block_based_table_iterator.cc index d2605670fc3e..8107e58f24bd 100644 --- a/table/block_based/block_based_table_iterator.cc +++ b/table/block_based/block_based_table_iterator.cc @@ -18,21 +18,48 @@ void BlockBasedTableIterator::Seek(const Slice& target) { void BlockBasedTableIterator::SeekImpl(const Slice* target, bool async_prefetch) { - bool is_first_pass = true; + ResetBlockCacheLookupVar(); + bool is_first_pass = !async_read_in_progress_; + bool autotune_readaheadsize = is_first_pass && + read_options_.auto_readahead_size && + read_options_.iterate_upper_bound; + + if (autotune_readaheadsize && + table_->get_rep()->table_options.block_cache.get() && + !read_options_.async_io && direction_ == IterDirection::kForward) { + readahead_cache_lookup_ = true; + } + + // Second pass. if (async_read_in_progress_) { AsyncInitDataBlock(false); - is_first_pass = false; } is_out_of_bound_ = false; is_at_first_key_from_index_ = false; - if (target && !CheckPrefixMayMatch(*target, IterDirection::kForward)) { + seek_stat_state_ = kNone; + bool filter_checked = false; + if (target && + !CheckPrefixMayMatch(*target, IterDirection::kForward, &filter_checked)) { ResetDataIter(); + RecordTick(table_->GetStatistics(), is_last_level_ + ? LAST_LEVEL_SEEK_FILTERED + : NON_LAST_LEVEL_SEEK_FILTERED); return; } + if (filter_checked) { + seek_stat_state_ = kFilterUsed; + RecordTick(table_->GetStatistics(), is_last_level_ + ? LAST_LEVEL_SEEK_FILTER_MATCH + : NON_LAST_LEVEL_SEEK_FILTER_MATCH); + } bool need_seek_index = true; - if (block_iter_points_to_real_block_ && block_iter_.Valid()) { + + // In case of readahead_cache_lookup_, index_iter_ could change to find the + // readahead size in BlockCacheLookupForReadAheadSize so it needs to reseek. + if (IsIndexAtCurr() && block_iter_points_to_real_block_ && + block_iter_.Valid()) { // Reseek. prev_block_offset_ = index_iter_->value().handle.offset(); @@ -60,13 +87,31 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target, } else { index_iter_->SeekToFirst(); } + is_index_at_curr_block_ = true; + if (!index_iter_->Valid()) { + ResetDataIter(); + return; + } + } + if (autotune_readaheadsize) { + FindReadAheadSizeUpperBound(); + if (target) { + index_iter_->Seek(*target); + } else { + index_iter_->SeekToFirst(); + } + + // Check for IO error. if (!index_iter_->Valid()) { ResetDataIter(); return; } } + // After reseek, index_iter_ point to the right key i.e. target in + // case of readahead_cache_lookup_. So index_iter_ can be used directly. + IndexValue v = index_iter_->value(); const bool same_block = block_iter_points_to_real_block_ && v.handle.offset() == prev_block_offset_; @@ -123,14 +168,27 @@ void BlockBasedTableIterator::SeekImpl(const Slice* target, } void BlockBasedTableIterator::SeekForPrev(const Slice& target) { + direction_ = IterDirection::kBackward; + ResetBlockCacheLookupVar(); is_out_of_bound_ = false; is_at_first_key_from_index_ = false; + seek_stat_state_ = kNone; + bool filter_checked = false; // For now totally disable prefix seek in auto prefix mode because we don't // have logic - if (!CheckPrefixMayMatch(target, IterDirection::kBackward)) { + if (!CheckPrefixMayMatch(target, IterDirection::kBackward, &filter_checked)) { ResetDataIter(); + RecordTick(table_->GetStatistics(), is_last_level_ + ? LAST_LEVEL_SEEK_FILTERED + : NON_LAST_LEVEL_SEEK_FILTERED); return; } + if (filter_checked) { + seek_stat_state_ = kFilterUsed; + RecordTick(table_->GetStatistics(), is_last_level_ + ? LAST_LEVEL_SEEK_FILTER_MATCH + : NON_LAST_LEVEL_SEEK_FILTER_MATCH); + } SavePrevIndexValue(); @@ -148,6 +206,7 @@ void BlockBasedTableIterator::SeekForPrev(const Slice& target) { // to distinguish the two unless we read the second block. In this case, we'll // end up with reading two blocks. index_iter_->Seek(target); + is_index_at_curr_block_ = true; if (!index_iter_->Valid()) { auto seek_status = index_iter_->status(); @@ -183,14 +242,22 @@ void BlockBasedTableIterator::SeekForPrev(const Slice& target) { } void BlockBasedTableIterator::SeekToLast() { + direction_ = IterDirection::kBackward; + ResetBlockCacheLookupVar(); is_out_of_bound_ = false; is_at_first_key_from_index_ = false; + seek_stat_state_ = kNone; + SavePrevIndexValue(); + index_iter_->SeekToLast(); + is_index_at_curr_block_ = true; + if (!index_iter_->Valid()) { ResetDataIter(); return; } + InitDataBlock(); block_iter_.SeekToLast(); FindKeyBackward(); @@ -219,6 +286,14 @@ bool BlockBasedTableIterator::NextAndGetResult(IterateResult* result) { } void BlockBasedTableIterator::Prev() { + // Return Error. + if (readahead_cache_lookup_) { + block_iter_.Invalidate(Status::NotSupported( + "auto tuning of readahead_size is not supported with Prev operation.")); + return; + } + + ResetBlockCacheLookupVar(); if (is_at_first_key_from_index_) { is_at_first_key_from_index_ = false; @@ -238,7 +313,18 @@ void BlockBasedTableIterator::Prev() { } void BlockBasedTableIterator::InitDataBlock() { - BlockHandle data_block_handle = index_iter_->value().handle; + BlockHandle data_block_handle; + bool is_in_cache = false; + bool use_block_cache_for_lookup = true; + + if (DoesContainBlockHandles()) { + data_block_handle = block_handles_.front().handle_; + is_in_cache = block_handles_.front().is_cache_hit_; + use_block_cache_for_lookup = false; + } else { + data_block_handle = index_iter_->value().handle; + } + if (!block_iter_points_to_real_block_ || data_block_handle.offset() != prev_block_offset_ || // if previous attempt of reading the block missed cache, try again @@ -246,26 +332,59 @@ void BlockBasedTableIterator::InitDataBlock() { if (block_iter_points_to_real_block_) { ResetDataIter(); } - auto* rep = table_->get_rep(); bool is_for_compaction = lookup_context_.caller == TableReaderCaller::kCompaction; - // Prefetch additional data for range scans (iterators). - // Implicit auto readahead: - // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. - // Explicit user requested readahead: - // Enabled from the very first IO when ReadOptions.readahead_size is set. - block_prefetcher_.PrefetchIfNeeded( - rep, data_block_handle, read_options_.readahead_size, is_for_compaction, - /*no_sequential_checking=*/false, read_options_.rate_limiter_priority); - Status s; - table_->NewDataBlockIterator( - read_options_, data_block_handle, &block_iter_, BlockType::kData, - /*get_context=*/nullptr, &lookup_context_, - block_prefetcher_.prefetch_buffer(), - /*for_compaction=*/is_for_compaction, /*async_read=*/false, s); + + // Initialize Data Block From CacheableEntry. + if (is_in_cache) { + Status s; + block_iter_.Invalidate(Status::OK()); + table_->NewDataBlockIterator( + read_options_, (block_handles_.front().cachable_entry_).As(), + &block_iter_, s); + } else { + auto* rep = table_->get_rep(); + + std::function readaheadsize_cb = + nullptr; + if (readahead_cache_lookup_) { + readaheadsize_cb = std::bind( + &BlockBasedTableIterator::BlockCacheLookupForReadAheadSize, this, + std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3); + } + + // Prefetch additional data for range scans (iterators). + // Implicit auto readahead: + // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. + // Explicit user requested readahead: + // Enabled from the very first IO when ReadOptions.readahead_size is + // set. + block_prefetcher_.PrefetchIfNeeded( + rep, data_block_handle, read_options_.readahead_size, + is_for_compaction, + /*no_sequential_checking=*/false, read_options_, readaheadsize_cb); + + Status s; + table_->NewDataBlockIterator( + read_options_, data_block_handle, &block_iter_, BlockType::kData, + /*get_context=*/nullptr, &lookup_context_, + block_prefetcher_.prefetch_buffer(), + /*for_compaction=*/is_for_compaction, /*async_read=*/false, s, + use_block_cache_for_lookup); + } block_iter_points_to_real_block_ = true; + CheckDataBlockWithinUpperBound(); + if (!is_for_compaction && + (seek_stat_state_ & kDataBlockReadSinceLastSeek) == 0) { + RecordTick(table_->GetStatistics(), is_last_level_ + ? LAST_LEVEL_SEEK_DATA + : NON_LAST_LEVEL_SEEK_DATA); + seek_stat_state_ = static_cast( + seek_stat_state_ | kDataBlockReadSinceLastSeek | kReportOnUseful); + } } } @@ -282,6 +401,16 @@ void BlockBasedTableIterator::AsyncInitDataBlock(bool is_first_pass) { ResetDataIter(); } auto* rep = table_->get_rep(); + + std::function readaheadsize_cb = + nullptr; + if (readahead_cache_lookup_) { + readaheadsize_cb = std::bind( + &BlockBasedTableIterator::BlockCacheLookupForReadAheadSize, this, + std::placeholders::_1, std::placeholders::_2, + std::placeholders::_3); + } + // Prefetch additional data for range scans (iterators). // Implicit auto readahead: // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. @@ -294,14 +423,15 @@ void BlockBasedTableIterator::AsyncInitDataBlock(bool is_first_pass) { block_prefetcher_.PrefetchIfNeeded( rep, data_block_handle, read_options_.readahead_size, is_for_compaction, /*no_sequential_checking=*/read_options_.async_io, - read_options_.rate_limiter_priority); + read_options_, readaheadsize_cb); Status s; table_->NewDataBlockIterator( read_options_, data_block_handle, &block_iter_, BlockType::kData, /*get_context=*/nullptr, &lookup_context_, block_prefetcher_.prefetch_buffer(), - /*for_compaction=*/is_for_compaction, /*async_read=*/true, s); + /*for_compaction=*/is_for_compaction, /*async_read=*/true, s, + /*use_block_cache_for_lookup=*/true); if (s.IsTryAgain()) { async_read_in_progress_ = true; @@ -316,10 +446,20 @@ void BlockBasedTableIterator::AsyncInitDataBlock(bool is_first_pass) { read_options_, data_block_handle, &block_iter_, BlockType::kData, /*get_context=*/nullptr, &lookup_context_, block_prefetcher_.prefetch_buffer(), - /*for_compaction=*/is_for_compaction, /*async_read=*/false, s); + /*for_compaction=*/is_for_compaction, /*async_read=*/false, s, + /*use_block_cache_for_lookup=*/false); } block_iter_points_to_real_block_ = true; CheckDataBlockWithinUpperBound(); + + if (!is_for_compaction && + (seek_stat_state_ & kDataBlockReadSinceLastSeek) == 0) { + RecordTick(table_->GetStatistics(), is_last_level_ + ? LAST_LEVEL_SEEK_DATA + : NON_LAST_LEVEL_SEEK_DATA); + seek_stat_state_ = static_cast( + seek_stat_state_ | kDataBlockReadSinceLastSeek | kReportOnUseful); + } async_read_in_progress_ = false; } @@ -338,20 +478,29 @@ bool BlockBasedTableIterator::MaterializeCurrentBlock() { block_iter_.SeekToFirst(); + // MaterializeCurrentBlock is called when block is actually read by + // calling InitDataBlock. is_at_first_key_from_index_ will be false for block + // handles placed in blockhandle. So index_ will be pointing to current block. + // After InitDataBlock, index_iter_ can point to different block if + // BlockCacheLookupForReadAheadSize is called. + Slice first_internal_key; + if (DoesContainBlockHandles()) { + first_internal_key = block_handles_.front().first_internal_key_; + } else { + first_internal_key = index_iter_->value().first_internal_key; + } + if (!block_iter_.Valid() || - icomp_.Compare(block_iter_.key(), - index_iter_->value().first_internal_key) != 0) { + icomp_.Compare(block_iter_.key(), first_internal_key) != 0) { block_iter_.Invalidate(Status::Corruption( "first key in index doesn't match first key in block")); return false; } - return true; } void BlockBasedTableIterator::FindKeyForward() { // This method's code is kept short to make it likely to be inlined. - assert(!is_out_of_bound_); assert(block_iter_points_to_real_block_); @@ -374,40 +523,72 @@ void BlockBasedTableIterator::FindBlockForward() { return; } // Whether next data block is out of upper bound, if there is one. - const bool next_block_is_out_of_bound = - read_options_.iterate_upper_bound != nullptr && + // index_iter_ can point to different block in case of + // readahead_cache_lookup_. readahead_cache_lookup_ will be handle the + // upper_bound check. + bool next_block_is_out_of_bound = + IsIndexAtCurr() && read_options_.iterate_upper_bound != nullptr && block_iter_points_to_real_block_ && block_upper_bound_check_ == BlockUpperBound::kUpperBoundInCurBlock; + assert(!next_block_is_out_of_bound || user_comparator_.CompareWithoutTimestamp( *read_options_.iterate_upper_bound, /*a_has_ts=*/false, index_iter_->user_key(), /*b_has_ts=*/true) <= 0); + ResetDataIter(); - index_iter_->Next(); - if (next_block_is_out_of_bound) { - // The next block is out of bound. No need to read it. - TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", nullptr); - // We need to make sure this is not the last data block before setting - // is_out_of_bound_, since the index key for the last data block can be - // larger than smallest key of the next file on the same level. - if (index_iter_->Valid()) { - is_out_of_bound_ = true; - } - return; - } - if (!index_iter_->Valid()) { - return; + if (DoesContainBlockHandles()) { + // Advance and point to that next Block handle to make that block handle + // current. + block_handles_.pop_front(); } - IndexValue v = index_iter_->value(); + if (!DoesContainBlockHandles()) { + // For readahead_cache_lookup_ enabled scenario - + // 1. In case of Seek, block_handle will be empty and it should be follow + // as usual doing index_iter_->Next(). + // 2. If block_handles is empty and index is not at current because of + // lookup (during Next), it should skip doing index_iter_->Next(), as + // it's already pointing to next block; + // 3. Last block could be out of bound and it won't iterate over that + // during BlockCacheLookup. We need to set for that block here. + if (IsIndexAtCurr() || is_index_out_of_bound_) { + index_iter_->Next(); + if (is_index_out_of_bound_) { + next_block_is_out_of_bound = is_index_out_of_bound_; + is_index_out_of_bound_ = false; + } + } else { + // Skip Next as index_iter_ already points to correct index when it + // iterates in BlockCacheLookupForReadAheadSize. + is_index_at_curr_block_ = true; + } - if (!v.first_internal_key.empty() && allow_unprepared_value_) { - // Index contains the first key of the block. Defer reading the block. - is_at_first_key_from_index_ = true; - return; - } + if (next_block_is_out_of_bound) { + // The next block is out of bound. No need to read it. + TEST_SYNC_POINT_CALLBACK("BlockBasedTableIterator:out_of_bound", + nullptr); + // We need to make sure this is not the last data block before setting + // is_out_of_bound_, since the index key for the last data block can be + // larger than smallest key of the next file on the same level. + if (index_iter_->Valid()) { + is_out_of_bound_ = true; + } + return; + } + + if (!index_iter_->Valid()) { + return; + } + IndexValue v = index_iter_->value(); + if (!v.first_internal_key.empty() && allow_unprepared_value_) { + // Index contains the first key of the block. Defer reading the block. + is_at_first_key_from_index_ = true; + return; + } + } InitDataBlock(); block_iter_.SeekToFirst(); } while (!block_iter_.Valid()); @@ -446,7 +627,7 @@ void BlockBasedTableIterator::CheckOutOfBound() { } void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() { - if (read_options_.iterate_upper_bound != nullptr && + if (IsIndexAtCurr() && read_options_.iterate_upper_bound != nullptr && block_iter_points_to_real_block_) { block_upper_bound_check_ = (user_comparator_.CompareWithoutTimestamp( *read_options_.iterate_upper_bound, @@ -456,4 +637,136 @@ void BlockBasedTableIterator::CheckDataBlockWithinUpperBound() { : BlockUpperBound::kUpperBoundInCurBlock; } } + +void BlockBasedTableIterator::FindReadAheadSizeUpperBound() { + size_t total_bytes_till_upper_bound = 0; + size_t footer = table_->get_rep()->footer.GetBlockTrailerSize(); + uint64_t start_offset = index_iter_->value().handle.offset(); + + do { + BlockHandle block_handle = index_iter_->value().handle; + total_bytes_till_upper_bound += block_handle.size(); + total_bytes_till_upper_bound += footer; + + // Can't figure out for current block if current block + // is out of bound. But for next block we can find that. + // If curr block's index key >= iterate_upper_bound, it + // means all the keys in next block or above are out of + // bound. + if (IsNextBlockOutOfBound()) { + break; + } + + // Since next block is not out of bound, iterate to that + // index block and add it's Data block size to + // readahead_size. + index_iter_->Next(); + + if (!index_iter_->Valid()) { + break; + } + + } while (true); + + block_prefetcher_.SetUpperBoundOffset(start_offset + + total_bytes_till_upper_bound); +} + +void BlockBasedTableIterator::BlockCacheLookupForReadAheadSize( + uint64_t offset, size_t readahead_size, size_t& updated_readahead_size) { + updated_readahead_size = readahead_size; + + // readahead_cache_lookup_ can be set false after Seek, if after Seek or Next + // there is SeekForPrev or any other backward operation. + if (!readahead_cache_lookup_) { + return; + } + + assert(!DoesContainBlockHandles()); + assert(index_iter_->value().handle.offset() == offset); + + // Error. current offset should be equal to what's requested for prefetching. + if (index_iter_->value().handle.offset() != offset) { + return; + } + + if (IsNextBlockOutOfBound()) { + updated_readahead_size = 0; + return; + } + + size_t current_readahead_size = 0; + size_t footer = table_->get_rep()->footer.GetBlockTrailerSize(); + + // Add the current block to block_handles_. + { + BlockHandleInfo block_handle_info; + block_handle_info.handle_ = index_iter_->value().handle; + block_handle_info.SetFirstInternalKey( + index_iter_->value().first_internal_key); + block_handles_.emplace_back(std::move(block_handle_info)); + } + + // Current block is included in length. Readahead should start from next + // block. + index_iter_->Next(); + is_index_at_curr_block_ = false; + + while (index_iter_->Valid()) { + BlockHandle block_handle = index_iter_->value().handle; + + // Adding this data block exceeds passed down readahead_size. So this data + // block won't be added. + if (current_readahead_size + block_handle.size() + footer > + readahead_size) { + break; + } + + current_readahead_size += block_handle.size(); + current_readahead_size += footer; + + // For current data block, do the lookup in the cache. Lookup should pin the + // data block and add the placeholder for cache. + BlockHandleInfo block_handle_info; + block_handle_info.handle_ = index_iter_->value().handle; + block_handle_info.SetFirstInternalKey( + index_iter_->value().first_internal_key); + + Status s = table_->LookupAndPinBlocksInCache( + read_options_, block_handle, + &(block_handle_info.cachable_entry_).As()); + if (!s.ok()) { + break; + } + + block_handle_info.is_cache_hit_ = + (block_handle_info.cachable_entry_.GetValue() || + block_handle_info.cachable_entry_.GetCacheHandle()); + + // Add the handle to the queue. + block_handles_.emplace_back(std::move(block_handle_info)); + + // Can't figure out for current block if current block + // is out of bound. But for next block we can find that. + // If curr block's index key >= iterate_upper_bound, it + // means all the keys in next block or above are out of + // bound. + if (IsNextBlockOutOfBound()) { + is_index_out_of_bound_ = true; + break; + } + index_iter_->Next(); + }; + + // Iterate cache hit block handles from the end till a Miss is there, to + // update the readahead_size. + for (auto it = block_handles_.rbegin(); + it != block_handles_.rend() && (*it).is_cache_hit_ == true; ++it) { + current_readahead_size -= (*it).handle_.size(); + current_readahead_size -= footer; + } + updated_readahead_size = current_readahead_size; + ResetPreviousBlockOffset(); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h index a2918b248666..7ed7e3375a1d 100644 --- a/table/block_based/block_based_table_iterator.h +++ b/table/block_based/block_based_table_iterator.h @@ -7,6 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once +#include + #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_based_table_reader_impl.h" #include "table/block_based/block_prefetcher.h" @@ -41,9 +43,10 @@ class BlockBasedTableIterator : public InternalIteratorBase { block_iter_points_to_real_block_(false), check_filter_(check_filter), need_upper_bound_check_(need_upper_bound_check), - async_read_in_progress_(false) {} + async_read_in_progress_(false), + is_last_level_(table->IsLastLevel()) {} - ~BlockBasedTableIterator() {} + ~BlockBasedTableIterator() override { ClearBlockHandles(); } void Seek(const Slice& target) override; void SeekForPrev(const Slice& target) override; @@ -57,6 +60,11 @@ class BlockBasedTableIterator : public InternalIteratorBase { (is_at_first_key_from_index_ || (block_iter_points_to_real_block_ && block_iter_.Valid())); } + + // For block cache readahead lookup scenario - + // If is_at_first_key_from_index_ is true, InitDataBlock hasn't been + // called. It means block_handles is empty and index_ point to current block. + // So index_iter_ can be accessed directly. Slice key() const override { assert(Valid()); if (is_at_first_key_from_index_) { @@ -73,6 +81,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { return block_iter_.user_key(); } } + bool PrepareValue() override { assert(Valid()); @@ -88,11 +97,27 @@ class BlockBasedTableIterator : public InternalIteratorBase { assert(!is_at_first_key_from_index_); assert(Valid()); + if (seek_stat_state_ & kReportOnUseful) { + bool filter_used = (seek_stat_state_ & kFilterUsed) != 0; + RecordTick( + table_->GetStatistics(), + filter_used + ? (is_last_level_ ? LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH + : NON_LAST_LEVEL_SEEK_DATA_USEFUL_FILTER_MATCH) + : (is_last_level_ ? LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER + : NON_LAST_LEVEL_SEEK_DATA_USEFUL_NO_FILTER)); + seek_stat_state_ = kDataBlockReadSinceLastSeek; + } + return block_iter_.value(); } Status status() const override { - // Prefix index set status to NotFound when the prefix does not exist - if (!index_iter_->status().ok() && !index_iter_->status().IsNotFound()) { + // In case of block cache readahead lookup, it won't add the block to + // block_handles if it's index is invalid. So index_iter_->status check can + // be skipped. + // Prefix index set status to NotFound when the prefix does not exist. + if (IsIndexAtCurr() && !index_iter_->status().ok() && + !index_iter_->status().IsNotFound()) { return index_iter_->status(); } else if (block_iter_points_to_real_block_) { return block_iter_.status(); @@ -146,7 +171,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { } void SavePrevIndexValue() { - if (block_iter_points_to_real_block_) { + if (block_iter_points_to_real_block_ && IsIndexAtCurr()) { // Reseek. If they end up with the same data block, we shouldn't re-fetch // the same data block. prev_block_offset_ = index_iter_->value().handle.offset(); @@ -204,12 +229,46 @@ class BlockBasedTableIterator : public InternalIteratorBase { // bound. // If the boundary key hasn't been checked against the upper bound, // kUnknown can be used. - enum class BlockUpperBound { + enum class BlockUpperBound : uint8_t { kUpperBoundInCurBlock, kUpperBoundBeyondCurBlock, kUnknown, }; + // State bits for collecting stats on seeks and whether they returned useful + // results. + enum SeekStatState : uint8_t { + kNone = 0, + // Most recent seek checked prefix filter (or similar future feature) + kFilterUsed = 1 << 0, + // Already recorded that a data block was accessed since the last seek. + kDataBlockReadSinceLastSeek = 1 << 1, + // Have not yet recorded that a value() was accessed. + kReportOnUseful = 1 << 2, + }; + + // BlockHandleInfo is used to store the info needed when block cache lookup + // ahead is enabled to tune readahead_size. + struct BlockHandleInfo { + void SetFirstInternalKey(const Slice& key) { + if (key.empty()) { + return; + } + size_t size = key.size(); + buf_ = std::unique_ptr(new char[size]); + memcpy(buf_.get(), key.data(), size); + first_internal_key_ = Slice(buf_.get(), size); + } + + BlockHandle handle_; + bool is_cache_hit_ = false; + CachableEntry cachable_entry_; + Slice first_internal_key_; + std::unique_ptr buf_; + }; + + bool IsIndexAtCurr() const { return is_index_at_curr_block_; } + const BlockBasedTable* table_; const ReadOptions& read_options_; const InternalKeyComparator& icomp_; @@ -240,6 +299,32 @@ class BlockBasedTableIterator : public InternalIteratorBase { bool async_read_in_progress_; + mutable SeekStatState seek_stat_state_ = SeekStatState::kNone; + bool is_last_level_; + + // If set to true, it'll lookup in the cache ahead to estimate the readahead + // size based on cache hit and miss. + bool readahead_cache_lookup_ = false; + + // It stores all the block handles that are lookuped in cache ahead when + // BlockCacheLookupForReadAheadSize is called. Since index_iter_ may point to + // different blocks when readahead_size is calculated in + // BlockCacheLookupForReadAheadSize, to avoid index_iter_ reseek, + // block_handles_ is used. + std::deque block_handles_; + + // During cache lookup to find readahead size, index_iter_ is iterated and it + // can point to a different block. is_index_at_curr_block_ keeps track of + // that. + bool is_index_at_curr_block_ = true; + bool is_index_out_of_bound_ = false; + + // Used in case of auto_readahead_size to disable the block_cache lookup if + // direction is reversed from forward to backward. In case of backward + // direction, SeekForPrev or Prev might call Seek from db_iter. So direction + // is used to disable the lookup. + IterDirection direction_ = IterDirection::kForward; + // If `target` is null, seek to first. void SeekImpl(const Slice* target, bool async_prefetch); @@ -257,16 +342,18 @@ class BlockBasedTableIterator : public InternalIteratorBase { // we need to check and update data_block_within_upper_bound_ accordingly. void CheckDataBlockWithinUpperBound(); - bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction) { + bool CheckPrefixMayMatch(const Slice& ikey, IterDirection direction, + bool* filter_checked) { if (need_upper_bound_check_ && direction == IterDirection::kBackward) { // Upper bound check isn't sufficient for backward direction to // guarantee the same result as total order, so disable prefix // check. return true; } - if (check_filter_ && !table_->PrefixRangeMayMatch( - ikey, read_options_, prefix_extractor_, - need_upper_bound_check_, &lookup_context_)) { + if (check_filter_ && + !table_->PrefixRangeMayMatch(ikey, read_options_, prefix_extractor_, + need_upper_bound_check_, &lookup_context_, + filter_checked)) { // TODO remember the iterator is invalidated because of prefix // match. This can avoid the upper level file iterator to falsely // believe the position is the end of the SST file and move to @@ -276,5 +363,42 @@ class BlockBasedTableIterator : public InternalIteratorBase { } return true; } + + // *** BEGIN APIs relevant to auto tuning of readahead_size *** + void FindReadAheadSizeUpperBound(); + + // This API is called to lookup the data blocks ahead in the cache to estimate + // the current readahead_size. + void BlockCacheLookupForReadAheadSize(uint64_t offset, size_t readahead_size, + size_t& updated_readahead_size); + + void ResetBlockCacheLookupVar() { + is_index_out_of_bound_ = false; + readahead_cache_lookup_ = false; + ClearBlockHandles(); + } + + bool IsNextBlockOutOfBound() { + // If curr block's index key >= iterate_upper_bound, it means all the keys + // in next block or above are out of bound. + return (user_comparator_.CompareWithoutTimestamp( + index_iter_->user_key(), + /*a_has_ts=*/true, *read_options_.iterate_upper_bound, + /*b_has_ts=*/false) >= 0 + ? true + : false); + } + + void ClearBlockHandles() { block_handles_.clear(); } + + // Reset prev_block_offset_. If index_iter_ has moved ahead, it won't get + // accurate prev_block_offset_. + void ResetPreviousBlockOffset() { + prev_block_offset_ = std::numeric_limits::max(); + } + + bool DoesContainBlockHandles() { return !block_handles_.empty(); } + + // *** END APIs relevant to auto tuning of readahead_size *** }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index d0d911a2e944..678cdf469e6d 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -88,21 +88,31 @@ CacheAllocationPtr CopyBufferToHeap(MemoryAllocator* allocator, Slice& buf) { // Explicitly instantiate templates for each "blocklike" type we use (and // before implicit specialization). // This makes it possible to keep the template definitions in the .cc file. -#define INSTANTIATE_RETRIEVE_BLOCK(T) \ - template Status BlockBasedTable::RetrieveBlock( \ - FilePrefetchBuffer * prefetch_buffer, const ReadOptions& ro, \ - const BlockHandle& handle, const UncompressionDict& uncompression_dict, \ - CachableEntry* out_parsed_block, GetContext* get_context, \ - BlockCacheLookupContext* lookup_context, bool for_compaction, \ - bool use_cache, bool wait_for_cache, bool async_read) const; - -INSTANTIATE_RETRIEVE_BLOCK(ParsedFullFilterBlock); -INSTANTIATE_RETRIEVE_BLOCK(UncompressionDict); -INSTANTIATE_RETRIEVE_BLOCK(Block_kData); -INSTANTIATE_RETRIEVE_BLOCK(Block_kIndex); -INSTANTIATE_RETRIEVE_BLOCK(Block_kFilterPartitionIndex); -INSTANTIATE_RETRIEVE_BLOCK(Block_kRangeDeletion); -INSTANTIATE_RETRIEVE_BLOCK(Block_kMetaIndex); +#define INSTANTIATE_BLOCKLIKE_TEMPLATES(T) \ + template Status BlockBasedTable::RetrieveBlock( \ + FilePrefetchBuffer * prefetch_buffer, const ReadOptions& ro, \ + const BlockHandle& handle, const UncompressionDict& uncompression_dict, \ + CachableEntry* out_parsed_block, GetContext* get_context, \ + BlockCacheLookupContext* lookup_context, bool for_compaction, \ + bool use_cache, bool async_read, bool use_block_cache_for_lookup) const; \ + template Status BlockBasedTable::MaybeReadBlockAndLoadToCache( \ + FilePrefetchBuffer * prefetch_buffer, const ReadOptions& ro, \ + const BlockHandle& handle, const UncompressionDict& uncompression_dict, \ + bool for_compaction, CachableEntry* block_entry, \ + GetContext* get_context, BlockCacheLookupContext* lookup_context, \ + BlockContents* contents, bool async_read, \ + bool use_block_cache_for_lookup) const; \ + template Status BlockBasedTable::LookupAndPinBlocksInCache( \ + const ReadOptions& ro, const BlockHandle& handle, \ + CachableEntry* out_parsed_block) const; + +INSTANTIATE_BLOCKLIKE_TEMPLATES(ParsedFullFilterBlock); +INSTANTIATE_BLOCKLIKE_TEMPLATES(UncompressionDict); +INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kData); +INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kIndex); +INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kFilterPartitionIndex); +INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kRangeDeletion); +INSTANTIATE_BLOCKLIKE_TEMPLATES(Block_kMetaIndex); } // namespace ROCKSDB_NAMESPACE @@ -560,7 +570,8 @@ Status BlockBasedTable::Open( const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, std::unique_ptr&& file, uint64_t file_size, - std::unique_ptr* table_reader, + uint8_t block_protection_bytes_per_key, + std::unique_ptr* table_reader, uint64_t tail_size, std::shared_ptr table_reader_cache_res_mgr, const std::shared_ptr& prefix_extractor, const bool prefetch_index_and_filter_in_cache, const bool skip_filters, @@ -569,22 +580,22 @@ Status BlockBasedTable::Open( TailPrefetchStats* tail_prefetch_stats, BlockCacheTracer* const block_cache_tracer, size_t max_file_size_for_l0_meta_pin, const std::string& cur_db_session_id, - uint64_t cur_file_num, UniqueId64x2 expected_unique_id) { + uint64_t cur_file_num, UniqueId64x2 expected_unique_id, + const bool user_defined_timestamps_persisted) { table_reader->reset(); Status s; Footer footer; std::unique_ptr prefetch_buffer; - // From read_options, retain deadline, io_timeout, and rate_limiter_priority. - // In future, we may retain more - // options. Specifically, we ignore verify_checksums and default to - // checksum verification anyway when creating the index and filter - // readers. + // From read_options, retain deadline, io_timeout, rate_limiter_priority, and + // verify_checksums. In future, we may retain more options. ReadOptions ro; ro.deadline = read_options.deadline; ro.io_timeout = read_options.io_timeout; ro.rate_limiter_priority = read_options.rate_limiter_priority; + ro.verify_checksums = read_options.verify_checksums; + ro.io_activity = read_options.io_activity; // prefetch both index and filters, down to all partitions const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; @@ -593,7 +604,8 @@ Status BlockBasedTable::Open( if (!ioptions.allow_mmap_reads) { s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch, tail_prefetch_stats, prefetch_all, preload_all, - &prefetch_buffer); + &prefetch_buffer, ioptions.stats, tail_size, + ioptions.logger); // Return error in prefetch path to users. if (!s.ok()) { return s; @@ -630,9 +642,9 @@ Status BlockBasedTable::Open( } BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; - Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options, - internal_comparator, skip_filters, - file_size, level, immortal_table); + Rep* rep = new BlockBasedTable::Rep( + ioptions, env_options, table_options, internal_comparator, skip_filters, + file_size, level, immortal_table, user_defined_timestamps_persisted); rep->file = std::move(file); rep->footer = footer; @@ -646,6 +658,7 @@ Status BlockBasedTable::Open( // meta-block reads. rep->compression_dict_handle = BlockHandle::NullBlockHandle(); + rep->create_context.protection_bytes_per_key = block_protection_bytes_per_key; // Read metaindex std::unique_ptr new_table( new BlockBasedTable(rep, block_cache_tracer)); @@ -672,9 +685,11 @@ Status BlockBasedTable::Open( CompressionTypeToString(kZSTD) || rep->table_properties->compression_name == CompressionTypeToString(kZSTDNotFinalCompression)); - rep->create_context = - BlockCreateContext(&rep->table_options, rep->ioptions.stats, - blocks_definitely_zstd_compressed); + rep->create_context = BlockCreateContext( + &rep->table_options, &rep->ioptions, rep->ioptions.stats, + blocks_definitely_zstd_compressed, block_protection_bytes_per_key, + rep->internal_comparator.user_comparator(), rep->index_value_is_full, + rep->index_has_first_key); // Check expected unique id if provided if (expected_unique_id != kNullUniqueId64x2) { @@ -734,7 +749,6 @@ Status BlockBasedTable::Open( rep->table_prefix_extractor = prefix_extractor; } else { // Current prefix_extractor doesn't match table -#ifndef ROCKSDB_LITE if (rep->table_properties) { //**TODO: If/When the DBOptions has a registry in it, the ConfigOptions // will need to use it @@ -750,7 +764,6 @@ Status BlockBasedTable::Open( st.ToString().c_str()); } } -#endif // ROCKSDB_LITE } // With properties loaded, we can set up portable/stable cache keys @@ -761,12 +774,14 @@ Status BlockBasedTable::Open( PersistentCacheOptions(rep->table_options.persistent_cache, rep->base_cache_key, rep->ioptions.stats); + // TODO(yuzhangyu): handle range deletion entries for UDT in memtable only. s = new_table->ReadRangeDelBlock(ro, prefetch_buffer.get(), metaindex_iter.get(), internal_comparator, &lookup_context); if (!s.ok()) { return s; } + rep->verify_checksum_set_on_open = ro.verify_checksums; s = new_table->PrefetchIndexAndFilterBlocks( ro, prefetch_buffer.get(), metaindex_iter.get(), new_table.get(), prefetch_all, table_options, level, file_size, @@ -806,21 +821,37 @@ Status BlockBasedTable::PrefetchTail( const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, - std::unique_ptr* prefetch_buffer) { + std::unique_ptr* prefetch_buffer, Statistics* stats, + uint64_t tail_size, Logger* const logger) { + assert(tail_size <= file_size); + size_t tail_prefetch_size = 0; - if (tail_prefetch_stats != nullptr) { - // Multiple threads may get a 0 (no history) when running in parallel, - // but it will get cleared after the first of them finishes. - tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); - } - if (tail_prefetch_size == 0) { - // Before read footer, readahead backwards to prefetch data. Do more - // readahead if we're going to read index/filter. - // TODO: This may incorrectly select small readahead in case partitioned - // index/filter is enabled and top-level partition pinning is enabled. - // That's because we need to issue readahead before we read the properties, - // at which point we don't yet know the index type. - tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024; + if (tail_size != 0) { + tail_prefetch_size = tail_size; + } else { + if (tail_prefetch_stats != nullptr) { + // Multiple threads may get a 0 (no history) when running in parallel, + // but it will get cleared after the first of them finishes. + tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); + } + if (tail_prefetch_size == 0) { + // Before read footer, readahead backwards to prefetch data. Do more + // readahead if we're going to read index/filter. + // TODO: This may incorrectly select small readahead in case partitioned + // index/filter is enabled and top-level partition pinning is enabled. + // That's because we need to issue readahead before we read the + // properties, at which point we don't yet know the index type. + tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024; + + ROCKS_LOG_WARN(logger, + "Tail prefetch size %zu is calculated based on heuristics", + tail_prefetch_size); + } else { + ROCKS_LOG_WARN( + logger, + "Tail prefetch size %zu is calculated based on TailPrefetchStats", + tail_prefetch_size); + } } size_t prefetch_off; size_t prefetch_len; @@ -831,13 +862,19 @@ Status BlockBasedTable::PrefetchTail( prefetch_off = static_cast(file_size - tail_prefetch_size); prefetch_len = tail_prefetch_size; } + +#ifndef NDEBUG + std::pair prefetch_off_len_pair = {&prefetch_off, + &prefetch_len}; TEST_SYNC_POINT_CALLBACK("BlockBasedTable::Open::TailPrefetchLen", - &tail_prefetch_size); + &prefetch_off_len_pair); +#endif // NDEBUG + IOOptions opts; + Status s = file->PrepareIOOptions(ro, opts); // Try file system prefetch - if (!file->use_direct_io() && !force_direct_prefetch) { - if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority) - .IsNotSupported()) { + if (s.ok() && !file->use_direct_io() && !force_direct_prefetch) { + if (!file->Prefetch(opts, prefetch_off, prefetch_len).IsNotSupported()) { prefetch_buffer->reset(new FilePrefetchBuffer( 0 /* readahead_size */, 0 /* max_readahead_size */, false /* enable */, true /* track_min_offset */)); @@ -846,16 +883,16 @@ Status BlockBasedTable::PrefetchTail( } // Use `FilePrefetchBuffer` - prefetch_buffer->reset( - new FilePrefetchBuffer(0 /* readahead_size */, 0 /* max_readahead_size */, - true /* enable */, true /* track_min_offset */)); + prefetch_buffer->reset(new FilePrefetchBuffer( + 0 /* readahead_size */, 0 /* max_readahead_size */, true /* enable */, + true /* track_min_offset */, false /* implicit_auto_readahead */, + 0 /* num_file_reads */, 0 /* num_file_reads_for_auto_readahead */, + 0 /* upper_bound_offset */, nullptr /* fs */, nullptr /* clock */, stats, + /* readahead_cb */ nullptr, + FilePrefetchBufferUsage::kTableOpenPrefetchTail)); - IOOptions opts; - Status s = file->PrepareIOOptions(ro, opts); if (s.ok()) { - s = (*prefetch_buffer) - ->Prefetch(opts, file, prefetch_off, prefetch_len, - ro.rate_limiter_priority); + s = (*prefetch_buffer)->Prefetch(opts, file, prefetch_off, prefetch_len); } return s; } @@ -917,10 +954,18 @@ Status BlockBasedTable::ReadPropertiesBlock( // If table properties don't contain index type, we assume that the table // is in very old format and has kBinarySearch index type. auto& props = rep_->table_properties->user_collected_properties; - auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); - if (pos != props.end()) { + auto index_type_pos = props.find(BlockBasedTablePropertyNames::kIndexType); + if (index_type_pos != props.end()) { rep_->index_type = static_cast( - DecodeFixed32(pos->second.c_str())); + DecodeFixed32(index_type_pos->second.c_str())); + } + auto min_ts_pos = props.find("rocksdb.timestamp_min"); + if (min_ts_pos != props.end()) { + rep_->min_timestamp = Slice(min_ts_pos->second); + } + auto max_ts_pos = props.find("rocksdb.timestamp_max"); + if (max_ts_pos != props.end()) { + rep_->max_timestamp = Slice(max_ts_pos->second); } rep_->index_has_first_key = @@ -954,7 +999,8 @@ Status BlockBasedTable::ReadRangeDelBlock( read_options, range_del_handle, /*input_iter=*/nullptr, BlockType::kRangeDeletion, /*get_context=*/nullptr, lookup_context, prefetch_buffer, - /*for_compaction= */ false, /*async_read= */ false, tmp_status)); + /*for_compaction= */ false, /*async_read= */ false, tmp_status, + /*use_block_cache_for_lookup=*/true)); assert(iter != nullptr); s = iter->status(); if (!s.ok()) { @@ -1128,7 +1174,8 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( // are hence follow the configuration for pin and prefetch regardless of // the value of cache_index_and_filter_blocks if (prefetch_all || pin_partition) { - s = rep_->index_reader->CacheDependencies(ro, pin_partition); + s = rep_->index_reader->CacheDependencies(ro, pin_partition, + prefetch_buffer); } if (!s.ok()) { return s; @@ -1152,7 +1199,7 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( if (filter) { // Refer to the comment above about paritioned indexes always being cached if (prefetch_all || pin_partition) { - s = filter->CacheDependencies(ro, pin_partition); + s = filter->CacheDependencies(ro, pin_partition, prefetch_buffer); if (!s.ok()) { return s; } @@ -1177,23 +1224,7 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( return s; } -void BlockBasedTable::SetupForCompaction() { - switch (rep_->ioptions.access_hint_on_compaction_start) { - case Options::NONE: - break; - case Options::NORMAL: - rep_->file->file()->Hint(FSRandomAccessFile::kNormal); - break; - case Options::SEQUENTIAL: - rep_->file->file()->Hint(FSRandomAccessFile::kSequential); - break; - case Options::WILLNEED: - rep_->file->file()->Hint(FSRandomAccessFile::kWillNeed); - break; - default: - assert(false); - } -} +void BlockBasedTable::SetupForCompaction() {} std::shared_ptr BlockBasedTable::GetTableProperties() const { @@ -1255,36 +1286,42 @@ Status BlockBasedTable::ReadMetaIndexBlock( } template -WithBlocklikeCheck BlockBasedTable::GetDataBlockFromCache( - const Slice& cache_key, BlockCacheInterface block_cache, - CompressedBlockCacheInterface block_cache_compressed, - const ReadOptions& read_options, - CachableEntry* out_parsed_block, - const UncompressionDict& uncompression_dict, const bool wait, - GetContext* get_context) const { - assert(out_parsed_block); - assert(out_parsed_block->IsEmpty()); +Cache::Priority BlockBasedTable::GetCachePriority() const { // Here we treat the legacy name "...index_and_filter_blocks..." to mean all // metadata blocks that might go into block cache, EXCEPT only those needed // for the read path (Get, etc.). TableProperties should not be needed on the // read path (prefix extractor setting is an O(1) size special case that we // are working not to require from TableProperties), so it is not given // high-priority treatment if it should go into BlockCache. - const Cache::Priority priority = - rep_->table_options.cache_index_and_filter_blocks_with_high_priority && - TBlocklike::kBlockType != BlockType::kData && - TBlocklike::kBlockType != BlockType::kProperties - ? Cache::Priority::HIGH - : Cache::Priority::LOW; + if constexpr (TBlocklike::kBlockType == BlockType::kData || + TBlocklike::kBlockType == BlockType::kProperties) { + return Cache::Priority::LOW; + } else if (rep_->table_options + .cache_index_and_filter_blocks_with_high_priority) { + return Cache::Priority::HIGH; + } else { + return Cache::Priority::LOW; + } +} + +template +WithBlocklikeCheck BlockBasedTable::GetDataBlockFromCache( + const Slice& cache_key, BlockCacheInterface block_cache, + CachableEntry* out_parsed_block, GetContext* get_context, + const UncompressionDict* dict) const { + assert(out_parsed_block); + assert(out_parsed_block->IsEmpty()); Status s; Statistics* statistics = rep_->ioptions.statistics.get(); // Lookup uncompressed cache first if (block_cache) { + BlockCreateContext create_ctx = rep_->create_context; + create_ctx.dict = dict; assert(!cache_key.empty()); auto cache_handle = block_cache.LookupFull( - cache_key, &rep_->create_context, priority, wait, statistics, + cache_key, &create_ctx, GetCachePriority(), statistics, rep_->ioptions.lowest_used_cache_tier); // Avoid updating metrics here if the handle is not complete yet. This @@ -1306,84 +1343,19 @@ WithBlocklikeCheck BlockBasedTable::GetDataBlockFromCache( // If not found, search from the compressed block cache. assert(out_parsed_block->IsEmpty()); - if (!block_cache_compressed) { - return s; - } - - assert(!cache_key.empty()); - BlockContents contents; - auto block_cache_compressed_handle = - block_cache_compressed.Lookup(cache_key, statistics); - - // if we found in the compressed cache, then uncompress and insert into - // uncompressed cache - if (block_cache_compressed_handle == nullptr) { - RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS); - return s; - } - - // found compressed block - RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT); - BlockContents* compressed_block = - block_cache_compressed.Value(block_cache_compressed_handle); - CompressionType compression_type = GetBlockCompressionType(*compressed_block); - assert(compression_type != kNoCompression); - - // Retrieve the uncompressed contents into a new buffer - UncompressionContext context(compression_type); - UncompressionInfo info(context, uncompression_dict, compression_type); - s = UncompressSerializedBlock( - info, compressed_block->data.data(), compressed_block->data.size(), - &contents, rep_->table_options.format_version, rep_->ioptions, - GetMemoryAllocator(rep_->table_options)); - - // Insert parsed block into block cache, the priority is based on the - // data block type. - if (s.ok()) { - std::unique_ptr block_holder; - rep_->create_context.Create(&block_holder, std::move(contents)); - - if (block_cache && block_holder->own_bytes() && read_options.fill_cache) { - size_t charge = block_holder->ApproximateMemoryUsage(); - BlockCacheTypedHandle* cache_handle = nullptr; - s = block_cache.InsertFull(cache_key, block_holder.get(), charge, - &cache_handle, priority, - rep_->ioptions.lowest_used_cache_tier); - if (s.ok()) { - assert(cache_handle != nullptr); - out_parsed_block->SetCachedValue(block_holder.release(), - block_cache.get(), cache_handle); - - UpdateCacheInsertionMetrics(TBlocklike::kBlockType, get_context, charge, - s.IsOkOverwritten(), rep_->ioptions.stats); - } else { - RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); - } - } else { - out_parsed_block->SetOwnedValue(std::move(block_holder)); - } - } - - // Release hold on compressed cache entry - block_cache_compressed.Release(block_cache_compressed_handle); return s; } template WithBlocklikeCheck BlockBasedTable::PutDataBlockToCache( const Slice& cache_key, BlockCacheInterface block_cache, - CompressedBlockCacheInterface block_cache_compressed, - CachableEntry* out_parsed_block, BlockContents&& block_contents, - CompressionType block_comp_type, + CachableEntry* out_parsed_block, + BlockContents&& uncompressed_block_contents, + BlockContents&& compressed_block_contents, CompressionType block_comp_type, const UncompressionDict& uncompression_dict, MemoryAllocator* memory_allocator, GetContext* get_context) const { const ImmutableOptions& ioptions = rep_->ioptions; const uint32_t format_version = rep_->table_options.format_version; - const Cache::Priority priority = - rep_->table_options.cache_index_and_filter_blocks_with_high_priority && - TBlocklike::kBlockType != BlockType::kData - ? Cache::Priority::HIGH - : Cache::Priority::LOW; assert(out_parsed_block); assert(out_parsed_block->IsEmpty()); @@ -1391,57 +1363,31 @@ WithBlocklikeCheck BlockBasedTable::PutDataBlockToCache( Statistics* statistics = ioptions.stats; std::unique_ptr block_holder; - if (block_comp_type != kNoCompression) { + if (block_comp_type != kNoCompression && + uncompressed_block_contents.data.empty()) { + assert(compressed_block_contents.data.data()); // Retrieve the uncompressed contents into a new buffer - BlockContents uncompressed_block_contents; UncompressionContext context(block_comp_type); UncompressionInfo info(context, uncompression_dict, block_comp_type); - s = UncompressBlockData(info, block_contents.data.data(), - block_contents.data.size(), + s = UncompressBlockData(info, compressed_block_contents.data.data(), + compressed_block_contents.data.size(), &uncompressed_block_contents, format_version, ioptions, memory_allocator); if (!s.ok()) { return s; } - rep_->create_context.Create(&block_holder, - std::move(uncompressed_block_contents)); - } else { - rep_->create_context.Create(&block_holder, std::move(block_contents)); - } - - // Insert compressed block into compressed block cache. - // Release the hold on the compressed cache entry immediately. - if (block_cache_compressed && block_comp_type != kNoCompression && - block_contents.own_bytes()) { - assert(block_contents.has_trailer); - assert(!cache_key.empty()); - - // We cannot directly put block_contents because this could point to - // an object in the stack. - auto block_cont_for_comp_cache = - std::make_unique(std::move(block_contents)); - size_t charge = block_cont_for_comp_cache->ApproximateMemoryUsage(); - - s = block_cache_compressed.Insert(cache_key, - block_cont_for_comp_cache.get(), charge, - nullptr /*handle*/, Cache::Priority::LOW); - - if (s.ok()) { - // Cache took ownership - block_cont_for_comp_cache.release(); - RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD); - } else { - RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); - } } + rep_->create_context.Create(&block_holder, + std::move(uncompressed_block_contents)); // insert into uncompressed block cache if (block_cache && block_holder->own_bytes()) { size_t charge = block_holder->ApproximateMemoryUsage(); BlockCacheTypedHandle* cache_handle = nullptr; s = block_cache.InsertFull(cache_key, block_holder.get(), charge, - &cache_handle, priority, - rep_->ioptions.lowest_used_cache_tier); + &cache_handle, GetCachePriority(), + rep_->ioptions.lowest_used_cache_tier, + compressed_block_contents.data, block_comp_type); if (s.ok()) { assert(cache_handle != nullptr); @@ -1511,7 +1457,8 @@ DataBlockIter* BlockBasedTable::InitBlockIterator( DataBlockIter* input_iter, bool block_contents_pinned) { return block->NewDataIterator(rep->internal_comparator.user_comparator(), rep->get_global_seqno(block_type), input_iter, - rep->ioptions.stats, block_contents_pinned); + rep->ioptions.stats, block_contents_pinned, + rep->user_defined_timestamps_persisted); } // TODO? @@ -1524,7 +1471,63 @@ IndexBlockIter* BlockBasedTable::InitBlockIterator( rep->get_global_seqno(block_type), input_iter, rep->ioptions.stats, /* total_order_seek */ true, rep->index_has_first_key, rep->index_key_includes_seq, rep->index_value_is_full, - block_contents_pinned); + block_contents_pinned, rep->user_defined_timestamps_persisted); +} + +// Right now only called for Data blocks. +template +Status BlockBasedTable::LookupAndPinBlocksInCache( + const ReadOptions& ro, const BlockHandle& handle, + CachableEntry* out_parsed_block) const { + BlockCacheInterface block_cache{ + rep_->table_options.block_cache.get()}; + + assert(block_cache); + + Status s; + CachableEntry uncompression_dict; + if (rep_->uncompression_dict_reader) { + const bool no_io = (ro.read_tier == kBlockCacheTier); + s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( + /* prefetch_buffer= */ nullptr, ro, no_io, ro.verify_checksums, + /* get_context= */ nullptr, /* lookup_context= */ nullptr, + &uncompression_dict); + if (!s.ok()) { + return s; + } + } + + // Do the lookup. + CacheKey key_data = GetCacheKey(rep_->base_cache_key, handle); + const Slice key = key_data.AsSlice(); + + Statistics* statistics = rep_->ioptions.statistics.get(); + + BlockCreateContext create_ctx = rep_->create_context; + create_ctx.dict = uncompression_dict.GetValue() + ? uncompression_dict.GetValue() + : &UncompressionDict::GetEmptyDict(); + + auto cache_handle = + block_cache.LookupFull(key, &create_ctx, GetCachePriority(), + statistics, rep_->ioptions.lowest_used_cache_tier); + + if (!cache_handle) { + UpdateCacheMissMetrics(TBlocklike::kBlockType, /* get_context = */ nullptr); + return s; + } + + // Found in Cache. + TBlocklike* value = block_cache.Value(cache_handle); + if (value) { + UpdateCacheHitMetrics(TBlocklike::kBlockType, /* get_context = */ nullptr, + block_cache.get()->GetUsage(cache_handle)); + } + out_parsed_block->SetCachedValue(value, block_cache.get(), cache_handle); + + assert(!out_parsed_block->IsEmpty()); + + return s; } // If contents is nullptr, this function looks up the block caches for the @@ -1537,17 +1540,14 @@ WithBlocklikeCheck BlockBasedTable::MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - const bool wait, const bool for_compaction, - CachableEntry* out_parsed_block, GetContext* get_context, - BlockCacheLookupContext* lookup_context, BlockContents* contents, - bool async_read) const { + bool for_compaction, CachableEntry* out_parsed_block, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + BlockContents* contents, bool async_read, + bool use_block_cache_for_lookup) const { assert(out_parsed_block != nullptr); const bool no_io = (ro.read_tier == kBlockCacheTier); BlockCacheInterface block_cache{ rep_->table_options.block_cache.get()}; - CompressedBlockCacheInterface block_cache_compressed{ - rep_->table_options.block_cache_compressed.get()}; - // First, try to get the block from the cache // // If either block cache is enabled, we'll try to read from it. @@ -1555,28 +1555,31 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache( CacheKey key_data; Slice key; bool is_cache_hit = false; - if (block_cache || block_cache_compressed) { + if (block_cache) { // create key for block cache key_data = GetCacheKey(rep_->base_cache_key, handle); key = key_data.AsSlice(); if (!contents) { - s = GetDataBlockFromCache(key, block_cache, block_cache_compressed, ro, - out_parsed_block, uncompression_dict, wait, - get_context); - // Value could still be null at this point, so check the cache handle - // and update the read pattern for prefetching - if (out_parsed_block->GetValue() || out_parsed_block->GetCacheHandle()) { - // TODO(haoyu): Differentiate cache hit on uncompressed block cache and - // compressed block cache. - is_cache_hit = true; - if (prefetch_buffer) { - // Update the block details so that PrefetchBuffer can use the read - // pattern to determine if reads are sequential or not for - // prefetching. It should also take in account blocks read from cache. - prefetch_buffer->UpdateReadPattern( - handle.offset(), BlockSizeWithTrailer(handle), - ro.adaptive_readahead /*decrease_readahead_size*/); + if (use_block_cache_for_lookup) { + s = GetDataBlockFromCache(key, block_cache, out_parsed_block, + get_context, &uncompression_dict); + // Value could still be null at this point, so check the cache handle + // and update the read pattern for prefetching + if (out_parsed_block->GetValue() || + out_parsed_block->GetCacheHandle()) { + // TODO(haoyu): Differentiate cache hit on uncompressed block cache + // and compressed block cache. + is_cache_hit = true; + if (prefetch_buffer) { + // Update the block details so that PrefetchBuffer can use the read + // pattern to determine if reads are sequential or not for + // prefetching. It should also take in account blocks read from + // cache. + prefetch_buffer->UpdateReadPattern( + handle.offset(), BlockSizeWithTrailer(handle), + ro.adaptive_readahead /*decrease_readahead_size*/); + } } } } @@ -1591,21 +1594,33 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache( TBlocklike::kBlockType != BlockType::kFilter && TBlocklike::kBlockType != BlockType::kCompressionDictionary && rep_->blocks_maybe_compressed; - const bool do_uncompress = maybe_compressed && !block_cache_compressed; + // This flag, if true, tells BlockFetcher to return the uncompressed + // block when ReadBlockContents() is called. + const bool do_uncompress = maybe_compressed; CompressionType contents_comp_type; // Maybe serialized or uncompressed BlockContents tmp_contents; + BlockContents uncomp_contents; + BlockContents comp_contents; if (!contents) { Histograms histogram = for_compaction ? READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS; StopWatch sw(rep_->ioptions.clock, statistics, histogram); + // Setting do_uncompress to false may cause an extra mempcy in the + // following cases - + // 1. Compression is enabled, but block is not actually compressed + // 2. Compressed block is in the prefetch buffer + // 3. Direct IO + // + // It would also cause a memory allocation to be used rather than + // stack if the compressed block size is < 5KB BlockFetcher block_fetcher( rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &tmp_contents, rep_->ioptions, do_uncompress, maybe_compressed, TBlocklike::kBlockType, uncompression_dict, rep_->persistent_cache_options, GetMemoryAllocator(rep_->table_options), - GetMemoryAllocatorForCompressedBlock(rep_->table_options)); + /*allocator=*/nullptr); // If prefetch_buffer is not allocated, it will fallback to synchronous // reading of block contents. @@ -1619,7 +1634,6 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache( } contents_comp_type = block_fetcher.get_compression_type(); - contents = &tmp_contents; if (get_context) { switch (TBlocklike::kBlockType) { case BlockType::kIndex: @@ -1633,106 +1647,168 @@ BlockBasedTable::MaybeReadBlockAndLoadToCache( break; } } + if (s.ok()) { + if (do_uncompress && contents_comp_type != kNoCompression) { + comp_contents = BlockContents(block_fetcher.GetCompressedBlock()); + uncomp_contents = std::move(tmp_contents); + } else if (contents_comp_type != kNoCompression) { + // do_uncompress must be false, so output of BlockFetcher is + // compressed + comp_contents = std::move(tmp_contents); + } else { + uncomp_contents = std::move(tmp_contents); + } + + // If filling cache is allowed and a cache is configured, try to put + // the block to the cache. Do this here while block_fetcher is in + // scope, since comp_contents will be a reference to the compressed + // block in block_fetcher + s = PutDataBlockToCache( + key, block_cache, out_parsed_block, std::move(uncomp_contents), + std::move(comp_contents), contents_comp_type, uncompression_dict, + GetMemoryAllocator(rep_->table_options), get_context); + } } else { contents_comp_type = GetBlockCompressionType(*contents); - } + if (contents_comp_type != kNoCompression) { + comp_contents = std::move(*contents); + } else { + uncomp_contents = std::move(*contents); + } - if (s.ok()) { - // If filling cache is allowed and a cache is configured, try to put the - // block to the cache. - s = PutDataBlockToCache( - key, block_cache, block_cache_compressed, out_parsed_block, - std::move(*contents), contents_comp_type, uncompression_dict, - GetMemoryAllocator(rep_->table_options), get_context); + if (s.ok()) { + // If filling cache is allowed and a cache is configured, try to put + // the block to the cache. + s = PutDataBlockToCache( + key, block_cache, out_parsed_block, std::move(uncomp_contents), + std::move(comp_contents), contents_comp_type, uncompression_dict, + GetMemoryAllocator(rep_->table_options), get_context); + } } } } - // Fill lookup_context. + // TODO: optimize so that lookup_context != nullptr implies the others if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() && lookup_context) { - size_t usage = 0; - uint64_t nkeys = 0; - if (out_parsed_block->GetValue()) { - // Approximate the number of keys in the block using restarts. - // FIXME: Should this only apply to data blocks? - nkeys = rep_->table_options.block_restart_interval * - GetBlockNumRestarts(*out_parsed_block->GetValue()); - usage = out_parsed_block->GetValue()->ApproximateMemoryUsage(); - } - TraceType trace_block_type = TraceType::kTraceMax; - switch (TBlocklike::kBlockType) { - case BlockType::kData: - trace_block_type = TraceType::kBlockTraceDataBlock; - break; - case BlockType::kFilter: - case BlockType::kFilterPartitionIndex: - trace_block_type = TraceType::kBlockTraceFilterBlock; - break; - case BlockType::kCompressionDictionary: - trace_block_type = TraceType::kBlockTraceUncompressionDictBlock; - break; - case BlockType::kRangeDeletion: - trace_block_type = TraceType::kBlockTraceRangeDeletionBlock; - break; - case BlockType::kIndex: - trace_block_type = TraceType::kBlockTraceIndexBlock; - break; - default: - // This cannot happen. - assert(false); - break; - } - bool no_insert = no_io || !ro.fill_cache; - if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock( - trace_block_type, lookup_context->caller)) { - // Defer logging the access to Get() and MultiGet() to trace additional - // information, e.g., referenced_key_exist_in_block. - - // Make a copy of the block key here since it will be logged later. - lookup_context->FillLookupContext( - is_cache_hit, no_insert, trace_block_type, - /*block_size=*/usage, /*block_key=*/key.ToString(), nkeys); - } else { - // Avoid making copy of block_key and cf_name when constructing the access - // record. - BlockCacheTraceRecord access_record( - rep_->ioptions.clock->NowMicros(), - /*block_key=*/"", trace_block_type, - /*block_size=*/usage, rep_->cf_id_for_tracing(), - /*cf_name=*/"", rep_->level_for_tracing(), - rep_->sst_number_for_tracing(), lookup_context->caller, is_cache_hit, - no_insert, lookup_context->get_id, - lookup_context->get_from_user_specified_snapshot, - /*referenced_key=*/""); - // TODO: Should handle this error? - block_cache_tracer_ - ->WriteBlockAccess(access_record, key, rep_->cf_name_for_tracing(), - lookup_context->referenced_key) - .PermitUncheckedError(); - } + SaveLookupContextOrTraceRecord( + key, is_cache_hit, ro, out_parsed_block->GetValue(), lookup_context); } assert(s.ok() || out_parsed_block->GetValue() == nullptr); return s; } +template +WithBlocklikeCheck +BlockBasedTable::SaveLookupContextOrTraceRecord( + const Slice& block_key, bool is_cache_hit, const ReadOptions& ro, + const TBlocklike* parsed_block_value, + BlockCacheLookupContext* lookup_context) const { + assert(lookup_context); + size_t usage = 0; + uint64_t nkeys = 0; + if (parsed_block_value) { + // Approximate the number of keys in the block using restarts. + int interval = rep_->table_options.block_restart_interval; + nkeys = interval * GetBlockNumRestarts(*parsed_block_value); + // On average, the last restart should be just over half utilized. + // Specifically, 1..N should be N/2 + 0.5. For example, 7 -> 4, 8 -> 4.5. + // Use the get_id to alternate between rounding up vs. down. + if (nkeys > 0) { + bool rounding = static_cast(lookup_context->get_id) & 1; + nkeys -= (interval - rounding) / 2; + } + usage = parsed_block_value->ApproximateMemoryUsage(); + } + TraceType trace_block_type = TraceType::kTraceMax; + switch (TBlocklike::kBlockType) { + case BlockType::kData: + trace_block_type = TraceType::kBlockTraceDataBlock; + break; + case BlockType::kFilter: + case BlockType::kFilterPartitionIndex: + trace_block_type = TraceType::kBlockTraceFilterBlock; + break; + case BlockType::kCompressionDictionary: + trace_block_type = TraceType::kBlockTraceUncompressionDictBlock; + break; + case BlockType::kRangeDeletion: + trace_block_type = TraceType::kBlockTraceRangeDeletionBlock; + break; + case BlockType::kIndex: + trace_block_type = TraceType::kBlockTraceIndexBlock; + break; + default: + // This cannot happen. + assert(false); + break; + } + const bool no_io = ro.read_tier == kBlockCacheTier; + bool no_insert = no_io || !ro.fill_cache; + if (BlockCacheTraceHelper::IsGetOrMultiGetOnDataBlock( + trace_block_type, lookup_context->caller)) { + // Make a copy of the block key here since it will be logged later. + lookup_context->FillLookupContext(is_cache_hit, no_insert, trace_block_type, + /*block_size=*/usage, + block_key.ToString(), nkeys); + + // Defer logging the access to Get() and MultiGet() to trace additional + // information, e.g., referenced_key + } else { + // Avoid making copy of block_key if it doesn't need to be saved in + // BlockCacheLookupContext + lookup_context->FillLookupContext(is_cache_hit, no_insert, trace_block_type, + /*block_size=*/usage, + /*block_key=*/{}, nkeys); + + // Fill in default values for irrelevant/unknown fields + FinishTraceRecord(*lookup_context, block_key, + lookup_context->referenced_key, + /*does_referenced_key_exist*/ false, + /*referenced_data_size*/ 0); + } +} + +void BlockBasedTable::FinishTraceRecord( + const BlockCacheLookupContext& lookup_context, const Slice& block_key, + const Slice& referenced_key, bool does_referenced_key_exist, + uint64_t referenced_data_size) const { + // Avoid making copy of referenced_key if it doesn't need to be saved in + // BlockCacheLookupContext + BlockCacheTraceRecord access_record( + rep_->ioptions.clock->NowMicros(), + /*block_key=*/"", lookup_context.block_type, lookup_context.block_size, + rep_->cf_id_for_tracing(), + /*cf_name=*/"", rep_->level_for_tracing(), rep_->sst_number_for_tracing(), + lookup_context.caller, lookup_context.is_cache_hit, + lookup_context.no_insert, lookup_context.get_id, + lookup_context.get_from_user_specified_snapshot, + /*referenced_key=*/"", referenced_data_size, + lookup_context.num_keys_in_block, does_referenced_key_exist); + // TODO: Should handle status here? + block_cache_tracer_ + ->WriteBlockAccess(access_record, block_key, rep_->cf_name_for_tracing(), + referenced_key) + .PermitUncheckedError(); +} + template WithBlocklikeCheck BlockBasedTable::RetrieveBlock( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* out_parsed_block, GetContext* get_context, BlockCacheLookupContext* lookup_context, bool for_compaction, - bool use_cache, bool wait_for_cache, bool async_read) const { + bool use_cache, bool async_read, bool use_block_cache_for_lookup) const { assert(out_parsed_block); assert(out_parsed_block->IsEmpty()); Status s; if (use_cache) { s = MaybeReadBlockAndLoadToCache( - prefetch_buffer, ro, handle, uncompression_dict, wait_for_cache, - for_compaction, out_parsed_block, get_context, lookup_context, - /*contents=*/nullptr, async_read); + prefetch_buffer, ro, handle, uncompression_dict, for_compaction, + out_parsed_block, get_context, lookup_context, + /*contents=*/nullptr, async_read, use_block_cache_for_lookup); if (!s.ok()) { return s; @@ -1819,7 +1895,8 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( rep->internal_comparator.user_comparator(), rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true, rep->index_has_first_key, rep->index_key_includes_seq, - rep->index_value_is_full); + rep->index_value_is_full, /*block_contents_pinned=*/false, + rep->user_defined_timestamps_persisted); } // This will be broken if the user specifies an unusual implementation @@ -1839,8 +1916,8 @@ BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( bool BlockBasedTable::PrefixRangeMayMatch( const Slice& internal_key, const ReadOptions& read_options, const SliceTransform* options_prefix_extractor, - const bool need_upper_bound_check, - BlockCacheLookupContext* lookup_context) const { + const bool need_upper_bound_check, BlockCacheLookupContext* lookup_context, + bool* filter_checked) const { if (!rep_->filter_policy) { return true; } @@ -1865,7 +1942,7 @@ bool BlockBasedTable::PrefixRangeMayMatch( bool may_match = true; FilterBlockReader* const filter = rep_->filter.get(); - bool filter_checked = false; + *filter_checked = false; if (filter != nullptr) { const bool no_io = read_options.read_tier == kBlockCacheTier; @@ -1873,16 +1950,8 @@ bool BlockBasedTable::PrefixRangeMayMatch( may_match = filter->RangeMayExist( read_options.iterate_upper_bound, user_key_without_ts, prefix_extractor, rep_->internal_comparator.user_comparator(), const_ikey_ptr, - &filter_checked, need_upper_bound_check, no_io, lookup_context, - read_options.rate_limiter_priority); - } - - if (filter_checked) { - Statistics* statistics = rep_->ioptions.stats; - RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); - if (!may_match) { - RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); - } + filter_checked, need_upper_bound_check, no_io, lookup_context, + read_options); } return may_match; @@ -1900,6 +1969,13 @@ bool BlockBasedTable::PrefixExtractorChanged( } } +Statistics* BlockBasedTable::GetStatistics() const { + return rep_->ioptions.stats; +} +bool BlockBasedTable::IsLastLevel() const { + return rep_->level == rep_->ioptions.num_levels - 1; +} + InternalIterator* BlockBasedTable::NewIterator( const ReadOptions& read_options, const SliceTransform* prefix_extractor, Arena* arena, bool skip_filters, TableReaderCaller caller, @@ -1944,11 +2020,21 @@ FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator( snapshot, read_options.timestamp); } +FragmentedRangeTombstoneIterator* BlockBasedTable::NewRangeTombstoneIterator( + SequenceNumber read_seqno, const Slice* timestamp) { + if (rep_->fragmented_range_dels == nullptr) { + return nullptr; + } + return new FragmentedRangeTombstoneIterator(rep_->fragmented_range_dels, + rep_->internal_comparator, + read_seqno, timestamp); +} + bool BlockBasedTable::FullFilterKeyMayMatch( FilterBlockReader* filter, const Slice& internal_key, const bool no_io, const SliceTransform* prefix_extractor, GetContext* get_context, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) const { + const ReadOptions& read_options) const { if (filter == nullptr) { return true; } @@ -1958,22 +2044,31 @@ bool BlockBasedTable::FullFilterKeyMayMatch( size_t ts_sz = rep_->internal_comparator.user_comparator()->timestamp_size(); Slice user_key_without_ts = StripTimestampFromUserKey(user_key, ts_sz); if (rep_->whole_key_filtering) { - may_match = - filter->KeyMayMatch(user_key_without_ts, no_io, const_ikey_ptr, - get_context, lookup_context, rate_limiter_priority); + may_match = filter->KeyMayMatch(user_key_without_ts, no_io, const_ikey_ptr, + get_context, lookup_context, read_options); + if (may_match) { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level); + } else { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL); + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); + } } else if (!PrefixExtractorChanged(prefix_extractor) && - prefix_extractor->InDomain(user_key_without_ts) && - !filter->PrefixMayMatch( - prefix_extractor->Transform(user_key_without_ts), no_io, - const_ikey_ptr, get_context, lookup_context, - rate_limiter_priority)) { + prefix_extractor->InDomain(user_key_without_ts)) { // FIXME ^^^: there should be no reason for Get() to depend on current // prefix_extractor at all. It should always use table_prefix_extractor. - may_match = false; - } - if (may_match) { - RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE); - PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level); + may_match = filter->PrefixMayMatch( + prefix_extractor->Transform(user_key_without_ts), no_io, const_ikey_ptr, + get_context, lookup_context, read_options); + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_CHECKED); + if (may_match) { + // Includes prefix stats + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, rep_->level); + } else { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_USEFUL); + // Includes prefix stats + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); + } } return may_match; } @@ -1982,14 +2077,14 @@ void BlockBasedTable::FullFilterKeysMayMatch( FilterBlockReader* filter, MultiGetRange* range, const bool no_io, const SliceTransform* prefix_extractor, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) const { + const ReadOptions& read_options) const { if (filter == nullptr) { return; } uint64_t before_keys = range->KeysLeft(); assert(before_keys > 0); // Caller should ensure if (rep_->whole_key_filtering) { - filter->KeysMayMatch(range, no_io, lookup_context, rate_limiter_priority); + filter->KeysMayMatch(range, no_io, lookup_context, read_options); uint64_t after_keys = range->KeysLeft(); if (after_keys) { RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_POSITIVE, after_keys); @@ -2006,13 +2101,21 @@ void BlockBasedTable::FullFilterKeysMayMatch( // FIXME ^^^: there should be no reason for MultiGet() to depend on current // prefix_extractor at all. It should always use table_prefix_extractor. filter->PrefixesMayMatch(range, prefix_extractor, false, lookup_context, - rate_limiter_priority); + read_options); RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_CHECKED, before_keys); uint64_t after_keys = range->KeysLeft(); + if (after_keys) { + // Includes prefix stats + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, after_keys, + rep_->level); + } uint64_t filtered_keys = before_keys - after_keys; if (filtered_keys) { RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_USEFUL, filtered_keys); + // Includes prefix stats + PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, filtered_keys, + rep_->level); } } } @@ -2026,6 +2129,16 @@ Status BlockBasedTable::ApproximateKeyAnchors(const ReadOptions& read_options, // likely not to be a problem. We are compacting the whole file, so all // keys will be read out anyway. An extra read to index block might be // a small share of the overhead. We can try to optimize if needed. + // + // `CacheDependencies()` brings all the blocks into cache using one I/O. That + // way the full index scan usually finds the index data it is looking for in + // cache rather than doing an I/O for each "dependency" (partition). + Status s = rep_->index_reader->CacheDependencies( + read_options, false /* pin */, nullptr /* prefetch_buffer */); + if (!s.ok()) { + return s; + } + IndexBlockIter iiter_on_stack; auto iiter = NewIndexIterator( read_options, /*disable_prefix_seek=*/false, &iiter_on_stack, @@ -2068,10 +2181,28 @@ Status BlockBasedTable::ApproximateKeyAnchors(const ReadOptions& read_options, return Status::OK(); } +bool BlockBasedTable::TimestampMayMatch(const ReadOptions& read_options) const { + if (read_options.timestamp != nullptr && !rep_->min_timestamp.empty()) { + RecordTick(rep_->ioptions.stats, TIMESTAMP_FILTER_TABLE_CHECKED); + auto read_ts = read_options.timestamp; + auto comparator = rep_->internal_comparator.user_comparator(); + if (comparator->CompareTimestamp(*read_ts, rep_->min_timestamp) < 0) { + RecordTick(rep_->ioptions.stats, TIMESTAMP_FILTER_TABLE_FILTERED); + return false; + } + } + return true; +} + Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters) { + // Similar to Bloom filter !may_match + // If timestamp is beyond the range of the table, skip + if (!TimestampMayMatch(read_options)) { + return Status::OK(); + } assert(key.size() >= 8); // key must be internal key assert(get_context != nullptr); Status s; @@ -2093,14 +2224,11 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, read_options.snapshot != nullptr; } TEST_SYNC_POINT("BlockBasedTable::Get:BeforeFilterMatch"); - const bool may_match = FullFilterKeyMayMatch( - filter, key, no_io, prefix_extractor, get_context, &lookup_context, - read_options.rate_limiter_priority); + const bool may_match = + FullFilterKeyMayMatch(filter, key, no_io, prefix_extractor, get_context, + &lookup_context, read_options); TEST_SYNC_POINT("BlockBasedTable::Get:AfterFilterMatch"); - if (!may_match) { - RecordTick(rep_->ioptions.stats, BLOOM_FILTER_USEFUL); - PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, rep_->level); - } else { + if (may_match) { IndexBlockIter iiter_on_stack; // if prefix_extractor found in block differs from options, disable // BlockPrefixIndex. Only do this check when index_type is kHashSearch. @@ -2144,7 +2272,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, NewDataBlockIterator( read_options, v.handle, &biter, BlockType::kData, get_context, &lookup_data_block_context, /*prefetch_buffer=*/nullptr, - /*for_compaction=*/false, /*async_read=*/false, tmp_status); + /*for_compaction=*/false, /*async_read=*/false, tmp_status, + /*use_block_cache_for_lookup=*/true); if (no_io && biter.status().IsIncomplete()) { // couldn't get block from block_cache @@ -2199,6 +2328,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, } } s = biter.status(); + if (!s.ok()) { + break; + } } // Write the block cache access record. if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { @@ -2210,25 +2342,9 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, } else { referenced_key = key; } - BlockCacheTraceRecord access_record( - rep_->ioptions.clock->NowMicros(), - /*block_key=*/"", lookup_data_block_context.block_type, - lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), - /*cf_name=*/"", rep_->level_for_tracing(), - rep_->sst_number_for_tracing(), lookup_data_block_context.caller, - lookup_data_block_context.is_cache_hit, - lookup_data_block_context.no_insert, - lookup_data_block_context.get_id, - lookup_data_block_context.get_from_user_specified_snapshot, - /*referenced_key=*/"", referenced_data_size, - lookup_data_block_context.num_keys_in_block, - does_referenced_key_exist); - // TODO: Should handle status here? - block_cache_tracer_ - ->WriteBlockAccess(access_record, - lookup_data_block_context.block_key, - rep_->cf_name_for_tracing(), referenced_key) - .PermitUncheckedError(); + FinishTraceRecord(lookup_data_block_context, + lookup_data_block_context.block_key, referenced_key, + does_referenced_key_exist, referenced_data_size); } if (done) { @@ -2237,10 +2353,16 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, } } if (matched && filter != nullptr) { - RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE); + if (rep_->whole_key_filtering) { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE); + } else { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_TRUE_POSITIVE); + } + // Includes prefix stats PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, rep_->level); } + if (s.ok() && !iiter->status().IsNotFound()) { s = iiter->status(); } @@ -2274,12 +2396,13 @@ Status BlockBasedTable::MultiGetFilter(const ReadOptions& read_options, TableReaderCaller::kUserMultiGet, tracing_mget_id, /*_get_from_user_specified_snapshot=*/read_options.snapshot != nullptr}; FullFilterKeysMayMatch(filter, mget_range, no_io, prefix_extractor, - &lookup_context, read_options.rate_limiter_priority); + &lookup_context, read_options); return Status::OK(); } -Status BlockBasedTable::Prefetch(const Slice* const begin, +Status BlockBasedTable::Prefetch(const ReadOptions& read_options, + const Slice* const begin, const Slice* const end) { auto& comparator = rep_->internal_comparator; UserComparatorWrapper user_comparator(comparator.user_comparator()); @@ -2289,7 +2412,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, } BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; IndexBlockIter iiter_on_stack; - auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + auto iiter = NewIndexIterator(read_options, /*need_upper_bound_check=*/false, &iiter_on_stack, /*get_context=*/nullptr, &lookup_context); std::unique_ptr> iiter_unique_ptr; @@ -2326,10 +2449,10 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, DataBlockIter biter; Status tmp_status; NewDataBlockIterator( - ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData, + read_options, block_handle, &biter, /*type=*/BlockType::kData, /*get_context=*/nullptr, &lookup_context, /*prefetch_buffer=*/nullptr, /*for_compaction=*/false, - /*async_read=*/false, tmp_status); + /*async_read=*/false, tmp_status, /*use_block_cache_for_lookup=*/true); if (!biter.status().ok()) { // there was an unexpected error while pre-fetching @@ -2346,11 +2469,10 @@ Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options, // Check Meta blocks std::unique_ptr metaindex; std::unique_ptr metaindex_iter; - ReadOptions ro; - s = ReadMetaIndexBlock(ro, nullptr /* prefetch buffer */, &metaindex, - &metaindex_iter); + s = ReadMetaIndexBlock(read_options, nullptr /* prefetch buffer */, + &metaindex, &metaindex_iter); if (s.ok()) { - s = VerifyChecksumInMetaBlocks(metaindex_iter.get()); + s = VerifyChecksumInMetaBlocks(read_options, metaindex_iter.get()); if (!s.ok()) { return s; } @@ -2447,6 +2569,10 @@ BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName( return BlockType::kHashIndexMetadata; } + if (meta_block_name == kIndexBlockName) { + return BlockType::kIndex; + } + if (meta_block_name.starts_with(kObsoleteFilterBlockPrefix)) { // Obsolete but possible in old files return BlockType::kInvalid; @@ -2457,7 +2583,7 @@ BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName( } Status BlockBasedTable::VerifyChecksumInMetaBlocks( - InternalIteratorBase* index_iter) { + const ReadOptions& read_options, InternalIteratorBase* index_iter) { Status s; for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { s = index_iter->status(); @@ -2467,20 +2593,29 @@ Status BlockBasedTable::VerifyChecksumInMetaBlocks( BlockHandle handle; Slice input = index_iter->value(); s = handle.DecodeFrom(&input); + if (!s.ok()) { + break; + } BlockContents contents; const Slice meta_block_name = index_iter->key(); if (meta_block_name == kPropertiesBlockName) { // Unfortunate special handling for properties block checksum w/ // global seqno std::unique_ptr table_properties; - s = ReadTablePropertiesHelper(ReadOptions(), handle, rep_->file.get(), + s = ReadTablePropertiesHelper(read_options, handle, rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer, rep_->ioptions, &table_properties, nullptr /* memory_allocator */); + } else if (rep_->verify_checksum_set_on_open && + meta_block_name == kIndexBlockName) { + // WART: For now, to maintain similar I/O behavior as before + // format_version=6, we skip verifying index block checksum--but only + // if it was checked on open. } else { + // FIXME? Need to verify checksums of index and filter partitions? s = BlockFetcher( rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, - ReadOptions(), handle, &contents, rep_->ioptions, + read_options, handle, &contents, rep_->ioptions, false /* decompress */, false /*maybe_compressed*/, GetBlockTypeForMetaBlockByName(meta_block_name), UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options) @@ -2519,6 +2654,7 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_context=*/nullptr)); iiter->Seek(key); + assert(iiter->status().ok()); assert(iiter->Valid()); return TEST_BlockInCache(iiter->value().handle); @@ -2535,6 +2671,15 @@ Status BlockBasedTable::CreateIndexReader( InternalIterator* meta_iter, bool use_cache, bool prefetch, bool pin, BlockCacheLookupContext* lookup_context, std::unique_ptr* index_reader) { + if (FormatVersionUsesIndexHandleInFooter(rep_->footer.format_version())) { + rep_->index_handle = rep_->footer.index_handle(); + } else { + Status s = FindMetaBlock(meta_iter, kIndexBlockName, &rep_->index_handle); + if (!s.ok()) { + return s; + } + } + switch (rep_->index_type) { case BlockBasedTableOptions::kTwoLevelIndexSearch: { return PartitionIndexReader::Create(this, ro, prefetch_buffer, use_cache, @@ -2592,7 +2737,8 @@ uint64_t BlockBasedTable::GetApproximateDataSize() { return rep_->footer.metaindex_handle().offset(); } -uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, +uint64_t BlockBasedTable::ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, TableReaderCaller caller) { uint64_t data_size = GetApproximateDataSize(); if (UNLIKELY(data_size == 0)) { @@ -2606,6 +2752,7 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, IndexBlockIter iiter_on_stack; ReadOptions ro; ro.total_order_seek = true; + ro.io_activity = read_options.io_activity; auto index_iter = NewIndexIterator(ro, /*disable_prefix_seek=*/true, /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, @@ -2634,7 +2781,8 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, static_cast(rep_->file_size)); } -uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end, +uint64_t BlockBasedTable::ApproximateSize(const ReadOptions& read_options, + const Slice& start, const Slice& end, TableReaderCaller caller) { assert(rep_->internal_comparator.Compare(start, end) <= 0); @@ -2651,6 +2799,7 @@ uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end, IndexBlockIter iiter_on_stack; ReadOptions ro; ro.total_order_seek = true; + ro.io_activity = read_options.io_activity; auto index_iter = NewIndexIterator(ro, /*disable_prefix_seek=*/true, /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, @@ -2698,13 +2847,13 @@ bool BlockBasedTable::TEST_FilterBlockInCache() const { bool BlockBasedTable::TEST_IndexBlockInCache() const { assert(rep_ != nullptr); - return TEST_BlockInCache(rep_->footer.index_handle()); + return TEST_BlockInCache(rep_->index_handle); } Status BlockBasedTable::GetKVPairsFromDataBlocks( - std::vector* kv_pair_blocks) { + const ReadOptions& read_options, std::vector* kv_pair_blocks) { std::unique_ptr> blockhandles_iter( - NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + NewIndexIterator(read_options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); @@ -2725,11 +2874,11 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks( std::unique_ptr datablock_iter; Status tmp_status; datablock_iter.reset(NewDataBlockIterator( - ReadOptions(), blockhandles_iter->value().handle, + read_options, blockhandles_iter->value().handle, /*input_iter=*/nullptr, /*type=*/BlockType::kData, /*get_context=*/nullptr, /*lookup_context=*/nullptr, /*prefetch_buffer=*/nullptr, /*for_compaction=*/false, - /*async_read=*/false, tmp_status)); + /*async_read=*/false, tmp_status, /*use_block_cache_for_lookup=*/true)); s = datablock_iter->status(); if (!s.ok()) { @@ -2771,7 +2920,8 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { "--------------------------------------\n"; std::unique_ptr metaindex; std::unique_ptr metaindex_iter; - ReadOptions ro; + // TODO: plumb Env::IOActivity + const ReadOptions ro; Status s = ReadMetaIndexBlock(ro, nullptr /* prefetch_buffer */, &metaindex, &metaindex_iter); if (s.ok()) { @@ -2827,7 +2977,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { if (rep_->uncompression_dict_reader) { CachableEntry uncompression_dict; s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( - nullptr /* prefetch_buffer */, false /* no_io */, + nullptr /* prefetch_buffer */, ro, false /* no_io */, false, /* verify_checksums */ nullptr /* get_context */, nullptr /* lookup_context */, &uncompression_dict); @@ -2845,7 +2995,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { } // Output range deletions block - auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions()); + auto* range_del_iter = NewRangeTombstoneIterator(ro); if (range_del_iter != nullptr) { range_del_iter->SeekToFirst(); if (range_del_iter->Valid()) { @@ -2875,8 +3025,10 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { out_stream << "Index Details:\n" "--------------------------------------\n"; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; std::unique_ptr> blockhandles_iter( - NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + NewIndexIterator(read_options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); Status s = blockhandles_iter->status(); @@ -2924,8 +3076,10 @@ Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { } Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; std::unique_ptr> blockhandles_iter( - NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + NewIndexIterator(read_options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); Status s = blockhandles_iter->status(); @@ -2959,11 +3113,11 @@ Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) { std::unique_ptr datablock_iter; Status tmp_status; datablock_iter.reset(NewDataBlockIterator( - ReadOptions(), blockhandles_iter->value().handle, + read_options, blockhandles_iter->value().handle, /*input_iter=*/nullptr, /*type=*/BlockType::kData, /*get_context=*/nullptr, /*lookup_context=*/nullptr, /*prefetch_buffer=*/nullptr, /*for_compaction=*/false, - /*async_read=*/false, tmp_status)); + /*async_read=*/false, tmp_status, /*use_block_cache_for_lookup=*/true)); s = datablock_iter->status(); if (!s.ok()) { diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 55ef76c45fb7..22361b505d4b 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -98,7 +98,8 @@ class BlockBasedTable : public TableReader { const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_key_comparator, std::unique_ptr&& file, uint64_t file_size, - std::unique_ptr* table_reader, + uint8_t block_protection_bytes_per_key, + std::unique_ptr* table_reader, uint64_t tail_size, std::shared_ptr table_reader_cache_res_mgr = nullptr, const std::shared_ptr& prefix_extractor = nullptr, @@ -110,13 +111,15 @@ class BlockBasedTable : public TableReader { BlockCacheTracer* const block_cache_tracer = nullptr, size_t max_file_size_for_l0_meta_pin = 0, const std::string& cur_db_session_id = "", uint64_t cur_file_num = 0, - UniqueId64x2 expected_unique_id = {}); + UniqueId64x2 expected_unique_id = {}, + const bool user_defined_timestamps_persisted = true); bool PrefixRangeMayMatch(const Slice& internal_key, const ReadOptions& read_options, const SliceTransform* options_prefix_extractor, const bool need_upper_bound_check, - BlockCacheLookupContext* lookup_context) const; + BlockCacheLookupContext* lookup_context, + bool* filter_checked) const; // Returns a new iterator over the table contents. // The result of NewIterator() is initially invalid (caller must @@ -135,6 +138,9 @@ class BlockBasedTable : public TableReader { FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( const ReadOptions& read_options) override; + FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( + SequenceNumber read_seqno, const Slice* timestamp) override; + // @param skip_filters Disables loading/accessing the filter block Status Get(const ReadOptions& readOptions, const Slice& key, GetContext* get_context, const SliceTransform* prefix_extractor, @@ -153,7 +159,8 @@ class BlockBasedTable : public TableReader { // Pre-fetch the disk blocks that correspond to the key range specified by // (kbegin, kend). The call will return error status in the event of // IO or iteration error. - Status Prefetch(const Slice* begin, const Slice* end) override; + Status Prefetch(const ReadOptions& read_options, const Slice* begin, + const Slice* end) override; // Given a key, return an approximate byte offset in the file where // the data for that key begins (or would begin if the key were @@ -161,15 +168,16 @@ class BlockBasedTable : public TableReader { // bytes, and so includes effects like compression of the underlying data. // E.g., the approximate offset of the last key in the table will // be close to the file length. - uint64_t ApproximateOffsetOf(const Slice& key, + uint64_t ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, TableReaderCaller caller) override; // Given start and end keys, return the approximate data size in the file // between the keys. The returned value is in terms of file bytes, and so // includes effects like compression of the underlying data. // The start key must not be greater than the end key. - uint64_t ApproximateSize(const Slice& start, const Slice& end, - TableReaderCaller caller) override; + uint64_t ApproximateSize(const ReadOptions& read_options, const Slice& start, + const Slice& end, TableReaderCaller caller) override; Status ApproximateKeyAnchors(const ReadOptions& read_options, std::vector& anchors) override; @@ -222,8 +230,9 @@ class BlockBasedTable : public TableReader { virtual size_t ApproximateMemoryUsage() const = 0; // Cache the dependencies of the index reader (e.g. the partitions // of a partitioned index). - virtual Status CacheDependencies(const ReadOptions& /*ro*/, - bool /* pin */) { + virtual Status CacheDependencies( + const ReadOptions& /*ro*/, bool /* pin */, + FilePrefetchBuffer* /* tail_prefetch_buffer */) { return Status::OK(); } }; @@ -244,6 +253,9 @@ class BlockBasedTable : public TableReader { bool redundant, Statistics* const statistics); + Statistics* GetStatistics() const; + bool IsLastLevel() const; + // Get the size to read from storage for a BlockHandle. size_t because we // are about to load into memory. static inline size_t BlockSizeWithTrailer(const BlockHandle& handle) { @@ -265,7 +277,13 @@ class BlockBasedTable : public TableReader { // Retrieve all key value pairs from data blocks in the table. // The key retrieved are internal keys. - Status GetKVPairsFromDataBlocks(std::vector* kv_pair_blocks); + Status GetKVPairsFromDataBlocks(const ReadOptions& read_options, + std::vector* kv_pair_blocks); + + template + Status LookupAndPinBlocksInCache( + const ReadOptions& ro, const BlockHandle& handle, + CachableEntry* out_parsed_block) const; struct Rep; @@ -274,14 +292,12 @@ class BlockBasedTable : public TableReader { // input_iter: if it is not null, update this one and return it as Iterator template - TBlockIter* NewDataBlockIterator(const ReadOptions& ro, - const BlockHandle& block_handle, - TBlockIter* input_iter, BlockType block_type, - GetContext* get_context, - BlockCacheLookupContext* lookup_context, - FilePrefetchBuffer* prefetch_buffer, - bool for_compaction, bool async_read, - Status& s) const; + TBlockIter* NewDataBlockIterator( + const ReadOptions& ro, const BlockHandle& block_handle, + TBlockIter* input_iter, BlockType block_type, GetContext* get_context, + BlockCacheLookupContext* lookup_context, + FilePrefetchBuffer* prefetch_buffer, bool for_compaction, bool async_read, + Status& s, bool use_block_cache_for_lookup) const; // input_iter: if it is not null, update this one and return it as Iterator template @@ -336,10 +352,10 @@ class BlockBasedTable : public TableReader { WithBlocklikeCheck MaybeReadBlockAndLoadToCache( FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, const BlockHandle& handle, const UncompressionDict& uncompression_dict, - const bool wait, const bool for_compaction, - CachableEntry* block_entry, GetContext* get_context, - BlockCacheLookupContext* lookup_context, BlockContents* contents, - bool async_read) const; + bool for_compaction, CachableEntry* block_entry, + GetContext* get_context, BlockCacheLookupContext* lookup_context, + BlockContents* contents, bool async_read, + bool use_block_cache_for_lookup) const; // Similar to the above, with one crucial difference: it will retrieve the // block from the file even if there are no caches configured (assuming the @@ -350,16 +366,25 @@ class BlockBasedTable : public TableReader { const BlockHandle& handle, const UncompressionDict& uncompression_dict, CachableEntry* block_entry, GetContext* get_context, BlockCacheLookupContext* lookup_context, bool for_compaction, - bool use_cache, bool wait_for_cache, bool async_read) const; + bool use_cache, bool async_read, bool use_block_cache_for_lookup) const; + + template + WithBlocklikeCheck SaveLookupContextOrTraceRecord( + const Slice& block_key, bool is_cache_hit, const ReadOptions& ro, + const TBlocklike* parsed_block_value, + BlockCacheLookupContext* lookup_context) const; + + void FinishTraceRecord(const BlockCacheLookupContext& lookup_context, + const Slice& block_key, const Slice& referenced_key, + bool does_referenced_key_exist, + uint64_t referenced_data_size) const; DECLARE_SYNC_AND_ASYNC_CONST( void, RetrieveMultipleBlocks, const ReadOptions& options, const MultiGetRange* batch, const autovector* handles, - autovector* statuses, - autovector, MultiGetContext::MAX_BATCH_SIZE>* - results, - char* scratch, const UncompressionDict& uncompression_dict); + Status* statuses, CachableEntry* results, char* scratch, + const UncompressionDict& uncompression_dict, bool use_fs_scratch); // Get the iterator from the index reader. // @@ -379,8 +404,10 @@ class BlockBasedTable : public TableReader { IndexBlockIter* input_iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) const; - // Read block cache from block caches (if set): block_cache and - // block_cache_compressed. + template + Cache::Priority GetCachePriority() const; + + // Read block cache from block caches (if set): block_cache. // On success, Status::OK with be returned and @block will be populated with // pointer to the block as well as its block handle. // @param uncompression_dict Data for presetting the compression library's @@ -388,10 +415,8 @@ class BlockBasedTable : public TableReader { template WithBlocklikeCheck GetDataBlockFromCache( const Slice& cache_key, BlockCacheInterface block_cache, - CompressedBlockCacheInterface block_cache_compressed, - const ReadOptions& read_options, CachableEntry* block, - const UncompressionDict& uncompression_dict, const bool wait, - GetContext* get_context) const; + CachableEntry* block, GetContext* get_context, + const UncompressionDict* dict) const; // Put a maybe compressed block to the corresponding block caches. // This method will perform decompression against block_contents if needed @@ -406,8 +431,9 @@ class BlockBasedTable : public TableReader { template WithBlocklikeCheck PutDataBlockToCache( const Slice& cache_key, BlockCacheInterface block_cache, - CompressedBlockCacheInterface block_cache_compressed, - CachableEntry* cached_block, BlockContents&& block_contents, + CachableEntry* cached_block, + BlockContents&& uncompressed_block_contents, + BlockContents&& compressed_block_contents, CompressionType block_comp_type, const UncompressionDict& uncompression_dict, MemoryAllocator* memory_allocator, GetContext* get_context) const; @@ -434,13 +460,13 @@ class BlockBasedTable : public TableReader { const SliceTransform* prefix_extractor, GetContext* get_context, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) const; + const ReadOptions& read_options) const; void FullFilterKeysMayMatch(FilterBlockReader* filter, MultiGetRange* range, const bool no_io, const SliceTransform* prefix_extractor, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) const; + const ReadOptions& read_options) const; // If force_direct_prefetch is true, always prefetching to RocksDB // buffer, rather than calling RandomAccessFile::Prefetch(). @@ -448,7 +474,8 @@ class BlockBasedTable : public TableReader { const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, - std::unique_ptr* prefetch_buffer); + std::unique_ptr* prefetch_buffer, Statistics* stats, + uint64_t tail_size, Logger* const logger); Status ReadMetaIndexBlock(const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, std::unique_ptr* metaindex_block, @@ -471,7 +498,8 @@ class BlockBasedTable : public TableReader { static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name); - Status VerifyChecksumInMetaBlocks(InternalIteratorBase* index_iter); + Status VerifyChecksumInMetaBlocks(const ReadOptions& read_options, + InternalIteratorBase* index_iter); Status VerifyChecksumInBlocks(const ReadOptions& read_options, InternalIteratorBase* index_iter); @@ -499,6 +527,8 @@ class BlockBasedTable : public TableReader { // in building the table file, otherwise true. bool PrefixExtractorChanged(const SliceTransform* prefix_extractor) const; + bool TimestampMayMatch(const ReadOptions& read_options) const; + // A cumulative data block file read in MultiGet lower than this size will // use a stack buffer static constexpr size_t kMultiGetReadStackBufSize = 8192; @@ -530,7 +560,8 @@ struct BlockBasedTable::Rep { Rep(const ImmutableOptions& _ioptions, const EnvOptions& _env_options, const BlockBasedTableOptions& _table_opt, const InternalKeyComparator& _internal_comparator, bool skip_filters, - uint64_t _file_size, int _level, const bool _immortal_table) + uint64_t _file_size, int _level, const bool _immortal_table, + const bool _user_defined_timestamps_persisted = true) : ioptions(_ioptions), env_options(_env_options), table_options(_table_opt), @@ -543,7 +574,8 @@ struct BlockBasedTable::Rep { global_seqno(kDisableGlobalSequenceNumber), file_size(_file_size), level(_level), - immortal_table(_immortal_table) {} + immortal_table(_immortal_table), + user_defined_timestamps_persisted(_user_defined_timestamps_persisted) {} ~Rep() { status.PermitUncheckedError(); } const ImmutableOptions& ioptions; const EnvOptions& env_options; @@ -572,6 +604,7 @@ struct BlockBasedTable::Rep { BlockHandle compression_dict_handle; std::shared_ptr table_properties; + BlockHandle index_handle; BlockBasedTableOptions::IndexType index_type; bool whole_key_filtering; bool prefix_filtering; @@ -600,6 +633,12 @@ struct BlockBasedTable::Rep { // move is involved int level; + // the timestamp range of table + // Points into memory owned by TableProperties. This would need to change if + // TableProperties become subject to cache eviction. + Slice min_timestamp; + Slice max_timestamp; + // If false, blocks in this file are definitely all uncompressed. Knowing this // before reading individual blocks enables certain optimizations. bool blocks_maybe_compressed = true; @@ -609,7 +648,22 @@ struct BlockBasedTable::Rep { bool index_key_includes_seq = true; bool index_value_is_full = true; + // Whether block checksums in metadata blocks were verified on open. + // This is only to mostly maintain current dubious behavior of VerifyChecksum + // with respect to index blocks, but only when the checksum was previously + // verified. + bool verify_checksum_set_on_open = false; + const bool immortal_table; + // Whether the user key contains user-defined timestamps. If this is false and + // the running user comparator has a non-zero timestamp size, a min timestamp + // of this size will be padded to each user key while parsing blocks whenever + // it applies. This includes the keys in data block, index block for data + // block, top-level index for index partitions (if index type is + // `kTwoLevelIndexSearch`), top-level index for filter partitions (if using + // partitioned filters), the `first_internal_key` in `IndexValue`, the + // `end_key` for range deletion entries. + const bool user_defined_timestamps_persisted; std::unique_ptr table_reader_cache_res_handle = nullptr; @@ -641,25 +695,31 @@ struct BlockBasedTable::Rep { void CreateFilePrefetchBuffer( size_t readahead_size, size_t max_readahead_size, std::unique_ptr* fpb, bool implicit_auto_readahead, - uint64_t num_file_reads, - uint64_t num_file_reads_for_auto_readahead) const { + uint64_t num_file_reads, uint64_t num_file_reads_for_auto_readahead, + uint64_t upper_bound_offset, + const std::function& readaheadsize_cb, + FilePrefetchBufferUsage usage) const { fpb->reset(new FilePrefetchBuffer( readahead_size, max_readahead_size, !ioptions.allow_mmap_reads /* enable */, false /* track_min_offset */, implicit_auto_readahead, num_file_reads, - num_file_reads_for_auto_readahead, ioptions.fs.get(), ioptions.clock, - ioptions.stats)); + num_file_reads_for_auto_readahead, upper_bound_offset, + ioptions.fs.get(), ioptions.clock, ioptions.stats, readaheadsize_cb, + usage)); } void CreateFilePrefetchBufferIfNotExists( size_t readahead_size, size_t max_readahead_size, std::unique_ptr* fpb, bool implicit_auto_readahead, - uint64_t num_file_reads, - uint64_t num_file_reads_for_auto_readahead) const { + uint64_t num_file_reads, uint64_t num_file_reads_for_auto_readahead, + uint64_t upper_bound_offset, + const std::function& readaheadsize_cb, + FilePrefetchBufferUsage usage = FilePrefetchBufferUsage::kUnknown) const { if (!(*fpb)) { CreateFilePrefetchBuffer(readahead_size, max_readahead_size, fpb, implicit_auto_readahead, num_file_reads, - num_file_reads_for_auto_readahead); + num_file_reads_for_auto_readahead, + upper_bound_offset, readaheadsize_cb, usage); } } diff --git a/table/block_based/block_based_table_reader_impl.h b/table/block_based/block_based_table_reader_impl.h index 105a479f36c0..fedccd5eec59 100644 --- a/table/block_based/block_based_table_reader_impl.h +++ b/table/block_based/block_based_table_reader_impl.h @@ -49,7 +49,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( BlockType block_type, GetContext* get_context, BlockCacheLookupContext* lookup_context, FilePrefetchBuffer* prefetch_buffer, bool for_compaction, bool async_read, - Status& s) const { + Status& s, bool use_block_cache_for_lookup) const { using IterBlocklike = typename IterTraits::IterBlocklike; PERF_TIMER_GUARD(new_table_block_iter_nanos); @@ -67,9 +67,13 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( // might already be under way and this would invalidate it. Also, the // uncompression dict is typically at the end of the file and would // most likely break the sequentiality of the access pattern. + // Same is with auto_readahead_size. It iterates over index to lookup for + // data blocks. And this could break the the sequentiality of the access + // pattern. s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( - ro.async_io ? nullptr : prefetch_buffer, no_io, ro.verify_checksums, - get_context, lookup_context, &uncompression_dict); + ((ro.async_io || ro.auto_readahead_size) ? nullptr : prefetch_buffer), + ro, no_io, ro.verify_checksums, get_context, lookup_context, + &uncompression_dict); if (!s.ok()) { iter->Invalidate(s); return iter; @@ -80,12 +84,12 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( s = RetrieveBlock( prefetch_buffer, ro, handle, dict, &block.As(), get_context, lookup_context, for_compaction, - /* use_cache */ true, /* wait_for_cache */ true, async_read); + /* use_cache */ true, async_read, use_block_cache_for_lookup); } else { s = RetrieveBlock( prefetch_buffer, ro, handle, UncompressionDict::GetEmptyDict(), &block.As(), get_context, lookup_context, for_compaction, - /* use_cache */ true, /* wait_for_cache */ true, async_read); + /* use_cache */ true, async_read, use_block_cache_for_lookup); } if (s.IsTryAgain() && async_read) { diff --git a/table/block_based/block_based_table_reader_sync_and_async.h b/table/block_based/block_based_table_reader_sync_and_async.h index ea75f631d976..e7621909cc73 100644 --- a/table/block_based/block_based_table_reader_sync_and_async.h +++ b/table/block_based/block_based_table_reader_sync_and_async.h @@ -32,9 +32,8 @@ namespace ROCKSDB_NAMESPACE { DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) (const ReadOptions& options, const MultiGetRange* batch, const autovector* handles, - autovector* statuses, - autovector, MultiGetContext::MAX_BATCH_SIZE>* results, - char* scratch, const UncompressionDict& uncompression_dict) const { + Status* statuses, CachableEntry* results, char* scratch, + const UncompressionDict& uncompression_dict, bool use_fs_scratch) const { RandomAccessFileReader* file = rep_->file.get(); const Footer& footer = rep_->footer; const ImmutableOptions& ioptions = rep_->ioptions; @@ -45,20 +44,20 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) size_t idx_in_batch = 0; for (auto mget_iter = batch->begin(); mget_iter != batch->end(); ++mget_iter, ++idx_in_batch) { - BlockCacheLookupContext lookup_data_block_context( - TableReaderCaller::kUserMultiGet); const BlockHandle& handle = (*handles)[idx_in_batch]; if (handle.IsNull()) { continue; } - (*statuses)[idx_in_batch] = - RetrieveBlock(nullptr, options, handle, uncompression_dict, - &(*results)[idx_in_batch].As(), - mget_iter->get_context, &lookup_data_block_context, - /* for_compaction */ false, /* use_cache */ true, - /* wait_for_cache */ true, /* async_read */ false); + // XXX: use_cache=true means double cache query? + statuses[idx_in_batch] = RetrieveBlock( + nullptr, options, handle, uncompression_dict, + &results[idx_in_batch].As(), mget_iter->get_context, + /* lookup_context */ nullptr, + /* for_compaction */ false, /* use_cache */ true, + /* async_read */ false, /* use_block_cache_for_lookup */ true); } + assert(idx_in_batch == handles->size()); CO_RETURN; } @@ -89,7 +88,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) // We don't combine block reads here in direct IO mode, because when doing // direct IO read, the block requests will be realigned and merged when // necessary. - if (use_shared_buffer && !file->use_direct_io() && + if ((use_shared_buffer || use_fs_scratch) && !file->use_direct_io() && prev_end == handle.offset()) { req_offset_for_block.emplace_back(prev_len); prev_len += BlockSizeWithTrailer(handle); @@ -100,7 +99,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) FSReadRequest req; req.offset = prev_offset; req.len = prev_len; - if (file->use_direct_io()) { + if (file->use_direct_io() || use_fs_scratch) { req.scratch = nullptr; } else if (use_shared_buffer) { req.scratch = scratch + buf_offset; @@ -108,10 +107,10 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) } else { req.scratch = new char[req.len]; } - read_reqs.emplace_back(req); + read_reqs.emplace_back(std::move(req)); } - // Step 2, remeber the previous block info + // Step 2, remember the previous block info prev_offset = handle.offset(); prev_len = BlockSizeWithTrailer(handle); req_offset_for_block.emplace_back(0); @@ -126,14 +125,14 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) FSReadRequest req; req.offset = prev_offset; req.len = prev_len; - if (file->use_direct_io()) { + if (file->use_direct_io() || use_fs_scratch) { req.scratch = nullptr; } else if (use_shared_buffer) { req.scratch = scratch + buf_offset; } else { req.scratch = new char[req.len]; } - read_reqs.emplace_back(req); + read_reqs.emplace_back(std::move(req)); } AlignedBuf direct_io_buf; @@ -145,7 +144,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) if (file->use_direct_io()) { #endif // WITH_COROUTINES s = file->MultiRead(opts, &read_reqs[0], read_reqs.size(), - &direct_io_buf, options.rate_limiter_priority); + &direct_io_buf); #if defined(WITH_COROUTINES) } else { co_await batch->context()->reader().MultiReadAsync( @@ -193,7 +192,10 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) BlockContents serialized_block; if (s.ok()) { - if (!use_shared_buffer) { + if (use_fs_scratch) { + serialized_block = + BlockContents(Slice(req.result.data() + req_offset, handle.size())); + } else if (!use_shared_buffer) { // We allocated a buffer for this block. Give ownership of it to // BlockContents so it can free the memory assert(req.result.data() == req.scratch); @@ -220,9 +222,8 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) // begin address of each read request, we need to add the offset // in each read request. Checksum is stored in the block trailer, // beyond the payload size. - s = VerifyBlockChecksum(footer.checksum_type(), data + req_offset, - handle.size(), rep_->file->file_name(), - handle.offset()); + s = VerifyBlockChecksum(footer, data + req_offset, handle.size(), + rep_->file->file_name(), handle.offset()); TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s); } } else if (!use_shared_buffer) { @@ -244,9 +245,8 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) // heap buffer or there is no cache at all. CompressionType compression_type = GetBlockCompressionType(serialized_block); - if (use_shared_buffer && (compression_type == kNoCompression || - (compression_type != kNoCompression && - rep_->table_options.block_cache_compressed))) { + if ((use_fs_scratch || use_shared_buffer) && + compression_type == kNoCompression) { Slice serialized = Slice(req.result.data() + req_offset, BlockSizeWithTrailer(handle)); serialized_block = BlockContents( @@ -261,17 +261,15 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) if (s.ok()) { if (options.fill_cache) { - BlockCacheLookupContext lookup_data_block_context( - TableReaderCaller::kUserMultiGet); - CachableEntry* block_entry = &(*results)[idx_in_batch]; + CachableEntry* block_entry = &results[idx_in_batch]; // MaybeReadBlockAndLoadToCache will insert into the block caches if // necessary. Since we're passing the serialized block contents, it // will avoid looking up the block cache s = MaybeReadBlockAndLoadToCache( - nullptr, options, handle, uncompression_dict, /*wait=*/true, - /*for_compaction=*/false, &block_entry->As(), - mget_iter->get_context, &lookup_data_block_context, - &serialized_block, /*async_read=*/false); + nullptr, options, handle, uncompression_dict, + /*for_compaction=*/false, block_entry, mget_iter->get_context, + /*lookup_context=*/nullptr, &serialized_block, + /*async_read=*/false, /*use_block_cache_for_lookup=*/true); // block_entry value could be null if no block cache is present, i.e // BlockBasedTableOptions::no_block_cache is true and no compressed @@ -303,11 +301,22 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::RetrieveMultipleBlocks) contents = std::move(serialized_block); } if (s.ok()) { - (*results)[idx_in_batch].SetOwnedValue(std::make_unique( + results[idx_in_batch].SetOwnedValue(std::make_unique( std::move(contents), read_amp_bytes_per_bit, ioptions.stats)); } } - (*statuses)[idx_in_batch] = s; + statuses[idx_in_batch] = s; + } + + if (use_fs_scratch) { + // Free the allocated scratch buffer by fs here as read requests might have + // been combined into one. + for (FSReadRequest& req : read_reqs) { + if (req.fs_scratch != nullptr) { + req.fs_scratch.reset(); + req.fs_scratch = nullptr; + } + } } } @@ -333,11 +342,13 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) if (sst_file_range.begin()->get_context) { tracing_mget_id = sst_file_range.begin()->get_context->get_tracing_get_id(); } - BlockCacheLookupContext lookup_context{ + // TODO: need more than one lookup_context here to track individual filter + // and index partition hits and misses. + BlockCacheLookupContext metadata_lookup_context{ TableReaderCaller::kUserMultiGet, tracing_mget_id, /*_get_from_user_specified_snapshot=*/read_options.snapshot != nullptr}; FullFilterKeysMayMatch(filter, &sst_file_range, no_io, prefix_extractor, - &lookup_context, read_options.rate_limiter_priority); + &metadata_lookup_context, read_options); if (!sst_file_range.empty()) { IndexBlockIter iiter_on_stack; @@ -347,9 +358,9 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) if (rep_->index_type == BlockBasedTableOptions::kHashSearch) { need_upper_bound_check = PrefixExtractorChanged(prefix_extractor); } - auto iiter = - NewIndexIterator(read_options, need_upper_bound_check, &iiter_on_stack, - sst_file_range.begin()->get_context, &lookup_context); + auto iiter = NewIndexIterator( + read_options, need_upper_bound_check, &iiter_on_stack, + sst_file_range.begin()->get_context, &metadata_lookup_context); std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { iiter_unique_ptr.reset(iiter); @@ -357,161 +368,182 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) uint64_t prev_offset = std::numeric_limits::max(); autovector block_handles; - autovector, MultiGetContext::MAX_BATCH_SIZE> results; - autovector statuses; + std::array, MultiGetContext::MAX_BATCH_SIZE> + results; + std::array statuses; + // Empty data_lookup_contexts means "unused," when block cache tracing is + // disabled. (Limited options as element type is not default contructible.) + std::vector data_lookup_contexts; MultiGetContext::Mask reused_mask = 0; char stack_buf[kMultiGetReadStackBufSize]; std::unique_ptr block_buf; + if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled()) { + // Awkward because BlockCacheLookupContext is not CopyAssignable + data_lookup_contexts.reserve(MultiGetContext::MAX_BATCH_SIZE); + for (size_t i = 0; i < MultiGetContext::MAX_BATCH_SIZE; ++i) { + data_lookup_contexts.push_back(metadata_lookup_context); + } + } { MultiGetRange data_block_range(sst_file_range, sst_file_range.begin(), sst_file_range.end()); - std::vector cache_handles; - bool wait_for_cache_results = false; - CachableEntry uncompression_dict; Status uncompression_dict_status; uncompression_dict_status.PermitUncheckedError(); bool uncompression_dict_inited = false; size_t total_len = 0; - ReadOptions ro = read_options; - ro.read_tier = kBlockCacheTier; - - for (auto miter = data_block_range.begin(); - miter != data_block_range.end(); ++miter) { - const Slice& key = miter->ikey; - iiter->Seek(miter->ikey); - IndexValue v; - if (iiter->Valid()) { - v = iiter->value(); - } - if (!iiter->Valid() || - (!v.first_internal_key.empty() && !skip_filters && - UserComparatorWrapper(rep_->internal_comparator.user_comparator()) - .CompareWithoutTimestamp( - ExtractUserKey(key), - ExtractUserKey(v.first_internal_key)) < 0)) { - // The requested key falls between highest key in previous block and - // lowest key in current block. - if (!iiter->status().IsNotFound()) { - *(miter->s) = iiter->status(); + // GetContext for any key will do, as the stats will be aggregated + // anyway + GetContext* get_context = sst_file_range.begin()->get_context; + + { + using BCI = BlockCacheInterface; + BCI block_cache{rep_->table_options.block_cache.get()}; + std::array + async_handles; + BlockCreateContext create_ctx = rep_->create_context; + std::array cache_keys; + size_t cache_lookup_count = 0; + + for (auto miter = data_block_range.begin(); + miter != data_block_range.end(); ++miter) { + const Slice& key = miter->ikey; + iiter->Seek(miter->ikey); + + IndexValue v; + if (iiter->Valid()) { + v = iiter->value(); + } + if (!iiter->Valid() || + (!v.first_internal_key.empty() && !skip_filters && + UserComparatorWrapper( + rep_->internal_comparator.user_comparator()) + .CompareWithoutTimestamp( + ExtractUserKey(key), + ExtractUserKey(v.first_internal_key)) < 0)) { + // The requested key falls between highest key in previous block and + // lowest key in current block. + if (!iiter->status().IsNotFound()) { + *(miter->s) = iiter->status(); + } + data_block_range.SkipKey(miter); + sst_file_range.SkipKey(miter); + continue; } - data_block_range.SkipKey(miter); - sst_file_range.SkipKey(miter); - continue; - } - - if (!uncompression_dict_inited && rep_->uncompression_dict_reader) { - uncompression_dict_status = - rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( - nullptr /* prefetch_buffer */, no_io, - read_options.verify_checksums, - sst_file_range.begin()->get_context, &lookup_context, - &uncompression_dict); - uncompression_dict_inited = true; - } - if (!uncompression_dict_status.ok()) { - assert(!uncompression_dict_status.IsNotFound()); - *(miter->s) = uncompression_dict_status; - data_block_range.SkipKey(miter); - sst_file_range.SkipKey(miter); - continue; - } + if (!uncompression_dict_inited && rep_->uncompression_dict_reader) { + uncompression_dict_status = + rep_->uncompression_dict_reader + ->GetOrReadUncompressionDictionary( + nullptr /* prefetch_buffer */, read_options, no_io, + read_options.verify_checksums, get_context, + &metadata_lookup_context, &uncompression_dict); + uncompression_dict_inited = true; + } - statuses.emplace_back(); - results.emplace_back(); - if (v.handle.offset() == prev_offset) { - // This key can reuse the previous block (later on). - // Mark previous as "reused" - reused_mask |= MultiGetContext::Mask{1} << (block_handles.size() - 1); - // Use null handle to indicate this one reuses same block as - // previous. - block_handles.emplace_back(BlockHandle::NullBlockHandle()); - continue; - } - // Lookup the cache for the given data block referenced by an index - // iterator value (i.e BlockHandle). If it exists in the cache, - // initialize block to the contents of the data block. - prev_offset = v.handle.offset(); - BlockHandle handle = v.handle; - BlockCacheLookupContext lookup_data_block_context( - TableReaderCaller::kUserMultiGet); - const UncompressionDict& dict = uncompression_dict.GetValue() - ? *uncompression_dict.GetValue() - : UncompressionDict::GetEmptyDict(); - Status s = RetrieveBlock( - nullptr, ro, handle, dict, &(results.back()).As(), - miter->get_context, &lookup_data_block_context, - /* for_compaction */ false, /* use_cache */ true, - /* wait_for_cache */ false, /* async_read */ false); - if (s.IsIncomplete()) { - s = Status::OK(); - } - if (s.ok() && !results.back().IsEmpty()) { - // Since we have a valid handle, check the value. If its nullptr, - // it means the cache is waiting for the final result and we're - // supposed to call WaitAll() to wait for the result. - if (results.back().GetValue() != nullptr) { - // Found it in the cache. Add NULL handle to indicate there is - // nothing to read from disk. - if (results.back().GetCacheHandle()) { - results.back().UpdateCachedValue(); - } + if (!uncompression_dict_status.ok()) { + assert(!uncompression_dict_status.IsNotFound()); + *(miter->s) = uncompression_dict_status; + data_block_range.SkipKey(miter); + sst_file_range.SkipKey(miter); + continue; + } + create_ctx.dict = uncompression_dict.GetValue() + ? uncompression_dict.GetValue() + : &UncompressionDict::GetEmptyDict(); + + if (v.handle.offset() == prev_offset) { + // This key can reuse the previous block (later on). + // Mark previous as "reused" + reused_mask |= MultiGetContext::Mask{1} + << (block_handles.size() - 1); + // Use null handle to indicate this one reuses same block as + // previous. block_handles.emplace_back(BlockHandle::NullBlockHandle()); - } else { - // We have to wait for the cache lookup to finish in the - // background, and then we may have to read the block from disk - // anyway - assert(results.back().GetCacheHandle()); - wait_for_cache_results = true; - block_handles.emplace_back(handle); - cache_handles.emplace_back(results.back().GetCacheHandle()); + continue; + } + prev_offset = v.handle.offset(); + block_handles.emplace_back(v.handle); + + if (block_cache) { + // Lookup the cache for the given data block referenced by an index + // iterator value (i.e BlockHandle). If it exists in the cache, + // initialize block to the contents of the data block. + + // An async version of MaybeReadBlockAndLoadToCache / + // GetDataBlockFromCache + BCI::TypedAsyncLookupHandle& async_handle = + async_handles[cache_lookup_count]; + cache_keys[cache_lookup_count] = + GetCacheKey(rep_->base_cache_key, v.handle); + async_handle.key = cache_keys[cache_lookup_count].AsSlice(); + // NB: StartAsyncLookupFull populates async_handle.helper + async_handle.create_context = &create_ctx; + async_handle.priority = GetCachePriority(); + async_handle.stats = rep_->ioptions.statistics.get(); + + block_cache.StartAsyncLookupFull( + async_handle, rep_->ioptions.lowest_used_cache_tier); + ++cache_lookup_count; + // TODO: stats? } - } else { - block_handles.emplace_back(handle); - total_len += BlockSizeWithTrailer(handle); } - } - if (wait_for_cache_results) { - Cache* block_cache = rep_->table_options.block_cache.get(); - block_cache->WaitAll(cache_handles); + if (block_cache) { + block_cache.get()->WaitAll(&async_handles[0], cache_lookup_count); + } + size_t lookup_idx = 0; for (size_t i = 0; i < block_handles.size(); ++i) { // If this block was a success or failure or not needed because // the corresponding key is in the same block as a prior key, skip - if (block_handles[i] == BlockHandle::NullBlockHandle() || - results[i].IsEmpty()) { + if (block_handles[i] == BlockHandle::NullBlockHandle()) { continue; } - results[i].UpdateCachedValue(); - void* val = results[i].GetValue(); - Cache::Handle* handle = results[i].GetCacheHandle(); - // GetContext for any key will do, as the stats will be aggregated - // anyway - GetContext* get_context = sst_file_range.begin()->get_context; - if (!val) { - // The async cache lookup failed - could be due to an error - // or a false positive. We need to read the data block from - // the SST file - results[i].Reset(); + if (!block_cache) { total_len += BlockSizeWithTrailer(block_handles[i]); - UpdateCacheMissMetrics(BlockType::kData, get_context); } else { - block_handles[i] = BlockHandle::NullBlockHandle(); - UpdateCacheHitMetrics(BlockType::kData, get_context, - block_cache->GetUsage(handle)); + BCI::TypedHandle* h = async_handles[lookup_idx].Result(); + if (h) { + // Cache hit + results[i].SetCachedValue(block_cache.Value(h), block_cache.get(), + h); + // Don't need to fetch + block_handles[i] = BlockHandle::NullBlockHandle(); + UpdateCacheHitMetrics(BlockType::kData, get_context, + block_cache.get()->GetUsage(h)); + } else { + // Cache miss + total_len += BlockSizeWithTrailer(block_handles[i]); + UpdateCacheMissMetrics(BlockType::kData, get_context); + } + if (!data_lookup_contexts.empty()) { + // Populate cache key before it's discarded + data_lookup_contexts[i].block_key = + async_handles[lookup_idx].key.ToString(); + } + ++lookup_idx; } } + assert(lookup_idx == cache_lookup_count); } if (total_len) { char* scratch = nullptr; + bool use_fs_scratch = false; const UncompressionDict& dict = uncompression_dict.GetValue() ? *uncompression_dict.GetValue() : UncompressionDict::GetEmptyDict(); assert(uncompression_dict_inited || !rep_->uncompression_dict_reader); assert(uncompression_dict_status.ok()); + + if (!rep_->file->use_direct_io()) { + if (CheckFSFeatureSupport(rep_->ioptions.fs.get(), + FSSupportedOps::kFSBuffer)) { + use_fs_scratch = true; + } + } + // If using direct IO, then scratch is not used, so keep it nullptr. // If the blocks need to be uncompressed and we don't need the // compressed blocks, then we can use a contiguous block of @@ -522,8 +554,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) // 2. If blocks are uncompressed, alloc heap bufs // 3. If blocks are compressed and no compressed block cache, use // stack buf - if (!rep_->file->use_direct_io() && - rep_->table_options.block_cache_compressed == nullptr && + if (!use_fs_scratch && !rep_->file->use_direct_io() && rep_->blocks_maybe_compressed) { if (total_len <= kMultiGetReadStackBufSize) { scratch = stack_buf; @@ -533,11 +564,10 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) } } CO_AWAIT(RetrieveMultipleBlocks) - (read_options, &data_block_range, &block_handles, &statuses, &results, - scratch, dict); - if (sst_file_range.begin()->get_context) { - ++(sst_file_range.begin() - ->get_context->get_context_stats_.num_sst_read); + (read_options, &data_block_range, &block_handles, &statuses[0], + &results[0], scratch, dict, use_fs_scratch); + if (get_context) { + ++(get_context->get_context_stats_.num_sst_read); } } } @@ -556,24 +586,26 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) bool first_block = true; do { DataBlockIter* biter = nullptr; + uint64_t referenced_data_size = 0; + Block_kData* parsed_block_value = nullptr; bool reusing_prev_block; bool later_reused; - uint64_t referenced_data_size = 0; bool does_referenced_key_exist = false; - BlockCacheLookupContext lookup_data_block_context( - TableReaderCaller::kUserMultiGet, tracing_mget_id, - /*_get_from_user_specified_snapshot=*/read_options.snapshot != - nullptr); + bool handle_present = false; + BlockCacheLookupContext* lookup_data_block_context = + data_lookup_contexts.empty() ? nullptr + : &data_lookup_contexts[idx_in_batch]; if (first_block) { - if (!block_handles[idx_in_batch].IsNull() || - !results[idx_in_batch].IsEmpty()) { + handle_present = !block_handles[idx_in_batch].IsNull(); + parsed_block_value = results[idx_in_batch].GetValue(); + if (handle_present || parsed_block_value) { first_biter.Invalidate(Status::OK()); NewDataBlockIterator( - read_options, results[idx_in_batch], &first_biter, + read_options, results[idx_in_batch].As(), &first_biter, statuses[idx_in_batch]); reusing_prev_block = false; } else { - // If handler is null and result is empty, then the status is never + // If handle is null and result is empty, then the status is never // set, which should be the initial value: ok(). assert(statuses[idx_in_batch].ok()); reusing_prev_block = true; @@ -598,9 +630,10 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) Status tmp_s; NewDataBlockIterator( read_options, iiter->value().handle, &next_biter, - BlockType::kData, get_context, &lookup_data_block_context, + BlockType::kData, get_context, lookup_data_block_context, /* prefetch_buffer= */ nullptr, /* for_compaction = */ false, - /*async_read = */ false, tmp_s); + /*async_read = */ false, tmp_s, + /* use_block_cache_for_lookup = */ true); biter = &next_biter; reusing_prev_block = false; later_reused = false; @@ -693,35 +726,23 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) // Write the block cache access. // XXX: There appear to be 'break' statements above that bypass this // writing of the block cache trace record - if (block_cache_tracer_ && block_cache_tracer_->is_tracing_enabled() && - !reusing_prev_block) { - // Avoid making copy of block_key, cf_name, and referenced_key when - // constructing the access record. + if (lookup_data_block_context && !reusing_prev_block && first_block) { Slice referenced_key; if (does_referenced_key_exist) { referenced_key = biter->key(); } else { referenced_key = key; } - BlockCacheTraceRecord access_record( - rep_->ioptions.clock->NowMicros(), - /*_block_key=*/"", lookup_data_block_context.block_type, - lookup_data_block_context.block_size, rep_->cf_id_for_tracing(), - /*_cf_name=*/"", rep_->level_for_tracing(), - rep_->sst_number_for_tracing(), lookup_data_block_context.caller, - lookup_data_block_context.is_cache_hit, - lookup_data_block_context.no_insert, - lookup_data_block_context.get_id, - lookup_data_block_context.get_from_user_specified_snapshot, - /*_referenced_key=*/"", referenced_data_size, - lookup_data_block_context.num_keys_in_block, - does_referenced_key_exist); - // TODO: Should handle status here? - block_cache_tracer_ - ->WriteBlockAccess(access_record, - lookup_data_block_context.block_key, - rep_->cf_name_for_tracing(), referenced_key) - .PermitUncheckedError(); + + // block_key is self-assigned here (previously assigned from + // cache_keys / async_handles, now out of scope) + SaveLookupContextOrTraceRecord(lookup_data_block_context->block_key, + /*is_cache_hit=*/!handle_present, + read_options, parsed_block_value, + lookup_data_block_context); + FinishTraceRecord( + *lookup_data_block_context, lookup_data_block_context->block_key, + referenced_key, does_referenced_key_exist, referenced_data_size); } s = biter->status(); if (done) { @@ -739,7 +760,12 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) } while (iiter->Valid()); if (matched && filter != nullptr) { - RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE); + if (rep_->whole_key_filtering) { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_FULL_TRUE_POSITIVE); + } else { + RecordTick(rep_->ioptions.stats, BLOOM_FILTER_PREFIX_TRUE_POSITIVE); + } + // Includes prefix stats PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, rep_->level); } diff --git a/table/block_based/block_based_table_reader_test.cc b/table/block_based/block_based_table_reader_test.cc index 4a2ef7ed5fca..254546893f3f 100644 --- a/table/block_based/block_based_table_reader_test.cc +++ b/table/block_based/block_based_table_reader_test.cc @@ -30,22 +30,42 @@ namespace ROCKSDB_NAMESPACE { class BlockBasedTableReaderBaseTest : public testing::Test { + public: + static constexpr int kBytesPerEntry = 256; + // 16 = (default block size) 4 * 1024 / kBytesPerEntry + static constexpr int kEntriesPerBlock = 16; + protected: // Prepare key-value pairs to occupy multiple blocks. - // Each value is 256B, every 16 pairs constitute 1 block. + // Each (key, value) pair is `kBytesPerEntry` byte, every kEntriesPerBlock + // pairs constitute 1 block. // If mixed_with_human_readable_string_value == true, // then adjacent blocks contain values with different compression // complexity: human readable strings are easier to compress than random - // strings. - static std::map GenerateKVMap( - int num_block = 100, - bool mixed_with_human_readable_string_value = false) { - std::map kv; - + // strings. key is an internal key. + // When ts_sz > 0 and `same_key_diff_ts` is true, this + // function generate keys with the same user provided key, with different + // user defined timestamps and different sequence number to differentiate them + static std::vector> GenerateKVMap( + int num_block = 2, bool mixed_with_human_readable_string_value = false, + size_t ts_sz = 0, bool same_key_diff_ts = false) { + std::vector> kv; + + SequenceNumber seq_no = 0; + uint64_t current_udt = 0; + if (same_key_diff_ts) { + // These numbers are based on the number of keys to create + an arbitrary + // buffer number (100) to avoid overflow. + current_udt = kEntriesPerBlock * num_block + 100; + seq_no = kEntriesPerBlock * num_block + 100; + } Random rnd(101); uint32_t key = 0; + // To make each (key, value) pair occupy exactly kBytesPerEntry bytes. + int value_size = kBytesPerEntry - (8 + static_cast(ts_sz) + + static_cast(kNumInternalBytes)); for (int block = 0; block < num_block; block++) { - for (int i = 0; i < 16; i++) { + for (int i = 0; i < kEntriesPerBlock; i++) { char k[9] = {0}; // Internal key is constructed directly from this key, // and internal key size is required to be >= 8 bytes, @@ -53,13 +73,27 @@ class BlockBasedTableReaderBaseTest : public testing::Test { snprintf(k, sizeof(k), "%08u", key); std::string v; if (mixed_with_human_readable_string_value) { - v = (block % 2) ? rnd.HumanReadableString(256) - : rnd.RandomString(256); + v = (block % 2) ? rnd.HumanReadableString(value_size) + : rnd.RandomString(value_size); + } else { + v = rnd.RandomString(value_size); + } + std::string user_key = std::string(k); + if (ts_sz > 0) { + if (same_key_diff_ts) { + PutFixed64(&user_key, current_udt); + current_udt -= 1; + } else { + PutFixed64(&user_key, 0); + } + } + InternalKey internal_key(user_key, seq_no, ValueType::kTypeValue); + kv.emplace_back(internal_key.Encode().ToString(), v); + if (same_key_diff_ts) { + seq_no -= 1; } else { - v = rnd.RandomString(256); + key++; } - kv[std::string(k)] = v; - key++; } } return kv; @@ -80,30 +114,37 @@ class BlockBasedTableReaderBaseTest : public testing::Test { // Creates a table with the specificied key value pairs (kv). void CreateTable(const std::string& table_name, + const ImmutableOptions& ioptions, const CompressionType& compression_type, - const std::map& kv) { + const std::vector>& kv, + uint32_t compression_parallel_threads = 1, + uint32_t compression_dict_bytes = 0) { std::unique_ptr writer; NewFileWriter(table_name, &writer); - // Create table builder. - ImmutableOptions ioptions(options_); - InternalKeyComparator comparator(options_.comparator); + InternalKeyComparator comparator(ioptions.user_comparator); ColumnFamilyOptions cf_options; + cf_options.prefix_extractor = options_.prefix_extractor; MutableCFOptions moptions(cf_options); + CompressionOptions compression_opts; + compression_opts.parallel_threads = compression_parallel_threads; + // Enable compression dictionary and set a buffering limit that is the same + // as each block's size. + compression_opts.max_dict_bytes = compression_dict_bytes; + compression_opts.max_dict_buffer_bytes = compression_dict_bytes; IntTblPropCollectorFactories factories; std::unique_ptr table_builder( options_.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, moptions, comparator, &factories, - compression_type, CompressionOptions(), + compression_type, compression_opts, 0 /* column_family_id */, kDefaultColumnFamilyName, -1 /* level */), writer.get())); // Build table. for (auto it = kv.begin(); it != kv.end(); it++) { - std::string k = ToInternalKey(it->first); std::string v = it->second; - table_builder->Add(k, v); + table_builder->Add(it->first, v); } ASSERT_OK(table_builder->Finish()); } @@ -114,10 +155,17 @@ class BlockBasedTableReaderBaseTest : public testing::Test { const std::string& table_name, std::unique_ptr* table, bool prefetch_index_and_filter_in_cache = true, - Status* status = nullptr) { + Status* status = nullptr, + bool user_defined_timestamps_persisted = true) { const MutableCFOptions moptions(options_); TableReaderOptions table_reader_options = TableReaderOptions( - ioptions, moptions.prefix_extractor, EnvOptions(), comparator); + ioptions, moptions.prefix_extractor, EnvOptions(), comparator, + 0 /* block_protection_bytes_per_key */, false /* _skip_filters */, + false /* _immortal */, false /* _force_direct_prefetch */, + -1 /* _level */, nullptr /* _block_cache_tracer */, + 0 /* _max_file_size_for_l0_meta_pin */, "" /* _cur_db_session_id */, + 0 /* _cur_file_num */, {} /* _unique_id */, 0 /* _largest_seqno */, + 0 /* _tail_size */, user_defined_timestamps_persisted); std::unique_ptr file; NewFileReader(table_name, foptions, &file); @@ -125,9 +173,11 @@ class BlockBasedTableReaderBaseTest : public testing::Test { uint64_t file_size = 0; ASSERT_OK(env_->GetFileSize(Path(table_name), &file_size)); + ReadOptions read_opts; + read_opts.verify_checksums = true; std::unique_ptr general_table; Status s = options_.table_factory->NewTableReader( - ReadOptions(), table_reader_options, std::move(file), file_size, + read_opts, table_reader_options, std::move(file), file_size, &general_table, prefetch_index_and_filter_in_cache); if (s.ok()) { @@ -172,21 +222,39 @@ class BlockBasedTableReaderBaseTest : public testing::Test { reader->reset(new RandomAccessFileReader(std::move(f), path, env_->GetSystemClock().get())); } - - std::string ToInternalKey(const std::string& key) { - InternalKey internal_key(key, 0, ValueType::kTypeValue); - return internal_key.Encode().ToString(); - } }; +// Param 1: compression type +// Param 2: whether to use direct reads +// Param 3: Block Based Table Index type +// Param 4: BBTO no_block_cache option +// Param 5: test mode for the user-defined timestamp feature +// Param 6: number of parallel compression threads +// Param 7: CompressionOptions.max_dict_bytes and +// CompressionOptions.max_dict_buffer_bytes to enable/disable +// compression dictionary. +// Param 8: test mode to specify the pattern for generating key / value. When +// true, generate keys with the same user provided key, different +// user-defined timestamps (if udt enabled), different sequence +// numbers. This test mode is used for testing `Get`. When false, +// generate keys with different user provided key, same user-defined +// timestamps (if udt enabled), same sequence number. This test mode is +// used for testing `Get`, `MultiGet`, and `NewIterator`. class BlockBasedTableReaderTest : public BlockBasedTableReaderBaseTest, public testing::WithParamInterface> { + CompressionType, bool, BlockBasedTableOptions::IndexType, bool, + test::UserDefinedTimestampTestMode, uint32_t, uint32_t, bool>> { protected: void SetUp() override { compression_type_ = std::get<0>(GetParam()); use_direct_reads_ = std::get<1>(GetParam()); + test::UserDefinedTimestampTestMode udt_test_mode = std::get<4>(GetParam()); + udt_enabled_ = test::IsUDTEnabled(udt_test_mode); + persist_udt_ = test::ShouldPersistUDT(udt_test_mode); + compression_parallel_threads_ = std::get<5>(GetParam()); + compression_dict_bytes_ = std::get<6>(GetParam()); + same_key_diff_ts_ = std::get<7>(GetParam()); BlockBasedTableReaderBaseTest::SetUp(); } @@ -194,53 +262,155 @@ class BlockBasedTableReaderTest BlockBasedTableOptions opts; opts.index_type = std::get<2>(GetParam()); opts.no_block_cache = std::get<3>(GetParam()); + opts.filter_policy.reset(NewBloomFilterPolicy(10, false)); + opts.partition_filters = + opts.index_type == + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + opts.metadata_cache_options.partition_pinning = PinningTier::kAll; options_.table_factory.reset( static_cast(NewBlockBasedTableFactory(opts))); + options_.prefix_extractor = + std::shared_ptr(NewFixedPrefixTransform(3)); } CompressionType compression_type_; bool use_direct_reads_; + bool udt_enabled_; + bool persist_udt_; + uint32_t compression_parallel_threads_; + uint32_t compression_dict_bytes_; + bool same_key_diff_ts_; }; +class BlockBasedTableReaderGetTest : public BlockBasedTableReaderTest {}; + +TEST_P(BlockBasedTableReaderGetTest, Get) { + Options options; + if (udt_enabled_) { + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + } + options.persist_user_defined_timestamps = persist_udt_; + size_t ts_sz = options.comparator->timestamp_size(); + std::vector> kv = + BlockBasedTableReaderBaseTest::GenerateKVMap( + 100 /* num_block */, + true /* mixed_with_human_readable_string_value */, ts_sz, + same_key_diff_ts_); + + std::string table_name = "BlockBasedTableReaderGetTest_Get" + + CompressionTypeToString(compression_type_); + + ImmutableOptions ioptions(options); + CreateTable(table_name, ioptions, compression_type_, kv, + compression_parallel_threads_, compression_dict_bytes_); + + std::unique_ptr table; + FileOptions foptions; + foptions.use_direct_reads = use_direct_reads_; + InternalKeyComparator comparator(options.comparator); + NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table, + true /* prefetch_index_and_filter_in_cache */, + nullptr /* status */, persist_udt_); + + ReadOptions read_opts; + ASSERT_OK( + table->VerifyChecksum(read_opts, TableReaderCaller::kUserVerifyChecksum)); + + for (size_t i = 0; i < kv.size(); i += 1) { + Slice key = kv[i].first; + Slice lkey = key; + std::string lookup_ikey; + if (udt_enabled_ && !persist_udt_) { + // When user-defined timestamps are collapsed to be the minimum timestamp, + // we also read with the minimum timestamp to be able to retrieve each + // value. + ReplaceInternalKeyWithMinTimestamp(&lookup_ikey, key, ts_sz); + lkey = lookup_ikey; + } + // Reading the first entry in a block caches the whole block. + if (i % kEntriesPerBlock == 0) { + ASSERT_FALSE(table->TEST_KeyInCache(read_opts, lkey.ToString())); + } else { + ASSERT_TRUE(table->TEST_KeyInCache(read_opts, lkey.ToString())); + } + PinnableSlice value; + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, ExtractUserKey(key), &value, + nullptr, nullptr, nullptr, nullptr, + true /* do_merge */, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr); + ASSERT_OK(table->Get(read_opts, lkey, &get_context, nullptr)); + ASSERT_EQ(value.ToString(), kv[i].second); + ASSERT_TRUE(table->TEST_KeyInCache(read_opts, lkey.ToString())); + } +} + // Tests MultiGet in both direct IO and non-direct IO mode. // The keys should be in cache after MultiGet. TEST_P(BlockBasedTableReaderTest, MultiGet) { - std::map kv = + Options options; + ReadOptions read_opts; + std::string dummy_ts(sizeof(uint64_t), '\0'); + Slice read_timestamp = dummy_ts; + if (udt_enabled_) { + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + read_opts.timestamp = &read_timestamp; + } + options.persist_user_defined_timestamps = persist_udt_; + size_t ts_sz = options.comparator->timestamp_size(); + std::vector> kv = BlockBasedTableReaderBaseTest::GenerateKVMap( 100 /* num_block */, - true /* mixed_with_human_readable_string_value */); + true /* mixed_with_human_readable_string_value */, ts_sz); // Prepare keys, values, and statuses for MultiGet. autovector keys; + autovector keys_without_timestamps; autovector values; autovector statuses; + autovector + expected_values; { const int step = static_cast(kv.size()) / MultiGetContext::MAX_BATCH_SIZE; auto it = kv.begin(); for (int i = 0; i < MultiGetContext::MAX_BATCH_SIZE; i++) { keys.emplace_back(it->first); + if (ts_sz > 0) { + Slice ukey_without_ts = + ExtractUserKeyAndStripTimestamp(it->first, ts_sz); + keys_without_timestamps.push_back(ukey_without_ts); + } else { + keys_without_timestamps.emplace_back(ExtractUserKey(it->first)); + } values.emplace_back(); statuses.emplace_back(); + expected_values.push_back(&(it->second)); std::advance(it, step); } } - std::string table_name = - "BlockBasedTableReaderTest" + CompressionTypeToString(compression_type_); - CreateTable(table_name, compression_type_, kv); + std::string table_name = "BlockBasedTableReaderTest_MultiGet" + + CompressionTypeToString(compression_type_); - std::unique_ptr table; - Options options; ImmutableOptions ioptions(options); + CreateTable(table_name, ioptions, compression_type_, kv, + compression_parallel_threads_, compression_dict_bytes_); + + std::unique_ptr table; FileOptions foptions; foptions.use_direct_reads = use_direct_reads_; InternalKeyComparator comparator(options.comparator); - NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table); + NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table, + true /* bool prefetch_index_and_filter_in_cache */, + nullptr /* status */, persist_udt_); + + ASSERT_OK( + table->VerifyChecksum(read_opts, TableReaderCaller::kUserVerifyChecksum)); // Ensure that keys are not in cache before MultiGet. for (auto& key : keys) { - ASSERT_FALSE(table->TEST_KeyInCache(ReadOptions(), key)); + ASSERT_FALSE(table->TEST_KeyInCache(read_opts, key.ToString())); } // Prepare MultiGetContext. @@ -248,26 +418,26 @@ TEST_P(BlockBasedTableReaderTest, MultiGet) { autovector key_context; autovector sorted_keys; for (size_t i = 0; i < keys.size(); ++i) { - get_context.emplace_back(BytewiseComparator(), nullptr, nullptr, nullptr, - GetContext::kNotFound, keys[i], &values[i], - nullptr, nullptr, nullptr, nullptr, + get_context.emplace_back(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, ExtractUserKey(keys[i]), + &values[i], nullptr, nullptr, nullptr, nullptr, true /* do_merge */, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr); - key_context.emplace_back(nullptr, keys[i], &values[i], nullptr, - &statuses.back()); + key_context.emplace_back(nullptr, keys_without_timestamps[i], &values[i], + nullptr, nullptr, &statuses.back()); key_context.back().get_context = &get_context.back(); } for (auto& key_ctx : key_context) { sorted_keys.emplace_back(&key_ctx); } - MultiGetContext ctx(&sorted_keys, 0, sorted_keys.size(), 0, ReadOptions(), + MultiGetContext ctx(&sorted_keys, 0, sorted_keys.size(), 0, read_opts, fs_.get(), nullptr); // Execute MultiGet. MultiGetContext::Range range = ctx.GetMultiGetRange(); PerfContext* perf_ctx = get_perf_context(); perf_ctx->Reset(); - table->MultiGet(ReadOptions(), &range, nullptr); + table->MultiGet(read_opts, &range, nullptr); ASSERT_GE(perf_ctx->block_read_count - perf_ctx->index_block_read_count - perf_ctx->filter_block_read_count - @@ -280,9 +450,73 @@ TEST_P(BlockBasedTableReaderTest, MultiGet) { } // Check that keys are in cache after MultiGet. for (size_t i = 0; i < keys.size(); i++) { - ASSERT_TRUE(table->TEST_KeyInCache(ReadOptions(), keys[i])); - ASSERT_EQ(values[i].ToString(), kv[keys[i].ToString()]); + ASSERT_TRUE(table->TEST_KeyInCache(read_opts, keys[i])); + ASSERT_EQ(values[i].ToString(), *expected_values[i]); + } +} + +TEST_P(BlockBasedTableReaderTest, NewIterator) { + Options options; + ReadOptions read_opts; + std::string dummy_ts(sizeof(uint64_t), '\0'); + Slice read_timestamp = dummy_ts; + if (udt_enabled_) { + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + read_opts.timestamp = &read_timestamp; + } + options.persist_user_defined_timestamps = persist_udt_; + size_t ts_sz = options.comparator->timestamp_size(); + std::vector> kv = + BlockBasedTableReaderBaseTest::GenerateKVMap( + 100 /* num_block */, + true /* mixed_with_human_readable_string_value */, ts_sz); + + std::string table_name = "BlockBasedTableReaderTest_NewIterator" + + CompressionTypeToString(compression_type_); + + ImmutableOptions ioptions(options); + CreateTable(table_name, ioptions, compression_type_, kv, + compression_parallel_threads_, compression_dict_bytes_); + + std::unique_ptr table; + FileOptions foptions; + foptions.use_direct_reads = use_direct_reads_; + InternalKeyComparator comparator(options.comparator); + NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table, + true /* bool prefetch_index_and_filter_in_cache */, + nullptr /* status */, persist_udt_); + ASSERT_OK( + table->VerifyChecksum(read_opts, TableReaderCaller::kUserVerifyChecksum)); + + std::unique_ptr iter; + iter.reset(table->NewIterator( + read_opts, options_.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kUncategorized)); + + // Test forward scan. + ASSERT_TRUE(!iter->Valid()); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + for (auto kv_iter = kv.begin(); kv_iter != kv.end(); kv_iter++) { + ASSERT_EQ(iter->key().ToString(), kv_iter->first); + ASSERT_EQ(iter->value().ToString(), kv_iter->second); + iter->Next(); + ASSERT_OK(iter->status()); + } + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); + + // Test backward scan. + iter->SeekToLast(); + ASSERT_OK(iter->status()); + for (auto kv_iter = kv.rbegin(); kv_iter != kv.rend(); kv_iter++) { + ASSERT_EQ(iter->key().ToString(), kv_iter->first); + ASSERT_EQ(iter->value().ToString(), kv_iter->second); + iter->Prev(); + ASSERT_OK(iter->status()); } + ASSERT_TRUE(!iter->Valid()); + ASSERT_OK(iter->status()); } class ChargeTableReaderTest @@ -365,7 +599,7 @@ class ChargeTableReaderTest TargetCacheChargeTrackingCache> table_reader_charge_tracking_cache_; std::size_t approx_table_reader_mem_; - std::map kv_; + std::vector> kv_; CompressionType compression_type_; private: @@ -373,7 +607,8 @@ class ChargeTableReaderTest std::size_t approx_table_reader_mem = 0; std::string table_name = "table_for_approx_table_reader_mem"; - CreateTable(table_name, compression_type_, kv_); + ImmutableOptions ioptions(options_); + CreateTable(table_name, ioptions, compression_type_, kv_); std::unique_ptr table; Status s; @@ -423,13 +658,14 @@ TEST_P(ChargeTableReaderTest, Basic) { std::size_t opened_table_reader_num = 0; std::string table_name; std::vector> tables; + ImmutableOptions ioptions(options_); // Keep creating BlockBasedTableReader till hiting the memory limit based on // cache capacity and creation fails (when charge_table_reader_ == // kEnabled) or reaching a specfied big number of table readers (when // charge_table_reader_ == kDisabled) while (s.ok() && opened_table_reader_num < max_table_reader_num_uncapped) { table_name = "table_" + std::to_string(opened_table_reader_num); - CreateTable(table_name, compression_type_, kv_); + CreateTable(table_name, ioptions, compression_type_, kv_); tables.push_back(std::unique_ptr()); NewBlockBasedTableReader( FileOptions(), ImmutableOptions(options_), @@ -464,7 +700,7 @@ TEST_P(ChargeTableReaderTest, Basic) { --opened_table_reader_num; } table_name = "table_for_successful_table_reader_open"; - CreateTable(table_name, compression_type_, kv_); + CreateTable(table_name, ioptions, compression_type_, kv_); tables.push_back(std::unique_ptr()); NewBlockBasedTableReader( FileOptions(), ImmutableOptions(options_), @@ -490,27 +726,42 @@ class BlockBasedTableReaderTestVerifyChecksum }; TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) { - std::map kv = - BlockBasedTableReaderBaseTest::GenerateKVMap(800 /* num_block */); + Options options; + ReadOptions read_opts; + std::string dummy_ts(sizeof(uint64_t), '\0'); + Slice read_timestamp = dummy_ts; + if (udt_enabled_) { + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + read_opts.timestamp = &read_timestamp; + } + options.persist_user_defined_timestamps = persist_udt_; + size_t ts_sz = options.comparator->timestamp_size(); + std::vector> kv = + BlockBasedTableReaderBaseTest::GenerateKVMap( + 800 /* num_block */, + false /* mixed_with_human_readable_string_value=*/, ts_sz); + options.statistics = CreateDBStatistics(); + ImmutableOptions ioptions(options); std::string table_name = "BlockBasedTableReaderTest" + CompressionTypeToString(compression_type_); - CreateTable(table_name, compression_type_, kv); + CreateTable(table_name, ioptions, compression_type_, kv, + compression_parallel_threads_, compression_dict_bytes_); std::unique_ptr table; - Options options; - ImmutableOptions ioptions(options); FileOptions foptions; foptions.use_direct_reads = use_direct_reads_; InternalKeyComparator comparator(options.comparator); - NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table); + NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table, + true /* bool prefetch_index_and_filter_in_cache */, + nullptr /* status */, persist_udt_); // Use the top level iterator to find the offset/size of the first // 2nd level index block and corrupt the block IndexBlockIter iiter_on_stack; BlockCacheLookupContext context{TableReaderCaller::kUserVerifyChecksum}; InternalIteratorBase* iiter = table->NewIndexIterator( - ReadOptions(), /*disable_prefix_seek=*/false, &iiter_on_stack, + read_opts, /*need_upper_bound_check=*/false, &iiter_on_stack, /*get_context=*/nullptr, &context); std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { @@ -527,33 +778,53 @@ TEST_P(BlockBasedTableReaderTestVerifyChecksum, ChecksumMismatch) { ASSERT_OK(test::CorruptFile(options.env, Path(table_name), static_cast(handle.offset()), 128)); - NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table); - Status s = table->VerifyChecksum(ReadOptions(), - TableReaderCaller::kUserVerifyChecksum); + NewBlockBasedTableReader(foptions, ioptions, comparator, table_name, &table, + true /* bool prefetch_index_and_filter_in_cache */, + nullptr /* status */, persist_udt_); + ASSERT_EQ(0, + options.statistics->getTickerCount(BLOCK_CHECKSUM_MISMATCH_COUNT)); + Status s = + table->VerifyChecksum(read_opts, TableReaderCaller::kUserVerifyChecksum); + ASSERT_EQ(1, + options.statistics->getTickerCount(BLOCK_CHECKSUM_MISMATCH_COUNT)); ASSERT_EQ(s.code(), Status::kCorruption); } // Param 1: compression type // Param 2: whether to use direct reads -// Param 3: Block Based Table Index type +// Param 3: Block Based Table Index type, partitioned filters are also enabled +// when index type is kTwoLevelIndexSearch // Param 4: BBTO no_block_cache option -#ifdef ROCKSDB_LITE -// Skip direct I/O tests in lite mode since direct I/O is unsupported. +// Param 5: test mode for the user-defined timestamp feature +// Param 6: number of parallel compression threads +// Param 7: CompressionOptions.max_dict_bytes and +// CompressionOptions.max_dict_buffer_bytes. This enable/disables +// compression dictionary. +// Param 8: test mode to specify the pattern for generating key / value pairs. INSTANTIATE_TEST_CASE_P( - MultiGet, BlockBasedTableReaderTest, + BlockBasedTableReaderTest, BlockBasedTableReaderTest, ::testing::Combine( - ::testing::ValuesIn(GetSupportedCompressions()), - ::testing::Values(false), - ::testing::Values(BlockBasedTableOptions::IndexType::kBinarySearch), + ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Bool(), + ::testing::Values( + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey), + ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()), + ::testing::Values(1, 2), ::testing::Values(0, 4096), ::testing::Values(false))); -#else // ROCKSDB_LITE INSTANTIATE_TEST_CASE_P( - MultiGet, BlockBasedTableReaderTest, + BlockBasedTableReaderGetTest, BlockBasedTableReaderGetTest, ::testing::Combine( ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Bool(), - ::testing::Values(BlockBasedTableOptions::IndexType::kBinarySearch), - ::testing::Values(false))); -#endif // ROCKSDB_LITE + ::testing::Values( + BlockBasedTableOptions::IndexType::kBinarySearch, + BlockBasedTableOptions::IndexType::kHashSearch, + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch, + BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey), + ::testing::Values(false), ::testing::ValuesIn(test::GetUDTTestModes()), + ::testing::Values(1, 2), ::testing::Values(0, 4096), + ::testing::Values(false, true))); INSTANTIATE_TEST_CASE_P( VerifyChecksum, BlockBasedTableReaderTestVerifyChecksum, ::testing::Combine( @@ -561,7 +832,9 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(false), ::testing::Values( BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch), - ::testing::Values(true))); + ::testing::Values(true), ::testing::ValuesIn(test::GetUDTTestModes()), + ::testing::Values(1, 2), ::testing::Values(0), + ::testing::Values(false))); } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_builder.cc b/table/block_based/block_builder.cc index 92702b17d0bc..877df81c1299 100644 --- a/table/block_based/block_builder.cc +++ b/table/block_based/block_builder.cc @@ -48,10 +48,13 @@ BlockBuilder::BlockBuilder( int block_restart_interval, bool use_delta_encoding, bool use_value_delta_encoding, BlockBasedTableOptions::DataBlockIndexType index_type, - double data_block_hash_table_util_ratio) + double data_block_hash_table_util_ratio, size_t ts_sz, + bool persist_user_defined_timestamps, bool is_user_key) : block_restart_interval_(block_restart_interval), use_delta_encoding_(use_delta_encoding), use_value_delta_encoding_(use_value_delta_encoding), + strip_ts_sz_(persist_user_defined_timestamps ? 0 : ts_sz), + is_user_key_(is_user_key), restarts_(1, 0), // First restart point is at offset 0 counter_(0), finished_(false) { @@ -96,6 +99,9 @@ size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, // Note: this is an imprecise estimate as it accounts for the whole key size // instead of non-shared key size. estimate += key.size(); + if (strip_ts_sz_ > 0) { + estimate -= strip_ts_sz_; + } // In value delta encoding we estimate the value delta size as half the full // value size since only the size field of block handle is encoded. estimate += @@ -168,13 +174,13 @@ void BlockBuilder::AddWithLastKey(const Slice& key, const Slice& value, // or Reset. This is more convenient for the caller and we can be more // clever inside BlockBuilder. On this hot code path, we want to avoid // conditional jumps like `buffer_.empty() ? ... : ...` so we can use a - // fast min operation instead, with an assertion to be sure our logic is - // sound. + // fast arithmetic operation instead, with an assertion to be sure our logic + // is sound. size_t buffer_size = buffer_.size(); size_t last_key_size = last_key_param.size(); - assert(buffer_size == 0 || buffer_size >= last_key_size); + assert(buffer_size == 0 || buffer_size >= last_key_size - strip_ts_sz_); - Slice last_key(last_key_param.data(), std::min(buffer_size, last_key_size)); + Slice last_key(last_key_param.data(), last_key_size * (buffer_size > 0)); AddWithLastKeyImpl(key, value, last_key, delta_value, buffer_size); } @@ -187,6 +193,15 @@ inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key, assert(!finished_); assert(counter_ <= block_restart_interval_); assert(!use_value_delta_encoding_ || delta_value); + std::string key_buf; + std::string last_key_buf; + const Slice key_to_persist = MaybeStripTimestampFromKey(&key_buf, key); + // For delta key encoding, the first key in each restart interval doesn't have + // a last key to share bytes with. + const Slice last_key_persisted = + last_key.size() == 0 + ? last_key + : MaybeStripTimestampFromKey(&last_key_buf, last_key); size_t shared = 0; // number of bytes shared with prev key if (counter_ >= block_restart_interval_) { // Restart compression @@ -195,10 +210,10 @@ inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key, counter_ = 0; } else if (use_delta_encoding_) { // See how much sharing to do with previous string - shared = key.difference_offset(last_key); + shared = key_to_persist.difference_offset(last_key_persisted); } - const size_t non_shared = key.size() - shared; + const size_t non_shared = key_to_persist.size() - shared; if (use_value_delta_encoding_) { // Add "" to buffer_ @@ -212,7 +227,7 @@ inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key, } // Add string delta to buffer_ followed by value - buffer_.append(key.data() + shared, non_shared); + buffer_.append(key_to_persist.data() + shared, non_shared); // Use value delta encoding only when the key has shared bytes. This would // simplify the decoding, where it can figure which decoding to use simply by // looking at the shared bytes size. @@ -222,7 +237,12 @@ inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key, buffer_.append(value.data(), value.size()); } + // TODO(yuzhangyu): make user defined timestamp work with block hash index. if (data_block_hash_index_builder_.Valid()) { + // Only data blocks should be using `kDataBlockBinaryAndHash` index type. + // And data blocks should always be built with internal keys instead of + // user keys. + assert(!is_user_key_); data_block_hash_index_builder_.Add(ExtractUserKey(key), restarts_.size() - 1); } @@ -231,4 +251,17 @@ inline void BlockBuilder::AddWithLastKeyImpl(const Slice& key, estimate_ += buffer_.size() - buffer_size; } +const Slice BlockBuilder::MaybeStripTimestampFromKey(std::string* key_buf, + const Slice& key) { + Slice stripped_key = key; + if (strip_ts_sz_ > 0) { + if (is_user_key_) { + stripped_key.remove_suffix(strip_ts_sz_); + } else { + StripTimestampFromInternalKey(key_buf, key, strip_ts_sz_); + stripped_key = *key_buf; + } + } + return stripped_key; +} } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_builder.h b/table/block_based/block_builder.h index 5f68b449bf3c..f167470bb5f5 100644 --- a/table/block_based/block_builder.h +++ b/table/block_based/block_builder.h @@ -28,7 +28,10 @@ class BlockBuilder { bool use_value_delta_encoding = false, BlockBasedTableOptions::DataBlockIndexType index_type = BlockBasedTableOptions::kDataBlockBinarySearch, - double data_block_hash_table_util_ratio = 0.75); + double data_block_hash_table_util_ratio = 0.75, + size_t ts_sz = 0, + bool persist_user_defined_timestamps = true, + bool is_user_key = false); // Reset the contents as if the BlockBuilder was just constructed. void Reset(); @@ -37,7 +40,8 @@ class BlockBuilder { void SwapAndReset(std::string& buffer); // REQUIRES: Finish() has not been called since the last call to Reset(). - // REQUIRES: key is larger than any previously added key + // REQUIRES: Unless a range tombstone block, key is larger than any previously + // added key // DO NOT mix with AddWithLastKey() between Resets. For efficiency, use // AddWithLastKey() in contexts where previous added key is already known // and delta encoding might be used. @@ -47,7 +51,8 @@ class BlockBuilder { // A faster version of Add() if the previous key is already known for all // Add()s. // REQUIRES: Finish() has not been called since the last call to Reset(). - // REQUIRES: key is larger than any previously added key + // REQUIRES: Unless a range tombstone block, key is larger than any previously + // added key // REQUIRES: if AddWithLastKey has been called since last Reset(), last_key // is the key from most recent AddWithLastKey. (For convenience, last_key // is ignored on first call after creation or Reset().) @@ -81,11 +86,32 @@ class BlockBuilder { const Slice* const delta_value, size_t buffer_size); + inline const Slice MaybeStripTimestampFromKey(std::string* key_buf, + const Slice& key); + const int block_restart_interval_; // TODO(myabandeh): put it into a separate IndexBlockBuilder const bool use_delta_encoding_; // Refer to BlockIter::DecodeCurrentValue for format of delta encoded values const bool use_value_delta_encoding_; + // Size in bytes for the user-defined timestamp to strip in a user key. + // This is non-zero if there is user-defined timestamp in the user key and it + // should not be persisted. + const size_t strip_ts_sz_; + // Whether the keys provided to build this block are user keys. If not, + // the keys are internal keys. This will affect how timestamp stripping is + // done for the key if `persisted_user_defined_timestamps_` is false and + // `ts_sz_` is non-zero. + // The timestamp stripping only applies to the keys added to the block. If the + // value contains user defined timestamp that needed to be stripped too, such + // as the `first_internal_key` in an `IndexValue` for an index block, the + // value part for a range deletion entry, their timestamp should be stripped + // before calling `BlockBuilder::Add`. + // Timestamp stripping only applies to data block and index blocks including + // index block for data blocks, index block for partitioned filter blocks, + // index block for partitioned index blocks. In summary, this only applies to + // block whose key are real user keys or internal keys created from user keys. + const bool is_user_key_; std::string buffer_; // Destination buffer std::vector restarts_; // Restart points diff --git a/table/block_based/block_cache.cc b/table/block_based/block_cache.cc index 86a3918448d0..08f5d2158dc5 100644 --- a/table/block_based/block_cache.cc +++ b/table/block_based/block_cache.cc @@ -5,23 +5,33 @@ #include "table/block_based/block_cache.h" +#include "table/block_based/block_based_table_reader.h" + namespace ROCKSDB_NAMESPACE { void BlockCreateContext::Create(std::unique_ptr* parsed_out, BlockContents&& block) { parsed_out->reset(new Block_kData( std::move(block), table_options->read_amp_bytes_per_bit, statistics)); + parsed_out->get()->InitializeDataBlockProtectionInfo(protection_bytes_per_key, + raw_ucmp); } void BlockCreateContext::Create(std::unique_ptr* parsed_out, BlockContents&& block) { parsed_out->reset(new Block_kIndex(std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics)); + parsed_out->get()->InitializeIndexBlockProtectionInfo( + protection_bytes_per_key, raw_ucmp, index_value_is_full, + index_has_first_key); } void BlockCreateContext::Create( std::unique_ptr* parsed_out, BlockContents&& block) { parsed_out->reset(new Block_kFilterPartitionIndex( std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics)); + parsed_out->get()->InitializeIndexBlockProtectionInfo( + protection_bytes_per_key, raw_ucmp, index_value_is_full, + index_has_first_key); } void BlockCreateContext::Create( std::unique_ptr* parsed_out, BlockContents&& block) { @@ -32,6 +42,8 @@ void BlockCreateContext::Create(std::unique_ptr* parsed_out, BlockContents&& block) { parsed_out->reset(new Block_kMetaIndex( std::move(block), /*read_amp_bytes_per_bit*/ 0, statistics)); + parsed_out->get()->InitializeMetaIndexBlockProtectionInfo( + protection_bytes_per_key); } void BlockCreateContext::Create( @@ -50,43 +62,43 @@ namespace { // For getting SecondaryCache-compatible helpers from a BlockType. This is // useful for accessing block cache in untyped contexts, such as for generic // cache warming in table builder. -constexpr std::array(BlockType::kInvalid) + 1> +const std::array(BlockType::kInvalid) + 1> kCacheItemFullHelperForBlockType{{ - &BlockCacheInterface::kFullHelper, - &BlockCacheInterface::kFullHelper, - &BlockCacheInterface::kFullHelper, + BlockCacheInterface::GetFullHelper(), + BlockCacheInterface::GetFullHelper(), + BlockCacheInterface::GetFullHelper(), nullptr, // kProperties - &BlockCacheInterface::kFullHelper, - &BlockCacheInterface::kFullHelper, + BlockCacheInterface::GetFullHelper(), + BlockCacheInterface::GetFullHelper(), nullptr, // kHashIndexPrefixes nullptr, // kHashIndexMetadata nullptr, // kMetaIndex (not yet stored in block cache) - &BlockCacheInterface::kFullHelper, + BlockCacheInterface::GetFullHelper(), nullptr, // kInvalid }}; // For getting basic helpers from a BlockType (no SecondaryCache support) -constexpr std::array(BlockType::kInvalid) + 1> +const std::array(BlockType::kInvalid) + 1> kCacheItemBasicHelperForBlockType{{ - &BlockCacheInterface::kBasicHelper, - &BlockCacheInterface::kBasicHelper, - &BlockCacheInterface::kBasicHelper, + BlockCacheInterface::GetBasicHelper(), + BlockCacheInterface::GetBasicHelper(), + BlockCacheInterface::GetBasicHelper(), nullptr, // kProperties - &BlockCacheInterface::kBasicHelper, - &BlockCacheInterface::kBasicHelper, + BlockCacheInterface::GetBasicHelper(), + BlockCacheInterface::GetBasicHelper(), nullptr, // kHashIndexPrefixes nullptr, // kHashIndexMetadata nullptr, // kMetaIndex (not yet stored in block cache) - &BlockCacheInterface::kBasicHelper, + BlockCacheInterface::GetBasicHelper(), nullptr, // kInvalid }}; } // namespace const Cache::CacheItemHelper* GetCacheItemHelper( BlockType block_type, CacheTier lowest_used_cache_tier) { - if (lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier) { + if (lowest_used_cache_tier > CacheTier::kVolatileTier) { return kCacheItemFullHelperForBlockType[static_cast(block_type)]; } else { return kCacheItemBasicHelperForBlockType[static_cast(block_type)]; diff --git a/table/block_based/block_cache.h b/table/block_based/block_cache.h index 8a881595baf2..06ba50566e10 100644 --- a/table/block_based/block_cache.h +++ b/table/block_based/block_cache.h @@ -70,22 +70,53 @@ class Block_kMetaIndex : public Block { struct BlockCreateContext : public Cache::CreateContext { BlockCreateContext() {} BlockCreateContext(const BlockBasedTableOptions* _table_options, - Statistics* _statistics, bool _using_zstd) + const ImmutableOptions* _ioptions, Statistics* _statistics, + bool _using_zstd, uint8_t _protection_bytes_per_key, + const Comparator* _raw_ucmp, + bool _index_value_is_full = false, + bool _index_has_first_key = false) : table_options(_table_options), + ioptions(_ioptions), statistics(_statistics), - using_zstd(_using_zstd) {} + raw_ucmp(_raw_ucmp), + using_zstd(_using_zstd), + protection_bytes_per_key(_protection_bytes_per_key), + index_value_is_full(_index_value_is_full), + index_has_first_key(_index_has_first_key) {} const BlockBasedTableOptions* table_options = nullptr; + const ImmutableOptions* ioptions = nullptr; Statistics* statistics = nullptr; + const Comparator* raw_ucmp = nullptr; + const UncompressionDict* dict = nullptr; + uint32_t format_version; bool using_zstd = false; + uint8_t protection_bytes_per_key = 0; + bool index_value_is_full; + bool index_has_first_key; // For TypedCacheInterface template inline void Create(std::unique_ptr* parsed_out, size_t* charge_out, const Slice& data, - MemoryAllocator* alloc) { - Create(parsed_out, - BlockContents(AllocateAndCopyBlock(data, alloc), data.size())); + CompressionType type, MemoryAllocator* alloc) { + BlockContents uncompressed_block_contents; + if (type != CompressionType::kNoCompression) { + assert(dict != nullptr); + UncompressionContext context(type); + UncompressionInfo info(context, *dict, type); + Status s = UncompressBlockData( + info, data.data(), data.size(), &uncompressed_block_contents, + table_options->format_version, *ioptions, alloc); + if (!s.ok()) { + parsed_out->reset(); + return; + } + } else { + uncompressed_block_contents = + BlockContents(AllocateAndCopyBlock(data, alloc), data.size()); + } + Create(parsed_out, std::move(uncompressed_block_contents)); *charge_out = parsed_out->get()->ApproximateMemoryUsage(); } @@ -103,10 +134,6 @@ struct BlockCreateContext : public Cache::CreateContext { BlockContents&& block); }; -// Convenient cache interface to use with block_cache_compressed -using CompressedBlockCacheInterface = - BasicTypedCacheInterface; - // Convenient cache interface to use for block_cache, with support for // SecondaryCache. template diff --git a/table/block_based/block_prefetcher.cc b/table/block_based/block_prefetcher.cc index 83ec2cb060ba..54848b785bab 100644 --- a/table/block_based/block_prefetcher.cc +++ b/table/block_based/block_prefetcher.cc @@ -15,15 +15,40 @@ namespace ROCKSDB_NAMESPACE { void BlockPrefetcher::PrefetchIfNeeded( const BlockBasedTable::Rep* rep, const BlockHandle& handle, const size_t readahead_size, bool is_for_compaction, - const bool no_sequential_checking, - const Env::IOPriority rate_limiter_priority) { - // num_file_reads is used by FilePrefetchBuffer only when - // implicit_auto_readahead is set. + const bool no_sequential_checking, const ReadOptions& read_options, + const std::function& readaheadsize_cb) { + const size_t len = BlockBasedTable::BlockSizeWithTrailer(handle); + const size_t offset = handle.offset(); if (is_for_compaction) { + if (!rep->file->use_direct_io() && compaction_readahead_size_ > 0) { + // If FS supports prefetching (readahead_limit_ will be non zero in that + // case) and current block exists in prefetch buffer then return. + if (offset + len <= readahead_limit_) { + return; + } + IOOptions opts; + Status s = rep->file->PrepareIOOptions(read_options, opts); + if (!s.ok()) { + return; + } + s = rep->file->Prefetch(opts, offset, len + compaction_readahead_size_); + if (s.ok()) { + readahead_limit_ = offset + len + compaction_readahead_size_; + return; + } else if (!s.IsNotSupported()) { + return; + } + } + // If FS prefetch is not supported, fall back to use internal prefetch + // buffer. + // + // num_file_reads is used by FilePrefetchBuffer only when + // implicit_auto_readahead is set. rep->CreateFilePrefetchBufferIfNotExists( compaction_readahead_size_, compaction_readahead_size_, &prefetch_buffer_, /*implicit_auto_readahead=*/false, - /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0); + /*num_file_reads=*/0, /*num_file_reads_for_auto_readahead=*/0, + /*upper_bound_offset=*/0, /*readaheadsize_cb=*/nullptr); return; } @@ -32,7 +57,9 @@ void BlockPrefetcher::PrefetchIfNeeded( rep->CreateFilePrefetchBufferIfNotExists( readahead_size, readahead_size, &prefetch_buffer_, /*implicit_auto_readahead=*/false, /*num_file_reads=*/0, - /*num_file_reads_for_auto_readahead=*/0); + /*num_file_reads_for_auto_readahead=*/0, upper_bound_offset_, + readaheadsize_cb, + /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch); return; } @@ -56,13 +83,12 @@ void BlockPrefetcher::PrefetchIfNeeded( initial_auto_readahead_size_, max_auto_readahead_size, &prefetch_buffer_, /*implicit_auto_readahead=*/true, /*num_file_reads=*/0, - rep->table_options.num_file_reads_for_auto_readahead); + rep->table_options.num_file_reads_for_auto_readahead, + upper_bound_offset_, readaheadsize_cb, + /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch); return; } - size_t len = BlockBasedTable::BlockSizeWithTrailer(handle); - size_t offset = handle.offset(); - // If FS supports prefetching (readahead_limit_ will be non zero in that case) // and current block exists in prefetch buffer then return. if (offset + len <= readahead_limit_) { @@ -89,7 +115,9 @@ void BlockPrefetcher::PrefetchIfNeeded( rep->CreateFilePrefetchBufferIfNotExists( initial_auto_readahead_size_, max_auto_readahead_size, &prefetch_buffer_, /*implicit_auto_readahead=*/true, num_file_reads_, - rep->table_options.num_file_reads_for_auto_readahead); + rep->table_options.num_file_reads_for_auto_readahead, + upper_bound_offset_, readaheadsize_cb, + /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch); return; } @@ -98,17 +126,21 @@ void BlockPrefetcher::PrefetchIfNeeded( } // If prefetch is not supported, fall back to use internal prefetch buffer. - // Discarding other return status of Prefetch calls intentionally, as - // we can fallback to reading from disk if Prefetch fails. - Status s = rep->file->Prefetch( - handle.offset(), - BlockBasedTable::BlockSizeWithTrailer(handle) + readahead_size_, - rate_limiter_priority); + IOOptions opts; + Status s = rep->file->PrepareIOOptions(read_options, opts); + if (!s.ok()) { + return; + } + s = rep->file->Prefetch( + opts, handle.offset(), + BlockBasedTable::BlockSizeWithTrailer(handle) + readahead_size_); if (s.IsNotSupported()) { rep->CreateFilePrefetchBufferIfNotExists( initial_auto_readahead_size_, max_auto_readahead_size, &prefetch_buffer_, /*implicit_auto_readahead=*/true, num_file_reads_, - rep->table_options.num_file_reads_for_auto_readahead); + rep->table_options.num_file_reads_for_auto_readahead, + upper_bound_offset_, readaheadsize_cb, + /*usage=*/FilePrefetchBufferUsage::kUserScanPrefetch); return; } diff --git a/table/block_based/block_prefetcher.h b/table/block_based/block_prefetcher.h index 518868a301d6..7e075c08e2d0 100644 --- a/table/block_based/block_prefetcher.h +++ b/table/block_based/block_prefetcher.h @@ -18,11 +18,11 @@ class BlockPrefetcher { readahead_size_(initial_auto_readahead_size), initial_auto_readahead_size_(initial_auto_readahead_size) {} - void PrefetchIfNeeded(const BlockBasedTable::Rep* rep, - const BlockHandle& handle, size_t readahead_size, - bool is_for_compaction, - const bool no_sequential_checking, - Env::IOPriority rate_limiter_priority); + void PrefetchIfNeeded( + const BlockBasedTable::Rep* rep, const BlockHandle& handle, + size_t readahead_size, bool is_for_compaction, + const bool no_sequential_checking, const ReadOptions& read_options, + const std::function& readaheadsize_cb); FilePrefetchBuffer* prefetch_buffer() { return prefetch_buffer_.get(); } void UpdateReadPattern(const uint64_t& offset, const size_t& len) { @@ -53,20 +53,32 @@ class BlockPrefetcher { &initial_auto_readahead_size_); } + void SetUpperBoundOffset(uint64_t upper_bound_offset) { + upper_bound_offset_ = upper_bound_offset; + if (prefetch_buffer() != nullptr) { + // Upper bound can be changed on reseek. So update that in + // FilePrefetchBuffer. + prefetch_buffer()->ResetUpperBoundOffset(upper_bound_offset); + } + } + private: // Readahead size used in compaction, its value is used only if // lookup_context_.caller = kCompaction. size_t compaction_readahead_size_; - // readahead_size_ is used if underlying FS supports prefetching. + // readahead_size_ is used in non-compaction read if underlying FS supports + // prefetching. size_t readahead_size_; size_t readahead_limit_ = 0; - // initial_auto_readahead_size_ is used if RocksDB uses internal prefetch - // buffer. + // initial_auto_readahead_size_ is used in non-compaction read if RocksDB uses + // internal prefetch buffer. uint64_t initial_auto_readahead_size_; uint64_t num_file_reads_ = 0; uint64_t prev_offset_ = 0; size_t prev_len_ = 0; std::unique_ptr prefetch_buffer_; + + uint64_t upper_bound_offset_ = 0; }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_test.cc b/table/block_based/block_test.cc index 83b87fe79e83..9082a08e9f82 100644 --- a/table/block_based/block_test.cc +++ b/table/block_based/block_test.cc @@ -15,6 +15,7 @@ #include #include +#include "db/db_test_util.h" #include "db/dbformat.h" #include "db/memtable.h" #include "db/write_batch_internal.h" @@ -33,7 +34,8 @@ namespace ROCKSDB_NAMESPACE { std::string GenerateInternalKey(int primary_key, int secondary_key, - int padding_size, Random *rnd) { + int padding_size, Random *rnd, + size_t ts_sz = 0) { char buf[50]; char *p = &buf[0]; snprintf(buf, sizeof(buf), "%6d%4d", primary_key, secondary_key); @@ -42,6 +44,11 @@ std::string GenerateInternalKey(int primary_key, int secondary_key, k += rnd->RandomString(padding_size); } AppendInternalKeyFooter(&k, 0 /* seqno */, kTypeValue); + std::string key_with_ts; + if (ts_sz > 0) { + PadInternalKeyWithMinTimestamp(&key_with_ts, k, ts_sz); + return key_with_ts; + } return k; } @@ -53,7 +60,7 @@ void GenerateRandomKVs(std::vector *keys, std::vector *values, const int from, const int len, const int step = 1, const int padding_size = 0, - const int keys_share_prefix = 1) { + const int keys_share_prefix = 1, size_t ts_sz = 0) { Random rnd(302); // generate different prefix @@ -61,7 +68,7 @@ void GenerateRandomKVs(std::vector *keys, // generating keys that shares the prefix for (int j = 0; j < keys_share_prefix; ++j) { // `DataBlockIter` assumes it reads only internal keys. - keys->emplace_back(GenerateInternalKey(i, j, padding_size, &rnd)); + keys->emplace_back(GenerateInternalKey(i, j, padding_size, &rnd, ts_sz)); // 100 bytes values values->emplace_back(rnd.RandomString(100)); @@ -69,19 +76,49 @@ void GenerateRandomKVs(std::vector *keys, } } -class BlockTest : public testing::Test {}; +// Test Param 1): key use delta encoding. +// Test Param 2): user-defined timestamp test mode. +// Test Param 3): data block index type. +class BlockTest : public testing::Test, + public testing::WithParamInterface< + std::tuple> { + public: + bool keyUseDeltaEncoding() const { return std::get<0>(GetParam()); } + bool isUDTEnabled() const { + return test::IsUDTEnabled(std::get<1>(GetParam())); + } + bool shouldPersistUDT() const { + return test::ShouldPersistUDT(std::get<1>(GetParam())); + } + + BlockBasedTableOptions::DataBlockIndexType dataBlockIndexType() const { + return std::get<2>(GetParam()); + } +}; // block test -TEST_F(BlockTest, SimpleTest) { +TEST_P(BlockTest, SimpleTest) { Random rnd(301); Options options = Options(); + if (isUDTEnabled()) { + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + } + size_t ts_sz = options.comparator->timestamp_size(); std::vector keys; std::vector values; - BlockBuilder builder(16); + BlockBasedTableOptions::DataBlockIndexType index_type = + isUDTEnabled() ? BlockBasedTableOptions::kDataBlockBinarySearch + : dataBlockIndexType(); + BlockBuilder builder(16, keyUseDeltaEncoding(), + false /* use_value_delta_encoding */, index_type, + 0.75 /* data_block_hash_table_util_ratio */, ts_sz, + shouldPersistUDT(), false /* is_user_key */); int num_records = 100000; - GenerateRandomKVs(&keys, &values, 0, num_records); + GenerateRandomKVs(&keys, &values, 0, num_records, 1 /* step */, + 0 /* padding_size */, 1 /* keys_share_prefix */, ts_sz); // add a bunch of records to a block for (int i = 0; i < num_records; i++) { builder.Add(keys[i], values[i]); @@ -97,8 +134,10 @@ TEST_F(BlockTest, SimpleTest) { // read contents of block sequentially int count = 0; - InternalIterator *iter = - reader.NewDataIterator(options.comparator, kDisableGlobalSequenceNumber); + InternalIterator *iter = reader.NewDataIterator( + options.comparator, kDisableGlobalSequenceNumber, nullptr /* iter */, + nullptr /* stats */, false /* block_contents_pinned */, + shouldPersistUDT()); for (iter->SeekToFirst(); iter->Valid(); count++, iter->Next()) { // read kv from block Slice k = iter->key(); @@ -111,8 +150,10 @@ TEST_F(BlockTest, SimpleTest) { delete iter; // read block contents randomly - iter = - reader.NewDataIterator(options.comparator, kDisableGlobalSequenceNumber); + iter = reader.NewDataIterator( + options.comparator, kDisableGlobalSequenceNumber, nullptr /* iter */, + nullptr /* stats */, false /* block_contents_pinned */, + shouldPersistUDT()); for (int i = 0; i < num_records; i++) { // find a random key in the lookaside array int index = rnd.Uniform(num_records); @@ -128,11 +169,18 @@ TEST_F(BlockTest, SimpleTest) { } // return the block contents -BlockContents GetBlockContents(std::unique_ptr *builder, - const std::vector &keys, - const std::vector &values, - const int /*prefix_group_size*/ = 1) { - builder->reset(new BlockBuilder(1 /* restart interval */)); +BlockContents GetBlockContents( + std::unique_ptr *builder, + const std::vector &keys, + const std::vector &values, bool key_use_delta_encoding, + size_t ts_sz, bool should_persist_udt, const int /*prefix_group_size*/ = 1, + BlockBasedTableOptions::DataBlockIndexType dblock_index_type = + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch) { + builder->reset( + new BlockBuilder(1 /* restart interval */, key_use_delta_encoding, + false /* use_value_delta_encoding */, dblock_index_type, + 0.75 /* data_block_hash_table_util_ratio */, ts_sz, + should_persist_udt, false /* is_user_key */)); // Add only half of the keys for (size_t i = 0; i < keys.size(); ++i) { @@ -148,7 +196,8 @@ BlockContents GetBlockContents(std::unique_ptr *builder, void CheckBlockContents(BlockContents contents, const int max_key, const std::vector &keys, - const std::vector &values) { + const std::vector &values, + bool is_udt_enabled, bool should_persist_udt) { const size_t prefix_size = 6; // create block reader BlockContents contents_ref(contents.data); @@ -159,7 +208,10 @@ void CheckBlockContents(BlockContents contents, const int max_key, NewFixedPrefixTransform(prefix_size)); std::unique_ptr regular_iter(reader2.NewDataIterator( - BytewiseComparator(), kDisableGlobalSequenceNumber)); + is_udt_enabled ? test::BytewiseComparatorWithU64TsWrapper() + : BytewiseComparator(), + kDisableGlobalSequenceNumber, nullptr /* iter */, nullptr /* stats */, + false /* block_contents_pinned */, should_persist_udt)); // Seek existent keys for (size_t i = 0; i < keys.size(); i++) { @@ -177,46 +229,79 @@ void CheckBlockContents(BlockContents contents, const int max_key, // return the one that is closest. for (int i = 1; i < max_key - 1; i += 2) { // `DataBlockIter` assumes its APIs receive only internal keys. - auto key = GenerateInternalKey(i, 0, 0, nullptr); + auto key = GenerateInternalKey(i, 0, 0, nullptr, + is_udt_enabled ? 8 : 0 /* ts_sz */); regular_iter->Seek(key); ASSERT_TRUE(regular_iter->Valid()); } } // In this test case, no two key share same prefix. -TEST_F(BlockTest, SimpleIndexHash) { +TEST_P(BlockTest, SimpleIndexHash) { const int kMaxKey = 100000; + size_t ts_sz = isUDTEnabled() ? 8 : 0; std::vector keys; std::vector values; GenerateRandomKVs(&keys, &values, 0 /* first key id */, kMaxKey /* last key id */, 2 /* step */, - 8 /* padding size (8 bytes randomly generated suffix) */); + 8 /* padding size (8 bytes randomly generated suffix) */, + 1 /* keys_share_prefix */, ts_sz); std::unique_ptr builder; - auto contents = GetBlockContents(&builder, keys, values); - CheckBlockContents(std::move(contents), kMaxKey, keys, values); + auto contents = GetBlockContents( + &builder, keys, values, keyUseDeltaEncoding(), ts_sz, shouldPersistUDT(), + 1 /* prefix_group_size */, + isUDTEnabled() + ? BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch + : dataBlockIndexType()); + + CheckBlockContents(std::move(contents), kMaxKey, keys, values, isUDTEnabled(), + shouldPersistUDT()); } -TEST_F(BlockTest, IndexHashWithSharedPrefix) { +TEST_P(BlockTest, IndexHashWithSharedPrefix) { const int kMaxKey = 100000; // for each prefix, there will be 5 keys starts with it. const int kPrefixGroup = 5; + size_t ts_sz = isUDTEnabled() ? 8 : 0; std::vector keys; std::vector values; // Generate keys with same prefix. GenerateRandomKVs(&keys, &values, 0, // first key id kMaxKey, // last key id - 2, // step - 10, // padding size, - kPrefixGroup); + 2 /* step */, + 10 /* padding size (8 bytes randomly generated suffix) */, + kPrefixGroup /* keys_share_prefix */, ts_sz); std::unique_ptr builder; - auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup); - CheckBlockContents(std::move(contents), kMaxKey, keys, values); + auto contents = GetBlockContents( + &builder, keys, values, keyUseDeltaEncoding(), isUDTEnabled(), + shouldPersistUDT(), kPrefixGroup, + isUDTEnabled() + ? BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch + : dataBlockIndexType()); + + CheckBlockContents(std::move(contents), kMaxKey, keys, values, isUDTEnabled(), + shouldPersistUDT()); } +// Param 0: key use delta encoding +// Param 1: user-defined timestamp test mode +// Param 2: data block index type. User-defined timestamp feature is not +// compatible with `kDataBlockBinaryAndHash` data block index type because the +// user comparator doesn't provide a `CanKeysWithDifferentByteContentsBeEqual` +// override. This combination is disabled. +INSTANTIATE_TEST_CASE_P( + P, BlockTest, + ::testing::Combine( + ::testing::Bool(), ::testing::ValuesIn(test::GetUDTTestModes()), + ::testing::Values( + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch, + BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinaryAndHash))); + // A slow and accurate version of BlockReadAmpBitmap that simply store // all the marked ranges in a set. class BlockReadAmpBitmapSlowAndAccurate { @@ -361,7 +446,7 @@ TEST_F(BlockTest, BlockWithReadAmpBitmap) { BlockBuilder builder(16); int num_records = 10000; - GenerateRandomKVs(&keys, &values, 0, num_records, 1); + GenerateRandomKVs(&keys, &values, 0, num_records, 1 /* step */); // add a bunch of records to a block for (int i = 0; i < num_records; i++) { builder.Add(keys[i], values[i]); @@ -494,19 +579,28 @@ TEST_F(BlockTest, ReadAmpBitmapPow2) { class IndexBlockTest : public testing::Test, - public testing::WithParamInterface> { + public testing::WithParamInterface< + std::tuple> { public: IndexBlockTest() = default; - bool useValueDeltaEncoding() const { return std::get<0>(GetParam()); } - bool includeFirstKey() const { return std::get<1>(GetParam()); } + bool keyIncludesSeq() const { return std::get<0>(GetParam()); } + bool useValueDeltaEncoding() const { return std::get<1>(GetParam()); } + bool includeFirstKey() const { return std::get<2>(GetParam()); } + bool isUDTEnabled() const { + return test::IsUDTEnabled(std::get<3>(GetParam())); + } + bool shouldPersistUDT() const { + return test::ShouldPersistUDT(std::get<3>(GetParam())); + } }; // Similar to GenerateRandomKVs but for index block contents. void GenerateRandomIndexEntries(std::vector *separators, std::vector *block_handles, std::vector *first_keys, - const int len) { + const int len, size_t ts_sz = 0, + bool zero_seqno = false) { Random rnd(42); // For each of `len` blocks, we need to generate a first and last key. @@ -514,7 +608,17 @@ void GenerateRandomIndexEntries(std::vector *separators, std::set keys; while ((int)keys.size() < len * 2) { // Keys need to be at least 8 bytes long to look like internal keys. - keys.insert(test::RandomKey(&rnd, 12)); + std::string new_key = test::RandomKey(&rnd, 12); + if (zero_seqno) { + AppendInternalKeyFooter(&new_key, 0 /* seqno */, kTypeValue); + } + if (ts_sz > 0) { + std::string key; + PadInternalKeyWithMinTimestamp(&key, new_key, ts_sz); + keys.insert(std::move(key)); + } else { + keys.insert(std::move(new_key)); + } } uint64_t offset = 0; @@ -531,19 +635,34 @@ void GenerateRandomIndexEntries(std::vector *separators, TEST_P(IndexBlockTest, IndexValueEncodingTest) { Random rnd(301); Options options = Options(); + if (isUDTEnabled()) { + options.comparator = test::BytewiseComparatorWithU64TsWrapper(); + } + size_t ts_sz = options.comparator->timestamp_size(); std::vector separators; std::vector block_handles; std::vector first_keys; const bool kUseDeltaEncoding = true; - BlockBuilder builder(16, kUseDeltaEncoding, useValueDeltaEncoding()); + BlockBuilder builder(16, kUseDeltaEncoding, useValueDeltaEncoding(), + BlockBasedTableOptions::kDataBlockBinarySearch, + 0.75 /* data_block_hash_table_util_ratio */, ts_sz, + shouldPersistUDT(), !keyIncludesSeq()); + int num_records = 100; GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, - num_records); + num_records, ts_sz, false /* zero_seqno */); BlockHandle last_encoded_handle; for (int i = 0; i < num_records; i++) { - IndexValue entry(block_handles[i], first_keys[i]); + std::string first_key_to_persist_buf; + Slice first_internal_key = first_keys[i]; + if (ts_sz > 0 && !shouldPersistUDT()) { + StripTimestampFromInternalKey(&first_key_to_persist_buf, first_keys[i], + ts_sz); + first_internal_key = first_key_to_persist_buf; + } + IndexValue entry(block_handles[i], first_internal_key); std::string encoded_entry; std::string delta_encoded_entry; entry.EncodeTo(&encoded_entry, includeFirstKey(), nullptr); @@ -553,7 +672,13 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) { } last_encoded_handle = entry.handle; const Slice delta_encoded_entry_slice(delta_encoded_entry); - builder.Add(separators[i], encoded_entry, &delta_encoded_entry_slice); + + if (keyIncludesSeq()) { + builder.Add(separators[i], encoded_entry, &delta_encoded_entry_slice); + } else { + const Slice user_key = ExtractUserKey(separators[i]); + builder.Add(user_key, encoded_entry, &delta_encoded_entry_slice); + } } // read serialized contents of the block @@ -565,14 +690,14 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) { Block reader(std::move(contents)); const bool kTotalOrderSeek = true; - const bool kIncludesSeq = true; - const bool kValueIsFull = !useValueDeltaEncoding(); IndexBlockIter *kNullIter = nullptr; Statistics *kNullStats = nullptr; // read contents of block sequentially InternalIteratorBase *iter = reader.NewIndexIterator( options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats, - kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull); + kTotalOrderSeek, includeFirstKey(), keyIncludesSeq(), + !useValueDeltaEncoding(), false /* block_contents_pinned */, + shouldPersistUDT()); iter->SeekToFirst(); for (int index = 0; index < num_records; ++index) { ASSERT_TRUE(iter->Valid()); @@ -580,7 +705,12 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) { Slice k = iter->key(); IndexValue v = iter->value(); - EXPECT_EQ(separators[index], k.ToString()); + if (keyIncludesSeq()) { + EXPECT_EQ(separators[index], k.ToString()); + } else { + const Slice user_key = ExtractUserKey(separators[index]); + EXPECT_EQ(user_key, k); + } EXPECT_EQ(block_handles[index].offset(), v.handle.offset()); EXPECT_EQ(block_handles[index].size(), v.handle.size()); EXPECT_EQ(includeFirstKey() ? first_keys[index] : "", @@ -593,7 +723,9 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) { // read block contents randomly iter = reader.NewIndexIterator( options.comparator, kDisableGlobalSequenceNumber, kNullIter, kNullStats, - kTotalOrderSeek, includeFirstKey(), kIncludesSeq, kValueIsFull); + kTotalOrderSeek, includeFirstKey(), keyIncludesSeq(), + !useValueDeltaEncoding(), false /* block_contents_pinned */, + shouldPersistUDT()); for (int i = 0; i < num_records * 2; i++) { // find a random key in the lookaside array int index = rnd.Uniform(num_records); @@ -603,7 +735,12 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) { iter->Seek(k); ASSERT_TRUE(iter->Valid()); IndexValue v = iter->value(); - EXPECT_EQ(separators[index], iter->key().ToString()); + if (keyIncludesSeq()) { + EXPECT_EQ(separators[index], iter->key().ToString()); + } else { + const Slice user_key = ExtractUserKey(separators[index]); + EXPECT_EQ(user_key, iter->key()); + } EXPECT_EQ(block_handles[index].offset(), v.handle.offset()); EXPECT_EQ(block_handles[index].size(), v.handle.size()); EXPECT_EQ(includeFirstKey() ? first_keys[index] : "", @@ -612,12 +749,952 @@ TEST_P(IndexBlockTest, IndexValueEncodingTest) { delete iter; } -INSTANTIATE_TEST_CASE_P(P, IndexBlockTest, - ::testing::Values(std::make_tuple(false, false), - std::make_tuple(false, true), - std::make_tuple(true, false), - std::make_tuple(true, true))); +// Param 0: key includes sequence number (whether to use user key or internal +// key as key entry in index block). +// Param 1: use value delta encoding +// Param 2: include first key +// Param 3: user-defined timestamp test mode +INSTANTIATE_TEST_CASE_P( + P, IndexBlockTest, + ::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(), + ::testing::ValuesIn(test::GetUDTTestModes()))); + +class BlockPerKVChecksumTest : public DBTestBase { + public: + BlockPerKVChecksumTest() + : DBTestBase("block_per_kv_checksum", /*env_do_fsync=*/false) {} + + template + void TestIterateForward(std::unique_ptr &biter, + size_t &verification_count) { + while (biter->Valid()) { + verification_count = 0; + biter->Next(); + if (biter->Valid()) { + ASSERT_GE(verification_count, 1); + } + } + } + + template + void TestIterateBackward(std::unique_ptr &biter, + size_t &verification_count) { + while (biter->Valid()) { + verification_count = 0; + biter->Prev(); + if (biter->Valid()) { + ASSERT_GE(verification_count, 1); + } + } + } + + template + void TestSeekToFirst(std::unique_ptr &biter, + size_t &verification_count) { + verification_count = 0; + biter->SeekToFirst(); + ASSERT_GE(verification_count, 1); + TestIterateForward(biter, verification_count); + } + + template + void TestSeekToLast(std::unique_ptr &biter, + size_t &verification_count) { + verification_count = 0; + biter->SeekToLast(); + ASSERT_GE(verification_count, 1); + TestIterateBackward(biter, verification_count); + } + + template + void TestSeekForPrev(std::unique_ptr &biter, + size_t &verification_count, std::string k) { + verification_count = 0; + biter->SeekForPrev(k); + ASSERT_GE(verification_count, 1); + TestIterateBackward(biter, verification_count); + } + + template + void TestSeek(std::unique_ptr &biter, size_t &verification_count, + std::string k) { + verification_count = 0; + biter->Seek(k); + ASSERT_GE(verification_count, 1); + TestIterateForward(biter, verification_count); + } + + bool VerifyChecksum(uint32_t checksum_len, const char *checksum_ptr, + const Slice &key, const Slice &val) { + if (!checksum_len) { + return checksum_ptr == nullptr; + } + return ProtectionInfo64().ProtectKV(key, val).Verify( + static_cast(checksum_len), checksum_ptr); + } +}; + +TEST_F(BlockPerKVChecksumTest, EmptyBlock) { + // Tests that empty block code path is not broken by per kv checksum. + BlockBuilder builder( + 16 /* block_restart_interval */, true /* use_delta_encoding */, + false /* use_value_delta_encoding */, + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch); + Slice raw_block = builder.Finish(); + BlockContents contents; + contents.data = raw_block; + + std::unique_ptr data_block; + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = 8; + BlockCreateContext create_context{&tbo, + nullptr, + nullptr /* statistics */, + false /* using_zstd */, + protection_bytes_per_key, + options.comparator}; + create_context.Create(&data_block, std::move(contents)); + std::unique_ptr biter{data_block->NewDataIterator( + options.comparator, kDisableGlobalSequenceNumber)}; + biter->SeekToFirst(); + ASSERT_FALSE(biter->Valid()); + ASSERT_OK(biter->status()); + Random rnd(33); + biter->SeekForGet(GenerateInternalKey(1, 1, 10, &rnd)); + ASSERT_FALSE(biter->Valid()); + ASSERT_OK(biter->status()); + biter->SeekToLast(); + ASSERT_FALSE(biter->Valid()); + ASSERT_OK(biter->status()); + biter->Seek(GenerateInternalKey(1, 1, 10, &rnd)); + ASSERT_FALSE(biter->Valid()); + ASSERT_OK(biter->status()); + biter->SeekForPrev(GenerateInternalKey(1, 1, 10, &rnd)); + ASSERT_FALSE(biter->Valid()); + ASSERT_OK(biter->status()); +} + +TEST_F(BlockPerKVChecksumTest, UnsupportedOptionValue) { + Options options = Options(); + options.block_protection_bytes_per_key = 128; + Destroy(options); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); +} +TEST_F(BlockPerKVChecksumTest, InitializeProtectionInfo) { + // Make sure that the checksum construction code path does not break + // when the block is itself already corrupted. + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = 8; + BlockCreateContext create_context{&tbo, + nullptr /* ioptions */, + nullptr /* statistics */, + false /* using_zstd */, + protection_bytes_per_key, + options.comparator}; + + { + std::string invalid_content = "1"; + Slice raw_block = invalid_content; + BlockContents contents; + contents.data = raw_block; + std::unique_ptr data_block; + create_context.Create(&data_block, std::move(contents)); + std::unique_ptr iter{data_block->NewDataIterator( + options.comparator, kDisableGlobalSequenceNumber)}; + ASSERT_TRUE(iter->status().IsCorruption()); + } + { + std::string invalid_content = "1"; + Slice raw_block = invalid_content; + BlockContents contents; + contents.data = raw_block; + std::unique_ptr index_block; + create_context.Create(&index_block, std::move(contents)); + std::unique_ptr iter{index_block->NewIndexIterator( + options.comparator, kDisableGlobalSequenceNumber, nullptr, nullptr, + true, false, true, true)}; + ASSERT_TRUE(iter->status().IsCorruption()); + } + { + std::string invalid_content = "1"; + Slice raw_block = invalid_content; + BlockContents contents; + contents.data = raw_block; + std::unique_ptr meta_block; + create_context.Create(&meta_block, std::move(contents)); + std::unique_ptr iter{meta_block->NewMetaIterator(true)}; + ASSERT_TRUE(iter->status().IsCorruption()); + } +} + +TEST_F(BlockPerKVChecksumTest, ApproximateMemory) { + // Tests that ApproximateMemoryUsage() includes memory used by block kv + // checksum. + const int kNumRecords = 20; + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0, kNumRecords, 1 /* step */, + 24 /* padding_size */); + std::unique_ptr builder; + auto generate_block_content = [&]() { + builder = std::make_unique(16 /* restart_interval */); + for (int i = 0; i < kNumRecords; ++i) { + builder->Add(keys[i], values[i]); + } + Slice raw_block = builder->Finish(); + BlockContents contents; + contents.data = raw_block; + return contents; + }; + + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = 8; + BlockCreateContext with_checksum_create_context{ + &tbo, + nullptr /* ioptions */, + nullptr /* statistics */, + false /* using_zstd */, + protection_bytes_per_key, + options.comparator, + true /* index_value_is_full */}; + BlockCreateContext create_context{&tbo, + nullptr /* ioptions */, + nullptr /* statistics */, + false /* using_zstd */, + 0, + options.comparator, + true /* index_value_is_full */}; + + { + std::unique_ptr data_block; + create_context.Create(&data_block, generate_block_content()); + size_t block_memory = data_block->ApproximateMemoryUsage(); + std::unique_ptr with_checksum_data_block; + with_checksum_create_context.Create(&with_checksum_data_block, + generate_block_content()); + ASSERT_GT(with_checksum_data_block->ApproximateMemoryUsage() - block_memory, + 100); + } + + { + std::unique_ptr meta_block; + create_context.Create(&meta_block, generate_block_content()); + size_t block_memory = meta_block->ApproximateMemoryUsage(); + std::unique_ptr with_checksum_meta_block; + with_checksum_create_context.Create(&with_checksum_meta_block, + generate_block_content()); + // Rough comparison to avoid flaky test due to memory allocation alignment. + ASSERT_GT(with_checksum_meta_block->ApproximateMemoryUsage() - block_memory, + 100); + } + + { + // Index block has different contents. + std::vector separators; + std::vector block_handles; + std::vector first_keys; + GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, + kNumRecords); + auto generate_index_content = [&]() { + builder = std::make_unique(16 /* restart_interval */); + BlockHandle last_encoded_handle; + for (int i = 0; i < kNumRecords; ++i) { + IndexValue entry(block_handles[i], first_keys[i]); + std::string encoded_entry; + std::string delta_encoded_entry; + entry.EncodeTo(&encoded_entry, false, nullptr); + last_encoded_handle = entry.handle; + const Slice delta_encoded_entry_slice(delta_encoded_entry); + builder->Add(separators[i], encoded_entry, &delta_encoded_entry_slice); + } + Slice raw_block = builder->Finish(); + BlockContents contents; + contents.data = raw_block; + return contents; + }; + + std::unique_ptr index_block; + create_context.Create(&index_block, generate_index_content()); + size_t block_memory = index_block->ApproximateMemoryUsage(); + std::unique_ptr with_checksum_index_block; + with_checksum_create_context.Create(&with_checksum_index_block, + generate_index_content()); + ASSERT_GT( + with_checksum_index_block->ApproximateMemoryUsage() - block_memory, + 100); + } +} + +std::string GetDataBlockIndexTypeStr( + BlockBasedTableOptions::DataBlockIndexType t) { + return t == BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch + ? "BinarySearch" + : "BinaryAndHash"; +} + +class DataBlockKVChecksumTest + : public BlockPerKVChecksumTest, + public testing::WithParamInterface> { + public: + DataBlockKVChecksumTest() = default; + + BlockBasedTableOptions::DataBlockIndexType GetDataBlockIndexType() const { + return std::get<0>(GetParam()); + } + uint8_t GetChecksumLen() const { return std::get<1>(GetParam()); } + uint32_t GetRestartInterval() const { return std::get<2>(GetParam()); } + bool GetUseDeltaEncoding() const { return std::get<3>(GetParam()); } + + std::unique_ptr GenerateDataBlock( + std::vector &keys, std::vector &values, + int num_record) { + BlockBasedTableOptions tbo; + BlockCreateContext create_context{&tbo, + nullptr /* statistics */, + nullptr /* ioptions */, + false /* using_zstd */, + GetChecksumLen(), + Options().comparator}; + builder_ = std::make_unique( + static_cast(GetRestartInterval()), + GetUseDeltaEncoding() /* use_delta_encoding */, + false /* use_value_delta_encoding */, GetDataBlockIndexType()); + for (int i = 0; i < num_record; i++) { + builder_->Add(keys[i], values[i]); + } + Slice raw_block = builder_->Finish(); + BlockContents contents; + contents.data = raw_block; + std::unique_ptr data_block; + create_context.Create(&data_block, std::move(contents)); + return data_block; + } + + std::unique_ptr builder_; +}; + +INSTANTIATE_TEST_CASE_P( + P, DataBlockKVChecksumTest, + ::testing::Combine( + ::testing::Values( + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch, + BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinaryAndHash), + ::testing::Values(0, 1, 2, 4, 8) /* protection_bytes_per_key */, + ::testing::Values(1, 2, 3, 8, 16) /* restart_interval */, + ::testing::Values(false, true)) /* delta_encoding */, + [](const testing::TestParamInfo> + &args) { + std::ostringstream oss; + oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) + << "ProtectionPerKey" << std::to_string(std::get<1>(args.param)) + << "RestartInterval" << std::to_string(std::get<2>(args.param)) + << "DeltaEncode" << std::to_string(std::get<3>(args.param)); + return oss.str(); + }); + +TEST_P(DataBlockKVChecksumTest, ChecksumConstructionAndVerification) { + uint8_t protection_bytes_per_key = GetChecksumLen(); + std::vector num_restart_intervals = {1, 16}; + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = + num_restart_interval * static_cast(GetRestartInterval()); + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */, + 24 /* padding_size */); + SyncPoint::GetInstance()->DisableProcessing(); + std::unique_ptr data_block = + GenerateDataBlock(keys, values, kNumRecords); + + const char *checksum_ptr = data_block->TEST_GetKVChecksum(); + // Check checksum of correct length is generated + for (int i = 0; i < kNumRecords; i++) { + ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key, + checksum_ptr + i * protection_bytes_per_key, + keys[i], values[i])); + } + std::vector seqnos{kDisableGlobalSequenceNumber, 0}; + + // Could just use a boolean flag. Use a counter here just to keep open the + // possibility of checking the exact number of verifications in the future. + size_t verification_count = 0; + // The SyncPoint is placed before checking checksum_len == 0 in + // Block::VerifyChecksum(). So verification count is incremented even with + // protection_bytes_per_key = 0. No actual checksum computation is done in + // that case (see Block::VerifyChecksum()). + SyncPoint::GetInstance()->SetCallBack( + "Block::VerifyChecksum::checksum_len", + [&verification_count, protection_bytes_per_key](void *checksum_len) { + ASSERT_EQ((*static_cast(checksum_len)), + protection_bytes_per_key); + ++verification_count; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + for (const auto seqno : seqnos) { + std::unique_ptr biter{ + data_block->NewDataIterator(Options().comparator, seqno)}; + + // SeekForGet() some key that does not exist + biter->SeekForGet(keys[kNumRecords]); + TestIterateForward(biter, verification_count); + + verification_count = 0; + biter->SeekForGet(keys[kNumRecords / 2]); + ASSERT_GE(verification_count, 1); + TestIterateForward(biter, verification_count); + + TestSeekToFirst(biter, verification_count); + TestSeekToLast(biter, verification_count); + TestSeekForPrev(biter, verification_count, keys[kNumRecords / 2]); + TestSeek(biter, verification_count, keys[kNumRecords / 2]); + } + } +} + +class IndexBlockKVChecksumTest + : public BlockPerKVChecksumTest, + public testing::WithParamInterface< + std::tuple> { + public: + IndexBlockKVChecksumTest() = default; + + BlockBasedTableOptions::DataBlockIndexType GetDataBlockIndexType() const { + return std::get<0>(GetParam()); + } + uint8_t GetChecksumLen() const { return std::get<1>(GetParam()); } + uint32_t GetRestartInterval() const { return std::get<2>(GetParam()); } + bool UseValueDeltaEncoding() const { return std::get<3>(GetParam()); } + bool IncludeFirstKey() const { return std::get<4>(GetParam()); } + + std::unique_ptr GenerateIndexBlock( + std::vector &separators, + std::vector &block_handles, + std::vector &first_keys, int num_record) { + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = GetChecksumLen(); + BlockCreateContext create_context{ + &tbo, + nullptr /* ioptions */, + nullptr /* statistics */, + false /* _using_zstd */, + protection_bytes_per_key, + options.comparator, + !UseValueDeltaEncoding() /* value_is_full */, + IncludeFirstKey()}; + builder_ = std::make_unique( + static_cast(GetRestartInterval()), true /* use_delta_encoding */, + UseValueDeltaEncoding() /* use_value_delta_encoding */, + GetDataBlockIndexType()); + BlockHandle last_encoded_handle; + for (int i = 0; i < num_record; i++) { + IndexValue entry(block_handles[i], first_keys[i]); + std::string encoded_entry; + std::string delta_encoded_entry; + entry.EncodeTo(&encoded_entry, IncludeFirstKey(), nullptr); + if (UseValueDeltaEncoding() && i > 0) { + entry.EncodeTo(&delta_encoded_entry, IncludeFirstKey(), + &last_encoded_handle); + } + + last_encoded_handle = entry.handle; + const Slice delta_encoded_entry_slice(delta_encoded_entry); + builder_->Add(separators[i], encoded_entry, &delta_encoded_entry_slice); + } + // read serialized contents of the block + Slice raw_block = builder_->Finish(); + // create block reader + BlockContents contents; + contents.data = raw_block; + std::unique_ptr index_block; + + create_context.Create(&index_block, std::move(contents)); + return index_block; + } + + std::unique_ptr builder_; +}; + +INSTANTIATE_TEST_CASE_P( + P, IndexBlockKVChecksumTest, + ::testing::Combine( + ::testing::Values( + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch, + BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinaryAndHash), + ::testing::Values(0, 1, 2, 4, 8), ::testing::Values(1, 3, 8, 16), + ::testing::Values(true, false), ::testing::Values(true, false)), + [](const testing::TestParamInfo< + std::tuple> &args) { + std::ostringstream oss; + oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes" + << std::to_string(std::get<1>(args.param)) << "RestartInterval" + << std::to_string(std::get<2>(args.param)) << "ValueDeltaEncode" + << std::to_string(std::get<3>(args.param)) << "IncludeFirstKey" + << std::to_string(std::get<4>(args.param)); + return oss.str(); + }); + +TEST_P(IndexBlockKVChecksumTest, ChecksumConstructionAndVerification) { + Options options = Options(); + uint8_t protection_bytes_per_key = GetChecksumLen(); + std::vector num_restart_intervals = {1, 16}; + std::vector seqnos{kDisableGlobalSequenceNumber, 10001}; + + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = + num_restart_interval * static_cast(GetRestartInterval()); + for (const auto seqno : seqnos) { + std::vector separators; + std::vector block_handles; + std::vector first_keys; + GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, + kNumRecords, 0 /* ts_sz */, + seqno != kDisableGlobalSequenceNumber); + SyncPoint::GetInstance()->DisableProcessing(); + std::unique_ptr index_block = GenerateIndexBlock( + separators, block_handles, first_keys, kNumRecords); + IndexBlockIter *kNullIter = nullptr; + Statistics *kNullStats = nullptr; + // read contents of block sequentially + std::unique_ptr biter{index_block->NewIndexIterator( + options.comparator, seqno, kNullIter, kNullStats, + true /* total_order_seek */, IncludeFirstKey() /* have_first_key */, + true /* key_includes_seq */, + !UseValueDeltaEncoding() /* value_is_full */, + true /* block_contents_pinned*/, + true /* user_defined_timestamps_persisted */, + nullptr /* prefix_index */)}; + biter->SeekToFirst(); + const char *checksum_ptr = index_block->TEST_GetKVChecksum(); + // Check checksum of correct length is generated + for (int i = 0; i < kNumRecords; i++) { + // Obtaining the actual content written as value to index block is not + // trivial: delta-encoded value is only persisted when not at block + // restart point and that keys share some byte (see more in + // BlockBuilder::AddWithLastKeyImpl()). So here we just do verification + // using value from iterator unlike tests for DataBlockIter or + // MetaBlockIter. + ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key, checksum_ptr, + biter->key(), biter->raw_value())); + } + + size_t verification_count = 0; + // The SyncPoint is placed before checking checksum_len == 0 in + // Block::VerifyChecksum(). To make the testing code below simpler and not + // having to differentiate 0 vs non-0 checksum_len, we do an explicit + // assert checking on checksum_len here. + SyncPoint::GetInstance()->SetCallBack( + "Block::VerifyChecksum::checksum_len", + [&verification_count, protection_bytes_per_key](void *checksum_len) { + ASSERT_EQ((*static_cast(checksum_len)), + protection_bytes_per_key); + ++verification_count; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + TestSeekToFirst(biter, verification_count); + TestSeekToLast(biter, verification_count); + TestSeek(biter, verification_count, first_keys[kNumRecords / 2]); + } + } +} + +class MetaIndexBlockKVChecksumTest + : public BlockPerKVChecksumTest, + public testing::WithParamInterface< + uint8_t /* block_protection_bytes_per_key */> { + public: + MetaIndexBlockKVChecksumTest() = default; + uint8_t GetChecksumLen() const { return GetParam(); } + uint32_t GetRestartInterval() const { return 1; } + + std::unique_ptr GenerateMetaIndexBlock( + std::vector &keys, std::vector &values, + int num_record) { + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = GetChecksumLen(); + BlockCreateContext create_context{&tbo, + nullptr /* ioptions */, + nullptr /* statistics */, + false /* using_zstd */, + protection_bytes_per_key, + options.comparator}; + builder_ = + std::make_unique(static_cast(GetRestartInterval())); + // add a bunch of records to a block + for (int i = 0; i < num_record; i++) { + builder_->Add(keys[i], values[i]); + } + Slice raw_block = builder_->Finish(); + BlockContents contents; + contents.data = raw_block; + std::unique_ptr meta_block; + create_context.Create(&meta_block, std::move(contents)); + return meta_block; + } + + std::unique_ptr builder_; +}; + +INSTANTIATE_TEST_CASE_P(P, MetaIndexBlockKVChecksumTest, + ::testing::Values(0, 1, 2, 4, 8), + [](const testing::TestParamInfo &args) { + std::ostringstream oss; + oss << "ProtBytes" << std::to_string(args.param); + return oss.str(); + }); + +TEST_P(MetaIndexBlockKVChecksumTest, ChecksumConstructionAndVerification) { + Options options = Options(); + BlockBasedTableOptions tbo; + uint8_t protection_bytes_per_key = GetChecksumLen(); + BlockCreateContext create_context{&tbo, + nullptr /* ioptions */, + nullptr /* statistics */, + false /* using_zstd */, + protection_bytes_per_key, + options.comparator}; + std::vector num_restart_intervals = {1, 16}; + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = num_restart_interval * GetRestartInterval(); + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */, + 24 /* padding_size */); + SyncPoint::GetInstance()->DisableProcessing(); + std::unique_ptr meta_block = + GenerateMetaIndexBlock(keys, values, kNumRecords); + const char *checksum_ptr = meta_block->TEST_GetKVChecksum(); + // Check checksum of correct length is generated + for (int i = 0; i < kNumRecords; i++) { + ASSERT_TRUE(VerifyChecksum(protection_bytes_per_key, + checksum_ptr + i * protection_bytes_per_key, + keys[i], values[i])); + } + + size_t verification_count = 0; + // The SyncPoint is placed before checking checksum_len == 0 in + // Block::VerifyChecksum(). To make the testing code below simpler and not + // having to differentiate 0 vs non-0 checksum_len, we do an explicit assert + // checking on checksum_len here. + SyncPoint::GetInstance()->SetCallBack( + "Block::VerifyChecksum::checksum_len", + [&verification_count, protection_bytes_per_key](void *checksum_len) { + ASSERT_EQ((*static_cast(checksum_len)), + protection_bytes_per_key); + ++verification_count; + }); + SyncPoint::GetInstance()->EnableProcessing(); + // Check that block iterator does checksum verification + std::unique_ptr biter{ + meta_block->NewMetaIterator(true /* block_contents_pinned */)}; + TestSeekToFirst(biter, verification_count); + TestSeekToLast(biter, verification_count); + TestSeek(biter, verification_count, keys[kNumRecords / 2]); + TestSeekForPrev(biter, verification_count, keys[kNumRecords / 2]); + } +} + +class DataBlockKVChecksumCorruptionTest : public DataBlockKVChecksumTest { + public: + DataBlockKVChecksumCorruptionTest() = default; + + std::unique_ptr GenerateDataBlockIter( + std::vector &keys, std::vector &values, + int num_record) { + // During Block construction, we may create block iter to initialize per kv + // checksum. Disable syncpoint that may be created for block iter methods. + SyncPoint::GetInstance()->DisableProcessing(); + block_ = GenerateDataBlock(keys, values, num_record); + std::unique_ptr biter{block_->NewDataIterator( + Options().comparator, kDisableGlobalSequenceNumber)}; + SyncPoint::GetInstance()->EnableProcessing(); + return biter; + } + + protected: + std::unique_ptr block_; +}; + +TEST_P(DataBlockKVChecksumCorruptionTest, CorruptEntry) { + std::vector num_restart_intervals = {1, 3}; + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = + num_restart_interval * static_cast(GetRestartInterval()); + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */, + 24 /* padding_size */); + SyncPoint::GetInstance()->SetCallBack( + "BlockIter::UpdateKey::value", [](void *arg) { + char *value = static_cast(arg); + // values generated by GenerateRandomKVs are of length 100 + ++value[10]; + }); + + // Purely for reducing the number of lines of code. + typedef std::unique_ptr IterPtr; + typedef void(IterAPI)(IterPtr & iter, std::string &); + + std::string seek_key = keys[kNumRecords / 2]; + auto test_seek = [&](IterAPI iter_api) { + IterPtr biter = GenerateDataBlockIter(keys, values, kNumRecords); + ASSERT_OK(biter->status()); + iter_api(biter, seek_key); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + + test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); }); + test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); }); + test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); }); + test_seek([](IterPtr &iter, std::string &k) { iter->SeekForPrev(k); }); + test_seek([](IterPtr &iter, std::string &k) { iter->SeekForGet(k); }); + + typedef void (DataBlockIter::*IterStepAPI)(); + auto test_step = [&](IterStepAPI iter_api, std::string &k) { + IterPtr biter = GenerateDataBlockIter(keys, values, kNumRecords); + SyncPoint::GetInstance()->DisableProcessing(); + biter->Seek(k); + ASSERT_TRUE(biter->Valid()); + ASSERT_OK(biter->status()); + SyncPoint::GetInstance()->EnableProcessing(); + std::invoke(iter_api, biter); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + + if (kNumRecords > 1) { + test_step(&DataBlockIter::Prev, seek_key); + test_step(&DataBlockIter::Next, seek_key); + } + } +} + +INSTANTIATE_TEST_CASE_P( + P, DataBlockKVChecksumCorruptionTest, + ::testing::Combine( + ::testing::Values( + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch, + BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinaryAndHash), + ::testing::Values(4, 8) /* block_protection_bytes_per_key */, + ::testing::Values(1, 3, 8, 16) /* restart_interval */, + ::testing::Values(false, true)), + [](const testing::TestParamInfo> + &args) { + std::ostringstream oss; + oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes" + << std::to_string(std::get<1>(args.param)) << "RestartInterval" + << std::to_string(std::get<2>(args.param)) << "DeltaEncode" + << std::to_string(std::get<3>(args.param)); + return oss.str(); + }); + +class IndexBlockKVChecksumCorruptionTest : public IndexBlockKVChecksumTest { + public: + IndexBlockKVChecksumCorruptionTest() = default; + + std::unique_ptr GenerateIndexBlockIter( + std::vector &separators, + std::vector &block_handles, + std::vector &first_keys, int num_record, + SequenceNumber seqno) { + SyncPoint::GetInstance()->DisableProcessing(); + block_ = + GenerateIndexBlock(separators, block_handles, first_keys, num_record); + std::unique_ptr biter{block_->NewIndexIterator( + Options().comparator, seqno, nullptr, nullptr, + true /* total_order_seek */, IncludeFirstKey() /* have_first_key */, + true /* key_includes_seq */, + !UseValueDeltaEncoding() /* value_is_full */, + true /* block_contents_pinned */, + true /* user_defined_timestamps_persisted */, + nullptr /* prefix_index */)}; + SyncPoint::GetInstance()->EnableProcessing(); + return biter; + } + + protected: + std::unique_ptr block_; +}; + +INSTANTIATE_TEST_CASE_P( + P, IndexBlockKVChecksumCorruptionTest, + ::testing::Combine( + ::testing::Values( + BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinarySearch, + BlockBasedTableOptions::DataBlockIndexType:: + kDataBlockBinaryAndHash), + ::testing::Values(4, 8) /* block_protection_bytes_per_key */, + ::testing::Values(1, 3, 8, 16) /* restart_interval */, + ::testing::Values(true, false), ::testing::Values(true, false)), + [](const testing::TestParamInfo< + std::tuple> &args) { + std::ostringstream oss; + oss << GetDataBlockIndexTypeStr(std::get<0>(args.param)) << "ProtBytes" + << std::to_string(std::get<1>(args.param)) << "RestartInterval" + << std::to_string(std::get<2>(args.param)) << "ValueDeltaEncode" + << std::to_string(std::get<3>(args.param)) << "IncludeFirstKey" + << std::to_string(std::get<4>(args.param)); + return oss.str(); + }); + +TEST_P(IndexBlockKVChecksumCorruptionTest, CorruptEntry) { + std::vector num_restart_intervals = {1, 3}; + std::vector seqnos{kDisableGlobalSequenceNumber, 10001}; + + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = + num_restart_interval * static_cast(GetRestartInterval()); + for (const auto seqno : seqnos) { + std::vector separators; + std::vector block_handles; + std::vector first_keys; + GenerateRandomIndexEntries(&separators, &block_handles, &first_keys, + kNumRecords, 0 /* ts_sz */, + seqno != kDisableGlobalSequenceNumber); + SyncPoint::GetInstance()->SetCallBack( + "BlockIter::UpdateKey::value", [](void *arg) { + char *value = static_cast(arg); + // value can be delta-encoded with different lengths, so we corrupt + // first bytes here to be safe + ++value[0]; + }); + + typedef std::unique_ptr IterPtr; + typedef void(IterAPI)(IterPtr & iter, std::string &); + std::string seek_key = first_keys[kNumRecords / 2]; + auto test_seek = [&](IterAPI iter_api) { + std::unique_ptr biter = GenerateIndexBlockIter( + separators, block_handles, first_keys, kNumRecords, seqno); + ASSERT_OK(biter->status()); + iter_api(biter, seek_key); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); }); + test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); }); + test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); }); + + typedef void (IndexBlockIter::*IterStepAPI)(); + auto test_step = [&](IterStepAPI iter_api, std::string &k) { + std::unique_ptr biter = GenerateIndexBlockIter( + separators, block_handles, first_keys, kNumRecords, seqno); + SyncPoint::GetInstance()->DisableProcessing(); + biter->Seek(k); + ASSERT_TRUE(biter->Valid()); + ASSERT_OK(biter->status()); + SyncPoint::GetInstance()->EnableProcessing(); + std::invoke(iter_api, biter); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + if (kNumRecords > 1) { + test_step(&IndexBlockIter::Prev, seek_key); + test_step(&IndexBlockIter::Next, seek_key); + } + } + } +} + +class MetaIndexBlockKVChecksumCorruptionTest + : public MetaIndexBlockKVChecksumTest { + public: + MetaIndexBlockKVChecksumCorruptionTest() = default; + + std::unique_ptr GenerateMetaIndexBlockIter( + std::vector &keys, std::vector &values, + int num_record) { + SyncPoint::GetInstance()->DisableProcessing(); + block_ = GenerateMetaIndexBlock(keys, values, num_record); + std::unique_ptr biter{ + block_->NewMetaIterator(true /* block_contents_pinned */)}; + SyncPoint::GetInstance()->EnableProcessing(); + return biter; + } + + protected: + std::unique_ptr block_; +}; + +INSTANTIATE_TEST_CASE_P( + P, MetaIndexBlockKVChecksumCorruptionTest, + ::testing::Values(4, 8) /* block_protection_bytes_per_key */, + [](const testing::TestParamInfo &args) { + std::ostringstream oss; + oss << "ProtBytes" << std::to_string(args.param); + return oss.str(); + }); + +TEST_P(MetaIndexBlockKVChecksumCorruptionTest, CorruptEntry) { + Options options = Options(); + std::vector num_restart_intervals = {1, 3}; + for (const auto num_restart_interval : num_restart_intervals) { + const int kNumRecords = + num_restart_interval * static_cast(GetRestartInterval()); + std::vector keys; + std::vector values; + GenerateRandomKVs(&keys, &values, 0, kNumRecords + 1, 1 /* step */, + 24 /* padding_size */); + SyncPoint::GetInstance()->SetCallBack( + "BlockIter::UpdateKey::value", [](void *arg) { + char *value = static_cast(arg); + // values generated by GenerateRandomKVs are of length 100 + ++value[10]; + }); + + typedef std::unique_ptr IterPtr; + typedef void(IterAPI)(IterPtr & iter, std::string &); + typedef void (MetaBlockIter::*IterStepAPI)(); + std::string seek_key = keys[kNumRecords / 2]; + auto test_seek = [&](IterAPI iter_api) { + IterPtr biter = GenerateMetaIndexBlockIter(keys, values, kNumRecords); + ASSERT_OK(biter->status()); + iter_api(biter, seek_key); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + + test_seek([](IterPtr &iter, std::string &) { iter->SeekToFirst(); }); + test_seek([](IterPtr &iter, std::string &) { iter->SeekToLast(); }); + test_seek([](IterPtr &iter, std::string &k) { iter->Seek(k); }); + test_seek([](IterPtr &iter, std::string &k) { iter->SeekForPrev(k); }); + + auto test_step = [&](IterStepAPI iter_api, const std::string &k) { + IterPtr biter = GenerateMetaIndexBlockIter(keys, values, kNumRecords); + SyncPoint::GetInstance()->DisableProcessing(); + biter->Seek(k); + ASSERT_TRUE(biter->Valid()); + ASSERT_OK(biter->status()); + SyncPoint::GetInstance()->EnableProcessing(); + std::invoke(iter_api, biter); + ASSERT_FALSE(biter->Valid()); + ASSERT_TRUE(biter->status().IsCorruption()); + }; + + if (kNumRecords > 1) { + test_step(&MetaBlockIter::Prev, seek_key); + test_step(&MetaBlockIter::Next, seek_key); + } + } +} } // namespace ROCKSDB_NAMESPACE int main(int argc, char **argv) { diff --git a/table/block_based/cachable_entry.h b/table/block_based/cachable_entry.h index 464dc8ebaf41..3cd1bb807ae4 100644 --- a/table/block_based/cachable_entry.h +++ b/table/block_based/cachable_entry.h @@ -13,7 +13,7 @@ #include #include "port/likely.h" -#include "rocksdb/cache.h" +#include "rocksdb/advanced_cache.h" #include "rocksdb/cleanable.h" namespace ROCKSDB_NAMESPACE { @@ -176,22 +176,6 @@ class CachableEntry { assert(!own_value_); } - void UpdateCachedValue() { - assert(cache_ != nullptr); - assert(cache_handle_ != nullptr); - - value_ = static_cast(cache_->Value(cache_handle_)); - } - - bool IsReady() { - if (!own_value_) { - assert(cache_ != nullptr); - assert(cache_handle_ != nullptr); - return cache_->IsReady(cache_handle_); - } - return true; - } - // Since this class is essentially an elaborate pointer, it's sometimes // useful to be able to upcast or downcast the base type of the pointer, // especially when interacting with typed_cache.h. diff --git a/table/block_based/data_block_hash_index_test.cc b/table/block_based/data_block_hash_index_test.cc index cd2e30833ddc..2841b271dea2 100644 --- a/table/block_based/data_block_hash_index_test.cc +++ b/table/block_based/data_block_hash_index_test.cc @@ -581,8 +581,9 @@ void TestBoundary(InternalKey& ik1, std::string& v1, InternalKey& ik2, const bool kImmortal = true; ASSERT_OK(ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor, soptions, - internal_comparator, !kSkipFilters, !kImmortal, - level_), + internal_comparator, + 0 /* block_protection_bytes_per_key */, !kSkipFilters, + !kImmortal, level_), std::move(file_reader), sink->contents().size(), &table_reader)); // Search using Get() ReadOptions ro; diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index e1e206990646..b14858c02093 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -113,17 +113,17 @@ class FilterBlockReader { const Slice* const const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) = 0; + const ReadOptions& read_options) = 0; virtual void KeysMayMatch(MultiGetRange* range, const bool no_io, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) { + const ReadOptions& read_options) { for (auto iter = range->begin(); iter != range->end(); ++iter) { const Slice ukey_without_ts = iter->ukey_without_ts; const Slice ikey = iter->ikey; GetContext* const get_context = iter->get_context; if (!KeyMayMatch(ukey_without_ts, no_io, &ikey, get_context, - lookup_context, rate_limiter_priority)) { + lookup_context, read_options)) { range->SkipKey(iter); } } @@ -136,21 +136,20 @@ class FilterBlockReader { const Slice* const const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) = 0; + const ReadOptions& read_options) = 0; virtual void PrefixesMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, const bool no_io, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) { + const ReadOptions& read_options) { for (auto iter = range->begin(); iter != range->end(); ++iter) { const Slice ukey_without_ts = iter->ukey_without_ts; const Slice ikey = iter->ikey; GetContext* const get_context = iter->get_context; if (prefix_extractor->InDomain(ukey_without_ts) && !PrefixMayMatch(prefix_extractor->Transform(ukey_without_ts), no_io, - &ikey, get_context, lookup_context, - rate_limiter_priority)) { + &ikey, get_context, lookup_context, read_options)) { range->SkipKey(iter); } } @@ -164,7 +163,9 @@ class FilterBlockReader { return error_msg; } - virtual Status CacheDependencies(const ReadOptions& /*ro*/, bool /*pin*/) { + virtual Status CacheDependencies( + const ReadOptions& /*ro*/, bool /*pin*/, + FilePrefetchBuffer* /* tail_prefetch_buffer */) { return Status::OK(); } @@ -176,7 +177,7 @@ class FilterBlockReader { bool* filter_checked, bool need_upper_bound_check, bool no_io, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) = 0; + const ReadOptions& read_options) = 0; }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/filter_block_reader_common.cc b/table/block_based/filter_block_reader_common.cc index 838fb5296a1f..e459b00ad5c0 100644 --- a/table/block_based/filter_block_reader_common.cc +++ b/table/block_based/filter_block_reader_common.cc @@ -28,12 +28,12 @@ Status FilterBlockReaderCommon::ReadFilterBlock( const BlockBasedTable::Rep* const rep = table->get_rep(); assert(rep); - const Status s = - table->RetrieveBlock(prefetch_buffer, read_options, rep->filter_handle, - UncompressionDict::GetEmptyDict(), filter_block, - get_context, lookup_context, - /* for_compaction */ false, use_cache, - /* wait_for_cache */ true, /* async_read */ false); + const Status s = table->RetrieveBlock( + prefetch_buffer, read_options, rep->filter_handle, + UncompressionDict::GetEmptyDict(), filter_block, get_context, + lookup_context, + /* for_compaction */ false, use_cache, + /* async_read */ false, /* use_block_cache_for_lookup */ true); return s; } @@ -70,7 +70,7 @@ Status FilterBlockReaderCommon::GetOrReadFilterBlock( bool no_io, GetContext* get_context, BlockCacheLookupContext* lookup_context, CachableEntry* filter_block, - Env::IOPriority rate_limiter_priority) const { + const ReadOptions& read_options) const { assert(filter_block); if (!filter_block_.IsEmpty()) { @@ -78,13 +78,12 @@ Status FilterBlockReaderCommon::GetOrReadFilterBlock( return Status::OK(); } - ReadOptions read_options; - read_options.rate_limiter_priority = rate_limiter_priority; + ReadOptions ro = read_options; if (no_io) { - read_options.read_tier = kBlockCacheTier; + ro.read_tier = kBlockCacheTier; } - return ReadFilterBlock(table_, nullptr /* prefetch_buffer */, read_options, + return ReadFilterBlock(table_, nullptr /* prefetch_buffer */, ro, cache_filter_blocks(), get_context, lookup_context, filter_block); } @@ -104,8 +103,7 @@ bool FilterBlockReaderCommon::RangeMayExist( const SliceTransform* prefix_extractor, const Comparator* comparator, const Slice* const const_ikey_ptr, bool* filter_checked, bool need_upper_bound_check, bool no_io, - BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) { + BlockCacheLookupContext* lookup_context, const ReadOptions& read_options) { if (!prefix_extractor || !prefix_extractor->InDomain(user_key_without_ts)) { *filter_checked = false; return true; @@ -119,7 +117,7 @@ bool FilterBlockReaderCommon::RangeMayExist( *filter_checked = true; return PrefixMayMatch(prefix, no_io, const_ikey_ptr, /* get_context */ nullptr, lookup_context, - rate_limiter_priority); + read_options); } } diff --git a/table/block_based/filter_block_reader_common.h b/table/block_based/filter_block_reader_common.h index 5c2fbdcea7be..62335b30be9c 100644 --- a/table/block_based/filter_block_reader_common.h +++ b/table/block_based/filter_block_reader_common.h @@ -40,7 +40,7 @@ class FilterBlockReaderCommon : public FilterBlockReader { const Slice* const const_ikey_ptr, bool* filter_checked, bool need_upper_bound_check, bool no_io, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) override; + const ReadOptions& read_options) override; protected: static Status ReadFilterBlock(const BlockBasedTable* table, @@ -58,7 +58,7 @@ class FilterBlockReaderCommon : public FilterBlockReader { Status GetOrReadFilterBlock(bool no_io, GetContext* get_context, BlockCacheLookupContext* lookup_context, CachableEntry* filter_block, - Env::IOPriority rate_limiter_priority) const; + const ReadOptions& read_options) const; size_t ApproximateFilterBlockMemoryUsage() const; diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc index f84f804dd6cc..19b880a900ae 100644 --- a/table/block_based/filter_policy.cc +++ b/table/block_based/filter_policy.cc @@ -10,6 +10,7 @@ #include "rocksdb/filter_policy.h" #include +#include #include #include #include @@ -24,6 +25,7 @@ #include "rocksdb/rocksdb_namespace.h" #include "rocksdb/slice.h" #include "rocksdb/utilities/object_registry.h" +#include "rocksdb/utilities/options_type.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/filter_policy_internal.h" #include "table/block_based/full_filter_block.h" @@ -1730,7 +1732,15 @@ const FilterPolicy* NewBloomFilterPolicy(double bits_per_key, RibbonFilterPolicy::RibbonFilterPolicy(double bloom_equivalent_bits_per_key, int bloom_before_level) : BloomLikeFilterPolicy(bloom_equivalent_bits_per_key), - bloom_before_level_(bloom_before_level) {} + bloom_before_level_(bloom_before_level) { + static const std::unordered_map type_info = { + {"bloom_before_level", + {offsetof(class RibbonFilterPolicy, bloom_before_level_), + OptionType::kAtomicInt, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, + }; + RegisterOptions(this, &type_info); +} FilterBitsBuilder* RibbonFilterPolicy::GetBuilderWithContext( const FilterBuildingContext& context) const { @@ -1738,31 +1748,38 @@ FilterBitsBuilder* RibbonFilterPolicy::GetBuilderWithContext( // "No filter" special case return nullptr; } - // Treat unknown same as bottommost - int levelish = INT_MAX; - - switch (context.compaction_style) { - case kCompactionStyleLevel: - case kCompactionStyleUniversal: { - if (context.reason == TableFileCreationReason::kFlush) { - // Treat flush as level -1 - assert(context.level_at_creation == 0); - levelish = -1; - } else if (context.level_at_creation == -1) { - // Unknown level - assert(levelish == INT_MAX); - } else { - levelish = context.level_at_creation; + // Treat unknown same as bottommost, INT_MAX - 1. + // INT_MAX is reserved for "always use Bloom". + int levelish = INT_MAX - 1; + + int bloom_before_level = bloom_before_level_.load(std::memory_order_relaxed); + if (bloom_before_level < INT_MAX) { + switch (context.compaction_style) { + case kCompactionStyleLevel: + case kCompactionStyleUniversal: { + if (context.reason == TableFileCreationReason::kFlush) { + // Treat flush as level -1 + assert(context.level_at_creation == 0); + levelish = -1; + } else if (context.level_at_creation == -1) { + // Unknown level + assert(levelish == INT_MAX - 1); + } else { + levelish = context.level_at_creation; + } + break; } - break; + case kCompactionStyleFIFO: + case kCompactionStyleNone: + // Treat as bottommost + assert(levelish == INT_MAX - 1); + break; } - case kCompactionStyleFIFO: - case kCompactionStyleNone: - // Treat as bottommost - assert(levelish == INT_MAX); - break; + } else { + // INT_MAX == always Bloom + assert(levelish < bloom_before_level); } - if (levelish < bloom_before_level_) { + if (levelish < bloom_before_level) { return GetFastLocalBloomBuilderWithContext(context); } else { return GetStandard128RibbonBuilderWithContext(context); @@ -1771,14 +1788,15 @@ FilterBitsBuilder* RibbonFilterPolicy::GetBuilderWithContext( const char* RibbonFilterPolicy::kClassName() { return "ribbonfilter"; } const char* RibbonFilterPolicy::kNickName() { return "rocksdb.RibbonFilter"; } +const char* RibbonFilterPolicy::kName() { return "RibbonFilterPolicy"; } std::string RibbonFilterPolicy::GetId() const { return BloomLikeFilterPolicy::GetId() + ":" + - std::to_string(bloom_before_level_); + std::to_string(bloom_before_level_.load(std::memory_order_acquire)); } -const FilterPolicy* NewRibbonFilterPolicy(double bloom_equivalent_bits_per_key, - int bloom_before_level) { +FilterPolicy* NewRibbonFilterPolicy(double bloom_equivalent_bits_per_key, + int bloom_before_level) { return new RibbonFilterPolicy(bloom_equivalent_bits_per_key, bloom_before_level); } @@ -1809,7 +1827,6 @@ std::shared_ptr BloomLikeFilterPolicy::Create( } } -#ifndef ROCKSDB_LITE namespace { static ObjectLibrary::PatternEntry FilterPatternEntryWithBits( const char* name) { @@ -1918,7 +1935,6 @@ static int RegisterBuiltinFilterPolicies(ObjectLibrary& library, return static_cast(library.GetFactoryCount(&num_types)); } } // namespace -#endif // ROCKSDB_LITE Status FilterPolicy::CreateFromString( const ConfigOptions& options, const std::string& value, @@ -1940,16 +1956,11 @@ Status FilterPolicy::CreateFromString( } else if (id.empty()) { // We have no Id but have options. Not good return Status::NotSupported("Cannot reset object ", id); } else { -#ifndef ROCKSDB_LITE static std::once_flag loaded; std::call_once(loaded, [&]() { RegisterBuiltinFilterPolicies(*(ObjectLibrary::Default().get()), ""); }); status = options.registry->NewSharedObject(id, policy); -#else - status = - Status::NotSupported("Cannot load filter policy in LITE mode ", value); -#endif // ROCKSDB_LITE } if (options.ignore_unsupported_options && status.IsNotSupported()) { return Status::OK(); diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h index 9bc3a24829b1..3919c8c6d285 100644 --- a/table/block_based/filter_policy_internal.h +++ b/table/block_based/filter_policy_internal.h @@ -290,10 +290,11 @@ class RibbonFilterPolicy : public BloomLikeFilterPolicy { const char* Name() const override { return kClassName(); } static const char* kNickName(); const char* NickName() const override { return kNickName(); } + static const char* kName(); std::string GetId() const override; private: - const int bloom_before_level_; + std::atomic bloom_before_level_; }; // For testing only, but always constructable with internal names diff --git a/table/block_based/flush_block_policy.cc b/table/block_based/flush_block_policy.cc index 9bb1f334b348..d5cc310013f2 100644 --- a/table/block_based/flush_block_policy.cc +++ b/table/block_based/flush_block_policy.cc @@ -13,7 +13,7 @@ #include "rocksdb/utilities/customizable_util.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_builder.h" -#include "table/block_based/flush_block_policy.h" +#include "table/block_based/flush_block_policy_impl.h" #include "table/format.h" namespace ROCKSDB_NAMESPACE { @@ -89,7 +89,6 @@ FlushBlockPolicy* FlushBlockBySizePolicyFactory::NewFlushBlockPolicy( return new FlushBlockBySizePolicy(size, deviation, false, data_block_builder); } -#ifndef ROCKSDB_LITE static int RegisterFlushBlockPolicyFactories(ObjectLibrary& library, const std::string& /*arg*/) { library.AddFactory( @@ -110,23 +109,6 @@ static int RegisterFlushBlockPolicyFactories(ObjectLibrary& library, }); return 2; } -#endif // ROCKSDB_LITE - -static bool LoadFlushPolicyFactory( - const std::string& id, std::shared_ptr* result) { - if (id.empty()) { - result->reset(new FlushBlockBySizePolicyFactory()); -#ifdef ROCKSDB_LITE - } else if (id == FlushBlockBySizePolicyFactory::kClassName()) { - result->reset(new FlushBlockBySizePolicyFactory()); - } else if (id == FlushBlockEveryKeyPolicyFactory::kClassName()) { - result->reset(new FlushBlockEveryKeyPolicyFactory()); -#endif // ROCKSDB_LITE - } else { - return false; - } - return true; -} FlushBlockBySizePolicyFactory::FlushBlockBySizePolicyFactory() : FlushBlockPolicyFactory() {} @@ -134,13 +116,17 @@ FlushBlockBySizePolicyFactory::FlushBlockBySizePolicyFactory() Status FlushBlockPolicyFactory::CreateFromString( const ConfigOptions& config_options, const std::string& value, std::shared_ptr* factory) { -#ifndef ROCKSDB_LITE static std::once_flag once; std::call_once(once, [&]() { RegisterFlushBlockPolicyFactories(*(ObjectLibrary::Default().get()), ""); }); -#endif // ROCKSDB_LITE - return LoadSharedObject( - config_options, value, LoadFlushPolicyFactory, factory); + + if (value.empty()) { + factory->reset(new FlushBlockBySizePolicyFactory()); + return Status::OK(); + } else { + return LoadSharedObject(config_options, value, + factory); + } } } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/flush_block_policy.h b/table/block_based/flush_block_policy_impl.h similarity index 100% rename from table/block_based/flush_block_policy.h rename to table/block_based/flush_block_policy_impl.h diff --git a/table/block_based/full_filter_block.cc b/table/block_based/full_filter_block.cc index a7680e494de9..60ff7c44f39f 100644 --- a/table/block_based/full_filter_block.cc +++ b/table/block_based/full_filter_block.cc @@ -127,12 +127,11 @@ bool FullFilterBlockReader::KeyMayMatch(const Slice& key, const bool no_io, const Slice* const /*const_ikey_ptr*/, GetContext* get_context, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) { + const ReadOptions& read_options) { if (!whole_key_filtering()) { return true; } - return MayMatch(key, no_io, get_context, lookup_context, - rate_limiter_priority); + return MayMatch(key, no_io, get_context, lookup_context, read_options); } std::unique_ptr FullFilterBlockReader::Create( @@ -165,20 +164,18 @@ std::unique_ptr FullFilterBlockReader::Create( bool FullFilterBlockReader::PrefixMayMatch( const Slice& prefix, const bool no_io, const Slice* const /*const_ikey_ptr*/, GetContext* get_context, - BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) { - return MayMatch(prefix, no_io, get_context, lookup_context, - rate_limiter_priority); + BlockCacheLookupContext* lookup_context, const ReadOptions& read_options) { + return MayMatch(prefix, no_io, get_context, lookup_context, read_options); } -bool FullFilterBlockReader::MayMatch( - const Slice& entry, bool no_io, GetContext* get_context, - BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) const { +bool FullFilterBlockReader::MayMatch(const Slice& entry, bool no_io, + GetContext* get_context, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) const { CachableEntry filter_block; const Status s = GetOrReadFilterBlock(no_io, get_context, lookup_context, - &filter_block, rate_limiter_priority); + &filter_block, read_options); if (!s.ok()) { IGNORE_STATUS_IF_ERROR(s); return true; @@ -203,33 +200,31 @@ bool FullFilterBlockReader::MayMatch( void FullFilterBlockReader::KeysMayMatch( MultiGetRange* range, const bool no_io, - BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) { + BlockCacheLookupContext* lookup_context, const ReadOptions& read_options) { if (!whole_key_filtering()) { // Simply return. Don't skip any key - consider all keys as likely to be // present return; } - MayMatch(range, no_io, nullptr, lookup_context, rate_limiter_priority); + MayMatch(range, no_io, nullptr, lookup_context, read_options); } void FullFilterBlockReader::PrefixesMayMatch( MultiGetRange* range, const SliceTransform* prefix_extractor, const bool no_io, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) { - MayMatch(range, no_io, prefix_extractor, lookup_context, - rate_limiter_priority); + const ReadOptions& read_options) { + MayMatch(range, no_io, prefix_extractor, lookup_context, read_options); } -void FullFilterBlockReader::MayMatch( - MultiGetRange* range, bool no_io, const SliceTransform* prefix_extractor, - BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) const { +void FullFilterBlockReader::MayMatch(MultiGetRange* range, bool no_io, + const SliceTransform* prefix_extractor, + BlockCacheLookupContext* lookup_context, + const ReadOptions& read_options) const { CachableEntry filter_block; const Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context, lookup_context, - &filter_block, rate_limiter_priority); + &filter_block, read_options); if (!s.ok()) { IGNORE_STATUS_IF_ERROR(s); return; diff --git a/table/block_based/full_filter_block.h b/table/block_based/full_filter_block.h index cd1771a388c7..7b0890d10c53 100644 --- a/table/block_based/full_filter_block.h +++ b/table/block_based/full_filter_block.h @@ -108,40 +108,40 @@ class FullFilterBlockReader bool KeyMayMatch(const Slice& key, const bool no_io, const Slice* const const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) override; + const ReadOptions& read_options) override; bool PrefixMayMatch(const Slice& prefix, const bool no_io, const Slice* const const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) override; + const ReadOptions& read_options) override; void KeysMayMatch(MultiGetRange* range, const bool no_io, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) override; + const ReadOptions& read_options) override; // Used in partitioned filter code void KeysMayMatch2(MultiGetRange* range, const SliceTransform* /*prefix_extractor*/, const bool no_io, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) { - KeysMayMatch(range, no_io, lookup_context, rate_limiter_priority); + const ReadOptions& read_options) { + KeysMayMatch(range, no_io, lookup_context, read_options); } void PrefixesMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, const bool no_io, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) override; + const ReadOptions& read_options) override; size_t ApproximateMemoryUsage() const override; private: bool MayMatch(const Slice& entry, bool no_io, GetContext* get_context, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) const; + const ReadOptions& read_options) const; void MayMatch(MultiGetRange* range, bool no_io, const SliceTransform* prefix_extractor, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) const; + const ReadOptions& read_options) const; }; } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc index bd98638e5b64..0268b7b27150 100644 --- a/table/block_based/full_filter_block_test.cc +++ b/table/block_based/full_filter_block_test.cc @@ -117,7 +117,7 @@ TEST_F(PluginFullFilterBlockTest, PluginEmptyBuilder) { ASSERT_TRUE(reader.KeyMayMatch("foo", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, Env::IO_TOTAL)); + /*lookup_context=*/nullptr, ReadOptions())); } TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) { @@ -135,42 +135,34 @@ TEST_F(PluginFullFilterBlockTest, PluginSingleChunk) { nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); FullFilterBlockReader reader(table_.get(), std::move(block)); - Env::IOPriority rate_limiter_priority = Env::IO_TOTAL; ASSERT_TRUE(reader.KeyMayMatch("foo", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + /*lookup_context=*/nullptr, ReadOptions())); ASSERT_TRUE(reader.KeyMayMatch("bar", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + /*lookup_context=*/nullptr, ReadOptions())); ASSERT_TRUE(reader.KeyMayMatch("box", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + /*lookup_context=*/nullptr, ReadOptions())); ASSERT_TRUE(reader.KeyMayMatch("hello", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + /*lookup_context=*/nullptr, ReadOptions())); ASSERT_TRUE(reader.KeyMayMatch("foo", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + /*lookup_context=*/nullptr, ReadOptions())); ASSERT_TRUE(!reader.KeyMayMatch("missing", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + /*lookup_context=*/nullptr, ReadOptions())); ASSERT_TRUE(!reader.KeyMayMatch("other", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + /*lookup_context=*/nullptr, ReadOptions())); } class FullFilterBlockTest : public mock::MockBlockBasedTableTester, @@ -195,7 +187,7 @@ TEST_F(FullFilterBlockTest, EmptyBuilder) { ASSERT_TRUE(reader.KeyMayMatch("foo", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, Env::IO_TOTAL)); + /*lookup_context=*/nullptr, ReadOptions())); } class CountUniqueFilterBitsBuilderWrapper : public FilterBitsBuilder { @@ -292,42 +284,34 @@ TEST_F(FullFilterBlockTest, SingleChunk) { nullptr /* cache */, nullptr /* cache_handle */, true /* own_value */); FullFilterBlockReader reader(table_.get(), std::move(block)); - Env::IOPriority rate_limiter_priority = Env::IO_TOTAL; ASSERT_TRUE(reader.KeyMayMatch("foo", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + /*lookup_context=*/nullptr, ReadOptions())); ASSERT_TRUE(reader.KeyMayMatch("bar", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + /*lookup_context=*/nullptr, ReadOptions())); ASSERT_TRUE(reader.KeyMayMatch("box", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + /*lookup_context=*/nullptr, ReadOptions())); ASSERT_TRUE(reader.KeyMayMatch("hello", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + /*lookup_context=*/nullptr, ReadOptions())); ASSERT_TRUE(reader.KeyMayMatch("foo", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + /*lookup_context=*/nullptr, ReadOptions())); ASSERT_TRUE(!reader.KeyMayMatch("missing", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + /*lookup_context=*/nullptr, ReadOptions())); ASSERT_TRUE(!reader.KeyMayMatch("other", /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + /*lookup_context=*/nullptr, ReadOptions())); } } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/hash_index_reader.cc b/table/block_based/hash_index_reader.cc index bcaba17a2514..5b710a768e5a 100644 --- a/table/block_based/hash_index_reader.cc +++ b/table/block_based/hash_index_reader.cc @@ -74,17 +74,17 @@ Status HashIndexReader::Create(const BlockBasedTable* table, // Read contents for the blocks BlockContents prefixes_contents; BlockFetcher prefixes_block_fetcher( - file, prefetch_buffer, footer, ReadOptions(), prefixes_handle, - &prefixes_contents, ioptions, true /*decompress*/, - true /*maybe_compressed*/, BlockType::kHashIndexPrefixes, - UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + file, prefetch_buffer, footer, ro, prefixes_handle, &prefixes_contents, + ioptions, true /*decompress*/, true /*maybe_compressed*/, + BlockType::kHashIndexPrefixes, UncompressionDict::GetEmptyDict(), + cache_options, memory_allocator); s = prefixes_block_fetcher.ReadBlockContents(); if (!s.ok()) { return s; } BlockContents prefixes_meta_contents; BlockFetcher prefixes_meta_block_fetcher( - file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle, + file, prefetch_buffer, footer, ro, prefixes_meta_handle, &prefixes_meta_contents, ioptions, true /*decompress*/, true /*maybe_compressed*/, BlockType::kHashIndexMetadata, UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); @@ -116,9 +116,8 @@ InternalIteratorBase* HashIndexReader::NewIterator( const BlockBasedTable::Rep* rep = table()->get_rep(); const bool no_io = (read_options.read_tier == kBlockCacheTier); CachableEntry index_block; - const Status s = - GetOrReadIndexBlock(no_io, read_options.rate_limiter_priority, - get_context, lookup_context, &index_block); + const Status s = GetOrReadIndexBlock(no_io, get_context, lookup_context, + &index_block, read_options); if (!s.ok()) { if (iter != nullptr) { iter->Invalidate(s); @@ -138,7 +137,7 @@ InternalIteratorBase* HashIndexReader::NewIterator( rep->get_global_seqno(BlockType::kIndex), iter, kNullStats, total_order_seek, index_has_first_key(), index_key_includes_seq(), index_value_is_full(), false /* block_contents_pinned */, - prefix_index_.get()); + user_defined_timestamps_persisted(), prefix_index_.get()); assert(it != nullptr); index_block.TransferTo(it); diff --git a/table/block_based/index_builder.cc b/table/block_based/index_builder.cc index 02473017823b..a9e02a287274 100644 --- a/table/block_based/index_builder.cc +++ b/table/block_based/index_builder.cc @@ -29,14 +29,16 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder( const InternalKeyComparator* comparator, const InternalKeySliceTransform* int_key_slice_transform, const bool use_value_delta_encoding, - const BlockBasedTableOptions& table_opt) { + const BlockBasedTableOptions& table_opt, size_t ts_sz, + const bool persist_user_defined_timestamps) { IndexBuilder* result = nullptr; switch (index_type) { case BlockBasedTableOptions::kBinarySearch: { result = new ShortenedIndexBuilder( comparator, table_opt.index_block_restart_interval, table_opt.format_version, use_value_delta_encoding, - table_opt.index_shortening, /* include_first_key */ false); + table_opt.index_shortening, /* include_first_key */ false, ts_sz, + persist_user_defined_timestamps); break; } case BlockBasedTableOptions::kHashSearch: { @@ -46,19 +48,22 @@ IndexBuilder* IndexBuilder::CreateIndexBuilder( result = new HashIndexBuilder( comparator, int_key_slice_transform, table_opt.index_block_restart_interval, table_opt.format_version, - use_value_delta_encoding, table_opt.index_shortening); + use_value_delta_encoding, table_opt.index_shortening, ts_sz, + persist_user_defined_timestamps); break; } case BlockBasedTableOptions::kTwoLevelIndexSearch: { result = PartitionedIndexBuilder::CreateIndexBuilder( - comparator, use_value_delta_encoding, table_opt); + comparator, use_value_delta_encoding, table_opt, ts_sz, + persist_user_defined_timestamps); break; } case BlockBasedTableOptions::kBinarySearchWithFirstKey: { result = new ShortenedIndexBuilder( comparator, table_opt.index_block_restart_interval, table_opt.format_version, use_value_delta_encoding, - table_opt.index_shortening, /* include_first_key */ true); + table_opt.index_shortening, /* include_first_key */ true, ts_sz, + persist_user_defined_timestamps); break; } default: { @@ -106,22 +111,31 @@ void ShortenedIndexBuilder::FindShortInternalKeySuccessor( PartitionedIndexBuilder* PartitionedIndexBuilder::CreateIndexBuilder( const InternalKeyComparator* comparator, const bool use_value_delta_encoding, - const BlockBasedTableOptions& table_opt) { + const BlockBasedTableOptions& table_opt, size_t ts_sz, + const bool persist_user_defined_timestamps) { return new PartitionedIndexBuilder(comparator, table_opt, - use_value_delta_encoding); + use_value_delta_encoding, ts_sz, + persist_user_defined_timestamps); } PartitionedIndexBuilder::PartitionedIndexBuilder( const InternalKeyComparator* comparator, const BlockBasedTableOptions& table_opt, - const bool use_value_delta_encoding) - : IndexBuilder(comparator), - index_block_builder_(table_opt.index_block_restart_interval, - true /*use_delta_encoding*/, - use_value_delta_encoding), - index_block_builder_without_seq_(table_opt.index_block_restart_interval, - true /*use_delta_encoding*/, - use_value_delta_encoding), + const bool use_value_delta_encoding, size_t ts_sz, + const bool persist_user_defined_timestamps) + : IndexBuilder(comparator, ts_sz, persist_user_defined_timestamps), + index_block_builder_( + table_opt.index_block_restart_interval, true /*use_delta_encoding*/, + use_value_delta_encoding, + BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */, + 0.75 /* data_block_hash_table_util_ratio */, ts_sz, + persist_user_defined_timestamps, false /* is_user_key */), + index_block_builder_without_seq_( + table_opt.index_block_restart_interval, true /*use_delta_encoding*/, + use_value_delta_encoding, + BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */, + 0.75 /* data_block_hash_table_util_ratio */, ts_sz, + persist_user_defined_timestamps, true /* is_user_key */), sub_index_builder_(nullptr), table_opt_(table_opt), // We start by false. After each partition we revise the value based on @@ -142,7 +156,8 @@ void PartitionedIndexBuilder::MakeNewSubIndexBuilder() { sub_index_builder_ = new ShortenedIndexBuilder( comparator_, table_opt_.index_block_restart_interval, table_opt_.format_version, use_value_delta_encoding_, - table_opt_.index_shortening, /* include_first_key */ false); + table_opt_.index_shortening, /* include_first_key */ false, ts_sz_, + persist_user_defined_timestamps_); // Set sub_index_builder_->seperator_is_key_plus_seq_ to true if // seperator_is_key_plus_seq_ is true (internal-key mode) (set to false by diff --git a/table/block_based/index_builder.h b/table/block_based/index_builder.h index dd3be03316a8..be690d7997f0 100644 --- a/table/block_based/index_builder.h +++ b/table/block_based/index_builder.h @@ -9,13 +9,12 @@ #pragma once -#include - #include #include #include #include +#include "db/dbformat.h" #include "rocksdb/comparator.h" #include "table/block_based/block_based_table_factory.h" #include "table/block_based/block_builder.h" @@ -36,10 +35,10 @@ class IndexBuilder { public: static IndexBuilder* CreateIndexBuilder( BlockBasedTableOptions::IndexType index_type, - const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator, + const InternalKeyComparator* comparator, const InternalKeySliceTransform* int_key_slice_transform, - const bool use_value_delta_encoding, - const BlockBasedTableOptions& table_opt); + bool use_value_delta_encoding, const BlockBasedTableOptions& table_opt, + size_t ts_sz, bool persist_user_defined_timestamps); // Index builder will construct a set of blocks which contain: // 1. One primary index block. @@ -49,10 +48,13 @@ class IndexBuilder { Slice index_block_contents; std::unordered_map meta_blocks; }; - explicit IndexBuilder(const InternalKeyComparator* comparator) - : comparator_(comparator) {} + IndexBuilder(const InternalKeyComparator* comparator, size_t ts_sz, + bool persist_user_defined_timestamps) + : comparator_(comparator), + ts_sz_(ts_sz), + persist_user_defined_timestamps_(persist_user_defined_timestamps) {} - virtual ~IndexBuilder() {} + virtual ~IndexBuilder() = default; // Add a new index entry to index block. // To allow further optimization, we provide `last_key_in_current_block` and @@ -104,7 +106,33 @@ class IndexBuilder { virtual bool seperator_is_key_plus_seq() { return true; } protected: + // Given the last key in current block and the first key in the next block, + // return true if internal key should be used as separator, false if user key + // can be used as separator. + inline bool ShouldUseKeyPlusSeqAsSeparator( + const Slice& last_key_in_current_block, + const Slice& first_key_in_next_block) { + Slice l_user_key = ExtractUserKey(last_key_in_current_block); + Slice r_user_key = ExtractUserKey(first_key_in_next_block); + // If user defined timestamps are not persisted. All the user keys will + // act like they have minimal timestamp. Only having user key is not + // sufficient, even if they are different user keys for now, they have to be + // different user keys without the timestamp part. + return persist_user_defined_timestamps_ + ? comparator_->user_comparator()->Compare(l_user_key, + r_user_key) == 0 + : comparator_->user_comparator()->CompareWithoutTimestamp( + l_user_key, r_user_key) == 0; + } + const InternalKeyComparator* comparator_; + // Size of user-defined timestamp in bytes. + size_t ts_sz_; + // Whether user-defined timestamp in the user key should be persisted when + // creating index block. If this flag is false, user-defined timestamp will + // be stripped from user key for each index entry, and the + // `first_internal_key` in `IndexValue` if it's included. + bool persist_user_defined_timestamps_; // Set after ::Finish is called size_t index_size_ = 0; }; @@ -120,19 +148,26 @@ class IndexBuilder { // substitute key that serves the same function. class ShortenedIndexBuilder : public IndexBuilder { public: - explicit ShortenedIndexBuilder( + ShortenedIndexBuilder( const InternalKeyComparator* comparator, const int index_block_restart_interval, const uint32_t format_version, const bool use_value_delta_encoding, BlockBasedTableOptions::IndexShorteningMode shortening_mode, - bool include_first_key) - : IndexBuilder(comparator), - index_block_builder_(index_block_restart_interval, - true /*use_delta_encoding*/, - use_value_delta_encoding), - index_block_builder_without_seq_(index_block_restart_interval, - true /*use_delta_encoding*/, - use_value_delta_encoding), + bool include_first_key, size_t ts_sz, + const bool persist_user_defined_timestamps) + : IndexBuilder(comparator, ts_sz, persist_user_defined_timestamps), + index_block_builder_( + index_block_restart_interval, true /*use_delta_encoding*/, + use_value_delta_encoding, + BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */, + 0.75 /* data_block_hash_table_util_ratio */, ts_sz, + persist_user_defined_timestamps, false /* is_user_key */), + index_block_builder_without_seq_( + index_block_restart_interval, true /*use_delta_encoding*/, + use_value_delta_encoding, + BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */, + 0.75 /* data_block_hash_table_util_ratio */, ts_sz, + persist_user_defined_timestamps, true /* is_user_key */), use_value_delta_encoding_(use_value_delta_encoding), include_first_key_(include_first_key), shortening_mode_(shortening_mode) { @@ -140,15 +175,15 @@ class ShortenedIndexBuilder : public IndexBuilder { seperator_is_key_plus_seq_ = (format_version <= 2); } - virtual void OnKeyAdded(const Slice& key) override { + void OnKeyAdded(const Slice& key) override { if (include_first_key_ && current_block_first_internal_key_.empty()) { current_block_first_internal_key_.assign(key.data(), key.size()); } } - virtual void AddIndexEntry(std::string* last_key_in_current_block, - const Slice* first_key_in_next_block, - const BlockHandle& block_handle) override { + void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override { if (first_key_in_next_block != nullptr) { if (shortening_mode_ != BlockBasedTableOptions::IndexShorteningMode::kNoShortening) { @@ -157,9 +192,8 @@ class ShortenedIndexBuilder : public IndexBuilder { *first_key_in_next_block); } if (!seperator_is_key_plus_seq_ && - comparator_->user_comparator()->Compare( - ExtractUserKey(*last_key_in_current_block), - ExtractUserKey(*first_key_in_next_block)) == 0) { + ShouldUseKeyPlusSeqAsSeparator(*last_key_in_current_block, + *first_key_in_next_block)) { seperator_is_key_plus_seq_ = true; } } else { @@ -172,7 +206,19 @@ class ShortenedIndexBuilder : public IndexBuilder { auto sep = Slice(*last_key_in_current_block); assert(!include_first_key_ || !current_block_first_internal_key_.empty()); - IndexValue entry(block_handle, current_block_first_internal_key_); + // When UDT should not be persisted, the index block builders take care of + // stripping UDT from the key, for the first internal key contained in the + // IndexValue, we need to explicitly do the stripping here before passing + // it to the block builders. + std::string first_internal_key_buf; + Slice first_internal_key = current_block_first_internal_key_; + if (!current_block_first_internal_key_.empty() && ts_sz_ > 0 && + !persist_user_defined_timestamps_) { + StripTimestampFromInternalKey(&first_internal_key_buf, + current_block_first_internal_key_, ts_sz_); + first_internal_key = first_internal_key_buf; + } + IndexValue entry(block_handle, first_internal_key); std::string encoded_entry; std::string delta_encoded_entry; entry.EncodeTo(&encoded_entry, include_first_key_, nullptr); @@ -185,6 +231,16 @@ class ShortenedIndexBuilder : public IndexBuilder { } last_encoded_handle_ = block_handle; const Slice delta_encoded_entry_slice(delta_encoded_entry); + + // TODO(yuzhangyu): fix this when "FindShortInternalKeySuccessor" + // optimization is available. + // Timestamp aware comparator currently doesn't provide override for + // "FindShortInternalKeySuccessor" optimization. So the actual + // last key in current block is used as the key for indexing the current + // block. As a result, when UDTs should not be persisted, it's safe to strip + // away the UDT from key in index block as data block does the same thing. + // What are the implications if a "FindShortInternalKeySuccessor" + // optimization is provided. index_block_builder_.Add(sep, encoded_entry, &delta_encoded_entry_slice); if (!seperator_is_key_plus_seq_) { index_block_builder_without_seq_.Add(ExtractUserKey(sep), encoded_entry, @@ -195,9 +251,8 @@ class ShortenedIndexBuilder : public IndexBuilder { } using IndexBuilder::Finish; - virtual Status Finish( - IndexBlocks* index_blocks, - const BlockHandle& /*last_partition_block_handle*/) override { + Status Finish(IndexBlocks* index_blocks, + const BlockHandle& /*last_partition_block_handle*/) override { if (seperator_is_key_plus_seq_) { index_blocks->index_block_contents = index_block_builder_.Finish(); } else { @@ -208,9 +263,9 @@ class ShortenedIndexBuilder : public IndexBuilder { return Status::OK(); } - virtual size_t IndexSize() const override { return index_size_; } + size_t IndexSize() const override { return index_size_; } - virtual bool seperator_is_key_plus_seq() override { + bool seperator_is_key_plus_seq() override { return seperator_is_key_plus_seq_; } @@ -265,27 +320,28 @@ class ShortenedIndexBuilder : public IndexBuilder { // data copy or small heap allocations for prefixes. class HashIndexBuilder : public IndexBuilder { public: - explicit HashIndexBuilder( - const InternalKeyComparator* comparator, - const SliceTransform* hash_key_extractor, - int index_block_restart_interval, int format_version, - bool use_value_delta_encoding, - BlockBasedTableOptions::IndexShorteningMode shortening_mode) - : IndexBuilder(comparator), + HashIndexBuilder(const InternalKeyComparator* comparator, + const SliceTransform* hash_key_extractor, + int index_block_restart_interval, int format_version, + bool use_value_delta_encoding, + BlockBasedTableOptions::IndexShorteningMode shortening_mode, + size_t ts_sz, const bool persist_user_defined_timestamps) + : IndexBuilder(comparator, ts_sz, persist_user_defined_timestamps), primary_index_builder_(comparator, index_block_restart_interval, format_version, use_value_delta_encoding, - shortening_mode, /* include_first_key */ false), + shortening_mode, /* include_first_key */ false, + ts_sz, persist_user_defined_timestamps), hash_key_extractor_(hash_key_extractor) {} - virtual void AddIndexEntry(std::string* last_key_in_current_block, - const Slice* first_key_in_next_block, - const BlockHandle& block_handle) override { + void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override { ++current_restart_index_; primary_index_builder_.AddIndexEntry(last_key_in_current_block, first_key_in_next_block, block_handle); } - virtual void OnKeyAdded(const Slice& key) override { + void OnKeyAdded(const Slice& key) override { auto key_prefix = hash_key_extractor_->Transform(key); bool is_first_entry = pending_block_num_ == 0; @@ -312,9 +368,8 @@ class HashIndexBuilder : public IndexBuilder { } } - virtual Status Finish( - IndexBlocks* index_blocks, - const BlockHandle& last_partition_block_handle) override { + Status Finish(IndexBlocks* index_blocks, + const BlockHandle& last_partition_block_handle) override { if (pending_block_num_ != 0) { FlushPendingPrefix(); } @@ -327,12 +382,12 @@ class HashIndexBuilder : public IndexBuilder { return s; } - virtual size_t IndexSize() const override { + size_t IndexSize() const override { return primary_index_builder_.IndexSize() + prefix_block_.size() + prefix_meta_block_.size(); } - virtual bool seperator_is_key_plus_seq() override { + bool seperator_is_key_plus_seq() override { return primary_index_builder_.seperator_is_key_plus_seq(); } @@ -377,25 +432,25 @@ class HashIndexBuilder : public IndexBuilder { class PartitionedIndexBuilder : public IndexBuilder { public: static PartitionedIndexBuilder* CreateIndexBuilder( - const ROCKSDB_NAMESPACE::InternalKeyComparator* comparator, - const bool use_value_delta_encoding, - const BlockBasedTableOptions& table_opt); + const InternalKeyComparator* comparator, bool use_value_delta_encoding, + const BlockBasedTableOptions& table_opt, size_t ts_sz, + bool persist_user_defined_timestamps); - explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator, - const BlockBasedTableOptions& table_opt, - const bool use_value_delta_encoding); + PartitionedIndexBuilder(const InternalKeyComparator* comparator, + const BlockBasedTableOptions& table_opt, + bool use_value_delta_encoding, size_t ts_sz, + bool persist_user_defined_timestamps); - virtual ~PartitionedIndexBuilder(); + ~PartitionedIndexBuilder() override; - virtual void AddIndexEntry(std::string* last_key_in_current_block, - const Slice* first_key_in_next_block, - const BlockHandle& block_handle) override; + void AddIndexEntry(std::string* last_key_in_current_block, + const Slice* first_key_in_next_block, + const BlockHandle& block_handle) override; - virtual Status Finish( - IndexBlocks* index_blocks, - const BlockHandle& last_partition_block_handle) override; + Status Finish(IndexBlocks* index_blocks, + const BlockHandle& last_partition_block_handle) override; - virtual size_t IndexSize() const override { return index_size_; } + size_t IndexSize() const override { return index_size_; } size_t TopLevelIndexSize(uint64_t) const { return top_level_index_size_; } size_t NumPartitions() const; @@ -414,11 +469,13 @@ class PartitionedIndexBuilder : public IndexBuilder { // cutting the next partition void RequestPartitionCut(); - virtual bool seperator_is_key_plus_seq() override { + bool seperator_is_key_plus_seq() override { return seperator_is_key_plus_seq_; } - bool get_use_value_delta_encoding() { return use_value_delta_encoding_; } + bool get_use_value_delta_encoding() const { + return use_value_delta_encoding_; + } private: // Set after ::Finish is called diff --git a/table/block_based/index_reader_common.cc b/table/block_based/index_reader_common.cc index 46c276e6be09..a1b05c2d69ff 100644 --- a/table/block_based/index_reader_common.cc +++ b/table/block_based/index_reader_common.cc @@ -26,18 +26,18 @@ Status BlockBasedTable::IndexReaderCommon::ReadIndexBlock( assert(rep != nullptr); const Status s = table->RetrieveBlock( - prefetch_buffer, read_options, rep->footer.index_handle(), + prefetch_buffer, read_options, rep->index_handle, UncompressionDict::GetEmptyDict(), &index_block->As(), get_context, lookup_context, /* for_compaction */ false, use_cache, - /* wait_for_cache */ true, /* async_read */ false); + /* async_read */ false, /* use_block_cache_for_lookup */ true); return s; } Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock( - bool no_io, Env::IOPriority rate_limiter_priority, GetContext* get_context, - BlockCacheLookupContext* lookup_context, - CachableEntry* index_block) const { + bool no_io, GetContext* get_context, + BlockCacheLookupContext* lookup_context, CachableEntry* index_block, + const ReadOptions& ro) const { assert(index_block != nullptr); if (!index_block_.IsEmpty()) { @@ -45,8 +45,7 @@ Status BlockBasedTable::IndexReaderCommon::GetOrReadIndexBlock( return Status::OK(); } - ReadOptions read_options; - read_options.rate_limiter_priority = rate_limiter_priority; + ReadOptions read_options = ro; if (no_io) { read_options.read_tier = kBlockCacheTier; } diff --git a/table/block_based/index_reader_common.h b/table/block_based/index_reader_common.h index 5627b0eeb379..1aa7cbb2db9f 100644 --- a/table/block_based/index_reader_common.h +++ b/table/block_based/index_reader_common.h @@ -65,10 +65,16 @@ class BlockBasedTable::IndexReaderCommon : public BlockBasedTable::IndexReader { return table_->get_rep()->table_options.cache_index_and_filter_blocks; } - Status GetOrReadIndexBlock(bool no_io, Env::IOPriority rate_limiter_priority, - GetContext* get_context, + bool user_defined_timestamps_persisted() const { + assert(table_ != nullptr); + assert(table_->get_rep() != nullptr); + return table_->get_rep()->user_defined_timestamps_persisted; + } + + Status GetOrReadIndexBlock(bool no_io, GetContext* get_context, BlockCacheLookupContext* lookup_context, - CachableEntry* index_block) const; + CachableEntry* index_block, + const ReadOptions& read_options) const; size_t ApproximateIndexBlockMemoryUsage() const { assert(!index_block_.GetOwnValue() || index_block_.GetValue() != nullptr); diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index 092446f022d0..c908db41d354 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -26,15 +26,22 @@ PartitionedFilterBlockBuilder::PartitionedFilterBlockBuilder( FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, const bool use_value_delta_encoding, PartitionedIndexBuilder* const p_index_builder, - const uint32_t partition_size) + const uint32_t partition_size, size_t ts_sz, + const bool persist_user_defined_timestamps) : FullFilterBlockBuilder(_prefix_extractor, whole_key_filtering, filter_bits_builder), - index_on_filter_block_builder_(index_block_restart_interval, - true /*use_delta_encoding*/, - use_value_delta_encoding), - index_on_filter_block_builder_without_seq_(index_block_restart_interval, - true /*use_delta_encoding*/, - use_value_delta_encoding), + index_on_filter_block_builder_( + index_block_restart_interval, true /*use_delta_encoding*/, + use_value_delta_encoding, + BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */, + 0.75 /* data_block_hash_table_util_ratio */, ts_sz, + persist_user_defined_timestamps, false /* is_user_key */), + index_on_filter_block_builder_without_seq_( + index_block_restart_interval, true /*use_delta_encoding*/, + use_value_delta_encoding, + BlockBasedTableOptions::kDataBlockBinarySearch /* index_type */, + 0.75 /* data_block_hash_table_util_ratio */, ts_sz, + persist_user_defined_timestamps, true /* is_user_key */), p_index_builder_(p_index_builder), keys_added_to_partition_(0), total_added_in_built_(0) { @@ -220,45 +227,43 @@ std::unique_ptr PartitionedFilterBlockReader::Create( bool PartitionedFilterBlockReader::KeyMayMatch( const Slice& key, const bool no_io, const Slice* const const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) { + const ReadOptions& read_options) { assert(const_ikey_ptr != nullptr); if (!whole_key_filtering()) { return true; } return MayMatch(key, no_io, const_ikey_ptr, get_context, lookup_context, - rate_limiter_priority, &FullFilterBlockReader::KeyMayMatch); + read_options, &FullFilterBlockReader::KeyMayMatch); } void PartitionedFilterBlockReader::KeysMayMatch( MultiGetRange* range, const bool no_io, - BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) { + BlockCacheLookupContext* lookup_context, const ReadOptions& read_options) { if (!whole_key_filtering()) { return; // Any/all may match } - MayMatch(range, nullptr, no_io, lookup_context, rate_limiter_priority, + MayMatch(range, nullptr, no_io, lookup_context, read_options, &FullFilterBlockReader::KeysMayMatch2); } bool PartitionedFilterBlockReader::PrefixMayMatch( const Slice& prefix, const bool no_io, const Slice* const const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) { + const ReadOptions& read_options) { assert(const_ikey_ptr != nullptr); return MayMatch(prefix, no_io, const_ikey_ptr, get_context, lookup_context, - rate_limiter_priority, - &FullFilterBlockReader::PrefixMayMatch); + read_options, &FullFilterBlockReader::PrefixMayMatch); } void PartitionedFilterBlockReader::PrefixesMayMatch( MultiGetRange* range, const SliceTransform* prefix_extractor, const bool no_io, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) { + const ReadOptions& read_options) { assert(prefix_extractor); - MayMatch(range, prefix_extractor, no_io, lookup_context, - rate_limiter_priority, &FullFilterBlockReader::PrefixesMayMatch); + MayMatch(range, prefix_extractor, no_io, lookup_context, read_options, + &FullFilterBlockReader::PrefixesMayMatch); } BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( @@ -272,7 +277,8 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( table()->get_rep()->get_global_seqno(BlockType::kFilterPartitionIndex), &iter, kNullStats, true /* total_order_seek */, false /* have_first_key */, index_key_includes_seq(), - index_value_is_full()); + index_value_is_full(), false /* block_contents_pinned */, + user_defined_timestamps_persisted()); iter.Seek(entry); if (UNLIKELY(!iter.Valid())) { // entry is larger than all the keys. However its prefix might still be @@ -290,8 +296,7 @@ BlockHandle PartitionedFilterBlockReader::GetFilterPartitionHandle( Status PartitionedFilterBlockReader::GetFilterPartitionBlock( FilePrefetchBuffer* prefetch_buffer, const BlockHandle& fltr_blk_handle, bool no_io, GetContext* get_context, - BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority, + BlockCacheLookupContext* lookup_context, const ReadOptions& _read_options, CachableEntry* filter_block) const { assert(table()); assert(filter_block); @@ -307,18 +312,17 @@ Status PartitionedFilterBlockReader::GetFilterPartitionBlock( } } - ReadOptions read_options; - read_options.rate_limiter_priority = rate_limiter_priority; + ReadOptions read_options = _read_options; if (no_io) { read_options.read_tier = kBlockCacheTier; } - const Status s = - table()->RetrieveBlock(prefetch_buffer, read_options, fltr_blk_handle, - UncompressionDict::GetEmptyDict(), filter_block, - get_context, lookup_context, - /* for_compaction */ false, /* use_cache */ true, - /* wait_for_cache */ true, /* async_read */ false); + const Status s = table()->RetrieveBlock( + prefetch_buffer, read_options, fltr_blk_handle, + UncompressionDict::GetEmptyDict(), filter_block, get_context, + lookup_context, + /* for_compaction */ false, /* use_cache */ true, + /* async_read */ false, /* use_block_cache_for_lookup */ true); return s; } @@ -326,11 +330,10 @@ Status PartitionedFilterBlockReader::GetFilterPartitionBlock( bool PartitionedFilterBlockReader::MayMatch( const Slice& slice, bool no_io, const Slice* const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority, - FilterFunction filter_function) const { + const ReadOptions& read_options, FilterFunction filter_function) const { CachableEntry filter_block; Status s = GetOrReadFilterBlock(no_io, get_context, lookup_context, - &filter_block, rate_limiter_priority); + &filter_block, read_options); if (UNLIKELY(!s.ok())) { IGNORE_STATUS_IF_ERROR(s); return true; @@ -347,8 +350,8 @@ bool PartitionedFilterBlockReader::MayMatch( CachableEntry filter_partition_block; s = GetFilterPartitionBlock(nullptr /* prefetch_buffer */, filter_handle, - no_io, get_context, lookup_context, - rate_limiter_priority, &filter_partition_block); + no_io, get_context, lookup_context, read_options, + &filter_partition_block); if (UNLIKELY(!s.ok())) { IGNORE_STATUS_IF_ERROR(s); return true; @@ -356,20 +359,17 @@ bool PartitionedFilterBlockReader::MayMatch( FullFilterBlockReader filter_partition(table(), std::move(filter_partition_block)); - return (filter_partition.*filter_function)(slice, no_io, const_ikey_ptr, - get_context, lookup_context, - rate_limiter_priority); + return (filter_partition.*filter_function)( + slice, no_io, const_ikey_ptr, get_context, lookup_context, read_options); } void PartitionedFilterBlockReader::MayMatch( MultiGetRange* range, const SliceTransform* prefix_extractor, bool no_io, - BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority, + BlockCacheLookupContext* lookup_context, const ReadOptions& read_options, FilterManyFunction filter_function) const { CachableEntry filter_block; - Status s = - GetOrReadFilterBlock(no_io, range->begin()->get_context, lookup_context, - &filter_block, rate_limiter_priority); + Status s = GetOrReadFilterBlock(no_io, range->begin()->get_context, + lookup_context, &filter_block, read_options); if (UNLIKELY(!s.ok())) { IGNORE_STATUS_IF_ERROR(s); return; // Any/all may match @@ -393,7 +393,7 @@ void PartitionedFilterBlockReader::MayMatch( this_filter_handle != prev_filter_handle) { MultiGetRange subrange(*range, start_iter_same_handle, iter); MayMatchPartition(&subrange, prefix_extractor, prev_filter_handle, no_io, - lookup_context, rate_limiter_priority, filter_function); + lookup_context, read_options, filter_function); range->AddSkipsFrom(subrange); start_iter_same_handle = iter; } @@ -409,7 +409,7 @@ void PartitionedFilterBlockReader::MayMatch( if (!prev_filter_handle.IsNull()) { MultiGetRange subrange(*range, start_iter_same_handle, range->end()); MayMatchPartition(&subrange, prefix_extractor, prev_filter_handle, no_io, - lookup_context, rate_limiter_priority, filter_function); + lookup_context, read_options, filter_function); range->AddSkipsFrom(subrange); } } @@ -417,13 +417,12 @@ void PartitionedFilterBlockReader::MayMatch( void PartitionedFilterBlockReader::MayMatchPartition( MultiGetRange* range, const SliceTransform* prefix_extractor, BlockHandle filter_handle, bool no_io, - BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority, + BlockCacheLookupContext* lookup_context, const ReadOptions& read_options, FilterManyFunction filter_function) const { CachableEntry filter_partition_block; Status s = GetFilterPartitionBlock( nullptr /* prefetch_buffer */, filter_handle, no_io, - range->begin()->get_context, lookup_context, rate_limiter_priority, + range->begin()->get_context, lookup_context, read_options, &filter_partition_block); if (UNLIKELY(!s.ok())) { IGNORE_STATUS_IF_ERROR(s); @@ -433,7 +432,7 @@ void PartitionedFilterBlockReader::MayMatchPartition( FullFilterBlockReader filter_partition(table(), std::move(filter_partition_block)); (filter_partition.*filter_function)(range, prefix_extractor, no_io, - lookup_context, rate_limiter_priority); + lookup_context, read_options); } size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { @@ -448,8 +447,8 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { } // TODO(myabandeh): merge this with the same function in IndexReader -Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, - bool pin) { +Status PartitionedFilterBlockReader::CacheDependencies( + const ReadOptions& ro, bool pin, FilePrefetchBuffer* tail_prefetch_buffer) { assert(table()); const BlockBasedTable::Rep* const rep = table()->get_rep(); @@ -460,8 +459,7 @@ Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, CachableEntry filter_block; Status s = GetOrReadFilterBlock(false /* no_io */, nullptr /* get_context */, - &lookup_context, &filter_block, - ro.rate_limiter_priority); + &lookup_context, &filter_block, ro); if (!s.ok()) { ROCKS_LOG_ERROR(rep->ioptions.logger, "Error retrieving top-level filter block while trying to " @@ -480,7 +478,8 @@ Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, comparator->user_comparator(), rep->get_global_seqno(BlockType::kFilterPartitionIndex), &biter, kNullStats, true /* total_order_seek */, false /* have_first_key */, - index_key_includes_seq(), index_value_is_full()); + index_key_includes_seq(), index_value_is_full(), + false /* block_contents_pinned */, user_defined_timestamps_persisted()); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); @@ -494,21 +493,24 @@ Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, handle.offset() + handle.size() + BlockBasedTable::kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; - rep->CreateFilePrefetchBuffer( - 0, 0, &prefetch_buffer, false /* Implicit autoreadahead */, - 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); - - IOOptions opts; - s = rep->file->PrepareIOOptions(ro, opts); - if (s.ok()) { - s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, - static_cast(prefetch_len), - ro.rate_limiter_priority); - } - if (!s.ok()) { - return s; + if (tail_prefetch_buffer == nullptr || !tail_prefetch_buffer->Enabled() || + tail_prefetch_buffer->GetPrefetchOffset() > prefetch_off) { + rep->CreateFilePrefetchBuffer( + 0, 0, &prefetch_buffer, false /* Implicit autoreadahead */, + 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/, + /*upper_bound_offset*/ 0, /*readaheadsize_cb*/ nullptr, + /*usage=*/FilePrefetchBufferUsage::kUnknown); + + IOOptions opts; + s = rep->file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, + static_cast(prefetch_len)); + } + if (!s.ok()) { + return s; + } } - // After prefetch, read the partitions one by one for (biter.SeekToFirst(); biter.Valid(); biter.Next()) { handle = biter.value().handle; @@ -517,10 +519,11 @@ Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, // TODO: Support counter batch update for partitioned index and // filter blocks s = table()->MaybeReadBlockAndLoadToCache( - prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), - /* wait */ true, /* for_compaction */ false, &block, - nullptr /* get_context */, &lookup_context, nullptr /* contents */, - false); + prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro, + handle, UncompressionDict::GetEmptyDict(), + /* for_compaction */ false, &block, nullptr /* get_context */, + &lookup_context, nullptr /* contents */, false, + /* use_block_cache_for_lookup */ true); if (!s.ok()) { return s; } @@ -559,4 +562,10 @@ bool PartitionedFilterBlockReader::index_value_is_full() const { return table()->get_rep()->index_value_is_full; } +bool PartitionedFilterBlockReader::user_defined_timestamps_persisted() const { + assert(table()); + assert(table()->get_rep()); + + return table()->get_rep()->user_defined_timestamps_persisted; +} } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index e810c01eeb34..817fe94245a0 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -31,7 +31,8 @@ class PartitionedFilterBlockBuilder : public FullFilterBlockBuilder { FilterBitsBuilder* filter_bits_builder, int index_block_restart_interval, const bool use_value_delta_encoding, PartitionedIndexBuilder* const p_index_builder, - const uint32_t partition_size); + const uint32_t partition_size, size_t ts_sz, + const bool persist_user_defined_timestamps); virtual ~PartitionedFilterBlockBuilder(); @@ -115,21 +116,21 @@ class PartitionedFilterBlockReader bool KeyMayMatch(const Slice& key, const bool no_io, const Slice* const const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) override; + const ReadOptions& read_options) override; void KeysMayMatch(MultiGetRange* range, const bool no_io, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) override; + const ReadOptions& read_options) override; bool PrefixMayMatch(const Slice& prefix, const bool no_io, const Slice* const const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) override; + const ReadOptions& read_options) override; void PrefixesMayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, const bool no_io, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority) override; + const ReadOptions& read_options) override; size_t ApproximateMemoryUsage() const override; @@ -140,38 +141,39 @@ class PartitionedFilterBlockReader Status GetFilterPartitionBlock( FilePrefetchBuffer* prefetch_buffer, const BlockHandle& handle, bool no_io, GetContext* get_context, - BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority, + BlockCacheLookupContext* lookup_context, const ReadOptions& read_options, CachableEntry* filter_block) const; using FilterFunction = bool (FullFilterBlockReader::*)( const Slice& slice, const bool no_io, const Slice* const const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority); + const ReadOptions& read_options); bool MayMatch(const Slice& slice, bool no_io, const Slice* const_ikey_ptr, GetContext* get_context, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority, + const ReadOptions& read_options, FilterFunction filter_function) const; using FilterManyFunction = void (FullFilterBlockReader::*)( MultiGetRange* range, const SliceTransform* prefix_extractor, const bool no_io, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority); + const ReadOptions& read_options); void MayMatch(MultiGetRange* range, const SliceTransform* prefix_extractor, bool no_io, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority, + const ReadOptions& read_options, FilterManyFunction filter_function) const; void MayMatchPartition(MultiGetRange* range, const SliceTransform* prefix_extractor, BlockHandle filter_handle, bool no_io, BlockCacheLookupContext* lookup_context, - Env::IOPriority rate_limiter_priority, + const ReadOptions& read_options, FilterManyFunction filter_function) const; - Status CacheDependencies(const ReadOptions& ro, bool pin) override; + Status CacheDependencies(const ReadOptions& ro, bool pin, + FilePrefetchBuffer* tail_prefetch_buffer) override; const InternalKeyComparator* internal_comparator() const; bool index_key_includes_seq() const; bool index_value_is_full() const; + bool user_defined_timestamps_persisted() const; protected: // For partition blocks pinned in cache. Can be a subset of blocks diff --git a/table/block_based/partitioned_filter_block_test.cc b/table/block_based/partitioned_filter_block_test.cc index 59445c45e0cd..1d6e2fced841 100644 --- a/table/block_based/partitioned_filter_block_test.cc +++ b/table/block_based/partitioned_filter_block_test.cc @@ -57,7 +57,8 @@ class MyPartitionedFilterBlockReader : public PartitionedFilterBlockReader { class PartitionedFilterBlockTest : public testing::Test, - virtual public ::testing::WithParamInterface { + virtual public ::testing::WithParamInterface< + std::tuple> { public: Options options_; ImmutableOptions ioptions_; @@ -67,38 +68,64 @@ class PartitionedFilterBlockTest std::unique_ptr table_; std::shared_ptr cache_; int bits_per_key_; + size_t ts_sz_; + bool user_defined_timestamps_persisted_; - PartitionedFilterBlockTest() - : ioptions_(options_), - env_options_(options_), - icomp_(options_.comparator), - bits_per_key_(10) { + PartitionedFilterBlockTest() : bits_per_key_(10) { + auto udt_test_mode = std::get<1>(GetParam()); + if (test::IsUDTEnabled(udt_test_mode)) { + options_.comparator = test::BytewiseComparatorWithU64TsWrapper(); + } + ts_sz_ = options_.comparator->timestamp_size(); + user_defined_timestamps_persisted_ = test::ShouldPersistUDT(udt_test_mode); + icomp_ = InternalKeyComparator(options_.comparator); + env_options_ = EnvOptions(options_); + ioptions_ = ImmutableOptions(options_); table_options_.filter_policy.reset( NewBloomFilterPolicy(bits_per_key_, false)); - table_options_.format_version = GetParam(); + table_options_.format_version = std::get<0>(GetParam()); table_options_.index_block_restart_interval = 3; } ~PartitionedFilterBlockTest() override {} - const std::string keys[4] = {"afoo", "bar", "box", "hello"}; - const std::string missing_keys[2] = {"missing", "other"}; + static constexpr int kKeyNum = 4; + static constexpr int kMissingKeyNum = 2; + const std::string keys_without_ts[kKeyNum] = {"afoo", "bar", "box", "hello"}; + const std::string missing_keys_without_ts[kMissingKeyNum] = {"missing", + "other"}; + + std::vector PrepareKeys(const std::string* orig_keys, + int number_of_keys) { + std::vector user_keys; + if (ts_sz_ == 0) { + user_keys.assign(orig_keys, orig_keys + number_of_keys); + } else { + for (int i = 0; i < number_of_keys; i++) { + std::string key_with_ts; + AppendKeyWithMinTimestamp(&key_with_ts, orig_keys[i], ts_sz_); + user_keys.push_back(std::move(key_with_ts)); + } + } + return user_keys; + } uint64_t MaxIndexSize() { - int num_keys = sizeof(keys) / sizeof(*keys); uint64_t max_key_size = 0; - for (int i = 1; i < num_keys; i++) { - max_key_size = - std::max(max_key_size, static_cast(keys[i].size())); + for (int i = 0; i < kKeyNum; i++) { + // If UDT is enabled, the size of each key would be increased by a + // timestamp size. + max_key_size = std::max( + max_key_size, static_cast(keys_without_ts[i].size()) + + ts_sz_ * sizeof(static_cast(0))); } - uint64_t max_index_size = num_keys * (max_key_size + 8 /*handle*/); + uint64_t max_index_size = kKeyNum * (max_key_size + 8 /*handle*/); return max_index_size; } uint64_t MaxFilterSize() { - int num_keys = sizeof(keys) / sizeof(*keys); // General, rough over-approximation - return num_keys * bits_per_key_ + (CACHE_LINE_SIZE * 8 + /*metadata*/ 5); + return kKeyNum * bits_per_key_ + (CACHE_LINE_SIZE * 8 + /*metadata*/ 5); } uint64_t last_offset = 10; @@ -112,7 +139,8 @@ class PartitionedFilterBlockTest PartitionedIndexBuilder* NewIndexBuilder() { const bool kValueDeltaEncoded = true; return PartitionedIndexBuilder::CreateIndexBuilder( - &icomp_, !kValueDeltaEncoded, table_options_); + &icomp_, !kValueDeltaEncoded, table_options_, ts_sz_, + user_defined_timestamps_persisted_); } PartitionedFilterBlockBuilder* NewBuilder( @@ -131,7 +159,8 @@ class PartitionedFilterBlockTest BloomFilterPolicy::GetBuilderFromContext( FilterBuildingContext(table_options_)), table_options_.index_block_restart_interval, !kValueDeltaEncoded, - p_index_builder, partition_size); + p_index_builder, partition_size, ts_sz_, + user_defined_timestamps_persisted_); } PartitionedFilterBlockReader* NewReader( @@ -152,7 +181,8 @@ class PartitionedFilterBlockTest table_.reset(new MockedBlockBasedTable( new BlockBasedTable::Rep(ioptions_, env_options_, table_options_, icomp_, skip_filters, file_size, level, - immortal_table), + immortal_table, + user_defined_timestamps_persisted_), pib)); BlockContents contents(slice); CachableEntry block( @@ -167,41 +197,43 @@ class PartitionedFilterBlockTest PartitionedIndexBuilder* pib, bool empty = false) { std::unique_ptr reader( NewReader(builder, pib)); - Env::IOPriority rate_limiter_priority = Env::IO_TOTAL; // Querying added keys const bool no_io = true; + std::vector keys = PrepareKeys(keys_without_ts, kKeyNum); for (auto key : keys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); - ASSERT_TRUE(reader->KeyMayMatch(key, !no_io, &ikey_slice, - /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + ASSERT_TRUE(reader->KeyMayMatch( + StripTimestampFromUserKey(key, ts_sz_), !no_io, &ikey_slice, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr, ReadOptions())); } { // querying a key twice auto ikey = InternalKey(keys[0], 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); - ASSERT_TRUE(reader->KeyMayMatch(keys[0], !no_io, &ikey_slice, - /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + ASSERT_TRUE(reader->KeyMayMatch( + StripTimestampFromUserKey(keys[0], ts_sz_), !no_io, &ikey_slice, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr, ReadOptions())); } // querying missing keys + std::vector missing_keys = + PrepareKeys(missing_keys_without_ts, kMissingKeyNum); for (auto key : missing_keys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); if (empty) { - ASSERT_TRUE(reader->KeyMayMatch(key, !no_io, &ikey_slice, - /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + ASSERT_TRUE(reader->KeyMayMatch( + StripTimestampFromUserKey(key, ts_sz_), !no_io, &ikey_slice, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr, ReadOptions())); } else { // assuming a good hash function - ASSERT_FALSE(reader->KeyMayMatch(key, !no_io, &ikey_slice, - /*get_context=*/nullptr, - /*lookup_context=*/nullptr, - rate_limiter_priority)); + ASSERT_FALSE(reader->KeyMayMatch( + StripTimestampFromUserKey(key, ts_sz_), !no_io, &ikey_slice, + /*get_context=*/nullptr, + /*lookup_context=*/nullptr, ReadOptions())); } } } @@ -211,17 +243,18 @@ class PartitionedFilterBlockTest std::unique_ptr builder( NewBuilder(pib.get())); int i = 0; - builder->Add(keys[i]); + std::vector keys = PrepareKeys(keys_without_ts, kKeyNum); + builder->Add(StripTimestampFromUserKey(keys[i], ts_sz_)); CutABlock(pib.get(), keys[i], keys[i + 1]); i++; - builder->Add(keys[i]); + builder->Add(StripTimestampFromUserKey(keys[i], ts_sz_)); CutABlock(pib.get(), keys[i], keys[i + 1]); i++; - builder->Add(keys[i]); - builder->Add(keys[i]); + builder->Add(StripTimestampFromUserKey(keys[i], ts_sz_)); + builder->Add(StripTimestampFromUserKey(keys[i], ts_sz_)); CutABlock(pib.get(), keys[i], keys[i + 1]); i++; - builder->Add(keys[i]); + builder->Add(StripTimestampFromUserKey(keys[i], ts_sz_)); CutABlock(pib.get(), keys[i]); VerifyReader(builder.get(), pib.get()); @@ -232,16 +265,17 @@ class PartitionedFilterBlockTest std::unique_ptr pib(NewIndexBuilder()); std::unique_ptr builder( NewBuilder(pib.get(), prefix_extractor)); + std::vector keys = PrepareKeys(keys_without_ts, kKeyNum); int i = 0; - builder->Add(keys[i]); + builder->Add(StripTimestampFromUserKey(keys[i], ts_sz_)); i++; - builder->Add(keys[i]); + builder->Add(StripTimestampFromUserKey(keys[i], ts_sz_)); CutABlock(pib.get(), keys[i], keys[i + 1]); i++; - builder->Add(keys[i]); - builder->Add(keys[i]); + builder->Add(StripTimestampFromUserKey(keys[i], ts_sz_)); + builder->Add(StripTimestampFromUserKey(keys[i], ts_sz_)); i++; - builder->Add(keys[i]); + builder->Add(StripTimestampFromUserKey(keys[i], ts_sz_)); CutABlock(pib.get(), keys[i]); VerifyReader(builder.get(), pib.get(), prefix_extractor); @@ -251,15 +285,16 @@ class PartitionedFilterBlockTest std::unique_ptr pib(NewIndexBuilder()); std::unique_ptr builder( NewBuilder(pib.get())); + std::vector keys = PrepareKeys(keys_without_ts, kKeyNum); int i = 0; - builder->Add(keys[i]); + builder->Add(StripTimestampFromUserKey(keys[i], ts_sz_)); i++; - builder->Add(keys[i]); + builder->Add(StripTimestampFromUserKey(keys[i], ts_sz_)); i++; - builder->Add(keys[i]); - builder->Add(keys[i]); + builder->Add(StripTimestampFromUserKey(keys[i], ts_sz_)); + builder->Add(StripTimestampFromUserKey(keys[i], ts_sz_)); i++; - builder->Add(keys[i]); + builder->Add(StripTimestampFromUserKey(keys[i], ts_sz_)); CutABlock(pib.get(), keys[i]); VerifyReader(builder.get(), pib.get()); @@ -300,10 +335,12 @@ class PartitionedFilterBlockTest }; // Format versions potentially intersting to partitioning -INSTANTIATE_TEST_CASE_P(FormatVersions, PartitionedFilterBlockTest, - testing::ValuesIn(std::set{ - 2, 3, 4, test::kDefaultFormatVersion, - kLatestFormatVersion})); +INSTANTIATE_TEST_CASE_P( + FormatVersions, PartitionedFilterBlockTest, + testing::Combine(testing::ValuesIn(std::set{ + 2, 3, 4, test::kDefaultFormatVersion, + kLatestFormatVersion}), + testing::ValuesIn(test::GetUDTTestModes()))); TEST_P(PartitionedFilterBlockTest, EmptyBuilder) { std::unique_ptr pib(NewIndexBuilder()); @@ -338,12 +375,14 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { std::unique_ptr pib(NewIndexBuilder()); std::unique_ptr builder( NewBuilder(pib.get(), prefix_extractor.get())); - const std::string pkeys[3] = {"p-key10", "p-key20", "p-key30"}; - builder->Add(pkeys[0]); + const std::string pkeys_without_ts[3] = {"p-key10", "p-key20", "p-key30"}; + std::vector pkeys = + PrepareKeys(pkeys_without_ts, 3 /* number_of_keys */); + builder->Add(StripTimestampFromUserKey(pkeys[0], ts_sz_)); CutABlock(pib.get(), pkeys[0], pkeys[1]); - builder->Add(pkeys[1]); + builder->Add(StripTimestampFromUserKey(pkeys[1], ts_sz_)); CutABlock(pib.get(), pkeys[1], pkeys[2]); - builder->Add(pkeys[2]); + builder->Add(StripTimestampFromUserKey(pkeys[2], ts_sz_)); CutABlock(pib.get(), pkeys[2]); std::unique_ptr reader( NewReader(builder.get(), pib.get())); @@ -354,10 +393,13 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr, /*lookup_context=*/nullptr, - Env::IO_TOTAL)); + ReadOptions())); } // Non-existent keys but with the same prefix - const std::string pnonkeys[4] = {"p-key9", "p-key11", "p-key21", "p-key31"}; + const std::string pnonkeys_without_ts[4] = {"p-key9", "p-key11", "p-key21", + "p-key31"}; + std::vector pnonkeys = + PrepareKeys(pnonkeys_without_ts, 4 /* number_of_keys */); for (auto key : pnonkeys) { auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); @@ -365,7 +407,7 @@ TEST_P(PartitionedFilterBlockTest, SamePrefixInMultipleBlocks) { /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr, /*lookup_context=*/nullptr, - Env::IO_TOTAL)); + ReadOptions())); } } @@ -382,30 +424,31 @@ TEST_P(PartitionedFilterBlockTest, PrefixInWrongPartitionBug) { // In the bug, searching for prefix "p3" on an index with format version 3, // will give the key "p3" and the partition of the keys that are <= p3, i.e., // p2-keys, where the filter for prefix "p3" does not exist. - const std::string pkeys[] = {"p1-key1", "p2-key2", "p3-key3", "p4-key3", - "p5-key3"}; - builder->Add(pkeys[0]); + const std::string pkeys_without_ts[] = {"p1-key1", "p2-key2", "p3-key3", + "p4-key3", "p5-key3"}; + std::vector pkeys = + PrepareKeys(pkeys_without_ts, 5 /* number_of_keys */); + builder->Add(StripTimestampFromUserKey(pkeys[0], ts_sz_)); CutABlock(pib.get(), pkeys[0], pkeys[1]); - builder->Add(pkeys[1]); + builder->Add(StripTimestampFromUserKey(pkeys[1], ts_sz_)); CutABlock(pib.get(), pkeys[1], pkeys[2]); - builder->Add(pkeys[2]); + builder->Add(StripTimestampFromUserKey(pkeys[2], ts_sz_)); CutABlock(pib.get(), pkeys[2], pkeys[3]); - builder->Add(pkeys[3]); + builder->Add(StripTimestampFromUserKey(pkeys[3], ts_sz_)); CutABlock(pib.get(), pkeys[3], pkeys[4]); - builder->Add(pkeys[4]); + builder->Add(StripTimestampFromUserKey(pkeys[4], ts_sz_)); CutABlock(pib.get(), pkeys[4]); std::unique_ptr reader( NewReader(builder.get(), pib.get())); - Env::IOPriority rate_limiter_priority = Env::IO_TOTAL; for (auto key : pkeys) { auto prefix = prefix_extractor->Transform(key); - auto ikey = InternalKey(prefix, 0, ValueType::kTypeValue); + auto ikey = InternalKey(key, 0, ValueType::kTypeValue); const Slice ikey_slice = Slice(*ikey.rep()); ASSERT_TRUE(reader->PrefixMayMatch(prefix, /*no_io=*/false, &ikey_slice, /*get_context=*/nullptr, /*lookup_context=*/nullptr, - rate_limiter_priority)); + ReadOptions())); } } @@ -418,7 +461,6 @@ TEST_P(PartitionedFilterBlockTest, OneBlockPerKey) { } TEST_P(PartitionedFilterBlockTest, PartitionCount) { - int num_keys = sizeof(keys) / sizeof(*keys); table_options_.metadata_block_size = std::max(MaxIndexSize(), MaxFilterSize()); int partitions = TestBlockPerKey(); @@ -426,7 +468,7 @@ TEST_P(PartitionedFilterBlockTest, PartitionCount) { // A low number ensures cutting a block after each key table_options_.metadata_block_size = 1; partitions = TestBlockPerKey(); - ASSERT_EQ(partitions, num_keys - 1 /* last two keys make one flush */); + ASSERT_EQ(partitions, kKeyNum - 1 /* last two keys make one flush */); } } // namespace ROCKSDB_NAMESPACE @@ -435,4 +477,4 @@ int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -} +} \ No newline at end of file diff --git a/table/block_based/partitioned_index_iterator.cc b/table/block_based/partitioned_index_iterator.cc index b9bc2155a126..cc6f70130927 100644 --- a/table/block_based/partitioned_index_iterator.cc +++ b/table/block_based/partitioned_index_iterator.cc @@ -91,15 +91,16 @@ void PartitionedIndexIterator::InitPartitionedIndexBlock() { // Enabled from the very first IO when ReadOptions.readahead_size is set. block_prefetcher_.PrefetchIfNeeded( rep, partitioned_index_handle, read_options_.readahead_size, - is_for_compaction, /*no_sequential_checking=*/false, - read_options_.rate_limiter_priority); + is_for_compaction, /*no_sequential_checking=*/false, read_options_, + /*readaheadsize_cb=*/nullptr); Status s; table_->NewDataBlockIterator( read_options_, partitioned_index_handle, &block_iter_, BlockType::kIndex, /*get_context=*/nullptr, &lookup_context_, block_prefetcher_.prefetch_buffer(), - /*for_compaction=*/is_for_compaction, /*async_read=*/false, s); + /*for_compaction=*/is_for_compaction, /*async_read=*/false, s, + /*use_block_cache_for_lookup=*/true); block_iter_points_to_real_block_ = true; // We could check upper bound here but it is complicated to reason about // upper bound in index iterator. On the other than, in large scans, index diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc index 705223c90acf..f825907180a8 100644 --- a/table/block_based/partitioned_index_reader.cc +++ b/table/block_based/partitioned_index_reader.cc @@ -49,9 +49,8 @@ InternalIteratorBase* PartitionIndexReader::NewIterator( BlockCacheLookupContext* lookup_context) { const bool no_io = (read_options.read_tier == kBlockCacheTier); CachableEntry index_block; - const Status s = - GetOrReadIndexBlock(no_io, read_options.rate_limiter_priority, - get_context, lookup_context, &index_block); + const Status s = GetOrReadIndexBlock(no_io, get_context, lookup_context, + &index_block, read_options); if (!s.ok()) { if (iter != nullptr) { iter->Invalidate(s); @@ -76,7 +75,8 @@ InternalIteratorBase* PartitionIndexReader::NewIterator( internal_comparator()->user_comparator(), rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true, index_has_first_key(), index_key_includes_seq(), - index_value_is_full())); + index_value_is_full(), false /* block_contents_pinned */, + user_defined_timestamps_persisted())); } else { ReadOptions ro; ro.fill_cache = read_options.fill_cache; @@ -85,6 +85,8 @@ InternalIteratorBase* PartitionIndexReader::NewIterator( ro.adaptive_readahead = read_options.adaptive_readahead; ro.async_io = read_options.async_io; ro.rate_limiter_priority = read_options.rate_limiter_priority; + ro.verify_checksums = read_options.verify_checksums; + ro.io_activity = read_options.io_activity; // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. @@ -93,7 +95,8 @@ InternalIteratorBase* PartitionIndexReader::NewIterator( internal_comparator()->user_comparator(), rep->get_global_seqno(BlockType::kIndex), nullptr, kNullStats, true, index_has_first_key(), index_key_includes_seq(), - index_value_is_full())); + index_value_is_full(), false /* block_contents_pinned */, + user_defined_timestamps_persisted())); it = new PartitionedIndexIterator( table(), ro, *internal_comparator(), std::move(index_iter), @@ -111,8 +114,13 @@ InternalIteratorBase* PartitionIndexReader::NewIterator( // the first level iter is always on heap and will attempt to delete it // in its destructor. } -Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, - bool pin) { +Status PartitionIndexReader::CacheDependencies( + const ReadOptions& ro, bool pin, FilePrefetchBuffer* tail_prefetch_buffer) { + if (!partition_map_.empty()) { + // The dependencies are already cached since `partition_map_` is filled in + // an all-or-nothing manner. + return Status::OK(); + } // Before read partitions, prefetch them to avoid lots of IOs BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; const BlockBasedTable::Rep* rep = table()->rep_; @@ -122,9 +130,8 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, CachableEntry index_block; { - Status s = GetOrReadIndexBlock(false /* no_io */, ro.rate_limiter_priority, - nullptr /* get_context */, &lookup_context, - &index_block); + Status s = GetOrReadIndexBlock(false /* no_io */, nullptr /* get_context */, + &lookup_context, &index_block, ro); if (!s.ok()) { return s; } @@ -135,7 +142,8 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, index_block.GetValue()->NewIndexIterator( internal_comparator()->user_comparator(), rep->get_global_seqno(BlockType::kIndex), &biter, kNullStats, true, - index_has_first_key(), index_key_includes_seq(), index_value_is_full()); + index_has_first_key(), index_key_includes_seq(), index_value_is_full(), + false /* block_contents_pinned */, user_defined_timestamps_persisted()); // Index partitions are assumed to be consecuitive. Prefetch them all. // Read the first block offset biter.SeekToFirst(); @@ -157,22 +165,25 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, handle.offset() + BlockBasedTable::BlockSizeWithTrailer(handle); uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; - rep->CreateFilePrefetchBuffer( - 0, 0, &prefetch_buffer, false /*Implicit auto readahead*/, - 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); - IOOptions opts; - { - Status s = rep->file->PrepareIOOptions(ro, opts); - if (s.ok()) { - s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, - static_cast(prefetch_len), - ro.rate_limiter_priority); - } - if (!s.ok()) { - return s; + if (tail_prefetch_buffer == nullptr || !tail_prefetch_buffer->Enabled() || + tail_prefetch_buffer->GetPrefetchOffset() > prefetch_off) { + rep->CreateFilePrefetchBuffer( + 0, 0, &prefetch_buffer, false /*Implicit auto readahead*/, + 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/, + /*upper_bound_offset*/ 0, /*readaheadsize_cb*/ nullptr, + /*usage=*/FilePrefetchBufferUsage::kUnknown); + IOOptions opts; + { + Status s = rep->file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, + static_cast(prefetch_len)); + } + if (!s.ok()) { + return s; + } } } - // For saving "all or nothing" to partition_map_ UnorderedMap> map_in_progress; @@ -186,10 +197,11 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, // TODO: Support counter batch update for partitioned index and // filter blocks Status s = table()->MaybeReadBlockAndLoadToCache( - prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), - /*wait=*/true, /*for_compaction=*/false, &block.As(), + prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro, + handle, UncompressionDict::GetEmptyDict(), + /*for_compaction=*/false, &block.As(), /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr, - /*async_read=*/false); + /*async_read=*/false, /*use_block_cache_for_lookup=*/true); if (!s.ok()) { return s; diff --git a/table/block_based/partitioned_index_reader.h b/table/block_based/partitioned_index_reader.h index 58a7877ab5db..9482fd6b44c4 100644 --- a/table/block_based/partitioned_index_reader.h +++ b/table/block_based/partitioned_index_reader.h @@ -30,7 +30,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { IndexBlockIter* iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) override; - Status CacheDependencies(const ReadOptions& ro, bool pin) override; + Status CacheDependencies(const ReadOptions& ro, bool pin, + FilePrefetchBuffer* tail_prefetch_buffer) override; size_t ApproximateMemoryUsage() const override { size_t usage = ApproximateIndexBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE diff --git a/table/block_based/reader_common.cc b/table/block_based/reader_common.cc index 0ff43e9b4e63..7d0c97c717d4 100644 --- a/table/block_based/reader_common.cc +++ b/table/block_based/reader_common.cc @@ -23,10 +23,14 @@ void ForceReleaseCachedEntry(void* arg, void* h) { } // WART: this is specific to block-based table -Status VerifyBlockChecksum(ChecksumType type, const char* data, +Status VerifyBlockChecksum(const Footer& footer, const char* data, size_t block_size, const std::string& file_name, uint64_t offset) { PERF_TIMER_GUARD(block_checksum_time); + + assert(footer.GetBlockTrailerSize() == 5); + ChecksumType type = footer.checksum_type(); + // After block_size bytes is compression type (1 byte), which is part of // the checksummed section. size_t len = block_size + 1; @@ -34,6 +38,13 @@ Status VerifyBlockChecksum(ChecksumType type, const char* data, uint32_t stored = DecodeFixed32(data + len); uint32_t computed = ComputeBuiltinChecksum(type, data, len); + + // Unapply context to 'stored' rather than apply to 'computed, for people + // who might look for reference crc value in error message + uint32_t modifier = + ChecksumModifierForContext(footer.base_context_checksum(), offset); + stored -= modifier; + if (stored == computed) { return Status::OK(); } else { @@ -43,8 +54,9 @@ Status VerifyBlockChecksum(ChecksumType type, const char* data, computed = crc32c::Unmask(computed); } return Status::Corruption( - "block checksum mismatch: stored = " + std::to_string(stored) + - ", computed = " + std::to_string(computed) + + "block checksum mismatch: stored" + + std::string(modifier ? "(context removed)" : "") + " = " + + std::to_string(stored) + ", computed = " + std::to_string(computed) + ", type = " + std::to_string(type) + " in " + file_name + " offset " + std::to_string(offset) + " size " + std::to_string(block_size)); } diff --git a/table/block_based/reader_common.h b/table/block_based/reader_common.h index 5bb199f28497..08c2a756bb01 100644 --- a/table/block_based/reader_common.h +++ b/table/block_based/reader_common.h @@ -8,10 +8,12 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include "rocksdb/cache.h" +#include "rocksdb/advanced_cache.h" #include "rocksdb/table.h" namespace ROCKSDB_NAMESPACE { +class Footer; + // Release the cached entry and decrement its ref count. extern void ForceReleaseCachedEntry(void* arg, void* h); @@ -22,16 +24,13 @@ inline MemoryAllocator* GetMemoryAllocator( : nullptr; } -inline MemoryAllocator* GetMemoryAllocatorForCompressedBlock( - const BlockBasedTableOptions& table_options) { - return table_options.block_cache_compressed.get() - ? table_options.block_cache_compressed->memory_allocator() - : nullptr; -} - -// Assumes block has a trailer as in format.h. file_name and offset provided -// for generating a diagnostic message in returned status. -extern Status VerifyBlockChecksum(ChecksumType type, const char* data, +// Assumes block has a trailer past `data + block_size` as in format.h. +// `file_name` provided for generating diagnostic message in returned status. +// `offset` might be required for proper verification (also used for message). +// +// Returns Status::OK() on checksum match, or Status::Corruption() on checksum +// mismatch. +extern Status VerifyBlockChecksum(const Footer& footer, const char* data, size_t block_size, const std::string& file_name, uint64_t offset); diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc index 7b0b7c94352a..3656b35d504c 100644 --- a/table/block_based/uncompression_dict_reader.cc +++ b/table/block_based/uncompression_dict_reader.cc @@ -62,8 +62,8 @@ Status UncompressionDictReader::ReadUncompressionDictionary( prefetch_buffer, read_options, rep->compression_dict_handle, UncompressionDict::GetEmptyDict(), uncompression_dict, get_context, lookup_context, - /* for_compaction */ false, use_cache, /* wait_for_cache */ true, - /* async_read */ false); + /* for_compaction */ false, use_cache, + /* async_read */ false, /* use_block_cache_for_lookup */ true); if (!s.ok()) { ROCKS_LOG_WARN( @@ -77,8 +77,9 @@ Status UncompressionDictReader::ReadUncompressionDictionary( } Status UncompressionDictReader::GetOrReadUncompressionDictionary( - FilePrefetchBuffer* prefetch_buffer, bool no_io, bool verify_checksums, - GetContext* get_context, BlockCacheLookupContext* lookup_context, + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, bool no_io, + bool verify_checksums, GetContext* get_context, + BlockCacheLookupContext* lookup_context, CachableEntry* uncompression_dict) const { assert(uncompression_dict); @@ -92,6 +93,7 @@ Status UncompressionDictReader::GetOrReadUncompressionDictionary( read_options.read_tier = kBlockCacheTier; } read_options.verify_checksums = verify_checksums; + read_options.io_activity = ro.io_activity; return ReadUncompressionDictionary(table_, prefetch_buffer, read_options, cache_dictionary_blocks(), get_context, diff --git a/table/block_based/uncompression_dict_reader.h b/table/block_based/uncompression_dict_reader.h index 416d25e2d965..c78800d8acb4 100644 --- a/table/block_based/uncompression_dict_reader.h +++ b/table/block_based/uncompression_dict_reader.h @@ -32,8 +32,9 @@ class UncompressionDictReader { std::unique_ptr* uncompression_dict_reader); Status GetOrReadUncompressionDictionary( - FilePrefetchBuffer* prefetch_buffer, bool no_io, bool verify_checksums, - GetContext* get_context, BlockCacheLookupContext* lookup_context, + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, bool no_io, + bool verify_checksums, GetContext* get_context, + BlockCacheLookupContext* lookup_context, CachableEntry* uncompression_dict) const; size_t ApproximateMemoryUsage() const; diff --git a/table/block_fetcher.cc b/table/block_fetcher.cc index 8df0850b3dfc..257a1a42ea8c 100644 --- a/table/block_fetcher.cc +++ b/table/block_fetcher.cc @@ -14,7 +14,7 @@ #include #include "logging/logging.h" -#include "memory/memory_allocator.h" +#include "memory/memory_allocator_impl.h" #include "monitoring/perf_context_imp.h" #include "rocksdb/compression_type.h" #include "rocksdb/env.h" @@ -33,10 +33,14 @@ inline void BlockFetcher::ProcessTrailerIfPresent() { if (footer_.GetBlockTrailerSize() > 0) { assert(footer_.GetBlockTrailerSize() == BlockBasedTable::kBlockTrailerSize); if (read_options_.verify_checksums) { - io_status_ = status_to_io_status(VerifyBlockChecksum( - footer_.checksum_type(), slice_.data(), block_size_, - file_->file_name(), handle_.offset())); + io_status_ = status_to_io_status( + VerifyBlockChecksum(footer_, slice_.data(), block_size_, + file_->file_name(), handle_.offset())); RecordTick(ioptions_.stats, BLOCK_CHECKSUM_COMPUTE_COUNT); + if (!io_status_.ok()) { + assert(io_status_.IsCorruption()); + RecordTick(ioptions_.stats, BLOCK_CHECKSUM_MISMATCH_COUNT); + } } compression_type_ = BlockBasedTable::GetBlockCompressionType(slice_.data(), block_size_); @@ -76,11 +80,11 @@ inline bool BlockFetcher::TryGetFromPrefetchBuffer() { if (read_options_.async_io && !for_compaction_) { read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCacheAsync( opts, file_, handle_.offset(), block_size_with_trailer_, &slice_, - &io_s, read_options_.rate_limiter_priority); + &io_s); } else { read_from_prefetch_buffer = prefetch_buffer_->TryReadFromCache( opts, file_, handle_.offset(), block_size_with_trailer_, &slice_, - &io_s, read_options_.rate_limiter_priority, for_compaction_); + &io_s, for_compaction_); } if (read_from_prefetch_buffer) { ProcessTrailerIfPresent(); @@ -254,17 +258,19 @@ IOStatus BlockFetcher::ReadBlockContents() { if (io_status_.ok()) { if (file_->use_direct_io()) { PERF_TIMER_GUARD(block_read_time); - io_status_ = file_->Read( - opts, handle_.offset(), block_size_with_trailer_, &slice_, nullptr, - &direct_io_buf_, read_options_.rate_limiter_priority); + PERF_CPU_TIMER_GUARD(block_read_cpu_time, nullptr); + io_status_ = + file_->Read(opts, handle_.offset(), block_size_with_trailer_, + &slice_, nullptr, &direct_io_buf_); PERF_COUNTER_ADD(block_read_count, 1); used_buf_ = const_cast(slice_.data()); } else { PrepareBufferForBlockFromFile(); PERF_TIMER_GUARD(block_read_time); - io_status_ = file_->Read(opts, handle_.offset(), - block_size_with_trailer_, &slice_, used_buf_, - nullptr, read_options_.rate_limiter_priority); + PERF_CPU_TIMER_GUARD(block_read_cpu_time, nullptr); + io_status_ = + file_->Read(opts, handle_.offset(), block_size_with_trailer_, + &slice_, used_buf_, nullptr); PERF_COUNTER_ADD(block_read_count, 1); #ifndef NDEBUG if (slice_.data() == &stack_buf_[0]) { @@ -330,9 +336,11 @@ IOStatus BlockFetcher::ReadBlockContents() { #ifndef NDEBUG num_heap_buf_memcpy_++; #endif - compression_type_ = kNoCompression; + // Save the compressed block without trailer + slice_ = Slice(slice_.data(), block_size_); } else { GetBlockContents(); + slice_ = Slice(); } InsertUncompressedBlockToPersistentCacheIfNeeded(); @@ -381,7 +389,6 @@ IOStatus BlockFetcher::ReadAsyncBlockContents() { #ifndef NDEBUG num_heap_buf_memcpy_++; #endif - compression_type_ = kNoCompression; } else { GetBlockContents(); } diff --git a/table/block_fetcher.h b/table/block_fetcher.h index 72adced30e3b..e5a51e3eb25f 100644 --- a/table/block_fetcher.h +++ b/table/block_fetcher.h @@ -8,7 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include "memory/memory_allocator.h" +#include "memory/memory_allocator_impl.h" #include "table/block_based/block.h" #include "table/block_based/block_type.h" #include "table/format.h" @@ -79,6 +79,10 @@ class BlockFetcher { inline size_t GetBlockSizeWithTrailer() const { return block_size_with_trailer_; } + inline Slice& GetCompressedBlock() { + assert(compression_type_ != kNoCompression); + return slice_; + } #ifndef NDEBUG int TEST_GetNumStackBufMemcpy() const { return num_stack_buf_memcpy_; } diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc index f87b23c3a4c2..d738fa3df8a5 100644 --- a/table/block_fetcher_test.cc +++ b/table/block_fetcher_test.cc @@ -107,6 +107,9 @@ class BlockFetcherTest : public testing::Test { Footer footer; ReadFooter(file.get(), &footer); const BlockHandle& index_handle = footer.index_handle(); + // FIXME: index handle will need to come from metaindex for + // format_version >= 6 when that becomes the default + ASSERT_FALSE(index_handle.IsNull()); CompressionType compression_type; FetchBlock(file.get(), index_handle, BlockType::kIndex, @@ -268,7 +271,8 @@ class BlockFetcherTest : public testing::Test { ASSERT_NE(table_options, nullptr); ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options, comparator, std::move(file), file_size, - &table_reader)); + 0 /* block_protection_bytes_per_key */, + &table_reader, 0 /* tail_size */)); table->reset(reinterpret_cast(table_reader.release())); } @@ -295,7 +299,7 @@ class BlockFetcherTest : public testing::Test { MemoryAllocator* heap_buf_allocator, MemoryAllocator* compressed_buf_allocator, BlockContents* contents, MemcpyStats* stats, - CompressionType* compresstion_type) { + CompressionType* compression_type) { ImmutableOptions ioptions(options_); ReadOptions roptions; PersistentCacheOptions persistent_cache_options; @@ -314,7 +318,11 @@ class BlockFetcherTest : public testing::Test { stats->num_compressed_buf_memcpy = fetcher->TEST_GetNumCompressedBufMemcpy(); - *compresstion_type = fetcher->get_compression_type(); + if (do_uncompress) { + *compression_type = kNoCompression; + } else { + *compression_type = fetcher->get_compression_type(); + } } // NOTE: expected_compression_type is the expected compression @@ -363,7 +371,6 @@ class BlockFetcherTest : public testing::Test { }; // Skip the following tests in lite mode since direct I/O is unsupported. -#ifndef ROCKSDB_LITE // Fetch index block under both direct IO and non-direct IO. // Expects: @@ -509,7 +516,6 @@ TEST_F(BlockFetcherTest, FetchAndUncompressCompressedDataBlock) { expected_stats_by_mode); } -#endif // ROCKSDB_LITE } // namespace } // namespace ROCKSDB_NAMESPACE diff --git a/table/compaction_merging_iterator.cc b/table/compaction_merging_iterator.cc new file mode 100644 index 000000000000..98581b16d762 --- /dev/null +++ b/table/compaction_merging_iterator.cc @@ -0,0 +1,371 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +#include "table/compaction_merging_iterator.h" + +namespace ROCKSDB_NAMESPACE { +class CompactionMergingIterator : public InternalIterator { + public: + CompactionMergingIterator( + const InternalKeyComparator* comparator, InternalIterator** children, + int n, bool is_arena_mode, + std::vector< + std::pair> + range_tombstones) + : is_arena_mode_(is_arena_mode), + comparator_(comparator), + current_(nullptr), + minHeap_(CompactionHeapItemComparator(comparator_)), + pinned_iters_mgr_(nullptr) { + children_.resize(n); + for (int i = 0; i < n; i++) { + children_[i].level = i; + children_[i].iter.Set(children[i]); + assert(children_[i].type == HeapItem::ITERATOR); + } + assert(range_tombstones.size() == static_cast(n)); + for (auto& p : range_tombstones) { + range_tombstone_iters_.push_back(p.first); + } + pinned_heap_item_.resize(n); + for (int i = 0; i < n; ++i) { + if (range_tombstones[i].second) { + // for LevelIterator + *range_tombstones[i].second = &range_tombstone_iters_[i]; + } + pinned_heap_item_[i].level = i; + pinned_heap_item_[i].type = HeapItem::DELETE_RANGE_START; + } + } + + void considerStatus(const Status& s) { + if (!s.ok() && status_.ok()) { + status_ = s; + } + } + + ~CompactionMergingIterator() override { + // TODO: use unique_ptr for range_tombstone_iters_ + for (auto child : range_tombstone_iters_) { + delete child; + } + + for (auto& child : children_) { + child.iter.DeleteIter(is_arena_mode_); + } + status_.PermitUncheckedError(); + } + + bool Valid() const override { return current_ != nullptr && status_.ok(); } + + Status status() const override { return status_; } + + void SeekToFirst() override; + + void Seek(const Slice& target) override; + + void Next() override; + + Slice key() const override { + assert(Valid()); + return current_->key(); + } + + Slice value() const override { + assert(Valid()); + if (LIKELY(current_->type == HeapItem::ITERATOR)) { + return current_->iter.value(); + } else { + return dummy_tombstone_val; + } + } + + // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result + // from current child iterator. Potentially as long as one of child iterator + // report out of bound is not possible, we know current key is within bound. + bool MayBeOutOfLowerBound() override { + assert(Valid()); + return current_->type == HeapItem::DELETE_RANGE_START || + current_->iter.MayBeOutOfLowerBound(); + } + + IterBoundCheck UpperBoundCheckResult() override { + assert(Valid()); + return current_->type == HeapItem::DELETE_RANGE_START + ? IterBoundCheck::kUnknown + : current_->iter.UpperBoundCheckResult(); + } + + void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override { + pinned_iters_mgr_ = pinned_iters_mgr; + for (auto& child : children_) { + child.iter.SetPinnedItersMgr(pinned_iters_mgr); + } + } + + bool IsDeleteRangeSentinelKey() const override { + assert(Valid()); + return current_->type == HeapItem::DELETE_RANGE_START; + } + + // Compaction uses the above subset of InternalIterator interface. + void SeekToLast() override { assert(false); } + + void SeekForPrev(const Slice&) override { assert(false); } + + void Prev() override { assert(false); } + + bool NextAndGetResult(IterateResult*) override { + assert(false); + return false; + } + + bool IsKeyPinned() const override { + assert(false); + return false; + } + + bool IsValuePinned() const override { + assert(false); + return false; + } + + bool PrepareValue() override { + assert(false); + return false; + } + + private: + struct HeapItem { + HeapItem() = default; + + IteratorWrapper iter; + size_t level = 0; + std::string tombstone_str; + enum Type { ITERATOR, DELETE_RANGE_START }; + Type type = ITERATOR; + + explicit HeapItem(size_t _level, InternalIteratorBase* _iter) + : level(_level), type(Type::ITERATOR) { + iter.Set(_iter); + } + + void SetTombstoneForCompaction(const ParsedInternalKey&& pik) { + tombstone_str.clear(); + AppendInternalKey(&tombstone_str, pik); + } + + [[nodiscard]] Slice key() const { + return type == ITERATOR ? iter.key() : tombstone_str; + } + }; + + class CompactionHeapItemComparator { + public: + explicit CompactionHeapItemComparator( + const InternalKeyComparator* comparator) + : comparator_(comparator) {} + + bool operator()(HeapItem* a, HeapItem* b) const { + int r = comparator_->Compare(a->key(), b->key()); + // For each file, we assume all range tombstone start keys come before + // its file boundary sentinel key (file's meta.largest key). + // In the case when meta.smallest = meta.largest and range tombstone start + // key is truncated at meta.smallest, the start key will have op_type = + // kMaxValid to make it smaller (see TruncatedRangeDelIterator + // constructor). The following assertion validates this assumption. + assert(a->type == b->type || r != 0); + return r > 0; + } + + private: + const InternalKeyComparator* comparator_; + }; + + using CompactionMinHeap = BinaryHeap; + bool is_arena_mode_; + const InternalKeyComparator* comparator_; + // HeapItem for all child point iterators. + std::vector children_; + // HeapItem for range tombstones. pinned_heap_item_[i] corresponds to the + // current range tombstone from range_tombstone_iters_[i]. + std::vector pinned_heap_item_; + // range_tombstone_iters_[i] contains range tombstones in the sorted run that + // corresponds to children_[i]. range_tombstone_iters_[i] == + // nullptr means the sorted run of children_[i] does not have range + // tombstones (or the current SSTable does not have range tombstones in the + // case of LevelIterator). + std::vector range_tombstone_iters_; + // Used as value for range tombstone keys + std::string dummy_tombstone_val{}; + + // Skip file boundary sentinel keys. + void FindNextVisibleKey(); + + // top of minHeap_ + HeapItem* current_; + // If any of the children have non-ok status, this is one of them. + Status status_; + CompactionMinHeap minHeap_; + PinnedIteratorsManager* pinned_iters_mgr_; + // Process a child that is not in the min heap. + // If valid, add to the min heap. Otherwise, check status. + void AddToMinHeapOrCheckStatus(HeapItem*); + + HeapItem* CurrentForward() const { + return !minHeap_.empty() ? minHeap_.top() : nullptr; + } + + void InsertRangeTombstoneAtLevel(size_t level) { + if (range_tombstone_iters_[level]->Valid()) { + pinned_heap_item_[level].SetTombstoneForCompaction( + range_tombstone_iters_[level]->start_key()); + minHeap_.push(&pinned_heap_item_[level]); + } + } +}; + +void CompactionMergingIterator::SeekToFirst() { + minHeap_.clear(); + status_ = Status::OK(); + for (auto& child : children_) { + child.iter.SeekToFirst(); + AddToMinHeapOrCheckStatus(&child); + } + + for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { + if (range_tombstone_iters_[i]) { + range_tombstone_iters_[i]->SeekToFirst(); + InsertRangeTombstoneAtLevel(i); + } + } + + FindNextVisibleKey(); + current_ = CurrentForward(); +} + +void CompactionMergingIterator::Seek(const Slice& target) { + minHeap_.clear(); + status_ = Status::OK(); + for (auto& child : children_) { + child.iter.Seek(target); + AddToMinHeapOrCheckStatus(&child); + } + + ParsedInternalKey pik; + ParseInternalKey(target, &pik, false /* log_err_key */) + .PermitUncheckedError(); + for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { + if (range_tombstone_iters_[i]) { + range_tombstone_iters_[i]->Seek(pik.user_key); + // For compaction, output keys should all be after seek target. + while (range_tombstone_iters_[i]->Valid() && + comparator_->Compare(range_tombstone_iters_[i]->start_key(), pik) < + 0) { + range_tombstone_iters_[i]->Next(); + } + InsertRangeTombstoneAtLevel(i); + } + } + + FindNextVisibleKey(); + current_ = CurrentForward(); +} + +void CompactionMergingIterator::Next() { + assert(Valid()); + // For the heap modifications below to be correct, current_ must be the + // current top of the heap. + assert(current_ == CurrentForward()); + // as the current points to the current record. move the iterator forward. + if (current_->type == HeapItem::ITERATOR) { + current_->iter.Next(); + if (current_->iter.Valid()) { + // current is still valid after the Next() call above. Call + // replace_top() to restore the heap property. When the same child + // iterator yields a sequence of keys, this is cheap. + assert(current_->iter.status().ok()); + minHeap_.replace_top(current_); + } else { + // current stopped being valid, remove it from the heap. + considerStatus(current_->iter.status()); + minHeap_.pop(); + } + } else { + assert(current_->type == HeapItem::DELETE_RANGE_START); + size_t level = current_->level; + assert(range_tombstone_iters_[level]); + range_tombstone_iters_[level]->Next(); + if (range_tombstone_iters_[level]->Valid()) { + pinned_heap_item_[level].SetTombstoneForCompaction( + range_tombstone_iters_[level]->start_key()); + minHeap_.replace_top(&pinned_heap_item_[level]); + } else { + minHeap_.pop(); + } + } + FindNextVisibleKey(); + current_ = CurrentForward(); +} + +void CompactionMergingIterator::FindNextVisibleKey() { + while (!minHeap_.empty()) { + HeapItem* current = minHeap_.top(); + // IsDeleteRangeSentinelKey() here means file boundary sentinel keys. + if (current->type != HeapItem::ITERATOR || + !current->iter.IsDeleteRangeSentinelKey()) { + return; + } + // range tombstone start keys from the same SSTable should have been + // exhausted + assert(!range_tombstone_iters_[current->level] || + !range_tombstone_iters_[current->level]->Valid()); + // current->iter is a LevelIterator, and it enters a new SST file in the + // Next() call here. + current->iter.Next(); + if (current->iter.Valid()) { + assert(current->iter.status().ok()); + minHeap_.replace_top(current); + } else { + considerStatus(current->iter.status()); + minHeap_.pop(); + } + if (range_tombstone_iters_[current->level]) { + InsertRangeTombstoneAtLevel(current->level); + } + } +} + +void CompactionMergingIterator::AddToMinHeapOrCheckStatus(HeapItem* child) { + if (child->iter.Valid()) { + assert(child->iter.status().ok()); + minHeap_.push(child); + } else { + considerStatus(child->iter.status()); + } +} + +InternalIterator* NewCompactionMergingIterator( + const InternalKeyComparator* comparator, InternalIterator** children, int n, + std::vector>& range_tombstone_iters, + Arena* arena) { + assert(n >= 0); + if (n == 0) { + return NewEmptyInternalIterator(arena); + } else { + if (arena == nullptr) { + return new CompactionMergingIterator(comparator, children, n, + false /* is_arena_mode */, + range_tombstone_iters); + } else { + auto mem = arena->AllocateAligned(sizeof(CompactionMergingIterator)); + return new (mem) CompactionMergingIterator(comparator, children, n, + true /* is_arena_mode */, + range_tombstone_iters); + } + } +} +} // namespace ROCKSDB_NAMESPACE diff --git a/table/compaction_merging_iterator.h b/table/compaction_merging_iterator.h new file mode 100644 index 000000000000..e3fd7797fd81 --- /dev/null +++ b/table/compaction_merging_iterator.h @@ -0,0 +1,44 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "db/range_del_aggregator.h" +#include "rocksdb/slice.h" +#include "rocksdb/types.h" +#include "table/merging_iterator.h" + +namespace ROCKSDB_NAMESPACE { + +/* + * This is a simplified version of MergingIterator and is specifically used for + * compaction. It merges the input `children` iterators into a sorted stream of + * keys. Range tombstone start keys are also emitted to prevent oversize + * compactions. For example, consider an L1 file with content [a, b), y, z, + * where [a, b) is a range tombstone and y and z are point keys. This could + * cause an oversize compaction as it can overlap with a wide range of key space + * in L2. + * + * CompactionMergingIterator emits range tombstone start keys from each LSM + * level's range tombstone iterator, and for each range tombstone + * [start,end)@seqno, the key will be start@seqno with op_type + * kTypeRangeDeletion unless truncated at file boundary (see detail in + * TruncatedRangeDelIterator::start_key()). + * + * Caller should use CompactionMergingIterator::IsDeleteRangeSentinelKey() to + * check if the current key is a range tombstone key. + * TODO(cbi): IsDeleteRangeSentinelKey() is used for two kinds of keys at + * different layers: file boundary and range tombstone keys. Separate them into + * two APIs for clarity. + */ +class CompactionMergingIterator; + +InternalIterator* NewCompactionMergingIterator( + const InternalKeyComparator* comparator, InternalIterator** children, int n, + std::vector>& range_tombstone_iters, + Arena* arena = nullptr); +} // namespace ROCKSDB_NAMESPACE diff --git a/table/cuckoo/cuckoo_table_builder.cc b/table/cuckoo/cuckoo_table_builder.cc index 296825d94809..0cf6834af81b 100644 --- a/table/cuckoo/cuckoo_table_builder.cc +++ b/table/cuckoo/cuckoo_table_builder.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "table/cuckoo/cuckoo_table_builder.h" #include @@ -404,8 +403,12 @@ Status CuckooTableBuilder::Finish() { } FooterBuilder footer; - footer.Build(kCuckooTableMagicNumber, /* format_version */ 1, offset, - kNoChecksum, meta_index_block_handle); + Status s = footer.Build(kCuckooTableMagicNumber, /* format_version */ 1, + offset, kNoChecksum, meta_index_block_handle); + if (!s.ok()) { + status_ = s; + return status_; + } io_status_ = file_->Append(footer.GetSlice()); status_ = io_status_; return status_; @@ -550,4 +553,3 @@ const char* CuckooTableBuilder::GetFileChecksumFuncName() const { } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/table/cuckoo/cuckoo_table_builder.h b/table/cuckoo/cuckoo_table_builder.h index a125e1f4c5e3..3a19dd6f997c 100644 --- a/table/cuckoo/cuckoo_table_builder.h +++ b/table/cuckoo/cuckoo_table_builder.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -135,4 +134,3 @@ class CuckooTableBuilder : public TableBuilder { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/table/cuckoo/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc index be1c62117da7..1a0d58c76d12 100644 --- a/table/cuckoo/cuckoo_table_builder_test.cc +++ b/table/cuckoo/cuckoo_table_builder_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "table/cuckoo/cuckoo_table_builder.h" @@ -71,8 +70,10 @@ class CuckooBuilderTest : public testing::Test { // Assert Table Properties. std::unique_ptr props; + const ReadOptions read_options; ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size, - kCuckooTableMagicNumber, ioptions, &props)); + kCuckooTableMagicNumber, ioptions, + read_options, &props)); // Check unused bucket. std::string unused_key = props->user_collected_properties[CuckooTablePropertyNames::kEmptyKey]; @@ -121,8 +122,7 @@ class CuckooBuilderTest : public testing::Test { for (uint32_t i = 0; i + 1 < table_size + cuckoo_block_size; ++i) { Slice read_slice; ASSERT_OK(file_reader->Read(IOOptions(), i * bucket_size, bucket_size, - &read_slice, nullptr, nullptr, - Env::IO_TOTAL /* rate_limiter_priority */)); + &read_slice, nullptr, nullptr)); size_t key_idx = std::find(expected_locations.begin(), expected_locations.end(), i) - expected_locations.begin(); @@ -628,13 +628,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/table/cuckoo/cuckoo_table_factory.cc b/table/cuckoo/cuckoo_table_factory.cc index 1253c92dd6a5..774e00212d97 100644 --- a/table/cuckoo/cuckoo_table_factory.cc +++ b/table/cuckoo/cuckoo_table_factory.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "table/cuckoo/cuckoo_table_factory.h" #include "db/dbformat.h" @@ -67,7 +66,6 @@ std::string CuckooTableFactory::GetPrintableOptions() const { static std::unordered_map cuckoo_table_type_info = { -#ifndef ROCKSDB_LITE {"hash_table_ratio", {offsetof(struct CuckooTableOptions, hash_table_ratio), OptionType::kDouble, OptionVerificationType::kNormal, @@ -88,7 +86,6 @@ static std::unordered_map cuckoo_table_type_info = {offsetof(struct CuckooTableOptions, use_module_hash), OptionType::kBoolean, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, -#endif // ROCKSDB_LITE }; CuckooTableFactory::CuckooTableFactory(const CuckooTableOptions& table_options) @@ -101,4 +98,3 @@ TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) { } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/table/cuckoo/cuckoo_table_factory.h b/table/cuckoo/cuckoo_table_factory.h index 9937c28dd29f..7132cec659ab 100644 --- a/table/cuckoo/cuckoo_table_factory.h +++ b/table/cuckoo/cuckoo_table_factory.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include @@ -79,4 +78,3 @@ class CuckooTableFactory : public TableFactory { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc index 1d70909a6013..a4479ab60cda 100644 --- a/table/cuckoo/cuckoo_table_reader.cc +++ b/table/cuckoo/cuckoo_table_reader.cc @@ -7,7 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #include "table/cuckoo/cuckoo_table_reader.h" #include @@ -60,8 +59,11 @@ CuckooTableReader::CuckooTableReader( } { std::unique_ptr props; - status_ = ReadTableProperties(file_.get(), file_size, - kCuckooTableMagicNumber, ioptions, &props); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status_ = + ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, + ioptions, read_options, &props); if (!status_.ok()) { return; } @@ -142,9 +144,8 @@ CuckooTableReader::CuckooTableReader( *reinterpret_cast(cuckoo_block_size->second.data()); cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1; // TODO: rate limit reads of whole cuckoo tables. - status_ = - file_->Read(IOOptions(), 0, static_cast(file_size), &file_data_, - nullptr, nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + status_ = file_->Read(IOOptions(), 0, static_cast(file_size), + &file_data_, nullptr, nullptr); } Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/, @@ -408,4 +409,3 @@ InternalIterator* CuckooTableReader::NewIterator( size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; } } // namespace ROCKSDB_NAMESPACE -#endif diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h index f6c599ae8087..d17011ed83e9 100644 --- a/table/cuckoo/cuckoo_table_reader.h +++ b/table/cuckoo/cuckoo_table_reader.h @@ -8,7 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#ifndef ROCKSDB_LITE #include #include #include @@ -59,12 +58,14 @@ class CuckooTableReader : public TableReader { size_t ApproximateMemoryUsage() const override; // Following methods are not implemented for Cuckoo Table Reader - uint64_t ApproximateOffsetOf(const Slice& /*key*/, + uint64_t ApproximateOffsetOf(const ReadOptions& /*read_options*/, + const Slice& /*key*/, TableReaderCaller /*caller*/) override { return 0; } - uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, + uint64_t ApproximateSize(const ReadOptions& /* read_options */, + const Slice& /*start*/, const Slice& /*end*/, TableReaderCaller /*caller*/) override { return 0; } @@ -97,4 +98,3 @@ class CuckooTableReader : public TableReader { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/table/cuckoo/cuckoo_table_reader_test.cc b/table/cuckoo/cuckoo_table_reader_test.cc index d3d1490c6ef1..e83baa107793 100644 --- a/table/cuckoo/cuckoo_table_reader_test.cc +++ b/table/cuckoo/cuckoo_table_reader_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #ifndef GFLAGS #include @@ -573,12 +572,3 @@ int main(int argc, char** argv) { #endif // GFLAGS. -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/table/format.cc b/table/format.cc index d3347cdb8c1a..27ecce54724e 100644 --- a/table/format.cc +++ b/table/format.cc @@ -10,20 +10,23 @@ #include "table/format.h" #include +#include #include #include "block_fetcher.h" #include "file/random_access_file_reader.h" -#include "memory/memory_allocator.h" +#include "memory/memory_allocator_impl.h" #include "monitoring/perf_context_imp.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "options/options_helper.h" +#include "port/likely.h" #include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/table.h" #include "table/block_based/block.h" #include "table/block_based/block_based_table_reader.h" #include "table/persistent_cache_helper.h" +#include "unique_id_impl.h" #include "util/cast_util.h" #include "util/coding.h" #include "util/compression.h" @@ -38,14 +41,8 @@ namespace ROCKSDB_NAMESPACE { extern const uint64_t kLegacyBlockBasedTableMagicNumber; extern const uint64_t kBlockBasedTableMagicNumber; -#ifndef ROCKSDB_LITE extern const uint64_t kLegacyPlainTableMagicNumber; extern const uint64_t kPlainTableMagicNumber; -#else -// ROCKSDB_LITE doesn't have plain table -const uint64_t kLegacyPlainTableMagicNumber = 0; -const uint64_t kPlainTableMagicNumber = 0; -#endif const char* kHostnameForDbHostId = "__hostname__"; bool ShouldReportDetailedTime(Env* env, Statistics* stats) { @@ -201,25 +198,41 @@ inline uint8_t BlockTrailerSizeForMagicNumber(uint64_t magic_number) { // -> format_version >= 1 // checksum type (char, 1 byte) // * Part2 +// -> format_version <= 5 // metaindex handle (varint64 offset, varint64 size) // index handle (varint64 offset, varint64 size) // for part2 size = 2 * BlockHandle::kMaxEncodedLength = 40 +// - This padding is unchecked/ignored +// -> format_version >= 6 +// extended magic number (4 bytes) = 0x3e 0x00 0x7a 0x00 +// - Also surely invalid (size 0) handles if interpreted as older version +// - (Helps ensure a corrupted format_version doesn't get us far with no +// footer checksum.) +// footer_checksum (uint32LE, 4 bytes) +// - Checksum of above checksum type of whole footer, with this field +// set to all zeros. +// base_context_checksum (uint32LE, 4 bytes) +// metaindex block size (uint32LE, 4 bytes) +// - Assumed to be immediately before footer, < 4GB +// (24 bytes, reserved for future use) +// - Brings part2 size also to 40 bytes +// - Checked that last eight bytes == 0, so reserved for a future +// incompatible feature (but under format_version=6) // * Part3 // -> format_version == 0 (inferred from legacy magic number) // legacy magic number (8 bytes) // -> format_version >= 1 (inferred from NOT legacy magic number) // format_version (uint32LE, 4 bytes), also called "footer version" // newer magic number (8 bytes) - +const std::array kExtendedMagic{{0x3e, 0x00, 0x7a, 0x00}}; constexpr size_t kFooterPart2Size = 2 * BlockHandle::kMaxEncodedLength; } // namespace -void FooterBuilder::Build(uint64_t magic_number, uint32_t format_version, - uint64_t footer_offset, ChecksumType checksum_type, - const BlockHandle& metaindex_handle, - const BlockHandle& index_handle) { - (void)footer_offset; // Future use - +Status FooterBuilder::Build(uint64_t magic_number, uint32_t format_version, + uint64_t footer_offset, ChecksumType checksum_type, + const BlockHandle& metaindex_handle, + const BlockHandle& index_handle, + uint32_t base_context_checksum) { assert(magic_number != Footer::kNullTableMagicNumber); assert(IsSupportedFormatVersion(format_version)); @@ -255,19 +268,71 @@ void FooterBuilder::Build(uint64_t magic_number, uint32_t format_version, assert(cur + 8 == slice_.data() + slice_.size()); } - { + if (format_version >= 6) { + if (BlockTrailerSizeForMagicNumber(magic_number) != 0) { + // base context checksum required for table formats with block checksums + assert(base_context_checksum != 0); + assert(ChecksumModifierForContext(base_context_checksum, 0) != 0); + } else { + // base context checksum not used + assert(base_context_checksum == 0); + assert(ChecksumModifierForContext(base_context_checksum, 0) == 0); + } + + // Start populating Part 2 + char* cur = data_.data() + /* part 1 size */ 1; + // Set extended magic of part2 + std::copy(kExtendedMagic.begin(), kExtendedMagic.end(), cur); + cur += kExtendedMagic.size(); + // Fill checksum data with zeros (for later computing checksum) + char* checksum_data = cur; + EncodeFixed32(cur, 0); + cur += 4; + // Save base context checksum + EncodeFixed32(cur, base_context_checksum); + cur += 4; + // Compute and save metaindex size + uint32_t metaindex_size = static_cast(metaindex_handle.size()); + if (metaindex_size != metaindex_handle.size()) { + return Status::NotSupported("Metaindex block size > 4GB"); + } + // Metaindex must be adjacent to footer + assert(metaindex_size == 0 || + metaindex_handle.offset() + metaindex_handle.size() == + footer_offset - BlockTrailerSizeForMagicNumber(magic_number)); + EncodeFixed32(cur, metaindex_size); + cur += 4; + + // Zero pad remainder (for future use) + std::fill_n(cur, 24U, char{0}); + assert(cur + 24 == part3); + + // Compute checksum, add context + uint32_t checksum = ComputeBuiltinChecksum( + checksum_type, data_.data(), Footer::kNewVersionsEncodedLength); + checksum += + ChecksumModifierForContext(base_context_checksum, footer_offset); + // Store it + EncodeFixed32(checksum_data, checksum); + } else { + // Base context checksum not used + assert(!FormatVersionUsesContextChecksum(format_version)); + // Should be left empty + assert(base_context_checksum == 0); + assert(ChecksumModifierForContext(base_context_checksum, 0) == 0); + + // Populate all of part 2 char* cur = part2; cur = metaindex_handle.EncodeTo(cur); cur = index_handle.EncodeTo(cur); // Zero pad remainder std::fill(cur, part3, char{0}); } + return Status::OK(); } Status Footer::DecodeFrom(Slice input, uint64_t input_offset, uint64_t enforce_table_magic_number) { - (void)input_offset; // Future use - // Only decode to unused Footer assert(table_magic_number_ == kNullTableMagicNumber); assert(input != nullptr); @@ -290,6 +355,9 @@ Status Footer::DecodeFrom(Slice input, uint64_t input_offset, block_trailer_size_ = BlockTrailerSizeForMagicNumber(magic); // Parse Part3 + const char* part3_ptr = magic_ptr; + uint32_t computed_checksum = 0; + uint64_t footer_offset = 0; if (legacy) { // The size is already asserted to be at least kMinEncodedLength // at the beginning of the function @@ -297,37 +365,101 @@ Status Footer::DecodeFrom(Slice input, uint64_t input_offset, format_version_ = 0 /* legacy */; checksum_type_ = kCRC32c; } else { - const char* part3_ptr = magic_ptr - 4; + part3_ptr = magic_ptr - 4; format_version_ = DecodeFixed32(part3_ptr); - if (!IsSupportedFormatVersion(format_version_)) { + if (UNLIKELY(!IsSupportedFormatVersion(format_version_))) { return Status::Corruption("Corrupt or unsupported format_version: " + std::to_string(format_version_)); } // All known format versions >= 1 occupy exactly this many bytes. - if (input.size() < kNewVersionsEncodedLength) { + if (UNLIKELY(input.size() < kNewVersionsEncodedLength)) { return Status::Corruption("Input is too short to be an SST file"); } uint64_t adjustment = input.size() - kNewVersionsEncodedLength; input.remove_prefix(adjustment); + footer_offset = input_offset + adjustment; // Parse Part1 char chksum = input.data()[0]; checksum_type_ = lossless_cast(chksum); - if (!IsSupportedChecksumType(checksum_type())) { + if (UNLIKELY(!IsSupportedChecksumType(checksum_type()))) { return Status::Corruption("Corrupt or unsupported checksum type: " + std::to_string(lossless_cast(chksum))); } + // This is the most convenient place to compute the checksum + if (checksum_type_ != kNoChecksum && format_version_ >= 6) { + std::array copy_without_checksum; + std::copy_n(input.data(), kNewVersionsEncodedLength, + ©_without_checksum[0]); + EncodeFixed32(©_without_checksum[5], 0); // Clear embedded checksum + computed_checksum = + ComputeBuiltinChecksum(checksum_type(), copy_without_checksum.data(), + kNewVersionsEncodedLength); + } // Consume checksum type field input.remove_prefix(1); } // Parse Part2 - Status result = metaindex_handle_.DecodeFrom(&input); - if (result.ok()) { - result = index_handle_.DecodeFrom(&input); + if (format_version_ >= 6) { + Slice ext_magic(input.data(), 4); + if (UNLIKELY(ext_magic.compare(Slice(kExtendedMagic.data(), + kExtendedMagic.size())) != 0)) { + return Status::Corruption("Bad extended magic number: 0x" + + ext_magic.ToString(/*hex*/ true)); + } + input.remove_prefix(4); + uint32_t stored_checksum = 0, metaindex_size = 0; + bool success; + success = GetFixed32(&input, &stored_checksum); + assert(success); + success = GetFixed32(&input, &base_context_checksum_); + assert(success); + if (UNLIKELY(ChecksumModifierForContext(base_context_checksum_, 0) == 0)) { + return Status::Corruption("Invalid base context checksum"); + } + computed_checksum += + ChecksumModifierForContext(base_context_checksum_, footer_offset); + if (UNLIKELY(computed_checksum != stored_checksum)) { + return Status::Corruption("Footer at " + std::to_string(footer_offset) + + " checksum mismatch"); + } + success = GetFixed32(&input, &metaindex_size); + assert(success); + (void)success; + uint64_t metaindex_end = footer_offset - GetBlockTrailerSize(); + metaindex_handle_ = + BlockHandle(metaindex_end - metaindex_size, metaindex_size); + + // Mark unpopulated + index_handle_ = BlockHandle::NullBlockHandle(); + + // 16 bytes of unchecked reserved padding + input.remove_prefix(16U); + + // 8 bytes of checked reserved padding (expected to be zero unless using a + // future feature). + uint64_t reserved = 0; + success = GetFixed64(&input, &reserved); + assert(success); + if (UNLIKELY(reserved != 0)) { + return Status::NotSupported( + "File uses a future feature not supported in this version"); + } + // End of part 2 + assert(input.data() == part3_ptr); + } else { + // format_version_ < 6 + Status result = metaindex_handle_.DecodeFrom(&input); + if (result.ok()) { + result = index_handle_.DecodeFrom(&input); + } + if (!result.ok()) { + return result; + } + // Padding in part2 is ignored } - return result; - // Padding in part2 is ignored + return Status::OK(); } std::string Footer::ToString() const { @@ -377,18 +509,16 @@ Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file, // need to pass a timeout at that point // TODO: rate limit footer reads. if (prefetch_buffer == nullptr || - !prefetch_buffer->TryReadFromCache( - opts, file, read_offset, Footer::kMaxEncodedLength, &footer_input, - nullptr, opts.rate_limiter_priority)) { + !prefetch_buffer->TryReadFromCache(opts, file, read_offset, + Footer::kMaxEncodedLength, + &footer_input, nullptr)) { if (file->use_direct_io()) { s = file->Read(opts, read_offset, Footer::kMaxEncodedLength, - &footer_input, nullptr, &internal_buf, - opts.rate_limiter_priority); + &footer_input, nullptr, &internal_buf); } else { footer_buf.reserve(Footer::kMaxEncodedLength); s = file->Read(opts, read_offset, Footer::kMaxEncodedLength, - &footer_input, &footer_buf[0], nullptr, - opts.rate_limiter_priority); + &footer_input, &footer_buf[0], nullptr); } if (!s.ok()) return s; } @@ -515,19 +645,25 @@ Status UncompressBlockData(const UncompressionInfo& uncompression_info, StopWatchNano timer(ioptions.clock, ShouldReportDetailedTime(ioptions.env, ioptions.stats)); size_t uncompressed_size = 0; - CacheAllocationPtr ubuf = - UncompressData(uncompression_info, data, size, &uncompressed_size, - GetCompressFormatForVersion(format_version), allocator); + const char* error_msg = nullptr; + CacheAllocationPtr ubuf = UncompressData( + uncompression_info, data, size, &uncompressed_size, + GetCompressFormatForVersion(format_version), allocator, &error_msg); if (!ubuf) { if (!CompressionTypeSupported(uncompression_info.type())) { - return Status::NotSupported( + ret = Status::NotSupported( "Unsupported compression method for this build", CompressionTypeToString(uncompression_info.type())); } else { - return Status::Corruption( - "Corrupted compressed block contents", - CompressionTypeToString(uncompression_info.type())); + std::ostringstream oss; + oss << "Corrupted compressed block contents"; + if (error_msg) { + oss << ": " << error_msg; + } + ret = Status::Corruption( + oss.str(), CompressionTypeToString(uncompression_info.type())); } + return ret; } *out_contents = BlockContents(std::move(ubuf), uncompressed_size); @@ -536,8 +672,8 @@ Status UncompressBlockData(const UncompressionInfo& uncompression_info, RecordTimeToHistogram(ioptions.stats, DECOMPRESSION_TIMES_NANOS, timer.ElapsedNanos()); } - RecordTimeToHistogram(ioptions.stats, BYTES_DECOMPRESSED, - out_contents->data.size()); + RecordTick(ioptions.stats, BYTES_DECOMPRESSED_FROM, size); + RecordTick(ioptions.stats, BYTES_DECOMPRESSED_TO, out_contents->data.size()); RecordTick(ioptions.stats, NUMBER_BLOCK_DECOMPRESSED); TEST_SYNC_POINT_CALLBACK("UncompressBlockData:TamperWithReturnValue", diff --git a/table/format.h b/table/format.h index 71d3706c42db..73675381edb0 100644 --- a/table/format.h +++ b/table/format.h @@ -15,7 +15,7 @@ #include "file/file_prefetch_buffer.h" #include "file/random_access_file_reader.h" -#include "memory/memory_allocator.h" +#include "memory/memory_allocator_impl.h" #include "options/cf_options.h" #include "port/malloc.h" #include "port/port.h" // noexcept @@ -111,6 +111,40 @@ struct IndexValue { std::string ToString(bool hex, bool have_first_key) const; }; +// Given a file's base_context_checksum and an offset of a block within that +// file, choose a 32-bit value that is as unique as possible. This value will +// be added to the standard checksum to get a checksum "with context," or can +// be subtracted to "remove" context. Returns zero (no modifier) if feature is +// disabled with base_context_checksum == 0. +inline uint32_t ChecksumModifierForContext(uint32_t base_context_checksum, + uint64_t offset) { + // To disable on base_context_checksum == 0, we could write + // `if (base_context_checksum == 0) return 0;` but benchmarking shows + // measurable performance penalty vs. this: compute the modifier + // unconditionally and use an "all or nothing" bit mask to enable + // or disable. + uint32_t all_or_nothing = uint32_t{0} - (base_context_checksum != 0); + + // Desired properties: + // (call this function f(b, o) where b = base and o = offset) + // 1. Fast + // 2. f(b1, o) == f(b2, o) iff b1 == b2 + // (Perfectly preserve base entropy) + // 3. f(b, o1) == f(b, o2) only if o1 == o2 or |o1-o2| >= 4 billion + // (Guaranteed uniqueness for nearby offsets) + // 3. f(b, o + j * 2**32) == f(b, o + k * 2**32) only if j == k + // (Upper bits matter, and *aligned* misplacement fails check) + // 4. f(b1, o) == f(b2, o + x) then preferably not + // f(b1, o + y) == f(b2, o + x + y) + // (Avoid linearly correlated matches) + // 5. f(b, o) == 0 depends on both b and o + // (No predictable overlap with non-context checksums) + uint32_t modifier = + base_context_checksum ^ (Lower32of64(offset) + Upper32of64(offset)); + + return modifier & all_or_nothing; +} + inline uint32_t GetCompressFormatForVersion(uint32_t format_version) { // As of format_version 2, we encode compressed block with // compress_format_version == 2. Before that, the version is 1. @@ -118,18 +152,27 @@ inline uint32_t GetCompressFormatForVersion(uint32_t format_version) { return format_version >= 2 ? 2 : 1; } -constexpr uint32_t kLatestFormatVersion = 5; +constexpr uint32_t kLatestFormatVersion = 6; inline bool IsSupportedFormatVersion(uint32_t version) { return version <= kLatestFormatVersion; } +// Same as having a unique id in footer. +inline bool FormatVersionUsesContextChecksum(uint32_t version) { + return version >= 6; +} + +inline bool FormatVersionUsesIndexHandleInFooter(uint32_t version) { + return version < 6; +} + // Footer encapsulates the fixed information stored at the tail end of every // SST file. In general, it should only include things that cannot go // elsewhere under the metaindex block. For example, checksum_type is // required for verifying metaindex block checksum (when applicable), but -// index block handle can easily go in metaindex block (possible future). -// See also FooterBuilder below. +// index block handle can easily go in metaindex block. See also FooterBuilder +// below. class Footer { public: // Create empty. Populate using DecodeFrom. @@ -137,7 +180,7 @@ class Footer { // Deserialize a footer (populate fields) from `input` and check for various // corruptions. `input_offset` is the offset within the target file of - // `input` buffer (future use). + // `input` buffer, which is needed for verifying format_version >= 6 footer. // If enforce_table_magic_number != 0, will return corruption if table magic // number is not equal to enforce_table_magic_number. Status DecodeFrom(Slice input, uint64_t input_offset, @@ -152,13 +195,17 @@ class Footer { // BBTO::format_version.) uint32_t format_version() const { return format_version_; } + // See ChecksumModifierForContext() + uint32_t base_context_checksum() const { return base_context_checksum_; } + // Block handle for metaindex block. const BlockHandle& metaindex_handle() const { return metaindex_handle_; } // Block handle for (top-level) index block. + // TODO? remove from this struct and only read on decode for legacy cases const BlockHandle& index_handle() const { return index_handle_; } - // Checksum type used in the file. + // Checksum type used in the file, including footer for format version >= 6. ChecksumType checksum_type() const { return static_cast(checksum_type_); } @@ -198,6 +245,7 @@ class Footer { uint64_t table_magic_number_ = kNullTableMagicNumber; uint32_t format_version_ = kInvalidFormatVersion; + uint32_t base_context_checksum_ = 0; BlockHandle metaindex_handle_; BlockHandle index_handle_; int checksum_type_ = kInvalidChecksumType; @@ -219,11 +267,16 @@ class FooterBuilder { // * footer_offset is the file offset where the footer will be written // (for future use). // * checksum_type is for formats using block checksums. - // * index_handle is optional for some kinds of SST files. - void Build(uint64_t table_magic_number, uint32_t format_version, - uint64_t footer_offset, ChecksumType checksum_type, - const BlockHandle& metaindex_handle, - const BlockHandle& index_handle = BlockHandle::NullBlockHandle()); + // * index_handle is optional for some SST kinds and (for caller convenience) + // ignored when format_version >= 6. (Must be added to metaindex in that + // case.) + // * unique_id must be specified if format_vesion >= 6 and SST uses block + // checksums with context. Otherwise, auto-generated if format_vesion >= 6. + Status Build(uint64_t table_magic_number, uint32_t format_version, + uint64_t footer_offset, ChecksumType checksum_type, + const BlockHandle& metaindex_handle, + const BlockHandle& index_handle = BlockHandle::NullBlockHandle(), + uint32_t base_context_checksum = 0); // After Builder, get a Slice for the serialized Footer, backed by this // FooterBuilder. diff --git a/table/get_context.cc b/table/get_context.cc index 2b5a7ae65966..660726cd392a 100644 --- a/table/get_context.cc +++ b/table/get_context.cc @@ -12,7 +12,7 @@ #include "db/wide/wide_column_serialization.h" #include "monitoring/file_read_sample.h" #include "monitoring/perf_context_imp.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "rocksdb/merge_operator.h" #include "rocksdb/statistics.h" #include "rocksdb/system_clock.h" @@ -22,7 +22,6 @@ namespace ROCKSDB_NAMESPACE { namespace { void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) { -#ifndef ROCKSDB_LITE if (replay_log) { if (replay_log->empty()) { // Optimization: in the common case of only one operation in the @@ -32,11 +31,6 @@ void appendToReplayLog(std::string* replay_log, ValueType type, Slice value) { replay_log->push_back(type); PutLengthPrefixedSlice(replay_log, value); } -#else - (void)replay_log; - (void)type; - (void)value; -#endif // ROCKSDB_LITE } } // namespace @@ -306,6 +300,9 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, if (kNotFound == state_) { state_ = kFound; if (do_merge_) { + if (type == kTypeBlobIndex && ucmp_->timestamp_size() != 0) { + ukey_with_ts_found_.PinSelf(parsed_key.user_key); + } if (LIKELY(pinnable_val_ != nullptr)) { Slice value_to_use = value; @@ -345,7 +342,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, // merge_context_->operand_list if (type == kTypeBlobIndex) { PinnableSlice pin_val; - if (GetBlobValue(value, &pin_val) == false) { + if (GetBlobValue(parsed_key.user_key, value, &pin_val) == false) { return false; } Slice blob_value(pin_val); @@ -371,13 +368,13 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, assert(merge_operator_ != nullptr); if (type == kTypeBlobIndex) { PinnableSlice pin_val; - if (GetBlobValue(value, &pin_val) == false) { + if (GetBlobValue(parsed_key.user_key, value, &pin_val) == false) { return false; } Slice blob_value(pin_val); state_ = kFound; if (do_merge_) { - Merge(&blob_value); + MergeWithPlainBaseValue(blob_value); } else { // It means this function is called as part of DB GetMergeOperands // API and the current value should be part of @@ -388,7 +385,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, state_ = kFound; if (do_merge_) { - MergeWithEntity(value); + MergeWithWideColumnBaseValue(value); } else { // It means this function is called as part of DB GetMergeOperands // API and the current value should be part of @@ -410,7 +407,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, state_ = kFound; if (do_merge_) { - Merge(&value); + MergeWithPlainBaseValue(value); } else { // It means this function is called as part of DB GetMergeOperands // API and the current value should be part of @@ -433,7 +430,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, } else if (kMerge == state_) { state_ = kFound; if (do_merge_) { - Merge(nullptr); + MergeWithNoBaseValue(); } // If do_merge_ = false then the current value shouldn't be part of // merge_context_->operand_list @@ -445,11 +442,13 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, state_ = kMerge; // value_pinner is not set from plain_table_reader.cc for example. push_operand(value, value_pinner); + PERF_COUNTER_ADD(internal_merge_point_lookup_count, 1); + if (do_merge_ && merge_operator_ != nullptr && merge_operator_->ShouldMerge( merge_context_->GetOperandsDirectionBackward())) { state_ = kFound; - Merge(nullptr); + MergeWithNoBaseValue(); return false; } return true; @@ -464,100 +463,76 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, return false; } -void GetContext::Merge(const Slice* value) { - assert(do_merge_); - assert(!pinnable_val_ || !columns_); - - std::string result; - // `op_failure_scope` (an output parameter) is not provided (set to nullptr) - // since a failure must be propagated regardless of its value. - const Status s = MergeHelper::TimedFullMerge( - merge_operator_, user_key_, value, merge_context_->GetOperands(), &result, - logger_, statistics_, clock_, /* result_operand */ nullptr, - /* update_num_ops_stats */ true, - /* op_failure_scope */ nullptr); - if (!s.ok()) { - state_ = kCorrupt; +void GetContext::PostprocessMerge(const Status& merge_status) { + if (!merge_status.ok()) { + if (merge_status.subcode() == Status::SubCode::kMergeOperatorFailed) { + state_ = kMergeOperatorFailed; + } else { + state_ = kCorrupt; + } return; } if (LIKELY(pinnable_val_ != nullptr)) { - *(pinnable_val_->GetSelf()) = std::move(result); pinnable_val_->PinSelf(); - return; } - - assert(columns_); - columns_->SetPlainValue(result); } -void GetContext::MergeWithEntity(Slice entity) { +void GetContext::MergeWithNoBaseValue() { assert(do_merge_); + assert(pinnable_val_ || columns_); assert(!pinnable_val_ || !columns_); - if (LIKELY(pinnable_val_ != nullptr)) { - Slice value_of_default; - - { - const Status s = WideColumnSerialization::GetValueOfDefaultColumn( - entity, value_of_default); - if (!s.ok()) { - state_ = kCorrupt; - return; - } - } - - { - // `op_failure_scope` (an output parameter) is not provided (set to - // nullptr) since a failure must be propagated regardless of its value. - const Status s = MergeHelper::TimedFullMerge( - merge_operator_, user_key_, &value_of_default, - merge_context_->GetOperands(), pinnable_val_->GetSelf(), logger_, - statistics_, clock_, /* result_operand */ nullptr, - /* update_num_ops_stats */ true, - /* op_failure_scope */ nullptr); - if (!s.ok()) { - state_ = kCorrupt; - return; - } - } + // `op_failure_scope` (an output parameter) is not provided (set to nullptr) + // since a failure must be propagated regardless of its value. + const Status s = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, MergeHelper::kNoBaseValue, + merge_context_->GetOperands(), logger_, statistics_, clock_, + /* update_num_ops_stats */ true, + pinnable_val_ ? pinnable_val_->GetSelf() : nullptr, columns_, + /* op_failure_scope */ nullptr); + PostprocessMerge(s); +} - pinnable_val_->PinSelf(); - return; - } +void GetContext::MergeWithPlainBaseValue(const Slice& value) { + assert(do_merge_); + assert(pinnable_val_ || columns_); + assert(!pinnable_val_ || !columns_); - std::string result; + // `op_failure_scope` (an output parameter) is not provided (set to nullptr) + // since a failure must be propagated regardless of its value. + const Status s = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, MergeHelper::kPlainBaseValue, value, + merge_context_->GetOperands(), logger_, statistics_, clock_, + /* update_num_ops_stats */ true, + pinnable_val_ ? pinnable_val_->GetSelf() : nullptr, columns_, + /* op_failure_scope */ nullptr); + PostprocessMerge(s); +} - { - // `op_failure_scope` (an output parameter) is not provided (set to nullptr) - // since a failure must be propagated regardless of its value. - const Status s = MergeHelper::TimedFullMergeWithEntity( - merge_operator_, user_key_, entity, merge_context_->GetOperands(), - &result, logger_, statistics_, clock_, /* update_num_ops_stats */ true, - /* op_failure_scope */ nullptr); - if (!s.ok()) { - state_ = kCorrupt; - return; - } - } +void GetContext::MergeWithWideColumnBaseValue(const Slice& entity) { + assert(do_merge_); + assert(pinnable_val_ || columns_); + assert(!pinnable_val_ || !columns_); - { - assert(columns_); - const Status s = columns_->SetWideColumnValue(result); - if (!s.ok()) { - state_ = kCorrupt; - return; - } - } + // `op_failure_scope` (an output parameter) is not provided (set to nullptr) + // since a failure must be propagated regardless of its value. + const Status s = MergeHelper::TimedFullMerge( + merge_operator_, user_key_, MergeHelper::kWideBaseValue, entity, + merge_context_->GetOperands(), logger_, statistics_, clock_, + /* update_num_ops_stats */ true, + pinnable_val_ ? pinnable_val_->GetSelf() : nullptr, columns_, + /* op_failure_scope */ nullptr); + PostprocessMerge(s); } -bool GetContext::GetBlobValue(const Slice& blob_index, +bool GetContext::GetBlobValue(const Slice& user_key, const Slice& blob_index, PinnableSlice* blob_value) { constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; constexpr uint64_t* bytes_read = nullptr; Status status = blob_fetcher_->FetchBlob( - user_key_, blob_index, prefetch_buffer, blob_value, bytes_read); + user_key, blob_index, prefetch_buffer, blob_value, bytes_read); if (!status.ok()) { if (status.IsIncomplete()) { // FIXME: this code is not covered by unit tests @@ -583,8 +558,8 @@ void GetContext::push_operand(const Slice& value, Cleanable* value_pinner) { } void replayGetContextLog(const Slice& replay_log, const Slice& user_key, - GetContext* get_context, Cleanable* value_pinner) { -#ifndef ROCKSDB_LITE + GetContext* get_context, Cleanable* value_pinner, + SequenceNumber seq_no) { Slice s = replay_log; while (s.size()) { auto type = static_cast(*s.data()); @@ -595,19 +570,10 @@ void replayGetContextLog(const Slice& replay_log, const Slice& user_key, (void)ret; bool dont_care __attribute__((__unused__)); - // Since SequenceNumber is not stored and unknown, we will use - // kMaxSequenceNumber. - get_context->SaveValue( - ParsedInternalKey(user_key, kMaxSequenceNumber, type), value, - &dont_care, value_pinner); - } -#else // ROCKSDB_LITE - (void)replay_log; - (void)user_key; - (void)get_context; - (void)value_pinner; - assert(false); -#endif // ROCKSDB_LITE + + ParsedInternalKey ikey = ParsedInternalKey(user_key, seq_no, type); + get_context->SaveValue(ikey, value, &dont_care, value_pinner); + } } } // namespace ROCKSDB_NAMESPACE diff --git a/table/get_context.h b/table/get_context.h index dcc7ab8d60a9..b43ff6e1600f 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -75,6 +75,7 @@ class GetContext { kCorrupt, kMerge, // saver contains the current merge result (the operands) kUnexpectedBlobIndex, + kMergeOperatorFailed, }; GetContextStats get_context_stats_; @@ -177,14 +178,31 @@ class GetContext { bool has_callback() const { return callback_ != nullptr; } + const Slice& ukey_to_get_blob_value() const { + if (!ukey_with_ts_found_.empty()) { + return ukey_with_ts_found_; + } else { + return user_key_; + } + } + uint64_t get_tracing_get_id() const { return tracing_get_id_; } void push_operand(const Slice& value, Cleanable* value_pinner); private: - void Merge(const Slice* value); - void MergeWithEntity(Slice entity); - bool GetBlobValue(const Slice& blob_index, PinnableSlice* blob_value); + // Helper method that postprocesses the results of merge operations, e.g. it + // sets the state correctly upon merge errors. + void PostprocessMerge(const Status& merge_status); + + // The following methods perform the actual merge operation for the + // no base value/plain base value/wide-column base value cases. + void MergeWithNoBaseValue(); + void MergeWithPlainBaseValue(const Slice& value); + void MergeWithWideColumnBaseValue(const Slice& entity); + + bool GetBlobValue(const Slice& user_key, const Slice& blob_index, + PinnableSlice* blob_value); const Comparator* ucmp_; const MergeOperator* merge_operator_; @@ -194,6 +212,10 @@ class GetContext { GetState state_; Slice user_key_; + // When a blob index is found with the user key containing timestamp, + // this copies the corresponding user key on record in the sst file + // and is later used for blob verification. + PinnableSlice ukey_with_ts_found_; PinnableSlice* pinnable_val_; PinnableWideColumns* columns_; std::string* timestamp_; @@ -226,6 +248,7 @@ class GetContext { // must have been set by calling GetContext::SetReplayLog(). void replayGetContextLog(const Slice& replay_log, const Slice& user_key, GetContext* get_context, - Cleanable* value_pinner = nullptr); + Cleanable* value_pinner = nullptr, + SequenceNumber seq_no = kMaxSequenceNumber); } // namespace ROCKSDB_NAMESPACE diff --git a/table/internal_iterator.h b/table/internal_iterator.h index 8015ed635112..060306003ce4 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -43,6 +43,17 @@ class InternalIteratorBase : public Cleanable { virtual ~InternalIteratorBase() {} + // This iterator will only process range tombstones with sequence + // number <= `read_seqno`. + // Noop for most child classes. + // For range tombstone iterators (TruncatedRangeDelIterator, + // FragmentedRangeTombstoneIterator), will only return range tombstones with + // sequence number <= `read_seqno`. For LevelIterator, it may open new table + // files and create new range tombstone iterators during scanning. It will use + // `read_seqno` as the sequence number for creating new range tombstone + // iterators. + virtual void SetRangeDelReadSeqno(SequenceNumber /* read_seqno */) {} + // An iterator is either positioned at a key/value pair, or // not valid. This method returns true iff the iterator is valid. // Always returns false if !status().ok(). diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index 17abef4ac796..a9de3dff35c1 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -30,6 +30,11 @@ class IteratorWrapperBase { } ~IteratorWrapperBase() {} InternalIteratorBase* iter() const { return iter_; } + void SetRangeDelReadSeqno(SequenceNumber read_seqno) { + if (iter_) { + iter_->SetRangeDelReadSeqno(read_seqno); + } + } // Set the underlying Iterator to _iter and return // previous underlying Iterator. @@ -47,6 +52,17 @@ class IteratorWrapperBase { void DeleteIter(bool is_arena_mode) { if (iter_) { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + if (!status_checked_after_invalid_) { + // If this assertion fails, it is likely that you did not check + // iterator status after Valid() returns false. + fprintf(stderr, + "Failed to check status after Valid() returned false from this " + "iterator.\n"); + port::PrintStack(); + std::abort(); + } +#endif if (!is_arena_mode) { delete iter_; } else { @@ -56,7 +72,12 @@ class IteratorWrapperBase { } // Iterator interface methods - bool Valid() const { return valid_; } + bool Valid() const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + status_checked_after_invalid_ = valid_; +#endif + return valid_; + } Slice key() const { assert(Valid()); return result_.key; @@ -67,6 +88,9 @@ class IteratorWrapperBase { } // Methods below require iter() != nullptr Status status() const { +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + status_checked_after_invalid_ = true; +#endif assert(iter_); return iter_->status(); } @@ -178,6 +202,10 @@ class IteratorWrapperBase { InternalIteratorBase* iter_; IterateResult result_; bool valid_; + +#ifdef ROCKSDB_ASSERT_STATUS_CHECKED + mutable bool status_checked_after_invalid_ = true; +#endif }; using IteratorWrapper = IteratorWrapperBase; diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index 309ae69c5e84..247564fe7b07 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -10,121 +10,45 @@ #include "table/merging_iterator.h" #include "db/arena_wrapped_db_iter.h" -#include "db/dbformat.h" -#include "db/pinned_iterators_manager.h" -#include "memory/arena.h" -#include "monitoring/perf_context_imp.h" -#include "rocksdb/comparator.h" -#include "rocksdb/iterator.h" -#include "rocksdb/options.h" -#include "table/internal_iterator.h" -#include "table/iter_heap.h" -#include "table/iterator_wrapper.h" -#include "test_util/sync_point.h" -#include "util/autovector.h" -#include "util/heap.h" -#include "util/stop_watch.h" namespace ROCKSDB_NAMESPACE { -// For merging iterator to process range tombstones, we treat the start and end -// keys of a range tombstone as point keys and put them into the minHeap/maxHeap -// used in merging iterator. Take minHeap for example, we are able to keep track -// of currently "active" range tombstones (the ones whose start keys are popped -// but end keys are still in the heap) in `active_`. This `active_` set of range -// tombstones is then used to quickly determine whether the point key at heap -// top is deleted (by heap property, the point key at heap top must be within -// internal key range of active range tombstones). +// MergingIterator uses a min/max heap to combine data from point iterators. +// Range tombstones can be added and keys covered by range tombstones will be +// skipped. // -// The HeapItem struct represents 3 types of elements in the minHeap/maxHeap: -// point key and the start and end keys of a range tombstone. -struct HeapItem { - HeapItem() = default; - - enum Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END }; - IteratorWrapper iter; - size_t level = 0; - ParsedInternalKey parsed_ikey; - // Will be overwritten before use, initialize here so compiler does not - // complain. - Type type = ITERATOR; - - explicit HeapItem(size_t _level, InternalIteratorBase* _iter) - : level(_level), type(Type::ITERATOR) { - iter.Set(_iter); - } - - void SetTombstoneKey(ParsedInternalKey&& pik) { - // op_type is already initialized in MergingIterator::Finish(). - parsed_ikey.user_key = pik.user_key; - parsed_ikey.sequence = pik.sequence; - } - - Slice key() const { - assert(type == ITERATOR); - return iter.key(); - } - - bool IsDeleteRangeSentinelKey() const { - if (type == Type::ITERATOR) { - return iter.IsDeleteRangeSentinelKey(); - } - return false; - } -}; - -class MinHeapItemComparator { - public: - MinHeapItemComparator(const InternalKeyComparator* comparator) - : comparator_(comparator) {} - bool operator()(HeapItem* a, HeapItem* b) const { - if (LIKELY(a->type == HeapItem::ITERATOR)) { - if (LIKELY(b->type == HeapItem::ITERATOR)) { - return comparator_->Compare(a->key(), b->key()) > 0; - } else { - return comparator_->Compare(a->key(), b->parsed_ikey) > 0; - } - } else { - if (LIKELY(b->type == HeapItem::ITERATOR)) { - return comparator_->Compare(a->parsed_ikey, b->key()) > 0; - } else { - return comparator_->Compare(a->parsed_ikey, b->parsed_ikey) > 0; - } - } - } - - private: - const InternalKeyComparator* comparator_; -}; - -class MaxHeapItemComparator { - public: - MaxHeapItemComparator(const InternalKeyComparator* comparator) - : comparator_(comparator) {} - bool operator()(HeapItem* a, HeapItem* b) const { - if (LIKELY(a->type == HeapItem::ITERATOR)) { - if (LIKELY(b->type == HeapItem::ITERATOR)) { - return comparator_->Compare(a->key(), b->key()) < 0; - } else { - return comparator_->Compare(a->key(), b->parsed_ikey) < 0; - } - } else { - if (LIKELY(b->type == HeapItem::ITERATOR)) { - return comparator_->Compare(a->parsed_ikey, b->key()) < 0; - } else { - return comparator_->Compare(a->parsed_ikey, b->parsed_ikey) < 0; - } - } - } - - private: - const InternalKeyComparator* comparator_; -}; -// Without anonymous namespace here, we fail the warning -Wmissing-prototypes -namespace { -using MergerMinIterHeap = BinaryHeap; -using MergerMaxIterHeap = BinaryHeap; -} // namespace - +// The following are implementation details and can be ignored by user. +// For merging iterator to process range tombstones, it treats the start and end +// keys of a range tombstone as two keys and put them into minHeap_ or maxHeap_ +// together with regular point keys. Each range tombstone is active only within +// its internal key range [start_key, end_key). An `active_` set is used to +// track levels that have an active range tombstone. Take forward scanning +// for example. Level j is in active_ if its current range tombstone has its +// start_key popped from minHeap_ and its end_key in minHeap_. If the top of +// minHeap_ is a point key from level L, we can determine if the point key is +// covered by any range tombstone by checking if there is an l <= L in active_. +// The case of l == L also involves checking range tombstone's sequence number. +// +// The following (non-exhaustive) list of invariants are maintained by +// MergingIterator during forward scanning. After each InternalIterator API, +// i.e., Seek*() and Next(), and FindNextVisibleKey(), if minHeap_ is not empty: +// (1) minHeap_.top().type == ITERATOR +// (2) minHeap_.top()->key() is not covered by any range tombstone. +// +// After each call to SeekImpl() in addition to the functions mentioned above: +// (3) For all level i and j <= i, range_tombstone_iters_[j].prev.end_key() < +// children_[i].iter.key(). That is, range_tombstone_iters_[j] is at or before +// the first range tombstone from level j with end_key() > +// children_[i].iter.key(). +// (4) For all level i and j <= i, if j in active_, then +// range_tombstone_iters_[j]->start_key() < children_[i].iter.key(). +// - When range_tombstone_iters_[j] is !Valid(), we consider its `prev` to be +// the last range tombstone from that range tombstone iterator. +// - When referring to range tombstone start/end keys, assume it is the value of +// HeapItem::tombstone_pik. This value has op_type = kMaxValid, which makes +// range tombstone keys have distinct values from point keys. +// +// Applicable class variables have their own (forward scanning) invariants +// listed in the comments above their definition. class MergingIterator : public InternalIterator { public: MergingIterator(const InternalKeyComparator* comparator, @@ -136,7 +60,7 @@ class MergingIterator : public InternalIterator { direction_(kForward), comparator_(comparator), current_(nullptr), - minHeap_(comparator_), + minHeap_(MinHeapItemComparator(comparator_)), pinned_iters_mgr_(nullptr), iterate_upper_bound_(iterate_upper_bound) { children_.resize(n); @@ -162,30 +86,26 @@ class MergingIterator : public InternalIterator { current_ = nullptr; } - // Merging iterator can optionally process range tombstones: if a key is - // covered by a range tombstone, the merging iterator will not output it but - // skip it. - // - // Add the next range tombstone iterator to this merging iterator. - // There must be either no range tombstone iterator, or same number of - // range tombstone iterators as point iterators after all range tombstone - // iters are added. The i-th added range tombstone iterator and the i-th point - // iterator must point to the same sorted run. - // Merging iterator takes ownership of the range tombstone iterator and - // is responsible for freeing it. Note that during Iterator::Refresh() - // and when a level iterator moves to a different SST file, the range - // tombstone iterator could be updated. In that case, the merging iterator - // is only responsible to freeing the new range tombstone iterator - // that it has pointers to in range_tombstone_iters_. + // There must be either no range tombstone iterator or the same number of + // range tombstone iterators as point iterators after all iters are added. + // The i-th added range tombstone iterator and the i-th point iterator + // must point to the same LSM level. + // Merging iterator takes ownership of `iter` and is responsible for freeing + // it. One exception to this is when a LevelIterator moves to a different SST + // file or when Iterator::Refresh() is called, the range tombstone iterator + // could be updated. In that case, this merging iterator is only responsible + // for freeing the new range tombstone iterator that it has pointers to in + // range_tombstone_iters_. void AddRangeTombstoneIterator(TruncatedRangeDelIterator* iter) { range_tombstone_iters_.emplace_back(iter); } // Called by MergingIteratorBuilder when all point iterators and range // tombstone iterators are added. Initializes HeapItems for range tombstone - // iterators so that no further allocation is needed for HeapItem. + // iterators. void Finish() { if (!range_tombstone_iters_.empty()) { + assert(range_tombstone_iters_.size() == children_.size()); pinned_heap_item_.resize(range_tombstone_iters_.size()); for (size_t i = 0; i < range_tombstone_iters_.size(); ++i) { pinned_heap_item_[i].level = i; @@ -199,7 +119,7 @@ class MergingIterator : public InternalIterator { // TruncatedRangeDelIterator since untruncated tombstone end points // always have kMaxSequenceNumber and kTypeRangeDeletion (see // TruncatedRangeDelIterator::start_key()/end_key()). - pinned_heap_item_[i].parsed_ikey.type = kTypeMaxValid; + pinned_heap_item_[i].tombstone_pik.type = kTypeMaxValid; } } } @@ -215,18 +135,36 @@ class MergingIterator : public InternalIterator { status_.PermitUncheckedError(); } + void SetRangeDelReadSeqno(SequenceNumber read_seqno) override { + for (auto& child : children_) { + // This should only be needed for LevelIterator (iterators from L1+). + child.iter.SetRangeDelReadSeqno(read_seqno); + } + for (auto& child : range_tombstone_iters_) { + if (child) { + child->SetRangeDelReadSeqno(read_seqno); + } + } + } + bool Valid() const override { return current_ != nullptr && status_.ok(); } Status status() const override { return status_; } // Add range_tombstone_iters_[level] into min heap. // Updates active_ if the end key of a range tombstone is inserted. + // pinned_heap_items_[level].type is updated based on `start_key`. + // + // If range_tombstone_iters_[level] is after iterate_upper_bound_, + // it is removed from the heap. // @param start_key specifies which end point of the range tombstone to add. void InsertRangeTombstoneToMinHeap(size_t level, bool start_key = true, bool replace_top = false) { assert(!range_tombstone_iters_.empty() && range_tombstone_iters_[level]->Valid()); + // Maintains Invariant(phi) if (start_key) { + pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_START; ParsedInternalKey pik = range_tombstone_iters_[level]->start_key(); // iterate_upper_bound does not have timestamp if (iterate_upper_bound_ && @@ -241,15 +179,16 @@ class MergingIterator : public InternalIterator { return; } pinned_heap_item_[level].SetTombstoneKey(std::move(pik)); - pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START; + // Checks Invariant(active_) assert(active_.count(level) == 0); } else { // allow end key to go over upper bound (if present) since start key is // before upper bound and the range tombstone could still cover a // range before upper bound. + // Maintains Invariant(active_) pinned_heap_item_[level].SetTombstoneKey( range_tombstone_iters_[level]->end_key()); - pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END; + pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_END; active_.insert(level); } if (replace_top) { @@ -269,12 +208,12 @@ class MergingIterator : public InternalIterator { if (end_key) { pinned_heap_item_[level].SetTombstoneKey( range_tombstone_iters_[level]->end_key()); - pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_END; + pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_END; assert(active_.count(level) == 0); } else { pinned_heap_item_[level].SetTombstoneKey( range_tombstone_iters_[level]->start_key()); - pinned_heap_item_[level].type = HeapItem::DELETE_RANGE_START; + pinned_heap_item_[level].type = HeapItem::Type::DELETE_RANGE_START; active_.insert(level); } if (replace_top) { @@ -290,9 +229,12 @@ class MergingIterator : public InternalIterator { // so `active_` is updated accordingly. void PopDeleteRangeStart() { while (!minHeap_.empty() && - minHeap_.top()->type == HeapItem::DELETE_RANGE_START) { + minHeap_.top()->type == HeapItem::Type::DELETE_RANGE_START) { TEST_SYNC_POINT_CALLBACK("MergeIterator::PopDeleteRangeStart", nullptr); - // insert end key of this range tombstone and updates active_ + // Invariant(rti) holds since + // range_tombstone_iters_[minHeap_.top()->level] is still valid, and + // parameter `replace_top` is set to true here to ensure only one such + // HeapItem is in minHeap_. InsertRangeTombstoneToMinHeap( minHeap_.top()->level, false /* start_key */, true /* replace_top */); } @@ -304,7 +246,7 @@ class MergingIterator : public InternalIterator { // so `active_` is updated accordingly. void PopDeleteRangeEnd() { while (!maxHeap_->empty() && - maxHeap_->top()->type == HeapItem::DELETE_RANGE_END) { + maxHeap_->top()->type == HeapItem::Type::DELETE_RANGE_END) { // insert start key of this range tombstone and updates active_ InsertRangeTombstoneToMaxHeap(maxHeap_->top()->level, false /* end_key */, true /* replace_top */); @@ -359,44 +301,26 @@ class MergingIterator : public InternalIterator { // Position this merging iterator at the first key >= target (internal key). // If range tombstones are present, keys covered by range tombstones are // skipped, and this merging iter points to the first non-range-deleted key >= - // target after Seek(). If !Valid() and status().ok() then end of the iterator - // is reached. - // - // Internally, this involves positioning all child iterators at the first key - // >= target. If range tombstones are present, we apply a similar - // optimization, cascading seek, as in Pebble - // (https://github.com/cockroachdb/pebble). Specifically, if there is a range - // tombstone [start, end) that covers the target user key at level L, then - // this range tombstone must cover the range [target key, end) in all levels > - // L. So for all levels > L, we can pretend the target key is `end`. This - // optimization is applied at each level and hence the name "cascading seek". - // After a round of (cascading) seeks, the top of the heap is checked to see - // if it is covered by a range tombstone (see FindNextVisibleKey() for more - // detail), and advanced if so. The process is repeated until a - // non-range-deleted key is at the top of the heap, or heap becomes empty. + // target after Seek(). If !Valid() and status().ok() then this iterator + // reaches the end. // - // As mentioned in comments above HeapItem, to make the checking of whether - // top of the heap is covered by some range tombstone efficient, we treat each - // range deletion [start, end) as two point keys and insert them into the same - // min/maxHeap_ where point iterators are. The set `active_` tracks the levels - // that have active range tombstones. If level L is in `active_`, and the - // point key at top of the heap is from level >= L, then the point key is - // within the internal key range of the range tombstone that - // range_tombstone_iters_[L] currently points to. For correctness reasoning, - // one invariant that Seek() (and every other public APIs Seek*(), - // Next/Prev()) guarantees is as follows. After Seek(), suppose `k` is the - // current key of level L's point iterator. Then for each range tombstone - // iterator at level <= L, it is at or before the first range tombstone with - // end key > `k`. This ensures that when level L's point iterator reaches top - // of the heap, `active_` is calculated correctly (it contains the covering - // range tombstone's level if there is one), since no range tombstone iterator - // was skipped beyond that point iterator's current key during Seek(). - // Next()/Prev() maintains a stronger version of this invariant where all - // range tombstone iterators from level <= L are *at* the first range - // tombstone with end key > `k`. + // If range tombstones are present, cascading seeks may be called (an + // optimization adapted from Pebble https://github.com/cockroachdb/pebble). + // Roughly, if there is a range tombstone [start, end) that covers the + // target user key at level L, then this range tombstone must cover the range + // [target key, end) in all levels > L. So for all levels > L, we can pretend + // the target key is `end`. This optimization is applied at each level and + // hence the name "cascading seek". void Seek(const Slice& target) override { - assert(range_tombstone_iters_.empty() || - range_tombstone_iters_.size() == children_.size()); + // Define LevelNextVisible(i, k) to be the first key >= k in level i that is + // not covered by any range tombstone. + // After SeekImpl(target, 0), invariants (3) and (4) hold. + // For all level i, target <= children_[i].iter.key() <= LevelNextVisible(i, + // target). By the contract of FindNextVisibleKey(), Invariants (1)-(4) + // holds after this call, and minHeap_.top().iter points to the + // first key >= target among children_ that is not covered by any range + // tombstone. + status_ = Status::OK(); SeekImpl(target); FindNextVisibleKey(); @@ -410,6 +334,7 @@ class MergingIterator : public InternalIterator { void SeekForPrev(const Slice& target) override { assert(range_tombstone_iters_.empty() || range_tombstone_iters_.size() == children_.size()); + status_ = Status::OK(); SeekForPrevImpl(target); FindPrevVisibleKey(); @@ -424,7 +349,7 @@ class MergingIterator : public InternalIterator { assert(Valid()); // Ensure that all children are positioned after key(). // If we are moving in the forward direction, it is already - // true for all of the non-current children since current_ is + // true for all the non-current children since current_ is // the smallest child and key() == current_->key(). if (direction_ != kForward) { // The loop advanced all non-current children to be > key() so current_ @@ -448,6 +373,12 @@ class MergingIterator : public InternalIterator { considerStatus(current_->status()); minHeap_.pop(); } + // Invariants (3) and (4) hold when after advancing current_. + // Let k be the smallest key among children_[i].iter.key(). + // k <= children_[i].iter.key() <= LevelNextVisible(i, k) holds for all + // level i. After FindNextVisible(), Invariants (1)-(4) hold and + // minHeap_.top()->key() is the first key >= k from any children_ that is + // not covered by any range tombstone. FindNextVisibleKey(); current_ = CurrentForward(); } @@ -467,7 +398,7 @@ class MergingIterator : public InternalIterator { assert(Valid()); // Ensure that all children are positioned before key(). // If we are moving in the reverse direction, it is already - // true for all of the non-current children since current_ is + // true for all the non-current children since current_ is // the largest child and key() == current_->key(). if (direction_ != kReverse) { // Otherwise, retreat the non-current children. We retreat current_ @@ -518,7 +449,6 @@ class MergingIterator : public InternalIterator { // Here we simply relay MayBeOutOfLowerBound/MayBeOutOfUpperBound result // from current child iterator. Potentially as long as one of child iterator // report out of bound is not possible, we know current key is within bound. - bool MayBeOutOfLowerBound() override { assert(Valid()); return current_->MayBeOutOfLowerBound(); @@ -549,20 +479,108 @@ class MergingIterator : public InternalIterator { } private: + // Represents an element in the min/max heap. Each HeapItem corresponds to a + // point iterator or a range tombstone iterator, differentiated by + // HeapItem::type. + struct HeapItem { + HeapItem() = default; + + // corresponding point iterator + IteratorWrapper iter; + size_t level = 0; + // corresponding range tombstone iterator's start or end key value + // depending on value of `type`. + ParsedInternalKey tombstone_pik; + // Will be overwritten before use, initialize here so compiler does not + // complain. + enum class Type { ITERATOR, DELETE_RANGE_START, DELETE_RANGE_END }; + Type type = Type::ITERATOR; + + explicit HeapItem(size_t _level, InternalIteratorBase* _iter) + : level(_level), type(Type::ITERATOR) { + iter.Set(_iter); + } + + void SetTombstoneKey(ParsedInternalKey&& pik) { + // op_type is already initialized in MergingIterator::Finish(). + tombstone_pik.user_key = pik.user_key; + tombstone_pik.sequence = pik.sequence; + } + }; + + class MinHeapItemComparator { + public: + explicit MinHeapItemComparator(const InternalKeyComparator* comparator) + : comparator_(comparator) {} + + bool operator()(HeapItem* a, HeapItem* b) const { + if (LIKELY(a->type == HeapItem::Type::ITERATOR)) { + if (LIKELY(b->type == HeapItem::Type::ITERATOR)) { + return comparator_->Compare(a->iter.key(), b->iter.key()) > 0; + } else { + return comparator_->Compare(a->iter.key(), b->tombstone_pik) > 0; + } + } else { + if (LIKELY(b->type == HeapItem::Type::ITERATOR)) { + return comparator_->Compare(a->tombstone_pik, b->iter.key()) > 0; + } else { + return comparator_->Compare(a->tombstone_pik, b->tombstone_pik) > 0; + } + } + } + + private: + const InternalKeyComparator* comparator_; + }; + + class MaxHeapItemComparator { + public: + explicit MaxHeapItemComparator(const InternalKeyComparator* comparator) + : comparator_(comparator) {} + + bool operator()(HeapItem* a, HeapItem* b) const { + if (LIKELY(a->type == HeapItem::Type::ITERATOR)) { + if (LIKELY(b->type == HeapItem::Type::ITERATOR)) { + return comparator_->Compare(a->iter.key(), b->iter.key()) < 0; + } else { + return comparator_->Compare(a->iter.key(), b->tombstone_pik) < 0; + } + } else { + if (LIKELY(b->type == HeapItem::Type::ITERATOR)) { + return comparator_->Compare(a->tombstone_pik, b->iter.key()) < 0; + } else { + return comparator_->Compare(a->tombstone_pik, b->tombstone_pik) < 0; + } + } + } + + private: + const InternalKeyComparator* comparator_; + }; + + using MergerMinIterHeap = BinaryHeap; + using MergerMaxIterHeap = BinaryHeap; + friend class MergeIteratorBuilder; // Clears heaps for both directions, used when changing direction or seeking void ClearHeaps(bool clear_active = true); // Ensures that maxHeap_ is initialized when starting to go in the reverse // direction void InitMaxHeap(); - - // Advance this merging iterator until the current key (top of min heap) is - // not covered by any range tombstone or that there is no more keys (heap is - // empty). After this call, if Valid(), current_ points to the next key that - // is not covered by any range tombstone. + // Advance this merging iterator until the current key (minHeap_.top()) is + // from a point iterator and is not covered by any range tombstone, + // or that there is no more keys (heap is empty). SeekImpl() may be called + // to seek to the end of a range tombstone as an optimization. void FindNextVisibleKey(); void FindPrevVisibleKey(); + // Advance this merging iterators to the first key >= `target` for all + // components from levels >= starting_level. All iterators before + // starting_level are untouched. + // + // @param range_tombstone_reseek Whether target is some range tombstone + // end, i.e., whether this SeekImpl() call is a part of a "cascading seek". + // This is used only for recoding relevant perf_context. void SeekImpl(const Slice& target, size_t starting_level = 0, bool range_tombstone_reseek = false); @@ -577,40 +595,59 @@ class MergingIterator : public InternalIterator { enum Direction : uint8_t { kForward, kReverse }; Direction direction_; const InternalKeyComparator* comparator_; - // We could also use an autovector with a larger reserved size. // HeapItem for all child point iterators. + // Invariant(children_): children_[i] is in minHeap_ iff + // children_[i].iter.Valid(), and at most one children_[i] is in minHeap_. + // TODO: We could use an autovector with a larger reserved size. std::vector children_; - // HeapItem for range tombstone start and end keys. Each range tombstone - // iterator will have at most one side (start key or end key) in a heap - // at the same time, so this vector will be of size children_.size(); - // pinned_heap_item_[i] corresponds to the start key and end key HeapItem - // for range_tombstone_iters_[i]. + // HeapItem for range tombstone start and end keys. + // pinned_heap_item_[i] corresponds to range_tombstone_iters_[i]. + // Invariant(phi): If range_tombstone_iters_[i]->Valid(), + // pinned_heap_item_[i].tombstone_pik is equal to + // range_tombstone_iters_[i]->start_key() when + // pinned_heap_item_[i].type is DELETE_RANGE_START and + // range_tombstone_iters_[i]->end_key() when + // pinned_heap_item_[i].type is DELETE_RANGE_END (ignoring op_type which is + // kMaxValid for all pinned_heap_item_.tombstone_pik). + // pinned_heap_item_[i].type is either DELETE_RANGE_START or DELETE_RANGE_END. std::vector pinned_heap_item_; // range_tombstone_iters_[i] contains range tombstones in the sorted run that // corresponds to children_[i]. range_tombstone_iters_.empty() means not // handling range tombstones in merging iterator. range_tombstone_iters_[i] == // nullptr means the sorted run of children_[i] does not have range // tombstones. + // Invariant(rti): pinned_heap_item_[i] is in minHeap_ iff + // range_tombstone_iters_[i]->Valid() and at most one pinned_heap_item_[i] is + // in minHeap_. std::vector range_tombstone_iters_; // Levels (indices into range_tombstone_iters_/children_ ) that currently have - // "active" range tombstones. See comments above Seek() for meaning of - // "active". + // "active" range tombstones. See comments above MergingIterator for meaning + // of "active". + // Invariant(active_): i is in active_ iff range_tombstone_iters_[i]->Valid() + // and pinned_heap_item_[i].type == DELETE_RANGE_END. std::set active_; bool SkipNextDeleted(); + bool SkipPrevDeleted(); - // Cached pointer to child iterator with the current key, or nullptr if no - // child iterators are valid. This is the top of minHeap_ or maxHeap_ - // depending on the direction. + // Invariant: at the end of each InternalIterator API, + // current_ points to minHeap_.top().iter (maxHeap_ if backward scanning) + // or nullptr if no child iterator is valid. + // This follows from that current_ = CurrentForward()/CurrentReverse() is + // called at the end of each InternalIterator API. IteratorWrapper* current_; // If any of the children have non-ok status, this is one of them. Status status_; + // Invariant: min heap property is maintained (parent is always <= child). + // This holds by using only BinaryHeap APIs to modify heap. One + // exception is to modify heap top item directly (by caller iter->Next()), and + // it should be followed by a call to replace_top() or pop(). MergerMinIterHeap minHeap_; // Max heap is used for reverse iteration, which is way less common than - // forward. Lazily initialize it to save memory. + // forward. Lazily initialize it to save memory. std::unique_ptr maxHeap_; PinnedIteratorsManager* pinned_iters_mgr_; @@ -634,25 +671,93 @@ class MergingIterator : public InternalIterator { IteratorWrapper* CurrentForward() const { assert(direction_ == kForward); - assert(minHeap_.empty() || minHeap_.top()->type == HeapItem::ITERATOR); + assert(minHeap_.empty() || + minHeap_.top()->type == HeapItem::Type::ITERATOR); return !minHeap_.empty() ? &minHeap_.top()->iter : nullptr; } IteratorWrapper* CurrentReverse() const { assert(direction_ == kReverse); assert(maxHeap_); - assert(maxHeap_->empty() || maxHeap_->top()->type == HeapItem::ITERATOR); + assert(maxHeap_->empty() || + maxHeap_->top()->type == HeapItem::Type::ITERATOR); return !maxHeap_->empty() ? &maxHeap_->top()->iter : nullptr; } }; -// Seek to fist key >= target key (internal key) for children_[starting_level:]. -// Cascading seek optimizations are applied if range tombstones are present (see -// comment above Seek() for more). +// Pre-condition: +// - Invariants (3) and (4) hold for i < starting_level +// - For i < starting_level, range_tombstone_iters_[i].prev.end_key() < +// `target`. +// - For i < starting_level, if i in active_, then +// range_tombstone_iters_[i]->start_key() < `target`. +// +// Post-condition: +// - Invariants (3) and (4) hold for all level i. +// - (*) target <= children_[i].iter.key() <= LevelNextVisible(i, target) +// for i >= starting_level +// - (**) target < pinned_heap_item_[i].tombstone_pik if +// range_tombstone_iters_[i].Valid() for i >= starting_level +// +// Proof sketch: +// Invariant (3) holds for all level i. +// For j <= i < starting_level, it follows from Pre-condition that (3) holds +// and that SeekImpl(-, starting_level) does not update children_[i] or +// range_tombstone_iters_[j]. +// For j < starting_level and i >= starting_level, it follows from +// - Pre-condition that range_tombstone_iters_[j].prev.end_key() < `target` +// - range_tombstone_iters_[j] is not updated in SeekImpl(), and +// - children_[i].iter.Seek(current_search_key) is called with +// current_search_key >= target (shown below). +// When current_search_key is updated, it is updated to some +// range_tombstone_iter->end_key() after +// range_tombstone_iter->SeekInternalKey(current_search_key) was called. So +// current_search_key increases if updated and >= target. +// For starting_level <= j <= i: +// children_[i].iter.Seek(k1) and range_tombstone_iters_[j]->SeekInternalKey(k2) +// are called in SeekImpl(). Seek(k1) positions children_[i] at the first key >= +// k1 from level i. SeekInternalKey(k2) positions range_tombstone_iters_[j] at +// the first range tombstone from level j with end_key() > k2. It suffices to +// show that k1 >= k2. Since k1 and k2 are values of current_search_key where +// k1 = k2 or k1 is value of a later current_search_key than k2, so k1 >= k2. +// +// Invariant (4) holds for all level >= 0. +// By Pre-condition Invariant (4) holds for i < starting_level. +// Since children_[i], range_tombstone_iters_[i] and contents of active_ for +// i < starting_level do not change (4) holds for j <= i < starting_level. +// By Pre-condition: for all j < starting_level, if j in active_, then +// range_tombstone_iters_[j]->start_key() < target. For i >= starting_level, +// children_[i].iter.Seek(k) is called for k >= target. So +// children_[i].iter.key() >= target > range_tombstone_iters_[j]->start_key() +// for j < starting_level and i >= starting_level. So invariant (4) holds for +// j < starting_level and i >= starting_level. +// For starting_level <= j <= i, j is added to active_ only if +// - range_tombstone_iters_[j]->SeekInternalKey(k1) was called +// - range_tombstone_iters_[j]->start_key() <= k1 +// Since children_[i].iter.Seek(k2) is called for some k2 >= k1 and for all +// starting_level <= j <= i, (4) also holds for all starting_level <= j <= i. +// +// Post-condition (*): target <= children_[i].iter.key() <= LevelNextVisible(i, +// target) for i >= starting_level. +// target <= children_[i].iter.key() follows from that Seek() is called on some +// current_search_key >= target for children_[i].iter. If current_search_key +// is updated from k1 to k2 when level = i, we show that the range [k1, k2) is +// not visible for children_[j] for any j > i. When current_search_key is +// updated from k1 to k2, +// - range_tombstone_iters_[i]->SeekInternalKey(k1) was called +// - range_tombstone_iters_[i]->Valid() +// - range_tombstone_iters_[i]->start_key().user_key <= k1.user_key +// - k2 = range_tombstone_iters_[i]->end_key() +// We assume that range_tombstone_iters_[i]->start_key() has a higher sequence +// number compared to any key from levels > i that has the same user key. So no +// point key from levels > i in range [k1, k2) is visible. So +// children_[i].iter.key() <= LevelNextVisible(i, target). // -// @param range_tombstone_reseek Whether target is some range tombstone -// end, i.e., whether this SeekImpl() call is a part of a "cascading seek". This -// is used only for recoding relevant perf_context. +// Post-condition (**) target < pinned_heap_item_[i].tombstone_pik for i >= +// starting_level if range_tombstone_iters_[i].Valid(). This follows from that +// SeekInternalKey() being called for each range_tombstone_iters_ with some key +// >= `target` and that we pick start/end key that is > `target` to insert to +// minHeap_. void MergingIterator::SeekImpl(const Slice& target, size_t starting_level, bool range_tombstone_reseek) { // active range tombstones before `starting_level` remain active @@ -665,6 +770,7 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level, // TODO: perhaps we could save some upheap cost by add all child iters first // and then do a single heapify. + // Invariant(children_) for level < starting_level for (size_t level = 0; level < starting_level; ++level) { PERF_TIMER_GUARD(seek_min_heap_time); AddToMinHeapOrCheckStatus(&children_[level]); @@ -677,15 +783,20 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level, // - If `level` is in active_, then range_tombstone_iters_[level]->Valid() // and pinned_heap_item_[level] is of type RANGE_DELETION_END. for (size_t level = 0; level < starting_level; ++level) { + // Restores Invariants(rti), (phi) and (active_) for level < + // starting_level if (range_tombstone_iters_[level] && range_tombstone_iters_[level]->Valid()) { // use an iterator on active_ if performance becomes an issue here if (active_.count(level) > 0) { - assert(pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_END); + assert(pinned_heap_item_[level].type == + HeapItem::Type::DELETE_RANGE_END); // if it was active, then start key must be within upper_bound, // so we can add to minHeap_ directly. minHeap_.push(&pinned_heap_item_[level]); } else { + assert(pinned_heap_item_[level].type == + HeapItem::Type::DELETE_RANGE_START); // this takes care of checking iterate_upper_bound, but with an extra // key comparison if range_tombstone_iters_[level] was already out of // bound. Consider using a new HeapItem type or some flag to remember @@ -701,7 +812,6 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level, active_.erase(active_.lower_bound(starting_level), active_.end()); } - status_ = Status::OK(); IterKey current_search_key; current_search_key.SetInternalKey(target, false /* copy */); // Seek target might change to some range tombstone end key, so @@ -728,45 +838,37 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level, } auto range_tombstone_iter = range_tombstone_iters_[level]; if (range_tombstone_iter) { - range_tombstone_iter->Seek(current_search_key.GetUserKey()); + range_tombstone_iter->SeekInternalKey( + current_search_key.GetInternalKey()); + // Invariants (rti) and (phi) if (range_tombstone_iter->Valid()) { - // insert the range tombstone end that is closer to and >= - // current_search_key. Strictly speaking, since the Seek() call above - // is on user key, it is possible that range_tombstone_iter->end_key() - // < current_search_key. This can happen when range_tombstone_iter is - // truncated and range_tombstone_iter.largest_ has the same user key - // as current_search_key.GetUserKey() but with a larger sequence - // number than current_search_key. Correctness is not affected as this - // tombstone end key will be popped during FindNextVisibleKey(). + // If range tombstone starts after `current_search_key`, + // we should insert start key to heap as the range tombstone is not + // active yet. InsertRangeTombstoneToMinHeap( level, comparator_->Compare(range_tombstone_iter->start_key(), pik) > 0 /* start_key */); - // current_search_key < end_key guaranteed by the Seek() and Valid() - // calls above. Only interested in user key coverage since older - // sorted runs must have smaller sequence numbers than this range - // tombstone. + // current_search_key < end_key guaranteed by the SeekInternalKey() + // and Valid() calls above. Here we only need to compare user_key + // since if target.user_key == + // range_tombstone_iter->start_key().user_key and target < + // range_tombstone_iter->start_key(), no older level would have any + // key in range [target, range_tombstone_iter->start_key()], so no + // keys in range [target, range_tombstone_iter->end_key()) from older + // level would be visible. So it is safe to seek to + // range_tombstone_iter->end_key(). // // TODO: range_tombstone_iter->Seek() finds the max covering // sequence number, can make it cheaper by not looking for max. if (comparator_->user_comparator()->Compare( range_tombstone_iter->start_key().user_key, current_search_key.GetUserKey()) <= 0) { - // Since range_tombstone_iter->Valid(), seqno should be valid, so - // there is no need to check it. range_tombstone_reseek = true; - // Current target user key is covered by this range tombstone. - // All older sorted runs will seek to range tombstone end key. // Note that for prefix seek case, it is possible that the prefix // is not the same as the original target, it should not affect // correctness. Besides, in most cases, range tombstone start and // end key should have the same prefix? - // If range_tombstone_iter->end_key() is truncated to its largest_ - // boundary, the timestamp in user_key will not be max timestamp, - // but the timestamp of `range_tombstone_iter.largest_`. This should - // be fine here as current_search_key is used to Seek into lower - // levels. - current_search_key.SetInternalKey( - range_tombstone_iter->end_key().user_key, kMaxSequenceNumber); + current_search_key.SetInternalKey(range_tombstone_iter->end_key()); } } } @@ -818,6 +920,8 @@ void MergingIterator::SeekImpl(const Slice& target, size_t starting_level, // and `active_` is updated accordingly. // See FindNextVisibleKey() for more detail on internal implementation // of advancing child iters. +// When false is returned, if minHeap is not empty, then minHeap_.top().type +// == ITERATOR // // REQUIRES: // - min heap is currently not empty, and iter is in kForward direction. @@ -828,15 +932,19 @@ bool MergingIterator::SkipNextDeleted() { // - file boundary sentinel keys // - range deletion end key auto current = minHeap_.top(); - if (current->type == HeapItem::DELETE_RANGE_END) { + if (current->type == HeapItem::Type::DELETE_RANGE_END) { + // Invariant(active_): range_tombstone_iters_[current->level] is about to + // become !Valid() or that its start key is going to be added to minHeap_. active_.erase(current->level); assert(range_tombstone_iters_[current->level] && range_tombstone_iters_[current->level]->Valid()); range_tombstone_iters_[current->level]->Next(); + // Maintain Invariants (rti) and (phi) if (range_tombstone_iters_[current->level]->Valid()) { InsertRangeTombstoneToMinHeap(current->level, true /* start_key */, true /* replace_top */); } else { + // TruncatedRangeDelIterator does not have status minHeap_.pop(); } return true /* current key deleted */; @@ -847,41 +955,65 @@ bool MergingIterator::SkipNextDeleted() { // SetTombstoneKey()). assert(ExtractValueType(current->iter.key()) != kTypeRangeDeletion || active_.count(current->level) == 0); - // When entering a new file, old range tombstone iter is freed, - // but the last key from that range tombstone iter may still be in the heap. - // We need to ensure the data underlying its corresponding key Slice is - // still alive. We do so by popping the range tombstone key from heap before - // calling iter->Next(). Technically, this change is not needed: if there is - // a range tombstone end key that is after file boundary sentinel key in - // minHeap_, the range tombstone end key must have been truncated at file - // boundary. The underlying data of the range tombstone end key Slice is the - // SST file's largest internal key stored as file metadata in Version. - // However, since there are too many implicit assumptions made, it is safer - // to just ensure range tombstone iter is still alive. + // When entering a new file, range tombstone iter from the old file is + // freed, but the last key from that range tombstone iter may still be in + // the heap. We need to ensure the data underlying its corresponding key + // Slice is still alive. We do so by popping the range tombstone key from + // heap before calling iter->Next(). Technically, this change is not needed: + // if there is a range tombstone end key that is after file boundary + // sentinel key in minHeap_, the range tombstone end key must have been + // truncated at file boundary. The underlying data of the range tombstone + // end key Slice is the SST file's largest internal key stored as file + // metadata in Version. However, since there are too many implicit + // assumptions made, it is safer to just ensure range tombstone iter is + // still alive. minHeap_.pop(); // Remove last SST file's range tombstone end key if there is one. // This means file boundary is before range tombstone end key, // which could happen when a range tombstone and a user key // straddle two SST files. Note that in TruncatedRangeDelIterator // constructor, parsed_largest.sequence is decremented 1 in this case. - if (!minHeap_.empty() && minHeap_.top()->level == current->level && - minHeap_.top()->type == HeapItem::DELETE_RANGE_END) { - minHeap_.pop(); - active_.erase(current->level); + // Maintains Invariant(rti) that at most one + // pinned_heap_item_[current->level] is in minHeap_. + if (range_tombstone_iters_[current->level] && + range_tombstone_iters_[current->level]->Valid()) { + if (!minHeap_.empty() && minHeap_.top()->level == current->level) { + assert(minHeap_.top()->type == HeapItem::Type::DELETE_RANGE_END); + minHeap_.pop(); + // Invariant(active_): we are about to enter a new SST file with new + // range_tombstone_iters[current->level]. Either it is !Valid() or its + // start key is going to be added to minHeap_. + active_.erase(current->level); + } else { + // range tombstone is still valid, but it is not on heap. + // This should only happen if the range tombstone is over iterator + // upper bound. + assert(iterate_upper_bound_ && + comparator_->user_comparator()->CompareWithoutTimestamp( + range_tombstone_iters_[current->level]->start_key().user_key, + true /* a_has_ts */, *iterate_upper_bound_, + false /* b_has_ts */) >= 0); + } } // LevelIterator enters a new SST file current->iter.Next(); + // Invariant(children_): current is popped from heap and added back only if + // it is valid if (current->iter.Valid()) { assert(current->iter.status().ok()); minHeap_.push(current); + } else { + // TODO(cbi): check status and early return if non-ok. + considerStatus(current->iter.status()); } + // Invariants (rti) and (phi) if (range_tombstone_iters_[current->level] && range_tombstone_iters_[current->level]->Valid()) { InsertRangeTombstoneToMinHeap(current->level); } return true /* current key deleted */; } - assert(current->type == HeapItem::ITERATOR); + assert(current->type == HeapItem::Type::ITERATOR); // Point key case: check active_ for range tombstone coverage. ParsedInternalKey pik; ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError(); @@ -908,9 +1040,11 @@ bool MergingIterator::SkipNextDeleted() { if (pik.sequence < range_tombstone_iters_[current->level]->seq()) { // covered by range tombstone current->iter.Next(); + // Invariant (children_) if (current->iter.Valid()) { minHeap_.replace_top(current); } else { + considerStatus(current->iter.status()); minHeap_.pop(); } return true /* current key deleted */; @@ -927,7 +1061,7 @@ bool MergingIterator::SkipNextDeleted() { } // we can reach here only if active_ is empty assert(active_.empty()); - assert(minHeap_.top()->type == HeapItem::ITERATOR); + assert(minHeap_.top()->type == HeapItem::Type::ITERATOR); return false /* current key not deleted */; } @@ -951,7 +1085,8 @@ void MergingIterator::SeekForPrevImpl(const Slice& target, if (range_tombstone_iters_[level] && range_tombstone_iters_[level]->Valid()) { assert(static_cast(active_.count(level)) == - (pinned_heap_item_[level].type == HeapItem::DELETE_RANGE_START)); + (pinned_heap_item_[level].type == + HeapItem::Type::DELETE_RANGE_START)); maxHeap_->push(&pinned_heap_item_[level]); } else { assert(!active_.count(level)); @@ -961,7 +1096,6 @@ void MergingIterator::SeekForPrevImpl(const Slice& target, active_.erase(active_.lower_bound(starting_level), active_.end()); } - status_ = Status::OK(); IterKey current_search_key; current_search_key.SetInternalKey(target, false /* copy */); // Seek target might change to some range tombstone end key, so @@ -1056,7 +1190,7 @@ bool MergingIterator::SkipPrevDeleted() { // - file boundary sentinel keys // - range deletion start key auto current = maxHeap_->top(); - if (current->type == HeapItem::DELETE_RANGE_START) { + if (current->type == HeapItem::Type::DELETE_RANGE_START) { active_.erase(current->level); assert(range_tombstone_iters_[current->level] && range_tombstone_iters_[current->level]->Valid()); @@ -1074,7 +1208,7 @@ bool MergingIterator::SkipPrevDeleted() { maxHeap_->pop(); // Remove last SST file's range tombstone key if there is one. if (!maxHeap_->empty() && maxHeap_->top()->level == current->level && - maxHeap_->top()->type == HeapItem::DELETE_RANGE_START) { + maxHeap_->top()->type == HeapItem::Type::DELETE_RANGE_START) { maxHeap_->pop(); active_.erase(current->level); } @@ -1082,6 +1216,8 @@ bool MergingIterator::SkipPrevDeleted() { if (current->iter.Valid()) { assert(current->iter.status().ok()); maxHeap_->push(current); + } else { + considerStatus(current->iter.status()); } if (range_tombstone_iters_[current->level] && @@ -1090,7 +1226,7 @@ bool MergingIterator::SkipPrevDeleted() { } return true /* current key deleted */; } - assert(current->type == HeapItem::ITERATOR); + assert(current->type == HeapItem::Type::ITERATOR); // Point key case: check active_ for range tombstone coverage. ParsedInternalKey pik; ParseInternalKey(current->iter.key(), &pik, false).PermitUncheckedError(); @@ -1124,6 +1260,7 @@ bool MergingIterator::SkipPrevDeleted() { if (current->iter.Valid()) { maxHeap_->replace_top(current); } else { + considerStatus(current->iter.status()); maxHeap_->pop(); } return true /* current key deleted */; @@ -1136,11 +1273,12 @@ bool MergingIterator::SkipPrevDeleted() { } assert(active_.empty()); - assert(maxHeap_->top()->type == HeapItem::ITERATOR); + assert(maxHeap_->top()->type == HeapItem::Type::ITERATOR); return false /* current key not deleted */; } void MergingIterator::AddToMinHeapOrCheckStatus(HeapItem* child) { + // Invariant(children_) if (child->iter.Valid()) { assert(child->iter.status().ok()); minHeap_.push(child); @@ -1164,6 +1302,7 @@ void MergingIterator::AddToMaxHeapOrCheckStatus(HeapItem* child) { // Advance all range tombstones iters, including the one corresponding to // current_, to the first tombstone with end_key > current_.key(). // TODO: potentially do cascading seek here too +// TODO: show that invariants hold void MergingIterator::SwitchToForward() { ClearHeaps(); Slice target = key(); @@ -1177,7 +1316,7 @@ void MergingIterator::SwitchToForward() { if (child.iter.status() == Status::TryAgain()) { continue; } - if (child.iter.Valid() && comparator_->Equal(target, child.key())) { + if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) { assert(child.iter.status().ok()); child.iter.Next(); } @@ -1188,7 +1327,7 @@ void MergingIterator::SwitchToForward() { for (auto& child : children_) { if (child.iter.status() == Status::TryAgain()) { child.iter.Seek(target); - if (child.iter.Valid() && comparator_->Equal(target, child.key())) { + if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) { assert(child.iter.status().ok()); child.iter.Next(); } @@ -1239,7 +1378,7 @@ void MergingIterator::SwitchToBackward() { if (&child.iter != current_) { child.iter.SeekForPrev(target); TEST_SYNC_POINT_CALLBACK("MergeIterator::Prev:BeforePrev", &child); - if (child.iter.Valid() && comparator_->Equal(target, child.key())) { + if (child.iter.Valid() && comparator_->Equal(target, child.iter.key())) { assert(child.iter.status().ok()); child.iter.Prev(); } @@ -1297,32 +1436,201 @@ void MergingIterator::ClearHeaps(bool clear_active) { void MergingIterator::InitMaxHeap() { if (!maxHeap_) { - maxHeap_ = std::make_unique(comparator_); + maxHeap_ = + std::make_unique(MaxHeapItemComparator(comparator_)); } } -// Repeatedly check and remove heap top key if it is not a point key -// that is not covered by range tombstones. SeekImpl() is called to seek to end -// of a range tombstone if the heap top is a point key covered by some range -// tombstone from a newer sorted run. If the covering tombstone is from current -// key's level, then the current child iterator is simply advanced to its next -// key without reseeking. +// Assume there is a next key that is not covered by range tombstone. +// Pre-condition: +// - Invariants (3) and (4) +// - There is some k where k <= children_[i].iter.key() <= LevelNextVisible(i, +// k) for all levels i (LevelNextVisible() defined in Seek()). +// +// Define NextVisible(k) to be the first key >= k from among children_ that +// is not covered by any range tombstone. +// Post-condition: +// - Invariants (1)-(4) hold +// - (*): minHeap_->top()->key() == NextVisible(k) +// +// Loop invariants: +// - Invariants (3) and (4) +// - (*): k <= children_[i].iter.key() <= LevelNextVisible(i, k) +// +// Progress: minHeap_.top()->key() is non-decreasing and strictly increases in +// a finite number of iterations. +// TODO: it is possible to call SeekImpl(k2) after SeekImpl(k1) with +// k2 < k1 in the same FindNextVisibleKey(). For example, l1 has a range +// tombstone [2,3) and l2 has a range tombstone [1, 4). Point key 1 from l5 +// triggers SeekImpl(4 /* target */, 5). Then point key 2 from l3 triggers +// SeekImpl(3 /* target */, 3). +// Ideally we should only move iterators forward in SeekImpl(), and the +// progress condition can be made simpler: iterator only moves forward. +// +// Proof sketch: +// Post-condition: +// Invariant (1) holds when this method returns: +// Ignoring the empty minHeap_ case, there are two cases: +// Case 1: active_ is empty and !minHeap_.top()->iter.IsDeleteRangeSentinelKey() +// By invariants (rti) and (active_), active_ being empty means if a +// pinned_heap_item_[i] is in minHeap_, it has type DELETE_RANGE_START. Note +// that PopDeleteRangeStart() was called right before the while loop condition, +// so minHeap_.top() is not of type DELETE_RANGE_START. So minHeap_.top() must +// be of type ITERATOR. +// Case 2: SkipNextDeleted() returns false. The method returns false only when +// minHeap_.top().type == ITERATOR. +// +// Invariant (2) holds when this method returns: +// From Invariant (1), minHeap_.top().type == ITERATOR. Suppose it is +// children_[i] for some i. Suppose that children_[i].iter.key() is covered by +// some range tombstone. This means there is a j <= i and a range tombstone from +// level j with start_key() < children_[i].iter.key() < end_key(). +// - If range_tombstone_iters_[j]->Valid(), by Invariants (rti) and (phi), +// pinned_heap_item_[j] is in minHeap_, and pinned_heap_item_[j].tombstone_pik +// is either start or end key of this range tombstone. If +// pinned_heap_item_[j].tombstone_pik < children_[i].iter.key(), it would be at +// top of minHeap_ which would contradict Invariant (1). So +// pinned_heap_item_[j].tombstone_pik > children_[i].iter.key(). +// By Invariant (3), range_tombstone_iters_[j].prev.end_key() < +// children_[i].iter.key(). We assume that in each level, range tombstones +// cover non-overlapping ranges. So range_tombstone_iters_[j] is at +// the range tombstone with start_key() < children_[i].iter.key() < end_key() +// and has its end_key() in minHeap_. By Invariants (phi) and (active_), +// j is in active_. From while loop condition, SkipNextDeleted() must have +// returned false for this method to return. +// - If j < i, then SeekImpl(range_tombstone_iters_[j']->end_key(), i) +// was called for some j' < i and j' in active_. Note that since j' is in +// active_, pinned_heap_item_[j'] is in minHeap_ and has tombstone_pik = +// range_tombstone_iters_[j']->end_key(). So +// range_tombstone_iters_[j']->end_key() must be larger than +// children_[i].iter.key() to not be at top of minHeap_. This means after +// SeekImpl(), children_[i] would be at a key > children_[i].iter.key() +// -- contradiction. +// - If j == i, children_[i]->Next() would have been called and children_[i] +// would be at a key > children_[i].iter.key() -- contradiction. +// - If !range_tombstone_iters_[j]->Valid(). Then range_tombstone_iters_[j] +// points to an SST file with all range tombstones from that file exhausted. +// The file must come before the file containing the first +// range tombstone with start_key() < children_[i].iter.key() < end_key(). +// Assume files from same level have non-overlapping ranges, the current file's +// meta.largest is less than children_[i].iter.key(). So the file boundary key, +// which has value meta.largest must have been popped from minHeap_ before +// children_[i].iter.key(). So range_tombstone_iters_[j] would not point to +// this SST file -- contradiction. +// So it is impossible for children_[i].iter.key() to be covered by a range +// tombstone. +// +// Post-condition (*) holds when the function returns: +// From loop invariant (*) that k <= children_[i].iter.key() <= +// LevelNextVisible(i, k) and Invariant (2) above, when the function returns, +// minHeap_.top()->key() is the smallest LevelNextVisible(i, k) among all levels +// i. This is equal to NextVisible(k). +// +// Invariant (3) holds after each iteration: +// PopDeleteRangeStart() does not change range tombstone position. +// In SkipNextDeleted(): +// - If DELETE_RANGE_END is popped from minHeap_, it means the range +// tombstone's end key is < all other point keys, so it is safe to advance to +// next range tombstone. +// - If file boundary is popped (current->iter.IsDeleteRangeSentinelKey()), +// we assume that file's last range tombstone's +// end_key <= file boundary key < all other point keys. So it is safe to +// move to the first range tombstone in the next SST file. +// - If children_[i]->Next() is called, then it is fine as it is advancing a +// point iterator. +// - If SeekImpl(target, l) is called, then (3) follows from SeekImpl()'s +// post-condition if its pre-condition holds. First pre-condition follows +// from loop invariant where Invariant (3) holds for all levels i. +// Now we should second pre-condition holds. Since Invariant (3) holds for +// all i, we have for all j <= l, range_tombstone_iters_[j].prev.end_key() +// < children_[l].iter.key(). `target` is the value of +// range_tombstone_iters_[j'].end_key() for some j' < l and j' in active_. +// By Invariant (active_) and (rti), pinned_heap_item_[j'] is in minHeap_ and +// pinned_heap_item_[j'].tombstone_pik = range_tombstone_iters_[j'].end_key(). +// This end_key must be larger than children_[l].key() since it was not at top +// of minHeap_. So for all levels j <= l, +// range_tombstone_iters_[j].prev.end_key() < children_[l].iter.key() < target +// +// Invariant (4) holds after each iteration: +// A level i is inserted into active_ during calls to PopDeleteRangeStart(). +// In that case, range_tombstone_iters_[i].start_key() < all point keys +// by heap property and the assumption that point keys and range tombstone keys +// are distinct. +// If SeekImpl(target, l) is called, then there is a range_tombstone_iters_[j] +// where target = range_tombstone_iters_[j]->end_key() and children_[l]->key() +// < target. By loop invariants, (3) and (4) holds for levels. +// Since target > children_[l]->key(), it also holds that for j < l, +// range_tombstone_iters_[j].prev.end_key() < target and that if j in active_, +// range_tombstone_iters_[i]->start_key() < target. So all pre-conditions of +// SeekImpl(target, l) holds, and (4) follow from its post-condition. +// All other places either in this function either advance point iterators +// or remove some level from active_, so (4) still holds. +// +// Look Invariant (*): for all level i, k <= children_[i] <= LevelNextVisible(i, +// k). +// k <= children_[i] follows from loop `progress` condition. +// Consider when children_[i] is changed for any i. It is through +// children_[i].iter.Next() or SeekImpl() in SkipNextDeleted(). +// If children_[i].iter.Next() is called, there is a range tombstone from level +// i where tombstone seqno > children_[i].iter.key()'s seqno and i in active_. +// By Invariant (4), tombstone's start_key < children_[i].iter.key(). By +// invariants (active_), (phi), and (rti), tombstone's end_key is in minHeap_ +// and that children_[i].iter.key() < end_key. So children_[i].iter.key() is +// not visible, and it is safe to call Next(). +// If SeekImpl(target, l) is called, by its contract, when SeekImpl() returns, +// target <= children_[i]->key() <= LevelNextVisible(i, target) for i >= l, +// and children_[end_key() for some j < i and j is in active_. +// By Invariant (4), range_tombstone_iters_[j]->start_key() < +// children_[i].iter.key() for all i >= l. So for each level i >= l, the range +// [children_[i].iter.key(), target) is not visible. So after SeekImpl(), +// children_[i].iter.key() <= LevelNextVisible(i, target) <= +// LevelNextVisible(i, k). +// +// `Progress` holds for each iteration: +// Very sloppy intuition: +// - in PopDeleteRangeStart(): the value of a pinned_heap_item_.tombstone_pik_ +// is updated from the start key to the end key of the same range tombstone. +// We assume that start key <= end key for the same range tombstone. +// - in SkipNextDeleted() +// - If the top of heap is DELETE_RANGE_END, the range tombstone is advanced +// and the relevant pinned_heap_item_.tombstone_pik is increased or popped +// from minHeap_. +// - If the top of heap is a file boundary key, then both point iter and +// range tombstone iter are advanced to the next file. +// - If the top of heap is ITERATOR and current->iter.Next() is called, it +// moves to a larger point key. +// - If the top of heap is ITERATOR and SeekImpl(k, l) is called, then all +// iterators from levels >= l are advanced to some key >= k by its contract. +// And top of minHeap_ before SeekImpl(k, l) was less than k. +// There are special cases where different heap items have the same key, +// e.g. when two range tombstone end keys share the same value). In +// these cases, iterators are being advanced, so the minimum key should increase +// in a finite number of steps. inline void MergingIterator::FindNextVisibleKey() { - // When active_ is empty, we know heap top cannot be a range tombstone end - // key. It cannot be a range tombstone start key per PopDeleteRangeStart(). PopDeleteRangeStart(); - while (!minHeap_.empty() && - (!active_.empty() || minHeap_.top()->IsDeleteRangeSentinelKey()) && - SkipNextDeleted()) { + // PopDeleteRangeStart() implies heap top is not DELETE_RANGE_START + // active_ being empty implies no DELETE_RANGE_END in heap. + // So minHeap_->top() must be of type ITERATOR. + while ( + !minHeap_.empty() && + (!active_.empty() || minHeap_.top()->iter.IsDeleteRangeSentinelKey()) && + SkipNextDeleted()) { PopDeleteRangeStart(); } + // Checks Invariant (1) + assert(minHeap_.empty() || minHeap_.top()->type == HeapItem::Type::ITERATOR); } inline void MergingIterator::FindPrevVisibleKey() { PopDeleteRangeEnd(); - while (!maxHeap_->empty() && - (!active_.empty() || maxHeap_->top()->IsDeleteRangeSentinelKey()) && - SkipPrevDeleted()) { + // PopDeleteRangeEnd() implies heap top is not DELETE_RANGE_END + // active_ being empty implies no DELETE_RANGE_START in heap. + // So maxHeap_->top() must be of type ITERATOR. + while ( + !maxHeap_->empty() && + (!active_.empty() || maxHeap_->top()->iter.IsDeleteRangeSentinelKey()) && + SkipPrevDeleted()) { PopDeleteRangeEnd(); } } diff --git a/table/merging_iterator.h b/table/merging_iterator.h index 16fc0877e522..562a4e57f506 100644 --- a/table/merging_iterator.h +++ b/table/merging_iterator.h @@ -12,6 +12,7 @@ #include "db/range_del_aggregator.h" #include "rocksdb/slice.h" #include "rocksdb/types.h" +#include "table/iterator_wrapper.h" namespace ROCKSDB_NAMESPACE { @@ -23,7 +24,7 @@ template class InternalIteratorBase; using InternalIterator = InternalIteratorBase; -// Return an iterator that provided the union of the data in +// Return an iterator that provides the union of the data in // children[0,n-1]. Takes ownership of the child iterators and // will delete them when the result iterator is deleted. // @@ -35,11 +36,15 @@ extern InternalIterator* NewMergingIterator( const InternalKeyComparator* comparator, InternalIterator** children, int n, Arena* arena = nullptr, bool prefix_seek_mode = false); +// The iterator returned by NewMergingIterator() and +// MergeIteratorBuilder::Finish(). MergingIterator handles the merging of data +// from different point and/or range tombstone iterators. class MergingIterator; -// A builder class to build a merging iterator by adding iterators one by one. -// User should call only one of AddIterator() or AddPointAndTombstoneIterator() -// exclusively for the same builder. +// A builder class to for an iterator that provides the union of data +// of input iterators. Two APIs are provided to add input iterators. User should +// only call one of them exclusively depending on if range tombstone should be +// processed. class MergeIteratorBuilder { public: // comparator: the comparator used in merging comparator @@ -49,7 +54,7 @@ class MergeIteratorBuilder { const Slice* iterate_upper_bound = nullptr); ~MergeIteratorBuilder(); - // Add iter to the merging iterator. + // Add point key iterator `iter` to the merging iterator. void AddIterator(InternalIterator* iter); // Add a point key iterator and a range tombstone iterator. diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 6530f6a80c49..2cbaacec08f3 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -27,6 +27,8 @@ namespace ROCKSDB_NAMESPACE { const std::string kPropertiesBlockName = "rocksdb.properties"; +// NB: only used with format_version >= 6 +const std::string kIndexBlockName = "rocksdb.index"; // Old property block name for backward compatibility const std::string kPropertiesBlockOldName = "rocksdb.stats"; const std::string kCompressionDictBlockName = "rocksdb.compression_dict"; @@ -115,6 +117,11 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { Add(TablePropertiesNames::kFastCompressionEstimatedDataSize, props.fast_compression_estimated_data_size); } + Add(TablePropertiesNames::kTailStartOffset, props.tail_start_offset); + if (props.user_defined_timestamps_persisted == 0) { + Add(TablePropertiesNames::kUserDefinedTimestampsPersisted, + props.user_defined_timestamps_persisted); + } if (!props.db_id.empty()) { Add(TablePropertiesNames::kDbId, props.db_id); } @@ -206,21 +213,25 @@ void NotifyCollectTableCollectorsOnBlockAdd( bool NotifyCollectTableCollectorsOnFinish( const std::vector>& collectors, - Logger* info_log, PropertyBlockBuilder* builder) { + Logger* info_log, PropertyBlockBuilder* builder, + UserCollectedProperties& user_collected_properties, + UserCollectedProperties& readable_properties) { bool all_succeeded = true; for (auto& collector : collectors) { - UserCollectedProperties user_collected_properties; Status s = collector->Finish(&user_collected_properties); - - all_succeeded = all_succeeded && s.ok(); - if (!s.ok()) { + if (s.ok()) { + for (const auto& prop : collector->GetReadableProperties()) { + readable_properties.insert(prop); + } + builder->Add(user_collected_properties); + } else { LogPropertiesCollectionError(info_log, "Finish" /* method */, collector->Name()); - } else { - builder->Add(user_collected_properties); + if (all_succeeded) { + all_succeeded = false; + } } } - return all_succeeded; } @@ -307,6 +318,10 @@ Status ReadTablePropertiesHelper( &new_table_properties->slow_compression_estimated_data_size}, {TablePropertiesNames::kFastCompressionEstimatedDataSize, &new_table_properties->fast_compression_estimated_data_size}, + {TablePropertiesNames::kTailStartOffset, + &new_table_properties->tail_start_offset}, + {TablePropertiesNames::kUserDefinedTimestampsPersisted, + &new_table_properties->user_defined_timestamps_persisted}, }; std::string last_key; @@ -386,8 +401,8 @@ Status ReadTablePropertiesHelper( // Modified version of BlockFetcher checksum verification // (See write_global_seqno comment above) if (s.ok() && footer.GetBlockTrailerSize() > 0) { - s = VerifyBlockChecksum(footer.checksum_type(), properties_block.data(), - block_size, file->file_name(), handle.offset()); + s = VerifyBlockChecksum(footer, properties_block.data(), block_size, + file->file_name(), handle.offset()); if (s.IsCorruption()) { if (new_table_properties->external_sst_file_global_seqno_offset != 0) { std::string tmp_buf(properties_block.data(), @@ -396,8 +411,8 @@ Status ReadTablePropertiesHelper( new_table_properties->external_sst_file_global_seqno_offset - handle.offset(); EncodeFixed64(&tmp_buf[static_cast(global_seqno_offset)], 0); - s = VerifyBlockChecksum(footer.checksum_type(), tmp_buf.data(), - block_size, file->file_name(), handle.offset()); + s = VerifyBlockChecksum(footer, tmp_buf.data(), block_size, + file->file_name(), handle.offset()); } } } @@ -412,20 +427,22 @@ Status ReadTablePropertiesHelper( Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, std::unique_ptr* properties, MemoryAllocator* memory_allocator, FilePrefetchBuffer* prefetch_buffer) { BlockHandle block_handle; Footer footer; - Status s = FindMetaBlockInFile(file, file_size, table_magic_number, ioptions, - kPropertiesBlockName, &block_handle, - memory_allocator, prefetch_buffer, &footer); + Status s = + FindMetaBlockInFile(file, file_size, table_magic_number, ioptions, + read_options, kPropertiesBlockName, &block_handle, + memory_allocator, prefetch_buffer, &footer); if (!s.ok()) { return s; } if (!block_handle.IsNull()) { - s = ReadTablePropertiesHelper(ReadOptions(), block_handle, file, + s = ReadTablePropertiesHelper(read_options, block_handle, file, prefetch_buffer, footer, ioptions, properties, memory_allocator); } else { @@ -473,14 +490,20 @@ Status FindMetaBlock(InternalIterator* meta_index_iter, Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, BlockContents* metaindex_contents, MemoryAllocator* memory_allocator, FilePrefetchBuffer* prefetch_buffer, Footer* footer_out) { Footer footer; IOOptions opts; - auto s = ReadFooterFromFile(opts, file, *ioptions.fs, prefetch_buffer, - file_size, &footer, table_magic_number); + Status s; + s = file->PrepareIOOptions(read_options, opts); + if (!s.ok()) { + return s; + } + s = ReadFooterFromFile(opts, file, *ioptions.fs, prefetch_buffer, file_size, + &footer, table_magic_number); if (!s.ok()) { return s; } @@ -489,7 +512,7 @@ Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, } auto metaindex_handle = footer.metaindex_handle(); - return BlockFetcher(file, prefetch_buffer, footer, ReadOptions(), + return BlockFetcher(file, prefetch_buffer, footer, read_options, metaindex_handle, metaindex_contents, ioptions, false /* do decompression */, false /*maybe_compressed*/, BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), @@ -497,18 +520,16 @@ Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, .ReadBlockContents(); } -Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size, - uint64_t table_magic_number, - const ImmutableOptions& ioptions, - const std::string& meta_block_name, - BlockHandle* block_handle, - MemoryAllocator* memory_allocator, - FilePrefetchBuffer* prefetch_buffer, - Footer* footer_out) { +Status FindMetaBlockInFile( + RandomAccessFileReader* file, uint64_t file_size, + uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, const std::string& meta_block_name, + BlockHandle* block_handle, MemoryAllocator* memory_allocator, + FilePrefetchBuffer* prefetch_buffer, Footer* footer_out) { BlockContents metaindex_contents; auto s = ReadMetaIndexBlockInFile( - file, file_size, table_magic_number, ioptions, &metaindex_contents, - memory_allocator, prefetch_buffer, footer_out); + file, file_size, table_magic_number, ioptions, read_options, + &metaindex_contents, memory_allocator, prefetch_buffer, footer_out); if (!s.ok()) { return s; } @@ -526,6 +547,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, const std::string& meta_block_name, BlockType block_type, BlockContents* contents, MemoryAllocator* memory_allocator) { @@ -535,15 +557,16 @@ Status ReadMetaBlock(RandomAccessFileReader* file, BlockHandle block_handle; Footer footer; - Status status = FindMetaBlockInFile( - file, file_size, table_magic_number, ioptions, meta_block_name, - &block_handle, memory_allocator, prefetch_buffer, &footer); + Status status = + FindMetaBlockInFile(file, file_size, table_magic_number, ioptions, + read_options, meta_block_name, &block_handle, + memory_allocator, prefetch_buffer, &footer); if (!status.ok()) { return status; } - return BlockFetcher(file, prefetch_buffer, footer, ReadOptions(), - block_handle, contents, ioptions, false /* decompress */, + return BlockFetcher(file, prefetch_buffer, footer, read_options, block_handle, + contents, ioptions, false /* decompress */, false /*maybe_compressed*/, block_type, UncompressionDict::GetEmptyDict(), PersistentCacheOptions::kEmpty, memory_allocator) diff --git a/table/meta_blocks.h b/table/meta_blocks.h index b867dd01d487..0a404dc9cf5f 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -32,6 +32,7 @@ struct TableProperties; // Meta block names for metaindex extern const std::string kPropertiesBlockName; +extern const std::string kIndexBlockName; extern const std::string kPropertiesBlockOldName; extern const std::string kCompressionDictBlockName; extern const std::string kRangeDelBlockName; @@ -97,9 +98,13 @@ void NotifyCollectTableCollectorsOnBlockAdd( // NotifyCollectTableCollectorsOnFinish() triggers the `Finish` event for all // property collectors. The collected properties will be added to `builder`. +// It will also populate `user_collected_properties` and `readable_properties` +// with the collected properties. bool NotifyCollectTableCollectorsOnFinish( const std::vector>& collectors, - Logger* info_log, PropertyBlockBuilder* builder); + Logger* info_log, PropertyBlockBuilder* builder, + UserCollectedProperties& user_collected_properties, + UserCollectedProperties& readable_properties); // Read table properties from a file using known BlockHandle. // @returns a status to indicate if the operation succeeded. On success, @@ -119,6 +124,7 @@ Status ReadTablePropertiesHelper( Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, std::unique_ptr* properties, MemoryAllocator* memory_allocator = nullptr, FilePrefetchBuffer* prefetch_buffer = nullptr); @@ -139,6 +145,7 @@ Status FindMetaBlock(InternalIterator* meta_index_iter, Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, const std::string& meta_block_name, BlockHandle* block_handle, MemoryAllocator* memory_allocator = nullptr, @@ -149,6 +156,7 @@ Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size, Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, BlockContents* block_contents, MemoryAllocator* memory_allocator = nullptr, FilePrefetchBuffer* prefetch_buffer = nullptr, @@ -161,6 +169,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, const std::string& meta_block_name, BlockType block_type, BlockContents* contents, MemoryAllocator* memory_allocator = nullptr); diff --git a/table/mock_table.cc b/table/mock_table.cc index 130889eaa80b..1823758e4468 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -41,12 +41,14 @@ class MockTableReader : public TableReader { GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters = false) override; - uint64_t ApproximateOffsetOf(const Slice& /*key*/, + uint64_t ApproximateOffsetOf(const ReadOptions& /*read_options*/, + const Slice& /*key*/, TableReaderCaller /*caller*/) override { return 0; } - uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, + uint64_t ApproximateSize(const ReadOptions& /*read_options*/, + const Slice& /*start*/, const Slice& /*end*/, TableReaderCaller /*caller*/) override { return 0; } @@ -228,7 +230,13 @@ Status MockTableReader::Get(const ReadOptions&, const Slice& key, std::shared_ptr MockTableReader::GetTableProperties() const { - return std::shared_ptr(new TableProperties()); + TableProperties* tp = new TableProperties(); + tp->num_entries = table_.size(); + tp->num_range_deletions = 0; + tp->raw_key_size = 1; + tp->raw_value_size = 1; + + return std::shared_ptr(tp); } MockTableFactory::MockTableFactory() @@ -297,8 +305,7 @@ Status MockTableFactory::GetIDFromFile(RandomAccessFileReader* file, uint32_t* id) const { char buf[4]; Slice result; - Status s = file->Read(IOOptions(), 0, 4, &result, buf, nullptr, - Env::IO_TOTAL /* rate_limiter_priority */); + Status s = file->Read(IOOptions(), 0, 4, &result, buf, nullptr); assert(result.size() == 4); *id = DecodeFixed32(buf); return s; diff --git a/table/multiget_context.h b/table/multiget_context.h index 76027a9520ff..54af2262cb45 100644 --- a/table/multiget_context.h +++ b/table/multiget_context.h @@ -12,6 +12,7 @@ #include "db/lookup_key.h" #include "db/merge_context.h" #include "rocksdb/env.h" +#include "rocksdb/options.h" #include "rocksdb/statistics.h" #include "rocksdb/types.h" #include "util/async_file_reader.h" @@ -21,6 +22,7 @@ namespace ROCKSDB_NAMESPACE { class GetContext; +class PinnableWideColumns; struct KeyContext { const Slice* key; @@ -36,11 +38,13 @@ struct KeyContext { bool is_blob_index; void* cb_arg; PinnableSlice* value; + PinnableWideColumns* columns; std::string* timestamp; GetContext* get_context; KeyContext(ColumnFamilyHandle* col_family, const Slice& user_key, - PinnableSlice* val, std::string* ts, Status* stat) + PinnableSlice* val, PinnableWideColumns* cols, std::string* ts, + Status* stat) : key(&user_key), lkey(nullptr), column_family(col_family), @@ -50,10 +54,9 @@ struct KeyContext { is_blob_index(false), cb_arg(nullptr), value(val), + columns(cols), timestamp(ts), get_context(nullptr) {} - - KeyContext() = default; }; // The MultiGetContext class is a container for the sorted list of keys that diff --git a/table/persistent_cache_helper.h b/table/persistent_cache_helper.h index ece339aeec11..cce4092cde43 100644 --- a/table/persistent_cache_helper.h +++ b/table/persistent_cache_helper.h @@ -6,7 +6,7 @@ #include -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "table/format.h" #include "table/persistent_cache_options.h" diff --git a/table/persistent_cache_options.h b/table/persistent_cache_options.h index b543ab3a3160..46f268797959 100644 --- a/table/persistent_cache_options.h +++ b/table/persistent_cache_options.h @@ -7,7 +7,7 @@ #include #include "cache/cache_key.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "rocksdb/persistent_cache.h" namespace ROCKSDB_NAMESPACE { diff --git a/table/plain/plain_table_builder.cc b/table/plain/plain_table_builder.cc index 04723955cf8a..24dd0f97ae44 100644 --- a/table/plain/plain_table_builder.cc +++ b/table/plain/plain_table_builder.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "table/plain/plain_table_builder.h" #include @@ -266,19 +265,24 @@ Status PlainTableBuilder::Finish() { PropertyBlockBuilder property_block_builder; // -- Add basic properties property_block_builder.AddTableProperty(properties_); - + // -- Add eixsting user collected properties property_block_builder.Add(properties_.user_collected_properties); - - // -- Add user collected properties + // -- Add more user collected properties + UserCollectedProperties more_user_collected_properties; NotifyCollectTableCollectorsOnFinish( - table_properties_collectors_, ioptions_.logger, &property_block_builder); + table_properties_collectors_, ioptions_.logger, &property_block_builder, + more_user_collected_properties, properties_.readable_properties); + properties_.user_collected_properties.insert( + more_user_collected_properties.begin(), + more_user_collected_properties.end()); // -- Write property block BlockHandle property_block_handle; - IOStatus s = WriteBlock(property_block_builder.Finish(), file_, &offset_, + io_status_ = WriteBlock(property_block_builder.Finish(), file_, &offset_, &property_block_handle); - if (!s.ok()) { - return static_cast(s); + if (!io_status_.ok()) { + status_ = io_status_; + return status_; } meta_index_builer.Add(kPropertiesBlockName, property_block_handle); @@ -294,8 +298,12 @@ Status PlainTableBuilder::Finish() { // Write Footer // no need to write out new footer if we're using default checksum FooterBuilder footer; - footer.Build(kPlainTableMagicNumber, /* format_version */ 0, offset_, - kNoChecksum, metaindex_block_handle); + Status s = footer.Build(kPlainTableMagicNumber, /* format_version */ 0, + offset_, kNoChecksum, metaindex_block_handle); + if (!s.ok()) { + status_ = s; + return status_; + } io_status_ = file_->Append(footer.GetSlice()); if (io_status_.ok()) { offset_ += footer.GetSlice().size(); @@ -334,4 +342,3 @@ void PlainTableBuilder::SetSeqnoTimeTableProperties(const std::string& string, } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/table/plain/plain_table_builder.h b/table/plain/plain_table_builder.h index 445491c2ab4e..fb7ea63be507 100644 --- a/table/plain/plain_table_builder.h +++ b/table/plain/plain_table_builder.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -150,5 +149,3 @@ class PlainTableBuilder : public TableBuilder { }; } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/table/plain/plain_table_factory.cc b/table/plain/plain_table_factory.cc index dfe5241a53e2..80aa9cb8e8a4 100644 --- a/table/plain/plain_table_factory.cc +++ b/table/plain/plain_table_factory.cc @@ -20,7 +20,6 @@ #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE static std::unordered_map plain_table_type_info = { {"user_key_len", {offsetof(struct PlainTableOptions, user_key_len), OptionType::kUInt32T, @@ -123,17 +122,6 @@ std::string PlainTableFactory::GetPrintableOptions() const { return ret; } -Status GetPlainTableOptionsFromString(const PlainTableOptions& table_options, - const std::string& opts_str, - PlainTableOptions* new_table_options) { - ConfigOptions config_options; - config_options.input_strings_escaped = false; - config_options.ignore_unknown_options = false; - config_options.invoke_prepare_options = false; - return GetPlainTableOptionsFromString(config_options, table_options, opts_str, - new_table_options); -} - Status GetPlainTableOptionsFromString(const ConfigOptions& config_options, const PlainTableOptions& table_options, const std::string& opts_str, @@ -153,9 +141,7 @@ Status GetPlainTableOptionsFromString(const ConfigOptions& config_options, return Status::InvalidArgument(s.getState()); } } -#endif // ROCKSDB_LITE -#ifndef ROCKSDB_LITE static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library, const std::string& /*arg*/) { // The MemTableRepFactory built-in classes will be either a class @@ -232,7 +218,6 @@ static int RegisterBuiltinMemTableRepFactory(ObjectLibrary& library, size_t num_types; return static_cast(library.GetFactoryCount(&num_types)); } -#endif // ROCKSDB_LITE Status GetMemTableRepFactoryFromString( const std::string& opts_str, std::unique_ptr* result) { @@ -245,12 +230,10 @@ Status GetMemTableRepFactoryFromString( Status MemTableRepFactory::CreateFromString( const ConfigOptions& config_options, const std::string& value, std::unique_ptr* result) { -#ifndef ROCKSDB_LITE static std::once_flag once; std::call_once(once, [&]() { RegisterBuiltinMemTableRepFactory(*(ObjectLibrary::Default().get()), ""); }); -#endif // ROCKSDB_LITE std::string id; std::unordered_map opt_map; Status status = Customizable::GetOptionsMap(config_options, result->get(), @@ -264,31 +247,8 @@ Status MemTableRepFactory::CreateFromString( } else if (id.empty()) { // We have no Id but have options. Not good return Status::NotSupported("Cannot reset object ", id); } else { -#ifndef ROCKSDB_LITE status = NewUniqueObject(config_options, id, opt_map, result); -#else - // To make it possible to configure the memtables in LITE mode, the ID - // is of the form :, where name is the name of the class and - // is the length of the object (e.g. skip_list:10). - std::vector opts_list = StringSplit(id, ':'); - if (opts_list.empty() || opts_list.size() > 2 || !opt_map.empty()) { - status = Status::InvalidArgument("Can't parse memtable_factory option ", - value); - } else if (opts_list[0] == SkipListFactory::kNickName() || - opts_list[0] == SkipListFactory::kClassName()) { - // Expecting format - // skip_list: - if (opts_list.size() == 2) { - size_t lookahead = ParseSizeT(opts_list[1]); - result->reset(new SkipListFactory(lookahead)); - } else { - result->reset(new SkipListFactory()); - } - } else if (!config_options.ignore_unsupported_options) { - status = Status::NotSupported("Cannot load object in LITE mode ", id); - } -#endif // ROCKSDB_LITE } return status; } @@ -304,19 +264,6 @@ Status MemTableRepFactory::CreateFromString( return s; } -#ifndef ROCKSDB_LITE -Status GetPlainTableOptionsFromMap( - const PlainTableOptions& table_options, - const std::unordered_map& opts_map, - PlainTableOptions* new_table_options, bool input_strings_escaped, - bool ignore_unknown_options) { - ConfigOptions config_options; - config_options.input_strings_escaped = input_strings_escaped; - config_options.ignore_unknown_options = ignore_unknown_options; - return GetPlainTableOptionsFromMap(config_options, table_options, opts_map, - new_table_options); -} - Status GetPlainTableOptionsFromMap( const ConfigOptions& config_options, const PlainTableOptions& table_options, const std::unordered_map& opts_map, @@ -346,5 +293,4 @@ const std::string PlainTablePropertyNames::kBloomVersion = const std::string PlainTablePropertyNames::kNumBloomBlocks = "rocksdb.plain.table.bloom.numblocks"; -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/table/plain/plain_table_factory.h b/table/plain/plain_table_factory.h index ce60b9d1990e..a47418af69fc 100644 --- a/table/plain/plain_table_factory.h +++ b/table/plain/plain_table_factory.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -179,4 +178,3 @@ class PlainTableFactory : public TableFactory { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/table/plain/plain_table_index.cc b/table/plain/plain_table_index.cc index b7e07cfb2244..c85176d6ca78 100644 --- a/table/plain/plain_table_index.cc +++ b/table/plain/plain_table_index.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "table/plain/plain_table_index.h" #include @@ -210,4 +209,3 @@ const std::string PlainTableIndexBuilder::kPlainTableIndexBlock = "PlainTableIndexBlock"; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/table/plain/plain_table_index.h b/table/plain/plain_table_index.h index 9f5f0eeff1e2..0adb6417d95e 100644 --- a/table/plain/plain_table_index.h +++ b/table/plain/plain_table_index.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -245,4 +244,3 @@ class PlainTableIndexBuilder { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/table/plain/plain_table_key_coding.cc b/table/plain/plain_table_key_coding.cc index 800d8d76fbc2..0ac42319103e 100644 --- a/table/plain/plain_table_key_coding.cc +++ b/table/plain/plain_table_key_coding.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "table/plain/plain_table_key_coding.h" #include @@ -216,8 +215,7 @@ bool PlainTableFileReader::ReadNonMmap(uint32_t file_offset, uint32_t len, // TODO: rate limit plain table reads. Status s = file_info_->file->Read(IOOptions(), file_offset, size_to_read, - &read_result, new_buffer->buf.get(), nullptr, - Env::IO_TOTAL /* rate_limiter_priority */); + &read_result, new_buffer->buf.get(), nullptr); if (!s.ok()) { status_ = s; return false; @@ -506,4 +504,3 @@ Status PlainTableKeyDecoder::NextKeyNoValue(uint32_t start_offset, } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LIT diff --git a/table/plain/plain_table_key_coding.h b/table/plain/plain_table_key_coding.h index 9cda7df32367..fdef22482513 100644 --- a/table/plain/plain_table_key_coding.h +++ b/table/plain/plain_table_key_coding.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include @@ -198,4 +197,3 @@ class PlainTableKeyDecoder { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index 6ce3d0ab9943..a74da1f89521 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -3,7 +3,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #include "table/plain/plain_table_reader.h" @@ -127,8 +126,10 @@ Status PlainTableReader::Open( } std::unique_ptr props; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - ioptions, &props); + ioptions, read_options, &props); if (!s.ok()) { return s; } @@ -283,9 +284,9 @@ void PlainTableReader::FillBloom(const std::vector& prefix_hashes) { Status PlainTableReader::MmapDataIfNeeded() { if (file_info_.is_mmap_mode) { // Get mmapped memory. - return file_info_.file->Read( - IOOptions(), 0, static_cast(file_size_), &file_info_.file_data, - nullptr, nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + return file_info_.file->Read(IOOptions(), 0, + static_cast(file_size_), + &file_info_.file_data, nullptr, nullptr); } return Status::OK(); } @@ -298,10 +299,14 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, assert(props != nullptr); BlockContents index_block_contents; - Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, - file_size_, kPlainTableMagicNumber, ioptions_, - PlainTableIndexBuilder::kPlainTableIndexBlock, - BlockType::kIndex, &index_block_contents); + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + Status s = + ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, + file_size_, kPlainTableMagicNumber, ioptions_, read_options, + PlainTableIndexBuilder::kPlainTableIndexBlock, + BlockType::kIndex, &index_block_contents); bool index_in_file = s.ok(); @@ -311,8 +316,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, if (index_in_file) { s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, file_size_, kPlainTableMagicNumber, ioptions_, - BloomBlockBuilder::kBloomBlock, BlockType::kFilter, - &bloom_block_contents); + read_options, BloomBlockBuilder::kBloomBlock, + BlockType::kFilter, &bloom_block_contents); bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0; } @@ -615,12 +620,14 @@ Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, return Status::OK(); } -uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/, - TableReaderCaller /*caller*/) { +uint64_t PlainTableReader::ApproximateOffsetOf( + const ReadOptions& /*read_options*/, const Slice& /*key*/, + TableReaderCaller /*caller*/) { return 0; } -uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/, +uint64_t PlainTableReader::ApproximateSize(const ReadOptions& /* read_options*/, + const Slice& /*start*/, const Slice& /*end*/, TableReaderCaller /*caller*/) { return 0; @@ -762,4 +769,3 @@ Slice PlainTableIterator::value() const { Status PlainTableIterator::status() const { return status_; } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h index 62bda693aebb..0f5f7f3ce0ef 100644 --- a/table/plain/plain_table_reader.h +++ b/table/plain/plain_table_reader.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -93,11 +92,12 @@ class PlainTableReader : public TableReader { GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters = false) override; - uint64_t ApproximateOffsetOf(const Slice& key, + uint64_t ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, TableReaderCaller caller) override; - uint64_t ApproximateSize(const Slice& start, const Slice& end, - TableReaderCaller caller) override; + uint64_t ApproximateSize(const ReadOptions& read_options, const Slice& start, + const Slice& end, TableReaderCaller caller) override; uint32_t GetIndexSize() const { return index_.GetIndexSize(); } void SetupForCompaction() override; @@ -241,4 +241,3 @@ class PlainTableReader : public TableReader { void operator=(const TableReader&) = delete; }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc index 3357099e8297..150776de1b2e 100644 --- a/table/sst_file_dumper.cc +++ b/table/sst_file_dumper.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include "table/sst_file_dumper.h" @@ -17,6 +16,8 @@ #include "db/blob/blob_index.h" #include "db/memtable.h" +#include "db/wide/wide_column_serialization.h" +#include "db/wide/wide_columns_helper.h" #include "db/write_batch_internal.h" #include "options/cf_options.h" #include "port/port.h" @@ -37,6 +38,7 @@ #include "table/table_reader.h" #include "util/compression.h" #include "util/random.h" +#include "util/udt_util.h" namespace ROCKSDB_NAMESPACE { @@ -110,8 +112,7 @@ Status SstFileDumper::GetTableReader(const std::string& file_path) { uint64_t prefetch_off = file_size - prefetch_size; IOOptions opts; s = prefetch_buffer.Prefetch(opts, file_.get(), prefetch_off, - static_cast(prefetch_size), - Env::IO_TOTAL /* rate_limiter_priority */); + static_cast(prefetch_size)); s = ReadFooterFromFile(opts, file_.get(), *fs, &prefetch_buffer, file_size, &footer); @@ -166,10 +167,19 @@ Status SstFileDumper::NewTableReader( const ImmutableOptions& /*ioptions*/, const EnvOptions& /*soptions*/, const InternalKeyComparator& /*internal_comparator*/, uint64_t file_size, std::unique_ptr* /*table_reader*/) { - auto t_opt = - TableReaderOptions(ioptions_, moptions_.prefix_extractor, soptions_, - internal_comparator_, false /* skip_filters */, - false /* imortal */, true /* force_direct_prefetch */); + // TODO(yuzhangyu): full support in sst_dump for SST files generated when + // `user_defined_timestamps_persisted` is false. + auto t_opt = TableReaderOptions( + ioptions_, moptions_.prefix_extractor, soptions_, internal_comparator_, + 0 /* block_protection_bytes_per_key */, false /* skip_filters */, + false /* immortal */, true /* force_direct_prefetch */, -1 /* level */, + nullptr /* block_cache_tracer */, 0 /* max_file_size_for_l0_meta_pin */, + "" /* cur_db_session_id */, 0 /* cur_file_num */, {} /* unique_id */, + 0 /* largest_seqno */, 0 /* tail_size */, + table_properties_ == nullptr + ? true + : static_cast( + table_properties_->user_defined_timestamps_persisted)); // Allow open file with global sequence number for backward compatibility. t_opt.largest_seqno = kMaxSequenceNumber; @@ -188,6 +198,7 @@ Status SstFileDumper::NewTableReader( } Status SstFileDumper::VerifyChecksum() { + assert(read_options_.verify_checksums); // We could pass specific readahead setting into read options if needed. return table_reader_->VerifyChecksum(read_options_, TableReaderCaller::kSSTDumpTool); @@ -316,7 +327,8 @@ Status SstFileDumper::ShowCompressionSize( const uint64_t compressed_blocks = opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_COMPRESSED); const uint64_t not_compressed_blocks = - opts.statistics->getAndResetTickerCount(NUMBER_BLOCK_NOT_COMPRESSED); + opts.statistics->getAndResetTickerCount( + NUMBER_BLOCK_COMPRESSION_REJECTED); // When the option enable_index_compression is true, // NUMBER_BLOCK_COMPRESSED is incremented for index block(s). if ((compressed_blocks + not_compressed_blocks) > num_data_blocks) { @@ -356,8 +368,11 @@ Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number, RandomAccessFileReader* file, uint64_t file_size, FilePrefetchBuffer* prefetch_buffer) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; Status s = ROCKSDB_NAMESPACE::ReadTableProperties( - file, file_size, table_magic_number, ioptions_, &table_properties_, + file, file_size, table_magic_number, ioptions_, read_options, + &table_properties_, /* memory_allocator= */ nullptr, prefetch_buffer); if (!s.ok()) { if (!silent_) { @@ -444,10 +459,20 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num, read_options_, moptions_.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kSSTDumpTool); + + const Comparator* ucmp = internal_comparator_.user_comparator(); + size_t ts_sz = ucmp->timestamp_size(); + + Slice from_slice = from_key; + Slice to_slice = to_key; + std::string from_key_buf, to_key_buf; + auto [from, to] = MaybeAddTimestampsToRange( + has_from ? &from_slice : nullptr, has_to ? &to_slice : nullptr, ts_sz, + &from_key_buf, &to_key_buf); uint64_t i = 0; - if (has_from) { + if (from.has_value()) { InternalKey ikey; - ikey.SetMinPossibleForUserKey(from_key); + ikey.SetMinPossibleForUserKey(from.value()); iter->Seek(ikey.Encode()); } else { iter->SeekToFirst(); @@ -471,15 +496,29 @@ Status SstFileDumper::ReadSequential(bool print_kv, uint64_t read_num, } // If end marker was specified, we stop before it - if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) { + if (to.has_value() && ucmp->Compare(ikey.user_key, to.value()) >= 0) { break; } if (print_kv) { if (!decode_blob_index_ || ikey.type != kTypeBlobIndex) { - fprintf(stdout, "%s => %s\n", - ikey.DebugString(true, output_hex_).c_str(), - value.ToString(output_hex_).c_str()); + if (ikey.type == kTypeWideColumnEntity) { + std::ostringstream oss; + const Status s = WideColumnsHelper::DumpSliceAsWideColumns( + iter->value(), oss, output_hex_); + if (!s.ok()) { + fprintf(stderr, "%s => error deserializing wide columns\n", + ikey.DebugString(true, output_hex_).c_str()); + continue; + } + fprintf(stdout, "%s => %s\n", + ikey.DebugString(true, output_hex_).c_str(), + oss.str().c_str()); + } else { + fprintf(stdout, "%s => %s\n", + ikey.DebugString(true, output_hex_).c_str(), + value.ToString(output_hex_).c_str()); + } } else { BlobIndex blob_index; @@ -515,5 +554,3 @@ Status SstFileDumper::ReadTableProperties( return init_result_; } } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/table/sst_file_dumper.h b/table/sst_file_dumper.h index 7be8763909ae..1e78959d145e 100644 --- a/table/sst_file_dumper.h +++ b/table/sst_file_dumper.h @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -98,4 +97,3 @@ class SstFileDumper { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc index 48f1be0be8ff..533b7cd6ac7a 100644 --- a/table/sst_file_reader.cc +++ b/table/sst_file_reader.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "rocksdb/sst_file_reader.h" @@ -57,7 +56,8 @@ Status SstFileReader::Open(const std::string& file_path) { } if (s.ok()) { TableReaderOptions t_opt(r->ioptions, r->moptions.prefix_extractor, - r->soptions, r->ioptions.internal_comparator); + r->soptions, r->ioptions.internal_comparator, + r->moptions.block_protection_bytes_per_key); // Allow open file with global sequence number for backward compatibility. t_opt.largest_seqno = kMaxSequenceNumber; s = r->options.table_factory->NewTableReader(t_opt, std::move(file_reader), @@ -67,6 +67,7 @@ Status SstFileReader::Open(const std::string& file_path) { } Iterator* SstFileReader::NewIterator(const ReadOptions& roptions) { + assert(roptions.io_activity == Env::IOActivity::kUnknown); auto r = rep_.get(); auto sequence = roptions.snapshot != nullptr ? roptions.snapshot->GetSequenceNumber() @@ -92,10 +93,9 @@ std::shared_ptr SstFileReader::GetTableProperties() } Status SstFileReader::VerifyChecksum(const ReadOptions& read_options) { + assert(read_options.io_activity == Env::IOActivity::kUnknown); return rep_->table_reader->VerifyChecksum(read_options, TableReaderCaller::kSSTFileReader); } } // namespace ROCKSDB_NAMESPACE - -#endif // !ROCKSDB_LITE diff --git a/table/sst_file_reader_test.cc b/table/sst_file_reader_test.cc index 4837d223bab8..36a7975cfcdf 100644 --- a/table/sst_file_reader_test.cc +++ b/table/sst_file_reader_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "rocksdb/sst_file_reader.h" @@ -305,6 +304,7 @@ class SstFileReaderTimestampTest : public testing::Test { } ASSERT_FALSE(iter->Valid()); + ASSERT_OK(iter->status()); } protected: @@ -422,13 +422,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "SKIPPED as SstFileReader is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc index 273c2fc4a7a4..3364e1e016e7 100644 --- a/table/sst_file_writer.cc +++ b/table/sst_file_writer.cc @@ -9,6 +9,8 @@ #include "db/db_impl/db_impl.h" #include "db/dbformat.h" +#include "db/wide/wide_column_serialization.h" +#include "db/wide/wide_columns_helper.h" #include "file/writable_file_writer.h" #include "rocksdb/file_system.h" #include "rocksdb/table.h" @@ -23,7 +25,6 @@ const std::string ExternalSstFilePropertyNames::kVersion = const std::string ExternalSstFilePropertyNames::kGlobalSeqno = "rocksdb.external_sst_file.global_seqno"; -#ifndef ROCKSDB_LITE const size_t kFadviseTrigger = 1024 * 1024; // 1MB @@ -86,7 +87,8 @@ struct SstFileWriter::Rep { assert(value_type == kTypeValue || value_type == kTypeMerge || value_type == kTypeDeletion || - value_type == kTypeDeletionWithTimestamp); + value_type == kTypeDeletionWithTimestamp || + value_type == kTypeWideColumnEntity); constexpr SequenceNumber sequence_number = 0; @@ -135,10 +137,36 @@ struct SstFileWriter::Rep { return AddImpl(user_key_with_ts, value, value_type); } + Status AddEntity(const Slice& user_key, const WideColumns& columns) { + WideColumns sorted_columns(columns); + WideColumnsHelper::SortColumns(sorted_columns); + + std::string entity; + const Status s = WideColumnSerialization::Serialize(sorted_columns, entity); + if (!s.ok()) { + return s; + } + if (entity.size() > size_t{std::numeric_limits::max()}) { + return Status::InvalidArgument("wide column entity is too large"); + } + return Add(user_key, entity, kTypeWideColumnEntity); + } + Status DeleteRangeImpl(const Slice& begin_key, const Slice& end_key) { if (!builder) { return Status::InvalidArgument("File is not opened"); } + int cmp = internal_comparator.user_comparator()->CompareWithoutTimestamp( + begin_key, end_key); + if (cmp > 0) { + // It's an empty range where endpoints appear mistaken. Don't bother + // applying it to the DB, and return an error to the user. + return Status::InvalidArgument("end key comes before start key"); + } else if (cmp == 0) { + // It's an empty range. Don't bother applying it to the DB. + return Status::OK(); + } + RangeTombstone tombstone(begin_key, end_key, 0 /* Sequence Number */); if (file_info.num_range_del_entries == 0) { file_info.smallest_range_del_key.assign(tombstone.start_key_.data(), @@ -367,6 +395,11 @@ Status SstFileWriter::Put(const Slice& user_key, const Slice& timestamp, return rep_->Add(user_key, timestamp, value, ValueType::kTypeValue); } +Status SstFileWriter::PutEntity(const Slice& user_key, + const WideColumns& columns) { + return rep_->AddEntity(user_key, columns); +} + Status SstFileWriter::Merge(const Slice& user_key, const Slice& value) { return rep_->Add(user_key, value, ValueType::kTypeMerge); } @@ -430,6 +463,5 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) { } uint64_t SstFileWriter::FileSize() { return rep_->file_info.file_size; } -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/table/table_builder.h b/table/table_builder.h index 1790f33b1b3c..d6f0e1a03c93 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -37,12 +37,13 @@ struct TableReaderOptions { const std::shared_ptr& _prefix_extractor, const EnvOptions& _env_options, const InternalKeyComparator& _internal_comparator, - bool _skip_filters = false, bool _immortal = false, - bool _force_direct_prefetch = false, int _level = -1, - BlockCacheTracer* const _block_cache_tracer = nullptr, + uint8_t _block_protection_bytes_per_key, bool _skip_filters = false, + bool _immortal = false, bool _force_direct_prefetch = false, + int _level = -1, BlockCacheTracer* const _block_cache_tracer = nullptr, size_t _max_file_size_for_l0_meta_pin = 0, const std::string& _cur_db_session_id = "", uint64_t _cur_file_num = 0, - UniqueId64x2 _unique_id = {}, SequenceNumber _largest_seqno = 0) + UniqueId64x2 _unique_id = {}, SequenceNumber _largest_seqno = 0, + uint64_t _tail_size = 0, bool _user_defined_timestamps_persisted = true) : ioptions(_ioptions), prefix_extractor(_prefix_extractor), env_options(_env_options), @@ -56,7 +57,10 @@ struct TableReaderOptions { max_file_size_for_l0_meta_pin(_max_file_size_for_l0_meta_pin), cur_db_session_id(_cur_db_session_id), cur_file_num(_cur_file_num), - unique_id(_unique_id) {} + unique_id(_unique_id), + block_protection_bytes_per_key(_block_protection_bytes_per_key), + tail_size(_tail_size), + user_defined_timestamps_persisted(_user_defined_timestamps_persisted) {} const ImmutableOptions& ioptions; const std::shared_ptr& prefix_extractor; @@ -86,6 +90,13 @@ struct TableReaderOptions { // Known unique_id or {}, kNullUniqueId64x2 means unknown UniqueId64x2 unique_id; + + uint8_t block_protection_bytes_per_key; + + uint64_t tail_size; + + // Whether the key in the table contains user-defined timestamps. + bool user_defined_timestamps_persisted; }; struct TableBuilderOptions { @@ -197,6 +208,8 @@ class TableBuilder { // is enabled. virtual uint64_t EstimatedFileSize() const { return FileSize(); } + virtual uint64_t GetTailSize() const { return 0; } + // If the user defined table properties collector suggest the file to // be further compacted. virtual bool NeedCompact() const { return false; } diff --git a/table/table_factory.cc b/table/table_factory.cc index fc5c5ccde477..29b6c89f81c5 100644 --- a/table/table_factory.cc +++ b/table/table_factory.cc @@ -16,7 +16,6 @@ namespace ROCKSDB_NAMESPACE { static void RegisterTableFactories(const std::string& /*arg*/) { -#ifndef ROCKSDB_LITE static std::once_flag loaded; std::call_once(loaded, []() { auto library = ObjectLibrary::Default(); @@ -42,24 +41,12 @@ static void RegisterTableFactories(const std::string& /*arg*/) { return guard->get(); }); }); -#endif // ROCKSDB_LITE -} - -static bool LoadFactory(const std::string& name, - std::shared_ptr* factory) { - if (name == TableFactory::kBlockBasedTableName()) { - factory->reset(new BlockBasedTableFactory()); - return true; - } else { - return false; - } } Status TableFactory::CreateFromString(const ConfigOptions& config_options, const std::string& value, std::shared_ptr* factory) { RegisterTableFactories(""); - return LoadSharedObject(config_options, value, LoadFactory, - factory); + return LoadSharedObject(config_options, value, factory); } } // namespace ROCKSDB_NAMESPACE diff --git a/table/table_properties.cc b/table/table_properties.cc index b382281f8573..17a13543de85 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -109,6 +109,10 @@ std::string TableProperties::ToString(const std::string& prop_delim, AppendProperty(result, "comparator name", comparator_name.empty() ? std::string("N/A") : comparator_name, prop_delim, kv_delim); + AppendProperty(result, "user defined timestamps persisted", + user_defined_timestamps_persisted ? std::string("true") + : std::string("false"), + prop_delim, kv_delim); AppendProperty( result, "merge operator name", @@ -303,6 +307,10 @@ const std::string TablePropertiesNames::kFastCompressionEstimatedDataSize = "rocksdb.sample_for_compression.fast.data.size"; const std::string TablePropertiesNames::kSequenceNumberTimeMapping = "rocksdb.seqno.time.map"; +const std::string TablePropertiesNames::kTailStartOffset = + "rocksdb.tail.start.offset"; +const std::string TablePropertiesNames::kUserDefinedTimestampsPersisted = + "rocksdb.user.defined.timestamps.persisted"; #ifndef NDEBUG // WARNING: TEST_SetRandomTableProperties assumes the following layout of diff --git a/table/table_reader.h b/table/table_reader.h index 391072eec1bc..87610f4fed1d 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -60,11 +60,17 @@ class TableReader { size_t compaction_readahead_size = 0, bool allow_unprepared_value = false) = 0; + // read_options.snapshot needs to outlive this call. virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( const ReadOptions& /*read_options*/) { return nullptr; } + virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( + SequenceNumber /* read_seqno */, const Slice* /* timestamp */) { + return nullptr; + } + // Given a key, return an approximate byte offset in the file where // the data for that key begins (or would begin if the key were // present in the file). The returned value is in terms of file @@ -76,7 +82,8 @@ class TableReader { // function and letting ApproximateSize take optional start and end, so // that absolute start and end can be specified and optimized without // key / index work. - virtual uint64_t ApproximateOffsetOf(const Slice& key, + virtual uint64_t ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, TableReaderCaller caller) = 0; // Given start and end keys, return the approximate data size in the file @@ -84,7 +91,8 @@ class TableReader { // includes effects like compression of the underlying data and applicable // portions of metadata including filters and indexes. Nullptr for start or // end (or both) indicates absolute start or end of the table. - virtual uint64_t ApproximateSize(const Slice& start, const Slice& end, + virtual uint64_t ApproximateSize(const ReadOptions& read_options, + const Slice& start, const Slice& end, TableReaderCaller caller) = 0; struct Anchor { @@ -160,7 +168,8 @@ class TableReader { // Prefetch data corresponding to a give range of keys // Typically this functionality is required for table implementations that // persists the data on a non volatile storage medium like disk/SSD - virtual Status Prefetch(const Slice* begin = nullptr, + virtual Status Prefetch(const ReadOptions& /* read_options */, + const Slice* begin = nullptr, const Slice* end = nullptr) { (void)begin; (void)end; diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index b13caf68d521..60c84d7bf099 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -144,7 +144,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, new RandomAccessFileReader(std::move(raf), file_name)); s = opts.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor, env_options, - ikc), + ikc, 0 /* block_protection_bytes_per_key */), std::move(file_reader), file_size, &table_reader); if (!s.ok()) { fprintf(stderr, "Open Table Error: %s\n", s.ToString().c_str()); @@ -297,18 +297,12 @@ int main(int argc, char** argv) { options.compression = ROCKSDB_NAMESPACE::CompressionType::kNoCompression; if (FLAGS_table_factory == "cuckoo_hash") { -#ifndef ROCKSDB_LITE options.allow_mmap_reads = FLAGS_mmap_read; env_options.use_mmap_reads = FLAGS_mmap_read; ROCKSDB_NAMESPACE::CuckooTableOptions table_options; table_options.hash_table_ratio = 0.75; tf.reset(ROCKSDB_NAMESPACE::NewCuckooTableFactory(table_options)); -#else - fprintf(stderr, "Plain table is not supported in lite mode\n"); - exit(1); -#endif // ROCKSDB_LITE } else if (FLAGS_table_factory == "plain_table") { -#ifndef ROCKSDB_LITE options.allow_mmap_reads = FLAGS_mmap_read; env_options.use_mmap_reads = FLAGS_mmap_read; @@ -320,10 +314,6 @@ int main(int argc, char** argv) { tf.reset(new ROCKSDB_NAMESPACE::PlainTableFactory(plain_table_options)); options.prefix_extractor.reset( ROCKSDB_NAMESPACE::NewFixedPrefixTransform(FLAGS_prefix_len)); -#else - fprintf(stderr, "Cuckoo table is not supported in lite mode\n"); - exit(1); -#endif // ROCKSDB_LITE } else if (FLAGS_table_factory == "block_based") { tf.reset(new ROCKSDB_NAMESPACE::BlockBasedTableFactory()); } else { diff --git a/table/table_test.cc b/table/table_test.cc index d5fff82da4fb..e6f95243e1ee 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -27,7 +27,7 @@ #include "db/memtable.h" #include "db/write_batch_internal.h" #include "memtable/stl_wrappers.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" @@ -55,7 +55,7 @@ #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_builder.h" #include "table/block_based/filter_policy_internal.h" -#include "table/block_based/flush_block_policy.h" +#include "table/block_based/flush_block_policy_impl.h" #include "table/block_fetcher.h" #include "table/format.h" #include "table/get_context.h" @@ -425,14 +425,15 @@ class TableConstructor : public Constructor { } uint64_t ApproximateOffsetOf(const Slice& key) const { + const ReadOptions read_options; if (convert_to_internal_key_) { InternalKey ikey(key, kMaxSequenceNumber, kTypeValue); const Slice skey = ikey.Encode(); return table_reader_->ApproximateOffsetOf( - skey, TableReaderCaller::kUncategorized); + read_options, skey, TableReaderCaller::kUncategorized); } return table_reader_->ApproximateOffsetOf( - key, TableReaderCaller::kUncategorized); + read_options, key, TableReaderCaller::kUncategorized); } virtual Status Reopen(const ImmutableOptions& ioptions, @@ -443,7 +444,9 @@ class TableConstructor : public Constructor { file_reader_.reset(new RandomAccessFileReader(std::move(source), "test")); return ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor, soptions, - *last_internal_comparator_, /*skip_filters*/ false, + *last_internal_comparator_, + 0 /* block_protection_bytes_per_key */, + /*skip_filters*/ false, /*immortal*/ false, false, level_, &block_cache_tracer_, moptions.write_buffer_size, "", file_num_, kNullUniqueId64x2, largest_seqno_), @@ -620,11 +623,9 @@ class DBConstructor : public Constructor { enum TestType { BLOCK_BASED_TABLE_TEST, -#ifndef ROCKSDB_LITE PLAIN_TABLE_SEMI_FIXED_PREFIX, PLAIN_TABLE_FULL_STR_PREFIX, PLAIN_TABLE_TOTAL_ORDER, -#endif // !ROCKSDB_LITE BLOCK_TEST, MEMTABLE_TEST, DB_TEST @@ -654,11 +655,9 @@ std::ostream& operator<<(std::ostream& os, const TestArgs& args) { static std::vector GenerateArgList() { std::vector test_args; std::vector test_types = {BLOCK_BASED_TABLE_TEST, -#ifndef ROCKSDB_LITE PLAIN_TABLE_SEMI_FIXED_PREFIX, PLAIN_TABLE_FULL_STR_PREFIX, PLAIN_TABLE_TOTAL_ORDER, -#endif // !ROCKSDB_LITE BLOCK_TEST, MEMTABLE_TEST, DB_TEST}; @@ -697,7 +696,6 @@ static std::vector GenerateArgList() { for (auto test_type : test_types) { for (auto reverse_compare : reverse_compare_types) { -#ifndef ROCKSDB_LITE if (test_type == PLAIN_TABLE_SEMI_FIXED_PREFIX || test_type == PLAIN_TABLE_FULL_STR_PREFIX || test_type == PLAIN_TABLE_TOTAL_ORDER) { @@ -715,7 +713,6 @@ static std::vector GenerateArgList() { test_args.push_back(one_arg); continue; } -#endif // !ROCKSDB_LITE for (auto restart_interval : restart_intervals) { for (auto compression_type : compression_types) { @@ -804,8 +801,7 @@ class HarnessTest : public testing::Test { internal_comparator_.reset( new InternalKeyComparator(options_.comparator)); break; -// Plain table is not supported in ROCKSDB_LITE -#ifndef ROCKSDB_LITE + case PLAIN_TABLE_SEMI_FIXED_PREFIX: support_prev_ = false; only_support_prefix_seek_ = true; @@ -845,7 +841,6 @@ class HarnessTest : public testing::Test { internal_comparator_.reset( new InternalKeyComparator(options_.comparator)); break; -#endif // !ROCKSDB_LITE case BLOCK_TEST: table_options_.block_size = 256; options_.table_factory.reset( @@ -1155,19 +1150,16 @@ class BlockBasedTableTest std::move(trace_writer)); ASSERT_NE(block_cache_trace_writer, nullptr); // Always return Status::OK(). - assert(c->block_cache_tracer_ - .StartTrace(trace_opt, std::move(block_cache_trace_writer)) - .ok()); + ASSERT_OK(c->block_cache_tracer_.StartTrace( + trace_opt, std::move(block_cache_trace_writer))); { - std::string user_key = "k01"; - InternalKey internal_key(user_key, 0, kTypeValue); + InternalKey internal_key(auto_add_key1, 0, kTypeValue); std::string encoded_key = internal_key.Encode().ToString(); c->Add(encoded_key, kDummyValue); } { - std::string user_key = "k02"; - InternalKey internal_key(user_key, 0, kTypeValue); + InternalKey internal_key(auto_add_key2, 0, kTypeValue); std::string encoded_key = internal_key.Encode().ToString(); c->Add(encoded_key, kDummyValue); } @@ -1188,6 +1180,7 @@ class BlockBasedTableTest EXPECT_OK(reader.ReadHeader(&header)); uint32_t index = 0; while (s.ok()) { + SCOPED_TRACE("expected_records[" + std::to_string(index) + "]"); BlockCacheTraceRecord access; s = reader.ReadAccess(&access); if (!s.ok()) { @@ -1200,22 +1193,33 @@ class BlockBasedTableTest EXPECT_EQ(access.caller, expected_records[index].caller); EXPECT_EQ(access.no_insert, expected_records[index].no_insert); EXPECT_EQ(access.is_cache_hit, expected_records[index].is_cache_hit); - // Get - if (access.caller == TableReaderCaller::kUserGet) { + EXPECT_EQ(access.get_id, expected_records[index].get_id); + // The well-populated cases + if (access.caller == TableReaderCaller::kUserGet || + (access.caller == TableReaderCaller::kUserMultiGet && + access.block_type == TraceType::kBlockTraceDataBlock)) { EXPECT_EQ(access.referenced_key, expected_records[index].referenced_key); - EXPECT_EQ(access.get_id, expected_records[index].get_id); EXPECT_EQ(access.get_from_user_specified_snapshot, expected_records[index].get_from_user_specified_snapshot); if (access.block_type == TraceType::kBlockTraceDataBlock) { EXPECT_GT(access.referenced_data_size, 0); EXPECT_GT(access.num_keys_in_block, 0); + if (access.caller == TableReaderCaller::kUserMultiGet) { + // Test num_keys_in_block estimate, assuming default restart + // interval of 16 and just one interval. + // Rounding depends on get_id. + if (access.get_id & 1) { + EXPECT_EQ(access.num_keys_in_block, 9); + } else { + EXPECT_EQ(access.num_keys_in_block, 8); + } + } EXPECT_EQ(access.referenced_key_exist_in_block, expected_records[index].referenced_key_exist_in_block); } } else { EXPECT_EQ(access.referenced_key, ""); - EXPECT_EQ(access.get_id, 0); EXPECT_FALSE(access.get_from_user_specified_snapshot); EXPECT_EQ(access.referenced_data_size, 0); EXPECT_EQ(access.num_keys_in_block, 0); @@ -1231,6 +1235,8 @@ class BlockBasedTableTest protected: uint64_t IndexUncompressedHelper(bool indexCompress); + const std::string auto_add_key1 = "aak01"; + const std::string auto_add_key2 = "aak02"; private: uint32_t format_; @@ -1324,7 +1330,7 @@ class FileChecksumTestHelper { uint64_t offset = 0; Status s; s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(), - nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + nullptr); if (!s.ok()) { return s; } @@ -1332,8 +1338,7 @@ class FileChecksumTestHelper { file_checksum_generator->Update(scratch.get(), result.size()); offset += static_cast(result.size()); s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(), - nullptr, - Env::IO_TOTAL /* rate_limiter_priority */); + nullptr); if (!s.ok()) { return s; } @@ -1714,7 +1719,8 @@ TEST_P(BlockBasedTableTest, BasicBlockBasedTableProperties) { MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); - ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_NOT_COMPRESSED), 0); + ASSERT_EQ( + options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSION_REJECTED), 0); auto& props = *c.GetTableReader()->GetTableProperties(); ASSERT_EQ(kvmap.size(), props.num_entries); @@ -1976,7 +1982,8 @@ void PrefetchRange(TableConstructor* c, Options* opt, end.reset(new Slice(key_end)); } } - s = table_reader->Prefetch(begin.get(), end.get()); + const ReadOptions read_options; + s = table_reader->Prefetch(read_options, begin.get(), end.get()); ASSERT_TRUE(s.code() == expected_status.code()); @@ -2261,6 +2268,12 @@ TEST_P(BlockBasedTableTest, BadChecksumType) { "Corruption: Corrupt or unsupported checksum type: 123 in test"); } +class BuiltinChecksumTest : public testing::Test, + public testing::WithParamInterface {}; + +INSTANTIATE_TEST_CASE_P(SupportedChecksums, BuiltinChecksumTest, + testing::ValuesIn(GetSupportedChecksums())); + namespace { std::string ChecksumAsString(const std::string& data, ChecksumType checksum_type) { @@ -2286,7 +2299,11 @@ std::string ChecksumAsString(std::string* data, char new_last_byte, // Make sure that checksum values don't change in later versions, even if // consistent within current version. -TEST_P(BlockBasedTableTest, ChecksumSchemas) { +TEST_P(BuiltinChecksumTest, ChecksumSchemas) { + // Trailing 'x' chars will be replaced by compression type. Specifically, + // the first byte of a block trailer is compression type, which is part of + // the checksum input. This test does not deal with storing or parsing + // checksums from the trailer (next 4 bytes of trailer). std::string b0 = "x"; std::string b1 = "This is a short block!x"; std::string b2; @@ -2294,7 +2311,6 @@ TEST_P(BlockBasedTableTest, ChecksumSchemas) { b2.append("This is a long block!"); } b2.append("x"); - // Trailing 'x' will be replaced by compression type std::string empty; @@ -2302,74 +2318,108 @@ TEST_P(BlockBasedTableTest, ChecksumSchemas) { char ct2 = kSnappyCompression; char ct3 = kZSTD; - // Note: first byte of trailer is compression type, last 4 are checksum - - for (ChecksumType t : GetSupportedChecksums()) { - switch (t) { - case kNoChecksum: - EXPECT_EQ(ChecksumAsString(empty, t), "00000000"); - EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "00000000"); - EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "00000000"); - EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "00000000"); - EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "00000000"); - EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "00000000"); - EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "00000000"); - EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "00000000"); - EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "00000000"); - EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00000000"); - break; - case kCRC32c: - EXPECT_EQ(ChecksumAsString(empty, t), "D8EA82A2"); - EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "D28F2549"); - EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "052B2843"); - EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "46F8F711"); - EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "583F0355"); - EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "2F9B0A57"); - EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "ECE7DA1D"); - EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "943EF0AB"); - EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "43A2EDB1"); - EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00E53D63"); - break; - case kxxHash: - EXPECT_EQ(ChecksumAsString(empty, t), "055DCC02"); - EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "3EB065CF"); - EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "31F79238"); - EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "320D2E00"); - EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "4A2E5FB0"); - EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "0BD9F652"); - EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "B4107E50"); - EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "20F4D4BA"); - EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "8F1A1F99"); - EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "A191A338"); - break; - case kxxHash64: - EXPECT_EQ(ChecksumAsString(empty, t), "99E9D851"); - EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "682705DB"); - EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "30E7211B"); - EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "B7BB58E8"); - EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B74655EF"); - EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "B6C8BBBE"); - EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "AED9E3B4"); - EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "0D4999FE"); - EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "F5932423"); - EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "6B31BAB1"); - break; - case kXXH3: - EXPECT_EQ(ChecksumAsString(empty, t), "00000000"); - EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "C294D338"); - EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "1B174353"); - EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "2D0E20C8"); - EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B37FB5E6"); - EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "6AFC258D"); - EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "5CE54616"); - EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "FA2D482E"); - EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "23AED845"); - EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "15B7BBDE"); - break; - default: - // Force this test to be updated on new ChecksumTypes - assert(false); - break; + ChecksumType t = GetParam(); + switch (t) { + case kNoChecksum: + EXPECT_EQ(ChecksumAsString(empty, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00000000"); + break; + case kCRC32c: + EXPECT_EQ(ChecksumAsString(empty, t), "D8EA82A2"); + EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "D28F2549"); + EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "052B2843"); + EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "46F8F711"); + EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "583F0355"); + EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "2F9B0A57"); + EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "ECE7DA1D"); + EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "943EF0AB"); + EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "43A2EDB1"); + EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "00E53D63"); + break; + case kxxHash: + EXPECT_EQ(ChecksumAsString(empty, t), "055DCC02"); + EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "3EB065CF"); + EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "31F79238"); + EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "320D2E00"); + EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "4A2E5FB0"); + EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "0BD9F652"); + EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "B4107E50"); + EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "20F4D4BA"); + EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "8F1A1F99"); + EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "A191A338"); + break; + case kxxHash64: + EXPECT_EQ(ChecksumAsString(empty, t), "99E9D851"); + EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "682705DB"); + EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "30E7211B"); + EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "B7BB58E8"); + EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B74655EF"); + EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "B6C8BBBE"); + EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "AED9E3B4"); + EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "0D4999FE"); + EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "F5932423"); + EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "6B31BAB1"); + break; + case kXXH3: + EXPECT_EQ(ChecksumAsString(empty, t), "00000000"); + EXPECT_EQ(ChecksumAsString(&b0, ct1, t), "C294D338"); + EXPECT_EQ(ChecksumAsString(&b0, ct2, t), "1B174353"); + EXPECT_EQ(ChecksumAsString(&b0, ct3, t), "2D0E20C8"); + EXPECT_EQ(ChecksumAsString(&b1, ct1, t), "B37FB5E6"); + EXPECT_EQ(ChecksumAsString(&b1, ct2, t), "6AFC258D"); + EXPECT_EQ(ChecksumAsString(&b1, ct3, t), "5CE54616"); + EXPECT_EQ(ChecksumAsString(&b2, ct1, t), "FA2D482E"); + EXPECT_EQ(ChecksumAsString(&b2, ct2, t), "23AED845"); + EXPECT_EQ(ChecksumAsString(&b2, ct3, t), "15B7BBDE"); + break; + default: + // Force this test to be updated on new ChecksumTypes + assert(false); + break; + } +} + +TEST_P(BuiltinChecksumTest, ChecksumZeroInputs) { + // Verify that no reasonably sized "all zeros" inputs produce "all zeros" + // output. Otherwise, "wiped" data could appear to be well-formed. + // Assuming essentially random assignment of output values, the likelihood + // of encountering checksum == 0 for an input not specifically crafted is + // 1 in 4 billion. + if (GetParam() == kNoChecksum) { + return; + } + // "Thorough" case is too slow for continouous testing + bool thorough = getenv("ROCKSDB_THOROUGH_CHECKSUM_TEST") != nullptr; + // Verified through 10M + size_t kMaxZerosLen = thorough ? 10000000 : 20000; + std::string zeros(kMaxZerosLen, '\0'); + + for (size_t len = 0; len < kMaxZerosLen; ++len) { + if (thorough && (len & 0xffffU) == 0) { + fprintf(stderr, "t=%u len=%u\n", (unsigned)GetParam(), (unsigned)len); + } + uint32_t v = ComputeBuiltinChecksum(GetParam(), zeros.data(), len); + if (v == 0U) { + // One exception case: + if (GetParam() == kXXH3 && len == 0) { + // This is not a big deal because assuming the block length is known + // from the block handle, which comes from a checksum-verified block, + // there is nothing to corrupt in a zero-length block. And when there + // is a block trailer with compression byte (as in block-based table), + // zero length checksummed data never arises. + continue; + } + // Only compute this on failure + SCOPED_TRACE("len=" + std::to_string(len)); + ASSERT_NE(v, 0U); } } } @@ -3033,15 +3083,14 @@ TEST_P(BlockBasedTableTest, TracingGetTest) { MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); - std::string user_key = "k01"; - InternalKey internal_key(user_key, 0, kTypeValue); + InternalKey internal_key(auto_add_key1, 0, kTypeValue); std::string encoded_key = internal_key.Encode().ToString(); for (uint32_t i = 1; i <= 2; i++) { PinnableSlice value; - GetContext get_context(options.comparator, nullptr, nullptr, nullptr, - GetContext::kNotFound, user_key, &value, nullptr, - nullptr, nullptr, true, nullptr, nullptr, nullptr, - nullptr, nullptr, nullptr, /*tracing_get_id=*/i); + GetContext get_context( + options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, + auto_add_key1, &value, nullptr, nullptr, nullptr, true, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, /*tracing_get_id=*/i); get_perf_context()->Reset(); ASSERT_OK(c.GetTableReader()->Get(ReadOptions(), encoded_key, &get_context, moptions.prefix_extractor.get())); @@ -3091,6 +3140,189 @@ TEST_P(BlockBasedTableTest, TracingGetTest) { c.ResetTableReader(); } +struct HitMissCountingCache : public CacheWrapper { + using CacheWrapper::CacheWrapper; + const char* Name() const override { return "HitMissCountingCache"; } + + uint64_t hit_count_ = 0; + uint64_t miss_count_ = 0; + + void Reset() { + hit_count_ = 0; + miss_count_ = 0; + } + + Handle* Lookup(const Slice& key, const CacheItemHelper* helper, + CreateContext* create_context, + Priority priority = Priority::LOW, + Statistics* stats = nullptr) override { + // ASSUMES no blocking async lookups + Handle* h = target_->Lookup(key, helper, create_context, priority, stats); + if (h) { + hit_count_++; + } else { + miss_count_++; + } + return h; + } + + void StartAsyncLookup(AsyncLookupHandle& async_handle) override { + target_->StartAsyncLookup(async_handle); + // If not pending, caller might not call WaitAll, so have to account here. + if (!async_handle.IsPending()) { + if (async_handle.Result()) { + hit_count_++; + } else { + miss_count_++; + } + } + } + + void WaitAll(AsyncLookupHandle* async_handles, size_t count) override { + // If !pending, then we already accounted for it in StartAsyncLookup. + // Assume the pending status does not change asynchronously (since + // StartAsyncLookup) and remember which still need accounting. + std::vector needs_accounting; + for (size_t i = 0; i < count; ++i) { + if (async_handles[i].IsPending()) { + needs_accounting.push_back(async_handles + i); + } + } + target_->WaitAll(async_handles, count); + for (auto ah : needs_accounting) { + if (ah->Result()) { + hit_count_++; + } else { + miss_count_++; + } + } + } + + void VerifyExpectedHitMissCounts( + const std::vector& expected_records) { + uint64_t expected_hits = 0; + uint64_t expected_misses = 0; + for (const auto& r : expected_records) { + if (r.is_cache_hit) { + expected_hits++; + } else { + expected_misses++; + } + } + EXPECT_EQ(expected_hits, hit_count_); + EXPECT_EQ(expected_misses, miss_count_); + Reset(); + } +}; + +TEST_P(BlockBasedTableTest, TracingMultiGetTest) { + TableConstructor c(BytewiseComparator()); + Options options; + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + options.create_if_missing = true; + auto cache = + std::make_shared(NewLRUCache(1024 * 1024, 0)); + table_options.block_cache = cache; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + // Put auto_add_key1 and auto_add_key2 in the same data block + table_options.block_size = kDummyValue.size() * 2 + 100; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + SetupTracingTest(&c); + std::vector keys; + stl_wrappers::KVMap kvmap; + ImmutableOptions ioptions(options); + MutableCFOptions moptions(options); + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + + std::vector expected_records; + + for (bool first_pass : {true, false}) { + uint64_t get_id_offset = first_pass ? 2 : 5; + ReadOptions ro; + std::array ukeys{{auto_add_key1, auto_add_key2}}; + std::array values; + std::vector get_contexts; + get_contexts.emplace_back( + options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, + ukeys[0], &values[0], nullptr, nullptr, nullptr, true, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, get_id_offset); + get_contexts.emplace_back( + options.comparator, nullptr, nullptr, nullptr, GetContext::kNotFound, + ukeys[1], &values[1], nullptr, nullptr, nullptr, true, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, get_id_offset + 1); + std::array encoded_keys; + encoded_keys[0] = InternalKey(ukeys[0], 0, kTypeValue).Encode().ToString(); + encoded_keys[1] = InternalKey(ukeys[1], 0, kTypeValue).Encode().ToString(); + std::array statuses; + autovector key_context; + key_context.emplace_back(/*ColumnFamilyHandle omitted*/ nullptr, ukeys[0], + &values[0], + /*PinnableWideColumns omitted*/ nullptr, + /*timestamp omitted*/ nullptr, &statuses[0]); + key_context[0].ukey_without_ts = ukeys[0]; + key_context[0].ikey = encoded_keys[0]; + key_context[0].get_context = &get_contexts[0]; + key_context.emplace_back(/*ColumnFamilyHandle omitted*/ nullptr, ukeys[1], + &values[1], + /*PinnableWideColumns omitted*/ nullptr, + /*timestamp omitted*/ nullptr, &statuses[1]); + key_context[1].ukey_without_ts = ukeys[1]; + key_context[1].ikey = encoded_keys[1]; + key_context[1].get_context = &get_contexts[1]; + autovector sorted_keys; + sorted_keys.push_back(&key_context[0]); + sorted_keys.push_back(&key_context[1]); + MultiGetContext m_context( + &sorted_keys, 0, sorted_keys.size(), /*SequenceNumber*/ 42, ro, + options.env->GetFileSystem().get(), options.statistics.get()); + MultiGetRange range = m_context.GetMultiGetRange(); + + get_perf_context()->Reset(); + c.GetTableReader()->MultiGet(ro, &range, /*prefix_extractor*/ nullptr); + + // Verify read op result + for (uint32_t i = 0; i <= 1; i++) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(get_contexts[i].State(), GetContext::kFound); + ASSERT_EQ(values[i].ToString(), kDummyValue); + } + + // Verify traces. + BlockCacheTraceRecord record; + if (first_pass) { + // The first two records should be prefetching index and filter blocks. + record.get_id = 0; + record.block_type = TraceType::kBlockTraceIndexBlock; + record.caller = TableReaderCaller::kPrefetch; + record.is_cache_hit = false; + record.no_insert = false; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceFilterBlock; + expected_records.push_back(record); + } + // Then we should have three records for one index, one filter, and one data + // block access. (The two keys share a data block.) + record.get_id = get_id_offset; + record.block_type = TraceType::kBlockTraceFilterBlock; + record.caller = TableReaderCaller::kUserMultiGet; + record.get_from_user_specified_snapshot = false; + record.referenced_key = encoded_keys[0]; + record.referenced_key_exist_in_block = true; + record.is_cache_hit = true; + expected_records.push_back(record); + record.block_type = TraceType::kBlockTraceIndexBlock; + expected_records.push_back(record); + record.is_cache_hit = !first_pass; + record.block_type = TraceType::kBlockTraceDataBlock; + expected_records.push_back(record); + } + VerifyBlockAccessTrace(&c, expected_records); + cache->VerifyExpectedHitMissCounts(expected_records); + c.ResetTableReader(); +} + TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) { TableConstructor c(BytewiseComparator()); Options options; @@ -3107,12 +3339,12 @@ TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) { MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); + const ReadOptions read_options; for (uint32_t i = 1; i <= 2; i++) { - std::string user_key = "k01"; - InternalKey internal_key(user_key, 0, kTypeValue); + InternalKey internal_key(auto_add_key1, 0, kTypeValue); std::string encoded_key = internal_key.Encode().ToString(); c.GetTableReader()->ApproximateOffsetOf( - encoded_key, TableReaderCaller::kUserApproximateSize); + read_options, encoded_key, TableReaderCaller::kUserApproximateSize); } // Verify traces. std::vector expected_records; @@ -3811,8 +4043,6 @@ TEST_P(BlockBasedTableTest, Crc32cFileChecksum) { ASSERT_STREQ(checksum.c_str(), "\345\245\277\110"); } -// Plain table is not supported in ROCKSDB_LITE -#ifndef ROCKSDB_LITE TEST_F(PlainTableTest, BasicPlainTableProperties) { PlainTableOptions plain_table_options; plain_table_options.user_key_len = 8; @@ -3854,8 +4084,10 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { new RandomAccessFileReader(std::move(source), "test")); std::unique_ptr props; + const ReadOptions read_options; auto s = ReadTableProperties(file_reader.get(), ss->contents().size(), - kPlainTableMagicNumber, ioptions, &props); + kPlainTableMagicNumber, ioptions, read_options, + &props); ASSERT_OK(s); ASSERT_EQ(0ul, props->index_size); @@ -3942,7 +4174,6 @@ TEST_F(PlainTableTest, Crc32cFileChecksum) { EXPECT_STREQ(f.GetFileChecksum().c_str(), checksum.c_str()); } -#endif // !ROCKSDB_LITE TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) { TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); @@ -3983,6 +4214,7 @@ TEST_F(GeneralTableTest, ApproximateOffsetOfPlain) { } static void DoCompressionTest(CompressionType comp) { + SCOPED_TRACE("CompressionType = " + CompressionTypeToString(comp)); Random rnd(301); TableConstructor c(BytewiseComparator(), true /* convert_to_internal_key_ */); std::string tmp; @@ -4004,8 +4236,8 @@ static void DoCompressionTest(CompressionType comp) { ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k02"), 0, 0)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3525)); - ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3525)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k03"), 2000, 3550)); + ASSERT_TRUE(Between(c.ApproximateOffsetOf("k04"), 2000, 3550)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("xyz"), 4000, 7075)); c.ResetTableReader(); } @@ -4115,7 +4347,6 @@ TEST_P(ParameterizedHarnessTest, RandomizedHarnessTest) { } } -#ifndef ROCKSDB_LITE TEST_F(DBHarnessTest, RandomizedLongDB) { Random rnd(test::RandomSeed()); int num_entries = 100000; @@ -4136,7 +4367,6 @@ TEST_F(DBHarnessTest, RandomizedLongDB) { } ASSERT_GT(files, 0); } -#endif // ROCKSDB_LITE #endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) class MemTableTest : public testing::Test { @@ -4241,11 +4471,12 @@ TEST(TableTest, FooterTests) { BlockHandle index(data_size + 5, index_size); BlockHandle meta_index(data_size + index_size + 2 * 5, metaindex_size); uint64_t footer_offset = data_size + metaindex_size + index_size + 3 * 5; + uint32_t base_context_checksum = 123456789; { // legacy block based FooterBuilder footer; - footer.Build(kBlockBasedTableMagicNumber, /* format_version */ 0, - footer_offset, kCRC32c, meta_index, index); + ASSERT_OK(footer.Build(kBlockBasedTableMagicNumber, /* format_version */ 0, + footer_offset, kCRC32c, meta_index, index)); Footer decoded_footer; ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset)); ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber); @@ -4255,6 +4486,7 @@ TEST(TableTest, FooterTests) { ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); ASSERT_EQ(decoded_footer.format_version(), 0U); + ASSERT_EQ(decoded_footer.base_context_checksum(), 0U); ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U); // Ensure serialized with legacy magic ASSERT_EQ( @@ -4264,9 +4496,11 @@ TEST(TableTest, FooterTests) { // block based, various checksums, various versions for (auto t : GetSupportedChecksums()) { for (uint32_t fv = 1; IsSupportedFormatVersion(fv); ++fv) { + uint32_t maybe_bcc = + FormatVersionUsesContextChecksum(fv) ? base_context_checksum : 0U; FooterBuilder footer; - footer.Build(kBlockBasedTableMagicNumber, fv, footer_offset, t, - meta_index, index); + ASSERT_OK(footer.Build(kBlockBasedTableMagicNumber, fv, footer_offset, t, + meta_index, index, maybe_bcc)); Footer decoded_footer; ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset)); ASSERT_EQ(decoded_footer.table_magic_number(), @@ -4275,19 +4509,44 @@ TEST(TableTest, FooterTests) { ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); - ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); - ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + if (FormatVersionUsesIndexHandleInFooter(fv)) { + ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); + ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + } ASSERT_EQ(decoded_footer.format_version(), fv); ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 5U); + + if (FormatVersionUsesContextChecksum(fv)) { + ASSERT_EQ(decoded_footer.base_context_checksum(), + base_context_checksum); + + // Bad offset should fail footer checksum + decoded_footer = Footer(); + ASSERT_NOK( + decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset - 1)); + } else { + ASSERT_EQ(decoded_footer.base_context_checksum(), 0U); + } + + // Too big metaindex size should also fail encoding only in new footer + uint64_t big_metaindex_size = 0x100000007U; + uint64_t big_footer_offset = + data_size + big_metaindex_size + index_size + 3 * 5; + BlockHandle big_metaindex = + BlockHandle(data_size + index_size + 2 * 5, big_metaindex_size); + ASSERT_NE(footer + .Build(kBlockBasedTableMagicNumber, fv, big_footer_offset, + t, big_metaindex, index, maybe_bcc) + .ok(), + FormatVersionUsesContextChecksum(fv)); } } -// Plain table is not supported in ROCKSDB_LITE -#ifndef ROCKSDB_LITE + { // legacy plain table FooterBuilder footer; - footer.Build(kPlainTableMagicNumber, /* format_version */ 0, footer_offset, - kNoChecksum, meta_index); + ASSERT_OK(footer.Build(kPlainTableMagicNumber, /* format_version */ 0, + footer_offset, kNoChecksum, meta_index)); Footer decoded_footer; ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset)); ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber); @@ -4306,8 +4565,8 @@ TEST(TableTest, FooterTests) { { // xxhash plain table (not currently used) FooterBuilder footer; - footer.Build(kPlainTableMagicNumber, /* format_version */ 1, footer_offset, - kxxHash, meta_index); + ASSERT_OK(footer.Build(kPlainTableMagicNumber, /* format_version */ 1, + footer_offset, kxxHash, meta_index)); Footer decoded_footer; ASSERT_OK(decoded_footer.DecodeFrom(footer.GetSlice(), footer_offset)); ASSERT_EQ(decoded_footer.table_magic_number(), kPlainTableMagicNumber); @@ -4319,7 +4578,6 @@ TEST(TableTest, FooterTests) { ASSERT_EQ(decoded_footer.format_version(), 1U); ASSERT_EQ(decoded_footer.GetBlockTrailerSize(), 0U); } -#endif // !ROCKSDB_LITE } class IndexBlockRestartIntervalTest @@ -4535,9 +4793,10 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { new RandomAccessFileReader(std::move(source), "")); std::unique_ptr props; + const ReadOptions read_options; ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(), kBlockBasedTableMagicNumber, ioptions, - &props)); + read_options, &props)); UserCollectedProperties user_props = props->user_collected_properties; version = DecodeFixed32( @@ -4567,7 +4826,7 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { options.table_factory->NewTableReader( TableReaderOptions(ioptions, moptions.prefix_extractor, EnvOptions(), - ikc), + ikc, 0 /* block_protection_bytes_per_key */), std::move(file_reader), ss_rw.contents().size(), &table_reader); return table_reader->NewIterator( @@ -4712,9 +4971,10 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { // Helper function to get version, global_seqno, global_seqno_offset std::function VerifyBlockAlignment = [&]() { std::unique_ptr props; + const ReadOptions read_options; ASSERT_OK(ReadTableProperties(file_reader.get(), sink->contents().size(), kBlockBasedTableMagicNumber, ioptions, - &props)); + read_options, &props)); uint64_t data_block_size = props->data_size / props->num_data_blocks; ASSERT_EQ(data_block_size, 4096); @@ -4735,7 +4995,8 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { ASSERT_OK(ioptions.table_factory->NewTableReader( TableReaderOptions(ioptions2, moptions2.prefix_extractor, EnvOptions(), - GetPlainInternalComparator(options2.comparator)), + GetPlainInternalComparator(options2.comparator), + 0 /* block_protection_bytes_per_key */), std::move(file_reader), sink->contents().size(), &table_reader)); ReadOptions read_options; @@ -4851,6 +5112,59 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) { } } +TEST_P(BlockBasedTableTest, CompressionRatioThreshold) { + for (CompressionType type : GetSupportedCompressions()) { + if (type == kNoCompression) { + continue; + } + if (type == kBZip2Compression) { + // Weird behavior in this test + continue; + } + SCOPED_TRACE("Compression type: " + std::to_string(type)); + + Options options; + options.compression = type; + + BlockBasedTableOptions table_options = GetBlockBasedTableOptions(); + int len = 10000; + Random rnd(301); + std::vector keys; + stl_wrappers::KVMap kvmap; + + // Test the max_compressed_bytes_per_kb option + for (int threshold : {0, 1, 100, 400, 600, 900, 1024}) { + SCOPED_TRACE("threshold=" + std::to_string(threshold)); + options.compression_opts.max_compressed_bytes_per_kb = threshold; + ImmutableOptions ioptions(options); + MutableCFOptions moptions(options); + + for (double compressible_to : {0.25, 0.75}) { + SCOPED_TRACE("compressible_to=" + std::to_string(compressible_to)); + TableConstructor c(BytewiseComparator(), + true /* convert_to_internal_key_ */); + std::string buf; + c.Add("x", test::CompressibleString(&rnd, compressible_to, len, &buf)); + + // write an SST file + c.Finish(options, ioptions, moptions, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + + size_t table_file_size = c.TEST_GetSink()->contents().size(); + size_t approx_sst_overhead = 1000; + if (compressible_to < threshold / 1024.0) { + // Should be compressed (substantial variance depending on algorithm) + EXPECT_NEAR2(len * compressible_to + approx_sst_overhead, + table_file_size, len / 8); + } else { + // Should not be compressed + EXPECT_NEAR2(len + approx_sst_overhead, table_file_size, len / 10); + } + } + } + } +} + TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) { // The properties meta-block should come at the end since we always need to // read it when opening a file, unlike index/filter/other meta-blocks, which @@ -4926,9 +5240,13 @@ TEST_P(BlockBasedTableTest, PropertiesMetaBlockLast) { } } ASSERT_EQ(kPropertiesBlockName, key_at_max_offset); - // index handle is stored in footer rather than metaindex block, so need - // separate logic to verify it comes before properties block. - ASSERT_GT(max_offset, footer.index_handle().offset()); + if (FormatVersionUsesIndexHandleInFooter(footer.format_version())) { + // If index handle is stored in footer rather than metaindex block, + // need separate logic to verify it comes before properties block. + ASSERT_GT(max_offset, footer.index_handle().offset()); + } else { + ASSERT_TRUE(footer.index_handle().IsNull()); + } c.ResetTableReader(); } @@ -5077,16 +5395,13 @@ TEST_F(BBTTailPrefetchTest, FilePrefetchBufferMinOffset) { IOOptions opts; buffer.TryReadFromCache(opts, nullptr /* reader */, 500 /* offset */, 10 /* n */, nullptr /* result */, - nullptr /* status */, - Env::IO_TOTAL /* rate_limiter_priority */); + nullptr /* status */); buffer.TryReadFromCache(opts, nullptr /* reader */, 480 /* offset */, 10 /* n */, nullptr /* result */, - nullptr /* status */, - Env::IO_TOTAL /* rate_limiter_priority */); + nullptr /* status */); buffer.TryReadFromCache(opts, nullptr /* reader */, 490 /* offset */, 10 /* n */, nullptr /* result */, - nullptr /* status */, - Env::IO_TOTAL /* rate_limiter_priority */); + nullptr /* status */); ASSERT_EQ(480, buffer.min_offset_read()); } diff --git a/test_util/mock_time_env.h b/test_util/mock_time_env.h index 7834368e03ae..19bb9e76de98 100644 --- a/test_util/mock_time_env.h +++ b/test_util/mock_time_env.h @@ -8,7 +8,11 @@ #include #include +#include "port/port.h" #include "rocksdb/system_clock.h" +#include "test_util/mock_time_env.h" +#include "test_util/sync_point.h" +#include "util/random.h" namespace ROCKSDB_NAMESPACE { @@ -65,6 +69,33 @@ class MockSystemClock : public SystemClockWrapper { current_time_us_.fetch_add(micros); } + virtual bool TimedWait(port::CondVar* cv, + std::chrono::microseconds deadline) override { + uint64_t now_micros = NowMicros(); + uint64_t deadline_micros = static_cast(deadline.count()); + uint64_t delay_micros; + if (deadline_micros > now_micros) { + delay_micros = deadline_micros - now_micros; + } else { + delay_micros = 0; + } + // To prevent slowdown, this `TimedWait()` is completely synthetic. First, + // it yields to coerce other threads to run while the lock is released. + // Second, it randomly selects between mocking an immediate wakeup and a + // timeout. + cv->GetMutex()->Unlock(); + std::this_thread::yield(); + bool mock_timeout = Random::GetTLSInstance()->OneIn(2); + if (mock_timeout) { + TEST_SYNC_POINT("MockSystemClock::TimedWait:UnlockedPreSleep"); + current_time_us_.fetch_add(delay_micros); + TEST_SYNC_POINT("MockSystemClock::TimedWait:UnlockedPostSleep1"); + TEST_SYNC_POINT("MockSystemClock::TimedWait:UnlockedPostSleep2"); + } + cv->GetMutex()->Lock(); + return mock_timeout; + } + // TODO: this is a workaround for the different behavior on different platform // for timedwait timeout. Ideally timedwait API should be moved to env. // details: PR #7101. diff --git a/test_util/secondary_cache_test_util.cc b/test_util/secondary_cache_test_util.cc new file mode 100644 index 000000000000..6f0bd3849487 --- /dev/null +++ b/test_util/secondary_cache_test_util.cc @@ -0,0 +1,97 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "test_util/secondary_cache_test_util.h" + +#include + +namespace ROCKSDB_NAMESPACE { + +namespace secondary_cache_test_util { + +namespace { +using TestItem = WithCacheType::TestItem; + +size_t SizeCallback(Cache::ObjectPtr obj) { + return static_cast(obj)->Size(); +} + +Status SaveToCallback(Cache::ObjectPtr from_obj, size_t from_offset, + size_t length, char* out) { + auto item = static_cast(from_obj); + const char* buf = item->Buf(); + EXPECT_EQ(length, item->Size()); + EXPECT_EQ(from_offset, 0); + memcpy(out, buf, length); + return Status::OK(); +} + +void DeletionCallback(Cache::ObjectPtr obj, MemoryAllocator* /*alloc*/) { + delete static_cast(obj); +} + +Status SaveToCallbackFail(Cache::ObjectPtr /*obj*/, size_t /*offset*/, + size_t /*size*/, char* /*out*/) { + return Status::NotSupported(); +} + +Status CreateCallback(const Slice& data, CompressionType /*type*/, + CacheTier /*source*/, Cache::CreateContext* context, + MemoryAllocator* /*allocator*/, Cache::ObjectPtr* out_obj, + size_t* out_charge) { + auto t = static_cast(context); + if (t->fail_create_) { + return Status::NotSupported(); + } + *out_obj = new TestItem(data.data(), data.size()); + *out_charge = data.size(); + return Status::OK(); +} + +// If helpers without_secondary are provided, returns helpers with secondary +// support. If not provided, returns helpers without secondary support. +auto GenerateHelpersByRole( + const std::array* + without_secondary, + bool fail) { + std::array a; + for (uint32_t i = 0; i < kNumCacheEntryRoles; ++i) { + if (without_secondary) { + a[i] = + Cache::CacheItemHelper{static_cast(i), + &DeletionCallback, + &SizeCallback, + fail ? &SaveToCallbackFail : &SaveToCallback, + &CreateCallback, + &(*without_secondary)[i]}; + } else { + a[i] = Cache::CacheItemHelper{static_cast(i), + &DeletionCallback}; + } + } + return a; +} +} // namespace + +const Cache::CacheItemHelper* WithCacheType::GetHelper( + CacheEntryRole r, bool secondary_compatible, bool fail) { + static const std::array + without_secondary = GenerateHelpersByRole(nullptr, false); + static const std::array + with_secondary = GenerateHelpersByRole(&without_secondary, false); + static const std::array + with_secondary_fail = GenerateHelpersByRole(&without_secondary, true); + return &(fail ? with_secondary_fail + : secondary_compatible ? with_secondary + : without_secondary)[static_cast(r)]; +} + +const Cache::CacheItemHelper* WithCacheType::GetHelperFail(CacheEntryRole r) { + return GetHelper(r, true, true); +} + +} // namespace secondary_cache_test_util + +} // namespace ROCKSDB_NAMESPACE diff --git a/test_util/secondary_cache_test_util.h b/test_util/secondary_cache_test_util.h new file mode 100644 index 000000000000..5e2262a9c128 --- /dev/null +++ b/test_util/secondary_cache_test_util.h @@ -0,0 +1,131 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include + +#include "rocksdb/advanced_cache.h" + +namespace ROCKSDB_NAMESPACE { +namespace secondary_cache_test_util { + +struct TestCreateContext : public Cache::CreateContext { + void SetFailCreate(bool fail) { fail_create_ = fail; } + + bool fail_create_ = false; +}; + +class WithCacheType : public TestCreateContext { + public: + WithCacheType() {} + virtual ~WithCacheType() {} + + class TestItem { + public: + TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) { + memcpy(buf_.get(), buf, size); + } + ~TestItem() = default; + + char* Buf() { return buf_.get(); } + [[nodiscard]] size_t Size() const { return size_; } + std::string ToString() { return std::string(Buf(), Size()); } + + private: + std::unique_ptr buf_; + size_t size_; + }; + + static constexpr auto kLRU = "lru"; + static constexpr auto kFixedHyperClock = "fixed_hyper_clock"; + static constexpr auto kAutoHyperClock = "auto_hyper_clock"; + + // For options other than capacity + size_t estimated_value_size_ = 1; + + virtual const std::string& Type() const = 0; + + static bool IsHyperClock(const std::string& type) { + return type == kFixedHyperClock || type == kAutoHyperClock; + } + + bool IsHyperClock() const { return IsHyperClock(Type()); } + + std::shared_ptr NewCache( + size_t capacity, + std::function modify_opts_fn = {}) { + const auto& type = Type(); + if (type == kLRU) { + LRUCacheOptions lru_opts; + lru_opts.capacity = capacity; + lru_opts.hash_seed = 0; // deterministic tests + if (modify_opts_fn) { + modify_opts_fn(lru_opts); + } + return lru_opts.MakeSharedCache(); + } + if (IsHyperClock(type)) { + HyperClockCacheOptions hc_opts{ + capacity, type == kFixedHyperClock ? estimated_value_size_ : 0}; + hc_opts.min_avg_entry_charge = + std::max(size_t{1}, estimated_value_size_ / 2); + hc_opts.hash_seed = 0; // deterministic tests + if (modify_opts_fn) { + modify_opts_fn(hc_opts); + } + return hc_opts.MakeSharedCache(); + } + assert(false); + return nullptr; + } + + std::shared_ptr NewCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + CacheMetadataChargePolicy charge_policy = kDontChargeCacheMetadata) { + return NewCache(capacity, [=](ShardedCacheOptions& opts) { + opts.num_shard_bits = num_shard_bits; + opts.strict_capacity_limit = strict_capacity_limit; + opts.metadata_charge_policy = charge_policy; + }); + } + + std::shared_ptr NewCache( + size_t capacity, int num_shard_bits, bool strict_capacity_limit, + std::shared_ptr secondary_cache) { + return NewCache(capacity, [=](ShardedCacheOptions& opts) { + opts.num_shard_bits = num_shard_bits; + opts.strict_capacity_limit = strict_capacity_limit; + opts.metadata_charge_policy = kDontChargeCacheMetadata; + opts.secondary_cache = secondary_cache; + }); + } + + static const Cache::CacheItemHelper* GetHelper( + CacheEntryRole r = CacheEntryRole::kDataBlock, + bool secondary_compatible = true, bool fail = false); + + static const Cache::CacheItemHelper* GetHelperFail( + CacheEntryRole r = CacheEntryRole::kDataBlock); +}; + +class WithCacheTypeParam : public WithCacheType, + public testing::WithParamInterface { + const std::string& Type() const override { return GetParam(); } +}; + +constexpr auto kLRU = WithCacheType::kLRU; +constexpr auto kFixedHyperClock = WithCacheType::kFixedHyperClock; +constexpr auto kAutoHyperClock = WithCacheType::kAutoHyperClock; + +inline auto GetTestingCacheTypes() { + return testing::Values(std::string(kLRU), std::string(kFixedHyperClock), + std::string(kAutoHyperClock)); +} + +} // namespace secondary_cache_test_util +} // namespace ROCKSDB_NAMESPACE diff --git a/test_util/sync_point.h b/test_util/sync_point.h index 65f1239ec44e..6022073e573a 100644 --- a/test_util/sync_point.h +++ b/test_util/sync_point.h @@ -85,7 +85,9 @@ class SyncPoint { }; // call once at the beginning of a test to setup the dependency between - // sync points + // sync points. Specifically, execution will not be allowed to proceed past + // each successor until execution has reached the corresponding predecessor, + // in any thread. void LoadDependency(const std::vector& dependencies); // call once at the beginning of a test to setup the dependency between diff --git a/test_util/testharness.h b/test_util/testharness.h index 69018629a57d..d8b6c9679c18 100644 --- a/test_util/testharness.h +++ b/test_util/testharness.h @@ -51,6 +51,11 @@ GTEST_SUCCESS_("BYPASSED: " m); \ } while (false) /* user ; */ +// Avoid "loss of precision" warnings when passing in 64-bit integers +#define EXPECT_NEAR2(val1, val2, abs_error) \ + EXPECT_NEAR(static_cast(val1), static_cast(val2), \ + static_cast(abs_error)) + #include #include "port/stack_trace.h" diff --git a/test_util/testutil.cc b/test_util/testutil.cc index 5e1b909f9731..1e771f4fd166 100644 --- a/test_util/testutil.cc +++ b/test_util/testutil.cc @@ -39,7 +39,10 @@ namespace test { const uint32_t kDefaultFormatVersion = BlockBasedTableOptions().format_version; const std::set kFooterFormatVersionsToTest{ + // Non-legacy, before big footer changes 5U, + // After big footer changes + 6U, // In case any interesting future changes kDefaultFormatVersion, kLatestFormatVersion, @@ -72,11 +75,27 @@ std::string RandomKey(Random* rnd, int len, RandomKeyType type) { return result; } +const std::vector& GetUDTTestModes() { + static std::vector udt_test_modes = { + UserDefinedTimestampTestMode::kStripUserDefinedTimestamp, + UserDefinedTimestampTestMode::kNormal, + UserDefinedTimestampTestMode::kNone}; + return udt_test_modes; +} + +bool IsUDTEnabled(const UserDefinedTimestampTestMode& test_mode) { + return test_mode != UserDefinedTimestampTestMode::kNone; +} + +bool ShouldPersistUDT(const UserDefinedTimestampTestMode& test_mode) { + return test_mode != UserDefinedTimestampTestMode::kStripUserDefinedTimestamp; +} + extern Slice CompressibleString(Random* rnd, double compressed_fraction, int len, std::string* dst) { int raw = static_cast(len * compressed_fraction); if (raw < 1) raw = 1; - std::string raw_data = rnd->RandomString(raw); + std::string raw_data = rnd->RandomBinaryString(raw); // Duplicate the random data until we have filled "len" bytes dst->clear(); @@ -134,6 +153,16 @@ const Comparator* BytewiseComparatorWithU64TsWrapper() { return user_comparator; } +const Comparator* ReverseBytewiseComparatorWithU64TsWrapper() { + ConfigOptions config_options; + const Comparator* user_comparator = nullptr; + Status s = Comparator::CreateFromString( + config_options, "rocksdb.ReverseBytewiseComparator.u64ts", + &user_comparator); + s.PermitUncheckedError(); + return user_comparator; +} + void CorruptKeyType(InternalKey* ikey) { std::string keystr = ikey->Encode().ToString(); keystr[keystr.size() - 8] = kTypeLogData; @@ -242,7 +271,6 @@ BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) { } TableFactory* RandomTableFactory(Random* rnd, int pre_defined) { -#ifndef ROCKSDB_LITE int random_num = pre_defined >= 0 ? pre_defined : rnd->Uniform(4); switch (random_num) { case 0: @@ -252,11 +280,6 @@ TableFactory* RandomTableFactory(Random* rnd, int pre_defined) { default: return NewBlockBasedTableFactory(); } -#else - (void)rnd; - (void)pre_defined; - return NewBlockBasedTableFactory(); -#endif // !ROCKSDB_LITE } MergeOperator* RandomMergeOperator(Random* rnd) { @@ -501,13 +524,11 @@ Status CorruptFile(Env* env, const std::string& fname, int offset, s = WriteStringToFile(env, contents, fname); } if (s.ok() && verify_checksum) { -#ifndef ROCKSDB_LITE Options options; options.env = env; EnvOptions env_options; Status v = VerifySstFileChecksum(options, env_options, fname); assert(!v.ok()); -#endif } return s; } @@ -620,7 +641,6 @@ class SpecialMemTableRep : public MemTableRep { }; class SpecialSkipListFactory : public MemTableRepFactory { public: -#ifndef ROCKSDB_LITE static bool Register(ObjectLibrary& library, const std::string& /*arg*/) { library.AddFactory( ObjectLibrary::PatternEntry(SpecialSkipListFactory::kClassName(), true) @@ -638,8 +658,7 @@ class SpecialSkipListFactory : public MemTableRepFactory { }); return true; } -#endif // ROCKSDB_LITE - // After number of inserts exceeds `num_entries_flush` in a mem table, trigger + // After number of inserts >= `num_entries_flush` in a mem table, trigger // flush. explicit SpecialSkipListFactory(int num_entries_flush) : num_entries_flush_(num_entries_flush) {} @@ -678,7 +697,6 @@ MemTableRepFactory* NewSpecialSkipListFactory(int num_entries_per_flush) { return new SpecialSkipListFactory(num_entries_per_flush); } -#ifndef ROCKSDB_LITE // This method loads existing test classes into the ObjectRegistry int RegisterTestObjects(ObjectLibrary& library, const std::string& arg) { size_t num_types; @@ -721,17 +739,12 @@ int RegisterTestObjects(ObjectLibrary& library, const std::string& arg) { return static_cast(library.GetFactoryCount(&num_types)); } -#endif // ROCKSDB_LITE void RegisterTestLibrary(const std::string& arg) { static bool registered = false; if (!registered) { registered = true; -#ifndef ROCKSDB_LITE ObjectRegistry::Default()->AddLibrary("test", RegisterTestObjects, arg); -#else - (void)arg; -#endif // ROCKSDB_LITE } } } // namespace test diff --git a/test_util/testutil.h b/test_util/testutil.h index c2289dd819f3..eca1ff794e96 100644 --- a/test_util/testutil.h +++ b/test_util/testutil.h @@ -52,6 +52,22 @@ enum RandomKeyType : char { RANDOM, LARGEST, SMALLEST, MIDDLE }; extern std::string RandomKey(Random* rnd, int len, RandomKeyType type = RandomKeyType::RANDOM); +enum class UserDefinedTimestampTestMode { + // Test does not enable user-defined timestamp feature. + kNone, + // Test enables user-defined timestamp feature. Write/read with min timestamps + kNormal, + // Test enables user-defined timestamp feature. Write/read with min timestamps + // Set `persist_user_defined_timestamps` to false. + kStripUserDefinedTimestamp, +}; + +extern const std::vector& GetUDTTestModes(); + +extern bool IsUDTEnabled(const UserDefinedTimestampTestMode& test_mode); + +extern bool ShouldPersistUDT(const UserDefinedTimestampTestMode& test_mode); + // Store in *dst a string of length "len" that will compress to // "N*compressed_fraction" bytes and return a Slice that references // the generated data. @@ -116,6 +132,9 @@ extern const Comparator* Uint64Comparator(); // A wrapper api for getting the ComparatorWithU64Ts extern const Comparator* BytewiseComparatorWithU64TsWrapper(); +// A wrapper api for getting the ComparatorWithU64Ts +extern const Comparator* ReverseBytewiseComparatorWithU64TsWrapper(); + class StringSink : public FSWritableFile { public: std::string contents_; @@ -851,10 +870,8 @@ void DeleteDir(Env* env, const std::string& dirname); Status CreateEnvFromSystem(const ConfigOptions& options, Env** result, std::shared_ptr* guard); -#ifndef ROCKSDB_LITE // Registers the testutil classes with the ObjectLibrary int RegisterTestObjects(ObjectLibrary& library, const std::string& /*arg*/); -#endif // ROCKSDB_LITE // Register the testutil classes with the default ObjectRegistry/Library void RegisterTestLibrary(const std::string& arg = ""); diff --git a/test_util/transaction_test_util.cc b/test_util/transaction_test_util.cc index 99286d836173..11fca6d57571 100644 --- a/test_util/transaction_test_util.cc +++ b/test_util/transaction_test_util.cc @@ -2,7 +2,6 @@ // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "test_util/transaction_test_util.h" @@ -399,4 +398,3 @@ Status RandomTransactionInserter::Verify(DB* db, uint16_t num_sets, } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/test_util/transaction_test_util.h b/test_util/transaction_test_util.h index 7a38ab62681b..284b3925031a 100644 --- a/test_util/transaction_test_util.h +++ b/test_util/transaction_test_util.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include "port/port.h" #include "rocksdb/options.h" @@ -146,4 +145,3 @@ class RandomTransactionInserter { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc b/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc index 98615ea3b9fc..b19c9f2a8115 100644 --- a/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc +++ b/third-party/gtest-1.8.1/fused-src/gtest/gtest-all.cc @@ -8676,7 +8676,7 @@ static void StackLowerThanAddress(const void* ptr, bool* result) { // Make sure AddressSanitizer does not tamper with the stack here. GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ static bool StackGrowsDown() { - int dummy = 1; + int dummy = 0; bool result; StackLowerThanAddress(&dummy, &result); return result; diff --git a/tools/benchmark.sh b/tools/benchmark.sh index b41d25c78744..73d9e9613562 100755 --- a/tools/benchmark.sh +++ b/tools/benchmark.sh @@ -6,6 +6,7 @@ EXIT_INVALID_ARGS=1 EXIT_NOT_COMPACTION_TEST=2 EXIT_UNKNOWN_JOB=3 +EXIT_INVALID_PATH=4 # Size Constants K=1024 @@ -114,6 +115,11 @@ fi job_id=${JOB_ID} +if [ ! -x ./db_bench ]; then + echo "./db_bench not found. Please make sure it exists in the current directory." + exit $EXIT_INVALID_PATH +fi + # Make it easier to run only the compaction test. Getting valid data requires # a number of iterations and having an ability to run the test separately from # rest of the benchmarks helps. diff --git a/tools/blob_dump.cc b/tools/blob_dump.cc index 1f75eb20d8a0..23b5f8f7903a 100644 --- a/tools/blob_dump.cc +++ b/tools/blob_dump.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include #include @@ -102,10 +101,3 @@ int main(int argc, char** argv) { } return 0; } -#else -#include -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "Not supported in lite mode.\n"); - return -1; -} -#endif // ROCKSDB_LITE diff --git a/tools/block_cache_analyzer/block_cache_pysim.py b/tools/block_cache_analyzer/block_cache_pysim.py index 67307df53299..3962f37ebecf 100644 --- a/tools/block_cache_analyzer/block_cache_pysim.py +++ b/tools/block_cache_analyzer/block_cache_pysim.py @@ -492,7 +492,7 @@ def write_policy_ratio_timeline( file.write(row + "\n") -class Policy(object): +class Policy: """ A policy maintains a set of evicted keys. It returns a reward of one to itself if it has not evicted a missing key. Otherwise, it gives itself 0 @@ -654,7 +654,7 @@ def policy_name(self): return "cc" -class Cache(object): +class Cache: """ This is the base class for the implementations of alternative cache replacement policies. @@ -1310,7 +1310,7 @@ def _should_admit(self, trace_record, key, hash, value_size): return True -class Deque(object): +class Deque: """A Deque class facilitates the implementation of LRU and ARC.""" def __init__(self): diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc index f0bb6975badf..f2d4f05bea7c 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #ifdef GFLAGS #include "tools/block_cache_analyzer/block_cache_trace_analyzer.h" @@ -2313,4 +2312,3 @@ int block_cache_trace_analyzer_tool(int argc, char** argv) { } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS -#endif // ROCKSDB_LITE diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc index 60834480538a..174565641f77 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #ifndef GFLAGS #include int main() { @@ -790,11 +789,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } #endif // GFLAG -#else -#include -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "block_cache_trace_analyzer_test is not supported in ROCKSDB_LITE\n"); - return 0; -} -#endif // ROCKSDB_LITE diff --git a/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc b/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc index 44fec559892f..6e0be54d6e09 100644 --- a/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc +++ b/tools/block_cache_analyzer/block_cache_trace_analyzer_tool.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #ifndef GFLAGS #include int main() { @@ -16,10 +15,3 @@ int main(int argc, char** argv) { return ROCKSDB_NAMESPACE::block_cache_trace_analyzer_tool(argc, argv); } #endif // GFLAGS -#else // ROCKSDB_LITE -#include -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "Not supported in lite mode.\n"); - return 1; -} -#endif // ROCKSDB_LITE diff --git a/tools/check_format_compatible.sh b/tools/check_format_compatible.sh index e7efd4f31912..93b51a9b9e95 100755 --- a/tools/check_format_compatible.sh +++ b/tools/check_format_compatible.sh @@ -125,7 +125,7 @@ EOF # To check for DB forward compatibility with loading options (old version # reading data from new), as well as backward compatibility -declare -a db_forward_with_options_refs=("6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb" "7.7.fb" "7.8.fb" "7.9.fb") +declare -a db_forward_with_options_refs=("6.27.fb" "6.28.fb" "6.29.fb" "7.0.fb" "7.1.fb" "7.2.fb" "7.3.fb" "7.4.fb" "7.5.fb" "7.6.fb" "7.7.fb" "7.8.fb" "7.9.fb" "7.10.fb" "8.0.fb" "8.1.fb" "8.2.fb" "8.3.fb" "8.4.fb" "8.5.fb" "8.6.fb" "8.7.fb" "8.8.fb" "8.9.fb") # To check for DB forward compatibility without loading options (in addition # to the "with loading options" set), as well as backward compatibility declare -a db_forward_no_options_refs=() # N/A at the moment diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index dfe0f18cb9a3..340a8a3a1e2e 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -18,7 +18,6 @@ #include #include #include - #ifdef __APPLE__ #include #include @@ -34,16 +33,16 @@ #include #include #include +#include #include #include #include -#include "cloud/aws/aws_file_system.h" #include "db/db_impl/db_impl.h" #include "db/malloc_stats.h" #include "db/version_set.h" #include "monitoring/histogram.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "options/cf_options.h" #include "port/port.h" #include "port/stack_trace.h" @@ -67,9 +66,7 @@ #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/options_type.h" #include "rocksdb/utilities/options_util.h" -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/replayer.h" -#endif // ROCKSDB_LITE #include "rocksdb/utilities/sim_cache.h" #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" @@ -107,12 +104,6 @@ using GFLAGS_NAMESPACE::RegisterFlagValidator; using GFLAGS_NAMESPACE::SetUsageMessage; using GFLAGS_NAMESPACE::SetVersionString; -#ifdef ROCKSDB_LITE -#define IF_ROCKSDB_LITE(Then, Else) Then -#else -#define IF_ROCKSDB_LITE(Then, Else) Else -#endif - DEFINE_string( benchmarks, "fillseq," @@ -132,11 +123,9 @@ DEFINE_string( "compact," "compactall," "flush," -IF_ROCKSDB_LITE("", "compact0," "compact1," "waitforcompaction," -) "multireadrandom," "mixgraph," "readseq," @@ -233,11 +222,9 @@ IF_ROCKSDB_LITE("", "Meta operations:\n" "\tcompact -- Compact the entire DB; If multiple, randomly choose one\n" "\tcompactall -- Compact the entire DB\n" -IF_ROCKSDB_LITE("", "\tcompact0 -- compact L0 into L1\n" "\tcompact1 -- compact L1 into L2\n" "\twaitforcompaction - pause until compaction is (probably) done\n" -) "\tflush - flush the memtable\n" "\tstats -- Print DB stats\n" "\tresetstats -- Reset DB stats\n" @@ -245,9 +232,7 @@ IF_ROCKSDB_LITE("", "\tmemstats -- Print memtable stats\n" "\tsstables -- Print sstable info\n" "\theapprofile -- Dump a heap profile (if supported by this port)\n" -IF_ROCKSDB_LITE("", "\treplay -- replay the trace file specified with trace_file\n" -) "\tgetmergeoperands -- Insert lots of merge records which are a list of " "sorted ints for a key and then compare performance of lookup for another " "key by doing a Get followed by binary searching in the large sorted list " @@ -303,7 +288,7 @@ DEFINE_int32(bloom_locality, 0, "Control bloom filter probes locality"); DEFINE_int64(seed, 0, "Seed base for random number generators. " "When 0 it is derived from the current time."); -static int64_t seed_base; +static std::optional seed_base; DEFINE_int32(threads, 1, "Number of concurrent threads to run."); @@ -564,7 +549,7 @@ DEFINE_bool(universal_allow_trivial_move, false, DEFINE_bool(universal_incremental, false, "Enable incremental compactions in universal compaction."); -DEFINE_int64(cache_size, 8 << 20, // 8MB +DEFINE_int64(cache_size, 32 << 20, // 32MB "Number of bytes to use as a cache of uncompressed data"); DEFINE_int32(cache_numshardbits, -1, @@ -585,7 +570,7 @@ DEFINE_string(cache_type, "lru_cache", "Type of block cache."); DEFINE_bool(use_compressed_secondary_cache, false, "Use the CompressedSecondaryCache as the secondary cache."); -DEFINE_int64(compressed_secondary_cache_size, 8 << 20, // 8MB +DEFINE_int64(compressed_secondary_cache_size, 32 << 20, // 32MB "Number of bytes to use as a cache of data"); DEFINE_int32(compressed_secondary_cache_numshardbits, 6, @@ -617,6 +602,12 @@ DEFINE_uint32( "compress_format_version == 2 -- decompressed size is included" " in the block header in varint32 format."); +DEFINE_bool(use_tiered_volatile_cache, false, + "If use_compressed_secondary_cache is true and " + "use_tiered_volatile_cache is true, then allocate a tiered cache " + "that distributes cache reservations proportionally over both " + "the caches."); + DEFINE_int64(simcache_size, -1, "Number of bytes to use as a simcache of " "uncompressed data. Nagative value disables simcache."); @@ -727,7 +718,9 @@ DEFINE_int32(file_opening_threads, "If open_files is set to -1, this option set the number of " "threads that will be used to open files during DB::Open()"); -DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size"); +DEFINE_uint64(compaction_readahead_size, + ROCKSDB_NAMESPACE::Options().compaction_readahead_size, + "Compaction readahead size"); DEFINE_int32(log_readahead_size, 0, "WAL and manifest readahead size"); @@ -947,7 +940,6 @@ DEFINE_int64(max_num_range_tombstones, 0, DEFINE_bool(expand_range_tombstones, false, "Expand range tombstone into sequential regular tombstones."); -#ifndef ROCKSDB_LITE // Transactions Options DEFINE_bool(optimistic_transaction_db, false, "Open a OptimisticTransactionDB instance. " @@ -1051,7 +1043,6 @@ DEFINE_string( static enum ROCKSDB_NAMESPACE::CompressionType FLAGS_blob_db_compression_type_e = ROCKSDB_NAMESPACE::kSnappyCompression; -#endif // ROCKSDB_LITE // Integrated BlobDB options DEFINE_bool( @@ -1123,7 +1114,6 @@ DEFINE_int32(prepopulate_blob_cache, 0, "[Integrated BlobDB] Pre-populate hot/warm blobs in blob cache. 0 " "to disable and 1 to insert during flush."); -#ifndef ROCKSDB_LITE // Secondary DB instance Options DEFINE_bool(use_secondary_db, false, @@ -1138,7 +1128,6 @@ DEFINE_int32(secondary_update_interval, 5, "Secondary instance attempts to catch up with the primary every " "secondary_update_interval seconds."); -#endif // ROCKSDB_LITE DEFINE_bool(report_bg_io_stats, false, "Measure times spents on I/Os while in compactions. "); @@ -1146,7 +1135,6 @@ DEFINE_bool(report_bg_io_stats, false, DEFINE_bool(use_stderr_info_logger, false, "Write info logs to stderr instead of to LOG file. "); -#ifndef ROCKSDB_LITE DEFINE_string(trace_file, "", "Trace workload to a file. "); @@ -1155,8 +1143,6 @@ DEFINE_double(trace_replay_fast_forward, 1.0, DEFINE_int32(block_cache_trace_sampling_frequency, 1, "Block cache trace sampling frequency, termed s. It uses spatial " "downsampling and samples accesses to one out of s blocks."); -DEFINE_bool(block_cache_trace_filter_referenced_key, false, - "If true, block cache trace will not include referenced_key"); DEFINE_int64( block_cache_trace_max_trace_file_size_in_bytes, uint64_t{64} * 1024 * 1024 * 1024, @@ -1170,7 +1156,6 @@ DEFINE_int32(trace_replay_threads, 1, DEFINE_bool(io_uring_enabled, true, "If true, enable the use of IO uring if the platform supports it"); extern "C" bool RocksDbIOUringEnable() { return FLAGS_io_uring_enabled; } -#endif // ROCKSDB_LITE DEFINE_bool(adaptive_readahead, false, "carry forward internal auto readahead size from one file to next " @@ -1260,6 +1245,10 @@ DEFINE_uint64( "num_file_reads_for_auto_readahead indicates after how many sequential " "reads into that file internal auto prefetching should be start."); +DEFINE_bool( + auto_readahead_size, false, + "When set true, RocksDB does auto tuning of readahead size during Scans"); + static enum ROCKSDB_NAMESPACE::CompressionType StringToCompressionType( const char* ctype) { assert(ctype); @@ -1347,19 +1336,12 @@ static bool ValidateTableCacheNumshardbits(const char* flagname, } DEFINE_int32(table_cache_numshardbits, 4, ""); -#ifndef ROCKSDB_LITE DEFINE_string(env_uri, "", "URI for registry Env lookup. Mutually exclusive with --fs_uri"); DEFINE_string(fs_uri, "", "URI for registry Filesystem lookup. Mutually exclusive" " with --env_uri." " Creates a default environment with the specified filesystem."); -DEFINE_string(aws_access_id, "", "Access id for AWS"); -DEFINE_string(aws_secret_key, "", "Secret key for AWS"); -DEFINE_string(aws_region, "", "AWS region"); -DEFINE_bool(keep_local_sst_files, true, - "Keep all files in local storage as well as cloud storage"); -#endif // ROCKSDB_LITE DEFINE_string(simulate_hybrid_fs_file, "", "File for Store Metadata for Simulate hybrid FS. Empty means " "disable the feature. Now, if it is set, last_level_temperature " @@ -1556,12 +1538,10 @@ DEFINE_uint64(max_compaction_bytes, ROCKSDB_NAMESPACE::Options().max_compaction_bytes, "Max bytes allowed in one compaction"); -#ifndef ROCKSDB_LITE DEFINE_bool(readonly, false, "Run read only benchmarks."); DEFINE_bool(print_malloc_stats, false, "Print malloc stats to stdout after benchmarks finish."); -#endif // ROCKSDB_LITE DEFINE_bool(disable_auto_compactions, false, "Do not auto trigger compactions"); @@ -1691,77 +1671,6 @@ DEFINE_int64(multiread_stride, 0, "Stride length for the keys in a MultiGet batch"); DEFINE_bool(multiread_batched, false, "Use the new MultiGet API"); -enum RepFactory { - kSkipList, - kPrefixHash, - kVectorRep, - kHashLinkedList, -}; - -// create Factory for creating S3 Envs -#ifndef ROCKSDB_LITE -#ifdef USE_AWS -ROCKSDB_NAMESPACE::Env* CreateAwsEnv( - const std::string& dbpath, - std::unique_ptr* result) { - fprintf(stderr, "Creating AwsEnv for path %s\n", dbpath.c_str()); - std::shared_ptr info_log; - info_log.reset(new ROCKSDB_NAMESPACE::StderrLogger( - ROCKSDB_NAMESPACE::InfoLogLevel::WARN_LEVEL)); - ROCKSDB_NAMESPACE::CloudFileSystemOptions coptions; - std::string region; - if (FLAGS_aws_access_id.size() != 0) { - coptions.credentials.InitializeSimple(FLAGS_aws_access_id, - FLAGS_aws_secret_key); - region = FLAGS_aws_region; - } - assert(coptions.credentials.HasValid().ok()); - - coptions.keep_local_sst_files = FLAGS_keep_local_sst_files; - if (FLAGS_db.empty()) { - coptions.TEST_Initialize("dbbench.", "db-bench", region); - } else { - coptions.TEST_Initialize("dbbench.", FLAGS_db, region); - } - ROCKSDB_NAMESPACE::CloudFileSystem* s; - auto st = ROCKSDB_NAMESPACE::AwsFileSystem::NewAwsFileSystem( - ROCKSDB_NAMESPACE::FileSystem::Default(), coptions, std::move(info_log), - &s); - assert(st.ok()); - ((ROCKSDB_NAMESPACE::CloudFileSystemImpl*)s)->TEST_DisableCloudManifest(); - *result = rocksdb::NewCompositeEnv(std::shared_ptr(s)); - return result->get(); -} - -static const auto& s3_reg __attribute__((__unused__)) = - ROCKSDB_NAMESPACE::ObjectLibrary::Default() - -> AddFactory( - ROCKSDB_NAMESPACE::ObjectLibrary::PatternEntry("s3").AddSeparator("://", false), - [](const std::string& uri, - std::unique_ptr* guard, std::string*) { - CreateAwsEnv(uri, guard); - return guard->get(); - }); -#endif /* USE_AWS */ -#endif // ROCKSDB_LITE - -static enum RepFactory StringToRepFactory(const char* ctype) { - assert(ctype); - - if (!strcasecmp(ctype, "skip_list")) - return kSkipList; - else if (!strcasecmp(ctype, "prefix_hash")) - return kPrefixHash; - else if (!strcasecmp(ctype, "vector")) - return kVectorRep; - else if (!strcasecmp(ctype, "hash_linkedlist")) - return kHashLinkedList; - - fprintf(stdout, "Cannot parse memreptable %s\n", ctype); - return kSkipList; -} - -static enum RepFactory FLAGS_rep_factory; DEFINE_string(memtablerep, "skip_list", ""); DEFINE_int64(hash_bucket_count, 1024 * 1024, "hash bucket count"); DEFINE_bool(use_plain_table, false, @@ -1788,11 +1697,9 @@ DEFINE_bool(read_with_latest_user_timestamp, true, "If true, always use the current latest timestamp for read. If " "false, choose a random timestamp from the past."); -#ifndef ROCKSDB_LITE DEFINE_string(secondary_cache_uri, "", "Full URI for creating a custom secondary cache object"); static class std::shared_ptr secondary_cache; -#endif // ROCKSDB_LITE static const bool FLAGS_prefix_size_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize); @@ -1831,6 +1738,10 @@ DEFINE_uint32( "This options determines the size of such checksums. " "Supported values: 0, 1, 2, 4, 8."); +DEFINE_uint32(block_protection_bytes_per_key, 0, + "Enable block per key-value checksum protection. " + "Supported values: 0, 1, 2, 4, 8."); + DEFINE_bool(build_info, false, "Print the build info via GetRocksBuildInfoAsString"); @@ -1845,7 +1756,6 @@ static Status CreateMemTableRepFactory( Status s; if (!strcasecmp(FLAGS_memtablerep.c_str(), SkipListFactory::kNickName())) { factory->reset(new SkipListFactory(FLAGS_skip_list_lookahead)); -#ifndef ROCKSDB_LITE } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "prefix_hash")) { factory->reset(NewHashSkipListRepFactory(FLAGS_hash_bucket_count)); } else if (!strcasecmp(FLAGS_memtablerep.c_str(), @@ -1853,7 +1763,6 @@ static Status CreateMemTableRepFactory( factory->reset(new VectorRepFactory()); } else if (!strcasecmp(FLAGS_memtablerep.c_str(), "hash_linkedlist")) { factory->reset(NewHashLinkListRepFactory(FLAGS_hash_bucket_count)); -#endif // ROCKSDB_LITE } else { std::unique_ptr unique; s = MemTableRepFactory::CreateFromString(config_options, FLAGS_memtablerep, @@ -2016,9 +1925,7 @@ static void AppendWithSpace(std::string* str, Slice msg) { struct DBWithColumnFamilies { std::vector cfh; DB* db; -#ifndef ROCKSDB_LITE OptimisticTransactionDB* opt_txn_db; -#endif // ROCKSDB_LITE std::atomic num_created; // Need to be updated after all the // new entries in cfh are set. size_t num_hot; // Number of column families to be queried at each moment. @@ -2030,10 +1937,8 @@ struct DBWithColumnFamilies { DBWithColumnFamilies() : db(nullptr) -#ifndef ROCKSDB_LITE , opt_txn_db(nullptr) -#endif // ROCKSDB_LITE { cfh.clear(); num_created = 0; @@ -2043,9 +1948,7 @@ struct DBWithColumnFamilies { DBWithColumnFamilies(const DBWithColumnFamilies& other) : cfh(other.cfh), db(other.db), -#ifndef ROCKSDB_LITE opt_txn_db(other.opt_txn_db), -#endif // ROCKSDB_LITE num_created(other.num_created.load()), num_hot(other.num_hot), cfh_idx_to_prob(other.cfh_idx_to_prob) { @@ -2055,7 +1958,6 @@ struct DBWithColumnFamilies { std::for_each(cfh.begin(), cfh.end(), [](ColumnFamilyHandle* cfhi) { delete cfhi; }); cfh.clear(); -#ifndef ROCKSDB_LITE if (opt_txn_db) { delete opt_txn_db; opt_txn_db = nullptr; @@ -2063,10 +1965,6 @@ struct DBWithColumnFamilies { delete db; db = nullptr; } -#else - delete db; - db = nullptr; -#endif // ROCKSDB_LITE } ColumnFamilyHandle* GetCfh(int64_t rand_num) { @@ -2721,7 +2619,7 @@ struct ThreadState { SharedState* shared; explicit ThreadState(int index, int my_seed) - : tid(index), rand(seed_base + my_seed) {} + : tid(index), rand(*seed_base + my_seed) {} }; class Duration { @@ -2783,10 +2681,8 @@ class Benchmark { ReadOptions read_options_; WriteOptions write_options_; Options open_options_; // keep options around to properly destroy db later -#ifndef ROCKSDB_LITE TraceOptions trace_options_; TraceOptions block_cache_trace_options_; -#endif int64_t reads_; int64_t deletes_; double read_random_exp_range_; @@ -2800,7 +2696,6 @@ class Benchmark { class ErrorHandlerListener : public EventListener { public: -#ifndef ROCKSDB_LITE ErrorHandlerListener() : mutex_(), cv_(&mutex_), @@ -2845,10 +2740,6 @@ class Benchmark { InstrumentedCondVar cv_; bool no_auto_recovery_; bool recovery_complete_; -#else // ROCKSDB_LITE - bool WaitForRecovery(uint64_t /*abs_time_us*/) { return true; } - void EnableAutoRecovery(bool /*enable*/) {} -#endif // ROCKSDB_LITE }; std::shared_ptr listener_; @@ -2949,7 +2840,7 @@ class Benchmark { std::string input_str(len, 'y'); std::string compressed; CompressionOptions opts; - CompressionContext context(FLAGS_compression_type_e); + CompressionContext context(FLAGS_compression_type_e, opts); CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), FLAGS_compression_type_e, FLAGS_sample_for_compression); @@ -3124,26 +3015,67 @@ class Benchmark { return allocator; } + static int32_t GetCacheHashSeed() { + // For a fixed Cache seed, need a non-negative int32 + return static_cast(*seed_base) & 0x7fffffff; + } + static std::shared_ptr NewCache(int64_t capacity) { + CompressedSecondaryCacheOptions secondary_cache_opts; + bool use_tiered_cache = false; if (capacity <= 0) { return nullptr; } + if (FLAGS_use_compressed_secondary_cache) { + secondary_cache_opts.capacity = FLAGS_compressed_secondary_cache_size; + secondary_cache_opts.num_shard_bits = + FLAGS_compressed_secondary_cache_numshardbits; + secondary_cache_opts.high_pri_pool_ratio = + FLAGS_compressed_secondary_cache_high_pri_pool_ratio; + secondary_cache_opts.low_pri_pool_ratio = + FLAGS_compressed_secondary_cache_low_pri_pool_ratio; + secondary_cache_opts.compression_type = + FLAGS_compressed_secondary_cache_compression_type_e; + secondary_cache_opts.compress_format_version = + FLAGS_compressed_secondary_cache_compress_format_version; + if (FLAGS_use_tiered_volatile_cache) { + use_tiered_cache = true; + } + } if (FLAGS_cache_type == "clock_cache") { fprintf(stderr, "Old clock cache implementation has been removed.\n"); exit(1); - } else if (FLAGS_cache_type == "hyper_clock_cache") { - return HyperClockCacheOptions(static_cast(capacity), - FLAGS_block_size /*estimated_entry_charge*/, - FLAGS_cache_numshardbits) - .MakeSharedCache(); + } else if (EndsWith(FLAGS_cache_type, "hyper_clock_cache")) { + size_t estimated_entry_charge; + if (FLAGS_cache_type == "fixed_hyper_clock_cache" || + FLAGS_cache_type == "hyper_clock_cache") { + estimated_entry_charge = FLAGS_block_size; + } else if (FLAGS_cache_type == "auto_hyper_clock_cache") { + estimated_entry_charge = 0; + } else { + fprintf(stderr, "Cache type not supported."); + exit(1); + } + HyperClockCacheOptions opts(FLAGS_cache_size, estimated_entry_charge, + FLAGS_cache_numshardbits); + opts.hash_seed = GetCacheHashSeed(); + if (use_tiered_cache) { + TieredCacheOptions tiered_opts; + opts.capacity += secondary_cache_opts.capacity; + tiered_opts.cache_type = PrimaryCacheType::kCacheTypeHCC; + tiered_opts.cache_opts = &opts; + tiered_opts.comp_cache_opts = secondary_cache_opts; + return NewTieredCache(tiered_opts); + } else { + return opts.MakeSharedCache(); + } } else if (FLAGS_cache_type == "lru_cache") { LRUCacheOptions opts( static_cast(capacity), FLAGS_cache_numshardbits, false /*strict_capacity_limit*/, FLAGS_cache_high_pri_pool_ratio, GetCacheAllocator(), kDefaultToAdaptiveMutex, kDefaultCacheMetadataChargePolicy, FLAGS_cache_low_pri_pool_ratio); - -#ifndef ROCKSDB_LITE + opts.hash_seed = GetCacheHashSeed(); if (!FLAGS_secondary_cache_uri.empty()) { Status s = SecondaryCache::CreateFromString( ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache); @@ -3155,27 +3087,21 @@ class Benchmark { exit(1); } opts.secondary_cache = secondary_cache; - } -#endif // ROCKSDB_LITE - - if (FLAGS_use_compressed_secondary_cache) { - CompressedSecondaryCacheOptions secondary_cache_opts; - secondary_cache_opts.capacity = FLAGS_compressed_secondary_cache_size; - secondary_cache_opts.num_shard_bits = - FLAGS_compressed_secondary_cache_numshardbits; - secondary_cache_opts.high_pri_pool_ratio = - FLAGS_compressed_secondary_cache_high_pri_pool_ratio; - secondary_cache_opts.low_pri_pool_ratio = - FLAGS_compressed_secondary_cache_low_pri_pool_ratio; - secondary_cache_opts.compression_type = - FLAGS_compressed_secondary_cache_compression_type_e; - secondary_cache_opts.compress_format_version = - FLAGS_compressed_secondary_cache_compress_format_version; + } else if (FLAGS_use_compressed_secondary_cache && !use_tiered_cache) { opts.secondary_cache = NewCompressedSecondaryCache(secondary_cache_opts); } - return NewLRUCache(opts); + if (use_tiered_cache) { + TieredCacheOptions tiered_opts; + opts.capacity += secondary_cache_opts.capacity; + tiered_opts.cache_type = PrimaryCacheType::kCacheTypeLRU; + tiered_opts.cache_opts = &opts; + tiered_opts.comp_cache_opts = secondary_cache_opts; + return NewTieredCache(tiered_opts); + } else { + return opts.MakeSharedCache(); + } } else { fprintf(stderr, "Cache type not supported."); exit(1); @@ -3205,11 +3131,7 @@ class Benchmark { : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads)), merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys), report_file_operations_(FLAGS_report_file_operations), -#ifndef ROCKSDB_LITE use_blob_db_(FLAGS_use_blob_db), // Stacked BlobDB -#else - use_blob_db_(false), // Stacked BlobDB -#endif // !ROCKSDB_LITE read_operands_(false) { // use simcache instead of cache if (FLAGS_simcache_size >= 0) { @@ -3245,12 +3167,10 @@ class Benchmark { if (!FLAGS_wal_dir.empty()) { options.wal_dir = FLAGS_wal_dir; } -#ifndef ROCKSDB_LITE if (use_blob_db_) { // Stacked BlobDB blob_db::DestroyBlobDB(FLAGS_db, options, blob_db::BlobDBOptions()); } -#endif // !ROCKSDB_LITE DestroyDB(FLAGS_db, options); if (!FLAGS_wal_dir.empty()) { FLAGS_env->DeleteDir(FLAGS_wal_dir); @@ -3454,6 +3374,7 @@ class Benchmark { read_options_.adaptive_readahead = FLAGS_adaptive_readahead; read_options_.async_io = FLAGS_async_io; read_options_.optimize_multiget_for_io = FLAGS_optimize_multiget_for_io; + read_options_.auto_readahead_size = FLAGS_auto_readahead_size; void (Benchmark::*method)(ThreadState*) = nullptr; void (Benchmark::*post_process_method)() = nullptr; @@ -3647,14 +3568,12 @@ class Benchmark { method = &Benchmark::Compact; } else if (name == "compactall") { CompactAll(); -#ifndef ROCKSDB_LITE } else if (name == "compact0") { CompactLevel(0); } else if (name == "compact1") { CompactLevel(1); } else if (name == "waitforcompaction") { WaitForCompaction(); -#endif } else if (name == "flush") { Flush(); } else if (name == "crc32c") { @@ -3671,11 +3590,9 @@ class Benchmark { method = &Benchmark::Compress; } else if (name == "uncompress") { method = &Benchmark::Uncompress; -#ifndef ROCKSDB_LITE } else if (name == "randomtransaction") { method = &Benchmark::RandomTransaction; post_process_method = &Benchmark::RandomTransactionVerify; -#endif // ROCKSDB_LITE } else if (name == "randomreplacekeys") { fresh_db = true; method = &Benchmark::RandomReplaceKeys; @@ -3691,6 +3608,8 @@ class Benchmark { } else if (name == "block_cache_entry_stats") { // DB::Properties::kBlockCacheEntryStats PrintStats("rocksdb.block-cache-entry-stats"); + } else if (name == "cache_report_problems") { + CacheReportProblems(); } else if (name == "stats") { PrintStats("rocksdb.stats"); } else if (name == "resetstats") { @@ -3711,7 +3630,6 @@ class Benchmark { PrintStats("rocksdb.sstables"); } else if (name == "stats_history") { PrintStatsHistory(); -#ifndef ROCKSDB_LITE } else if (name == "replay") { if (num_threads > 1) { fprintf(stderr, "Multi-threaded replay is not yet supported\n"); @@ -3722,24 +3640,19 @@ class Benchmark { ErrorExit(); } method = &Benchmark::Replay; -#endif // ROCKSDB_LITE } else if (name == "getmergeoperands") { method = &Benchmark::GetMergeOperands; -#ifndef ROCKSDB_LITE } else if (name == "verifychecksum") { method = &Benchmark::VerifyChecksum; } else if (name == "verifyfilechecksums") { method = &Benchmark::VerifyFileChecksums; -#endif // ROCKSDB_LITE } else if (name == "readrandomoperands") { read_operands_ = true; method = &Benchmark::ReadRandom; -#ifndef ROCKSDB_LITE } else if (name == "backup") { method = &Benchmark::Backup; } else if (name == "restore") { method = &Benchmark::Restore; -#endif } else if (!name.empty()) { // No error message for empty name fprintf(stderr, "unknown benchmark '%s'\n", name.c_str()); ErrorExit(); @@ -3771,7 +3684,6 @@ class Benchmark { if (method != nullptr) { fprintf(stdout, "DB path: [%s]\n", FLAGS_db.c_str()); -#ifndef ROCKSDB_LITE if (name == "backup") { std::cout << "Backup path: [" << FLAGS_backup_dir << "]" << std::endl; } else if (name == "restore") { @@ -3820,9 +3732,6 @@ class Benchmark { FLAGS_block_cache_trace_max_trace_file_size_in_bytes; block_cache_trace_options_.sampling_frequency = FLAGS_block_cache_trace_sampling_frequency; - if (FLAGS_block_cache_trace_filter_referenced_key) { - block_cache_trace_options_.filter |= kTraceFilterReferencedKey; - } std::unique_ptr block_cache_trace_writer; Status s = NewFileTraceWriter(FLAGS_env, EnvOptions(), FLAGS_block_cache_trace_file, @@ -3845,7 +3754,6 @@ class Benchmark { fprintf(stdout, "Tracing block cache accesses to: [%s]\n", FLAGS_block_cache_trace_file.c_str()); } -#endif // ROCKSDB_LITE if (num_warmup > 0) { printf("Warming up benchmark by running %d times\n", num_warmup); @@ -3884,7 +3792,6 @@ class Benchmark { secondary_update_thread_.reset(); } -#ifndef ROCKSDB_LITE if (name != "replay" && FLAGS_trace_file != "") { Status s = db_.db->EndTrace(); if (!s.ok()) { @@ -3900,7 +3807,6 @@ class Benchmark { s.ToString().c_str()); } } -#endif // ROCKSDB_LITE if (FLAGS_statistics) { fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str()); @@ -3911,21 +3817,17 @@ class Benchmark { static_cast_with_check(cache_.get())->ToString().c_str()); } -#ifndef ROCKSDB_LITE if (FLAGS_use_secondary_db) { fprintf(stdout, "Secondary instance updated %" PRIu64 " times.\n", secondary_db_updates_); } -#endif // ROCKSDB_LITE } private: std::shared_ptr timestamp_emulator_; std::unique_ptr secondary_update_thread_; std::atomic secondary_update_stopped_{0}; -#ifndef ROCKSDB_LITE uint64_t secondary_db_updates_ = 0; -#endif // ROCKSDB_LITE struct ThreadArg { Benchmark* bm; SharedState* shared; @@ -4109,7 +4011,8 @@ class Benchmark { bool ok = true; std::string compressed; CompressionOptions opts; - CompressionContext context(FLAGS_compression_type_e); + opts.level = FLAGS_compression_level; + CompressionContext context(FLAGS_compression_type_e, opts); CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), FLAGS_compression_type_e, FLAGS_sample_for_compression); @@ -4138,8 +4041,10 @@ class Benchmark { Slice input = gen.Generate(FLAGS_block_size); std::string compressed; - CompressionContext compression_ctx(FLAGS_compression_type_e); CompressionOptions compression_opts; + compression_opts.level = FLAGS_compression_level; + CompressionContext compression_ctx(FLAGS_compression_type_e, + compression_opts); CompressionInfo compression_info( compression_opts, compression_ctx, CompressionDict::GetEmptyDict(), FLAGS_compression_type_e, FLAGS_sample_for_compression); @@ -4173,12 +4078,15 @@ class Benchmark { // Returns true if the options is initialized from the specified // options file. bool InitializeOptionsFromFile(Options* opts) { -#ifndef ROCKSDB_LITE printf("Initializing RocksDB Options from the specified file\n"); DBOptions db_opts; std::vector cf_descs; if (FLAGS_options_file != "") { - auto s = LoadOptionsFromFile(FLAGS_options_file, FLAGS_env, &db_opts, + ConfigOptions config_opts; + config_opts.ignore_unknown_options = false; + config_opts.input_strings_escaped = true; + config_opts.env = FLAGS_env; + auto s = LoadOptionsFromFile(config_opts, FLAGS_options_file, &db_opts, &cf_descs); db_opts.env = FLAGS_env; if (s.ok()) { @@ -4189,9 +4097,6 @@ class Benchmark { FLAGS_options_file.c_str(), s.ToString().c_str()); exit(1); } -#else - (void)opts; -#endif return false; } @@ -4253,13 +4158,11 @@ class Benchmark { FLAGS_use_direct_io_for_flush_and_compaction; options.manual_wal_flush = FLAGS_manual_wal_flush; options.wal_compression = FLAGS_wal_compression_e; -#ifndef ROCKSDB_LITE options.ttl = FLAGS_fifo_compaction_ttl; options.compaction_options_fifo = CompactionOptionsFIFO( FLAGS_fifo_compaction_max_table_files_size_mb * 1024 * 1024, FLAGS_fifo_compaction_allow_compaction); options.compaction_options_fifo.age_for_warm = FLAGS_fifo_age_for_warm; -#endif // ROCKSDB_LITE options.prefix_extractor = prefix_extractor_; if (FLAGS_use_uint64_comparator) { options.comparator = test::Uint64Comparator(); @@ -4269,7 +4172,7 @@ class Benchmark { } } if (FLAGS_use_stderr_info_logger) { - options.info_log.reset(new StderrLogger()); + options.info_log = std::make_shared(); } options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0; options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio; @@ -4309,7 +4212,6 @@ class Benchmark { exit(1); } if (FLAGS_use_plain_table) { -#ifndef ROCKSDB_LITE if (!options.memtable_factory->IsInstanceOf("prefix_hash") && !options.memtable_factory->IsInstanceOf("hash_linkedlist")) { fprintf(stderr, "Warning: plain table is used with %s\n", @@ -4327,12 +4229,7 @@ class Benchmark { plain_table_options.hash_table_ratio = 0.75; options.table_factory = std::shared_ptr( NewPlainTableFactory(plain_table_options)); -#else - fprintf(stderr, "Plain table is not supported in lite mode\n"); - exit(1); -#endif // ROCKSDB_LITE } else if (FLAGS_use_cuckoo_table) { -#ifndef ROCKSDB_LITE if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) { fprintf(stderr, "Invalid cuckoo_hash_ratio\n"); exit(1); @@ -4348,10 +4245,6 @@ class Benchmark { table_options.identity_as_first_hash = FLAGS_identity_as_first_hash; options.table_factory = std::shared_ptr(NewCuckooTableFactory(table_options)); -#else - fprintf(stderr, "Cuckoo table is not supported in lite mode\n"); - exit(1); -#endif // ROCKSDB_LITE } else { BlockBasedTableOptions block_based_options; block_based_options.checksum = @@ -4453,7 +4346,6 @@ class Benchmark { {/*.charged = */ FLAGS_charge_blob_cache ? CacheEntryRoleOptions::Decision::kEnabled : CacheEntryRoleOptions::Decision::kDisabled}}); - block_based_options.block_cache_compressed = compressed_cache_; block_based_options.block_size = FLAGS_block_size; block_based_options.block_restart_interval = FLAGS_block_restart_interval; block_based_options.index_block_restart_interval = @@ -4496,7 +4388,6 @@ class Benchmark { block_based_options.data_block_hash_table_util_ratio = FLAGS_data_block_hash_table_util_ratio; if (FLAGS_read_cache_path != "") { -#ifndef ROCKSDB_LITE Status rc_status; // Read cache need to be provided with a the Logger, we will put all @@ -4528,11 +4419,6 @@ class Benchmark { rc_status.ToString().c_str()); exit(1); } -#else - fprintf(stderr, "Read cache is not supported in LITE\n"); - exit(1); - -#endif } if (FLAGS_use_blob_cache) { @@ -4728,7 +4614,6 @@ class Benchmark { FLAGS_blob_compaction_readahead_size; options.blob_file_starting_level = FLAGS_blob_file_starting_level; -#ifndef ROCKSDB_LITE if (FLAGS_readonly && FLAGS_transaction_db) { fprintf(stderr, "Cannot use readonly flag with transaction_db\n"); exit(1); @@ -4738,9 +4623,10 @@ class Benchmark { fprintf(stderr, "Cannot use use_secondary_db flag with transaction_db\n"); exit(1); } -#endif // ROCKSDB_LITE options.memtable_protection_bytes_per_key = FLAGS_memtable_protection_bytes_per_key; + options.block_protection_bytes_per_key = + FLAGS_block_protection_bytes_per_key; } void InitializeOptionsGeneral(Options* opts) { @@ -4766,7 +4652,7 @@ class Benchmark { if (FLAGS_cache_size > 0) { // This violates this function's rules on when to set options. But we // have to do it because the case of unconfigured block cache in OPTIONS - // file is indistinguishable (it is sanitized to 8MB by this point, not + // file is indistinguishable (it is sanitized to 32MB by this point, not // nullptr), and our regression tests assume this will be the shared // block cache, even with OPTIONS file provided. table_options->block_cache = cache_; @@ -4919,7 +4805,6 @@ class Benchmark { exit(1); } } -#ifndef ROCKSDB_LITE if (FLAGS_readonly) { s = DB::OpenForReadOnly(options, db_name, column_families, &db->cfh, &db->db); @@ -4945,14 +4830,10 @@ class Benchmark { } else { s = DB::Open(options, db_name, column_families, &db->cfh, &db->db); } -#else - s = DB::Open(options, db_name, column_families, &db->cfh, &db->db); -#endif // ROCKSDB_LITE db->cfh.resize(FLAGS_num_column_families); db->num_created = num_hot; db->num_hot = num_hot; db->cfh_idx_to_prob = std::move(cfh_idx_to_prob); -#ifndef ROCKSDB_LITE } else if (FLAGS_readonly) { s = DB::OpenForReadOnly(options, db_name, &db->db); } else if (FLAGS_optimistic_transaction_db) { @@ -5018,7 +4899,6 @@ class Benchmark { }, FLAGS_secondary_update_interval, db)); } -#endif // ROCKSDB_LITE } else { s = DB::Open(options, db_name, &db->db); } @@ -5067,7 +4947,7 @@ class Benchmark { values_[i] = i; } RandomShuffle(values_.begin(), values_.end(), - static_cast(seed_base)); + static_cast(*seed_base)); } } @@ -5179,7 +5059,7 @@ class Benchmark { // Default_random_engine provides slightly // improved throughput over mt19937. std::default_random_engine overwrite_gen{ - static_cast(seed_base)}; + static_cast(*seed_base)}; std::bernoulli_distribution overwrite_decider(p); // Inserted key window is filled with the last N @@ -5189,7 +5069,7 @@ class Benchmark { // - random access is O(1) // - insertion/removal at beginning/end is also O(1). std::deque inserted_key_window; - Random64 reservoir_id_gen(seed_base); + Random64 reservoir_id_gen(*seed_base); // --- Variables used in disposable/persistent keys simulation: // The following variables are used when @@ -5226,7 +5106,7 @@ class Benchmark { ErrorExit(); } } - Random rnd_disposable_entry(static_cast(seed_base)); + Random rnd_disposable_entry(static_cast(*seed_base)); std::string random_value; // Queue that stores scheduled timestamp of disposable entries deletes, // along with starting index of disposable entry keys to delete. @@ -5402,7 +5282,6 @@ class Benchmark { val = gen.Generate(); } if (use_blob_db_) { -#ifndef ROCKSDB_LITE // Stacked BlobDB blob_db::BlobDB* blobdb = static_cast(db_with_cfh->db); @@ -5412,7 +5291,6 @@ class Benchmark { } else { s = blobdb->Put(write_options_, key, val); } -#endif // ROCKSDB_LITE } else if (FLAGS_num_column_families <= 1) { batch.Put(key, val); } else { @@ -5457,11 +5335,9 @@ class Benchmark { GenerateKeyFromInt(begin_num + offset, FLAGS_num, &expanded_keys[offset]); if (use_blob_db_) { -#ifndef ROCKSDB_LITE // Stacked BlobDB s = db_with_cfh->db->Delete(write_options_, expanded_keys[offset]); -#endif // ROCKSDB_LITE } else if (FLAGS_num_column_families <= 1) { batch.Delete(expanded_keys[offset]); } else { @@ -5474,12 +5350,10 @@ class Benchmark { GenerateKeyFromInt(begin_num + range_tombstone_width_, FLAGS_num, &end_key); if (use_blob_db_) { -#ifndef ROCKSDB_LITE // Stacked BlobDB s = db_with_cfh->db->DeleteRange( write_options_, db_with_cfh->db->DefaultColumnFamily(), begin_key, end_key); -#endif // ROCKSDB_LITE } else if (FLAGS_num_column_families <= 1) { batch.DeleteRange(begin_key, end_key); } else { @@ -5565,7 +5439,6 @@ class Benchmark { Status DoDeterministicCompact(ThreadState* thread, CompactionStyle compaction_style, WriteMode write_mode) { -#ifndef ROCKSDB_LITE ColumnFamilyMetaData meta; std::vector db_list; if (db_.db != nullptr) { @@ -5866,14 +5739,6 @@ class Benchmark { std::to_string(options_list[i].level0_stop_writes_trigger)}}); } return Status::OK(); -#else - (void)thread; - (void)compaction_style; - (void)write_mode; - fprintf(stderr, "Rocksdb Lite doesn't support filldeterministic\n"); - return Status::NotSupported( - "Rocksdb Lite doesn't support filldeterministic"); -#endif // ROCKSDB_LITE } void ReadSequential(ThreadState* thread) { @@ -5898,6 +5763,7 @@ class Benchmark { options.adaptive_readahead = FLAGS_adaptive_readahead; options.async_io = FLAGS_async_io; + options.auto_readahead_size = FLAGS_auto_readahead_size; Iterator* iter = db->NewIterator(options); int64_t i = 0; @@ -7175,6 +7041,8 @@ class Benchmark { thread->stats.FinishedOps(&db_, db_.db, 1, kSeek); } + (void)num_seek_to_first; + (void)num_next; delete iter; } @@ -7883,7 +7751,6 @@ class Benchmark { } } -#ifndef ROCKSDB_LITE void VerifyChecksum(ThreadState* thread) { DB* db = SelectDB(thread); ReadOptions ro; @@ -7892,6 +7759,7 @@ class Benchmark { ro.rate_limiter_priority = FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL; ro.readahead_size = FLAGS_readahead_size; + ro.auto_readahead_size = FLAGS_auto_readahead_size; Status s = db->VerifyChecksum(ro); if (!s.ok()) { fprintf(stderr, "VerifyChecksum() failed: %s\n", s.ToString().c_str()); @@ -7907,6 +7775,7 @@ class Benchmark { ro.rate_limiter_priority = FLAGS_rate_limit_user_ops ? Env::IO_USER : Env::IO_TOTAL; ro.readahead_size = FLAGS_readahead_size; + ro.auto_readahead_size = FLAGS_auto_readahead_size; Status s = db->VerifyFileChecksums(ro); if (!s.ok()) { fprintf(stderr, "VerifyFileChecksums() failed: %s\n", @@ -8006,7 +7875,6 @@ class Benchmark { fprintf(stdout, "RandomTransactionVerify FAILED!!\n"); } } -#endif // ROCKSDB_LITE // Writes and deletes random keys without overwriting keys. // @@ -8257,60 +8125,25 @@ class Benchmark { } } -#ifndef ROCKSDB_LITE void WaitForCompactionHelper(DBWithColumnFamilies& db) { - // This is an imperfect way of waiting for compaction. The loop and sleep - // is done because a thread that finishes a compaction job should get a - // chance to pickup a new compaction job. - - std::vector keys = {DB::Properties::kMemTableFlushPending, - DB::Properties::kNumRunningFlushes, - DB::Properties::kCompactionPending, - DB::Properties::kNumRunningCompactions}; - fprintf(stdout, "waitforcompaction(%s): started\n", db.db->GetName().c_str()); - while (true) { - bool retry = false; - - for (const auto& k : keys) { - uint64_t v; - if (!db.db->GetIntProperty(k, &v)) { - fprintf(stderr, "waitforcompaction(%s): GetIntProperty(%s) failed\n", - db.db->GetName().c_str(), k.c_str()); - exit(1); - } else if (v > 0) { - fprintf(stdout, - "waitforcompaction(%s): active(%s). Sleep 10 seconds\n", - db.db->GetName().c_str(), k.c_str()); - FLAGS_env->SleepForMicroseconds(10 * 1000000); - retry = true; - break; - } - } + Status s = db.db->WaitForCompact(WaitForCompactOptions()); - if (!retry) { - fprintf(stdout, "waitforcompaction(%s): finished\n", - db.db->GetName().c_str()); - return; - } - } + fprintf(stdout, "waitforcompaction(%s): finished with status (%s)\n", + db.db->GetName().c_str(), s.ToString().c_str()); } void WaitForCompaction() { // Give background threads a chance to wake FLAGS_env->SleepForMicroseconds(5 * 1000000); - // I am skeptical that this check race free. I hope that checking twice - // reduces the chance. if (db_.db != nullptr) { WaitForCompactionHelper(db_); - WaitForCompactionHelper(db_); } else { for (auto& db_with_cfh : multi_dbs_) { WaitForCompactionHelper(db_with_cfh); - WaitForCompactionHelper(db_with_cfh); } } } @@ -8394,7 +8227,6 @@ class Benchmark { while (!CompactLevelHelper(db_with_cfh, from_level)) WaitForCompaction(); } } -#endif void Flush() { FlushOptions flush_opt; @@ -8474,6 +8306,11 @@ class Benchmark { } } + void CacheReportProblems() { + auto debug_logger = std::make_shared(DEBUG_LEVEL); + cache_->ReportProblems(debug_logger); + } + void PrintStats(const char* key) { if (db_.db != nullptr) { PrintStats(db_.db, key, false); @@ -8518,7 +8355,6 @@ class Benchmark { } } -#ifndef ROCKSDB_LITE void Replay(ThreadState* thread) { if (db_.db != nullptr) { @@ -8608,7 +8444,6 @@ class Benchmark { delete backup_engine; } -#endif // ROCKSDB_LITE }; int db_bench_tool(int argc, char** argv) { @@ -8624,7 +8459,6 @@ int db_bench_tool(int argc, char** argv) { ParseCommandLineFlags(&argc, &argv, true); FLAGS_compaction_style_e = (ROCKSDB_NAMESPACE::CompactionStyle)FLAGS_compaction_style; -#ifndef ROCKSDB_LITE if (FLAGS_statistics && !FLAGS_statistics_string.empty()) { fprintf(stderr, "Cannot provide both --statistics and --statistics_string.\n"); @@ -8640,7 +8474,6 @@ int db_bench_tool(int argc, char** argv) { exit(1); } } -#endif // ROCKSDB_LITE if (FLAGS_statistics) { dbstats = ROCKSDB_NAMESPACE::CreateDBStatistics(); } @@ -8670,7 +8503,6 @@ int db_bench_tool(int argc, char** argv) { FLAGS_compressed_secondary_cache_compression_type_e = StringToCompressionType( FLAGS_compressed_secondary_cache_compression_type.c_str()); -#ifndef ROCKSDB_LITE // Stacked BlobDB FLAGS_blob_db_compression_type_e = StringToCompressionType(FLAGS_blob_db_compression_type.c_str()); @@ -8702,7 +8534,6 @@ int db_bench_tool(int argc, char** argv) { // Let -readonly imply -use_existing_db FLAGS_use_existing_db |= FLAGS_readonly; -#endif // ROCKSDB_LITE if (FLAGS_build_info) { std::string build_info; @@ -8715,7 +8546,7 @@ int db_bench_tool(int argc, char** argv) { uint64_t now = FLAGS_env->GetSystemClock()->NowMicros(); seed_base = static_cast(now); fprintf(stdout, "Set seed to %" PRIu64 " because --seed was 0\n", - seed_base); + *seed_base); } else { seed_base = FLAGS_seed; } @@ -8783,13 +8614,11 @@ int db_bench_tool(int argc, char** argv) { ROCKSDB_NAMESPACE::Benchmark benchmark; benchmark.Run(); -#ifndef ROCKSDB_LITE if (FLAGS_print_malloc_stats) { std::string stats_string; ROCKSDB_NAMESPACE::DumpMallocStats(&stats_string); fprintf(stdout, "Malloc stats:\n%s\n", stats_string.c_str()); } -#endif // ROCKSDB_LITE return 0; } diff --git a/tools/db_bench_tool_test.cc b/tools/db_bench_tool_test.cc index a406ff66c592..a30c650654f3 100644 --- a/tools/db_bench_tool_test.cc +++ b/tools/db_bench_tool_test.cc @@ -87,9 +87,13 @@ class DBBenchTest : public testing::Test { void VerifyOptions(const Options& opt) { DBOptions loaded_db_opts; + ConfigOptions config_opts; + config_opts.ignore_unknown_options = false; + config_opts.input_strings_escaped = true; + config_opts.env = Env::Default(); std::vector cf_descs; - ASSERT_OK(LoadLatestOptions(db_path_, Env::Default(), &loaded_db_opts, - &cf_descs)); + ASSERT_OK( + LoadLatestOptions(config_opts, db_path_, &loaded_db_opts, &cf_descs)); ConfigOptions exact; exact.input_strings_escaped = false; @@ -302,9 +306,13 @@ TEST_F(DBBenchTest, OptionsFileFromFile) { ASSERT_OK(writable->Close()); DBOptions db_opt; + ConfigOptions config_opt; + config_opt.ignore_unknown_options = false; + config_opt.input_strings_escaped = true; + config_opt.env = Env::Default(); std::vector cf_descs; - ASSERT_OK(LoadOptionsFromFile(kOptionsFileName, Env::Default(), &db_opt, - &cf_descs)); + ASSERT_OK( + LoadOptionsFromFile(config_opt, kOptionsFileName, &db_opt, &cf_descs)); Options opt(db_opt, cf_descs[0].options); opt.create_if_missing = true; diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 83678289463b..01c3ae329e09 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -37,12 +37,13 @@ "backup_one_in": 100000, "batch_protection_bytes_per_key": lambda: random.choice([0, 8]), "memtable_protection_bytes_per_key": lambda: random.choice([0, 1, 2, 4, 8]), + "block_protection_bytes_per_key": lambda: random.choice([0, 1, 2, 4, 8]), "block_size": 16384, "bloom_bits": lambda: random.choice( [random.randint(0, 19), random.lognormvariate(2.3, 1.3)] ), "cache_index_and_filter_blocks": lambda: random.randint(0, 1), - "cache_size": 8388608, + "cache_size": lambda: random.choice([8388608, 33554432]), "charge_compression_dictionary_building_buffer": lambda: random.choice([0, 1]), "charge_filter_construction": lambda: random.choice([0, 1]), "charge_table_reader": lambda: random.choice([0, 1]), @@ -64,6 +65,7 @@ "compression_parallel_threads": 1, "compression_max_dict_buffer_bytes": lambda: (1 << random.randint(0, 40)) - 1, "compression_use_zstd_dict_trainer": lambda: random.randint(0, 1), + "compression_checksum": lambda: random.randint(0, 1), "clear_column_family_one_in": 0, "compact_files_one_in": 1000000, "compact_range_one_in": 1000000, @@ -88,6 +90,7 @@ "index_type": lambda: random.choice([0, 0, 0, 2, 2, 3]), "ingest_external_file_one_in": 1000000, "iterpercent": 10, + "lock_wal_one_in": 1000000, "mark_for_compaction_one_file_in": lambda: 10 * random.randint(0, 1), "max_background_compactions": 20, "max_bytes_for_level_base": 10485760, @@ -121,20 +124,28 @@ "use_direct_reads": lambda: random.randint(0, 1), "use_direct_io_for_flush_and_compaction": lambda: random.randint(0, 1), "mock_direct_io": False, - "cache_type": lambda: random.choice(["lru_cache", "hyper_clock_cache"]), + "cache_type": lambda: random.choice( + ["lru_cache", "fixed_hyper_clock_cache", "auto_hyper_clock_cache", + "auto_hyper_clock_cache", "tiered_lru_cache", + "tiered_fixed_hyper_clock_cache", "tiered_auto_hyper_clock_cache", + "tiered_auto_hyper_clock_cache"] + ), "use_full_merge_v1": lambda: random.randint(0, 1), "use_merge": lambda: random.randint(0, 1), # use_put_entity_one_in has to be the same across invocations for verification to work, hence no lambda "use_put_entity_one_in": random.choice([0] * 7 + [1, 5, 10]), # 999 -> use Bloom API - "ribbon_starting_level": lambda: random.choice([random.randint(-1, 10), 999]), + "bloom_before_level": lambda: random.choice([random.randint(-1, 2), random.randint(-1, 10), 0x7fffffff - 1, 0x7fffffff]), "value_size_mult": 32, + "verification_only": 0, "verify_checksum": 1, "write_buffer_size": 4 * 1024 * 1024, "writepercent": 35, - "format_version": lambda: random.choice([2, 3, 4, 5, 5]), + "format_version": lambda: random.choice([2, 3, 4, 5, 6, 6]), "index_block_restart_interval": lambda: random.choice(range(1, 16)), "use_multiget": lambda: random.randint(0, 1), + "use_get_entity": lambda: random.choice([0] * 7 + [1]), + "use_multi_get_entity": lambda: random.choice([0] * 7 + [1]), "periodic_compaction_seconds": lambda: random.choice([0, 0, 1, 2, 10, 100, 1000]), # 0 = never (used by some), 10 = often (for threading bugs), 600 = default "stats_dump_period_sec": lambda: random.choice([0, 10, 600]), @@ -149,12 +160,12 @@ "sync": lambda: random.choice([1 if t == 0 else 0 for t in range(0, 20)]), "bytes_per_sync": lambda: random.choice([0, 262144]), "wal_bytes_per_sync": lambda: random.choice([0, 524288]), - # Disable compaction_readahead_size because the test is not passing. - # "compaction_readahead_size" : lambda : random.choice( - # [0, 0, 1024 * 1024]), + "compaction_readahead_size": lambda: random.choice( + [0, 0, 1024 * 1024]), "db_write_buffer_size": lambda: random.choice( [0, 0, 0, 1024 * 1024, 8 * 1024 * 1024, 128 * 1024 * 1024] ), + "use_write_buffer_manager": lambda: random.randint(0, 1), "avoid_unnecessary_blocking_io": random.randint(0, 1), "write_dbid_to_manifest": random.randint(0, 1), "avoid_flush_during_recovery": lambda: random.choice( @@ -163,13 +174,15 @@ "max_write_batch_group_size_bytes": lambda: random.choice( [16, 64, 1024 * 1024, 16 * 1024 * 1024] ), - "level_compaction_dynamic_level_bytes": True, + "level_compaction_dynamic_level_bytes": lambda: random.randint(0, 1), "verify_checksum_one_in": 1000000, + "verify_file_checksums_one_in": 1000000, "verify_db_one_in": 100000, "continuous_verification_interval": 0, "max_key_len": 3, "key_len_percent_dist": "1,30,69", "read_fault_one_in": lambda: random.choice([0, 32, 1000]), + "write_fault_one_in": lambda: random.choice([0, 128, 1000]), "open_metadata_write_fault_one_in": lambda: random.choice([0, 0, 8]), "open_write_fault_one_in": lambda: random.choice([0, 0, 16]), "open_read_fault_one_in": lambda: random.choice([0, 0, 32]), @@ -181,6 +194,7 @@ ), "user_timestamp_size": 0, "secondary_cache_fault_one_in": lambda: random.choice([0, 0, 32]), + "compressed_secondary_cache_size": lambda: random.choice([8388608, 16777216]), "prepopulate_block_cache": lambda: random.choice([0, 1]), "memtable_prefix_bloom_size_ratio": lambda: random.choice([0.001, 0.01, 0.1, 0.5]), "memtable_whole_key_filtering": lambda: random.randint(0, 1), @@ -192,20 +206,31 @@ "secondary_cache_uri": lambda: random.choice( [ "", - "compressed_secondary_cache://capacity=8388608", + "", + "", "compressed_secondary_cache://capacity=8388608;enable_custom_split_merge=true", ] ), "allow_data_in_errors": True, + "enable_thread_tracking": lambda: random.choice([0, 1]), "readahead_size": lambda: random.choice([0, 16384, 524288]), "initial_auto_readahead_size": lambda: random.choice([0, 16384, 524288]), "max_auto_readahead_size": lambda: random.choice([0, 16384, 524288]), "num_file_reads_for_auto_readahead": lambda: random.choice([0, 1, 2]), "min_write_buffer_number_to_merge": lambda: random.choice([1, 2]), "preserve_internal_time_seconds": lambda: random.choice([0, 60, 3600, 36000]), + "memtable_max_range_deletions": lambda: random.choice([0] * 6 + [100, 1000]), + # 0 (disable) is the default and more commonly used value. + "bottommost_file_compaction_delay": lambda: random.choice( + [0, 0, 0, 600, 3600, 86400] + ), + "auto_readahead_size" : lambda: random.choice([0, 1]), + "verify_iterator_with_expected_state_one_in": 5, } _TEST_DIR_ENV_VAR = "TEST_TMPDIR" +# If TEST_TMPDIR_EXPECTED is not specified, default value will be TEST_TMPDIR +_TEST_EXPECTED_DIR_ENV_VAR = "TEST_TMPDIR_EXPECTED" _DEBUG_LEVEL_ENV_VAR = "DEBUG_LEVEL" stress_cmd = "./db_stress" @@ -228,7 +253,10 @@ def get_dbname(test_name): print("Running DB cleanup command - %s\n" % cleanup_cmd) # Ignore failure os.system(cleanup_cmd) - os.mkdir(dbname) + try: + os.mkdir(dbname) + except OSError: + pass return dbname @@ -240,12 +268,18 @@ def setup_expected_values_dir(): if expected_values_dir is not None: return expected_values_dir expected_dir_prefix = "rocksdb_crashtest_expected_" - test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR) - if test_tmpdir is None or test_tmpdir == "": + test_exp_tmpdir = os.environ.get(_TEST_EXPECTED_DIR_ENV_VAR) + + # set the value to _TEST_DIR_ENV_VAR if _TEST_EXPECTED_DIR_ENV_VAR is not + # specified. + if test_exp_tmpdir is None or test_exp_tmpdir == "": + test_exp_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR) + + if test_exp_tmpdir is None or test_exp_tmpdir == "": expected_values_dir = tempfile.mkdtemp(prefix=expected_dir_prefix) else: # if tmpdir is specified, store the expected_values_dir under that dir - expected_values_dir = test_tmpdir + "/rocksdb_crashtest_expected" + expected_values_dir = test_exp_tmpdir + "/rocksdb_crashtest_expected" if os.path.exists(expected_values_dir): shutil.rmtree(expected_values_dir) os.mkdir(expected_values_dir) @@ -260,16 +294,22 @@ def setup_multiops_txn_key_spaces_file(): if multiops_txn_key_spaces_file is not None: return multiops_txn_key_spaces_file key_spaces_file_prefix = "rocksdb_crashtest_multiops_txn_key_spaces" - test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR) - if test_tmpdir is None or test_tmpdir == "": + test_exp_tmpdir = os.environ.get(_TEST_EXPECTED_DIR_ENV_VAR) + + # set the value to _TEST_DIR_ENV_VAR if _TEST_EXPECTED_DIR_ENV_VAR is not + # specified. + if test_exp_tmpdir is None or test_exp_tmpdir == "": + test_exp_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR) + + if test_exp_tmpdir is None or test_exp_tmpdir == "": multiops_txn_key_spaces_file = tempfile.mkstemp(prefix=key_spaces_file_prefix)[ 1 ] else: - if not os.path.exists(test_tmpdir): - os.mkdir(test_tmpdir) + if not os.path.exists(test_exp_tmpdir): + os.mkdir(test_exp_tmpdir) multiops_txn_key_spaces_file = tempfile.mkstemp( - prefix=key_spaces_file_prefix, dir=test_tmpdir + prefix=key_spaces_file_prefix, dir=test_exp_tmpdir )[1] return multiops_txn_key_spaces_file @@ -296,10 +336,14 @@ def is_direct_io_supported(dbname): } whitebox_default_params = { - # TODO: enable this once we figure out how to adjust kill odds for WAL- - # disabled runs, and either (1) separate full `db_stress` runs out of - # whitebox crash or (2) support verification at end of `db_stress` runs - # that ran with WAL disabled. + # TODO: enable this at random once we figure out two things. First, we need + # to ensure the kill odds in WAL-disabled runs result in regular crashing + # before the fifteen minute timeout. When WAL is disabled there are very few + # calls to write functions since writes to SST files are buffered and other + # writes (e.g., MANIFEST) are infrequent. Crashing in reasonable time might + # currently assume killpoints in write functions are reached frequently. + # + # Second, we need to make sure disabling WAL works with `-reopen > 0`. "disable_wal": 0, "duration": 10000, "log2_keys_per_lock": 10, @@ -320,9 +364,8 @@ def is_direct_io_supported(dbname): "target_file_size_multiplier": 1, "test_batches_snapshots": 0, "write_buffer_size": 32 * 1024 * 1024, - "level_compaction_dynamic_level_bytes": False, + "level_compaction_dynamic_level_bytes": lambda: random.randint(0, 1), "paranoid_file_checks": lambda: random.choice([0, 1, 1, 1]), - "verify_iterator_with_expected_state_one_in": 5, # this locks a range of keys } blackbox_simple_default_params = { @@ -345,10 +388,14 @@ def is_direct_io_supported(dbname): "enable_compaction_filter": 0, # `CfConsistencyStressTest::TestIngestExternalFile()` is not implemented. "ingest_external_file_one_in": 0, + # `CfConsistencyStressTest::TestIterateAgainstExpected()` is not implemented. + "verify_iterator_with_expected_state_one_in": 0, } +# For pessimistic transaction db txn_params = { "use_txn": 1, + "use_optimistic_txn": 0, # Avoid lambda to set it once for the entire test "txn_write_policy": random.randint(0, 2), "unordered_write": random.randint(0, 1), @@ -361,7 +408,18 @@ def is_direct_io_supported(dbname): "enable_pipelined_write": 0, "create_timestamped_snapshot_one_in": random.choice([0, 20]), # PutEntity in transactions is not yet implemented - "use_put_entity_one_in" : 0, + "use_put_entity_one_in": 0, +} + +# For optimistic transaction db +optimistic_txn_params = { + "use_txn": 1, + "use_optimistic_txn": 1, + "occ_validation_policy": random.randint(0, 1), + "share_occ_lock_buckets": random.randint(0, 1), + "occ_lock_bucket_count": lambda: random.choice([10, 100, 500]), + # PutEntity in transactions is not yet implemented + "use_put_entity_one_in": 0, } best_efforts_recovery_params = { @@ -369,6 +427,8 @@ def is_direct_io_supported(dbname): "atomic_flush": 0, "disable_wal": 1, "column_families": 1, + "skip_verifydb": 1, + "verify_db_one_in": 0 } blob_params = { @@ -401,11 +461,9 @@ def is_direct_io_supported(dbname): "use_merge": 0, "use_full_merge_v1": 0, "use_txn": 0, - "enable_blob_files": 0, - "use_blob_db": 0, "ingest_external_file_one_in": 0, # PutEntity with timestamps is not yet implemented - "use_put_entity_one_in" : 0, + "use_put_entity_one_in": 0, } tiered_params = { @@ -460,8 +518,15 @@ def is_direct_io_supported(dbname): "enable_compaction_filter": 0, "create_timestamped_snapshot_one_in": 50, "sync_fault_injection": 0, + # This test has aggressive flush frequency and small write buffer size. + # Disabling write fault to avoid writes being stopped. + "write_fault_one_in": 0, # PutEntity in transactions is not yet implemented - "use_put_entity_one_in" : 0, + "use_put_entity_one_in": 0, + "use_get_entity": 0, + "use_multi_get_entity": 0, + # `MultiOpsTxnsStressTest::TestIterateAgainstExpected()` is not implemented. + "verify_iterator_with_expected_state_one_in": 0, } multiops_wc_txn_params = { @@ -527,12 +592,16 @@ def finalize_and_sanitize(src_params): # Multi-key operations are not currently compatible with transactions or # timestamp. - if (dest_params.get("test_batches_snapshots") == 1 or - dest_params.get("use_txn") == 1 or - dest_params.get("user_timestamp_size") > 0): + if ( + dest_params.get("test_batches_snapshots") == 1 + or dest_params.get("use_txn") == 1 + or dest_params.get("user_timestamp_size") > 0 + ): dest_params["ingest_external_file_one_in"] = 0 - if (dest_params.get("test_batches_snapshots") == 1 or - dest_params.get("use_txn") == 1): + if ( + dest_params.get("test_batches_snapshots") == 1 + or dest_params.get("use_txn") == 1 + ): dest_params["delpercent"] += dest_params["delrangepercent"] dest_params["delrangepercent"] = 0 if ( @@ -603,9 +672,8 @@ def finalize_and_sanitize(src_params): dest_params["enable_compaction_filter"] = 0 dest_params["sync"] = 0 dest_params["write_fault_one_in"] = 0 - if dest_params["secondary_cache_uri"] != "": - # Currently the only cache type compatible with a secondary cache is LRUCache - dest_params["cache_type"] = "lru_cache" + dest_params["skip_verifydb"] = 1 + dest_params["verify_db_one_in"] = 0 # Remove the following once write-prepared/write-unprepared with/without # unordered write supports timestamped snapshots if dest_params.get("create_timestamped_snapshot_one_in", 0) > 0: @@ -613,15 +681,37 @@ def finalize_and_sanitize(src_params): dest_params["unordered_write"] = 0 # For TransactionDB, correctness testing with unsync data loss is currently # compatible with only write committed policy - if (dest_params.get("use_txn") == 1 and dest_params.get("txn_write_policy") != 0): + if dest_params.get("use_txn") == 1 and dest_params.get("txn_write_policy") != 0: dest_params["sync_fault_injection"] = 0 dest_params["manual_wal_flush_one_in"] = 0 - # PutEntity is currently not supported by SstFileWriter or in conjunction with Merge + # Wide column stress tests require FullMergeV3 if dest_params["use_put_entity_one_in"] != 0: - dest_params["ingest_external_file_one_in"] = 0 - dest_params["use_merge"] = 0 dest_params["use_full_merge_v1"] = 0 - + if dest_params["file_checksum_impl"] == "none": + dest_params["verify_file_checksums_one_in"] = 0 + if dest_params["write_fault_one_in"] > 0: + # background work may be disabled while DB is resuming after some error + dest_params["max_write_buffer_number"] = max(dest_params["max_write_buffer_number"], 10) + if dest_params["secondary_cache_uri"].find("compressed_secondary_cache") >= 0: + dest_params["compressed_secondary_cache_size"] = 0 + dest_params["compressed_secondary_cache_ratio"] = 0.0 + if dest_params["cache_type"].find("tiered_") >= 0: + if dest_params["compressed_secondary_cache_size"] > 0: + dest_params["compressed_secondary_cache_ratio"] = \ + float(dest_params["compressed_secondary_cache_size"]/ \ + (dest_params["cache_size"] + dest_params["compressed_secondary_cache_size"])) + dest_params["compressed_secondary_cache_size"] = 0 + else: + dest_params["compressed_secondary_cache_ratio"] = 0.0 + dest_params["cache_type"] = dest_params["cache_type"].replace("tiered_", "") + else: + if dest_params["secondary_cache_uri"]: + dest_params["compressed_secondary_cache_size"] = 0 + dest_params["compressed_secondary_cache_ratio"] = 0.0 + if dest_params["use_write_buffer_manager"]: + if (dest_params["cache_size"] <= 0 + or dest_params["db_write_buffer_size"] <= 0): + dest_params["use_write_buffer_manager"] = 0 return dest_params @@ -643,6 +733,8 @@ def gen_cmd_params(args): params.update(cf_consistency_params) if args.txn: params.update(txn_params) + if args.optimistic_txn: + params.update(optimistic_txn_params) if args.test_best_efforts_recovery: params.update(best_efforts_recovery_params) if args.enable_ts: @@ -656,12 +748,11 @@ def gen_cmd_params(args): if args.test_tiered_storage: params.update(tiered_params) - # Best-effort recovery, user defined timestamp, tiered storage are currently - # incompatible with BlobDB. Test BE recovery if specified on the command - # line; otherwise, apply BlobDB related overrides with a 10% chance. + # Best-effort recovery, tiered storage are currently incompatible with BlobDB. + # Test BE recovery if specified on the command line; otherwise, apply BlobDB + # related overrides with a 10% chance. if ( not args.test_best_efforts_recovery - and not args.enable_ts and not args.test_tiered_storage and random.choice([0] * 9 + [1]) == 1 ): @@ -689,6 +780,7 @@ def gen_cmd(params, unknown_params): "random_kill_odd", "cf_consistency", "txn", + "optimistic_txn", "test_best_efforts_recovery", "enable_ts", "test_multiops_txn", @@ -696,6 +788,7 @@ def gen_cmd(params, unknown_params): "stress_cmd", "test_tiered_storage", "cleanup_cmd", + "skip_tmpdir_check", } and v is not None ] @@ -704,7 +797,7 @@ def gen_cmd(params, unknown_params): return cmd -def execute_cmd(cmd, timeout): +def execute_cmd(cmd, timeout=None): child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE) print("Running db_stress with pid=%d: %s\n\n" % (child.pid, " ".join(cmd))) @@ -758,10 +851,51 @@ def blackbox_crash_main(args, unknown_args): print("stderr has error message:") print("***" + line + "***") + stderrdata = errs.lower() + errorcount = stderrdata.count("error") - stderrdata.count("got errors 0 times") + print("#times error occurred in output is " + str(errorcount) + "\n") + + if errorcount > 0: + print("TEST FAILED. Output has 'error'!!!\n") + sys.exit(2) + if stderrdata.find("fail") >= 0: + print("TEST FAILED. Output has 'fail'!!!\n") + sys.exit(2) + time.sleep(1) # time to stabilize before the next run time.sleep(1) # time to stabilize before the next run + # We should run the test one more time with VerifyOnly setup and no-timeout + # Only do this if the tests are not failed for total-duration + print("Running final time for verification") + cmd_params.update({"verification_only": 1}) + cmd_params.update({"skip_verifydb": 0}) + + cmd = gen_cmd( + dict(list(cmd_params.items()) + list({"db": dbname}.items())), unknown_args + ) + hit_timeout, retcode, outs, errs = execute_cmd(cmd) + + # Print stats of the final run + print("stdout:", outs) + + for line in errs.split("\n"): + if line != "" and not line.startswith("WARNING"): + print("stderr has error message:") + print("***" + line + "***") + + stderrdata = errs.lower() + errorcount = stderrdata.count("error") - stderrdata.count("got errors 0 times") + print("#times error occurred in output is " + str(errorcount) + "\n") + + if errorcount > 0: + print("TEST FAILED. Output has 'error'!!!\n") + sys.exit(2) + if stderrdata.find("fail") >= 0: + print("TEST FAILED. Output has 'fail'!!!\n") + sys.exit(2) + # we need to clean up after ourselves -- only do this on test success shutil.rmtree(dbname, True) @@ -861,9 +995,17 @@ def whitebox_crash_main(args, unknown_args): "ops_per_thread": cmd_params["ops_per_thread"], } - cur_compaction_style = additional_opts.get("compaction_style", cmd_params.get("compaction_style", 0)) - if prev_compaction_style != -1 and prev_compaction_style != cur_compaction_style: - print("`compaction_style` is changed in current run so `destroy_db_initially` is set to 1 as a short-term solution to avoid cycling through previous db of different compaction style." + "\n") + cur_compaction_style = additional_opts.get( + "compaction_style", cmd_params.get("compaction_style", 0) + ) + if ( + prev_compaction_style != -1 + and prev_compaction_style != cur_compaction_style + ): + print( + "`compaction_style` is changed in current run so `destroy_db_initially` is set to 1 as a short-term solution to avoid cycling through previous db of different compaction style." + + "\n" + ) additional_opts["destroy_db_initially"] = 1 prev_compaction_style = cur_compaction_style @@ -937,8 +1079,11 @@ def whitebox_crash_main(args, unknown_args): if ret != 0: print("TEST FAILED. DB cleanup returned error %d\n" % ret) sys.exit(1) - os.mkdir(dbname) - if (expected_values_dir is not None): + try: + os.mkdir(dbname) + except OSError: + pass + if expected_values_dir is not None: shutil.rmtree(expected_values_dir, True) os.mkdir(expected_values_dir) @@ -959,6 +1104,7 @@ def main(): parser.add_argument("--simple", action="store_true") parser.add_argument("--cf_consistency", action="store_true") parser.add_argument("--txn", action="store_true") + parser.add_argument("--optimistic_txn", action="store_true") parser.add_argument("--test_best_efforts_recovery", action="store_true") parser.add_argument("--enable_ts", action="store_true") parser.add_argument("--test_multiops_txn", action="store_true") @@ -966,6 +1112,7 @@ def main(): parser.add_argument("--stress_cmd") parser.add_argument("--test_tiered_storage", action="store_true") parser.add_argument("--cleanup_cmd") + parser.add_argument("--skip_tmpdir_check", action="store_true") all_params = dict( list(default_params.items()) @@ -983,6 +1130,7 @@ def main(): + list(cf_consistency_params.items()) + list(tiered_params.items()) + list(txn_params.items()) + + list(optimistic_txn_params.items()) ) for k, v in all_params.items(): @@ -991,12 +1139,18 @@ def main(): args, unknown_args = parser.parse_known_args() test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR) - if test_tmpdir is not None and not os.path.isdir(test_tmpdir): - print( - "%s env var is set to a non-existent directory: %s" - % (_TEST_DIR_ENV_VAR, test_tmpdir) - ) - sys.exit(1) + if test_tmpdir is not None and not args.skip_tmpdir_check: + isdir = False + try: + isdir = os.path.isdir(test_tmpdir) + if not isdir: + print( + "ERROR: %s env var is set to a non-existent directory: %s. Update it to correct directory path." + % (_TEST_DIR_ENV_VAR, test_tmpdir) + ) + sys.exit(1) + except OSError: + pass if args.stress_cmd: stress_cmd = args.stress_cmd diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc index ba680f4f20c0..c424743d738d 100644 --- a/tools/db_repl_stress.cc +++ b/tools/db_repl_stress.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #ifndef GFLAGS #include int main() { @@ -131,10 +130,3 @@ int main(int argc, const char** argv) { #endif // GFLAGS -#else // ROCKSDB_LITE -#include -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "Not supported in lite mode.\n"); - return 1; -} -#endif // ROCKSDB_LITE diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc index 8cc67f5d5a6c..f40be5ae2f2a 100644 --- a/tools/db_sanity_test.cc +++ b/tools/db_sanity_test.cc @@ -198,7 +198,6 @@ class SanityTestZSTDCompression : public SanityTest { Options options_; }; -#ifndef ROCKSDB_LITE class SanityTestPlainTableFactory : public SanityTest { public: explicit SanityTestPlainTableFactory(const std::string& path) @@ -214,7 +213,6 @@ class SanityTestPlainTableFactory : public SanityTest { private: Options options_; }; -#endif // ROCKSDB_LITE class SanityTestBloomFilter : public SanityTest { public: @@ -244,9 +242,7 @@ bool RunSanityTests(const std::string& command, const std::string& path) { new SanityTestLZ4Compression(path), new SanityTestLZ4HCCompression(path), new SanityTestZSTDCompression(path), -#ifndef ROCKSDB_LITE new SanityTestPlainTableFactory(path), -#endif // ROCKSDB_LITE new SanityTestBloomFilter(path)}; if (command == "create") { diff --git a/tools/dump/db_dump_tool.cc b/tools/dump/db_dump_tool.cc index 427a54d99eb7..92edd5512be6 100644 --- a/tools/dump/db_dump_tool.cc +++ b/tools/dump/db_dump_tool.cc @@ -3,7 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE + +#include "rocksdb/db_dump_tool.h" #include "rocksdb/db_dump_tool.h" @@ -257,4 +258,3 @@ bool DbUndumpTool::Run(const UndumpOptions& undump_options, return true; } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/tools/dump/rocksdb_dump.cc b/tools/dump/rocksdb_dump.cc index 358457e92325..c2042bf7c0f1 100644 --- a/tools/dump/rocksdb_dump.cc +++ b/tools/dump/rocksdb_dump.cc @@ -3,15 +3,12 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#if !(defined GFLAGS) || defined(ROCKSDB_LITE) +#if !(defined GFLAGS) #include int main() { #ifndef GFLAGS fprintf(stderr, "Please install gflags to run rocksdb tools\n"); -#endif -#ifdef ROCKSDB_LITE - fprintf(stderr, "DbDumpTool is not supported in ROCKSDB_LITE\n"); #endif return 1; } @@ -60,4 +57,4 @@ int main(int argc, char** argv) { } return 0; } -#endif // !(defined GFLAGS) || defined(ROCKSDB_LITE) +#endif // !(defined GFLAGS) diff --git a/tools/dump/rocksdb_undump.cc b/tools/dump/rocksdb_undump.cc index 2ff128548d2c..e437b3fe8a43 100644 --- a/tools/dump/rocksdb_undump.cc +++ b/tools/dump/rocksdb_undump.cc @@ -3,15 +3,12 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#if !(defined GFLAGS) || defined(ROCKSDB_LITE) +#if !(defined GFLAGS) #include int main() { #ifndef GFLAGS fprintf(stderr, "Please install gflags to run rocksdb tools\n"); -#endif -#ifdef ROCKSDB_LITE - fprintf(stderr, "DbUndumpTool is not supported in ROCKSDB_LITE\n"); #endif return 1; } @@ -59,4 +56,4 @@ int main(int argc, char **argv) { } return 0; } -#endif // !(defined GFLAGS) || defined(ROCKSDB_LITE) +#endif // !(defined GFLAGS) diff --git a/tools/io_tracer_parser.cc b/tools/io_tracer_parser.cc index 41ef45d978c5..287d60c850ad 100644 --- a/tools/io_tracer_parser.cc +++ b/tools/io_tracer_parser.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #ifndef GFLAGS #include int main() { @@ -16,10 +15,3 @@ int main(int argc, char** argv) { return ROCKSDB_NAMESPACE::io_tracer_parser(argc, argv); } #endif // GFLAGS -#else // ROCKSDB_LITE -#include -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "Not supported in lite mode.\n"); - return 1; -} -#endif // ROCKSDB_LITE diff --git a/tools/io_tracer_parser_test.cc b/tools/io_tracer_parser_test.cc index 41be5fa96cd2..8e1fb72df394 100644 --- a/tools/io_tracer_parser_test.cc +++ b/tools/io_tracer_parser_test.cc @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #ifndef GFLAGS #include int main() { @@ -181,10 +180,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } #endif // GFLAGS -#else -#include -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "io_tracer_parser_test is not supported in ROCKSDB_LITE\n"); - return 0; -} -#endif // ROCKSDB_LITE diff --git a/tools/io_tracer_parser_tool.cc b/tools/io_tracer_parser_tool.cc index 01b920f3b842..cdaeab044d16 100644 --- a/tools/io_tracer_parser_tool.cc +++ b/tools/io_tracer_parser_tool.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #ifdef GFLAGS #include "tools/io_tracer_parser_tool.h" @@ -141,4 +140,3 @@ int io_tracer_parser(int argc, char** argv) { } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS -#endif // ROCKSDB_LITE diff --git a/tools/io_tracer_parser_tool.h b/tools/io_tracer_parser_tool.h index 6c22c8f89c45..c79d4c510cfc 100644 --- a/tools/io_tracer_parser_tool.h +++ b/tools/io_tracer_parser_tool.h @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #pragma once #include @@ -37,4 +36,3 @@ class IOTraceRecordParser { int io_tracer_parser(int argc, char** argv); } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/tools/ldb.cc b/tools/ldb.cc index 482383be85f9..52533e6b0f6e 100644 --- a/tools/ldb.cc +++ b/tools/ldb.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include "rocksdb/ldb_tool.h" @@ -12,10 +11,3 @@ int main(int argc, char** argv) { tool.Run(argc, argv); return 0; } -#else -#include -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "Not supported in lite mode.\n"); - return 1; -} -#endif // ROCKSDB_LITE diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 323d5f8b40ad..578d3af2b2c2 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -4,10 +4,9 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/ldb_cmd.h" -#include +#include #include #include #include @@ -23,6 +22,8 @@ #include "db/dbformat.h" #include "db/log_reader.h" #include "db/version_util.h" +#include "db/wide/wide_column_serialization.h" +#include "db/wide/wide_columns_helper.h" #include "db/write_batch_internal.h" #include "file/filename.h" #include "rocksdb/cache.h" @@ -208,9 +209,15 @@ LDBCommand* LDBCommand::SelectCommand(const ParsedParams& parsed_params) { if (parsed_params.cmd == GetCommand::Name()) { return new GetCommand(parsed_params.cmd_params, parsed_params.option_map, parsed_params.flags); + } else if (parsed_params.cmd == GetEntityCommand::Name()) { + return new GetEntityCommand(parsed_params.cmd_params, + parsed_params.option_map, parsed_params.flags); } else if (parsed_params.cmd == PutCommand::Name()) { return new PutCommand(parsed_params.cmd_params, parsed_params.option_map, parsed_params.flags); + } else if (parsed_params.cmd == PutEntityCommand::Name()) { + return new PutEntityCommand(parsed_params.cmd_params, + parsed_params.option_map, parsed_params.flags); } else if (parsed_params.cmd == BatchPutCommand::Name()) { return new BatchPutCommand(parsed_params.cmd_params, parsed_params.option_map, parsed_params.flags); @@ -930,7 +937,15 @@ void LDBCommand::PrepareOptions() { &column_families_); if (!s.ok() && !s.IsNotFound()) { // Option file exists but load option file error. - std::string msg = s.ToString(); + std::string current_version = std::to_string(ROCKSDB_MAJOR) + "." + + std::to_string(ROCKSDB_MINOR) + "." + + std::to_string(ROCKSDB_PATCH); + std::string msg = + s.ToString() + "\nThis tool was built with version " + + current_version + + ". If your db is in a different version, please try again " + "with option --" + + LDBCommand::ARG_IGNORE_UNKNOWN_OPTIONS + "."; exec_state_ = LDBCommandExecuteResult::Failed(msg); db_ = nullptr; return; @@ -1086,6 +1101,28 @@ std::string LDBCommand::PrintKeyValue(const std::string& key, return PrintKeyValue(key, value, is_hex, is_hex); } +std::string LDBCommand::PrintKeyValueOrWideColumns( + const Slice& key, const Slice& value, const WideColumns& wide_columns, + bool is_key_hex, bool is_value_hex) { + if (wide_columns.empty() || + WideColumnsHelper::HasDefaultColumnOnly(wide_columns)) { + return PrintKeyValue(key.ToString(), value.ToString(), is_key_hex, + is_value_hex); + } + /* + // Sample plaintext output (first column is kDefaultWideColumnName) + key_1 ==> :foo attr_name1:bar attr_name2:baz + + // Sample hex output (first column is kDefaultWideColumnName) + 0x6669727374 ==> :0x68656C6C6F 0x617474725F6E616D6531:0x666F6F + */ + std::ostringstream oss; + WideColumnsHelper::DumpWideColumns(wide_columns, oss, is_value_hex); + return PrintKeyValue(key.ToString(), oss.str().c_str(), is_key_hex, + false); // is_value_hex_ is already honored in oss. + // avoid double-hexing it. +} + std::string LDBCommand::HelpRangeCmdArgs() { std::ostringstream str_stream; str_stream << " "; @@ -1316,7 +1353,8 @@ void DBLoaderCommand::DoCommand() { namespace { void DumpManifestFile(Options options, std::string file, bool verbose, bool hex, - bool json) { + bool json, + const std::vector& cf_descs) { EnvOptions sopt; std::string dbname("dummy"); std::shared_ptr tc(NewLRUCache(options.max_open_files - 10, @@ -1331,8 +1369,10 @@ void DumpManifestFile(Options options, std::string file, bool verbose, bool hex, ImmutableDBOptions immutable_db_options(options); VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ ""); - Status s = versions.DumpManifest(options, file, verbose, hex, json); + /*db_id=*/"", /*db_session_id=*/"", + options.daily_offpeak_time_utc, + /*error_handler=*/nullptr); + Status s = versions.DumpManifest(options, file, verbose, hex, json, cf_descs); if (!s.ok()) { fprintf(stderr, "Error in processing file %s %s\n", file.c_str(), s.ToString().c_str()); @@ -1377,6 +1417,7 @@ ManifestDumpCommand::ManifestDumpCommand( } void ManifestDumpCommand::DoCommand() { + PrepareOptions(); std::string manifestfile; if (!path_.empty()) { @@ -1444,7 +1485,8 @@ void ManifestDumpCommand::DoCommand() { fprintf(stdout, "Processing Manifest file %s\n", manifestfile.c_str()); } - DumpManifestFile(options_, manifestfile, verbose_, is_key_hex_, json_); + DumpManifestFile(options_, manifestfile, verbose_, is_key_hex_, json_, + column_families_); if (verbose_) { fprintf(stdout, "Processing Manifest file %s done\n", manifestfile.c_str()); @@ -1472,7 +1514,9 @@ Status GetLiveFilesChecksumInfoFromVersionSet(Options options, ImmutableDBOptions immutable_db_options(options); VersionSet versions(dbname, &immutable_db_options, sopt, tc.get(), &wb, &wc, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ ""); + /*db_id=*/"", /*db_session_id=*/"", + options.daily_offpeak_time_utc, + /*error_handler=*/nullptr); std::vector cf_name_list; s = versions.ListColumnFamilies(&cf_name_list, db_path, immutable_db_options.fs.get()); @@ -1518,6 +1562,7 @@ FileChecksumDumpCommand::FileChecksumDumpCommand( } void FileChecksumDumpCommand::DoCommand() { + PrepareOptions(); // print out the checksum information in the following format: // sst file number, checksum function name, checksum value // sst file number, checksum function name, checksum value @@ -1622,6 +1667,7 @@ ListColumnFamiliesCommand::ListColumnFamiliesCommand( : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {} void ListColumnFamiliesCommand::DoCommand() { + PrepareOptions(); std::vector column_families; Status s = DB::ListColumnFamilies(options_, db_path_, &column_families); if (!s.ok()) { @@ -1885,8 +1931,20 @@ void InternalDumpCommand::DoCommand() { std::string key = ikey.DebugString(is_key_hex_); Slice value(key_version.value); if (!decode_blob_index_ || value_type != kTypeBlobIndex) { - fprintf(stdout, "%s => %s\n", key.c_str(), - value.ToString(is_value_hex_).c_str()); + if (value_type == kTypeWideColumnEntity) { + std::ostringstream oss; + const Status s = WideColumnsHelper::DumpSliceAsWideColumns( + value, oss, is_value_hex_); + if (!s.ok()) { + fprintf(stderr, "%s => error deserializing wide columns\n", + key.c_str()); + } else { + fprintf(stdout, "%s => %s\n", key.c_str(), oss.str().c_str()); + } + } else { + fprintf(stdout, "%s => %s\n", key.c_str(), + value.ToString(is_value_hex_).c_str()); + } } else { BlobIndex blob_index; @@ -2052,7 +2110,7 @@ void DBDumperCommand::DoCommand() { break; case kDescriptorFile: DumpManifestFile(options_, path_, /* verbose_ */ false, is_key_hex_, - /* json_ */ false); + /* json_ */ false, column_families_); break; case kBlobFile: DumpBlobFile(path_, is_key_hex_, is_value_hex_, @@ -2186,9 +2244,14 @@ void DBDumperCommand::DoDumpCommand() { if (is_db_ttl_ && timestamp_) { fprintf(stdout, "%s ", TimeToHumanString(rawtime).c_str()); } + // (TODO) TTL Iterator does not support wide columns yet. std::string str = - PrintKeyValue(iter->key().ToString(), iter->value().ToString(), - is_key_hex_, is_value_hex_); + is_db_ttl_ + ? PrintKeyValue(iter->key().ToString(), iter->value().ToString(), + is_key_hex_, is_value_hex_) + : PrintKeyValueOrWideColumns(iter->key(), iter->value(), + iter->columns(), is_key_hex_, + is_value_hex_); fprintf(stdout, "%s\n", str.c_str()); } } @@ -2274,7 +2337,9 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, int* levels) { WriteBufferManager wb(opt.db_write_buffer_size); VersionSet versions(db_path_, &db_options, soptions, tc.get(), &wb, &wc, /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr, - /*db_id*/ "", /*db_session_id*/ ""); + /*db_id=*/"", /*db_session_id=*/"", + opt.daily_offpeak_time_utc, + /*error_handler=*/nullptr); std::vector dummy; ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName, ColumnFamilyOptions(opt)); @@ -2530,6 +2595,16 @@ class InMemoryHandler : public WriteBatch::Handler { return Status::OK(); } + Status PutEntityCF(uint32_t cf, const Slice& key, + const Slice& value) override { + row_ << "PUT_ENTITY(" << cf << ") : "; + std::string k = LDBCommand::StringToHex(key.ToString()); + if (print_values_) { + return WideColumnsHelper::DumpSliceAsWideColumns(value, row_, true); + } + return Status::OK(); + } + Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override { row_ << "MERGE(" << cf << ") : "; commonPutMerge(key, value); @@ -2735,6 +2810,7 @@ void WALDumperCommand::Help(std::string& ret) { } void WALDumperCommand::DoCommand() { + PrepareOptions(); DumpWalFile(options_, wal_file_, print_header_, print_values_, is_write_committed_, &exec_state_); } @@ -2786,6 +2862,55 @@ void GetCommand::DoCommand() { // ---------------------------------------------------------------------------- +GetEntityCommand::GetEntityCommand( + const std::vector& params, + const std::map& options, + const std::vector& flags) + : LDBCommand( + options, flags, true, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX})) { + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::Failed( + " must be specified for the get_entity command"); + } else { + key_ = params.at(0); + } + + if (is_key_hex_) { + key_ = HexToString(key_); + } +} + +void GetEntityCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(GetEntityCommand::Name()); + ret.append(" "); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); +} + +void GetEntityCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + PinnableWideColumns pinnable_wide_columns; + Status st = db_->GetEntity(ReadOptions(), GetCfHandle(), key_, + &pinnable_wide_columns); + if (st.ok()) { + std::ostringstream oss; + WideColumnsHelper::DumpWideColumns(pinnable_wide_columns.columns(), oss, + is_value_hex_); + fprintf(stdout, "%s\n", oss.str().c_str()); + } else { + std::stringstream oss; + oss << "GetEntity failed: " << st.ToString(); + exec_state_ = LDBCommandExecuteResult::Failed(oss.str()); + } +} + +// ---------------------------------------------------------------------------- + ApproxSizeCommand::ApproxSizeCommand( const std::vector& /*params*/, const std::map& options, @@ -3028,30 +3153,22 @@ void ScanCommand::DoCommand() { } } - Slice key_slice = it->key(); - - std::string formatted_key; - if (is_key_hex_) { - formatted_key = "0x" + key_slice.ToString(true /* hex */); - key_slice = formatted_key; - } else if (ldb_options_.key_formatter) { - formatted_key = ldb_options_.key_formatter->Format(key_slice); - key_slice = formatted_key; - } - if (no_value_) { - fprintf(stdout, "%.*s\n", static_cast(key_slice.size()), - key_slice.data()); - } else { - Slice val_slice = it->value(); - std::string formatted_value; - if (is_value_hex_) { - formatted_value = "0x" + val_slice.ToString(true /* hex */); - val_slice = formatted_value; + std::string key_str = it->key().ToString(); + if (is_key_hex_) { + key_str = StringToHex(key_str); + } else if (ldb_options_.key_formatter) { + key_str = ldb_options_.key_formatter->Format(key_str); } - fprintf(stdout, "%.*s : %.*s\n", static_cast(key_slice.size()), - key_slice.data(), static_cast(val_slice.size()), - val_slice.data()); + fprintf(stdout, "%s\n", key_str.c_str()); + } else { + std::string str = is_db_ttl_ ? PrintKeyValue(it->key().ToString(), + it->value().ToString(), + is_key_hex_, is_value_hex_) + : PrintKeyValueOrWideColumns( + it->key(), it->value(), it->columns(), + is_key_hex_, is_value_hex_); + fprintf(stdout, "%s\n", str.c_str()); } num_keys_scanned++; @@ -3230,6 +3347,81 @@ void PutCommand::OverrideBaseOptions() { // ---------------------------------------------------------------------------- +PutEntityCommand::PutEntityCommand( + const std::vector& params, + const std::map& options, + const std::vector& flags) + : LDBCommand(options, flags, false, + BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, + ARG_VALUE_HEX, ARG_CREATE_IF_MISSING})) { + if (params.size() < 2) { + exec_state_ = LDBCommandExecuteResult::Failed( + " and at least one column : must be " + "specified for the put_entity command"); + } else { + auto iter = params.begin(); + key_ = *iter; + if (is_key_hex_) { + key_ = HexToString(key_); + } + for (++iter; iter != params.end(); ++iter) { + auto split = StringSplit(*iter, ':'); + if (split.size() != 2) { + exec_state_ = LDBCommandExecuteResult::Failed( + "wide column format needs to be : (did " + "you mean put ?)"); + return; + } + std::string name(split[0]); + std::string value(split[1]); + if (is_value_hex_) { + name = HexToString(name); + value = HexToString(value); + } + column_names_.push_back(name); + column_values_.push_back(value); + } + } + create_if_missing_ = IsFlagPresent(flags_, ARG_CREATE_IF_MISSING); +} + +void PutEntityCommand::Help(std::string& ret) { + ret.append(" "); + ret.append(PutCommand::Name()); + ret.append( + " : : " + "<...>"); + ret.append(" [--" + ARG_CREATE_IF_MISSING + "]"); + ret.append(" [--" + ARG_TTL + "]"); + ret.append("\n"); +} + +void PutEntityCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + assert(column_names_.size() == column_values_.size()); + WideColumns columns; + for (size_t i = 0; i < column_names_.size(); i++) { + WideColumn column(column_names_[i], column_values_[i]); + columns.emplace_back(column); + } + Status st = db_->PutEntity(WriteOptions(), GetCfHandle(), key_, columns); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed(st.ToString()); + } +} + +void PutEntityCommand::OverrideBaseOptions() { + LDBCommand::OverrideBaseOptions(); + options_.create_if_missing = create_if_missing_; +} + +// ---------------------------------------------------------------------------- + const char* DBQuerierCommand::HELP_CMD = "help"; const char* DBQuerierCommand::GET_CMD = "get"; const char* DBQuerierCommand::PUT_CMD = "put"; @@ -3712,7 +3904,8 @@ void DBFileDumperCommand::DoCommand() { manifest_filepath = NormalizePath(manifest_filepath); std::cout << manifest_filepath << std::endl; - DumpManifestFile(options_, manifest_filepath, false, false, false); + DumpManifestFile(options_, manifest_filepath, false, false, false, + column_families_); std::cout << std::endl; std::vector column_families; @@ -4174,6 +4367,8 @@ UnsafeRemoveSstFileCommand::UnsafeRemoveSstFileCommand( } void UnsafeRemoveSstFileCommand::DoCommand() { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; PrepareOptions(); OfflineManifestWriter w(options_, db_path_); @@ -4198,7 +4393,7 @@ void UnsafeRemoveSstFileCommand::DoCommand() { s = options_.env->GetFileSystem()->NewDirectory(db_path_, IOOptions(), &db_dir, nullptr); if (s.ok()) { - s = w.LogAndApply(cfd, &edit, db_dir.get()); + s = w.LogAndApply(read_options, cfd, &edit, db_dir.get()); } } @@ -4318,4 +4513,3 @@ void CloudManifestDumpCommand::DoCommand() { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/tools/ldb_cmd_impl.h b/tools/ldb_cmd_impl.h index ce8d07f28017..d56ccf662e14 100644 --- a/tools/ldb_cmd_impl.h +++ b/tools/ldb_cmd_impl.h @@ -403,6 +403,22 @@ class GetCommand : public LDBCommand { std::string key_; }; +class GetEntityCommand : public LDBCommand { + public: + static std::string Name() { return "get_entity"; } + + GetEntityCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + private: + std::string key_; +}; + class ApproxSizeCommand : public LDBCommand { public: static std::string Name() { return "approxsize"; } @@ -530,6 +546,26 @@ class PutCommand : public LDBCommand { std::string value_; }; +class PutEntityCommand : public LDBCommand { + public: + static std::string Name() { return "put_entity"; } + + PutEntityCommand(const std::vector& params, + const std::map& options, + const std::vector& flags); + + void DoCommand() override; + + static void Help(std::string& ret); + + void OverrideBaseOptions() override; + + private: + std::string key_; + std::vector column_names_; + std::vector column_values_; +}; + /** * Command that starts up a REPL shell that allows * get/put/delete. diff --git a/tools/ldb_cmd_test.cc b/tools/ldb_cmd_test.cc index 5d83a6cd97c5..465d1eb31713 100644 --- a/tools/ldb_cmd_test.cc +++ b/tools/ldb_cmd_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/ldb_cmd.h" #include @@ -208,7 +207,8 @@ class FileChecksumTestHelper { WriteBufferManager wb(options_.db_write_buffer_size); ImmutableDBOptions immutable_db_options(options_); VersionSet versions(dbname_, &immutable_db_options, sopt, tc.get(), &wb, - &wc, nullptr, nullptr, "", ""); + &wc, nullptr, nullptr, "", "", + options_.daily_offpeak_time_utc, nullptr); std::vector cf_name_list; Status s; s = versions.ListColumnFamilies(&cf_name_list, dbname_, @@ -269,7 +269,7 @@ class FileChecksumTestHelper { break; } } - EXPECT_OK(db_->EnableFileDeletions()); + EXPECT_OK(db_->EnableFileDeletions(/*force=*/false)); return cs; } }; @@ -1215,12 +1215,3 @@ int main(int argc, char** argv) { RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as LDBCommand is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/tools/ldb_test.py b/tools/ldb_test.py index e243d69c0563..cde0414713d1 100644 --- a/tools/ldb_test.py +++ b/tools/ldb_test.py @@ -121,32 +121,55 @@ def testSimpleStringPutGet(self): self.assertRunOK("get x2", "y2") self.assertRunFAIL("get x3") - self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2") + self.assertRunFAIL("put_entity x4") + self.assertRunFAIL("put_entity x4 cv1") + self.assertRunOK("put_entity x4 :cv1", "OK") + self.assertRunOK("get_entity x4", ":cv1") + + self.assertRunOK("put_entity x5 cn1:cv1 cn2:cv2", "OK") + self.assertRunOK("get_entity x5", "cn1:cv1 cn2:cv2") + + self.assertRunOK( + "scan --from=x1 --to=z", + "x1 ==> y1\nx2 ==> y2\nx4 ==> cv1\nx5 ==> cn1:cv1 cn2:cv2", + ) self.assertRunOK("put x3 y3", "OK") - self.assertRunOK("scan --from=x1 --to=z", "x1 : y1\nx2 : y2\nx3 : y3") - self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3") - self.assertRunOK("scan --from=x", "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK( + "scan --from=x1 --to=z", + "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> cv1\nx5 ==> cn1:cv1 cn2:cv2", + ) + self.assertRunOK( + "scan", + "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> cv1\nx5 ==> cn1:cv1 cn2:cv2", + ) + self.assertRunOK( + "scan --from=x", + "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> cv1\nx5 ==> cn1:cv1 cn2:cv2", + ) - self.assertRunOK("scan --to=x2", "x1 : y1") - self.assertRunOK("scan --from=x1 --to=z --max_keys=1", "x1 : y1") - self.assertRunOK("scan --from=x1 --to=z --max_keys=2", "x1 : y1\nx2 : y2") + self.assertRunOK("scan --to=x2", "x1 ==> y1") + self.assertRunOK("scan --from=x1 --to=z --max_keys=1", "x1 ==> y1") + self.assertRunOK("scan --from=x1 --to=z --max_keys=2", "x1 ==> y1\nx2 ==> y2") + + self.assertRunOK("delete x4", "OK") + self.assertRunOK("delete x5", "OK") self.assertRunOK( - "scan --from=x1 --to=z --max_keys=3", "x1 : y1\nx2 : y2\nx3 : y3" + "scan --from=x1 --to=z --max_keys=3", "x1 ==> y1\nx2 ==> y2\nx3 ==> y3" ) self.assertRunOK( - "scan --from=x1 --to=z --max_keys=4", "x1 : y1\nx2 : y2\nx3 : y3" + "scan --from=x1 --to=z --max_keys=4", "x1 ==> y1\nx2 ==> y2\nx3 ==> y3" ) - self.assertRunOK("scan --from=x1 --to=x2", "x1 : y1") - self.assertRunOK("scan --from=x2 --to=x4", "x2 : y2\nx3 : y3") + self.assertRunOK("scan --from=x1 --to=x2", "x1 ==> y1") + self.assertRunOK("scan --from=x2 --to=x4", "x2 ==> y2\nx3 ==> y3") self.assertRunFAIL("scan --from=x4 --to=z") # No results => FAIL self.assertRunFAIL("scan --from=x1 --to=z --max_keys=foo") - self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOK("scan", "x1 ==> y1\nx2 ==> y2\nx3 ==> y3") self.assertRunOK("delete x1", "OK") - self.assertRunOK("scan", "x2 : y2\nx3 : y3") + self.assertRunOK("scan", "x2 ==> y2\nx3 ==> y3") self.assertRunOK("delete NonExistentKey", "OK") # It is weird that GET and SCAN raise exception for @@ -171,9 +194,9 @@ def ingestExternSst(self, params, inputSst): def testStringBatchPut(self): print("Running testStringBatchPut...") self.assertRunOK("batchput x1 y1 --create_if_missing", "OK") - self.assertRunOK("scan", "x1 : y1") + self.assertRunOK("scan", "x1 ==> y1") self.assertRunOK('batchput x2 y2 x3 y3 "x4 abc" "y4 xyz"', "OK") - self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 abc : y4 xyz") + self.assertRunOK("scan", "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 abc ==> y4 xyz") self.assertRunFAIL("batchput") self.assertRunFAIL("batchput k1") self.assertRunFAIL("batchput k1 v1 k2") @@ -183,11 +206,11 @@ def testBlobBatchPut(self): dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) self.assertRunOK("batchput x1 y1 --create_if_missing --enable_blob_files", "OK") - self.assertRunOK("scan", "x1 : y1") + self.assertRunOK("scan", "x1 ==> y1") self.assertRunOK( 'batchput --enable_blob_files x2 y2 x3 y3 "x4 abc" "y4 xyz"', "OK" ) - self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 abc : y4 xyz") + self.assertRunOK("scan", "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 abc ==> y4 xyz") blob_files = self.getBlobFiles(dbPath) self.assertTrue(len(blob_files) >= 1) @@ -278,12 +301,12 @@ def testInvalidCmdLines(self): def testHexPutGet(self): print("Running testHexPutGet...") self.assertRunOK("put a1 b1 --create_if_missing", "OK") - self.assertRunOK("scan", "a1 : b1") - self.assertRunOK("scan --hex", "0x6131 : 0x6231") + self.assertRunOK("scan", "a1 ==> b1") + self.assertRunOK("scan --hex", "0x6131 ==> 0x6231") self.assertRunFAIL("put --hex 6132 6232") self.assertRunOK("put --hex 0x6132 0x6232", "OK") - self.assertRunOK("scan --hex", "0x6131 : 0x6231\n0x6132 : 0x6232") - self.assertRunOK("scan", "a1 : b1\na2 : b2") + self.assertRunOK("scan --hex", "0x6131 ==> 0x6231\n0x6132 ==> 0x6232") + self.assertRunOK("scan", "a1 ==> b1\na2 ==> b2") self.assertRunOK("get a1", "b1") self.assertRunOK("get --hex 0x6131", "0x6231") self.assertRunOK("get a2", "b2") @@ -292,27 +315,28 @@ def testHexPutGet(self): self.assertRunOK("get --key_hex --value_hex 0x6132", "0x6232") self.assertRunOK("get --value_hex a2", "0x6232") self.assertRunOK( - "scan --key_hex --value_hex", "0x6131 : 0x6231\n0x6132 : 0x6232" + "scan --key_hex --value_hex", "0x6131 ==> 0x6231\n0x6132 ==> 0x6232" ) self.assertRunOK( - "scan --hex --from=0x6131 --to=0x6133", "0x6131 : 0x6231\n0x6132 : 0x6232" + "scan --hex --from=0x6131 --to=0x6133", + "0x6131 ==> 0x6231\n0x6132 ==> 0x6232", ) - self.assertRunOK("scan --hex --from=0x6131 --to=0x6132", "0x6131 : 0x6231") - self.assertRunOK("scan --key_hex", "0x6131 : b1\n0x6132 : b2") - self.assertRunOK("scan --value_hex", "a1 : 0x6231\na2 : 0x6232") + self.assertRunOK("scan --hex --from=0x6131 --to=0x6132", "0x6131 ==> 0x6231") + self.assertRunOK("scan --key_hex", "0x6131 ==> b1\n0x6132 ==> b2") + self.assertRunOK("scan --value_hex", "a1 ==> 0x6231\na2 ==> 0x6232") self.assertRunOK("batchput --hex 0x6133 0x6233 0x6134 0x6234", "OK") - self.assertRunOK("scan", "a1 : b1\na2 : b2\na3 : b3\na4 : b4") + self.assertRunOK("scan", "a1 ==> b1\na2 ==> b2\na3 ==> b3\na4 ==> b4") self.assertRunOK("delete --hex 0x6133", "OK") - self.assertRunOK("scan", "a1 : b1\na2 : b2\na4 : b4") + self.assertRunOK("scan", "a1 ==> b1\na2 ==> b2\na4 ==> b4") self.assertRunOK("checkconsistency", "OK") def testTtlPutGet(self): print("Running testTtlPutGet...") self.assertRunOK("put a1 b1 --ttl --create_if_missing", "OK") - self.assertRunOK("scan --hex", "0x6131 : 0x6231", True) + self.assertRunOK("scan --hex", "0x6131 ==> 0x6231", True) self.assertRunOK("dump --ttl ", "a1 ==> b1", True) self.assertRunOK("dump --hex --ttl ", "0x6131 ==> 0x6231\nKeys in range: 1") - self.assertRunOK("scan --hex --ttl", "0x6131 : 0x6231") + self.assertRunOK("scan --hex --ttl", "0x6131 ==> 0x6231") self.assertRunOK("get --value_hex a1", "0x6231", True) self.assertRunOK("get --ttl a1", "b1") self.assertRunOK("put a3 b3 --create_if_missing", "OK") @@ -334,7 +358,7 @@ def testInvalidCmdLines(self): # noqa: F811 T25377293 Grandfathered in def testDumpLoad(self): print("Running testDumpLoad...") self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4", "OK") - self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + self.assertRunOK("scan", "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4") origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME) # Dump and load without any additional params specified @@ -345,7 +369,7 @@ def testDumpLoad(self): self.loadDb("--db=%s --create_if_missing" % loadedDbPath, dumpFilePath) ) self.assertRunOKFull( - "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4" + "scan --db=%s" % loadedDbPath, "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4" ) # Dump and load in hex @@ -358,7 +382,7 @@ def testDumpLoad(self): ) ) self.assertRunOKFull( - "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4" + "scan --db=%s" % loadedDbPath, "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4" ) # Dump only a portion of the key range @@ -370,7 +394,7 @@ def testDumpLoad(self): self.assertTrue( self.loadDb("--db=%s --create_if_missing" % loadedDbPath, dumpFilePath) ) - self.assertRunOKFull("scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2") + self.assertRunOKFull("scan --db=%s" % loadedDbPath, "x1 ==> y1\nx2 ==> y2") # Dump upto max_keys rows dumpFilePath = os.path.join(self.TMP_DIR, "dump4") @@ -379,13 +403,15 @@ def testDumpLoad(self): self.assertTrue( self.loadDb("--db=%s --create_if_missing" % loadedDbPath, dumpFilePath) ) - self.assertRunOKFull("scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3") + self.assertRunOKFull( + "scan --db=%s" % loadedDbPath, "x1 ==> y1\nx2 ==> y2\nx3 ==> y3" + ) # Load into an existing db, create_if_missing is not specified self.assertTrue(self.dumpDb("--db=%s" % origDbPath, dumpFilePath)) self.assertTrue(self.loadDb("--db=%s" % loadedDbPath, dumpFilePath)) self.assertRunOKFull( - "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4" + "scan --db=%s" % loadedDbPath, "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4" ) # Dump and load with WAL disabled @@ -398,7 +424,7 @@ def testDumpLoad(self): ) ) self.assertRunOKFull( - "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4" + "scan --db=%s" % loadedDbPath, "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4" ) # Dump and load with lots of extra params specified @@ -423,7 +449,7 @@ def testDumpLoad(self): ) ) self.assertRunOKFull( - "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4" + "scan --db=%s" % loadedDbPath, "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4" ) # Dump with count_only @@ -435,7 +461,7 @@ def testDumpLoad(self): ) # DB should have atleast one value for scan to work self.assertRunOKFull("put --db=%s k1 v1" % loadedDbPath, "OK") - self.assertRunOKFull("scan --db=%s" % loadedDbPath, "k1 : v1") + self.assertRunOKFull("scan --db=%s" % loadedDbPath, "k1 ==> v1") # Dump command fails because of typo in params dumpFilePath = os.path.join(self.TMP_DIR, "dump8") @@ -458,7 +484,7 @@ def testDumpLoad(self): ) ) self.assertRunOKFull( - "scan --db=%s" % loadedDbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4" + "scan --db=%s" % loadedDbPath, "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4" ) blob_files = self.getBlobFiles(loadedDbPath) self.assertTrue(len(blob_files) >= 1) @@ -498,26 +524,26 @@ def testMiscAdminTask(self): # These tests need to be improved; for example with asserts about # whether compaction or level reduction actually took place. self.assertRunOK("batchput --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4", "OK") - self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + self.assertRunOK("scan", "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4") origDbPath = os.path.join(self.TMP_DIR, self.DB_NAME) self.assertTrue(0 == run_err_null("./ldb compact --db=%s" % origDbPath)) - self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + self.assertRunOK("scan", "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4") self.assertTrue( 0 == run_err_null("./ldb reduce_levels --db=%s --new_levels=2" % origDbPath) ) - self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + self.assertRunOK("scan", "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4") self.assertTrue( 0 == run_err_null("./ldb reduce_levels --db=%s --new_levels=3" % origDbPath) ) - self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + self.assertRunOK("scan", "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4") self.assertTrue( 0 == run_err_null("./ldb compact --db=%s --from=x1 --to=x3" % origDbPath) ) - self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + self.assertRunOK("scan", "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4") self.assertTrue( 0 @@ -525,7 +551,7 @@ def testMiscAdminTask(self): "./ldb compact --db=%s --hex --from=0x6131 --to=0x6134" % origDbPath ) ) - self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + self.assertRunOK("scan", "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4") # TODO(dilip): Not sure what should be passed to WAL.Currently corrupted. self.assertTrue( @@ -535,7 +561,7 @@ def testMiscAdminTask(self): % (origDbPath, os.path.join(origDbPath, "LOG")) ) ) - self.assertRunOK("scan", "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + self.assertRunOK("scan", "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4") def testCheckConsistency(self): print("Running testCheckConsistency...") @@ -923,7 +949,9 @@ def testIngestExternalSst(self): "batchput --db=%s --create_if_missing x1 y1 x2 y2 x3 y3 x4 y4" % dbPath, "OK", ) - self.assertRunOK("scan --db=%s" % dbPath, "x1 : y1\nx2 : y2\nx3 : y3\nx4 : y4") + self.assertRunOK( + "scan --db=%s" % dbPath, "x1 ==> y1\nx2 ==> y2\nx3 ==> y3\nx4 ==> y4" + ) dumpFilePath = os.path.join(self.TMP_DIR, "dump1") with open(dumpFilePath, "w") as f: f.write("x1 ==> y10\nx2 ==> y20\nx3 ==> y30\nx4 ==> y40") @@ -947,7 +975,7 @@ def testIngestExternalSst(self): ) ) self.assertRunOKFull( - "scan --db=%s" % dbPath, "x1 : y10\nx2 : y20\nx3 : y30\nx4 : y40" + "scan --db=%s" % dbPath, "x1 ==> y10\nx2 ==> y20\nx3 ==> y30\nx4 ==> y40" ) diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc index ab5173838833..0d6bc3c5b890 100644 --- a/tools/ldb_tool.cc +++ b/tools/ldb_tool.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include "rocksdb/ldb_tool.h" #include "rocksdb/utilities/ldb_cmd.h" @@ -94,6 +93,7 @@ void LDBCommandRunner::PrintHelp(const LDBOptions& ldb_options, BatchPutCommand::Help(ret); ScanCommand::Help(ret); DeleteCommand::Help(ret); + SingleDeleteCommand::Help(ret); DeleteRangeCommand::Help(ret); DBQuerierCommand::Help(ret); ApproxSizeCommand::Help(ret); @@ -181,5 +181,3 @@ void LDBTool::Run(int argc, char** argv, Options options, exit(error_code); } } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc index c8604bf439ba..229911eaa161 100644 --- a/tools/reduce_levels_test.cc +++ b/tools/reduce_levels_test.cc @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include "db/db_impl/db_impl.h" #include "db/version_set.h" @@ -82,6 +81,7 @@ class ReduceLevelTest : public testing::Test { Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels) { ROCKSDB_NAMESPACE::Options opt; + opt.level_compaction_dynamic_level_bytes = false; opt.num_levels = num_levels; opt.create_if_missing = create_if_missing; ROCKSDB_NAMESPACE::Status st = @@ -211,12 +211,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as LDBCommand is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/tools/regression_test.sh b/tools/regression_test.sh index 2743c5aee232..26380f61439c 100755 --- a/tools/regression_test.sh +++ b/tools/regression_test.sh @@ -113,7 +113,8 @@ DATA_FORMAT+="%9.0f,%10.0f,%10.0f,%10.0f,%10.0f,%10.0f,%5.0f," DATA_FORMAT+="%5.0f,%5.0f,%5.0f" # time DATA_FORMAT+="\n" -MAIN_PATTERN="$1""[[:blank:]]+:.*[[:blank:]]+([0-9\.]+)[[:blank:]]+ops/sec" +# In case of async_io, $1 is benchmark_asyncio +MAIN_PATTERN="${1%%_*}""[[:blank:]]+:.*[[:blank:]]+([0-9\.]+)[[:blank:]]+ops/sec" PERC_PATTERN="Percentiles: P50: ([0-9\.]+) P75: ([0-9\.]+) " PERC_PATTERN+="P99: ([0-9\.]+) P99.9: ([0-9\.]+) P99.99: ([0-9\.]+)" #============================================================================== @@ -137,6 +138,9 @@ function main { fi if [ $TEST_MODE -ge 1 ]; then build_checkpoint + # run_db_bench benchmark_name NUM_OPS NUM_THREADS USED_EXISTING_DB UPDATE_REPORT ASYNC_IO + run_db_bench "seekrandom_asyncio" $NUM_OPS $NUM_THREADS 1 1 true + run_db_bench "multireadrandom_asyncio" $NUM_OPS $NUM_THREADS 1 1 true run_db_bench "readrandom" run_db_bench "readwhilewriting" run_db_bench "deleterandom" @@ -199,10 +203,11 @@ function init_arguments { } # $1 --- benchmark name -# $2 --- number of operations. Default: $NUM_KEYS +# $2 --- number of operations. Default: $NUM_OPS # $3 --- number of threads. Default $NUM_THREADS # $4 --- use_existing_db. Default: 1 # $5 --- update_report. Default: 1 +# $6 --- async_io. Default: False function run_db_bench { # Make sure no other db_bench is running. (Make sure command succeeds if pidof # command exists but finds nothing.) @@ -234,6 +239,16 @@ function run_db_bench { threads=${3:-$NUM_THREADS} USE_EXISTING_DB=${4:-1} UPDATE_REPORT=${5:-1} + async_io=${6:-false} + seek_nexts=$SEEK_NEXTS + + if [ "$async_io" == "true" ]; then + if ! [ -z "$SEEK_NEXTS_ASYNC_IO" ]; then + seek_nexts=$SEEK_NEXTS_ASYNC_IO + fi + fi + + echo "" echo "=======================================================================" echo "Benchmark $1" @@ -242,9 +257,13 @@ function run_db_bench { db_bench_error=0 options_file_arg=$(setup_options_file) echo "$options_file_arg" + + # In case of async_io, benchmark is benchmark_asyncio + db_bench_type=${1%%_*} + # use `which time` to avoid using bash's internal time command db_bench_cmd="\$(which time) -p $DB_BENCH_DIR/db_bench \ - --benchmarks=$1 --db=$DB_PATH --wal_dir=$WAL_PATH \ + --benchmarks=$db_bench_type --db=$DB_PATH --wal_dir=$WAL_PATH \ --use_existing_db=$USE_EXISTING_DB \ --perf_level=$PERF_LEVEL \ --disable_auto_compactions \ @@ -260,7 +279,7 @@ function run_db_bench { $options_file_arg \ --compression_ratio=$COMPRESSION_RATIO \ --histogram=$HISTOGRAM \ - --seek_nexts=$SEEK_NEXTS \ + --seek_nexts=$seek_nexts \ --stats_per_interval=$STATS_PER_INTERVAL \ --stats_interval_seconds=$STATS_INTERVAL_SECONDS \ --max_background_flushes=$MAX_BACKGROUND_FLUSHES \ @@ -271,7 +290,15 @@ function run_db_bench { --seed=$SEED \ --multiread_batched=true \ --batch_size=$MULTIREAD_BATCH_SIZE \ - --multiread_stride=$MULTIREAD_STRIDE 2>&1" + --multiread_stride=$MULTIREAD_STRIDE \ + --async_io=$async_io" + + if [ "$async_io" == "true" ]; then + db_bench_cmd="$db_bench_cmd $(set_async_io_parameters) " + fi + + db_bench_cmd=" $db_bench_cmd 2>&1" + if ! [ -z "$REMOTE_USER_AT_HOST" ]; then echo "Running benchmark remotely on $REMOTE_USER_AT_HOST" db_bench_cmd="$SSH $REMOTE_USER_AT_HOST '$db_bench_cmd'" @@ -286,6 +313,24 @@ function run_db_bench { fi } +function set_async_io_parameters { + options=" --duration=500" + # Below parameters are used in case of async_io only. + # 1. If you want to run below parameters for all benchmarks, it should be + # specify in OPTIONS_FILE instead of exporting them. + # 2. Below exported var takes precedence over OPTIONS_FILE. + if ! [ -z "$MAX_READAHEAD_SIZE" ]; then + options="$options --max_auto_readahead_size=$MAX_READAHEAD_SIZE " + fi + if ! [ -z "$INITIAL_READAHEAD_SIZE" ]; then + options="$options --initial_auto_readahead_size=$INITIAL_READAHEAD_SIZE " + fi + if ! [ -z "$NUM_READS_FOR_READAHEAD_SIZE" ]; then + options="$options --num_file_reads_for_auto_readahead=$NUM_READS_FOR_READAHEAD_SIZE " + fi + echo $options +} + function build_checkpoint { cmd_prefix="" if ! [ -z "$REMOTE_USER_AT_HOST" ]; then @@ -317,7 +362,10 @@ function multiply { # $1 --- name of the benchmark # $2 --- the filename of the output log of db_bench function update_report { - main_result=`cat $2 | grep $1` + # In case of async_io, benchmark is benchmark_asyncio + db_bench_type=${1%%_*} + + main_result=`cat $2 | grep $db_bench_type` exit_on_error $? perc_statement=`cat $2 | grep Percentile` exit_on_error $? @@ -402,7 +450,7 @@ function run_local { } function setup_options_file { - if ! [ -z "$OPTIONS_FILE" ]; then + if ! [ -z "$OPTIONS_FILE" ]; then if ! [ -z "$REMOTE_USER_AT_HOST" ]; then options_file="$DB_BENCH_DIR/OPTIONS_FILE" run_local "$SCP $OPTIONS_FILE $REMOTE_USER_AT_HOST:$options_file" @@ -438,8 +486,9 @@ function setup_test_directory { run_remote "ls -l $DB_BENCH_DIR" if ! [ -z "$REMOTE_USER_AT_HOST" ]; then - run_local "$SCP ./db_bench $REMOTE_USER_AT_HOST:$DB_BENCH_DIR/db_bench" - run_local "$SCP ./ldb $REMOTE_USER_AT_HOST:$DB_BENCH_DIR/ldb" + shopt -s nullglob # allow missing librocksdb*.so* for static lib build + run_local "tar cz db_bench ldb librocksdb*.so* | $SSH $REMOTE_USER_AT_HOST 'cd $DB_BENCH_DIR/ && tar xzv'" + shopt -u nullglob fi run_local "mkdir -p $RESULT_PATH" diff --git a/tools/run_blob_bench.sh b/tools/run_blob_bench.sh index 3755a9e56bba..aeb2894f4079 100755 --- a/tools/run_blob_bench.sh +++ b/tools/run_blob_bench.sh @@ -19,6 +19,7 @@ # Exit Codes EXIT_INVALID_ARGS=1 +EXIT_INVALID_PATH=2 # Size constants K=1024 @@ -74,6 +75,11 @@ if [ $# -ge 1 ]; then fi fi +if [ ! -f tools/benchmark.sh ]; then + echo "tools/benchmark.sh not found" + exit $EXIT_INVALID_PATH +fi + # shellcheck disable=SC2153 if [ -z "$DB_DIR" ]; then echo "DB_DIR is not defined" diff --git a/tools/simulated_hybrid_file_system.cc b/tools/simulated_hybrid_file_system.cc index a474417c78a5..2b9aa0950fe5 100644 --- a/tools/simulated_hybrid_file_system.cc +++ b/tools/simulated_hybrid_file_system.cc @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #include "util/stop_watch.h" -#ifndef ROCKSDB_LITE #include #include @@ -242,4 +241,3 @@ IOStatus SimulatedWritableFile::Sync(const IOOptions& options, } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/tools/simulated_hybrid_file_system.h b/tools/simulated_hybrid_file_system.h index 251d89df79de..44b37eadeb12 100644 --- a/tools/simulated_hybrid_file_system.h +++ b/tools/simulated_hybrid_file_system.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include @@ -123,4 +122,3 @@ class SimulatedWritableFile : public FSWritableFileWrapper { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc index becf67316660..d68ab90329bd 100644 --- a/tools/sst_dump.cc +++ b/tools/sst_dump.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include "rocksdb/sst_dump_tool.h" @@ -11,10 +10,3 @@ int main(int argc, char** argv) { ROCKSDB_NAMESPACE::SSTDumpTool tool; return tool.Run(argc, argv); } -#else -#include -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "Not supported in lite mode.\n"); - return 1; -} -#endif // ROCKSDB_LITE diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc index aa1ff810fd23..f0b71bf8ea03 100644 --- a/tools/sst_dump_test.cc +++ b/tools/sst_dump_test.cc @@ -6,10 +6,10 @@ // Copyright (c) 2012 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #include +#include "db/wide/wide_column_serialization.h" #include "file/random_access_file_reader.h" #include "port/stack_trace.h" #include "rocksdb/convenience.h" @@ -25,10 +25,11 @@ namespace ROCKSDB_NAMESPACE { const uint32_t kOptLength = 1024; namespace { -static std::string MakeKey(int i) { +static std::string MakeKey(int i, + ValueType value_type = ValueType::kTypeValue) { char buf[100]; snprintf(buf, sizeof(buf), "k_%04d", i); - InternalKey key(std::string(buf), 0, ValueType::kTypeValue); + InternalKey key(std::string(buf), 0, value_type); return key.Encode().ToString(); } @@ -45,6 +46,16 @@ static std::string MakeValue(int i) { return key.Encode().ToString(); } +static std::string MakeWideColumn(int i) { + std::string val = MakeValue(i); + std::string val1 = "attr_1_val_" + val; + std::string val2 = "attr_2_val_" + val; + WideColumns columns{{"attr_1", val1}, {"attr_2", val2}}; + std::string entity; + EXPECT_OK(WideColumnSerialization::Serialize(columns, entity)); + return entity; +} + void cleanup(const Options& opts, const std::string& file_name) { Env* env = opts.env; ASSERT_OK(env->DeleteFile(file_name)); @@ -95,7 +106,8 @@ class SSTDumpToolTest : public testing::Test { snprintf(usage[2], kOptLength, "--file=%s", file_path.c_str()); } - void createSST(const Options& opts, const std::string& file_name) { + void createSST(const Options& opts, const std::string& file_name, + uint32_t wide_column_one_in = 0) { Env* test_env = opts.env; FileOptions file_options(opts); ReadOptions read_options; @@ -124,7 +136,12 @@ class SSTDumpToolTest : public testing::Test { const char* comparator_name = ikc.user_comparator()->Name(); if (strcmp(comparator_name, ReverseBytewiseComparator()->Name()) == 0) { for (int32_t i = num_keys; i >= 0; i--) { - tb->Add(MakeKey(i), MakeValue(i)); + if (wide_column_one_in == 0 || i % wide_column_one_in != 0) { + tb->Add(MakeKey(i), MakeValue(i)); + } else { + tb->Add(MakeKey(i, ValueType::kTypeWideColumnEntity), + MakeWideColumn(i)); + } } } else if (strcmp(comparator_name, test::BytewiseComparatorWithU64TsWrapper()->Name()) == @@ -134,7 +151,12 @@ class SSTDumpToolTest : public testing::Test { } } else { for (uint32_t i = 0; i < num_keys; i++) { - tb->Add(MakeKey(i), MakeValue(i)); + if (wide_column_one_in == 0 || i % wide_column_one_in != 0) { + tb->Add(MakeKey(i), MakeValue(i)); + } else { + tb->Add(MakeKey(i, ValueType::kTypeWideColumnEntity), + MakeWideColumn(i)); + } } } ASSERT_OK(tb->Finish()); @@ -165,7 +187,7 @@ TEST_F(SSTDumpToolTest, EmptyFilter) { Options opts; opts.env = env(); std::string file_path = MakeFilePath("rocksdb_sst_test.sst"); - createSST(opts, file_path); + createSST(opts, file_path, 10); char* usage[3]; PopulateCommandArgs(file_path, "--command=raw", usage); @@ -213,7 +235,7 @@ TEST_F(SSTDumpToolTest, SstDumpComparatorWithU64Ts) { opts.table_factory.reset(new BlockBasedTableFactory(table_opts)); std::string file_path = MakeFilePath("rocksdb_sst_comparator_with_u64_ts.sst"); - createSST(opts, file_path); + createSST(opts, file_path, 10); char* usage[3]; PopulateCommandArgs(file_path, "--command=raw", usage); @@ -235,7 +257,7 @@ TEST_F(SSTDumpToolTest, FilterBlock) { ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, true)); opts.table_factory.reset(new BlockBasedTableFactory(table_opts)); std::string file_path = MakeFilePath("rocksdb_sst_test.sst"); - createSST(opts, file_path); + createSST(opts, file_path, 10); char* usage[3]; PopulateCommandArgs(file_path, "--command=raw", usage); @@ -301,7 +323,7 @@ TEST_F(SSTDumpToolTest, CompressedSizes) { ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10, false)); opts.table_factory.reset(new BlockBasedTableFactory(table_opts)); std::string file_path = MakeFilePath("rocksdb_sst_test.sst"); - createSST(opts, file_path); + createSST(opts, file_path, 10); char* usage[3]; PopulateCommandArgs(file_path, "--command=recompress", usage); @@ -427,7 +449,7 @@ TEST_F(SSTDumpToolTest, RawOutput) { Options opts; opts.env = env(); std::string file_path = MakeFilePath("rocksdb_sst_test.sst"); - createSST(opts, file_path); + createSST(opts, file_path, 10); char* usage[3]; PopulateCommandArgs(file_path, "--command=raw", usage); @@ -469,13 +491,3 @@ int main(int argc, char** argv) { RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } - -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as SSTDumpTool is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE return RUN_ALL_TESTS(); diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index 0a2c282808ac..1b269043ab2c 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include "rocksdb/sst_dump_tool.h" @@ -420,6 +419,10 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) { filename = std::string(dir_or_file) + "/" + filename; } + if (command == "verify") { + verify_checksum = true; + } + ROCKSDB_NAMESPACE::SstFileDumper dumper( options, filename, Temperature::kUnknown, readahead_size, verify_checksum, output_hex, decode_blob_index); @@ -581,4 +584,3 @@ int SSTDumpTool::Run(int argc, char const* const* argv, Options options) { } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/tools/trace_analyzer.cc b/tools/trace_analyzer.cc index 958078d1c1be..831d351793e5 100644 --- a/tools/trace_analyzer.cc +++ b/tools/trace_analyzer.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #ifndef GFLAGS #include int main() { @@ -16,10 +15,3 @@ int main(int argc, char** argv) { return ROCKSDB_NAMESPACE::trace_analyzer_tool(argc, argv); } #endif -#else -#include -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "Not supported in lite mode.\n"); - return 1; -} -#endif // ROCKSDB_LITE diff --git a/tools/trace_analyzer_test.cc b/tools/trace_analyzer_test.cc index d7f9e4da81fb..e7d090eb2910 100644 --- a/tools/trace_analyzer_test.cc +++ b/tools/trace_analyzer_test.cc @@ -7,7 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #ifndef GFLAGS #include int main() { @@ -32,6 +31,8 @@ int main() { #include "test_util/testutil.h" #include "tools/trace_analyzer_tool.h" #include "trace_replay/trace_replay.h" +#include "utilities/fault_injection_env.h" +#include "utilities/trace/file_trace_reader_writer.h" namespace ROCKSDB_NAMESPACE { @@ -336,7 +337,7 @@ TEST_F(TraceAnalyzerTest, Put) { CheckFileContent(k_whole_prefix, file_path, true); // Check the overall qps - std::vector all_qps = {"0 1 0 0 0 0 0 0 0 1"}; + std::vector all_qps = {"0 1 0 0 0 0 0 0 0 0 1"}; file_path = output_path + "/test-qps_stats.txt"; CheckFileContent(all_qps, file_path, true); @@ -786,6 +787,48 @@ TEST_F(TraceAnalyzerTest, Iterator) { */ } +TEST_F(TraceAnalyzerTest, ExistsPreviousTraceWriteError) { + DB* db_ = nullptr; + Options options; + options.create_if_missing = true; + + std::unique_ptr fault_env( + new FaultInjectionTestEnv(env_)); + const std::string trace_path = + test_path_ + "/previous_trace_write_error_trace"; + std::unique_ptr trace_writer; + ASSERT_OK(NewFileTraceWriter(fault_env.get(), env_options_, trace_path, + &trace_writer)); + + ASSERT_OK(DB::Open(options, dbname_, &db_)); + ASSERT_OK(db_->StartTrace(TraceOptions(), std::move(trace_writer))); + + // Inject write error on the first trace write. + // This trace write is made big enough to actually write to FS for error + // injection. + const std::string kBigKey(1000000, 'k'); + const std::string kBigValue(1000000, 'v'); + fault_env->SetFilesystemActive(false, Status::IOError("Injected")); + + ASSERT_OK(db_->Put(WriteOptions(), kBigKey, kBigValue)); + + fault_env->SetFilesystemActive(true); + + // Without proper handling of the previous trace write error, + // this trace write will continue and crash the db (in DEBUG_LEVEL > 0) + // due to writing to the trace file that has seen error. + ASSERT_OK(db_->Put(WriteOptions(), kBigKey, kBigValue)); + + // Verify `EndTrace()` returns the previous write trace error if any + Status s = db_->EndTrace(); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_TRUE(s.ToString().find("Tracing has seen error") != std::string::npos); + ASSERT_TRUE(s.ToString().find("Injected") != std::string::npos); + + delete db_; + ASSERT_OK(DestroyDB(dbname_, options)); +} + // Test analyzing of multiget TEST_F(TraceAnalyzerTest, MultiGet) { std::string trace_path = test_path_ + "/trace"; @@ -879,12 +922,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } #endif // GFLAG -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "Trace_analyzer test is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE return RUN_ALL_TESTS(); diff --git a/tools/trace_analyzer_tool.cc b/tools/trace_analyzer_tool.cc index 5a6d6786415c..00a4da046163 100644 --- a/tools/trace_analyzer_tool.cc +++ b/tools/trace_analyzer_tool.cc @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #ifdef GFLAGS #ifdef NUMA @@ -1582,6 +1581,12 @@ Status TraceAnalyzer::PutCF(uint32_t column_family_id, const Slice& key, column_family_id, key, value.size()); } +Status TraceAnalyzer::PutEntityCF(uint32_t column_family_id, const Slice& key, + const Slice& value) { + return OutputAnalysisResult(TraceOperationType::kPutEntity, write_batch_ts_, + column_family_id, key, value.size()); +} + // Handle the Delete request in the write batch of the trace Status TraceAnalyzer::DeleteCF(uint32_t column_family_id, const Slice& key) { return OutputAnalysisResult(TraceOperationType::kDelete, write_batch_ts_, @@ -1922,4 +1927,3 @@ int trace_analyzer_tool(int argc, char** argv) { } // namespace ROCKSDB_NAMESPACE #endif // Endif of Gflag -#endif // RocksDB LITE diff --git a/tools/trace_analyzer_tool.h b/tools/trace_analyzer_tool.h index 4b885b18cc10..2c3042bdc583 100644 --- a/tools/trace_analyzer_tool.h +++ b/tools/trace_analyzer_tool.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -35,7 +34,8 @@ enum TraceOperationType : int { kIteratorSeek = 6, kIteratorSeekForPrev = 7, kMultiGet = 8, - kTaTypeNum = 9 + kPutEntity = 9, + kTaTypeNum = 10 }; struct TraceUnit { @@ -201,6 +201,10 @@ class TraceAnalyzer : private TraceRecord::Handler, Status PutCF(uint32_t column_family_id, const Slice& key, const Slice& value) override; + using WriteBatch::Handler::PutEntityCF; + Status PutEntityCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override; + using WriteBatch::Handler::DeleteCF; Status DeleteCF(uint32_t column_family_id, const Slice& key) override; @@ -323,4 +327,3 @@ int trace_analyzer_tool(int argc, char** argv); } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/tools/write_stress.cc b/tools/write_stress.cc index ba5bd3f4f00c..5cfec3e8e5bd 100644 --- a/tools/write_stress.cc +++ b/tools/write_stress.cc @@ -243,9 +243,6 @@ class WriteStress { } threads_.clear(); -// Skip checking for leaked files in ROCKSDB_LITE since we don't have access to -// function GetLiveFilesMetaData -#ifndef ROCKSDB_LITE // let's see if we leaked some files db_->PauseBackgroundWork(); std::vector metadata; @@ -281,7 +278,6 @@ class WriteStress { } } db_->ContinueBackgroundWork(); -#endif // !ROCKSDB_LITE return 0; } diff --git a/trace_replay/trace_replay.cc b/trace_replay/trace_replay.cc index c681e374c43e..126a8e248de9 100644 --- a/trace_replay/trace_replay.cc +++ b/trace_replay/trace_replay.cc @@ -345,7 +345,8 @@ Tracer::Tracer(SystemClock* clock, const TraceOptions& trace_options, : clock_(clock), trace_options_(trace_options), trace_writer_(std::move(trace_writer)), - trace_request_count_(0) { + trace_request_count_(0), + trace_write_status_(Status::OK()) { // TODO: What if this fails? WriteHeader().PermitUncheckedError(); } @@ -612,9 +613,18 @@ Status Tracer::WriteFooter() { } Status Tracer::WriteTrace(const Trace& trace) { + if (!trace_write_status_.ok()) { + return Status::Incomplete("Tracing has seen error: %s", + trace_write_status_.ToString()); + } + assert(trace_write_status_.ok()); std::string encoded_trace; TracerHelper::EncodeTrace(trace, &encoded_trace); - return trace_writer_->Write(Slice(encoded_trace)); + Status s = trace_writer_->Write(Slice(encoded_trace)); + if (!s.ok()) { + trace_write_status_ = s; + } + return s; } Status Tracer::Close() { return WriteFooter(); } diff --git a/trace_replay/trace_replay.h b/trace_replay/trace_replay.h index 9aba5ceb7248..55908dcb7ed5 100644 --- a/trace_replay/trace_replay.h +++ b/trace_replay/trace_replay.h @@ -178,6 +178,7 @@ class Tracer { TraceOptions trace_options_; std::unique_ptr trace_writer_; uint64_t trace_request_count_; + Status trace_write_status_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/unreleased_history/README.txt b/unreleased_history/README.txt new file mode 100644 index 000000000000..1d641285d5e8 --- /dev/null +++ b/unreleased_history/README.txt @@ -0,0 +1,73 @@ +Adding release notes +-------------------- + +When adding release notes for the next release, add a file to one of these +directories: + +unreleased_history/new_features +unreleased_history/behavior_changes +unreleased_history/public_api_changes +unreleased_history/bug_fixes + +with a unique name that makes sense for your change, preferably using the .md +extension for syntax highlighting. + +There is a script to help, as in + +$ unreleased_history/add.sh unreleased_history/bug_fixes/crash_in_feature.md + +or simply + +$ unreleased_history/add.sh + +will take you through some prompts. + +The file should usually contain one line of markdown, and "* " is not +required, as it will automatically be inserted later if not included at the +start of the first line in the file. Extra newlines or missing trailing +newlines will also be corrected. + +The only times release notes should be added directly to HISTORY are if +* A release is being amended or corrected after it is already "cut" but not +tagged, which should be rare. +* A single commit contains a noteworthy change and a patch release version bump + + +Ordering of entries +------------------- + +Within each group, entries will be included using ls sort order, so important +entries could start their file name with a small three digit number like +100pretty_important.md. + +The ordering of groups such as new_features vs. public_api_changes is +hard-coded in unreleased_history/release.sh + + +Updating HISTORY.md with release notes +-------------------------------------- + +The script unreleased_history/release.sh does this. Run the script before +updating version.h to the next develpment release, so that the script will pick +up the version being released. You might want to start with + +$ DRY_RUN=1 unreleased_history/release.sh | less + +to check for problems and preview the output. Then run + +$ unreleased_history/release.sh + +which will git rm some files and modify HISTORY.md. You still need to commit the +changes, or revert with the command reported in the output. + + +Why not update HISTORY.md directly? +----------------------------------- + +First, it was common to hit unnecessary merge conflicts when adding entries to +HISTORY.md, which slowed development. Second, when a PR was opened before a +release cut and landed after the release cut, it was easy to add the HISTORY +entry to the wrong version's history. This new setup completely fixes both of +those issues, with perhaps slighly more initial work to create each entry. +There is also now an extra step in using `git blame` to map a release note +to its source code implementation, but that is a relatively rare operation. diff --git a/unreleased_history/add.sh b/unreleased_history/add.sh new file mode 100755 index 000000000000..b7822c8efa1e --- /dev/null +++ b/unreleased_history/add.sh @@ -0,0 +1,27 @@ +#! /usr/bin/env bash +# Copyright (c) Meta Platforms, Inc. and affiliates. + +set -e +set -o pipefail + +if [ "$1" ]; then + # Target file specified on command line + TARGET="$1" +else + # Interactively choose a group and file name + DIRS="`find unreleased_history/ -mindepth 1 -maxdepth 1 -type d`" + echo "Choose a group for new release note:" + echo "$DIRS" | grep -nEo '[^/]+$' + echo -n "Enter a number: " + while [ ! "$DIRNUM" ]; do read -r DIRNUM; done + DIR="$(echo "$DIRS" | head -n "$DIRNUM" | tail -1)" + echo "Choose a file name for new release note (e.g. improved_whatever.md):" + while [ ! "$FNAME" ]; do read -re FNAME; done + # Replace spaces with underscores + TARGET="$(echo "$DIR/$FNAME" | tr ' ' '_')" +fi + +# Edit/create the file +${EDITOR:-nano} "$TARGET" +# Add to version control (easy to forget!) +git add "$TARGET" diff --git a/unreleased_history/behavior_changes/.gitkeep b/unreleased_history/behavior_changes/.gitkeep new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/unreleased_history/bug_fixes/.gitkeep b/unreleased_history/bug_fixes/.gitkeep new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/unreleased_history/new_features/.gitkeep b/unreleased_history/new_features/.gitkeep new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/unreleased_history/performance_improvements/.gitkeep b/unreleased_history/performance_improvements/.gitkeep new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/unreleased_history/public_api_changes/.gitkeep b/unreleased_history/public_api_changes/.gitkeep new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/unreleased_history/release.sh b/unreleased_history/release.sh new file mode 100755 index 000000000000..1f50f51b024c --- /dev/null +++ b/unreleased_history/release.sh @@ -0,0 +1,104 @@ +#! /usr/bin/env bash +# Copyright (c) Meta Platforms, Inc. and affiliates. + +set -e + +if [ ! -d unreleased_history ]; then + echo "Can't find unreleased_history/ directory" + exit 1 +fi + +GIT_PATHS="unreleased_history/ HISTORY.md" +if [ ! "$DRY_RUN" ]; then + # Check for uncommitted changes + UNCOMMITTED="$(git diff -- $GIT_PATHS)" + if [ "$UNCOMMITTED" ]; then + echo 'Uncommitted changes to files to be modified. Please commit first to' + echo 'ensure a clean revert path. You can always `git commit -a --amend`' + echo 'to add more changes to your commit.' + exit 2 + fi +fi + +# Add first part of existing HISTORY file to new version +awk '{ print } /NOTE/ { exit(0) }' < HISTORY.md > HISTORY.new + +# And a blank line separator +echo >> HISTORY.new + +# Add new version header +awk '/#define ROCKSDB_MAJOR/ { major = $3 } + /#define ROCKSDB_MINOR/ { minor = $3 } + /#define ROCKSDB_PATCH/ { patch = $3 } + END { printf "## " major "." minor "." patch }' < include/rocksdb/version.h >> HISTORY.new +echo " (`git log -n1 --date=format:"%m/%d/%Y" --format="%ad"`)" >> HISTORY.new + +function process_file () { + # use awk to correct + # * extra or missing newlines + # * leading or trailing whitespace + # * missing '* ' on first line + awk '/./ { gsub(/^[ \t]+/, ""); gsub(/[ \t]+$/, ""); + if (notfirstline || $1 == "*") print; + else print "* " $0; + notfirstline=1; }' < $1 >> HISTORY.new + echo git rm $1 + if [ ! "$DRY_RUN" ]; then + git rm $1 + fi +} + +PROCESSED_DIRECTORIES="" + +function process_dir () { + PROCESSED_DIRECTORIES="$PROCESSED_DIRECTORIES $1" + # ls will sort the files, including the permanent header file + FILES="$(ls unreleased_history/$1/)" + if [ "$FILES" ]; then + echo "### $2" >> HISTORY.new + for FILE in $FILES; do + process_file "unreleased_history/$1/$FILE" + done + echo >> HISTORY.new + echo "Saved entries from $1" + else + echo "Nothing new in $1" + fi +} + +# Process dirs and files +process_dir new_features "New Features" +process_dir public_api_changes "Public API Changes" +process_dir behavior_changes "Behavior Changes" +process_dir bug_fixes "Bug Fixes" +process_dir performance_improvements "Performance Improvements" + +# Check for unexpected files or dirs at top level. process_dir/process_file +# will deal with contents of these directories +EXPECTED_REGEX="[^/]*[.]sh|README[.]txt|$(echo $PROCESSED_DIRECTORIES | tr ' ' '|')" +platform=`uname` +if [ $platform = 'Darwin' ]; then + UNEXPECTED="$(find -E unreleased_history -mindepth 1 -maxdepth 1 -not -regex "[^/]*/($EXPECTED_REGEX)")" +else + UNEXPECTED="$(find unreleased_history/ -mindepth 1 -maxdepth 1 -regextype egrep -not -regex "[^/]*/($EXPECTED_REGEX)")" +fi +if [ "$UNEXPECTED" ]; then + echo "Unexpected files I don't know how to process:" + echo "$UNEXPECTED" + rm HISTORY.new + exit 3 +fi + +# Add rest of existing HISTORY file to new version (collapsing newlines) +awk '/./ { if (note) pr=1 } + /NOTE/ { note=1 } + { if (pr) print }' < HISTORY.md >> HISTORY.new + +if [ "$DRY_RUN" ]; then + echo '===========================================' + diff -U3 HISTORY.md HISTORY.new || true + rm HISTORY.new +else + mv HISTORY.new HISTORY.md + echo "Done. Revert command: git checkout HEAD -- $GIT_PATHS" +fi diff --git a/util/aligned_buffer.h b/util/aligned_buffer.h index 95ee5dfe82e8..acab56c215e8 100644 --- a/util/aligned_buffer.h +++ b/util/aligned_buffer.h @@ -9,6 +9,7 @@ #pragma once #include +#include #include "port/port.h" diff --git a/util/async_file_reader.cc b/util/async_file_reader.cc index 080c1ae96689..9ce13b99fe59 100644 --- a/util/async_file_reader.cc +++ b/util/async_file_reader.cc @@ -26,6 +26,11 @@ bool AsyncFileReader::MultiReadAsyncImpl(ReadAwaiter* awaiter) { FSReadRequest* read_req = static_cast(cb_arg); read_req->status = req.status; read_req->result = req.result; + if (req.fs_scratch != nullptr) { + // TODO akanksha: Revisit to remove the const in the callback. + FSReadRequest& req_tmp = const_cast(req); + read_req->fs_scratch = std::move(req_tmp.fs_scratch); + } }, &awaiter->read_reqs_[i], &awaiter->io_handle_[i], &awaiter->del_fn_[i], /*aligned_buf=*/nullptr); diff --git a/util/atomic.h b/util/atomic.h new file mode 100644 index 000000000000..afb3dc540050 --- /dev/null +++ b/util/atomic.h @@ -0,0 +1,111 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// Background: +// std::atomic is somewhat easy to misuse: +// * Implicit conversion to T using std::memory_order_seq_cst, along with +// memory order parameter defaults, make it easy to accidentally mix sequential +// consistency ordering with acquire/release memory ordering. See +// "The single total order might not be consistent with happens-before" at +// https://en.cppreference.com/w/cpp/atomic/memory_order +// * It's easy to use nonsensical (UB) combinations like store with +// std::memory_order_acquire. +// For such reasons, we provide wrappers below to make safe usage easier. + +// Wrapper around std::atomic to avoid certain bugs (see Background above). +// +// This relaxed-only wrapper is intended for atomics that do not need +// ordering constraints with other data reads/writes aside from those +// necessary for computing data values or given by other happens-before +// relationships. For example, a cross-thread counter that never returns +// the same result can be a RelaxedAtomic. +template +class RelaxedAtomic { + public: + explicit RelaxedAtomic(T initial = {}) : v_(initial) {} + void StoreRelaxed(T desired) { v_.store(desired, std::memory_order_relaxed); } + T LoadRelaxed() const { return v_.load(std::memory_order_relaxed); } + bool CasWeakRelaxed(T& expected, T desired) { + return v_.compare_exchange_weak(expected, desired, + std::memory_order_relaxed); + } + bool CasStrongRelaxed(T& expected, T desired) { + return v_.compare_exchange_strong(expected, desired, + std::memory_order_relaxed); + } + T ExchangeRelaxed(T desired) { + return v_.exchange(desired, std::memory_order_relaxed); + } + T FetchAddRelaxed(T operand) { + return v_.fetch_add(operand, std::memory_order_relaxed); + } + T FetchSubRelaxed(T operand) { + return v_.fetch_sub(operand, std::memory_order_relaxed); + } + T FetchAndRelaxed(T operand) { + return v_.fetch_and(operand, std::memory_order_relaxed); + } + T FetchOrRelaxed(T operand) { + return v_.fetch_or(operand, std::memory_order_relaxed); + } + T FetchXorRelaxed(T operand) { + return v_.fetch_xor(operand, std::memory_order_relaxed); + } + + protected: + std::atomic v_; +}; + +// Wrapper around std::atomic to avoid certain bugs (see Background above). +// +// Except for some unusual cases requiring sequential consistency, this is +// a general-purpose atomic. Relaxed operations can be mixed in as appropriate. +template +class AcqRelAtomic : public RelaxedAtomic { + public: + explicit AcqRelAtomic(T initial = {}) : RelaxedAtomic(initial) {} + void Store(T desired) { + RelaxedAtomic::v_.store(desired, std::memory_order_release); + } + T Load() const { + return RelaxedAtomic::v_.load(std::memory_order_acquire); + } + bool CasWeak(T& expected, T desired) { + return RelaxedAtomic::v_.compare_exchange_weak( + expected, desired, std::memory_order_acq_rel); + } + bool CasStrong(T& expected, T desired) { + return RelaxedAtomic::v_.compare_exchange_strong( + expected, desired, std::memory_order_acq_rel); + } + T Exchange(T desired) { + return RelaxedAtomic::v_.exchange(desired, std::memory_order_acq_rel); + } + T FetchAdd(T operand) { + return RelaxedAtomic::v_.fetch_add(operand, std::memory_order_acq_rel); + } + T FetchSub(T operand) { + return RelaxedAtomic::v_.fetch_sub(operand, std::memory_order_acq_rel); + } + T FetchAnd(T operand) { + return RelaxedAtomic::v_.fetch_and(operand, std::memory_order_acq_rel); + } + T FetchOr(T operand) { + return RelaxedAtomic::v_.fetch_or(operand, std::memory_order_acq_rel); + } + T FetchXor(T operand) { + return RelaxedAtomic::v_.fetch_xor(operand, std::memory_order_acq_rel); + } +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/util/autovector.h b/util/autovector.h index f758473b79bd..79ee5de5725c 100644 --- a/util/autovector.h +++ b/util/autovector.h @@ -16,18 +16,6 @@ namespace ROCKSDB_NAMESPACE { -#ifdef ROCKSDB_LITE -template -class autovector : public std::vector { - using std::vector::vector; - - public: - autovector() { - // Make sure the initial vector has space for kSize elements - std::vector::reserve(kSize); - } -}; -#else // A vector that leverages pre-allocated stack-based array to achieve better // performance for array with small amount of items. // @@ -402,5 +390,4 @@ autovector& autovector::operator=( return *this; } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/util/autovector_test.cc b/util/autovector_test.cc index 8c7c39ce6445..b75a0fa2a2f0 100644 --- a/util/autovector_test.cc +++ b/util/autovector_test.cc @@ -26,12 +26,7 @@ const unsigned long kSize = 8; namespace { template void AssertAutoVectorOnlyInStack(autovector* vec, bool result) { -#ifndef ROCKSDB_LITE ASSERT_EQ(vec->only_in_stack(), result); -#else - (void)vec; - (void)result; -#endif // !ROCKSDB_LITE } } // namespace @@ -114,9 +109,7 @@ void AssertEqual(const autovector& a, const autovector& b) { ASSERT_EQ(a.size(), b.size()); ASSERT_EQ(a.empty(), b.empty()); -#ifndef ROCKSDB_LITE ASSERT_EQ(a.only_in_stack(), b.only_in_stack()); -#endif // !ROCKSDB_LITE for (size_t i = 0; i < a.size(); ++i) { ASSERT_EQ(a[i], b[i]); } diff --git a/util/bloom_impl.h b/util/bloom_impl.h index fadd012d305f..c9bbb125b8f1 100644 --- a/util/bloom_impl.h +++ b/util/bloom_impl.h @@ -17,7 +17,7 @@ #include "rocksdb/slice.h" #include "util/hash.h" -#ifdef HAVE_AVX2 +#ifdef __AVX2__ #include #endif @@ -199,7 +199,7 @@ class FastLocalBloomImpl { static inline void AddHash(uint32_t h1, uint32_t h2, uint32_t len_bytes, int num_probes, char *data) { - uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6; + uint32_t bytes_to_cache_line = FastRange32(h1, len_bytes >> 6) << 6; AddHashPrepared(h2, num_probes, data + bytes_to_cache_line); } @@ -216,7 +216,7 @@ class FastLocalBloomImpl { static inline void PrepareHash(uint32_t h1, uint32_t len_bytes, const char *data, uint32_t /*out*/ *byte_offset) { - uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6; + uint32_t bytes_to_cache_line = FastRange32(h1, len_bytes >> 6) << 6; PREFETCH(data + bytes_to_cache_line, 0 /* rw */, 1 /* locality */); PREFETCH(data + bytes_to_cache_line + 63, 0 /* rw */, 1 /* locality */); *byte_offset = bytes_to_cache_line; @@ -224,14 +224,14 @@ class FastLocalBloomImpl { static inline bool HashMayMatch(uint32_t h1, uint32_t h2, uint32_t len_bytes, int num_probes, const char *data) { - uint32_t bytes_to_cache_line = FastRange32(len_bytes >> 6, h1) << 6; + uint32_t bytes_to_cache_line = FastRange32(h1, len_bytes >> 6) << 6; return HashMayMatchPrepared(h2, num_probes, data + bytes_to_cache_line); } static inline bool HashMayMatchPrepared(uint32_t h2, int num_probes, const char *data_at_cache_line) { uint32_t h = h2; -#ifdef HAVE_AVX2 +#ifdef __AVX2__ int rem_probes = num_probes; // NOTE: For better performance for num_probes in {1, 2, 9, 10, 17, 18, diff --git a/util/bloom_test.cc b/util/bloom_test.cc index 06dd1de06c6e..b0a5cae5661d 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -23,6 +23,7 @@ int main() { #include "cache/cache_reservation_manager.h" #include "memory/arena.h" #include "port/jemalloc_helper.h" +#include "rocksdb/convenience.h" #include "rocksdb/filter_policy.h" #include "table/block_based/filter_policy_internal.h" #include "test_util/testharness.h" @@ -1109,12 +1110,16 @@ static void SetTestingLevel(int levelish, FilterBuildingContext* ctx) { TEST(RibbonTest, RibbonTestLevelThreshold) { BlockBasedTableOptions opts; FilterBuildingContext ctx(opts); + + std::shared_ptr reused{NewRibbonFilterPolicy(10)}; + // A few settings for (CompactionStyle cs : {kCompactionStyleLevel, kCompactionStyleUniversal, kCompactionStyleFIFO, kCompactionStyleNone}) { ctx.compaction_style = cs; - for (int bloom_before_level : {-1, 0, 1, 10}) { - std::vector > policies; + for (int bloom_before_level : {-1, 0, 1, 10, INT_MAX - 1, INT_MAX}) { + SCOPED_TRACE("bloom_before_level=" + std::to_string(bloom_before_level)); + std::vector > policies; policies.emplace_back(NewRibbonFilterPolicy(10, bloom_before_level)); if (bloom_before_level == 0) { @@ -1122,16 +1127,22 @@ TEST(RibbonTest, RibbonTestLevelThreshold) { policies.emplace_back(NewRibbonFilterPolicy(10)); } - for (std::unique_ptr& policy : policies) { - // Claim to be generating filter for this level - SetTestingLevel(bloom_before_level, &ctx); + ASSERT_OK(reused->ConfigureOption({}, "bloom_before_level", + std::to_string(bloom_before_level))); - std::unique_ptr builder{ - policy->GetBuilderWithContext(ctx)}; + policies.push_back(reused); - // Must be Ribbon (more space efficient than 10 bits per key) - ASSERT_LT(GetEffectiveBitsPerKey(builder.get()), 8); + for (auto& policy : policies) { + std::unique_ptr builder; + if (bloom_before_level < INT_MAX) { + // Claim to be generating filter for this level + SetTestingLevel(bloom_before_level, &ctx); + + builder.reset(policy->GetBuilderWithContext(ctx)); + // Must be Ribbon (more space efficient than 10 bits per key) + ASSERT_LT(GetEffectiveBitsPerKey(builder.get()), 8); + } if (bloom_before_level >= 0) { // Claim to be generating filter for previous level SetTestingLevel(bloom_before_level - 1, &ctx); @@ -1142,6 +1153,10 @@ TEST(RibbonTest, RibbonTestLevelThreshold) { // Level is considered. // Must be Bloom (~ 10 bits per key) ASSERT_GT(GetEffectiveBitsPerKey(builder.get()), 9); + } else if (bloom_before_level == INT_MAX) { + // Force bloom option + // Must be Bloom (~ 10 bits per key) + ASSERT_GT(GetEffectiveBitsPerKey(builder.get()), 9); } else { // Level is ignored under non-traditional compaction styles. // Must be Ribbon (more space efficient than 10 bits per key) @@ -1155,8 +1170,14 @@ TEST(RibbonTest, RibbonTestLevelThreshold) { builder.reset(policy->GetBuilderWithContext(ctx)); - // Must be Ribbon (more space efficient than 10 bits per key) - ASSERT_LT(GetEffectiveBitsPerKey(builder.get()), 8); + if (bloom_before_level < INT_MAX) { + // Must be Ribbon (more space efficient than 10 bits per key) + ASSERT_LT(GetEffectiveBitsPerKey(builder.get()), 8); + } else { + // Force bloom option + // Must be Bloom (~ 10 bits per key) + ASSERT_GT(GetEffectiveBitsPerKey(builder.get()), 9); + } } } } diff --git a/util/build_version.cc.in b/util/build_version.cc.in index c1706dc1fd22..56bc878562a0 100644 --- a/util/build_version.cc.in +++ b/util/build_version.cc.in @@ -21,7 +21,6 @@ static const std::string rocksdb_build_date = "rocksdb_build_date:@GIT_DATE@"; static const std::string rocksdb_build_date = "rocksdb_build_date:@BUILD_DATE@"; #endif -#ifndef ROCKSDB_LITE extern "C" { @ROCKSDB_PLUGIN_EXTERNS@ } // extern "C" @@ -29,7 +28,6 @@ extern "C" { std::unordered_map ROCKSDB_NAMESPACE::ObjectRegistry::builtins_ = { @ROCKSDB_PLUGIN_BUILTINS@ }; -#endif //ROCKSDB_LITE namespace ROCKSDB_NAMESPACE { static void AddProperty(std::unordered_map *props, const std::string& name) { diff --git a/util/cast_util.h b/util/cast_util.h index c91b6ff1ee4a..e010274a75b7 100644 --- a/util/cast_util.h +++ b/util/cast_util.h @@ -5,6 +5,8 @@ #pragma once +#include +#include #include #include "rocksdb/rocksdb_namespace.h" @@ -23,6 +25,19 @@ inline DestClass* static_cast_with_check(SrcClass* x) { return ret; } +template +inline std::shared_ptr static_cast_with_check( + std::shared_ptr&& x) { +#if defined(ROCKSDB_USE_RTTI) && !defined(NDEBUG) + auto orig_raw = x.get(); +#endif + auto ret = std::static_pointer_cast(std::move(x)); +#if defined(ROCKSDB_USE_RTTI) && !defined(NDEBUG) + assert(ret.get() == dynamic_cast(orig_raw)); +#endif + return ret; +} + // A wrapper around static_cast for lossless conversion between integral // types, including enum types. For example, this can be used for converting // between signed/unsigned or enum type and underlying type without fear of @@ -39,4 +54,13 @@ inline To lossless_cast(From x) { return static_cast(x); } +// For disambiguating a potentially heterogeneous aggregate as a homogeneous +// initializer list. E.g. might be able to write List({x, y}) in some cases +// instead of std::vector({x, y}). +template +inline const std::initializer_list& List( + const std::initializer_list& list) { + return list; +} + } // namespace ROCKSDB_NAMESPACE diff --git a/util/compaction_job_stats_impl.cc b/util/compaction_job_stats_impl.cc index cfab2a4fefe3..cdb591f23ca1 100644 --- a/util/compaction_job_stats_impl.cc +++ b/util/compaction_job_stats_impl.cc @@ -7,12 +7,12 @@ namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE void CompactionJobStats::Reset() { elapsed_micros = 0; cpu_micros = 0; + has_num_input_records = true; num_input_records = 0; num_blobs_read = 0; num_input_files = 0; @@ -56,6 +56,7 @@ void CompactionJobStats::Add(const CompactionJobStats& stats) { elapsed_micros += stats.elapsed_micros; cpu_micros += stats.cpu_micros; + has_num_input_records &= stats.has_num_input_records; num_input_records += stats.num_input_records; num_blobs_read += stats.num_blobs_read; num_input_files += stats.num_input_files; @@ -89,12 +90,5 @@ void CompactionJobStats::Add(const CompactionJobStats& stats) { num_single_del_mismatch += stats.num_single_del_mismatch; } -#else - -void CompactionJobStats::Reset() {} - -void CompactionJobStats::Add(const CompactionJobStats& /*stats*/) {} - -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/util/comparator.cc b/util/comparator.cc index f85ed69ee6c9..98ecef9d26b5 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -23,6 +23,7 @@ #include "rocksdb/slice.h" #include "rocksdb/utilities/customizable_util.h" #include "rocksdb/utilities/object_registry.h" +#include "util/coding.h" namespace ROCKSDB_NAMESPACE { @@ -230,7 +231,6 @@ class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { } }; -// EXPERIMENTAL // Comparator with 64-bit integer timestamp. // We did not performance test this yet. template @@ -250,6 +250,12 @@ class ComparatorWithU64TsImpl : public Comparator { const char* Name() const override { return kClassName(); } + // The comparator that compares the user key without timestamp part is treated + // as the root comparator. + const Comparator* GetRootComparator() const override { + return &cmp_without_ts_; + } + void FindShortSuccessor(std::string*) const override {} void FindShortestSeparator(std::string*, const Slice&) const override {} int Compare(const Slice& a, const Slice& b) const override { @@ -316,37 +322,71 @@ const Comparator* BytewiseComparatorWithU64Ts() { return &comp_with_u64_ts; } -#ifndef ROCKSDB_LITE +const Comparator* ReverseBytewiseComparatorWithU64Ts() { + STATIC_AVOID_DESTRUCTION( + ComparatorWithU64TsImpl, comp_with_u64_ts); + return &comp_with_u64_ts; +} + +Status DecodeU64Ts(const Slice& ts, uint64_t* int_ts) { + if (ts.size() != sizeof(uint64_t)) { + return Status::InvalidArgument("U64Ts timestamp size mismatch."); + } + *int_ts = DecodeFixed64(ts.data()); + return Status::OK(); +} + +Slice EncodeU64Ts(uint64_t ts, std::string* ts_buf) { + char buf[sizeof(ts)]; + EncodeFixed64(buf, ts); + ts_buf->assign(buf, sizeof(buf)); + return Slice(*ts_buf); +} + +Slice MaxU64Ts() { + static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff"; + return Slice(kTsMax, sizeof(uint64_t)); +} + +Slice MinU64Ts() { + static constexpr char kTsMin[] = "\x00\x00\x00\x00\x00\x00\x00\x00"; + return Slice(kTsMin, sizeof(uint64_t)); +} + static int RegisterBuiltinComparators(ObjectLibrary& library, const std::string& /*arg*/) { library.AddFactory( BytewiseComparatorImpl::kClassName(), [](const std::string& /*uri*/, - std::unique_ptr* /*guard */, - std::string* /* errmsg */) { return BytewiseComparator(); }); + std::unique_ptr* /*guard*/, + std::string* /*errmsg*/) { return BytewiseComparator(); }); library.AddFactory( ReverseBytewiseComparatorImpl::kClassName(), [](const std::string& /*uri*/, - std::unique_ptr* /*guard */, - std::string* /* errmsg */) { return ReverseBytewiseComparator(); }); + std::unique_ptr* /*guard*/, + std::string* /*errmsg*/) { return ReverseBytewiseComparator(); }); library.AddFactory( ComparatorWithU64TsImpl::kClassName(), [](const std::string& /*uri*/, - std::unique_ptr* /*guard */, - std::string* /* errmsg */) { return BytewiseComparatorWithU64Ts(); }); - return 3; + std::unique_ptr* /*guard*/, + std::string* /*errmsg*/) { return BytewiseComparatorWithU64Ts(); }); + library.AddFactory( + ComparatorWithU64TsImpl::kClassName(), + [](const std::string& /*uri*/, + std::unique_ptr* /*guard*/, + std::string* /*errmsg*/) { + return ReverseBytewiseComparatorWithU64Ts(); + }); + return 4; } -#endif // ROCKSDB_LITE Status Comparator::CreateFromString(const ConfigOptions& config_options, const std::string& value, const Comparator** result) { -#ifndef ROCKSDB_LITE static std::once_flag once; std::call_once(once, [&]() { RegisterBuiltinComparators(*(ObjectLibrary::Default().get()), ""); }); -#endif // ROCKSDB_LITE std::string id; std::unordered_map opt_map; Status status = Customizable::GetOptionsMap(config_options, *result, value, @@ -361,6 +401,9 @@ Status Comparator::CreateFromString(const ConfigOptions& config_options, } else if (id == ComparatorWithU64TsImpl::kClassName()) { *result = BytewiseComparatorWithU64Ts(); + } else if (id == ComparatorWithU64TsImpl< + ReverseBytewiseComparatorImpl>::kClassName()) { + *result = ReverseBytewiseComparatorWithU64Ts(); } else if (value.empty()) { // No Id and no options. Clear the object *result = nullptr; @@ -368,11 +411,7 @@ Status Comparator::CreateFromString(const ConfigOptions& config_options, } else if (id.empty()) { // We have no Id but have options. Not good return Status::NotSupported("Cannot reset object ", id); } else { -#ifndef ROCKSDB_LITE status = config_options.registry->NewStaticObject(id, result); -#else - status = Status::NotSupported("Cannot load object in LITE mode ", id); -#endif // ROCKSDB_LITE if (!status.ok()) { if (config_options.ignore_unsupported_options && status.IsNotSupported()) { diff --git a/util/compression.cc b/util/compression.cc index 712d333ee633..2a0bc38d4f3d 100644 --- a/util/compression.cc +++ b/util/compression.cc @@ -48,7 +48,7 @@ int ZSTDStreamingCompress::Compress(const char* input, size_t input_size, if (input_size == 0) { return 0; } -#ifndef ZSTD_STREAMING +#ifndef ZSTD_ADVANCED (void)input; (void)input_size; (void)output; @@ -77,7 +77,7 @@ int ZSTDStreamingCompress::Compress(const char* input, size_t input_size, } void ZSTDStreamingCompress::Reset() { -#ifdef ZSTD_STREAMING +#ifdef ZSTD_ADVANCED ZSTD_CCtx_reset(cctx_, ZSTD_ResetDirective::ZSTD_reset_session_only); input_buffer_ = {/*src=*/nullptr, /*size=*/0, /*pos=*/0}; #endif @@ -91,7 +91,7 @@ int ZSTDStreamingUncompress::Uncompress(const char* input, size_t input_size, if (input_size == 0) { return 0; } -#ifdef ZSTD_STREAMING +#ifdef ZSTD_ADVANCED if (input) { // New input input_buffer_ = {input, input_size, /*pos=*/0}; @@ -113,7 +113,7 @@ int ZSTDStreamingUncompress::Uncompress(const char* input, size_t input_size, } void ZSTDStreamingUncompress::Reset() { -#ifdef ZSTD_STREAMING +#ifdef ZSTD_ADVANCED ZSTD_DCtx_reset(dctx_, ZSTD_ResetDirective::ZSTD_reset_session_only); input_buffer_ = {/*src=*/nullptr, /*size=*/0, /*pos=*/0}; #endif diff --git a/util/compression.h b/util/compression.h index 31ff5a7554a3..5620969d76d5 100644 --- a/util/compression.h +++ b/util/compression.h @@ -20,7 +20,7 @@ #endif // ROCKSDB_MALLOC_USABLE_SIZE #include -#include "memory/memory_allocator.h" +#include "memory/memory_allocator_impl.h" #include "rocksdb/options.h" #include "rocksdb/table.h" #include "table/block_based/block_type.h" @@ -53,8 +53,11 @@ #include #endif // ZSTD_VERSION_NUMBER >= 10103 // v1.4.0+ +// ZSTD_Compress2(), ZSTD_compressStream2() and frame parameters all belong to +// advanced APIs and require v1.4.0+. +// https://github.com/facebook/zstd/blob/eb9f881eb810f2242f1ef36b3f3e7014eecb8fa6/lib/zstd.h#L297C40-L297C45 #if ZSTD_VERSION_NUMBER >= 10400 -#define ZSTD_STREAMING +#define ZSTD_ADVANCED #endif // ZSTD_VERSION_NUMBER >= 10400 namespace ROCKSDB_NAMESPACE { // Need this for the context allocation override @@ -180,6 +183,9 @@ struct CompressionDict { if (level == CompressionOptions::kDefaultCompressionLevel) { // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see // https://github.com/facebook/zstd/issues/1148 + // TODO(cbi): ZSTD_CLEVEL_DEFAULT is exposed after + // https://github.com/facebook/zstd/pull/1174. Use ZSTD_CLEVEL_DEFAULT + // instead of hardcoding 3. level = 3; } // Should be safe (but slower) if below call fails as we'll use the @@ -363,14 +369,43 @@ class CompressionContext { private: #if defined(ZSTD) && (ZSTD_VERSION_NUMBER >= 500) ZSTD_CCtx* zstd_ctx_ = nullptr; - void CreateNativeContext(CompressionType type) { - if (type == kZSTD || type == kZSTDNotFinalCompression) { + + ZSTD_CCtx* CreateZSTDContext() { #ifdef ROCKSDB_ZSTD_CUSTOM_MEM - zstd_ctx_ = - ZSTD_createCCtx_advanced(port::GetJeZstdAllocationOverrides()); + return ZSTD_createCCtx_advanced(port::GetJeZstdAllocationOverrides()); #else // ROCKSDB_ZSTD_CUSTOM_MEM - zstd_ctx_ = ZSTD_createCCtx(); + return ZSTD_createCCtx(); #endif // ROCKSDB_ZSTD_CUSTOM_MEM + } + + void CreateNativeContext(CompressionType type, int level, bool checksum) { + if (type == kZSTD || type == kZSTDNotFinalCompression) { + zstd_ctx_ = CreateZSTDContext(); +#ifdef ZSTD_ADVANCED + if (level == CompressionOptions::kDefaultCompressionLevel) { + // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see + // https://github.com/facebook/zstd/issues/1148 + level = 3; + } + size_t err = + ZSTD_CCtx_setParameter(zstd_ctx_, ZSTD_c_compressionLevel, level); + if (ZSTD_isError(err)) { + assert(false); + ZSTD_freeCCtx(zstd_ctx_); + zstd_ctx_ = CreateZSTDContext(); + } + if (checksum) { + err = ZSTD_CCtx_setParameter(zstd_ctx_, ZSTD_c_checksumFlag, 1); + if (ZSTD_isError(err)) { + assert(false); + ZSTD_freeCCtx(zstd_ctx_); + zstd_ctx_ = CreateZSTDContext(); + } + } +#else + (void)level; + (void)checksum; +#endif } } void DestroyNativeContext() { @@ -388,12 +423,14 @@ class CompressionContext { #else // ZSTD && (ZSTD_VERSION_NUMBER >= 500) private: - void CreateNativeContext(CompressionType /* type */) {} + void CreateNativeContext(CompressionType /* type */, int /* level */, + bool /* checksum */) {} void DestroyNativeContext() {} #endif // ZSTD && (ZSTD_VERSION_NUMBER >= 500) public: - explicit CompressionContext(CompressionType type) { - CreateNativeContext(type); + explicit CompressionContext(CompressionType type, + const CompressionOptions& options) { + CreateNativeContext(type, options.level, options.checksum); } ~CompressionContext() { DestroyNativeContext(); } CompressionContext(const CompressionContext&) = delete; @@ -525,7 +562,7 @@ inline bool ZSTDNotFinal_Supported() { } inline bool ZSTD_Streaming_Supported() { -#if defined(ZSTD) && defined(ZSTD_STREAMING) +#if defined(ZSTD_ADVANCED) return true; #else return false; @@ -1116,9 +1153,15 @@ inline bool LZ4_Compress(const CompressionInfo& info, static_cast(compression_dict.size())); } #if LZ4_VERSION_NUMBER >= 10700 // r129+ - outlen = - LZ4_compress_fast_continue(stream, input, &(*output)[output_header_len], - static_cast(length), compress_bound, 1); + int acceleration; + if (info.options().level < 0) { + acceleration = -info.options().level; + } else { + acceleration = 1; + } + outlen = LZ4_compress_fast_continue( + stream, input, &(*output)[output_header_len], static_cast(length), + compress_bound, acceleration); #else // up to r128 outlen = LZ4_compress_limitedOutput_continue( stream, input, &(*output)[output_header_len], static_cast(length), @@ -1343,30 +1386,44 @@ inline bool ZSTD_Compress(const CompressionInfo& info, const char* input, size_t compressBound = ZSTD_compressBound(length); output->resize(static_cast(output_header_len + compressBound)); size_t outlen = 0; - int level; - if (info.options().level == CompressionOptions::kDefaultCompressionLevel) { - // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see - // https://github.com/facebook/zstd/issues/1148 - level = 3; - } else { - level = info.options().level; - } #if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+ ZSTD_CCtx* context = info.context().ZSTDPreallocCtx(); assert(context != nullptr); +#ifdef ZSTD_ADVANCED + if (info.dict().GetDigestedZstdCDict() != nullptr) { + ZSTD_CCtx_refCDict(context, info.dict().GetDigestedZstdCDict()); + } else { + ZSTD_CCtx_loadDictionary(context, info.dict().GetRawDict().data(), + info.dict().GetRawDict().size()); + } + + // Compression level is set in `contex` during CreateNativeContext() + outlen = ZSTD_compress2(context, &(*output)[output_header_len], compressBound, + input, length); +#else // ZSTD_ADVANCED #if ZSTD_VERSION_NUMBER >= 700 // v0.7.0+ if (info.dict().GetDigestedZstdCDict() != nullptr) { outlen = ZSTD_compress_usingCDict(context, &(*output)[output_header_len], compressBound, input, length, info.dict().GetDigestedZstdCDict()); } -#endif // ZSTD_VERSION_NUMBER >= 700 +#endif // ZSTD_VERSION_NUMBER >= 700 + // TODO (cbi): error handling for compression. if (outlen == 0) { + int level; + if (info.options().level == CompressionOptions::kDefaultCompressionLevel) { + // 3 is the value of ZSTD_CLEVEL_DEFAULT (not exposed publicly), see + // https://github.com/facebook/zstd/issues/1148 + level = 3; + } else { + level = info.options().level; + } outlen = ZSTD_compress_usingDict(context, &(*output)[output_header_len], compressBound, input, length, info.dict().GetRawDict().data(), info.dict().GetRawDict().size(), level); } +#endif // ZSTD_ADVANCED #else // up to v0.4.x outlen = ZSTD_compress(&(*output)[output_header_len], compressBound, input, length, level); @@ -1387,17 +1444,28 @@ inline bool ZSTD_Compress(const CompressionInfo& info, const char* input, // @param compression_dict Data for presetting the compression library's // dictionary. +// @param error_message If not null, will be set if decompression fails. +// +// Returns nullptr if decompression fails. inline CacheAllocationPtr ZSTD_Uncompress( const UncompressionInfo& info, const char* input_data, size_t input_length, - size_t* uncompressed_size, MemoryAllocator* allocator = nullptr) { + size_t* uncompressed_size, MemoryAllocator* allocator = nullptr, + const char** error_message = nullptr) { #ifdef ZSTD + static const char* const kErrorDecodeOutputSize = + "Cannot decode output size."; + static const char* const kErrorOutputLenMismatch = + "Decompressed size does not match header."; uint32_t output_len = 0; if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, &output_len)) { + if (error_message) { + *error_message = kErrorDecodeOutputSize; + } return nullptr; } - auto output = AllocateBlock(output_len, allocator); + CacheAllocationPtr output = AllocateBlock(output_len, allocator); size_t actual_output_length = 0; #if ZSTD_VERSION_NUMBER >= 500 // v0.5.0+ ZSTD_DCtx* context = info.context().GetZSTDContext(); @@ -1407,19 +1475,31 @@ inline CacheAllocationPtr ZSTD_Uncompress( actual_output_length = ZSTD_decompress_usingDDict( context, output.get(), output_len, input_data, input_length, info.dict().GetDigestedZstdDDict()); - } + } else { #endif // ROCKSDB_ZSTD_DDICT - if (actual_output_length == 0) { actual_output_length = ZSTD_decompress_usingDict( context, output.get(), output_len, input_data, input_length, info.dict().GetRawDict().data(), info.dict().GetRawDict().size()); +#ifdef ROCKSDB_ZSTD_DDICT } +#endif // ROCKSDB_ZSTD_DDICT #else // up to v0.4.x (void)info; actual_output_length = ZSTD_decompress(output.get(), output_len, input_data, input_length); #endif // ZSTD_VERSION_NUMBER >= 500 - assert(actual_output_length == output_len); + if (ZSTD_isError(actual_output_length)) { + if (error_message) { + *error_message = ZSTD_getErrorName(actual_output_length); + } + return nullptr; + } else if (actual_output_length != output_len) { + if (error_message) { + *error_message = kErrorOutputLenMismatch; + } + return nullptr; + } + *uncompressed_size = actual_output_length; return output; #else // ZSTD @@ -1428,6 +1508,7 @@ inline CacheAllocationPtr ZSTD_Uncompress( (void)input_length; (void)uncompressed_size; (void)allocator; + (void)error_message; return nullptr; #endif } @@ -1530,6 +1611,7 @@ inline std::string ZSTD_FinalizeDictionary( return dict_data; } #else // up to v1.4.4 + assert(false); (void)samples; (void)sample_lens; (void)max_dict_bytes; @@ -1589,7 +1671,8 @@ inline bool CompressData(const Slice& raw, inline CacheAllocationPtr UncompressData( const UncompressionInfo& uncompression_info, const char* data, size_t n, size_t* uncompressed_size, uint32_t compress_format_version, - MemoryAllocator* allocator = nullptr) { + MemoryAllocator* allocator = nullptr, + const char** error_message = nullptr) { switch (uncompression_info.type()) { case kSnappyCompression: return Snappy_Uncompress(data, n, uncompressed_size, allocator); @@ -1609,8 +1692,9 @@ inline CacheAllocationPtr UncompressData( return CacheAllocationPtr(XPRESS_Uncompress(data, n, uncompressed_size)); case kZSTD: case kZSTDNotFinalCompression: + // TODO(cbi): error message handling for other compression algorithms. return ZSTD_Uncompress(uncompression_info, data, n, uncompressed_size, - allocator); + allocator, error_message); default: return CacheAllocationPtr(); } @@ -1743,7 +1827,7 @@ class ZSTDStreamingCompress final : public StreamingCompress { size_t max_output_len) : StreamingCompress(kZSTD, opts, compress_format_version, max_output_len) { -#ifdef ZSTD_STREAMING +#ifdef ZSTD_ADVANCED cctx_ = ZSTD_createCCtx(); // Each compressed frame will have a checksum ZSTD_CCtx_setParameter(cctx_, ZSTD_c_checksumFlag, 1); @@ -1752,14 +1836,14 @@ class ZSTDStreamingCompress final : public StreamingCompress { #endif } ~ZSTDStreamingCompress() override { -#ifdef ZSTD_STREAMING +#ifdef ZSTD_ADVANCED ZSTD_freeCCtx(cctx_); #endif } int Compress(const char* input, size_t input_size, char* output, size_t* output_pos) override; void Reset() override; -#ifdef ZSTD_STREAMING +#ifdef ZSTD_ADVANCED ZSTD_CCtx* cctx_; ZSTD_inBuffer input_buffer_; #endif @@ -1770,14 +1854,14 @@ class ZSTDStreamingUncompress final : public StreamingUncompress { explicit ZSTDStreamingUncompress(uint32_t compress_format_version, size_t max_output_len) : StreamingUncompress(kZSTD, compress_format_version, max_output_len) { -#ifdef ZSTD_STREAMING +#ifdef ZSTD_ADVANCED dctx_ = ZSTD_createDCtx(); assert(dctx_ != nullptr); input_buffer_ = {/*src=*/nullptr, /*size=*/0, /*pos=*/0}; #endif } ~ZSTDStreamingUncompress() override { -#ifdef ZSTD_STREAMING +#ifdef ZSTD_ADVANCED ZSTD_freeDCtx(dctx_); #endif } @@ -1786,7 +1870,7 @@ class ZSTDStreamingUncompress final : public StreamingUncompress { void Reset() override; private: -#ifdef ZSTD_STREAMING +#ifdef ZSTD_ADVANCED ZSTD_DCtx* dctx_; ZSTD_inBuffer input_buffer_; #endif diff --git a/util/core_local.h b/util/core_local.h index b444a1152256..9c5b3f2815ea 100644 --- a/util/core_local.h +++ b/util/core_local.h @@ -5,6 +5,7 @@ #pragma once +#include #include #include #include @@ -12,6 +13,7 @@ #include "port/likely.h" #include "port/port.h" +#include "util/math.h" #include "util/random.h" namespace ROCKSDB_NAMESPACE { @@ -69,7 +71,7 @@ std::pair CoreLocalArray::AccessElementAndIndex() const { // cpu id unavailable, just pick randomly core_idx = Random::GetTLSInstance()->Uniform(1 << size_shift_); } else { - core_idx = static_cast(cpuid & ((1 << size_shift_) - 1)); + core_idx = static_cast(BottomNBits(cpuid, size_shift_)); } return {AccessAtCore(core_idx), core_idx}; } diff --git a/util/crc32c.cc b/util/crc32c.cc index d71c71c2e425..9e97045f4473 100644 --- a/util/crc32c.cc +++ b/util/crc32c.cc @@ -15,10 +15,6 @@ #include #include -#ifdef HAVE_SSE42 -#include -#include -#endif #include "port/lang.h" #include "util/coding.h" @@ -50,6 +46,13 @@ #endif +ASSERT_FEATURE_COMPAT_HEADER(); + +#ifdef __SSE4_2__ +#include +#include +#endif + #if defined(HAVE_ARM64_CRC) bool pmull_runtime_flag = false; #endif @@ -107,6 +110,7 @@ static const uint32_t table0_[256] = { 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351}; +#ifndef __SSE4_2__ static const uint32_t table1_[256] = { 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, @@ -244,14 +248,10 @@ static const uint32_t table3_[256] = { static inline uint32_t LE_LOAD32(const uint8_t* p) { return DecodeFixed32(reinterpret_cast(p)); } +#endif // !__SSE4_2__ -#if defined(HAVE_SSE42) && (defined(__LP64__) || defined(_WIN64)) -static inline uint64_t LE_LOAD64(const uint8_t* p) { - return DecodeFixed64(reinterpret_cast(p)); -} -#endif - -static inline void Slow_CRC32(uint64_t* l, uint8_t const** p) { +static inline void DefaultCRC32(uint64_t* l, uint8_t const** p) { +#ifndef __SSE4_2__ uint32_t c = static_cast(*l ^ LE_LOAD32(*p)); *p += 4; *l = table3_[c & 0xff] ^ table2_[(c >> 8) & 0xff] ^ @@ -261,16 +261,8 @@ static inline void Slow_CRC32(uint64_t* l, uint8_t const** p) { *p += 4; *l = table3_[c & 0xff] ^ table2_[(c >> 8) & 0xff] ^ table1_[(c >> 16) & 0xff] ^ table0_[c >> 24]; -} - -#if (!(defined(HAVE_POWER8) && defined(HAS_ALTIVEC))) && \ - (!defined(HAVE_ARM64_CRC)) || \ - defined(NO_THREEWAY_CRC32C) -static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) { -#ifndef HAVE_SSE42 - Slow_CRC32(l, p); #elif defined(__LP64__) || defined(_WIN64) - *l = _mm_crc32_u64(*l, LE_LOAD64(*p)); + *l = _mm_crc32_u64(*l, DecodeFixed64(reinterpret_cast(*p))); *p += 8; #else *l = _mm_crc32_u32(static_cast(*l), LE_LOAD32(*p)); @@ -279,7 +271,6 @@ static inline void Fast_CRC32(uint64_t* l, uint8_t const** p) { *p += 4; #endif } -#endif template uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { @@ -324,48 +315,6 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { return static_cast(l ^ 0xffffffffu); } -// Detect if ARM64 CRC or not. -#ifndef HAVE_ARM64_CRC -// Detect if SS42 or not. -#ifndef HAVE_POWER8 - -static bool isSSE42() { -#ifndef HAVE_SSE42 - return false; -#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE) - uint32_t c_; - __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx"); - return c_ & (1U << 20); // copied from CpuId.h in Folly. Test SSE42 -#elif defined(_WIN64) - int info[4]; - __cpuidex(info, 0x00000001, 0); - return (info[2] & ((int)1 << 20)) != 0; -#else - return false; -#endif -} - -static bool isPCLMULQDQ() { -#ifndef HAVE_SSE42 - // in build_detect_platform we set this macro when both SSE42 and PCLMULQDQ - // are supported by compiler - return false; -#elif defined(__GNUC__) && defined(__x86_64__) && !defined(IOS_CROSS_COMPILE) - uint32_t c_; - __asm__("cpuid" : "=c"(c_) : "a"(1) : "ebx", "edx"); - return c_ & (1U << 1); // PCLMULQDQ is in bit 1 (not bit 0) -#elif defined(_WIN64) - int info[4]; - __cpuidex(info, 0x00000001, 0); - return (info[2] & ((int)1 << 1)) != 0; -#else - return false; -#endif -} - -#endif // HAVE_POWER8 -#endif // HAVE_ARM64_CRC - using Function = uint32_t (*)(uint32_t, const char*, size_t); #if defined(HAVE_POWER8) && defined(HAS_ALTIVEC) @@ -436,7 +385,9 @@ std::string IsFastCrc32Supported() { arch = "Arm64"; } #else - has_fast_crc = isSSE42(); +#ifdef __SSE4_2__ + has_fast_crc = true; +#endif // __SSE4_2__ arch = "x86"; #endif if (has_fast_crc) { @@ -477,7 +428,7 @@ std::string IsFastCrc32Supported() { * * */ -#if defined HAVE_SSE42 && defined HAVE_PCLMUL +#if defined(__SSE4_2__) && defined(__PCLMUL__) #define CRCtriplet(crc, buf, offset) \ crc##0 = _mm_crc32_u64(crc##0, *(buf##0 + offset)); \ @@ -1152,34 +1103,30 @@ uint32_t crc32c_3way(uint32_t crc, const char* buf, size_t len) { } } -#endif //HAVE_SSE42 && HAVE_PCLMUL +#endif //__SSE4_2__ && __PCLMUL__ static inline Function Choose_Extend() { #ifdef HAVE_POWER8 - return isAltiVec() ? ExtendPPCImpl : ExtendImpl; + return isAltiVec() ? ExtendPPCImpl : ExtendImpl; #elif defined(HAVE_ARM64_CRC) if(crc32c_runtime_check()) { pmull_runtime_flag = crc32c_pmull_runtime_check(); return ExtendARMImpl; } else { - return ExtendImpl; + return ExtendImpl; } -#else - if (isSSE42()) { - if (isPCLMULQDQ()) { -#if (defined HAVE_SSE42 && defined HAVE_PCLMUL) && !defined NO_THREEWAY_CRC32C - return crc32c_3way; -#else - return ExtendImpl; // Fast_CRC32 will check HAVE_SSE42 itself +#elif defined(__SSE4_2__) && defined(__PCLMUL__) && !defined NO_THREEWAY_CRC32C + // NOTE: runtime detection no longer supported on x86 +#ifdef _MSC_VER +#pragma warning(disable: 4551) #endif - } - else { // no runtime PCLMULQDQ support but has SSE42 support - return ExtendImpl; - } - } // end of isSSE42() - else { - return ExtendImpl; - } + (void)ExtendImpl; // suppress unused warning +#ifdef _MSC_VER +#pragma warning(default: 4551) +#endif + return crc32c_3way; +#else + return ExtendImpl; #endif } diff --git a/util/crc32c_arm64.cc b/util/crc32c_arm64.cc index 4885f4fe101c..98d1c307db57 100644 --- a/util/crc32c_arm64.cc +++ b/util/crc32c_arm64.cc @@ -23,10 +23,10 @@ #include #endif #if defined(__OpenBSD__) -#include -#include -#include #include +#include +#include +#include #endif #ifdef HAVE_ARM64_CRYPTO @@ -67,13 +67,12 @@ uint32_t crc32c_runtime_check(void) { return r == 1; #elif defined(__OpenBSD__) int r = 0; - const int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 }; + const int isar0_mib[] = {CTL_MACHDEP, CPU_ID_AA64ISAR0}; uint64_t isar0; size_t len = sizeof(isar0); if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) { - if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE) - r = 1; + if (ID_AA64ISAR0_CRC32(isar0) >= ID_AA64ISAR0_CRC32_BASE) r = 1; } return r; #else @@ -94,13 +93,12 @@ bool crc32c_pmull_runtime_check(void) { return true; #elif defined(__OpenBSD__) bool r = false; - const int isar0_mib[] = { CTL_MACHDEP, CPU_ID_AA64ISAR0 }; + const int isar0_mib[] = {CTL_MACHDEP, CPU_ID_AA64ISAR0}; uint64_t isar0; size_t len = sizeof(isar0); if (sysctl(isar0_mib, 2, &isar0, &len, NULL, 0) != -1) { - if (ID_AA64ISAR0_AES(isar0) >= ID_AA64ISAR0_AES_PMULL) - r = true; + if (ID_AA64ISAR0_AES(isar0) >= ID_AA64ISAR0_AES_PMULL) r = true; } return r; #else diff --git a/util/data_structure.cc b/util/data_structure.cc new file mode 100644 index 000000000000..d647df5d5b25 --- /dev/null +++ b/util/data_structure.cc @@ -0,0 +1,18 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "rocksdb/data_structure.h" + +#include "util/math.h" + +namespace ROCKSDB_NAMESPACE { +namespace detail { + +int CountTrailingZeroBitsForSmallEnumSet(uint64_t v) { + return CountTrailingZeroBits(v); +} + +} // namespace detail +} // namespace ROCKSDB_NAMESPACE diff --git a/util/distributed_mutex.h b/util/distributed_mutex.h index 9675a1e2deab..e3450d753e2d 100644 --- a/util/distributed_mutex.h +++ b/util/distributed_mutex.h @@ -28,7 +28,7 @@ class DMutex : public folly::DistributedMutex { explicit DMutex(bool IGNORED_adaptive = false) { (void)IGNORED_adaptive; } // currently no-op - void AssertHeld() {} + void AssertHeld() const {} }; using DMutexLock = std::lock_guard; @@ -36,6 +36,8 @@ using DMutexLock = std::lock_guard; #else +#include + #include "port/port.h" namespace ROCKSDB_NAMESPACE { diff --git a/util/duplicate_detector.h b/util/duplicate_detector.h index d778622db819..aa42e950e498 100644 --- a/util/duplicate_detector.h +++ b/util/duplicate_detector.h @@ -56,12 +56,10 @@ class DuplicateDetector { ". WAL must must have been emptied before dropping the column " "family", cf); -#ifndef ROCKSDB_LITE throw std::runtime_error( "Recovering an entry from a dropped column family. " "WAL must must have been flushed before dropping the column " "family"); -#endif return; } auto cmp = h->GetComparator(); diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h index 40cd29404456..0ff1053ca6a8 100644 --- a/util/dynamic_bloom.h +++ b/util/dynamic_bloom.h @@ -126,7 +126,7 @@ inline void DynamicBloom::MayContain(int num_keys, Slice* keys, std::array byte_offsets; for (int i = 0; i < num_keys; ++i) { hashes[i] = BloomHash(keys[i]); - size_t a = FastRange32(kLen, hashes[i]); + size_t a = FastRange32(hashes[i], kLen); PREFETCH(data_ + a, 0, 3); byte_offsets[i] = a; } @@ -142,7 +142,7 @@ inline void DynamicBloom::MayContain(int num_keys, Slice* keys, #pragma warning(disable : 4189) #endif inline void DynamicBloom::Prefetch(uint32_t h32) { - size_t a = FastRange32(kLen, h32); + size_t a = FastRange32(h32, kLen); PREFETCH(data_ + a, 0, 3); } #if defined(_MSC_VER) @@ -171,7 +171,7 @@ inline void DynamicBloom::Prefetch(uint32_t h32) { // because of false positives.) inline bool DynamicBloom::MayContainHash(uint32_t h32) const { - size_t a = FastRange32(kLen, h32); + size_t a = FastRange32(h32, kLen); PREFETCH(data_ + a, 0, 3); return DoubleProbe(h32, a); } @@ -195,7 +195,7 @@ inline bool DynamicBloom::DoubleProbe(uint32_t h32, size_t byte_offset) const { template inline void DynamicBloom::AddHash(uint32_t h32, const OrFunc& or_func) { - size_t a = FastRange32(kLen, h32); + size_t a = FastRange32(h32, kLen); PREFETCH(data_ + a, 0, 3); // Expand/remix with 64-bit golden ratio uint64_t h = 0x9e3779b97f4a7c13ULL * h32; diff --git a/util/file_checksum_helper.cc b/util/file_checksum_helper.cc index a7392035245a..b8c4099b805f 100644 --- a/util/file_checksum_helper.cc +++ b/util/file_checksum_helper.cc @@ -98,6 +98,8 @@ Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, return Status::InvalidArgument("checksum_list is nullptr"); } assert(checksum_list); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; checksum_list->reset(); Status s; @@ -125,7 +127,8 @@ Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, reporter.status_ptr = &s; log::Reader reader(nullptr, std::move(file_reader), &reporter, true /* checksum */, 0 /* log_number */); - FileChecksumRetriever retriever(manifest_file_size, *checksum_list); + FileChecksumRetriever retriever(read_options, manifest_file_size, + *checksum_list); retriever.Iterate(reader, &s); assert(!retriever.status().ok() || manifest_file_size == std::numeric_limits::max() || @@ -134,7 +137,6 @@ Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, return retriever.status(); } -#ifndef ROCKSDB_LITE namespace { static int RegisterFileChecksumGenFactories(ObjectLibrary& library, const std::string& /*arg*/) { @@ -149,23 +151,19 @@ static int RegisterFileChecksumGenFactories(ObjectLibrary& library, return 1; } } // namespace -#endif // !ROCKSDB_LITE Status FileChecksumGenFactory::CreateFromString( const ConfigOptions& options, const std::string& value, std::shared_ptr* result) { -#ifndef ROCKSDB_LITE static std::once_flag once; std::call_once(once, [&]() { RegisterFileChecksumGenFactories(*(ObjectLibrary::Default().get()), ""); }); -#endif // ROCKSDB_LITE if (value == FileChecksumGenCrc32cFactory::kClassName()) { *result = GetFileChecksumGenCrc32cFactory(); return Status::OK(); } else { - Status s = LoadSharedObject(options, value, nullptr, - result); + Status s = LoadSharedObject(options, value, result); return s; } } diff --git a/util/file_checksum_helper.h b/util/file_checksum_helper.h index d622e9bba05c..52469cf9f987 100644 --- a/util/file_checksum_helper.h +++ b/util/file_checksum_helper.h @@ -8,6 +8,7 @@ #include #include "port/port.h" +#include "rocksdb/env.h" #include "rocksdb/file_checksum.h" #include "rocksdb/status.h" #include "util/coding.h" diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc index e778efc3c549..68776612b905 100644 --- a/util/file_reader_writer_test.cc +++ b/util/file_reader_writer_test.cc @@ -206,11 +206,7 @@ TEST_F(WritableFileWriterTest, IncrementalBuffer) { (attempt < kNumAttempts / 2) ? 512 * 1024 : 700 * 1024; std::string actual; std::unique_ptr wf(new FakeWF(&actual, -#ifndef ROCKSDB_LITE attempt % 2 == 1, -#else - false, -#endif no_flush)); std::unique_ptr writer(new WritableFileWriter( std::move(wf), "" /* don't care */, env_options)); @@ -421,7 +417,6 @@ TEST_F(DBWritableFileWriterTest, AppendWithChecksumRateLimiter) { Destroy(options); } -#ifndef ROCKSDB_LITE TEST_F(WritableFileWriterTest, AppendStatusReturn) { class FakeWF : public FSWritableFile { public: @@ -477,7 +472,6 @@ TEST_F(WritableFileWriterTest, AppendStatusReturn) { fwf->SetIOError(true); ASSERT_NOK(writer->Append(std::string(2 * kMb, 'b'))); } -#endif class ReadaheadRandomAccessFileTest : public testing::Test, @@ -792,7 +786,6 @@ TEST(LineFileReaderTest, LineFileReaderTest) { } } -#ifndef ROCKSDB_LITE class IOErrorEventListener : public EventListener { public: IOErrorEventListener() { notify_error_.store(0); } @@ -908,7 +901,6 @@ TEST_F(DBWritableFileWriterTest, IOErrorNotification) { ASSERT_EQ(listener->NotifyErrorCount(), 2); fwf->CheckCounters(1, 1); } -#endif // ROCKSDB_LITE class WritableFileWriterIOPriorityTest : public testing::Test { protected: diff --git a/util/filelock_test.cc b/util/filelock_test.cc index 69947a732e1e..82021aec99a6 100644 --- a/util/filelock_test.cc +++ b/util/filelock_test.cc @@ -11,6 +11,9 @@ #include #include #endif +#ifdef __OpenBSD__ +#include +#endif #include #include "test_util/testharness.h" diff --git a/util/filter_bench.cc b/util/filter_bench.cc index 93186cd08419..4e6dabf57684 100644 --- a/util/filter_bench.cc +++ b/util/filter_bench.cc @@ -3,10 +3,10 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#if !defined(GFLAGS) || defined(ROCKSDB_LITE) +#if !defined(GFLAGS) #include int main() { - fprintf(stderr, "filter_bench requires gflags and !ROCKSDB_LITE\n"); + fprintf(stderr, "filter_bench requires gflags\n"); return 1; } #else @@ -728,7 +728,7 @@ double FilterBench::RandomQueryTest(uint32_t inside_threshold, bool dry_run, batch_slices[i], /*no_io=*/false, /*const_ikey_ptr=*/nullptr, /*get_context=*/nullptr, - /*lookup_context=*/nullptr, Env::IO_TOTAL); + /*lookup_context=*/nullptr, ROCKSDB_NAMESPACE::ReadOptions()); } } else { if (dry_run) { @@ -837,4 +837,4 @@ int main(int argc, char **argv) { return 0; } -#endif // !defined(GFLAGS) || defined(ROCKSDB_LITE) +#endif // !defined(GFLAGS) diff --git a/util/gflags_compat.h b/util/gflags_compat.h index b6f88a5bcf11..8f4a30b0d661 100644 --- a/util/gflags_compat.h +++ b/util/gflags_compat.h @@ -15,16 +15,15 @@ #endif #ifndef DEFINE_uint32 -// DEFINE_uint32 does not appear in older versions of gflags. This should be -// a sane definition for those versions. +// DEFINE_uint32 / DECLARE_uint32 do not appear in older versions of gflags. +// These should be sane definitions for those versions. #include -#define DEFINE_uint32(name, val, txt) \ - namespace gflags_compat { \ - DEFINE_int32(name, val, txt); \ - } \ - std::reference_wrapper FLAGS_##name = \ - std::ref(*reinterpret_cast(&gflags_compat::FLAGS_##name)); +#define DEFINE_uint32(name, val, txt) \ + namespace gflags_compat { \ + DEFINE_int32(name, val, txt); \ + } \ + uint32_t &FLAGS_##name = \ + *reinterpret_cast(&gflags_compat::FLAGS_##name); -#define DECLARE_uint32(name) \ - extern std::reference_wrapper FLAGS_##name; +#define DECLARE_uint32(name) extern uint32_t &FLAGS_##name; #endif // !DEFINE_uint32 diff --git a/util/hash.h b/util/hash.h index eafa47f34692..7a24659ad1c9 100644 --- a/util/hash.h +++ b/util/hash.h @@ -128,10 +128,14 @@ inline uint32_t Upper32of64(uint64_t v) { } inline uint32_t Lower32of64(uint64_t v) { return static_cast(v); } -// std::hash compatible interface. -// TODO: consider rename to SliceHasher32 -struct SliceHasher { +// std::hash-like interface. +struct SliceHasher32 { uint32_t operator()(const Slice& s) const { return GetSliceHash(s); } }; +struct SliceNPHasher64 { + uint64_t operator()(const Slice& s, uint64_t seed = 0) const { + return GetSliceNPHash64(s, seed); + } +}; } // namespace ROCKSDB_NAMESPACE diff --git a/util/hash_test.cc b/util/hash_test.cc index 72112b044813..ccc283a24376 100644 --- a/util/hash_test.cc +++ b/util/hash_test.cc @@ -565,6 +565,8 @@ size_t FastRange64(uint64_t hash, size_t range) { // Tests for math.h / math128.h (not worth a separate test binary) using ROCKSDB_NAMESPACE::BitParity; using ROCKSDB_NAMESPACE::BitsSetToOne; +using ROCKSDB_NAMESPACE::BitwiseAnd; +using ROCKSDB_NAMESPACE::BottomNBits; using ROCKSDB_NAMESPACE::ConstexprFloorLog2; using ROCKSDB_NAMESPACE::CountTrailingZeroBits; using ROCKSDB_NAMESPACE::DecodeFixed128; @@ -580,6 +582,19 @@ using ROCKSDB_NAMESPACE::Upper64of128; int blah(int x) { return DownwardInvolution(x); } +template +static void test_BitwiseAnd(T1 v1, T2 v2) { + auto a = BitwiseAnd(v1, v2); + // Essentially repeating the implementation :-/ + if constexpr (sizeof(T1) < sizeof(T2)) { + static_assert(std::is_same_v); + EXPECT_EQ(a, static_cast(v1 & v2)); + } else { + static_assert(std::is_same_v); + EXPECT_EQ(a, static_cast(v1 & v2)); + } +} + template static void test_BitOps() { // This complex code is to generalize to 128-bit values. Otherwise @@ -598,6 +613,22 @@ static void test_BitOps() { // If we could directly use arithmetic: // T vm1 = static_cast(v - 1); + // BottomNBits + { + // An essentially full length value + T x = everyOtherBit; + if (i > 2) { + // Make it slightly irregular + x = x ^ (T{1} << (i / 2)); + } + auto a = BottomNBits(x, i); + auto b = BottomNBits(~x, i); + EXPECT_EQ(x | a, x); + EXPECT_EQ(a | b, vm1); + EXPECT_EQ(a & b, T{0}); + EXPECT_EQ(BottomNBits(x ^ a, i), T{0}); + } + // FloorLog2 if (v > 0) { EXPECT_EQ(FloorLog2(v), i); @@ -707,9 +738,22 @@ static void test_BitOps() { } } + // BitwiseAnd + { + test_BitwiseAnd(vm1, static_cast(0x99)); + test_BitwiseAnd(v, static_cast(0x99)); + test_BitwiseAnd(char{0x66}, vm1); + test_BitwiseAnd(char{0x66}, v); + test_BitwiseAnd(v, int16_t{0x6699}); + test_BitwiseAnd(v, uint16_t{0x9966}); + test_BitwiseAnd(int64_t{0x1234234534564567}, v); + test_BitwiseAnd(uint64_t{0x9876876576545432}, v); + } + vm1 = (vm1 << 1) | 1; } + // ConstexprFloorLog2 EXPECT_EQ(ConstexprFloorLog2(T{1}), 0); EXPECT_EQ(ConstexprFloorLog2(T{2}), 1); EXPECT_EQ(ConstexprFloorLog2(T{3}), 1); diff --git a/util/math.h b/util/math.h index da31b43ecdb2..e1948e0a313e 100644 --- a/util/math.h +++ b/util/math.h @@ -9,19 +9,47 @@ #ifdef _MSC_VER #include #endif +#ifdef __BMI2__ +#include +#endif #include #include +#include "port/lang.h" #include "rocksdb/rocksdb_namespace.h" +ASSERT_FEATURE_COMPAT_HEADER(); + namespace ROCKSDB_NAMESPACE { +// Fast implementation of extracting the bottom n bits of an integer. +// To ensure fast implementation, undefined if n bits is full width or more. +template +inline T BottomNBits(T v, int nbits) { + static_assert(std::is_integral_v, "non-integral type"); + static_assert(!std::is_reference_v, "use std::remove_reference_t"); + assert(nbits >= 0); + assert(nbits < int{8 * sizeof(T)}); +#ifdef __BMI2__ + if constexpr (sizeof(T) <= 4) { + return static_cast(_bzhi_u32(static_cast(v), nbits)); + } + if constexpr (sizeof(T) <= 8) { + return static_cast(_bzhi_u64(static_cast(v), nbits)); + } +#endif + // Newer compilers compile this down to bzhi on x86, but some older + // ones don't, thus the need for the intrinsic above. + return static_cast(v & ((T{1} << nbits) - 1)); +} + // Fast implementation of floor(log2(v)). Undefined for 0 or negative // numbers (in case of signed type). template inline int FloorLog2(T v) { - static_assert(std::is_integral::value, "non-integral type"); + static_assert(std::is_integral_v, "non-integral type"); + static_assert(!std::is_reference_v, "use std::remove_reference_t"); assert(v > 0); #ifdef _MSC_VER static_assert(sizeof(T) <= sizeof(uint64_t), "type too big"); @@ -60,6 +88,8 @@ inline int FloorLog2(T v) { // Constexpr version of FloorLog2 template constexpr int ConstexprFloorLog2(T v) { + // NOTE: not checking is_integral so that this works with Unsigned128 + static_assert(!std::is_reference_v, "use std::remove_reference_t"); int rv = 0; while (v > T{1}) { ++rv; @@ -71,7 +101,8 @@ constexpr int ConstexprFloorLog2(T v) { // Number of low-order zero bits before the first 1 bit. Undefined for 0. template inline int CountTrailingZeroBits(T v) { - static_assert(std::is_integral::value, "non-integral type"); + static_assert(std::is_integral_v, "non-integral type"); + static_assert(!std::is_reference_v, "use std::remove_reference_t"); assert(v != 0); #ifdef _MSC_VER static_assert(sizeof(T) <= sizeof(uint64_t), "type too big"); @@ -112,6 +143,9 @@ namespace detail { template int BitsSetToOneFallback(T v) { + static_assert(std::is_integral_v, "non-integral type"); + static_assert(!std::is_reference_v, "use std::remove_reference_t"); + const int kBits = static_cast(sizeof(T)) * 8; static_assert((kBits & (kBits - 1)) == 0, "must be power of two bits"); // we static_cast these bit patterns in order to truncate them to the correct @@ -137,7 +171,9 @@ int BitsSetToOneFallback(T v) { // Number of bits set to 1. Also known as "population count". template inline int BitsSetToOne(T v) { - static_assert(std::is_integral::value, "non-integral type"); + static_assert(std::is_integral_v, "non-integral type"); + static_assert(!std::is_reference_v, "use std::remove_reference_t"); + #ifdef _MSC_VER static_assert(sizeof(T) <= sizeof(uint64_t), "type too big"); if (sizeof(T) < sizeof(uint32_t)) { @@ -145,27 +181,29 @@ inline int BitsSetToOne(T v) { constexpr auto mm = 8 * sizeof(uint32_t) - 1; // The bit mask is to neutralize sign extension on small signed types constexpr uint32_t m = (uint32_t{1} << ((8 * sizeof(T)) & mm)) - 1; -#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86)) +#if __POPCNT__ return static_cast(__popcnt(static_cast(v) & m)); #else return static_cast(detail::BitsSetToOneFallback(v) & m); -#endif +#endif // __POPCNT__ } else if (sizeof(T) == sizeof(uint32_t)) { -#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86)) +#if __POPCNT__ return static_cast(__popcnt(static_cast(v))); #else return detail::BitsSetToOneFallback(static_cast(v)); -#endif +#endif // __POPCNT__ } else { -#if defined(HAVE_SSE42) && defined(_M_X64) +#if __POPCNT__ +#ifdef _M_X64 return static_cast(__popcnt64(static_cast(v))); -#elif defined(HAVE_SSE42) && defined(_M_IX86) +#else return static_cast( __popcnt(static_cast(static_cast(v) >> 32) + __popcnt(static_cast(v)))); +#endif // _M_X64 #else return detail::BitsSetToOneFallback(static_cast(v)); -#endif +#endif // __POPCNT__ } #else static_assert(sizeof(T) <= sizeof(unsigned long long), "type too big"); @@ -187,7 +225,9 @@ inline int BitsSetToOne(T v) { template inline int BitParity(T v) { - static_assert(std::is_integral::value, "non-integral type"); + static_assert(std::is_integral_v, "non-integral type"); + static_assert(!std::is_reference_v, "use std::remove_reference_t"); + #ifdef _MSC_VER // bit parity == oddness of popcount return BitsSetToOne(v) & 1; @@ -209,7 +249,8 @@ inline int BitParity(T v) { // encode/decode big endian. template inline T EndianSwapValue(T v) { - static_assert(std::is_integral::value, "non-integral type"); + static_assert(std::is_integral_v, "non-integral type"); + static_assert(!std::is_reference_v, "use std::remove_reference_t"); #ifdef _MSC_VER if (sizeof(T) == 2) { @@ -239,6 +280,9 @@ inline T EndianSwapValue(T v) { // Reverses the order of bits in an integral value template inline T ReverseBits(T v) { + static_assert(std::is_integral_v, "non-integral type"); + static_assert(!std::is_reference_v, "use std::remove_reference_t"); + T r = EndianSwapValue(v); const T kHighestByte = T{1} << ((sizeof(T) - 1) * 8); const T kEveryByte = kHighestByte | (kHighestByte / 255); @@ -272,7 +316,8 @@ inline T ReverseBits(T v) { // is that all square sub-matrices that include the top row are invertible. template inline T DownwardInvolution(T v) { - static_assert(std::is_integral::value, "non-integral type"); + static_assert(std::is_integral_v, "non-integral type"); + static_assert(!std::is_reference_v, "use std::remove_reference_t"); static_assert(sizeof(T) <= 8, "only supported up to 64 bits"); uint64_t r = static_cast(v); @@ -291,4 +336,16 @@ inline T DownwardInvolution(T v) { return static_cast(r); } +// Bitwise-And with typing that allows you to avoid writing an explicit cast +// to the smaller type, or the type of the right parameter if same size. +template +inline std::conditional_t BitwiseAnd(A a, B b) { + static_assert(std::is_integral_v, "non-integral type"); + static_assert(std::is_integral_v, "non-integral type"); + static_assert(!std::is_reference_v, "use std::remove_reference_t"); + static_assert(!std::is_reference_v, "use std::remove_reference_t"); + using Smaller = std::conditional_t; + return static_cast(a & b); +} + } // namespace ROCKSDB_NAMESPACE diff --git a/util/math128.h b/util/math128.h index ae490051a78b..5f96dbc66daf 100644 --- a/util/math128.h +++ b/util/math128.h @@ -41,13 +41,13 @@ struct Unsigned128 { hi = upper; } - explicit operator uint64_t() { return lo; } - - explicit operator uint32_t() { return static_cast(lo); } - - explicit operator uint16_t() { return static_cast(lo); } - - explicit operator uint8_t() { return static_cast(lo); } + // Convert to any integer 64 bits or less. + template && + sizeof(T) <= sizeof(uint64_t)> > + explicit operator T() { + return static_cast(lo); + } }; inline Unsigned128 operator<<(const Unsigned128& lhs, unsigned shift) { @@ -190,6 +190,16 @@ inline Unsigned128 Multiply64to128(uint64_t a, uint64_t b) { #endif } +template <> +inline Unsigned128 BottomNBits(Unsigned128 v, int nbits) { + if (nbits < 64) { + return BottomNBits(Lower64of128(v), nbits); + } else { + return (Unsigned128{BottomNBits(Upper64of128(v), nbits - 64)} << 64) | + Lower64of128(v); + } +} + template <> inline int FloorLog2(Unsigned128 v) { if (Upper64of128(v) == 0) { @@ -236,6 +246,18 @@ inline Unsigned128 DownwardInvolution(Unsigned128 v) { DownwardInvolution(Upper64of128(v) ^ Lower64of128(v)); } +template +inline std::remove_reference_t BitwiseAnd(A a, Unsigned128 b) { + static_assert(sizeof(A) <= sizeof(Unsigned128)); + return static_cast(a & b); +} + +template +inline std::remove_reference_t BitwiseAnd(Unsigned128 a, B b) { + static_assert(sizeof(B) <= sizeof(Unsigned128)); + return static_cast(a & b); +} + template struct IsUnsignedUpTo128 : std::integral_constant::value || diff --git a/util/mutexlock.h b/util/mutexlock.h index 94066b29ea44..aecd4f21cb4f 100644 --- a/util/mutexlock.h +++ b/util/mutexlock.h @@ -11,10 +11,14 @@ #include #include +#include +#include #include #include #include "port/port.h" +#include "util/fastrange.h" +#include "util/hash.h" namespace ROCKSDB_NAMESPACE { @@ -128,10 +132,25 @@ class SpinMutex { std::atomic locked_; }; -// We want to prevent false sharing +// For preventing false sharing, especially for mutexes. +// NOTE: if a mutex is less than half the size of a cache line, it would +// make more sense for Striped structure below to pack more than one mutex +// into each cache line, as this would only reduce contention for the same +// amount of space and cache sharing. However, a mutex is often 40 bytes out +// of a 64 byte cache line. template -struct ALIGN_AS(CACHE_LINE_SIZE) LockData { - T lock_; +struct ALIGN_AS(CACHE_LINE_SIZE) CacheAlignedWrapper { + T obj_; +}; +template +struct Unwrap { + using type = T; + static type &Go(T &t) { return t; } +}; +template +struct Unwrap> { + using type = T; + static type &Go(CacheAlignedWrapper &t) { return t.obj_; } }; // @@ -143,38 +162,28 @@ struct ALIGN_AS(CACHE_LINE_SIZE) LockData { // single lock and allowing independent operations to lock different stripes and // proceed concurrently, instead of creating contention for a single lock. // -template +template class Striped { public: - Striped(size_t stripes, std::function hash) - : stripes_(stripes), hash_(hash) { - locks_ = reinterpret_cast *>( - port::cacheline_aligned_alloc(sizeof(LockData) * stripes)); - for (size_t i = 0; i < stripes; i++) { - new (&locks_[i]) LockData(); - } - } + explicit Striped(size_t stripe_count) + : stripe_count_(stripe_count), data_(new T[stripe_count]) {} - virtual ~Striped() { - if (locks_ != nullptr) { - assert(stripes_ > 0); - for (size_t i = 0; i < stripes_; i++) { - locks_[i].~LockData(); - } - port::cacheline_aligned_free(locks_); - } + using Unwrapped = typename Unwrap::type; + Unwrapped &Get(const Key &key, uint64_t seed = 0) { + size_t index = FastRangeGeneric(hash_(key, seed), stripe_count_); + return Unwrap::Go(data_[index]); } - T *get(const P &key) { - uint64_t h = hash_(key); - size_t index = h % stripes_; - return &reinterpret_cast *>(&locks_[index])->lock_; + size_t ApproximateMemoryUsage() const { + // NOTE: could use malloc_usable_size() here, but that could count unmapped + // pages and could mess up unit test OccLockBucketsTest::CacheAligned + return sizeof(*this) + stripe_count_ * sizeof(T); } private: - size_t stripes_; - LockData *locks_; - std::function hash_; + size_t stripe_count_; + std::unique_ptr data_; + Hash hash_; }; } // namespace ROCKSDB_NAMESPACE diff --git a/util/overload.h b/util/overload.h new file mode 100644 index 000000000000..27da816483b2 --- /dev/null +++ b/util/overload.h @@ -0,0 +1,23 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include "rocksdb/rocksdb_namespace.h" + +namespace ROCKSDB_NAMESPACE { + +// A helper template that can combine multiple functors into a single one to be +// used with std::visit for example. It also works with lambdas, since it +// comes with an explicit deduction guide. +template +struct overload : Ts... { + using Ts::operator()...; +}; + +template +overload(Ts...) -> overload; + +} // namespace ROCKSDB_NAMESPACE diff --git a/util/random.cc b/util/random.cc index c94c28dfb2b5..7ac6ee19a1b7 100644 --- a/util/random.cc +++ b/util/random.cc @@ -6,6 +6,7 @@ #include "util/random.h" +#include #include #include diff --git a/util/rate_limiter.cc b/util/rate_limiter.cc index 6bbcabfaeef5..e92b3bf7634b 100644 --- a/util/rate_limiter.cc +++ b/util/rate_limiter.cc @@ -7,15 +7,14 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/rate_limiter.h" - #include -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "port/port.h" #include "rocksdb/system_clock.h" #include "test_util/sync_point.h" #include "util/aligned_buffer.h" +#include "util/rate_limiter_impl.h" namespace ROCKSDB_NAMESPACE { size_t RateLimiter::RequestToken(size_t bytes, size_t alignment, @@ -38,11 +37,10 @@ size_t RateLimiter::RequestToken(size_t bytes, size_t alignment, // Pending request struct GenericRateLimiter::Req { explicit Req(int64_t _bytes, port::Mutex* _mu) - : request_bytes(_bytes), bytes(_bytes), cv(_mu), granted(false) {} + : request_bytes(_bytes), bytes(_bytes), cv(_mu) {} int64_t request_bytes; int64_t bytes; port::CondVar cv; - bool granted; }; GenericRateLimiter::GenericRateLimiter( @@ -109,6 +107,23 @@ void GenericRateLimiter::SetBytesPerSecondLocked(int64_t bytes_per_second) { std::memory_order_relaxed); } +Status GenericRateLimiter::SetSingleBurstBytes(int64_t single_burst_bytes) { + if (single_burst_bytes <= 0) { + return Status::InvalidArgument( + "`single_burst_bytes` must be greater than 0"); + } + + MutexLock g(&request_mutex_); + SetSingleBurstBytesLocked(single_burst_bytes); + return Status::OK(); +} + +void GenericRateLimiter::SetSingleBurstBytesLocked(int64_t single_burst_bytes) { + refill_bytes_per_period_.store(single_burst_bytes, std::memory_order_relaxed); + refill_period_us_.store(CalculateRefillPeriodUsLocked(single_burst_bytes), + std::memory_order_relaxed); +} + void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri, Statistics* stats) { assert(bytes <= refill_bytes_per_period_.load(std::memory_order_relaxed)); @@ -122,7 +137,8 @@ void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri, static const int kRefillsPerTune = 100; std::chrono::microseconds now(NowMicrosMonotonicLocked()); if (now - tuned_time_ >= - kRefillsPerTune * std::chrono::microseconds(refill_period_us_)) { + kRefillsPerTune * std::chrono::microseconds(refill_period_us_.load( + std::memory_order_relaxed))) { Status s = TuneLocked(); s.PermitUncheckedError(); //**TODO: What to do on error? } @@ -137,12 +153,14 @@ void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri, ++total_requests_[pri]; - if (available_bytes_ >= bytes) { - // Refill thread assigns quota and notifies requests waiting on - // the queue under mutex. So if we get here, that means nobody - // is waiting? - available_bytes_ -= bytes; - total_bytes_through_[pri] += bytes; + if (available_bytes_ > 0) { + int64_t bytes_through = std::min(available_bytes_, bytes); + total_bytes_through_[pri] += bytes_through; + available_bytes_ -= bytes_through; + bytes -= bytes_through; + } + + if (bytes == 0) { return; } @@ -170,7 +188,7 @@ void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri, RecordTick(stats, NUMBER_RATE_LIMITER_DRAINS); ++num_drains_; wait_until_refill_pending_ = true; - r.cv.TimedWait(wait_until); + clock_->TimedWait(&r.cv, std::chrono::microseconds(wait_until)); TEST_SYNC_POINT_CALLBACK("GenericRateLimiter::Request:PostTimedWait", &time_until_refill_us); wait_until_refill_pending_ = false; @@ -179,16 +197,16 @@ void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri, // Whichever thread reaches here first performs duty (2) as described // above. RefillBytesAndGrantRequestsLocked(); - if (r.granted) { - // If there is any remaining requests, make sure there exists at least - // one candidate is awake for future duties by signaling a front request - // of a queue. - for (int i = Env::IO_TOTAL - 1; i >= Env::IO_LOW; --i) { - std::deque queue = queue_[i]; - if (!queue.empty()) { - queue.front()->cv.Signal(); - break; - } + } + if (r.request_bytes == 0) { + // If there is any remaining requests, make sure there exists at least + // one candidate is awake for future duties by signaling a front request + // of a queue. + for (int i = Env::IO_TOTAL - 1; i >= Env::IO_LOW; --i) { + auto& queue = queue_[i]; + if (!queue.empty()) { + queue.front()->cv.Signal(); + break; } } } @@ -202,13 +220,13 @@ void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri, ++num_found; } } - if (r.granted) { + if (r.request_bytes == 0) { assert(num_found == 0); } else { assert(num_found == 1); } #endif // NDEBUG - } while (!stop_ && !r.granted); + } while (!stop_ && r.request_bytes > 0); if (stop_) { // It is now in the clean-up of ~GenericRateLimiter(). @@ -261,13 +279,13 @@ GenericRateLimiter::GeneratePriorityIterationOrderLocked() { void GenericRateLimiter::RefillBytesAndGrantRequestsLocked() { TEST_SYNC_POINT_CALLBACK( "GenericRateLimiter::RefillBytesAndGrantRequestsLocked", &request_mutex_); - next_refill_us_ = NowMicrosMonotonicLocked() + refill_period_us_; + next_refill_us_ = NowMicrosMonotonicLocked() + + refill_period_us_.load(std::memory_order_relaxed); // Carry over the left over quota from the last period auto refill_bytes_per_period = refill_bytes_per_period_.load(std::memory_order_relaxed); - if (available_bytes_ < refill_bytes_per_period) { - available_bytes_ += refill_bytes_per_period; - } + assert(available_bytes_ == 0); + available_bytes_ = refill_bytes_per_period; std::vector pri_iteration_order = GeneratePriorityIterationOrderLocked(); @@ -292,7 +310,6 @@ void GenericRateLimiter::RefillBytesAndGrantRequestsLocked() { total_bytes_through_[current_pri] += next_req->bytes; queue->pop_front(); - next_req->granted = true; // Quota granted, signal the thread to exit next_req->cv.Signal(); } @@ -301,13 +318,28 @@ void GenericRateLimiter::RefillBytesAndGrantRequestsLocked() { int64_t GenericRateLimiter::CalculateRefillBytesPerPeriodLocked( int64_t rate_bytes_per_sec) { + int64_t refill_period_us = refill_period_us_.load(std::memory_order_relaxed); if (std::numeric_limits::max() / rate_bytes_per_sec < - refill_period_us_) { + refill_period_us) { + // Avoid unexpected result in the overflow case. The result now is still + // inaccurate but is a number that is large enough. + return std::numeric_limits::max() / kMicrosecondsPerSecond; + } else { + return rate_bytes_per_sec * refill_period_us / kMicrosecondsPerSecond; + } +} + +int64_t GenericRateLimiter::CalculateRefillPeriodUsLocked( + int64_t single_burst_bytes) { + int64_t rate_bytes_per_sec = + rate_bytes_per_sec_.load(std::memory_order_relaxed); + if (std::numeric_limits::max() / single_burst_bytes < + kMicrosecondsPerSecond) { // Avoid unexpected result in the overflow case. The result now is still // inaccurate but is a number that is large enough. - return std::numeric_limits::max() / 1000000; + return std::numeric_limits::max() / rate_bytes_per_sec; } else { - return rate_bytes_per_sec * refill_period_us_ / 1000000; + return single_burst_bytes * kMicrosecondsPerSecond / rate_bytes_per_sec; } } @@ -322,10 +354,11 @@ Status GenericRateLimiter::TuneLocked() { std::chrono::microseconds prev_tuned_time = tuned_time_; tuned_time_ = std::chrono::microseconds(NowMicrosMonotonicLocked()); + int64_t refill_period_us = refill_period_us_.load(std::memory_order_relaxed); int64_t elapsed_intervals = (tuned_time_ - prev_tuned_time + - std::chrono::microseconds(refill_period_us_) - + std::chrono::microseconds(refill_period_us) - std::chrono::microseconds(1)) / - std::chrono::microseconds(refill_period_us_); + std::chrono::microseconds(refill_period_us); // We tune every kRefillsPerTune intervals, so the overflow and division-by- // zero conditions should never happen. assert(num_drains_ <= std::numeric_limits::max() / 100); diff --git a/util/rate_limiter.h b/util/rate_limiter_impl.h similarity index 93% rename from util/rate_limiter.h rename to util/rate_limiter_impl.h index 4c078f5a0eef..c6786b048579 100644 --- a/util/rate_limiter.h +++ b/util/rate_limiter_impl.h @@ -36,6 +36,8 @@ class GenericRateLimiter : public RateLimiter { // This API allows user to dynamically change rate limiter's bytes per second. virtual void SetBytesPerSecond(int64_t bytes_per_second) override; + virtual Status SetSingleBurstBytes(int64_t single_burst_bytes) override; + // Request for token to write bytes. If this request can not be satisfied, // the call is blocked. Caller is responsible to make sure // bytes <= GetSingleBurstBytes() and bytes >= 0. Negative bytes @@ -102,11 +104,14 @@ class GenericRateLimiter : public RateLimiter { } private: + static constexpr int kMicrosecondsPerSecond = 1000000; void RefillBytesAndGrantRequestsLocked(); std::vector GeneratePriorityIterationOrderLocked(); int64_t CalculateRefillBytesPerPeriodLocked(int64_t rate_bytes_per_sec); + int64_t CalculateRefillPeriodUsLocked(int64_t single_burst_bytes); Status TuneLocked(); void SetBytesPerSecondLocked(int64_t bytes_per_second); + void SetSingleBurstBytesLocked(int64_t single_burst_bytes); uint64_t NowMicrosMonotonicLocked() { return clock_->NowNanos() / std::milli::den; @@ -115,7 +120,7 @@ class GenericRateLimiter : public RateLimiter { // This mutex guard all internal states mutable port::Mutex request_mutex_; - const int64_t refill_period_us_; + std::atomic refill_period_us_; std::atomic rate_bytes_per_sec_; std::atomic refill_bytes_per_period_; diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc index cda134867591..16e7623ac8d9 100644 --- a/util/rate_limiter_test.cc +++ b/util/rate_limiter_test.cc @@ -7,8 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/rate_limiter.h" - #include #include #include @@ -17,9 +15,11 @@ #include "db/db_test_util.h" #include "port/port.h" #include "rocksdb/system_clock.h" +#include "test_util/mock_time_env.h" #include "test_util/sync_point.h" #include "test_util/testharness.h" #include "util/random.h" +#include "util/rate_limiter_impl.h" namespace ROCKSDB_NAMESPACE { @@ -416,35 +416,70 @@ TEST_F(RateLimiterTest, LimitChangeTest) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -TEST_F(RateLimiterTest, AutoTuneIncreaseWhenFull) { +TEST_F(RateLimiterTest, AvailableByteSizeExhaustTest) { + SpecialEnv special_env(Env::Default(), /*time_elapse_only_sleep*/ true); const std::chrono::seconds kTimePerRefill(1); - const int kRefillsPerTune = 100; // needs to match util/rate_limiter.cc - SpecialEnv special_env(Env::Default(), /*time_elapse_only_sleep*/ true); + // This test makes sure available_bytes_ get exhausted first before queuing + // any remaining bytes when requested_bytes > available_bytes + const int64_t available_bytes_per_period = 500; - auto stats = CreateDBStatistics(); - std::unique_ptr rate_limiter(new GenericRateLimiter( - 1000 /* rate_bytes_per_sec */, + std::shared_ptr limiter = std::make_shared( + available_bytes_per_period, std::chrono::microseconds(kTimePerRefill).count(), 10 /* fairness */, RateLimiter::Mode::kWritesOnly, special_env.GetSystemClock(), - true /* auto_tuned */)); + false /* auto_tuned */); + + // Step 1. Request 100 and wait for the refill + // so that the remaining available bytes are 400 + limiter->Request(100, Env::IO_USER, nullptr /* stats */, + RateLimiter::OpType::kWrite); + special_env.SleepForMicroseconds( + static_cast(std::chrono::microseconds(kTimePerRefill).count())); - // Rate limiter uses `CondVar::TimedWait()`, which does not have access to the - // `Env` to advance its time according to the fake wait duration. The - // workaround is to install a callback that advance the `Env`'s mock time. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "GenericRateLimiter::Request:PostTimedWait", [&](void* arg) { - int64_t time_waited_us = *static_cast(arg); - special_env.SleepForMicroseconds(static_cast(time_waited_us)); + "GenericRateLimiter::Request:PostEnqueueRequest", [&](void* arg) { + port::Mutex* request_mutex = (port::Mutex*)arg; + request_mutex->Unlock(); + // Step 3. Check GetTotalBytesThrough = available_bytes_per_period + // to make sure that the first request (100) and the part of the second + // request (400) made through when the remaining of the second request + // got queued + ASSERT_EQ(available_bytes_per_period, + limiter->GetTotalBytesThrough(Env::IO_USER)); + request_mutex->Lock(); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + // Step 2. Request 500, which is greater than the remaining available bytes + // (400) + limiter->Request(500, Env::IO_USER, nullptr /* stats */, + RateLimiter::OpType::kWrite); + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( + "GenericRateLimiter::Request:PostEnqueueRequest"); +} + +TEST_F(RateLimiterTest, AutoTuneIncreaseWhenFull) { + const std::chrono::seconds kTimePerRefill(1); + const int kRefillsPerTune = 100; // needs to match util/rate_limiter.cc + + auto mock_clock = + std::make_shared(Env::Default()->GetSystemClock()); + + auto stats = CreateDBStatistics(); + std::unique_ptr rate_limiter(new GenericRateLimiter( + 1000 /* rate_bytes_per_sec */, + std::chrono::microseconds(kTimePerRefill).count(), 10 /* fairness */, + RateLimiter::Mode::kWritesOnly, mock_clock, true /* auto_tuned */)); + // verify rate limit increases after a sequence of periods where rate limiter // is always drained int64_t orig_bytes_per_sec = rate_limiter->GetSingleBurstBytes(); rate_limiter->Request(orig_bytes_per_sec, Env::IO_HIGH, stats.get(), RateLimiter::OpType::kWrite); - while (std::chrono::microseconds(special_env.NowMicros()) <= + while (std::chrono::microseconds(mock_clock->NowMicros()) <= kRefillsPerTune * kTimePerRefill) { rate_limiter->Request(orig_bytes_per_sec, Env::IO_HIGH, stats.get(), RateLimiter::OpType::kWrite); @@ -452,13 +487,9 @@ TEST_F(RateLimiterTest, AutoTuneIncreaseWhenFull) { int64_t new_bytes_per_sec = rate_limiter->GetSingleBurstBytes(); ASSERT_GT(new_bytes_per_sec, orig_bytes_per_sec); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( - "GenericRateLimiter::Request:PostTimedWait"); - // decreases after a sequence of periods where rate limiter is not drained orig_bytes_per_sec = new_bytes_per_sec; - special_env.SleepForMicroseconds(static_cast( + mock_clock->SleepForMicroseconds(static_cast( kRefillsPerTune * std::chrono::microseconds(kTimePerRefill).count())); // make a request so tuner can be triggered rate_limiter->Request(1 /* bytes */, Env::IO_HIGH, stats.get(), @@ -467,6 +498,90 @@ TEST_F(RateLimiterTest, AutoTuneIncreaseWhenFull) { ASSERT_LT(new_bytes_per_sec, orig_bytes_per_sec); } +TEST_F(RateLimiterTest, WaitHangingBug) { + // At t=0: Threads 0 and 1 request `kBytesPerRefill` bytes at low-pri. One + // will be granted immediately and the other will enter `TimedWait()`. + // + // At t=`kMicrosPerRefill`: Thread 2 requests `kBytesPerRefill` bytes at + // low-pri. Thread 2's request enters the queue. To expose the bug scenario, + // `SyncPoint`s ensure this happens while the lock is temporarily released in + // `TimedWait()`. Before the bug fix, Thread 2's request would then hang in + // `Wait()` interminably. + const int kBytesPerSecond = 100; + const int kMicrosPerSecond = 1000 * 1000; + const int kMicrosPerRefill = kMicrosPerSecond; + const int kBytesPerRefill = + kBytesPerSecond * kMicrosPerRefill / kMicrosPerSecond; + + auto mock_clock = + std::make_shared(Env::Default()->GetSystemClock()); + std::unique_ptr limiter(new GenericRateLimiter( + kBytesPerSecond, kMicrosPerRefill, 10 /* fairness */, + RateLimiter::Mode::kWritesOnly, mock_clock, false /* auto_tuned */)); + std::array request_threads; + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( + {{"RateLimiterTest::WaitHangingBug:InitialRequestsReady", + "MockSystemClock::TimedWait:UnlockedPreSleep"}, + {"MockSystemClock::TimedWait:UnlockedPostSleep1", + "RateLimiterTest::WaitHangingBug:TestThreadRequestBegin"}, + {"RateLimiterTest::WaitHangingBug:TestThreadRequestEnd", + "MockSystemClock::TimedWait:UnlockedPostSleep2"}}); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + for (int i = 0; i < 2; i++) { + request_threads[i] = std::thread([&]() { + limiter->Request(kBytesPerRefill /* bytes */, Env::IOPriority::IO_LOW, + nullptr /* stats */, RateLimiter::OpType::kWrite); + }); + } + while (limiter->GetTotalRequests() < 2) { + } + TEST_SYNC_POINT("RateLimiterTest::WaitHangingBug:InitialRequestsReady"); + + TEST_SYNC_POINT("RateLimiterTest::WaitHangingBug:TestThreadRequestBegin"); + request_threads[2] = std::thread([&]() { + limiter->Request(kBytesPerRefill /* bytes */, Env::IOPriority::IO_LOW, + nullptr /* stats */, RateLimiter::OpType::kWrite); + }); + while (limiter->GetTotalRequests() < 3) { + } + TEST_SYNC_POINT("RateLimiterTest::WaitHangingBug:TestThreadRequestEnd"); + + for (int i = 0; i < 3; i++) { + request_threads[i].join(); + } +} + +TEST_F(RateLimiterTest, RuntimeSingleBurstBytesChange) { + constexpr int kMicrosecondsPerSecond = 1000000; + + const int64_t kRateBytesPerSec = 400; + + const int64_t kOldSingleBurstBytes = 100; + const int64_t kOldRefillPeriodUs = + kOldSingleBurstBytes * kMicrosecondsPerSecond / kRateBytesPerSec; + const int64_t kNewSingleBurstBytes = kOldSingleBurstBytes * 2; + + SpecialEnv special_env(Env::Default(), /*time_elapse_only_sleep*/ true); + std::unique_ptr limiter(new GenericRateLimiter( + kRateBytesPerSec, kOldRefillPeriodUs, 10 /* fairness */, + RateLimiter::Mode::kWritesOnly, special_env.GetSystemClock(), + false /* auto_tuned */)); + + ASSERT_EQ(kOldSingleBurstBytes, limiter->GetSingleBurstBytes()); + + ASSERT_TRUE(limiter->SetSingleBurstBytes(0).IsInvalidArgument()); + ASSERT_OK(limiter->SetSingleBurstBytes(kNewSingleBurstBytes)); + ASSERT_EQ(kNewSingleBurstBytes, limiter->GetSingleBurstBytes()); + + // If the updated single burst bytes is not reflected in the bytes + // granting process, this request will hang forever. + limiter->Request(limiter->GetSingleBurstBytes() /* bytes */, + Env::IOPriority::IO_USER, nullptr /* stats */, + RateLimiter::OpType::kWrite); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/util/ribbon_test.cc b/util/ribbon_test.cc index 6519df3d5fb2..cd1c437c33b3 100644 --- a/util/ribbon_test.cc +++ b/util/ribbon_test.cc @@ -104,11 +104,11 @@ struct StandardKeyGen { return str_; } - bool operator==(const StandardKeyGen& other) { + bool operator==(const StandardKeyGen& other) const { // Same prefix is assumed return id_ == other.id_; } - bool operator!=(const StandardKeyGen& other) { + bool operator!=(const StandardKeyGen& other) const { // Same prefix is assumed return id_ != other.id_; } @@ -144,8 +144,8 @@ struct SmallKeyGen { return str_; } - bool operator==(const SmallKeyGen& other) { return id_ == other.id_; } - bool operator!=(const SmallKeyGen& other) { return id_ != other.id_; } + bool operator==(const SmallKeyGen& other) const { return id_ == other.id_; } + bool operator!=(const SmallKeyGen& other) const { return id_ != other.id_; } uint64_t id_; std::string str_; @@ -1069,11 +1069,11 @@ struct PhsfInputGen { const std::pair* operator->() { return &**this; } - bool operator==(const PhsfInputGen& other) { + bool operator==(const PhsfInputGen& other) const { // Same prefix is assumed return id_ == other.id_; } - bool operator!=(const PhsfInputGen& other) { + bool operator!=(const PhsfInputGen& other) const { // Same prefix is assumed return id_ != other.id_; } diff --git a/util/single_thread_executor.h b/util/single_thread_executor.h index c69f2a292132..652be5052643 100644 --- a/util/single_thread_executor.h +++ b/util/single_thread_executor.h @@ -8,6 +8,7 @@ #if USE_COROUTINES #include +#include #include "folly/CPortability.h" #include "folly/CppAttributes.h" diff --git a/util/slice.cc b/util/slice.cc index 1fa21afcb2d2..22dd7ee6bb15 100644 --- a/util/slice.cc +++ b/util/slice.cc @@ -156,7 +156,6 @@ const SliceTransform* NewCappedPrefixTransform(size_t cap_len) { const SliceTransform* NewNoopTransform() { return new NoopTransform; } -#ifndef ROCKSDB_LITE static int RegisterBuiltinSliceTransform(ObjectLibrary& library, const std::string& /*arg*/) { // For the builtin transforms, the format is typically @@ -212,17 +211,14 @@ static int RegisterBuiltinSliceTransform(ObjectLibrary& library, size_t num_types; return static_cast(library.GetFactoryCount(&num_types)); } -#endif // ROCKSDB_LITE Status SliceTransform::CreateFromString( const ConfigOptions& config_options, const std::string& value, std::shared_ptr* result) { -#ifndef ROCKSDB_LITE static std::once_flag once; std::call_once(once, [&]() { RegisterBuiltinSliceTransform(*(ObjectLibrary::Default().get()), ""); }); -#endif // ROCKSDB_LITE std::string id; std::unordered_map opt_map; Status status = Customizable::GetOptionsMap(config_options, result->get(), @@ -232,39 +228,7 @@ Status SliceTransform::CreateFromString( } else if (id.empty() && opt_map.empty()) { result->reset(); } else { -#ifndef ROCKSDB_LITE status = config_options.registry->NewSharedObject(id, result); -#else - auto Matches = [](const std::string& input, size_t size, - const char* pattern, char sep) { - auto plen = strlen(pattern); - return (size > plen + 2 && input[plen] == sep && - StartsWith(input, pattern)); - }; - - auto size = id.size(); - if (id == NoopTransform::kClassName()) { - result->reset(NewNoopTransform()); - } else if (Matches(id, size, FixedPrefixTransform::kNickName(), ':')) { - auto fixed = strlen(FixedPrefixTransform::kNickName()); - auto len = ParseSizeT(id.substr(fixed + 1)); - result->reset(NewFixedPrefixTransform(len)); - } else if (Matches(id, size, CappedPrefixTransform::kNickName(), ':')) { - auto capped = strlen(CappedPrefixTransform::kNickName()); - auto len = ParseSizeT(id.substr(capped + 1)); - result->reset(NewCappedPrefixTransform(len)); - } else if (Matches(id, size, CappedPrefixTransform::kClassName(), '.')) { - auto capped = strlen(CappedPrefixTransform::kClassName()); - auto len = ParseSizeT(id.substr(capped + 1)); - result->reset(NewCappedPrefixTransform(len)); - } else if (Matches(id, size, FixedPrefixTransform::kClassName(), '.')) { - auto fixed = strlen(FixedPrefixTransform::kClassName()); - auto len = ParseSizeT(id.substr(fixed + 1)); - result->reset(NewFixedPrefixTransform(len)); - } else { - status = Status::NotSupported("Cannot load object in LITE mode ", id); - } -#endif // ROCKSDB_LITE if (config_options.ignore_unsupported_options && status.IsNotSupported()) { return Status::OK(); } else if (status.ok()) { @@ -277,13 +241,11 @@ Status SliceTransform::CreateFromString( } std::string SliceTransform::AsString() const { -#ifndef ROCKSDB_LITE if (HasRegisteredOptions()) { ConfigOptions opts; opts.delimiter = ";"; return ToString(opts); } -#endif // ROCKSDB_LITE return GetId(); } diff --git a/util/slice_test.cc b/util/slice_test.cc index e1c35d567f36..e82547494b06 100644 --- a/util/slice_test.cc +++ b/util/slice_test.cc @@ -173,13 +173,104 @@ class SmallEnumSetTest : public testing::Test { ~SmallEnumSetTest() {} }; -TEST_F(SmallEnumSetTest, SmallSetTest) { - FileTypeSet fs; +TEST_F(SmallEnumSetTest, SmallEnumSetTest1) { + FileTypeSet fs; // based on a legacy enum type + ASSERT_TRUE(fs.empty()); ASSERT_TRUE(fs.Add(FileType::kIdentityFile)); + ASSERT_FALSE(fs.empty()); ASSERT_FALSE(fs.Add(FileType::kIdentityFile)); ASSERT_TRUE(fs.Add(FileType::kInfoLogFile)); ASSERT_TRUE(fs.Contains(FileType::kIdentityFile)); ASSERT_FALSE(fs.Contains(FileType::kDBLockFile)); + ASSERT_FALSE(fs.empty()); + ASSERT_FALSE(fs.Remove(FileType::kDBLockFile)); + ASSERT_TRUE(fs.Remove(FileType::kIdentityFile)); + ASSERT_FALSE(fs.empty()); + ASSERT_TRUE(fs.Remove(FileType::kInfoLogFile)); + ASSERT_TRUE(fs.empty()); +} + +namespace { +enum class MyEnumClass { A, B, C }; +} // namespace + +using MyEnumClassSet = SmallEnumSet; + +TEST_F(SmallEnumSetTest, SmallEnumSetTest2) { + MyEnumClassSet s; // based on an enum class type + ASSERT_TRUE(s.Add(MyEnumClass::A)); + ASSERT_TRUE(s.Contains(MyEnumClass::A)); + ASSERT_FALSE(s.Contains(MyEnumClass::B)); + ASSERT_TRUE(s.With(MyEnumClass::B).Contains(MyEnumClass::B)); + ASSERT_TRUE(s.With(MyEnumClass::A).Contains(MyEnumClass::A)); + ASSERT_FALSE(s.Contains(MyEnumClass::B)); + ASSERT_FALSE(s.Without(MyEnumClass::A).Contains(MyEnumClass::A)); + ASSERT_FALSE( + s.With(MyEnumClass::B).Without(MyEnumClass::B).Contains(MyEnumClass::B)); + ASSERT_TRUE( + s.Without(MyEnumClass::B).With(MyEnumClass::B).Contains(MyEnumClass::B)); + ASSERT_TRUE(s.Contains(MyEnumClass::A)); + + const MyEnumClassSet cs = s; + ASSERT_TRUE(cs.Contains(MyEnumClass::A)); + ASSERT_EQ(cs, MyEnumClassSet{MyEnumClass::A}); + ASSERT_EQ(cs.Without(MyEnumClass::A), MyEnumClassSet{}); + ASSERT_EQ(cs, MyEnumClassSet::All().Without(MyEnumClass::B, MyEnumClass::C)); + ASSERT_EQ(cs.With(MyEnumClass::B, MyEnumClass::C), MyEnumClassSet::All()); + ASSERT_EQ( + MyEnumClassSet::All(), + MyEnumClassSet{}.With(MyEnumClass::A, MyEnumClass::B, MyEnumClass::C)); + ASSERT_NE(cs, MyEnumClassSet{MyEnumClass::B}); + ASSERT_NE(cs, MyEnumClassSet::All()); + + int count = 0; + for (MyEnumClass e : cs) { + ASSERT_EQ(e, MyEnumClass::A); + ++count; + } + ASSERT_EQ(count, 1); + + count = 0; + for (MyEnumClass e : MyEnumClassSet::All().Without(MyEnumClass::B)) { + ASSERT_NE(e, MyEnumClass::B); + ++count; + } + ASSERT_EQ(count, 2); + + for (MyEnumClass e : MyEnumClassSet{}) { + (void)e; + assert(false); + } +} + +// ***************************************************************** // +// Unit test for Status +TEST(StatusTest, Update) { + const Status ok = Status::OK(); + const Status inc = Status::Incomplete("blah"); + const Status notf = Status::NotFound("meow"); + + Status s = ok; + ASSERT_TRUE(s.UpdateIfOk(Status::Corruption("bad")).IsCorruption()); + ASSERT_TRUE(s.IsCorruption()); + + s = ok; + ASSERT_TRUE(s.UpdateIfOk(Status::OK()).ok()); + ASSERT_TRUE(s.UpdateIfOk(ok).ok()); + ASSERT_TRUE(s.ok()); + + ASSERT_TRUE(s.UpdateIfOk(inc).IsIncomplete()); + ASSERT_TRUE(s.IsIncomplete()); + + ASSERT_TRUE(s.UpdateIfOk(notf).IsIncomplete()); + ASSERT_TRUE(s.UpdateIfOk(ok).IsIncomplete()); + ASSERT_TRUE(s.IsIncomplete()); + + // Keeps left-most non-OK status + s = ok; + ASSERT_TRUE( + s.UpdateIfOk(Status()).UpdateIfOk(notf).UpdateIfOk(inc).IsNotFound()); + ASSERT_TRUE(s.IsNotFound()); } } // namespace ROCKSDB_NAMESPACE diff --git a/util/slice_transform_test.cc b/util/slice_transform_test.cc index 64ac8bb1f322..18b0ea51f327 100644 --- a/util/slice_transform_test.cc +++ b/util/slice_transform_test.cc @@ -91,8 +91,8 @@ class SliceTransformDBTest : public testing::Test { }; namespace { -uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) { - return options.statistics->getTickerCount(ticker_type); +uint64_t PopTicker(const Options& options, Tickers ticker_type) { + return options.statistics->getAndResetTickerCount(ticker_type); } } // namespace @@ -121,28 +121,33 @@ TEST_F(SliceTransformDBTest, CapPrefix) { ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); ASSERT_EQ(iter->value().ToString(), "bar"); - ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 0U); + EXPECT_EQ(PopTicker(last_options_, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 1U); + EXPECT_EQ(PopTicker(last_options_, NON_LAST_LEVEL_SEEK_FILTERED), 0U); iter->Seek("foo2"); ASSERT_OK(iter->status()); ASSERT_TRUE(!iter->Valid()); - ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U); + EXPECT_EQ(PopTicker(last_options_, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 0U); + EXPECT_EQ(PopTicker(last_options_, NON_LAST_LEVEL_SEEK_FILTERED), 1U); iter->Seek("barbarbar"); ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); ASSERT_EQ(iter->value().ToString(), "foo"); - ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U); + EXPECT_EQ(PopTicker(last_options_, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 1U); + EXPECT_EQ(PopTicker(last_options_, NON_LAST_LEVEL_SEEK_FILTERED), 0U); iter->Seek("barfoofoo"); ASSERT_OK(iter->status()); ASSERT_TRUE(!iter->Valid()); - ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 2U); + EXPECT_EQ(PopTicker(last_options_, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 0U); + EXPECT_EQ(PopTicker(last_options_, NON_LAST_LEVEL_SEEK_FILTERED), 1U); iter->Seek("foobarbar"); ASSERT_OK(iter->status()); ASSERT_TRUE(!iter->Valid()); - ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 3U); + EXPECT_EQ(PopTicker(last_options_, NON_LAST_LEVEL_SEEK_FILTER_MATCH), 0U); + EXPECT_EQ(PopTicker(last_options_, NON_LAST_LEVEL_SEEK_FILTERED), 1U); } } // namespace ROCKSDB_NAMESPACE diff --git a/util/status.cc b/util/status.cc index 1156b10ef498..160755d54d73 100644 --- a/util/status.cc +++ b/util/status.cc @@ -41,9 +41,11 @@ static const char* msgs[static_cast(Status::kMaxSubCode)] = { "Insufficient capacity for merge operands", // kManualCompactionPaused "Manual compaction paused", - " (overwritten)", // kOverwritten, subcode of OK - "Txn not prepared", // kTxnNotPrepared - "IO fenced off", // kIOFenced + " (overwritten)", // kOverwritten, subcode of OK + "Txn not prepared", // kTxnNotPrepared + "IO fenced off", // kIOFenced + "Merge operator failed", // kMergeOperatorFailed + "Number of operands merged exceeded threshold", // kMergeOperandThresholdExceeded }; Status::Status(Code _code, SubCode _subcode, const Slice& msg, diff --git a/util/stop_watch.h b/util/stop_watch.h index e26380d97cb0..28781304577d 100644 --- a/util/stop_watch.h +++ b/util/stop_watch.h @@ -4,28 +4,38 @@ // (found in the LICENSE.Apache file in the root directory). // #pragma once -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "rocksdb/system_clock.h" namespace ROCKSDB_NAMESPACE { // Auto-scoped. -// Records the measure time into the corresponding histogram if statistics -// is not nullptr. It is also saved into *elapsed if the pointer is not nullptr -// and overwrite is true, it will be added to *elapsed if overwrite is false. +// When statistics is not nullptr, records the measured time into any enabled +// histograms supplied to the constructor. A histogram argument may be omitted +// by setting it to Histograms::HISTOGRAM_ENUM_MAX. It is also saved into +// *elapsed if the pointer is not nullptr and overwrite is true, it will be +// added to *elapsed if overwrite is false. class StopWatch { public: StopWatch(SystemClock* clock, Statistics* statistics, - const uint32_t hist_type, uint64_t* elapsed = nullptr, - bool overwrite = true, bool delay_enabled = false) + const uint32_t hist_type_1, + const uint32_t hist_type_2 = Histograms::HISTOGRAM_ENUM_MAX, + uint64_t* elapsed = nullptr, bool overwrite = true, + bool delay_enabled = false) : clock_(clock), statistics_(statistics), - hist_type_(hist_type), + hist_type_1_(statistics && statistics->HistEnabledForType(hist_type_1) + ? hist_type_1 + : Histograms::HISTOGRAM_ENUM_MAX), + hist_type_2_(statistics && statistics->HistEnabledForType(hist_type_2) + ? hist_type_2 + : Histograms::HISTOGRAM_ENUM_MAX), elapsed_(elapsed), overwrite_(overwrite), stats_enabled_(statistics && - statistics->get_stats_level() >= + statistics->get_stats_level() > StatsLevel::kExceptTimers && - statistics->HistEnabledForType(hist_type)), + (hist_type_1_ != Histograms::HISTOGRAM_ENUM_MAX || + hist_type_2_ != Histograms::HISTOGRAM_ENUM_MAX)), delay_enabled_(delay_enabled), total_delay_(0), delay_start_time_(0), @@ -44,10 +54,15 @@ class StopWatch { *elapsed_ -= total_delay_; } if (stats_enabled_) { - statistics_->reportTimeToHistogram( - hist_type_, (elapsed_ != nullptr) - ? *elapsed_ - : (clock_->NowMicros() - start_time_)); + const auto time = (elapsed_ != nullptr) + ? *elapsed_ + : (clock_->NowMicros() - start_time_); + if (hist_type_1_ != Histograms::HISTOGRAM_ENUM_MAX) { + statistics_->reportTimeToHistogram(hist_type_1_, time); + } + if (hist_type_2_ != Histograms::HISTOGRAM_ENUM_MAX) { + statistics_->reportTimeToHistogram(hist_type_2_, time); + } } } @@ -75,7 +90,8 @@ class StopWatch { private: SystemClock* clock_; Statistics* statistics_; - const uint32_t hist_type_; + const uint32_t hist_type_1_; + const uint32_t hist_type_2_; uint64_t* elapsed_; bool overwrite_; bool stats_enabled_; @@ -110,6 +126,8 @@ class StopWatchNano { return (clock_ != nullptr) ? ElapsedNanos(reset) : 0U; } + bool IsStarted() { return start_ != 0; } + private: SystemClock* clock_; uint64_t start_; diff --git a/util/string_util.cc b/util/string_util.cc index 324482a4cd7d..57207889f1a3 100644 --- a/util/string_util.cc +++ b/util/string_util.cc @@ -286,7 +286,6 @@ bool StartsWith(const std::string& string, const std::string& pattern) { return string.compare(0, pattern.size(), pattern) == 0; } -#ifndef ROCKSDB_LITE bool ParseBoolean(const std::string& type, const std::string& value) { if (value == "true" || value == "1") { @@ -325,7 +324,6 @@ int32_t ParseInt32(const std::string& value) { } } -#endif uint64_t ParseUint64(const std::string& value) { size_t endchar; @@ -439,6 +437,45 @@ bool SerializeIntVector(const std::vector& vec, std::string* value) { return true; } +int ParseTimeStringToSeconds(const std::string& value) { + int hours, minutes; + char colon; + + std::istringstream stream(value); + stream >> hours >> colon >> minutes; + + if (stream.fail() || !stream.eof() || colon != ':') { + return -1; + } + + if (hours < 0 || hours > 23 || minutes < 0 || minutes > 59) { + return -1; + } + return hours * 3600 + minutes * 60; +} + +bool TryParseTimeRangeString(const std::string& value, int& start_time, + int& end_time) { + if (value.empty()) { + start_time = 0; + end_time = 0; + return true; + } + auto split = StringSplit(value, '-'); + if (split.size() != 2) { + return false; + } + start_time = ParseTimeStringToSeconds(split[0]); + if (start_time < 0) { + return false; + } + end_time = ParseTimeStringToSeconds(split[1]); + if (end_time < 0) { + return false; + } + return true; +} + // Copied from folly/string.cpp: // https://github.com/facebook/folly/blob/0deef031cb8aab76dc7e736f8b7c22d701d5f36b/folly/String.cpp#L457 // There are two variants of `strerror_r` function, one returns diff --git a/util/string_util.h b/util/string_util.h index 11178fd1d7b0..999081ebba96 100644 --- a/util/string_util.h +++ b/util/string_util.h @@ -144,7 +144,6 @@ bool EndsWith(const std::string& string, const std::string& pattern); // Returns true if "string" starts with "pattern" bool StartsWith(const std::string& string, const std::string& pattern); -#ifndef ROCKSDB_LITE bool ParseBoolean(const std::string& type, const std::string& value); uint8_t ParseUint8(const std::string& value); @@ -152,7 +151,6 @@ uint8_t ParseUint8(const std::string& value); uint32_t ParseUint32(const std::string& value); int32_t ParseInt32(const std::string& value); -#endif uint64_t ParseUint64(const std::string& value); @@ -168,6 +166,16 @@ std::vector ParseVectorInt(const std::string& value); bool SerializeIntVector(const std::vector& vec, std::string* value); +// Expects HH:mm format for the input value +// Returns -1 if invalid input. Otherwise returns seconds since midnight +int ParseTimeStringToSeconds(const std::string& value); + +// Expects HH:mm-HH:mm format for the input value +// Returns false, if invalid format. +// Otherwise, returns true and start_time and end_time are set +bool TryParseTimeRangeString(const std::string& value, int& start_time, + int& end_time); + extern const std::string kNullptrString; // errnoStr() function returns a string that describes the error code passed in diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc index af4e62355956..b5b3378fab94 100644 --- a/util/thread_list_test.cc +++ b/util/thread_list_test.cc @@ -42,6 +42,8 @@ class SimulatedBackgroundTask { std::unique_lock l(mutex_); running_count_++; bg_cv_.notify_all(); + assert(cf_key_); + Env::Default()->GetThreadStatusUpdater()->SetEnableTracking(true); Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(cf_key_); Env::Default()->GetThreadStatusUpdater()->SetThreadOperation( operation_type_); diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc index 25ef5c0eeccf..3d12fe83a7e2 100644 --- a/util/thread_local_test.cc +++ b/util/thread_local_test.cc @@ -559,18 +559,14 @@ TEST_F(ThreadLocalTest, DISABLED_MainThreadDiesFirst) { // Triggers the initialization of singletons. Env::Default(); -#ifndef ROCKSDB_LITE try { -#endif // ROCKSDB_LITE ROCKSDB_NAMESPACE::port::Thread th(&AccessThreadLocal, nullptr); th.detach(); TEST_SYNC_POINT("MainThreadDiesFirst:End"); -#ifndef ROCKSDB_LITE } catch (const std::system_error& ex) { std::cerr << "Start thread: " << ex.code() << std::endl; FAIL(); } -#endif // ROCKSDB_LITE } } // namespace ROCKSDB_NAMESPACE diff --git a/util/thread_operation.h b/util/thread_operation.h index c24fccd5c410..4c01782caf6e 100644 --- a/util/thread_operation.h +++ b/util/thread_operation.h @@ -38,7 +38,17 @@ struct OperationInfo { static OperationInfo global_operation_table[] = { {ThreadStatus::OP_UNKNOWN, ""}, {ThreadStatus::OP_COMPACTION, "Compaction"}, - {ThreadStatus::OP_FLUSH, "Flush"}}; + {ThreadStatus::OP_FLUSH, "Flush"}, + {ThreadStatus::OP_DBOPEN, "DBOpen"}, + {ThreadStatus::OP_GET, "Get"}, + {ThreadStatus::OP_MULTIGET, "MultiGet"}, + {ThreadStatus::OP_DBITERATOR, "DBIterator"}, + {ThreadStatus::OP_VERIFY_DB_CHECKSUM, "VerifyDBChecksum"}, + {ThreadStatus::OP_VERIFY_FILE_CHECKSUMS, "VerifyFileChecksums"}, + {ThreadStatus::OP_GETENTITY, "GetEntity"}, + {ThreadStatus::OP_MULTIGETENTITY, "MultiGetEntity"}, + +}; struct OperationStageInfo { const ThreadStatus::OperationStage stage; diff --git a/util/udt_util.cc b/util/udt_util.cc new file mode 100644 index 000000000000..40cf1e4964b3 --- /dev/null +++ b/util/udt_util.cc @@ -0,0 +1,385 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/udt_util.h" + +#include "db/dbformat.h" +#include "rocksdb/types.h" +#include "util/coding.h" +#include "util/write_batch_util.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +enum class RecoveryType { + kNoop, + kUnrecoverable, + kStripTimestamp, + kPadTimestamp, +}; + +RecoveryType GetRecoveryType(const size_t running_ts_sz, + const std::optional& recorded_ts_sz) { + if (running_ts_sz == 0) { + if (!recorded_ts_sz.has_value()) { + // A column family id not recorded is equivalent to that column family has + // zero timestamp size. + return RecoveryType::kNoop; + } + return RecoveryType::kStripTimestamp; + } + + assert(running_ts_sz != 0); + + if (!recorded_ts_sz.has_value()) { + return RecoveryType::kPadTimestamp; + } + + if (running_ts_sz != *recorded_ts_sz) { + return RecoveryType::kUnrecoverable; + } + + return RecoveryType::kNoop; +} + +bool AllRunningColumnFamiliesConsistent( + const UnorderedMap& running_ts_sz, + const UnorderedMap& record_ts_sz) { + for (const auto& [cf_id, ts_sz] : running_ts_sz) { + auto record_it = record_ts_sz.find(cf_id); + RecoveryType recovery_type = + GetRecoveryType(ts_sz, record_it != record_ts_sz.end() + ? std::optional(record_it->second) + : std::nullopt); + if (recovery_type != RecoveryType::kNoop) { + return false; + } + } + return true; +} + +Status CheckWriteBatchTimestampSizeConsistency( + const WriteBatch* batch, + const UnorderedMap& running_ts_sz, + const UnorderedMap& record_ts_sz, + TimestampSizeConsistencyMode check_mode, bool* ts_need_recovery) { + std::vector column_family_ids; + Status status = + CollectColumnFamilyIdsFromWriteBatch(*batch, &column_family_ids); + if (!status.ok()) { + return status; + } + for (const auto& cf_id : column_family_ids) { + auto running_iter = running_ts_sz.find(cf_id); + if (running_iter == running_ts_sz.end()) { + // Ignore dropped column family referred to in a WriteBatch regardless of + // its consistency. + continue; + } + auto record_iter = record_ts_sz.find(cf_id); + RecoveryType recovery_type = GetRecoveryType( + running_iter->second, record_iter != record_ts_sz.end() + ? std::optional(record_iter->second) + : std::nullopt); + if (recovery_type != RecoveryType::kNoop) { + if (check_mode == TimestampSizeConsistencyMode::kVerifyConsistency) { + return Status::InvalidArgument( + "WriteBatch contains timestamp size inconsistency."); + } + + if (recovery_type == RecoveryType::kUnrecoverable) { + return Status::InvalidArgument( + "WriteBatch contains unrecoverable timestamp size inconsistency."); + } + + // If any column family needs reconciliation, it will mark the whole + // WriteBatch to need recovery and rebuilt. + *ts_need_recovery = true; + } + } + return Status::OK(); +} + +enum class ToggleUDT { + kUnchanged, + kEnableUDT, + kDisableUDT, + kInvalidChange, +}; + +ToggleUDT CompareComparator(const Comparator* new_comparator, + const std::string& old_comparator_name) { + static const char* kUDTSuffix = ".u64ts"; + static const Slice kSuffixSlice = kUDTSuffix; + static const size_t kSuffixSize = 6; + size_t ts_sz = new_comparator->timestamp_size(); + (void)ts_sz; + Slice new_ucmp_name(new_comparator->Name()); + Slice old_ucmp_name(old_comparator_name); + if (new_ucmp_name.compare(old_ucmp_name) == 0) { + return ToggleUDT::kUnchanged; + } + if (new_ucmp_name.size() == old_ucmp_name.size() + kSuffixSize && + new_ucmp_name.starts_with(old_ucmp_name) && + new_ucmp_name.ends_with(kSuffixSlice)) { + assert(ts_sz == 8); + return ToggleUDT::kEnableUDT; + } + if (old_ucmp_name.size() == new_ucmp_name.size() + kSuffixSize && + old_ucmp_name.starts_with(new_ucmp_name) && + old_ucmp_name.ends_with(kSuffixSlice)) { + assert(ts_sz == 0); + return ToggleUDT::kDisableUDT; + } + return ToggleUDT::kInvalidChange; +} +} // namespace + +TimestampRecoveryHandler::TimestampRecoveryHandler( + const UnorderedMap& running_ts_sz, + const UnorderedMap& record_ts_sz) + : running_ts_sz_(running_ts_sz), + record_ts_sz_(record_ts_sz), + new_batch_(new WriteBatch()), + handler_valid_(true), + new_batch_diff_from_orig_batch_(false) {} + +Status TimestampRecoveryHandler::PutCF(uint32_t cf, const Slice& key, + const Slice& value) { + std::string new_key_buf; + Slice new_key; + Status status = + ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key); + if (!status.ok()) { + return status; + } + return WriteBatchInternal::Put(new_batch_.get(), cf, new_key, value); +} + +Status TimestampRecoveryHandler::DeleteCF(uint32_t cf, const Slice& key) { + std::string new_key_buf; + Slice new_key; + Status status = + ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key); + if (!status.ok()) { + return status; + } + return WriteBatchInternal::Delete(new_batch_.get(), cf, new_key); +} + +Status TimestampRecoveryHandler::SingleDeleteCF(uint32_t cf, const Slice& key) { + std::string new_key_buf; + Slice new_key; + Status status = + ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key); + if (!status.ok()) { + return status; + } + return WriteBatchInternal::SingleDelete(new_batch_.get(), cf, new_key); +} + +Status TimestampRecoveryHandler::DeleteRangeCF(uint32_t cf, + const Slice& begin_key, + const Slice& end_key) { + std::string new_begin_key_buf; + Slice new_begin_key; + std::string new_end_key_buf; + Slice new_end_key; + Status status = ReconcileTimestampDiscrepancy( + cf, begin_key, &new_begin_key_buf, &new_begin_key); + if (!status.ok()) { + return status; + } + status = ReconcileTimestampDiscrepancy(cf, end_key, &new_end_key_buf, + &new_end_key); + if (!status.ok()) { + return status; + } + return WriteBatchInternal::DeleteRange(new_batch_.get(), cf, new_begin_key, + new_end_key); +} + +Status TimestampRecoveryHandler::MergeCF(uint32_t cf, const Slice& key, + const Slice& value) { + std::string new_key_buf; + Slice new_key; + Status status = + ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key); + if (!status.ok()) { + return status; + } + return WriteBatchInternal::Merge(new_batch_.get(), cf, new_key, value); +} + +Status TimestampRecoveryHandler::PutBlobIndexCF(uint32_t cf, const Slice& key, + const Slice& value) { + std::string new_key_buf; + Slice new_key; + Status status = + ReconcileTimestampDiscrepancy(cf, key, &new_key_buf, &new_key); + if (!status.ok()) { + return status; + } + return WriteBatchInternal::PutBlobIndex(new_batch_.get(), cf, new_key, value); +} + +Status TimestampRecoveryHandler::ReconcileTimestampDiscrepancy( + uint32_t cf, const Slice& key, std::string* new_key_buf, Slice* new_key) { + assert(handler_valid_); + auto running_iter = running_ts_sz_.find(cf); + if (running_iter == running_ts_sz_.end()) { + // The column family referred to by the WriteBatch is no longer running. + // Copy over the entry as is to the new WriteBatch. + *new_key = key; + return Status::OK(); + } + size_t running_ts_sz = running_iter->second; + auto record_iter = record_ts_sz_.find(cf); + std::optional record_ts_sz = + record_iter != record_ts_sz_.end() + ? std::optional(record_iter->second) + : std::nullopt; + RecoveryType recovery_type = GetRecoveryType(running_ts_sz, record_ts_sz); + + switch (recovery_type) { + case RecoveryType::kNoop: + *new_key = key; + break; + case RecoveryType::kStripTimestamp: + assert(record_ts_sz.has_value()); + *new_key = StripTimestampFromUserKey(key, *record_ts_sz); + new_batch_diff_from_orig_batch_ = true; + break; + case RecoveryType::kPadTimestamp: + AppendKeyWithMinTimestamp(new_key_buf, key, running_ts_sz); + *new_key = *new_key_buf; + new_batch_diff_from_orig_batch_ = true; + break; + case RecoveryType::kUnrecoverable: + return Status::InvalidArgument( + "Unrecoverable timestamp size inconsistency encountered by " + "TimestampRecoveryHandler."); + default: + assert(false); + } + return Status::OK(); +} + +Status HandleWriteBatchTimestampSizeDifference( + const WriteBatch* batch, + const UnorderedMap& running_ts_sz, + const UnorderedMap& record_ts_sz, + TimestampSizeConsistencyMode check_mode, + std::unique_ptr* new_batch) { + // Quick path to bypass checking the WriteBatch. + if (AllRunningColumnFamiliesConsistent(running_ts_sz, record_ts_sz)) { + return Status::OK(); + } + bool need_recovery = false; + Status status = CheckWriteBatchTimestampSizeConsistency( + batch, running_ts_sz, record_ts_sz, check_mode, &need_recovery); + if (!status.ok()) { + return status; + } else if (need_recovery) { + assert(new_batch); + SequenceNumber sequence = WriteBatchInternal::Sequence(batch); + TimestampRecoveryHandler recovery_handler(running_ts_sz, record_ts_sz); + status = batch->Iterate(&recovery_handler); + if (!status.ok()) { + return status; + } else { + *new_batch = recovery_handler.TransferNewBatch(); + WriteBatchInternal::SetSequence(new_batch->get(), sequence); + } + } + return Status::OK(); +} + +Status ValidateUserDefinedTimestampsOptions( + const Comparator* new_comparator, const std::string& old_comparator_name, + bool new_persist_udt, bool old_persist_udt, + bool* mark_sst_files_has_no_udt) { + size_t ts_sz = new_comparator->timestamp_size(); + ToggleUDT res = CompareComparator(new_comparator, old_comparator_name); + switch (res) { + case ToggleUDT::kUnchanged: + if (old_persist_udt == new_persist_udt) { + return Status::OK(); + } + if (ts_sz == 0) { + return Status::OK(); + } + return Status::InvalidArgument( + "Cannot toggle the persist_user_defined_timestamps flag for a column " + "family with user-defined timestamps feature enabled."); + case ToggleUDT::kEnableUDT: + if (!new_persist_udt) { + *mark_sst_files_has_no_udt = true; + return Status::OK(); + } + return Status::InvalidArgument( + "Cannot open a column family and enable user-defined timestamps " + "feature without setting persist_user_defined_timestamps flag to " + "false."); + case ToggleUDT::kDisableUDT: + if (!old_persist_udt) { + return Status::OK(); + } + return Status::InvalidArgument( + "Cannot open a column family and disable user-defined timestamps " + "feature if its existing persist_user_defined_timestamps flag is not " + "false."); + case ToggleUDT::kInvalidChange: + return Status::InvalidArgument( + new_comparator->Name(), + "does not match existing comparator " + old_comparator_name); + default: + break; + } + return Status::InvalidArgument( + "Unsupported user defined timestamps settings change."); +} + +void GetFullHistoryTsLowFromU64CutoffTs(Slice* cutoff_ts, + std::string* full_history_ts_low) { + uint64_t cutoff_udt_ts = 0; + [[maybe_unused]] bool format_res = GetFixed64(cutoff_ts, &cutoff_udt_ts); + assert(format_res); + PutFixed64(full_history_ts_low, cutoff_udt_ts + 1); +} + +std::tuple, std::optional> +MaybeAddTimestampsToRange(const Slice* start, const Slice* end, size_t ts_sz, + std::string* start_with_ts, std::string* end_with_ts, + bool exclusive_end) { + std::optional ret_start, ret_end; + if (start) { + if (ts_sz == 0) { + ret_start = *start; + } else { + // Maximum timestamp means including all keys with any timestamp for start + AppendKeyWithMaxTimestamp(start_with_ts, *start, ts_sz); + ret_start = Slice(*start_with_ts); + } + } + if (end) { + if (ts_sz == 0) { + ret_end = *end; + } else { + if (exclusive_end) { + // Append a maximum timestamp as the range limit is exclusive: + // [start, end) + AppendKeyWithMaxTimestamp(end_with_ts, *end, ts_sz); + } else { + // Append a minimum timestamp to end so the range limit is inclusive: + // [start, end] + AppendKeyWithMinTimestamp(end_with_ts, *end, ts_sz); + } + ret_end = Slice(*end_with_ts); + } + } + return std::make_tuple(ret_start, ret_end); +} +} // namespace ROCKSDB_NAMESPACE diff --git a/util/udt_util.h b/util/udt_util.h new file mode 100644 index 000000000000..b524fceab0d9 --- /dev/null +++ b/util/udt_util.h @@ -0,0 +1,268 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include +#include +#include +#include + +#include "db/write_batch_internal.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/write_batch.h" +#include "util/coding.h" +#include "util/hash_containers.h" + +namespace ROCKSDB_NAMESPACE { + +// Dummy record in WAL logs signaling user-defined timestamp sizes for +// subsequent records. +class UserDefinedTimestampSizeRecord { + public: + UserDefinedTimestampSizeRecord() {} + explicit UserDefinedTimestampSizeRecord( + std::vector>&& cf_to_ts_sz) + : cf_to_ts_sz_(std::move(cf_to_ts_sz)) {} + + const std::vector>& GetUserDefinedTimestampSize() + const { + return cf_to_ts_sz_; + } + + inline void EncodeTo(std::string* dst) const { + assert(dst != nullptr); + for (const auto& [cf_id, ts_sz] : cf_to_ts_sz_) { + assert(ts_sz != 0); + PutFixed32(dst, cf_id); + PutFixed16(dst, static_cast(ts_sz)); + } + } + + inline Status DecodeFrom(Slice* src) { + const size_t total_size = src->size(); + if ((total_size % kSizePerColumnFamily) != 0) { + std::ostringstream oss; + oss << "User-defined timestamp size record length: " << total_size + << " is not a multiple of " << kSizePerColumnFamily << std::endl; + return Status::Corruption(oss.str()); + } + int num_of_entries = static_cast(total_size / kSizePerColumnFamily); + for (int i = 0; i < num_of_entries; i++) { + uint32_t cf_id = 0; + uint16_t ts_sz = 0; + if (!GetFixed32(src, &cf_id) || !GetFixed16(src, &ts_sz)) { + return Status::Corruption( + "Error decoding user-defined timestamp size record entry"); + } + cf_to_ts_sz_.emplace_back(cf_id, static_cast(ts_sz)); + } + return Status::OK(); + } + + inline std::string DebugString() const { + std::ostringstream oss; + + for (const auto& [cf_id, ts_sz] : cf_to_ts_sz_) { + oss << "Column family: " << cf_id + << ", user-defined timestamp size: " << ts_sz << std::endl; + } + return oss.str(); + } + + private: + // 4 bytes for column family id, 2 bytes for user-defined timestamp size. + static constexpr size_t kSizePerColumnFamily = 4 + 2; + + std::vector> cf_to_ts_sz_; +}; + +// This handler is used to recover a WriteBatch read from WAL logs during +// recovery. It does a best-effort recovery if the column families contained in +// the WriteBatch have inconsistency between the recorded timestamp size and the +// running timestamp size. And creates a new WriteBatch that are consistent with +// the running timestamp size with entries from the original WriteBatch. +// +// Note that for a WriteBatch with no inconsistency, a new WriteBatch is created +// nonetheless, and it should be exactly the same as the original WriteBatch. +// +// To access the new WriteBatch, invoke `TransferNewBatch` after calling +// `Iterate`. The handler becomes invalid afterwards. +// +// For the user key in each entry, the best effort recovery means: +// 1) If recorded timestamp size is 0, running timestamp size is > 0, a min +// timestamp of length running timestamp size is padded to the user key. +// 2) If recorded timestamp size is > 0, running timestamp size is 0, the last +// bytes of length recorded timestamp size is stripped from user key. +// 3) If recorded timestamp size is the same as running timestamp size, no-op. +// 4) If recorded timestamp size and running timestamp size are both non-zero +// but not equal, return Status::InvalidArgument. +class TimestampRecoveryHandler : public WriteBatch::Handler { + public: + TimestampRecoveryHandler(const UnorderedMap& running_ts_sz, + const UnorderedMap& record_ts_sz); + + ~TimestampRecoveryHandler() override {} + + // No copy or move. + TimestampRecoveryHandler(const TimestampRecoveryHandler&) = delete; + TimestampRecoveryHandler(TimestampRecoveryHandler&&) = delete; + TimestampRecoveryHandler& operator=(const TimestampRecoveryHandler&) = delete; + TimestampRecoveryHandler& operator=(TimestampRecoveryHandler&&) = delete; + + Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override; + + Status DeleteCF(uint32_t cf, const Slice& key) override; + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override; + + Status DeleteRangeCF(uint32_t cf, const Slice& begin_key, + const Slice& end_key) override; + + Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override; + + Status PutBlobIndexCF(uint32_t cf, const Slice& key, + const Slice& value) override; + + Status MarkBeginPrepare(bool) override { return Status::OK(); } + + Status MarkEndPrepare(const Slice&) override { return Status::OK(); } + + Status MarkCommit(const Slice&) override { return Status::OK(); } + + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + return Status::OK(); + } + + Status MarkRollback(const Slice&) override { return Status::OK(); } + + Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); } + + std::unique_ptr&& TransferNewBatch() { + assert(new_batch_diff_from_orig_batch_); + handler_valid_ = false; + return std::move(new_batch_); + } + + private: + Status ReconcileTimestampDiscrepancy(uint32_t cf, const Slice& key, + std::string* new_key_buf, + Slice* new_key); + + // Mapping from column family id to user-defined timestamp size for all + // running column families including the ones with zero timestamp size. + const UnorderedMap& running_ts_sz_; + + // Mapping from column family id to user-defined timestamp size as recorded + // in the WAL. This only contains non-zero user-defined timestamp size. + const UnorderedMap& record_ts_sz_; + + std::unique_ptr new_batch_; + // Handler is valid upon creation and becomes invalid after its `new_batch_` + // is transferred. + bool handler_valid_; + + // False upon creation, and become true if at least one user key from the + // original batch is updated when creating the new batch. + bool new_batch_diff_from_orig_batch_; +}; + +// Mode for checking and handling timestamp size inconsistency encountered in a +// WriteBatch read from WAL log. +enum class TimestampSizeConsistencyMode { + // Verified that the recorded user-defined timestamp size is consistent with + // the running one for all the column families involved in a WriteBatch. + // Column families referred to in the WriteBatch but are dropped are ignored. + kVerifyConsistency, + // Verified that if any inconsistency exists in a WriteBatch, it's all + // tolerable by a best-effort reconciliation. And optionally creates a new + // WriteBatch from the original WriteBatch that is consistent with the running + // timestamp size. Column families referred to in the WriteBatch but are + // dropped are ignored. If a new WriteBatch is created, such entries are + // copied over as is. + kReconcileInconsistency, +}; + +// Handles the inconsistency between recorded timestamp sizes and running +// timestamp sizes for a WriteBatch. A non-OK `status` indicates there are +// intolerable inconsistency with the specified `check_mode`. +// +// If `check_mode` is `kVerifyConsistency`, intolerable inconsistency means any +// running column family has an inconsistent user-defined timestamp size. +// +// If `check_mode` is `kReconcileInconsistency`, intolerable inconsistency means +// any running column family has an inconsistent user-defined timestamp size +// that cannot be reconciled with a best-effort recovery. Check +// `TimestampRecoveryHandler` for what a best-effort recovery is capable of. In +// this mode, output argument `new_batch` should be set, a new WriteBatch is +// created on the heap and transferred to `new_batch` if there is tolerable +// inconsistency. +// +// An invariant that WAL logging ensures is that all timestamp size info +// is logged prior to a WriteBatch that needed this info. And zero timestamp +// size is skipped. So `record_ts_sz` only contains column family with non-zero +// timestamp size and a column family id absent from `record_ts_sz` will be +// interpreted as that column family has zero timestamp size. On the other hand, +// `running_ts_sz` should contain the timestamp size for all running column +// families including the ones with zero timestamp size. +Status HandleWriteBatchTimestampSizeDifference( + const WriteBatch* batch, + const UnorderedMap& running_ts_sz, + const UnorderedMap& record_ts_sz, + TimestampSizeConsistencyMode check_mode, + std::unique_ptr* new_batch = nullptr); + +// This util function is used when opening an existing column family and +// processing its VersionEdit. It does a sanity check for the column family's +// old user comparator and the persist_user_defined_timestamps flag as recorded +// in the VersionEdit, against its new settings from the column family's +// ImmutableCFOptions. +// +// Valid settings change include: +// 1) no user comparator change and no effective persist_user_defined_timestamp +// flag change. +// 2) switch user comparator to enable user-defined timestamps feature provided +// the immediately effective persist_user_defined_timestamps flag is false. +// 3) switch user comparator to disable user-defined timestamps feature provided +// that the before-change persist_user_defined_timestamps is already false. +// +// Switch user comparator to disable/enable UDT is only sanity checked by a user +// comparator name comparison. The full check includes enforcing the new user +// comparator ranks user keys exactly the same as the old user comparator and +// only add / remove the user-defined timestamp comparison. We don't have ways +// to strictly enforce this so currently only the RocksDB builtin comparator +// wrapper `ComparatorWithU64TsImpl` is supported to enable / disable +// user-defined timestamps. It formats user-defined timestamps as uint64_t. +// +// When the settings indicate a legit change to enable user-defined timestamps +// feature on a column family, `mark_sst_files_has_no_udt` will be set to true +// to indicate marking all existing SST files has no user-defined timestamps +// when re-writing the manifest. +Status ValidateUserDefinedTimestampsOptions( + const Comparator* new_comparator, const std::string& old_comparator_name, + bool new_persist_udt, bool old_persist_udt, + bool* mark_sst_files_has_no_udt); + +// Given a cutoff user-defined timestamp formatted as uint64_t, get the +// effective `full_history_ts_low` timestamp, which is the next immediately +// bigger timestamp. Used by the UDT in memtable only feature when flushing +// memtables and remove timestamps. This process collapses history and increase +// the effective `full_history_ts_low`. +void GetFullHistoryTsLowFromU64CutoffTs(Slice* cutoff_ts, + std::string* full_history_ts_low); + +// `start` is the inclusive lower user key bound without user-defined timestamp. +// `end` is the upper user key bound without user-defined timestamp. +// By default, `end` is treated as being exclusive. If `exclusive_end` is set to +// false, it's treated as an inclusive upper bound. +// If any of these two bounds is nullptr, an empty std::optional is +// returned for that bound. +std::tuple, std::optional> +MaybeAddTimestampsToRange(const Slice* start, const Slice* end, size_t ts_sz, + std::string* start_with_ts, std::string* end_with_ts, + bool exclusive_end = true); +} // namespace ROCKSDB_NAMESPACE diff --git a/util/udt_util_test.cc b/util/udt_util_test.cc new file mode 100644 index 000000000000..44ee567f7445 --- /dev/null +++ b/util/udt_util_test.cc @@ -0,0 +1,461 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/udt_util.h" + +#include + +#include "db/dbformat.h" +#include "test_util/testharness.h" +#include "test_util/testutil.h" + +namespace ROCKSDB_NAMESPACE { +namespace { +static const std::string kTestKeyWithoutTs = "key"; +static const std::string kValuePlaceHolder = "value"; +} // namespace + +class HandleTimestampSizeDifferenceTest : public testing::Test { + public: + HandleTimestampSizeDifferenceTest() {} + + // Test handler used to collect the column family id and user keys contained + // in a WriteBatch for test verification. And verifies the value part stays + // the same if it's available. + class KeyCollector : public WriteBatch::Handler { + public: + explicit KeyCollector() {} + + ~KeyCollector() override {} + + Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override { + if (value.compare(kValuePlaceHolder) != 0) { + return Status::InvalidArgument(); + } + return AddKey(cf, key); + } + + Status DeleteCF(uint32_t cf, const Slice& key) override { + return AddKey(cf, key); + } + + Status SingleDeleteCF(uint32_t cf, const Slice& key) override { + return AddKey(cf, key); + } + + Status DeleteRangeCF(uint32_t cf, const Slice& begin_key, + const Slice& end_key) override { + Status status = AddKey(cf, begin_key); + if (!status.ok()) { + return status; + } + return AddKey(cf, end_key); + } + + Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override { + if (value.compare(kValuePlaceHolder) != 0) { + return Status::InvalidArgument(); + } + return AddKey(cf, key); + } + + Status PutBlobIndexCF(uint32_t cf, const Slice& key, + const Slice& value) override { + if (value.compare(kValuePlaceHolder) != 0) { + return Status::InvalidArgument(); + } + return AddKey(cf, key); + } + + Status MarkBeginPrepare(bool) override { return Status::OK(); } + + Status MarkEndPrepare(const Slice&) override { return Status::OK(); } + + Status MarkRollback(const Slice&) override { return Status::OK(); } + + Status MarkCommit(const Slice&) override { return Status::OK(); } + + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + return Status::OK(); + } + + Status MarkNoop(bool) override { return Status::OK(); } + + const std::vector>& GetKeys() const { + return keys_; + } + + private: + Status AddKey(uint32_t cf, const Slice& key) { + keys_.push_back(std::make_pair(cf, key)); + return Status::OK(); + } + std::vector> keys_; + }; + + void CreateKey(std::string* key_buf, size_t ts_sz) { + if (ts_sz > 0) { + AppendKeyWithMinTimestamp(key_buf, kTestKeyWithoutTs, ts_sz); + } else { + key_buf->assign(kTestKeyWithoutTs); + } + } + + void CreateWriteBatch(const UnorderedMap& ts_sz_for_batch, + WriteBatch* batch) { + for (const auto& [cf_id, ts_sz] : ts_sz_for_batch) { + std::string key; + CreateKey(&key, ts_sz); + ASSERT_OK(WriteBatchInternal::Put(batch, cf_id, key, kValuePlaceHolder)); + ASSERT_OK(WriteBatchInternal::Delete(batch, cf_id, key)); + ASSERT_OK(WriteBatchInternal::SingleDelete(batch, cf_id, key)); + ASSERT_OK(WriteBatchInternal::DeleteRange(batch, cf_id, key, key)); + ASSERT_OK( + WriteBatchInternal::Merge(batch, cf_id, key, kValuePlaceHolder)); + ASSERT_OK(WriteBatchInternal::PutBlobIndex(batch, cf_id, key, + kValuePlaceHolder)); + } + } + + void CheckSequenceEqual(const WriteBatch& orig_batch, + const WriteBatch& new_batch) { + ASSERT_EQ(WriteBatchInternal::Sequence(&orig_batch), + WriteBatchInternal::Sequence(&new_batch)); + } + void CheckCountEqual(const WriteBatch& orig_batch, + const WriteBatch& new_batch) { + ASSERT_EQ(WriteBatchInternal::Count(&orig_batch), + WriteBatchInternal::Count(&new_batch)); + } + + void VerifyKeys( + const std::vector>& keys_with_ts, + const std::vector>& keys_without_ts, + size_t ts_sz, std::optional dropped_cf) { + ASSERT_EQ(keys_with_ts.size(), keys_without_ts.size()); + const std::string kTsMin(ts_sz, static_cast(0)); + for (size_t i = 0; i < keys_with_ts.size(); i++) { + // TimestampRecoveryHandler ignores dropped column family and copy it over + // as is. Check the keys stay the same. + if (dropped_cf.has_value() && + keys_with_ts[i].first == dropped_cf.value()) { + ASSERT_EQ(keys_with_ts[i].first, keys_without_ts[i].first); + ASSERT_EQ(keys_with_ts[i].second, keys_without_ts[i].second); + continue; + } + const Slice& key_with_ts = keys_with_ts[i].second; + const Slice& key_without_ts = keys_without_ts[i].second; + ASSERT_TRUE(key_with_ts.starts_with(key_without_ts)); + ASSERT_EQ(key_with_ts.size() - key_without_ts.size(), ts_sz); + ASSERT_TRUE(key_with_ts.ends_with(kTsMin)); + } + } + + void CheckContentsWithTimestampStripping(const WriteBatch& orig_batch, + const WriteBatch& new_batch, + size_t ts_sz, + std::optional dropped_cf) { + CheckSequenceEqual(orig_batch, new_batch); + CheckCountEqual(orig_batch, new_batch); + KeyCollector collector_for_orig_batch; + ASSERT_OK(orig_batch.Iterate(&collector_for_orig_batch)); + KeyCollector collector_for_new_batch; + ASSERT_OK(new_batch.Iterate(&collector_for_new_batch)); + VerifyKeys(collector_for_orig_batch.GetKeys(), + collector_for_new_batch.GetKeys(), ts_sz, dropped_cf); + } + + void CheckContentsWithTimestampPadding(const WriteBatch& orig_batch, + const WriteBatch& new_batch, + size_t ts_sz) { + CheckSequenceEqual(orig_batch, new_batch); + CheckCountEqual(orig_batch, new_batch); + KeyCollector collector_for_orig_batch; + ASSERT_OK(orig_batch.Iterate(&collector_for_orig_batch)); + KeyCollector collector_for_new_batch; + ASSERT_OK(new_batch.Iterate(&collector_for_new_batch)); + VerifyKeys(collector_for_new_batch.GetKeys(), + collector_for_orig_batch.GetKeys(), ts_sz, + std::nullopt /* dropped_cf */); + } +}; + +TEST_F(HandleTimestampSizeDifferenceTest, AllColumnFamiliesConsistent) { + UnorderedMap running_ts_sz = {{1, sizeof(uint64_t)}, + {2, 0}}; + UnorderedMap record_ts_sz = {{1, sizeof(uint64_t)}}; + WriteBatch batch; + CreateWriteBatch(running_ts_sz, &batch); + + // All `check_mode` pass with OK status and `batch` not checked or updated. + ASSERT_OK(HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kVerifyConsistency)); + std::unique_ptr new_batch(nullptr); + ASSERT_OK(HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kReconcileInconsistency, &new_batch)); + ASSERT_TRUE(new_batch.get() == nullptr); +} + +TEST_F(HandleTimestampSizeDifferenceTest, + AllInconsistentColumnFamiliesDropped) { + UnorderedMap running_ts_sz = {{2, 0}}; + UnorderedMap record_ts_sz = {{1, sizeof(uint64_t)}, + {3, sizeof(char)}}; + WriteBatch batch; + CreateWriteBatch(record_ts_sz, &batch); + + // All `check_mode` pass with OK status and `batch` not checked or updated. + ASSERT_OK(HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kVerifyConsistency)); + std::unique_ptr new_batch(nullptr); + ASSERT_OK(HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kReconcileInconsistency, &new_batch)); + ASSERT_TRUE(new_batch.get() == nullptr); +} + +TEST_F(HandleTimestampSizeDifferenceTest, InvolvedColumnFamiliesConsistent) { + UnorderedMap running_ts_sz = {{1, sizeof(uint64_t)}, + {2, sizeof(char)}}; + UnorderedMap record_ts_sz = {{1, sizeof(uint64_t)}}; + WriteBatch batch; + CreateWriteBatch(record_ts_sz, &batch); + + // All `check_mode` pass with OK status and `batch` not updated. + ASSERT_OK(HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kVerifyConsistency)); + std::unique_ptr new_batch(nullptr); + ASSERT_OK(HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kReconcileInconsistency, &new_batch)); + ASSERT_TRUE(new_batch.get() == nullptr); +} + +TEST_F(HandleTimestampSizeDifferenceTest, + InconsistentColumnFamilyNeedsTimestampStripping) { + UnorderedMap running_ts_sz = {{1, 0}, {2, sizeof(char)}}; + UnorderedMap record_ts_sz = {{1, sizeof(uint64_t)}}; + WriteBatch batch; + CreateWriteBatch(record_ts_sz, &batch); + + // kVerifyConsistency doesn't tolerate inconsistency for running column + // families. + ASSERT_TRUE(HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kVerifyConsistency) + .IsInvalidArgument()); + + std::unique_ptr new_batch(nullptr); + ASSERT_OK(HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kReconcileInconsistency, &new_batch)); + ASSERT_TRUE(new_batch.get() != nullptr); + CheckContentsWithTimestampStripping(batch, *new_batch, sizeof(uint64_t), + std::nullopt /* dropped_cf */); +} + +TEST_F(HandleTimestampSizeDifferenceTest, + InconsistentColumnFamilyNeedsTimestampPadding) { + UnorderedMap running_ts_sz = {{1, sizeof(uint64_t)}}; + // Make `record_ts_sz` not contain zero timestamp size entries to follow the + // behavior of actual WAL log timestamp size record. + UnorderedMap record_ts_sz; + WriteBatch batch; + CreateWriteBatch({{1, 0}}, &batch); + + // kVerifyConsistency doesn't tolerate inconsistency for running column + // families. + ASSERT_TRUE(HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kVerifyConsistency) + .IsInvalidArgument()); + + std::unique_ptr new_batch(nullptr); + ASSERT_OK(HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kReconcileInconsistency, &new_batch)); + ASSERT_TRUE(new_batch.get() != nullptr); + CheckContentsWithTimestampPadding(batch, *new_batch, sizeof(uint64_t)); +} + +TEST_F(HandleTimestampSizeDifferenceTest, + InconsistencyReconcileCopyOverDroppedColumnFamily) { + UnorderedMap running_ts_sz = {{1, 0}}; + UnorderedMap record_ts_sz = {{1, sizeof(uint64_t)}, + {2, sizeof(char)}}; + WriteBatch batch; + CreateWriteBatch(record_ts_sz, &batch); + std::unique_ptr new_batch(nullptr); + + // kReconcileInconsistency tolerate inconsistency for dropped column family + // and all related entries copied over to the new WriteBatch. + ASSERT_OK(HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kReconcileInconsistency, &new_batch)); + + ASSERT_TRUE(new_batch.get() != nullptr); + CheckContentsWithTimestampStripping(batch, *new_batch, sizeof(uint64_t), + std::optional(2)); +} + +TEST_F(HandleTimestampSizeDifferenceTest, UnrecoverableInconsistency) { + UnorderedMap running_ts_sz = {{1, sizeof(char)}}; + UnorderedMap record_ts_sz = {{1, sizeof(uint64_t)}}; + WriteBatch batch; + CreateWriteBatch(record_ts_sz, &batch); + + ASSERT_TRUE(HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kVerifyConsistency) + .IsInvalidArgument()); + + ASSERT_TRUE(HandleWriteBatchTimestampSizeDifference( + &batch, running_ts_sz, record_ts_sz, + TimestampSizeConsistencyMode::kReconcileInconsistency) + .IsInvalidArgument()); +} + +TEST(ValidateUserDefinedTimestampsOptionsTest, EnableUserDefinedTimestamps) { + bool mark_sst_files = false; + const Comparator* new_comparator = test::BytewiseComparatorWithU64TsWrapper(); + const Comparator* old_comparator = BytewiseComparator(); + ASSERT_OK(ValidateUserDefinedTimestampsOptions( + new_comparator, std::string(old_comparator->Name()), + false /*new_persist_udt*/, true /*old_persist_udt*/, &mark_sst_files)); + ASSERT_TRUE(mark_sst_files); + + ASSERT_OK(ValidateUserDefinedTimestampsOptions( + new_comparator, std::string(old_comparator->Name()), + false /*new_persist_udt*/, false /*old_persist_udt*/, &mark_sst_files)); + ASSERT_TRUE(mark_sst_files); +} + +TEST(ValidateUserDefinedTimestampsOptionsTest, + EnableUserDefinedTimestampsNewPersistUDTFlagIncorrect) { + bool mark_sst_files = false; + const Comparator* new_comparator = test::BytewiseComparatorWithU64TsWrapper(); + const Comparator* old_comparator = BytewiseComparator(); + ASSERT_TRUE(ValidateUserDefinedTimestampsOptions( + new_comparator, std::string(old_comparator->Name()), + true /*new_persist_udt*/, true /*old_persist_udt*/, + &mark_sst_files) + .IsInvalidArgument()); + ASSERT_TRUE(ValidateUserDefinedTimestampsOptions( + new_comparator, std::string(old_comparator->Name()), + true /*new_persist_udt*/, false /*old_persist_udt*/, + &mark_sst_files) + .IsInvalidArgument()); +} + +TEST(ValidateUserDefinedTimestampsOptionsTest, DisableUserDefinedTimestamps) { + bool mark_sst_files = false; + const Comparator* new_comparator = ReverseBytewiseComparator(); + const Comparator* old_comparator = + test::ReverseBytewiseComparatorWithU64TsWrapper(); + ASSERT_OK(ValidateUserDefinedTimestampsOptions( + new_comparator, std::string(old_comparator->Name()), + false /*new_persist_udt*/, false /*old_persist_udt*/, &mark_sst_files)); + ASSERT_FALSE(mark_sst_files); + + ASSERT_OK(ValidateUserDefinedTimestampsOptions( + new_comparator, std::string(old_comparator->Name()), + true /*new_persist_udt*/, false /*old_persist_udt*/, &mark_sst_files)); + ASSERT_FALSE(mark_sst_files); +} + +TEST(ValidateUserDefinedTimestampsOptionsTest, + DisableUserDefinedTimestampsOldPersistUDTFlagIncorrect) { + bool mark_sst_files = false; + const Comparator* new_comparator = BytewiseComparator(); + const Comparator* old_comparator = test::BytewiseComparatorWithU64TsWrapper(); + ASSERT_TRUE(ValidateUserDefinedTimestampsOptions( + new_comparator, std::string(old_comparator->Name()), + false /*new_persist_udt*/, true /*old_persist_udt*/, + &mark_sst_files) + .IsInvalidArgument()); + ASSERT_TRUE(ValidateUserDefinedTimestampsOptions( + new_comparator, std::string(old_comparator->Name()), + true /*new_persist_udt*/, true /*old_persist_udt*/, + &mark_sst_files) + .IsInvalidArgument()); +} + +TEST(ValidateUserDefinedTimestampsOptionsTest, UserComparatorUnchanged) { + bool mark_sst_files = false; + const Comparator* ucmp_without_ts = BytewiseComparator(); + const Comparator* ucmp_with_ts = test::BytewiseComparatorWithU64TsWrapper(); + ASSERT_OK(ValidateUserDefinedTimestampsOptions( + ucmp_without_ts, std::string(ucmp_without_ts->Name()), + false /*new_persist_udt*/, false /*old_persist_udt*/, &mark_sst_files)); + ASSERT_FALSE(mark_sst_files); + ASSERT_OK(ValidateUserDefinedTimestampsOptions( + ucmp_without_ts, std::string(ucmp_without_ts->Name()), + true /*new_persist_udt*/, true /*old_persist_udt*/, &mark_sst_files)); + ASSERT_FALSE(mark_sst_files); + ASSERT_OK(ValidateUserDefinedTimestampsOptions( + ucmp_without_ts, std::string(ucmp_without_ts->Name()), + true /*new_persist_udt*/, false /*old_persist_udt*/, &mark_sst_files)); + ASSERT_FALSE(mark_sst_files); + ASSERT_OK(ValidateUserDefinedTimestampsOptions( + ucmp_without_ts, std::string(ucmp_without_ts->Name()), + false /*new_persist_udt*/, true /*old_persist_udt*/, &mark_sst_files)); + ASSERT_FALSE(mark_sst_files); + + ASSERT_OK(ValidateUserDefinedTimestampsOptions( + ucmp_with_ts, std::string(ucmp_with_ts->Name()), true /*new_persist_udt*/, + true /*old_persist_udt*/, &mark_sst_files)); + ASSERT_FALSE(mark_sst_files); + ASSERT_OK(ValidateUserDefinedTimestampsOptions( + ucmp_with_ts, std::string(ucmp_with_ts->Name()), + false /*new_persist_udt*/, false /*old_persist_udt*/, &mark_sst_files)); + ASSERT_FALSE(mark_sst_files); + ASSERT_TRUE(ValidateUserDefinedTimestampsOptions( + ucmp_with_ts, std::string(ucmp_with_ts->Name()), + true /*new_persist_udt*/, false /*old_persist_udt*/, + &mark_sst_files) + .IsInvalidArgument()); + ASSERT_TRUE(ValidateUserDefinedTimestampsOptions( + ucmp_with_ts, std::string(ucmp_with_ts->Name()), + false /*new_persist_udt*/, true /*old_persist_udt*/, + &mark_sst_files) + .IsInvalidArgument()); +} + +TEST(ValidateUserDefinedTimestampsOptionsTest, InvalidUserComparatorChange) { + bool mark_sst_files = false; + const Comparator* new_comparator = BytewiseComparator(); + const Comparator* old_comparator = ReverseBytewiseComparator(); + ASSERT_TRUE(ValidateUserDefinedTimestampsOptions( + new_comparator, std::string(old_comparator->Name()), + false /*new_persist_udt*/, true /*old_persist_udt*/, + &mark_sst_files) + .IsInvalidArgument()); +} + +TEST(GetFullHistoryTsLowFromU64CutoffTsTest, Success) { + std::string cutoff_ts; + uint64_t cutoff_ts_int = 3; + PutFixed64(&cutoff_ts, 3); + Slice cutoff_ts_slice = cutoff_ts; + std::string actual_full_history_ts_low; + GetFullHistoryTsLowFromU64CutoffTs(&cutoff_ts_slice, + &actual_full_history_ts_low); + + std::string expected_ts_low; + PutFixed64(&expected_ts_low, cutoff_ts_int + 1); + ASSERT_EQ(expected_ts_low, actual_full_history_ts_low); +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/util/write_batch_util.cc b/util/write_batch_util.cc new file mode 100644 index 000000000000..fa6a3b09bec5 --- /dev/null +++ b/util/write_batch_util.cc @@ -0,0 +1,25 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/write_batch_util.h" + +namespace ROCKSDB_NAMESPACE { + +Status CollectColumnFamilyIdsFromWriteBatch( + const WriteBatch& batch, std::vector* column_family_ids) { + assert(column_family_ids != nullptr); + column_family_ids->clear(); + ColumnFamilyCollector handler; + Status s = batch.Iterate(&handler); + if (s.ok()) { + for (const auto& cf : handler.column_families()) { + column_family_ids->push_back(cf); + } + } + return s; +} + +} // namespace ROCKSDB_NAMESPACE diff --git a/util/write_batch_util.h b/util/write_batch_util.h new file mode 100644 index 000000000000..70bbad9fc78f --- /dev/null +++ b/util/write_batch_util.h @@ -0,0 +1,80 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once +#include +#include + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/write_batch.h" + +namespace ROCKSDB_NAMESPACE { +// ColumnFamilyCollector is a write batch handler which does nothing +// except recording unique column family IDs +class ColumnFamilyCollector : public WriteBatch::Handler { + std::unordered_set column_family_ids_; + + Status AddColumnFamilyId(uint32_t column_family_id) { + column_family_ids_.insert(column_family_id); + return Status::OK(); + } + + public: + explicit ColumnFamilyCollector() {} + + ~ColumnFamilyCollector() override {} + + Status PutCF(uint32_t column_family_id, const Slice&, const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status DeleteCF(uint32_t column_family_id, const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status DeleteRangeCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status MergeCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status PutBlobIndexCF(uint32_t column_family_id, const Slice&, + const Slice&) override { + return AddColumnFamilyId(column_family_id); + } + + Status MarkBeginPrepare(bool) override { return Status::OK(); } + + Status MarkEndPrepare(const Slice&) override { return Status::OK(); } + + Status MarkRollback(const Slice&) override { return Status::OK(); } + + Status MarkCommit(const Slice&) override { return Status::OK(); } + + Status MarkCommitWithTimestamp(const Slice&, const Slice&) override { + return Status::OK(); + } + + Status MarkNoop(bool) override { return Status::OK(); } + + const std::unordered_set& column_families() const { + return column_family_ids_; + } +}; + +Status CollectColumnFamilyIdsFromWriteBatch( + const WriteBatch& batch, std::vector* column_family_ids); + +} // namespace ROCKSDB_NAMESPACE diff --git a/util/xxhash.h b/util/xxhash.h index ad49bab816d4..2b9c228835f2 100644 --- a/util/xxhash.h +++ b/util/xxhash.h @@ -11,9 +11,6 @@ #ifndef XXH_NAMESPACE #define XXH_NAMESPACE ROCKSDB_ #endif // !defined(XXH_NAMESPACE) - -// for FALLTHROUGH_INTENDED, inserted as appropriate -#include "port/lang.h" /* END RocksDB customizations */ // clang-format off diff --git a/util/xxph3.h b/util/xxph3.h index 968000c3a87f..2933b74dbf0f 100644 --- a/util/xxph3.h +++ b/util/xxph3.h @@ -386,10 +386,6 @@ typedef struct { #define XXPH_STATIC_LINKING_ONLY #endif -/* BEGIN RocksDB customizations */ -#include "port/lang.h" /* for FALLTHROUGH_INTENDED, inserted as appropriate */ -/* END RocksDB customizations */ - /* ************************************* * Compiler Specific Options ***************************************/ diff --git a/utilities/agg_merge/agg_merge.cc b/utilities/agg_merge/agg_merge.cc index a7eab1f1227c..8e5c536f5594 100644 --- a/utilities/agg_merge/agg_merge.cc +++ b/utilities/agg_merge/agg_merge.cc @@ -3,7 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#include "utilities/agg_merge/agg_merge.h" +#include "rocksdb/utilities/agg_merge.h" #include @@ -17,9 +17,9 @@ #include "port/likely.h" #include "rocksdb/merge_operator.h" #include "rocksdb/slice.h" -#include "rocksdb/utilities/agg_merge.h" #include "rocksdb/utilities/options_type.h" #include "util/coding.h" +#include "utilities/agg_merge/agg_merge_impl.h" #include "utilities/merge_operators.h" namespace ROCKSDB_NAMESPACE { diff --git a/utilities/agg_merge/agg_merge.h b/utilities/agg_merge/agg_merge_impl.h similarity index 100% rename from utilities/agg_merge/agg_merge.h rename to utilities/agg_merge/agg_merge_impl.h diff --git a/utilities/agg_merge/agg_merge_test.cc b/utilities/agg_merge/agg_merge_test.cc index a65441cd0a5e..67abcf142a27 100644 --- a/utilities/agg_merge/agg_merge_test.cc +++ b/utilities/agg_merge/agg_merge_test.cc @@ -12,7 +12,7 @@ #include "db/db_test_util.h" #include "rocksdb/options.h" #include "test_util/testharness.h" -#include "utilities/agg_merge/agg_merge.h" +#include "utilities/agg_merge/agg_merge_impl.h" #include "utilities/agg_merge/test_agg_merge.h" namespace ROCKSDB_NAMESPACE { diff --git a/utilities/agg_merge/test_agg_merge.cc b/utilities/agg_merge/test_agg_merge.cc index 06e5b5697a31..63b89cccd69b 100644 --- a/utilities/agg_merge/test_agg_merge.cc +++ b/utilities/agg_merge/test_agg_merge.cc @@ -11,7 +11,7 @@ #include #include "util/coding.h" -#include "utilities/agg_merge/agg_merge.h" +#include "utilities/agg_merge/agg_merge_impl.h" namespace ROCKSDB_NAMESPACE { diff --git a/utilities/backup/backup_engine.cc b/utilities/backup/backup_engine.cc index d8f7d928a086..31a7337315f8 100644 --- a/utilities/backup/backup_engine.cc +++ b/utilities/backup/backup_engine.cc @@ -7,7 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #include #include @@ -50,7 +49,7 @@ #include "util/coding.h" #include "util/crc32c.h" #include "util/math.h" -#include "util/rate_limiter.h" +#include "util/rate_limiter_impl.h" #include "util/string_util.h" #include "utilities/backup/backup_engine_impl.h" #include "utilities/checkpoint/checkpoint_impl.h" @@ -1584,7 +1583,7 @@ IOStatus BackupEngineImpl::CreateNewBackupWithMetadata( // we copied all the files, enable file deletions if (disabled.ok()) { // If we successfully disabled file deletions - db->EnableFileDeletions(false).PermitUncheckedError(); + db->EnableFileDeletions(/*force=*/false).PermitUncheckedError(); } auto backup_time = backup_env_->NowMicros() - start_backup; @@ -3354,4 +3353,3 @@ void TEST_SetDefaultRateLimitersClock( } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/backup/backup_engine_impl.h b/utilities/backup/backup_engine_impl.h index 398f47f27251..a764b34d9b1c 100644 --- a/utilities/backup/backup_engine_impl.h +++ b/utilities/backup/backup_engine_impl.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/backup_engine.h" @@ -33,4 +32,3 @@ void TEST_SetDefaultRateLimitersClock( const std::shared_ptr& backup_rate_limiter_clock = nullptr, const std::shared_ptr& restore_rate_limiter_clock = nullptr); } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/backup/backup_engine_test.cc b/utilities/backup/backup_engine_test.cc index d780a1b2b261..5ed6ae89513b 100644 --- a/utilities/backup/backup_engine_test.cc +++ b/utilities/backup/backup_engine_test.cc @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#if !defined(ROCKSDB_LITE) && !defined(OS_WIN) +#if !defined(OS_WIN) #include "rocksdb/utilities/backup_engine.h" @@ -46,7 +46,7 @@ #include "util/cast_util.h" #include "util/mutexlock.h" #include "util/random.h" -#include "util/rate_limiter.h" +#include "util/rate_limiter_impl.h" #include "util/stderr_logger.h" #include "util/string_util.h" #include "utilities/backup/backup_engine_impl.h" @@ -589,7 +589,7 @@ void AssertExists(DB* db, int from, int to) { for (int i = from; i < to; ++i) { std::string key = "testkey" + std::to_string(i); std::string value; - Status s = db->Get(ReadOptions(), Slice(key), &value); + ASSERT_OK(db->Get(ReadOptions(), Slice(key), &value)); ASSERT_EQ(value, "testvalue" + std::to_string(i)); } } @@ -4308,13 +4308,13 @@ TEST_F(BackupEngineTest, ExcludeFiles) { for (auto be_pair : {std::make_pair(backup_engine_.get(), alt_backup_engine), std::make_pair(alt_backup_engine, backup_engine_.get())}) { - DestroyDB(dbname_, options_); + ASSERT_OK(DestroyDB(dbname_, options_)); RestoreOptions ro; // Fails without alternate dir ASSERT_TRUE(be_pair.first->RestoreDBFromLatestBackup(dbname_, dbname_, ro) .IsInvalidArgument()); - DestroyDB(dbname_, options_); + ASSERT_OK(DestroyDB(dbname_, options_)); // Works with alternate dir ro.alternate_dirs.push_front(be_pair.second); ASSERT_OK(be_pair.first->RestoreDBFromLatestBackup(dbname_, dbname_, ro)); @@ -4332,7 +4332,7 @@ TEST_F(BackupEngineTest, ExcludeFiles) { for (auto be_pair : {std::make_pair(backup_engine_.get(), alt_backup_engine), std::make_pair(alt_backup_engine, backup_engine_.get())}) { - DestroyDB(dbname_, options_); + ASSERT_OK(DestroyDB(dbname_, options_)); RestoreOptions ro; ro.alternate_dirs.push_front(be_pair.second); ASSERT_OK(be_pair.first->RestoreDBFromLatestBackup(dbname_, dbname_, ro)); @@ -4407,8 +4407,8 @@ int main(int argc, char** argv) { #include int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as BackupEngine is not supported in ROCKSDB_LITE\n"); + fprintf(stderr, "SKIPPED as BackupEngine is not supported in Windows\n"); return 0; } -#endif // !defined(ROCKSDB_LITE) && !defined(OS_WIN) +#endif // !defined(OS_WIN) diff --git a/utilities/blob_db/blob_compaction_filter.cc b/utilities/blob_db/blob_compaction_filter.cc index 86907e979b93..ddaa98c7d32a 100644 --- a/utilities/blob_db/blob_compaction_filter.cc +++ b/utilities/blob_db/blob_compaction_filter.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/blob_db/blob_compaction_filter.h" @@ -487,4 +486,3 @@ BlobIndexCompactionFilterFactoryGC::CreateCompactionFilter( } // namespace blob_db } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_compaction_filter.h b/utilities/blob_db/blob_compaction_filter.h index 1493cfc1a539..cb83d0d034f5 100644 --- a/utilities/blob_db/blob_compaction_filter.h +++ b/utilities/blob_db/blob_compaction_filter.h @@ -3,12 +3,11 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include "db/blob/blob_index.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "rocksdb/compaction_filter.h" #include "utilities/blob_db/blob_db_gc_stats.h" #include "utilities/blob_db/blob_db_impl.h" @@ -201,4 +200,3 @@ class BlobIndexCompactionFilterFactoryGC } // namespace blob_db } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc index cbd02e68e932..b6fe0390364d 100644 --- a/utilities/blob_db/blob_db.cc +++ b/utilities/blob_db/blob_db.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include "utilities/blob_db/blob_db.h" @@ -111,4 +110,3 @@ void BlobDBOptions::Dump(Logger* log) const { } // namespace blob_db } // namespace ROCKSDB_NAMESPACE -#endif diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h index e9d92486f9b8..e2f0b7bdbdd8 100644 --- a/utilities/blob_db/blob_db.h +++ b/utilities/blob_db/blob_db.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -263,4 +262,3 @@ Status DestroyBlobDB(const std::string& dbname, const Options& options, } // namespace blob_db } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_db_gc_stats.h b/utilities/blob_db/blob_db_gc_stats.h index fea6b003289a..12c11af85492 100644 --- a/utilities/blob_db/blob_db_gc_stats.h +++ b/utilities/blob_db/blob_db_gc_stats.h @@ -9,7 +9,6 @@ #include "rocksdb/rocksdb_namespace.h" -#ifndef ROCKSDB_LITE namespace ROCKSDB_NAMESPACE { @@ -53,4 +52,3 @@ class BlobDBGarbageCollectionStats { } // namespace blob_db } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 87e294c5c098..2fa7ae898f56 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -3,7 +3,6 @@ // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/blob_db/blob_db_impl.h" @@ -23,7 +22,7 @@ #include "file/writable_file_writer.h" #include "logging/logging.h" #include "monitoring/instrumented_mutex.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "rocksdb/convenience.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" @@ -270,7 +269,13 @@ Status BlobDBImpl::Open(std::vector* handles) { // Add trash files in blob dir to file delete scheduler. SstFileManagerImpl* sfm = static_cast( db_impl_->immutable_db_options().sst_file_manager.get()); - DeleteScheduler::CleanupDirectory(env_, sfm, blob_dir_); + s = DeleteScheduler::CleanupDirectory(env_, sfm, blob_dir_); + if (!s.ok()) { + ROCKS_LOG_ERROR(db_options_.info_log, + "Failed to clean up directory %s, status: %s", + blob_dir_.c_str(), s.ToString().c_str()); + return s; + } UpdateLiveSSTSize(); @@ -1143,7 +1148,7 @@ Slice BlobDBImpl::GetCompressedSlice(const Slice& raw, StopWatch compression_sw(clock_, statistics_, BLOB_DB_COMPRESSION_MICROS); CompressionType type = bdb_options_.compression; CompressionOptions opts; - CompressionContext context(type); + CompressionContext context(type, opts); CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(), type, 0 /* sample_for_compression */); CompressBlock(raw, info, &type, kBlockBasedTableVersionFormat, false, @@ -1385,28 +1390,46 @@ Status BlobDBImpl::AppendBlob(const std::shared_ptr& bfile, return s; } -std::vector BlobDBImpl::MultiGet(const ReadOptions& read_options, +std::vector BlobDBImpl::MultiGet(const ReadOptions& _read_options, const std::vector& keys, std::vector* values) { StopWatch multiget_sw(clock_, statistics_, BLOB_DB_MULTIGET_MICROS); RecordTick(statistics_, BLOB_DB_NUM_MULTIGET); // Get a snapshot to avoid blob file get deleted between we // fetch and index entry and reading from the file. - ReadOptions ro(read_options); - bool snapshot_created = SetSnapshotIfNeeded(&ro); - std::vector statuses; - statuses.reserve(keys.size()); + std::size_t num_keys = keys.size(); + statuses.reserve(num_keys); + + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kMultiGet) { + Status s = Status::InvalidArgument( + "Can only call MultiGet with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); + + for (size_t i = 0; i < num_keys; ++i) { + statuses.push_back(s); + } + return statuses; + } + + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kMultiGet; + } + bool snapshot_created = SetSnapshotIfNeeded(&read_options); + values->clear(); values->reserve(keys.size()); PinnableSlice value; for (size_t i = 0; i < keys.size(); i++) { - statuses.push_back(Get(ro, DefaultColumnFamily(), keys[i], &value)); + statuses.push_back( + GetImpl(read_options, DefaultColumnFamily(), keys[i], &value)); values->push_back(value.ToString()); value.Reset(); } if (snapshot_created) { - db_->ReleaseSnapshot(ro.snapshot); + db_->ReleaseSnapshot(read_options.snapshot); } return statuses; } @@ -1545,12 +1568,12 @@ Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number, if (reader->use_direct_io()) { s = reader->Read(IOOptions(), record_offset, static_cast(record_size), &blob_record, nullptr, - &aligned_buf, Env::IO_TOTAL /* rate_limiter_priority */); + &aligned_buf); } else { buf.reserve(static_cast(record_size)); s = reader->Read(IOOptions(), record_offset, static_cast(record_size), &blob_record, &buf[0], - nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + nullptr); } RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size()); } @@ -1610,16 +1633,36 @@ Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number, return Status::OK(); } -Status BlobDBImpl::Get(const ReadOptions& read_options, +Status BlobDBImpl::Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { - return Get(read_options, column_family, key, value, - static_cast(nullptr) /*expiration*/); + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kGet) { + return Status::InvalidArgument( + "Can only call Get with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kGet`"); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kGet; + } + return GetImpl(read_options, column_family, key, value); } -Status BlobDBImpl::Get(const ReadOptions& read_options, +Status BlobDBImpl::Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value, uint64_t* expiration) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kGet) { + return Status::InvalidArgument( + "Can only call Get with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kGet`"); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kGet; + } + StopWatch get_sw(clock_, statistics_, BLOB_DB_GET_MICROS); RecordTick(statistics_, BLOB_DB_NUM_GET); return GetImpl(read_options, column_family, key, value, expiration); @@ -1878,7 +1921,7 @@ std::pair BlobDBImpl::EvictExpiredFiles(bool aborted) { } if (!blob_file->Immutable()) { - CloseBlobFile(blob_file); + CloseBlobFile(blob_file).PermitUncheckedError(); } assert(blob_file->Immutable()); @@ -2036,7 +2079,17 @@ void BlobDBImpl::CopyBlobFiles( } } -Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) { +Iterator* BlobDBImpl::NewIterator(const ReadOptions& _read_options) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kDBIterator) { + return NewErrorIterator(Status::InvalidArgument( + "Can only call NewIterator with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kDBIterator`")); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kDBIterator; + } auto* cfd = static_cast_with_check(DefaultColumnFamily()) ->cfd(); @@ -2048,8 +2101,9 @@ Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) { own_snapshot = new ManagedSnapshot(db_); snapshot = own_snapshot->snapshot(); } + SuperVersion* sv = cfd->GetReferencedSuperVersion(db_impl_); auto* iter = db_impl_->NewIteratorImpl( - read_options, cfd, snapshot->GetSequenceNumber(), + read_options, cfd, sv, snapshot->GetSequenceNumber(), nullptr /*read_callback*/, true /*expose_blob_index*/); return new BlobDBIterator(own_snapshot, iter, this, clock_, statistics_); } @@ -2174,4 +2228,3 @@ void BlobDBImpl::TEST_ProcessCompactionJobInfo(const CompactionJobInfo& info) { } // namespace blob_db } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h index 0b4dbf5e53b4..d491108d3e68 100644 --- a/utilities/blob_db/blob_db_impl.h +++ b/utilities/blob_db/blob_db_impl.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -104,12 +103,13 @@ class BlobDBImpl : public BlobDB { const Slice& value) override; using BlobDB::Get; - Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, - const Slice& key, PinnableSlice* value) override; + Status Get(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; - Status Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, - const Slice& key, PinnableSlice* value, - uint64_t* expiration) override; + Status Get(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value, uint64_t* expiration) override; using BlobDB::NewIterator; virtual Iterator* NewIterator(const ReadOptions& read_options) override; @@ -124,7 +124,7 @@ class BlobDBImpl : public BlobDB { using BlobDB::MultiGet; virtual std::vector MultiGet( - const ReadOptions& read_options, const std::vector& keys, + const ReadOptions& _read_options, const std::vector& keys, std::vector* values) override; using BlobDB::Write; @@ -490,7 +490,7 @@ class BlobDBImpl : public BlobDB { // Each call of DisableFileDeletions will increase disable_file_deletion_ // by 1. EnableFileDeletions will either decrease the count by 1 or reset - // it to zeor, depending on the force flag. + // it to zero, depending on the force flag. // // REQUIRES: access with delete_file_mutex_ held. int disable_file_deletions_ = 0; @@ -500,4 +500,3 @@ class BlobDBImpl : public BlobDB { } // namespace blob_db } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_db_impl_filesnapshot.cc b/utilities/blob_db/blob_db_impl_filesnapshot.cc index 87e3f33cc65f..da2d02d07a40 100644 --- a/utilities/blob_db/blob_db_impl_filesnapshot.cc +++ b/utilities/blob_db/blob_db_impl_filesnapshot.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "file/filename.h" #include "logging/logging.h" @@ -110,4 +109,3 @@ void BlobDBImpl::GetLiveFilesMetaData(std::vector* metadata) { } // namespace blob_db } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/utilities/blob_db/blob_db_iterator.h b/utilities/blob_db/blob_db_iterator.h index fd2b2f8f5273..4898ddfd7691 100644 --- a/utilities/blob_db/blob_db_iterator.h +++ b/utilities/blob_db/blob_db_iterator.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include "db/arena_wrapped_db_iter.h" #include "rocksdb/iterator.h" @@ -147,4 +146,3 @@ class BlobDBIterator : public Iterator { }; } // namespace blob_db } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/utilities/blob_db/blob_db_listener.h b/utilities/blob_db/blob_db_listener.h index d17d2985356f..c95740c50e72 100644 --- a/utilities/blob_db/blob_db_listener.h +++ b/utilities/blob_db/blob_db_listener.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include @@ -23,7 +22,7 @@ class BlobDBListener : public EventListener { void OnFlushBegin(DB* /*db*/, const FlushJobInfo& /*info*/) override { assert(blob_db_impl_ != nullptr); - blob_db_impl_->SyncBlobFiles(); + blob_db_impl_->SyncBlobFiles().PermitUncheckedError(); } void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& /*info*/) override { @@ -68,4 +67,3 @@ class BlobDBListenerGC : public BlobDBListener { } // namespace blob_db } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc index 1744bda1ff42..57c0411caaeb 100644 --- a/utilities/blob_db/blob_db_test.cc +++ b/utilities/blob_db/blob_db_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/blob_db/blob_db.h" @@ -1180,6 +1179,12 @@ TEST_F(BlobDBTest, FIFOEviction_NoEnoughBlobFilesToEvict) { options.statistics = statistics; Open(bdb_options, options); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted", + "BlobDBTest.FIFOEviction_NoEnoughBlobFilesToEvict:AfterFlush"}}); + + SyncPoint::GetInstance()->EnableProcessing(); + ASSERT_EQ(0, blob_db_impl()->TEST_live_sst_size()); std::string small_value(50, 'v'); std::map data; @@ -1189,10 +1194,15 @@ TEST_F(BlobDBTest, FIFOEviction_NoEnoughBlobFilesToEvict) { ASSERT_OK(Put("key" + std::to_string(i), small_value, &data)); } ASSERT_OK(blob_db_->Flush(FlushOptions())); + uint64_t live_sst_size = 0; ASSERT_TRUE(blob_db_->GetIntProperty(DB::Properties::kTotalSstFilesSize, &live_sst_size)); ASSERT_TRUE(live_sst_size > 0); + + TEST_SYNC_POINT( + "BlobDBTest.FIFOEviction_NoEnoughBlobFilesToEvict:AfterFlush"); + ASSERT_EQ(live_sst_size, blob_db_impl()->TEST_live_sst_size()); bdb_options.max_db_size = live_sst_size + 2000; @@ -1216,6 +1226,8 @@ TEST_F(BlobDBTest, FIFOEviction_NoEnoughBlobFilesToEvict) { ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); // Verify large_key2 still exists. VerifyDB(data); + + SyncPoint::GetInstance()->DisableProcessing(); } // Test flush or compaction will trigger FIFO eviction since they update @@ -1234,6 +1246,12 @@ TEST_F(BlobDBTest, FIFOEviction_TriggerOnSSTSizeChange) { options.compression = kNoCompression; Open(bdb_options, options); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted", + "BlobDBTest.FIFOEviction_TriggerOnSSTSizeChange:AfterFlush"}}); + + SyncPoint::GetInstance()->EnableProcessing(); + std::string value(800, 'v'); ASSERT_OK(PutWithTTL("large_key", value, 60)); ASSERT_EQ(1, blob_db_impl()->TEST_GetBlobFiles().size()); @@ -1247,11 +1265,15 @@ TEST_F(BlobDBTest, FIFOEviction_TriggerOnSSTSizeChange) { } ASSERT_OK(blob_db_->Flush(FlushOptions())); + TEST_SYNC_POINT("BlobDBTest.FIFOEviction_TriggerOnSSTSizeChange:AfterFlush"); + // Verify large_key is deleted by FIFO eviction. blob_db_impl()->TEST_DeleteObsoleteFiles(); ASSERT_EQ(0, blob_db_impl()->TEST_GetBlobFiles().size()); ASSERT_EQ(1, statistics->getTickerCount(BLOB_DB_FIFO_NUM_FILES_EVICTED)); VerifyDB(data); + + SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(BlobDBTest, InlineSmallValues) { @@ -1630,6 +1652,12 @@ TEST_F(BlobDBTest, FilterForFIFOEviction) { options.disable_auto_compactions = true; Open(bdb_options, options); + SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted", + "BlobDBTest.FilterForFIFOEviction:AfterFlush"}}); + + SyncPoint::GetInstance()->EnableProcessing(); + std::map data; std::map data_after_compact; // Insert some small values that will be inlined. @@ -1644,6 +1672,9 @@ TEST_F(BlobDBTest, FilterForFIFOEviction) { } uint64_t num_keys_to_evict = data.size() - data_after_compact.size(); ASSERT_OK(blob_db_->Flush(FlushOptions())); + + TEST_SYNC_POINT("BlobDBTest.FilterForFIFOEviction:AfterFlush"); + uint64_t live_sst_size = blob_db_impl()->TEST_live_sst_size(); ASSERT_GT(live_sst_size, 0); VerifyDB(data); @@ -1695,6 +1726,8 @@ TEST_F(BlobDBTest, FilterForFIFOEviction) { data_after_compact["large_key2"] = large_value; data_after_compact["large_key3"] = large_value; VerifyDB(data_after_compact); + + SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(BlobDBTest, GarbageCollection) { @@ -1996,7 +2029,7 @@ TEST_F(BlobDBTest, DisableFileDeletions) { ASSERT_EQ(1, blob_db_impl()->TEST_GetObsoleteFiles().size()); VerifyDB(data); // Call EnableFileDeletions a second time. - ASSERT_OK(blob_db_->EnableFileDeletions(false)); + ASSERT_OK(blob_db_->EnableFileDeletions(/*force=*/false)); blob_db_impl()->TEST_DeleteObsoleteFiles(); } // Regardless of value of `force`, file should be deleted by now. @@ -2387,13 +2420,3 @@ int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as BlobDB is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/utilities/blob_db/blob_dump_tool.cc b/utilities/blob_db/blob_dump_tool.cc index 1e063299015a..0c2fef5e1563 100644 --- a/utilities/blob_db/blob_dump_tool.cc +++ b/utilities/blob_db/blob_dump_tool.cc @@ -2,7 +2,6 @@ // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/blob_db/blob_dump_tool.h" @@ -103,8 +102,8 @@ Status BlobDumpTool::Read(uint64_t offset, size_t size, Slice* result) { } buffer_.reset(new char[buffer_size_]); } - Status s = reader_->Read(IOOptions(), offset, size, result, buffer_.get(), - nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + Status s = + reader_->Read(IOOptions(), offset, size, result, buffer_.get(), nullptr); if (!s.ok()) { return s; } @@ -278,5 +277,3 @@ std::string BlobDumpTool::GetString(std::pair p) { } // namespace blob_db } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_dump_tool.h b/utilities/blob_db/blob_dump_tool.h index bece564e1cc9..12cd1bf42210 100644 --- a/utilities/blob_db/blob_dump_tool.h +++ b/utilities/blob_db/blob_dump_tool.h @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -55,4 +54,3 @@ class BlobDumpTool { } // namespace blob_db } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc index c68e557c6762..5b31d5697320 100644 --- a/utilities/blob_db/blob_file.cc +++ b/utilities/blob_db/blob_file.cc @@ -3,7 +3,6 @@ // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/blob_db/blob_file.h" #include @@ -115,13 +114,11 @@ Status BlobFile::ReadFooter(BlobLogFooter* bf) { // TODO: rate limit reading footers from blob files. if (ra_file_reader_->use_direct_io()) { s = ra_file_reader_->Read(IOOptions(), footer_offset, BlobLogFooter::kSize, - &result, nullptr, &aligned_buf, - Env::IO_TOTAL /* rate_limiter_priority */); + &result, nullptr, &aligned_buf); } else { buf.reserve(BlobLogFooter::kSize + 10); s = ra_file_reader_->Read(IOOptions(), footer_offset, BlobLogFooter::kSize, - &result, &buf[0], nullptr, - Env::IO_TOTAL /* rate_limiter_priority */); + &result, &buf[0], nullptr); } if (!s.ok()) return s; if (result.size() != BlobLogFooter::kSize) { @@ -239,13 +236,11 @@ Status BlobFile::ReadMetadata(const std::shared_ptr& fs, // TODO: rate limit reading headers from blob files. if (file_reader->use_direct_io()) { s = file_reader->Read(IOOptions(), 0, BlobLogHeader::kSize, &header_slice, - nullptr, &aligned_buf, - Env::IO_TOTAL /* rate_limiter_priority */); + nullptr, &aligned_buf); } else { header_buf.reserve(BlobLogHeader::kSize); s = file_reader->Read(IOOptions(), 0, BlobLogHeader::kSize, &header_slice, - &header_buf[0], nullptr, - Env::IO_TOTAL /* rate_limiter_priority */); + &header_buf[0], nullptr); } if (!s.ok()) { ROCKS_LOG_ERROR( @@ -282,13 +277,12 @@ Status BlobFile::ReadMetadata(const std::shared_ptr& fs, if (file_reader->use_direct_io()) { s = file_reader->Read(IOOptions(), file_size - BlobLogFooter::kSize, BlobLogFooter::kSize, &footer_slice, nullptr, - &aligned_buf, - Env::IO_TOTAL /* rate_limiter_priority */); + &aligned_buf); } else { footer_buf.reserve(BlobLogFooter::kSize); s = file_reader->Read(IOOptions(), file_size - BlobLogFooter::kSize, BlobLogFooter::kSize, &footer_slice, &footer_buf[0], - nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + nullptr); } if (!s.ok()) { ROCKS_LOG_ERROR( @@ -315,4 +309,3 @@ Status BlobFile::ReadMetadata(const std::shared_ptr& fs, } // namespace blob_db } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_file.h b/utilities/blob_db/blob_file.h index 6f3f2bea7c3a..8651c6b67282 100644 --- a/utilities/blob_db/blob_file.h +++ b/utilities/blob_db/blob_file.h @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -243,4 +242,3 @@ class BlobFile { }; } // namespace blob_db } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/cache_dump_load.cc b/utilities/cache_dump_load.cc index 9a7c7679875e..bbe02a934012 100644 --- a/utilities/cache_dump_load.cc +++ b/utilities/cache_dump_load.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/cache_dump_load.h" @@ -66,4 +65,3 @@ Status NewDefaultCacheDumpedLoader( } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/cache_dump_load_impl.cc b/utilities/cache_dump_load_impl.cc index b5e21291b34f..52f2a4df7d97 100644 --- a/utilities/cache_dump_load_impl.cc +++ b/utilities/cache_dump_load_impl.cc @@ -5,7 +5,6 @@ #include "cache/cache_key.h" #include "table/block_based/block_based_table_reader.h" -#ifndef ROCKSDB_LITE #include "cache/cache_entry_roles.h" #include "file/writable_file_writer.h" @@ -368,4 +367,3 @@ IOStatus CacheDumpedLoaderImpl::ReadCacheBlock(std::string* data, } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/cache_dump_load_impl.h b/utilities/cache_dump_load_impl.h index ad637b00dd79..59cabbf3b68d 100644 --- a/utilities/cache_dump_load_impl.h +++ b/utilities/cache_dump_load_impl.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include @@ -250,8 +249,7 @@ class FromFileCacheDumpReader : public CacheDumpReader { while (to_read > 0) { io_s = file_reader_->Read(IOOptions(), offset_, to_read, &result_, - buffer_, nullptr, - Env::IO_TOTAL /* rate_limiter_priority */); + buffer_, nullptr); if (!io_s.ok()) { return io_s; } @@ -355,4 +353,3 @@ class CacheDumperHelper { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/cassandra/cassandra_compaction_filter.cc b/utilities/cassandra/cassandra_compaction_filter.cc index 4e48d63aabeb..b7da2ba0cbc6 100644 --- a/utilities/cassandra/cassandra_compaction_filter.cc +++ b/utilities/cassandra/cassandra_compaction_filter.cc @@ -17,7 +17,6 @@ namespace ROCKSDB_NAMESPACE { namespace cassandra { static std::unordered_map cassandra_filter_type_info = { -#ifndef ROCKSDB_LITE {"purge_ttl_on_expiration", {offsetof(struct CassandraOptions, purge_ttl_on_expiration), OptionType::kBoolean, OptionVerificationType::kNormal, @@ -26,7 +25,6 @@ static std::unordered_map {offsetof(struct CassandraOptions, gc_grace_period_in_seconds), OptionType::kUInt32T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, -#endif // ROCKSDB_LITE }; CassandraCompactionFilter::CassandraCompactionFilter( @@ -77,7 +75,6 @@ CassandraCompactionFilterFactory::CreateCompactionFilter( return result; } -#ifndef ROCKSDB_LITE int RegisterCassandraObjects(ObjectLibrary& library, const std::string& /*arg*/) { library.AddFactory( @@ -105,6 +102,5 @@ int RegisterCassandraObjects(ObjectLibrary& library, size_t num_types; return static_cast(library.GetFactoryCount(&num_types)); } -#endif // ROCKSDB_LITE } // namespace cassandra } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc index c5be836e8f55..e3266a0dc11a 100644 --- a/utilities/cassandra/cassandra_functional_test.cc +++ b/utilities/cassandra/cassandra_functional_test.cc @@ -343,7 +343,6 @@ TEST_F(CassandraFunctionalTest, CompactionShouldRemoveTombstoneFromPut) { ASSERT_FALSE(std::get<0>(store.Get("k1"))); } -#ifndef ROCKSDB_LITE TEST_F(CassandraFunctionalTest, LoadMergeOperator) { ConfigOptions config_options; std::shared_ptr mo; @@ -434,7 +433,6 @@ TEST_F(CassandraFunctionalTest, LoadCompactionFilterFactory) { ASSERT_EQ(opts->gc_grace_period_in_seconds, 42); ASSERT_TRUE(opts->purge_ttl_on_expiration); } -#endif // ROCKSDB_LITE } // namespace cassandra } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/cassandra/cassandra_options.h b/utilities/cassandra/cassandra_options.h index efa73a308360..e0527aada663 100644 --- a/utilities/cassandra/cassandra_options.h +++ b/utilities/cassandra/cassandra_options.h @@ -34,10 +34,8 @@ struct CassandraOptions { // data back. bool purge_ttl_on_expiration; }; -#ifndef ROCKSDB_LITE extern "C" { int RegisterCassandraObjects(ObjectLibrary& library, const std::string& arg); } // extern "C" -#endif // ROCKSDB_LITE } // namespace cassandra } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/cassandra/merge_operator.cc b/utilities/cassandra/merge_operator.cc index bde5dcbaddb2..366d8fa4435f 100644 --- a/utilities/cassandra/merge_operator.cc +++ b/utilities/cassandra/merge_operator.cc @@ -19,7 +19,6 @@ namespace ROCKSDB_NAMESPACE { namespace cassandra { static std::unordered_map merge_operator_options_info = { -#ifndef ROCKSDB_LITE {"gc_grace_period_in_seconds", {offsetof(struct CassandraOptions, gc_grace_period_in_seconds), OptionType::kUInt32T, OptionVerificationType::kNormal, @@ -27,7 +26,6 @@ static std::unordered_map {"operands_limit", {offsetof(struct CassandraOptions, operands_limit), OptionType::kSizeT, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, -#endif // ROCKSDB_LITE }; CassandraValueMergeOperator::CassandraValueMergeOperator( diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index cdea325cd71c..e1f094513092 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -7,7 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -#ifndef ROCKSDB_LITE #include "utilities/checkpoint/checkpoint_impl.h" @@ -149,7 +148,7 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir, // we copied all the files, enable file deletions if (disabled_file_deletions) { - Status ss = db_->EnableFileDeletions(false); + Status ss = db_->EnableFileDeletions(/*force=*/false); assert(ss.ok()); ss.PermitUncheckedError(); } @@ -338,7 +337,7 @@ Status CheckpointImpl::ExportColumnFamily( nullptr, Temperature::kUnknown); } /*copy_file_cb*/); - const auto enable_status = db_->EnableFileDeletions(false /*force*/); + const auto enable_status = db_->EnableFileDeletions(/*force=*/false); if (s.ok()) { s = enable_status; } @@ -373,17 +372,19 @@ Status CheckpointImpl::ExportColumnFamily( for (const auto& file_metadata : level_metadata.files) { LiveFileMetaData live_file_metadata; live_file_metadata.size = file_metadata.size; - live_file_metadata.name = std::move(file_metadata.name); + live_file_metadata.name = file_metadata.name; live_file_metadata.file_number = file_metadata.file_number; live_file_metadata.db_path = export_dir; live_file_metadata.smallest_seqno = file_metadata.smallest_seqno; live_file_metadata.largest_seqno = file_metadata.largest_seqno; - live_file_metadata.smallestkey = std::move(file_metadata.smallestkey); - live_file_metadata.largestkey = std::move(file_metadata.largestkey); + live_file_metadata.smallestkey = file_metadata.smallestkey; + live_file_metadata.largestkey = file_metadata.largestkey; live_file_metadata.oldest_blob_file_number = file_metadata.oldest_blob_file_number; live_file_metadata.epoch_number = file_metadata.epoch_number; live_file_metadata.level = level_metadata.level; + live_file_metadata.smallest = file_metadata.smallest; + live_file_metadata.largest = file_metadata.largest; result_metadata->files.push_back(live_file_metadata); } *metadata = result_metadata; @@ -467,4 +468,3 @@ Status CheckpointImpl::ExportFilesInMetaData( } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/checkpoint/checkpoint_impl.h b/utilities/checkpoint/checkpoint_impl.h index 2947330ccef8..3cb9a6477f0d 100644 --- a/utilities/checkpoint/checkpoint_impl.h +++ b/utilities/checkpoint/checkpoint_impl.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include @@ -63,4 +62,3 @@ class CheckpointImpl : public Checkpoint { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc index 3da753d5f3ba..a9cea1c058fd 100644 --- a/utilities/checkpoint/checkpoint_test.cc +++ b/utilities/checkpoint/checkpoint_test.cc @@ -8,7 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. // Syncpoint prevents us building and running tests in release -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/checkpoint.h" #ifndef OS_WIN @@ -461,8 +460,8 @@ TEST_F(CheckpointTest, CheckpointCF) { Options options = CurrentOptions(); CreateAndReopenWithCF({"one", "two", "three", "four", "five"}, options); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"CheckpointTest::CheckpointCF:2", "DBImpl::GetLiveFiles:2"}, - {"DBImpl::GetLiveFiles:1", "CheckpointTest::CheckpointCF:1"}}); + {{"CheckpointTest::CheckpointCF:2", "DBImpl::FlushAllColumnFamilies:2"}, + {"DBImpl::FlushAllColumnFamilies:1", "CheckpointTest::CheckpointCF:1"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); @@ -855,6 +854,30 @@ TEST_F(CheckpointTest, CheckpointReadOnlyDB) { delete snapshot_db; } +TEST_F(CheckpointTest, CheckpointWithLockWAL) { + Options options = CurrentOptions(); + ASSERT_OK(Put("foo", "foo_value")); + + ASSERT_OK(db_->LockWAL()); + + Checkpoint* checkpoint = nullptr; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name_)); + delete checkpoint; + checkpoint = nullptr; + + ASSERT_OK(db_->UnlockWAL()); + Close(); + + DB* snapshot_db = nullptr; + ASSERT_OK(DB::Open(options, snapshot_name_, &snapshot_db)); + ReadOptions read_opts; + std::string get_result; + ASSERT_OK(snapshot_db->Get(read_opts, "foo", &get_result)); + ASSERT_EQ("foo_value", get_result); + delete snapshot_db; +} + TEST_F(CheckpointTest, CheckpointReadOnlyDBWithMultipleColumnFamilies) { Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu", "eevee"}, options); @@ -902,7 +925,7 @@ TEST_F(CheckpointTest, CheckpointWithDbPath) { options.db_paths.emplace_back(dbname_ + "_2", 0); Reopen(options); ASSERT_OK(Put("key1", "val1")); - Flush(); + ASSERT_OK(Flush()); Checkpoint* checkpoint; ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); // Currently not supported @@ -945,7 +968,7 @@ TEST_F(CheckpointTest, PutRaceWithCheckpointTrackedWalSync) { // Simulate full loss of unsynced data. This drops "key2" -> "val2" from the // DB WAL. - fault_env->DropUnsyncedFileData(); + ASSERT_OK(fault_env->DropUnsyncedFileData()); // Before the bug fix, reopening the DB would fail because the MANIFEST's // AddWal entry indicated the WAL should be synced through "key2" -> "val2". @@ -962,13 +985,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as Checkpoint is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/utilities/compaction_filters.cc b/utilities/compaction_filters.cc index 8763901c3dc1..999f723651f0 100644 --- a/utilities/compaction_filters.cc +++ b/utilities/compaction_filters.cc @@ -13,7 +13,6 @@ #include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h" namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE static int RegisterBuiltinCompactionFilters(ObjectLibrary& library, const std::string& /*arg*/) { library.AddFactory( @@ -25,19 +24,16 @@ static int RegisterBuiltinCompactionFilters(ObjectLibrary& library, }); return 1; } -#endif // ROCKSDB_LITE Status CompactionFilter::CreateFromString(const ConfigOptions& config_options, const std::string& value, const CompactionFilter** result) { -#ifndef ROCKSDB_LITE static std::once_flag once; std::call_once(once, [&]() { RegisterBuiltinCompactionFilters(*(ObjectLibrary::Default().get()), ""); }); -#endif // ROCKSDB_LITE CompactionFilter* filter = const_cast(*result); - Status status = LoadStaticObject(config_options, value, - nullptr, &filter); + Status status = + LoadStaticObject(config_options, value, &filter); if (status.ok()) { *result = const_cast(filter); } @@ -49,8 +45,8 @@ Status CompactionFilterFactory::CreateFromString( std::shared_ptr* result) { // Currently there are no builtin CompactionFilterFactories. // If any are introduced, they need to be registered here. - Status status = LoadSharedObject( - config_options, value, nullptr, result); + Status status = + LoadSharedObject(config_options, value, result); return status; } } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc b/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc index b788dbf9b065..0a0c585f8895 100644 --- a/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc +++ b/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc @@ -3,7 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE + +#include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h" #include "utilities/compaction_filters/remove_emptyvalue_compactionfilter.h" @@ -23,4 +24,3 @@ bool RemoveEmptyValueCompactionFilter::Filter(int /*level*/, } } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h b/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h index 864ad15ffafb..2a56bd6c2ecd 100644 --- a/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h +++ b/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #pragma once @@ -25,4 +24,3 @@ class RemoveEmptyValueCompactionFilter : public CompactionFilter { }; } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/utilities/debug.cc b/utilities/debug.cc index f2c3bb513769..911bc510a6ac 100644 --- a/utilities/debug.cc +++ b/utilities/debug.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/debug.h" @@ -117,4 +116,3 @@ Status GetAllKeyVersions(DB* db, ColumnFamilyHandle* cfh, Slice begin_key, } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/env_mirror.cc b/utilities/env_mirror.cc index 3ea323b42975..0802d7c708fa 100644 --- a/utilities/env_mirror.cc +++ b/utilities/env_mirror.cc @@ -7,7 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/env_mirror.h" @@ -272,4 +271,3 @@ Status EnvMirror::ReuseWritableFile(const std::string& fname, } } // namespace ROCKSDB_NAMESPACE -#endif diff --git a/utilities/env_mirror_test.cc b/utilities/env_mirror_test.cc index c372de1da5ee..ad4cc936686c 100644 --- a/utilities/env_mirror_test.cc +++ b/utilities/env_mirror_test.cc @@ -4,7 +4,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/env_mirror.h" @@ -215,12 +214,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int argc, char** argv) { - fprintf(stderr, "SKIPPED as EnvMirror is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/utilities/env_timed.cc b/utilities/env_timed.cc index 1eb7231463fa..01fdfccaf322 100644 --- a/utilities/env_timed.cc +++ b/utilities/env_timed.cc @@ -12,7 +12,6 @@ namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE TimedFileSystem::TimedFileSystem(const std::shared_ptr& base) : FileSystemWrapper(base) {} IOStatus TimedFileSystem::NewSequentialFile( @@ -178,10 +177,5 @@ Env* NewTimedEnv(Env* base_env) { return new CompositeEnvWrapper(base_env, timed_fs); } -#else // ROCKSDB_LITE - -Env* NewTimedEnv(Env* /*base_env*/) { return nullptr; } - -#endif // !ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/env_timed.h b/utilities/env_timed.h index 2d34fd59012c..6c6ac89fb431 100644 --- a/utilities/env_timed.h +++ b/utilities/env_timed.h @@ -7,7 +7,6 @@ #pragma once #include "rocksdb/file_system.h" namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE class TimedFileSystem : public FileSystemWrapper { public: explicit TimedFileSystem(const std::shared_ptr& base); @@ -93,5 +92,4 @@ class TimedFileSystem : public FileSystemWrapper { IODebugContext* dbg) override; }; -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/env_timed_test.cc b/utilities/env_timed_test.cc index 6e392579d2bf..3099fb74c7be 100644 --- a/utilities/env_timed_test.cc +++ b/utilities/env_timed_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "rocksdb/env.h" #include "rocksdb/perf_context.h" @@ -33,12 +32,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else // ROCKSDB_LITE -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as TimedEnv is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/utilities/fault_injection_env.h b/utilities/fault_injection_env.h index 549bfe7168ac..6c1623a8d304 100644 --- a/utilities/fault_injection_env.h +++ b/utilities/fault_injection_env.h @@ -96,6 +96,7 @@ class TestWritableFile : public WritableFile { virtual bool use_direct_io() const override { return target_->use_direct_io(); }; + uint64_t GetFileSize() final { return target_->GetFileSize(); } private: FileState state_; diff --git a/utilities/fault_injection_fs.cc b/utilities/fault_injection_fs.cc index 5261d79ea1c0..53bbaeb07937 100644 --- a/utilities/fault_injection_fs.cc +++ b/utilities/fault_injection_fs.cc @@ -242,6 +242,7 @@ IOStatus TestFSWritableFile::PositionedAppend( IOStatus TestFSWritableFile::Close(const IOOptions& options, IODebugContext* dbg) { MutexLock l(&mutex_); + fs_->WritableFileClosed(state_); if (!fs_->IsFilesystemActive()) { return fs_->GetError(); } @@ -263,7 +264,6 @@ IOStatus TestFSWritableFile::Close(const IOOptions& options, io_s = target_->Close(options, dbg); } if (io_s.ok()) { - fs_->WritableFileClosed(state_); IOStatus in_s = fs_->InjectMetadataWriteError(); if (!in_s.ok()) { return in_s; @@ -408,7 +408,7 @@ IOStatus TestFSRandomAccessFile::Read(uint64_t offset, size_t n, scratch, /*need_count_increase=*/true, /*fault_injected=*/nullptr); } if (s.ok() && fs_->ShouldInjectRandomReadError()) { - return IOStatus::IOError("Injected read error"); + return IOStatus::IOError("injected read error"); } return s; } @@ -430,7 +430,7 @@ IOStatus TestFSRandomAccessFile::ReadAsync( } if (ret.ok()) { if (fs_->ShouldInjectRandomReadError()) { - ret = IOStatus::IOError("Injected read error"); + ret = IOStatus::IOError("injected read error"); } else { s = target_->ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, nullptr); } @@ -470,7 +470,7 @@ IOStatus TestFSRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, /*fault_injected=*/nullptr); } if (s.ok() && fs_->ShouldInjectRandomReadError()) { - return IOStatus::IOError("Injected read error"); + return IOStatus::IOError("injected read error"); } return s; } @@ -487,7 +487,7 @@ IOStatus TestFSSequentialFile::Read(size_t n, const IOOptions& options, IODebugContext* dbg) { IOStatus s = target()->Read(n, options, result, scratch, dbg); if (s.ok() && fs_->ShouldInjectRandomReadError()) { - return IOStatus::IOError("Injected seq read error"); + return IOStatus::IOError("injected seq read error"); } return s; } @@ -499,7 +499,7 @@ IOStatus TestFSSequentialFile::PositionedRead(uint64_t offset, size_t n, IOStatus s = target()->PositionedRead(offset, n, options, result, scratch, dbg); if (s.ok() && fs_->ShouldInjectRandomReadError()) { - return IOStatus::IOError("Injected seq positioned read error"); + return IOStatus::IOError("injected seq positioned read error"); } return s; } @@ -678,7 +678,7 @@ IOStatus FaultInjectionTestFS::NewRandomAccessFile( return GetError(); } if (ShouldInjectRandomReadError()) { - return IOStatus::IOError("Injected error when open random access file"); + return IOStatus::IOError("injected error when open random access file"); } IOStatus io_s = InjectThreadSpecificReadError(ErrorOperation::kOpen, nullptr, false, nullptr, @@ -701,7 +701,7 @@ IOStatus FaultInjectionTestFS::NewSequentialFile( } if (ShouldInjectRandomReadError()) { - return IOStatus::IOError("Injected read error when creating seq file"); + return IOStatus::IOError("injected read error when creating seq file"); } IOStatus io_s = target()->NewSequentialFile(fname, file_opts, result, dbg); if (io_s.ok()) { @@ -956,6 +956,7 @@ IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError( return IOStatus::OK(); } + IOStatus ret; if (ctx->rand.OneIn(ctx->one_in)) { if (ctx->count == 0) { ctx->message = ""; @@ -970,15 +971,15 @@ IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError( if (op != ErrorOperation::kMultiReadSingleReq) { // Likely non-per read status code for MultiRead - ctx->message += "error; "; + ctx->message += "injected read error; "; ret_fault_injected = true; - return IOStatus::IOError(); + ret = IOStatus::IOError(ctx->message); } else if (Random::GetTLSInstance()->OneIn(8)) { assert(result); // For a small chance, set the failure to status but turn the // result to be empty, which is supposed to be caught for a check. *result = Slice(); - ctx->message += "inject empty result; "; + ctx->message += "injected empty result; "; ret_fault_injected = true; } else if (!direct_io && Random::GetTLSInstance()->OneIn(7) && scratch != nullptr && result->data() == scratch) { @@ -995,15 +996,18 @@ IOStatus FaultInjectionTestFS::InjectThreadSpecificReadError( // It would work for CRC. Not 100% sure for xxhash and will adjust // if it is not the case. const_cast(result->data())[result->size() - 1]++; - ctx->message += "corrupt last byte; "; + ctx->message += "injected corrupt last byte; "; ret_fault_injected = true; } else { - ctx->message += "error result multiget single; "; + ctx->message += "injected error result multiget single; "; ret_fault_injected = true; - return IOStatus::IOError(); + ret = IOStatus::IOError(ctx->message); } } - return IOStatus::OK(); + if (ctx->retryable) { + ret.SetRetryable(true); + } + return ret; } bool FaultInjectionTestFS::TryParseFileName(const std::string& file_name, @@ -1052,7 +1056,7 @@ IOStatus FaultInjectionTestFS::InjectMetadataWriteError() { } } TEST_SYNC_POINT("FaultInjectionTestFS::InjectMetadataWriteError:Injected"); - return IOStatus::IOError(); + return IOStatus::IOError("injected metadata write error"); } void FaultInjectionTestFS::PrintFaultBacktrace() { diff --git a/utilities/fault_injection_fs.h b/utilities/fault_injection_fs.h index cab0051bd144..afd770dde07f 100644 --- a/utilities/fault_injection_fs.h +++ b/utilities/fault_injection_fs.h @@ -323,8 +323,8 @@ class FaultInjectionTestFS : public FileSystemWrapper { if (!TryParseFileName(file_name, &file_number, &file_type)) { return false; } - return skip_direct_writable_types_.find(file_type) != - skip_direct_writable_types_.end(); + return direct_writable_types_.find(file_type) != + direct_writable_types_.end(); } void SetFilesystemActiveNoLock( bool active, IOStatus error = IOStatus::Corruption("Not active")) { @@ -402,7 +402,8 @@ class FaultInjectionTestFS : public FileSystemWrapper { // seed is the seed for the random number generator, and one_in determines // the probability of injecting error (i.e an error is injected with // 1/one_in probability) - void SetThreadLocalReadErrorContext(uint32_t seed, int one_in) { + void SetThreadLocalReadErrorContext(uint32_t seed, int one_in, + bool retryable) { struct ErrorContext* ctx = static_cast(thread_local_error_->Get()); if (ctx == nullptr) { @@ -411,6 +412,7 @@ class FaultInjectionTestFS : public FileSystemWrapper { } ctx->one_in = one_in; ctx->count = 0; + ctx->retryable = retryable; } static void DeleteThreadLocalErrorContext(void* p) { @@ -437,9 +439,9 @@ class FaultInjectionTestFS : public FileSystemWrapper { write_error_allowed_types_ = types; } - void SetSkipDirectWritableTypes(const std::set& types) { + void SetDirectWritableTypes(const std::set& types) { MutexLock l(&mutex_); - skip_direct_writable_types_ = types; + direct_writable_types_ = types; } void SetRandomMetadataWriteError(int one_in) { @@ -556,12 +558,14 @@ class FaultInjectionTestFS : public FileSystemWrapper { std::string message; int frames; ErrorType type; + bool retryable; explicit ErrorContext(uint32_t seed) : rand(seed), enable_error_injection(false), callstack(nullptr), - frames(0) {} + frames(0), + retryable(false) {} ~ErrorContext() { if (callstack) { free(callstack); @@ -579,7 +583,7 @@ class FaultInjectionTestFS : public FileSystemWrapper { bool inject_for_all_file_types_; std::vector write_error_allowed_types_; // File types where direct writable is skipped. - std::set skip_direct_writable_types_; + std::set direct_writable_types_; bool ingest_data_corruption_before_write_; ChecksumType checksum_handoff_func_tpye_; bool fail_get_file_unique_id_; diff --git a/utilities/fault_injection_secondary_cache.cc b/utilities/fault_injection_secondary_cache.cc index d24e92f06ff6..c2ea12535bc8 100644 --- a/utilities/fault_injection_secondary_cache.cc +++ b/utilities/fault_injection_secondary_cache.cc @@ -78,13 +78,13 @@ FaultInjectionSecondaryCache::GetErrorContext() { Status FaultInjectionSecondaryCache::Insert( const Slice& key, Cache::ObjectPtr value, - const Cache::CacheItemHelper* helper) { + const Cache::CacheItemHelper* helper, bool force_insert) { ErrorContext* ctx = GetErrorContext(); if (ctx->rand.OneIn(prob_)) { return Status::IOError(); } - return base_->Insert(key, value, helper); + return base_->Insert(key, value, helper, force_insert); } std::unique_ptr @@ -92,18 +92,18 @@ FaultInjectionSecondaryCache::Lookup(const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool wait, bool advise_erase, - bool& is_in_sec_cache) { + bool& kept_in_sec_cache) { ErrorContext* ctx = GetErrorContext(); if (base_is_compressed_sec_cache_) { if (ctx->rand.OneIn(prob_)) { return nullptr; } else { return base_->Lookup(key, helper, create_context, wait, advise_erase, - is_in_sec_cache); + kept_in_sec_cache); } } else { std::unique_ptr hdl = base_->Lookup( - key, helper, create_context, wait, advise_erase, is_in_sec_cache); + key, helper, create_context, wait, advise_erase, kept_in_sec_cache); if (wait && ctx->rand.OneIn(prob_)) { hdl.reset(); } diff --git a/utilities/fault_injection_secondary_cache.h b/utilities/fault_injection_secondary_cache.h index 47585e30e0ac..dd73ac15630c 100644 --- a/utilities/fault_injection_secondary_cache.h +++ b/utilities/fault_injection_secondary_cache.h @@ -32,12 +32,18 @@ class FaultInjectionSecondaryCache : public SecondaryCache { const char* Name() const override { return "FaultInjectionSecondaryCache"; } Status Insert(const Slice& key, Cache::ObjectPtr value, - const Cache::CacheItemHelper* helper) override; + const Cache::CacheItemHelper* helper, + bool force_insert) override; + + Status InsertSaved(const Slice& /*key*/, const Slice& /*saved*/, + CompressionType /*type*/, CacheTier /*source*/) override { + return Status::OK(); + } std::unique_ptr Lookup( const Slice& key, const Cache::CacheItemHelper* helper, Cache::CreateContext* create_context, bool wait, bool advise_erase, - bool& is_in_sec_cache) override; + bool& kept_in_sec_cache) override; bool SupportForceErase() const override { return base_->SupportForceErase(); } diff --git a/utilities/leveldb_options/leveldb_options.cc b/utilities/leveldb_options/leveldb_options.cc index 125c3d9565c7..f81e59b83ad7 100644 --- a/utilities/leveldb_options/leveldb_options.cc +++ b/utilities/leveldb_options/leveldb_options.cc @@ -9,7 +9,7 @@ #include "rocksdb/utilities/leveldb_options.h" -#include "rocksdb/cache.h" +#include "rocksdb/advanced_cache.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" #include "rocksdb/filter_policy.h" diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc index 0b043af0eaea..8255a6cad797 100644 --- a/utilities/memory/memory_test.cc +++ b/utilities/memory/memory_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "db/db_impl/db_impl.h" #include "rocksdb/cache.h" @@ -41,7 +40,6 @@ class MemoryTest : public testing::Test { const auto bbto = factory->GetOptions(); if (bbto != nullptr) { cache_set->insert(bbto->block_cache.get()); - cache_set->insert(bbto->block_cache_compressed.get()); } } @@ -269,11 +267,3 @@ int main(int argc, char** argv) { #endif } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - printf("Skipped in RocksDBLite as utilities are not supported.\n"); - return 0; -} -#endif // !ROCKSDB_LITE diff --git a/utilities/memory/memory_util.cc b/utilities/memory/memory_util.cc index 13c81aec4a9f..62042192f910 100644 --- a/utilities/memory/memory_util.cc +++ b/utilities/memory/memory_util.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/memory_util.h" @@ -49,4 +48,3 @@ Status MemoryUtil::GetApproximateMemoryUsageByType( return Status::OK(); } } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/utilities/merge_operators.cc b/utilities/merge_operators.cc index c97e9ce2514d..020064d0924a 100644 --- a/utilities/merge_operators.cc +++ b/utilities/merge_operators.cc @@ -12,46 +12,14 @@ #include "rocksdb/utilities/customizable_util.h" #include "rocksdb/utilities/object_registry.h" #include "utilities/merge_operators/bytesxor.h" +#include "utilities/merge_operators/max_operator.h" +#include "utilities/merge_operators/put_operator.h" #include "utilities/merge_operators/sortlist.h" #include "utilities/merge_operators/string_append/stringappend.h" #include "utilities/merge_operators/string_append/stringappend2.h" +#include "utilities/merge_operators/uint64add.h" namespace ROCKSDB_NAMESPACE { -static bool LoadMergeOperator(const std::string& id, - std::shared_ptr* result) { - bool success = true; - // TODO: Hook the "name" up to the actual Name() of the MergeOperators? - // Requires these classes be moved into a header file... - if (id == "put" || id == "PutOperator") { - *result = MergeOperators::CreatePutOperator(); - } else if (id == "put_v1") { - *result = MergeOperators::CreateDeprecatedPutOperator(); - } else if (id == "uint64add" || id == "UInt64AddOperator") { - *result = MergeOperators::CreateUInt64AddOperator(); - } else if (id == "max" || id == "MaxOperator") { - *result = MergeOperators::CreateMaxOperator(); -#ifdef ROCKSDB_LITE - // The remainder of the classes are handled by the ObjectRegistry in - // non-LITE mode - } else if (id == StringAppendOperator::kNickName() || - id == StringAppendOperator::kClassName()) { - *result = MergeOperators::CreateStringAppendOperator(); - } else if (id == StringAppendTESTOperator::kNickName() || - id == StringAppendTESTOperator::kClassName()) { - *result = MergeOperators::CreateStringAppendTESTOperator(); - } else if (id == BytesXOROperator::kNickName() || - id == BytesXOROperator::kClassName()) { - *result = MergeOperators::CreateBytesXOROperator(); - } else if (id == SortList::kNickName() || id == SortList::kClassName()) { - *result = MergeOperators::CreateSortOperator(); -#endif // ROCKSDB_LITE - } else { - success = false; - } - return success; -} - -#ifndef ROCKSDB_LITE static int RegisterBuiltinMergeOperators(ObjectLibrary& library, const std::string& /*arg*/) { size_t num_types; @@ -87,22 +55,49 @@ static int RegisterBuiltinMergeOperators(ObjectLibrary& library, guard->reset(new BytesXOROperator()); return guard->get(); }); + library.AddFactory( + ObjectLibrary::PatternEntry(UInt64AddOperator::kClassName()) + .AnotherName(UInt64AddOperator::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new UInt64AddOperator()); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(MaxOperator::kClassName()) + .AnotherName(MaxOperator::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new MaxOperator()); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(PutOperatorV2::kClassName()) + .AnotherName(PutOperatorV2::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new PutOperatorV2()); + return guard->get(); + }); + library.AddFactory( + ObjectLibrary::PatternEntry(PutOperator::kNickName()), + [](const std::string& /*uri*/, std::unique_ptr* guard, + std::string* /*errmsg*/) { + guard->reset(new PutOperator()); + return guard->get(); + }); return static_cast(library.GetFactoryCount(&num_types)); } -#endif // ROCKSDB_LITE Status MergeOperator::CreateFromString(const ConfigOptions& config_options, const std::string& value, std::shared_ptr* result) { -#ifndef ROCKSDB_LITE static std::once_flag once; std::call_once(once, [&]() { RegisterBuiltinMergeOperators(*(ObjectLibrary::Default().get()), ""); }); -#endif // ROCKSDB_LITE - return LoadSharedObject(config_options, value, - LoadMergeOperator, result); + return LoadSharedObject(config_options, value, result); } std::shared_ptr MergeOperators::CreateFromStringId( diff --git a/utilities/merge_operators/max.cc b/utilities/merge_operators/max.cc index de4abfa6fa7d..ff854cca347c 100644 --- a/utilities/merge_operators/max.cc +++ b/utilities/merge_operators/max.cc @@ -8,71 +8,55 @@ #include "rocksdb/merge_operator.h" #include "rocksdb/slice.h" #include "utilities/merge_operators.h" +#include "utilities/merge_operators/max_operator.h" -using ROCKSDB_NAMESPACE::Logger; -using ROCKSDB_NAMESPACE::MergeOperator; -using ROCKSDB_NAMESPACE::Slice; - -namespace { // anonymous namespace - -// Merge operator that picks the maximum operand, Comparison is based on -// Slice::compare -class MaxOperator : public MergeOperator { - public: - bool FullMergeV2(const MergeOperationInput& merge_in, - MergeOperationOutput* merge_out) const override { - Slice& max = merge_out->existing_operand; - if (merge_in.existing_value) { - max = Slice(merge_in.existing_value->data(), - merge_in.existing_value->size()); - } else if (max.data() == nullptr) { - max = Slice(); - } - - for (const auto& op : merge_in.operand_list) { - if (max.compare(op) < 0) { - max = op; - } - } +namespace ROCKSDB_NAMESPACE { - return true; +bool MaxOperator::FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + Slice& max = merge_out->existing_operand; + if (merge_in.existing_value) { + max = + Slice(merge_in.existing_value->data(), merge_in.existing_value->size()); + } else if (max.data() == nullptr) { + max = Slice(); } - bool PartialMerge(const Slice& /*key*/, const Slice& left_operand, - const Slice& right_operand, std::string* new_value, - Logger* /*logger*/) const override { - if (left_operand.compare(right_operand) >= 0) { - new_value->assign(left_operand.data(), left_operand.size()); - } else { - new_value->assign(right_operand.data(), right_operand.size()); + for (const auto& op : merge_in.operand_list) { + if (max.compare(op) < 0) { + max = op; } - return true; } - bool PartialMergeMulti(const Slice& /*key*/, - const std::deque& operand_list, - std::string* new_value, - Logger* /*logger*/) const override { - Slice max; - for (const auto& operand : operand_list) { - if (max.compare(operand) < 0) { - max = operand; - } - } + return true; +} - new_value->assign(max.data(), max.size()); - return true; +bool MaxOperator::PartialMerge(const Slice& /*key*/, const Slice& left_operand, + const Slice& right_operand, + std::string* new_value, + Logger* /*logger*/) const { + if (left_operand.compare(right_operand) >= 0) { + new_value->assign(left_operand.data(), left_operand.size()); + } else { + new_value->assign(right_operand.data(), right_operand.size()); } + return true; +} - static const char* kClassName() { return "MaxOperator"; } - static const char* kNickName() { return "max"; } - const char* Name() const override { return kClassName(); } - const char* NickName() const override { return kNickName(); } -}; - -} // end of anonymous namespace +bool MaxOperator::PartialMergeMulti(const Slice& /*key*/, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const { + Slice max; + for (const auto& operand : operand_list) { + if (max.compare(operand) < 0) { + max = operand; + } + } -namespace ROCKSDB_NAMESPACE { + new_value->assign(max.data(), max.size()); + return true; +} std::shared_ptr MergeOperators::CreateMaxOperator() { return std::make_shared(); diff --git a/utilities/merge_operators/max_operator.h b/utilities/merge_operators/max_operator.h new file mode 100644 index 000000000000..4c8e98db4b2e --- /dev/null +++ b/utilities/merge_operators/max_operator.h @@ -0,0 +1,35 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Merge operator that picks the maximum operand, Comparison is based on +// Slice::compare + +#pragma once + +#include "rocksdb/merge_operator.h" + +namespace ROCKSDB_NAMESPACE { +class Logger; +class Slice; + +class MaxOperator : public MergeOperator { + public: + static const char* kClassName() { return "MaxOperator"; } + static const char* kNickName() { return "max"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; + bool PartialMerge(const Slice& /*key*/, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* /*logger*/) const override; + bool PartialMergeMulti(const Slice& /*key*/, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/utilities/merge_operators/put.cc b/utilities/merge_operators/put.cc index ccf9ff21f190..79468c6b7879 100644 --- a/utilities/merge_operators/put.cc +++ b/utilities/merge_operators/put.cc @@ -8,12 +8,9 @@ #include "rocksdb/merge_operator.h" #include "rocksdb/slice.h" #include "utilities/merge_operators.h" +#include "utilities/merge_operators/put_operator.h" -namespace { // anonymous namespace - -using ROCKSDB_NAMESPACE::Logger; -using ROCKSDB_NAMESPACE::MergeOperator; -using ROCKSDB_NAMESPACE::Slice; +namespace ROCKSDB_NAMESPACE { // A merge operator that mimics Put semantics // Since this merge-operator will not be used in production, @@ -23,64 +20,49 @@ using ROCKSDB_NAMESPACE::Slice; // which would be simpler in this case). // // From the client-perspective, semantics are the same. -class PutOperator : public MergeOperator { - public: - bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/, - const std::deque& operand_sequence, - std::string* new_value, Logger* /*logger*/) const override { - // Put basically only looks at the current/latest value - assert(!operand_sequence.empty()); - assert(new_value != nullptr); - new_value->assign(operand_sequence.back()); - return true; - } - - bool PartialMerge(const Slice& /*key*/, const Slice& /*left_operand*/, - const Slice& right_operand, std::string* new_value, - Logger* /*logger*/) const override { - new_value->assign(right_operand.data(), right_operand.size()); - return true; - } - - using MergeOperator::PartialMergeMulti; - bool PartialMergeMulti(const Slice& /*key*/, - const std::deque& operand_list, - std::string* new_value, - Logger* /*logger*/) const override { - new_value->assign(operand_list.back().data(), operand_list.back().size()); - return true; - } - - static const char* kClassName() { return "PutOperator"; } - static const char* kNickName() { return "put_v1"; } - const char* Name() const override { return kClassName(); } - const char* NickName() const override { return kNickName(); } -}; - -class PutOperatorV2 : public PutOperator { - bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/, - const std::deque& /*operand_sequence*/, - std::string* /*new_value*/, - Logger* /*logger*/) const override { - assert(false); - return false; - } +bool PutOperator::FullMerge(const Slice& /*key*/, + const Slice* /*existing_value*/, + const std::deque& operand_sequence, + std::string* new_value, Logger* /*logger*/) const { + // Put basically only looks at the current/latest value + assert(!operand_sequence.empty()); + assert(new_value != nullptr); + new_value->assign(operand_sequence.back()); + return true; +} - bool FullMergeV2(const MergeOperationInput& merge_in, - MergeOperationOutput* merge_out) const override { - // Put basically only looks at the current/latest value - assert(!merge_in.operand_list.empty()); - merge_out->existing_operand = merge_in.operand_list.back(); - return true; - } +bool PutOperator::PartialMerge(const Slice& /*key*/, + const Slice& /*left_operand*/, + const Slice& right_operand, + std::string* new_value, + Logger* /*logger*/) const { + new_value->assign(right_operand.data(), right_operand.size()); + return true; +} - static const char* kNickName() { return "put"; } - const char* NickName() const override { return kNickName(); } -}; +bool PutOperator::PartialMergeMulti(const Slice& /*key*/, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const { + new_value->assign(operand_list.back().data(), operand_list.back().size()); + return true; +} -} // end of anonymous namespace +bool PutOperatorV2::FullMerge( + const Slice& /*key*/, const Slice* /*existing_value*/, + const std::deque& /*operand_sequence*/, + std::string* /*new_value*/, Logger* /*logger*/) const { + assert(false); + return false; +} -namespace ROCKSDB_NAMESPACE { +bool PutOperatorV2::FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const { + // Put basically only looks at the current/latest value + assert(!merge_in.operand_list.empty()); + merge_out->existing_operand = merge_in.operand_list.back(); + return true; +} std::shared_ptr MergeOperators::CreateDeprecatedPutOperator() { return std::make_shared(); diff --git a/utilities/merge_operators/put_operator.h b/utilities/merge_operators/put_operator.h new file mode 100644 index 000000000000..7529da78e36c --- /dev/null +++ b/utilities/merge_operators/put_operator.h @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// A merge operator that mimics Put semantics +// Since this merge-operator will not be used in production, +// it is implemented as a non-associative merge operator to illustrate the +// new interface and for testing purposes. (That is, we inherit from +// the MergeOperator class rather than the AssociativeMergeOperator +// which would be simpler in this case). +// +// From the client-perspective, semantics are the same. + +#pragma once + +#include "rocksdb/merge_operator.h" + +namespace ROCKSDB_NAMESPACE { +class Logger; +class Slice; + +class PutOperator : public MergeOperator { + public: + static const char* kClassName() { return "PutOperator"; } + static const char* kNickName() { return "put_v1"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/, + const std::deque& operand_sequence, + std::string* new_value, Logger* /*logger*/) const override; + bool PartialMerge(const Slice& /*key*/, const Slice& left_operand, + const Slice& right_operand, std::string* new_value, + Logger* /*logger*/) const override; + using MergeOperator::PartialMergeMulti; + bool PartialMergeMulti(const Slice& /*key*/, + const std::deque& operand_list, + std::string* new_value, + Logger* /*logger*/) const override; +}; + +class PutOperatorV2 : public PutOperator { + public: + static const char* kNickName() { return "put"; } + const char* NickName() const override { return kNickName(); } + + bool FullMerge(const Slice& /*key*/, const Slice* /*existing_value*/, + const std::deque& /*operand_sequence*/, + std::string* /*new_value*/, Logger* /*logger*/) const override; + + bool FullMergeV2(const MergeOperationInput& merge_in, + MergeOperationOutput* merge_out) const override; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/utilities/merge_operators/string_append/stringappend.cc b/utilities/merge_operators/string_append/stringappend.cc index 5092cabcb45d..748e5c89f6ca 100644 --- a/utilities/merge_operators/string_append/stringappend.cc +++ b/utilities/merge_operators/string_append/stringappend.cc @@ -1,8 +1,9 @@ -/** - * A MergeOperator for rocksdb that implements string append. - * @author Deon Nicholas (dnicholas@fb.com) - * Copyright 2013 Facebook - */ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// A MergeOperator for rocksdb that implements string append. #include "stringappend.h" @@ -19,11 +20,9 @@ namespace ROCKSDB_NAMESPACE { namespace { static std::unordered_map stringappend_merge_type_info = { -#ifndef ROCKSDB_LITE {"delimiter", {0, OptionType::kString, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, -#endif // ROCKSDB_LITE }; } // namespace // Constructor: also specify the delimiter character. diff --git a/utilities/merge_operators/string_append/stringappend.h b/utilities/merge_operators/string_append/stringappend.h index 153532382c41..4a7b2b9e58ba 100644 --- a/utilities/merge_operators/string_append/stringappend.h +++ b/utilities/merge_operators/string_append/stringappend.h @@ -1,8 +1,9 @@ -/** - * A MergeOperator for rocksdb that implements string append. - * @author Deon Nicholas (dnicholas@fb.com) - * Copyright 2013 Facebook - */ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// A MergeOperator for rocksdb that implements string append. #pragma once #include "rocksdb/merge_operator.h" diff --git a/utilities/merge_operators/string_append/stringappend2.cc b/utilities/merge_operators/string_append/stringappend2.cc index 36cb9ee34ea4..bd0716cc3cad 100644 --- a/utilities/merge_operators/string_append/stringappend2.cc +++ b/utilities/merge_operators/string_append/stringappend2.cc @@ -1,7 +1,7 @@ -/** - * @author Deon Nicholas (dnicholas@fb.com) - * Copyright 2013 Facebook - */ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). #include "stringappend2.h" @@ -19,11 +19,9 @@ namespace ROCKSDB_NAMESPACE { namespace { static std::unordered_map stringappend2_merge_type_info = { -#ifndef ROCKSDB_LITE {"delimiter", {0, OptionType::kString, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, -#endif // ROCKSDB_LITE }; } // namespace diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc index 22b6144af651..acc71c8e49c1 100644 --- a/utilities/merge_operators/string_append/stringappend_test.cc +++ b/utilities/merge_operators/string_append/stringappend_test.cc @@ -49,7 +49,6 @@ std::shared_ptr OpenNormalDb(const std::string& delim) { return std::shared_ptr(db); } -#ifndef ROCKSDB_LITE // TtlDb is not supported in Lite // Open a TtlDB with a non-associative StringAppendTESTOperator std::shared_ptr OpenTtlDb(const std::string& delim) { DBWithTTL* db; @@ -65,7 +64,6 @@ std::shared_ptr OpenTtlDb(const std::string& delim) { EXPECT_OK(DBWithTTL::Open(options, kDbName, &db, 123456)); return std::shared_ptr(db); } -#endif // !ROCKSDB_LITE } // namespace /// StringLists represents a set of string-lists, each with a key-index. @@ -130,14 +128,12 @@ class StringAppendOperatorTest : public testing::Test, } void SetUp() override { -#ifndef ROCKSDB_LITE // TtlDb is not supported in Lite bool if_use_ttl = GetParam(); if (if_use_ttl) { fprintf(stderr, "Running tests with ttl db and generic operator.\n"); StringAppendOperatorTest::SetOpenDbFunction(&OpenTtlDb); return; } -#endif // !ROCKSDB_LITE fprintf(stderr, "Running tests with regular db and operator.\n"); StringAppendOperatorTest::SetOpenDbFunction(&OpenNormalDb); } @@ -197,6 +193,7 @@ TEST_P(StringAppendOperatorTest, IteratorTest) { ASSERT_EQ(res, "a1,a2,a3"); } } + ASSERT_OK(it->status()); // Should release the snapshot and be aware of the new stuff now it.reset(db_->NewIterator(ReadOptions())); @@ -221,6 +218,7 @@ TEST_P(StringAppendOperatorTest, IteratorTest) { ASSERT_EQ(res, "a1,a2,a3,a4"); } } + ASSERT_OK(it->status()); slists.Append("k3", "g1"); @@ -246,6 +244,7 @@ TEST_P(StringAppendOperatorTest, IteratorTest) { ASSERT_EQ(res, "g1"); } } + ASSERT_OK(it->status()); } TEST_P(StringAppendOperatorTest, SimpleTest) { diff --git a/utilities/merge_operators/uint64add.cc b/utilities/merge_operators/uint64add.cc index 5be2f56411a1..72957761b60a 100644 --- a/utilities/merge_operators/uint64add.cc +++ b/utilities/merge_operators/uint64add.cc @@ -3,6 +3,8 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "utilities/merge_operators/uint64add.h" + #include #include "logging/logging.h" @@ -12,61 +14,40 @@ #include "util/coding.h" #include "utilities/merge_operators.h" -namespace { // anonymous namespace - -using ROCKSDB_NAMESPACE::AssociativeMergeOperator; -using ROCKSDB_NAMESPACE::InfoLogLevel; -using ROCKSDB_NAMESPACE::Logger; -using ROCKSDB_NAMESPACE::Slice; - -// A 'model' merge operator with uint64 addition semantics -// Implemented as an AssociativeMergeOperator for simplicity and example. -class UInt64AddOperator : public AssociativeMergeOperator { - public: - bool Merge(const Slice& /*key*/, const Slice* existing_value, - const Slice& value, std::string* new_value, - Logger* logger) const override { - uint64_t orig_value = 0; - if (existing_value) { - orig_value = DecodeInteger(*existing_value, logger); - } - uint64_t operand = DecodeInteger(value, logger); - - assert(new_value); - new_value->clear(); - ROCKSDB_NAMESPACE::PutFixed64(new_value, orig_value + operand); +namespace ROCKSDB_NAMESPACE { // anonymous namespace - return true; // Return true always since corruption will be treated as 0 +bool UInt64AddOperator::Merge(const Slice& /*key*/, const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* logger) const { + uint64_t orig_value = 0; + if (existing_value) { + orig_value = DecodeInteger(*existing_value, logger); } + uint64_t operand = DecodeInteger(value, logger); - static const char* kClassName() { return "UInt64AddOperator"; } - static const char* kNickName() { return "uint64add"; } - const char* Name() const override { return kClassName(); } - const char* NickName() const override { return kNickName(); } + assert(new_value); + new_value->clear(); + PutFixed64(new_value, orig_value + operand); - private: - // Takes the string and decodes it into a uint64_t - // On error, prints a message and returns 0 - uint64_t DecodeInteger(const Slice& value, Logger* logger) const { - uint64_t result = 0; - - if (value.size() == sizeof(uint64_t)) { - result = ROCKSDB_NAMESPACE::DecodeFixed64(value.data()); - } else if (logger != nullptr) { - // If value is corrupted, treat it as 0 - ROCKS_LOG_ERROR(logger, - "uint64 value corruption, size: %" ROCKSDB_PRIszt - " > %" ROCKSDB_PRIszt, - value.size(), sizeof(uint64_t)); - } + return true; // Return true always since corruption will be treated as 0 +} - return result; +uint64_t UInt64AddOperator::DecodeInteger(const Slice& value, + Logger* logger) const { + uint64_t result = 0; + + if (value.size() == sizeof(uint64_t)) { + result = DecodeFixed64(value.data()); + } else if (logger != nullptr) { + // If value is corrupted, treat it as 0 + ROCKS_LOG_ERROR(logger, + "uint64 value corruption, size: %" ROCKSDB_PRIszt + " > %" ROCKSDB_PRIszt, + value.size(), sizeof(uint64_t)); } -}; -} // anonymous namespace - -namespace ROCKSDB_NAMESPACE { + return result; +} std::shared_ptr MergeOperators::CreateUInt64AddOperator() { return std::make_shared(); diff --git a/utilities/merge_operators/uint64add.h b/utilities/merge_operators/uint64add.h new file mode 100644 index 000000000000..7e232677a3fe --- /dev/null +++ b/utilities/merge_operators/uint64add.h @@ -0,0 +1,35 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// A 'model' merge operator with uint64 addition semantics +// Implemented as an AssociativeMergeOperator for simplicity and example. + +#pragma once + +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" + +namespace ROCKSDB_NAMESPACE { +class Logger; +class Slice; + +class UInt64AddOperator : public AssociativeMergeOperator { + public: + static const char* kClassName() { return "UInt64AddOperator"; } + static const char* kNickName() { return "uint64add"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kNickName(); } + + bool Merge(const Slice& /*key*/, const Slice* existing_value, + const Slice& value, std::string* new_value, + Logger* logger) const override; + + private: + // Takes the string and decodes it into a uint64_t + // On error, prints a message and returns 0 + uint64_t DecodeInteger(const Slice& value, Logger* logger) const; +}; + +} // namespace ROCKSDB_NAMESPACE diff --git a/utilities/object_registry.cc b/utilities/object_registry.cc index 18834783d843..786f2ee2e43f 100644 --- a/utilities/object_registry.cc +++ b/utilities/object_registry.cc @@ -14,7 +14,6 @@ #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE namespace { bool MatchesInteger(const std::string &target, size_t start, size_t pos) { // If it is numeric, everything up to the match must be a number @@ -379,5 +378,4 @@ int ObjectRegistry::RegisterPlugin(const std::string &name, } } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/object_registry_test.cc b/utilities/object_registry_test.cc index 90cd155ee082..4042bc9b90e2 100644 --- a/utilities/object_registry_test.cc +++ b/utilities/object_registry_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/object_registry.h" @@ -861,12 +860,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else // ROCKSDB_LITE -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as ObjRegistry is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/utilities/option_change_migration/option_change_migration.cc b/utilities/option_change_migration/option_change_migration.cc index e93d2152dc1f..ea3cf6857127 100644 --- a/utilities/option_change_migration/option_change_migration.cc +++ b/utilities/option_change_migration/option_change_migration.cc @@ -5,7 +5,6 @@ #include "rocksdb/utilities/option_change_migration.h" -#ifndef ROCKSDB_LITE #include "rocksdb/db.h" namespace ROCKSDB_NAMESPACE { @@ -175,12 +174,3 @@ Status OptionChangeMigration(std::string dbname, const Options& old_opts, } } } // namespace ROCKSDB_NAMESPACE -#else -namespace ROCKSDB_NAMESPACE { -Status OptionChangeMigration(std::string /*dbname*/, - const Options& /*old_opts*/, - const Options& /*new_opts*/) { - return Status::NotSupported(); -} -} // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/option_change_migration/option_change_migration_test.cc b/utilities/option_change_migration/option_change_migration_test.cc index 71af45db1c5d..1cb42a0cacff 100644 --- a/utilities/option_change_migration/option_change_migration_test.cc +++ b/utilities/option_change_migration/option_change_migration_test.cc @@ -49,7 +49,6 @@ class DBOptionChangeMigrationTests uint64_t fifo_max_table_files_size_; }; -#ifndef ROCKSDB_LITE TEST_P(DBOptionChangeMigrationTests, Migrate1) { Options old_options = CurrentOptions(); old_options.compaction_style = @@ -88,6 +87,7 @@ TEST_P(DBOptionChangeMigrationTests, Migrate1) { for (; it->Valid(); it->Next()) { keys.insert(it->key().ToString()); } + ASSERT_OK(it->status()); } Close(); @@ -125,6 +125,7 @@ TEST_P(DBOptionChangeMigrationTests, Migrate1) { it->Next(); } ASSERT_TRUE(!it->Valid()); + ASSERT_OK(it->status()); } } @@ -166,6 +167,7 @@ TEST_P(DBOptionChangeMigrationTests, Migrate2) { for (; it->Valid(); it->Next()) { keys.insert(it->key().ToString()); } + ASSERT_OK(it->status()); } Close(); @@ -203,6 +205,7 @@ TEST_P(DBOptionChangeMigrationTests, Migrate2) { it->Next(); } ASSERT_TRUE(!it->Valid()); + ASSERT_OK(it->status()); } } @@ -230,7 +233,7 @@ TEST_P(DBOptionChangeMigrationTests, Migrate3) { for (int i = 0; i < 50; i++) { ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (num == 9) { // Issue a full compaction to generate some zero-out files @@ -250,6 +253,7 @@ TEST_P(DBOptionChangeMigrationTests, Migrate3) { for (; it->Valid(); it->Next()) { keys.insert(it->key().ToString()); } + ASSERT_OK(it->status()); } Close(); @@ -287,6 +291,7 @@ TEST_P(DBOptionChangeMigrationTests, Migrate3) { it->Next(); } ASSERT_TRUE(!it->Valid()); + ASSERT_OK(it->status()); } } @@ -314,7 +319,7 @@ TEST_P(DBOptionChangeMigrationTests, Migrate4) { for (int i = 0; i < 50; i++) { ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900))); } - Flush(); + ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (num == 9) { // Issue a full compaction to generate some zero-out files @@ -334,6 +339,7 @@ TEST_P(DBOptionChangeMigrationTests, Migrate4) { for (; it->Valid(); it->Next()) { keys.insert(it->key().ToString()); } + ASSERT_OK(it->status()); } Close(); @@ -371,6 +377,7 @@ TEST_P(DBOptionChangeMigrationTests, Migrate4) { it->Next(); } ASSERT_TRUE(!it->Valid()); + ASSERT_OK(it->status()); } } @@ -497,7 +504,7 @@ TEST_F(DBOptionChangeMigrationTest, CompactedSrcToUniversal) { ASSERT_OK(Put(Key(num * 100 + i), rnd.RandomString(900))); } } - Flush(); + ASSERT_OK(Flush()); CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr)); @@ -510,6 +517,7 @@ TEST_F(DBOptionChangeMigrationTest, CompactedSrcToUniversal) { for (; it->Valid(); it->Next()) { keys.insert(it->key().ToString()); } + ASSERT_OK(it->status()); } Close(); @@ -540,7 +548,6 @@ TEST_F(DBOptionChangeMigrationTest, CompactedSrcToUniversal) { } } -#endif // ROCKSDB_LITE } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/utilities/options/options_util.cc b/utilities/options/options_util.cc index 00c4b981a653..394bf431f467 100644 --- a/utilities/options/options_util.cc +++ b/utilities/options/options_util.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/options_util.h" @@ -14,20 +13,6 @@ #include "table/block_based/block_based_table_factory.h" namespace ROCKSDB_NAMESPACE { -Status LoadOptionsFromFile(const std::string& file_name, Env* env, - DBOptions* db_options, - std::vector* cf_descs, - bool ignore_unknown_options, - std::shared_ptr* cache) { - ConfigOptions config_options; - config_options.ignore_unknown_options = ignore_unknown_options; - config_options.input_strings_escaped = true; - config_options.env = env; - - return LoadOptionsFromFile(config_options, file_name, db_options, cf_descs, - cache); -} - Status LoadOptionsFromFile(const ConfigOptions& config_options, const std::string& file_name, DBOptions* db_options, std::vector* cf_descs, @@ -90,19 +75,6 @@ Status GetLatestOptionsFileName(const std::string& dbpath, Env* env, return Status::OK(); } -Status LoadLatestOptions(const std::string& dbpath, Env* env, - DBOptions* db_options, - std::vector* cf_descs, - bool ignore_unknown_options, - std::shared_ptr* cache) { - ConfigOptions config_options; - config_options.ignore_unknown_options = ignore_unknown_options; - config_options.input_strings_escaped = true; - config_options.env = env; - - return LoadLatestOptions(config_options, dbpath, db_options, cf_descs, cache); -} - Status LoadLatestOptions(const ConfigOptions& config_options, const std::string& dbpath, DBOptions* db_options, std::vector* cf_descs, @@ -117,19 +89,6 @@ Status LoadLatestOptions(const ConfigOptions& config_options, db_options, cf_descs, cache); } -Status CheckOptionsCompatibility( - const std::string& dbpath, Env* env, const DBOptions& db_options, - const std::vector& cf_descs, - bool ignore_unknown_options) { - ConfigOptions config_options(db_options); - config_options.sanity_level = ConfigOptions::kSanityLevelLooselyCompatible; - config_options.ignore_unknown_options = ignore_unknown_options; - config_options.input_strings_escaped = true; - config_options.env = env; - return CheckOptionsCompatibility(config_options, dbpath, db_options, - cf_descs); -} - Status CheckOptionsCompatibility( const ConfigOptions& config_options, const std::string& dbpath, const DBOptions& db_options, @@ -156,4 +115,3 @@ Status CheckOptionsCompatibility( } } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc index 1c3b41ff29d8..fd9affb0d917 100644 --- a/utilities/options/options_util_test.cc +++ b/utilities/options/options_util_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/options_util.h" @@ -63,7 +62,11 @@ TEST_F(OptionsUtilTest, SaveAndLoad) { DBOptions loaded_db_opt; std::vector loaded_cf_descs; - ASSERT_OK(LoadOptionsFromFile(kFileName, env_.get(), &loaded_db_opt, + ConfigOptions config_options; + config_options.ignore_unknown_options = false; + config_options.input_strings_escaped = true; + config_options.env = env_.get(); + ASSERT_OK(LoadOptionsFromFile(config_options, kFileName, &loaded_db_opt, &loaded_cf_descs)); ConfigOptions exact; exact.sanity_level = ConfigOptions::kSanityLevelExactMatch; @@ -142,19 +145,6 @@ TEST_F(OptionsUtilTest, SaveAndLoadWithCacheCheck) { ASSERT_EQ(loaded_bbt_opt->block_cache.get(), cache.get()); } } - - // Test the old interface - ASSERT_OK(LoadOptionsFromFile(kFileName, env_.get(), &loaded_db_opt, - &loaded_cf_descs, false, &cache)); - for (size_t i = 0; i < loaded_cf_descs.size(); i++) { - auto* loaded_bbt_opt = - loaded_cf_descs[i] - .options.table_factory->GetOptions(); - // Expect the same cache will be loaded - if (loaded_bbt_opt != nullptr) { - ASSERT_EQ(loaded_bbt_opt->block_cache.get(), cache.get()); - } - } ASSERT_OK(DestroyDB(dbname_, Options(loaded_db_opt, cf_opts[0]))); } @@ -360,6 +350,21 @@ TEST_F(OptionsUtilTest, SanityCheck) { ASSERT_OK( CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); } + + // persist_user_defined_timestamps + { + bool prev_persist_user_defined_timestamps = + cf_descs[2].options.persist_user_defined_timestamps; + cf_descs[2].options.persist_user_defined_timestamps = false; + ASSERT_NOK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + + cf_descs[2].options.persist_user_defined_timestamps = + prev_persist_user_defined_timestamps; + ASSERT_OK( + CheckOptionsCompatibility(config_options, dbname_, db_opt, cf_descs)); + } + ASSERT_OK(DestroyDB(dbname_, Options(db_opt, cf_descs[0].options))); } @@ -386,11 +391,8 @@ TEST_F(OptionsUtilTest, LatestOptionsNotFound) { ASSERT_TRUE(s.IsNotFound()); ASSERT_TRUE(s.IsPathNotFound()); - s = LoadLatestOptions(dbname_, options.env, &options, &cf_descs); - ASSERT_TRUE(s.IsNotFound()); - ASSERT_TRUE(s.IsPathNotFound()); - s = LoadLatestOptions(config_opts, dbname_, &options, &cf_descs); + ASSERT_TRUE(s.IsNotFound()); ASSERT_TRUE(s.IsPathNotFound()); s = GetLatestOptionsFileName(dbname_, options.env, &options_file_name); @@ -404,7 +406,7 @@ TEST_F(OptionsUtilTest, LatestOptionsNotFound) { ASSERT_TRUE(s.IsNotFound()); ASSERT_TRUE(s.IsPathNotFound()); - s = LoadLatestOptions(dbname_, options.env, &options, &cf_descs); + s = LoadLatestOptions(config_opts, dbname_, &options, &cf_descs); ASSERT_TRUE(s.IsNotFound()); ASSERT_TRUE(s.IsPathNotFound()); @@ -641,6 +643,9 @@ TEST_F(OptionsUtilTest, RenameDatabaseDirectory) { DBOptions db_opts; std::vector cf_descs; std::vector handles; + ConfigOptions ignore_opts; + ignore_opts.ignore_unknown_options = false; + ignore_opts.env = options.env; options.create_if_missing = true; @@ -651,7 +656,7 @@ TEST_F(OptionsUtilTest, RenameDatabaseDirectory) { auto new_dbname = dbname_ + "_2"; ASSERT_OK(options.env->RenameFile(dbname_, new_dbname)); - ASSERT_OK(LoadLatestOptions(new_dbname, options.env, &db_opts, &cf_descs)); + ASSERT_OK(LoadLatestOptions(ignore_opts, new_dbname, &db_opts, &cf_descs)); ASSERT_EQ(cf_descs.size(), 1U); db_opts.create_if_missing = false; @@ -675,20 +680,23 @@ TEST_F(OptionsUtilTest, WalDirSettings) { DBOptions db_opts; std::vector cf_descs; std::vector handles; + ConfigOptions ignore_opts; + ignore_opts.ignore_unknown_options = false; + ignore_opts.env = options.env; options.create_if_missing = true; // Open a DB with no wal dir set. The wal_dir should stay empty ASSERT_OK(DB::Open(options, dbname_, &db)); delete db; - ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs)); ASSERT_EQ(db_opts.wal_dir, ""); // Open a DB with wal_dir == dbname. The wal_dir should be set to empty options.wal_dir = dbname_; ASSERT_OK(DB::Open(options, dbname_, &db)); delete db; - ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs)); ASSERT_EQ(db_opts.wal_dir, ""); // Open a DB with no wal_dir but a db_path==dbname_. The wal_dir should be @@ -697,7 +705,7 @@ TEST_F(OptionsUtilTest, WalDirSettings) { options.db_paths.emplace_back(dbname_, std::numeric_limits::max()); ASSERT_OK(DB::Open(options, dbname_, &db)); delete db; - ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs)); ASSERT_EQ(db_opts.wal_dir, ""); // Open a DB with no wal_dir==dbname_ and db_path==dbname_. The wal_dir @@ -706,7 +714,7 @@ TEST_F(OptionsUtilTest, WalDirSettings) { options.db_paths.emplace_back(dbname_, std::numeric_limits::max()); ASSERT_OK(DB::Open(options, dbname_, &db)); delete db; - ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs)); ASSERT_EQ(db_opts.wal_dir, ""); ASSERT_OK(DestroyDB(dbname_, options)); @@ -717,7 +725,7 @@ TEST_F(OptionsUtilTest, WalDirSettings) { std::numeric_limits::max()); ASSERT_OK(DB::Open(options, dbname_, &db)); delete db; - ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs)); ASSERT_EQ(db_opts.wal_dir, dbname_); ASSERT_OK(DestroyDB(dbname_, options)); @@ -726,7 +734,7 @@ TEST_F(OptionsUtilTest, WalDirSettings) { options.db_paths.clear(); ASSERT_OK(DB::Open(options, dbname_, &db)); delete db; - ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs)); ASSERT_EQ(db_opts.wal_dir, dbname_ + "/wal"); ASSERT_OK(DestroyDB(dbname_, options)); } @@ -737,6 +745,9 @@ TEST_F(OptionsUtilTest, WalDirInOptins) { DBOptions db_opts; std::vector cf_descs; std::vector handles; + ConfigOptions ignore_opts; + ignore_opts.ignore_unknown_options = false; + ignore_opts.env = options.env; // Store an options file with wal_dir=dbname_ and make sure it still loads // when the input wal_dir is empty @@ -750,12 +761,12 @@ TEST_F(OptionsUtilTest, WalDirInOptins) { ASSERT_OK(PersistRocksDBOptions(options, {"default"}, {options}, dbname_ + "/" + options_file, options.env->GetFileSystem().get())); - ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs)); ASSERT_EQ(db_opts.wal_dir, dbname_); options.wal_dir = ""; ASSERT_OK(DB::Open(options, dbname_, &db)); delete db; - ASSERT_OK(LoadLatestOptions(dbname_, options.env, &db_opts, &cf_descs)); + ASSERT_OK(LoadLatestOptions(ignore_opts, dbname_, &db_opts, &cf_descs)); ASSERT_EQ(db_opts.wal_dir, ""); } } // namespace ROCKSDB_NAMESPACE @@ -769,11 +780,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - printf("Skipped in RocksDBLite as utilities are not supported.\n"); - return 0; -} -#endif // !ROCKSDB_LITE diff --git a/utilities/persistent_cache/block_cache_tier.cc b/utilities/persistent_cache/block_cache_tier.cc index 8ad9bb1b1662..3118fc2df68b 100644 --- a/utilities/persistent_cache/block_cache_tier.cc +++ b/utilities/persistent_cache/block_cache_tier.cc @@ -2,7 +2,6 @@ // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/persistent_cache/block_cache_tier.h" @@ -419,4 +418,3 @@ Status NewPersistentCache(Env* const env, const std::string& path, } // namespace ROCKSDB_NAMESPACE -#endif // ifndef ROCKSDB_LITE diff --git a/utilities/persistent_cache/block_cache_tier.h b/utilities/persistent_cache/block_cache_tier.h index 1aac287cc0ed..caabbef94ebe 100644 --- a/utilities/persistent_cache/block_cache_tier.h +++ b/utilities/persistent_cache/block_cache_tier.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #ifndef OS_WIN #include @@ -153,4 +152,3 @@ class BlockCacheTier : public PersistentCacheTier { } // namespace ROCKSDB_NAMESPACE -#endif diff --git a/utilities/persistent_cache/block_cache_tier_file.cc b/utilities/persistent_cache/block_cache_tier_file.cc index f4f8517abe3a..ff01c1abcf6a 100644 --- a/utilities/persistent_cache/block_cache_tier_file.cc +++ b/utilities/persistent_cache/block_cache_tier_file.cc @@ -2,7 +2,6 @@ // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/persistent_cache/block_cache_tier_file.h" @@ -237,7 +236,7 @@ bool RandomAccessCacheFile::Read(const LBA& lba, Slice* key, Slice* val, Slice result; Status s = freader_->Read(IOOptions(), lba.off_, lba.size_, &result, scratch, - nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + nullptr); if (!s.ok()) { Error(log_, "Error reading from file %s. %s", Path().c_str(), s.ToString().c_str()); @@ -606,5 +605,3 @@ void ThreadedWriter::DispatchIO(const IO& io) { } } // namespace ROCKSDB_NAMESPACE - -#endif diff --git a/utilities/persistent_cache/block_cache_tier_file.h b/utilities/persistent_cache/block_cache_tier_file.h index 1d265ab74fb5..127f5a8676af 100644 --- a/utilities/persistent_cache/block_cache_tier_file.h +++ b/utilities/persistent_cache/block_cache_tier_file.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -290,4 +289,3 @@ class ThreadedWriter : public Writer { } // namespace ROCKSDB_NAMESPACE -#endif diff --git a/utilities/persistent_cache/block_cache_tier_metadata.cc b/utilities/persistent_cache/block_cache_tier_metadata.cc index d73b5d0b48ab..182df6ca65af 100644 --- a/utilities/persistent_cache/block_cache_tier_metadata.cc +++ b/utilities/persistent_cache/block_cache_tier_metadata.cc @@ -2,7 +2,6 @@ // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/persistent_cache/block_cache_tier_metadata.h" @@ -83,4 +82,3 @@ void BlockCacheTierMetadata::RemoveAllKeys(BlockCacheFile* f) { } // namespace ROCKSDB_NAMESPACE -#endif diff --git a/utilities/persistent_cache/block_cache_tier_metadata.h b/utilities/persistent_cache/block_cache_tier_metadata.h index 2fcd501056d3..15eb0be3d966 100644 --- a/utilities/persistent_cache/block_cache_tier_metadata.h +++ b/utilities/persistent_cache/block_cache_tier_metadata.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -121,4 +120,3 @@ class BlockCacheTierMetadata { } // namespace ROCKSDB_NAMESPACE -#endif diff --git a/utilities/persistent_cache/hash_table.h b/utilities/persistent_cache/hash_table.h index b00b294ce9d6..13b2ec7d66fc 100644 --- a/utilities/persistent_cache/hash_table.h +++ b/utilities/persistent_cache/hash_table.h @@ -5,7 +5,6 @@ // #pragma once -#ifndef ROCKSDB_LITE #include @@ -236,4 +235,3 @@ class HashTable { } // namespace ROCKSDB_NAMESPACE -#endif diff --git a/utilities/persistent_cache/hash_table_bench.cc b/utilities/persistent_cache/hash_table_bench.cc index 74d7e2edfbff..bf4406bb3552 100644 --- a/utilities/persistent_cache/hash_table_bench.cc +++ b/utilities/persistent_cache/hash_table_bench.cc @@ -4,7 +4,7 @@ // (found in the LICENSE.Apache file in the root directory). // -#if !defined(OS_WIN) && !defined(ROCKSDB_LITE) +#if !defined(OS_WIN) #ifndef GFLAGS #include diff --git a/utilities/persistent_cache/hash_table_evictable.h b/utilities/persistent_cache/hash_table_evictable.h index e10939b2f7db..daf4bec1aefe 100644 --- a/utilities/persistent_cache/hash_table_evictable.h +++ b/utilities/persistent_cache/hash_table_evictable.h @@ -5,7 +5,6 @@ // #pragma once -#ifndef ROCKSDB_LITE #include @@ -165,4 +164,3 @@ class EvictableHashTable : private HashTable { } // namespace ROCKSDB_NAMESPACE -#endif diff --git a/utilities/persistent_cache/hash_table_test.cc b/utilities/persistent_cache/hash_table_test.cc index 2f6387f5fb33..faae2cf2142e 100644 --- a/utilities/persistent_cache/hash_table_test.cc +++ b/utilities/persistent_cache/hash_table_test.cc @@ -17,7 +17,6 @@ #include "util/random.h" #include "utilities/persistent_cache/hash_table_evictable.h" -#ifndef ROCKSDB_LITE namespace ROCKSDB_NAMESPACE { @@ -154,7 +153,6 @@ TEST_F(EvictableHashTableTest, TestEvict) { } } // namespace ROCKSDB_NAMESPACE -#endif int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); diff --git a/utilities/persistent_cache/lrulist.h b/utilities/persistent_cache/lrulist.h index a608890fc3e1..9a896ed59948 100644 --- a/utilities/persistent_cache/lrulist.h +++ b/utilities/persistent_cache/lrulist.h @@ -5,7 +5,6 @@ // #pragma once -#ifndef ROCKSDB_LITE #include @@ -171,4 +170,3 @@ class LRUList { } // namespace ROCKSDB_NAMESPACE -#endif diff --git a/utilities/persistent_cache/persistent_cache_bench.cc b/utilities/persistent_cache/persistent_cache_bench.cc index 9d6e15d6b685..f2993ee12705 100644 --- a/utilities/persistent_cache/persistent_cache_bench.cc +++ b/utilities/persistent_cache/persistent_cache_bench.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #ifndef GFLAGS #include @@ -354,6 +353,3 @@ int main(int argc, char** argv) { return 0; } #endif // #ifndef GFLAGS -#else -int main(int, char**) { return 0; } -#endif diff --git a/utilities/persistent_cache/persistent_cache_test.cc b/utilities/persistent_cache/persistent_cache_test.cc index d1b18b68aa23..dfbc9b931602 100644 --- a/utilities/persistent_cache/persistent_cache_test.cc +++ b/utilities/persistent_cache/persistent_cache_test.cc @@ -6,7 +6,6 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#if !defined ROCKSDB_LITE #include "utilities/persistent_cache/persistent_cache_test.h" @@ -296,7 +295,7 @@ PersistentCacheDBTest::PersistentCacheDBTest() // test template void PersistentCacheDBTest::RunTest( const std::function(bool)>& new_pcache, - const size_t max_keys = 100 * 1024, const size_t max_usecase = 5) { + const size_t max_keys = 100 * 1024, const size_t max_usecase = 3) { // number of insertion interations int num_iter = static_cast(max_keys * kStressFactor); @@ -320,43 +319,21 @@ void PersistentCacheDBTest::RunTest( pcache = new_pcache(/*is_compressed=*/true); table_options.persistent_cache = pcache; table_options.block_cache = NewLRUCache(size_max); - table_options.block_cache_compressed = nullptr; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); break; case 1: - // page cache, block cache, compressed cache - pcache = new_pcache(/*is_compressed=*/true); - table_options.persistent_cache = pcache; - table_options.block_cache = NewLRUCache(size_max); - table_options.block_cache_compressed = NewLRUCache(size_max); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - break; - case 2: - // page cache, block cache, compressed cache + KNoCompression - // both block cache and compressed cache, but DB is not compressed - // also, make block cache sizes bigger, to trigger block cache hits - pcache = new_pcache(/*is_compressed=*/true); - table_options.persistent_cache = pcache; - table_options.block_cache = NewLRUCache(size_max); - table_options.block_cache_compressed = NewLRUCache(size_max); - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.compression = kNoCompression; - break; - case 3: // page cache, no block cache, no compressed cache pcache = new_pcache(/*is_compressed=*/false); table_options.persistent_cache = pcache; table_options.block_cache = nullptr; - table_options.block_cache_compressed = nullptr; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); break; - case 4: + case 2: // page cache, no block cache, no compressed cache // Page cache caches compressed blocks pcache = new_pcache(/*is_compressed=*/true); table_options.persistent_cache = pcache; table_options.block_cache = nullptr; - table_options.block_cache_compressed = nullptr; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); break; default: @@ -372,10 +349,6 @@ void PersistentCacheDBTest::RunTest( Verify(num_iter, values); auto block_miss = TestGetTickerCount(options, BLOCK_CACHE_MISS); - auto compressed_block_hit = - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT); - auto compressed_block_miss = - TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS); auto page_hit = TestGetTickerCount(options, PERSISTENT_CACHE_HIT); auto page_miss = TestGetTickerCount(options, PERSISTENT_CACHE_MISS); @@ -386,31 +359,12 @@ void PersistentCacheDBTest::RunTest( ASSERT_GT(page_miss, 0); ASSERT_GT(page_hit, 0); ASSERT_GT(block_miss, 0); - ASSERT_EQ(compressed_block_miss, 0); - ASSERT_EQ(compressed_block_hit, 0); break; case 1: - // page cache, block cache, compressed cache - ASSERT_GT(page_miss, 0); - ASSERT_GT(block_miss, 0); - ASSERT_GT(compressed_block_miss, 0); - break; case 2: - // page cache, block cache, compressed cache + KNoCompression - ASSERT_GT(page_miss, 0); - ASSERT_GT(page_hit, 0); - ASSERT_GT(block_miss, 0); - ASSERT_GT(compressed_block_miss, 0); - // remember kNoCompression - ASSERT_EQ(compressed_block_hit, 0); - break; - case 3: - case 4: // page cache, no block cache, no compressed cache ASSERT_GT(page_miss, 0); ASSERT_GT(page_hit, 0); - ASSERT_EQ(compressed_block_hit, 0); - ASSERT_EQ(compressed_block_miss, 0); break; default: FAIL(); @@ -457,6 +411,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } -#else // !defined ROCKSDB_LITE -int main() { return 0; } -#endif // !defined ROCKSDB_LITE diff --git a/utilities/persistent_cache/persistent_cache_test.h b/utilities/persistent_cache/persistent_cache_test.h index f13155ed6ea2..aab61bb773b4 100644 --- a/utilities/persistent_cache/persistent_cache_test.h +++ b/utilities/persistent_cache/persistent_cache_test.h @@ -8,7 +8,6 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -283,4 +282,3 @@ class PersistentCacheDBTest : public DBTestBase { } // namespace ROCKSDB_NAMESPACE -#endif diff --git a/utilities/persistent_cache/persistent_cache_tier.cc b/utilities/persistent_cache/persistent_cache_tier.cc index 54cbce8f707f..773aafbf2608 100644 --- a/utilities/persistent_cache/persistent_cache_tier.cc +++ b/utilities/persistent_cache/persistent_cache_tier.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include "utilities/persistent_cache/persistent_cache_tier.h" @@ -164,4 +163,3 @@ bool PersistentTieredCache::IsCompressed() { } // namespace ROCKSDB_NAMESPACE -#endif diff --git a/utilities/persistent_cache/persistent_cache_tier.h b/utilities/persistent_cache/persistent_cache_tier.h index 65aadcd3f4f5..44d2fbba31ae 100644 --- a/utilities/persistent_cache/persistent_cache_tier.h +++ b/utilities/persistent_cache/persistent_cache_tier.h @@ -5,7 +5,6 @@ // #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -339,4 +338,3 @@ class PersistentTieredCache : public PersistentCacheTier { } // namespace ROCKSDB_NAMESPACE -#endif diff --git a/utilities/persistent_cache/volatile_tier_impl.cc b/utilities/persistent_cache/volatile_tier_impl.cc index 45d2830aa808..6264f5ef3aab 100644 --- a/utilities/persistent_cache/volatile_tier_impl.cc +++ b/utilities/persistent_cache/volatile_tier_impl.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // -#ifndef ROCKSDB_LITE #include "utilities/persistent_cache/volatile_tier_impl.h" @@ -137,4 +136,3 @@ bool VolatileCacheTier::Evict() { } // namespace ROCKSDB_NAMESPACE -#endif diff --git a/utilities/persistent_cache/volatile_tier_impl.h b/utilities/persistent_cache/volatile_tier_impl.h index 09265e457fd1..f5d30644381a 100644 --- a/utilities/persistent_cache/volatile_tier_impl.h +++ b/utilities/persistent_cache/volatile_tier_impl.h @@ -5,7 +5,6 @@ // #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -138,4 +137,3 @@ class VolatileCacheTier : public PersistentCacheTier { } // namespace ROCKSDB_NAMESPACE -#endif diff --git a/utilities/simulator_cache/sim_cache.cc b/utilities/simulator_cache/sim_cache.cc index 0f0c098710b4..ff9d52dca9be 100644 --- a/utilities/simulator_cache/sim_cache.cc +++ b/utilities/simulator_cache/sim_cache.cc @@ -9,7 +9,7 @@ #include #include "file/writable_file_writer.h" -#include "monitoring/statistics.h" +#include "monitoring/statistics_impl.h" #include "port/port.h" #include "rocksdb/env.h" #include "rocksdb/file_system.h" @@ -151,23 +151,26 @@ class SimCacheImpl : public SimCache { // capacity for real cache (ShardedLRUCache) // test_capacity for key only cache SimCacheImpl(std::shared_ptr sim_cache, std::shared_ptr cache) - : cache_(cache), + : SimCache(cache), key_only_cache_(sim_cache), miss_times_(0), hit_times_(0), stats_(nullptr) {} ~SimCacheImpl() override {} - void SetCapacity(size_t capacity) override { cache_->SetCapacity(capacity); } + + const char* Name() const override { return "SimCache"; } + + void SetCapacity(size_t capacity) override { target_->SetCapacity(capacity); } void SetStrictCapacityLimit(bool strict_capacity_limit) override { - cache_->SetStrictCapacityLimit(strict_capacity_limit); + target_->SetStrictCapacityLimit(strict_capacity_limit); } - using Cache::Insert; Status Insert(const Slice& key, Cache::ObjectPtr value, const CacheItemHelper* helper, size_t charge, Handle** handle, - Priority priority) override { + Priority priority, const Slice& compressed = {}, + CompressionType type = kNoCompression) override { // The handle and value passed in are for real cache, so we pass nullptr // to key_only_cache_ for both instead. Also, the deleter function pointer // will be called by user to perform some external operation which should @@ -176,73 +179,82 @@ class SimCacheImpl : public SimCache { Handle* h = key_only_cache_->Lookup(key); if (h == nullptr) { // TODO: Check for error here? - auto s = key_only_cache_->Insert(key, nullptr, &kNoopCacheItemHelper, - charge, nullptr, priority); + auto s = + key_only_cache_->Insert(key, nullptr, &kNoopCacheItemHelper, charge, + nullptr, priority, compressed, type); s.PermitUncheckedError(); } else { key_only_cache_->Release(h); } cache_activity_logger_.ReportAdd(key, charge); - if (!cache_) { + if (!target_) { return Status::OK(); } - return cache_->Insert(key, value, helper, charge, handle, priority); + return target_->Insert(key, value, helper, charge, handle, priority, + compressed, type); } Handle* Lookup(const Slice& key, const CacheItemHelper* helper, CreateContext* create_context, - Priority priority = Priority::LOW, bool wait = true, + Priority priority = Priority::LOW, Statistics* stats = nullptr) override { HandleLookup(key, stats); - if (!cache_) { + if (!target_) { return nullptr; } - return cache_->Lookup(key, helper, create_context, priority, wait, stats); + return target_->Lookup(key, helper, create_context, priority, stats); + } + + void StartAsyncLookup(AsyncLookupHandle& async_handle) override { + HandleLookup(async_handle.key, async_handle.stats); + if (target_) { + target_->StartAsyncLookup(async_handle); + } } - bool Ref(Handle* handle) override { return cache_->Ref(handle); } + bool Ref(Handle* handle) override { return target_->Ref(handle); } using Cache::Release; bool Release(Handle* handle, bool erase_if_last_ref = false) override { - return cache_->Release(handle, erase_if_last_ref); + return target_->Release(handle, erase_if_last_ref); } void Erase(const Slice& key) override { - cache_->Erase(key); + target_->Erase(key); key_only_cache_->Erase(key); } Cache::ObjectPtr Value(Handle* handle) override { - return cache_->Value(handle); + return target_->Value(handle); } - uint64_t NewId() override { return cache_->NewId(); } + uint64_t NewId() override { return target_->NewId(); } - size_t GetCapacity() const override { return cache_->GetCapacity(); } + size_t GetCapacity() const override { return target_->GetCapacity(); } bool HasStrictCapacityLimit() const override { - return cache_->HasStrictCapacityLimit(); + return target_->HasStrictCapacityLimit(); } - size_t GetUsage() const override { return cache_->GetUsage(); } + size_t GetUsage() const override { return target_->GetUsage(); } size_t GetUsage(Handle* handle) const override { - return cache_->GetUsage(handle); + return target_->GetUsage(handle); } size_t GetCharge(Handle* handle) const override { - return cache_->GetCharge(handle); + return target_->GetCharge(handle); } const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override { - return cache_->GetCacheItemHelper(handle); + return target_->GetCacheItemHelper(handle); } - size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); } + size_t GetPinnedUsage() const override { return target_->GetPinnedUsage(); } void DisownData() override { - cache_->DisownData(); + target_->DisownData(); key_only_cache_->DisownData(); } @@ -250,11 +262,11 @@ class SimCacheImpl : public SimCache { const std::function& callback, const ApplyToAllEntriesOptions& opts) override { - cache_->ApplyToAllEntries(callback, opts); + target_->ApplyToAllEntries(callback, opts); } void EraseUnRefEntries() override { - cache_->EraseUnRefEntries(); + target_->EraseUnRefEntries(); key_only_cache_->EraseUnRefEntries(); } @@ -295,7 +307,7 @@ class SimCacheImpl : public SimCache { std::string GetPrintableOptions() const override { std::ostringstream oss; oss << " cache_options:" << std::endl; - oss << cache_->GetPrintableOptions(); + oss << target_->GetPrintableOptions(); oss << " sim_cache_options:" << std::endl; oss << key_only_cache_->GetPrintableOptions(); return oss.str(); @@ -314,7 +326,6 @@ class SimCacheImpl : public SimCache { } private: - std::shared_ptr cache_; std::shared_ptr key_only_cache_; std::atomic miss_times_; std::atomic hit_times_; diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector.cc b/utilities/table_properties_collectors/compact_on_deletion_collector.cc index 16f33934dcae..a3b5189da03c 100644 --- a/utilities/table_properties_collectors/compact_on_deletion_collector.cc +++ b/utilities/table_properties_collectors/compact_on_deletion_collector.cc @@ -15,7 +15,6 @@ #include "util/string_util.h" namespace ROCKSDB_NAMESPACE { -#ifndef ROCKSDB_LITE CompactOnDeletionCollector::CompactOnDeletionCollector( size_t sliding_window_size, size_t deletion_trigger, double deletion_ratio) @@ -100,7 +99,6 @@ Status CompactOnDeletionCollector::Finish( } static std::unordered_map on_deletion_collector_type_info = { -#ifndef ROCKSDB_LITE {"window_size", {0, OptionType::kUnknown, OptionVerificationType::kNormal, OptionTypeFlags::kCompareNever | OptionTypeFlags::kMutable, @@ -156,7 +154,6 @@ static std::unordered_map }, nullptr}}, -#endif // ROCKSDB_LITE }; CompactOnDeletionCollectorFactory::CompactOnDeletionCollectorFactory( @@ -208,20 +205,17 @@ static int RegisterTablePropertiesCollectorFactories( return 1; } } // namespace -#endif // !ROCKSDB_LITE Status TablePropertiesCollectorFactory::CreateFromString( const ConfigOptions& options, const std::string& value, std::shared_ptr* result) { -#ifndef ROCKSDB_LITE static std::once_flag once; std::call_once(once, [&]() { RegisterTablePropertiesCollectorFactories(*(ObjectLibrary::Default().get()), ""); }); -#endif // ROCKSDB_LITE return LoadSharedObject(options, value, - nullptr, result); + result); } } // namespace ROCKSDB_NAMESPACE diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector.h b/utilities/table_properties_collectors/compact_on_deletion_collector.h index 2f7dc4f1b63c..c267463a02be 100644 --- a/utilities/table_properties_collectors/compact_on_deletion_collector.h +++ b/utilities/table_properties_collectors/compact_on_deletion_collector.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/table_properties_collectors.h" namespace ROCKSDB_NAMESPACE { @@ -67,4 +66,3 @@ class CompactOnDeletionCollector : public TablePropertiesCollector { bool finished_; }; } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc index 88aeb8d5c92f..5de18e262433 100644 --- a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc +++ b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc @@ -9,7 +9,6 @@ #include -#ifndef ROCKSDB_LITE #include #include #include @@ -237,9 +236,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } -#else -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as RocksDBLite does not include utilities.\n"); - return 0; -} -#endif // !ROCKSDB_LITE diff --git a/utilities/trace/file_trace_reader_writer.cc b/utilities/trace/file_trace_reader_writer.cc index 5886d3539fc2..f2ca741442b7 100644 --- a/utilities/trace/file_trace_reader_writer.cc +++ b/utilities/trace/file_trace_reader_writer.cc @@ -42,8 +42,7 @@ Status FileTraceReader::Reset() { Status FileTraceReader::Read(std::string* data) { assert(file_reader_ != nullptr); Status s = file_reader_->Read(IOOptions(), offset_, kTraceMetadataSize, - &result_, buffer_, nullptr, - Env::IO_TOTAL /* rate_limiter_priority */); + &result_, buffer_, nullptr); if (!s.ok()) { return s; } @@ -68,7 +67,7 @@ Status FileTraceReader::Read(std::string* data) { bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read; while (to_read > 0) { s = file_reader_->Read(IOOptions(), offset_, to_read, &result_, buffer_, - nullptr, Env::IO_TOTAL /* rate_limiter_priority */); + nullptr); if (!s.ok()) { return s; } diff --git a/utilities/trace/replayer_impl.cc b/utilities/trace/replayer_impl.cc index 31023f1a26d2..32a5ad7f051c 100644 --- a/utilities/trace/replayer_impl.cc +++ b/utilities/trace/replayer_impl.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/trace/replayer_impl.h" @@ -313,4 +312,3 @@ void ReplayerImpl::BackgroundWork(void* arg) { } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/trace/replayer_impl.h b/utilities/trace/replayer_impl.h index 367b0b51ed75..f484ab237f02 100644 --- a/utilities/trace/replayer_impl.h +++ b/utilities/trace/replayer_impl.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -83,4 +82,3 @@ struct ReplayerWorkerArg { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/lock_manager.cc b/utilities/transactions/lock/lock_manager.cc index df16b32ad9a4..f0bef953b60d 100644 --- a/utilities/transactions/lock/lock_manager.cc +++ b/utilities/transactions/lock/lock_manager.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/lock/lock_manager.h" @@ -26,4 +25,3 @@ std::shared_ptr NewLockManager(PessimisticTransactionDB* db, } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/lock_manager.h b/utilities/transactions/lock/lock_manager.h index a5ce1948c1d7..d90ed9b00772 100644 --- a/utilities/transactions/lock/lock_manager.h +++ b/utilities/transactions/lock/lock_manager.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include "rocksdb/types.h" #include "rocksdb/utilities/transaction.h" @@ -79,4 +78,3 @@ std::shared_ptr NewLockManager(PessimisticTransactionDB* db, } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/lock_tracker.h b/utilities/transactions/lock/lock_tracker.h index 5fa228a82900..fe09ab22c595 100644 --- a/utilities/transactions/lock/lock_tracker.h +++ b/utilities/transactions/lock/lock_tracker.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include @@ -206,4 +205,3 @@ class LockTrackerFactory { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/point/point_lock_manager.cc b/utilities/transactions/lock/point/point_lock_manager.cc index b362a164dddd..b73b3fe7618c 100644 --- a/utilities/transactions/lock/point/point_lock_manager.cc +++ b/utilities/transactions/lock/point/point_lock_manager.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/lock/point/point_lock_manager.h" @@ -718,4 +717,3 @@ void PointLockManager::UnLock(PessimisticTransaction* /* txn */, } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/point/point_lock_manager.h b/utilities/transactions/lock/point/point_lock_manager.h index eeb34f3bec4e..99183ca1cd2f 100644 --- a/utilities/transactions/lock/point/point_lock_manager.h +++ b/utilities/transactions/lock/point/point_lock_manager.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -221,4 +220,3 @@ class PointLockManager : public LockManager { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/point/point_lock_manager_test.cc b/utilities/transactions/lock/point/point_lock_manager_test.cc index 525fdea71388..28ce5275ddbe 100644 --- a/utilities/transactions/lock/point/point_lock_manager_test.cc +++ b/utilities/transactions/lock/point/point_lock_manager_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/lock/point/point_lock_manager_test.h" @@ -129,14 +128,14 @@ TEST_F(PointLockManagerTest, DeadlockDepthExceeded) { port::Thread t1 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() { ASSERT_OK(locker_->TryLock(txn2, 1, "k2", env_, true)); // block because txn1 is holding a lock on k1. - locker_->TryLock(txn2, 1, "k1", env_, true); + ASSERT_OK(locker_->TryLock(txn2, 1, "k1", env_, true)); }); ASSERT_OK(locker_->TryLock(txn3, 1, "k3", env_, true)); port::Thread t2 = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() { // block because txn3 is holding a lock on k1. - locker_->TryLock(txn4, 1, "k3", env_, true); + ASSERT_OK(locker_->TryLock(txn4, 1, "k3", env_, true)); }); auto s = locker_->TryLock(txn3, 1, "k2", env_, true); @@ -169,13 +168,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "SKIPPED because Transactions are not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/point/point_lock_manager_test.h b/utilities/transactions/lock/point/point_lock_manager_test.h index ca9f46bf9da7..51d9076b272e 100644 --- a/utilities/transactions/lock/point/point_lock_manager_test.h +++ b/utilities/transactions/lock/point/point_lock_manager_test.h @@ -244,7 +244,7 @@ TEST_P(AnyLockManagerTest, Deadlock) { // txn1 tries to lock k2, will block forever. port::Thread t = BlockUntilWaitingTxn(wait_sync_point_name_, [&]() { // block because txn2 is holding a lock on k2. - locker_->TryLock(txn1, 1, "k2", env_, true); + ASSERT_OK(locker_->TryLock(txn1, 1, "k2", env_, true)); }); auto s = locker_->TryLock(txn2, 1, "k1", env_, true); diff --git a/utilities/transactions/lock/point/point_lock_tracker.cc b/utilities/transactions/lock/point/point_lock_tracker.cc index 6204a8f02184..ce86c648688a 100644 --- a/utilities/transactions/lock/point/point_lock_tracker.cc +++ b/utilities/transactions/lock/point/point_lock_tracker.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/lock/point/point_lock_tracker.h" @@ -254,4 +253,3 @@ void PointLockTracker::Clear() { tracked_keys_.clear(); } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/point/point_lock_tracker.h b/utilities/transactions/lock/point/point_lock_tracker.h index daf6f9aa276b..57e1b8437abe 100644 --- a/utilities/transactions/lock/point/point_lock_tracker.h +++ b/utilities/transactions/lock/point/point_lock_tracker.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -96,4 +95,3 @@ class PointLockTrackerFactory : public LockTrackerFactory { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_lock_manager.h b/utilities/transactions/lock/range/range_lock_manager.h index 01899542e5cc..22d653ff675d 100644 --- a/utilities/transactions/lock/range/range_lock_manager.h +++ b/utilities/transactions/lock/range/range_lock_manager.h @@ -8,7 +8,6 @@ // Generic definitions for a Range-based Lock Manager // #pragma once -#ifndef ROCKSDB_LITE #include "utilities/transactions/lock/lock_manager.h" @@ -33,4 +32,3 @@ class RangeLockManagerBase : public LockManager { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_locking_test.cc b/utilities/transactions/lock/range/range_locking_test.cc index bce66c1f360d..9c044910dfb0 100644 --- a/utilities/transactions/lock/range/range_locking_test.cc +++ b/utilities/transactions/lock/range/range_locking_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #ifndef OS_WIN #include @@ -447,13 +446,3 @@ int main(int /*argc*/, char** /*argv*/) { #endif // OS_WIN -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "skipped as transactions are not supported in rocksdb_lite\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_tree/lib/db.h b/utilities/transactions/lock/range/range_tree/lib/db.h index 5aa826c8e086..99cfa1f54109 100644 --- a/utilities/transactions/lock/range/range_tree/lib/db.h +++ b/utilities/transactions/lock/range/range_tree/lib/db.h @@ -1,3 +1,8 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + #ifndef _DB_H #define _DB_H diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc index 5110cd48253b..181a1416de14 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc @@ -1,6 +1,5 @@ /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: -#ifndef ROCKSDB_LITE #ifndef OS_WIN #ident "$Id$" /*====== @@ -136,4 +135,3 @@ void concurrent_tree::locked_keyrange::remove_all(void) { } /* namespace toku */ #endif // OS_WIN -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc index e50ace5a9d3b..e61dfdfac7b6 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc @@ -1,6 +1,5 @@ /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: -#ifndef ROCKSDB_LITE #ifndef OS_WIN #ident "$Id$" /*====== @@ -219,4 +218,3 @@ void keyrange::replace_right_key(const DBT *key) { } /* namespace toku */ #endif // OS_WIN -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc index 3d217be70ec1..f79c91acff02 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc @@ -1,6 +1,5 @@ /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ // vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2: -#ifndef ROCKSDB_LITE #ifndef OS_WIN #ident "$Id$" /*====== @@ -524,4 +523,3 @@ void lock_request::set_retry_test_callback(void (*f)(void)) { } /* namespace toku */ #endif // OS_WIN -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc index 3d6a590c7922..26b6d9f6901b 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc @@ -1,6 +1,5 @@ /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ // vim: ft=cpp:expandtab:ts=8:sw=2:softtabstop=2: -#ifndef ROCKSDB_LITE #ifndef OS_WIN #ident "$Id$" /*====== @@ -1020,4 +1019,3 @@ DICTIONARY_ID locktree::get_dict_id() const { return m_dict_id; } } /* namespace toku */ #endif // OS_WIN -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc index 4186182beaec..bec464e48e42 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc @@ -1,6 +1,5 @@ /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: -#ifndef ROCKSDB_LITE #ifndef OS_WIN #ident "$Id$" /*====== @@ -524,4 +523,3 @@ void locktree_manager::kill_waiter(void *extra) { } /* namespace toku */ #endif // OS_WIN -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc index 1e1d23ef8af8..eb56b20adfbe 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc @@ -1,6 +1,5 @@ /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: -#ifndef ROCKSDB_LITE #ifndef OS_WIN #ident "$Id$" /*====== @@ -262,4 +261,3 @@ void range_buffer::append_point(const DBT *key, bool is_exclusive) { } /* namespace toku */ #endif // OS_WIN -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc index 8997f634b06a..4e469e380259 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc @@ -1,6 +1,5 @@ /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: -#ifndef ROCKSDB_LITE #ifndef OS_WIN #ident "$Id$" /*====== @@ -517,4 +516,3 @@ treenode *treenode::child_ptr::get_locked(void) { } /* namespace toku */ #endif // OS_WIN -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc index 4caf1e26fd8b..d7f989043cda 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc @@ -1,6 +1,5 @@ /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: -#ifndef ROCKSDB_LITE #ifndef OS_WIN #ident "$Id$" /*====== @@ -117,4 +116,3 @@ TXNID txnid_set::get(uint32_t i) const { } /* namespace toku */ #endif // OS_WIN -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc b/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc index 24536c88ecf1..df0073e0d298 100644 --- a/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc +++ b/utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc @@ -1,6 +1,5 @@ /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: -#ifndef ROCKSDB_LITE #ifndef OS_WIN #ident "$Id$" /*====== @@ -210,4 +209,3 @@ void wfg::node::free(wfg::node *n) { } /* namespace toku */ #endif // OS_WIN -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h index af47800fb712..c50a3a07fc61 100644 --- a/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h +++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_assert_subst.h @@ -1,3 +1,7 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). // // A replacement for toku_assert.h // diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h b/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h index eb8291c1d297..ad1d7bf54906 100644 --- a/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h +++ b/utilities/transactions/lock/range/range_tree/lib/portability/toku_external_pthread.h @@ -1,3 +1,7 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). /* A wrapper around ROCKSDB_NAMESPACE::TransactionDBMutexFactory-provided condition and mutex that provides toku_pthread_*-like interface. The functions diff --git a/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h b/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h index 803914862f9f..f4013bb36c58 100644 --- a/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h +++ b/utilities/transactions/lock/range/range_tree/lib/portability/txn_subst.h @@ -1,3 +1,7 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). // // A substitute for ft/txn/txn.h // diff --git a/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc b/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc index 50dc879ceae1..6dc86cc999a0 100644 --- a/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc +++ b/utilities/transactions/lock/range/range_tree/lib/standalone_port.cc @@ -1,4 +1,8 @@ -#ifndef ROCKSDB_LITE +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + #ifndef OS_WIN /* This is a dump ground to make Lock Tree work without the rest of TokuDB. @@ -129,4 +133,3 @@ int toku_builtin_compare_fun(const DBT *a, const DBT *b) { return toku_keycompare(a->data, a->size, b->data, b->size); } #endif // OS_WIN -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc b/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc index 63cc3a267a1f..4ea6249564c3 100644 --- a/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc +++ b/utilities/transactions/lock/range/range_tree/lib/util/dbt.cc @@ -1,6 +1,5 @@ /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: -#ifndef ROCKSDB_LITE #ifndef OS_WIN #ident "$Id$" /*====== @@ -150,4 +149,3 @@ bool toku_dbt_equals(const DBT *a, const DBT *b) { } } #endif // OS_WIN -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc b/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc index 0e7a9880be52..087cb238d39a 100644 --- a/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc +++ b/utilities/transactions/lock/range/range_tree/lib/util/memarena.cc @@ -1,6 +1,5 @@ /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: -#ifndef ROCKSDB_LITE #ifndef OS_WIN #ident "$Id$" /*====== @@ -198,4 +197,3 @@ bool memarena::chunk_iterator::more() const { return _chunk_idx < _ma->_n_other_chunks; } #endif // OS_WIN -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc index 531165deaa03..65ca91b0babc 100644 --- a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc +++ b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #ifndef OS_WIN #include "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h" @@ -500,4 +499,3 @@ LockManager::RangeLockStatus RangeTreeLockManager::GetRangeLockStatus() { } // namespace ROCKSDB_NAMESPACE #endif // OS_WIN -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h index e4236d600a00..2652add15916 100644 --- a/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h +++ b/utilities/transactions/lock/range/range_tree/range_tree_lock_manager.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #ifndef OS_WIN // For DeadlockInfoBuffer: @@ -134,4 +133,3 @@ void wait_callback_for_locktree(void* cdata, toku::lock_wait_infos* infos); } // namespace ROCKSDB_NAMESPACE #endif // OS_WIN -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc b/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc index be1e1478bc3b..5bfb86337671 100644 --- a/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc +++ b/utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #ifndef OS_WIN #include "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.h" @@ -153,4 +152,3 @@ void RangeLockList::ReplaceLocks(const toku::locktree *lt, } // namespace ROCKSDB_NAMESPACE #endif // OS_WIN -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/optimistic_transaction.cc b/utilities/transactions/optimistic_transaction.cc index 0ee0f28b6744..e8506f281614 100644 --- a/utilities/transactions/optimistic_transaction.cc +++ b/utilities/transactions/optimistic_transaction.cc @@ -3,10 +3,10 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/optimistic_transaction.h" +#include #include #include "db/column_family.h" @@ -16,6 +16,7 @@ #include "rocksdb/status.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "util/cast_util.h" +#include "util/defer.h" #include "util/string_util.h" #include "utilities/transactions/lock/point/point_lock_tracker.h" #include "utilities/transactions/optimistic_transaction.h" @@ -97,28 +98,42 @@ Status OptimisticTransaction::CommitWithParallelValidate() { assert(txn_db_impl); DBImpl* db_impl = static_cast_with_check(db_->GetRootDB()); assert(db_impl); - const size_t space = txn_db_impl->GetLockBucketsSize(); - std::set lk_idxes; - std::vector> lks; + std::set lk_ptrs; std::unique_ptr cf_it( tracked_locks_->GetColumnFamilyIterator()); assert(cf_it != nullptr); while (cf_it->HasNext()) { ColumnFamilyId cf = cf_it->Next(); + + // To avoid the same key(s) contending across CFs or DBs, seed the + // hash independently. + uint64_t seed = reinterpret_cast(db_impl) + + uint64_t{0xb83c07fbc6ced699} /*random prime*/ * cf; + std::unique_ptr key_it( tracked_locks_->GetKeyIterator(cf)); assert(key_it != nullptr); while (key_it->HasNext()) { - const std::string& key = key_it->Next(); - lk_idxes.insert(FastRange64(GetSliceNPHash64(key), space)); + auto lock_bucket_ptr = &txn_db_impl->GetLockBucket(key_it->Next(), seed); + TEST_SYNC_POINT_CALLBACK( + "OptimisticTransaction::CommitWithParallelValidate::lock_bucket_ptr", + lock_bucket_ptr); + lk_ptrs.insert(lock_bucket_ptr); } } // NOTE: in a single txn, all bucket-locks are taken in ascending order. // In this way, txns from different threads all obey this rule so that // deadlock can be avoided. - for (auto v : lk_idxes) { - lks.emplace_back(txn_db_impl->LockBucket(v)); + for (auto v : lk_ptrs) { + // WART: if an exception is thrown during a Lock(), previously locked will + // not be Unlock()ed. But a vector of MutexLock is likely inefficient. + v->Lock(); } + Defer unlocks([&]() { + for (auto v : lk_ptrs) { + v->Unlock(); + } + }); Status s = TransactionUtil::CheckKeysForConflicts(db_impl, *tracked_locks_, true /* cache_only */); @@ -192,5 +207,3 @@ Status OptimisticTransaction::SetName(const TransactionName& /* unused */) { } } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/optimistic_transaction.h b/utilities/transactions/optimistic_transaction.h index de23233d573c..a03fa16970fe 100644 --- a/utilities/transactions/optimistic_transaction.h +++ b/utilities/transactions/optimistic_transaction.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -98,4 +97,3 @@ class OptimisticTransactionCallback : public WriteCallback { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/optimistic_transaction_db_impl.cc b/utilities/transactions/optimistic_transaction_db_impl.cc index bffb3d5ed8dc..30efa86aafb8 100644 --- a/utilities/transactions/optimistic_transaction_db_impl.cc +++ b/utilities/transactions/optimistic_transaction_db_impl.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/optimistic_transaction_db_impl.h" @@ -18,6 +17,15 @@ namespace ROCKSDB_NAMESPACE { +std::shared_ptr MakeSharedOccLockBuckets(size_t bucket_count, + bool cache_aligned) { + if (cache_aligned) { + return std::make_shared>(bucket_count); + } else { + return std::make_shared>(bucket_count); + } +} + Transaction* OptimisticTransactionDBImpl::BeginTransaction( const WriteOptions& write_options, const OptimisticTransactionOptions& txn_options, Transaction* old_txn) { @@ -29,12 +37,6 @@ Transaction* OptimisticTransactionDBImpl::BeginTransaction( } } -std::unique_lock OptimisticTransactionDBImpl::LockBucket( - size_t idx) { - assert(idx < bucketed_locks_.size()); - return std::unique_lock(*bucketed_locks_[idx]); -} - Status OptimisticTransactionDB::Open(const Options& options, const std::string& dbname, OptimisticTransactionDB** dbptr) { @@ -108,4 +110,3 @@ void OptimisticTransactionDBImpl::ReinitializeTransaction( } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/optimistic_transaction_db_impl.h b/utilities/transactions/optimistic_transaction_db_impl.h index 88e86ea4a67e..7bc718e9bdcf 100644 --- a/utilities/transactions/optimistic_transaction_db_impl.h +++ b/utilities/transactions/optimistic_transaction_db_impl.h @@ -4,18 +4,43 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include -#include +#include +#include #include #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/utilities/optimistic_transaction_db.h" +#include "util/cast_util.h" +#include "util/mutexlock.h" namespace ROCKSDB_NAMESPACE { +class OccLockBucketsImplBase : public OccLockBuckets { + public: + virtual port::Mutex& GetLockBucket(const Slice& key, uint64_t seed) = 0; +}; + +template +class OccLockBucketsImpl : public OccLockBucketsImplBase { + public: + explicit OccLockBucketsImpl(size_t bucket_count) : locks_(bucket_count) {} + port::Mutex& GetLockBucket(const Slice& key, uint64_t seed) override { + return locks_.Get(key, seed); + } + size_t ApproximateMemoryUsage() const override { + return locks_.ApproximateMemoryUsage(); + } + + private: + // TODO: investigate optionally using folly::MicroLock to majorly save space + using M = std::conditional_t, + port::Mutex>; + Striped locks_; +}; + class OptimisticTransactionDBImpl : public OptimisticTransactionDB { public: explicit OptimisticTransactionDBImpl( @@ -25,12 +50,13 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB { db_owner_(take_ownership), validate_policy_(occ_options.validate_policy) { if (validate_policy_ == OccValidationPolicy::kValidateParallel) { - uint32_t bucket_size = std::max(16u, occ_options.occ_lock_buckets); - bucketed_locks_.reserve(bucket_size); - for (size_t i = 0; i < bucket_size; ++i) { - bucketed_locks_.emplace_back( - std::unique_ptr(new std::mutex)); + auto bucketed_locks = occ_options.shared_lock_buckets; + if (!bucketed_locks) { + uint32_t bucket_count = std::max(16u, occ_options.occ_lock_buckets); + bucketed_locks = MakeSharedOccLockBuckets(bucket_count); } + bucketed_locks_ = static_cast_with_check( + std::move(bucketed_locks)); } } @@ -63,16 +89,14 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB { return OptimisticTransactionDB::Write(write_opts, batch); } - size_t GetLockBucketsSize() const { return bucketed_locks_.size(); } - OccValidationPolicy GetValidatePolicy() const { return validate_policy_; } - std::unique_lock LockBucket(size_t idx); + port::Mutex& GetLockBucket(const Slice& key, uint64_t seed) { + return bucketed_locks_->GetLockBucket(key, seed); + } private: - // NOTE: used in validation phase. Each key is hashed into some - // bucket. We then take the lock in the hash value order to avoid deadlock. - std::vector> bucketed_locks_; + std::shared_ptr bucketed_locks_; bool db_owner_; @@ -85,4 +109,3 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc index aa8192c325be..733494180459 100644 --- a/utilities/transactions/optimistic_transaction_test.cc +++ b/utilities/transactions/optimistic_transaction_test.cc @@ -3,9 +3,9 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE - +#include #include +#include #include #include @@ -28,49 +28,54 @@ class OptimisticTransactionTest : public testing::Test, public testing::WithParamInterface { public: - OptimisticTransactionDB* txn_db; + std::unique_ptr txn_db; std::string dbname; Options options; + OptimisticTransactionDBOptions occ_opts; OptimisticTransactionTest() { options.create_if_missing = true; options.max_write_buffer_number = 2; options.max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize; options.merge_operator.reset(new TestPutOperator()); + occ_opts.validate_policy = GetParam(); dbname = test::PerThreadDBPath("optimistic_transaction_testdb"); EXPECT_OK(DestroyDB(dbname, options)); Open(); } ~OptimisticTransactionTest() override { - delete txn_db; + EXPECT_OK(txn_db->Close()); + txn_db.reset(); EXPECT_OK(DestroyDB(dbname, options)); } void Reopen() { - delete txn_db; - txn_db = nullptr; + txn_db.reset(); Open(); } - private: - void Open() { + static void OpenImpl(const Options& options, + const OptimisticTransactionDBOptions& occ_opts, + const std::string& dbname, + std::unique_ptr* txn_db) { ColumnFamilyOptions cf_options(options); - OptimisticTransactionDBOptions occ_opts; - occ_opts.validate_policy = GetParam(); std::vector column_families; std::vector handles; column_families.push_back( ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); - Status s = - OptimisticTransactionDB::Open(DBOptions(options), occ_opts, dbname, - column_families, &handles, &txn_db); - + OptimisticTransactionDB* raw_txn_db = nullptr; + Status s = OptimisticTransactionDB::Open( + options, occ_opts, dbname, column_families, &handles, &raw_txn_db); ASSERT_OK(s); - ASSERT_NE(txn_db, nullptr); + ASSERT_NE(raw_txn_db, nullptr); + txn_db->reset(raw_txn_db); ASSERT_EQ(handles.size(), 1); delete handles[0]; } + + private: + void Open() { OpenImpl(options, occ_opts, dbname, &txn_db); } }; TEST_P(OptimisticTransactionTest, SuccessTest) { @@ -317,17 +322,11 @@ TEST_P(OptimisticTransactionTest, FlushTest) { delete txn; } -TEST_P(OptimisticTransactionTest, FlushTest2) { - WriteOptions write_options; - ReadOptions read_options, snapshot_read_options; +namespace { +void FlushTest2PopulateTxn(Transaction* txn) { + ReadOptions snapshot_read_options; std::string value; - ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar"))); - ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar"))); - - Transaction* txn = txn_db->BeginTransaction(write_options); - ASSERT_NE(txn, nullptr); - snapshot_read_options.snapshot = txn->GetSnapshot(); ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); @@ -337,6 +336,21 @@ TEST_P(OptimisticTransactionTest, FlushTest2) { ASSERT_OK(txn->GetForUpdate(snapshot_read_options, "foo", &value)); ASSERT_EQ(value, "bar2"); +} +} // namespace + +TEST_P(OptimisticTransactionTest, FlushTest2) { + WriteOptions write_options; + ReadOptions read_options; + std::string value; + + ASSERT_OK(txn_db->Put(write_options, Slice("foo"), Slice("bar"))); + ASSERT_OK(txn_db->Put(write_options, Slice("foo2"), Slice("bar"))); + + Transaction* txn = txn_db->BeginTransaction(write_options); + ASSERT_NE(txn, nullptr); + + FlushTest2PopulateTxn(txn); // Put a random key so we have a MemTable to flush ASSERT_OK(txn_db->Put(write_options, "dummy", "dummy")); @@ -362,9 +376,23 @@ TEST_P(OptimisticTransactionTest, FlushTest2) { // txn should not commit since MemTableList History is not large enough ASSERT_TRUE(s.IsTryAgain()); + // simply trying Commit again doesn't help + s = txn->Commit(); + ASSERT_TRUE(s.IsTryAgain()); + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); ASSERT_EQ(value, "bar"); + // But rolling back and redoing does + ASSERT_OK(txn->Rollback()); + + FlushTest2PopulateTxn(txn); + + ASSERT_OK(txn->Commit()); + + ASSERT_OK(txn_db->Get(read_options, "foo", &value)); + ASSERT_EQ(value, "bar2"); + delete txn; } @@ -617,8 +645,11 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) { delete cfa; delete cfb; - delete txn_db; - txn_db = nullptr; + txn_db.reset(); + + OptimisticTransactionDBOptions my_occ_opts = occ_opts; + const size_t bucket_count = 500; + my_occ_opts.shared_lock_buckets = MakeSharedOccLockBuckets(bucket_count); // open DB with three column families std::vector column_families; @@ -631,10 +662,11 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) { column_families.push_back( ColumnFamilyDescriptor("CFB", ColumnFamilyOptions())); std::vector handles; - ASSERT_OK(OptimisticTransactionDB::Open(options, dbname, column_families, - &handles, &txn_db)); - assert(txn_db != nullptr); - ASSERT_NE(txn_db, nullptr); + OptimisticTransactionDB* raw_txn_db = nullptr; + ASSERT_OK(OptimisticTransactionDB::Open( + options, my_occ_opts, dbname, column_families, &handles, &raw_txn_db)); + ASSERT_NE(raw_txn_db, nullptr); + txn_db.reset(raw_txn_db); Transaction* txn = txn_db->BeginTransaction(write_options); ASSERT_NE(txn, nullptr); @@ -672,6 +704,7 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) { s = txn_db->Get(read_options, "AAA", &value); ASSERT_TRUE(s.IsNotFound()); s = txn_db->Get(read_options, handles[2], "AAAZZZ", &value); + ASSERT_OK(s); ASSERT_EQ(value, "barbar"); Slice key_slices[3] = {Slice("AAA"), Slice("ZZ"), Slice("Z")}; @@ -695,6 +728,7 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) { delete txn; delete txn2; + // ** MultiGet ** txn = txn_db->BeginTransaction(write_options, txn_options); snapshot_read_options.snapshot = txn->GetSnapshot(); @@ -746,11 +780,162 @@ TEST_P(OptimisticTransactionTest, ColumnFamiliesTest) { s = txn2->Commit(); ASSERT_TRUE(s.IsBusy()); + delete txn; + delete txn2; + + // ** Test independence and/or sharing of lock buckets across CFs and DBs ** + if (my_occ_opts.validate_policy == OccValidationPolicy::kValidateParallel) { + struct SeenStat { + uint64_t rolling_hash = 0; + uintptr_t min = 0; + uintptr_t max = 0; + }; + SeenStat cur_seen; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "OptimisticTransaction::CommitWithParallelValidate::lock_bucket_ptr", + [&](void* arg) { + // Hash the pointer + cur_seen.rolling_hash = Hash64(reinterpret_cast(&arg), + sizeof(arg), cur_seen.rolling_hash); + uintptr_t val = reinterpret_cast(arg); + if (cur_seen.min == 0 || val < cur_seen.min) { + cur_seen.min = val; + } + if (cur_seen.max == 0 || val > cur_seen.max) { + cur_seen.max = val; + } + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + // Another db sharing lock buckets + auto shared_dbname = + test::PerThreadDBPath("optimistic_transaction_testdb_shared"); + std::unique_ptr shared_txn_db = nullptr; + OpenImpl(options, my_occ_opts, shared_dbname, &shared_txn_db); + + // Another db not sharing lock buckets + auto nonshared_dbname = + test::PerThreadDBPath("optimistic_transaction_testdb_nonshared"); + std::unique_ptr nonshared_txn_db = nullptr; + my_occ_opts.occ_lock_buckets = bucket_count; + my_occ_opts.shared_lock_buckets = nullptr; + OpenImpl(options, my_occ_opts, nonshared_dbname, &nonshared_txn_db); + + // Plenty of keys to avoid randomly hitting the same hash sequence + std::array keys; + for (size_t i = 0; i < keys.size(); ++i) { + keys[i] = std::to_string(i); + } + + // Get a baseline pattern of bucket accesses + cur_seen = {}; + txn = txn_db->BeginTransaction(write_options, txn_options); + for (const auto& key : keys) { + ASSERT_OK(txn->Put(handles[0], key, "blah")); + } + ASSERT_OK(txn->Commit()); + // Sufficiently large hash coverage of the space + const uintptr_t min_span_bytes = sizeof(port::Mutex) * bucket_count / 2; + ASSERT_GT(cur_seen.max - cur_seen.min, min_span_bytes); + // Save + SeenStat base_seen = cur_seen; + + // Verify it is repeatable + cur_seen = {}; + txn = txn_db->BeginTransaction(write_options, txn_options, txn); + for (const auto& key : keys) { + ASSERT_OK(txn->Put(handles[0], key, "moo")); + } + ASSERT_OK(txn->Commit()); + ASSERT_EQ(cur_seen.rolling_hash, base_seen.rolling_hash); + ASSERT_EQ(cur_seen.min, base_seen.min); + ASSERT_EQ(cur_seen.max, base_seen.max); + + // Try another CF + cur_seen = {}; + txn = txn_db->BeginTransaction(write_options, txn_options, txn); + for (const auto& key : keys) { + ASSERT_OK(txn->Put(handles[1], key, "blah")); + } + ASSERT_OK(txn->Commit()); + // Different access pattern (different hash seed) + ASSERT_NE(cur_seen.rolling_hash, base_seen.rolling_hash); + // Same pointer space + ASSERT_LT(cur_seen.min, base_seen.max); + ASSERT_GT(cur_seen.max, base_seen.min); + // Sufficiently large hash coverage of the space + ASSERT_GT(cur_seen.max - cur_seen.min, min_span_bytes); + // Save + SeenStat cf1_seen = cur_seen; + + // And another CF + cur_seen = {}; + txn = txn_db->BeginTransaction(write_options, txn_options, txn); + for (const auto& key : keys) { + ASSERT_OK(txn->Put(handles[2], key, "blah")); + } + ASSERT_OK(txn->Commit()); + // Different access pattern (different hash seed) + ASSERT_NE(cur_seen.rolling_hash, base_seen.rolling_hash); + ASSERT_NE(cur_seen.rolling_hash, cf1_seen.rolling_hash); + // Same pointer space + ASSERT_LT(cur_seen.min, base_seen.max); + ASSERT_GT(cur_seen.max, base_seen.min); + // Sufficiently large hash coverage of the space + ASSERT_GT(cur_seen.max - cur_seen.min, min_span_bytes); + + // And DB with shared lock buckets + cur_seen = {}; + delete txn; + txn = shared_txn_db->BeginTransaction(write_options, txn_options); + for (const auto& key : keys) { + ASSERT_OK(txn->Put(key, "blah")); + } + ASSERT_OK(txn->Commit()); + // Different access pattern (different hash seed) + ASSERT_NE(cur_seen.rolling_hash, base_seen.rolling_hash); + ASSERT_NE(cur_seen.rolling_hash, cf1_seen.rolling_hash); + // Same pointer space + ASSERT_LT(cur_seen.min, base_seen.max); + ASSERT_GT(cur_seen.max, base_seen.min); + // Sufficiently large hash coverage of the space + ASSERT_GT(cur_seen.max - cur_seen.min, min_span_bytes); + + // And DB with distinct lock buckets + cur_seen = {}; + delete txn; + txn = nonshared_txn_db->BeginTransaction(write_options, txn_options); + for (const auto& key : keys) { + ASSERT_OK(txn->Put(key, "blah")); + } + ASSERT_OK(txn->Commit()); + // Different access pattern (different hash seed) + ASSERT_NE(cur_seen.rolling_hash, base_seen.rolling_hash); + ASSERT_NE(cur_seen.rolling_hash, cf1_seen.rolling_hash); + // Different pointer space + ASSERT_TRUE(cur_seen.min > base_seen.max || cur_seen.max < base_seen.min); + // Sufficiently large hash coverage of the space + ASSERT_GT(cur_seen.max - cur_seen.min, min_span_bytes); + + delete txn; + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + + // ** Test dropping column family before committing, or even creating txn ** + txn = txn_db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn->Delete(handles[1], "AAA")); + s = txn_db->DropColumnFamily(handles[1]); ASSERT_OK(s); s = txn_db->DropColumnFamily(handles[2]); ASSERT_OK(s); + ASSERT_NOK(txn->Commit()); + + txn2 = txn_db->BeginTransaction(write_options, txn_options); + ASSERT_OK(txn2->Delete(handles[2], "AAA")); + ASSERT_NOK(txn2->Commit()); + delete txn; delete txn2; @@ -1238,7 +1423,7 @@ TEST_P(OptimisticTransactionTest, UndoGetForUpdateTest) { txn1->UndoGetForUpdate("A"); Transaction* txn2 = txn_db->BeginTransaction(write_options); - txn2->Put("A", "x"); + ASSERT_OK(txn2->Put("A", "x")); ASSERT_OK(txn2->Commit()); delete txn2; @@ -1403,7 +1588,7 @@ TEST_P(OptimisticTransactionTest, OptimisticTransactionStressTest) { std::function call_inserter = [&] { ASSERT_OK(OptimisticTransactionStressTestInserter( - txn_db, num_transactions_per_thread, num_sets, num_keys_per_set)); + txn_db.get(), num_transactions_per_thread, num_sets, num_keys_per_set)); }; // Create N threads that use RandomTransactionInserter to write @@ -1418,7 +1603,7 @@ TEST_P(OptimisticTransactionTest, OptimisticTransactionStressTest) { } // Verify that data is consistent - Status s = RandomTransactionInserter::Verify(txn_db, num_sets); + Status s = RandomTransactionInserter::Verify(txn_db.get(), num_sets); ASSERT_OK(s); } @@ -1450,6 +1635,47 @@ TEST_P(OptimisticTransactionTest, SequenceNumberAfterRecoverTest) { delete transaction; } +#ifdef __SANITIZE_THREAD__ +// Skip OptimisticTransactionTest.SequenceNumberAfterRecoverLargeTest under TSAN +// to avoid false positive because of TSAN lock limit of 64. +#else +TEST_P(OptimisticTransactionTest, SequenceNumberAfterRecoverLargeTest) { + WriteOptions write_options; + OptimisticTransactionOptions transaction_options; + + Transaction* transaction( + txn_db->BeginTransaction(write_options, transaction_options)); + + std::string value(1024 * 1024, 'X'); + const size_t n_zero = 2; + std::string s_i; + Status s; + for (int i = 1; i <= 64; i++) { + s_i = std::to_string(i); + auto key = std::string(n_zero - std::min(n_zero, s_i.length()), '0') + s_i; + s = transaction->Put(key, value); + ASSERT_OK(s); + } + + s = transaction->Commit(); + ASSERT_OK(s); + delete transaction; + + Reopen(); + transaction = txn_db->BeginTransaction(write_options, transaction_options); + s = transaction->Put("bar", "val"); + ASSERT_OK(s); + s = transaction->Commit(); + if (!s.ok()) { + std::cerr << "Failed to commit records. Error: " << s.ToString() + << std::endl; + } + ASSERT_OK(s); + + delete transaction; +} +#endif // __SANITIZE_THREAD__ + TEST_P(OptimisticTransactionTest, TimestampedSnapshotMissingCommitTs) { std::unique_ptr txn(txn_db->BeginTransaction(WriteOptions())); ASSERT_OK(txn->Put("a", "v")); @@ -1470,6 +1696,19 @@ INSTANTIATE_TEST_CASE_P( testing::Values(OccValidationPolicy::kValidateSerial, OccValidationPolicy::kValidateParallel)); +TEST(OccLockBucketsTest, CacheAligned) { + // Typical x86_64 is 40 byte mutex, 64 byte cache line + if (sizeof(port::Mutex) >= sizeof(CacheAlignedWrapper)) { + ROCKSDB_GTEST_BYPASS("Test requires mutex smaller than cache line"); + return; + } + auto buckets_unaligned = MakeSharedOccLockBuckets(100, false); + auto buckets_aligned = MakeSharedOccLockBuckets(100, true); + // Save at least one byte per bucket + ASSERT_LE(buckets_unaligned->ApproximateMemoryUsage() + 100, + buckets_aligned->ApproximateMemoryUsage()); +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { @@ -1477,15 +1716,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf( - stderr, - "SKIPPED as optimistic_transaction is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc index cb8fd3bb61cf..1e870190e39a 100644 --- a/utilities/transactions/pessimistic_transaction.cc +++ b/utilities/transactions/pessimistic_transaction.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/pessimistic_transaction.h" @@ -167,6 +166,11 @@ template inline Status WriteCommittedTxn::GetForUpdateImpl( const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, TValue* value, bool exclusive, const bool do_validate) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call GetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } column_family = column_family ? column_family : db_impl_->DefaultColumnFamily(); assert(column_family); @@ -179,8 +183,8 @@ inline Status WriteCommittedTxn::GetForUpdateImpl( value, exclusive, do_validate); } } else { - Status s = db_impl_->FailIfTsMismatchCf( - column_family, *(read_options.timestamp), /*ts_for_read=*/true); + Status s = + db_impl_->FailIfTsMismatchCf(column_family, *(read_options.timestamp)); if (!s.ok()) { return s; } @@ -884,14 +888,8 @@ Status PessimisticTransaction::LockBatch(WriteBatch* batch, Handler() {} void RecordKey(uint32_t column_family_id, const Slice& key) { - std::string key_str = key.ToString(); - auto& cfh_keys = keys_[column_family_id]; - auto iter = cfh_keys.find(key_str); - if (iter == cfh_keys.end()) { - // key not yet seen, store it. - cfh_keys.insert({std::move(key_str)}); - } + cfh_keys.insert(key.ToString()); } Status PutCF(uint32_t column_family_id, const Slice& key, @@ -1170,6 +1168,16 @@ Status PessimisticTransaction::SetName(const TransactionName& name) { return s; } -} // namespace ROCKSDB_NAMESPACE +Status PessimisticTransaction::CollapseKey(const ReadOptions& options, + const Slice& key, + ColumnFamilyHandle* column_family) { + auto* cfh = column_family ? column_family : db_impl_->DefaultColumnFamily(); + std::string value; + const auto status = GetForUpdate(options, cfh, key, &value, true, true); + if (!status.ok()) { + return status; + } + return Put(column_family, key, value); +} -#endif // ROCKSDB_LITE +} // namespace ROCKSDB_NAMESPACE diff --git a/utilities/transactions/pessimistic_transaction.h b/utilities/transactions/pessimistic_transaction.h index d43d1d3ac505..bb12266ec4bf 100644 --- a/utilities/transactions/pessimistic_transaction.h +++ b/utilities/transactions/pessimistic_transaction.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -120,6 +119,10 @@ class PessimisticTransaction : public TransactionBaseImpl { const Endpoint& start_key, const Endpoint& end_key) override; + virtual Status CollapseKey( + const ReadOptions& options, const Slice& key, + ColumnFamilyHandle* column_family = nullptr) override; + protected: // Refer to // TransactionOptions::use_only_the_last_commit_time_batch_for_recovery @@ -309,5 +312,3 @@ class WriteCommittedTxn : public PessimisticTransaction { }; } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/pessimistic_transaction_db.cc b/utilities/transactions/pessimistic_transaction_db.cc index 950ef80422d8..8009bef197b0 100644 --- a/utilities/transactions/pessimistic_transaction_db.cc +++ b/utilities/transactions/pessimistic_transaction_db.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/pessimistic_transaction_db.h" @@ -779,4 +778,3 @@ Status SnapshotCreationCallback::operator()(SequenceNumber seq, } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/pessimistic_transaction_db.h b/utilities/transactions/pessimistic_transaction_db.h index 25cd11054090..b662048bd49e 100644 --- a/utilities/transactions/pessimistic_transaction_db.h +++ b/utilities/transactions/pessimistic_transaction_db.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -315,4 +314,3 @@ class SnapshotCreationCallback : public PostMemTableCallback { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/snapshot_checker.cc b/utilities/transactions/snapshot_checker.cc index 76d16681a17f..da363a12fbf2 100644 --- a/utilities/transactions/snapshot_checker.cc +++ b/utilities/transactions/snapshot_checker.cc @@ -5,27 +5,12 @@ #include "db/snapshot_checker.h" -#ifdef ROCKSDB_LITE -#include -#endif // ROCKSDB_LITE #include "port/lang.h" #include "utilities/transactions/write_prepared_txn_db.h" namespace ROCKSDB_NAMESPACE { -#ifdef ROCKSDB_LITE -WritePreparedSnapshotChecker::WritePreparedSnapshotChecker( - WritePreparedTxnDB* /*txn_db*/) {} - -SnapshotCheckerResult WritePreparedSnapshotChecker::CheckInSnapshot( - SequenceNumber /*sequence*/, SequenceNumber /*snapshot_sequence*/) const { - // Should never be called in LITE mode. - assert(false); - return SnapshotCheckerResult::kInSnapshot; -} - -#else WritePreparedSnapshotChecker::WritePreparedSnapshotChecker( WritePreparedTxnDB* txn_db) @@ -44,7 +29,6 @@ SnapshotCheckerResult WritePreparedSnapshotChecker::CheckInSnapshot( : SnapshotCheckerResult::kNotInSnapshot; } -#endif // ROCKSDB_LITE DisableGCSnapshotChecker* DisableGCSnapshotChecker::Instance() { STATIC_AVOID_DESTRUCTION(DisableGCSnapshotChecker, instance); diff --git a/utilities/transactions/timestamped_snapshot_test.cc b/utilities/transactions/timestamped_snapshot_test.cc index e9b474415d03..9681b0157ad3 100644 --- a/utilities/transactions/timestamped_snapshot_test.cc +++ b/utilities/transactions/timestamped_snapshot_test.cc @@ -3,14 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifdef ROCKSDB_LITE -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as Transactions are not supported in LITE mode\n"); - return 0; -} -#else // ROCKSDB_LITE #include #include "util/cast_util.h" @@ -463,4 +455,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } -#endif // !ROCKSDB_LITE diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc index 83fd94ac8557..b232736cfb76 100644 --- a/utilities/transactions/transaction_base.cc +++ b/utilities/transactions/transaction_base.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/transaction_base.h" @@ -111,6 +110,8 @@ void TransactionBaseImpl::Reinitialize(DB* db, start_time_ = dbimpl_->GetSystemClock()->NowMicros(); indexing_enabled_ = true; cmp_ = GetColumnFamilyUserComparator(db_->DefaultColumnFamily()); + WriteBatchInternal::SetDefaultColumnFamilyTimestampSize( + write_batch_.GetWriteBatch(), cmp_->timestamp_size()); WriteBatchInternal::UpdateProtectionInfo( write_batch_.GetWriteBatch(), write_options_.protection_bytes_per_key) .PermitUncheckedError(); @@ -233,22 +234,56 @@ Status TransactionBaseImpl::PopSavePoint() { return write_batch_.PopSavePoint(); } -Status TransactionBaseImpl::Get(const ReadOptions& read_options, +Status TransactionBaseImpl::Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kGet) { + return Status::InvalidArgument( + "Can only call Get with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kGet`"); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kGet; + } + auto s = GetImpl(read_options, column_family, key, value); + return s; +} + +Status TransactionBaseImpl::GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) { assert(value != nullptr); PinnableSlice pinnable_val(value); assert(!pinnable_val.IsPinned()); - auto s = Get(read_options, column_family, key, &pinnable_val); + auto s = GetImpl(read_options, column_family, key, &pinnable_val); if (s.ok() && pinnable_val.IsPinned()) { value->assign(pinnable_val.data(), pinnable_val.size()); } // else value is already assigned return s; } -Status TransactionBaseImpl::Get(const ReadOptions& read_options, +Status TransactionBaseImpl::Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kGet) { + return Status::InvalidArgument( + "Can only call Get with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kGet`"); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kGet; + } + return GetImpl(read_options, column_family, key, pinnable_val); +} + +Status TransactionBaseImpl::GetImpl(const ReadOptions& read_options, + ColumnFamilyHandle* column_family, + const Slice& key, + PinnableSlice* pinnable_val) { return write_batch_.GetFromBatchAndDB(db_, read_options, column_family, key, pinnable_val); } @@ -263,6 +298,11 @@ Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options, "If do_validate is false then GetForUpdate with snapshot is not " "defined."); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call GetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } Status s = TryLock(column_family, key, true /* read_only */, exclusive, do_validate); @@ -270,7 +310,7 @@ Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options, assert(value != nullptr); PinnableSlice pinnable_val(value); assert(!pinnable_val.IsPinned()); - s = Get(read_options, column_family, key, &pinnable_val); + s = GetImpl(read_options, column_family, key, &pinnable_val); if (s.ok() && pinnable_val.IsPinned()) { value->assign(pinnable_val.data(), pinnable_val.size()); } // else value is already assigned @@ -289,35 +329,72 @@ Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options, "If do_validate is false then GetForUpdate with snapshot is not " "defined."); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call GetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } Status s = TryLock(column_family, key, true /* read_only */, exclusive, do_validate); if (s.ok() && pinnable_val != nullptr) { - s = Get(read_options, column_family, key, pinnable_val); + s = GetImpl(read_options, column_family, key, pinnable_val); } return s; } std::vector TransactionBaseImpl::MultiGet( - const ReadOptions& read_options, + const ReadOptions& _read_options, const std::vector& column_family, const std::vector& keys, std::vector* values) { size_t num_keys = keys.size(); - values->resize(num_keys); - std::vector stat_list(num_keys); + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kMultiGet) { + Status s = Status::InvalidArgument( + "Can only call MultiGet with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); + + for (size_t i = 0; i < num_keys; ++i) { + stat_list[i] = s; + } + return stat_list; + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kMultiGet; + } + + values->resize(num_keys); for (size_t i = 0; i < num_keys; ++i) { - stat_list[i] = Get(read_options, column_family[i], keys[i], &(*values)[i]); + stat_list[i] = + GetImpl(read_options, column_family[i], keys[i], &(*values)[i]); } return stat_list; } -void TransactionBaseImpl::MultiGet(const ReadOptions& read_options, +void TransactionBaseImpl::MultiGet(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kMultiGet) { + Status s = Status::InvalidArgument( + "Can only call MultiGet with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + statuses[i] = s; + } + } + return; + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kMultiGet; + } write_batch_.MultiGetFromBatchAndDB(db_, read_options, column_family, num_keys, keys, values, statuses, sorted_input); @@ -327,8 +404,14 @@ std::vector TransactionBaseImpl::MultiGetForUpdate( const ReadOptions& read_options, const std::vector& column_family, const std::vector& keys, std::vector* values) { - // Regardless of whether the MultiGet succeeded, track these keys. size_t num_keys = keys.size(); + if (read_options.io_activity != Env::IOActivity::kUnknown) { + Status s = Status::InvalidArgument( + "Cannot call MultiGetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + return std::vector(num_keys, s); + } + // Regardless of whether the MultiGet succeeded, track these keys. values->resize(num_keys); // Lock all keys @@ -344,7 +427,8 @@ std::vector TransactionBaseImpl::MultiGetForUpdate( // TODO(agiardullo): optimize multiget? std::vector stat_list(num_keys); for (size_t i = 0; i < num_keys; ++i) { - stat_list[i] = Get(read_options, column_family[i], keys[i], &(*values)[i]); + stat_list[i] = + GetImpl(read_options, column_family[i], keys[i], &(*values)[i]); } return stat_list; @@ -727,5 +811,3 @@ WriteBatch* TransactionBaseImpl::GetCommitTimeWriteBatch() { return &commit_time_batch_; } } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h index 1bcb20ca90bd..be363b473a5f 100644 --- a/utilities/transactions/transaction_base.h +++ b/utilities/transactions/transaction_base.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -54,11 +53,13 @@ class TransactionBaseImpl : public Transaction { Status PopSavePoint() override; using Transaction::Get; - Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, - const Slice& key, std::string* value) override; + Status Get(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) override; - Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, - const Slice& key, PinnableSlice* value) override; + Status Get(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; Status Get(const ReadOptions& options, const Slice& key, std::string* value) override { @@ -85,7 +86,7 @@ class TransactionBaseImpl : public Transaction { using Transaction::MultiGet; std::vector MultiGet( - const ReadOptions& options, + const ReadOptions& _read_options, const std::vector& column_family, const std::vector& keys, std::vector* values) override; @@ -99,9 +100,10 @@ class TransactionBaseImpl : public Transaction { keys, values); } - void MultiGet(const ReadOptions& options, ColumnFamilyHandle* column_family, - const size_t num_keys, const Slice* keys, PinnableSlice* values, - Status* statuses, const bool sorted_input = false) override; + void MultiGet(const ReadOptions& _read_options, + ColumnFamilyHandle* column_family, const size_t num_keys, + const Slice* keys, PinnableSlice* values, Status* statuses, + const bool sorted_input = false) override; using Transaction::MultiGetForUpdate; std::vector MultiGetForUpdate( @@ -261,6 +263,13 @@ class TransactionBaseImpl : public Transaction { LockTracker& GetTrackedLocks() { return *tracked_locks_; } protected: + Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) override; + + virtual Status GetImpl(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + PinnableSlice* value) override; + // Add a key to the list of tracked keys. // // seqno is the earliest seqno this key was involved with this transaction. @@ -380,5 +389,3 @@ class TransactionBaseImpl : public Transaction { }; } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_db_mutex_impl.cc b/utilities/transactions/transaction_db_mutex_impl.cc index 345c4be902fd..52c299b376a9 100644 --- a/utilities/transactions/transaction_db_mutex_impl.cc +++ b/utilities/transactions/transaction_db_mutex_impl.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/transaction_db_mutex_impl.h" @@ -132,4 +131,3 @@ Status TransactionDBCondVarImpl::WaitFor( } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_db_mutex_impl.h b/utilities/transactions/transaction_db_mutex_impl.h index fbee92832089..f509f4017a5c 100644 --- a/utilities/transactions/transaction_db_mutex_impl.h +++ b/utilities/transactions/transaction_db_mutex_impl.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include "rocksdb/utilities/transaction_db_mutex.h" @@ -23,4 +22,3 @@ class TransactionDBMutexFactoryImpl : public TransactionDBMutexFactory { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index d74a4b8b116f..d12626ca8c5c 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/transaction_test.h" @@ -26,7 +25,6 @@ #include "test_util/transaction_test_util.h" #include "util/random.h" #include "util/string_util.h" -#include "utilities/fault_injection_env.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend.h" #include "utilities/transactions/pessimistic_transaction_db.h" @@ -80,6 +78,112 @@ INSTANTIATE_TEST_CASE_P( std::make_tuple(false, true, WRITE_PREPARED, kUnorderedWrite, true))); #endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) +TEST_P(TransactionTest, TestUpperBoundUponDeletion) { + // Reproduction from the original bug report, 11606 + // This test does writes without snapshot validation, and then tries to create + // iterator later, which is unsupported in write unprepared. + if (txn_db_options.write_policy == WRITE_UNPREPARED) { + return; + } + + WriteOptions write_options; + ReadOptions read_options; + Status s; + + Transaction* txn = db->BeginTransaction(write_options); + ASSERT_TRUE(txn); + + // Write some keys in a txn + s = txn->Put("2", "2"); + ASSERT_OK(s); + + s = txn->Put("1", "1"); + ASSERT_OK(s); + + s = txn->Delete("2"); + ASSERT_OK(s); + + read_options.iterate_upper_bound = new Slice("2", 1); + Iterator* iter = txn->GetIterator(read_options); + ASSERT_OK(iter->status()); + iter->SeekToFirst(); + while (iter->Valid()) { + ASSERT_EQ("1", iter->key().ToString()); + iter->Next(); + } + delete iter; + delete txn; + delete read_options.iterate_upper_bound; +} + +TEST_P(TransactionTest, TestTxnRespectBoundsInReadOption) { + if (txn_db_options.write_policy == WRITE_UNPREPARED) { + return; + } + + WriteOptions write_options; + + { + std::unique_ptr txn(db->BeginTransaction(write_options)); + // writes that should be observed by base_iterator_ in BaseDeltaIterator + ASSERT_OK(txn->Put("a", "aa")); + ASSERT_OK(txn->Put("c", "cc")); + ASSERT_OK(txn->Put("e", "ee")); + ASSERT_OK(txn->Put("f", "ff")); + ASSERT_TRUE(txn->Commit().ok()); + } + + std::unique_ptr txn2(db->BeginTransaction(write_options)); + // writes that should be observed by delta_iterator_ in BaseDeltaIterator + ASSERT_OK(txn2->Put("b", "bb")); + ASSERT_OK(txn2->Put("c", "cc")); + ASSERT_OK(txn2->Put("f", "ff")); + + // delta_iterator_: b c f + // base_iterator_: a c e f + // + // given range [c, f) + // assert only {c, e} can be seen + + ReadOptions ro; + ro.iterate_lower_bound = new Slice("c"); + ro.iterate_upper_bound = new Slice("f"); + std::unique_ptr iter(txn2->GetIterator(ro)); + + iter->Seek(Slice("b")); + ASSERT_EQ("c", iter->key()); // lower bound capping + iter->Seek(Slice("f")); + ASSERT_FALSE(iter->Valid()); // out of bound + + iter->SeekForPrev(Slice("f")); + ASSERT_EQ("e", iter->key()); // upper bound capping + iter->SeekForPrev(Slice("b")); + ASSERT_FALSE(iter->Valid()); // out of bound + + // move to the lower bound + iter->SeekToFirst(); + ASSERT_EQ("c", iter->key()); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + + // move to the upper bound + iter->SeekToLast(); + ASSERT_EQ("e", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + // reversely walk to the beginning + iter->SeekToLast(); + ASSERT_EQ("e", iter->key()); + iter->Prev(); + ASSERT_EQ("c", iter->key()); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + + delete ro.iterate_lower_bound; + delete ro.iterate_upper_bound; +} + TEST_P(TransactionTest, DoubleEmptyWrite) { WriteOptions write_options; write_options.sync = true; @@ -1384,7 +1488,7 @@ TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) { ASSERT_OK(db_impl->TEST_FlushMemTable(true)); // regular db read - db->Get(read_options, "foo2", &value); + ASSERT_OK(db->Get(read_options, "foo2", &value)); ASSERT_EQ(value, "bar2"); // nothing has been prepped yet @@ -1432,7 +1536,7 @@ TEST_P(TransactionTest, PersistentTwoPhaseTransactionTest) { ASSERT_OK(s); // value is now available - db->Get(read_options, "foo", &value); + ASSERT_OK(db->Get(read_options, "foo", &value)); ASSERT_EQ(value, "bar"); // we already committed @@ -1601,12 +1705,12 @@ TEST_P(TransactionStressTest, TwoPhaseLongPrepareTest) { if (i % 29 == 0) { // crash - env->SetFilesystemActive(false); + fault_fs->SetFilesystemActive(false); reinterpret_cast(db)->TEST_Crash(); - ReOpenNoDelete(); + ASSERT_OK(ReOpenNoDelete()); } else if (i % 37 == 0) { // close - ReOpenNoDelete(); + ASSERT_OK(ReOpenNoDelete()); } } @@ -1669,8 +1773,8 @@ TEST_P(TransactionTest, TwoPhaseSequenceTest) { delete txn; // kill and reopen - env->SetFilesystemActive(false); - ReOpenNoDelete(); + fault_fs->SetFilesystemActive(false); + ASSERT_OK(ReOpenNoDelete()); assert(db != nullptr); // value is now available @@ -1706,9 +1810,9 @@ TEST_P(TransactionTest, TwoPhaseDoubleRecoveryTest) { delete txn; // kill and reopen - env->SetFilesystemActive(false); + fault_fs->SetFilesystemActive(false); reinterpret_cast(db)->TEST_Crash(); - ReOpenNoDelete(); + ASSERT_OK(ReOpenNoDelete()); // commit old txn assert(db != nullptr); // Make clang analyze happy. @@ -1739,7 +1843,7 @@ TEST_P(TransactionTest, TwoPhaseDoubleRecoveryTest) { delete txn; // kill and reopen - env->SetFilesystemActive(false); + fault_fs->SetFilesystemActive(false); ASSERT_OK(ReOpenNoDelete()); assert(db != nullptr); @@ -2091,7 +2195,7 @@ TEST_P(TransactionTest, TwoPhaseOutOfOrderDelete) { ASSERT_OK(db->FlushWAL(false)); // kill and reopen - env->SetFilesystemActive(false); + fault_fs->SetFilesystemActive(false); reinterpret_cast(db)->TEST_Crash(); ASSERT_OK(ReOpenNoDelete()); assert(db != nullptr); @@ -2188,9 +2292,9 @@ TEST_P(TransactionTest, WriteConflictTest) { s = txn->Commit(); ASSERT_OK(s); - db->Get(read_options, "foo", &value); + ASSERT_OK(db->Get(read_options, "foo", &value)); ASSERT_EQ(value, "A2"); - db->Get(read_options, "foo2", &value); + ASSERT_OK(db->Get(read_options, "foo2", &value)); ASSERT_EQ(value, "B2"); delete txn; @@ -2232,13 +2336,13 @@ TEST_P(TransactionTest, WriteConflictTest2) { ASSERT_OK(s); // Txn should commit, but only write foo2 and foo3 // Verify that transaction wrote foo2 and foo3 but not foo - db->Get(read_options, "foo", &value); + ASSERT_OK(db->Get(read_options, "foo", &value)); ASSERT_EQ(value, "barz"); - db->Get(read_options, "foo2", &value); + ASSERT_OK(db->Get(read_options, "foo2", &value)); ASSERT_EQ(value, "X"); - db->Get(read_options, "foo3", &value); + ASSERT_OK(db->Get(read_options, "foo3", &value)); ASSERT_EQ(value, "Y"); delete txn; @@ -2330,13 +2434,13 @@ TEST_P(TransactionTest, FlushTest) { // force a memtable flush FlushOptions flush_ops; - db->Flush(flush_ops); + ASSERT_OK(db->Flush(flush_ops)); s = txn->Commit(); // txn should commit since the flushed table is still in MemtableList History ASSERT_OK(s); - db->Get(read_options, "foo", &value); + ASSERT_OK(db->Get(read_options, "foo", &value)); ASSERT_EQ(value, "bar2"); delete txn; @@ -2494,6 +2598,24 @@ TEST_P(TransactionTest, FlushTest2) { } } +TEST_P(TransactionTest, WaitForCompactAbortOnPause) { + Status s = ReOpen(); + ASSERT_OK(s); + assert(db != nullptr); + + DBImpl* db_impl = static_cast_with_check(db->GetRootDB()); + + // Pause the background jobs. + ASSERT_OK(db_impl->PauseBackgroundWork()); + + WaitForCompactOptions waitForCompactOptions = WaitForCompactOptions(); + waitForCompactOptions.abort_on_pause = true; + s = db->WaitForCompact(waitForCompactOptions); + ASSERT_NOK(s); + ASSERT_FALSE(s.IsNotSupported()); + ASSERT_TRUE(s.IsAborted()); +} + TEST_P(TransactionTest, NoSnapshotTest) { WriteOptions write_options; ReadOptions read_options; @@ -4974,7 +5096,7 @@ TEST_P(TransactionTest, DeleteRangeSupportTest) { } break; case WRITE_PREPARED: - // Intentional fall-through + FALLTHROUGH_INTENDED; case WRITE_UNPREPARED: if (skip_concurrency_control && skip_duplicate_key_check) { ASSERT_OK(s); @@ -6007,7 +6129,7 @@ TEST_P(TransactionTest, DuplicateKeys) { cf_options.max_successive_merges = 2; cf_options.merge_operator = MergeOperators::CreateStringAppendOperator(); ASSERT_OK(ReOpen()); - db->CreateColumnFamily(cf_options, cf_name, &cf_handle); + ASSERT_OK(db->CreateColumnFamily(cf_options, cf_name, &cf_handle)); WriteOptions write_options; // Ensure one value for the key ASSERT_OK(db->Put(write_options, cf_handle, Slice("key"), Slice("value"))); @@ -6350,11 +6472,11 @@ TEST_P(TransactionTest, DoubleCrashInRecovery) { // Corrupt the last log file in the middle, so that it is not corrupted // in the tail. std::string file_content; - ASSERT_OK(ReadFileToString(env, fname, &file_content)); + ASSERT_OK(ReadFileToString(env.get(), fname, &file_content)); file_content[400] = 'h'; file_content[401] = 'a'; ASSERT_OK(env->DeleteFile(fname)); - ASSERT_OK(WriteStringToFile(env, file_content, fname, true)); + ASSERT_OK(WriteStringToFile(env.get(), file_content, fname, true)); // Recover from corruption std::vector handles; @@ -6641,6 +6763,157 @@ TEST_P(TransactionTest, StallTwoWriteQueues) { ASSERT_TRUE(t2_completed); } +// Make sure UnlockWAL does not return until the stall it controls is cleared. +TEST_P(TransactionTest, UnlockWALStallCleared) { + auto dbimpl = static_cast_with_check(db->GetRootDB()); + for (bool external_stall : {false, true}) { + WriteOptions wopts; + wopts.sync = true; + wopts.disableWAL = false; + + ASSERT_OK(db->Put(wopts, "k1", "val1")); + + // Stall writes + ASSERT_OK(db->LockWAL()); + + std::unique_ptr token; + if (external_stall) { + // Also make sure UnlockWAL can return despite another stall being in + // effect. + token = dbimpl->TEST_write_controler().GetStopToken(); + } + + SyncPoint::GetInstance()->DisableProcessing(); + std::vector sync_deps; + sync_deps.push_back( + {"DBImpl::DelayWrite:Wait", + "TransactionTest::UnlockWALStallCleared:BeforeUnlockWAL1"}); + if (options.two_write_queues && + txn_db_options.write_policy == WRITE_COMMITTED) { + sync_deps.push_back( + {"DBImpl::DelayWrite:NonmemWait", + "TransactionTest::UnlockWALStallCleared:BeforeUnlockWAL2"}); + } + SyncPoint::GetInstance()->LoadDependency(sync_deps); + SyncPoint::GetInstance()->SetCallBack( + "DBImpl::DelayWrite:AfterWait", [](void* arg) { + auto& mu = *static_cast(arg); + mu.AssertHeld(); + // Pretend we are slow waking up from bg_cv_, to give a chance for the + // bug to occur if it can. Randomly prefer one queue over the other. + mu.Unlock(); + if (Random::GetTLSInstance()->OneIn(2)) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } else { + std::this_thread::yield(); + } + mu.Lock(); + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Create blocking writes (for both queues) in background and use + // sync point dependency to get the stall into the write queue(s) + std::atomic t1_completed{false}; + port::Thread t1{[&]() { + ASSERT_OK(db->Put(wopts, "k2", "val2")); + t1_completed = true; + }}; + + std::atomic t2_completed{false}; + port::Thread t2{[&]() { + std::unique_ptr txn0{db->BeginTransaction(wopts, {})}; + ASSERT_OK(txn0->SetName("x1")); + ASSERT_OK(txn0->Put("k3", "val3")); + ASSERT_OK(txn0->Prepare()); // nonmem + ASSERT_OK(txn0->Commit()); + }}; + + // Be sure the test is set up appropriately + TEST_SYNC_POINT("TransactionTest::UnlockWALStallCleared:BeforeUnlockWAL1"); + TEST_SYNC_POINT("TransactionTest::UnlockWALStallCleared:BeforeUnlockWAL2"); + ASSERT_FALSE(t1_completed.load()); + ASSERT_FALSE(t2_completed.load()); + + // Clear the stall + ASSERT_OK(db->UnlockWAL()); + + WriteOptions wopts2 = wopts; + if (external_stall) { + // We did not deadlock in UnlockWAL, so now async clear the external + // stall and then do a blocking write. + // DB mutex acquire+release is needed to ensure we don't reset token and + // signal while DelayWrite() is between IsStopped() and + // BeginWriteStall(). + token.reset(); + dbimpl->TEST_LockMutex(); + dbimpl->TEST_UnlockMutex(); + dbimpl->TEST_SignalAllBgCv(); + } else { + // To verify the LockWAL stall is guaranteed cleared, do a non-blocking + // write that is attempting to catch a bug by attempting to come before + // the thread that did BeginWriteStall() can do EndWriteStall() + wopts2.no_slowdown = true; + } + std::unique_ptr txn0{db->BeginTransaction(wopts2, {})}; + ASSERT_OK(txn0->SetName("x2")); + ASSERT_OK(txn0->Put("k1", "val4")); + ASSERT_OK(txn0->Prepare()); // nonmem + ASSERT_OK(txn0->Commit()); + + t1.join(); + t2.join(); + } +} + +TEST_F(TransactionDBTest, CollapseKey) { + ASSERT_OK(ReOpen()); + ASSERT_OK(db->Put({}, "hello", "world")); + ASSERT_OK(db->Flush({})); + ASSERT_OK(db->Merge({}, "hello", "world")); + ASSERT_OK(db->Flush({})); + ASSERT_OK(db->Merge({}, "hello", "world")); + ASSERT_OK(db->Flush({})); + + std::string value; + ASSERT_OK(db->Get({}, "hello", &value)); + ASSERT_EQ("world,world,world", value); + + // get merge op info + std::vector operands(3); + GetMergeOperandsOptions mergeOperandOptions; + mergeOperandOptions.expected_max_number_of_operands = 3; + int numOperands; + ASSERT_OK(db->GetMergeOperands({}, db->DefaultColumnFamily(), "hello", + operands.data(), &mergeOperandOptions, + &numOperands)); + ASSERT_EQ(3, numOperands); + + // collapse key + { + std::unique_ptr txn0{ + db->BeginTransaction(WriteOptions{}, TransactionOptions{})}; + ASSERT_OK(txn0->CollapseKey(ReadOptions{}, "hello")); + ASSERT_OK(txn0->Commit()); + } + + // merge operands should be 1 + ASSERT_OK(db->GetMergeOperands({}, db->DefaultColumnFamily(), "hello", + operands.data(), &mergeOperandOptions, + &numOperands)); + ASSERT_EQ(1, numOperands); + + // get again after collapse + ASSERT_OK(db->Get({}, "hello", &value)); + ASSERT_EQ("world,world,world", value); + + // collapse of non-existent key + { + std::unique_ptr txn1{ + db->BeginTransaction(WriteOptions{}, TransactionOptions{})}; + ASSERT_TRUE(txn1->CollapseKey(ReadOptions{}, "dummy").IsNotFound()); + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { @@ -6648,14 +6921,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_test.h b/utilities/transactions/transaction_test.h index 0b86453a409e..60c5c8a4ba07 100644 --- a/utilities/transactions/transaction_test.h +++ b/utilities/transactions/transaction_test.h @@ -25,7 +25,7 @@ #include "test_util/transaction_test_util.h" #include "util/random.h" #include "util/string_util.h" -#include "utilities/fault_injection_env.h" +#include "utilities/fault_injection_fs.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend.h" #include "utilities/transactions/pessimistic_transaction_db.h" @@ -42,7 +42,8 @@ class TransactionTestBase : public ::testing::Test { public: TransactionDB* db; SpecialEnv special_env; - FaultInjectionTestEnv* env; + std::shared_ptr fault_fs; + std::unique_ptr env; std::string dbname; Options options; @@ -63,8 +64,9 @@ class TransactionTestBase : public ::testing::Test { options.level0_file_num_compaction_trigger = 2; options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); special_env.skip_fsync_ = true; - env = new FaultInjectionTestEnv(&special_env); - options.env = env; + fault_fs.reset(new FaultInjectionTestFS(FileSystem::Default())); + env.reset(new CompositeEnvWrapper(&special_env, fault_fs)); + options.env = env.get(); options.two_write_queues = two_write_queue; dbname = test::PerThreadDBPath("transaction_testdb"); @@ -101,15 +103,14 @@ class TransactionTestBase : public ::testing::Test { } else { fprintf(stdout, "db is still in %s\n", dbname.c_str()); } - delete env; } Status ReOpenNoDelete() { delete db; db = nullptr; - env->AssertNoOpenFile(); - env->DropUnsyncedFileData(); - env->ResetState(); + fault_fs->AssertNoOpenFile(); + EXPECT_OK(fault_fs->DropUnsyncedFileData()); + fault_fs->ResetState(); Status s; if (use_stackable_db_ == false) { s = TransactionDB::Open(options, txn_db_options, dbname, &db); @@ -128,9 +129,9 @@ class TransactionTestBase : public ::testing::Test { handles->clear(); delete db; db = nullptr; - env->AssertNoOpenFile(); - env->DropUnsyncedFileData(); - env->ResetState(); + fault_fs->AssertNoOpenFile(); + EXPECT_OK(fault_fs->DropUnsyncedFileData()); + fault_fs->ResetState(); Status s; if (use_stackable_db_ == false) { s = TransactionDB::Open(options, txn_db_options, dbname, cfs, handles, @@ -145,7 +146,7 @@ class TransactionTestBase : public ::testing::Test { Status ReOpen() { delete db; db = nullptr; - DestroyDB(dbname, options); + EXPECT_OK(DestroyDB(dbname, options)); Status s; if (use_stackable_db_ == false) { s = TransactionDB::Open(options, txn_db_options, dbname, &db); @@ -487,6 +488,12 @@ class TransactionTest std::get<2>(GetParam()), std::get<3>(GetParam())){}; }; +class TransactionDBTest : public TransactionTestBase { + public: + TransactionDBTest() + : TransactionTestBase(false, false, WRITE_COMMITTED, kOrderedWrite) {} +}; + class TransactionStressTest : public TransactionTest {}; class MySQLStyleTransactionTest diff --git a/utilities/transactions/transaction_util.cc b/utilities/transactions/transaction_util.cc index 360edc8ec1a7..d5a68807e77a 100644 --- a/utilities/transactions/transaction_util.cc +++ b/utilities/transactions/transaction_util.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/transaction_util.h" @@ -203,4 +202,3 @@ Status TransactionUtil::CheckKeysForConflicts(DBImpl* db_impl, } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_util.h b/utilities/transactions/transaction_util.h index a349ba87a65c..725e1c927905 100644 --- a/utilities/transactions/transaction_util.h +++ b/utilities/transactions/transaction_util.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -82,4 +81,3 @@ class TransactionUtil { } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/write_committed_transaction_ts_test.cc b/utilities/transactions/write_committed_transaction_ts_test.cc index 94b8201f7ae0..595e7ad1ae66 100644 --- a/utilities/transactions/write_committed_transaction_ts_test.cc +++ b/utilities/transactions/write_committed_transaction_ts_test.cc @@ -7,7 +7,6 @@ #include "rocksdb/options.h" #include "rocksdb/utilities/transaction_db.h" #include "utilities/merge_operators.h" -#ifndef ROCKSDB_LITE #include "test_util/testutil.h" #include "utilities/transactions/transaction_test.h" @@ -99,6 +98,38 @@ TEST_P(WriteCommittedTxnWithTsTest, SanityChecks) { txn1.reset(); } +void CheckKeyValueTsWithIterator( + Iterator* iter, + std::vector> entries) { + size_t num_entries = entries.size(); + // test forward iteration + for (size_t i = 0; i < num_entries; i++) { + auto [key, value, timestamp] = entries[i]; + if (i == 0) { + iter->Seek(key); + } else { + iter->Next(); + } + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), key); + ASSERT_EQ(iter->value(), value); + ASSERT_EQ(iter->timestamp(), timestamp); + } + // test backward iteration + for (size_t i = 0; i < num_entries; i++) { + auto [key, value, timestamp] = entries[num_entries - 1 - i]; + if (i == 0) { + iter->SeekForPrev(key); + } else { + iter->Prev(); + } + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key(), key); + ASSERT_EQ(iter->value(), value); + ASSERT_EQ(iter->timestamp(), timestamp); + } +} + TEST_P(WriteCommittedTxnWithTsTest, ReOpenWithTimestamp) { options.merge_operator = MergeOperators::CreateUInt64AddOperator(); ASSERT_OK(ReOpenNoDelete()); @@ -129,17 +160,57 @@ TEST_P(WriteCommittedTxnWithTsTest, ReOpenWithTimestamp) { std::unique_ptr txn1( NewTxn(WriteOptions(), TransactionOptions())); assert(txn1); + + std::string write_ts; + uint64_t write_ts_int = 23; + PutFixed64(&write_ts, write_ts_int); + ReadOptions read_opts; + std::string read_ts; + PutFixed64(&read_ts, write_ts_int + 1); + Slice read_ts_slice = read_ts; + read_opts.timestamp = &read_ts_slice; + + ASSERT_OK(txn1->Put(handles_[1], "bar", "value0")); ASSERT_OK(txn1->Put(handles_[1], "foo", "value1")); + // (key, value, ts) pairs to check. + std::vector> + entries_to_check; + entries_to_check.emplace_back("bar", "value0", ""); + entries_to_check.emplace_back("foo", "value1", ""); + { std::string buf; PutFixed64(&buf, 23); ASSERT_OK(txn1->Put("id", buf)); ASSERT_OK(txn1->Merge("id", buf)); } + + // Check (key, value, ts) with overwrites in txn before `SetCommitTimestamp`. + if (std::get<2>(GetParam())) { // enable_indexing = true + std::unique_ptr iter(txn1->GetIterator(read_opts, handles_[1])); + CheckKeyValueTsWithIterator(iter.get(), entries_to_check); + } + ASSERT_OK(txn1->SetName("txn1")); ASSERT_OK(txn1->Prepare()); - ASSERT_OK(txn1->SetCommitTimestamp(/*ts=*/23)); + ASSERT_OK(txn1->SetCommitTimestamp(write_ts_int)); + + // Check (key, value, ts) with overwrites in txn after `SetCommitTimestamp`. + if (std::get<2>(GetParam())) { // enable_indexing = true + std::unique_ptr iter(txn1->GetIterator(read_opts, handles_[1])); + CheckKeyValueTsWithIterator(iter.get(), entries_to_check); + } + ASSERT_OK(txn1->Commit()); + entries_to_check.clear(); + entries_to_check.emplace_back("bar", "value0", write_ts); + entries_to_check.emplace_back("foo", "value1", write_ts); + + // Check (key, value, ts) pairs with overwrites in txn after `Commit`. + { + std::unique_ptr iter(txn1->GetIterator(read_opts, handles_[1])); + CheckKeyValueTsWithIterator(iter.get(), entries_to_check); + } txn1.reset(); { @@ -160,6 +231,14 @@ TEST_P(WriteCommittedTxnWithTsTest, ReOpenWithTimestamp) { assert(result); ASSERT_EQ(46, ival); } + + // Check (key, value, ts) pairs without overwrites in txn. + { + std::unique_ptr txn2( + NewTxn(WriteOptions(), TransactionOptions())); + std::unique_ptr iter(txn2->GetIterator(read_opts, handles_[1])); + CheckKeyValueTsWithIterator(iter.get(), entries_to_check); + } } TEST_P(WriteCommittedTxnWithTsTest, RecoverFromWal) { @@ -565,6 +644,13 @@ TEST_P(WriteCommittedTxnWithTsTest, CheckKeysForConflicts) { ASSERT_TRUE(txn1->GetForUpdate(ReadOptions(), "foo", &dontcare).IsBusy()); ASSERT_TRUE(called); + Transaction* reused_txn = + db->BeginTransaction(WriteOptions(), TransactionOptions(), txn1.get()); + ASSERT_EQ(reused_txn, txn1.get()); + ASSERT_OK(reused_txn->Put("foo", "v1")); + ASSERT_OK(reused_txn->SetCommitTimestamp(40)); + ASSERT_OK(reused_txn->Commit()); + SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); } @@ -577,12 +663,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as Transactions not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index 6cbb26e9dab7..d6f1ace73731 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include #include @@ -1346,7 +1345,7 @@ TEST_P(WritePreparedTransactionTest, NewSnapshotLargerThanMax) { // Check that the new max has not advanced the last seq ASSERT_LT(wp_db->max_evicted_seq_.load(), last_seq); for (auto txn : txns) { - txn->Rollback(); + ASSERT_OK(txn->Rollback()); delete txn; } } @@ -4047,14 +4046,3 @@ int main(int argc, char** argv) { } return RUN_ALL_TESTS(); } - -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc index 16b5cc1cbce5..58126a475081 100644 --- a/utilities/transactions/write_prepared_txn.cc +++ b/utilities/transactions/write_prepared_txn.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/write_prepared_txn.h" @@ -40,19 +39,37 @@ void WritePreparedTxn::Initialize(const TransactionOptions& txn_options) { prepare_batch_cnt_ = 0; } -void WritePreparedTxn::MultiGet(const ReadOptions& options, +void WritePreparedTxn::MultiGet(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kMultiGet) { + Status s = Status::InvalidArgument( + "Can only call MultiGet with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); + + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + statuses[i] = s; + } + } + return; + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kMultiGet; + } + SequenceNumber min_uncommitted, snap_seq; - const SnapshotBackup backed_by_snapshot = - wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + const SnapshotBackup backed_by_snapshot = wpt_db_->AssignMinMaxSeqs( + read_options.snapshot, &min_uncommitted, &snap_seq); WritePreparedTxnReadCallback callback(wpt_db_, snap_seq, min_uncommitted, backed_by_snapshot); - write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys, - keys, values, statuses, sorted_input, - &callback); + write_batch_.MultiGetFromBatchAndDB(db_, read_options, column_family, + num_keys, keys, values, statuses, + sorted_input, &callback); if (UNLIKELY(!callback.valid() || !wpt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { wpt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); @@ -62,9 +79,27 @@ void WritePreparedTxn::MultiGet(const ReadOptions& options, } } -Status WritePreparedTxn::Get(const ReadOptions& options, +Status WritePreparedTxn::Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kGet) { + return Status::InvalidArgument( + "Can only call Get with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kGet`"); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kGet; + } + + return GetImpl(read_options, column_family, key, pinnable_val); +} + +Status WritePreparedTxn::GetImpl(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, + PinnableSlice* pinnable_val) { SequenceNumber min_uncommitted, snap_seq; const SnapshotBackup backed_by_snapshot = wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); @@ -88,11 +123,7 @@ Status WritePreparedTxn::Get(const ReadOptions& options, } Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options) { - // Make sure to get iterator from WritePrepareTxnDB, not the root db. - Iterator* db_iter = wpt_db_->NewIterator(options); - assert(db_iter); - - return write_batch_.NewIteratorWithBase(db_iter); + return GetIterator(options, wpt_db_->DefaultColumnFamily()); } Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options, @@ -101,7 +132,7 @@ Iterator* WritePreparedTxn::GetIterator(const ReadOptions& options, Iterator* db_iter = wpt_db_->NewIterator(options, column_family); assert(db_iter); - return write_batch_.NewIteratorWithBase(column_family, db_iter); + return write_batch_.NewIteratorWithBase(column_family, db_iter, &options); } Status WritePreparedTxn::PrepareInternal() { @@ -508,5 +539,3 @@ Status WritePreparedTxn::RebuildFromWriteBatch(WriteBatch* src_batch) { } } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/write_prepared_txn.h b/utilities/transactions/write_prepared_txn.h index 30d9bdb99bf3..9a0fb81d19c9 100644 --- a/utilities/transactions/write_prepared_txn.h +++ b/utilities/transactions/write_prepared_txn.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -52,12 +51,12 @@ class WritePreparedTxn : public PessimisticTransaction { // seq in the WAL that is also published, LastPublishedSequence, as opposed to // the last seq in the memtable. using Transaction::Get; - virtual Status Get(const ReadOptions& options, + virtual Status Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) override; using Transaction::MultiGet; - virtual void MultiGet(const ReadOptions& options, + virtual void MultiGet(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, @@ -87,6 +86,10 @@ class WritePreparedTxn : public PessimisticTransaction { friend class WriteUnpreparedTxnDB; friend class WriteUnpreparedTxn; + using Transaction::GetImpl; + Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) override; + Status PrepareInternal() override; Status CommitWithoutPrepareInternal() override; @@ -115,5 +118,3 @@ class WritePreparedTxn : public PessimisticTransaction { }; } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index 595c3df8f524..91a81d158930 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/write_prepared_txn_db.h" @@ -248,9 +247,26 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig, return s; } -Status WritePreparedTxnDB::Get(const ReadOptions& options, +Status WritePreparedTxnDB::Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kGet) { + return Status::InvalidArgument( + "Can only call Get with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kGet`"); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kGet; + } + + return GetImpl(read_options, column_family, key, value); +} + +Status WritePreparedTxnDB::GetImpl(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) { SequenceNumber min_uncommitted, snap_seq; const SnapshotBackup backed_by_snapshot = AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); @@ -310,16 +326,35 @@ void WritePreparedTxnDB::UpdateCFComparatorMap(ColumnFamilyHandle* h) { } std::vector WritePreparedTxnDB::MultiGet( - const ReadOptions& options, + const ReadOptions& _read_options, const std::vector& column_family, const std::vector& keys, std::vector* values) { assert(values); size_t num_keys = keys.size(); + std::vector stat_list(num_keys); + + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kMultiGet) { + Status s = Status::InvalidArgument( + "Can only call MultiGet with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); + + for (size_t i = 0; i < num_keys; ++i) { + stat_list[i] = s; + } + return stat_list; + } + + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kMultiGet; + } + values->resize(num_keys); - std::vector stat_list(num_keys); for (size_t i = 0; i < num_keys; ++i) { - stat_list[i] = this->Get(options, column_family[i], keys[i], &(*values)[i]); + stat_list[i] = + this->GetImpl(read_options, column_family[i], keys[i], &(*values)[i]); } return stat_list; } @@ -342,17 +377,27 @@ static void CleanupWritePreparedTxnDBIterator(void* arg1, void* /*arg2*/) { } } // anonymous namespace -Iterator* WritePreparedTxnDB::NewIterator(const ReadOptions& options, +Iterator* WritePreparedTxnDB::NewIterator(const ReadOptions& _read_options, ColumnFamilyHandle* column_family) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kDBIterator) { + return NewErrorIterator(Status::InvalidArgument( + "Can only call NewIterator with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kDBIterator`")); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kDBIterator; + } constexpr bool expose_blob_index = false; constexpr bool allow_refresh = false; std::shared_ptr own_snapshot = nullptr; SequenceNumber snapshot_seq = kMaxSequenceNumber; SequenceNumber min_uncommitted = 0; - if (options.snapshot != nullptr) { - snapshot_seq = options.snapshot->GetSequenceNumber(); + if (read_options.snapshot != nullptr) { + snapshot_seq = read_options.snapshot->GetSequenceNumber(); min_uncommitted = - static_cast_with_check(options.snapshot) + static_cast_with_check(read_options.snapshot) ->min_uncommitted_; } else { auto* snapshot = GetSnapshot(); @@ -368,26 +413,38 @@ Iterator* WritePreparedTxnDB::NewIterator(const ReadOptions& options, static_cast_with_check(column_family)->cfd(); auto* state = new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted); - auto* db_iter = - db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback, - expose_blob_index, allow_refresh); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(db_impl_); + auto* db_iter = db_impl_->NewIteratorImpl(read_options, cfd, super_version, + snapshot_seq, &state->callback, + expose_blob_index, allow_refresh); db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr); return db_iter; } Status WritePreparedTxnDB::NewIterators( - const ReadOptions& options, + const ReadOptions& _read_options, const std::vector& column_families, std::vector* iterators) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kDBIterator) { + return Status::InvalidArgument( + "Can only call NewIterator with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kDBIterator`"); + } + + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kDBIterator; + } constexpr bool expose_blob_index = false; constexpr bool allow_refresh = false; std::shared_ptr own_snapshot = nullptr; SequenceNumber snapshot_seq = kMaxSequenceNumber; SequenceNumber min_uncommitted = 0; - if (options.snapshot != nullptr) { - snapshot_seq = options.snapshot->GetSequenceNumber(); + if (read_options.snapshot != nullptr) { + snapshot_seq = read_options.snapshot->GetSequenceNumber(); min_uncommitted = - static_cast_with_check(options.snapshot) + static_cast_with_check(read_options.snapshot) ->min_uncommitted_; } else { auto* snapshot = GetSnapshot(); @@ -405,9 +462,10 @@ Status WritePreparedTxnDB::NewIterators( static_cast_with_check(column_family)->cfd(); auto* state = new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted); - auto* db_iter = - db_impl_->NewIteratorImpl(options, cfd, snapshot_seq, &state->callback, - expose_blob_index, allow_refresh); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(db_impl_); + auto* db_iter = db_impl_->NewIteratorImpl(read_options, cfd, super_version, + snapshot_seq, &state->callback, + expose_blob_index, allow_refresh); db_iter->RegisterCleanup(CleanupWritePreparedTxnDBIterator, state, nullptr); iterators->push_back(db_iter); } @@ -1027,4 +1085,3 @@ void SubBatchCounter::AddKey(const uint32_t cf, const Slice& key) { } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/write_prepared_txn_db.h b/utilities/transactions/write_prepared_txn_db.h index a3cd9b055487..1d33db550547 100644 --- a/utilities/transactions/write_prepared_txn_db.h +++ b/utilities/transactions/write_prepared_txn_db.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include #include @@ -84,24 +83,24 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { size_t batch_cnt, WritePreparedTxn* txn); using DB::Get; - virtual Status Get(const ReadOptions& options, + virtual Status Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) override; using DB::MultiGet; virtual std::vector MultiGet( - const ReadOptions& options, + const ReadOptions& _read_options, const std::vector& column_family, const std::vector& keys, std::vector* values) override; using DB::NewIterator; - virtual Iterator* NewIterator(const ReadOptions& options, + virtual Iterator* NewIterator(const ReadOptions& _read_options, ColumnFamilyHandle* column_family) override; using DB::NewIterators; virtual Status NewIterators( - const ReadOptions& options, + const ReadOptions& _read_options, const std::vector& column_families, std::vector* iterators) override; @@ -521,6 +520,21 @@ class WritePreparedTxnDB : public PessimisticTransactionDB { RecordTick(db_impl_->immutable_db_options_.statistics.get(), ticker_type); } + Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) { + assert(value != nullptr); + PinnableSlice pinnable_val(value); + assert(!pinnable_val.IsPinned()); + auto s = GetImpl(options, column_family, key, &pinnable_val); + if (s.ok() && pinnable_val.IsPinned()) { + value->assign(pinnable_val.data(), pinnable_val.size()); + } // else value is already assigned + return s; + } + + Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value); + // A heap with the amortized O(1) complexity for erase. It uses one extra heap // to keep track of erased entries that are not yet on top of the main heap. class PreparedHeap { @@ -1122,4 +1136,3 @@ bool WritePreparedTxnDB::ValidateSnapshot( } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/write_unprepared_transaction_test.cc b/utilities/transactions/write_unprepared_transaction_test.cc index 6c8c62e0e044..d1307d7602d0 100644 --- a/utilities/transactions/write_unprepared_transaction_test.cc +++ b/utilities/transactions/write_unprepared_transaction_test.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/transaction_test.h" #include "utilities/transactions/write_unprepared_txn.h" @@ -36,26 +35,32 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(std::make_tuple(false, false, WRITE_UNPREPARED), std::make_tuple(false, true, WRITE_UNPREPARED))); -enum StressAction { NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT }; -class WriteUnpreparedStressTest : public WriteUnpreparedTransactionTestBase, - virtual public ::testing::WithParamInterface< - std::tuple> { +enum SnapshotAction { NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT }; +enum VerificationOperation { VERIFY_GET, VERIFY_NEXT, VERIFY_PREV }; +class WriteUnpreparedSnapshotTest + : public WriteUnpreparedTransactionTestBase, + virtual public ::testing::WithParamInterface< + std::tuple> { public: - WriteUnpreparedStressTest() + WriteUnpreparedSnapshotTest() : WriteUnpreparedTransactionTestBase(false, std::get<0>(GetParam()), WRITE_UNPREPARED), - action_(std::get<1>(GetParam())) {} - StressAction action_; + action_(std::get<1>(GetParam())), + verify_op_(std::get<2>(GetParam())) {} + SnapshotAction action_; + VerificationOperation verify_op_; }; +// Test parameters: +// Param 0): use stackable db, parameterization hard coded to be overwritten to +// false. Param 1): test mode for snapshot action Param 2): test mode for +// verification operation INSTANTIATE_TEST_CASE_P( - WriteUnpreparedStressTest, WriteUnpreparedStressTest, - ::testing::Values(std::make_tuple(false, NO_SNAPSHOT), - std::make_tuple(false, RO_SNAPSHOT), - std::make_tuple(false, REFRESH_SNAPSHOT), - std::make_tuple(true, NO_SNAPSHOT), - std::make_tuple(true, RO_SNAPSHOT), - std::make_tuple(true, REFRESH_SNAPSHOT))); + WriteUnpreparedSnapshotTest, WriteUnpreparedSnapshotTest, + ::testing::Combine( + ::testing::Bool(), + ::testing::Values(NO_SNAPSHOT, RO_SNAPSHOT, REFRESH_SNAPSHOT), + ::testing::Values(VERIFY_GET, VERIFY_NEXT, VERIFY_PREV))); TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) { // The following tests checks whether reading your own write for @@ -136,42 +141,33 @@ TEST_P(WriteUnpreparedTransactionTest, ReadYourOwnWrite) { } } -#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) -TEST_P(WriteUnpreparedStressTest, ReadYourOwnWriteStress) { - // This is a stress test where different threads are writing random keys, and - // then before committing or aborting the transaction, it validates to see - // that it can read the keys it wrote, and the keys it did not write respect - // the snapshot. To avoid row lock contention (and simply stressing the - // locking system), each thread is mostly only writing to its own set of keys. +TEST_P(WriteUnpreparedSnapshotTest, ReadYourOwnWrite) { + // This test validates a transaction can read its writes and the correctness + // of its read with regard to a mocked snapshot functionality. const uint32_t kNumIter = 1000; - const uint32_t kNumThreads = 10; const uint32_t kNumKeys = 5; // Test with // 1. no snapshots set // 2. snapshot set on ReadOptions // 3. snapshot set, and refreshing after every write. - StressAction a = action_; + SnapshotAction snapshot_action = action_; WriteOptions write_options; txn_db_options.transaction_lock_timeout = -1; options.disable_auto_compactions = true; ASSERT_OK(ReOpen()); std::vector keys; - for (uint32_t k = 0; k < kNumKeys * kNumThreads; k++) { + for (uint32_t k = 0; k < kNumKeys; k++) { keys.push_back("k" + std::to_string(k)); } - RandomShuffle(keys.begin(), keys.end()); // This counter will act as a "sequence number" to help us validate // visibility logic with snapshots. If we had direct access to the seqno of // snapshots and key/values, then we should directly compare those instead. std::atomic counter(0); - std::function stress_thread = [&](int id) { - size_t tid = std::hash()(std::this_thread::get_id()); - Random64 rnd(static_cast(tid)); - + std::function check_correctness_wrt_snapshot = [&]() { Transaction* txn; TransactionOptions txn_options; // batch_size of 1 causes writes to DB for every marker. @@ -179,114 +175,82 @@ TEST_P(WriteUnpreparedStressTest, ReadYourOwnWriteStress) { ReadOptions read_options; for (uint32_t i = 0; i < kNumIter; i++) { - std::set owned_keys(keys.begin() + id * kNumKeys, - keys.begin() + (id + 1) * kNumKeys); - // Add unowned keys to make the workload more interesting, but this - // increases row lock contention, so just do it sometimes. - if (rnd.OneIn(2)) { - owned_keys.insert(keys[rnd.Uniform(kNumKeys * kNumThreads)]); - } - txn = db->BeginTransaction(write_options, txn_options); - ASSERT_OK(txn->SetName(std::to_string(id))); txn->SetSnapshot(); - if (a >= RO_SNAPSHOT) { + if (snapshot_action >= RO_SNAPSHOT) { read_options.snapshot = txn->GetSnapshot(); ASSERT_TRUE(read_options.snapshot != nullptr); } - uint64_t buf[2]; - buf[0] = id; + uint64_t buf[1]; // When scanning through the database, make sure that all unprepared - // keys have value >= snapshot and all other keys have value < snapshot. + // keys have value >= snapshot. int64_t snapshot_num = counter.fetch_add(1); Status s; - for (const auto& key : owned_keys) { - buf[1] = counter.fetch_add(1); + for (const auto& key : keys) { + buf[0] = counter.fetch_add(1); s = txn->Put(key, Slice((const char*)buf, sizeof(buf))); if (!s.ok()) { break; } - if (a == REFRESH_SNAPSHOT) { + if (snapshot_action == REFRESH_SNAPSHOT) { txn->SetSnapshot(); read_options.snapshot = txn->GetSnapshot(); snapshot_num = counter.fetch_add(1); } } - // Failure is possible due to snapshot validation. In this case, - // rollback and move onto next iteration. - if (!s.ok()) { - ASSERT_TRUE(s.IsBusy()); - ASSERT_OK(txn->Rollback()); - delete txn; - continue; - } + ASSERT_OK(s); - auto verify_key = [&owned_keys, &a, &id, &snapshot_num]( - const std::string& key, const std::string& value) { - if (owned_keys.count(key) > 0) { - ASSERT_EQ(value.size(), 16); - - // Since this key is part of owned_keys, then this key must be - // unprepared by this transaction identified by 'id' - ASSERT_EQ(((int64_t*)value.c_str())[0], id); - if (a == REFRESH_SNAPSHOT) { - // If refresh snapshot is true, then the snapshot is refreshed - // after every Put(), meaning that the current snapshot in - // snapshot_num must be greater than the "seqno" of any keys - // written by the current transaction. - ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num); - } else { - // If refresh snapshot is not on, then the snapshot was taken at - // the beginning of the transaction, meaning all writes must come - // after snapshot_num - ASSERT_GT(((int64_t*)value.c_str())[1], snapshot_num); - } - } else if (a >= RO_SNAPSHOT) { - // If this is not an unprepared key, just assert that the key - // "seqno" is smaller than the snapshot seqno. - ASSERT_EQ(value.size(), 16); - ASSERT_LT(((int64_t*)value.c_str())[1], snapshot_num); + auto verify_key = [&snapshot_action, + &snapshot_num](const std::string& value) { + ASSERT_EQ(value.size(), 8); + + if (snapshot_action == REFRESH_SNAPSHOT) { + // If refresh snapshot is true, then the snapshot is refreshed + // after every Put(), meaning that the current snapshot in + // snapshot_num must be greater than the "seqno" of any keys + // written by the current transaction. + ASSERT_LT(((int64_t*)value.c_str())[0], snapshot_num); + } else { + // If refresh snapshot is not on, then the snapshot was taken at + // the beginning of the transaction, meaning all writes must come + // after snapshot_num + ASSERT_GT(((int64_t*)value.c_str())[0], snapshot_num); } }; - // Validate Get()/Next()/Prev(). Do only one of them to save time, and - // reduce lock contention. - switch (rnd.Uniform(3)) { - case 0: // Validate Get() + // Validate one of Get()/Next()/Prev() depending on the verification + // operation to use. + switch (verify_op_) { + case VERIFY_GET: // Validate Get() { for (const auto& key : keys) { std::string value; - s = txn->Get(read_options, Slice(key), &value); - if (!s.ok()) { - ASSERT_TRUE(s.IsNotFound()); - ASSERT_EQ(owned_keys.count(key), 0); - } else { - verify_key(key, value); - } + ASSERT_OK(txn->Get(read_options, Slice(key), &value)); + verify_key(value); } break; } - case 1: // Validate Next() + case VERIFY_NEXT: // Validate Next() { Iterator* iter = txn->GetIterator(read_options); ASSERT_OK(iter->status()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - verify_key(iter->key().ToString(), iter->value().ToString()); + verify_key(iter->value().ToString()); } ASSERT_OK(iter->status()); delete iter; break; } - case 2: // Validate Prev() + case VERIFY_PREV: // Validate Prev() { Iterator* iter = txn->GetIterator(read_options); ASSERT_OK(iter->status()); for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { - verify_key(iter->key().ToString(), iter->value().ToString()); + verify_key(iter->value().ToString()); } ASSERT_OK(iter->status()); delete iter; @@ -296,25 +260,13 @@ TEST_P(WriteUnpreparedStressTest, ReadYourOwnWriteStress) { FAIL(); } - if (rnd.OneIn(2)) { - ASSERT_OK(txn->Commit()); - } else { - ASSERT_OK(txn->Rollback()); - } + ASSERT_OK(txn->Commit()); delete txn; } }; - std::vector threads; - for (uint32_t i = 0; i < kNumThreads; i++) { - threads.emplace_back(stress_thread, i); - } - - for (auto& t : threads) { - t.join(); - } + check_correctness_wrt_snapshot(); } -#endif // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN) // This tests how write unprepared behaves during recovery when the DB crashes // after a transaction has either been unprepared or prepared, and tests if @@ -778,13 +730,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, - "SKIPPED as Transactions are not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index 6e04d33442cd..c30cf9e1f049 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/write_unprepared_txn.h" @@ -944,19 +943,36 @@ Status WriteUnpreparedTxn::PopSavePoint() { return Status::NotFound(); } -void WriteUnpreparedTxn::MultiGet(const ReadOptions& options, +void WriteUnpreparedTxn::MultiGet(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kMultiGet) { + Status s = Status::InvalidArgument( + "Can only call MultiGet with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kMultiGet`"); + + for (size_t i = 0; i < num_keys; ++i) { + if (statuses[i].ok()) { + statuses[i] = s; + } + } + return; + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kMultiGet; + } SequenceNumber min_uncommitted, snap_seq; - const SnapshotBackup backed_by_snapshot = - wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); + const SnapshotBackup backed_by_snapshot = wupt_db_->AssignMinMaxSeqs( + read_options.snapshot, &min_uncommitted, &snap_seq); WriteUnpreparedTxnReadCallback callback(wupt_db_, snap_seq, min_uncommitted, unprep_seqs_, backed_by_snapshot); - write_batch_.MultiGetFromBatchAndDB(db_, options, column_family, num_keys, - keys, values, statuses, sorted_input, - &callback); + write_batch_.MultiGetFromBatchAndDB(db_, read_options, column_family, + num_keys, keys, values, statuses, + sorted_input, &callback); if (UNLIKELY(!callback.valid() || !wupt_db_->ValidateSnapshot(snap_seq, backed_by_snapshot))) { wupt_db_->WPRecordTick(TXN_GET_TRY_AGAIN); @@ -966,9 +982,26 @@ void WriteUnpreparedTxn::MultiGet(const ReadOptions& options, } } -Status WriteUnpreparedTxn::Get(const ReadOptions& options, +Status WriteUnpreparedTxn::Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kGet) { + return Status::InvalidArgument( + "Can only call Get with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kGet`"); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kGet; + } + + return GetImpl(read_options, column_family, key, value); +} + +Status WriteUnpreparedTxn::GetImpl(const ReadOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) { SequenceNumber min_uncommitted, snap_seq; const SnapshotBackup backed_by_snapshot = wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); @@ -1004,7 +1037,8 @@ Iterator* WriteUnpreparedTxn::GetIterator(const ReadOptions& options, Iterator* db_iter = wupt_db_->NewIterator(options, column_family, this); assert(db_iter); - auto iter = write_batch_.NewIteratorWithBase(column_family, db_iter); + auto iter = + write_batch_.NewIteratorWithBase(column_family, db_iter, &options); active_iterators_.push_back(iter); iter->RegisterCleanup(CleanupWriteUnpreparedWBWIIterator, this, iter); return iter; @@ -1049,5 +1083,3 @@ WriteUnpreparedTxn::GetUnpreparedSequenceNumbers() { } } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/write_unprepared_txn.h b/utilities/transactions/write_unprepared_txn.h index 5a3227f4ee65..fe47c8cd8a34 100644 --- a/utilities/transactions/write_unprepared_txn.h +++ b/utilities/transactions/write_unprepared_txn.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include @@ -185,12 +184,12 @@ class WriteUnpreparedTxn : public WritePreparedTxn { // Get and GetIterator needs to be overridden so that a ReadCallback to // handle read-your-own-write is used. using Transaction::Get; - virtual Status Get(const ReadOptions& options, + virtual Status Get(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) override; using Transaction::MultiGet; - virtual void MultiGet(const ReadOptions& options, + virtual void MultiGet(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, @@ -212,6 +211,10 @@ class WriteUnpreparedTxn : public WritePreparedTxn { friend class WriteUnpreparedTxnDB; const std::map& GetUnpreparedSequenceNumbers(); + using Transaction::GetImpl; + Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, PinnableSlice* value) override; + Status WriteRollbackKeys(const LockTracker& tracked_keys, WriteBatchWithIndex* rollback_batch, ReadCallback* callback, const ReadOptions& roptions); @@ -337,5 +340,3 @@ class WriteUnpreparedTxn : public WritePreparedTxn { }; } // namespace ROCKSDB_NAMESPACE - -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index 2ed2d5c59bb5..1d75dd44901a 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -3,7 +3,6 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE #include "utilities/transactions/write_unprepared_txn_db.h" @@ -386,9 +385,20 @@ static void CleanupWriteUnpreparedTxnDBIterator(void* arg1, void* /*arg2*/) { } } // anonymous namespace -Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options, +Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, WriteUnpreparedTxn* txn) { + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kDBIterator) { + return NewErrorIterator(Status::InvalidArgument( + "Can only call NewIterator with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kDBIterator`")); + } + + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kDBIterator; + } // TODO(lth): Refactor so that this logic is shared with WritePrepared. constexpr bool expose_blob_index = false; constexpr bool allow_refresh = false; @@ -427,11 +437,11 @@ Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options, // max_visible_seq, and then return the last visible value, so that this // restriction can be lifted. const Snapshot* snapshot = nullptr; - if (options.snapshot == nullptr) { + if (read_options.snapshot == nullptr) { snapshot = GetSnapshot(); own_snapshot = std::make_shared(db_impl_, snapshot); } else { - snapshot = options.snapshot; + snapshot = read_options.snapshot; } snapshot_seq = snapshot->GetSequenceNumber(); @@ -462,12 +472,12 @@ Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options, static_cast_with_check(column_family)->cfd(); auto* state = new IteratorState(this, snapshot_seq, own_snapshot, min_uncommitted, txn); + SuperVersion* super_version = cfd->GetReferencedSuperVersion(db_impl_); auto* db_iter = db_impl_->NewIteratorImpl( - options, cfd, state->MaxVisibleSeq(), &state->callback, expose_blob_index, - allow_refresh); + read_options, cfd, super_version, state->MaxVisibleSeq(), + &state->callback, expose_blob_index, allow_refresh); db_iter->RegisterCleanup(CleanupWriteUnpreparedTxnDBIterator, state, nullptr); return db_iter; } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/transactions/write_unprepared_txn_db.h b/utilities/transactions/write_unprepared_txn_db.h index c40e96d4966b..409d73a0a88d 100644 --- a/utilities/transactions/write_unprepared_txn_db.h +++ b/utilities/transactions/write_unprepared_txn_db.h @@ -4,7 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE #include "utilities/transactions/write_prepared_txn_db.h" #include "utilities/transactions/write_unprepared_txn.h" @@ -28,7 +27,7 @@ class WriteUnpreparedTxnDB : public WritePreparedTxnDB { struct IteratorState; using WritePreparedTxnDB::NewIterator; - Iterator* NewIterator(const ReadOptions& options, + Iterator* NewIterator(const ReadOptions& _read_options, ColumnFamilyHandle* column_family, WriteUnpreparedTxn* txn); @@ -105,4 +104,3 @@ class WriteUnpreparedCommitEntryPreReleaseCallback : public PreReleaseCallback { }; } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc index 3bfc66649a76..e4bff782658d 100644 --- a/utilities/ttl/db_ttl_impl.cc +++ b/utilities/ttl/db_ttl_impl.cc @@ -2,7 +2,6 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #include "utilities/ttl/db_ttl_impl.h" @@ -20,9 +19,9 @@ namespace ROCKSDB_NAMESPACE { static std::unordered_map ttl_merge_op_type_info = - {{"user_operator", - OptionTypeInfo::AsCustomSharedPtr( - 0, OptionVerificationType::kByName, OptionTypeFlags::kNone)}}; + {{"user_operator", OptionTypeInfo::AsCustomSharedPtr( + 0, OptionVerificationType::kByNameAllowNull, + OptionTypeFlags::kNone)}}; TtlMergeOperator::TtlMergeOperator( const std::shared_ptr& merge_op, SystemClock* clock) @@ -452,7 +451,11 @@ bool DBWithTTLImpl::IsStale(const Slice& value, int32_t ttl, if (!clock->GetCurrentTime(&curtime).ok()) { return false; // Treat the data as fresh if could not get current time } - int32_t timestamp_value = + /* int32_t may overflow when timestamp_value + ttl + * for example ttl = 86400 * 365 * 15 + * convert timestamp_value to int64_t + */ + int64_t timestamp_value = DecodeFixed32(value.data() + value.size() - kTSLength); return (timestamp_value + ttl) < curtime; } @@ -591,9 +594,19 @@ Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) { } } -Iterator* DBWithTTLImpl::NewIterator(const ReadOptions& opts, +Iterator* DBWithTTLImpl::NewIterator(const ReadOptions& _read_options, ColumnFamilyHandle* column_family) { - return new TtlIterator(db_->NewIterator(opts, column_family)); + if (_read_options.io_activity != Env::IOActivity::kUnknown && + _read_options.io_activity != Env::IOActivity::kDBIterator) { + return NewErrorIterator(Status::InvalidArgument( + "Can only call NewIterator with `ReadOptions::io_activity` is " + "`Env::IOActivity::kUnknown` or `Env::IOActivity::kDBIterator`")); + } + ReadOptions read_options(_read_options); + if (read_options.io_activity == Env::IOActivity::kUnknown) { + read_options.io_activity = Env::IOActivity::kDBIterator; + } + return new TtlIterator(db_->NewIterator(read_options, column_family)); } void DBWithTTLImpl::SetTtl(ColumnFamilyHandle* h, int32_t ttl) { @@ -607,4 +620,3 @@ void DBWithTTLImpl::SetTtl(ColumnFamilyHandle* h, int32_t ttl) { } } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h index dd67a6ddc364..b125d79b0676 100644 --- a/utilities/ttl/db_ttl_impl.h +++ b/utilities/ttl/db_ttl_impl.h @@ -5,7 +5,6 @@ #pragma once -#ifndef ROCKSDB_LITE #include #include #include @@ -79,7 +78,7 @@ class DBWithTTLImpl : public DBWithTTL { virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; using StackableDB::NewIterator; - virtual Iterator* NewIterator(const ReadOptions& opts, + virtual Iterator* NewIterator(const ReadOptions& _read_options, ColumnFamilyHandle* column_family) override; virtual DB* GetBaseDB() override { return db_; } @@ -242,4 +241,3 @@ int RegisterTtlObjects(ObjectLibrary& library, const std::string& /*arg*/); } // extern "C" } // namespace ROCKSDB_NAMESPACE -#endif // ROCKSDB_LITE diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc index a42e0acb4ea1..da1d2d0da968 100644 --- a/utilities/ttl/ttl_test.cc +++ b/utilities/ttl/ttl_test.cc @@ -3,7 +3,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE #include #include @@ -404,8 +403,10 @@ class TtlTest : public testing::Test { DBWithTTL* db_ttl_; std::unique_ptr env_; - private: + protected: Options options_; + + private: KVMap kvmap_; KVMap::iterator kv_it_; const std::string kNewValue_ = "new_value"; @@ -612,6 +613,17 @@ TEST_F(TtlTest, CompactionFilter) { CloseTtl(); } +TEST_F(TtlTest, UnregisteredMergeOperator) { + class UnregisteredMergeOperator : public MergeOperator { + public: + const char* Name() const override { return "UnregisteredMergeOperator"; } + }; + options_.fail_if_options_file_error = true; + options_.merge_operator = std::make_shared(); + OpenTtl(); + CloseTtl(); +} + // Insert some key-values which KeyMayExist should be able to get and check that // values returned are fine TEST_F(TtlTest, KeyMayExist) { @@ -636,13 +648,24 @@ TEST_F(TtlTest, MultiGetTest) { CloseTtl(); } +TEST_F(TtlTest, TtlFiftenYears) { + MakeKVMap(kSampleSize_); + // 15 year will lead int32_t overflow from now + const int kFifteenYearSeconds = 86400 * 365 * 15; + OpenTtl(kFifteenYearSeconds); + PutValues(0, kSampleSize_, true); + // trigger the compaction + SleepCompactCheck(1, 0, kSampleSize_); + CloseTtl(); +} + TEST_F(TtlTest, ColumnFamiliesTest) { DB* db; Options options; options.create_if_missing = true; options.env = env_.get(); - DB::Open(options, dbname_, &db); + ASSERT_OK(DB::Open(options, dbname_, &db)); ColumnFamilyHandle* handle; ASSERT_OK(db->CreateColumnFamily(ColumnFamilyOptions(options), "ttl_column_family", &handle)); @@ -891,6 +914,14 @@ TEST_F(TtlOptionsTest, LoadTtlMergeOperator) { std::shared_ptr copy; ASSERT_OK(MergeOperator::CreateFromString(config_options_, opts_str, ©)); ASSERT_TRUE(mo->AreEquivalent(config_options_, copy.get(), &mismatch)); + + // An unregistered user_operator will be null, which is not supported by the + // `TtlMergeOperator` implementation. + ASSERT_OK(MergeOperator::CreateFromString( + config_options_, "id=TtlMergeOperator; user_operator=unknown", &mo)); + ASSERT_NE(mo.get(), nullptr); + ASSERT_STREQ(mo->Name(), TtlMergeOperator::kClassName()); + ASSERT_NOK(mo->ValidateOptions(DBOptions(), ColumnFamilyOptions())); } } // namespace ROCKSDB_NAMESPACE @@ -901,12 +932,3 @@ int main(int argc, char** argv) { return RUN_ALL_TESTS(); } -#else -#include - -int main(int /*argc*/, char** /*argv*/) { - fprintf(stderr, "SKIPPED as DBWithTTL is not supported in ROCKSDB_LITE\n"); - return 0; -} - -#endif // !ROCKSDB_LITE diff --git a/utilities/wal_filter.cc b/utilities/wal_filter.cc index 98bba361003a..9fa36bf27221 100644 --- a/utilities/wal_filter.cc +++ b/utilities/wal_filter.cc @@ -15,8 +15,7 @@ namespace ROCKSDB_NAMESPACE { Status WalFilter::CreateFromString(const ConfigOptions& config_options, const std::string& value, WalFilter** filter) { - Status s = - LoadStaticObject(config_options, value, nullptr, filter); + Status s = LoadStaticObject(config_options, value, filter); return s; } diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc index 408243b3fff4..bbfc60f9b486 100644 --- a/utilities/write_batch_with_index/write_batch_with_index.cc +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -3,10 +3,9 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE - #include "rocksdb/utilities/write_batch_with_index.h" +#include #include #include "db/column_family.h" @@ -300,12 +299,20 @@ WBWIIterator* WriteBatchWithIndex::NewIterator( Iterator* WriteBatchWithIndex::NewIteratorWithBase( ColumnFamilyHandle* column_family, Iterator* base_iterator, const ReadOptions* read_options) { - auto wbwiii = - new WBWIIteratorImpl(GetColumnFamilyID(column_family), &(rep->skip_list), - &rep->write_batch, &rep->comparator); + WBWIIteratorImpl* wbwiii; + if (read_options != nullptr) { + wbwiii = new WBWIIteratorImpl( + GetColumnFamilyID(column_family), &(rep->skip_list), &rep->write_batch, + &rep->comparator, read_options->iterate_lower_bound, + read_options->iterate_upper_bound); + } else { + wbwiii = new WBWIIteratorImpl(GetColumnFamilyID(column_family), + &(rep->skip_list), &rep->write_batch, + &rep->comparator); + } + return new BaseDeltaIterator(column_family, base_iterator, wbwiii, - GetColumnFamilyUserComparator(column_family), - read_options); + GetColumnFamilyUserComparator(column_family)); } Iterator* WriteBatchWithIndex::NewIteratorWithBase(Iterator* base_iterator) { @@ -428,11 +435,12 @@ Status WriteBatchWithIndex::PutLogData(const Slice& blob) { void WriteBatchWithIndex::Clear() { rep->Clear(); } Status WriteBatchWithIndex::GetFromBatch(ColumnFamilyHandle* column_family, - const DBOptions& options, + const DBOptions& /* options */, const Slice& key, std::string* value) { + MergeContext merge_context; Status s; - WriteBatchWithIndexInternal wbwii(&options, column_family); - auto result = wbwii.GetFromBatch(this, key, value, &s); + auto result = WriteBatchWithIndexInternal::GetFromBatch( + this, column_family, key, &merge_context, value, &s); switch (result) { case WBWIIteratorImpl::kFound: @@ -504,20 +512,27 @@ Status WriteBatchWithIndex::GetFromBatchAndDB(DB* db, Status WriteBatchWithIndex::GetFromBatchAndDB( DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val, ReadCallback* callback) { + assert(db); + assert(pinnable_val); + + if (!column_family) { + column_family = db->DefaultColumnFamily(); + } + const Comparator* const ucmp = rep->comparator.GetComparator(column_family); size_t ts_sz = ucmp ? ucmp->timestamp_size() : 0; if (ts_sz > 0 && !read_options.timestamp) { return Status::InvalidArgument("Must specify timestamp"); } - Status s; - WriteBatchWithIndexInternal wbwii(db, column_family); - // Since the lifetime of the WriteBatch is the same as that of the transaction // we cannot pin it as otherwise the returned value will not be available // after the transaction finishes. - std::string& batch_value = *pinnable_val->GetSelf(); - auto result = wbwii.GetFromBatch(this, key, &batch_value, &s); + MergeContext merge_context; + Status s; + + auto result = WriteBatchWithIndexInternal::GetFromBatch( + this, column_family, key, &merge_context, pinnable_val->GetSelf(), &s); if (result == WBWIIteratorImpl::kFound) { pinnable_val->PinSelf(); @@ -532,7 +547,8 @@ Status WriteBatchWithIndex::GetFromBatchAndDB( // Did not find key in batch OR could not resolve Merges. Try DB. if (!callback) { - s = db->Get(read_options, column_family, key, pinnable_val); + s = static_cast_with_check(db->GetRootDB()) + ->GetImpl(read_options, column_family, key, pinnable_val); } else { DBImpl::GetImplOptions get_impl_options; get_impl_options.column_family = column_family; @@ -546,10 +562,14 @@ Status WriteBatchWithIndex::GetFromBatchAndDB( if (result == WBWIIteratorImpl::kMergeInProgress) { // Merge result from DB with merges in Batch std::string merge_result; + if (s.ok()) { - s = wbwii.MergeKey(key, pinnable_val, &merge_result); - } else { // Key not present in db (s.IsNotFound()) - s = wbwii.MergeKey(key, nullptr, &merge_result); + s = WriteBatchWithIndexInternal::MergeKeyWithPlainBaseValue( + column_family, key, *pinnable_val, merge_context, &merge_result); + } else { + assert(s.IsNotFound()); + s = WriteBatchWithIndexInternal::MergeKeyWithNoBaseValue( + column_family, key, merge_context, &merge_result); } if (s.ok()) { pinnable_val->Reset(); @@ -574,6 +594,15 @@ void WriteBatchWithIndex::MultiGetFromBatchAndDB( DB* db, const ReadOptions& read_options, ColumnFamilyHandle* column_family, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, bool sorted_input, ReadCallback* callback) { + assert(db); + assert(keys); + assert(values); + assert(statuses); + + if (!column_family) { + column_family = db->DefaultColumnFamily(); + } + const Comparator* const ucmp = rep->comparator.GetComparator(column_family); size_t ts_sz = ucmp ? ucmp->timestamp_size() : 0; if (ts_sz > 0 && !read_options.timestamp) { @@ -583,8 +612,6 @@ void WriteBatchWithIndex::MultiGetFromBatchAndDB( return; } - WriteBatchWithIndexInternal wbwii(db, column_family); - autovector key_context; autovector sorted_keys; // To hold merges from the write batch @@ -600,8 +627,8 @@ void WriteBatchWithIndex::MultiGetFromBatchAndDB( Status* s = &statuses[i]; PinnableSlice* pinnable_val = &values[i]; pinnable_val->Reset(); - auto result = - wbwii.GetFromBatch(this, keys[i], &merge_context, &batch_value, s); + auto result = WriteBatchWithIndexInternal::GetFromBatch( + this, column_family, keys[i], &merge_context, &batch_value, s); if (result == WBWIIteratorImpl::kFound) { *pinnable_val->GetSelf() = std::move(batch_value); @@ -618,7 +645,8 @@ void WriteBatchWithIndex::MultiGetFromBatchAndDB( assert(result == WBWIIteratorImpl::kMergeInProgress || result == WBWIIteratorImpl::kNotFound); key_context.emplace_back(column_family, keys[i], &values[i], - /*timestamp*/ nullptr, &statuses[i]); + /* columns */ nullptr, /* timestamp */ nullptr, + &statuses[i]); merges.emplace_back(result, std::move(merge_context)); } @@ -640,14 +668,17 @@ void WriteBatchWithIndex::MultiGetFromBatchAndDB( std::pair& merge_result = merges[index]; if (merge_result.first == WBWIIteratorImpl::kMergeInProgress) { - std::string merged_value; // Merge result from DB with merges in Batch + std::string merged_value; + if (key.s->ok()) { - *key.s = wbwii.MergeKey(*key.key, iter->value, merge_result.second, - &merged_value); - } else { // Key not present in db (s.IsNotFound()) - *key.s = wbwii.MergeKey(*key.key, nullptr, merge_result.second, - &merged_value); + *key.s = WriteBatchWithIndexInternal::MergeKeyWithPlainBaseValue( + column_family, *key.key, *key.value, merge_result.second, + &merged_value); + } else { + assert(key.s->IsNotFound()); + *key.s = WriteBatchWithIndexInternal::MergeKeyWithNoBaseValue( + column_family, *key.key, merge_result.second, &merged_value); } if (key.s->ok()) { key.value->Reset(); @@ -692,4 +723,3 @@ const Comparator* WriteBatchWithIndexInternal::GetUserComparator( } } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/utilities/write_batch_with_index/write_batch_with_index_internal.cc b/utilities/write_batch_with_index/write_batch_with_index_internal.cc index 5ae4df7dd05e..bedd5934d5bd 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_internal.cc +++ b/utilities/write_batch_with_index/write_batch_with_index_internal.cc @@ -3,14 +3,12 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). -#ifndef ROCKSDB_LITE - #include "utilities/write_batch_with_index/write_batch_with_index_internal.h" #include "db/column_family.h" #include "db/db_impl/db_impl.h" -#include "db/merge_context.h" #include "db/merge_helper.h" +#include "options/cf_options.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/utilities/write_batch_with_index.h" @@ -22,19 +20,18 @@ namespace ROCKSDB_NAMESPACE { BaseDeltaIterator::BaseDeltaIterator(ColumnFamilyHandle* column_family, Iterator* base_iterator, WBWIIteratorImpl* delta_iterator, - const Comparator* comparator, - const ReadOptions* read_options) + const Comparator* comparator) : forward_(true), current_at_base_(true), equal_keys_(false), status_(Status::OK()), + column_family_(column_family), base_iterator_(base_iterator), delta_iterator_(delta_iterator), - comparator_(comparator), - iterate_upper_bound_(read_options ? read_options->iterate_upper_bound - : nullptr) { + comparator_(comparator) { + assert(base_iterator_); + assert(delta_iterator_); assert(comparator_); - wbwii_.reset(new WriteBatchWithIndexInternal(column_family)); } bool BaseDeltaIterator::Valid() const { @@ -149,33 +146,8 @@ Slice BaseDeltaIterator::key() const { : delta_iterator_->Entry().key; } -Slice BaseDeltaIterator::value() const { - if (current_at_base_) { - return base_iterator_->value(); - } else { - WriteEntry delta_entry = delta_iterator_->Entry(); - if (wbwii_->GetNumOperands() == 0) { - return delta_entry.value; - } else if (delta_entry.type == kDeleteRecord || - delta_entry.type == kSingleDeleteRecord) { - status_ = - wbwii_->MergeKey(delta_entry.key, nullptr, merge_result_.GetSelf()); - } else if (delta_entry.type == kPutRecord) { - status_ = wbwii_->MergeKey(delta_entry.key, &delta_entry.value, - merge_result_.GetSelf()); - } else if (delta_entry.type == kMergeRecord) { - if (equal_keys_) { - Slice base_value = base_iterator_->value(); - status_ = wbwii_->MergeKey(delta_entry.key, &base_value, - merge_result_.GetSelf()); - } else { - status_ = - wbwii_->MergeKey(delta_entry.key, nullptr, merge_result_.GetSelf()); - } - } - merge_result_.PinSelf(); - return merge_result_; - } +Slice BaseDeltaIterator::timestamp() const { + return current_at_base_ ? base_iterator_->timestamp() : Slice(); } Status BaseDeltaIterator::status() const { @@ -274,17 +246,70 @@ void BaseDeltaIterator::AdvanceBase() { bool BaseDeltaIterator::BaseValid() const { return base_iterator_->Valid(); } bool BaseDeltaIterator::DeltaValid() const { return delta_iterator_->Valid(); } + +void BaseDeltaIterator::ResetValue() { value_.clear(); } + +void BaseDeltaIterator::SetValueFromBase() { + assert(current_at_base_); + assert(BaseValid()); + assert(value_.empty()); + + value_ = base_iterator_->value(); +} + +void BaseDeltaIterator::SetValueFromDelta() { + assert(!current_at_base_); + assert(DeltaValid()); + assert(value_.empty()); + + WriteEntry delta_entry = delta_iterator_->Entry(); + + if (merge_context_.GetNumOperands() == 0) { + value_ = delta_entry.value; + + return; + } + + if (delta_entry.type == kDeleteRecord || + delta_entry.type == kSingleDeleteRecord) { + status_ = WriteBatchWithIndexInternal::MergeKeyWithNoBaseValue( + column_family_, delta_entry.key, merge_context_, &merge_result_); + } else if (delta_entry.type == kPutRecord) { + status_ = WriteBatchWithIndexInternal::MergeKeyWithPlainBaseValue( + column_family_, delta_entry.key, delta_entry.value, merge_context_, + &merge_result_); + } else if (delta_entry.type == kMergeRecord) { + if (equal_keys_) { + status_ = WriteBatchWithIndexInternal::MergeKeyWithPlainBaseValue( + column_family_, delta_entry.key, base_iterator_->value(), + merge_context_, &merge_result_); + } else { + status_ = WriteBatchWithIndexInternal::MergeKeyWithNoBaseValue( + column_family_, delta_entry.key, merge_context_, &merge_result_); + } + } else { + status_ = Status::NotSupported("Unsupported entry type for merge"); + } + + if (!status_.ok()) { + return; + } + + value_ = merge_result_; +} + void BaseDeltaIterator::UpdateCurrent() { // Suppress false positive clang analyzer warnings. #ifndef __clang_analyzer__ status_ = Status::OK(); + ResetValue(); + while (true) { auto delta_result = WBWIIteratorImpl::kNotFound; WriteEntry delta_entry; if (DeltaValid()) { assert(delta_iterator_->status().ok()); - delta_result = - delta_iterator_->FindLatestUpdate(wbwii_->GetMergeContext()); + delta_result = delta_iterator_->FindLatestUpdate(&merge_context_); delta_entry = delta_iterator_->Entry(); } else if (!delta_iterator_->status().ok()) { // Expose the error status and stop. @@ -304,24 +329,18 @@ void BaseDeltaIterator::UpdateCurrent() { // Finished return; } - if (iterate_upper_bound_) { - if (comparator_->CompareWithoutTimestamp( - delta_entry.key, /*a_has_ts=*/false, *iterate_upper_bound_, - /*b_has_ts=*/false) >= 0) { - // out of upper bound -> finished. - return; - } - } if (delta_result == WBWIIteratorImpl::kDeleted && - wbwii_->GetNumOperands() == 0) { + merge_context_.GetNumOperands() == 0) { AdvanceDelta(); } else { current_at_base_ = false; + SetValueFromDelta(); return; } } else if (!DeltaValid()) { // Delta has finished. current_at_base_ = true; + SetValueFromBase(); return; } else { int compare = @@ -333,8 +352,9 @@ void BaseDeltaIterator::UpdateCurrent() { equal_keys_ = true; } if (delta_result != WBWIIteratorImpl::kDeleted || - wbwii_->GetNumOperands() > 0) { + merge_context_.GetNumOperands() > 0) { current_at_base_ = false; + SetValueFromDelta(); return; } // Delta is less advanced and is delete. @@ -344,6 +364,7 @@ void BaseDeltaIterator::UpdateCurrent() { } } else { current_at_base_ = true; + SetValueFromBase(); return; } } @@ -454,10 +475,10 @@ WBWIIteratorImpl::Result WBWIIteratorImpl::FindLatestUpdate( } Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset, - WriteType* type, Slice* Key, + WriteType* type, Slice* key, Slice* value, Slice* blob, Slice* xid) const { - if (type == nullptr || Key == nullptr || value == nullptr || + if (type == nullptr || key == nullptr || value == nullptr || blob == nullptr || xid == nullptr) { return Status::InvalidArgument("Output parameters cannot be null"); } @@ -473,7 +494,7 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset, Slice input = Slice(rep_.data() + data_offset, rep_.size() - data_offset); char tag; uint32_t column_family; - Status s = ReadRecordFromWriteBatch(&input, &tag, &column_family, Key, value, + Status s = ReadRecordFromWriteBatch(&input, &tag, &column_family, key, value, blob, xid); if (!s.ok()) { return s; @@ -631,81 +652,66 @@ bool WBWIIteratorImpl::MatchesKey(uint32_t cf_id, const Slice& key) { } } -WriteBatchWithIndexInternal::WriteBatchWithIndexInternal( - ColumnFamilyHandle* column_family) - : db_(nullptr), db_options_(nullptr), column_family_(column_family) {} - -WriteBatchWithIndexInternal::WriteBatchWithIndexInternal( - DB* db, ColumnFamilyHandle* column_family) - : db_(db), db_options_(nullptr), column_family_(column_family) { - if (db_ != nullptr && column_family_ == nullptr) { - column_family_ = db_->DefaultColumnFamily(); - } -} - -WriteBatchWithIndexInternal::WriteBatchWithIndexInternal( - const DBOptions* db_options, ColumnFamilyHandle* column_family) - : db_(nullptr), db_options_(db_options), column_family_(column_family) {} - -Status WriteBatchWithIndexInternal::MergeKey(const Slice& key, - const Slice* value, - const MergeContext& context, - std::string* result) const { - if (column_family_ != nullptr) { - auto cfh = static_cast_with_check(column_family_); - const auto merge_operator = cfh->cfd()->ioptions()->merge_operator.get(); - if (merge_operator == nullptr) { - return Status::InvalidArgument( - "Merge_operator must be set for column_family"); - } else if (db_ != nullptr) { - const ImmutableDBOptions& immutable_db_options = - static_cast_with_check(db_->GetRootDB()) - ->immutable_db_options(); - Statistics* statistics = immutable_db_options.statistics.get(); - Logger* logger = immutable_db_options.info_log.get(); - SystemClock* clock = immutable_db_options.clock; - // `op_failure_scope` (an output parameter) is not provided (set to - // nullptr) since a failure must be propagated regardless of its value. - return MergeHelper::TimedFullMerge( - merge_operator, key, value, context.GetOperands(), result, logger, - statistics, clock, /* result_operand */ nullptr, - /* update_num_ops_stats */ false, - /* op_failure_scope */ nullptr); - } else if (db_options_ != nullptr) { - Statistics* statistics = db_options_->statistics.get(); - Env* env = db_options_->env; - Logger* logger = db_options_->info_log.get(); - SystemClock* clock = env->GetSystemClock().get(); - // `op_failure_scope` (an output parameter) is not provided (set to - // nullptr) since a failure must be propagated regardless of its value. - return MergeHelper::TimedFullMerge( - merge_operator, key, value, context.GetOperands(), result, logger, - statistics, clock, /* result_operand */ nullptr, - /* update_num_ops_stats */ false, - /* op_failure_scope */ nullptr); - } else { - const auto cf_opts = cfh->cfd()->ioptions(); - // `op_failure_scope` (an output parameter) is not provided (set to - // nullptr) since a failure must be propagated regardless of its value. - return MergeHelper::TimedFullMerge( - merge_operator, key, value, context.GetOperands(), result, - cf_opts->logger, cf_opts->stats, cf_opts->clock, - /* result_operand */ nullptr, /* update_num_ops_stats */ false, - /* op_failure_scope */ nullptr); - } - } else { - return Status::InvalidArgument("Must provide a column_family"); +Status WriteBatchWithIndexInternal::MergeKeyWithNoBaseValue( + ColumnFamilyHandle* column_family, const Slice& key, + const MergeContext& context, std::string* result) { + // TODO: support wide columns in WBWI + + if (!column_family) { + return Status::InvalidArgument("Must provide a column family"); + } + + const auto& ioptions = GetImmutableOptions(column_family); + + const auto* merge_operator = ioptions.merge_operator.get(); + if (!merge_operator) { + return Status::InvalidArgument( + "Merge operator must be set for column family"); + } + + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. + return MergeHelper::TimedFullMerge( + merge_operator, key, MergeHelper::kNoBaseValue, context.GetOperands(), + ioptions.logger, ioptions.stats, ioptions.clock, + /* update_num_ops_stats */ false, result, + /* columns */ nullptr, /* op_failure_scope */ nullptr); +} + +Status WriteBatchWithIndexInternal::MergeKeyWithPlainBaseValue( + ColumnFamilyHandle* column_family, const Slice& key, const Slice& value, + const MergeContext& context, std::string* result) { + // TODO: support wide columns in WBWI + + if (!column_family) { + return Status::InvalidArgument("Must provide a column family"); + } + + const auto& ioptions = GetImmutableOptions(column_family); + + const auto* merge_operator = ioptions.merge_operator.get(); + if (!merge_operator) { + return Status::InvalidArgument( + "Merge operator must be set for column family"); } + + // `op_failure_scope` (an output parameter) is not provided (set to + // nullptr) since a failure must be propagated regardless of its value. + return MergeHelper::TimedFullMerge( + merge_operator, key, MergeHelper::kPlainBaseValue, value, + context.GetOperands(), ioptions.logger, ioptions.stats, ioptions.clock, + /* update_num_ops_stats */ false, result, + /* columns */ nullptr, /* op_failure_scope */ nullptr); } WBWIIteratorImpl::Result WriteBatchWithIndexInternal::GetFromBatch( - WriteBatchWithIndex* batch, const Slice& key, MergeContext* context, - std::string* value, Status* s) { + WriteBatchWithIndex* batch, ColumnFamilyHandle* column_family, + const Slice& key, MergeContext* context, std::string* value, Status* s) { *s = Status::OK(); std::unique_ptr iter( static_cast_with_check( - batch->NewIterator(column_family_))); + batch->NewIterator(column_family))); // Search the iterator for this key, and updates/merges to it. iter->Seek(key); @@ -719,7 +725,8 @@ WBWIIteratorImpl::Result WriteBatchWithIndexInternal::GetFromBatch( } else if (result == WBWIIteratorImpl::Result::kFound) { // PUT Slice entry_value = iter->Entry().value; if (context->GetNumOperands() > 0) { - *s = MergeKey(key, &entry_value, *context, value); + *s = MergeKeyWithPlainBaseValue(column_family, key, entry_value, *context, + value); if (!s->ok()) { result = WBWIIteratorImpl::Result::kError; } @@ -728,7 +735,7 @@ WBWIIteratorImpl::Result WriteBatchWithIndexInternal::GetFromBatch( } } else if (result == WBWIIteratorImpl::kDeleted) { if (context->GetNumOperands() > 0) { - *s = MergeKey(key, nullptr, *context, value); + *s = MergeKeyWithNoBaseValue(column_family, key, *context, value); if (s->ok()) { result = WBWIIteratorImpl::Result::kFound; } else { @@ -740,5 +747,3 @@ WBWIIteratorImpl::Result WriteBatchWithIndexInternal::GetFromBatch( } } // namespace ROCKSDB_NAMESPACE - -#endif // !ROCKSDB_LITE diff --git a/utilities/write_batch_with_index/write_batch_with_index_internal.h b/utilities/write_batch_with_index/write_batch_with_index_internal.h index edabc95bcdab..c4135ad32642 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_internal.h +++ b/utilities/write_batch_with_index/write_batch_with_index_internal.h @@ -4,8 +4,6 @@ // (found in the LICENSE.Apache file in the root directory). #pragma once -#ifndef ROCKSDB_LITE - #include #include #include @@ -22,10 +20,9 @@ namespace ROCKSDB_NAMESPACE { -class MergeContext; class WBWIIteratorImpl; -class WriteBatchWithIndexInternal; struct Options; +struct ImmutableOptions; // when direction == forward // * current_at_base_ <=> base_iterator > delta_iterator @@ -37,8 +34,7 @@ class BaseDeltaIterator : public Iterator { public: BaseDeltaIterator(ColumnFamilyHandle* column_family, Iterator* base_iterator, WBWIIteratorImpl* delta_iterator, - const Comparator* comparator, - const ReadOptions* read_options = nullptr); + const Comparator* comparator); ~BaseDeltaIterator() override {} @@ -50,7 +46,8 @@ class BaseDeltaIterator : public Iterator { void Next() override; void Prev() override; Slice key() const override; - Slice value() const override; + Slice value() const override { return value_; } + Slice timestamp() const override; Status status() const override; void Invalidate(Status s); @@ -61,18 +58,22 @@ class BaseDeltaIterator : public Iterator { void AdvanceBase(); bool BaseValid() const; bool DeltaValid() const; + void ResetValue(); + void SetValueFromBase(); + void SetValueFromDelta(); void UpdateCurrent(); - std::unique_ptr wbwii_; bool forward_; bool current_at_base_; bool equal_keys_; - mutable Status status_; + Status status_; + ColumnFamilyHandle* column_family_; std::unique_ptr base_iterator_; std::unique_ptr delta_iterator_; const Comparator* comparator_; // not owned - const Slice* iterate_upper_bound_; - mutable PinnableSlice merge_result_; + MergeContext merge_context_; + std::string merge_result_; + Slice value_; }; // Key used by skip list, as the binary searchable index of WriteBatchWithIndex. @@ -197,59 +198,107 @@ class WBWIIteratorImpl : public WBWIIterator { WBWIIteratorImpl(uint32_t column_family_id, WriteBatchEntrySkipList* skip_list, const ReadableWriteBatch* write_batch, - WriteBatchEntryComparator* comparator) + WriteBatchEntryComparator* comparator, + const Slice* iterate_lower_bound = nullptr, + const Slice* iterate_upper_bound = nullptr) : column_family_id_(column_family_id), skip_list_iter_(skip_list), write_batch_(write_batch), - comparator_(comparator) {} + comparator_(comparator), + iterate_lower_bound_(iterate_lower_bound), + iterate_upper_bound_(iterate_upper_bound) {} ~WBWIIteratorImpl() override {} bool Valid() const override { - if (!skip_list_iter_.Valid()) { - return false; - } - const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key(); - return (iter_entry != nullptr && - iter_entry->column_family == column_family_id_); + return !out_of_bound_ && ValidRegardlessOfBoundLimit(); } void SeekToFirst() override { - WriteBatchIndexEntry search_entry( - nullptr /* search_key */, column_family_id_, - true /* is_forward_direction */, true /* is_seek_to_first */); - skip_list_iter_.Seek(&search_entry); + if (iterate_lower_bound_ != nullptr) { + WriteBatchIndexEntry search_entry( + iterate_lower_bound_ /* search_key */, column_family_id_, + true /* is_forward_direction */, false /* is_seek_to_first */); + skip_list_iter_.Seek(&search_entry); + } else { + WriteBatchIndexEntry search_entry( + nullptr /* search_key */, column_family_id_, + true /* is_forward_direction */, true /* is_seek_to_first */); + skip_list_iter_.Seek(&search_entry); + } + + if (ValidRegardlessOfBoundLimit()) { + out_of_bound_ = TestOutOfBound(); + } } void SeekToLast() override { - WriteBatchIndexEntry search_entry( - nullptr /* search_key */, column_family_id_ + 1, - true /* is_forward_direction */, true /* is_seek_to_first */); + WriteBatchIndexEntry search_entry = + (iterate_upper_bound_ != nullptr) + ? WriteBatchIndexEntry( + iterate_upper_bound_ /* search_key */, column_family_id_, + true /* is_forward_direction */, false /* is_seek_to_first */) + : WriteBatchIndexEntry( + nullptr /* search_key */, column_family_id_ + 1, + true /* is_forward_direction */, true /* is_seek_to_first */); + skip_list_iter_.Seek(&search_entry); if (!skip_list_iter_.Valid()) { skip_list_iter_.SeekToLast(); } else { skip_list_iter_.Prev(); } + + if (ValidRegardlessOfBoundLimit()) { + out_of_bound_ = TestOutOfBound(); + } } void Seek(const Slice& key) override { + if (BeforeLowerBound(&key)) { // cap to prevent out of bound + SeekToFirst(); + return; + } + WriteBatchIndexEntry search_entry(&key, column_family_id_, true /* is_forward_direction */, false /* is_seek_to_first */); skip_list_iter_.Seek(&search_entry); + + if (ValidRegardlessOfBoundLimit()) { + out_of_bound_ = TestOutOfBound(); + } } void SeekForPrev(const Slice& key) override { + if (AtOrAfterUpperBound(&key)) { // cap to prevent out of bound + SeekToLast(); + return; + } + WriteBatchIndexEntry search_entry(&key, column_family_id_, false /* is_forward_direction */, false /* is_seek_to_first */); skip_list_iter_.SeekForPrev(&search_entry); + + if (ValidRegardlessOfBoundLimit()) { + out_of_bound_ = TestOutOfBound(); + } } - void Next() override { skip_list_iter_.Next(); } + void Next() override { + skip_list_iter_.Next(); + if (ValidRegardlessOfBoundLimit()) { + out_of_bound_ = TestOutOfBound(); + } + } - void Prev() override { skip_list_iter_.Prev(); } + void Prev() override { + skip_list_iter_.Prev(); + if (ValidRegardlessOfBoundLimit()) { + out_of_bound_ = TestOutOfBound(); + } + } WriteEntry Entry() const override; @@ -290,6 +339,45 @@ class WBWIIteratorImpl : public WBWIIterator { WriteBatchEntrySkipList::Iterator skip_list_iter_; const ReadableWriteBatch* write_batch_; WriteBatchEntryComparator* comparator_; + const Slice* iterate_lower_bound_; + const Slice* iterate_upper_bound_; + bool out_of_bound_ = false; + + bool TestOutOfBound() const { + const Slice& curKey = Entry().key; + return AtOrAfterUpperBound(&curKey) || BeforeLowerBound(&curKey); + } + + bool ValidRegardlessOfBoundLimit() const { + if (!skip_list_iter_.Valid()) { + return false; + } + const WriteBatchIndexEntry* iter_entry = skip_list_iter_.key(); + return iter_entry != nullptr && + iter_entry->column_family == column_family_id_; + } + + bool AtOrAfterUpperBound(const Slice* k) const { + if (iterate_upper_bound_ == nullptr) { + return false; + } + + return comparator_->GetComparator(column_family_id_) + ->CompareWithoutTimestamp(*k, /*a_has_ts=*/false, + *iterate_upper_bound_, + /*b_has_ts=*/false) >= 0; + } + + bool BeforeLowerBound(const Slice* k) const { + if (iterate_lower_bound_ == nullptr) { + return false; + } + + return comparator_->GetComparator(column_family_id_) + ->CompareWithoutTimestamp(*k, /*a_has_ts=*/false, + *iterate_lower_bound_, + /*b_has_ts=*/false) < 0; + } }; class WriteBatchWithIndexInternal { @@ -297,14 +385,15 @@ class WriteBatchWithIndexInternal { static const Comparator* GetUserComparator(const WriteBatchWithIndex& wbwi, uint32_t cf_id); - // For GetFromBatchAndDB or similar - explicit WriteBatchWithIndexInternal(DB* db, - ColumnFamilyHandle* column_family); - // For GetFromBatchAndDB or similar - explicit WriteBatchWithIndexInternal(ColumnFamilyHandle* column_family); - // For GetFromBatch or similar - explicit WriteBatchWithIndexInternal(const DBOptions* db_options, - ColumnFamilyHandle* column_family); + static Status MergeKeyWithNoBaseValue(ColumnFamilyHandle* column_family, + const Slice& key, + const MergeContext& context, + std::string* result); + + static Status MergeKeyWithPlainBaseValue(ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + const MergeContext& context, + std::string* result); // If batch contains a value for key, store it in *value and return kFound. // If batch contains a deletion for key, return Deleted. @@ -314,31 +403,10 @@ class WriteBatchWithIndexInternal { // and return kMergeInProgress // If batch does not contain this key, return kNotFound // Else, return kError on error with error Status stored in *s. - WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch, - const Slice& key, std::string* value, - Status* s) { - return GetFromBatch(batch, key, &merge_context_, value, s); - } - WBWIIteratorImpl::Result GetFromBatch(WriteBatchWithIndex* batch, - const Slice& key, - MergeContext* merge_context, - std::string* value, Status* s); - Status MergeKey(const Slice& key, const Slice* value, - std::string* result) const { - return MergeKey(key, value, merge_context_, result); - } - Status MergeKey(const Slice& key, const Slice* value, - const MergeContext& context, std::string* result) const; - size_t GetNumOperands() const { return merge_context_.GetNumOperands(); } - MergeContext* GetMergeContext() { return &merge_context_; } - Slice GetOperand(int index) const { return merge_context_.GetOperand(index); } - - private: - DB* db_; - const DBOptions* db_options_; - ColumnFamilyHandle* column_family_; - MergeContext merge_context_; + static WBWIIteratorImpl::Result GetFromBatch( + WriteBatchWithIndex* batch, ColumnFamilyHandle* column_family, + const Slice& key, MergeContext* merge_context, std::string* value, + Status* s); }; } // namespace ROCKSDB_NAMESPACE -#endif // !ROCKSDB_LITE diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc index 350dcc881e08..95333d8f4701 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_test.cc +++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc @@ -7,8 +7,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#ifndef ROCKSDB_LITE - #include "rocksdb/utilities/write_batch_with_index.h" #include @@ -262,14 +260,14 @@ class WBWIBaseTest : public testing::Test { std::string result; for (size_t i = 0; i < key.size(); i++) { if (key[i] == 'd') { - batch_->Delete(cf, key); + EXPECT_OK(batch_->Delete(cf, key)); result = ""; } else if (key[i] == 'p') { result = key + std::to_string(i); - batch_->Put(cf, key, result); + EXPECT_OK(batch_->Put(cf, key, result)); } else if (key[i] == 'm') { std::string value = key + std::to_string(i); - batch_->Merge(cf, key, value); + EXPECT_OK(batch_->Merge(cf, key, value)); if (result.empty()) { result = value; } else { @@ -1244,7 +1242,7 @@ TEST_F(WBWIOverwriteTest, TestGetFromBatchMerge2) { s = batch_->GetFromBatch(column_family, options_, "X", &value); ASSERT_TRUE(s.IsNotFound()); - batch_->Merge(column_family, "X", "ddd"); + ASSERT_OK(batch_->Merge(column_family, "X", "ddd")); ASSERT_OK(batch_->GetFromBatch(column_family, options_, "X", &value)); ASSERT_EQ("ddd", value); } @@ -1642,6 +1640,104 @@ TEST_P(WriteBatchWithIndexTest, TestNewIteratorWithBaseFromWbwi) { ASSERT_OK(iter->status()); } +TEST_P(WriteBatchWithIndexTest, TestBoundsCheckingInDeltaIterator) { + Status s = OpenDB(); + ASSERT_OK(s); + + KVMap empty_map; + + // writes that should be observed by BaseDeltaIterator::delta_iterator_ + ASSERT_OK(batch_->Put("a", "aa")); + ASSERT_OK(batch_->Put("b", "bb")); + ASSERT_OK(batch_->Put("c", "cc")); + + ReadOptions ro; + + auto check_only_b_is_visible = [&]() { + std::unique_ptr iter(batch_->NewIteratorWithBase( + db_->DefaultColumnFamily(), new KVIter(&empty_map), &ro)); + + // move to the lower bound + iter->SeekToFirst(); + ASSERT_EQ("b", iter->key()); + iter->Prev(); + ASSERT_FALSE(iter->Valid()); + + // move to the upper bound + iter->SeekToLast(); + ASSERT_EQ("b", iter->key()); + iter->Next(); + ASSERT_FALSE(iter->Valid()); + + // test bounds checking in Seek and SeekForPrev + iter->Seek(Slice("a")); + ASSERT_EQ("b", iter->key()); + iter->Seek(Slice("b")); + ASSERT_EQ("b", iter->key()); + iter->Seek(Slice("c")); + ASSERT_FALSE(iter->Valid()); + + iter->SeekForPrev(Slice("c")); + ASSERT_EQ("b", iter->key()); + iter->SeekForPrev(Slice("b")); + ASSERT_EQ("b", iter->key()); + iter->SeekForPrev(Slice("a")); + ASSERT_FALSE(iter->Valid()); + + iter->SeekForPrev( + Slice("a.1")); // a non-existent key that is smaller than "b" + ASSERT_FALSE(iter->Valid()); + + iter->Seek(Slice("b.1")); // a non-existent key that is greater than "b" + ASSERT_FALSE(iter->Valid()); + + delete ro.iterate_lower_bound; + delete ro.iterate_upper_bound; + }; + + ro.iterate_lower_bound = new Slice("b"); + ro.iterate_upper_bound = new Slice("c"); + check_only_b_is_visible(); + + ro.iterate_lower_bound = new Slice("a.1"); + ro.iterate_upper_bound = new Slice("c"); + check_only_b_is_visible(); + + ro.iterate_lower_bound = new Slice("b"); + ro.iterate_upper_bound = new Slice("b.2"); + check_only_b_is_visible(); +} + +TEST_P(WriteBatchWithIndexTest, + TestBoundsCheckingInSeekToFirstAndLastOfDeltaIterator) { + Status s = OpenDB(); + ASSERT_OK(s); + KVMap empty_map; + // writes that should be observed by BaseDeltaIterator::delta_iterator_ + ASSERT_OK(batch_->Put("c", "cc")); + + ReadOptions ro; + auto check_nothing_visible = [&]() { + std::unique_ptr iter(batch_->NewIteratorWithBase( + db_->DefaultColumnFamily(), new KVIter(&empty_map), &ro)); + iter->SeekToFirst(); + ASSERT_FALSE(iter->Valid()); + iter->SeekToLast(); + ASSERT_FALSE(iter->Valid()); + + delete ro.iterate_lower_bound; + delete ro.iterate_upper_bound; + }; + + ro.iterate_lower_bound = new Slice("b"); + ro.iterate_upper_bound = new Slice("c"); + check_nothing_visible(); + + ro.iterate_lower_bound = new Slice("d"); + ro.iterate_upper_bound = new Slice("e"); + check_nothing_visible(); +} + TEST_P(WriteBatchWithIndexTest, SavePointTest) { ColumnFamilyHandleImplDummy cf1(1, BytewiseComparator()); KVMap empty_map; @@ -2101,8 +2197,8 @@ TEST_P(WriteBatchWithIndexTest, GetFromBatchAfterMerge) { ASSERT_OK(OpenDB()); ASSERT_OK(db_->Put(write_opts_, "o", "aa")); - batch_->Merge("o", "bb"); // Merging bb under key "o" - batch_->Merge("m", "cc"); // Merging bc under key "m" + ASSERT_OK(batch_->Merge("o", "bb")); // Merging bb under key "o" + ASSERT_OK(batch_->Merge("m", "cc")); // Merging bc under key "m" s = batch_->GetFromBatch(options_, "m", &value); ASSERT_EQ(s.code(), Status::Code::kMergeInProgress); s = batch_->GetFromBatch(options_, "o", &value); @@ -2249,6 +2345,8 @@ TEST_F(WBWIOverwriteTest, TestBadMergeOperator) { } TEST_P(WriteBatchWithIndexTest, ColumnFamilyWithTimestamp) { + ASSERT_OK(OpenDB()); + ColumnFamilyHandleImplDummy cf2(2, test::BytewiseComparatorWithU64TsWrapper()); @@ -2264,10 +2362,9 @@ TEST_P(WriteBatchWithIndexTest, ColumnFamilyWithTimestamp) { .IsInvalidArgument()); { std::string value; - ASSERT_TRUE(batch_ - ->GetFromBatchAndDB( - /*db=*/nullptr, ReadOptions(), &cf2, "key", &value) - .IsInvalidArgument()); + ASSERT_TRUE( + batch_->GetFromBatchAndDB(db_, ReadOptions(), &cf2, "key", &value) + .IsInvalidArgument()); } { constexpr size_t num_keys = 2; @@ -2276,8 +2373,8 @@ TEST_P(WriteBatchWithIndexTest, ColumnFamilyWithTimestamp) { {PinnableSlice(), PinnableSlice()}}; std::array statuses{{Status(), Status()}}; constexpr bool sorted_input = false; - batch_->MultiGetFromBatchAndDB(/*db=*/nullptr, ReadOptions(), &cf2, - num_keys, keys.data(), pinnable_vals.data(), + batch_->MultiGetFromBatchAndDB(db_, ReadOptions(), &cf2, num_keys, + keys.data(), pinnable_vals.data(), statuses.data(), sorted_input); for (const auto& s : statuses) { ASSERT_TRUE(s.IsInvalidArgument()); @@ -2407,13 +2504,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - -#else -#include - -int main() { - fprintf(stderr, "SKIPPED\n"); - return 0; -} - -#endif // !ROCKSDB_LITE