diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 000000000..31e0e42d6
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,8 @@
+# Set update schedule for GitHub Actions
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "monthly"
+    open-pull-requests-limit: 5
diff --git a/.github/workflows/alpine.yml b/.github/workflows/alpine.yml
index 95491609c..d01355c78 100644
--- a/.github/workflows/alpine.yml
+++ b/.github/workflows/alpine.yml
@@ -2,11 +2,13 @@ name: Alpine Linux
 'on':
   - push
   - pull_request
+permissions:
+  contents: read
 jobs:
   ubuntu-build:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0
       - name: start docker
         run: |
           docker run -w /src -dit --name alpine -v $PWD:/src alpine:latest
@@ -33,4 +35,4 @@ jobs:
           ./alpine.sh cmake --build build_for_alpine_debug
       - name: testdebug
         run: |
-          ./alpine.sh bash -c "cd build_for_alpine_debug && ctest"
\ No newline at end of file
+          ./alpine.sh bash -c "cd build_for_alpine_debug && ctest"
diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml
new file mode 100644
index 000000000..9da72b1b9
--- /dev/null
+++ b/.github/workflows/cifuzz.yml
@@ -0,0 +1,26 @@
+name: CIFuzz
+on: [pull_request]
+permissions:
+  contents: read
+jobs:
+  Fuzzing:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Build Fuzzers
+      id: build
+      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@d318097b285bc695f785b98d40c2d058c0f438b5 # master
+      with:
+        oss-fuzz-project-name: 'croaring'
+        dry-run: false
+    - name: Run Fuzzers
+      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@d318097b285bc695f785b98d40c2d058c0f438b5 # master
+      with:
+        oss-fuzz-project-name: 'croaring'
+        fuzz-seconds: 300
+        dry-run: false
+    - name: Upload Crash
+      uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
+      if: failure() && steps.build.outcome == 'success'
+      with:
+        name: artifacts
+        path: ./out/artifacts
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 000000000..082400aa8
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,44 @@
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+  schedule:
+    - cron: "39 2 * * 6"
+
+permissions:
+  contents: read
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: [ cpp, python ]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
+
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@f3feb00acb00f31a6f60280e6ace9ca31d91c76a # v2.3.2
+        with:
+          languages: ${{ matrix.language }}
+
+      - name: Autobuild
+        uses: github/codeql-action/autobuild@f3feb00acb00f31a6f60280e6ace9ca31d91c76a # v2.3.2
+        if: ${{ matrix.language == 'cpp' || matrix.language == 'python' }}
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@f3feb00acb00f31a6f60280e6ace9ca31d91c76a # v2.3.2
+        with:
+          category: "/language:${{ matrix.language }}"
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
new file mode 100644
index 000000000..f88d499ea
--- /dev/null
+++ b/.github/workflows/documentation.yml
@@ -0,0 +1,36 @@
+name: Doxygen GitHub Pages
+
+on:
+  push:
+    branches:
+      - master
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pages: write
+      id-token: write
+    steps:
+      - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
+      - name: Install Doxygen
+        run: sudo apt-get install doxygen graphviz -y
+      - run: mkdir docs
+      - name: Install theme
+        run: ./tools/prepare_doxygen.sh
+      - name: Generate Doxygen Documentation
+        run: doxygen ./doxygen
+      - name: Deploy to GitHub Pages
+        uses: peaceiris/actions-gh-pages@373f7f263a76c20808c831209c920827a82a2847 # v3.9.3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: docs/html
diff --git a/.github/workflows/macos-ci.yml b/.github/workflows/macos-ci.yml
index d926d172f..c97c5160f 100644
--- a/.github/workflows/macos-ci.yml
+++ b/.github/workflows/macos-ci.yml
@@ -4,13 +4,15 @@ name: Macos-CI
   - push
   - pull_request
 
+permissions:
+  contents: read
 
 jobs:
   ci:
     name: macos-llvm
     runs-on: macos-latest
     steps: 
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0
       - name: Build and Test
         run: |
           mkdir build
@@ -20,6 +22,15 @@ jobs:
           ctest . --output-on-failure
           cmake --install . 
           cd ../tests/installation/find && mkdir build && cd build && cmake -DCMAKE_INSTALL_PREFIX:PATH=../../../build/destination .. &&  cmake --build .
+      - name: Build and Test (shared)
+        run: |
+          cmake -DBUILD_SHARED_LIBS=ON -B buildshared -DCMAKE_INSTALL_PREFIX:PATH=destinationshared
+          cmake --build buildshared
+          cmake --install buildshared
+          cd tests/installation/find
+          cmake -DCMAKE_INSTALL_PREFIX:PATH=../../../destinationshared -B buildshared
+          cmake --build buildshared
+          ./buildshared/repro
       - name: Build and Test Debug
         run: |
           mkdir builddebug
diff --git a/.github/workflows/s390x.yml b/.github/workflows/s390x.yml
new file mode 100644
index 000000000..c0045a235
--- /dev/null
+++ b/.github/workflows/s390x.yml
@@ -0,0 +1,33 @@
+name: Ubuntu s390x (GCC 11)
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2
+      - uses: uraimo/run-on-arch-action@a8003307a739516fdd80ee6d3da8924db811b8da # v2.5.0
+        name: Test
+        id: runcmd
+        with:
+          arch: s390x
+          githubToken: ${{ github.token }}
+          distro: ubuntu_latest
+          install: |
+            apt-get update -q -y
+            apt-get install -y cmake make g++
+          run: |
+            cmake -DCMAKE_BUILD_TYPE=Release -B build
+            cmake --build build -j=2
+            ctest --output-on-failure --test-dir build
+
diff --git a/.github/workflows/ubuntu-ci.yml b/.github/workflows/ubuntu-ci.yml
index 82956cca6..169124f04 100644
--- a/.github/workflows/ubuntu-ci.yml
+++ b/.github/workflows/ubuntu-ci.yml
@@ -4,6 +4,8 @@ name: Ubuntu-CI
   - push
   - pull_request
 
+permissions:
+  contents: read
 
 jobs:
   ci:
@@ -15,7 +17,7 @@ jobs:
       CXX: g++
 
     steps: 
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0
       - name: Build and Test
         run: |
           mkdir build
diff --git a/.github/workflows/ubuntu-debug-sani-ci.yml b/.github/workflows/ubuntu-debug-sani-ci.yml
new file mode 100644
index 000000000..cbda35a0c
--- /dev/null
+++ b/.github/workflows/ubuntu-debug-sani-ci.yml
@@ -0,0 +1,27 @@
+name: Ubuntu-Debug-Sanitized-CI
+
+'on':
+  - push
+  - pull_request
+
+permissions:
+  contents: read
+
+jobs:
+  ci:
+    name: ubuntu-gcc
+    runs-on: ubuntu-latest
+
+    env:
+      CC: gcc
+      CXX: g++
+
+    steps: 
+      - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0
+      - name: Build and Test
+        run: |
+          mkdir build
+          cd build
+          cmake  -DCMAKE_BUILD_TYPE=Debug -DROARING_SANITIZE=ON ..
+          cmake --build . 
+          ctest . --output-on-failure
diff --git a/.github/workflows/ubuntu-gcc10-ci.yml b/.github/workflows/ubuntu-gcc10-ci.yml
index d617f4b43..f46bfb94f 100644
--- a/.github/workflows/ubuntu-gcc10-ci.yml
+++ b/.github/workflows/ubuntu-gcc10-ci.yml
@@ -4,6 +4,8 @@ name: Ubuntu-GCC10-CI
   - push
   - pull_request
 
+permissions:
+  contents: read
 
 jobs:
   ci:
@@ -13,7 +15,7 @@ jobs:
       CC: gcc-10
       CXX: g++-10
     steps: 
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0
       - run:   |
          sudo apt update
          sudo apt install gcc-10 g++-10
diff --git a/.github/workflows/ubuntu-legacy-ci.yml b/.github/workflows/ubuntu-legacy-ci.yml
index fd13acabb..108a49d68 100644
--- a/.github/workflows/ubuntu-legacy-ci.yml
+++ b/.github/workflows/ubuntu-legacy-ci.yml
@@ -4,6 +4,8 @@ name: Ubuntu-CI
   - push
   - pull_request
 
+permissions:
+  contents: read
 
 jobs:
   ci:
@@ -11,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps: 
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0
       - name: Build and Test
         run: |
           mkdir build
diff --git a/.github/workflows/ubuntu-noexcept-ci.yml b/.github/workflows/ubuntu-noexcept-ci.yml
index 1b5382000..889d41ed1 100644
--- a/.github/workflows/ubuntu-noexcept-ci.yml
+++ b/.github/workflows/ubuntu-noexcept-ci.yml
@@ -4,6 +4,8 @@ name: Ubuntu-CI
   - push
   - pull_request
 
+permissions:
+  contents: read
 
 jobs:
   ci:
@@ -15,7 +17,7 @@ jobs:
       CXX: g++
 
     steps: 
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0
       - name: Build and Test
         run: |
           mkdir build
diff --git a/.github/workflows/ubuntu-oldclang-18-ci.yml b/.github/workflows/ubuntu-oldclang-18-ci.yml
deleted file mode 100644
index 8f6e50dde..000000000
--- a/.github/workflows/ubuntu-oldclang-18-ci.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: Ubuntu-CI (old llvm)
-
-'on':
-  - push
-  - pull_request
-
-
-jobs:
-  ci:
-    name: ubuntu-clangold-gcc
-    runs-on: ubuntu-18.04
-
-    env:
-      CC: clang-7
-      CXX: clang++-7
-
-    steps: 
-      - uses: actions/checkout@v2
-      - name: install clang 7
-        run: |
-          sudo apt update
-          sudo apt install clang-7
-      - name: Build and Test
-        run: |
-          mkdir build
-          cd build
-          cmake  ..
-          cmake --build . 
-          ctest . --output-on-failure
diff --git a/.github/workflows/ubuntu-sani-ci.yml b/.github/workflows/ubuntu-sani-ci.yml
index c955b7e85..c0f8608e3 100644
--- a/.github/workflows/ubuntu-sani-ci.yml
+++ b/.github/workflows/ubuntu-sani-ci.yml
@@ -4,6 +4,8 @@ name: Ubuntu-Sanitized-CI
   - push
   - pull_request
 
+permissions:
+  contents: read
 
 jobs:
   ci:
@@ -15,7 +17,7 @@ jobs:
       CXX: g++
 
     steps: 
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0
       - name: Build and Test
         run: |
           mkdir build
diff --git a/.github/workflows/ubuntu-18-ci.yml b/.github/workflows/ubuntu-sani-thread-ci.yml
similarity index 55%
rename from .github/workflows/ubuntu-18-ci.yml
rename to .github/workflows/ubuntu-sani-thread-ci.yml
index 554951fe2..067644aea 100644
--- a/.github/workflows/ubuntu-18-ci.yml
+++ b/.github/workflows/ubuntu-sani-thread-ci.yml
@@ -1,25 +1,27 @@
-name: Ubuntu-18-CI
+name: Ubuntu-Sanitized-CI
 
 'on':
   - push
   - pull_request
 
+permissions:
+  contents: read
 
 jobs:
   ci:
     name: ubuntu-gcc
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-latest
 
     env:
       CC: gcc
       CXX: g++
 
     steps: 
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0
       - name: Build and Test
         run: |
           mkdir build
           cd build
-          cmake  ..
+          cmake  -DROARING_SANITIZE_THREADS=ON ..
           cmake --build . 
           ctest . --output-on-failure
diff --git a/.github/workflows/vs16-arm-ci.yml b/.github/workflows/vs16-arm-ci.yml
index 79017bd76..f68e049af 100644
--- a/.github/workflows/vs16-arm-ci.yml
+++ b/.github/workflows/vs16-arm-ci.yml
@@ -2,20 +2,27 @@ name: VS16-ARM-CI
 
 on: [push, pull_request]
 
+permissions:
+  contents: read
+
 jobs:
   ci:
     name: windows-vs16
-    runs-on: windows-latest
+    runs-on: windows-2019
     strategy:
       fail-fast: false
       matrix:
         include:
-          - {arch: ARM}
-          - {arch: ARM64}
+          - {gen: Visual Studio 16 2019, arch: ARM}
+          - {gen: Visual Studio 16 2019, arch: ARM64}
     steps:
       - name: checkout
-        uses: actions/checkout@v2
-      - name: Use cmake
+        uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0
+      - name: Configure
         run: |
-          cmake -A ${{ matrix.arch }} -DCMAKE_CROSSCOMPILING=1 -B build  &&
-          cmake --build build --verbose
+          mkdir build
+          cd build && cmake -G "${{matrix.gen}}" -A ${{matrix.arch}}  ..
+      - name: Build
+        run: cmake --build build --config Release
+      - name: Build Debug
+        run: cmake --build build --config Debug
\ No newline at end of file
diff --git a/.github/workflows/vs16-ci.yml b/.github/workflows/vs16-ci.yml
index de40efc81..5eeb8b6df 100644
--- a/.github/workflows/vs16-ci.yml
+++ b/.github/workflows/vs16-ci.yml
@@ -2,24 +2,35 @@ name: VS16-CI
 
 on: [push, pull_request]
 
+permissions:
+  contents: read
+
 jobs:
   ci:
     name: windows-vs16
-    runs-on: windows-latest
+    runs-on: windows-2019
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - {gen: Visual Studio 16 2019, arch: Win32}
+          - {gen: Visual Studio 16 2019, arch: x64}
     steps:
-    - uses: actions/checkout@v2
-    - name: 'Run CMake with VS16'
-      uses: lukka/run-cmake@v2
-      with:
-        cmakeListsOrSettingsJson: CMakeListsTxtAdvanced
-        cmakeListsTxtPath: '${{ github.workspace }}/CMakeLists.txt'
-        buildDirectory: "${{ github.workspace }}/../../_temp/windows"
-        cmakeBuildType: Release
-        buildWithCMake: true
-        cmakeGenerator: VS16Win64
-        cmakeAppendedArgs: -DROARING_BUILD_STATIC=ON
-        buildWithCMakeArgs: --config Release
-
-    - name: 'Run CTest'
-      run: ctest  --verbose
-      working-directory: "${{ github.workspace }}/../../_temp/windows"
+      - name: checkout
+        uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0
+      - name: Configure
+        run: |
+          mkdir build
+          cd build && cmake -G "${{matrix.gen}}" -A ${{matrix.arch}}  ..
+      - name: Build
+        run: cmake --build build --config Release
+      - name: Run basic tests
+        run: |
+          cd build
+          ctest -C Release --output-on-failure
+      - name: Build Debug
+        run: cmake --build build --config Debug
+      - name: Run basic tests in Debug
+        run: |
+          cd build
+          ctest -C Debug --output-on-failure
\ No newline at end of file
diff --git a/.github/workflows/vs17-arm-ci.yml b/.github/workflows/vs17-arm-ci.yml
index bb0532003..95ff1015c 100644
--- a/.github/workflows/vs17-arm-ci.yml
+++ b/.github/workflows/vs17-arm-ci.yml
@@ -2,6 +2,9 @@ name: VS17-ARM-CI
 
 on: [push, pull_request]
 
+permissions:
+  contents: read
+
 jobs:
   ci:
     name: windows-vs17
@@ -14,8 +17,8 @@ jobs:
           - {arch: ARM64}
     steps:
       - name: checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0
       - name: Use cmake
         run: |
           cmake -A ${{ matrix.arch }} -DCMAKE_CROSSCOMPILING=1 -B build  &&
-          cmake --build build --verbose
\ No newline at end of file
+          cmake --build build --verbose
diff --git a/.github/workflows/vs17-ci.yml b/.github/workflows/vs17-ci.yml
index cc5b8451b..eaabb130f 100644
--- a/.github/workflows/vs17-ci.yml
+++ b/.github/workflows/vs17-ci.yml
@@ -2,6 +2,9 @@ name: VS17-CI
 
 on: [push, pull_request]
 
+permissions:
+  contents: read
+
 jobs:
   ci:
     name: windows-vs17
@@ -12,9 +15,9 @@ jobs:
         include:
           - {gen: Visual Studio 17 2022, arch: Win32}
           - {gen: Visual Studio 17 2022, arch: x64}
-   steps:
+    steps:
       - name: checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0
       - name: Configure
         run: |
           mkdir build
@@ -24,4 +27,10 @@ jobs:
       - name: Run basic tests
         run: |
           cd build
-          ctest -C Release --output-on-failure 
\ No newline at end of file
+          ctest -C Release --output-on-failure
+      - name: Build Debug
+        run: cmake --build build --config Debug
+      - name: Run basic tests in Debug
+        run: |
+          cd build
+          ctest -C Debug --output-on-failure
diff --git a/.github/workflows/vs17-clang-ci.yml b/.github/workflows/vs17-clang-ci.yml
index 0f258a4dc..871de7eb7 100644
--- a/.github/workflows/vs17-clang-ci.yml
+++ b/.github/workflows/vs17-clang-ci.yml
@@ -2,6 +2,9 @@ name: VS17-CLANG-CI
 
 on: [push, pull_request]
 
+permissions:
+  contents: read
+
 jobs:
   ci:
     name: windows-vs17
@@ -14,7 +17,7 @@ jobs:
           - {gen: Visual Studio 17 2022, arch: x64}
     steps:
       - name: checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0
       - name: Configure
         run: |
           mkdir build
@@ -24,4 +27,10 @@ jobs:
       - name: Run basic tests
         run: |
           cd build
-          ctest -C Release --output-on-failure 
\ No newline at end of file
+          ctest -C Release --output-on-failure
+      - name: Build Debug
+        run: cmake --build build --config Debug --parallel
+      - name: Run basic tests in Debug
+        run: |
+          cd build
+          ctest -C Debug --output-on-failure 
diff --git a/.gitignore b/.gitignore
index 16a15f00d..656ac49c4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 # Downloaded dependencies
 tests/vendor/cmocka
+dependencies
 
 # Object files
 *.o
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3d386aeab..a3d505cbb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,6 @@ project(RoaringBitmap
 )
 include(GNUInstallDirs)
 
-set(CMAKE_MACOSX_RPATH OFF)
 if (NOT CMAKE_BUILD_TYPE)
                 message(STATUS "No build type selected, default to Release")
                 set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
@@ -16,11 +15,11 @@ if(CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSIO
     message(FATAL_ERROR "${PROJECT_NAME} requires at least apple-clang version 11 to support runtime dispatching.")
 endif()
 set(ROARING_LIB_NAME roaring)
-set(PROJECT_VERSION_MAJOR 0)
-set(PROJECT_VERSION_MINOR 6)
-set(PROJECT_VERSION_PATCH 0)
-set(ROARING_LIB_VERSION "0.6.0" CACHE STRING "Roaring library version")
-set(ROARING_LIB_SOVERSION "4" CACHE STRING "Roaring library soversion")
+set(PROJECT_VERSION_MAJOR 2)
+set(PROJECT_VERSION_MINOR 0)
+set(PROJECT_VERSION_PATCH 1)
+set(ROARING_LIB_VERSION "2.0.1" CACHE STRING "Roaring library version")
+set(ROARING_LIB_SOVERSION "13" CACHE STRING "Roaring library soversion")
 
 option(ROARING_EXCEPTIONS "Enable exception-throwing interface" ON)
 if(NOT ROARING_EXCEPTIONS)
@@ -31,7 +30,7 @@ endif()
 option(ROARING_DISABLE_X64 "Forcefully disable x64 optimizations even if hardware supports it (this disables AVX)" OFF)
 option(ROARING_DISABLE_AVX "Forcefully disable AVX even if hardware supports it " OFF)
 option(ROARING_DISABLE_NEON "Forcefully disable NEON even if hardware supports it" OFF)
-option(ROARING_DISABLE_NATIVE "Forcefully disable -march optimizations (obsolete)" OFF)
+option(ROARING_DISABLE_AVX512 "Forcefully disable AVX512 even if compiler supports it" OFF)
 
 option(ROARING_BUILD_STATIC "Build a static library" ON)
 if(BUILD_SHARED_LIBS)
@@ -44,6 +43,8 @@ option(ROARING_BUILD_LTO "Build library with Link Time Optimization" OFF)
 option(ROARING_BUILD_C_AS_CPP "Build library C files using C++ compilation" OFF)
 option(ROARING_BUILD_C_TESTS_AS_CPP "Build test C files using C++ compilation" OFF)
 option(ROARING_SANITIZE "Sanitize addresses" OFF)
+option(ROARING_SANITIZE_THREADS "Sanitize threads" OFF)
+
 option(ENABLE_ROARING_TESTS "If OFF, disable unit tests altogether" ON)
 
 set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/tools/cmake")
@@ -62,7 +63,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/roaring.pc" DESTINATION ${CMAKE_INSTA
 
 add_library(roaring-headers INTERFACE)
 target_include_directories(roaring-headers INTERFACE
-  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/${ROARING_LIB_NAME}>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include/roaring>
   $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCDIR}>)
 add_library(roaring-headers-cpp INTERFACE)
 target_include_directories(roaring-headers-cpp INTERFACE
@@ -73,11 +74,11 @@ target_include_directories(roaring-headers-cpp INTERFACE
 ### Some users want the C++ header files to be installed as well.
 ### C++ header files get installed to /usr/local/include/roaring typically
 SET(CPP_ROARING_HEADERS cpp/roaring64map.hh  cpp/roaring.hh) # needs to be updated if we add more files
-install(FILES ${CPP_ROARING_HEADERS} DESTINATION include/${ROARING_LIB_NAME})
-install(DIRECTORY include/${ROARING_LIB_NAME} DESTINATION include)
+install(FILES ${CPP_ROARING_HEADERS} DESTINATION include/roaring)
+install(DIRECTORY include/roaring DESTINATION include)
 
 install(TARGETS roaring-headers roaring-headers-cpp
-   EXPORT ${ROARING_LIB_NAME}-config
+   EXPORT roaring-config
    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
@@ -98,9 +99,17 @@ configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/tests/config.h.in"
 
 add_subdirectory(src)
 if(ENABLE_ROARING_TESTS)
-  add_subdirectory(benchmarks)
+  if(CMAKE_SIZEOF_VOID_P EQUAL 8) # we only include the benchmarks on 64-bit systems.
+    add_subdirectory(benchmarks)
+  endif()
   add_subdirectory(tests)
 endif()
+option(ENABLE_ROARING_MICROBENCHMARKS "Enable microbenchmarks" OFF)
+if(ENABLE_ROARING_MICROBENCHMARKS)
+    add_subdirectory(microbenchmarks)
+else()
+     MESSAGE( STATUS "You may enable microbenchmarks by setting ENABLE_ROARING_MICROBENCHMARKS to ON " )
+endif()
 # Being terse is good, but knowing how the build is configured is important
 # and should not be hard to figure out.
 MESSAGE( STATUS "CMAKE_SYSTEM_PROCESSOR: " ${CMAKE_SYSTEM_PROCESSOR})
diff --git a/LICENSE b/LICENSE
index 8f567d348..8b0ad80d7 100644
--- a/LICENSE
+++ b/LICENSE
@@ -232,4 +232,5 @@ PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
 SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
\ No newline at end of file
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
index 4589a8cbd..4f88b185e 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,11 @@
-# CRoaring [![Build status](https://ci.appveyor.com/api/projects/status/gr4ibsflqs9by1bc/branch/master?svg=true)](https://ci.appveyor.com/project/lemire/croaring/branch/master) [![Build Status](https://cloud.drone.io/api/badges/RoaringBitmap/CRoaring/status.svg)](https://cloud.drone.io/RoaringBitmap/CRoaring)
+# CRoaring 
+
+[![Ubuntu-CI](https://github.com/RoaringBitmap/CRoaring/actions/workflows/ubuntu-noexcept-ci.yml/badge.svg)](https://github.com/RoaringBitmap/CRoaring/actions/workflows/ubuntu-noexcept-ci.yml) [![VS17-CI](https://github.com/RoaringBitmap/CRoaring/actions/workflows/vs17-ci.yml/badge.svg)](https://github.com/RoaringBitmap/CRoaring/actions/workflows/vs17-ci.yml)
+[![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/croaring.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:croaring)
+
+[![Doxygen Documentation](https://img.shields.io/badge/docs-doxygen-green.svg)](http://roaringbitmap.github.io/CRoaring/)
+
+
 
 Portable Roaring bitmaps in C (and C++) with full support for your favorite compiler (GNU GCC, LLVM's clang, Visual Studio). Included in the [Awesome C](https://github.com/kozross/awesome-c) list of open source C software.
 
@@ -9,7 +16,7 @@ Bitsets, also called bitmaps, are commonly used as fast data structures. Unfortu
 
 Roaring bitmaps are compressed bitmaps which tend to outperform conventional compressed bitmaps such as WAH, EWAH or Concise.
 They are used by several major systems such as [Apache Lucene][lucene] and derivative systems such as [Solr][solr] and
-[Elasticsearch][elasticsearch], [Metamarkets' Druid][druid], [LinkedIn Pinot][pinot], [Netflix Atlas][atlas],  [Apache Spark][spark], [OpenSearchServer][opensearchserver], [Cloud Torrent][cloudtorrent], [Whoosh][whoosh], [InfluxDB](https://www.influxdata.com), [Pilosa][pilosa], [Bleve](http://www.blevesearch.com), [Microsoft Visual Studio Team Services (VSTS)][vsts], and eBay's [Apache Kylin][kylin]. The CRoaring library is used in several systems such as [Apache Doris](http://doris.incubator.apache.org). The YouTube SQL Engine, [Google Procella](https://research.google/pubs/pub48388/), uses Roaring bitmaps for indexing.
+[Elasticsearch][elasticsearch], [Metamarkets' Druid][druid], [LinkedIn Pinot][pinot], [Netflix Atlas][atlas],  [Apache Spark][spark], [OpenSearchServer][opensearchserver], [Cloud Torrent][cloudtorrent], [Whoosh][whoosh], [InfluxDB](https://www.influxdata.com), [Pilosa][pilosa], [Bleve](http://www.blevesearch.com), [Microsoft Visual Studio Team Services (VSTS)][vsts], and eBay's [Apache Kylin][kylin]. The CRoaring library is used in several systems such as [Apache Doris](http://doris.incubator.apache.org), [StarRocks](https://github.com/StarRocks/starrocks). The YouTube SQL Engine, [Google Procella](https://research.google/pubs/pub48388/), uses Roaring bitmaps for indexing.
 
 We published a peer-reviewed article on the design and evaluation of this library:
 
@@ -50,10 +57,15 @@ of the latest hardware. Roaring bitmaps are already available on a variety of pl
 
 - Linux, macOS, FreeBSD, Windows (MSYS2 and Microsoft Visual studio).
 - We test the library with ARM, x64/x86 and POWER processors. We only support little endian systems (big endian systems are vanishingly rare).
-- Recent C compiler supporting the C11 standard (GCC 7 or better, LLVM 7.0 or better, Xcode 11 or better), there is also an optional C++ class that requires a C++ compiler supporting the C++11 standard.
+- Recent C compiler supporting the C11 standard (GCC 7 or better, LLVM 7.0 or better, Xcode 11 or better, Microsoft Visual Studio 2022 or better, Intel oneAPI Compiler 2023.2 or better), there is also an optional C++ class that requires a C++ compiler supporting the C++11 standard.
 - CMake (to contribute to the project, users can rely on amalgamation/unity builds if they do not wish to use CMake).
 - Under x64 systems, the library provides runtime dispatch so that optimized functions are called based on the detected CPU features. It works with GCC, clang (version 9 and up) and Visual Studio (2017 and up). Other systems (e.g., ARM) do not need runtime dispatch.
 
+Hardly anyone has access to an actual big-endian system. Nevertheless,
+We support big-endian systems such as IBM s390x through emulators---except for
+IO serialization which is only supported on little-endian systems (see [issue 423](https://github.com/RoaringBitmap/CRoaring/issues/423)).
+
+
 # Using as a CMake dependency
 
 If you like CMake, you can just a few lines in you `CMakeLists.txt` file to grab a `CRoaring` release. [See our demonstration for further details](https://github.com/RoaringBitmap/croaring_cmake_demo_single_file).
@@ -168,6 +180,43 @@ The C interface is found in the file ``include/roaring/roaring.h``. We have C++
 
 Some users have to deal with large volumes of data. It  may be important for these users to be aware of the `addMany` (C++) `roaring_bitmap_or_many` (C) functions as it is much faster and economical to add values in batches when possible. Furthermore, calling periodically the `runOptimize` (C++) or `roaring_bitmap_run_optimize` (C) functions may help.
 
+
+# Running microbenchmarks
+
+We have microbenchmarks constructed with the Google Benchmarks.
+Under Linux or macOS, you may run them as follows:
+
+```
+cmake -B build
+cmake --build build
+./build/microbenchmarks/bench
+```
+
+By default, the benchmark tools picks one data set (e.g., `CRoaring/benchmarks/realdata/census1881`).
+We have several data sets and you may pick others:
+
+```
+./build/microbenchmarks/bench benchmarks/realdata/wikileaks-noquotes 
+```
+
+You may disable some functionality for the purpose of benchmarking. For example, assuming you
+have an x64 processor, you could benchmark the code without AVX-512 even if both your processor 
+and compiler supports it:
+
+```
+cmake -B buildnoavx512 -D ROARING_DISABLE_AVX512=ON
+cmake --build buildnoavx512
+./buildnoavx512/microbenchmarks/bench
+```
+
+You can benchmark without AVX or AVX-512 as well:
+
+```
+cmake -B buildnoavx -D ROARING_DISABLE_AVX=ON
+cmake --build buildnoavx
+./buildnoavx/microbenchmarks/bench
+```
+
 # Custom memory allocators
 For general users, CRoaring would apply default allocator without extra codes. But global memory hook is also provided for those who want a custom memory allocator. Here is an example:
 ```C
@@ -276,7 +325,15 @@ int main() {
     uint32_t expectedsize = roaring_bitmap_portable_size_in_bytes(r1);
     char *serializedbytes = malloc(expectedsize);
     roaring_bitmap_portable_serialize(r1, serializedbytes);
-    roaring_bitmap_t *t = roaring_bitmap_portable_deserialize(serializedbytes);
+    // Note: it is expected that the input follows the specification
+    // https://github.com/RoaringBitmap/RoaringFormatSpec
+    // otherwise the result may be unusable.
+    roaring_bitmap_t *t = roaring_bitmap_portable_deserialize_safe(serializedbytes, expectedsize);
+    if(t == NULL) { return EXIT_FAILURE; }
+    const char *reason = NULL;
+    if (!roaring_bitmap_internal_validate(t, &reason)) {
+        return EXIT_FAILURE;
+    }
     assert(roaring_bitmap_equals(r1, t));  // what we recover is equal
     roaring_bitmap_free(t);
     // we can also check whether there is a bitmap at a memory location without
@@ -287,6 +344,21 @@ int main() {
            expectedsize);  // sizeofbitmap would be zero if no bitmap were found
     // we can also read the bitmap "safely" by specifying a byte size limit:
     t = roaring_bitmap_portable_deserialize_safe(serializedbytes, expectedsize);
+    if(t == NULL) {
+        printf("Problem during deserialization.\n");
+        // We could clear any memory and close any file here.
+        return EXIT_FAILURE;
+    }
+    // We can validate the bitmap we recovered to make sure it is proper.
+    const char *reason_failure = NULL;
+    if (!roaring_bitmap_internal_validate(t, &reason_failure)) {
+        printf("safely deserialized invalid bitmap: %s\n", reason_failure);
+        // We could clear any memory and close any file here.
+        return EXIT_FAILURE;
+    }
+    // It is still necessary for the content of seriallizedbytes to follow
+    // the standard: https://github.com/RoaringBitmap/RoaringFormatSpec
+    // This is guaranted when calling 'roaring_bitmap_portable_deserialize'.
     assert(roaring_bitmap_equals(r1, t));  // what we recover is equal
     roaring_bitmap_free(t);
 
@@ -330,6 +402,77 @@ int main() {
 }
 ```
 
+# Conventional bitsets (C)
+
+We support convention bitsets (uncompressed) as part of the library.
+
+Simple example:
+
+```C
+bitset_t * b = bitset_create();
+bitset_set(b,10);
+bitset_get(b,10);// returns true
+bitset_free(b); // frees memory
+```
+
+More advanced example:
+
+```C
+    bitset_t *b = bitset_create();
+    for (int k = 0; k < 1000; ++k) {
+        bitset_set(b, 3 * k);
+    }
+    // We have bitset_count(b) == 1000.
+    // We have bitset_get(b, 3) is true
+    // You can iterate through the values:
+    size_t k = 0;
+    for (size_t i = 0; bitset_next_set_bit(b, &i); i++) {
+        // You will have i == k
+        k += 3;
+    }
+    // We support a wide range of operations on two bitsets such as
+    // bitset_inplace_symmetric_difference(b1,b2);
+    // bitset_inplace_symmetric_difference(b1,b2);
+    // bitset_inplace_difference(b1,b2);// should make no difference
+    // bitset_inplace_union(b1,b2);
+    // bitset_inplace_intersection(b1,b2);
+    // bitsets_disjoint
+    // bitsets_intersect
+```
+
+In some instances, you may want to convert a Roaring bitmap into a conventional (uncompressed) bitset.
+Indeed, bitsets have advantages such as higher query performances in some cases. The following code
+illustrates how you may do so:
+
+```C
+    roaring_bitmap_t *r1 = roaring_bitmap_create();
+    for (uint32_t i = 100; i < 100000; i+= 1 + (i%5)) {
+     roaring_bitmap_add(r1, i);
+    }
+    for (uint32_t i = 100000; i < 500000; i+= 100) {
+     roaring_bitmap_add(r1, i);
+    }
+    roaring_bitmap_add_range(r1, 500000, 600000);
+    bitset_t * bitset = bitset_create();
+    bool success = roaring_bitmap_to_bitset(r1, bitset);
+    assert(success); // could fail due to memory allocation.
+    assert(bitset_count(bitset) == roaring_bitmap_get_cardinality(r1));
+    // You can then query the bitset:
+    for (uint32_t i = 100; i < 100000; i+= 1 + (i%5)) {
+        assert(bitset_get(bitset,i));
+    }
+    for (uint32_t i = 100000; i < 500000; i+= 100) {
+        assert(bitset_get(bitset,i));
+    }
+    // you must free the memory:
+    bitset_free(bitset);
+    roaring_bitmap_free(r1);
+```
+
+You should be aware that a convention bitset (`bitset_t *`) may use much more
+memory than a Roaring bitmap in some cases. You should run benchmarks to determine
+whether the conversion to a bitset has performance benefits in your case.
+
 # Example (C++)
 
 
@@ -372,6 +515,11 @@ int main() {
     r2.printf();
     printf("\n");
 
+    // create a new bitmap with initializer list
+    Roaring r2i = Roaring::bitmapOfList({1, 2, 3, 5, 6});
+
+    assert(r2i == r2);
+
     // we can also create a bitmap from a pointer to 32-bit integers
     const uint32_t values[] = {2, 3, 4};
     Roaring r3(3, values);
@@ -406,7 +554,10 @@ int main() {
     uint32_t expectedsize = r1.getSizeInBytes();
     char *serializedbytes = new char[expectedsize];
     r1.write(serializedbytes);
-    Roaring t = Roaring::read(serializedbytes);
+    // readSafe will not overflow, but the resulting bitmap
+    // is only valid and usable if the input follows the
+    // Roaring specification: https://github.com/RoaringBitmap/RoaringFormatSpec/
+    Roaring t = Roaring::readSafe(serializedbytes, expectedsize);
     assert(r1 == t);
     delete[] serializedbytes;
 
@@ -477,14 +628,6 @@ ctest
 ```
 
 
-To run real-data benchmark
-
-```
-./real_bitmaps_benchmark ../benchmarks/realdata/census1881
-```
-where you must adjust the path "../benchmarks/realdata/census1881" so that it points to one of the directories in the benchmarks/realdata directory.
-
-
 To check that your code abides by the style convention (make sure that ``clang-format`` is installed):
 
 ```
@@ -517,7 +660,7 @@ To build with at least Visual Studio 2017 directly in the IDE:
 - For testing, in the Standard toolbar, drop the ``Select Startup Item...`` menu and choose one of the tests. Run the test by pressing the button to the left of the dropdown.
 
 
-We have optimizations specific to AVX2 in the code, and they are turned dynamically based on the detected hardware at runtime.
+We have optimizations specific to AVX2 and AVX-512 in the code, and they are turned dynamically based on the detected hardware at runtime.
 
 
 ## Usage (Using `conan`)
@@ -560,14 +703,25 @@ These commands will also print out instructions on how to use the library from M
 
 If you find the version of `roaring` shipped with `vcpkg` is out-of-date, feel free to report it to `vcpkg` community either by submiting an issue or by creating a PR.
 
-# AVX2-related throttling
+# SIMD-related throttling
 
-Our AVX2 code does not use floating-point numbers or multiplications, so it is not subject to turbo frequency throttling on many-core Intel processors.
+Our AVX2 code does not use floating-point numbers or multiplications, so it is not subject to turbo frequency throttling on many-core Intel processors. 
+
+Our AVX-512 code is only enabled on recent hardware (Intel Ice Lake or better and AMD Zen 4) where SIMD-specific frequency throttling is not observed.
 
 # Thread safety
 
 Like, for example, STL containers or Java's default data structures, the CRoaring library has no built-in thread support. Thus whenever you modify a bitmap in one thread, it is unsafe to query it in others. It is safe however to query bitmaps (without modifying them) from several distinct threads,  as long as you do not use the copy-on-write attribute. For example, you can safely copy a bitmap and use both copies in concurrently. One should probably avoid the use of the copy-on-write attribute in a threaded environment.
 
+Some of our users rely on "copy-on-write" (default to disabled). A bitmap with the copy-on-write flag
+set to true might generate shared containers. A shared container is just a reference to a single
+container with reference counting (we keep track of the number of shallow copies). If you copy shared
+containers over several threads, this might be unsafe due to the need to update the counter concurrently.
+Thus for shared containers, we use reference counting with an atomic counter. If the library is compiled
+as a C library (the default), we use C11 atomics. Unfortunately, Visual Studio does not support C11
+atomics at this times (though this is subject to change). To compensate, we
+use Windows-specific code in such instances (`_InterlockedDecrement` `_InterlockedIncrement`).
+
 
 # How to best aggregate bitmaps?
 
@@ -577,8 +731,8 @@ different strategies.
 You can use `roaring_bitmap_or_many(bitmapcount, bitmaps)` or `roaring_bitmap_or_many_heap(bitmapcount, bitmaps)` or you may
 even roll your own aggregation:
 
-```
-roaring_bitmap_t *answer  = roaring_bitmap_copy(bitmaps[0]);
+```C
+roaring_bitmap_t *answer = roaring_bitmap_copy(bitmaps[0]);
 for (size_t i = 1; i < bitmapcount; i++) {
   roaring_bitmap_or_inplace(answer, bitmaps[i]);
 }
@@ -599,8 +753,9 @@ later `roaring_bitmap_or_inplace` will be very fast.
 
 You should benchmark these alternatives on your own data to decide what is best.
 
-# Python Wrapper
+# Wrappers
 
+## Python
 Tom Cornebize wrote a Python wrapper available at https://github.com/Ezibenroc/PyRoaringBitMap
 Installing it is as easy as typing...
 
@@ -608,7 +763,7 @@ Installing it is as easy as typing...
 pip install pyroaring
 ```
 
-# JavaScript Wrapper
+## JavaScript
 
 Salvatore Previti  wrote a Node/JavaScript wrapper available at https://github.com/SalvatorePreviti/roaring-node
 Installing it is as easy as typing...
@@ -617,32 +772,35 @@ Installing it is as easy as typing...
 npm install roaring
 ```
 
-# Swift Wrapper
+## Swift
 
 Jérémie Piotte wrote a [Swift wrapper](https://github.com/RoaringBitmap/SwiftRoaring).
 
 
-# C# Wrapper
+## C#
 
 Brandon Smith wrote a C# wrapper available at https://github.com/RogueException/CRoaring.Net (works for Windows and Linux under x64 processors)
 
 
-# Go (golang) Wrapper
+## Go (golang)
 
 There is a Go (golang) wrapper available at https://github.com/RoaringBitmap/gocroaring
 
-# Rust Wrapper
+## Rust
 
 Saulius Grigaliunas wrote a Rust wrapper available at https://github.com/saulius/croaring-rs
 
-# D Wrapper
+## D
 
 Yuce Tekol wrote a D wrapper available at https://github.com/yuce/droaring
 
-# Redis Module
+## Redis
 
 Antonio Guilherme Ferreira Viggiano wrote a Redis Module available at https://github.com/aviggiano/redis-roaring
 
+## Zig
+
+Justin Whear wrote a Zig wrapper available at https://github.com/jwhear/roaring-zig
 
 
 # Mailing list/discussion group
@@ -651,10 +809,9 @@ https://groups.google.com/forum/#!forum/roaring-bitmaps
 
 # References about Roaring
 
-- Daniel Lemire, Owen Kaser, Nathan Kurz, Luca Deri, Chris O'Hara, François Saint-Jacques, Gregory Ssi-Yan-Kai, Roaring Bitmaps: Implementation of an Optimized Software Library, Software: Practice and Experience (to appear) [arXiv:1709.07821](https://arxiv.org/abs/1709.07821)
+- Daniel Lemire, Owen Kaser, Nathan Kurz, Luca Deri, Chris O'Hara, François Saint-Jacques, Gregory Ssi-Yan-Kai, Roaring Bitmaps: Implementation of an Optimized Software Library, Software: Practice and Experience Volume 48, Issue 4 April 2018 Pages 867-895 [arXiv:1709.07821](https://arxiv.org/abs/1709.07821)
 -  Samy Chambi, Daniel Lemire, Owen Kaser, Robert Godin,
 Better bitmap performance with Roaring bitmaps,
-Software: Practice and Experience Volume 46, Issue 5, pages 709–719, May 2016
-http://arxiv.org/abs/1402.6407 This paper used data from http://lemire.me/data/realroaring2014.html
-- Daniel Lemire, Gregory Ssi-Yan-Kai, Owen Kaser, Consistently faster and smaller compressed bitmaps with Roaring, Software: Practice and Experience (accepted in 2016, to appear) http://arxiv.org/abs/1603.06549
+Software: Practice and Experience Volume 46, Issue 5, pages 709–719, May 2016  [arXiv:1402.6407](http://arxiv.org/abs/1402.6407)
+- Daniel Lemire, Gregory Ssi-Yan-Kai, Owen Kaser, Consistently faster and smaller compressed bitmaps with Roaring, Software: Practice and Experience Volume 46, Issue 11, pages 1547-1569, November 2016 [arXiv:1603.06549](http://arxiv.org/abs/1603.06549)
 - Samy Chambi, Daniel Lemire, Robert Godin, Kamel Boukhalfa, Charles Allen, Fangjin Yang, Optimizing Druid with Roaring bitmaps, IDEAS 2016, 2016. http://r-libre.teluq.ca/950/
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 000000000..1d9c45c86
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,9 @@
+# Security Policy
+
+## Reporting a Vulnerability
+
+Please use the following contact information for reporting a vulnerability:
+
+- [Daniel Lemire]( https://www.teluq.ca/siteweb/univ/en/dlemire.html) - daniel@lemire.me
+
+
diff --git a/amalgamation.sh b/amalgamation.sh
index 1281384ef..545f1f7ed 100755
--- a/amalgamation.sh
+++ b/amalgamation.sh
@@ -5,7 +5,11 @@
 ########################################################################
 SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
 
-timestamp=$(date)  # capture to label files with their generation time
+case $SCRIPTPATH in
+    (*\ *) echo "Path ($SCRIPTPATH) cannot contain whitespace"; exit 1 ;;
+esac
+
+timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ")  # capture to label files with their generation time
 
 function newline {
     echo ""
@@ -31,6 +35,8 @@ DEMOCPP="amalgamation_demo.cpp"
 ALL_PUBLIC_H="
 $SCRIPTPATH/include/roaring/roaring_version.h
 $SCRIPTPATH/include/roaring/roaring_types.h
+$SCRIPTPATH/include/roaring/portability.h
+$SCRIPTPATH/include/roaring/bitset/bitset.h
 $SCRIPTPATH/include/roaring/roaring.h
 $SCRIPTPATH/include/roaring/memory.h
 "
@@ -43,12 +49,11 @@ $SCRIPTPATH/cpp/roaring64map.hh
 "
 
 # internal .h files => These are used in the implementation but aren't part of
-# the API.  They're all embedded at the head of the amalgamated C file, and
+# the API.  They are all embedded at the head of the amalgamated C file, and
 # need to be in this order.
 #
 ALL_PRIVATE_H="
 $SCRIPTPATH/include/roaring/isadetection.h
-$SCRIPTPATH/include/roaring/portability.h
 $SCRIPTPATH/include/roaring/containers/perfparameters.h
 $SCRIPTPATH/include/roaring/containers/container_defs.h
 $SCRIPTPATH/include/roaring/array_util.h
@@ -67,7 +72,6 @@ $SCRIPTPATH/include/roaring/containers/mixed_union.h
 $SCRIPTPATH/include/roaring/containers/mixed_xor.h
 $SCRIPTPATH/include/roaring/containers/containers.h
 $SCRIPTPATH/include/roaring/roaring_array.h
-$SCRIPTPATH/include/roaring/misc/configreport.h
 "
 
 # .c implementation files
@@ -76,13 +80,13 @@ $SCRIPTPATH/include/roaring/misc/configreport.h
 # has the definitions available from all the header files.  Since the order of
 # the top level declarations doesn't matter after that point, the file list is
 # generated automatically from git-tracked C files in the /src/ directory.
+# Sort them so every run uses the same order.
 #
-ALL_PRIVATE_C=$( ( \
+ALL_PRIVATE_C=$( ( ( \
     [ -d $SCRIPTPATH/.git ] \
         && ( type git >/dev/null 2>&1 ) \
-        && ( git ls-files $SCRIPTPATH/src/*.c $SCRIPTPATH/src/**/*c ) \
-    ) || ( find $SCRIPTPATH/src -name '*.c' ) )
-
+        && ( git -C $SCRIPTPATH ls-files 'src/*.c' ) \
+    ) || ( find $SCRIPTPATH/src -name '*.c' ) ) | sort )
 # Verify up-front that all the files exist
 #
 for i in ${ALL_PUBLIC_H} ${ALL_PUBLIC_HH} ${ALL_PRIVATE_H} ${ALL_PRIVATE_C}; do
@@ -166,13 +170,21 @@ echo "Creating ${DEMOC}..."
 
     cat <<< '
 #include <stdio.h>
+#include <stdlib.h>
 #include "roaring.c"
 int main() {
   roaring_bitmap_t *r1 = roaring_bitmap_create();
   for (uint32_t i = 100; i < 1000; i++) roaring_bitmap_add(r1, i);
   printf("cardinality = %d\n", (int) roaring_bitmap_get_cardinality(r1));
   roaring_bitmap_free(r1);
-  return 0;
+
+  bitset_t *b = bitset_create();
+  for (int k = 0; k < 1000; ++k) {
+        bitset_set(b, 3 * k);
+  }
+  printf("%zu \n", bitset_count(b));
+  bitset_free(b);
+  return EXIT_SUCCESS;
 }
 '
 } > "${DEMOC}"
@@ -242,10 +254,10 @@ CPPBIN=${DEMOCPP%%.*}
 echo "The interface is found in the file 'include/roaring/roaring.h'."
 newline
 echo "For C, try:"
-echo "cc -march=native -O3 -std=c11  -o ${CBIN} ${DEMOC}  && ./${CBIN} "
+echo "cc -O3 -std=c11  -o ${CBIN} ${DEMOC}  && ./${CBIN} "
 newline
 echo "For C++, try:"
-echo "c++ -march=native -O3 -std=c++11 -o ${CPPBIN} ${DEMOCPP}  && ./${CPPBIN} "
+echo "c++ -O3 -std=c++11 -o ${CPPBIN} ${DEMOCPP}  && ./${CPPBIN} "
 
 lowercase(){
     echo "$1" | tr 'A-Z' 'a-z'
@@ -257,8 +269,8 @@ newline
 echo "You can build a shared library with the following command:"
 
 if [ $OS == "darwin" ]; then
-  echo "cc -march=native -O3 -std=c11 -shared -o libroaring.dylib -fPIC roaring.c"
+  echo "cc  -O3 -std=c11 -shared -o libroaring.dylib -fPIC roaring.c"
 else
-  echo "cc -march=native -O3 -std=c11 -shared -o libroaring.so -fPIC roaring.c"
+  echo "cc -O3 -std=c11 -shared -o libroaring.so -fPIC roaring.c"
 fi
 
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 0d5fabcaa..71be77ee7 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -12,6 +12,8 @@ if(NOT WIN32)
     add_c_benchmark(intersect_range_benchmark)
     target_link_libraries(add_benchmark m)
     add_c_benchmark(frozen_benchmark)
+    add_c_benchmark(containsmulti_benchmark)
+    add_cpp_benchmark(fastunion_benchmark)
 endif()
 add_c_benchmark(bitset_container_benchmark)
 add_c_benchmark(array_container_benchmark)
diff --git a/benchmarks/add_benchmark.c b/benchmarks/add_benchmark.c
index cc143e3dd..574b285c8 100644
--- a/benchmarks/add_benchmark.c
+++ b/benchmarks/add_benchmark.c
@@ -96,15 +96,33 @@ void run_test(uint32_t spanlen, uint32_t intvlen, double density, order_t order)
     printf("          %6.1f\n", array_min(results, num_passes));
 
     printf("  roaring_bitmap_add_many():");
+    for (int p = 0; p < num_passes; p++) {
+        roaring_bitmap_t *r = roaring_bitmap_create();
+        uint32_t values[intvlen * count];
+        for (int64_t i = 0; i < count; i++) {
+            for (uint32_t j = 0; j < intvlen; j++) {
+                values[i * intvlen + j] = offsets[i] + j;
+            }
+        }
+        RDTSC_START(cycles_start);
+        for (int64_t i = 0; i < count; i++) {
+            roaring_bitmap_add_many(r, intvlen, values + (i * intvlen));
+        }
+        RDTSC_FINAL(cycles_final);
+        results[p] = (cycles_final - cycles_start) * 1.0 / count / intvlen;
+        roaring_bitmap_free(r);
+    }
+    printf("     %6.1f\n", array_min(results, num_passes));
+
+    printf("  roaring_bitmap_add_bulk():");
     for (int p = 0; p < num_passes; p++) {
         roaring_bitmap_t *r = roaring_bitmap_create();
         RDTSC_START(cycles_start);
-        uint32_t values[intvlen];
+        roaring_bulk_context_t context = {0};
         for (int64_t i = 0; i < count; i++) {
             for (uint32_t j = 0; j < intvlen; j++) {
-                values[j] = offsets[i] + j;
+                roaring_bitmap_add_bulk(r, &context, offsets[i] + j);
             }
-            roaring_bitmap_add_many(r, intvlen, values);
         }
         RDTSC_FINAL(cycles_final);
         results[p] = (cycles_final - cycles_start) * 1.0 / count / intvlen;
diff --git a/benchmarks/array_container_benchmark.c b/benchmarks/array_container_benchmark.c
index a76844ac9..fc6a2f196 100644
--- a/benchmarks/array_container_benchmark.c
+++ b/benchmarks/array_container_benchmark.c
@@ -23,7 +23,7 @@ void array_cache_flush(array_container_t* B) { (void)B; }
 // tries to put the array in cache
 void array_cache_prefetch(array_container_t* B) {
 #if !CROARING_REGULAR_VISUAL_STUDIO
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
     const int32_t CACHELINESIZE =
         computecacheline();  // 64 bytes per cache line
 #else
@@ -137,8 +137,8 @@ int main() {
     printf("intersection cardinality = %d \n", answer);
     BEST_TIME(intersection_test(B1, B2, BO), answer, repeat, answer);
     printf("==intersection and union test 2 \n");
-    array_container_clear(B1);
-    array_container_clear(B2);
+    B1->cardinality = 0;
+    B2->cardinality = 0;
     for (int x = 0; x < 1 << 16; x += 16) {
         array_container_add(B1, (uint16_t)x);
     }
diff --git a/benchmarks/benchmark.h b/benchmarks/benchmark.h
index fee613fd9..e3a6ad166 100644
--- a/benchmarks/benchmark.h
+++ b/benchmarks/benchmark.h
@@ -37,69 +37,39 @@
         (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                      \
     } while (0)
 
-#elif defined(__linux__) && defined(__GLIBC__)
-
-#include <time.h>
-#ifdef CLOCK_THREAD_CPUTIME_ID
-#define RDTSC_START(cycles) \
-  do { \
-    struct timespec ts; \
-    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); \
-    cycles = ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; \
-  } while (0)
-
-#define RDTSC_FINAL(cycles) \
-  do { \
-    struct timespec ts; \
-    clock_gettime(CLOCK_REALTIME, &ts); \
-    cycles = ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; \
-  } while (0)
-
-#elif defined(CLOCK_REALTIME)  // #ifdef CLOCK_THREAD_CPUTIME_ID
-#define RDTSC_START(cycles) \
-  do { \
-    struct timespec ts; \
-    clock_gettime(CLOCK_REALTIME, &ts); \
-    cycles = ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; \
-  } while (0)
-
-#define RDTSC_FINAL(cycles) \
-  do { \
-    struct timespec ts; \
-    clock_gettime(CLOCK_REALTIME, &ts); \
-    cycles = ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; \
-  } while (0)
-
-#else
-#define RDTSC_START(cycles) \
-  do { \
-    cycles = clock(); \
-  } while(0)
+#else  // defined(CROARING_IS_X64) && defined(ROARING_INLINE_ASM)
+
+#if defined(CLOCK_THREAD_CPUTIME_ID)
+#define RDTSC_CLOCK_ID CLOCK_THREAD_CPUTIME_ID
+#elif defined(CLOCK_MONOTONIC)
+#define RDTSC_CLOCK_ID CLOCK_MONOTONIC
+#elif defined(CLOCK_REALTIME)
+#define RDTSC_CLOCK_ID CLOCK_REALTIME
+#endif
 
-#define RDTSC_FINAL(cycles) \
-  do { \
-    cycles = clock(); \
-  } while(0)
+#if defined(RDTSC_CLOCK_ID)
+#define RDTSC_START(cycles)                                     \
+    do {                                                        \
+        struct timespec ts;                                     \
+        clock_gettime(RDTSC_CLOCK_ID, &ts);                     \
+        cycles = ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; \
+    } while (0)
 
-#endif // #ifdef CLOCK_THREAD_CPUTIME_ID
+#define RDTSC_FINAL(cycles) RDTSC_START(cycles)
 
-#else
+#else  // defined(RDTSC_CLOCK_ID)
 
 /**
-* Other architectures do not support rdtsc ?
+* Fall back to the `clock` function
 */
-#include <time.h>
-
 #define RDTSC_START(cycles) \
     do {                    \
         cycles = clock();   \
     } while (0)
 
-#define RDTSC_FINAL(cycles) \
-    do {                    \
-        cycles = clock();   \
-    } while (0)
+#define RDTSC_FINAL(cycles) RDTSC_START(cycles)
 
+#endif
 #endif
 
 /*
diff --git a/benchmarks/bitset_container_benchmark.c b/benchmarks/bitset_container_benchmark.c
index 3d418ee2c..932398a97 100644
--- a/benchmarks/bitset_container_benchmark.c
+++ b/benchmarks/bitset_container_benchmark.c
@@ -29,7 +29,7 @@ void bitset_cache_flush(bitset_container_t* B) { (void)B; }
 // tries to put array of words in cache
 void bitset_cache_prefetch(bitset_container_t* B) {
 #if !CROARING_REGULAR_VISUAL_STUDIO
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
     const int32_t CACHELINESIZE =
         computecacheline();  // 64 bytes per cache line
 #else
@@ -69,7 +69,7 @@ int set_test(bitset_container_t* B) {
 int unset_test(bitset_container_t* B) {
     int x;
     for (x = 0; x < 1 << 16; x += 3) {
-        bitset_container_unset(B, (uint16_t)x);
+        bitset_container_remove(B, (uint16_t)x);
     }
     return 0;
 }
diff --git a/benchmarks/containsmulti_benchmark.c b/benchmarks/containsmulti_benchmark.c
new file mode 100644
index 000000000..e92d82e5a
--- /dev/null
+++ b/benchmarks/containsmulti_benchmark.c
@@ -0,0 +1,121 @@
+#define _GNU_SOURCE
+#include <roaring/roaring.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "benchmark.h"
+#include "random.h"
+#include "numbersfromtextfiles.h"
+
+void contains_multi_via_contains(roaring_bitmap_t* bm, const uint32_t* values, bool* results, const size_t count) {
+    for (size_t i = 0; i < count; ++i) {
+        results[i] = roaring_bitmap_contains(bm, values[i]);
+    }
+}
+
+void contains_multi_bulk(roaring_bitmap_t* bm, const uint32_t* values, bool* results, const size_t count) {
+    roaring_bulk_context_t context = {0};
+    for (size_t i = 0; i < count; ++i) {
+        results[i] = roaring_bitmap_contains_bulk(bm, &context, values[i]);
+    }
+}
+
+int compare_uint32(const void* a, const void* b) {
+    uint32_t arg1 = *(const uint32_t*)a;
+    uint32_t arg2 = *(const uint32_t*)b;
+    if (arg1 < arg2) return -1;
+    if (arg1 > arg2) return 1;
+    return 0;
+}
+
+int main(int argc, char* argv[]) {
+    (void)&read_all_integer_files; // suppress unused warning
+
+    if (argc < 2) {
+        printf("Usage: %s <comma_separated_integers_file> ...\n", argv[0]);
+        printf("Example: %s ~/CRoaring/benchmarks/realdata/weather_sept_85/*\n", argv[0]);
+        return 1;
+    }
+
+    size_t fields = argc-1;
+    uint32_t* values[argc];
+    size_t count[argc];
+
+    roaring_bitmap_t* bm = roaring_bitmap_create();
+    for (int i = 1; i < argc; i++) {
+        size_t t_count = 0;
+        uint32_t* t_values = read_integer_file(argv[i], &t_count);
+        if (t_count == 0) {
+            printf("No integers found in %s\n", argv[i]);
+            return 1;
+        }
+        roaring_bitmap_add_many(bm, t_count, t_values);
+
+        shuffle_uint32(t_values, t_count);
+
+        values[i-1] = t_values;
+        count[i-1] = t_count;
+    }
+    //roaring_bitmap_run_optimize(bm);
+
+    printf("Data:\n");
+    printf("  cardinality: %"PRIu64"\n", roaring_bitmap_get_cardinality(bm));
+    printf("  buckets: %d\n", (int)bm->high_low_container.size);
+    printf("  range: %"PRIu32"-%"PRIu32"\n", roaring_bitmap_minimum(bm) >> 16, roaring_bitmap_maximum(bm) >> 16);
+
+    const int num_passes = 10;
+    printf("Cycles/element: %d\n", num_passes);
+    uint64_t cycles_start, cycles_final;
+
+    printf("                          roaring_bitmap_contains:");
+    for (int p = 0; p < num_passes; p++) {
+        bool result[count[p]];
+        RDTSC_START(cycles_start);
+        contains_multi_via_contains(bm, values[p], result, count[p]);
+        RDTSC_FINAL(cycles_final);
+        printf(" %10f", (cycles_final - cycles_start) * 1.0 / count[p]);
+    }
+    printf("\n");
+
+    printf("                     roaring_bitmap_contains_bulk:");
+    for (int p = 0; p < num_passes; p++) {
+        bool result[count[p]];
+        RDTSC_START(cycles_start);
+        contains_multi_bulk(bm, values[p], result, count[p]);
+        RDTSC_FINAL(cycles_final);
+        printf(" %10f", (cycles_final - cycles_start) * 1.0 / count[p]);
+    }
+    printf("\n");
+
+    // sort input array
+    for (size_t i = 0; i < fields; ++i) {
+        qsort(values[i], count[i], sizeof(uint32_t), compare_uint32);
+    }
+
+    printf("        roaring_bitmap_contains with sorted input:");
+    for (int p = 0; p < num_passes; p++) {
+        bool result[count[p]];
+        RDTSC_START(cycles_start);
+        contains_multi_via_contains(bm, values[p], result, count[p]);
+        RDTSC_FINAL(cycles_final);
+        printf(" %10f", (cycles_final - cycles_start) * 1.0 / count[p]);
+    }
+    printf("\n");
+
+    printf("   roaring_bitmap_contains_bulk with sorted input:");
+    for (int p = 0; p < num_passes; p++) {
+        bool result[count[p]];
+        RDTSC_START(cycles_start);
+        contains_multi_bulk(bm, values[p], result, count[p]);
+        RDTSC_FINAL(cycles_final);
+        printf(" %10f", (cycles_final - cycles_start) * 1.0 / count[p]);
+    }
+    printf("\n");
+
+    roaring_bitmap_free(bm);
+    for (size_t i = 0; i < fields; ++i) {
+        free(values[i]);
+    }
+    return 0;
+}
diff --git a/benchmarks/fastunion_benchmark.cpp b/benchmarks/fastunion_benchmark.cpp
new file mode 100644
index 000000000..ad693f6b8
--- /dev/null
+++ b/benchmarks/fastunion_benchmark.cpp
@@ -0,0 +1,101 @@
+#include <stdio.h>
+#include <iostream>
+#include <roaring/roaring.h>
+#include "roaring64map.hh"
+#include "benchmark.h"
+
+using roaring::Roaring64Map;
+
+namespace {
+const uint32_t num_iterations = 10;
+
+const uint32_t num_bitmaps = 100;
+const uint32_t num_outer_slots = 1000;
+const uint32_t num_inner_values = 2000;
+
+/**
+ * Creates the input maps for the test. This method creates 'num_bitmaps' maps,
+ * each of which contains 'num_outer_slots' 32-bit Roarings, each of which
+ * contains 'num_inner_values' bits. The inner bits are separated by
+ * 'num_bitmaps' and their starting offset is offset by 1 from one bitmap to the
+ * next. The intent is that in the result of the union, all the bits in a given
+ * 32 bit Roaring slot will end up densely packed together, which seemed like an
+ * interesting thing to do.
+ */
+std::vector<Roaring64Map> makeMaps() {
+    std::vector<Roaring64Map> result;
+    for (uint32_t bm_index = 0; bm_index != num_bitmaps; ++bm_index) {
+        Roaring64Map roaring;
+
+        for (uint32_t slot = 0; slot != num_outer_slots; ++slot) {
+            auto value = (uint64_t(slot) << 32) + bm_index + 0x98765432;
+            for (uint32_t inner_index = 0; inner_index != num_inner_values;
+                 ++inner_index) {
+                roaring.add(value);
+                value += num_bitmaps;
+            }
+        }
+        result.push_back(std::move(roaring));
+    }
+    return result;
+}
+
+Roaring64Map legacy_fastunion(size_t n, const Roaring64Map **inputs) {
+    Roaring64Map ans;
+    // not particularly fast
+    for (size_t lcv = 0; lcv < n; ++lcv) {
+        ans |= *(inputs[lcv]);
+    }
+    return ans;
+}
+
+void benchmarkLegacyFastUnion() {
+    std::cout << "*** Legacy fastunion ***\n";
+    auto maps = makeMaps();
+
+    // Need pointers to the above
+    std::vector<const Roaring64Map*> result_ptrs;
+    for (auto &map : maps) {
+        result_ptrs.push_back(&map);
+    }
+
+    for (uint32_t iter = 0; iter < num_iterations; ++iter) {
+        uint64_t cycles_start, cycles_final;
+        RDTSC_START(cycles_start);
+        auto result = legacy_fastunion(result_ptrs.size(), result_ptrs.data());
+        RDTSC_FINAL(cycles_final);
+
+        auto num_cycles = cycles_final - cycles_start;
+        uint64_t cycles_per_map = num_cycles / maps.size();
+        std::cout << "Iteration " << iter << ": " << cycles_per_map << " per map\n";
+    }
+}
+
+void benchmarkNewFastUnion() {
+    std::cout << "*** New fastunion() ***\n";
+    auto maps = makeMaps();
+
+    // Need pointers to the above
+    std::vector<const Roaring64Map*> result_ptrs;
+    for (auto &map : maps) {
+        result_ptrs.push_back(&map);
+    }
+
+    for (uint32_t iter = 0; iter < num_iterations; ++iter) {
+        uint64_t cycles_start, cycles_final;
+        RDTSC_START(cycles_start);
+        auto result =
+            Roaring64Map::fastunion(result_ptrs.size(), result_ptrs.data());
+        RDTSC_FINAL(cycles_final);
+
+        auto num_cycles = cycles_final - cycles_start;
+        uint64_t cycles_per_map = num_cycles / maps.size();
+        std::cout << "Iteration " << iter << ": " << cycles_per_map << " per map\n";
+    }
+}
+}  // namespace
+
+int main() {
+    benchmarkLegacyFastUnion();
+    benchmarkNewFastUnion();
+}
diff --git a/benchmarks/real_bitmaps_benchmark.c b/benchmarks/real_bitmaps_benchmark.c
index 87840a926..7d8c4847e 100644
--- a/benchmarks/real_bitmaps_benchmark.c
+++ b/benchmarks/real_bitmaps_benchmark.c
@@ -174,6 +174,46 @@ int main(int argc, char **argv) {
            " cycles\n",
            count, total_count, cycles_final - cycles_start);
 
+    uint64_t portable_cycle_count = 0, portable_frozen_cycle_count = 0,
+      frozen_cycle_count = 0;
+    for(int i = 0; i < (int)count; i++) {
+        int size = roaring_bitmap_portable_size_in_bytes(bitmaps[i]);
+        char *buf = malloc(size);
+        roaring_bitmap_portable_serialize(bitmaps[i], buf);
+
+        int frozen_size = roaring_bitmap_frozen_size_in_bytes(bitmaps[i]);
+        char *frozen_buf = roaring_aligned_malloc(32, frozen_size);
+        roaring_bitmap_frozen_serialize(bitmaps[i], frozen_buf);
+
+        RDTSC_START(cycles_start);
+        roaring_bitmap_t *r1 = roaring_bitmap_portable_deserialize(buf);
+        RDTSC_FINAL(cycles_final);
+        portable_cycle_count += cycles_final - cycles_start;
+
+        RDTSC_START(cycles_start);
+        roaring_bitmap_t *r2 = roaring_bitmap_portable_deserialize_frozen(buf);
+        RDTSC_FINAL(cycles_final);
+        portable_frozen_cycle_count += cycles_final - cycles_start;
+
+        RDTSC_START(cycles_start);
+        const roaring_bitmap_t *r3 = roaring_bitmap_frozen_view(frozen_buf, frozen_size);
+        RDTSC_FINAL(cycles_final);
+        frozen_cycle_count += cycles_final - cycles_start;
+
+        roaring_bitmap_free(r1);
+        roaring_bitmap_free(r2);
+        roaring_bitmap_free(r3);
+        free(buf);
+        roaring_aligned_free(frozen_buf);
+    }
+
+    printf("Deserializing %zu bitmaps took %" PRIu64 " cycles for portable format\n",
+           count, portable_cycle_count);
+    printf("Deserializing %zu bitmaps took %" PRIu64 " cycles for portable frozen format\n",
+           count, portable_frozen_cycle_count);
+    printf("Deserializing %zu bitmaps took %" PRIu64 " cycles for frozen format\n",
+           count, frozen_cycle_count);
+
     for (int i = 0; i < (int)count; ++i) {
         free(numbers[i]);
         numbers[i] = NULL;  // paranoid
diff --git a/benchmarks/run_container_benchmark.c b/benchmarks/run_container_benchmark.c
index b04170257..e2fdc495a 100644
--- a/benchmarks/run_container_benchmark.c
+++ b/benchmarks/run_container_benchmark.c
@@ -23,7 +23,7 @@ void run_cache_flush(run_container_t* B) { (void)B; }
 // tries to put array in cache
 void run_cache_prefetch(run_container_t* B) {
 #if !CROARING_REGULAR_VISUAL_STUDIO
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
     const int32_t CACHELINESIZE =
         computecacheline();  // 64 bytes per cache line
 #else
@@ -141,8 +141,8 @@ int main() {
     printf("intersection cardinality = %d \n", answer);
     BEST_TIME(intersection_test(B1, B2, BO), answer, repeat, answer);
     printf("==intersection and union test 2 \n");
-    run_container_clear(B1);
-    run_container_clear(B2);
+    B1->n_runs = 0;
+    B2->n_runs = 0;
     for (int x = 0; x < (1 << 16); x += 64) {
         int length = x % 11;
         for (int y = 0; y < length; ++y)
diff --git a/cmake/import.cmake b/cmake/import.cmake
new file mode 100644
index 000000000..a9b6ffe5d
--- /dev/null
+++ b/cmake/import.cmake
@@ -0,0 +1,52 @@
+set(dep_root "${PROJEC_SOURCE_DIR}/dependencies/.cache")
+if(DEFINED ENV{roaring_DEPENDENCY_CACHE_DIR})
+  set(dep_root "$ENV{roaring_DEPENDENCY_CACHE_DIR}")
+endif()
+
+function(import_dependency NAME GITHUB_REPO COMMIT)
+  message(STATUS "Importing ${NAME} (${GITHUB_REPO}@${COMMIT})")
+  set(target "${dep_root}/${NAME}")
+
+  # If the folder exists in the cache, then we assume that everything is as
+  # should be and do nothing
+  if(EXISTS "${target}")
+    set("${NAME}_SOURCE_DIR" "${target}" PARENT_SCOPE)
+    return()
+  endif()
+
+  set(zip_url "https://github.com/${GITHUB_REPO}/archive/${COMMIT}.zip")
+  set(archive "${dep_root}/archive.zip")
+  set(dest "${dep_root}/_extract")
+
+  file(DOWNLOAD "${zip_url}" "${archive}")
+  file(MAKE_DIRECTORY "${dest}")
+  execute_process(
+          WORKING_DIRECTORY "${dest}"
+          COMMAND "${CMAKE_COMMAND}" -E tar xf "${archive}")
+  file(REMOVE "${archive}")
+
+  # GitHub archives only ever have one folder component at the root, so this
+  # will always match that single folder
+  file(GLOB dir LIST_DIRECTORIES YES "${dest}/*")
+
+  file(RENAME "${dir}" "${target}")
+
+  set("${NAME}_SOURCE_DIR" "${target}" PARENT_SCOPE)
+endfunction()
+
+# Delegates to the dependency
+macro(add_dependency NAME)
+  if(NOT DEFINED "${NAME}_SOURCE_DIR")
+    message(FATAL_ERROR "Missing ${NAME}_SOURCE_DIR variable")
+  endif()
+
+  add_subdirectory("${${NAME}_SOURCE_DIR}" "${PROJECT_BINARY_DIR}/_deps/${NAME}" EXCLUDE_FROM_ALL)
+endmacro()
+
+function(set_off NAME)
+  set("${NAME}" OFF CACHE INTERNAL "")
+endfunction()
+
+function(set_on NAME)
+  set("${NAME}" ON CACHE INTERNAL "")
+endfunction()
\ No newline at end of file
diff --git a/cpp/roaring.hh b/cpp/roaring.hh
index 8ee9e7b30..f14150baf 100644
--- a/cpp/roaring.hh
+++ b/cpp/roaring.hh
@@ -7,6 +7,7 @@ A C++ header for Roaring Bitmaps.
 #include <cstdarg>
 
 #include <algorithm>
+#include <initializer_list>
 #include <new>
 #include <stdexcept>
 #include <string>
@@ -41,6 +42,28 @@ namespace roaring {
 
 class RoaringSetBitForwardIterator;
 
+/**
+ * A bit of context usable with `*Bulk()` functions.
+ *
+ * A context may only be used with a single bitmap, and any modification to a bitmap
+ * (other than modifications performed with `Bulk()` functions with the context
+ * passed) will invalidate any contexts associated with that bitmap.
+ */
+class BulkContext {
+   public:
+    friend class Roaring;
+    using roaring_bitmap_bulk_context_t = api::roaring_bulk_context_t;
+    BulkContext() : context_{nullptr, 0, 0, 0} {}
+
+    BulkContext(const BulkContext&) = delete;
+    BulkContext& operator=(const BulkContext&) = delete;
+    BulkContext(BulkContext&&) noexcept = default;
+    BulkContext& operator=(BulkContext&&) noexcept = default;
+
+   private:
+    roaring_bitmap_bulk_context_t context_;
+};
+
 class Roaring {
     typedef api::roaring_bitmap_t roaring_bitmap_t;  // class-local name alias
 
@@ -55,14 +78,22 @@ public:
     }
 
     /**
-     * Construct a bitmap from a list of integer values.
+     * Construct a bitmap from a list of 32-bit integer values.
      */
     Roaring(size_t n, const uint32_t *data) : Roaring() {
         api::roaring_bitmap_add_many(&roaring, n, data);
     }
 
     /**
-     * Copy constructor
+     * Construct a bitmap from an initializer list.
+     */
+    Roaring(std::initializer_list<uint32_t> l) : Roaring() {
+        addMany(l.size(), l.begin());
+    }
+
+    /**
+     * Copy constructor.
+     * It may throw std::runtime_error if there is insufficient memory.
      */
     Roaring(const Roaring &r) : Roaring() {
         if (!api::roaring_bitmap_overwrite(&roaring, &r.roaring)) {
@@ -74,8 +105,8 @@ public:
     }
 
     /**
-     * Move constructor. The moved object remains valid, i.e.
-     * all methods can still be called on it.
+     * Move constructor. The moved-from object remains valid but empty, i.e.
+     * it behaves as though it was just freshly constructed.
      */
     Roaring(Roaring &&r) noexcept : roaring(r.roaring) {
         //
@@ -99,7 +130,7 @@ public:
     }
 
     /**
-     * Construct a bitmap from a list of integer values.
+     * Construct a bitmap from a list of uint32_t values.
      */
     static Roaring bitmapOf(size_t n, ...) {
         Roaring ans;
@@ -112,69 +143,124 @@ public:
         return ans;
     }
 
+    /**
+     * Construct a bitmap from a list of uint32_t values.
+     * E.g., bitmapOfList({1,2,3}).
+     */
+    static Roaring bitmapOfList(std::initializer_list<uint32_t> l) {
+        Roaring ans;
+        ans.addMany(l.size(), l.begin());
+        return ans;
+    }
+
     /**
      * Add value x
      */
-    void add(uint32_t x) { api::roaring_bitmap_add(&roaring, x); }
+    void add(uint32_t x) noexcept { api::roaring_bitmap_add(&roaring, x); }
 
     /**
      * Add value x
      * Returns true if a new value was added, false if the value was already
      * existing.
      */
-    bool addChecked(uint32_t x) {
+    bool addChecked(uint32_t x) noexcept {
         return api::roaring_bitmap_add_checked(&roaring, x);
     }
 
     /**
-     * Add all values from x (included) to y (excluded)
+     * Add all values in range [min, max)
      */
-    void addRange(const uint64_t x, const uint64_t y)  {
-        return api::roaring_bitmap_add_range(&roaring, x, y);
+    void addRange(const uint64_t min, const uint64_t max) noexcept {
+        return api::roaring_bitmap_add_range(&roaring, min, max);
+    }
+
+    /**
+     * Add all values in range [min, max]
+     */
+    void addRangeClosed(const uint32_t min, const uint32_t max) noexcept {
+        return api::roaring_bitmap_add_range_closed(&roaring, min, max);
     }
 
     /**
      * Add value n_args from pointer vals
      */
-    void addMany(size_t n_args, const uint32_t *vals) {
+    void addMany(size_t n_args, const uint32_t *vals) noexcept {
         api::roaring_bitmap_add_many(&roaring, n_args, vals);
     }
 
+    /**
+     * Add value val, using context from a previous insert for speed
+     * optimization.
+     *
+     * `context` will be used to store information between calls to make bulk
+     * operations faster. `context` should be default-initialized before the
+     * first call to this function.
+     */
+    void addBulk(BulkContext &context, uint32_t x) noexcept {
+        api::roaring_bitmap_add_bulk(&roaring, &context.context_, x);
+    }
+
+    /**
+     * Check if item x is present, using context from a previous insert or search
+     * for speed optimization.
+     *
+     * `context` will be used to store information between calls to make bulk
+     * operations faster. `context` should be default-initialized before the
+     * first call to this function.
+     */
+    bool containsBulk(BulkContext& context, uint32_t x) const noexcept {
+        return api::roaring_bitmap_contains_bulk(&roaring, &context.context_, x);
+    }
+
     /**
      * Remove value x
      */
-    void remove(uint32_t x) { api::roaring_bitmap_remove(&roaring, x); }
+    void remove(uint32_t x) noexcept { api::roaring_bitmap_remove(&roaring, x); }
 
     /**
      * Remove value x
      * Returns true if a new value was removed, false if the value was not
      * existing.
      */
-    bool removeChecked(uint32_t x) {
+    bool removeChecked(uint32_t x) noexcept {
         return api::roaring_bitmap_remove_checked(&roaring, x);
     }
 
+    /**
+     * Remove all values in range [min, max)
+     */
+    void removeRange(uint64_t min, uint64_t max) noexcept {
+        return api::roaring_bitmap_remove_range(&roaring, min, max);
+    }
+
+    /**
+     * Remove all values in range [min, max]
+     */
+    void removeRangeClosed(uint32_t min, uint32_t max) noexcept {
+        return api::roaring_bitmap_remove_range_closed(&roaring, min, max);
+    }
+
     /**
      * Return the largest value (if not empty)
      */
-    uint32_t maximum() const { return api::roaring_bitmap_maximum(&roaring); }
+    uint32_t maximum() const noexcept { return api::roaring_bitmap_maximum(&roaring); }
 
     /**
      * Return the smallest value (if not empty)
      */
-    uint32_t minimum() const { return api::roaring_bitmap_minimum(&roaring); }
+    uint32_t minimum() const noexcept { return api::roaring_bitmap_minimum(&roaring); }
 
     /**
      * Check if value x is present
      */
-    bool contains(uint32_t x) const {
+    bool contains(uint32_t x) const noexcept {
         return api::roaring_bitmap_contains(&roaring, x);
     }
 
     /**
      * Check if all values from x (included) to y (excluded) are present
      */
-    bool containsRange(const uint64_t x, const uint64_t y) const {
+    bool containsRange(const uint64_t x, const uint64_t y) const noexcept {
         return api::roaring_bitmap_contains_range(&roaring, x, y);
     }
 
@@ -202,6 +288,7 @@ public:
     /**
      * Copies the content of the provided bitmap, and
      * discard the current content.
+     * It may throw std::runtime_error if there is insufficient memory.
      */
     Roaring &operator=(const Roaring &r) {
         if (!api::roaring_bitmap_overwrite(&roaring, &r.roaring)) {
@@ -228,12 +315,24 @@ public:
         return *this;
     }
 
+    /**
+     * Assignment from an initializer list.
+     */
+    Roaring &operator=(std::initializer_list<uint32_t> l) {
+        // Delegate to move assignment operator
+        *this = Roaring(l);
+        return *this;
+    }
+
     /**
      * Compute the intersection between the current bitmap and the provided
      * bitmap, writing the result in the current bitmap. The provided bitmap
      * is not modified.
+     *
+     * Performance hint: if you are computing the intersection between several
+     * bitmaps, two-by-two, it is best to start with the smallest bitmap.
      */
-    Roaring &operator&=(const Roaring &r) {
+    Roaring &operator&=(const Roaring &r) noexcept {
         api::roaring_bitmap_and_inplace(&roaring, &r.roaring);
         return *this;
     }
@@ -243,7 +342,7 @@ public:
      * bitmap, writing the result in the current bitmap. The provided bitmap
      * is not modified.
      */
-    Roaring &operator-=(const Roaring &r) {
+    Roaring &operator-=(const Roaring &r) noexcept {
         api::roaring_bitmap_andnot_inplace(&roaring, &r.roaring);
         return *this;
     }
@@ -255,7 +354,7 @@ public:
      *
      * See also the fastunion function to aggregate many bitmaps more quickly.
      */
-    Roaring &operator|=(const Roaring &r) {
+    Roaring &operator|=(const Roaring &r) noexcept {
         api::roaring_bitmap_or_inplace(&roaring, &r.roaring);
         return *this;
     }
@@ -265,7 +364,7 @@ public:
      * bitmap, writing the result in the current bitmap. The provided bitmap
      * is not modified.
      */
-    Roaring &operator^=(const Roaring &r) {
+    Roaring &operator^=(const Roaring &r) noexcept {
         api::roaring_bitmap_xor_inplace(&roaring, &r.roaring);
         return *this;
     }
@@ -273,31 +372,31 @@ public:
     /**
      * Exchange the content of this bitmap with another.
      */
-    void swap(Roaring &r) { std::swap(r.roaring, roaring); }
+    void swap(Roaring &r) noexcept { std::swap(r.roaring, roaring); }
 
     /**
      * Get the cardinality of the bitmap (number of elements).
      */
-    uint64_t cardinality() const {
+    uint64_t cardinality() const noexcept {
         return api::roaring_bitmap_get_cardinality(&roaring);
     }
 
     /**
      * Returns true if the bitmap is empty (cardinality is zero).
      */
-    bool isEmpty() const { return api::roaring_bitmap_is_empty(&roaring); }
+    bool isEmpty() const noexcept { return api::roaring_bitmap_is_empty(&roaring); }
 
     /**
      * Returns true if the bitmap is subset of the other.
      */
-    bool isSubset(const Roaring &r) const {
+    bool isSubset(const Roaring &r) const noexcept {
         return api::roaring_bitmap_is_subset(&roaring, &r.roaring);
     }
 
     /**
      * Returns true if the bitmap is strict subset of the other.
      */
-    bool isStrictSubset(const Roaring &r) const {
+    bool isStrictSubset(const Roaring &r) const noexcept {
         return api::roaring_bitmap_is_strict_subset(&roaring, &r.roaring);
     }
 
@@ -306,37 +405,45 @@ public:
      * responsible to ensure that there is enough memory allocated
      * (e.g., ans = new uint32[mybitmap.cardinality()];)
      */
-    void toUint32Array(uint32_t *ans) const {
+    void toUint32Array(uint32_t *ans) const noexcept {
         api::roaring_bitmap_to_uint32_array(&roaring, ans);
     }
     /**
      * To int array with pagination
      */
-    void rangeUint32Array(uint32_t *ans, size_t offset, size_t limit) const {
+    void rangeUint32Array(uint32_t *ans, size_t offset, size_t limit) const noexcept {
         api::roaring_bitmap_range_uint32_array(&roaring, offset, limit, ans);
     }
 
     /**
      * Return true if the two bitmaps contain the same elements.
      */
-    bool operator==(const Roaring &r) const {
+    bool operator==(const Roaring &r) const noexcept {
         return api::roaring_bitmap_equals(&roaring, &r.roaring);
     }
 
     /**
-     * Compute the negation of the roaring bitmap within a specified interval.
-     * interval: [range_start, range_end).
-     * Areas outside the range are passed through unchanged.
+     * Compute the negation of the roaring bitmap within the half-open interval
+     * [range_start, range_end). Areas outside the interval are unchanged.
      */
-    void flip(uint64_t range_start, uint64_t range_end) {
+    void flip(uint64_t range_start, uint64_t range_end) noexcept {
         api::roaring_bitmap_flip_inplace(&roaring, range_start, range_end);
     }
 
+    /**
+     * Compute the negation of the roaring bitmap within the closed interval
+     * [range_start, range_end]. Areas outside the interval are unchanged.
+     */
+    void flipClosed(uint32_t range_start, uint32_t range_end) noexcept {
+        api::roaring_bitmap_flip_inplace(
+            &roaring, range_start, uint64_t(range_end) + 1);
+    }
+
     /**
      * Remove run-length encoding even when it is more space efficient.
      * Return whether a change was applied.
      */
-    bool removeRunCompression() {
+    bool removeRunCompression() noexcept {
         return api::roaring_bitmap_remove_run_compression(&roaring);
     }
 
@@ -346,13 +453,13 @@ public:
      * Returns true if the result has at least one run container.  Additional
      * savings might be possible by calling shrinkToFit().
      */
-    bool runOptimize() { return api::roaring_bitmap_run_optimize(&roaring); }
+    bool runOptimize() noexcept { return api::roaring_bitmap_run_optimize(&roaring); }
 
     /**
      * If needed, reallocate memory to shrink the memory usage. Returns
      * the number of bytes saved.
      */
-    size_t shrinkToFit() { return api::roaring_bitmap_shrink_to_fit(&roaring); }
+    size_t shrinkToFit() noexcept { return api::roaring_bitmap_shrink_to_fit(&roaring); }
 
     /**
      * Iterate over the bitmap elements. The function iterator is called once
@@ -375,21 +482,21 @@ public:
      * this function returns true and sets element to the element of given rank.
      * Otherwise, it returns false.
      */
-    bool select(uint32_t rnk, uint32_t *element) const {
+    bool select(uint32_t rnk, uint32_t *element) const noexcept {
         return api::roaring_bitmap_select(&roaring, rnk, element);
     }
 
     /**
      * Computes the size of the intersection between two bitmaps.
      */
-    uint64_t and_cardinality(const Roaring &r) const {
+    uint64_t and_cardinality(const Roaring &r) const noexcept {
         return api::roaring_bitmap_and_cardinality(&roaring, &r.roaring);
     }
 
     /**
      * Check whether the two bitmaps intersect.
      */
-    bool intersect(const Roaring &r) const {
+    bool intersect(const Roaring &r) const noexcept {
         return api::roaring_bitmap_intersect(&roaring, &r.roaring);
     }
 
@@ -400,21 +507,21 @@ public:
      *
      * The Jaccard index is undefined if both bitmaps are empty.
      */
-    double jaccard_index(const Roaring &r) const {
+    double jaccard_index(const Roaring &r) const noexcept {
         return api::roaring_bitmap_jaccard_index(&roaring, &r.roaring);
     }
 
     /**
      * Computes the size of the union between two bitmaps.
      */
-    uint64_t or_cardinality(const Roaring &r) const {
+    uint64_t or_cardinality(const Roaring &r) const noexcept {
         return api::roaring_bitmap_or_cardinality(&roaring, &r.roaring);
     }
 
     /**
      * Computes the size of the difference (andnot) between two bitmaps.
      */
-    uint64_t andnot_cardinality(const Roaring &r) const {
+    uint64_t andnot_cardinality(const Roaring &r) const noexcept {
         return api::roaring_bitmap_andnot_cardinality(&roaring, &r.roaring);
     }
 
@@ -422,7 +529,7 @@ public:
      * Computes the size of the symmetric difference (andnot) between two
      * bitmaps.
      */
-    uint64_t xor_cardinality(const Roaring &r) const {
+    uint64_t xor_cardinality(const Roaring &r) const noexcept {
         return api::roaring_bitmap_xor_cardinality(&roaring, &r.roaring);
     }
 
@@ -434,10 +541,21 @@ public:
      * 1 when ranking the smallest value, but the select function returns the
      * smallest value when using index 0.
      */
-    uint64_t rank(uint32_t x) const {
+    uint64_t rank(uint32_t x) const noexcept {
         return api::roaring_bitmap_rank(&roaring, x);
     }
 
+    /**
+     * Returns the index of x in the set, index start from 0.
+     * If the set doesn't contain x , this function will return -1.
+     * The difference with rank function is that this function will return -1
+     * when x isn't in the set, but the rank function will return a
+     * non-negative number.
+     */
+    int64_t getIndex(uint32_t x) const noexcept {
+        return api::roaring_bitmap_get_index(&roaring, x);
+    }
+
     /**
      * Write a bitmap to a char buffer. This is meant to be compatible with
      * the Java and Go versions. Returns how many bytes were written which
@@ -477,11 +595,12 @@ public:
      *      }  // namespace serialization
      *      }  // namespace boost
      */
-    size_t write(char *buf, bool portable = true) const {
-        if (portable)
+    size_t write(char *buf, bool portable = true) const noexcept {
+        if (portable) {
             return api::roaring_bitmap_portable_serialize(&roaring, buf);
-        else
+        } else {
             return api::roaring_bitmap_serialize(&roaring, buf);
+        }
     }
 
     /**
@@ -494,6 +613,11 @@ public:
      *
      * This function is unsafe in the sense that if you provide bad data,
      * many, many bytes could be read. See also readSafe.
+     *
+     * The function may throw std::runtime_error if a bitmap could not be read. Not that even
+     * if it does not throw, the bitmap could still be unusable if the loaded
+     * data does not match the portable Roaring specification: you should
+     * ensure that the data you load come from a serialized bitmap.
      */
     static Roaring read(const char *buf, bool portable = true) {
         roaring_bitmap_t * r = portable
@@ -508,7 +632,23 @@ public:
     /**
      * Read a bitmap from a serialized version, reading no more than maxbytes
      * bytes.  This is meant to be compatible with the Java and Go versions.
+     * The function itself is safe in the sense that it will not cause buffer overflows.
+     * However, for correct operations, it is assumed that the bitmap read was once
+     * serialized from a valid bitmap. If you provided an incorrect input (garbage), then the
+     * bitmap read may not be in a valid state and following operations may not lead
+     * to sensible results. It is your responsability to ensure that the input bytes
+     * follow the format specification if you want a usable bitmap:
+     * https://github.com/RoaringBitmap/RoaringFormatSpec
+     * In particular, the serialized array containers need to be in sorted order, and the
+     * run containers should be in sorted non-overlapping order. This is is guaranteed to
+     * happen when serializing an existing bitmap, but not for random inputs.
+     * Note that this function assumes that your bitmap was serialized in *portable* mode
+     * (which is the default with the 'write' method).
      *
+     * The function may throw std::runtime_error if a bitmap could not be read. Not that even
+     * if it does not throw, the bitmap could still be unusable if the loaded
+     * data does not match the portable Roaring specification: you should
+     * ensure that the data you load come from a serialized bitmap.
      */
     static Roaring readSafe(const char *buf, size_t maxbytes) {
         roaring_bitmap_t * r =
@@ -527,13 +667,18 @@ public:
      * can save space compared to the portable format (e.g., for very
      * sparse bitmaps).
      */
-    size_t getSizeInBytes(bool portable = true) const {
-        if (portable)
+    size_t getSizeInBytes(bool portable = true) const noexcept {
+        if (portable) {
             return api::roaring_bitmap_portable_size_in_bytes(&roaring);
-        else
+        } else {
             return api::roaring_bitmap_size_in_bytes(&roaring);
+        }
     }
 
+    /**
+     * For advanced users.
+     * This function may throw std::runtime_error.
+     */
     static const Roaring frozenView(const char *buf, size_t length) {
         const roaring_bitmap_t *s =
             api::roaring_bitmap_frozen_view(buf, length);
@@ -545,17 +690,29 @@ public:
         return r;
     }
 
-    void writeFrozen(char *buf) const {
+    /**
+     * For advanced users.
+     */
+    void writeFrozen(char *buf) const noexcept {
         roaring_bitmap_frozen_serialize(&roaring, buf);
     }
 
-    size_t getFrozenSizeInBytes() const {
+    /**
+     * For advanced users.
+     */
+    size_t getFrozenSizeInBytes() const noexcept {
         return roaring_bitmap_frozen_size_in_bytes(&roaring);
     }
 
     /**
      * Computes the intersection between two bitmaps and returns new bitmap.
      * The current bitmap and the provided bitmap are unchanged.
+     *
+     * Performance hint: if you are computing the intersection between several
+     * bitmaps, two-by-two, it is best to start with the smallest bitmap.
+     * Consider also using the operator &= to avoid needlessly creating
+     * many temporary bitmaps.
+     * This function may throw std::runtime_error.
      */
     Roaring operator&(const Roaring &o) const {
         roaring_bitmap_t *r = api::roaring_bitmap_and(&roaring, &o.roaring);
@@ -568,6 +725,7 @@ public:
     /**
      * Computes the difference between two bitmaps and returns new bitmap.
      * The current bitmap and the provided bitmap are unchanged.
+     * This function may throw std::runtime_error.
      */
     Roaring operator-(const Roaring &o) const {
         roaring_bitmap_t *r = api::roaring_bitmap_andnot(&roaring, &o.roaring);
@@ -580,6 +738,7 @@ public:
     /**
      * Computes the union between two bitmaps and returns new bitmap.
      * The current bitmap and the provided bitmap are unchanged.
+     * This function may throw std::runtime_error.
      */
     Roaring operator|(const Roaring &o) const {
         roaring_bitmap_t *r = api::roaring_bitmap_or(&roaring, &o.roaring);
@@ -592,6 +751,7 @@ public:
     /**
      * Computes the symmetric union between two bitmaps and returns new bitmap.
      * The current bitmap and the provided bitmap are unchanged.
+     * This function may throw std::runtime_error.
      */
     Roaring operator^(const Roaring &o) const {
         roaring_bitmap_t *r = api::roaring_bitmap_xor(&roaring, &o.roaring);
@@ -604,19 +764,19 @@ public:
     /**
      * Whether or not we apply copy and write.
      */
-    void setCopyOnWrite(bool val) {
+    void setCopyOnWrite(bool val) noexcept {
         api::roaring_bitmap_set_copy_on_write(&roaring, val);
     }
 
     /**
      * Print the content of the bitmap
      */
-    void printf() const { api::roaring_bitmap_printf(&roaring); }
+    void printf() const noexcept { api::roaring_bitmap_printf(&roaring); }
 
     /**
      * Print the content of the bitmap into a string
      */
-    std::string toString() const {
+    std::string toString() const noexcept {
         struct iter_data {
             std::string str{}; // The empty constructor silences warnings from pedantic static analyzers.
             char first_char = '{';
@@ -641,13 +801,14 @@ public:
     /**
      * Whether or not copy and write is active.
      */
-    bool getCopyOnWrite() const {
+    bool getCopyOnWrite() const noexcept {
         return api::roaring_bitmap_get_copy_on_write(&roaring);
     }
 
     /**
      * Computes the logical or (union) between "n" bitmaps (referenced by a
      * pointer).
+     * This function may throw std::runtime_error.
      */
     static Roaring fastunion(size_t n, const Roaring **inputs) {
         const roaring_bitmap_t **x =
diff --git a/cpp/roaring64map.hh b/cpp/roaring64map.hh
index 6ec9ccdff..6caea14a7 100644
--- a/cpp/roaring64map.hh
+++ b/cpp/roaring64map.hh
@@ -1,27 +1,36 @@
-/*
-A C++ header for 64-bit Roaring Bitmaps, implemented by way of a map of many
-32-bit Roaring Bitmaps.
+/**
+ * A C++ header for 64-bit Roaring Bitmaps, 
+ * implemented by way of a map of many
+ * 32-bit Roaring Bitmaps.
+ * 
+ * Reference (format specification) :
+ * https://github.com/RoaringBitmap/RoaringFormatSpec#extention-for-64-bit-implementations
 */
 #ifndef INCLUDE_ROARING_64_MAP_HH_
 #define INCLUDE_ROARING_64_MAP_HH_
 
 #include <algorithm>
+#include <cinttypes> // PRIu64 macro
 #include <cstdarg>  // for va_list handling in bitmapOf()
 #include <cstdio>  // for std::printf() in the printf() method
 #include <cstring>  // for std::memcpy()
+#include <functional>
+#include <initializer_list>
 #include <limits>
 #include <map>
 #include <new>
 #include <numeric>
+#include <queue>
 #include <stdexcept>
 #include <string>
 #include <utility>
 
 #include "roaring.hh"
-using roaring::Roaring;
 
 namespace roaring {
 
+using roaring::Roaring;
+
 class Roaring64MapSetBitForwardIterator;
 class Roaring64MapSetBitBiDirectionalIterator;
 
@@ -44,6 +53,13 @@ public:
      */
     Roaring64Map(size_t n, const uint64_t *data) { addMany(n, data); }
 
+    /**
+     * Construct a bitmap from an initializer list.
+     */
+    Roaring64Map(std::initializer_list<uint64_t> l) {
+        addMany(l.size(), l.begin());
+    }
+
     /**
      * Construct a 64-bit map from a 32-bit one
      */
@@ -75,10 +91,19 @@ public:
     /**
      * Move assignment operator.
      */
-     Roaring64Map &operator=(Roaring64Map &&r) noexcept = default;
+    Roaring64Map &operator=(Roaring64Map &&r) noexcept = default;
 
     /**
-     * Construct a bitmap from a list of integer values.
+     * Assignment from an initializer list.
+     */
+    Roaring64Map &operator=(std::initializer_list<uint64_t> l) {
+        // Delegate to move assignment operator
+        *this = Roaring64Map(l);
+        return *this;
+    }
+
+    /**
+     * Construct a bitmap from a list of uint64_t values.
      */
     static Roaring64Map bitmapOf(size_t n...) {
         Roaring64Map ans;
@@ -92,74 +117,328 @@ public:
     }
 
     /**
-     * Add value x
+     * Construct a bitmap from a list of uint64_t values.
+     * E.g., bitmapOfList({1,2,3}).
+     */
+    static Roaring64Map bitmapOfList(std::initializer_list<uint64_t> l) {
+        Roaring64Map ans;
+        ans.addMany(l.size(), l.begin());
+        return ans;
+    }
+
+    /**
+     * Adds value x.
      */
     void add(uint32_t x) {
-        roarings[0].add(x);
-        roarings[0].setCopyOnWrite(copyOnWrite);
+        lookupOrCreateInner(0).add(x);
     }
+
+    /**
+     * Adds value x.
+     */
     void add(uint64_t x) {
-        roarings[highBytes(x)].add(lowBytes(x));
-        roarings[highBytes(x)].setCopyOnWrite(copyOnWrite);
+        lookupOrCreateInner(highBytes(x)).add(lowBytes(x));
     }
 
     /**
-     * Add value x
-     * Returns true if a new value was added, false if the value was already existing.
+     * Adds value x.
+     * Returns true if a new value was added, false if the value was already
+     * present.
      */
     bool addChecked(uint32_t x) {
-        bool result = roarings[0].addChecked(x);
-        roarings[0].setCopyOnWrite(copyOnWrite);
-        return result;
+        return lookupOrCreateInner(0).addChecked(x);
     }
+
+    /**
+     * Adds value x.
+     * Returns true if a new value was added, false if the value was already
+     * present.
+     */
     bool addChecked(uint64_t x) {
-        bool result = roarings[highBytes(x)].addChecked(lowBytes(x));
-        roarings[highBytes(x)].setCopyOnWrite(copyOnWrite);
-        return result;
+        return lookupOrCreateInner(highBytes(x)).addChecked(lowBytes(x));
     }
 
     /**
-     * Add value n_args from pointer vals
+     * Adds all values in the half-open interval [min, max).
+     */
+    void addRange(uint64_t min, uint64_t max) {
+        if (min >= max) {
+            return;
+        }
+        addRangeClosed(min, max - 1);
+    }
+
+    /**
+     * Adds all values in the closed interval [min, max].
+     */
+    void addRangeClosed(uint32_t min, uint32_t max) {
+        lookupOrCreateInner(0).addRangeClosed(min, max);
+    }
+
+    /**
+     * Adds all values in the closed interval [min, max]
+     */
+    void addRangeClosed(uint64_t min, uint64_t max) {
+        if (min > max) {
+            return;
+        }
+        uint32_t start_high = highBytes(min);
+        uint32_t start_low = lowBytes(min);
+        uint32_t end_high = highBytes(max);
+        uint32_t end_low = lowBytes(max);
+
+        // We put std::numeric_limits<>::max in parentheses to avoid a
+        // clash with the Windows.h header under Windows.
+        const uint32_t uint32_max = (std::numeric_limits<uint32_t>::max)();
+
+        // Fill in any nonexistent slots with empty Roarings. This simplifies
+        // the logic below, allowing it to simply iterate over the map between
+        // 'start_high' and 'end_high' in a linear fashion.
+        auto current_iter = ensureRangePopulated(start_high, end_high);
+
+        // If start and end land on the same inner bitmap, then we can do the
+        // whole operation in one call.
+        if (start_high == end_high) {
+            auto &bitmap = current_iter->second;
+            bitmap.addRangeClosed(start_low, end_low);
+            return;
+        }
+
+        // Because start and end don't land on the same inner bitmap,
+        // we need to do this in multiple steps:
+        // 1. Partially fill the first bitmap with values from the closed
+        //    interval [start_low, uint32_max]
+        // 2. Fill intermediate bitmaps completely: [0, uint32_max]
+        // 3. Partially fill the last bitmap with values from the closed
+        //    interval [0, end_low]
+        auto num_intermediate_bitmaps = end_high - start_high - 1;
+
+        // Step 1: Partially fill the first bitmap.
+        {
+            auto &bitmap = current_iter->second;
+            bitmap.addRangeClosed(start_low, uint32_max);
+            ++current_iter;
+        }
+
+        // Step 2. Fill intermediate bitmaps completely.
+        if (num_intermediate_bitmaps != 0) {
+            auto &first_intermediate = current_iter->second;
+            first_intermediate.addRangeClosed(0, uint32_max);
+            ++current_iter;
+
+            // Now make (num_intermediate_bitmaps - 1) copies of this.
+            for (uint32_t i = 1; i != num_intermediate_bitmaps; ++i) {
+                auto &next_intermediate = current_iter->second;
+                next_intermediate = first_intermediate;
+                ++current_iter;
+            }
+        }
+
+        // Step 3: Partially fill the last bitmap.
+        auto &bitmap = current_iter->second;
+        bitmap.addRangeClosed(0, end_low);
+    }
+
+    /**
+     * Adds 'n_args' values from the contiguous memory range starting at 'vals'.
      */
     void addMany(size_t n_args, const uint32_t *vals) {
-        Roaring &roaring = roarings[0];
-        roaring.addMany(n_args, vals);
-        roaring.setCopyOnWrite(copyOnWrite);
+        lookupOrCreateInner(0).addMany(n_args, vals);
     }
 
+    /**
+     * Adds 'n_args' values from the contiguous memory range starting at 'vals'.
+     */
     void addMany(size_t n_args, const uint64_t *vals) {
+        // Potentially reduce outer map lookups by optimistically
+        // assuming that adjacent values will belong to the same inner bitmap.
+        Roaring *last_inner_bitmap = nullptr;
+        uint32_t last_value_high = 0;
         for (size_t lcv = 0; lcv < n_args; lcv++) {
-            roarings[highBytes(vals[lcv])].add(lowBytes(vals[lcv]));
-            roarings[highBytes(vals[lcv])].setCopyOnWrite(copyOnWrite);
+            auto value = vals[lcv];
+            auto value_high = highBytes(value);
+            auto value_low = lowBytes(value);
+            if (last_inner_bitmap == nullptr || value_high != last_value_high) {
+                last_inner_bitmap = &lookupOrCreateInner(value_high);
+                last_value_high = value_high;
+            }
+            last_inner_bitmap->add(value_low);
         }
     }
 
     /**
-     * Remove value x
+     * Removes value x.
+     */
+    void remove(uint32_t x) {
+        auto iter = roarings.begin();
+        // Since x is a uint32_t, highbytes(x) == 0. The inner bitmap we are
+        // looking for, if it exists, will be at the first slot of 'roarings'.
+        if (iter == roarings.end() || iter->first != 0) {
+            return;
+        }
+        auto &bitmap = iter->second;
+        bitmap.remove(x);
+        eraseIfEmpty(iter);
+    }
+
+    /**
+     * Removes value x.
      */
-    void remove(uint32_t x) { roarings[0].remove(x); }
     void remove(uint64_t x) {
-        auto roaring_iter = roarings.find(highBytes(x));
-        if (roaring_iter != roarings.cend())
-            roaring_iter->second.remove(lowBytes(x));
+        auto iter = roarings.find(highBytes(x));
+        if (iter == roarings.end()) {
+            return;
+        }
+        auto &bitmap = iter->second;
+        bitmap.remove(lowBytes(x));
+        eraseIfEmpty(iter);
     }
 
     /**
-     * Remove value x
-     * Returns true if a new value was removed, false if the value was not existing.
+     * Removes value x
+     * Returns true if a new value was removed, false if the value was not
+     * present.
      */
     bool removeChecked(uint32_t x) {
-        return roarings[0].removeChecked(x);
+        auto iter = roarings.begin();
+        // Since x is a uint32_t, highbytes(x) == 0. The inner bitmap we are
+        // looking for, if it exists, will be at the first slot of 'roarings'.
+        if (iter == roarings.end() || iter->first != 0) {
+            return false;
+        }
+        auto &bitmap = iter->second;
+        if (!bitmap.removeChecked(x)) {
+            return false;
+        }
+        eraseIfEmpty(iter);
+        return true;
     }
+
+    /**
+     * Remove value x
+     * Returns true if a new value was removed, false if the value was not
+     * present.
+     */
     bool removeChecked(uint64_t x) {
-        auto roaring_iter = roarings.find(highBytes(x));
-        if (roaring_iter != roarings.cend())
-            return roaring_iter->second.removeChecked(lowBytes(x));
-        return false;
+        auto iter = roarings.find(highBytes(x));
+        if (iter == roarings.end()) {
+            return false;
+        }
+        auto &bitmap = iter->second;
+        if (!bitmap.removeChecked(lowBytes(x))) {
+            return false;
+        }
+        eraseIfEmpty(iter);
+        return true;
+    }
+
+    /**
+     * Removes all values in the half-open interval [min, max).
+     */
+    void removeRange(uint64_t min, uint64_t max) {
+        if (min >= max) {
+            return;
+        }
+        return removeRangeClosed(min, max - 1);
     }
 
     /**
-     * Clear the bitmap
+     * Removes all values in the closed interval [min, max].
+     */
+    void removeRangeClosed(uint32_t min, uint32_t max) {
+        auto iter = roarings.begin();
+        // Since min and max are uint32_t, highbytes(min or max) == 0. The inner
+        // bitmap we are looking for, if it exists, will be at the first slot of
+        // 'roarings'.
+        if (iter == roarings.end() || iter->first != 0) {
+            return;
+        }
+        auto &bitmap = iter->second;
+        bitmap.removeRangeClosed(min, max);
+        eraseIfEmpty(iter);
+    }
+
+    /**
+     * Removes all values in the closed interval [min, max].
+     */
+    void removeRangeClosed(uint64_t min, uint64_t max) {
+        if (min > max) {
+            return;
+        }
+        uint32_t start_high = highBytes(min);
+        uint32_t start_low = lowBytes(min);
+        uint32_t end_high = highBytes(max);
+        uint32_t end_low = lowBytes(max);
+
+        // We put std::numeric_limits<>::max in parentheses to avoid a
+        // clash with the Windows.h header under Windows.
+        const uint32_t uint32_max = (std::numeric_limits<uint32_t>::max)();
+
+        // If the outer map is empty, end_high is less than the first key,
+        // or start_high is greater than the last key, then exit now because
+        // there is no work to do.
+        if (roarings.empty() || end_high < roarings.cbegin()->first ||
+            start_high > (roarings.crbegin())->first) {
+            return;
+        }
+
+        // If we get here, start_iter points to the first entry in the outer map
+        // with key >= start_high. Such an entry is known to exist (i.e. the
+        // iterator will not be equal to end()) because start_high <= the last
+        // key in the map (thanks to the above if statement).
+        auto start_iter = roarings.lower_bound(start_high);
+        // end_iter points to the first entry in the outer map with
+        // key >= end_high, if such a key exists. Otherwise, it equals end().
+        auto end_iter = roarings.lower_bound(end_high);
+
+        // Note that the 'lower_bound' method will find the start and end slots,
+        // if they exist; otherwise it will find the next-higher slots.
+        // In the case where 'start' landed on an existing slot, we need to do a
+        // partial erase of that slot, and likewise for 'end'. But all the slots
+        // in between can be fully erased. More precisely:
+        //
+        // 1. If the start point falls on an existing entry, there are two
+        //    subcases:
+        //    a. if the end point falls on that same entry, remove the closed
+        //       interval [start_low, end_low] from that entry and we are done.
+        //    b. Otherwise, remove the closed interval [start_low, uint32_max]
+        //       from that entry, advance start_iter, and fall through to step 2.
+        // 2. Completely erase all slots in the half-open interval
+        //    [start_iter, end_iter)
+        // 3. If the end point falls on an existing entry, remove the closed
+        //    interval [0, end_high] from it.
+
+        // Step 1. If the start point falls on an existing entry...
+        if (start_iter->first == start_high) {
+            auto &start_inner = start_iter->second;
+            // 1a. if the end point falls on that same entry...
+            if (start_iter == end_iter) {
+                start_inner.removeRangeClosed(start_low, end_low);
+                eraseIfEmpty(start_iter);
+                return;
+            }
+
+            // 1b. Otherwise, remove the closed range [start_low, uint32_max]...
+            start_inner.removeRangeClosed(start_low, uint32_max);
+            // Advance start_iter, but keep the old value so we can check the
+            // bitmap we just modified for emptiness and erase if it necessary.
+            auto temp = start_iter++;
+            eraseIfEmpty(temp);
+        }
+
+        // 2. Completely erase all slots in the half-open interval...
+        roarings.erase(start_iter, end_iter);
+
+        // 3. If the end point falls on an existing entry...
+        if (end_iter != roarings.end() && end_iter->first == end_high) {
+            auto &end_inner = end_iter->second;
+            end_inner.removeRangeClosed(0, end_low);
+            eraseIfEmpty(end_iter);
+        }
+    }
+
+    /**
+     * Clears the bitmap.
      */
     void clear() {
         roarings.clear();
@@ -210,16 +489,61 @@ public:
     }
 
     /**
-     * Compute the intersection between the current bitmap and the provided
-     * bitmap, writing the result in the current bitmap. The provided bitmap
-     * is not modified.
+     * Compute the intersection of the current bitmap and the provided bitmap,
+     * writing the result in the current bitmap. The provided bitmap is not
+     * modified.
+     *
+     * Performance hint: if you are computing the intersection between several
+     * bitmaps, two-by-two, it is best to start with the smallest bitmap.
      */
-    Roaring64Map &operator&=(const Roaring64Map &r) {
-        for (auto &map_entry : roarings) {
-            if (r.roarings.count(map_entry.first) == 1)
-                map_entry.second &= r.roarings.at(map_entry.first);
-            else
-                map_entry.second = Roaring();
+    Roaring64Map &operator&=(const Roaring64Map &other) {
+        if (this == &other) {
+            // ANDing *this with itself is a no-op.
+            return *this;
+        }
+
+        // Logic table summarizing what to do when a given outer key is
+        // present vs. absent from self and other.
+        //
+        // self     other    (self & other)  work to do
+        // --------------------------------------------
+        // absent   absent   empty           None
+        // absent   present  empty           None
+        // present  absent   empty           Erase self
+        // present  present  empty or not    Intersect self with other, but
+        //                                   erase self if result is empty.
+        //
+        // Because there is only work to do when a key is present in 'self', the
+        // main for loop iterates over entries in 'self'.
+
+        decltype(roarings.begin()) self_next;
+        for (auto self_iter = roarings.begin(); self_iter != roarings.end();
+             self_iter = self_next) {
+            // Do the 'next' operation now, so we don't have to worry about
+            // invalidation of self_iter down below with the 'erase' operation.
+            self_next = std::next(self_iter);
+
+            auto self_key = self_iter->first;
+            auto &self_bitmap = self_iter->second;
+
+            auto other_iter = other.roarings.find(self_key);
+            if (other_iter == other.roarings.end()) {
+                // 'other' doesn't have self_key. In the logic table above,
+                // this reflects the case (self.present & other.absent).
+                // So, erase self.
+                roarings.erase(self_iter);
+                continue;
+            }
+
+            // Both sides have self_key. In the logic table above, this reflects
+            // the case (self.present & other.present). So, intersect self with
+            // other.
+            const auto &other_bitmap = other_iter->second;
+            self_bitmap &= other_bitmap;
+            if (self_bitmap.isEmpty()) {
+                // ...but if intersection is empty, remove it altogether.
+                roarings.erase(self_iter);
+            }
         }
         return *this;
     }
@@ -229,44 +553,177 @@ public:
      * bitmap, writing the result in the current bitmap. The provided bitmap
      * is not modified.
      */
-    Roaring64Map &operator-=(const Roaring64Map &r) {
-        for (auto &map_entry : roarings) {
-            if (r.roarings.count(map_entry.first) == 1)
-                map_entry.second -= r.roarings.at(map_entry.first);
+    Roaring64Map &operator-=(const Roaring64Map &other) {
+        if (this == &other) {
+            // Subtracting *this from itself results in the empty map.
+            roarings.clear();
+            return *this;
+        }
+
+        // Logic table summarizing what to do when a given outer key is
+        // present vs. absent from self and other.
+        //
+        // self     other    (self - other)  work to do
+        // --------------------------------------------
+        // absent   absent   empty           None
+        // absent   present  empty           None
+        // present  absent   unchanged       None
+        // present  present  empty or not    Subtract other from self, but
+        //                                   erase self if result is empty
+        //
+        // Because there is only work to do when a key is present in both 'self'
+        // and 'other', the main while loop ping-pongs back and forth until it
+        // finds the next key that is the same on both sides.
+
+        auto self_iter = roarings.begin();
+        auto other_iter = other.roarings.cbegin();
+
+        while (self_iter != roarings.end() &&
+               other_iter != other.roarings.cend()) {
+            auto self_key = self_iter->first;
+            auto other_key = other_iter->first;
+            if (self_key < other_key) {
+                // Because self_key is < other_key, advance self_iter to the
+                // first point where self_key >= other_key (or end).
+                self_iter = roarings.lower_bound(other_key);
+                continue;
+            }
+
+            if (self_key > other_key) {
+                // Because self_key is > other_key, advance other_iter to the
+                // first point where other_key >= self_key (or end).
+                other_iter = other.roarings.lower_bound(self_key);
+                continue;
+            }
+
+            // Both sides have self_key. In the logic table above, this reflects
+            // the case (self.present & other.present). So subtract other from
+            // self.
+            auto &self_bitmap = self_iter->second;
+            const auto &other_bitmap = other_iter->second;
+            self_bitmap -= other_bitmap;
+
+            if (self_bitmap.isEmpty()) {
+                // ...but if subtraction is empty, remove it altogether.
+                self_iter = roarings.erase(self_iter);
+            } else {
+                ++self_iter;
+            }
+            ++other_iter;
         }
         return *this;
     }
 
     /**
-     * Compute the union between the current bitmap and the provided bitmap,
+     * Compute the union of the current bitmap and the provided bitmap,
      * writing the result in the current bitmap. The provided bitmap is not
      * modified.
      *
      * See also the fastunion function to aggregate many bitmaps more quickly.
      */
-    Roaring64Map &operator|=(const Roaring64Map &r) {
-        for (const auto &map_entry : r.roarings) {
-            if (roarings.count(map_entry.first) == 0) {
-                roarings[map_entry.first] = map_entry.second;
-                roarings[map_entry.first].setCopyOnWrite(copyOnWrite);
-            } else
-                roarings[map_entry.first] |= map_entry.second;
+    Roaring64Map &operator|=(const Roaring64Map &other) {
+        if (this == &other) {
+            // ORing *this with itself is a no-op.
+            return *this;
+        }
+
+        // Logic table summarizing what to do when a given outer key is
+        // present vs. absent from self and other.
+        //
+        // self     other    (self | other)  work to do
+        // --------------------------------------------
+        // absent   absent   empty           None
+        // absent   present  not empty       Copy other to self and set flags
+        // present  absent   unchanged       None
+        // present  present  not empty       self |= other
+        //
+        // Because there is only work to do when a key is present in 'other',
+        // the main for loop iterates over entries in 'other'.
+
+        for (const auto &other_entry : other.roarings) {
+            const auto &other_bitmap = other_entry.second;
+
+            // Try to insert other_bitmap into self at other_key. We take
+            // advantage of the fact that std::map::insert will not overwrite an
+            // existing entry.
+            auto insert_result = roarings.insert(other_entry);
+            auto self_iter = insert_result.first;
+            auto insert_happened = insert_result.second;
+            auto &self_bitmap = self_iter->second;
+
+            if (insert_happened) {
+                // Key was not present in self, so insert was performed above.
+                // In the logic table above, this reflects the case
+                // (self.absent | other.present). Because the copy has already
+                // happened, thanks to the 'insert' operation above, we just
+                // need to set the copyOnWrite flag.
+                self_bitmap.setCopyOnWrite(copyOnWrite);
+                continue;
+            }
+
+            // Both sides have self_key, and the insert was not performed. In
+            // the logic table above, this reflects the case
+            // (self.present & other.present). So OR other into self.
+            self_bitmap |= other_bitmap;
         }
         return *this;
     }
 
     /**
-     * Compute the symmetric union between the current bitmap and the provided
-     * bitmap, writing the result in the current bitmap. The provided bitmap
-     * is not modified.
+     * Compute the XOR of the current bitmap and the provided bitmap, writing
+     * the result in the current bitmap. The provided bitmap is not modified.
      */
-    Roaring64Map &operator^=(const Roaring64Map &r) {
-        for (const auto &map_entry : r.roarings) {
-            if (roarings.count(map_entry.first) == 0) {
-                roarings[map_entry.first] = map_entry.second;
-                roarings[map_entry.first].setCopyOnWrite(copyOnWrite);
-            } else
-                roarings[map_entry.first] ^= map_entry.second;
+    Roaring64Map &operator^=(const Roaring64Map &other) {
+        if (this == &other) {
+            // XORing *this with itself results in the empty map.
+            roarings.clear();
+            return *this;
+        }
+
+        // Logic table summarizing what to do when a given outer key is
+        // present vs. absent from self and other.
+        //
+        // self     other    (self ^ other)  work to do
+        // --------------------------------------------
+        // absent   absent   empty           None
+        // absent   present  non-empty       Copy other to self and set flags
+        // present  absent   unchanged       None
+        // present  present  empty or not    XOR other into self, but erase self
+        //                                   if result is empty.
+        //
+        // Because there is only work to do when a key is present in 'other',
+        // the main for loop iterates over entries in 'other'.
+
+        for (const auto &other_entry : other.roarings) {
+            const auto &other_bitmap = other_entry.second;
+
+            // Try to insert other_bitmap into self at other_key. We take
+            // advantage of the fact that std::map::insert will not overwrite an
+            // existing entry.
+            auto insert_result = roarings.insert(other_entry);
+            auto self_iter = insert_result.first;
+            auto insert_happened = insert_result.second;
+            auto &self_bitmap = self_iter->second;
+
+            if (insert_happened) {
+                // Key was not present in self, so insert was performed above.
+                // In the logic table above, this reflects the case
+                // (self.absent ^ other.present). Because the copy has already
+                // happened, thanks to the 'insert' operation above, we just
+                // need to set the copyOnWrite flag.
+                self_bitmap.setCopyOnWrite(copyOnWrite);
+                continue;
+            }
+
+            // Both sides have self_key, and the insert was not performed. In
+            // the logic table above, this reflects the case
+            // (self.present ^ other.present). So XOR other into self.
+            self_bitmap ^= other_bitmap;
+
+            if (self_bitmap.isEmpty()) {
+                // ...but if intersection is empty, remove it altogether.
+                roarings.erase(self_iter);
+            }
         }
         return *this;
     }
@@ -338,6 +795,9 @@ public:
      */
     bool isSubset(const Roaring64Map &r) const {
         for (const auto &map_entry : roarings) {
+            if (map_entry.second.isEmpty()) {
+                continue;
+            }
             auto roaring_iter = r.roarings.find(map_entry.first);
             if (roaring_iter == r.roarings.cend())
                 return false;
@@ -420,36 +880,98 @@ public:
     }
 
     /**
-     * Compute the negation of the roaring bitmap within a specified interval.
-     * areas outside the range are passed through unchanged.
+     * Computes the negation of the roaring bitmap within the half-open interval
+     * [min, max). Areas outside the interval are unchanged.
+     */
+    void flip(uint64_t min, uint64_t max) {
+        if (min >= max) {
+            return;
+        }
+        flipClosed(min, max - 1);
+    }
+
+    /**
+     * Computes the negation of the roaring bitmap within the closed interval
+     * [min, max]. Areas outside the interval are unchanged.
      */
-    void flip(uint64_t range_start, uint64_t range_end) {
-        uint32_t start_high = highBytes(range_start);
-        uint32_t start_low = lowBytes(range_start);
-        uint32_t end_high = highBytes(range_end);
-        uint32_t end_low = lowBytes(range_end);
+    void flipClosed(uint32_t min, uint32_t max) {
+        auto iter = roarings.begin();
+        // Since min and max are uint32_t, highbytes(min or max) == 0. The inner
+        // bitmap we are looking for, if it exists, will be at the first slot of
+        // 'roarings'. If it does not exist, we have to create it.
+        if (iter == roarings.end() || iter->first != 0) {
+            iter = roarings.emplace_hint(iter, std::piecewise_construct,
+                                         std::forward_as_tuple(0),
+                                         std::forward_as_tuple());
+            auto &bitmap = iter->second;
+            bitmap.setCopyOnWrite(copyOnWrite);
+        }
+        auto &bitmap = iter->second;
+        bitmap.flipClosed(min, max);
+        eraseIfEmpty(iter);
+    }
 
+    /**
+     * Computes the negation of the roaring bitmap within the closed interval
+     * [min, max]. Areas outside the interval are unchanged.
+     */
+    void flipClosed(uint64_t min, uint64_t max) {
+        if (min > max) {
+          return;
+        }
+        uint32_t start_high = highBytes(min);
+        uint32_t start_low = lowBytes(min);
+        uint32_t end_high = highBytes(max);
+        uint32_t end_low = lowBytes(max);
+
+        // We put std::numeric_limits<>::max in parentheses to avoid a
+        // clash with the Windows.h header under Windows.
+        const uint32_t uint32_max = (std::numeric_limits<uint32_t>::max)();
+
+        // Fill in any nonexistent slots with empty Roarings. This simplifies
+        // the logic below, allowing it to simply iterate over the map between
+        // 'start_high' and 'end_high' in a linear fashion.
+        auto current_iter = ensureRangePopulated(start_high, end_high);
+
+        // If start and end land on the same inner bitmap, then we can do the
+        // whole operation in one call.
         if (start_high == end_high) {
-            roarings[start_high].flip(start_low, end_low);
+            auto &bitmap = current_iter->second;
+            bitmap.flipClosed(start_low, end_low);
+            eraseIfEmpty(current_iter);
             return;
         }
-        // we put std::numeric_limits<>::max/min in parentheses
-        // to avoid a clash with the Windows.h header under Windows
-        // flip operates on the range [lower_bound, upper_bound)
-        const uint64_t max_upper_bound =
-            static_cast<uint64_t>((std::numeric_limits<uint32_t>::max)()) + 1;
-        roarings[start_high].flip(start_low, max_upper_bound);
-        roarings[start_high++].setCopyOnWrite(copyOnWrite);
 
-        for (; start_high <= highBytes(range_end) - 1; ++start_high) {
-            roarings[start_high].flip((std::numeric_limits<uint32_t>::min)(),
-                                      max_upper_bound);
-            roarings[start_high].setCopyOnWrite(copyOnWrite);
+        // Because start and end don't land on the same inner bitmap,
+        // we need to do this in multiple steps:
+        // 1. Partially flip the first bitmap in the closed interval
+        //    [start_low, uint32_max]
+        // 2. Flip intermediate bitmaps completely: [0, uint32_max]
+        // 3. Partially flip the last bitmap in the closed interval
+        //    [0, end_low]
+
+        auto num_intermediate_bitmaps = end_high - start_high - 1;
+
+        // 1. Partially flip the first bitmap.
+        {
+            auto &bitmap = current_iter->second;
+            bitmap.flipClosed(start_low, uint32_max);
+            auto temp = current_iter++;
+            eraseIfEmpty(temp);
+        }
+
+        // 2. Flip intermediate bitmaps completely.
+        for (uint32_t i = 0; i != num_intermediate_bitmaps; ++i) {
+            auto &bitmap = current_iter->second;
+            bitmap.flipClosed(0, uint32_max);
+            auto temp = current_iter++;
+            eraseIfEmpty(temp);
         }
 
-        roarings[start_high].flip((std::numeric_limits<uint32_t>::min)(),
-                                  end_low);
-        roarings[start_high].setCopyOnWrite(copyOnWrite);
+        // 3. Partially flip the last bitmap.
+        auto &bitmap = current_iter->second;
+        bitmap.flipClosed(0, end_low);
+        eraseIfEmpty(current_iter);
     }
 
     /**
@@ -521,20 +1043,29 @@ public:
     }
 
     /**
-     * If the size of the roaring bitmap is strictly greater than rank, then
-     * this function returns true and set element to the element of given
-     * rank.  Otherwise, it returns false.
+     * Selects the value at index 'rank' in the bitmap, where the smallest value
+     * is at index 0. If 'rank' < cardinality(), returns true with *element set
+     * to the element of the specified rank. Otherwise, returns false and the
+     * contents of *element are unspecified.
      */
-    bool select(uint64_t rnk, uint64_t *element) const {
+    bool select(uint64_t rank, uint64_t *element) const {
         for (const auto &map_entry : roarings) {
-            uint64_t sub_cardinality = (uint64_t)map_entry.second.cardinality();
-            if (rnk < sub_cardinality) {
-                *element = ((uint64_t)map_entry.first) << 32;
-                // assuming little endian
-                return map_entry.second.select((uint32_t)rnk,
-                                               ((uint32_t *)element));
+            auto key = map_entry.first;
+            const auto &bitmap = map_entry.second;
+
+            uint64_t sub_cardinality = bitmap.cardinality();
+            if (rank < sub_cardinality) {
+                uint32_t low_bytes;
+                // Casting rank to uint32_t is safe because
+                // rank < sub_cardinality and sub_cardinality <= 2^32.
+                if (!bitmap.select((uint32_t)rank, &low_bytes)) {
+                    ROARING_TERMINATE("Logic error: bitmap.select() "
+                        "returned false despite rank < cardinality()");
+                }
+                *element = uniteBytes(key, low_bytes);
+                return true;
             }
-            rnk -= sub_cardinality;
+            rank -= sub_cardinality;
         }
         return false;
     }
@@ -544,21 +1075,40 @@ public:
      */
     uint64_t rank(uint64_t x) const {
         uint64_t result = 0;
+        // Find the first bitmap >= x's bucket. If that is the bucket x would be in, find it's rank in that bucket.
+        // Either way, we're left with a range of all buckets strictly smaller than x's bucket, add all their
+        // cardinalities together.
+        auto end = roarings.lower_bound(highBytes(x));
+        if (end != roarings.cend() && end->first == highBytes(x)) {
+            result += end->second.rank(lowBytes(x));
+        }
+        for (auto iter = roarings.cbegin(); iter != end; ++iter) {
+            result += iter->second.cardinality();
+        }
+        return result;
+    }
+
+    /**
+     * Returns the index of x in the set, index start from 0.
+     * If the set doesn't contain x , this function will return -1.
+     * The difference with rank function is that this function will return -1
+     * when x isn't in the set, but the rank function will return a
+     * non-negative number.
+     */
+    int64_t getIndex(uint64_t x) const {
+        int64_t index = 0;
         auto roaring_destination = roarings.find(highBytes(x));
         if (roaring_destination != roarings.cend()) {
             for (auto roaring_iter = roarings.cbegin();
                  roaring_iter != roaring_destination; ++roaring_iter) {
-                result += roaring_iter->second.cardinality();
+                index += roaring_iter->second.cardinality();
             }
-            result += roaring_destination->second.rank(lowBytes(x));
-            return result;
+            auto low_idx = roaring_destination->second.getIndex(lowBytes(x));
+            if (low_idx < 0) return -1;
+            index += low_idx;
+            return index;
         }
-        roaring_destination = roarings.lower_bound(highBytes(x));
-        for (auto roaring_iter = roarings.cbegin();
-             roaring_iter != roaring_destination; ++roaring_iter) {
-            result += roaring_iter->second.cardinality();
-        }
-        return result;
+        return -1;
     }
 
     /**
@@ -632,19 +1182,17 @@ public:
      * space compared to the portable format (e.g., for very sparse bitmaps).
      */
     static Roaring64Map readSafe(const char *buf, size_t maxbytes) {
+        if (maxbytes < sizeof(uint64_t)) {
+            ROARING_TERMINATE("ran out of bytes");
+        }
         Roaring64Map result;
-        // get map size
         uint64_t map_size;
         std::memcpy(&map_size, buf, sizeof(uint64_t));
         buf += sizeof(uint64_t);
+        maxbytes -= sizeof(uint64_t);
         for (uint64_t lcv = 0; lcv < map_size; lcv++) {
-            // get map key
             if(maxbytes < sizeof(uint32_t)) {
-#if ROARING_EXCEPTIONS
-                throw std::runtime_error("ran out of bytes");
-#else
                 ROARING_TERMINATE("ran out of bytes");
-#endif
             }
             uint32_t key;
             std::memcpy(&key, buf, sizeof(uint32_t));
@@ -778,6 +1326,11 @@ public:
     /**
      * Computes the intersection between two bitmaps and returns new bitmap.
      * The current bitmap and the provided bitmap are unchanged.
+     *
+     * Performance hint: if you are computing the intersection between several
+     * bitmaps, two-by-two, it is best to start with the smallest bitmap.
+     * Consider also using the operator &= to avoid needlessly creating
+     * many temporary bitmaps.
      */
     Roaring64Map operator&(const Roaring64Map &o) const {
         return Roaring64Map(*this) &= o;
@@ -820,90 +1373,27 @@ public:
     }
 
     /**
-     * Print the content of the bitmap
+     * Print the contents of the bitmap to stdout.
+     * Note: this method adds a final newline, but toString() does not.
      */
     void printf() const {
-        if (!isEmpty()) {
-            auto map_iter = roarings.cbegin();
-            while (map_iter->second.isEmpty()) ++map_iter;
-            struct iter_data {
-                uint32_t high_bits{};
-                char first_char{'{'};
-            } outer_iter_data;
-            outer_iter_data.high_bits = roarings.begin()->first;
-            map_iter->second.iterate(
-                [](uint32_t low_bits, void *inner_iter_data) -> bool {
-                    std::printf("%c%llu",
-                                ((iter_data *)inner_iter_data)->first_char,
-                                (long long unsigned)uniteBytes(
-                                    ((iter_data *)inner_iter_data)->high_bits,
-                                    low_bits));
-                    ((iter_data *)inner_iter_data)->first_char = ',';
-                    return true;
-                },
-                (void *)&outer_iter_data);
-            std::for_each(
-                ++map_iter, roarings.cend(),
-                [](const std::pair<const uint32_t, Roaring> &map_entry) {
-                    map_entry.second.iterate(
-                        [](uint32_t low_bits, void *high_bits) -> bool {
-                            std::printf(",%llu",
-                                        (long long unsigned)uniteBytes(
-                                            *(uint32_t *)high_bits, low_bits));
-                            return true;
-                        },
-                        (void *)&map_entry.first);
-                });
-        } else
-            std::printf("{");
-        std::printf("}\n");
-    }
-
-    /**
-     * Print the content of the bitmap into a string
+        auto sink = [](const std::string &s) {
+            fputs(s.c_str(), stdout);
+        };
+        printToSink(sink);
+        sink("\n");
+    }
+
+    /**
+     * Print the contents of the bitmap into a string.
      */
     std::string toString() const {
-        struct iter_data {
-            std::string str{}; // The empty constructor silences warnings from pedantic static analyzers.
-            uint32_t high_bits{0};
-            char first_char{'{'};
-        } outer_iter_data;
-        if (!isEmpty()) {
-            auto map_iter = roarings.cbegin();
-            while (map_iter->second.isEmpty()) ++map_iter;
-            outer_iter_data.high_bits = roarings.begin()->first;
-            map_iter->second.iterate(
-                [](uint32_t low_bits, void *inner_iter_data) -> bool {
-                    ((iter_data *)inner_iter_data)->str +=
-                        ((iter_data *)inner_iter_data)->first_char;
-                    ((iter_data *)inner_iter_data)->str += std::to_string(
-                        uniteBytes(((iter_data *)inner_iter_data)->high_bits,
-                                   low_bits));
-                    ((iter_data *)inner_iter_data)->first_char = ',';
-                    return true;
-                },
-                (void *)&outer_iter_data);
-            std::for_each(
-                ++map_iter, roarings.cend(),
-                [&outer_iter_data](
-                    const std::pair<const uint32_t, Roaring> &map_entry) {
-                    outer_iter_data.high_bits = map_entry.first;
-                    map_entry.second.iterate(
-                        [](uint32_t low_bits, void *inner_iter_data) -> bool {
-                            ((iter_data *)inner_iter_data)->str +=
-                                ((iter_data *)inner_iter_data)->first_char;
-                            ((iter_data *)inner_iter_data)->str +=
-                                std::to_string(uniteBytes(
-                                    ((iter_data *)inner_iter_data)->high_bits,
-                                    low_bits));
-                            return true;
-                        },
-                        (void *)&outer_iter_data);
-                });
-        } else
-            outer_iter_data.str = '{';
-        outer_iter_data.str += '}';
-        return outer_iter_data.str;
+        std::string result;
+        auto sink = [&result](const std::string &s) {
+            result += s;
+        };
+        printToSink(sink);
+        return result;
     }
 
     /**
@@ -916,12 +1406,124 @@ public:
      * pointer).
      */
     static Roaring64Map fastunion(size_t n, const Roaring64Map **inputs) {
-        Roaring64Map ans;
-        // not particularly fast
-        for (size_t lcv = 0; lcv < n; ++lcv) {
-            ans |= *(inputs[lcv]);
+        // The strategy here is to basically do a "group by" operation.
+        // We group the input roarings by key, do a 32-bit
+        // roaring_bitmap_or_many on each group, and collect the results.
+        // We accomplish the "group by" operation using a priority queue, which
+        // tracks the next key for each of our input maps. At each step, our
+        // algorithm takes the next subset of maps that share the same next key,
+        // runs roaring_bitmap_or_many on those bitmaps, and then advances the
+        // current_iter on all the affected entries and then repeats.
+
+        // There is an entry in our priority queue for each of the 'n' inputs.
+        // For a given Roaring64Map, we look at its underlying 'roarings'
+        // std::map, and take its begin() and end(). This forms our half-open
+        // interval [current_iter, end_iter), which we keep in the priority
+        // queue as a pq_entry. These entries are updated (removed and then
+        // reinserted with the pq_entry.iterator field advanced by one step) as
+        // our algorithm progresses. But when a given interval becomes empty
+        // (i.e. pq_entry.iterator == pq_entry.end) it is not returned to the
+        // priority queue.
+        struct pq_entry {
+            roarings_t::const_iterator iterator;
+            roarings_t::const_iterator end;
+        };
+
+        // Custom comparator for the priority queue.
+        auto pq_comp = [](const pq_entry &lhs, const pq_entry &rhs) {
+            auto left_key = lhs.iterator->first;
+            auto right_key = rhs.iterator->first;
+
+            // We compare in the opposite direction than normal because priority
+            // queues normally order from largest to smallest, but we want
+            // smallest to largest.
+            return left_key > right_key;
+        };
+
+        // Create and populate the priority queue.
+        std::priority_queue<pq_entry, std::vector<pq_entry>, decltype(pq_comp)> pq(pq_comp);
+        for (size_t i = 0; i < n; ++i) {
+            const auto &roarings = inputs[i]->roarings;
+            if (roarings.begin() != roarings.end()) {
+                pq.push({roarings.begin(), roarings.end()});
+            }
         }
-        return ans;
+
+        // A reusable vector that holds the pointers to the inner bitmaps that
+        // we pass to the underlying 32-bit fastunion operation.
+        std::vector<const roaring_bitmap_t*> group_bitmaps;
+
+        // Summary of the algorithm:
+        // 1. While the priority queue is not empty:
+        //    A. Get its lowest key. Call this group_key
+        //    B. While the lowest entry in the priority queue has a key equal to
+        //       group_key:
+        //       1. Remove this entry (the pair {current_iter, end_iter}) from
+        //          the priority queue.
+        //       2. Add the bitmap pointed to by current_iter to a list of
+        //          32-bit bitmaps to process.
+        //       3. Advance current_iter. Now it will point to a bitmap entry
+        //          with some key greater than group_key (or it will point to
+        //          end()).
+        //       4. If current_iter != end_iter, reinsert the pair into the
+        //          priority queue.
+        //    C. Invoke the 32-bit roaring_bitmap_or_many() and add to result
+        Roaring64Map result;
+        while (!pq.empty()) {
+            // Find the next key (the lowest key) in the priority queue.
+            auto group_key = pq.top().iterator->first;
+
+            // The purpose of the inner loop is to gather all the inner bitmaps
+            // that share "group_key" into "group_bitmaps" so that they can be
+            // fed to roaring_bitmap_or_many(). While we are doing this, we
+            // advance those iterators to their next value and reinsert them
+            // into the priority queue (unless they reach their end).
+            group_bitmaps.clear();
+            while (!pq.empty()) {
+                auto candidate_current_iter = pq.top().iterator;
+                auto candidate_end_iter = pq.top().end;
+
+                auto candidate_key = candidate_current_iter->first;
+                const auto &candidate_bitmap = candidate_current_iter->second;
+
+                // This element will either be in the group (having
+                // key == group_key) or it will not be in the group (having
+                // key > group_key). (Note it cannot have key < group_key
+                // because of the ordered nature of the priority queue itself
+                // and the ordered nature of all the underlying roaring maps).
+                if (candidate_key != group_key) {
+                    // This entry, and (thanks to the nature of the priority
+                    // queue) all other entries as well, are all greater than
+                    // group_key, so we're done collecting elements for the
+                    // current group. Because of the way this loop was written,
+                    // the group will will always contain at least one element.
+                    break;
+                }
+
+                group_bitmaps.push_back(&candidate_bitmap.roaring);
+                // Remove this entry from the priority queue. Note this
+                // invalidates pq.top() so make sure you don't have any dangling
+                // references to it.
+                pq.pop();
+
+                // Advance 'candidate_current_iter' and insert a new entry
+                // {candidate_current_iter, candidate_end_iter} into the
+                // priority queue (unless it has reached its end).
+                ++candidate_current_iter;
+                if (candidate_current_iter != candidate_end_iter) {
+                    pq.push({candidate_current_iter, candidate_end_iter});
+                }
+            }
+
+            // Use the fast inner union to combine these.
+            auto *inner_result = roaring_bitmap_or_many(group_bitmaps.size(),
+                group_bitmaps.data());
+            // Insert the 32-bit result at end of the 'roarings' map of the
+            // result we are building.
+            result.roarings.insert(result.roarings.end(),
+                std::make_pair(group_key, Roaring(inner_result)));
+        }
+        return result;
     }
 
     friend class Roaring64MapSetBitForwardIterator;
@@ -947,12 +1549,13 @@ public:
     const_iterator end() const;
 
 private:
-    std::map<uint32_t, Roaring> roarings{}; // The empty constructor silences warnings from pedantic static analyzers.
+    typedef std::map<uint32_t, Roaring> roarings_t;
+    roarings_t roarings{}; // The empty constructor silences warnings from pedantic static analyzers.
     bool copyOnWrite{false};
-    static uint32_t highBytes(const uint64_t in) { return uint32_t(in >> 32); }
-    static uint32_t lowBytes(const uint64_t in) { return uint32_t(in); }
-    static uint64_t uniteBytes(const uint32_t highBytes,
-                               const uint32_t lowBytes) {
+    static constexpr uint32_t highBytes(const uint64_t in) { return uint32_t(in >> 32); }
+    static constexpr uint32_t lowBytes(const uint64_t in) { return uint32_t(in); }
+    static constexpr uint64_t uniteBytes(const uint32_t highBytes,
+                                         const uint32_t lowBytes) {
         return (uint64_t(highBytes) << 32) | uint64_t(lowBytes);
     }
     // this is needed to tolerate gcc's C++11 libstdc++ lacking emplace
@@ -972,6 +1575,102 @@ private:
         roarings.emplace(key, std::move(value));
 #endif
     }
+
+    /*
+     * Look up 'key' in the 'roarings' map. If it does not exist, create it.
+     * Also, set its copyOnWrite flag to 'copyOnWrite'. Then return a reference
+     * to the (already existing or newly created) inner bitmap.
+     */
+    Roaring &lookupOrCreateInner(uint32_t key) {
+        auto &bitmap = roarings[key];
+        bitmap.setCopyOnWrite(copyOnWrite);
+        return bitmap;
+    }
+
+    /**
+     * Prints the contents of the bitmap to a caller-provided sink function.
+     */
+    void printToSink(const std::function<void(const std::string &)> &sink) const {
+        sink("{");
+
+        // Storage for snprintf. Big enough to store the decimal representation
+        // of the largest uint64_t value and trailing \0.
+        char buffer[32];
+        const char *separator = "";
+        // Reusable, and therefore avoids many repeated heap allocations.
+        std::string callback_string;
+        for (const auto &entry : roarings) {
+            auto high_bits = entry.first;
+            const auto &bitmap = entry.second;
+            for (const auto low_bits : bitmap) {
+                auto value = uniteBytes(high_bits, low_bits);
+                snprintf(buffer, sizeof(buffer), "%" PRIu64, value);
+                callback_string = separator;
+                callback_string.append(buffer);
+                sink(callback_string);
+                separator = ",";
+            }
+        }
+        sink("}");
+    }
+
+    /**
+     * Ensures that every key in the closed interval [start_high, end_high]
+     * refers to a Roaring bitmap rather being an empty slot. Inserts empty
+     * Roaring bitmaps if necessary. The interval must be valid and non-empty.
+     * Returns an iterator to the bitmap at start_high.
+     */
+    roarings_t::iterator ensureRangePopulated(uint32_t start_high,
+                                              uint32_t end_high) {
+        if (start_high > end_high) {
+            ROARING_TERMINATE("Logic error: start_high > end_high");
+        }
+        // next_populated_iter points to the first entry in the outer map with
+        // key >= start_high, or end().
+        auto next_populated_iter = roarings.lower_bound(start_high);
+
+        // Use uint64_t to avoid an infinite loop when end_high == uint32_max.
+        roarings_t::iterator start_iter{};  // Definitely assigned in loop.
+        for (uint64_t slot = start_high; slot <= end_high; ++slot) {
+            roarings_t::iterator slot_iter;
+            if (next_populated_iter != roarings.end() &&
+                next_populated_iter->first == slot) {
+                // 'slot' index has caught up to next_populated_iter.
+                // Note it here and advance next_populated_iter.
+                slot_iter = next_populated_iter++;
+            } else {
+                // 'slot' index has not yet caught up to next_populated_iter.
+                // Make a fresh entry {key = 'slot', value = Roaring()}, insert
+                // it just prior to next_populated_iter, and set its copy
+                // on write flag. We take pains to use emplace_hint and
+                // piecewise_construct to minimize effort.
+                slot_iter = roarings.emplace_hint(
+                    next_populated_iter, std::piecewise_construct,
+                    std::forward_as_tuple(uint32_t(slot)),
+                    std::forward_as_tuple());
+                auto &bitmap = slot_iter->second;
+                bitmap.setCopyOnWrite(copyOnWrite);
+            }
+
+            // Make a note of the iterator of the starting slot. It will be
+            // needed for the return value.
+            if (slot == start_high) {
+                start_iter = slot_iter;
+            }
+        }
+        return start_iter;
+    }
+
+    /**
+     * Erases the entry pointed to by 'iter' from the 'roarings' map. Warning:
+     * this invalidates 'iter'.
+     */
+    void eraseIfEmpty(roarings_t::iterator iter) {
+        const auto &bitmap = iter->second;
+        if (bitmap.isEmpty()) {
+            roarings.erase(iter);
+        }
+    }
 };
 
 /**
@@ -981,7 +1680,7 @@ class Roaring64MapSetBitForwardIterator {
 public:
     typedef std::forward_iterator_tag iterator_category;
     typedef uint64_t *pointer;
-    typedef uint64_t &reference_type;
+    typedef uint64_t &reference;
     typedef uint64_t value_type;
     typedef int64_t difference_type;
     typedef Roaring64MapSetBitForwardIterator type_of_iterator;
diff --git a/doxygen b/doxygen
new file mode 100644
index 000000000..571e70754
--- /dev/null
+++ b/doxygen
@@ -0,0 +1,2741 @@
+# Doxyfile 1.9.6
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+#
+# Note:
+#
+# Use doxygen to compare the used configuration file with the template
+# configuration file:
+# doxygen -x [configFile]
+# Use doxygen to compare the used configuration file with the template
+# configuration file without replacing the environment variables or CMake type
+# replacement variables:
+# doxygen -x_noenv [configFile]
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "CRoaring"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         = "2.0.1"
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          = "Roaring bitmaps in C (and C++)"
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = "docs"
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096
+# sub-directories (in 2 levels) under the output directory of each output format
+# and will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to
+# control the number of sub-directories.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = YES
+
+# Controls the number of sub-directories that will be created when
+# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every
+# level increment doubles the number of directories, resulting in 4096
+# directories at level 8 which is the default and also the maximum value. The
+# sub-directories are organized in 2 levels, the first level always has a fixed
+# number of 16 directories.
+# Minimum value: 0, maximum value: 8, default value: 8.
+# This tag requires that the tag CREATE_SUBDIRS is set to YES.
+
+CREATE_SUBDIRS_LEVEL   = 8
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian,
+# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English
+# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek,
+# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with
+# English messages), Korean, Korean-en (Korean with English messages), Latvian,
+# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese,
+# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish,
+# Swedish, Turkish, Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = YES
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 2
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:^^"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". Note that you cannot put \n's in the value part of an alias
+# to insert newlines (in the resulting output). You can put ^^ in the value part
+# of an alias to insert a newline as if a physical newline was in the original
+# file. When you need a literal { or } or , in the value part of an alias you
+# have to escape them by means of a backslash (\), this can lead to conflicts
+# with the commands \{ and \} for these it is advised to use the version @{ and
+# @} or use a double escape (\\{ and \\})
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = YES
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+# The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which effectively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = YES
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = YES
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = YES
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = YES
+
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# will also hide undocumented C++ concepts if enabled. This option has no effect
+# if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
+# Possible values are: SYSTEM, NO and YES.
+# The default value is: SYSTEM.
+
+CASE_SENSE_NAMES       = SYSTEM
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class
+# will show which file needs to be included to use the class.
+# The default value is: YES.
+
+SHOW_HEADERFILE        = YES
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = NO
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = NO
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = NO
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= NO
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file. See also section "Changing the
+# layout of pages" for information.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as documenting some parameters in
+# a documented function twice, or documenting parameters that don't exist or
+# using markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete
+# function parameter documentation. If set to NO, doxygen will accept that some
+# parameters have no documentation without warning.
+# The default value is: YES.
+
+WARN_IF_INCOMPLETE_DOC = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong parameter
+# documentation, but not about the absence of documentation. If EXTRACT_ALL is
+# set to YES then this flag will automatically be disabled. See also
+# WARN_IF_INCOMPLETE_DOC
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = YES
+
+# If WARN_IF_UNDOC_ENUM_VAL option is set to YES, doxygen will warn about
+# undocumented enumeration values. If set to NO, doxygen will accept
+# undocumented enumeration values. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: NO.
+
+WARN_IF_UNDOC_ENUM_VAL = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# See also: WARN_LINE_FORMAT
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# In the $text part of the WARN_FORMAT command it is possible that a reference
+# to a more specific place is given. To make it easier to jump to this place
+# (outside of doxygen) the user can define a custom "cut" / "paste" string.
+# Example:
+# WARN_LINE_FORMAT = "'vi $file +$line'"
+# See also: WARN_FORMAT
+# The default value is: at line $line of file $file.
+
+WARN_LINE_FORMAT       = "at line $line of file $file"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = README.md cpp/roaring.hh cpp/roaring64map.hh
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
+# See also: INPUT_FILE_ENCODING
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses The INPUT_FILE_ENCODING tag can be used to specify
+# character encoding on a per file pattern basis. Doxygen will compare the file
+# name with each pattern and apply the encoding instead of the default
+# INPUT_ENCODING) if there is a match. The character encodings are a list of the
+# form: pattern=encoding (like *.php=ISO-8859-1). See cfg_input_encoding
+# "INPUT_ENCODING" for further information on supported encodings.
+
+INPUT_FILE_ENCODING    =
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
+# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
+# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
+# *.vhdl, *.ucf, *.qsf and *.ice.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.l \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.pyw \
+                         *.f90 \
+                         *.f95 \
+                         *.f03 \
+                         *.f08 \
+                         *.f18 \
+                         *.f \
+                         *.for \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf \
+                         *.ice
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = benchmarks, tests, Testing, tools, build, docs
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = "*/test/*"
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# ANamespace::AClass, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that doxygen will use the data processed and written to standard output
+# for further processing, therefore nothing else, like debug statements or used
+# commands (so in case of a Windows batch file always use @echo OFF), should be
+# written to standard output.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE = README.md
+
+# The Fortran standard specifies that for fixed formatted Fortran code all
+# characters from position 72 are to be considered as comment. A common
+# extension is to allow longer lines before the automatic comment starts. The
+# setting FORTRAN_COMMENT_AFTER will also make it possible that longer lines can
+# be processed before the automatic comment starts.
+# Minimum value: 7, maximum value: 10000, default value: 72.
+
+FORTRAN_COMMENT_AFTER  = 72
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# entity all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = YES
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = YES
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = NO
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The IGNORE_PREFIX tag can be used to specify a prefix (or a list of prefixes)
+# that should be ignored while generating the index headers. The IGNORE_PREFIX
+# tag works for classes, function and member names. The entity will be placed in
+# the alphabetical list under the first letter of the entity name that remains
+# after removing the prefix.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# Note: Since the styling of scrollbars can currently not be overruled in
+# Webkit/Chromium, the styling will be left out of the default doxygen.css if
+# one or more extra stylesheets have been specified. So if scrollbar
+# customization is desired it has to be added explicitly. For an example see the
+# documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  = theme/doxygen-awesome.css \
+                         theme/doxygen-awesome-sidebar-only.css \
+                         theme/doxygen-awesome-sidebar-only-darkmode-toggle.css
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       = theme/doxygen-awesome-darkmode-toggle.js \
+                         theme/doxygen-awesome-interactive-toc.js \
+                         theme/doxygen-awesome-fragment-copy-button.js \
+                         theme/doxygen-awesome-paragraph-link.js
+
+# The HTML_COLORSTYLE tag can be used to specify if the generated HTML output
+# should be rendered with a dark or light theme.
+# Possible values are: LIGHT always generate light mode output, DARK always
+# generate dark mode output, AUTO_LIGHT automatically set the mode according to
+# the user preference, use light mode if no preference is set (the default),
+# AUTO_DARK automatically set the mode according to the user preference, use
+# dark mode if no preference is set and TOGGLE allow to user to switch between
+# light and dark mode via a button.
+# The default value is: AUTO_LIGHT.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE        = LIGHT
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a color-wheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 209
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use gray-scales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 255
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 113
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL         =
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# on Windows. In the beginning of 2021 Microsoft took the original page, with
+# a.o. the download links, offline the HTML help workshop was already many years
+# in maintenance mode). You can download the HTML help workshop from the web
+# archives at Installation executable (see:
+# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo
+# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe).
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the main .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine tune the look of the index (see "Fine-tuning the output"). As an
+# example, the default style sheet generated by doxygen has an example that
+# shows how to put an image at the root of the tree instead of the PROJECT_NAME.
+# Since the tree basically has the same information as the tab index, you could
+# consider setting DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = YES
+
+# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the
+# FULL_SIDEBAR option determines if the side bar is limited to only the treeview
+# area (value NO) or if it should extend to the full height of the window (value
+# YES). Setting this to YES gives a layout similar to
+# https://docs.readthedocs.io with more room for contents, but less room for the
+# project logo, title, and description. If either GENERATE_TREEVIEW or
+# DISABLE_INDEX is set to NO, this option has no effect.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FULL_SIDEBAR           = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email
+# addresses.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+OBFUSCATE_EMAILS       = YES
+
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# With MATHJAX_VERSION it is possible to specify the MathJax version to be used.
+# Note that the different versions of MathJax have different requirements with
+# regards to the different settings, so it is possible that also other MathJax
+# settings have to be changed when switching between the different MathJax
+# versions.
+# Possible values are: MathJax_2 and MathJax_3.
+# The default value is: MathJax_2.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_VERSION        = MathJax_2
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. For more details about the output format see MathJax
+# version 2 (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3
+# (see:
+# http://docs.mathjax.org/en/latest/web/components/output.html).
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility. This is the name for Mathjax version 2, for MathJax version 3
+# this will be translated into chtml), NativeMML (i.e. MathML. Only supported
+# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This
+# is the name for Mathjax version 3, for MathJax version 2 this will be
+# translated into HTML-CSS) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment. The default value is:
+# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2
+# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        =
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# for MathJax version 2 (see
+# https://docs.mathjax.org/en/v2.7-latest/tex.html#tex-and-latex-extensions):
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# For example for MathJax version 3 (see
+# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html):
+# MATHJAX_EXTENSIONS = ams
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using JavaScript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         =
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for
+# the generated LaTeX document. The header should contain everything until the
+# first chapter. If it is left blank doxygen will generate a standard header. It
+# is highly recommended to start with a default header using
+# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty
+# and then modify the file new_header.tex. See also section "Doxygen usage" for
+# information on how to generate the default header that doxygen normally uses.
+#
+# Note: Only use a user-defined header if you know what you are doing!
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. The following
+# commands have a special meaning inside the header (and footer): For a
+# description of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for
+# the generated LaTeX document. The footer should contain everything after the
+# last chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer. See also section "Doxygen
+# usage" for information on how to generate the default footer that doxygen
+# normally uses. Note: Only use a user-defined footer if you know what you are
+# doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor. Note that the INCLUDE_PATH is not recursive, so the setting of
+# RECURSIVE has no effect here.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# DOT_COMMON_ATTR is common attributes for nodes, edges and labels of
+# subgraphs. When you want a differently looking font in the dot files that
+# doxygen generates you can specify fontname, fontcolor and fontsize attributes.
+# For details please see <a href=https://graphviz.org/doc/info/attrs.html>Node,
+# Edge and Graph Attributes specification</a> You need to make sure dot is able
+# to find the font, which can be done by putting it in a standard location or by
+# setting the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font. Default graphviz fontsize is 14.
+# The default value is: fontname=Helvetica,fontsize=10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_COMMON_ATTR        = "fontname=Helvetica,fontsize=10"
+
+# DOT_EDGE_ATTR is concatenated with DOT_COMMON_ATTR. For elegant style you can
+# add 'arrowhead=open, arrowtail=open, arrowsize=0.5'. <a
+# href=https://graphviz.org/doc/info/arrows.html>Complete documentation about
+# arrows shapes.</a>
+# The default value is: labelfontname=Helvetica,labelfontsize=10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_EDGE_ATTR          = "labelfontname=Helvetica,labelfontsize=10"
+
+# DOT_NODE_ATTR is concatenated with DOT_COMMON_ATTR. For view without boxes
+# around nodes set 'shape=plain' or 'shape=plaintext' <a
+# href=https://www.graphviz.org/doc/info/shapes.html>Shapes specification</a>
+# The default value is: shape=box,height=0.2,width=0.4.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NODE_ATTR          = "shape=box,height=0.2,width=0.4"
+
+# You can set the path where dot can find font specified with fontname in
+# DOT_COMMON_ATTR and others dot attributes.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
+# The default value is: YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies. See also the chapter Grouping
+# in the manual.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = YES
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = YES
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH    = 1
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
+# files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc temporary
+# files.
+# The default value is: YES.
+
+DOT_CLEANUP            = YES
diff --git a/fuzz/build.sh b/fuzz/build.sh
new file mode 100755
index 000000000..21644a6e5
--- /dev/null
+++ b/fuzz/build.sh
@@ -0,0 +1,38 @@
+#!/bin/bash -eu
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+
+mkdir build-dir && cd build-dir
+cmake -DENABLE_ROARING_TESTS=OFF ..
+make -j$(nproc)
+
+$CC $CFLAGS  \
+     -I$SRC/croaring/include \
+     -c $SRC/croaring_fuzzer.c -o fuzzer.o
+
+$CXX $CXXFLAGS $LIB_FUZZING_ENGINE fuzzer.o   \
+     -o $OUT/croaring_fuzzer $SRC/croaring/build-dir/src/libroaring.a
+
+$CXX $CFLAGS $CXXFLAGS  \
+     -I$SRC/croaring/include \
+     -I$SRC/croaring \
+     -c $SRC/croaring_fuzzer_cc.cc -o fuzzer_cc.o
+
+$CXX $CXXFLAGS $LIB_FUZZING_ENGINE fuzzer_cc.o   \
+     -o $OUT/croaring_fuzzer_cc $SRC/croaring/build-dir/src/libroaring.a
+
+zip $OUT/croaring_fuzzer_seed_corpus.zip $SRC/croaring/tests/testdata/*bin
+cp $SRC/croaring/tests/testdata/*bin $OUT/
diff --git a/fuzz/croaring_fuzzer.c b/fuzz/croaring_fuzzer.c
new file mode 100644
index 000000000..1e62749a9
--- /dev/null
+++ b/fuzz/croaring_fuzzer.c
@@ -0,0 +1,50 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "roaring/roaring.h"
+
+int LLVMFuzzerTestOneInput(const char *data, size_t size) {
+    // We test that deserialization never fails.
+    roaring_bitmap_t *bitmap =
+        roaring_bitmap_portable_deserialize_safe(data, size);
+    if (bitmap) {
+        // The bitmap may not be usable if it does not follow the specification.
+        // We can validate the bitmap we recovered to make sure it is proper.
+        const char *reason_failure = NULL;
+        if (roaring_bitmap_internal_validate(bitmap, &reason_failure)) {
+            // the bitmap is ok!
+            uint32_t cardinality = roaring_bitmap_get_cardinality(bitmap);
+
+            for (uint32_t i = 100; i < 1000; i++) {
+                if (!roaring_bitmap_contains(bitmap, i)) {
+                    cardinality++;
+                    roaring_bitmap_add(bitmap, i);
+                }
+            }
+            uint32_t new_cardinality = roaring_bitmap_get_cardinality(bitmap);
+            if (cardinality != new_cardinality) {
+                printf("bug\n");
+                exit(1);
+            }
+        }
+        roaring_bitmap_free(bitmap);
+    }
+    return 0;
+}
diff --git a/fuzz/croaring_fuzzer.options b/fuzz/croaring_fuzzer.options
new file mode 100644
index 000000000..7ca5e76f5
--- /dev/null
+++ b/fuzz/croaring_fuzzer.options
@@ -0,0 +1,2 @@
+[libfuzzer]
+close_fd_mask = 2
diff --git a/fuzz/croaring_fuzzer_cc.cc b/fuzz/croaring_fuzzer_cc.cc
new file mode 100644
index 000000000..fe2efda1c
--- /dev/null
+++ b/fuzz/croaring_fuzzer_cc.cc
@@ -0,0 +1,160 @@
+// Copyright 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "cpp/roaring.hh"
+#include "fuzzer/FuzzedDataProvider.h"
+#include <vector>
+
+std::vector<uint32_t> ConsumeVecInRange(FuzzedDataProvider &fdp, size_t length,
+                                        uint32_t min_value,
+                                        uint32_t max_value) {
+  std::vector<uint32_t> result = {0};
+  result.resize(length);
+  std::generate(result.begin(), result.end(), [&]() {
+    return fdp.ConsumeIntegralInRange<uint32_t>(min_value, max_value);
+  });
+  return result;
+}
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  /**
+   * A bitmap may contain up to 2**32 elements. Later this function will
+   * output the content to an array where each element uses 32 bits of storage.
+   * That would use 16 GB. Thus this function is bound to run out of memory.
+   *
+   * Even without the full serialization to a 32-bit array, a bitmap may still use over
+   * 512 MB in the normal course of operation: that is to be expected since it can
+   * represent all sets of integers in [0,2**32]. This function may hold several
+   * bitmaps in memory at once, so it can require gigabytes of memory (without bugs).
+   * Hence, unless it has a generous memory capacity, this function will run out of memory
+   * almost certainly.
+   *
+   * For sanity, we may limit the range to, say, 10,000,000 which will use 38 MB or so.
+   * With such a limited range, if we run out of memory, then we can almost certain that it
+   * has to do with a genuine bug.
+   */
+
+  uint32_t range_start = 0;
+  uint32_t range_end = 10'000'000;
+
+  /**
+   * We are not solely dependent on the range [range_start, range_end) because
+   * ConsumeVecInRange below produce integers in a small range starting at 0.
+   */
+
+  FuzzedDataProvider fdp(data, size);
+  /**
+   * The next line was ConsumeVecInRange(fdp, 500, 0, 1000) but it would pick 500
+   * values at random from 0, 1000, making almost certain that all of the values are
+   * picked. It seems more useful to pick 500 values in the range 0,1000.
+   */
+  std::vector<uint32_t> bitmap_data_a = ConsumeVecInRange(fdp, 500, 0, 1000);
+  roaring::Roaring a(bitmap_data_a.size(), bitmap_data_a.data());
+  a.runOptimize();
+  a.shrinkToFit();
+
+  std::vector<uint32_t> bitmap_data_b = ConsumeVecInRange(fdp, 500, 0, 1000);
+  roaring::Roaring b(bitmap_data_b.size(), bitmap_data_b.data());
+  b.runOptimize();
+  b.add(fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end));
+  b.addChecked(fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end));
+  b.addRange(fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end), fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end));
+  // add half of a to b.
+  b.addMany(bitmap_data_a.size() / 2, bitmap_data_a.data());
+  b.remove(fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end));
+  b.removeChecked(fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end));
+  b.removeRange(fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end),
+                fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end));
+  b.removeRangeClosed(fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end),
+                      fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end));
+  b.maximum();
+  b.minimum();
+  b.contains(fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end));
+  b.containsRange(fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end),
+                  fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end));
+
+  uint32_t element = 0;
+  a.select(fdp.ConsumeIntegralInRange<uint32_t>(0, 1000), &element);
+  a.intersect(b);
+  a.jaccard_index(b);
+  a.or_cardinality(b);
+  a.andnot_cardinality(b);
+  a.xor_cardinality(b);
+  a.rank(fdp.ConsumeIntegralInRange<uint32_t>(0, 5000));
+  a.getSizeInBytes();
+
+  roaring::Roaring c = a & b;
+  roaring::Roaring d = a - b;
+  roaring::Roaring e = a | b;
+  roaring::Roaring f = a ^ b;
+  a |= e;
+  a &= b;
+  a -= c;
+  a ^= f;
+
+  volatile bool is_equal = (a == b);
+
+  std::vector<uint32_t> b_as_array = {0};
+  b_as_array.resize(b.cardinality());
+  b.isEmpty();
+  b.toUint32Array(b_as_array.data());
+
+  a.isSubset(b);
+  a.isStrictSubset(b);
+  b.flip(fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end), fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end));
+  b.flipClosed(fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end),
+               fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end));
+  b.removeRunCompression();
+
+  // Move/copy constructors
+  roaring::Roaring copied = b;
+  roaring::Roaring moved = std::move(b);
+
+  // Asignment operators
+  b = copied;
+  b = std::move(moved);
+
+  // Safe read from serialized
+  std::vector<char> read_buffer = fdp.ConsumeBytes<char>(100);
+  try {
+    roaring::Roaring read_safely =
+        roaring::Roaring::readSafe(read_buffer.data(), read_buffer.size());
+    // The above is guaranteed to be safe. However, read_safely is maybe
+    // in an improper state and it cannot be used safely (including for
+    // reserialization).
+  } catch(...) {}
+
+  // The bitmap b can be serialized and re-read.
+  std::size_t expected_size_in_bytes = b.getSizeInBytes();
+  std::vector<char> buffer(expected_size_in_bytes);
+  std::size_t size_in_bytes = b.write(buffer.data());
+  assert(expected_size_in_bytes == size_in_bytes);
+  roaring::Roaring bread = roaring::Roaring::readSafe(buffer.data(), size_in_bytes);
+  assert(bread == b);
+
+  f.toString();
+
+  volatile int unused = 0;
+
+  for (roaring::Roaring::const_iterator i = a.begin(); i != a.end(); i++) {
+    unused++;
+  }
+
+  roaring::Roaring::const_iterator b_iter = b.begin();
+  b_iter.equalorlarger(fdp.ConsumeIntegralInRange<uint32_t>(range_start, range_end));
+
+  return 0;
+}
diff --git a/include/roaring/array_util.h b/include/roaring/array_util.h
index e0be2f70e..d9baa2b3b 100644
--- a/include/roaring/array_util.h
+++ b/include/roaring/array_util.h
@@ -6,6 +6,12 @@
 
 #include <roaring/portability.h>
 
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+
 #ifdef __cplusplus
 extern "C" { namespace roaring { namespace internal {
 #endif
@@ -89,7 +95,7 @@ static inline int32_t advanceUntil(const uint16_t *array, int32_t pos,
 }
 
 /**
- * Returns number of elements which are less then $ikey.
+ * Returns number of elements which are less than ikey.
  * Array elements must be unique and sorted.
  */
 static inline int32_t count_less(const uint16_t *array, int32_t lenarray,
@@ -100,7 +106,7 @@ static inline int32_t count_less(const uint16_t *array, int32_t lenarray,
 }
 
 /**
- * Returns number of elements which are greater then $ikey.
+ * Returns number of elements which are greater than ikey.
  * Array elements must be unique and sorted.
  */
 static inline int32_t count_greater(const uint16_t *array, int32_t lenarray,
@@ -125,6 +131,19 @@ int32_t intersect_vector16(const uint16_t *__restrict__ A, size_t s_a,
                            const uint16_t *__restrict__ B, size_t s_b,
                            uint16_t *C);
 
+int32_t intersect_vector16_inplace(uint16_t *__restrict__ A, size_t s_a,
+                           const uint16_t *__restrict__ B, size_t s_b);
+
+/**
+ * Take an array container and write it out to a 32-bit array, using base
+ * as the offset.
+ */
+int array_container_to_uint32_array_vector16(void *vout, const uint16_t* array, size_t cardinality,
+                                    uint32_t base);
+#if CROARING_COMPILER_SUPPORTS_AVX512
+int avx512_array_container_to_uint32_array(void *vout, const uint16_t* array, size_t cardinality,
+                                    uint32_t base);
+#endif
 /**
  * Compute the cardinality of the intersection using SSE4 instructions
  */
diff --git a/include/roaring/bitset/bitset.h b/include/roaring/bitset/bitset.h
new file mode 100644
index 000000000..011702bc8
--- /dev/null
+++ b/include/roaring/bitset/bitset.h
@@ -0,0 +1,283 @@
+#ifndef CBITSET_BITSET_H
+#define CBITSET_BITSET_H
+
+// For compatibility with MSVC with the use of `restrict`
+#if (__STDC_VERSION__ >= 199901L) || \
+    (defined(__GNUC__) && defined(__STDC_VERSION__))
+#define CBITSET_RESTRICT restrict
+#else
+#define CBITSET_RESTRICT
+#endif  // (__STDC_VERSION__ >= 199901L) || (defined(__GNUC__) &&
+        // defined(__STDC_VERSION__ ))
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <roaring/portability.h>
+
+#ifdef __cplusplus
+extern "C" { namespace roaring { namespace api {
+#endif
+
+struct bitset_s {
+    uint64_t *CBITSET_RESTRICT array;
+    /* For simplicity and performance, we prefer to have a size and a capacity that is a multiple of 64 bits.
+     * Thus we only track the size and the capacity in terms of 64-bit words allocated */
+    size_t arraysize;
+    size_t capacity;
+};
+
+typedef struct bitset_s bitset_t;
+
+/* Create a new bitset. Return NULL in case of failure. */
+bitset_t *bitset_create(void);
+
+/* Create a new bitset able to contain size bits. Return NULL in case of
+ * failure. */
+bitset_t *bitset_create_with_capacity(size_t size);
+
+/* Free memory. */
+void bitset_free(bitset_t *bitset);
+
+/* Set all bits to zero. */
+void bitset_clear(bitset_t *bitset);
+
+/* Set all bits to one. */
+void bitset_fill(bitset_t *bitset);
+
+/* Create a copy */
+bitset_t *bitset_copy(const bitset_t *bitset);
+
+/* For advanced users: Resize the bitset so that it can support newarraysize * 64 bits.
+ * Return true in case of success, false for failure. Pad
+ * with zeroes new buffer areas if requested. */
+bool bitset_resize(bitset_t *bitset, size_t newarraysize, bool padwithzeroes);
+
+/* returns how many bytes of memory the backend buffer uses */
+inline size_t bitset_size_in_bytes(const bitset_t *bitset) {
+    return bitset->arraysize * sizeof(uint64_t);
+}
+
+/* returns how many bits can be accessed */
+inline size_t bitset_size_in_bits(const bitset_t *bitset) {
+    return bitset->arraysize * 64;
+}
+
+/* returns how many words (64-bit) of memory the backend buffer uses */
+inline size_t bitset_size_in_words(const bitset_t *bitset) {
+    return bitset->arraysize;
+}
+
+/* For advanced users: Grow the bitset so that it can support newarraysize * 64 bits with padding.
+ * Return true in case of success, false for failure. */
+bool bitset_grow(bitset_t *bitset, size_t newarraysize);
+
+/* attempts to recover unused memory, return false in case of roaring_reallocation
+ * failure */
+bool bitset_trim(bitset_t *bitset);
+
+/* shifts all bits by 's' positions so that the bitset representing values
+ * 1,2,10 would represent values 1+s, 2+s, 10+s */
+void bitset_shift_left(bitset_t *bitset, size_t s);
+
+/* shifts all bits by 's' positions so that the bitset representing values
+ * 1,2,10 would represent values 1-s, 2-s, 10-s, negative values are deleted */
+void bitset_shift_right(bitset_t *bitset, size_t s);
+
+/* Set the ith bit. Attempts to resize the bitset if needed (may silently fail)
+ */
+inline void bitset_set(bitset_t *bitset, size_t i) {
+    size_t shiftedi = i / 64;
+    if (shiftedi >= bitset->arraysize) {
+        if (!bitset_grow(bitset, shiftedi + 1)) {
+            return;
+        }
+    }
+    bitset->array[shiftedi] |= ((uint64_t)1) << (i % 64);
+}
+
+/* Set the ith bit to the specified value. Attempts to resize the bitset if
+ * needed (may silently fail) */
+inline void bitset_set_to_value(bitset_t *bitset, size_t i, bool flag) {
+    size_t shiftedi = i / 64;
+    uint64_t mask = ((uint64_t)1) << (i % 64);
+    uint64_t dynmask = ((uint64_t)flag) << (i % 64);
+    if (shiftedi >= bitset->arraysize) {
+        if (!bitset_grow(bitset, shiftedi + 1)) {
+            return;
+        }
+    }
+    uint64_t w = bitset->array[shiftedi];
+    w &= ~mask;
+    w |= dynmask;
+    bitset->array[shiftedi] = w;
+}
+
+/* Get the value of the ith bit.  */
+inline bool bitset_get(const bitset_t *bitset, size_t i) {
+    size_t shiftedi = i / 64;
+    if (shiftedi >= bitset->arraysize) {
+        return false;
+    }
+    return (bitset->array[shiftedi] & (((uint64_t)1) << (i % 64))) != 0;
+}
+
+/* Count number of bits set.  */
+size_t bitset_count(const bitset_t *bitset);
+
+/* Find the index of the first bit set. Or zero if the bitset is empty.  */
+size_t bitset_minimum(const bitset_t *bitset);
+
+/* Find the index of the last bit set. Or zero if the bitset is empty.  */
+size_t bitset_maximum(const bitset_t *bitset);
+
+/* compute the union in-place (to b1), returns true if successful, to generate a
+ * new bitset first call bitset_copy */
+bool bitset_inplace_union(bitset_t *CBITSET_RESTRICT b1,
+                          const bitset_t *CBITSET_RESTRICT b2);
+
+/* report the size of the union (without materializing it) */
+size_t bitset_union_count(const bitset_t *CBITSET_RESTRICT b1,
+                          const bitset_t *CBITSET_RESTRICT b2);
+
+/* compute the intersection in-place (to b1), to generate a new bitset first
+ * call bitset_copy */
+void bitset_inplace_intersection(bitset_t *CBITSET_RESTRICT b1,
+                                 const bitset_t *CBITSET_RESTRICT b2);
+
+/* report the size of the intersection (without materializing it) */
+size_t bitset_intersection_count(const bitset_t *CBITSET_RESTRICT b1,
+                                 const bitset_t *CBITSET_RESTRICT b2);
+
+/* returns true if the bitsets contain no common elements */
+bool bitsets_disjoint(const bitset_t *CBITSET_RESTRICT b1, const bitset_t *CBITSET_RESTRICT b2);
+
+/* returns true if the bitsets contain any common elements */
+bool bitsets_intersect(const bitset_t *CBITSET_RESTRICT b1, const bitset_t *CBITSET_RESTRICT b2);
+
+/* returns true if b1 contains all of the set bits of b2 */
+bool bitset_contains_all(const bitset_t *CBITSET_RESTRICT b1, const bitset_t *CBITSET_RESTRICT b2);
+
+/* compute the difference in-place (to b1), to generate a new bitset first call
+ * bitset_copy */
+void bitset_inplace_difference(bitset_t *CBITSET_RESTRICT b1,
+                               const bitset_t *CBITSET_RESTRICT b2);
+
+/* compute the size of the difference */
+size_t bitset_difference_count(const bitset_t *CBITSET_RESTRICT b1,
+                               const bitset_t *CBITSET_RESTRICT b2);
+
+/* compute the symmetric difference in-place (to b1), return true if successful,
+ * to generate a new bitset first call bitset_copy */
+bool bitset_inplace_symmetric_difference(bitset_t *CBITSET_RESTRICT b1,
+                                         const bitset_t *CBITSET_RESTRICT b2);
+
+/* compute the size of the symmetric difference  */
+size_t bitset_symmetric_difference_count(const bitset_t *CBITSET_RESTRICT b1,
+                                         const bitset_t *CBITSET_RESTRICT b2);
+
+/* iterate over the set bits
+ like so :
+  for(size_t i = 0; bitset_next_set_bit(b,&i) ; i++) {
+    //.....
+  }
+  */
+inline bool bitset_next_set_bit(const bitset_t *bitset, size_t *i) {
+    size_t x = *i / 64;
+    if (x >= bitset->arraysize) {
+        return false;
+    }
+    uint64_t w = bitset->array[x];
+    w >>= (*i & 63);
+    if (w != 0) {
+        *i += roaring_trailing_zeroes(w);
+        return true;
+    }
+    x++;
+    while (x < bitset->arraysize) {
+        w = bitset->array[x];
+        if (w != 0) {
+            *i = x * 64 + roaring_trailing_zeroes(w);
+            return true;
+        }
+        x++;
+    }
+    return false;
+}
+
+/* iterate over the set bits
+ like so :
+   size_t buffer[256];
+   size_t howmany = 0;
+  for(size_t startfrom = 0; (howmany = bitset_next_set_bits(b,buffer,256, &startfrom)) >
+ 0 ; startfrom++) {
+    //.....
+  }
+  */
+inline size_t bitset_next_set_bits(const bitset_t *bitset, size_t *buffer,
+                                   size_t capacity, size_t *startfrom) {
+    if (capacity == 0) return 0;  // sanity check
+    size_t x = *startfrom / 64;
+    if (x >= bitset->arraysize) {
+        return 0;  // nothing more to iterate over
+    }
+    uint64_t w = bitset->array[x];
+    w >>= (*startfrom & 63);
+    size_t howmany = 0;
+    size_t base = x << 6;
+    while (howmany < capacity) {
+        while (w != 0) {
+            uint64_t t = w & (~w + 1);
+            int r = roaring_trailing_zeroes(w);
+            buffer[howmany++] = r + base;
+            if (howmany == capacity) goto end;
+            w ^= t;
+        }
+        x += 1;
+        if (x == bitset->arraysize) {
+            break;
+        }
+        base += 64;
+        w = bitset->array[x];
+    }
+end:
+    if (howmany > 0) {
+        *startfrom = buffer[howmany - 1];
+    }
+    return howmany;
+}
+
+typedef bool (*bitset_iterator)(size_t value, void *param);
+
+// return true if uninterrupted
+inline bool bitset_for_each(const bitset_t *b, bitset_iterator iterator,
+                            void *ptr) {
+    size_t base = 0;
+    for (size_t i = 0; i < b->arraysize; ++i) {
+        uint64_t w = b->array[i];
+        while (w != 0) {
+            uint64_t t = w & (~w + 1);
+            int r = roaring_trailing_zeroes(w);
+            if (!iterator(r + base, ptr)) return false;
+            w ^= t;
+        }
+        base += 64;
+    }
+    return true;
+}
+
+inline void bitset_print(const bitset_t *b) {
+    printf("{");
+    for (size_t i = 0; bitset_next_set_bit(b, &i); i++) {
+        printf("%zu, ", i);
+    }
+    printf("}");
+}
+
+#ifdef __cplusplus
+} } } // extern "C" { namespace roaring { namespace api {
+#endif
+
+#endif
diff --git a/include/roaring/bitset_util.h b/include/roaring/bitset_util.h
index 0eea94edd..32bc6798a 100644
--- a/include/roaring/bitset_util.h
+++ b/include/roaring/bitset_util.h
@@ -6,6 +6,12 @@
 #include <roaring/portability.h>
 #include <roaring/utilasm.h>
 
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+
 #ifdef __cplusplus
 extern "C" { namespace roaring { namespace internal {
 #endif
@@ -40,16 +46,16 @@ static inline int bitset_lenrange_cardinality(const uint64_t *words,
     uint32_t firstword = start / 64;
     uint32_t endword = (start + lenminusone) / 64;
     if (firstword == endword) {
-        return hamming(words[firstword] &
+        return roaring_hamming(words[firstword] &
                        ((~UINT64_C(0)) >> ((63 - lenminusone) % 64))
                            << (start % 64));
     }
-    int answer = hamming(words[firstword] & ((~UINT64_C(0)) << (start % 64)));
+    int answer = roaring_hamming(words[firstword] & ((~UINT64_C(0)) << (start % 64)));
     for (uint32_t i = firstword + 1; i < endword; i++) {
-        answer += hamming(words[i]);
+        answer += roaring_hamming(words[i]);
     }
     answer +=
-        hamming(words[endword] &
+        roaring_hamming(words[endword] &
                 (~UINT64_C(0)) >> (((~start + 1) - lenminusone - 1) % 64));
     return answer;
 }
@@ -154,6 +160,9 @@ size_t bitset_extract_setbits_avx2(const uint64_t *words, size_t length,
                                    uint32_t *out, size_t outcapacity,
                                    uint32_t base);
 
+size_t bitset_extract_setbits_avx512(const uint64_t *words, size_t length, 
+                                   uint32_t *out, size_t outcapacity, 
+                                   uint32_t base);
 /*
  * Given a bitset containing "length" 64-bit words, write out the position
  * of all the set bits to "out", values start at "base".
@@ -186,6 +195,10 @@ size_t bitset_extract_setbits_sse_uint16(const uint64_t *words, size_t length,
                                          uint16_t *out, size_t outcapacity,
                                          uint16_t base);
 
+size_t bitset_extract_setbits_avx512_uint16(const uint64_t *words, size_t length,
+                                         uint16_t *out, size_t outcapacity, 
+                                         uint16_t base);
+
 /*
  * Given a bitset containing "length" 64-bit words, write out the position
  * of all the set bits to "out",  values start at "base"
@@ -249,7 +262,7 @@ uint64_t bitset_flip_list_withcard(uint64_t *words, uint64_t card,
 
 void bitset_flip_list(uint64_t *words, const uint16_t *list, uint64_t length);
 
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
 /***
  * BEGIN Harley-Seal popcount functions.
  */
@@ -287,7 +300,7 @@ static inline __m256i popcount256(__m256i v) {
     const __m256i popcnt2 = _mm256_shuffle_epi8(lookupneg, hi);
     return _mm256_sad_epu8(popcnt1, popcnt2);
 }
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 
 CROARING_TARGET_AVX2
 /**
@@ -299,7 +312,7 @@ static inline void CSA(__m256i *h, __m256i *l, __m256i a, __m256i b,
     *h = _mm256_or_si256(_mm256_and_si256(a, b), _mm256_and_si256(u, c));
     *l = _mm256_xor_si256(u, c);
 }
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 
 CROARING_TARGET_AVX2
 /**
@@ -363,7 +376,7 @@ inline static uint64_t avx2_harley_seal_popcount256(const __m256i *data,
            (uint64_t)(_mm256_extract_epi64(total, 2)) +
            (uint64_t)(_mm256_extract_epi64(total, 3));
 }
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 
 #define AVXPOPCNTFNC(opname, avx_intrinsic)                                    \
     static inline uint64_t avx2_harley_seal_popcount256_##opname(              \
@@ -547,28 +560,147 @@ CROARING_UNTARGET_REGION
 
 CROARING_TARGET_AVX2
 AVXPOPCNTFNC(or, _mm256_or_si256)
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 
 CROARING_TARGET_AVX2
 AVXPOPCNTFNC(union, _mm256_or_si256)
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 
 CROARING_TARGET_AVX2
 AVXPOPCNTFNC(and, _mm256_and_si256)
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 
 CROARING_TARGET_AVX2
 AVXPOPCNTFNC(intersection, _mm256_and_si256)
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 
 CROARING_TARGET_AVX2
 AVXPOPCNTFNC (xor, _mm256_xor_si256)
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 
 CROARING_TARGET_AVX2
 AVXPOPCNTFNC(andnot, _mm256_andnot_si256)
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
+
+
+#define VPOPCNT_AND_ADD(ptr, i, accu)   \
+    const __m512i v##i = _mm512_loadu_si512((const __m512i*)ptr + i);  \
+    const __m512i p##i = _mm512_popcnt_epi64(v##i);    \
+    accu = _mm512_add_epi64(accu, p##i);  
+
+#if CROARING_COMPILER_SUPPORTS_AVX512
+CROARING_TARGET_AVX512
+static inline uint64_t sum_epu64_256(const __m256i v) {
 
+    return (uint64_t)(_mm256_extract_epi64(v, 0))
+         + (uint64_t)(_mm256_extract_epi64(v, 1))
+         + (uint64_t)(_mm256_extract_epi64(v, 2))
+         + (uint64_t)(_mm256_extract_epi64(v, 3));
+}
+
+
+static inline uint64_t simd_sum_epu64(const __m512i v) {
+
+     __m256i lo = _mm512_extracti64x4_epi64(v, 0);
+     __m256i hi = _mm512_extracti64x4_epi64(v, 1);
+
+    return sum_epu64_256(lo) + sum_epu64_256(hi);
+}
+
+static inline uint64_t avx512_vpopcount(const __m512i* data, const uint64_t size)
+{
+    const uint64_t limit = size - size % 4;
+    __m512i total = _mm512_setzero_si512();
+    uint64_t i = 0;
+
+    for (; i < limit; i += 4)
+    {    
+        VPOPCNT_AND_ADD(data + i, 0, total);
+        VPOPCNT_AND_ADD(data + i, 1, total);
+        VPOPCNT_AND_ADD(data + i, 2, total);
+        VPOPCNT_AND_ADD(data + i, 3, total);
+    }
+    
+    for (; i < size; i++)
+    {
+        total = _mm512_add_epi64(total, _mm512_popcnt_epi64(_mm512_loadu_si512(data + i)));
+    }
+        
+    return simd_sum_epu64(total);
+}
+CROARING_UNTARGET_AVX512
+#endif
+
+#define AVXPOPCNTFNC512(opname, avx_intrinsic)                                 \
+    static inline uint64_t avx512_harley_seal_popcount512_##opname(            \
+        const __m512i *data1, const __m512i *data2, const uint64_t size) {     \
+        __m512i total = _mm512_setzero_si512();                                \
+        const uint64_t limit = size - size % 4;                                \
+        uint64_t i = 0;                                                        \
+	    for (; i < limit; i += 4) {                                            \
+            __m512i a1 = avx_intrinsic(_mm512_loadu_si512(data1 + i),          \
+                                       _mm512_loadu_si512(data2 + i));         \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a1));          \
+            __m512i a2 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 1),      \
+                                       _mm512_loadu_si512(data2 + i + 1));     \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a2));          \
+             __m512i a3 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 2),     \
+                                       _mm512_loadu_si512(data2 + i + 2));     \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a3));          \
+             __m512i a4 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 3),     \
+                                       _mm512_loadu_si512(data2 + i + 3));     \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a4));          \
+       }                                                                       \
+       for(; i < size; i++) {                                                  \
+              __m512i a = avx_intrinsic(_mm512_loadu_si512(data1 + i),         \
+                       _mm512_loadu_si512(data2 + i));                         \
+              total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a));         \
+        }                                                                      \
+        return simd_sum_epu64(total);                                          \
+    }                                                                          \
+    static inline uint64_t avx512_harley_seal_popcount512andstore_##opname(    \
+        const __m512i *__restrict__ data1, const __m512i *__restrict__ data2,  \
+        __m512i *__restrict__ out, const uint64_t size) {                      \
+        __m512i total = _mm512_setzero_si512();                                \
+        const uint64_t limit = size - size % 4;                                \
+        uint64_t i = 0;                                                        \
+	    for (; i < limit; i += 4) {                                        \
+            __m512i a1 = avx_intrinsic(_mm512_loadu_si512(data1 + i),          \
+                                       _mm512_loadu_si512(data2 + i));         \
+            _mm512_storeu_si512(out + i, a1);                                  \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a1));          \
+            __m512i a2 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 1),      \
+                                       _mm512_loadu_si512(data2 + i + 1));     \
+            _mm512_storeu_si512(out + i + 1, a2);                              \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a2));          \
+             __m512i a3 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 2),     \
+                                       _mm512_loadu_si512(data2 + i + 2));     \
+            _mm512_storeu_si512(out + i + 2, a3);                              \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a3));          \
+            __m512i a4 = avx_intrinsic(_mm512_loadu_si512(data1 + i + 3),      \
+                                       _mm512_loadu_si512(data2 + i + 3));     \
+            _mm512_storeu_si512(out + i + 3, a4);                              \
+            total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a4));          \
+       }                                                                       \
+       for(; i < size; i++) {                                                  \
+              __m512i a = avx_intrinsic(_mm512_loadu_si512(data1 + i),         \
+                       _mm512_loadu_si512(data2 + i));                         \
+            _mm512_storeu_si512(out + i, a);                                   \
+ 	       total = _mm512_add_epi64(total, _mm512_popcnt_epi64(a));        \
+        }                                                                      \
+        return simd_sum_epu64(total);                                          \
+    }                                                                          \
+
+#if CROARING_COMPILER_SUPPORTS_AVX512
+CROARING_TARGET_AVX512
+AVXPOPCNTFNC512(or, _mm512_or_si512)
+AVXPOPCNTFNC512(union, _mm512_or_si512)
+AVXPOPCNTFNC512(and, _mm512_and_si512)
+AVXPOPCNTFNC512(intersection, _mm512_and_si512)
+AVXPOPCNTFNC512(xor, _mm512_xor_si512)
+AVXPOPCNTFNC512(andnot, _mm512_andnot_si512)
+CROARING_UNTARGET_AVX512
+#endif
 /***
  * END Harley-Seal popcount functions.
  */
diff --git a/include/roaring/containers/array.h b/include/roaring/containers/array.h
index 47bd93185..3070d6e33 100644
--- a/include/roaring/containers/array.h
+++ b/include/roaring/containers/array.h
@@ -68,6 +68,7 @@ void array_container_free(array_container_t *array);
 array_container_t *array_container_clone(const array_container_t *src);
 
 /* Get the cardinality of `array'. */
+ALLOW_UNALIGNED
 static inline int array_container_cardinality(const array_container_t *array) {
     return array->cardinality;
 }
@@ -86,10 +87,6 @@ void array_container_copy(const array_container_t *src, array_container_t *dst);
 void array_container_add_from_range(array_container_t *arr, uint32_t min,
                                     uint32_t max, uint16_t step);
 
-/* Set the cardinality to zero (does not release memory). */
-static inline void array_container_clear(array_container_t *array) {
-    array->cardinality = 0;
-}
 
 static inline bool array_container_empty(const array_container_t *array) {
     return array->cardinality == 0;
@@ -161,6 +158,8 @@ void array_container_printf(const array_container_t *v);
 void array_container_printf_as_uint32_array(const array_container_t *v,
                                             uint32_t base);
 
+bool array_container_validate(const array_container_t *v, const char **reason);
+
 /**
  * Return the serialized size in bytes of a container having cardinality "card".
  */
@@ -218,6 +217,7 @@ static inline int32_t array_container_size_in_bytes(
 /**
  * Return true if the two arrays have the same content.
  */
+ALLOW_UNALIGNED
 static inline bool array_container_equals(
     const array_container_t *container1,
     const array_container_t *container2) {
@@ -369,18 +369,24 @@ void array_container_offset(const array_container_t *c,
 //* Check whether a range of values from range_start (included) to range_end (excluded) is present. */
 static inline bool array_container_contains_range(const array_container_t *arr,
                                                     uint32_t range_start, uint32_t range_end) {
-
+    const int32_t range_count = range_end - range_start;
     const uint16_t rs_included = range_start;
     const uint16_t re_included = range_end - 1;
 
-    const uint16_t *carr = (const uint16_t *) arr->array;
-
-    const int32_t start = advanceUntil(carr, -1, arr->cardinality, rs_included);
-    const int32_t end = advanceUntil(carr, start - 1, arr->cardinality, re_included);
+    // Empty range is always included
+    if (range_count <= 0) {
+        return true;
+    }
+    if (range_count > arr->cardinality) {
+        return false;
+    }
 
-    return (start < arr->cardinality) && (end < arr->cardinality)
-            && (((uint16_t)(end - start)) == re_included - rs_included)
-            && (carr[start] == rs_included) && (carr[end] == re_included);
+    const int32_t start = binarySearch(arr->array, arr->cardinality, rs_included);
+    // If this sorted array contains all items in the range:
+    // * the start item must be found
+    // * the last item in range range_count must exist, and be the expected end value
+    return (start >= 0) && (arr->cardinality >= start + range_count) &&
+           (arr->array[start + range_count - 1] == re_included);
 }
 
 /* Returns the smallest value (assumes not empty) */
@@ -406,7 +412,18 @@ inline int array_container_rank(const array_container_t *arr, uint16_t x) {
     }
 }
 
-/* Returns the index of the first value equal or smaller than x, or -1 */
+/* Returns the index of x , if not exsist return -1 */
+inline int array_container_get_index(const array_container_t *arr, uint16_t x) {
+    const int32_t idx = binarySearch(arr->array, arr->cardinality, x);
+    const bool is_present = idx >= 0;
+    if (is_present) {
+        return idx;
+    } else {
+        return -1;
+    }
+}
+
+/* Returns the index of the first value equal or larger than x, or -1 */
 inline int array_container_index_equalorlarger(const array_container_t *arr, uint16_t x) {
     const int32_t idx = binarySearch(arr->array, arr->cardinality, x);
     const bool is_present = idx >= 0;
@@ -442,14 +459,15 @@ static inline void array_container_add_range_nvals(array_container_t *array,
 }
 
 /**
- * Adds all values in range [min,max].
+ * Adds all values in range [min,max]. This function is currently unused
+ * and left as a documentation.
  */
-static inline void array_container_add_range(array_container_t *array,
+/*static inline void array_container_add_range(array_container_t *array,
                                              uint32_t min, uint32_t max) {
     int32_t nvals_greater = count_greater(array->array, array->cardinality, max);
     int32_t nvals_less = count_less(array->array, array->cardinality - nvals_greater, min);
     array_container_add_range_nvals(array, min, max, nvals_less, nvals_greater);
-}
+}*/
 
 /*
  * Removes all elements array[pos] .. array[pos+count-1]
diff --git a/include/roaring/containers/bitset.h b/include/roaring/containers/bitset.h
index 2c9e53061..a27e715ae 100644
--- a/include/roaring/containers/bitset.h
+++ b/include/roaring/containers/bitset.h
@@ -77,8 +77,8 @@ static inline void bitset_container_set(bitset_container_t *bitset,
     bitset->words[offset] = load;
 }
 
-/* Unset the ith bit.  */
-static inline void bitset_container_unset(bitset_container_t *bitset,
+/* Unset the ith bit. Currently unused. Could be used for optimization. */
+/*static inline void bitset_container_unset(bitset_container_t *bitset,
                                           uint16_t pos) {
     uint64_t shift = 6;
     uint64_t offset;
@@ -87,7 +87,7 @@ static inline void bitset_container_unset(bitset_container_t *bitset,
     uint64_t load = bitset->words[offset];
     ASM_CLEAR_BIT_DEC_WAS_SET(load, p, bitset->cardinality);
     bitset->words[offset] = load;
-}
+}*/
 
 /* Add `pos' to `bitset'. Returns true if `pos' was not present. Might be slower
  * than bitset_container_set.  */
@@ -142,15 +142,15 @@ static inline void bitset_container_set(bitset_container_t *bitset,
     bitset->words[pos >> 6] = new_word;
 }
 
-/* Unset the ith bit.  */
-static inline void bitset_container_unset(bitset_container_t *bitset,
+/* Unset the ith bit. Currently unused.  */
+/*static inline void bitset_container_unset(bitset_container_t *bitset,
                                           uint16_t pos) {
     const uint64_t old_word = bitset->words[pos >> 6];
     const int index = pos & 63;
     const uint64_t new_word = old_word & (~(UINT64_C(1) << index));
     bitset->cardinality -= (uint32_t)((old_word ^ new_word) >> index);
     bitset->words[pos >> 6] = new_word;
-}
+}*/
 
 /* Add `pos' to `bitset'. Returns true if `pos' was not present. Might be slower
  * than bitset_container_set.  */
@@ -232,6 +232,7 @@ static inline bool bitset_container_contains_range(const bitset_container_t *bit
 }
 
 /* Get the number of bits set */
+ALLOW_UNALIGNED
 static inline int bitset_container_cardinality(
     const bitset_container_t *bitset) {
     return bitset->cardinality;
@@ -254,19 +255,7 @@ void bitset_container_add_from_range(bitset_container_t *bitset, uint32_t min,
  * bitset->cardinality =  bitset_container_compute_cardinality(bitset).*/
 int bitset_container_compute_cardinality(const bitset_container_t *bitset);
 
-/* Get whether there is at least one bit set  (see bitset_container_empty for the reverse),
-   when the cardinality is unknown, it is computed and stored in the struct */
-static inline bool bitset_container_nonzero_cardinality(
-    bitset_container_t *bitset) {
-    // account for laziness
-    if (bitset->cardinality == BITSET_UNKNOWN_CARDINALITY) {
-        // could bail early instead with a nonzero result
-        bitset->cardinality = bitset_container_compute_cardinality(bitset);
-    }
-    return bitset->cardinality > 0;
-}
-
-/* Check whether this bitset is empty (see bitset_container_nonzero_cardinality for the reverse),
+/* Check whether this bitset is empty,
  *  it never modifies the bitset struct. */
 static inline bool bitset_container_empty(
     const bitset_container_t *bitset) {
@@ -315,6 +304,12 @@ int bitset_container_union(const bitset_container_t *src_1,
 int bitset_container_union_justcard(const bitset_container_t *src_1,
                                     const bitset_container_t *src_2);
 
+/* Computes the union of bitsets `src_1' and `src_2' into `dst', but does
+ * not update the cardinality. Provided to optimize chained operations. */
+int bitset_container_union_nocard(const bitset_container_t *src_1,
+				  const bitset_container_t *src_2,
+				  bitset_container_t *dst);
+
 /* Computes the union of bitsets `src_1' and `src_2' into `dst', but does not
  * update the cardinality. Provided to optimize chained operations. */
 int bitset_container_or_nocard(const bitset_container_t *src_1,
@@ -343,6 +338,12 @@ int bitset_container_intersection(const bitset_container_t *src_1,
 int bitset_container_intersection_justcard(const bitset_container_t *src_1,
                                            const bitset_container_t *src_2);
 
+/* Computes the intersection of bitsets `src_1' and `src_2' into `dst', but does
+ * not update the cardinality. Provided to optimize chained operations. */
+int bitset_container_intersection_nocard(const bitset_container_t *src_1,
+					 const bitset_container_t *src_2,
+					 bitset_container_t *dst);
+
 /* Computes the intersection of bitsets `src_1' and `src_2' into `dst', but does
  * not update the cardinality. Provided to optimize chained operations. */
 int bitset_container_and_nocard(const bitset_container_t *src_1,
@@ -412,6 +413,8 @@ void bitset_container_printf(const bitset_container_t *v);
 void bitset_container_printf_as_uint32_array(const bitset_container_t *v,
                                              uint32_t base);
 
+bool bitset_container_validate(const bitset_container_t *v, const char **reason);
+
 /**
  * Return the serialized size in bytes of a container.
  */
@@ -492,6 +495,9 @@ uint16_t bitset_container_maximum(const bitset_container_t *container);
 /* Returns the number of values equal or smaller than x */
 int bitset_container_rank(const bitset_container_t *container, uint16_t x);
 
+/* Returns the index of x , if not exsist return -1 */
+int bitset_container_get_index(const bitset_container_t *container, uint16_t x);
+
 /* Returns the index of the first value equal or larger than x, or -1 */
 int bitset_container_index_equalorlarger(const bitset_container_t *container, uint16_t x);
 
diff --git a/include/roaring/containers/containers.h b/include/roaring/containers/containers.h
index ce8f86283..d011cc02e 100644
--- a/include/roaring/containers/containers.h
+++ b/include/roaring/containers/containers.h
@@ -55,11 +55,10 @@ extern "C" { namespace roaring { namespace internal {
  * A shared container is a wrapper around a container
  * with reference counting.
  */
-
 STRUCT_CONTAINER(shared_container_s) {
     container_t *container;
     uint8_t typecode;
-    uint32_t counter;  // to be managed atomically
+    croaring_refcount_t counter;  // to be managed atomically
 };
 
 typedef struct shared_container_s shared_container_t;
@@ -172,9 +171,10 @@ static inline bitset_container_t *container_to_bitset(
             return result;
         case SHARED_CONTAINER_TYPE:
             assert(false);
+            roaring_unreachable;
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return 0;  // unreached
 }
 
@@ -182,7 +182,7 @@ static inline bitset_container_t *container_to_bitset(
  * Get the container name from the typecode
  * (unused at time of writing)
  */
-static inline const char *get_container_name(uint8_t typecode) {
+/*static inline const char *get_container_name(uint8_t typecode) {
     switch (typecode) {
         case BITSET_CONTAINER_TYPE:
             return container_names[0];
@@ -194,10 +194,10 @@ static inline const char *get_container_name(uint8_t typecode) {
             return container_names[3];
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return "unknown";
     }
-}
+}*/
 
 static inline const char *get_full_container_name(
     const container_t *c, uint8_t typecode
@@ -219,16 +219,16 @@ static inline const char *get_full_container_name(
                     return shared_container_names[2];
                 default:
                     assert(false);
-                    __builtin_unreachable();
+                    roaring_unreachable;
                     return "unknown";
             }
             break;
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return "unknown";
     }
-    __builtin_unreachable();
+    roaring_unreachable;
     return NULL;
 }
 
@@ -248,7 +248,7 @@ static inline int container_get_cardinality(
             return run_container_cardinality(const_CAST_run(c));
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return 0;  // unreached
 }
 
@@ -270,7 +270,7 @@ static inline bool container_is_full(const container_t *c, uint8_t typecode) {
             return run_container_is_full(const_CAST_run(c));
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return 0;  // unreached
 }
 
@@ -287,7 +287,7 @@ static inline int container_shrink_to_fit(
             return run_container_shrink_to_fit(CAST_run(c));
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return 0;  // unreached
 }
 
@@ -371,7 +371,7 @@ static inline container_t *container_repair_after_lazy(
             assert(false);
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return 0;  // unreached
 }
 
@@ -397,7 +397,7 @@ static inline int32_t container_write(
             return run_container_write(const_CAST_run(c), buf);
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return 0;  // unreached
 }
 
@@ -419,7 +419,7 @@ static inline int32_t container_size_in_bytes(
             return run_container_size_in_bytes(const_CAST_run(c));
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return 0;  // unreached
 }
 
@@ -435,6 +435,9 @@ void container_printf(const container_t *container, uint8_t typecode);
 void container_printf_as_uint32_array(const container_t *container,
                                       uint8_t typecode, uint32_t base);
 
+bool container_internal_validate(const container_t *container,
+                                 uint8_t typecode, const char **reason);
+
 /**
  * Checks whether a container is not empty, requires a  typecode
  */
@@ -452,7 +455,7 @@ static inline bool container_nonzero_cardinality(
             return run_container_nonzero_cardinality(const_CAST_run(c));
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return 0;  // unreached
 }
 
@@ -484,7 +487,7 @@ static inline int container_to_uint32_array(
                             output, const_CAST_run(c), base);
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return 0;  // unreached
 }
 
@@ -524,7 +527,7 @@ static inline container_t *container_add(
             return c;
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return NULL;
     }
 }
@@ -564,7 +567,7 @@ static inline container_t *container_remove(
             return c;
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return NULL;
     }
 }
@@ -587,7 +590,7 @@ static inline bool container_contains(
             return run_container_contains(const_CAST_run(c), val);
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return false;
     }
 }
@@ -614,7 +617,7 @@ static inline bool container_contains_range(
                                                     range_start, range_end);
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return false;
     }
 }
@@ -670,7 +673,7 @@ static inline bool container_equals(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return false;
     }
 }
@@ -723,7 +726,7 @@ static inline bool container_is_subset(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return false;
     }
 }
@@ -818,7 +821,7 @@ static inline container_t *container_and(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return NULL;
     }
 }
@@ -871,7 +874,7 @@ static inline int container_and_cardinality(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return 0;
     }
 }
@@ -924,7 +927,7 @@ static inline bool container_intersect(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return 0;
     }
 }
@@ -1022,7 +1025,7 @@ static inline container_t *container_iand(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return NULL;
     }
 }
@@ -1134,7 +1137,7 @@ static inline container_t *container_or(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return NULL;  // unreached
     }
 }
@@ -1179,7 +1182,7 @@ static inline container_t *container_lazy_or(
                                 CAST_run(result));
             *result_type = RUN_CONTAINER_TYPE;
             // we are being lazy
-            result = convert_run_to_efficient_container(
+            result = convert_run_to_efficient_container_and_free(
                 CAST_run(result), result_type);
             return result;
 
@@ -1249,7 +1252,7 @@ static inline container_t *container_lazy_or(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return NULL;  // unreached
     }
 }
@@ -1361,7 +1364,7 @@ static inline container_t *container_ior(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return NULL;
     }
 }
@@ -1492,7 +1495,7 @@ static inline container_t *container_lazy_ior(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return NULL;
     }
 }
@@ -1579,7 +1582,7 @@ static inline container_t* container_xor(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return NULL;  // unreached
     }
 }
@@ -1616,7 +1619,7 @@ static inline void container_add_offset(const container_t *c, uint8_t type,
         break;
     default:
         assert(false);
-        __builtin_unreachable();
+        roaring_unreachable;
         break;
     }
 }
@@ -1715,7 +1718,7 @@ static inline container_t *container_lazy_xor(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return NULL;  // unreached
     }
 }
@@ -1799,7 +1802,7 @@ static inline container_t *container_ixor(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return NULL;
     }
 }
@@ -1946,7 +1949,7 @@ static inline container_t *container_andnot(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return NULL;  // unreached
     }
 }
@@ -2032,7 +2035,7 @@ static inline container_t *container_iandnot(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return NULL;
     }
 }
@@ -2060,10 +2063,10 @@ static inline bool container_iterate(
                                          base, iterator, ptr);
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return false;
 }
 
@@ -2086,10 +2089,10 @@ static inline bool container_iterate64(
                                            iterator, high_bits, ptr);
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return false;
 }
 
@@ -2119,10 +2122,10 @@ static inline container_t *container_not(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return NULL;
 }
 
@@ -2155,10 +2158,10 @@ static inline container_t *container_not_range(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return NULL;
 }
 
@@ -2190,10 +2193,10 @@ static inline container_t *container_inot(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return NULL;
 }
 
@@ -2226,10 +2229,10 @@ static inline container_t *container_inot_range(
 
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return NULL;
 }
 
@@ -2259,10 +2262,10 @@ static inline bool container_select(
                                         start_rank, rank, element);
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return false;
 }
 
@@ -2279,10 +2282,10 @@ static inline uint16_t container_maximum(
             return run_container_maximum(const_CAST_run(c));
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return false;
 }
 
@@ -2299,10 +2302,10 @@ static inline uint16_t container_minimum(
             return run_container_minimum(const_CAST_run(c));
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return false;
 }
 
@@ -2321,10 +2324,30 @@ static inline int container_rank(
             return run_container_rank(const_CAST_run(c), x);
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
+    }
+    assert(false);
+    roaring_unreachable;
+    return false;
+}
+
+// return the index of x, if not exsist return -1
+static inline int container_get_index(const container_t *c, uint8_t type,
+                                    uint16_t x) {
+    c = container_unwrap_shared(c, &type);
+    switch (type) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_get_index(const_CAST_bitset(c), x);
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_get_index(const_CAST_array(c), x);
+        case RUN_CONTAINER_TYPE:
+            return run_container_get_index(const_CAST_run(c), x);
+        default:
+            assert(false);
+            roaring_unreachable;
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
     return false;
 }
 
@@ -2402,7 +2425,7 @@ static inline container_t *container_add_range(
             }
         }
         default:
-            __builtin_unreachable();
+            roaring_unreachable;
     }
 }
 
@@ -2430,7 +2453,7 @@ static inline container_t *container_remove_range(
 
             if (result_cardinality == 0) {
                 return NULL;
-            } else if (result_cardinality < DEFAULT_MAX_SIZE) {
+            } else if (result_cardinality <= DEFAULT_MAX_SIZE) {
                 *result_type = ARRAY_CONTAINER_TYPE;
                 bitset_reset_range(bitset->words, min, max+1);
                 bitset->cardinality = result_cardinality;
@@ -2469,18 +2492,10 @@ static inline container_t *container_remove_range(
             }
 
             run_container_remove_range(run, min, max);
-
-            if (run_container_serialized_size_in_bytes(run->n_runs) <=
-                    bitset_container_serialized_size_in_bytes()) {
-                *result_type = RUN_CONTAINER_TYPE;
-                return run;
-            } else {
-                *result_type = BITSET_CONTAINER_TYPE;
-                return bitset_container_from_run(run);
-            }
+            return convert_run_to_efficient_container(run, result_type);
         }
         default:
-            __builtin_unreachable();
+            roaring_unreachable;
      }
 }
 
diff --git a/include/roaring/containers/run.h b/include/roaring/containers/run.h
index 793fc01d8..f24a579a3 100644
--- a/include/roaring/containers/run.h
+++ b/include/roaring/containers/run.h
@@ -305,11 +305,6 @@ static inline bool run_container_empty(
 /* Copy one container into another. We assume that they are distinct. */
 void run_container_copy(const run_container_t *src, run_container_t *dst);
 
-/* Set the cardinality to zero (does not release memory). */
-static inline void run_container_clear(run_container_t *run) {
-    run->n_runs = 0;
-}
-
 /**
  * Append run described by vl to the run container, possibly merging.
  * It is assumed that the run would be inserted at the end of the container, no
@@ -440,6 +435,8 @@ void run_container_printf(const run_container_t *v);
 void run_container_printf_as_uint32_array(const run_container_t *v,
                                           uint32_t base);
 
+bool run_container_validate(const run_container_t *run, const char **reason);
+
 /**
  * Return the serialized size in bytes of a container having "num_runs" runs.
  */
@@ -486,6 +483,7 @@ static inline int32_t run_container_size_in_bytes(
 /**
  * Return true if the two containers have the same content.
  */
+ALLOW_UNALIGNED
 static inline bool run_container_equals(const run_container_t *container1,
                           const run_container_t *container2) {
     if (container1->n_runs != container2->n_runs) {
@@ -563,6 +561,9 @@ inline uint16_t run_container_maximum(const run_container_t *run) {
 /* Returns the number of values equal or smaller than x */
 int run_container_rank(const run_container_t *arr, uint16_t x);
 
+/* Returns the index of x, if not exsist return -1 */
+int run_container_get_index(const run_container_t *arr, uint16_t x);
+
 /* Returns the index of the first run containing a value at least as large as x, or -1 */
 inline int run_container_index_equalorlarger(const run_container_t *arr, uint16_t x) {
     int32_t index = interleavedBinarySearch(arr->runs, arr->n_runs, x);
@@ -610,14 +611,15 @@ static inline void run_container_add_range_nruns(run_container_t* run,
 }
 
 /**
- * Add all values in range [min, max]
+ * Add all values in range [min, max]. This function is currently unused
+ * and left as documentation.
  */
-static inline void run_container_add_range(run_container_t* run,
+/*static inline void run_container_add_range(run_container_t* run,
                                            uint32_t min, uint32_t max) {
     int32_t nruns_greater = rle16_count_greater(run->runs, run->n_runs, max);
     int32_t nruns_less = rle16_count_less(run->runs, run->n_runs - nruns_greater, min);
     run_container_add_range_nruns(run, min, max, nruns_less, nruns_greater);
-}
+}*/
 
 /**
  * Shifts last $count elements either left (distance < 0) or right (distance > 0)
diff --git a/include/roaring/isadetection.h b/include/roaring/isadetection.h
index cfea20070..446b32dae 100644
--- a/include/roaring/isadetection.h
+++ b/include/roaring/isadetection.h
@@ -1,227 +1,42 @@
-/* From
-https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
-Highly modified.
-
-Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
-Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
-Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
-Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
-Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
-Copyright (c) 2011-2013 NYU                      (Clement Farabet)
-Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
-Iain Melvin, Jason Weston) Copyright (c) 2006      Idiap Research Institute
-(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
-Samy Bengio, Johnny Mariethoz)
-
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the distribution.
-
-3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
-America and IDIAP Research Institute nor the names of its contributors may be
-   used to endorse or promote products derived from this software without
-   specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-*/
-
 #ifndef ROARING_ISADETECTION_H
 #define ROARING_ISADETECTION_H
+#if defined(__x86_64__) || defined(_M_AMD64) // x64
 
-#include <stdint.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#if CROARING_REGULAR_VISUAL_STUDIO
-#include <intrin.h>
-#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
-#include <cpuid.h>
-#endif // CROARING_REGULAR_VISUAL_STUDIO
-
-
-enum croaring_instruction_set {
-  CROARING_DEFAULT = 0x0,
-  CROARING_NEON = 0x1,
-  CROARING_AVX2 = 0x4,
-  CROARING_SSE42 = 0x8,
-  CROARING_PCLMULQDQ = 0x10,
-  CROARING_BMI1 = 0x20,
-  CROARING_BMI2 = 0x40,
-  CROARING_ALTIVEC = 0x80,
-  CROARING_UNINITIALIZED = 0x8000
-};
-
-#if defined(__PPC64__)
-
-static inline uint32_t dynamic_croaring_detect_supported_architectures() {
-  return CROARING_ALTIVEC;
-}
-
-#elif defined(__arm__) || defined(__aarch64__) // incl. armel, armhf, arm64
-
-#if defined(__ARM_NEON)
-
-static inline uint32_t dynamic_croaring_detect_supported_architectures() {
-  return CROARING_NEON;
-}
-
-#else // ARM without NEON
 
-static inline uint32_t dynamic_croaring_detect_supported_architectures() {
-  return CROARING_DEFAULT;
-}
 
-#endif
 
-#elif defined(__x86_64__) || defined(_M_AMD64) // x64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#ifdef __has_include
+// We want to make sure that the AVX-512 functions are only built on compilers
+// fully supporting AVX-512.
+#if __has_include(<avx512vbmi2intrin.h>)
+#define CROARING_COMPILER_SUPPORTS_AVX512 1
+#endif // #if __has_include(<avx512vbmi2intrin.h>)
+#endif // #ifdef __has_include
 
+// Visual Studio 2019 and up support AVX-512
+#ifdef _MSC_VER
+#if _MSC_VER >= 1920
+#define CROARING_COMPILER_SUPPORTS_AVX512 1
+#endif // #if _MSC_VER >= 1920
+#endif // #ifdef _MSC_VER
 
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#define CROARING_COMPILER_SUPPORTS_AVX512 0
+#endif // #ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#endif // #ifndef CROARING_COMPILER_SUPPORTS_AVX512
 
 
-static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
-                         uint32_t *edx) {
-
-#if CROARING_REGULAR_VISUAL_STUDIO
-  int cpu_info[4];
-  __cpuid(cpu_info, *eax);
-  *eax = cpu_info[0];
-  *ebx = cpu_info[1];
-  *ecx = cpu_info[2];
-  *edx = cpu_info[3];
-#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
-  uint32_t level = *eax;
-  __get_cpuid(level, eax, ebx, ecx, edx);
-#else
-  uint32_t a = *eax, b, c = *ecx, d;
-  __asm__("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
-  *eax = a;
-  *ebx = b;
-  *ecx = c;
-  *edx = d;
+#ifdef __cplusplus
+extern "C" { namespace roaring { namespace internal {
 #endif
-}
-
-static inline uint32_t dynamic_croaring_detect_supported_architectures() {
-  uint32_t eax, ebx, ecx, edx;
-  uint32_t host_isa = 0x0;
-  // Can be found on Intel ISA Reference for CPUID
-  static uint32_t cpuid_avx2_bit = 1 << 5;      ///< @private Bit 5 of EBX for EAX=0x7
-  static uint32_t cpuid_bmi1_bit = 1 << 3;      ///< @private bit 3 of EBX for EAX=0x7
-  static uint32_t cpuid_bmi2_bit = 1 << 8;      ///< @private bit 8 of EBX for EAX=0x7
-  static uint32_t cpuid_sse42_bit = 1 << 20;    ///< @private bit 20 of ECX for EAX=0x1
-  static uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit  1 of ECX for EAX=0x1
-  // ECX for EAX=0x7
-  eax = 0x7;
-  ecx = 0x0;
-  cpuid(&eax, &ebx, &ecx, &edx);
-  if (ebx & cpuid_avx2_bit) {
-    host_isa |= CROARING_AVX2;
-  }
-  if (ebx & cpuid_bmi1_bit) {
-    host_isa |= CROARING_BMI1;
-  }
-
-  if (ebx & cpuid_bmi2_bit) {
-    host_isa |= CROARING_BMI2;
-  }
-
-  // EBX for EAX=0x1
-  eax = 0x1;
-  cpuid(&eax, &ebx, &ecx, &edx);
-
-  if (ecx & cpuid_sse42_bit) {
-    host_isa |= CROARING_SSE42;
-  }
-
-  if (ecx & cpuid_pclmulqdq_bit) {
-    host_isa |= CROARING_PCLMULQDQ;
-  }
-
-  return host_isa;
-}
-#else // fallback
-
-
-static inline uint32_t dynamic_croaring_detect_supported_architectures() {
-  return CROARING_DEFAULT;
-}
-
-
-#endif // end SIMD extension detection code
-
-
-#if defined(__x86_64__) || defined(_M_AMD64) // x64
-
-#if defined(__cplusplus)
-static inline uint32_t croaring_detect_supported_architectures() {
-    // thread-safe as per the C++11 standard.
-    static uint32_t buffer = dynamic_croaring_detect_supported_architectures();
-    return buffer;
-}
-#elif CROARING_VISUAL_STUDIO
-// Visual Studio does not support C11 atomics.
-static inline uint32_t croaring_detect_supported_architectures() {
-    static int buffer = CROARING_UNINITIALIZED;
-    if (buffer == CROARING_UNINITIALIZED) {
-      buffer = dynamic_croaring_detect_supported_architectures();
-    }
-    return buffer;
-}
-#else // CROARING_VISUAL_STUDIO
-#include <stdatomic.h>
-static inline uint32_t croaring_detect_supported_architectures() {
-    // we use an atomic for thread safety
-    static _Atomic uint32_t buffer = CROARING_UNINITIALIZED;
-    if (buffer == CROARING_UNINITIALIZED) {
-      // atomicity is sufficient
-      buffer = dynamic_croaring_detect_supported_architectures();
-    }
-    return buffer;
-}
-#endif // CROARING_REGULAR_VISUAL_STUDIO
-
-#ifdef ROARING_DISABLE_AVX
-static inline bool croaring_avx2() {
-  return false;
-}
-#elif defined(__AVX2__)
-static inline bool croaring_avx2() {
-  return true;
-}
-#else
-static inline bool croaring_avx2() {
-  return  (croaring_detect_supported_architectures() & CROARING_AVX2) == CROARING_AVX2;
-}
+enum {
+  ROARING_SUPPORTS_AVX2 = 1,
+  ROARING_SUPPORTS_AVX512 = 2,
+};
+int croaring_hardware_support(void);
+#ifdef __cplusplus
+} } }  // extern "C" { namespace roaring { namespace internal {
 #endif
-
-
-#else // defined(__x86_64__) || defined(_M_AMD64) // x64
-
-static inline bool croaring_avx2() {
-  return false;
-}
-
-static inline uint32_t croaring_detect_supported_architectures() {
-    // no runtime dispatch
-    return dynamic_croaring_detect_supported_architectures();
-}
-#endif // defined(__x86_64__) || defined(_M_AMD64) // x64
-
+#endif // x64
 #endif // ROARING_ISADETECTION_H
diff --git a/include/roaring/misc/configreport.h b/include/roaring/misc/configreport.h
index 7e3c3c1d6..1ff937722 100644
--- a/include/roaring/misc/configreport.h
+++ b/include/roaring/misc/configreport.h
@@ -1,8 +1,9 @@
 /*
  * configreport.h
- *
+ * If this gets compiled into a different execution unit than the CRoaring library,
+ * the functions croaring_hardware_support() & ROARING_SUPPORTS_AVX512 and croaring_hardware_support() & ROARING_SUPPORTS_AVX2 *may* trigger an additional
+ * call to dynamic_croaring_detect_supported_architectures().
  */
-
 #ifndef INCLUDE_MISC_CONFIGREPORT_H_
 #define INCLUDE_MISC_CONFIGREPORT_H_
 
@@ -11,12 +12,11 @@
 #include <stdio.h>
 
 #include <roaring/portability.h>
-
 #ifdef __cplusplus
 extern "C" { namespace roaring { namespace misc {
 #endif
 
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
 // useful for basic info (0)
 static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
                                 unsigned int *ecx, unsigned int *edx) {
@@ -110,44 +110,73 @@ static inline const char *guessprocessor() {
         case 0x016C:
             codename = "Pineview";
             break;
+        case 0x706e:
+        case 0x606a:
+            codename = "Icelake";
+            break;
+        case 0x706a:
+        case 0x506c:
+            codename = "Goldmont";
+            break;
+       case 0x806c:
+       case 0x806d:
+            codename = "TigerLake";
+            break;
+        case 0x806e:
+        case 0x906e:
+            codename = "Kabylake";
+            break;
+        case 0xa065:
+        case 0xa066:
+            codename = "Cometlake";
+            break;
+        case 0xa067:
+            codename = "Rocketlake";
+            break;
+        case 0x9067:
+        case 0x906a:
+            codename = "Alderlake";
+            break;
+        case 0xb067:
+            codename = "Raptorlake";
+            break;
+        case 0x30f1:
+        case 0x60f0:
+        case 0x70f1:
+        case 0x60f8:
+        case 0x90f0:
+            codename = "Zen2";
+            break;
+        case 0x20f10:
+        case 0x50f00:
+            codename = "Zen3";
+            break;
+        case 0x40f40:
+            codename = "Zen3+";
+            break;
+        case 0x60f10:
+            codename = "Zen4";
+            break;
         default:
-            codename = "UNKNOWN";
+            codename = "unknown";
             break;
     }
     return codename;
 }
 
 static inline void tellmeall() {
+#if CROARING_IS_BIG_ENDIAN
+    printf("big-endian system detected\n"));
+#endif
     printf("x64 processor:  %s\t", guessprocessor());
-
 #ifdef __VERSION__
     printf(" compiler version: %s\t", __VERSION__);
 #endif
-    uint32_t config =  croaring_detect_supported_architectures();
-    if((config & CROARING_NEON) == CROARING_NEON) {
-        printf(" NEON detected\t");
-    }
+
  #ifdef __AVX2__
     printf(" Building for AVX2\t");
  #endif
-    if(croaring_avx2()) {
-        printf( "AVX2 usable\t");
-    }
-    if((config & CROARING_AVX2) == CROARING_AVX2) {
-        printf( "AVX2 detected\t");
-       if(!croaring_avx2()) {
-         printf( "AVX2 not used\t");
-       }
-     }
-    if((config & CROARING_SSE42) == CROARING_SSE42) {
-        printf(" SSE4.2 detected\t");
-    }
-    if((config & CROARING_BMI1) == CROARING_BMI1) {
-        printf(" BMI1 detected\t");
-    }
-    if((config & CROARING_BMI2) == CROARING_BMI2) {
-        printf(" BMI2 detected\t");
-    }
+
     printf("\n");
     if ((sizeof(int) != 4) || (sizeof(long) != 8)) {
         printf("number of bytes: int = %lu long = %lu \n",
@@ -170,6 +199,9 @@ static inline void tellmeall() {
 #else
 
 static inline void tellmeall() {
+#if CROARING_IS_BIG_ENDIAN
+    printf("big-endian system detected\n");
+#endif
     printf("Non-X64  processor\n");
 #ifdef __arm__
     printf("ARM processor detected\n");
@@ -177,14 +209,6 @@ static inline void tellmeall() {
 #ifdef __VERSION__
     printf(" compiler version: %s\t", __VERSION__);
 #endif
-    uint32_t config =  croaring_detect_supported_architectures();
-    if((config & CROARING_NEON) == CROARING_NEON) {
-        printf(" NEON detected\t");
-    }
-    if((config & CROARING_ALTIVEC) == CROARING_ALTIVEC) {
-        printf("Altivec detected\n");
-    }
-
     if ((sizeof(int) != 4) || (sizeof(long) != 8)) {
         printf("number of bytes: int = %lu long = %lu \n",
                (long unsigned int)sizeof(size_t),
diff --git a/include/roaring/portability.h b/include/roaring/portability.h
index a72dcf6bc..fba2a4b25 100644
--- a/include/roaring/portability.h
+++ b/include/roaring/portability.h
@@ -38,19 +38,35 @@
 #define CROARING_REGULAR_VISUAL_STUDIO 1
 #endif // __clang__
 #endif // _MSC_VER
+#ifndef CROARING_VISUAL_STUDIO
+#define CROARING_VISUAL_STUDIO 0
+#endif
+#ifndef CROARING_CLANG_VISUAL_STUDIO
+#define CROARING_CLANG_VISUAL_STUDIO 0
+#endif
+#ifndef CROARING_REGULAR_VISUAL_STUDIO
+#define CROARING_REGULAR_VISUAL_STUDIO 0
+#endif
+
+#if defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE < 200809L)
+#undef _POSIX_C_SOURCE
+#endif
 
-#if !(defined(_POSIX_C_SOURCE)) || (_POSIX_C_SOURCE < 200809L)
+#ifndef _POSIX_C_SOURCE
 #define _POSIX_C_SOURCE 200809L
 #endif // !(defined(_POSIX_C_SOURCE)) || (_POSIX_C_SOURCE < 200809L)
 #if !(defined(_XOPEN_SOURCE)) || (_XOPEN_SOURCE < 700)
 #define _XOPEN_SOURCE 700
 #endif // !(defined(_XOPEN_SOURCE)) || (_XOPEN_SOURCE < 700)
 
-#include "isadetection.h"
+#ifdef __illumos__
+#define __EXTENSIONS__
+#endif
+
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>  // will provide posix_memalign with _POSIX_C_SOURCE as defined above
-#if !(defined(__APPLE__)) && !(defined(__FreeBSD__))
+#ifdef __GLIBC__
 #include <malloc.h>  // this should never be needed but there are some reports that it is needed.
 #endif
 
@@ -58,17 +74,14 @@
 extern "C" {  // portability definitions are in global scope, not a namespace
 #endif
 
-#if CROARING_REGULAR_VISUAL_STUDIO && !defined(_WIN64) && !defined(CROARING_ACK_32BIT)
-#pragma message( \
-    "You appear to be attempting a 32-bit build under Visual Studio. We recommend a 64-bit build instead.")
-#endif
-
 #if defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ != 8
 #error This code assumes  64-bit long longs (by use of the GCC intrinsics). Your system is not currently supported.
 #endif
 
 #if CROARING_REGULAR_VISUAL_STUDIO
+#ifndef __restrict__
 #define __restrict__ __restrict
+#endif // __restrict__
 #endif // CROARING_REGULAR_VISUAL_STUDIO
 
 
@@ -88,7 +101,7 @@ extern "C" {  // portability definitions are in global scope, not a namespace
 #undef CROARING_IS_X64
 #endif
 
-#ifdef CROARING_DISABLE_X64
+#ifdef ROARING_DISABLE_X64
 #undef CROARING_IS_X64
 #endif
 // we include the intrinsic header
@@ -98,7 +111,7 @@ extern "C" {  // portability definitions are in global scope, not a namespace
 
 
 
-#ifdef CROARING_CLANG_VISUAL_STUDIO
+#if CROARING_CLANG_VISUAL_STUDIO
 
 /**
  * You are not supposed, normally, to include these
@@ -123,6 +136,17 @@ extern "C" {  // portability definitions are in global scope, not a namespace
 #include <avxintrin.h>
 #include <avx2intrin.h>
 #include <wmmintrin.h>
+#if _MSC_VER >= 1920
+// Important: we need the AVX-512 headers:
+#include <avx512fintrin.h>
+#include <avx512dqintrin.h>
+#include <avx512cdintrin.h>
+#include <avx512bwintrin.h>
+#include <avx512vlintrin.h>
+#include <avx512vbmiintrin.h>
+#include <avx512vbmi2intrin.h>
+#include <avx512vpopcntdqintrin.h>
+#endif // _MSC_VER >= 1920
 // unfortunately, we may not get _blsr_u64, but, thankfully, clang
 // has it as a macro.
 #ifndef _blsr_u64
@@ -135,10 +159,10 @@ extern "C" {  // portability definitions are in global scope, not a namespace
 #endif // CROARING_REGULAR_VISUAL_STUDIO
 #endif // defined(__x86_64__) || defined(_M_X64)
 
-#if !defined(USENEON) && !defined(DISABLENEON) && defined(__ARM_NEON)
-#  define USENEON
+#if !defined(CROARING_USENEON) && !defined(DISABLENEON) && defined(__ARM_NEON)
+#  define CROARING_USENEON
 #endif
-#if defined(USENEON)
+#if defined(CROARING_USENEON)
 #  include <arm_neon.h>
 #endif
 
@@ -154,12 +178,13 @@ extern "C" {  // portability definitions are in global scope, not a namespace
 
 #ifndef __clang__  // if one compiles with MSVC *with* clang, then these
                    // intrinsics are defined!!!
+#define CROARING_INTRINSICS 1
 // sadly there is no way to check whether we are missing these intrinsics
 // specifically.
 
-/* wrappers for Visual Studio built-ins that look like gcc built-ins */
+/* wrappers for Visual Studio built-ins that look like gcc built-ins __builtin_ctzll */
 /* result might be undefined when input_num is zero */
-inline int __builtin_ctzll(unsigned long long input_num) {
+inline int roaring_trailing_zeroes(unsigned long long input_num) {
     unsigned long index;
 #ifdef _WIN64  // highly recommended!!!
     _BitScanForward64(&index, input_num);
@@ -170,12 +195,13 @@ inline int __builtin_ctzll(unsigned long long input_num) {
         _BitScanForward(&index, (uint32_t)(input_num >> 32));
         index += 32;
     }
-#endif
+#endif // _WIN64
     return index;
 }
 
+/* wrappers for Visual Studio built-ins that look like gcc built-ins __builtin_clzll */
 /* result might be undefined when input_num is zero */
-inline int __builtin_clzll(unsigned long long input_num) {
+inline int roaring_leading_zeroes(unsigned long long input_num) {
     unsigned long index;
 #ifdef _WIN64  // highly recommended!!!
     _BitScanReverse64(&index, input_num);
@@ -186,28 +212,21 @@ inline int __builtin_clzll(unsigned long long input_num) {
     } else {
         _BitScanReverse(&index, (uint32_t)(input_num));
     }
-#endif
+#endif // _WIN64
     return 63 - index;
 }
 
-
-/* software implementation avoids POPCNT */
-/*static inline int __builtin_popcountll(unsigned long long input_num) {
-  const uint64_t m1 = 0x5555555555555555; //binary: 0101...
-  const uint64_t m2 = 0x3333333333333333; //binary: 00110011..
-  const uint64_t m4 = 0x0f0f0f0f0f0f0f0f; //binary:  4 zeros,  4 ones ...
-  const uint64_t h01 = 0x0101010101010101; //the sum of 256 to the power of 0,1,2,3...
-
-  input_num -= (input_num >> 1) & m1;
-  input_num = (input_num & m2) + ((input_num >> 2) & m2);
-  input_num = (input_num + (input_num >> 4)) & m4;
-  return (input_num * h01) >> 56;
-}*/
-
 /* Use #define so this is effective even under /Ob0 (no inline) */
-#define __builtin_unreachable() __assume(0)
-#endif
+#define roaring_unreachable __assume(0)
+#endif // __clang__
 
+#endif // CROARING_REGULAR_VISUAL_STUDIO
+
+#ifndef CROARING_INTRINSICS
+#define CROARING_INTRINSICS 1
+#define roaring_unreachable __builtin_unreachable()
+static inline int roaring_trailing_zeroes(unsigned long long input_num) { return __builtin_ctzll(input_num); }
+static inline int roaring_leading_zeroes(unsigned long long input_num) { return __builtin_clzll(input_num); }
 #endif
 
 #if CROARING_REGULAR_VISUAL_STUDIO
@@ -227,7 +246,11 @@ inline int __builtin_clzll(unsigned long long input_num) {
 
 #define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
 
-static inline int hammingbackup(uint64_t x) {
+#ifdef CROARING_USENEON
+// we can always compute the popcount fast.
+#elif (defined(_M_ARM) || defined(_M_ARM64)) && ((defined(_WIN64) || defined(_WIN32)) && defined(CROARING_REGULAR_VISUAL_STUDIO) && CROARING_REGULAR_VISUAL_STUDIO)
+// we will need this function:
+static inline int roaring_hamming_backup(uint64_t x) {
   uint64_t c1 = UINT64_C(0x5555555555555555);
   uint64_t c2 = UINT64_C(0x3333333333333333);
   uint64_t c4 = UINT64_C(0x0F0F0F0F0F0F0F0F);
@@ -236,18 +259,22 @@ static inline int hammingbackup(uint64_t x) {
   x *= UINT64_C(0x0101010101010101);
   return x >> 56;
 }
+#endif
+
 
-static inline int hamming(uint64_t x) {
+static inline int roaring_hamming(uint64_t x) {
 #if defined(_WIN64) && defined(CROARING_REGULAR_VISUAL_STUDIO) && CROARING_REGULAR_VISUAL_STUDIO
-#ifdef _M_ARM64
-  return hammingbackup(x);
+#ifdef CROARING_USENEON
+   return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
+#elif defined(_M_ARM64)
+  return roaring_hamming_backup(x);
   // (int) _CountOneBits64(x); is unavailable
 #else  // _M_ARM64
   return (int) __popcnt64(x);
 #endif // _M_ARM64
 #elif defined(_WIN32) && defined(CROARING_REGULAR_VISUAL_STUDIO) && CROARING_REGULAR_VISUAL_STUDIO
 #ifdef _M_ARM
-  return hammingbackup(x);
+  return roaring_hamming_backup(x);
   // _CountOneBits is unavailable
 #else // _M_ARM
     return (int) __popcnt(( unsigned int)x) + (int)  __popcnt(( unsigned int)(x>>32));
@@ -291,7 +318,7 @@ static inline int hamming(uint64_t x) {
 //
 
 // We are going to use runtime dispatch.
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
 #ifdef __clang__
 // clang does not have GCC push pop
 // warning: clang attribute push can't be used within a namespace in clang up
@@ -316,15 +343,194 @@ static inline int hamming(uint64_t x) {
 #define CROARING_UNTARGET_REGION
 #endif
 
-#define CROARING_TARGET_AVX2 CROARING_TARGET_REGION("avx2,bmi,pclmul,lzcnt")
+
+#define CROARING_TARGET_AVX2 CROARING_TARGET_REGION("avx2,bmi,pclmul,lzcnt,popcnt")
+#define CROARING_TARGET_AVX512 CROARING_TARGET_REGION("avx2,bmi,bmi2,pclmul,lzcnt,popcnt,avx512f,avx512dq,avx512bw,avx512vbmi2,avx512bitalg,avx512vpopcntdq")
+#define CROARING_UNTARGET_AVX2 CROARING_UNTARGET_REGION
+#define CROARING_UNTARGET_AVX512 CROARING_UNTARGET_REGION
 
 #ifdef __AVX2__
 // No need for runtime dispatching.
 // It is unnecessary and harmful to old clang to tag regions.
 #undef CROARING_TARGET_AVX2
 #define CROARING_TARGET_AVX2
-#undef CROARING_UNTARGET_REGION
-#define CROARING_UNTARGET_REGION
+#undef CROARING_UNTARGET_AVX2
+#define CROARING_UNTARGET_AVX2
+#endif
+
+#if defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VBMI2__) && defined(__AVX512BITALG__) && defined(__AVX512VPOPCNTDQ__)
+// No need for runtime dispatching.
+// It is unnecessary and harmful to old clang to tag regions.
+#undef CROARING_TARGET_AVX512
+#define CROARING_TARGET_AVX512
+#undef CROARING_UNTARGET_AVX512
+#define CROARING_UNTARGET_AVX512
+#endif
+
+// Allow unaligned memory access
+#if defined(__GNUC__) || defined(__clang__)
+#define ALLOW_UNALIGNED __attribute__((no_sanitize("alignment")))
+#else
+#define ALLOW_UNALIGNED
+#endif
+
+#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
+ #define CROARING_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#elif defined(_WIN32)
+ #define CROARING_IS_BIG_ENDIAN 0
+ #else
+ #if defined(__APPLE__) || defined(__FreeBSD__) // defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__
+ #include <machine/endian.h>
+ #elif defined(sun) || defined(__sun) // defined(__APPLE__) || defined(__FreeBSD__)
+ #include <sys/byteorder.h>
+ #else  // defined(__APPLE__) || defined(__FreeBSD__)
+
+ #ifdef __has_include
+ #if __has_include(<endian.h>)
+ #include <endian.h>
+ #endif //__has_include(<endian.h>)
+ #endif //__has_include
+
+ #endif // defined(__APPLE__) || defined(__FreeBSD__)
+
+
+ #ifndef !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__)
+ #define CROARING_IS_BIG_ENDIAN 0
+ #endif
+
+ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ #define CROARING_IS_BIG_ENDIAN 0
+ #else // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ #define CROARING_IS_BIG_ENDIAN 1
+ #endif // __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 #endif
 
+// Defines for the possible CROARING atomic implementations
+#define CROARING_ATOMIC_IMPL_NONE          1
+#define CROARING_ATOMIC_IMPL_CPP           2
+#define CROARING_ATOMIC_IMPL_C             3
+#define CROARING_ATOMIC_IMPL_C_WINDOWS     4
+
+// If the use has forced a specific implementation, use that, otherwise,
+// figure out the best implementation we can use.
+#if !defined(CROARING_ATOMIC_IMPL)
+  #if defined(__cplusplus) && __cplusplus >= 201103L
+    #ifdef __has_include
+      #if __has_include(<atomic>)
+        #define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_CPP
+      #endif //__has_include(<atomic>)
+    #else
+      // We lack __has_include to check:
+      #define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_CPP
+    #endif //__has_include
+  #elif __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_ATOMICS__)
+    #define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_C
+  #elif CROARING_REGULAR_VISUAL_STUDIO
+    // https://www.technetworkhub.com/c11-atomics-in-visual-studio-2022-version-17/
+    #define CROARING_ATOMIC_IMPL CROARING_ATOMIC_IMPL_C_WINDOWS
+  #endif
+#endif // !defined(CROARING_ATOMIC_IMPL)
+
+#if CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_C
+#include <stdatomic.h>
+typedef _Atomic(uint32_t) croaring_refcount_t;
+
+static inline void croaring_refcount_inc(croaring_refcount_t *val) {
+    // Increasing the reference counter can always be done with
+    // memory_order_relaxed: New references to an object can only be formed from
+    // an existing reference, and passing an existing reference from one thread to
+    // another must already provide any required synchronization.
+    atomic_fetch_add_explicit(val, 1, memory_order_relaxed);
+}
+
+static inline bool croaring_refcount_dec(croaring_refcount_t *val) {
+    // It is important to enforce any possible access to the object in one thread
+    // (through an existing reference) to happen before deleting the object in a
+    // different thread. This is achieved by a "release" operation after dropping
+    // a reference (any access to the object through this reference must obviously
+    // happened before), and an "acquire" operation before deleting the object.
+    bool is_zero = atomic_fetch_sub_explicit(val, 1, memory_order_release) == 1;
+    if (is_zero) {
+        atomic_thread_fence(memory_order_acquire);
+    }
+    return is_zero;
+}
+
+static inline uint32_t croaring_refcount_get(const croaring_refcount_t *val) {
+    return atomic_load_explicit(val, memory_order_relaxed);
+}
+#elif CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_CPP
+#include <atomic>
+typedef std::atomic<uint32_t> croaring_refcount_t;
+
+static inline void croaring_refcount_inc(croaring_refcount_t *val) {
+    val->fetch_add(1, std::memory_order_relaxed);
+}
+
+static inline bool croaring_refcount_dec(croaring_refcount_t *val) {
+    // See above comments on the c11 atomic implementation for memory ordering
+    bool is_zero = val->fetch_sub(1, std::memory_order_release) == 1;
+    if (is_zero) {
+        std::atomic_thread_fence(std::memory_order_acquire);
+    }
+    return is_zero;
+}
+
+static inline uint32_t croaring_refcount_get(const croaring_refcount_t *val) {
+    return val->load(std::memory_order_relaxed);
+}
+#elif CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_C_WINDOWS
+#include <intrin.h>
+#pragma intrinsic(_InterlockedIncrement)
+#pragma intrinsic(_InterlockedDecrement)
+
+// _InterlockedIncrement and _InterlockedDecrement take a (signed) long, and
+// overflow is defined to wrap, so we can pretend it is a uint32_t for our case
+typedef volatile long croaring_refcount_t;
+
+static inline void croaring_refcount_inc(croaring_refcount_t *val) {
+    _InterlockedIncrement(val);
+}
+
+static inline bool croaring_refcount_dec(croaring_refcount_t *val) {
+    return _InterlockedDecrement(val) == 0;
+}
+
+static inline uint32_t croaring_refcount_get(const croaring_refcount_t *val) {
+    // Per https://learn.microsoft.com/en-us/windows/win32/sync/interlocked-variable-access
+    // > Simple reads and writes to properly-aligned 32-bit variables are atomic
+    // > operations. In other words, you will not end up with only one portion
+    // > of the variable updated; all bits are updated in an atomic fashion.
+    return *val;
+}
+#elif CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_NONE
+#include <assert.h>
+typedef uint32_t croaring_refcount_t;
+
+static inline void croaring_refcount_inc(croaring_refcount_t *val) {
+    *val += 1;
+}
+
+static inline bool croaring_refcount_dec(croaring_refcount_t *val) {
+    assert(*val > 0);
+    *val -= 1;
+    return val == 0;
+}
+
+static inline uint32_t croaring_refcount_get(const croaring_refcount_t *val) {
+    return *val;
+}
+#else
+#error "Unknown atomic implementation"
+#endif
+
+
+// We need portability.h to be included first,
+// but we also always want isadetection.h to be
+// included (right after).
+// See https://github.com/RoaringBitmap/CRoaring/issues/394
+// There is no scenario where we want portability.h to
+// be included, but not isadetection.h: the latter is a
+// strict requirement.
+#include <roaring/isadetection.h> // include it last!
 #endif /* INCLUDE_PORTABILITY_H_ */
diff --git a/include/roaring/roaring.h b/include/roaring/roaring.h
index e82d05b1b..b2476e7db 100644
--- a/include/roaring/roaring.h
+++ b/include/roaring/roaring.h
@@ -12,6 +12,7 @@
 #include <roaring/memory.h>
 #include <roaring/roaring_types.h>
 #include <roaring/roaring_version.h>
+#include <roaring/bitset/bitset.h>
 
 #ifdef __cplusplus
 extern "C" { namespace roaring { namespace api {
@@ -34,7 +35,7 @@ roaring_bitmap_t *roaring_bitmap_create_with_capacity(uint32_t cap);
  * Returns NULL if the allocation fails.
  * Client is responsible for calling `roaring_bitmap_free()`.
  */
-static inline roaring_bitmap_t *roaring_bitmap_create(void)
+inline roaring_bitmap_t *roaring_bitmap_create(void)
   { return roaring_bitmap_create_with_capacity(0); }
 
 /**
@@ -49,7 +50,7 @@ bool roaring_bitmap_init_with_capacity(roaring_bitmap_t *r, uint32_t cap);
  * The bitmap will be in a "clear" state, with no auxiliary allocations.
  * Since this performs no allocations, the function will not fail.
  */
-static inline void roaring_bitmap_init_cleared(roaring_bitmap_t *r)
+inline void roaring_bitmap_init_cleared(roaring_bitmap_t *r)
   { roaring_bitmap_init_with_capacity(r, 0); }
 
 /**
@@ -73,11 +74,10 @@ roaring_bitmap_t *roaring_bitmap_of_ptr(size_t n_args, const uint32_t *vals);
  * do so for all of your bitmaps, since interactions between bitmaps with and
  * without COW is unsafe.
  */
-static inline bool roaring_bitmap_get_copy_on_write(const roaring_bitmap_t* r) {
+inline bool roaring_bitmap_get_copy_on_write(const roaring_bitmap_t* r) {
     return r->high_low_container.flags & ROARING_FLAG_COW;
 }
-static inline void roaring_bitmap_set_copy_on_write(roaring_bitmap_t* r,
-                                                    bool cow) {
+inline void roaring_bitmap_set_copy_on_write(roaring_bitmap_t* r, bool cow) {
     if (cow) {
         r->high_low_container.flags |= ROARING_FLAG_COW;
     } else {
@@ -110,6 +110,9 @@ roaring_bitmap_t *roaring_bitmap_copy(const roaring_bitmap_t *r);
  *
  * It might be preferable and simpler to call roaring_bitmap_copy except
  * that roaring_bitmap_overwrite can save on memory allocations.
+ *
+ * Returns true if successful, or false if there was an error. On failure,
+ * the dest bitmap is left in a valid, empty state (even if it was not empty before).
  */
 bool roaring_bitmap_overwrite(roaring_bitmap_t *dest,
                               const roaring_bitmap_t *src);
@@ -122,6 +125,11 @@ void roaring_bitmap_printf(const roaring_bitmap_t *r);
 /**
  * Computes the intersection between two bitmaps and returns new bitmap. The
  * caller is responsible for memory management.
+ *
+ * Performance hint: if you are computing the intersection between several
+ * bitmaps, two-by-two, it is best to start with the smallest bitmap.
+ * You may also rely on roaring_bitmap_and_inplace to avoid creating
+ * many temporary bitmaps.
  */
 roaring_bitmap_t *roaring_bitmap_and(const roaring_bitmap_t *r1,
                                      const roaring_bitmap_t *r2);
@@ -173,7 +181,10 @@ uint64_t roaring_bitmap_xor_cardinality(const roaring_bitmap_t *r1,
 
 /**
  * Inplace version of `roaring_bitmap_and()`, modifies r1
- * r1 == r2 is allowed
+ * r1 == r2 is allowed.
+ *
+ * Performance hint: if you are computing the intersection between several
+ * bitmaps, two-by-two, it is best to start with the smallest bitmap.
  */
 void roaring_bitmap_and_inplace(roaring_bitmap_t *r1,
                                 const roaring_bitmap_t *r2);
@@ -257,9 +268,48 @@ void roaring_bitmap_andnot_inplace(roaring_bitmap_t *r1,
  */
 void roaring_bitmap_free(const roaring_bitmap_t *r);
 
+/**
+ * A bit of context usable with `roaring_bitmap_*_bulk()` functions
+ *
+ * Should be initialized with `{0}` (or `memset()` to all zeros).
+ * Callers should treat it as an opaque type.
+ *
+ * A context may only be used with a single bitmap
+ * (unless re-initialized to zero), and any modification to a bitmap
+ * (other than modifications performed with `_bulk()` functions with the context
+ * passed) will invalidate any contexts associated with that bitmap.
+ */
+typedef struct roaring_bulk_context_s {
+    ROARING_CONTAINER_T *container;
+    int idx;
+    uint16_t key;
+    uint8_t typecode;
+} roaring_bulk_context_t;
+
+/**
+ * Add an item, using context from a previous insert for speed optimization.
+ *
+ * `context` will be used to store information between calls to make bulk
+ * operations faster. `*context` should be zero-initialized before the first
+ * call to this function.
+ *
+ * Modifying the bitmap in any way (other than `-bulk` suffixed functions)
+ * will invalidate the stored context, calling this function with a non-zero
+ * context after doing any modification invokes undefined behavior.
+ *
+ * In order to exploit this optimization, the caller should call this function
+ * with values with the same "key" (high 16 bits of the value) consecutively.
+ */
+void roaring_bitmap_add_bulk(roaring_bitmap_t *r,
+                             roaring_bulk_context_t *context, uint32_t val);
+
 /**
  * Add value n_args from pointer vals, faster than repeatedly calling
  * `roaring_bitmap_add()`
+ *
+ * In order to exploit this optimization, the caller should attempt to keep
+ * values with the same "key" (high 16 bits of the value) as consecutive
+ * elements in `vals`
  */
 void roaring_bitmap_add_many(roaring_bitmap_t *r, size_t n_args,
                              const uint32_t *vals);
@@ -284,9 +334,9 @@ void roaring_bitmap_add_range_closed(roaring_bitmap_t *r,
 /**
  * Add all values in range [min, max)
  */
-static inline void roaring_bitmap_add_range(roaring_bitmap_t *r,
-                                            uint64_t min, uint64_t max) {
-    if(max == min) return;
+inline void roaring_bitmap_add_range(roaring_bitmap_t *r,
+                                     uint64_t min, uint64_t max) {
+    if(max <= min) return;
     roaring_bitmap_add_range_closed(r, (uint32_t)min, (uint32_t)(max - 1));
 }
 
@@ -304,9 +354,9 @@ void roaring_bitmap_remove_range_closed(roaring_bitmap_t *r,
 /**
  * Remove all values in range [min, max)
  */
-static inline void roaring_bitmap_remove_range(roaring_bitmap_t *r,
-                                               uint64_t min, uint64_t max) {
-    if(max == min) return;
+inline void roaring_bitmap_remove_range(roaring_bitmap_t *r,
+                                        uint64_t min, uint64_t max) {
+    if(max <= min) return;
     roaring_bitmap_remove_range_closed(r, (uint32_t)min, (uint32_t)(max - 1));
 }
 
@@ -335,6 +385,25 @@ bool roaring_bitmap_contains_range(const roaring_bitmap_t *r,
                                    uint64_t range_start,
                                    uint64_t range_end);
 
+/**
+ * Check if an items is present, using context from a previous insert or search
+ * for speed optimization.
+ *
+ * `context` will be used to store information between calls to make bulk
+ * operations faster. `*context` should be zero-initialized before the first
+ * call to this function.
+ *
+ * Modifying the bitmap in any way (other than `-bulk` suffixed functions)
+ * will invalidate the stored context, calling this function with a non-zero
+ * context after doing any modification invokes undefined behavior.
+ *
+ * In order to exploit this optimization, the caller should call this function
+ * with values with the same "key" (high 16 bits of the value) consecutively.
+ */
+bool roaring_bitmap_contains_bulk(const roaring_bitmap_t *r,
+                                  roaring_bulk_context_t *context,
+                                  uint32_t val);
+
 /**
  * Get the cardinality of the bitmap (number of elements).
  */
@@ -369,6 +438,23 @@ void roaring_bitmap_clear(roaring_bitmap_t *r);
  */
 void roaring_bitmap_to_uint32_array(const roaring_bitmap_t *r, uint32_t *ans);
 
+/**
+ * Store the bitmap to a bitset. This can be useful for people
+ * who need the performance and simplicity of a standard bitset.
+ * We assume that the input bitset is originally empty (does not
+ * have any set bit).
+ *
+ *   bitset_t * out = bitset_create();
+ *   // if the bitset has content in it, call "bitset_clear(out)"
+ *   bool success = roaring_bitmap_to_bitset(mybitmap, out); 
+ *   // on failure, success will be false.
+ *   // You can then query the bitset:
+ *   bool is_present = bitset_get(out,  10011 );
+ *   // you must free the memory:
+ *   bitset_free(out);
+ *
+ */
+bool roaring_bitmap_to_bitset(const roaring_bitmap_t *r, bitset_t * bitset);
 
 /**
  * Convert the bitmap to a sorted array from `offset` by `limit`, output in `ans`.
@@ -413,6 +499,9 @@ size_t roaring_bitmap_shrink_to_fit(roaring_bitmap_t *r);
  * more space efficient than the portable form, e.g. when the data is sparse.
  *
  * Returns how many bytes written, should be `roaring_bitmap_size_in_bytes(r)`.
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x),
+ * the data format is going to be big-endian and not compatible with little-endian systems.
  */
 size_t roaring_bitmap_serialize(const roaring_bitmap_t *r, char *buf);
 
@@ -420,10 +509,27 @@ size_t roaring_bitmap_serialize(const roaring_bitmap_t *r, char *buf);
  * Use with `roaring_bitmap_serialize()`.
  *
  * (See `roaring_bitmap_portable_deserialize()` if you want a format that's
- * compatible with Java and Go implementations)
+ * compatible with Java and Go implementations).
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x),
+ * the data format is going to be big-endian and not compatible with little-endian systems.
  */
 roaring_bitmap_t *roaring_bitmap_deserialize(const void *buf);
 
+/**
+ * Use with `roaring_bitmap_serialize()`.
+ *
+ * (See `roaring_bitmap_portable_deserialize_safe()` if you want a format that's
+ * compatible with Java and Go implementations).
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x),
+ * the data format is going to be big-endian and not compatible with little-endian systems.
+ * 
+ * The difference with `roaring_bitmap_deserialize()` is that this function checks that the input buffer
+ * is a valid bitmap.  If the buffer is too small, NULL is returned.
+ */
+roaring_bitmap_t *roaring_bitmap_deserialize_safe(const void *buf, size_t maxbytes);
+
 /**
  * How many bytes are required to serialize this bitmap (NOT compatible
  * with Java and Go versions)
@@ -440,6 +546,9 @@ size_t roaring_bitmap_size_in_bytes(const roaring_bitmap_t *r);
  *
  * This is meant to be compatible with the Java and Go versions:
  * https://github.com/RoaringBitmap/RoaringFormatSpec
+*
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x),
+ * the data format is going to be big-endian and not compatible with little-endian systems.
  */
 roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf);
 
@@ -449,10 +558,43 @@ roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf);
  *
  * This is meant to be compatible with the Java and Go versions:
  * https://github.com/RoaringBitmap/RoaringFormatSpec
+ *
+ * The function itself is safe in the sense that it will not cause buffer overflows.
+ * However, for correct operations, it is assumed that the bitmap read was once
+ * serialized from a valid bitmap (i.e., it follows the format specification).
+ * If you provided an incorrect input (garbage), then the bitmap read may not be in
+ * a valid state and following operations may not lead to sensible results.
+ * In particular, the serialized array containers need to be in sorted order, and the
+ * run containers should be in sorted non-overlapping order. This is is guaranteed to
+ * happen when serializing an existing bitmap, but not for random inputs.
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x),
+ * the data format is going to be big-endian and not compatible with little-endian systems.
  */
 roaring_bitmap_t *roaring_bitmap_portable_deserialize_safe(const char *buf,
                                                            size_t maxbytes);
 
+/**
+ * Read bitmap from a serialized buffer.
+ * In case of failure, NULL is returned.
+ *
+ * Bitmap returned by this function can be used in all readonly contexts.
+ * Bitmap must be freed as usual, by calling roaring_bitmap_free().
+ * Underlying buffer must not be freed or modified while it backs any bitmaps.
+ *
+ * The function is unsafe in the following ways:
+ * 1) It may execute unaligned memory accesses.
+ * 2) A buffer overflow may occur if buf does not point to a valid serialized
+ *    bitmap.
+ *
+ * This is meant to be compatible with the Java and Go versions:
+ * https://github.com/RoaringBitmap/RoaringFormatSpec
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x),
+ * the data format is going to be big-endian and not compatible with little-endian systems.
+ */
+roaring_bitmap_t *roaring_bitmap_portable_deserialize_frozen(const char *buf);
+
 /**
  * Check how many bytes would be read (up to maxbytes) at this pointer if there
  * is a bitmap, returns zero if there is no valid bitmap.
@@ -480,6 +622,9 @@ size_t roaring_bitmap_portable_size_in_bytes(const roaring_bitmap_t *r);
  *
  * This is meant to be compatible with the Java and Go versions:
  * https://github.com/RoaringBitmap/RoaringFormatSpec
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x),
+ * the data format is going to be big-endian and not compatible with little-endian systems.
  */
 size_t roaring_bitmap_portable_serialize(const roaring_bitmap_t *r, char *buf);
 
@@ -510,6 +655,9 @@ size_t roaring_bitmap_frozen_size_in_bytes(const roaring_bitmap_t *r);
 /**
  * Serializes bitmap using frozen format.
  * Buffer size must be at least roaring_bitmap_frozen_size_in_bytes().
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x),
+ * the data format is going to be big-endian and not compatible with little-endian systems.
  */
 void roaring_bitmap_frozen_serialize(const roaring_bitmap_t *r, char *buf);
 
@@ -523,6 +671,9 @@ void roaring_bitmap_frozen_serialize(const roaring_bitmap_t *r, char *buf);
  * Bitmap returned by this function can be used in all readonly contexts.
  * Bitmap must be freed as usual, by calling roaring_bitmap_free().
  * Underlying buffer must not be freed or modified while it backs any bitmaps.
+ *
+ * This function is endian-sensitive. If you have a big-endian system (e.g., a mainframe IBM s390x),
+ * the data format is going to be big-endian and not compatible with little-endian systems.
  */
 const roaring_bitmap_t *roaring_bitmap_frozen_view(const char *buf,
                                                    size_t length);
@@ -664,6 +815,15 @@ bool roaring_bitmap_select(const roaring_bitmap_t *r, uint32_t rank,
  */
 uint64_t roaring_bitmap_rank(const roaring_bitmap_t *r, uint32_t x);
 
+/**
+ * Returns the index of x in the given roaring bitmap.
+ * If the roaring bitmap doesn't contain x , this function will return -1.
+ * The difference with rank function is that this function will return -1 when x
+ * is not the element of roaring bitmap, but the rank function will return a
+ * non-negative number.
+ */
+int64_t roaring_bitmap_get_index(const roaring_bitmap_t *r, uint32_t x);
+
 /**
  * Returns the smallest value in the set, or UINT32_MAX if the set is empty.
  */
@@ -683,6 +843,16 @@ uint32_t roaring_bitmap_maximum(const roaring_bitmap_t *r);
 void roaring_bitmap_statistics(const roaring_bitmap_t *r,
                                roaring_statistics_t *stat);
 
+/**
+ * Perform internal consistency checks. Returns true if the bitmap is consistent.
+ *
+ * Note that some operations intentionally leave bitmaps in an inconsistent state temporarily,
+ * for example, `roaring_bitmap_lazy_*` functions, until `roaring_bitmap_repair_after_lazy` is called.
+ *
+ * If reason is non-null, it will be set to a string describing the first inconsistency found if any.
+ */
+bool roaring_bitmap_internal_validate(const roaring_bitmap_t *r, const char **reason);
+
 /*********************
 * What follows is code use to iterate through values in a roaring bitmap
 
@@ -814,4 +984,3 @@ uint32_t roaring_read_uint32_iterator(roaring_uint32_iterator_t *it,
         using namespace ::roaring::api;
     #endif
 #endif
-
diff --git a/include/roaring/roaring_array.h b/include/roaring/roaring_array.h
index fd201662b..ac4941bcd 100644
--- a/include/roaring/roaring_array.h
+++ b/include/roaring/roaring_array.h
@@ -93,7 +93,9 @@ inline container_t *ra_get_container_at_index(
 /**
  * Retrieves the key at index i
  */
-uint16_t ra_get_key_at_index(const roaring_array_t *ra, uint16_t i);
+inline uint16_t ra_get_key_at_index(const roaring_array_t *ra, uint16_t i) {
+    return ra->keys[i];
+}
 
 /**
  * Add a new key-value pair at index i
@@ -166,6 +168,8 @@ inline void ra_set_container_at_index(
     ra->typecodes[i] = typecode;
 }
 
+container_t *ra_get_container(roaring_array_t *ra, uint16_t x, uint8_t *typecode);
+
 /**
  * If needed, increase the capacity of the array so that it can fit k values
  * (at
diff --git a/include/roaring/roaring_version.h b/include/roaring/roaring_version.h
index 8b37799c6..fbc83b2ec 100644
--- a/include/roaring/roaring_version.h
+++ b/include/roaring/roaring_version.h
@@ -1,10 +1,10 @@
 // /include/roaring/roaring_version.h automatically generated by release.py, do not change by hand 
 #ifndef ROARING_INCLUDE_ROARING_VERSION 
 #define ROARING_INCLUDE_ROARING_VERSION 
-#define ROARING_VERSION "0.6.0"
+#define ROARING_VERSION "2.0.1"
 enum { 
-    ROARING_VERSION_MAJOR = 0,
-    ROARING_VERSION_MINOR = 6,
-    ROARING_VERSION_REVISION = 0
+    ROARING_VERSION_MAJOR = 2,
+    ROARING_VERSION_MINOR = 0,
+    ROARING_VERSION_REVISION = 1
 }; 
 #endif // ROARING_INCLUDE_ROARING_VERSION 
diff --git a/microbenchmarks/CMakeLists.txt b/microbenchmarks/CMakeLists.txt
new file mode 100644
index 000000000..628515bbc
--- /dev/null
+++ b/microbenchmarks/CMakeLists.txt
@@ -0,0 +1,16 @@
+
+set (BENCHMARK_DATA_DIR "${PROJECT_SOURCE_DIR}/benchmarks/realdata/")
+
+include(${PROJECT_SOURCE_DIR}/tools/cmake/Import.cmake)
+
+set_off(BENCHMARK_ENABLE_TESTING)
+set_off(BENCHMARK_ENABLE_INSTALL)
+set_off(BENCHMARK_ENABLE_WERROR)
+set(BENCHMARK_ENABLE_WERROR OFF)
+import_dependency(google_benchmarks google/benchmark 3441176)
+add_dependency(google_benchmarks)
+
+add_executable(bench bench.cpp)
+target_link_libraries(bench PRIVATE roaring)
+target_link_libraries(bench PRIVATE benchmark::benchmark)
+target_compile_definitions(bench PRIVATE BENCHMARK_DATA_DIR="${BENCHMARK_DATA_DIR}")
diff --git a/microbenchmarks/bench.cpp b/microbenchmarks/bench.cpp
new file mode 100644
index 000000000..3f013b8cd
--- /dev/null
+++ b/microbenchmarks/bench.cpp
@@ -0,0 +1,206 @@
+#include "bench.h"
+
+
+struct successive_intersection {
+    static uint64_t run() {
+        uint64_t marker = 0;
+        for (size_t i = 0; i + 1 < count; ++i) {
+            roaring_bitmap_t *tempand =
+                roaring_bitmap_and(bitmaps[i], bitmaps[i + 1]);
+            marker += roaring_bitmap_get_cardinality(tempand);
+            roaring_bitmap_free(tempand);
+        }
+        return marker;
+    }
+};
+auto SuccessiveIntersection = BasicBench<successive_intersection>;
+BENCHMARK(SuccessiveIntersection);
+
+
+struct successive_intersection_cardinality {
+    static uint64_t run() {
+        uint64_t marker = 0;
+        for (size_t i = 0; i + 1 < count; ++i) {
+            marker += roaring_bitmap_and_cardinality(bitmaps[i], bitmaps[i + 1]);
+        }
+        return marker;
+    }
+};
+auto SuccessiveIntersectionCardinality = BasicBench<successive_intersection_cardinality>;
+BENCHMARK(SuccessiveIntersectionCardinality);
+
+
+struct successive_union_cardinality {
+    static uint64_t run() {
+        uint64_t marker = 0;
+        for (size_t i = 0; i + 1 < count; ++i) {
+            marker += roaring_bitmap_or_cardinality(bitmaps[i], bitmaps[i + 1]);
+        }
+        return marker;
+    }
+};
+auto SuccessiveUnionCardinality = BasicBench<successive_union_cardinality>;
+BENCHMARK(SuccessiveUnionCardinality);
+
+struct successive_difference_cardinality {
+    static uint64_t run() {
+        uint64_t marker = 0;
+        for (size_t i = 0; i + 1 < count; ++i) {
+            marker += roaring_bitmap_andnot_cardinality(bitmaps[i], bitmaps[i + 1]);
+        }
+        return marker;
+    }
+};
+auto SuccessiveDifferenceCardinality = BasicBench<successive_difference_cardinality>;
+BENCHMARK(SuccessiveDifferenceCardinality);
+
+struct successive_union {
+    static uint64_t run() {
+        uint64_t marker = 0;
+        for (size_t i = 0; i + 1 < count; ++i) {
+            roaring_bitmap_t *tempand =
+                roaring_bitmap_or(bitmaps[i], bitmaps[i + 1]);
+            marker += roaring_bitmap_get_cardinality(tempand);
+            roaring_bitmap_free(tempand);
+        }
+        return marker;
+    }
+};
+auto SuccessiveUnion = BasicBench<successive_union>;
+BENCHMARK(SuccessiveUnion);
+
+struct many_union {
+    static uint64_t run() {
+        uint64_t marker = 0;
+        roaring_bitmap_t *totalorbitmap =
+            roaring_bitmap_or_many(count, (const roaring_bitmap_t **)bitmaps);
+        marker = roaring_bitmap_get_cardinality(totalorbitmap);
+        roaring_bitmap_free(totalorbitmap);
+        return marker;
+    }
+};
+auto TotalUnion = BasicBench<many_union>;
+BENCHMARK(TotalUnion);
+
+struct many_union_heap {
+    static uint64_t run() {
+        uint64_t marker = 0;
+        roaring_bitmap_t *totalorbitmap = roaring_bitmap_or_many_heap(
+            count, (const roaring_bitmap_t **)bitmaps);
+        marker = roaring_bitmap_get_cardinality(totalorbitmap);
+        roaring_bitmap_free(totalorbitmap);
+        return marker;
+    }
+};
+auto TotalUnionHeap = BasicBench<many_union_heap>;
+BENCHMARK(TotalUnionHeap);
+
+struct random_access {
+    static uint64_t run() {
+        uint64_t marker = 0;
+        for (size_t i = 0; i < count; ++i) {
+            marker += roaring_bitmap_contains(bitmaps[i], maxvalue / 4);
+            marker += roaring_bitmap_contains(bitmaps[i], maxvalue / 2);
+            marker += roaring_bitmap_contains(bitmaps[i], 3 * maxvalue / 4);
+        }
+        return marker;
+    }
+};
+auto RandomAccess = BasicBench<random_access>;
+BENCHMARK(RandomAccess);
+
+struct to_array {
+    static uint64_t run() {
+        uint64_t marker = 0;
+        for (size_t i = 0; i < count; ++i) {
+            roaring_bitmap_to_uint32_array(bitmaps[i], array_buffer);
+            marker += array_buffer[0];
+        }
+        return marker;
+    }
+};
+auto ToArray = BasicBench<to_array>;
+BENCHMARK(ToArray);
+
+struct iterate_all {
+    static uint64_t run() {
+        uint64_t marker = 0;
+        for (size_t i = 0; i < count; ++i) {
+            roaring_bitmap_t *r = bitmaps[i];
+            roaring_uint32_iterator_t j;
+            roaring_init_iterator(r, &j);
+            while (j.has_value) {
+                marker++;
+                roaring_advance_uint32_iterator(&j);
+            }
+        }
+        return marker;
+    }
+};
+auto IterateAll = BasicBench<iterate_all>;
+BENCHMARK(IterateAll);
+
+
+struct compute_cardinality {
+    static uint64_t run() {
+        uint64_t marker = 0;
+        for (size_t i = 0; i < count; ++i) {
+            marker += roaring_bitmap_get_cardinality(bitmaps[i]);
+        }
+        return marker;
+    }
+};
+
+auto ComputeCardinality = BasicBench<compute_cardinality>;
+BENCHMARK(ComputeCardinality);
+
+int main(int argc, char **argv) {
+    const char *dir_name;
+    if ((argc == 1) || (argc > 1 && argv[1][0] == '-')) {
+        benchmark::AddCustomContext(
+            "benchmarking other files",
+            "You may pass is a data directory as a parameter.");
+        dir_name = BENCHMARK_DATA_DIR "census1881";
+    } else {
+        dir_name = argv[1];
+    }
+    int number_loaded = load(dir_name);
+#if (__APPLE__ && __aarch64__) || defined(__linux__)
+    if (!collector.has_events()) {
+        benchmark::AddCustomContext("performance counters",
+                                    "No privileged access (sudo may help).");
+    }
+#else
+    if (!collector.has_events()) {
+        benchmark::AddCustomContext("performance counters",
+                                    "Unsupported system.");
+    }
+#endif
+
+#if CROARING_IS_X64
+    benchmark::AddCustomContext("x64", "detected");
+    int support = roaring::internal::croaring_hardware_support();
+#if CROARING_COMPILER_SUPPORTS_AVX512
+    benchmark::AddCustomContext("AVX-512", "supported by compiler");
+    benchmark::AddCustomContext("AVX-512 hardware", ( support & roaring::internal::ROARING_SUPPORTS_AVX512 ) ? "yes" : "no");
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+    benchmark::AddCustomContext("AVX-2 hardware", ( support & roaring::internal::ROARING_SUPPORTS_AVX2 ) ? "yes" : "no");
+#endif // CROARING_IS_X64
+    benchmark::AddCustomContext("data source", dir_name);
+
+    benchmark::AddCustomContext("number of bitmaps", std::to_string(count));
+
+    benchmark::AddCustomContext(
+        "In RAM volume in MiB (estimated)",
+        std::to_string(bitmap_examples_bytes / (1024 * 1024.0)));
+    if (number_loaded == -1) {
+        return EXIT_FAILURE;
+    }
+    benchmark::Initialize(&argc, argv);
+    benchmark::RunSpecifiedBenchmarks();
+    benchmark::Shutdown();
+    for (size_t i = 0; i < count; ++i) {
+        roaring_bitmap_free(bitmaps[i]);
+    }
+    free(array_buffer);
+}
\ No newline at end of file
diff --git a/microbenchmarks/bench.h b/microbenchmarks/bench.h
new file mode 100644
index 000000000..7a8d0662c
--- /dev/null
+++ b/microbenchmarks/bench.h
@@ -0,0 +1,247 @@
+#ifndef CROARING_MICROBENCHMARKS_BENCH_H
+#define CROARING_MICROBENCHMARKS_BENCH_H
+// clang-format off
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <sstream>
+
+#if (!defined(_WIN32) && !defined(_WIN64) && !(__MINGW32__) && !(__MINGW64__))
+#include <dirent.h>
+#else
+#include "toni_ronnko_dirent.h"
+#endif
+
+
+
+#include <benchmark/benchmark.h>
+#include <roaring/roaring.h>
+
+#include "performancecounters/event_counter.h"
+// clang-format on
+
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+
+event_collector collector;
+size_t N = 1000;
+size_t bitmap_examples_bytes = 0;
+size_t count = 0;
+roaring_bitmap_t **bitmaps = NULL;
+uint32_t * array_buffer;
+uint32_t maxvalue = 0;
+uint32_t maxcard = 0;
+
+/**
+ * Read the content of a file to a char array. Caller is
+ * responsible for memory de-allocation.
+ * Returns NULL on error.
+ *
+ * (If the individual files are small, this function is
+ * a good idea.)
+ */
+static char *read_file(const char *filename) {
+    FILE *fp = fopen(filename, "r");
+    if (!fp) {
+        printf("Could not open file %s\n", filename);
+        return NULL;
+    }
+
+    fseek(fp, 0, SEEK_END);
+    size_t size = (size_t)ftell(fp);
+    rewind(fp);
+    char *answer = (char *)malloc(size + 1);
+    if (!answer) {
+        fclose(fp);
+        return NULL;
+    }
+    if (fread(answer, size, 1, fp) != 1) {
+        free(answer);
+        return NULL;
+    }
+    answer[size] = '\0';
+    fclose(fp);
+    return answer;
+}
+
+/**
+ * Given a file made of comma-separated integers,
+ * read it all and generate an array of integers.
+ * The caller is responsible for memory de-allocation.
+ */
+static uint32_t *read_integer_file(const char *filename, size_t *howmany) {
+    char *buffer = read_file(filename);
+    if (buffer == NULL) return NULL;
+
+    size_t howmanyints = 1;
+    size_t i1 = 0;
+    for (; buffer[i1] != '\0'; i1++) {
+        if (buffer[i1] == ',') ++howmanyints;
+    }
+
+    uint32_t *answer = (uint32_t *)malloc(howmanyints * sizeof(uint32_t));
+    if (answer == NULL) return NULL;
+    size_t pos = 0;
+    for (size_t i = 0; (i < i1) && (buffer[i] != '\0'); i++) {
+        uint32_t currentint;
+        while ((buffer[i] < '0') || (buffer[i] > '9')) {
+            i++;
+            if (buffer[i] == '\0') goto END;
+        }
+        currentint = (uint32_t)(buffer[i] - '0');
+        i++;
+        for (; (buffer[i] >= '0') && (buffer[i] <= '9'); i++)
+            currentint = currentint * 10 + (uint32_t)(buffer[i] - '0');
+        answer[pos++] = currentint;
+    }
+END:
+    if (pos != howmanyints) {
+        printf("unexpected number of integers! %d %d \n", (int)pos,
+               (int)howmanyints);
+    }
+    *howmany = pos;
+    free(buffer);
+    return answer;
+}
+
+/**
+ * Does the file filename ends with the given extension.
+ */
+static bool has_extension(const char *filename, const char *extension) {
+    const char *ext = strrchr(filename, '.');
+    return (ext && !strcmp(ext, extension));
+}
+
+/**
+ * read all (count) integer files in a directory. Caller is responsible
+ * for memory de-allocation. In case of error, a NULL is returned.
+ */
+static uint32_t **read_all_integer_files(const char *dirname,
+                                         const char *extension,
+                                         size_t **howmany, size_t *tcount) {
+    struct dirent **entry_list;
+
+    int c = scandir(dirname, &entry_list, 0, alphasort);
+    if (c < 0) return NULL;
+    size_t truec = 0;
+    for (int i = 0; i < c; i++) {
+        if (has_extension(entry_list[i]->d_name, extension)) ++truec;
+    }
+    *tcount = truec;
+    *howmany = (size_t *)malloc(sizeof(size_t) * (*tcount));
+    uint32_t **answer = (uint32_t **)malloc(sizeof(uint32_t *) * (*tcount));
+    size_t dirlen = strlen(dirname);
+    char *modifdirname = (char *)dirname;
+    if (modifdirname[dirlen - 1] != '/') {
+        modifdirname = (char *)malloc(dirlen + 2);
+        strcpy(modifdirname, dirname);
+        modifdirname[dirlen] = '/';
+        modifdirname[dirlen + 1] = '\0';
+        dirlen++;
+    }
+    for (size_t i = 0, pos = 0; i < (size_t)c;
+         i++) { /* formerly looped while i < *tcount */
+        if (!has_extension(entry_list[i]->d_name, extension)) continue;
+        size_t filelen = strlen(entry_list[i]->d_name);
+        char *fullpath = (char *)malloc(dirlen + filelen + 1);
+        strcpy(fullpath, modifdirname);
+        strcpy(fullpath + dirlen, entry_list[i]->d_name);
+        answer[pos] = read_integer_file(fullpath, &((*howmany)[pos]));
+        pos++;
+        free(fullpath);
+    }
+    if (modifdirname != dirname) {
+        free(modifdirname);
+    }
+    for (int i = 0; i < c; ++i) free(entry_list[i]);
+    free(entry_list);
+    return answer;
+}
+/**
+ * Once you have collected all the integers, build the bitmaps.
+ */
+static roaring_bitmap_t **create_all_bitmaps(size_t *howmany,
+                                             uint32_t **numbers, size_t tcount,
+                                             bool runoptimize,
+                                             bool copy_on_write) {
+    for (size_t i = 0; i < count; i++) {
+        if (howmany[i] > 0) {
+            if (maxvalue < numbers[i][howmany[i] - 1]) {
+                maxvalue = numbers[i][howmany[i] - 1];
+            }
+        }
+        if(maxcard < howmany[i]) { maxcard = howmany[i]; }
+    }
+    if (numbers == NULL) return NULL;
+    roaring_bitmap_t **answer =
+        (roaring_bitmap_t **)malloc(sizeof(roaring_bitmap_t *) * tcount);
+    bitmap_examples_bytes = 0;
+    for (size_t i = 0; i < tcount; i++) {
+        answer[i] = roaring_bitmap_of_ptr(howmany[i], numbers[i]);
+        if (runoptimize) roaring_bitmap_run_optimize(answer[i]);
+        roaring_bitmap_shrink_to_fit(answer[i]);
+        bitmap_examples_bytes += roaring_bitmap_size_in_bytes(answer[i]);
+        roaring_bitmap_set_copy_on_write(answer[i], copy_on_write);
+    }
+    array_buffer = (uint32_t*) malloc(maxcard * sizeof(uint32_t));
+    return answer;
+}
+
+template <class func>
+static void BasicBench(benchmark::State &state) {
+    // volatile to prevent optimizations.
+    volatile uint64_t marker = 0;
+    for (auto _ : state) {
+        marker = func::run();
+    }
+    if (collector.has_events()) {
+        event_aggregate aggregate{};
+        for (size_t i = 0; i < N; i++) {
+            std::atomic_thread_fence(std::memory_order_acquire);
+            collector.start();
+            marker = func::run();
+            std::atomic_thread_fence(std::memory_order_release);
+            event_count allocate_count = collector.end();
+            aggregate << allocate_count;
+        }
+        state.counters["cycles"] = aggregate.best.cycles();
+
+        state.counters["instructions"] =  aggregate.best.instructions();
+        state.counters["GHz"] =
+            aggregate.best.cycles() / aggregate.best.elapsed_ns();
+    }
+    (void)marker;
+}
+
+
+int load(const char *dirname) {
+    const char *extension = ".txt";
+    bool copy_on_write = false;
+    bool runoptimize = true;
+    size_t *howmany;
+
+    uint32_t **numbers =
+        read_all_integer_files(dirname, extension, &howmany, &count);
+    if (numbers == NULL) {
+        printf(
+            "I could not find or load any data file with extension %s in "
+            "directory %s.\n",
+            extension, dirname);
+        return -1;
+    }
+    bitmaps =
+        create_all_bitmaps(howmany, numbers, count, runoptimize, copy_on_write);
+
+    for (size_t i = 0; i < count; ++i) {
+        free(numbers[i]);
+    }
+    free(howmany);
+    if (bitmaps == NULL) return -1;
+    return count;
+}
+
+#endif
\ No newline at end of file
diff --git a/microbenchmarks/performancecounters/apple_arm_events.h b/microbenchmarks/performancecounters/apple_arm_events.h
new file mode 100644
index 000000000..c9eeb8aea
--- /dev/null
+++ b/microbenchmarks/performancecounters/apple_arm_events.h
@@ -0,0 +1,1011 @@
+
+// Original design from:
+// =============================================================================
+// XNU kperf/kpc
+// Available for 64-bit Intel/Apple Silicon, macOS/iOS, with root privileges
+//
+// References:
+//
+// XNU source (since xnu 2422.1.72):
+// https://github.com/apple/darwin-xnu/blob/main/osfmk/kern/kpc.h
+// https://github.com/apple/darwin-xnu/blob/main/bsd/kern/kern_kpc.c
+//
+// Lightweight PET (Profile Every Thread, since xnu 3789.1.32):
+// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/pet.c
+// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/kperf_kpc.c
+//
+// System Private frameworks (since macOS 10.11, iOS 8.0):
+// /System/Library/PrivateFrameworks/kperf.framework
+// /System/Library/PrivateFrameworks/kperfdata.framework
+//
+// Xcode framework (since Xcode 7.0):
+// /Applications/Xcode.app/Contents/SharedFrameworks/DVTInstrumentsFoundation.framework
+//
+// CPU database (plist files)
+// macOS (since macOS 10.11):
+//     /usr/share/kpep/<name>.plist
+// iOS (copied from Xcode, since iOS 10.0, Xcode 8.0):
+//     /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+//     /DeviceSupport/<version>/DeveloperDiskImage.dmg/usr/share/kpep/<name>.plist
+//
+//
+// Created by YaoYuan <ibireme@gmail.com> on 2021.
+// Released into the public domain (unlicense.org).
+// =============================================================================
+
+#ifndef M1CYCLES_H
+#define M1CYCLES_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <dlfcn.h>           // for dlopen() and dlsym()
+#include <mach/mach_time.h>  // for mach_absolute_time()
+#include <sys/kdebug.h>      // for kdebug trace decode
+#include <sys/sysctl.h>      // for sysctl()
+#include <unistd.h>          // for usleep()
+
+struct performance_counters {
+  double cycles;
+  double branches;
+  double missed_branches;
+  double instructions;
+  performance_counters(uint64_t c, uint64_t b, uint64_t m, uint64_t i)
+      : cycles(c), branches(b), missed_branches(m), instructions(i) {}
+  performance_counters(double c, double b, double m, double i)
+      : cycles(c), branches(b), missed_branches(m), instructions(i) {}
+  performance_counters(double init)
+      : cycles(init),
+        branches(init),
+        missed_branches(init),
+        instructions(init) {}
+
+  inline performance_counters &operator-=(const performance_counters &other) {
+    cycles -= other.cycles;
+    branches -= other.branches;
+    missed_branches -= other.missed_branches;
+    instructions -= other.instructions;
+    return *this;
+  }
+  inline performance_counters &min(const performance_counters &other) {
+    cycles = other.cycles < cycles ? other.cycles : cycles;
+    branches = other.branches < branches ? other.branches : branches;
+    missed_branches = other.missed_branches < missed_branches
+                          ? other.missed_branches
+                          : missed_branches;
+    instructions =
+        other.instructions < instructions ? other.instructions : instructions;
+    return *this;
+  }
+  inline performance_counters &operator+=(const performance_counters &other) {
+    cycles += other.cycles;
+    branches += other.branches;
+    missed_branches += other.missed_branches;
+    instructions += other.instructions;
+    return *this;
+  }
+
+  inline performance_counters &operator/=(double numerator) {
+    cycles /= numerator;
+    branches /= numerator;
+    missed_branches /= numerator;
+    instructions /= numerator;
+    return *this;
+  }
+};
+
+inline performance_counters operator-(const performance_counters &a,
+                                      const performance_counters &b) {
+  return performance_counters(a.cycles - b.cycles, a.branches - b.branches,
+                              a.missed_branches - b.missed_branches,
+                              a.instructions - b.instructions);
+}
+
+typedef float f32;
+typedef double f64;
+typedef int8_t i8;
+typedef uint8_t u8;
+typedef int16_t i16;
+typedef uint16_t u16;
+typedef int32_t i32;
+typedef uint32_t u32;
+typedef int64_t i64;
+typedef uint64_t u64;
+typedef size_t usize;
+
+// -----------------------------------------------------------------------------
+// <kperf.framework> header (reverse engineered)
+// This framework wraps some sysctl calls to communicate with the kpc in kernel.
+// Most functions requires root privileges, or process is "blessed".
+// -----------------------------------------------------------------------------
+
+// Cross-platform class constants.
+#define KPC_CLASS_FIXED (0)
+#define KPC_CLASS_CONFIGURABLE (1)
+#define KPC_CLASS_POWER (2)
+#define KPC_CLASS_RAWPMU (3)
+
+// Cross-platform class mask constants.
+#define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED)                // 1
+#define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE)  // 2
+#define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER)                // 4
+#define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU)              // 8
+
+// PMU version constants.
+#define KPC_PMU_ERROR (0)      // Error
+#define KPC_PMU_INTEL_V3 (1)   // Intel
+#define KPC_PMU_ARM_APPLE (2)  // ARM64
+#define KPC_PMU_INTEL_V2 (3)   // Old Intel
+#define KPC_PMU_ARM_V2 (4)     // Old ARM
+
+// The maximum number of counters we could read from every class in one go.
+// ARMV7: FIXED: 1, CONFIGURABLE: 4
+// ARM32: FIXED: 2, CONFIGURABLE: 6
+// ARM64: FIXED: 2, CONFIGURABLE: CORE_NCTRS - FIXED (6 or 8)
+// x86: 32
+#define KPC_MAX_COUNTERS 32
+
+// Bits for defining what to do on an action.
+// Defined in https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/action.h
+#define KPERF_SAMPLER_TH_INFO (1U << 0)
+#define KPERF_SAMPLER_TH_SNAPSHOT (1U << 1)
+#define KPERF_SAMPLER_KSTACK (1U << 2)
+#define KPERF_SAMPLER_USTACK (1U << 3)
+#define KPERF_SAMPLER_PMC_THREAD (1U << 4)
+#define KPERF_SAMPLER_PMC_CPU (1U << 5)
+#define KPERF_SAMPLER_PMC_CONFIG (1U << 6)
+#define KPERF_SAMPLER_MEMINFO (1U << 7)
+#define KPERF_SAMPLER_TH_SCHEDULING (1U << 8)
+#define KPERF_SAMPLER_TH_DISPATCH (1U << 9)
+#define KPERF_SAMPLER_TK_SNAPSHOT (1U << 10)
+#define KPERF_SAMPLER_SYS_MEM (1U << 11)
+#define KPERF_SAMPLER_TH_INSCYC (1U << 12)
+#define KPERF_SAMPLER_TK_INFO (1U << 13)
+
+// Maximum number of kperf action ids.
+#define KPERF_ACTION_MAX (32)
+
+// Maximum number of kperf timer ids.
+#define KPERF_TIMER_MAX (8)
+
+// x86/arm config registers are 64-bit
+typedef u64 kpc_config_t;
+
+/// Print current CPU identification string to the buffer (same as snprintf),
+/// such as "cpu_7_8_10b282dc_46". This string can be used to locate the PMC
+/// database in /usr/share/kpep.
+/// @return string's length, or negative value if error occurs.
+/// @note This method does not requires root privileges.
+/// @details sysctl get(hw.cputype), get(hw.cpusubtype),
+///                 get(hw.cpufamily), get(machdep.cpu.model)
+static int (*kpc_cpu_string)(char *buf, usize buf_size);
+
+/// Get the version of KPC that's being run.
+/// @return See `PMU version constants` above.
+/// @details sysctl get(kpc.pmu_version)
+static u32 (*kpc_pmu_version)(void);
+
+/// Get running PMC classes.
+/// @return See `class mask constants` above,
+///         0 if error occurs or no class is set.
+/// @details sysctl get(kpc.counting)
+static u32 (*kpc_get_counting)(void);
+
+/// Set PMC classes to enable counting.
+/// @param classes See `class mask constants` above, set 0 to shutdown counting.
+/// @return 0 for success.
+/// @details sysctl set(kpc.counting)
+static int (*kpc_set_counting)(u32 classes);
+
+/// Get running PMC classes for current thread.
+/// @return See `class mask constants` above,
+///         0 if error occurs or no class is set.
+/// @details sysctl get(kpc.thread_counting)
+static u32 (*kpc_get_thread_counting)(void);
+
+/// Set PMC classes to enable counting for current thread.
+/// @param classes See `class mask constants` above, set 0 to shutdown counting.
+/// @return 0 for success.
+/// @details sysctl set(kpc.thread_counting)
+static int (*kpc_set_thread_counting)(u32 classes);
+
+/// Get how many config registers there are for a given mask.
+/// For example: Intel may returns 1 for `KPC_CLASS_FIXED_MASK`,
+///                        returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`.
+/// @param classes See `class mask constants` above.
+/// @return 0 if error occurs or no class is set.
+/// @note This method does not requires root privileges.
+/// @details sysctl get(kpc.config_count)
+static u32 (*kpc_get_config_count)(u32 classes);
+
+/// Get config registers.
+/// @param classes see `class mask constants` above.
+/// @param config Config buffer to receive values, should not smaller than
+///               kpc_get_config_count(classes) * sizeof(kpc_config_t).
+/// @return 0 for success.
+/// @details sysctl get(kpc.config_count), get(kpc.config)
+static int (*kpc_get_config)(u32 classes, kpc_config_t *config);
+
+/// Set config registers.
+/// @param classes see `class mask constants` above.
+/// @param config Config buffer, should not smaller than
+///               kpc_get_config_count(classes) * sizeof(kpc_config_t).
+/// @return 0 for success.
+/// @details sysctl get(kpc.config_count), set(kpc.config)
+static int (*kpc_set_config)(u32 classes, kpc_config_t *config);
+
+/// Get how many counters there are for a given mask.
+/// For example: Intel may returns 3 for `KPC_CLASS_FIXED_MASK`,
+///                        returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`.
+/// @param classes See `class mask constants` above.
+/// @note This method does not requires root privileges.
+/// @details sysctl get(kpc.counter_count)
+static u32 (*kpc_get_counter_count)(u32 classes);
+
+/// Get counter accumulations.
+/// If `all_cpus` is true, the buffer count should not smaller than
+/// (cpu_count * counter_count). Otherwize, the buffer count should not smaller
+/// than (counter_count).
+/// @see kpc_get_counter_count(), kpc_cpu_count().
+/// @param all_cpus true for all CPUs, false for current cpu.
+/// @param classes See `class mask constants` above.
+/// @param curcpu A pointer to receive current cpu id, can be NULL.
+/// @param buf Buffer to receive counter's value.
+/// @return 0 for success.
+/// @details sysctl get(hw.ncpu), get(kpc.counter_count), get(kpc.counters)
+static int (*kpc_get_cpu_counters)(bool all_cpus, u32 classes, int *curcpu,
+                                   u64 *buf);
+
+/// Get counter accumulations for current thread.
+/// @param tid Thread id, should be 0.
+/// @param buf_count The number of buf's elements (not bytes),
+///                  should not smaller than kpc_get_counter_count().
+/// @param buf Buffer to receive counter's value.
+/// @return 0 for success.
+/// @details sysctl get(kpc.thread_counters)
+static int (*kpc_get_thread_counters)(u32 tid, u32 buf_count, u64 *buf);
+
+/// Acquire/release the counters used by the Power Manager.
+/// @param val 1:acquire, 0:release
+/// @return 0 for success.
+/// @details sysctl set(kpc.force_all_ctrs)
+static int (*kpc_force_all_ctrs_set)(int val);
+
+/// Get the state of all_ctrs.
+/// @return 0 for success.
+/// @details sysctl get(kpc.force_all_ctrs)
+static int (*kpc_force_all_ctrs_get)(int *val_out);
+
+/// Set number of actions, should be `KPERF_ACTION_MAX`.
+/// @details sysctl set(kperf.action.count)
+static int (*kperf_action_count_set)(u32 count);
+
+/// Get number of actions.
+/// @details sysctl get(kperf.action.count)
+static int (*kperf_action_count_get)(u32 *count);
+
+/// Set what to sample when a trigger fires an action, e.g.
+/// `KPERF_SAMPLER_PMC_CPU`.
+/// @details sysctl set(kperf.action.samplers)
+static int (*kperf_action_samplers_set)(u32 actionid, u32 sample);
+
+/// Get what to sample when a trigger fires an action.
+/// @details sysctl get(kperf.action.samplers)
+static int (*kperf_action_samplers_get)(u32 actionid, u32 *sample);
+
+/// Apply a task filter to the action, -1 to disable filter.
+/// @details sysctl set(kperf.action.filter_by_task)
+static int (*kperf_action_filter_set_by_task)(u32 actionid, i32 port);
+
+/// Apply a pid filter to the action, -1 to disable filter.
+/// @details sysctl set(kperf.action.filter_by_pid)
+static int (*kperf_action_filter_set_by_pid)(u32 actionid, i32 pid);
+
+/// Set number of time triggers, should be `KPERF_TIMER_MAX`.
+/// @details sysctl set(kperf.timer.count)
+static int (*kperf_timer_count_set)(u32 count);
+
+/// Get number of time triggers.
+/// @details sysctl get(kperf.timer.count)
+static int (*kperf_timer_count_get)(u32 *count);
+
+/// Set timer number and period.
+/// @details sysctl set(kperf.timer.period)
+static int (*kperf_timer_period_set)(u32 actionid, u64 tick);
+
+/// Get timer number and period.
+/// @details sysctl get(kperf.timer.period)
+static int (*kperf_timer_period_get)(u32 actionid, u64 *tick);
+
+/// Set timer number and actionid.
+/// @details sysctl set(kperf.timer.action)
+static int (*kperf_timer_action_set)(u32 actionid, u32 timerid);
+
+/// Get timer number and actionid.
+/// @details sysctl get(kperf.timer.action)
+static int (*kperf_timer_action_get)(u32 actionid, u32 *timerid);
+
+/// Set which timer ID does PET (Profile Every Thread).
+/// @details sysctl set(kperf.timer.pet_timer)
+static int (*kperf_timer_pet_set)(u32 timerid);
+
+/// Get which timer ID does PET (Profile Every Thread).
+/// @details sysctl get(kperf.timer.pet_timer)
+static int (*kperf_timer_pet_get)(u32 *timerid);
+
+/// Enable or disable sampling.
+/// @details sysctl set(kperf.sampling)
+static int (*kperf_sample_set)(u32 enabled);
+
+/// Get is currently sampling.
+/// @details sysctl get(kperf.sampling)
+static int (*kperf_sample_get)(u32 *enabled);
+
+/// Reset kperf: stop sampling, kdebug, timers and actions.
+/// @return 0 for success.
+static int (*kperf_reset)(void);
+
+/// Nanoseconds to CPU ticks.
+static u64 (*kperf_ns_to_ticks)(u64 ns);
+
+/// CPU ticks to nanoseconds.
+static u64 (*kperf_ticks_to_ns)(u64 ticks);
+
+/// CPU ticks frequency (mach_absolute_time).
+static u64 (*kperf_tick_frequency)(void);
+
+// -----------------------------------------------------------------------------
+// <kperfdata.framework> header (reverse engineered)
+// This framework provides some functions to access the local CPU database.
+// These functions do not require root privileges.
+// -----------------------------------------------------------------------------
+
+// KPEP CPU archtecture constants.
+#define KPEP_ARCH_I386 0
+#define KPEP_ARCH_X86_64 1
+#define KPEP_ARCH_ARM 2
+#define KPEP_ARCH_ARM64 3
+
+/// KPEP event (size: 48/28 bytes on 64/32 bit OS)
+typedef struct kpep_event {
+  const char *name;  ///< Unique name of a event, such as "INST_RETIRED.ANY".
+  const char *description;  ///< Description for this event.
+  const char *errata;       ///< Errata, currently NULL.
+  const char *alias;        ///< Alias name, such as "Instructions", "Cycles".
+  const char *fallback;     ///< Fallback event name for fixed counter.
+  u32 mask;
+  u8 number;
+  u8 umask;
+  u8 reserved;
+  u8 is_fixed;
+} kpep_event;
+
+/// KPEP database (size: 144/80 bytes on 64/32 bit OS)
+typedef struct kpep_db {
+  const char *name;            ///< Database name, such as "haswell".
+  const char *cpu_id;          ///< Plist name, such as "cpu_7_8_10b282dc".
+  const char *marketing_name;  ///< Marketing name, such as "Intel Haswell".
+  void *plist_data;            ///< Plist data (CFDataRef), currently NULL.
+  void *event_map;  ///< All events (CFDict<CFSTR(event_name), kpep_event *>).
+  kpep_event
+      *event_arr;  ///< Event struct buffer (sizeof(kpep_event) * events_count).
+  kpep_event **fixed_event_arr;  ///< Fixed counter events (sizeof(kpep_event *)
+                                 ///< * fixed_counter_count)
+  void *alias_map;  ///< All aliases (CFDict<CFSTR(event_name), kpep_event *>).
+  usize reserved_1;
+  usize reserved_2;
+  usize reserved_3;
+  usize event_count;  ///< All events count.
+  usize alias_count;
+  usize fixed_counter_count;
+  usize config_counter_count;
+  usize power_counter_count;
+  u32 archtecture;  ///< see `KPEP CPU archtecture constants` above.
+  u32 fixed_counter_bits;
+  u32 config_counter_bits;
+  u32 power_counter_bits;
+} kpep_db;
+
+/// KPEP config (size: 80/44 bytes on 64/32 bit OS)
+typedef struct kpep_config {
+  kpep_db *db;
+  kpep_event **ev_arr;  ///< (sizeof(kpep_event *) * counter_count), init NULL
+  usize *ev_map;        ///< (sizeof(usize *) * counter_count), init 0
+  usize *ev_idx;        ///< (sizeof(usize *) * counter_count), init -1
+  u32 *flags;           ///< (sizeof(u32 *) * counter_count), init 0
+  u64 *kpc_periods;     ///< (sizeof(u64 *) * counter_count), init 0
+  usize event_count;    /// kpep_config_events_count()
+  usize counter_count;
+  u32 classes;  ///< See `class mask constants` above.
+  u32 config_counter;
+  u32 power_counter;
+  u32 reserved;
+} kpep_config;
+
+/// Error code for kpep_config_xxx() and kpep_db_xxx() functions.
+typedef enum {
+  KPEP_CONFIG_ERROR_NONE = 0,
+  KPEP_CONFIG_ERROR_INVALID_ARGUMENT = 1,
+  KPEP_CONFIG_ERROR_OUT_OF_MEMORY = 2,
+  KPEP_CONFIG_ERROR_IO = 3,
+  KPEP_CONFIG_ERROR_BUFFER_TOO_SMALL = 4,
+  KPEP_CONFIG_ERROR_CUR_SYSTEM_UNKNOWN = 5,
+  KPEP_CONFIG_ERROR_DB_PATH_INVALID = 6,
+  KPEP_CONFIG_ERROR_DB_NOT_FOUND = 7,
+  KPEP_CONFIG_ERROR_DB_ARCH_UNSUPPORTED = 8,
+  KPEP_CONFIG_ERROR_DB_VERSION_UNSUPPORTED = 9,
+  KPEP_CONFIG_ERROR_DB_CORRUPT = 10,
+  KPEP_CONFIG_ERROR_EVENT_NOT_FOUND = 11,
+  KPEP_CONFIG_ERROR_CONFLICTING_EVENTS = 12,
+  KPEP_CONFIG_ERROR_COUNTERS_NOT_FORCED = 13,
+  KPEP_CONFIG_ERROR_EVENT_UNAVAILABLE = 14,
+  KPEP_CONFIG_ERROR_ERRNO = 15,
+  KPEP_CONFIG_ERROR_MAX
+} kpep_config_error_code;
+
+/// Error description for kpep_config_error_code.
+static const char *kpep_config_error_names[KPEP_CONFIG_ERROR_MAX] = {
+    "none",
+    "invalid argument",
+    "out of memory",
+    "I/O",
+    "buffer too small",
+    "current system unknown",
+    "database path invalid",
+    "database not found",
+    "database architecture unsupported",
+    "database version unsupported",
+    "database corrupt",
+    "event not found",
+    "conflicting events",
+    "all counters must be forced",
+    "event unavailable",
+    "check errno"};
+
+/// Error description.
+static const char *kpep_config_error_desc(int code) {
+  if (0 <= code && code < KPEP_CONFIG_ERROR_MAX) {
+    return kpep_config_error_names[code];
+  }
+  return "unknown error";
+}
+
+/// Create a config.
+/// @param db A kpep db, see kpep_db_create()
+/// @param cfg_ptr A pointer to receive the new config.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_create)(kpep_db *db, kpep_config **cfg_ptr);
+
+/// Free the config.
+static void (*kpep_config_free)(kpep_config *cfg);
+
+/// Add an event to config.
+/// @param cfg The config.
+/// @param ev_ptr A event pointer.
+/// @param flag 0: all, 1: user space only
+/// @param err Error bitmap pointer, can be NULL.
+///            If return value is `CONFLICTING_EVENTS`, this bitmap contains
+///            the conflicted event indices, e.g. "1 << 2" means index 2.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_add_event)(kpep_config *cfg, kpep_event **ev_ptr,
+                                    u32 flag, u32 *err);
+
+/// Remove event at index.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_remove_event)(kpep_config *cfg, usize idx);
+
+/// Force all counters.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_force_counters)(kpep_config *cfg);
+
+/// Get events count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_events_count)(kpep_config *cfg, usize *count_ptr);
+
+/// Get all event pointers.
+/// @param buf A buffer to receive event pointers.
+/// @param buf_size The buffer's size in bytes, should not smaller than
+///                 kpep_config_events_count() * sizeof(void *).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_events)(kpep_config *cfg, kpep_event **buf,
+                                 usize buf_size);
+
+/// Get kpc register configs.
+/// @param buf A buffer to receive kpc register configs.
+/// @param buf_size The buffer's size in bytes, should not smaller than
+///                 kpep_config_kpc_count() * sizeof(kpc_config_t).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc)(kpep_config *cfg, kpc_config_t *buf,
+                              usize buf_size);
+
+/// Get kpc register config count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc_count)(kpep_config *cfg, usize *count_ptr);
+
+/// Get kpc classes.
+/// @param classes See `class mask constants` above.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc_classes)(kpep_config *cfg, u32 *classes_ptr);
+
+/// Get the index mapping from event to counter.
+/// @param buf A buffer to receive indexes.
+/// @param buf_size The buffer's size in bytes, should not smaller than
+///                 kpep_config_events_count() * sizeof(kpc_config_t).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc_map)(kpep_config *cfg, usize *buf, usize buf_size);
+
+/// Open a kpep database file in "/usr/share/kpep/" or "/usr/local/share/kpep/".
+/// @param name File name, for example "haswell", "cpu_100000c_1_92fb37c8".
+///             Pass NULL for current CPU.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_create)(const char *name, kpep_db **db_ptr);
+
+/// Free the kpep database.
+static void (*kpep_db_free)(kpep_db *db);
+
+/// Get the database's name.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_name)(kpep_db *db, const char **name);
+
+/// Get the event alias count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_aliases_count)(kpep_db *db, usize *count);
+
+/// Get all alias.
+/// @param buf A buffer to receive all alias strings.
+/// @param buf_size The buffer's size in bytes,
+///        should not smaller than kpep_db_aliases_count() * sizeof(void *).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_aliases)(kpep_db *db, const char **buf, usize buf_size);
+
+/// Get counters count for given classes.
+/// @param classes 1: Fixed, 2: Configurable.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_counters_count)(kpep_db *db, u8 classes, usize *count);
+
+/// Get all event count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_events_count)(kpep_db *db, usize *count);
+
+/// Get all events.
+/// @param buf A buffer to receive all event pointers.
+/// @param buf_size The buffer's size in bytes,
+///        should not smaller than kpep_db_events_count() * sizeof(void *).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_events)(kpep_db *db, kpep_event **buf, usize buf_size);
+
+/// Get one event by name.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_event)(kpep_db *db, const char *name, kpep_event **ev_ptr);
+
+/// Get event's name.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_event_name)(kpep_event *ev, const char **name_ptr);
+
+/// Get event's alias.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_event_alias)(kpep_event *ev, const char **alias_ptr);
+
+/// Get event's description.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_event_description)(kpep_event *ev, const char **str_ptr);
+
+// -----------------------------------------------------------------------------
+// load kperf/kperfdata dynamic library
+// -----------------------------------------------------------------------------
+
+typedef struct {
+  const char *name;
+  void **impl;
+} lib_symbol;
+
+#define lib_nelems(x) (sizeof(x) / sizeof((x)[0]))
+#define lib_symbol_def(name) \
+  {                          \
+#name, (void **)&name    \
+  }
+
+static const lib_symbol lib_symbols_kperf[] = {
+    lib_symbol_def(kpc_pmu_version),
+    lib_symbol_def(kpc_cpu_string),
+    lib_symbol_def(kpc_set_counting),
+    lib_symbol_def(kpc_get_counting),
+    lib_symbol_def(kpc_set_thread_counting),
+    lib_symbol_def(kpc_get_thread_counting),
+    lib_symbol_def(kpc_get_config_count),
+    lib_symbol_def(kpc_get_counter_count),
+    lib_symbol_def(kpc_set_config),
+    lib_symbol_def(kpc_get_config),
+    lib_symbol_def(kpc_get_cpu_counters),
+    lib_symbol_def(kpc_get_thread_counters),
+    lib_symbol_def(kpc_force_all_ctrs_set),
+    lib_symbol_def(kpc_force_all_ctrs_get),
+    lib_symbol_def(kperf_action_count_set),
+    lib_symbol_def(kperf_action_count_get),
+    lib_symbol_def(kperf_action_samplers_set),
+    lib_symbol_def(kperf_action_samplers_get),
+    lib_symbol_def(kperf_action_filter_set_by_task),
+    lib_symbol_def(kperf_action_filter_set_by_pid),
+    lib_symbol_def(kperf_timer_count_set),
+    lib_symbol_def(kperf_timer_count_get),
+    lib_symbol_def(kperf_timer_period_set),
+    lib_symbol_def(kperf_timer_period_get),
+    lib_symbol_def(kperf_timer_action_set),
+    lib_symbol_def(kperf_timer_action_get),
+    lib_symbol_def(kperf_sample_set),
+    lib_symbol_def(kperf_sample_get),
+    lib_symbol_def(kperf_reset),
+    lib_symbol_def(kperf_timer_pet_set),
+    lib_symbol_def(kperf_timer_pet_get),
+    lib_symbol_def(kperf_ns_to_ticks),
+    lib_symbol_def(kperf_ticks_to_ns),
+    lib_symbol_def(kperf_tick_frequency),
+};
+
+static const lib_symbol lib_symbols_kperfdata[] = {
+    lib_symbol_def(kpep_config_create),
+    lib_symbol_def(kpep_config_free),
+    lib_symbol_def(kpep_config_add_event),
+    lib_symbol_def(kpep_config_remove_event),
+    lib_symbol_def(kpep_config_force_counters),
+    lib_symbol_def(kpep_config_events_count),
+    lib_symbol_def(kpep_config_events),
+    lib_symbol_def(kpep_config_kpc),
+    lib_symbol_def(kpep_config_kpc_count),
+    lib_symbol_def(kpep_config_kpc_classes),
+    lib_symbol_def(kpep_config_kpc_map),
+    lib_symbol_def(kpep_db_create),
+    lib_symbol_def(kpep_db_free),
+    lib_symbol_def(kpep_db_name),
+    lib_symbol_def(kpep_db_aliases_count),
+    lib_symbol_def(kpep_db_aliases),
+    lib_symbol_def(kpep_db_counters_count),
+    lib_symbol_def(kpep_db_events_count),
+    lib_symbol_def(kpep_db_events),
+    lib_symbol_def(kpep_db_event),
+    lib_symbol_def(kpep_event_name),
+    lib_symbol_def(kpep_event_alias),
+    lib_symbol_def(kpep_event_description),
+};
+
+#define lib_path_kperf "/System/Library/PrivateFrameworks/kperf.framework/kperf"
+#define lib_path_kperfdata \
+  "/System/Library/PrivateFrameworks/kperfdata.framework/kperfdata"
+
+static bool lib_inited = false;
+static bool lib_has_err = false;
+static char lib_err_msg[256];
+
+static void *lib_handle_kperf = NULL;
+static void *lib_handle_kperfdata = NULL;
+
+static void lib_deinit(void) {
+  lib_inited = false;
+  lib_has_err = false;
+  if (lib_handle_kperf) dlclose(lib_handle_kperf);
+  if (lib_handle_kperfdata) dlclose(lib_handle_kperfdata);
+  lib_handle_kperf = NULL;
+  lib_handle_kperfdata = NULL;
+  for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) {
+    const lib_symbol *symbol = &lib_symbols_kperf[i];
+    *symbol->impl = NULL;
+  }
+  for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) {
+    const lib_symbol *symbol = &lib_symbols_kperfdata[i];
+    *symbol->impl = NULL;
+  }
+}
+
+static bool lib_init(void) {
+#define return_err()    \
+  do {                  \
+    lib_deinit();       \
+    lib_inited = true;  \
+    lib_has_err = true; \
+    return false;       \
+  } while (false)
+
+  if (lib_inited) return !lib_has_err;
+
+  // load dynamic library
+  lib_handle_kperf = dlopen(lib_path_kperf, RTLD_LAZY);
+  if (!lib_handle_kperf) {
+    snprintf(lib_err_msg, sizeof(lib_err_msg),
+             "Failed to load kperf.framework, message: %s.", dlerror());
+    return_err();
+  }
+  lib_handle_kperfdata = dlopen(lib_path_kperfdata, RTLD_LAZY);
+  if (!lib_handle_kperfdata) {
+    snprintf(lib_err_msg, sizeof(lib_err_msg),
+             "Failed to load kperfdata.framework, message: %s.", dlerror());
+    return_err();
+  }
+
+  // load symbol address from dynamic library
+  for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) {
+    const lib_symbol *symbol = &lib_symbols_kperf[i];
+    *symbol->impl = dlsym(lib_handle_kperf, symbol->name);
+    if (!*symbol->impl) {
+      snprintf(lib_err_msg, sizeof(lib_err_msg),
+               "Failed to load kperf function: %s.", symbol->name);
+      return_err();
+    }
+  }
+  for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) {
+    const lib_symbol *symbol = &lib_symbols_kperfdata[i];
+    *symbol->impl = dlsym(lib_handle_kperfdata, symbol->name);
+    if (!*symbol->impl) {
+      snprintf(lib_err_msg, sizeof(lib_err_msg),
+               "Failed to load kperfdata function: %s.", symbol->name);
+      return_err();
+    }
+  }
+
+  lib_inited = true;
+  lib_has_err = false;
+  return true;
+
+#undef return_err
+}
+
+// -----------------------------------------------------------------------------
+// kdebug private structs
+// https://github.com/apple/darwin-xnu/blob/main/bsd/sys_private/kdebug_private.h
+// -----------------------------------------------------------------------------
+
+/*
+ * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
+ * structure.
+ */
+#if defined(__arm64__)
+typedef uint64_t kd_buf_argtype;
+#else
+typedef uintptr_t kd_buf_argtype;
+#endif
+
+typedef struct {
+  uint64_t timestamp;
+  kd_buf_argtype arg1;
+  kd_buf_argtype arg2;
+  kd_buf_argtype arg3;
+  kd_buf_argtype arg4;
+  kd_buf_argtype arg5; /* the thread ID */
+  uint32_t debugid;    /* see <sys/kdebug.h> */
+
+/*
+ * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
+ * structure.
+ */
+#if defined(__LP64__) || defined(__arm64__)
+  uint32_t cpuid; /* cpu index, from 0 */
+  kd_buf_argtype unused;
+#endif
+} kd_buf;
+
+/* bits for the type field of kd_regtype */
+#define KDBG_CLASSTYPE 0x10000
+#define KDBG_SUBCLSTYPE 0x20000
+#define KDBG_RANGETYPE 0x40000
+#define KDBG_TYPENONE 0x80000
+#define KDBG_CKTYPES 0xF0000
+
+/* only trace at most 4 types of events, at the code granularity */
+#define KDBG_VALCHECK 0x00200000U
+
+typedef struct {
+  unsigned int type;
+  unsigned int value1;
+  unsigned int value2;
+  unsigned int value3;
+  unsigned int value4;
+} kd_regtype;
+
+typedef struct {
+  /* number of events that can fit in the buffers */
+  int nkdbufs;
+  /* set if trace is disabled */
+  int nolog;
+  /* kd_ctrl_page.flags */
+  unsigned int flags;
+  /* number of threads in thread map */
+  int nkdthreads;
+  /* the owning pid */
+  int bufid;
+} kbufinfo_t;
+
+
+// -----------------------------------------------------------------------------
+// Demo
+// -----------------------------------------------------------------------------
+
+#define EVENT_NAME_MAX 8
+typedef struct {
+  const char *alias;                  /// name for print
+  const char *names[EVENT_NAME_MAX];  /// name from pmc db
+} event_alias;
+
+/// Event names from /usr/share/kpep/<name>.plist
+static const event_alias profile_events[] = {
+    {"cycles",
+     {
+         "FIXED_CYCLES",             // Apple A7-A15
+         "CPU_CLK_UNHALTED.THREAD",  // Intel Core 1th-10th
+         "CPU_CLK_UNHALTED.CORE",    // Intel Yonah, Merom
+     }},
+    {"instructions",
+     {
+         "FIXED_INSTRUCTIONS",  // Apple A7-A15
+         "INST_RETIRED.ANY"     // Intel Yonah, Merom, Core 1th-10th
+     }},
+    {"branches",
+     {
+         "INST_BRANCH",                   // Apple A7-A15
+         "BR_INST_RETIRED.ALL_BRANCHES",  // Intel Core 1th-10th
+         "INST_RETIRED.ANY",              // Intel Yonah, Merom
+     }},
+    {"branch-misses",
+     {
+         "BRANCH_MISPRED_NONSPEC",  // Apple A7-A15, since iOS 15, macOS 12
+         "BRANCH_MISPREDICT",       // Apple A7-A14
+         "BR_MISP_RETIRED.ALL_BRANCHES",  // Intel Core 2th-10th
+         "BR_INST_RETIRED.MISPRED",       // Intel Yonah, Merom
+     }},
+};
+
+static kpep_event *get_event(kpep_db *db, const event_alias *alias) {
+  for (usize j = 0; j < EVENT_NAME_MAX; j++) {
+    const char *name = alias->names[j];
+    if (!name) break;
+    kpep_event *ev = NULL;
+    if (kpep_db_event(db, name, &ev) == 0) {
+      return ev;
+    }
+  }
+  return NULL;
+}
+
+struct AppleEvents {
+  kpc_config_t regs[KPC_MAX_COUNTERS] = {0};
+  usize counter_map[KPC_MAX_COUNTERS] = {0};
+  u64 counters_0[KPC_MAX_COUNTERS] = {0};
+  u64 counters_1[KPC_MAX_COUNTERS] = {0};
+  static constexpr usize ev_count =
+      sizeof(profile_events) / sizeof(profile_events[0]);
+  bool init = false;
+  bool worked = false;
+
+
+  inline bool setup_performance_counters() {
+    if (init) {
+      return worked;
+    }
+    init = true;
+
+    // load dylib
+    if (!lib_init()) {
+      printf("Error: %s\n", lib_err_msg);
+      return (worked = false);
+    }
+
+    // check permission
+    int force_ctrs = 0;
+    if (kpc_force_all_ctrs_get(&force_ctrs)) {
+      printf("Permission denied, xnu/kpc requires root privileges.\n");
+      return (worked = false);
+    }
+    int ret;
+    // load pmc db
+    kpep_db *db = NULL;
+    if ((ret = kpep_db_create(NULL, &db))) {
+      printf("Error: cannot load pmc database: %d.\n", ret);
+      return (worked = false);
+    }
+    // printf("loaded db: %s (%s)\n", db->name, db->marketing_name);
+    // printf("number of fixed counters: %zu\n", db->fixed_counter_count);
+    // printf("number of configurable counters: %zu\n",
+    // db->config_counter_count);
+
+    // create a config
+    kpep_config *cfg = NULL;
+    if ((ret = kpep_config_create(db, &cfg))) {
+      printf("Failed to create kpep config: %d (%s).\n", ret,
+             kpep_config_error_desc(ret));
+      return (worked = false);
+    }
+    if ((ret = kpep_config_force_counters(cfg))) {
+      printf("Failed to force counters: %d (%s).\n", ret,
+             kpep_config_error_desc(ret));
+      return (worked = false);
+    }
+
+    // get events
+    kpep_event *ev_arr[ev_count] = {0};
+    for (usize i = 0; i < ev_count; i++) {
+      const event_alias *alias = profile_events + i;
+      ev_arr[i] = get_event(db, alias);
+      if (!ev_arr[i]) {
+        printf("Cannot find event: %s.\n", alias->alias);
+        return (worked = false);
+      }
+    }
+
+    // add event to config
+    for (usize i = 0; i < ev_count; i++) {
+      kpep_event *ev = ev_arr[i];
+      if ((ret = kpep_config_add_event(cfg, &ev, 0, NULL))) {
+        printf("Failed to add event: %d (%s).\n", ret,
+               kpep_config_error_desc(ret));
+        return (worked = false);
+      }
+    }
+
+    // prepare buffer and config
+    u32 classes = 0;
+    usize reg_count = 0;
+    if ((ret = kpep_config_kpc_classes(cfg, &classes))) {
+      printf("Failed get kpc classes: %d (%s).\n", ret,
+             kpep_config_error_desc(ret));
+      return (worked = false);
+    }
+    if ((ret = kpep_config_kpc_count(cfg, &reg_count))) {
+      printf("Failed get kpc count: %d (%s).\n", ret,
+             kpep_config_error_desc(ret));
+      return (worked = false);
+    }
+    if ((ret = kpep_config_kpc_map(cfg, counter_map, sizeof(counter_map)))) {
+      printf("Failed get kpc map: %d (%s).\n", ret,
+             kpep_config_error_desc(ret));
+      return (worked = false);
+    }
+    if ((ret = kpep_config_kpc(cfg, regs, sizeof(regs)))) {
+      printf("Failed get kpc registers: %d (%s).\n", ret,
+             kpep_config_error_desc(ret));
+      return (worked = false);
+    }
+
+    // set config to kernel
+    if ((ret = kpc_force_all_ctrs_set(1))) {
+      printf("Failed force all ctrs: %d.\n", ret);
+      return (worked = false);
+    }
+    if ((classes & KPC_CLASS_CONFIGURABLE_MASK) && reg_count) {
+      if ((ret = kpc_set_config(classes, regs))) {
+        printf("Failed set kpc config: %d.\n", ret);
+        return (worked = false);
+      }
+    }
+
+    // start counting
+    if ((ret = kpc_set_counting(classes))) {
+      printf("Failed set counting: %d.\n", ret);
+      return (worked = false);
+    }
+    if ((ret = kpc_set_thread_counting(classes))) {
+      printf("Failed set thread counting: %d.\n", ret);
+      return (worked = false);
+    }
+
+    return (worked = true);
+  }
+
+  inline performance_counters get_counters() {
+    static bool warned = false;
+    int ret;
+    // get counters before
+    if ((ret = kpc_get_thread_counters(0, KPC_MAX_COUNTERS, counters_0))) {
+      if (!warned) {
+        printf("Failed get thread counters before: %d.\n", ret);
+        warned = true;
+      }
+      return 1;
+    }
+    return performance_counters{
+        counters_0[counter_map[0]], counters_0[counter_map[3]],
+        counters_0[counter_map[2]], counters_0[counter_map[1]]};
+  }
+};
+
+#endif
diff --git a/microbenchmarks/performancecounters/event_counter.h b/microbenchmarks/performancecounters/event_counter.h
new file mode 100644
index 000000000..63e605690
--- /dev/null
+++ b/microbenchmarks/performancecounters/event_counter.h
@@ -0,0 +1,150 @@
+#ifndef __EVENT_COUNTER_H
+#define __EVENT_COUNTER_H
+
+#include <cctype>
+#ifndef _MSC_VER
+#include <dirent.h>
+#endif
+#include <cinttypes>
+
+#include <cstring>
+
+#include <chrono>
+#include <vector>
+
+#include "linux-perf-events.h"
+#ifdef __linux__
+#include <libgen.h>
+#endif
+
+#if __APPLE__ && __aarch64__
+#include "apple_arm_events.h"
+#endif
+
+struct event_count {
+  std::chrono::duration<double> elapsed;
+  std::vector<unsigned long long> event_counts;
+  event_count() : elapsed(0), event_counts{0, 0, 0, 0, 0} {}
+  event_count(const std::chrono::duration<double> _elapsed,
+              const std::vector<unsigned long long> _event_counts)
+      : elapsed(_elapsed), event_counts(_event_counts) {}
+  event_count(const event_count& other)
+      : elapsed(other.elapsed), event_counts(other.event_counts) {}
+
+  // The types of counters (so we can read the getter more easily)
+  enum event_counter_types {
+    CPU_CYCLES,
+    INSTRUCTIONS,
+  };
+
+  double elapsed_sec() const {
+    return std::chrono::duration<double>(elapsed).count();
+  }
+  double elapsed_ns() const {
+    return std::chrono::duration<double, std::nano>(elapsed).count();
+  }
+  double cycles() const {
+    return static_cast<double>(event_counts[CPU_CYCLES]);
+  }
+  double instructions() const {
+    return static_cast<double>(event_counts[INSTRUCTIONS]);
+  }
+
+  event_count& operator=(const event_count& other) {
+    this->elapsed = other.elapsed;
+    this->event_counts = other.event_counts;
+    return *this;
+  }
+  event_count operator+(const event_count& other) const {
+    return event_count(elapsed + other.elapsed,
+                       {
+                           event_counts[0] + other.event_counts[0],
+                           event_counts[1] + other.event_counts[1],
+                           event_counts[2] + other.event_counts[2],
+                           event_counts[3] + other.event_counts[3],
+                           event_counts[4] + other.event_counts[4],
+                       });
+  }
+
+  void operator+=(const event_count& other) { *this = *this + other; }
+};
+
+struct event_aggregate {
+  bool has_events = false;
+  int iterations = 0;
+  event_count total{};
+  event_count best{};
+  event_count worst{};
+
+  event_aggregate() = default;
+
+  void operator<<(const event_count& other) {
+    if (iterations == 0 || other.elapsed < best.elapsed) {
+      best = other;
+    }
+    if (iterations == 0 || other.elapsed > worst.elapsed) {
+      worst = other;
+    }
+    iterations++;
+    total += other;
+  }
+
+  double elapsed_sec() const { return total.elapsed_sec() / iterations; }
+  double elapsed_ns() const { return total.elapsed_ns() / iterations; }
+  double cycles() const { return total.cycles() / iterations; }
+  double instructions() const { return total.instructions() / iterations; }
+};
+
+struct event_collector {
+  event_count count{};
+  std::chrono::time_point<std::chrono::steady_clock> start_clock{};
+
+#if defined(__linux__)
+  LinuxEvents<PERF_TYPE_HARDWARE> linux_events;
+  event_collector()
+      : linux_events(std::vector<int>{
+            PERF_COUNT_HW_CPU_CYCLES,
+            PERF_COUNT_HW_INSTRUCTIONS,
+        }) {}
+  bool has_events() { return linux_events.is_working(); }
+#elif __APPLE__ && __aarch64__
+  AppleEvents apple_events;
+  performance_counters diff;
+  event_collector() : diff(0) { apple_events.setup_performance_counters(); }
+  bool has_events() { return apple_events.setup_performance_counters(); }
+#else
+  event_collector() {}
+  bool has_events() { return false; }
+#endif
+
+  inline void start() {
+#if defined(__linux)
+    linux_events.start();
+#elif __APPLE__ && __aarch64__
+    if (has_events()) {
+      diff = apple_events.get_counters();
+    }
+#endif
+    start_clock = std::chrono::steady_clock::now();
+  }
+  inline event_count& end() {
+    const auto end_clock = std::chrono::steady_clock::now();
+#if defined(__linux)
+    linux_events.end(count.event_counts);
+#elif __APPLE__ && __aarch64__
+    if (has_events()) {
+      performance_counters end = apple_events.get_counters();
+      diff = end - diff;
+    }
+    count.event_counts[0] = diff.cycles;
+    count.event_counts[1] = diff.instructions;
+    count.event_counts[2] = diff.missed_branches;
+    count.event_counts[3] = 0;
+    count.event_counts[4] = diff.branches;
+#endif
+    count.elapsed = end_clock - start_clock;
+    return count;
+  }
+};
+
+#endif
diff --git a/microbenchmarks/performancecounters/ibireme.h b/microbenchmarks/performancecounters/ibireme.h
new file mode 100644
index 000000000..363d5d03b
--- /dev/null
+++ b/microbenchmarks/performancecounters/ibireme.h
@@ -0,0 +1,917 @@
+// =============================================================================
+// XNU kperf/kpc
+// Available for 64-bit Intel/Apple Silicon, macOS/iOS, with root privileges
+//
+// References:
+//
+// XNU source (since xnu 2422.1.72):
+// https://github.com/apple/darwin-xnu/blob/main/osfmk/kern/kpc.h
+// https://github.com/apple/darwin-xnu/blob/main/bsd/kern/kern_kpc.c
+//
+// Lightweight PET (Profile Every Thread, since xnu 3789.1.32):
+// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/pet.c
+// https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/kperf_kpc.c
+//
+// System Private frameworks (since macOS 10.11, iOS 8.0):
+// /System/Library/PrivateFrameworks/kperf.framework
+// /System/Library/PrivateFrameworks/kperfdata.framework
+//
+// Xcode framework (since Xcode 7.0):
+// /Applications/Xcode.app/Contents/SharedFrameworks/DVTInstrumentsFoundation.framework
+//
+// CPU database (plist files)
+// macOS (since macOS 10.11):
+//     /usr/share/kpep/<name>.plist
+// iOS (copied from Xcode, since iOS 10.0, Xcode 8.0):
+//     /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
+//     /DeviceSupport/<version>/DeveloperDiskImage.dmg/usr/share/kpep/<name>.plist
+//
+//
+// Created by YaoYuan <ibireme@gmail.com> on 2021.
+// Released into the public domain (unlicense.org).
+// =============================================================================
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <dlfcn.h>          // for dlopen() and dlsym()
+#include <mach/mach_time.h> // for mach_absolute_time()
+#include <sys/kdebug.h>     // for kdebug trace decode
+#include <sys/sysctl.h>     // for sysctl()
+#include <unistd.h>         // for usleep()
+
+typedef float f32;
+typedef double f64;
+typedef int8_t i8;
+typedef uint8_t u8;
+typedef int16_t i16;
+typedef uint16_t u16;
+typedef int32_t i32;
+typedef uint32_t u32;
+typedef int64_t i64;
+typedef uint64_t u64;
+typedef size_t usize;
+
+// -----------------------------------------------------------------------------
+// <kperf.framework> header (reverse engineered)
+// This framework wraps some sysctl calls to communicate with the kpc in kernel.
+// Most functions requires root privileges, or process is "blessed".
+// -----------------------------------------------------------------------------
+
+// Cross-platform class constants.
+#define KPC_CLASS_FIXED (0)
+#define KPC_CLASS_CONFIGURABLE (1)
+#define KPC_CLASS_POWER (2)
+#define KPC_CLASS_RAWPMU (3)
+
+// Cross-platform class mask constants.
+#define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED)               // 1
+#define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE) // 2
+#define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER)               // 4
+#define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU)             // 8
+
+// PMU version constants.
+#define KPC_PMU_ERROR (0)     // Error
+#define KPC_PMU_INTEL_V3 (1)  // Intel
+#define KPC_PMU_ARM_APPLE (2) // ARM64
+#define KPC_PMU_INTEL_V2 (3)  // Old Intel
+#define KPC_PMU_ARM_V2 (4)    // Old ARM
+
+// The maximum number of counters we could read from every class in one go.
+// ARMV7: FIXED: 1, CONFIGURABLE: 4
+// ARM32: FIXED: 2, CONFIGURABLE: 6
+// ARM64: FIXED: 2, CONFIGURABLE: CORE_NCTRS - FIXED (6 or 8)
+// x86: 32
+#define KPC_MAX_COUNTERS 32
+
+// Bits for defining what to do on an action.
+// Defined in https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/action.h
+#define KPERF_SAMPLER_TH_INFO (1U << 0)
+#define KPERF_SAMPLER_TH_SNAPSHOT (1U << 1)
+#define KPERF_SAMPLER_KSTACK (1U << 2)
+#define KPERF_SAMPLER_USTACK (1U << 3)
+#define KPERF_SAMPLER_PMC_THREAD (1U << 4)
+#define KPERF_SAMPLER_PMC_CPU (1U << 5)
+#define KPERF_SAMPLER_PMC_CONFIG (1U << 6)
+#define KPERF_SAMPLER_MEMINFO (1U << 7)
+#define KPERF_SAMPLER_TH_SCHEDULING (1U << 8)
+#define KPERF_SAMPLER_TH_DISPATCH (1U << 9)
+#define KPERF_SAMPLER_TK_SNAPSHOT (1U << 10)
+#define KPERF_SAMPLER_SYS_MEM (1U << 11)
+#define KPERF_SAMPLER_TH_INSCYC (1U << 12)
+#define KPERF_SAMPLER_TK_INFO (1U << 13)
+
+// Maximum number of kperf action ids.
+#define KPERF_ACTION_MAX (32)
+
+// Maximum number of kperf timer ids.
+#define KPERF_TIMER_MAX (8)
+
+// x86/arm config registers are 64-bit
+typedef u64 kpc_config_t;
+
+/// Print current CPU identification string to the buffer (same as snprintf),
+/// such as "cpu_7_8_10b282dc_46". This string can be used to locate the PMC
+/// database in /usr/share/kpep.
+/// @return string's length, or negative value if error occurs.
+/// @note This method does not requires root privileges.
+/// @details sysctl get(hw.cputype), get(hw.cpusubtype),
+///                 get(hw.cpufamily), get(machdep.cpu.model)
+static int (*kpc_cpu_string)(char *buf, usize buf_size);
+
+/// Get the version of KPC that's being run.
+/// @return See `PMU version constants` above.
+/// @details sysctl get(kpc.pmu_version)
+static u32 (*kpc_pmu_version)(void);
+
+/// Get running PMC classes.
+/// @return See `class mask constants` above,
+///         0 if error occurs or no class is set.
+/// @details sysctl get(kpc.counting)
+static u32 (*kpc_get_counting)(void);
+
+/// Set PMC classes to enable counting.
+/// @param classes See `class mask constants` above, set 0 to shutdown counting.
+/// @return 0 for success.
+/// @details sysctl set(kpc.counting)
+static int (*kpc_set_counting)(u32 classes);
+
+/// Get running PMC classes for current thread.
+/// @return See `class mask constants` above,
+///         0 if error occurs or no class is set.
+/// @details sysctl get(kpc.thread_counting)
+static u32 (*kpc_get_thread_counting)(void);
+
+/// Set PMC classes to enable counting for current thread.
+/// @param classes See `class mask constants` above, set 0 to shutdown counting.
+/// @return 0 for success.
+/// @details sysctl set(kpc.thread_counting)
+static int (*kpc_set_thread_counting)(u32 classes);
+
+/// Get how many config registers there are for a given mask.
+/// For example: Intel may returns 1 for `KPC_CLASS_FIXED_MASK`,
+///                        returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`.
+/// @param classes See `class mask constants` above.
+/// @return 0 if error occurs or no class is set.
+/// @note This method does not requires root privileges.
+/// @details sysctl get(kpc.config_count)
+static u32 (*kpc_get_config_count)(u32 classes);
+
+/// Get config registers.
+/// @param classes see `class mask constants` above.
+/// @param config Config buffer to receive values, should not smaller than
+///               kpc_get_config_count(classes) * sizeof(kpc_config_t).
+/// @return 0 for success.
+/// @details sysctl get(kpc.config_count), get(kpc.config)
+static int (*kpc_get_config)(u32 classes, kpc_config_t *config);
+
+/// Set config registers.
+/// @param classes see `class mask constants` above.
+/// @param config Config buffer, should not smaller than
+///               kpc_get_config_count(classes) * sizeof(kpc_config_t).
+/// @return 0 for success.
+/// @details sysctl get(kpc.config_count), set(kpc.config)
+static int (*kpc_set_config)(u32 classes, kpc_config_t *config);
+
+/// Get how many counters there are for a given mask.
+/// For example: Intel may returns 3 for `KPC_CLASS_FIXED_MASK`,
+///                        returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`.
+/// @param classes See `class mask constants` above.
+/// @note This method does not requires root privileges.
+/// @details sysctl get(kpc.counter_count)
+static u32 (*kpc_get_counter_count)(u32 classes);
+
+/// Get counter accumulations.
+/// If `all_cpus` is true, the buffer count should not smaller than
+/// (cpu_count * counter_count). Otherwize, the buffer count should not smaller
+/// than (counter_count).
+/// @see kpc_get_counter_count(), kpc_cpu_count().
+/// @param all_cpus true for all CPUs, false for current cpu.
+/// @param classes See `class mask constants` above.
+/// @param curcpu A pointer to receive current cpu id, can be NULL.
+/// @param buf Buffer to receive counter's value.
+/// @return 0 for success.
+/// @details sysctl get(hw.ncpu), get(kpc.counter_count), get(kpc.counters)
+static int (*kpc_get_cpu_counters)(bool all_cpus, u32 classes, int *curcpu,
+                                   u64 *buf);
+
+/// Get counter accumulations for current thread.
+/// @param tid Thread id, should be 0.
+/// @param buf_count The number of buf's elements (not bytes),
+///                  should not smaller than kpc_get_counter_count().
+/// @param buf Buffer to receive counter's value.
+/// @return 0 for success.
+/// @details sysctl get(kpc.thread_counters)
+static int (*kpc_get_thread_counters)(u32 tid, u32 buf_count, u64 *buf);
+
+/// Acquire/release the counters used by the Power Manager.
+/// @param val 1:acquire, 0:release
+/// @return 0 for success.
+/// @details sysctl set(kpc.force_all_ctrs)
+static int (*kpc_force_all_ctrs_set)(int val);
+
+/// Get the state of all_ctrs.
+/// @return 0 for success.
+/// @details sysctl get(kpc.force_all_ctrs)
+static int (*kpc_force_all_ctrs_get)(int *val_out);
+
+/// Set number of actions, should be `KPERF_ACTION_MAX`.
+/// @details sysctl set(kperf.action.count)
+static int (*kperf_action_count_set)(u32 count);
+
+/// Get number of actions.
+/// @details sysctl get(kperf.action.count)
+static int (*kperf_action_count_get)(u32 *count);
+
+/// Set what to sample when a trigger fires an action, e.g.
+/// `KPERF_SAMPLER_PMC_CPU`.
+/// @details sysctl set(kperf.action.samplers)
+static int (*kperf_action_samplers_set)(u32 actionid, u32 sample);
+
+/// Get what to sample when a trigger fires an action.
+/// @details sysctl get(kperf.action.samplers)
+static int (*kperf_action_samplers_get)(u32 actionid, u32 *sample);
+
+/// Apply a task filter to the action, -1 to disable filter.
+/// @details sysctl set(kperf.action.filter_by_task)
+static int (*kperf_action_filter_set_by_task)(u32 actionid, i32 port);
+
+/// Apply a pid filter to the action, -1 to disable filter.
+/// @details sysctl set(kperf.action.filter_by_pid)
+static int (*kperf_action_filter_set_by_pid)(u32 actionid, i32 pid);
+
+/// Set number of time triggers, should be `KPERF_TIMER_MAX`.
+/// @details sysctl set(kperf.timer.count)
+static int (*kperf_timer_count_set)(u32 count);
+
+/// Get number of time triggers.
+/// @details sysctl get(kperf.timer.count)
+static int (*kperf_timer_count_get)(u32 *count);
+
+/// Set timer number and period.
+/// @details sysctl set(kperf.timer.period)
+static int (*kperf_timer_period_set)(u32 actionid, u64 tick);
+
+/// Get timer number and period.
+/// @details sysctl get(kperf.timer.period)
+static int (*kperf_timer_period_get)(u32 actionid, u64 *tick);
+
+/// Set timer number and actionid.
+/// @details sysctl set(kperf.timer.action)
+static int (*kperf_timer_action_set)(u32 actionid, u32 timerid);
+
+/// Get timer number and actionid.
+/// @details sysctl get(kperf.timer.action)
+static int (*kperf_timer_action_get)(u32 actionid, u32 *timerid);
+
+/// Set which timer ID does PET (Profile Every Thread).
+/// @details sysctl set(kperf.timer.pet_timer)
+static int (*kperf_timer_pet_set)(u32 timerid);
+
+/// Get which timer ID does PET (Profile Every Thread).
+/// @details sysctl get(kperf.timer.pet_timer)
+static int (*kperf_timer_pet_get)(u32 *timerid);
+
+/// Enable or disable sampling.
+/// @details sysctl set(kperf.sampling)
+static int (*kperf_sample_set)(u32 enabled);
+
+/// Get is currently sampling.
+/// @details sysctl get(kperf.sampling)
+static int (*kperf_sample_get)(u32 *enabled);
+
+/// Reset kperf: stop sampling, kdebug, timers and actions.
+/// @return 0 for success.
+static int (*kperf_reset)(void);
+
+/// Nanoseconds to CPU ticks.
+static u64 (*kperf_ns_to_ticks)(u64 ns);
+
+/// CPU ticks to nanoseconds.
+static u64 (*kperf_ticks_to_ns)(u64 ticks);
+
+/// CPU ticks frequency (mach_absolute_time).
+static u64 (*kperf_tick_frequency)(void);
+
+/// Get lightweight PET mode (not in kperf.framework).
+static int kperf_lightweight_pet_get(u32 *enabled) {
+  if (!enabled)
+    return -1;
+  usize size = 4;
+  return sysctlbyname("kperf.lightweight_pet", enabled, &size, NULL, 0);
+}
+
+/// Set lightweight PET mode (not in kperf.framework).
+static int kperf_lightweight_pet_set(u32 enabled) {
+  return sysctlbyname("kperf.lightweight_pet", NULL, NULL, &enabled, 4);
+}
+
+// -----------------------------------------------------------------------------
+// <kperfdata.framework> header (reverse engineered)
+// This framework provides some functions to access the local CPU database.
+// These functions do not require root privileges.
+// -----------------------------------------------------------------------------
+
+// KPEP CPU archtecture constants.
+#define KPEP_ARCH_I386 0
+#define KPEP_ARCH_X86_64 1
+#define KPEP_ARCH_ARM 2
+#define KPEP_ARCH_ARM64 3
+
+/// KPEP event (size: 48/28 bytes on 64/32 bit OS)
+typedef struct kpep_event {
+  const char *name; ///< Unique name of a event, such as "INST_RETIRED.ANY".
+  const char *description; ///< Description for this event.
+  const char *errata;      ///< Errata, currently NULL.
+  const char *alias;       ///< Alias name, such as "Instructions", "Cycles".
+  const char *fallback;    ///< Fallback event name for fixed counter.
+  u32 mask;
+  u8 number;
+  u8 umask;
+  u8 reserved;
+  u8 is_fixed;
+} kpep_event;
+
+/// KPEP database (size: 144/80 bytes on 64/32 bit OS)
+typedef struct kpep_db {
+  const char *name;           ///< Database name, such as "haswell".
+  const char *cpu_id;         ///< Plist name, such as "cpu_7_8_10b282dc".
+  const char *marketing_name; ///< Marketing name, such as "Intel Haswell".
+  void *plist_data;           ///< Plist data (CFDataRef), currently NULL.
+  void *event_map; ///< All events (CFDict<CFSTR(event_name), kpep_event *>).
+  kpep_event
+      *event_arr; ///< Event struct buffer (sizeof(kpep_event) * events_count).
+  kpep_event **fixed_event_arr; ///< Fixed counter events (sizeof(kpep_event *)
+                                ///< * fixed_counter_count)
+  void *alias_map; ///< All aliases (CFDict<CFSTR(event_name), kpep_event *>).
+  usize reserved_1;
+  usize reserved_2;
+  usize reserved_3;
+  usize event_count; ///< All events count.
+  usize alias_count;
+  usize fixed_counter_count;
+  usize config_counter_count;
+  usize power_counter_count;
+  u32 archtecture; ///< see `KPEP CPU archtecture constants` above.
+  u32 fixed_counter_bits;
+  u32 config_counter_bits;
+  u32 power_counter_bits;
+} kpep_db;
+
+/// KPEP config (size: 80/44 bytes on 64/32 bit OS)
+typedef struct kpep_config {
+  kpep_db *db;
+  kpep_event **ev_arr; ///< (sizeof(kpep_event *) * counter_count), init NULL
+  usize *ev_map;       ///< (sizeof(usize *) * counter_count), init 0
+  usize *ev_idx;       ///< (sizeof(usize *) * counter_count), init -1
+  u32 *flags;          ///< (sizeof(u32 *) * counter_count), init 0
+  u64 *kpc_periods;    ///< (sizeof(u64 *) * counter_count), init 0
+  usize event_count;   /// kpep_config_events_count()
+  usize counter_count;
+  u32 classes; ///< See `class mask constants` above.
+  u32 config_counter;
+  u32 power_counter;
+  u32 reserved;
+} kpep_config;
+
+/// Error code for kpep_config_xxx() and kpep_db_xxx() functions.
+typedef enum {
+  KPEP_CONFIG_ERROR_NONE = 0,
+  KPEP_CONFIG_ERROR_INVALID_ARGUMENT = 1,
+  KPEP_CONFIG_ERROR_OUT_OF_MEMORY = 2,
+  KPEP_CONFIG_ERROR_IO = 3,
+  KPEP_CONFIG_ERROR_BUFFER_TOO_SMALL = 4,
+  KPEP_CONFIG_ERROR_CUR_SYSTEM_UNKNOWN = 5,
+  KPEP_CONFIG_ERROR_DB_PATH_INVALID = 6,
+  KPEP_CONFIG_ERROR_DB_NOT_FOUND = 7,
+  KPEP_CONFIG_ERROR_DB_ARCH_UNSUPPORTED = 8,
+  KPEP_CONFIG_ERROR_DB_VERSION_UNSUPPORTED = 9,
+  KPEP_CONFIG_ERROR_DB_CORRUPT = 10,
+  KPEP_CONFIG_ERROR_EVENT_NOT_FOUND = 11,
+  KPEP_CONFIG_ERROR_CONFLICTING_EVENTS = 12,
+  KPEP_CONFIG_ERROR_COUNTERS_NOT_FORCED = 13,
+  KPEP_CONFIG_ERROR_EVENT_UNAVAILABLE = 14,
+  KPEP_CONFIG_ERROR_ERRNO = 15,
+  KPEP_CONFIG_ERROR_MAX
+} kpep_config_error_code;
+
+/// Error description for kpep_config_error_code.
+static const char *kpep_config_error_names[KPEP_CONFIG_ERROR_MAX] = {
+    "none",
+    "invalid argument",
+    "out of memory",
+    "I/O",
+    "buffer too small",
+    "current system unknown",
+    "database path invalid",
+    "database not found",
+    "database architecture unsupported",
+    "database version unsupported",
+    "database corrupt",
+    "event not found",
+    "conflicting events",
+    "all counters must be forced",
+    "event unavailable",
+    "check errno"};
+
+/// Error description.
+static const char *kpep_config_error_desc(int code) {
+  if (0 <= code && code < KPEP_CONFIG_ERROR_MAX) {
+    return kpep_config_error_names[code];
+  }
+  return "unknown error";
+}
+
+/// Create a config.
+/// @param db A kpep db, see kpep_db_create()
+/// @param cfg_ptr A pointer to receive the new config.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_create)(kpep_db *db, kpep_config **cfg_ptr);
+
+/// Free the config.
+static void (*kpep_config_free)(kpep_config *cfg);
+
+/// Add an event to config.
+/// @param cfg The config.
+/// @param ev_ptr A event pointer.
+/// @param flag 0: all, 1: user space only
+/// @param err Error bitmap pointer, can be NULL.
+///            If return value is `CONFLICTING_EVENTS`, this bitmap contains
+///            the conflicted event indices, e.g. "1 << 2" means index 2.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_add_event)(kpep_config *cfg, kpep_event **ev_ptr,
+                                    u32 flag, u32 *err);
+
+/// Remove event at index.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_remove_event)(kpep_config *cfg, usize idx);
+
+/// Force all counters.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_force_counters)(kpep_config *cfg);
+
+/// Get events count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_events_count)(kpep_config *cfg, usize *count_ptr);
+
+/// Get all event pointers.
+/// @param buf A buffer to receive event pointers.
+/// @param buf_size The buffer's size in bytes, should not smaller than
+///                 kpep_config_events_count() * sizeof(void *).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_events)(kpep_config *cfg, kpep_event **buf,
+                                 usize buf_size);
+
+/// Get kpc register configs.
+/// @param buf A buffer to receive kpc register configs.
+/// @param buf_size The buffer's size in bytes, should not smaller than
+///                 kpep_config_kpc_count() * sizeof(kpc_config_t).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc)(kpep_config *cfg, kpc_config_t *buf,
+                              usize buf_size);
+
+/// Get kpc register config count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc_count)(kpep_config *cfg, usize *count_ptr);
+
+/// Get kpc classes.
+/// @param classes See `class mask constants` above.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc_classes)(kpep_config *cfg, u32 *classes_ptr);
+
+/// Get the index mapping from event to counter.
+/// @param buf A buffer to receive indexes.
+/// @param buf_size The buffer's size in bytes, should not smaller than
+///                 kpep_config_events_count() * sizeof(kpc_config_t).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_config_kpc_map)(kpep_config *cfg, usize *buf, usize buf_size);
+
+/// Open a kpep database file in "/usr/share/kpep/" or "/usr/local/share/kpep/".
+/// @param name File name, for example "haswell", "cpu_100000c_1_92fb37c8".
+///             Pass NULL for current CPU.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_create)(const char *name, kpep_db **db_ptr);
+
+/// Free the kpep database.
+static void (*kpep_db_free)(kpep_db *db);
+
+/// Get the database's name.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_name)(kpep_db *db, const char **name);
+
+/// Get the event alias count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_aliases_count)(kpep_db *db, usize *count);
+
+/// Get all alias.
+/// @param buf A buffer to receive all alias strings.
+/// @param buf_size The buffer's size in bytes,
+///        should not smaller than kpep_db_aliases_count() * sizeof(void *).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_aliases)(kpep_db *db, const char **buf, usize buf_size);
+
+/// Get counters count for given classes.
+/// @param classes 1: Fixed, 2: Configurable.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_counters_count)(kpep_db *db, u8 classes, usize *count);
+
+/// Get all event count.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_events_count)(kpep_db *db, usize *count);
+
+/// Get all events.
+/// @param buf A buffer to receive all event pointers.
+/// @param buf_size The buffer's size in bytes,
+///        should not smaller than kpep_db_events_count() * sizeof(void *).
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_events)(kpep_db *db, kpep_event **buf, usize buf_size);
+
+/// Get one event by name.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_db_event)(kpep_db *db, const char *name, kpep_event **ev_ptr);
+
+/// Get event's name.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_event_name)(kpep_event *ev, const char **name_ptr);
+
+/// Get event's alias.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_event_alias)(kpep_event *ev, const char **alias_ptr);
+
+/// Get event's description.
+/// @return kpep_config_error_code, 0 for success.
+static int (*kpep_event_description)(kpep_event *ev, const char **str_ptr);
+
+// -----------------------------------------------------------------------------
+// load kperf/kperfdata dynamic library
+// -----------------------------------------------------------------------------
+
+typedef struct {
+  const char *name;
+  void **impl;
+} lib_symbol;
+
+#define lib_nelems(x) (sizeof(x) / sizeof((x)[0]))
+#define lib_symbol_def(name)                                                   \
+  {                                                                            \
+#name, (void **)&name                                                      \
+  }
+
+static const lib_symbol lib_symbols_kperf[] = {
+    lib_symbol_def(kpc_pmu_version),
+    lib_symbol_def(kpc_cpu_string),
+    lib_symbol_def(kpc_set_counting),
+    lib_symbol_def(kpc_get_counting),
+    lib_symbol_def(kpc_set_thread_counting),
+    lib_symbol_def(kpc_get_thread_counting),
+    lib_symbol_def(kpc_get_config_count),
+    lib_symbol_def(kpc_get_counter_count),
+    lib_symbol_def(kpc_set_config),
+    lib_symbol_def(kpc_get_config),
+    lib_symbol_def(kpc_get_cpu_counters),
+    lib_symbol_def(kpc_get_thread_counters),
+    lib_symbol_def(kpc_force_all_ctrs_set),
+    lib_symbol_def(kpc_force_all_ctrs_get),
+    lib_symbol_def(kperf_action_count_set),
+    lib_symbol_def(kperf_action_count_get),
+    lib_symbol_def(kperf_action_samplers_set),
+    lib_symbol_def(kperf_action_samplers_get),
+    lib_symbol_def(kperf_action_filter_set_by_task),
+    lib_symbol_def(kperf_action_filter_set_by_pid),
+    lib_symbol_def(kperf_timer_count_set),
+    lib_symbol_def(kperf_timer_count_get),
+    lib_symbol_def(kperf_timer_period_set),
+    lib_symbol_def(kperf_timer_period_get),
+    lib_symbol_def(kperf_timer_action_set),
+    lib_symbol_def(kperf_timer_action_get),
+    lib_symbol_def(kperf_sample_set),
+    lib_symbol_def(kperf_sample_get),
+    lib_symbol_def(kperf_reset),
+    lib_symbol_def(kperf_timer_pet_set),
+    lib_symbol_def(kperf_timer_pet_get),
+    lib_symbol_def(kperf_ns_to_ticks),
+    lib_symbol_def(kperf_ticks_to_ns),
+    lib_symbol_def(kperf_tick_frequency),
+};
+
+static const lib_symbol lib_symbols_kperfdata[] = {
+    lib_symbol_def(kpep_config_create),
+    lib_symbol_def(kpep_config_free),
+    lib_symbol_def(kpep_config_add_event),
+    lib_symbol_def(kpep_config_remove_event),
+    lib_symbol_def(kpep_config_force_counters),
+    lib_symbol_def(kpep_config_events_count),
+    lib_symbol_def(kpep_config_events),
+    lib_symbol_def(kpep_config_kpc),
+    lib_symbol_def(kpep_config_kpc_count),
+    lib_symbol_def(kpep_config_kpc_classes),
+    lib_symbol_def(kpep_config_kpc_map),
+    lib_symbol_def(kpep_db_create),
+    lib_symbol_def(kpep_db_free),
+    lib_symbol_def(kpep_db_name),
+    lib_symbol_def(kpep_db_aliases_count),
+    lib_symbol_def(kpep_db_aliases),
+    lib_symbol_def(kpep_db_counters_count),
+    lib_symbol_def(kpep_db_events_count),
+    lib_symbol_def(kpep_db_events),
+    lib_symbol_def(kpep_db_event),
+    lib_symbol_def(kpep_event_name),
+    lib_symbol_def(kpep_event_alias),
+    lib_symbol_def(kpep_event_description),
+};
+
+#define lib_path_kperf "/System/Library/PrivateFrameworks/kperf.framework/kperf"
+#define lib_path_kperfdata                                                     \
+  "/System/Library/PrivateFrameworks/kperfdata.framework/kperfdata"
+
+static bool lib_inited = false;
+static bool lib_has_err = false;
+static char lib_err_msg[256];
+
+static void *lib_handle_kperf = NULL;
+static void *lib_handle_kperfdata = NULL;
+
+static void lib_deinit(void) {
+  lib_inited = false;
+  lib_has_err = false;
+  if (lib_handle_kperf)
+    dlclose(lib_handle_kperf);
+  if (lib_handle_kperfdata)
+    dlclose(lib_handle_kperfdata);
+  lib_handle_kperf = NULL;
+  lib_handle_kperfdata = NULL;
+  for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) {
+    const lib_symbol *symbol = &lib_symbols_kperf[i];
+    *symbol->impl = NULL;
+  }
+  for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) {
+    const lib_symbol *symbol = &lib_symbols_kperfdata[i];
+    *symbol->impl = NULL;
+  }
+}
+
+static bool lib_init(void) {
+#define return_err()                                                           \
+  do {                                                                         \
+    lib_deinit();                                                              \
+    lib_inited = true;                                                         \
+    lib_has_err = true;                                                        \
+    return false;                                                              \
+  } while (false)
+
+  if (lib_inited)
+    return !lib_has_err;
+
+  // load dynamic library
+  lib_handle_kperf = dlopen(lib_path_kperf, RTLD_LAZY);
+  if (!lib_handle_kperf) {
+    snprintf(lib_err_msg, sizeof(lib_err_msg),
+             "Failed to load kperf.framework, message: %s.", dlerror());
+    return_err();
+  }
+  lib_handle_kperfdata = dlopen(lib_path_kperfdata, RTLD_LAZY);
+  if (!lib_handle_kperfdata) {
+    snprintf(lib_err_msg, sizeof(lib_err_msg),
+             "Failed to load kperfdata.framework, message: %s.", dlerror());
+    return_err();
+  }
+
+  // load symbol address from dynamic library
+  for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) {
+    const lib_symbol *symbol = &lib_symbols_kperf[i];
+    *symbol->impl = dlsym(lib_handle_kperf, symbol->name);
+    if (!*symbol->impl) {
+      snprintf(lib_err_msg, sizeof(lib_err_msg),
+               "Failed to load kperf function: %s.", symbol->name);
+      return_err();
+    }
+  }
+  for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) {
+    const lib_symbol *symbol = &lib_symbols_kperfdata[i];
+    *symbol->impl = dlsym(lib_handle_kperfdata, symbol->name);
+    if (!*symbol->impl) {
+      snprintf(lib_err_msg, sizeof(lib_err_msg),
+               "Failed to load kperfdata function: %s.", symbol->name);
+      return_err();
+    }
+  }
+
+  lib_inited = true;
+  lib_has_err = false;
+  return true;
+
+#undef return_err
+}
+
+// -----------------------------------------------------------------------------
+// kdebug private structs
+// https://github.com/apple/darwin-xnu/blob/main/bsd/sys_private/kdebug_private.h
+// -----------------------------------------------------------------------------
+
+/*
+ * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
+ * structure.
+ */
+#if defined(__arm64__)
+typedef uint64_t kd_buf_argtype;
+#else
+typedef uintptr_t kd_buf_argtype;
+#endif
+
+typedef struct {
+  uint64_t timestamp;
+  kd_buf_argtype arg1;
+  kd_buf_argtype arg2;
+  kd_buf_argtype arg3;
+  kd_buf_argtype arg4;
+  kd_buf_argtype arg5; /* the thread ID */
+  uint32_t debugid;    /* see <sys/kdebug.h> */
+
+/*
+ * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
+ * structure.
+ */
+#if defined(__LP64__) || defined(__arm64__)
+  uint32_t cpuid; /* cpu index, from 0 */
+  kd_buf_argtype unused;
+#endif
+} kd_buf;
+
+/* bits for the type field of kd_regtype */
+#define KDBG_CLASSTYPE 0x10000
+#define KDBG_SUBCLSTYPE 0x20000
+#define KDBG_RANGETYPE 0x40000
+#define KDBG_TYPENONE 0x80000
+#define KDBG_CKTYPES 0xF0000
+
+/* only trace at most 4 types of events, at the code granularity */
+#define KDBG_VALCHECK 0x00200000U
+
+typedef struct {
+  unsigned int type;
+  unsigned int value1;
+  unsigned int value2;
+  unsigned int value3;
+  unsigned int value4;
+} kd_regtype;
+
+typedef struct {
+  /* number of events that can fit in the buffers */
+  int nkdbufs;
+  /* set if trace is disabled */
+  int nolog;
+  /* kd_ctrl_page.flags */
+  unsigned int flags;
+  /* number of threads in thread map */
+  int nkdthreads;
+  /* the owning pid */
+  int bufid;
+} kbufinfo_t;
+
+// -----------------------------------------------------------------------------
+// kdebug utils
+// -----------------------------------------------------------------------------
+
+/// Clean up trace buffers and reset ktrace/kdebug/kperf.
+/// @return 0 on success.
+static int kdebug_reset(void) {
+  int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREMOVE};
+  return sysctl(mib, 3, NULL, NULL, NULL, 0);
+}
+
+/// Disable and reinitialize the trace buffers.
+/// @return 0 on success.
+static int kdebug_reinit(void) {
+  int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETUP};
+  return sysctl(mib, 3, NULL, NULL, NULL, 0);
+}
+
+/// Set debug filter.
+static int kdebug_setreg(kd_regtype *kdr) {
+  int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETREG};
+  usize size = sizeof(kd_regtype);
+  return sysctl(mib, 3, kdr, &size, NULL, 0);
+}
+
+/// Set maximum number of trace entries (kd_buf).
+/// Only allow allocation up to half the available memory (sane_size).
+/// @return 0 on success.
+static int kdebug_trace_setbuf(int nbufs) {
+  int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDSETBUF, nbufs};
+  return sysctl(mib, 4, NULL, NULL, NULL, 0);
+}
+
+/// Enable or disable kdebug trace.
+/// Trace buffer must already be initialized.
+/// @return 0 on success.
+static int kdebug_trace_enable(bool enable) {
+  int mib[4] = {CTL_KERN, KERN_KDEBUG, KERN_KDENABLE, enable};
+  return sysctl(mib, 4, NULL, 0, NULL, 0);
+}
+
+/// Retrieve trace buffer information from kernel.
+/// @return 0 on success.
+static int kdebug_get_bufinfo(kbufinfo_t *info) {
+  if (!info)
+    return -1;
+  int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDGETBUF};
+  size_t needed = sizeof(kbufinfo_t);
+  return sysctl(mib, 3, info, &needed, NULL, 0);
+}
+
+/// Retrieve trace buffers from kernel.
+/// @param buf Memory to receive buffer data, array of `kd_buf`.
+/// @param len Length of `buf` in bytes.
+/// @param count Number of trace entries (kd_buf) obtained.
+/// @return 0 on success.
+static int kdebug_trace_read(void *buf, usize len, usize *count) {
+  if (count)
+    *count = 0;
+  if (!buf || !len)
+    return -1;
+
+  // Note: the input and output units are not the same.
+  // input: bytes
+  // output: number of kd_buf
+  int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDREADTR};
+  int ret = sysctl(mib, 3, buf, &len, NULL, 0);
+  if (ret != 0)
+    return ret;
+  *count = len;
+  return 0;
+}
+
+/// Block until there are new buffers filled or `timeout_ms` have passed.
+/// @param timeout_ms timeout milliseconds, 0 means wait forever.
+/// @param suc set true if new buffers filled.
+/// @return 0 on success.
+static int kdebug_wait(usize timeout_ms, bool *suc) {
+  if (timeout_ms == 0)
+    return -1;
+  int mib[3] = {CTL_KERN, KERN_KDEBUG, KERN_KDBUFWAIT};
+  usize val = timeout_ms;
+  int ret = sysctl(mib, 3, NULL, &val, NULL, 0);
+  if (suc)
+    *suc = !!val;
+  return ret;
+}
+
+// -----------------------------------------------------------------------------
+// Demo
+// -----------------------------------------------------------------------------
+
+#define EVENT_NAME_MAX 8
+typedef struct {
+  const char *alias;                 /// name for print
+  const char *names[EVENT_NAME_MAX]; /// name from pmc db
+} event_alias;
+
+/// Event names from /usr/share/kpep/<name>.plist
+static const event_alias profile_events[] = {
+    {"cycles",
+     {
+         "FIXED_CYCLES",            // Apple A7-A15
+         "CPU_CLK_UNHALTED.THREAD", // Intel Core 1th-10th
+         "CPU_CLK_UNHALTED.CORE",   // Intel Yonah, Merom
+     }},
+    {"instructions",
+     {
+         "FIXED_INSTRUCTIONS", // Apple A7-A15
+         "INST_RETIRED.ANY"    // Intel Yonah, Merom, Core 1th-10th
+     }},
+    {"branches",
+     {
+         "INST_BRANCH",                  // Apple A7-A15
+         "BR_INST_RETIRED.ALL_BRANCHES", // Intel Core 1th-10th
+         "INST_RETIRED.ANY",             // Intel Yonah, Merom
+     }},
+    {"branch-misses",
+     {
+         "BRANCH_MISPRED_NONSPEC",       // Apple A7-A15, since iOS 15, macOS 12
+         "BRANCH_MISPREDICT",            // Apple A7-A14
+         "BR_MISP_RETIRED.ALL_BRANCHES", // Intel Core 2th-10th
+         "BR_INST_RETIRED.MISPRED",      // Intel Yonah, Merom
+     }},
+};
+
+static kpep_event *get_event(kpep_db *db, const event_alias *alias) {
+  for (usize j = 0; j < EVENT_NAME_MAX; j++) {
+    const char *name = alias->names[j];
+    if (!name)
+      break;
+    kpep_event *ev = NULL;
+    if (kpep_db_event(db, name, &ev) == 0) {
+      return ev;
+    }
+  }
+  return NULL;
+}
+
+kpc_config_t regs[KPC_MAX_COUNTERS] = {0};
+usize counter_map[KPC_MAX_COUNTERS] = {0};
+u64 counters_0[KPC_MAX_COUNTERS] = {0};
+u64 counters_1[KPC_MAX_COUNTERS] = {0};
+const usize ev_count = sizeof(profile_events) / sizeof(profile_events[0]);
diff --git a/microbenchmarks/performancecounters/linux-perf-events.h b/microbenchmarks/performancecounters/linux-perf-events.h
new file mode 100644
index 000000000..494aeb738
--- /dev/null
+++ b/microbenchmarks/performancecounters/linux-perf-events.h
@@ -0,0 +1,101 @@
+// https://github.com/WojciechMula/toys/blob/master/000helpers/linux-perf-events.h
+#pragma once
+#ifdef __linux__
+
+#include <asm/unistd.h>        // for __NR_perf_event_open
+#include <linux/perf_event.h>  // for perf event constants
+#include <sys/ioctl.h>         // for ioctl
+#include <unistd.h>            // for syscall
+
+#include <cerrno>   // for errno
+#include <cstring>  // for memset
+#include <stdexcept>
+
+#include <iostream>
+#include <vector>
+
+template <int TYPE = PERF_TYPE_HARDWARE>
+class LinuxEvents {
+  int fd;
+  bool working;
+  perf_event_attr attribs{};
+  size_t num_events{};
+  std::vector<uint64_t> temp_result_vec{};
+  std::vector<uint64_t> ids{};
+
+ public:
+  explicit LinuxEvents(std::vector<int> config_vec) : fd(0), working(true) {
+    memset(&attribs, 0, sizeof(attribs));
+    attribs.type = TYPE;
+    attribs.size = sizeof(attribs);
+    attribs.disabled = 1;
+    attribs.exclude_kernel = 1;
+    attribs.exclude_hv = 1;
+
+    attribs.sample_period = 0;
+    attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
+    const int pid = 0;   // the current process
+    const int cpu = -1;  // all CPUs
+    const unsigned long flags = 0;
+
+    int group = -1;  // no group
+    num_events = config_vec.size();
+    ids.resize(config_vec.size());
+    uint32_t i = 0;
+    for (auto config : config_vec) {
+      attribs.config = config;
+      fd = static_cast<int>(
+          syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags));
+      if (fd == -1) {
+        report_error("perf_event_open");
+      }
+      ioctl(fd, PERF_EVENT_IOC_ID, &ids[i++]);
+      if (group == -1) {
+        group = fd;
+      }
+    }
+
+    temp_result_vec.resize(num_events * 2 + 1);
+  }
+
+  ~LinuxEvents() {
+    if (fd != -1) {
+      close(fd);
+    }
+  }
+
+  inline void start() {
+    if (fd != -1) {
+      if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
+        report_error("ioctl(PERF_EVENT_IOC_RESET)");
+      }
+
+      if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
+        report_error("ioctl(PERF_EVENT_IOC_ENABLE)");
+      }
+    }
+  }
+
+  inline void end(std::vector<unsigned long long> &results) {
+    if (fd != -1) {
+      if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
+        report_error("ioctl(PERF_EVENT_IOC_DISABLE)");
+      }
+
+      if (read(fd, temp_result_vec.data(), temp_result_vec.size() * 8) == -1) {
+        report_error("read");
+      }
+    }
+    // our actual results are in slots 1,3,5, ... of this structure
+    // we really should be checking our ids obtained earlier to be safe
+    for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) {
+      results[i / 2] = temp_result_vec[i];
+    }
+  }
+
+  bool is_working() { return working; }
+
+ private:
+  void report_error(const std::string &) { working = false; }
+};
+#endif
diff --git a/microbenchmarks/toni_ronnko_dirent.h b/microbenchmarks/toni_ronnko_dirent.h
new file mode 100644
index 000000000..a9356644f
--- /dev/null
+++ b/microbenchmarks/toni_ronnko_dirent.h
@@ -0,0 +1,1075 @@
+/*
+ * Dirent interface for Microsoft Visual Studio
+ *
+ * Copyright (C) 1998-2019 Toni Ronkko
+ * This file is part of dirent.  Dirent may be freely distributed
+ * under the MIT license.  For all details and documentation, see
+ * https://github.com/tronkko/dirent
+ */
+#ifndef DIRENT_H
+#define DIRENT_H
+
+/* Hide warnings about unreferenced local functions */
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wunused-function"
+#elif defined(_MSC_VER)
+#pragma warning(disable : 4505)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+/*
+ * Include windows.h without Windows Sockets 1.1 to prevent conflicts with
+ * Windows Sockets 2.0.
+ */
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+
+#include <errno.h>
+#include <malloc.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <wchar.h>
+
+/* Indicates that d_type field is available in dirent structure */
+#define _DIRENT_HAVE_D_TYPE
+
+/* Indicates that d_namlen field is available in dirent structure */
+#define _DIRENT_HAVE_D_NAMLEN
+
+/* Entries missing from MSVC 6.0 */
+#if !defined(FILE_ATTRIBUTE_DEVICE)
+#define FILE_ATTRIBUTE_DEVICE 0x40
+#endif
+
+/* File type and permission flags for stat(), general mask */
+#if !defined(S_IFMT)
+#define S_IFMT _S_IFMT
+#endif
+
+/* Directory bit */
+#if !defined(S_IFDIR)
+#define S_IFDIR _S_IFDIR
+#endif
+
+/* Character device bit */
+#if !defined(S_IFCHR)
+#define S_IFCHR _S_IFCHR
+#endif
+
+/* Pipe bit */
+#if !defined(S_IFFIFO)
+#define S_IFFIFO _S_IFFIFO
+#endif
+
+/* Regular file bit */
+#if !defined(S_IFREG)
+#define S_IFREG _S_IFREG
+#endif
+
+/* Read permission */
+#if !defined(S_IREAD)
+#define S_IREAD _S_IREAD
+#endif
+
+/* Write permission */
+#if !defined(S_IWRITE)
+#define S_IWRITE _S_IWRITE
+#endif
+
+/* Execute permission */
+#if !defined(S_IEXEC)
+#define S_IEXEC _S_IEXEC
+#endif
+
+/* Pipe */
+#if !defined(S_IFIFO)
+#define S_IFIFO _S_IFIFO
+#endif
+
+/* Block device */
+#if !defined(S_IFBLK)
+#define S_IFBLK 0
+#endif
+
+/* Link */
+#if !defined(S_IFLNK)
+#define S_IFLNK 0
+#endif
+
+/* Socket */
+#if !defined(S_IFSOCK)
+#define S_IFSOCK 0
+#endif
+
+/* Read user permission */
+#if !defined(S_IRUSR)
+#define S_IRUSR S_IREAD
+#endif
+
+/* Write user permission */
+#if !defined(S_IWUSR)
+#define S_IWUSR S_IWRITE
+#endif
+
+/* Execute user permission */
+#if !defined(S_IXUSR)
+#define S_IXUSR 0
+#endif
+
+/* Read group permission */
+#if !defined(S_IRGRP)
+#define S_IRGRP 0
+#endif
+
+/* Write group permission */
+#if !defined(S_IWGRP)
+#define S_IWGRP 0
+#endif
+
+/* Execute group permission */
+#if !defined(S_IXGRP)
+#define S_IXGRP 0
+#endif
+
+/* Read others permission */
+#if !defined(S_IROTH)
+#define S_IROTH 0
+#endif
+
+/* Write others permission */
+#if !defined(S_IWOTH)
+#define S_IWOTH 0
+#endif
+
+/* Execute others permission */
+#if !defined(S_IXOTH)
+#define S_IXOTH 0
+#endif
+
+/* Maximum length of file name */
+#if !defined(PATH_MAX)
+#define PATH_MAX MAX_PATH
+#endif
+#if !defined(FILENAME_MAX)
+#define FILENAME_MAX MAX_PATH
+#endif
+#if !defined(NAME_MAX)
+#define NAME_MAX FILENAME_MAX
+#endif
+
+/* File type flags for d_type */
+#define DT_UNKNOWN 0
+#define DT_REG S_IFREG
+#define DT_DIR S_IFDIR
+#define DT_FIFO S_IFIFO
+#define DT_SOCK S_IFSOCK
+#define DT_CHR S_IFCHR
+#define DT_BLK S_IFBLK
+#define DT_LNK S_IFLNK
+
+/* Macros for converting between st_mode and d_type */
+#define IFTODT(mode) ((mode)&S_IFMT)
+#define DTTOIF(type) (type)
+
+/*
+ * File type macros.  Note that block devices, sockets and links cannot be
+ * distinguished on Windows and the macros S_ISBLK, S_ISSOCK and S_ISLNK are
+ * only defined for compatibility.  These macros should always return false
+ * on Windows.
+ */
+#if !defined(S_ISFIFO)
+#define S_ISFIFO(mode) (((mode)&S_IFMT) == S_IFIFO)
+#endif
+#if !defined(S_ISDIR)
+#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
+#endif
+#if !defined(S_ISREG)
+#define S_ISREG(mode) (((mode)&S_IFMT) == S_IFREG)
+#endif
+#if !defined(S_ISLNK)
+#define S_ISLNK(mode) (((mode)&S_IFMT) == S_IFLNK)
+#endif
+#if !defined(S_ISSOCK)
+#define S_ISSOCK(mode) (((mode)&S_IFMT) == S_IFSOCK)
+#endif
+#if !defined(S_ISCHR)
+#define S_ISCHR(mode) (((mode)&S_IFMT) == S_IFCHR)
+#endif
+#if !defined(S_ISBLK)
+#define S_ISBLK(mode) (((mode)&S_IFMT) == S_IFBLK)
+#endif
+
+/* Return the exact length of the file name without zero terminator */
+#define _D_EXACT_NAMLEN(p) ((p)->d_namlen)
+
+/* Return the maximum size of a file name */
+#define _D_ALLOC_NAMLEN(p) ((PATH_MAX) + 1)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Wide-character version */
+struct _wdirent {
+  /* Always zero */
+  long d_ino;
+
+  /* File position within stream */
+  long d_off;
+
+  /* Structure size */
+  unsigned short d_reclen;
+
+  /* Length of name without \0 */
+  size_t d_namlen;
+
+  /* File type */
+  int d_type;
+
+  /* File name */
+  wchar_t d_name[PATH_MAX + 1];
+};
+typedef struct _wdirent _wdirent;
+
+struct _WDIR {
+  /* Current directory entry */
+  struct _wdirent ent;
+
+  /* Private file data */
+  WIN32_FIND_DATAW data;
+
+  /* True if data is valid */
+  int cached;
+
+  /* Win32 search handle */
+  HANDLE handle;
+
+  /* Initial directory name */
+  wchar_t *patt;
+};
+typedef struct _WDIR _WDIR;
+
+/* Multi-byte character version */
+struct dirent {
+  /* Always zero */
+  long d_ino;
+
+  /* File position within stream */
+  long d_off;
+
+  /* Structure size */
+  unsigned short d_reclen;
+
+  /* Length of name without \0 */
+  size_t d_namlen;
+
+  /* File type */
+  int d_type;
+
+  /* File name */
+  char d_name[PATH_MAX + 1];
+};
+typedef struct dirent dirent;
+
+struct DIR {
+  struct dirent ent;
+  struct _WDIR *wdirp;
+};
+typedef struct DIR DIR;
+
+/* Dirent functions */
+static DIR *opendir(const char *dirname);
+static _WDIR *_wopendir(const wchar_t *dirname);
+
+static struct dirent *readdir(DIR *dirp);
+static struct _wdirent *_wreaddir(_WDIR *dirp);
+
+static int readdir_r(DIR *dirp, struct dirent *entry, struct dirent **result);
+static int _wreaddir_r(_WDIR *dirp, struct _wdirent *entry,
+                       struct _wdirent **result);
+
+static int closedir(DIR *dirp);
+static int _wclosedir(_WDIR *dirp);
+
+static void rewinddir(DIR *dirp);
+static void _wrewinddir(_WDIR *dirp);
+
+static int scandir(const char *dirname, struct dirent ***namelist,
+                   int (*filter)(const struct dirent *),
+                   int (*compare)(const struct dirent **,
+                                  const struct dirent **));
+
+static int alphasort(const struct dirent **a, const struct dirent **b);
+
+static int versionsort(const struct dirent **a, const struct dirent **b);
+
+/* For compatibility with Symbian */
+#define wdirent _wdirent
+#define WDIR _WDIR
+#define wopendir _wopendir
+#define wreaddir _wreaddir
+#define wclosedir _wclosedir
+#define wrewinddir _wrewinddir
+
+/* Internal utility functions */
+static WIN32_FIND_DATAW *dirent_first(_WDIR *dirp);
+static WIN32_FIND_DATAW *dirent_next(_WDIR *dirp);
+
+static int dirent_mbstowcs_s(size_t *pReturnValue, wchar_t *wcstr,
+                             size_t sizeInWords, const char *mbstr,
+                             size_t count);
+
+static int dirent_wcstombs_s(size_t *pReturnValue, char *mbstr,
+                             size_t sizeInBytes, const wchar_t *wcstr,
+                             size_t count);
+
+static void dirent_set_errno(int error);
+
+/*
+ * Open directory stream DIRNAME for read and return a pointer to the
+ * internal working area that is used to retrieve individual directory
+ * entries.
+ */
+static _WDIR *_wopendir(const wchar_t *dirname) {
+  _WDIR *dirp;
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+  /* Desktop */
+  DWORD n;
+#else
+  /* WinRT */
+  size_t n;
+#endif
+  wchar_t *p;
+
+  /* Must have directory name */
+  if (dirname == NULL || dirname[0] == '\0') {
+    dirent_set_errno(ENOENT);
+    return NULL;
+  }
+
+  /* Allocate new _WDIR structure */
+  dirp = (_WDIR *)malloc(sizeof(struct _WDIR));
+  if (!dirp) {
+    return NULL;
+  }
+
+  /* Reset _WDIR structure */
+  dirp->handle = INVALID_HANDLE_VALUE;
+  dirp->patt = NULL;
+  dirp->cached = 0;
+
+  /*
+   * Compute the length of full path plus zero terminator
+   *
+   * Note that on WinRT there's no way to convert relative paths
+   * into absolute paths, so just assume it is an absolute path.
+   */
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+  /* Desktop */
+  n = GetFullPathNameW(dirname, 0, NULL, NULL);
+#else
+  /* WinRT */
+  n = wcslen(dirname);
+#endif
+
+  /* Allocate room for absolute directory name and search pattern */
+  dirp->patt = (wchar_t *)malloc(sizeof(wchar_t) * n + 16);
+  if (dirp->patt == NULL) {
+    goto exit_closedir;
+  }
+
+  /*
+   * Convert relative directory name to an absolute one.  This
+   * allows rewinddir() to function correctly even when current
+   * working directory is changed between opendir() and rewinddir().
+   *
+   * Note that on WinRT there's no way to convert relative paths
+   * into absolute paths, so just assume it is an absolute path.
+   */
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+  /* Desktop */
+  n = GetFullPathNameW(dirname, n, dirp->patt, NULL);
+  if (n <= 0) {
+    goto exit_closedir;
+  }
+#else
+  /* WinRT */
+  wcsncpy_s(dirp->patt, n + 1, dirname, n);
+#endif
+
+  /* Append search pattern \* to the directory name */
+  p = dirp->patt + n;
+  switch (p[-1]) {
+  case '\\':
+  case '/':
+  case ':':
+      /* Directory ends in path separator, e.g. c:\temp\ */
+      /*NOP*/;
+    break;
+
+  default:
+    /* Directory name doesn't end in path separator */
+    *p++ = '\\';
+  }
+  *p++ = '*';
+  *p = '\0';
+
+  /* Open directory stream and retrieve the first entry */
+  if (!dirent_first(dirp)) {
+    goto exit_closedir;
+  }
+
+  /* Success */
+  return dirp;
+
+  /* Failure */
+exit_closedir:
+  _wclosedir(dirp);
+  return NULL;
+}
+
+/*
+ * Read next directory entry.
+ *
+ * Returns pointer to static directory entry which may be overwritten by
+ * subsequent calls to _wreaddir().
+ */
+static struct _wdirent *_wreaddir(_WDIR *dirp) {
+  struct _wdirent *entry;
+
+  /*
+   * Read directory entry to buffer.  We can safely ignore the return value
+   * as entry will be set to NULL in case of error.
+   */
+  (void)_wreaddir_r(dirp, &dirp->ent, &entry);
+
+  /* Return pointer to statically allocated directory entry */
+  return entry;
+}
+
+/*
+ * Read next directory entry.
+ *
+ * Returns zero on success.  If end of directory stream is reached, then sets
+ * result to NULL and returns zero.
+ */
+static int _wreaddir_r(_WDIR *dirp, struct _wdirent *entry,
+                       struct _wdirent **result) {
+  WIN32_FIND_DATAW *datap;
+
+  /* Read next directory entry */
+  datap = dirent_next(dirp);
+  if (datap) {
+    size_t n;
+    DWORD attr;
+
+    /*
+     * Copy file name as wide-character string.  If the file name is too
+     * long to fit in to the destination buffer, then truncate file name
+     * to PATH_MAX characters and zero-terminate the buffer.
+     */
+    n = 0;
+    while (n < PATH_MAX && datap->cFileName[n] != 0) {
+      entry->d_name[n] = datap->cFileName[n];
+      n++;
+    }
+    entry->d_name[n] = 0;
+
+    /* Length of file name excluding zero terminator */
+    entry->d_namlen = n;
+
+    /* File type */
+    attr = datap->dwFileAttributes;
+    if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) {
+      entry->d_type = DT_CHR;
+    } else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) {
+      entry->d_type = DT_DIR;
+    } else {
+      entry->d_type = DT_REG;
+    }
+
+    /* Reset dummy fields */
+    entry->d_ino = 0;
+    entry->d_off = 0;
+    entry->d_reclen = sizeof(struct _wdirent);
+
+    /* Set result address */
+    *result = entry;
+
+  } else {
+
+    /* Return NULL to indicate end of directory */
+    *result = NULL;
+  }
+
+  return /*OK*/ 0;
+}
+
+/*
+ * Close directory stream opened by opendir() function.  This invalidates the
+ * DIR structure as well as any directory entry read previously by
+ * _wreaddir().
+ */
+static int _wclosedir(_WDIR *dirp) {
+  int ok;
+  if (dirp) {
+
+    /* Release search handle */
+    if (dirp->handle != INVALID_HANDLE_VALUE) {
+      FindClose(dirp->handle);
+    }
+
+    /* Release search pattern */
+    free(dirp->patt);
+
+    /* Release directory structure */
+    free(dirp);
+    ok = /*success*/ 0;
+
+  } else {
+
+    /* Invalid directory stream */
+    dirent_set_errno(EBADF);
+    ok = /*failure*/ -1;
+  }
+  return ok;
+}
+
+/*
+ * Rewind directory stream such that _wreaddir() returns the very first
+ * file name again.
+ */
+static void _wrewinddir(_WDIR *dirp) {
+  if (dirp) {
+    /* Release existing search handle */
+    if (dirp->handle != INVALID_HANDLE_VALUE) {
+      FindClose(dirp->handle);
+    }
+
+    /* Open new search handle */
+    dirent_first(dirp);
+  }
+}
+
+/* Get first directory entry (internal) */
+static WIN32_FIND_DATAW *dirent_first(_WDIR *dirp) {
+  WIN32_FIND_DATAW *datap;
+  DWORD error;
+
+  /* Open directory and retrieve the first entry */
+  dirp->handle = FindFirstFileExW(dirp->patt, FindExInfoStandard, &dirp->data,
+                                  FindExSearchNameMatch, NULL, 0);
+  if (dirp->handle != INVALID_HANDLE_VALUE) {
+
+    /* a directory entry is now waiting in memory */
+    datap = &dirp->data;
+    dirp->cached = 1;
+
+  } else {
+
+    /* Failed to open directory: no directory entry in memory */
+    dirp->cached = 0;
+    datap = NULL;
+
+    /* Set error code */
+    error = GetLastError();
+    switch (error) {
+    case ERROR_ACCESS_DENIED:
+      /* No read access to directory */
+      dirent_set_errno(EACCES);
+      break;
+
+    case ERROR_DIRECTORY:
+      /* Directory name is invalid */
+      dirent_set_errno(ENOTDIR);
+      break;
+
+    case ERROR_PATH_NOT_FOUND:
+    default:
+      /* Cannot find the file */
+      dirent_set_errno(ENOENT);
+    }
+  }
+  return datap;
+}
+
+/*
+ * Get next directory entry (internal).
+ *
+ * Returns
+ */
+static WIN32_FIND_DATAW *dirent_next(_WDIR *dirp) {
+  WIN32_FIND_DATAW *p;
+
+  /* Get next directory entry */
+  if (dirp->cached != 0) {
+
+    /* A valid directory entry already in memory */
+    p = &dirp->data;
+    dirp->cached = 0;
+
+  } else if (dirp->handle != INVALID_HANDLE_VALUE) {
+
+    /* Get the next directory entry from stream */
+    if (FindNextFileW(dirp->handle, &dirp->data) != FALSE) {
+      /* Got a file */
+      p = &dirp->data;
+    } else {
+      /* The very last entry has been processed or an error occurred */
+      FindClose(dirp->handle);
+      dirp->handle = INVALID_HANDLE_VALUE;
+      p = NULL;
+    }
+
+  } else {
+
+    /* End of directory stream reached */
+    p = NULL;
+  }
+
+  return p;
+}
+
+/*
+ * Open directory stream using plain old C-string.
+ */
+static DIR *opendir(const char *dirname) {
+  struct DIR *dirp;
+
+  /* Must have directory name */
+  if (dirname == NULL || dirname[0] == '\0') {
+    dirent_set_errno(ENOENT);
+    return NULL;
+  }
+
+  /* Allocate memory for DIR structure */
+  dirp = (DIR *)malloc(sizeof(struct DIR));
+  if (!dirp) {
+    return NULL;
+  }
+  {
+    int error;
+    wchar_t wname[PATH_MAX + 1];
+    size_t n;
+
+    /* Convert directory name to wide-character string */
+    error = dirent_mbstowcs_s(&n, wname, PATH_MAX + 1, dirname, PATH_MAX + 1);
+    if (error) {
+      /*
+       * Cannot convert file name to wide-character string.  This
+       * occurs if the string contains invalid multi-byte sequences or
+       * the output buffer is too small to contain the resulting
+       * string.
+       */
+      goto exit_free;
+    }
+
+    /* Open directory stream using wide-character name */
+    dirp->wdirp = _wopendir(wname);
+    if (!dirp->wdirp) {
+      goto exit_free;
+    }
+  }
+
+  /* Success */
+  return dirp;
+
+  /* Failure */
+exit_free:
+  free(dirp);
+  return NULL;
+}
+
+/*
+ * Read next directory entry.
+ */
+static struct dirent *readdir(DIR *dirp) {
+  struct dirent *entry;
+
+  /*
+   * Read directory entry to buffer.  We can safely ignore the return value
+   * as entry will be set to NULL in case of error.
+   */
+  (void)readdir_r(dirp, &dirp->ent, &entry);
+
+  /* Return pointer to statically allocated directory entry */
+  return entry;
+}
+
+/*
+ * Read next directory entry into called-allocated buffer.
+ *
+ * Returns zero on success.  If the end of directory stream is reached, then
+ * sets result to NULL and returns zero.
+ */
+static int readdir_r(DIR *dirp, struct dirent *entry, struct dirent **result) {
+  WIN32_FIND_DATAW *datap;
+
+  /* Read next directory entry */
+  datap = dirent_next(dirp->wdirp);
+  if (datap) {
+    size_t n;
+    int error;
+
+    /* Attempt to convert file name to multi-byte string */
+    error = dirent_wcstombs_s(&n, entry->d_name, PATH_MAX + 1, datap->cFileName,
+                              PATH_MAX + 1);
+
+    /*
+     * If the file name cannot be represented by a multi-byte string,
+     * then attempt to use old 8+3 file name.  This allows traditional
+     * Unix-code to access some file names despite of unicode
+     * characters, although file names may seem unfamiliar to the user.
+     *
+     * Be ware that the code below cannot come up with a short file
+     * name unless the file system provides one.  At least
+     * VirtualBox shared folders fail to do this.
+     */
+    if (error && datap->cAlternateFileName[0] != '\0') {
+      error = dirent_wcstombs_s(&n, entry->d_name, PATH_MAX + 1,
+                                datap->cAlternateFileName, PATH_MAX + 1);
+    }
+
+    if (!error) {
+      DWORD attr;
+
+      /* Length of file name excluding zero terminator */
+      entry->d_namlen = n - 1;
+
+      /* File attributes */
+      attr = datap->dwFileAttributes;
+      if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) {
+        entry->d_type = DT_CHR;
+      } else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) {
+        entry->d_type = DT_DIR;
+      } else {
+        entry->d_type = DT_REG;
+      }
+
+      /* Reset dummy fields */
+      entry->d_ino = 0;
+      entry->d_off = 0;
+      entry->d_reclen = sizeof(struct dirent);
+
+    } else {
+
+      /*
+       * Cannot convert file name to multi-byte string so construct
+       * an erroneous directory entry and return that.  Note that
+       * we cannot return NULL as that would stop the processing
+       * of directory entries completely.
+       */
+      entry->d_name[0] = '?';
+      entry->d_name[1] = '\0';
+      entry->d_namlen = 1;
+      entry->d_type = DT_UNKNOWN;
+      entry->d_ino = 0;
+      entry->d_off = -1;
+      entry->d_reclen = 0;
+    }
+
+    /* Return pointer to directory entry */
+    *result = entry;
+
+  } else {
+
+    /* No more directory entries */
+    *result = NULL;
+  }
+
+  return /*OK*/ 0;
+}
+
+/*
+ * Close directory stream.
+ */
+static int closedir(DIR *dirp) {
+  int ok;
+  if (dirp) {
+
+    /* Close wide-character directory stream */
+    ok = _wclosedir(dirp->wdirp);
+    dirp->wdirp = NULL;
+
+    /* Release multi-byte character version */
+    free(dirp);
+
+  } else {
+
+    /* Invalid directory stream */
+    dirent_set_errno(EBADF);
+    ok = /*failure*/ -1;
+  }
+  return ok;
+}
+
+/*
+ * Rewind directory stream to beginning.
+ */
+static void rewinddir(DIR *dirp) {
+  /* Rewind wide-character string directory stream */
+  _wrewinddir(dirp->wdirp);
+}
+
+/*
+ * Scan directory for entries.
+ */
+static int scandir(const char *dirname, struct dirent ***namelist,
+                   int (*filter)(const struct dirent *),
+                   int (*compare)(const struct dirent **,
+                                  const struct dirent **)) {
+  struct dirent **files = NULL;
+  size_t size = 0;
+  size_t allocated = 0;
+  const size_t init_size = 1;
+  DIR *dir = NULL;
+  struct dirent *entry;
+  struct dirent *tmp = NULL;
+  size_t i;
+  int result = 0;
+
+  /* Open directory stream */
+  dir = opendir(dirname);
+  if (dir) {
+
+    /* Read directory entries to memory */
+    while (1) {
+
+      /* Enlarge pointer table to make room for another pointer */
+      if (size >= allocated) {
+        void *p;
+        size_t num_entries;
+
+        /* Compute number of entries in the enlarged pointer table */
+        if (size < init_size) {
+          /* Allocate initial pointer table */
+          num_entries = init_size;
+        } else {
+          /* Double the size */
+          num_entries = size * 2;
+        }
+
+        /* Allocate first pointer table or enlarge existing table */
+        p = realloc(files, sizeof(void *) * num_entries);
+        if (p != NULL) {
+          /* Got the memory */
+          files = (dirent **)p;
+          allocated = num_entries;
+        } else {
+          /* Out of memory */
+          result = -1;
+          break;
+        }
+      }
+
+      /* Allocate room for temporary directory entry */
+      if (tmp == NULL) {
+        tmp = (struct dirent *)malloc(sizeof(struct dirent));
+        if (tmp == NULL) {
+          /* Cannot allocate temporary directory entry */
+          result = -1;
+          break;
+        }
+      }
+
+      /* Read directory entry to temporary area */
+      if (readdir_r(dir, tmp, &entry) == /*OK*/ 0) {
+
+        /* Did we get an entry? */
+        if (entry != NULL) {
+          int pass;
+
+          /* Determine whether to include the entry in result */
+          if (filter) {
+            /* Let the filter function decide */
+            pass = filter(tmp);
+          } else {
+            /* No filter function, include everything */
+            pass = 1;
+          }
+
+          if (pass) {
+            /* Store the temporary entry to pointer table */
+            files[size++] = tmp;
+            tmp = NULL;
+
+            /* Keep up with the number of files */
+            result++;
+          }
+
+        } else {
+
+          /*
+           * End of directory stream reached => sort entries and
+           * exit.
+           */
+          qsort(files, size, sizeof(void *),
+                (int (*)(const void *, const void *))compare);
+          break;
+        }
+
+      } else {
+        /* Error reading directory entry */
+        result = /*Error*/ -1;
+        break;
+      }
+    }
+
+  } else {
+    /* Cannot open directory */
+    result = /*Error*/ -1;
+  }
+
+  /* Release temporary directory entry */
+  free(tmp);
+
+  /* Release allocated memory on error */
+  if (result < 0) {
+    for (i = 0; i < size; i++) {
+      free(files[i]);
+    }
+    free(files);
+    files = NULL;
+  }
+
+  /* Close directory stream */
+  if (dir) {
+    closedir(dir);
+  }
+
+  /* Pass pointer table to caller */
+  if (namelist) {
+    *namelist = files;
+  }
+  return result;
+}
+
+/* Alphabetical sorting */
+static int alphasort(const struct dirent **a, const struct dirent **b) {
+  return strcoll((*a)->d_name, (*b)->d_name);
+}
+
+/* Sort versions */
+static int versionsort(const struct dirent **a, const struct dirent **b) {
+  /* FIXME: implement strverscmp and use that */
+  return alphasort(a, b);
+}
+
+/* Convert multi-byte string to wide character string */
+static int dirent_mbstowcs_s(size_t *pReturnValue, wchar_t *wcstr,
+                             size_t sizeInWords, const char *mbstr,
+                             size_t count) {
+  int error;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+
+  /* Microsoft Visual Studio 2005 or later */
+  error = mbstowcs_s(pReturnValue, wcstr, sizeInWords, mbstr, count);
+
+#else
+
+  /* Older Visual Studio or non-Microsoft compiler */
+  size_t n;
+
+  /* Convert to wide-character string (or count characters) */
+  n = mbstowcs(wcstr, mbstr, sizeInWords);
+  if (!wcstr || n < count) {
+
+    /* Zero-terminate output buffer */
+    if (wcstr && sizeInWords) {
+      if (n >= sizeInWords) {
+        n = sizeInWords - 1;
+      }
+      wcstr[n] = 0;
+    }
+
+    /* Length of resulting multi-byte string WITH zero terminator */
+    if (pReturnValue) {
+      *pReturnValue = n + 1;
+    }
+
+    /* Success */
+    error = 0;
+
+  } else {
+
+    /* Could not convert string */
+    error = 1;
+  }
+
+#endif
+  return error;
+}
+
+/* Convert wide-character string to multi-byte string */
+static int dirent_wcstombs_s(size_t *pReturnValue, char *mbstr,
+                             size_t sizeInBytes, /* max size of mbstr */
+                             const wchar_t *wcstr, size_t count) {
+  int error;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+
+  /* Microsoft Visual Studio 2005 or later */
+  error = wcstombs_s(pReturnValue, mbstr, sizeInBytes, wcstr, count);
+
+#else
+
+  /* Older Visual Studio or non-Microsoft compiler */
+  size_t n;
+
+  /* Convert to multi-byte string (or count the number of bytes needed) */
+  n = wcstombs(mbstr, wcstr, sizeInBytes);
+  if (!mbstr || n < count) {
+
+    /* Zero-terminate output buffer */
+    if (mbstr && sizeInBytes) {
+      if (n >= sizeInBytes) {
+        n = sizeInBytes - 1;
+      }
+      mbstr[n] = '\0';
+    }
+
+    /* Length of resulting multi-bytes string WITH zero-terminator */
+    if (pReturnValue) {
+      *pReturnValue = n + 1;
+    }
+
+    /* Success */
+    error = 0;
+
+  } else {
+
+    /* Cannot convert string */
+    error = 1;
+  }
+
+#endif
+  return error;
+}
+
+/* Set errno variable */
+static void dirent_set_errno(int error) {
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+
+  /* Microsoft Visual Studio 2005 and later */
+  _set_errno(error);
+
+#else
+
+  /* Non-Microsoft compiler or older Microsoft compiler */
+  errno = error;
+
+#endif
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /*DIRENT_H*/
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e68af7957..16007664d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -15,8 +15,10 @@ endif()
 
 MESSAGE( STATUS "ROARING_LIB_TYPE: " ${ROARING_LIB_TYPE})
 set(ROARING_SRC
+    isadetection.c
     array_util.c
     bitset_util.c
+    bitset.c
     containers/array.c
     containers/bitset.c
     containers/containers.c
@@ -38,34 +40,51 @@ if(ROARING_BUILD_C_AS_CPP)  # more checks and tools, e.g. <type_traits> analysis
   SET_SOURCE_FILES_PROPERTIES(${ROARING_SRC} PROPERTIES LANGUAGE CXX)
 endif()
 
-add_library(${ROARING_LIB_NAME} ${ROARING_LIB_TYPE} ${ROARING_SRC})
-target_include_directories(${ROARING_LIB_NAME}
+add_library(roaring ${ROARING_LIB_TYPE} ${ROARING_SRC})
+if(ROARING_DISABLE_AVX512)
+  target_compile_definitions(roaring PUBLIC CROARING_COMPILER_SUPPORTS_AVX512=0)
+endif(ROARING_DISABLE_AVX512)
+
+if(ROARING_DISABLE_AVX)
+  target_compile_definitions(roaring PUBLIC ROARING_DISABLE_AVX=1)
+endif(ROARING_DISABLE_AVX)
+
+if(ROARING_DISABLE_X64)
+  target_compile_definitions(roaring PUBLIC ROARING_DISABLE_X64=1)
+endif(ROARING_DISABLE_X64)
+
+if(ROARING_DISABLE_NEON)
+  target_compile_definitions(roaring PUBLIC DISABLENEON=1)
+endif(ROARING_DISABLE_NEON)
+
+
+target_include_directories(roaring
   PUBLIC
    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
 )
-target_link_libraries(${ROARING_LIB_NAME} PUBLIC roaring-headers)
-target_link_libraries(${ROARING_LIB_NAME} PUBLIC roaring-headers-cpp)
+target_link_libraries(roaring PUBLIC roaring-headers)
+target_link_libraries(roaring PUBLIC roaring-headers-cpp)
 #
-#install(TARGETS ${ROARING_LIB_NAME} DESTINATION lib)
+#install(TARGETS roaring DESTINATION lib)
 #
-install(TARGETS ${ROARING_LIB_NAME} 
-   EXPORT ${ROARING_LIB_NAME}-config
+install(TARGETS roaring 
+   EXPORT roaring-config
    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
    INCLUDES DESTINATION ${CMAKE_INSTALL_INCDIR}
 )
-install(EXPORT ${ROARING_LIB_NAME}-config
-   FILE ${ROARING_LIB_NAME}-config.cmake
-   NAMESPACE ${ROARING_LIB_NAME}::
-   DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${ROARING_LIB_NAME}
+install(EXPORT roaring-config
+   FILE roaring-config.cmake
+   NAMESPACE roaring::
+   DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/roaring
  )
 
 if(NOT MSVC)
 ## We output the library at the root of the current directory where cmake is invoked
 ## This is handy but Visual Studio will happily ignore us
-set_target_properties(${ROARING_LIB_NAME} PROPERTIES
+set_target_properties(roaring PROPERTIES
   LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}
   VERSION ${ROARING_LIB_VERSION}
   SOVERSION ${ROARING_LIB_SOVERSION})
@@ -77,6 +96,6 @@ if(MSVC AND (ROARING_LIB_TYPE STREQUAL "SHARED"))
     MESSAGE( STATUS "To build  a Windows DLL using Visual Studio, you may need cmake 3.4 or better." )
   endif()
   MESSAGE( STATUS "Building a Windows DLL using Visual Studio, exporting all symbols automatically." )
- set_target_properties(${ROARING_LIB_NAME}
+ set_target_properties(roaring
     PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS 1)
 endif()
diff --git a/src/array_util.c b/src/array_util.c
index 48349105a..eb7dcbc49 100644
--- a/src/array_util.c
+++ b/src/array_util.c
@@ -9,14 +9,21 @@
 #include <roaring/portability.h>
 #include <roaring/utilasm.h>
 
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+
 #ifdef __cplusplus
+using namespace ::roaring::internal;
 extern "C" { namespace roaring { namespace internal {
 #endif
 
 extern inline int32_t binarySearch(const uint16_t *array, int32_t lenarray,
                                    uint16_t ikey);
 
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
 // used by intersect_vector16
 ALIGNED(0x1000)
 static const uint8_t shuffle_mask16[] = {
@@ -385,7 +392,7 @@ int32_t intersect_vector16(const uint16_t *__restrict__ A, size_t s_a,
                 v_b, vectorlength, v_a, vectorlength,
                 _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
             const int r = _mm_extract_epi32(res_v, 0);
-            __m128i sm16 = _mm_load_si128((const __m128i *)shuffle_mask16 + r);
+            __m128i sm16 = _mm_loadu_si128((const __m128i *)shuffle_mask16 + r);
             __m128i p = _mm_shuffle_epi8(v_a, sm16);
             _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
             count += _mm_popcnt_u32(r);
@@ -409,7 +416,7 @@ int32_t intersect_vector16(const uint16_t *__restrict__ A, size_t s_a,
                     _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
                 const int r = _mm_extract_epi32(res_v, 0);
                 __m128i sm16 =
-                    _mm_load_si128((const __m128i *)shuffle_mask16 + r);
+                    _mm_loadu_si128((const __m128i *)shuffle_mask16 + r);
                 __m128i p = _mm_shuffle_epi8(v_a, sm16);
                 _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
                 count += _mm_popcnt_u32(r);
@@ -444,7 +451,120 @@ int32_t intersect_vector16(const uint16_t *__restrict__ A, size_t s_a,
     }
     return (int32_t)count;
 }
-CROARING_UNTARGET_REGION
+
+ALLOW_UNALIGNED
+int array_container_to_uint32_array_vector16(void *vout, const uint16_t* array, size_t cardinality,
+                                    uint32_t base) {
+    int outpos = 0;
+    uint32_t *out = (uint32_t *)vout;
+    size_t i = 0;
+    for ( ;i + sizeof(__m128i)/sizeof(uint16_t) <= cardinality; i += sizeof(__m128i)/sizeof(uint16_t)) {
+        __m128i vinput = _mm_loadu_si128((const __m128i*) (array + i));
+        __m256i voutput = _mm256_add_epi32(_mm256_cvtepu16_epi32(vinput), _mm256_set1_epi32(base));
+        _mm256_storeu_si256((__m256i*)(out + outpos), voutput);
+        outpos += sizeof(__m256i)/sizeof(uint32_t);
+    }
+    for ( ; i < cardinality; ++i) {
+        const uint32_t val = base + array[i];
+        memcpy(out + outpos, &val,
+               sizeof(uint32_t));  // should be compiled as a MOV on x64
+        outpos++;
+    }
+    return outpos;
+}
+
+int32_t intersect_vector16_inplace(uint16_t *__restrict__ A, size_t s_a,
+                           const uint16_t *__restrict__ B, size_t s_b) {
+    size_t count = 0;
+    size_t i_a = 0, i_b = 0;
+    const int vectorlength = sizeof(__m128i) / sizeof(uint16_t);
+    const size_t st_a = (s_a / vectorlength) * vectorlength;
+    const size_t st_b = (s_b / vectorlength) * vectorlength;
+    __m128i v_a, v_b;
+    if ((i_a < st_a) && (i_b < st_b)) {
+        v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+        v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+        __m128i tmp[2] = {_mm_setzero_si128()};
+        size_t tmp_count = 0;
+        while ((A[i_a] == 0) || (B[i_b] == 0)) {
+            const __m128i res_v = _mm_cmpestrm(
+                v_b, vectorlength, v_a, vectorlength,
+                _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
+            const int r = _mm_extract_epi32(res_v, 0);
+            __m128i sm16 = _mm_loadu_si128((const __m128i *)shuffle_mask16 + r);
+            __m128i p = _mm_shuffle_epi8(v_a, sm16);
+            _mm_storeu_si128((__m128i*)&((uint16_t*)tmp)[tmp_count], p);
+            tmp_count += _mm_popcnt_u32(r);
+            const uint16_t a_max = A[i_a + vectorlength - 1];
+            const uint16_t b_max = B[i_b + vectorlength - 1];
+            if (a_max <= b_max) {
+                _mm_storeu_si128((__m128i *)&A[count], tmp[0]);
+                _mm_storeu_si128(tmp, _mm_setzero_si128());
+                count += tmp_count;
+                tmp_count = 0;           
+                i_a += vectorlength;
+                if (i_a == st_a) break;
+                v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+            }
+            if (b_max <= a_max) {
+                i_b += vectorlength;
+                if (i_b == st_b) break;
+                v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+            }
+        }
+        if ((i_a < st_a) && (i_b < st_b)) {
+            while (true) {
+                const __m128i res_v = _mm_cmpistrm(
+                    v_b, v_a,
+                    _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
+                const int r = _mm_extract_epi32(res_v, 0);
+                __m128i sm16 = _mm_loadu_si128((const __m128i *)shuffle_mask16 + r);
+                __m128i p = _mm_shuffle_epi8(v_a, sm16);
+                _mm_storeu_si128((__m128i*)&((uint16_t*)tmp)[tmp_count], p);
+                tmp_count += _mm_popcnt_u32(r);
+                const uint16_t a_max = A[i_a + vectorlength - 1];
+                const uint16_t b_max = B[i_b + vectorlength - 1];
+                if (a_max <= b_max) {
+                    _mm_storeu_si128((__m128i *)&A[count], tmp[0]);
+                    _mm_storeu_si128(tmp, _mm_setzero_si128());
+                    count += tmp_count;
+                    tmp_count = 0;  
+                    i_a += vectorlength;
+                    if (i_a == st_a) break;
+                    v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
+                }
+                if (b_max <= a_max) {
+                    i_b += vectorlength;
+                    if (i_b == st_b) break;
+                    v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
+                }
+            }
+        }
+        // tmp_count <= 8, so this does not affect efficiency so much
+        for (size_t i = 0; i < tmp_count; i++) {
+            A[count] = ((uint16_t*)tmp)[i];
+            count++;
+        }
+        i_a += tmp_count;  // We can at least jump pass $tmp_count elements in A
+    }
+    // intersect the tail using scalar intersection
+    while (i_a < s_a && i_b < s_b) {
+        uint16_t a = A[i_a];
+        uint16_t b = B[i_b];
+        if (a < b) {
+            i_a++;
+        } else if (b < a) {
+            i_b++;
+        } else {
+            A[count] = a;  //==b;
+            count++;
+            i_a++;
+            i_b++;
+        }
+    }
+    return (int32_t)count;
+}
+CROARING_UNTARGET_AVX2
 
 CROARING_TARGET_AVX2
 int32_t intersect_vector16_cardinality(const uint16_t *__restrict__ A,
@@ -516,7 +636,7 @@ int32_t intersect_vector16_cardinality(const uint16_t *__restrict__ A,
     }
     return (int32_t)count;
 }
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 
 CROARING_TARGET_AVX2
 /////////
@@ -586,7 +706,7 @@ int32_t difference_vector16(const uint16_t *__restrict__ A, size_t s_a,
                 const int bitmask_belongs_to_difference =
                     _mm_extract_epi32(runningmask_a_found_in_b, 0) ^ 0xFF;
                 /*** next few lines are probably expensive *****/
-                __m128i sm16 = _mm_load_si128((const __m128i *)shuffle_mask16 +
+                __m128i sm16 = _mm_loadu_si128((const __m128i *)shuffle_mask16 +
                                               bitmask_belongs_to_difference);
                 __m128i p = _mm_shuffle_epi8(v_a, sm16);
                 _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
@@ -621,7 +741,7 @@ int32_t difference_vector16(const uint16_t *__restrict__ A, size_t s_a,
                 _mm_or_si128(runningmask_a_found_in_b, a_found_in_b);
             const int bitmask_belongs_to_difference =
                 _mm_extract_epi32(runningmask_a_found_in_b, 0) ^ 0xFF;
-            __m128i sm16 = _mm_load_si128((const __m128i *)shuffle_mask16 +
+            __m128i sm16 = _mm_loadu_si128((const __m128i *)shuffle_mask16 +
                                           bitmask_belongs_to_difference);
             __m128i p = _mm_shuffle_epi8(v_a, sm16);
             _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
@@ -660,7 +780,7 @@ int32_t difference_vector16(const uint16_t *__restrict__ A, size_t s_a,
     }
     return count;
 }
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 #endif  // CROARING_IS_X64
 
 
@@ -1107,7 +1227,7 @@ int32_t xor_uint16(const uint16_t *array_1, int32_t card_1,
     return pos_out;
 }
 
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
 
 /***
  * start of the SIMD 16-bit union code
@@ -1149,7 +1269,7 @@ static inline void sse_merge(const __m128i *vInput1,
     *vecMax = _mm_max_epu16(vecTmp, *vecMax);
     *vecMin = _mm_alignr_epi8(*vecMin, *vecMin, 2);
 }
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 // used by store_unique, generated by simdunion.py
 static uint8_t uniqshuf[] = {
     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
@@ -1508,7 +1628,7 @@ static inline int store_unique(__m128i old, __m128i newval, uint16_t *output) {
     _mm_storeu_si128((__m128i *)output, val);
     return numberofnewvalues;
 }
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 
 // working in-place, this function overwrites the repeated values
 // could be avoided?
@@ -1609,7 +1729,7 @@ uint32_t union_vector16(const uint16_t *__restrict__ array1, uint32_t length1,
     }
     return len;
 }
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 
 /**
  * End of the SIMD 16-bit union code
@@ -1638,7 +1758,7 @@ static inline int store_unique_xor(__m128i old, __m128i newval,
     _mm_storeu_si128((__m128i *)output, val);
     return numberofnewvalues;
 }
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 
 // working in-place, this function overwrites the repeated values
 // could be avoided? Warning: assumes len > 0
@@ -1756,7 +1876,7 @@ uint32_t xor_vector16(const uint16_t *__restrict__ array1, uint32_t length1,
     }
     return len;
 }
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 /**
  * End of SIMD 16-bit XOR code
  */
@@ -1860,8 +1980,8 @@ size_t union_uint32_card(const uint32_t *set_1, size_t size_1,
 
 size_t fast_union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2,
                     size_t size_2, uint16_t *buffer) {
-#ifdef CROARING_IS_X64
-    if( croaring_avx2() ) {
+#if CROARING_IS_X64
+    if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) {
         // compute union with smallest array first
       if (size_1 < size_2) {
         return union_vector16(set_1, (uint32_t)size_1,
@@ -1891,7 +2011,67 @@ size_t fast_union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *s
     }
 #endif
 }
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
+#if CROARING_COMPILER_SUPPORTS_AVX512
+CROARING_TARGET_AVX512
+static inline bool _avx512_memequals(const void *s1, const void *s2, size_t n) {
+    const uint8_t *ptr1 = (const uint8_t *)s1;
+    const uint8_t *ptr2 = (const uint8_t *)s2;
+    const uint8_t *end1 = ptr1 + n;
+    const uint8_t *end8 = ptr1 + ((n >> 3) << 3);
+    const uint8_t *end32 = ptr1 + ((n >> 5) << 5);
+    const uint8_t *end64 = ptr1 + ((n >> 6) << 6);
+    
+    while (ptr1 < end64){
+        __m512i r1 = _mm512_loadu_si512((const __m512i*)ptr1);
+        __m512i r2 = _mm512_loadu_si512((const __m512i*)ptr2);
+
+        uint64_t mask = _mm512_cmpeq_epi8_mask(r1, r2);
+        
+        if (mask != UINT64_MAX) {
+           return false;
+        }
+
+        ptr1 += 64;
+        ptr2 += 64;
+
+    }
+
+    while (ptr1 < end32) {
+        __m256i r1 = _mm256_loadu_si256((const __m256i*)ptr1);
+        __m256i r2 = _mm256_loadu_si256((const __m256i*)ptr2);
+        int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r1, r2));
+        if ((uint32_t)mask != UINT32_MAX) {
+            return false;
+        }
+        ptr1 += 32;
+        ptr2 += 32;
+    }
+
+    while (ptr1 < end8) {
+	uint64_t v1, v2;
+        memcpy(&v1,ptr1,sizeof(uint64_t));
+        memcpy(&v2,ptr2,sizeof(uint64_t));
+        if (v1 != v2) {
+            return false;
+        }
+        ptr1 += 8;
+        ptr2 += 8;
+    }
+
+    while (ptr1 < end1) {
+        if (*ptr1 != *ptr2) {
+            return false;
+        }
+        ptr1++;
+        ptr2++;
+    }
+
+    return true;
+}
+CROARING_UNTARGET_AVX512
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+
 CROARING_TARGET_AVX2
 static inline bool _avx2_memequals(const void *s1, const void *s2, size_t n) {
     const uint8_t *ptr1 = (const uint8_t *)s1;
@@ -1932,15 +2112,21 @@ static inline bool _avx2_memequals(const void *s1, const void *s2, size_t n) {
 
     return true;
 }
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 #endif
 
 bool memequals(const void *s1, const void *s2, size_t n) {
     if (n == 0) {
         return true;
     }
-#ifdef CROARING_IS_X64
-    if( croaring_avx2() ) {
+#if CROARING_IS_X64
+    int support = croaring_hardware_support();
+#if CROARING_COMPILER_SUPPORTS_AVX512
+    if( support & ROARING_SUPPORTS_AVX512 ) {
+      return _avx512_memequals(s1, s2, n);
+    } else
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+    if( support & ROARING_SUPPORTS_AVX2 ) {
       return _avx2_memequals(s1, s2, n);
     } else {
       return memcmp(s1, s2, n) == 0;
@@ -1950,6 +2136,35 @@ bool memequals(const void *s1, const void *s2, size_t n) {
 #endif
 }
 
+
+#if CROARING_IS_X64
+#if CROARING_COMPILER_SUPPORTS_AVX512
+CROARING_TARGET_AVX512
+ALLOW_UNALIGNED
+int avx512_array_container_to_uint32_array(void *vout, const uint16_t* array, size_t cardinality,
+                                    uint32_t base) {
+    int outpos = 0;
+    uint32_t *out = (uint32_t *)vout;
+    size_t i = 0;
+    for ( ;i + sizeof(__m256i)/sizeof(uint16_t) <= cardinality; i += sizeof(__m256i)/sizeof(uint16_t)) {
+        __m256i vinput = _mm256_loadu_si256((const __m256i*) (array + i));
+        __m512i voutput = _mm512_add_epi32(_mm512_cvtepu16_epi32(vinput), _mm512_set1_epi32(base));
+        _mm512_storeu_si512((__m512i*)(out + outpos), voutput);
+        outpos += sizeof(__m512i)/sizeof(uint32_t);
+    }
+    for ( ; i < cardinality; ++i) {
+        const uint32_t val = base + array[i];
+        memcpy(out + outpos, &val,
+               sizeof(uint32_t));  // should be compiled as a MOV on x64
+        outpos++;
+    }
+    return outpos;
+}
+CROARING_UNTARGET_AVX512
+#endif // #if CROARING_COMPILER_SUPPORTS_AVX512
+#endif // #if CROARING_IS_X64
+
+
 #ifdef __cplusplus
 } } }  // extern "C" { namespace roaring { namespace internal {
 #endif
diff --git a/src/bitset.c b/src/bitset.c
new file mode 100644
index 000000000..03337951f
--- /dev/null
+++ b/src/bitset.c
@@ -0,0 +1,456 @@
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <roaring/bitset/bitset.h>
+#include <roaring/portability.h>
+#include <roaring/memory.h>
+
+#ifdef __cplusplus
+extern "C" { namespace roaring { namespace internal {
+#endif
+
+extern inline void bitset_print(const bitset_t *b);
+extern inline bool bitset_for_each(const bitset_t *b, bitset_iterator iterator,
+                                   void *ptr);
+extern inline size_t bitset_next_set_bits(const bitset_t *bitset, size_t *buffer,
+                                 size_t capacity, size_t *startfrom);
+extern inline void bitset_set_to_value(bitset_t *bitset, size_t i, bool flag);
+extern inline bool bitset_next_set_bit(const bitset_t *bitset, size_t *i);
+extern inline void bitset_set(bitset_t *bitset, size_t i);
+extern inline bool bitset_get(const bitset_t *bitset, size_t i);
+extern inline size_t bitset_size_in_words(const bitset_t *bitset);
+extern inline size_t bitset_size_in_bits(const bitset_t *bitset);
+extern inline size_t bitset_size_in_bytes(const bitset_t *bitset);
+
+
+/* Create a new bitset. Return NULL in case of failure. */
+bitset_t *bitset_create(void) {
+    bitset_t *bitset = NULL;
+    /* Allocate the bitset itself. */
+    if ((bitset = (bitset_t *)roaring_malloc(sizeof(bitset_t))) == NULL) {
+        return NULL;
+    }
+    bitset->array = NULL;
+    bitset->arraysize = 0;
+    bitset->capacity = 0;
+    return bitset;
+}
+
+/* Create a new bitset able to contain size bits. Return NULL in case of
+ * failure. */
+bitset_t *bitset_create_with_capacity(size_t size) {
+    bitset_t *bitset = NULL;
+    /* Allocate the bitset itself. */
+    if ((bitset = (bitset_t *)roaring_malloc(sizeof(bitset_t))) == NULL) {
+        return NULL;
+    }
+    bitset->arraysize =
+        (size + sizeof(uint64_t) * 8 - 1) / (sizeof(uint64_t) * 8);
+    bitset->capacity = bitset->arraysize;
+    if ((bitset->array =
+             (uint64_t *)roaring_calloc(bitset->arraysize, sizeof(uint64_t))) == NULL) {
+        roaring_free(bitset);
+        return NULL;
+    }
+    return bitset;
+}
+
+/* Create a copy */
+bitset_t *bitset_copy(const bitset_t *bitset) {
+    bitset_t *copy = NULL;
+    /* Allocate the bitset itself. */
+    if ((copy = (bitset_t *)roaring_malloc(sizeof(bitset_t))) == NULL) {
+        return NULL;
+    }
+    memcpy(copy, bitset, sizeof(bitset_t));
+    copy->capacity = copy->arraysize;
+    if ((copy->array = (uint64_t *)roaring_malloc(sizeof(uint64_t) *
+                                          bitset->arraysize)) == NULL) {
+        roaring_free(copy);
+        return NULL;
+    }
+    memcpy(copy->array, bitset->array, sizeof(uint64_t) * bitset->arraysize);
+    return copy;
+}
+
+void bitset_clear(bitset_t *bitset) {
+    memset(bitset->array, 0, sizeof(uint64_t) * bitset->arraysize);
+}
+
+void bitset_fill(bitset_t *bitset) {
+    memset(bitset->array, 0xff, sizeof(uint64_t) * bitset->arraysize);
+}
+
+void bitset_shift_left(bitset_t *bitset, size_t s) {
+    size_t extra_words = s / 64;
+    int inword_shift = s % 64;
+    size_t as = bitset->arraysize;
+    if (inword_shift == 0) {
+        bitset_resize(bitset, as + extra_words, false);
+        // could be done with a memmove
+        for (size_t i = as + extra_words; i > extra_words; i--) {
+            bitset->array[i - 1] = bitset->array[i - 1 - extra_words];
+        }
+    } else {
+        bitset_resize(bitset, as + extra_words + 1, true);
+        bitset->array[as + extra_words] =
+            bitset->array[as - 1] >> (64 - inword_shift);
+        for (size_t i = as + extra_words; i >= extra_words + 2; i--) {
+            bitset->array[i - 1] =
+                (bitset->array[i - 1 - extra_words] << inword_shift) |
+                (bitset->array[i - 2 - extra_words] >> (64 - inword_shift));
+        }
+        bitset->array[extra_words] = bitset->array[0] << inword_shift;
+    }
+    for (size_t i = 0; i < extra_words; i++) {
+        bitset->array[i] = 0;
+    }
+}
+
+void bitset_shift_right(bitset_t *bitset, size_t s) {
+    size_t extra_words = s / 64;
+    int inword_shift = s % 64;
+    size_t as = bitset->arraysize;
+    if (inword_shift == 0) {
+        // could be done with a memmove
+        for (size_t i = 0; i < as - extra_words; i++) {
+            bitset->array[i] = bitset->array[i + extra_words];
+        }
+        bitset_resize(bitset, as - extra_words, false);
+
+    } else {
+        for (size_t i = 0; i + extra_words + 1 < as; i++) {
+            bitset->array[i] =
+                (bitset->array[i + extra_words] >> inword_shift) |
+                (bitset->array[i + extra_words + 1] << (64 - inword_shift));
+        }
+        bitset->array[as - extra_words - 1] =
+            (bitset->array[as - 1] >> inword_shift);
+        bitset_resize(bitset, as - extra_words, false);
+    }
+}
+
+/* Free memory. */
+void bitset_free(bitset_t *bitset) {
+    if(bitset == NULL) { return; }
+    roaring_free(bitset->array);
+    roaring_free(bitset);
+}
+
+/* Resize the bitset so that it can support newarraysize * 64 bits. Return true
+ * in case of success, false for failure. */
+bool bitset_resize(bitset_t *bitset, size_t newarraysize, bool padwithzeroes) {
+    if(newarraysize > SIZE_MAX/64) { return false; }
+    size_t smallest =
+        newarraysize < bitset->arraysize ? newarraysize : bitset->arraysize;
+    if (bitset->capacity < newarraysize) {
+        uint64_t *newarray;
+        size_t newcapacity = bitset->capacity;
+        if(newcapacity == 0) { newcapacity = 1; }
+        while(newcapacity < newarraysize) { newcapacity *= 2; }
+        if ((newarray = (uint64_t *) roaring_realloc(bitset->array, sizeof(uint64_t) * newcapacity)) == NULL) {
+            return false;
+        }
+        bitset->capacity = newcapacity;
+        bitset->array = newarray;
+    }
+    if (padwithzeroes && (newarraysize > smallest))
+        memset(bitset->array + smallest, 0,
+               sizeof(uint64_t) * (newarraysize - smallest));
+    bitset->arraysize = newarraysize;
+    return true;  // success!
+}
+
+size_t bitset_count(const bitset_t *bitset) {
+    size_t card = 0;
+    size_t k = 0;
+    for (; k + 7 < bitset->arraysize; k += 8) {
+        card += roaring_hamming(bitset->array[k]);
+        card += roaring_hamming(bitset->array[k + 1]);
+        card += roaring_hamming(bitset->array[k + 2]);
+        card += roaring_hamming(bitset->array[k + 3]);
+        card += roaring_hamming(bitset->array[k + 4]);
+        card += roaring_hamming(bitset->array[k + 5]);
+        card += roaring_hamming(bitset->array[k + 6]);
+        card += roaring_hamming(bitset->array[k + 7]);
+    }
+    for (; k + 3 < bitset->arraysize; k += 4) {
+        card += roaring_hamming(bitset->array[k]);
+        card += roaring_hamming(bitset->array[k + 1]);
+        card += roaring_hamming(bitset->array[k + 2]);
+        card += roaring_hamming(bitset->array[k + 3]);
+    }
+    for (; k < bitset->arraysize; k++) {
+        card += roaring_hamming(bitset->array[k]);
+    }
+    return card;
+}
+
+bool bitset_inplace_union(bitset_t *CBITSET_RESTRICT b1,
+                          const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    for (size_t k = 0; k < minlength; ++k) {
+        b1->array[k] |= b2->array[k];
+    }
+    if (b2->arraysize > b1->arraysize) {
+        size_t oldsize = b1->arraysize;
+        if (!bitset_resize(b1, b2->arraysize, false)) return false;
+        memcpy(b1->array + oldsize, b2->array + oldsize,
+               (b2->arraysize - oldsize) * sizeof(uint64_t));
+    }
+    return true;
+}
+
+size_t bitset_minimum(const bitset_t *bitset) {
+    for (size_t k = 0; k < bitset->arraysize; k++) {
+        uint64_t w = bitset->array[k];
+        if (w != 0) {
+            return roaring_trailing_zeroes(w) + k * 64;
+        }
+    }
+    return 0;
+}
+
+bool bitset_grow(bitset_t *bitset, size_t newarraysize) {
+    if(newarraysize < bitset->arraysize) { return false; }
+    if(newarraysize > SIZE_MAX/64) { return false; }
+    if (bitset->capacity < newarraysize) {
+        uint64_t *newarray;
+        size_t newcapacity = (UINT64_C(0xFFFFFFFFFFFFFFFF) >> roaring_leading_zeroes(newarraysize)) + 1;
+        while(newcapacity < newarraysize) { newcapacity *= 2; }
+        if ((newarray = (uint64_t *) roaring_realloc(bitset->array, sizeof(uint64_t) * newcapacity)) == NULL) {
+            return false;
+        }
+        bitset->capacity = newcapacity;
+        bitset->array = newarray;
+    }
+    memset(bitset->array + bitset->arraysize, 0,
+           sizeof(uint64_t) * (newarraysize - bitset->arraysize));
+    bitset->arraysize = newarraysize;
+    return true;  // success!
+}
+
+size_t bitset_maximum(const bitset_t *bitset) {
+    for (size_t k = bitset->arraysize; k > 0; k--) {
+        uint64_t w = bitset->array[k - 1];
+        if (w != 0) {
+            return 63 - roaring_leading_zeroes(w) + (k - 1) * 64;
+        }
+    }
+    return 0;
+}
+
+/* Returns true if bitsets share no common elements, false otherwise.
+ *
+ * Performs early-out if common element found. */
+bool bitsets_disjoint(const bitset_t *CBITSET_RESTRICT b1, const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+
+    for (size_t k = 0; k < minlength; k++) {
+        if ((b1->array[k] & b2->array[k]) != 0) return false;
+    }
+    return true;
+}
+
+/* Returns true if bitsets contain at least 1 common element, false if they are
+ * disjoint.
+ *
+ * Performs early-out if common element found. */
+bool bitsets_intersect(const bitset_t *CBITSET_RESTRICT b1, const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+
+    for (size_t k = 0; k < minlength; k++) {
+        if ((b1->array[k] & b2->array[k]) != 0) return true;
+    }
+    return false;
+}
+
+/* Returns true if b has any bits set in or after b->array[starting_loc]. */
+static bool any_bits_set(const bitset_t *b, size_t starting_loc) {
+    if (starting_loc >= b->arraysize) {
+        return false;
+    }
+    for (size_t k = starting_loc; k < b->arraysize; k++) {
+        if (b->array[k] != 0) return true;
+    }
+    return false;
+}
+
+/* Returns true if b1 has all of b2's bits set.
+ *
+ * Performs early out if a bit is found in b2 that is not found in b1. */
+bool bitset_contains_all(const bitset_t *CBITSET_RESTRICT b1, const bitset_t *CBITSET_RESTRICT b2) {
+    size_t min_size = b1->arraysize;
+    if(b1->arraysize > b2->arraysize) {
+        min_size = b2->arraysize;
+    }
+    for (size_t k = 0; k < min_size; k++) {
+        if ((b1->array[k] & b2->array[k]) != b2->array[k]) {
+            return false;
+        }
+    }
+    if (b2->arraysize > b1->arraysize) {
+        /* Need to check if b2 has any bits set beyond b1's array */
+        return !any_bits_set(b2, b1->arraysize);
+    }
+    return true;
+}
+
+size_t bitset_union_count(const bitset_t *CBITSET_RESTRICT b1,
+                          const bitset_t *CBITSET_RESTRICT b2) {
+    size_t answer = 0;
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    size_t k = 0;
+    for (; k + 3 < minlength; k += 4) {
+        answer += roaring_hamming(b1->array[k] | b2->array[k]);
+        answer += roaring_hamming(b1->array[k + 1] | b2->array[k + 1]);
+        answer += roaring_hamming(b1->array[k + 2] | b2->array[k + 2]);
+        answer += roaring_hamming(b1->array[k + 3] | b2->array[k + 3]);
+    }
+    for (; k < minlength; ++k) {
+        answer += roaring_hamming(b1->array[k] | b2->array[k]);
+    }
+    if (b2->arraysize > b1->arraysize) {
+        // k is equal to b1->arraysize
+        for (; k + 3 < b2->arraysize; k += 4) {
+            answer += roaring_hamming(b2->array[k]);
+            answer += roaring_hamming(b2->array[k + 1]);
+            answer += roaring_hamming(b2->array[k + 2]);
+            answer += roaring_hamming(b2->array[k + 3]);
+        }
+        for (; k < b2->arraysize; ++k) {
+            answer += roaring_hamming(b2->array[k]);
+        }
+    } else {
+        // k is equal to b2->arraysize
+        for (; k + 3 < b1->arraysize; k += 4) {
+            answer += roaring_hamming(b1->array[k]);
+            answer += roaring_hamming(b1->array[k + 1]);
+            answer += roaring_hamming(b1->array[k + 2]);
+            answer += roaring_hamming(b1->array[k + 3]);
+        }
+        for (; k < b1->arraysize; ++k) {
+            answer += roaring_hamming(b1->array[k]);
+        }
+    }
+    return answer;
+}
+
+void bitset_inplace_intersection(bitset_t *CBITSET_RESTRICT b1,
+                                 const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    size_t k = 0;
+    for (; k < minlength; ++k) {
+        b1->array[k] &= b2->array[k];
+    }
+    for (; k < b1->arraysize; ++k) {
+        b1->array[k] = 0;  // memset could, maybe, be a tiny bit faster
+    }
+}
+
+size_t bitset_intersection_count(const bitset_t *CBITSET_RESTRICT b1,
+                                 const bitset_t *CBITSET_RESTRICT b2) {
+    size_t answer = 0;
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    for (size_t k = 0; k < minlength; ++k) {
+        answer += roaring_hamming(b1->array[k] & b2->array[k]);
+    }
+    return answer;
+}
+
+void bitset_inplace_difference(bitset_t *CBITSET_RESTRICT b1,
+                               const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    size_t k = 0;
+    for (; k < minlength; ++k) {
+        b1->array[k] &= ~(b2->array[k]);
+    }
+}
+
+size_t bitset_difference_count(const bitset_t *CBITSET_RESTRICT b1,
+                               const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    size_t k = 0;
+    size_t answer = 0;
+    for (; k < minlength; ++k) {
+        answer += roaring_hamming(b1->array[k] & ~(b2->array[k]));
+    }
+    for (; k < b1->arraysize; ++k) {
+        answer += roaring_hamming(b1->array[k]);
+    }
+    return answer;
+}
+
+bool bitset_inplace_symmetric_difference(bitset_t *CBITSET_RESTRICT b1,
+                                         const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    size_t k = 0;
+    for (; k < minlength; ++k) {
+        b1->array[k] ^= b2->array[k];
+    }
+    if (b2->arraysize > b1->arraysize) {
+        size_t oldsize = b1->arraysize;
+        if (!bitset_resize(b1, b2->arraysize, false)) return false;
+        memcpy(b1->array + oldsize, b2->array + oldsize,
+               (b2->arraysize - oldsize) * sizeof(uint64_t));
+    }
+    return true;
+}
+
+size_t bitset_symmetric_difference_count(const bitset_t *CBITSET_RESTRICT b1,
+                                         const bitset_t *CBITSET_RESTRICT b2) {
+    size_t minlength =
+        b1->arraysize < b2->arraysize ? b1->arraysize : b2->arraysize;
+    size_t k = 0;
+    size_t answer = 0;
+    for (; k < minlength; ++k) {
+        answer += roaring_hamming(b1->array[k] ^ b2->array[k]);
+    }
+    if (b2->arraysize > b1->arraysize) {
+        for (; k < b2->arraysize; ++k) {
+            answer += roaring_hamming(b2->array[k]);
+        }
+    } else {
+        for (; k < b1->arraysize; ++k) {
+            answer += roaring_hamming(b1->array[k]);
+        }
+    }
+    return answer;
+}
+
+bool bitset_trim(bitset_t *bitset) {
+    size_t newsize = bitset->arraysize;
+    while (newsize > 0) {
+        if (bitset->array[newsize - 1] == 0)
+            newsize -= 1;
+        else
+            break;
+    }
+    if (bitset->capacity == newsize) return true;  // nothing to do
+    uint64_t *newarray;
+    if ((newarray = (uint64_t *)roaring_realloc(
+             bitset->array, sizeof(uint64_t) * newsize)) == NULL) {
+        return false;
+    }
+    bitset->array = newarray;
+    bitset->capacity = newsize;
+    bitset->arraysize = newsize;
+    return true;
+}
+
+
+#ifdef __cplusplus
+} } }  // extern "C" { namespace roaring { namespace internal {
+#endif
diff --git a/src/bitset_util.c b/src/bitset_util.c
index 10d3d6cb2..7096b27b1 100644
--- a/src/bitset_util.c
+++ b/src/bitset_util.c
@@ -6,11 +6,18 @@
 
 #include <roaring/bitset_util.h>
 
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+
 #ifdef __cplusplus
-extern "C" { namespace roaring { namespace internal {
+using namespace ::roaring::internal;
+extern "C" { namespace roaring { namespace api {
 #endif
 
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
 static uint8_t lengthTable[256] = {
     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
     2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
@@ -25,7 +32,7 @@ static uint8_t lengthTable[256] = {
     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
 #endif
 
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
 ALIGNED(32)
 static uint32_t vecDecodeTable[256][8] = {
     {0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */
@@ -286,9 +293,9 @@ static uint32_t vecDecodeTable[256][8] = {
     {1, 2, 3, 4, 5, 6, 7, 8}  /* 0xFF (11111111) */
 };
 
-#endif  // #ifdef CROARING_IS_X64
+#endif  // #if CROARING_IS_X64
 
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
 // same as vecDecodeTable but in 16 bits
 ALIGNED(32)
 static uint16_t vecDecodeTable_uint16[256][8] = {
@@ -552,7 +559,118 @@ static uint16_t vecDecodeTable_uint16[256][8] = {
 
 #endif
 
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
+#if CROARING_COMPILER_SUPPORTS_AVX512
+CROARING_TARGET_AVX512
+const uint8_t vbmi2_table[64] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63};
+size_t bitset_extract_setbits_avx512(const uint64_t *words, size_t length, uint32_t *vout,
+                                   size_t outcapacity, uint32_t base) {
+    uint32_t *out = (uint32_t *)vout;
+    uint32_t *initout = out;
+    uint32_t *safeout = out + outcapacity;
+    __m512i base_v = _mm512_set1_epi32(base);    
+    __m512i index_table = _mm512_loadu_si512(vbmi2_table);
+    size_t i = 0;
+
+    for (; (i < length) && ((out + 64) < safeout); i += 1)
+    {
+        uint64_t v = words[i];		
+        __m512i vec = _mm512_maskz_compress_epi8(v, index_table);	
+        	    
+        uint8_t advance = roaring_hamming(v);
+        
+        __m512i vbase = _mm512_add_epi32(base_v, _mm512_set1_epi32(i * 64));
+        __m512i r1 = _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(vec,0));
+        __m512i r2 = _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(vec,1));
+        __m512i r3 = _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(vec,2));
+        __m512i r4 = _mm512_cvtepi8_epi32(_mm512_extracti32x4_epi32(vec,3));
+        
+        r1 = _mm512_add_epi32(r1, vbase);
+        r2 = _mm512_add_epi32(r2, vbase);
+        r3 = _mm512_add_epi32(r3, vbase);
+        r4 = _mm512_add_epi32(r4, vbase);
+        _mm512_storeu_si512((__m512i *)out, r1);
+        _mm512_storeu_si512((__m512i *)(out + 16), r2);
+        _mm512_storeu_si512((__m512i *)(out + 32), r3);
+        _mm512_storeu_si512((__m512i *)(out + 48), r4);
+
+        out += advance;
+        
+    }
+
+    base += i * 64;
+    
+    for (; (i < length) && (out < safeout); ++i) {
+         uint64_t w = words[i];
+         while ((w != 0) && (out < safeout)) {
+             uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail)
+             int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT
+             uint32_t val = r + base;
+             memcpy(out, &val,
+                    sizeof(uint32_t));  // should be compiled as a MOV on x64
+             out++;
+             w ^= t;
+         }
+         base += 64;
+     }
+
+
+    return out - initout;
+
+}
+
+// Reference: https://lemire.me/blog/2022/05/10/faster-bitset-decoding-using-intel-avx-512/
+size_t bitset_extract_setbits_avx512_uint16(const uint64_t *array, size_t length,
+                                     uint16_t *vout, size_t capacity, uint16_t base) {
+    uint16_t *out = (uint16_t *)vout;
+    uint16_t *initout = out;
+    uint16_t *safeout = vout + capacity;
+
+    __m512i base_v = _mm512_set1_epi16(base);
+    __m512i index_table = _mm512_loadu_si512(vbmi2_table);
+    size_t i = 0;
+
+    for (; (i < length) && ((out + 64) < safeout); i++)
+    {
+        uint64_t v = array[i];
+        __m512i vec = _mm512_maskz_compress_epi8(v, index_table);
+
+        uint8_t advance = roaring_hamming(v);
+
+        __m512i vbase = _mm512_add_epi16(base_v, _mm512_set1_epi16(i * 64));
+        __m512i r1 = _mm512_cvtepi8_epi16(_mm512_extracti32x8_epi32(vec,0));
+        __m512i r2 = _mm512_cvtepi8_epi16(_mm512_extracti32x8_epi32(vec,1));
+
+        r1 = _mm512_add_epi16(r1, vbase);
+        r2 = _mm512_add_epi16(r2, vbase);
+
+	    _mm512_storeu_si512((__m512i *)out, r1);
+        _mm512_storeu_si512((__m512i *)(out + 32), r2);
+        out += advance;
+
+    }
+
+    base += i * 64;
+
+    for (; (i < length) && (out < safeout); ++i) {
+         uint64_t w = array[i];
+         while ((w != 0) && (out < safeout)) {
+             uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail)
+             int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT
+             uint32_t val = r + base;
+             memcpy(out, &val,
+                    sizeof(uint16_t));
+             out++;
+             w ^= t;
+         }
+         base += 64;
+     }
+
+    return out - initout;
+}
+CROARING_UNTARGET_AVX512
+#endif
+
 CROARING_TARGET_AVX2
 size_t bitset_extract_setbits_avx2(const uint64_t *words, size_t length,
                                    uint32_t *out, size_t outcapacity,
@@ -573,9 +691,9 @@ size_t bitset_extract_setbits_avx2(const uint64_t *words, size_t length,
                 uint8_t byteB = (uint8_t)(w >> 8);
                 w >>= 16;
                 __m256i vecA =
-                    _mm256_load_si256((const __m256i *)vecDecodeTable[byteA]);
+                    _mm256_loadu_si256((const __m256i *)vecDecodeTable[byteA]);
                 __m256i vecB =
-                    _mm256_load_si256((const __m256i *)vecDecodeTable[byteB]);
+                    _mm256_loadu_si256((const __m256i *)vecDecodeTable[byteB]);
                 uint8_t advanceA = lengthTable[byteA];
                 uint8_t advanceB = lengthTable[byteB];
                 vecA = _mm256_add_epi32(baseVec, vecA);
@@ -594,7 +712,7 @@ size_t bitset_extract_setbits_avx2(const uint64_t *words, size_t length,
         uint64_t w = words[i];
         while ((w != 0) && (out < safeout)) {
             uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail)
-            int r = __builtin_ctzll(w); // on x64, should compile to TZCNT
+            int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT
             uint32_t val = r + base;
             memcpy(out, &val,
                    sizeof(uint32_t));  // should be compiled as a MOV on x64
@@ -605,7 +723,7 @@ size_t bitset_extract_setbits_avx2(const uint64_t *words, size_t length,
     }
     return out - initout;
 }
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 #endif  // CROARING_IS_X64
 
 size_t bitset_extract_setbits(const uint64_t *words, size_t length,
@@ -615,7 +733,7 @@ size_t bitset_extract_setbits(const uint64_t *words, size_t length,
         uint64_t w = words[i];
         while (w != 0) {
             uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail)
-            int r = __builtin_ctzll(w); // on x64, should compile to TZCNT
+            int r = roaring_trailing_zeroes(w); // on x64, should compile to TZCNT
             uint32_t val = r + base;
             memcpy(out + outpos, &val,
                    sizeof(uint32_t));  // should be compiled as a MOV on x64
@@ -636,7 +754,7 @@ size_t bitset_extract_intersection_setbits_uint16(const uint64_t * __restrict__
         uint64_t w = words1[i] & words2[i];
         while (w != 0) {
             uint64_t t = w & (~w + 1);
-            int r = __builtin_ctzll(w);
+            int r = roaring_trailing_zeroes(w);
             out[outpos++] = r + base;
             w ^= t;
         }
@@ -645,7 +763,7 @@ size_t bitset_extract_intersection_setbits_uint16(const uint64_t * __restrict__
     return outpos;
 }
 
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
 /*
  * Given a bitset containing "length" 64-bit words, write out the position
  * of all the set bits to "out" as 16-bit integers, values start at "base" (can
@@ -678,9 +796,9 @@ size_t bitset_extract_setbits_sse_uint16(const uint64_t *words, size_t length,
                 uint8_t byteA = (uint8_t)w;
                 uint8_t byteB = (uint8_t)(w >> 8);
                 w >>= 16;
-                __m128i vecA = _mm_load_si128(
+                __m128i vecA = _mm_loadu_si128(
                     (const __m128i *)vecDecodeTable_uint16[byteA]);
-                __m128i vecB = _mm_load_si128(
+                __m128i vecB = _mm_loadu_si128(
                     (const __m128i *)vecDecodeTable_uint16[byteB]);
                 uint8_t advanceA = lengthTable[byteA];
                 uint8_t advanceB = lengthTable[byteB];
@@ -700,7 +818,7 @@ size_t bitset_extract_setbits_sse_uint16(const uint64_t *words, size_t length,
         uint64_t w = words[i];
         while ((w != 0) && (out < safeout)) {
             uint64_t t = w & (~w + 1);
-            int r = __builtin_ctzll(w);
+            int r = roaring_trailing_zeroes(w);
             *out = r + base;
             out++;
             w ^= t;
@@ -709,7 +827,7 @@ size_t bitset_extract_setbits_sse_uint16(const uint64_t *words, size_t length,
     }
     return out - initout;
 }
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 #endif
 
 /*
@@ -728,7 +846,7 @@ size_t bitset_extract_setbits_uint16(const uint64_t *words, size_t length,
         uint64_t w = words[i];
         while (w != 0) {
             uint64_t t = w & (~w + 1);
-            int r = __builtin_ctzll(w);
+            int r = roaring_trailing_zeroes(w);
             out[outpos++] = r + base;
             w ^= t;
         }
@@ -895,7 +1013,7 @@ static inline void _scalar_bitset_set_list(uint64_t *words, const uint16_t *list
 
 uint64_t bitset_clear_list(uint64_t *words, uint64_t card, const uint16_t *list,
                            uint64_t length) {
-    if( croaring_avx2() ) {
+    if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) {
         return _asm_bitset_clear_list(words, card, list, length);
     } else {
         return _scalar_bitset_clear_list(words, card, list, length);
@@ -904,7 +1022,7 @@ uint64_t bitset_clear_list(uint64_t *words, uint64_t card, const uint16_t *list,
 
 uint64_t bitset_set_list_withcard(uint64_t *words, uint64_t card,
                                   const uint16_t *list, uint64_t length) {
-    if( croaring_avx2() ) {
+    if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) {
         return _asm_bitset_set_list_withcard(words, card, list, length);
     } else {
         return _scalar_bitset_set_list_withcard(words, card, list, length);
@@ -912,7 +1030,7 @@ uint64_t bitset_set_list_withcard(uint64_t *words, uint64_t card,
 }
 
 void bitset_set_list(uint64_t *words, const uint16_t *list, uint64_t length) {
-    if( croaring_avx2() ) {
+    if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) {
         _asm_bitset_set_list(words, list, length);
     } else {
         _scalar_bitset_set_list(words, list, length);
@@ -1006,5 +1124,5 @@ void bitset_flip_list(uint64_t *words, const uint16_t *list, uint64_t length) {
 }
 
 #ifdef __cplusplus
-} } }  // extern "C" { namespace roaring { namespace internal {
+} } }  // extern "C" { namespace roaring { namespace api {
 #endif
diff --git a/src/containers/array.c b/src/containers/array.c
index dd9632062..0816460ba 100644
--- a/src/containers/array.c
+++ b/src/containers/array.c
@@ -9,6 +9,12 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+
 #ifdef __cplusplus
 extern "C" { namespace roaring { namespace internal {
 #endif
@@ -19,11 +25,12 @@ extern inline int array_container_index_equalorlarger(const array_container_t *a
 
 extern inline int array_container_rank(const array_container_t *arr,
                                        uint16_t x);
+extern inline int array_container_get_index(const array_container_t *arr,
+                                          uint16_t x);
 extern inline bool array_container_contains(const array_container_t *arr,
                                             uint16_t pos);
 extern inline int array_container_cardinality(const array_container_t *array);
 extern inline bool array_container_nonzero_cardinality(const array_container_t *array);
-extern inline void array_container_clear(array_container_t *array);
 extern inline int32_t array_container_serialized_size_in_bytes(int32_t card);
 extern inline bool array_container_empty(const array_container_t *array);
 extern inline bool array_container_full(const array_container_t *array);
@@ -52,7 +59,7 @@ array_container_t *array_container_create_given_capacity(int32_t size) {
 }
 
 /* Create a new array. Return NULL in case of failure. */
-array_container_t *array_container_create() {
+array_container_t *array_container_create(void) {
     return array_container_create_given_capacity(ARRAY_DEFAULT_INIT_SIZE);
 }
 
@@ -165,11 +172,7 @@ void array_container_grow(array_container_t *container, int32_t min,
         container->array = (uint16_t *)roaring_malloc(new_capacity * sizeof(uint16_t));
     }
 
-    //  handle the case where realloc fails
-    if (container->array == NULL) {
-      fprintf(stderr, "could not allocate memory\n");
-    }
-    assert(container->array != NULL);
+    // if realloc fails, we have container->array == NULL.
 }
 
 /* Copy one container into another. We assume that they are distinct. */
@@ -217,8 +220,8 @@ void array_container_andnot(const array_container_t *array_1,
                             array_container_t *out) {
     if (out->capacity < array_1->cardinality)
         array_container_grow(out, array_1->cardinality, false);
-#ifdef CROARING_IS_X64
-    if(( croaring_avx2() ) && (out != array_1) && (out != array_2)) {
+#if CROARING_IS_X64
+    if(( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) && (out != array_1) && (out != array_2)) {
       out->cardinality =
           difference_vector16(array_1->array, array_1->cardinality,
                             array_2->array, array_2->cardinality, out->array);
@@ -248,8 +251,8 @@ void array_container_xor(const array_container_t *array_1,
         array_container_grow(out, max_cardinality, false);
     }
 
-#ifdef CROARING_IS_X64
-    if( croaring_avx2() ) {
+#if CROARING_IS_X64
+    if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) {
       out->cardinality =
         xor_vector16(array_1->array, array_1->cardinality, array_2->array,
                      array_2->cardinality, out->array);
@@ -279,7 +282,7 @@ void array_container_intersection(const array_container_t *array1,
     int32_t card_1 = array1->cardinality, card_2 = array2->cardinality,
             min_card = minimum_int32(card_1, card_2);
     const int threshold = 64;  // subject to tuning
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
     if (out->capacity < min_card) {
       array_container_grow(out, min_card + sizeof(__m128i) / sizeof(uint16_t),
         false);
@@ -297,8 +300,8 @@ void array_container_intersection(const array_container_t *array1,
         out->cardinality = intersect_skewed_uint16(
             array2->array, card_2, array1->array, card_1, out->array);
     } else {
-#ifdef CROARING_IS_X64
-       if( croaring_avx2() ) {
+#if CROARING_IS_X64
+       if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) {
         out->cardinality = intersect_vector16(
             array1->array, card_1, array2->array, card_2, out->array);
        } else {
@@ -325,8 +328,8 @@ int array_container_intersection_cardinality(const array_container_t *array1,
         return intersect_skewed_uint16_cardinality(array2->array, card_2,
                                                    array1->array, card_1);
     } else {
-#ifdef CROARING_IS_X64
-    if( croaring_avx2() ) {
+#if CROARING_IS_X64
+    if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) {
         return intersect_vector16_cardinality(array1->array, card_1,
                                               array2->array, card_2);
     } else {
@@ -362,7 +365,6 @@ bool array_container_intersect(const array_container_t *array1,
  * */
 void array_container_intersection_inplace(array_container_t *src_1,
                                           const array_container_t *src_2) {
-    // todo: can any of this be vectorized?
     int32_t card_1 = src_1->cardinality, card_2 = src_2->cardinality;
     const int threshold = 64;  // subject to tuning
     if (card_1 * threshold < card_2) {
@@ -372,16 +374,40 @@ void array_container_intersection_inplace(array_container_t *src_1,
         src_1->cardinality = intersect_skewed_uint16(
             src_2->array, card_2, src_1->array, card_1, src_1->array);
     } else {
+#if CROARING_IS_X64
+        if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) {
+            src_1->cardinality = intersect_vector16_inplace(
+                src_1->array, card_1, src_2->array, card_2);
+        } else {
+            src_1->cardinality = intersect_uint16(
+                src_1->array, card_1, src_2->array, card_2, src_1->array);
+        }
+#else
         src_1->cardinality = intersect_uint16(
-            src_1->array, card_1, src_2->array, card_2, src_1->array);
+                        src_1->array, card_1, src_2->array, card_2, src_1->array);
+#endif
     }
 }
 
+ALLOW_UNALIGNED
 int array_container_to_uint32_array(void *vout, const array_container_t *cont,
                                     uint32_t base) {
+
+#if CROARING_IS_X64
+    int support = croaring_hardware_support();
+#if CROARING_COMPILER_SUPPORTS_AVX512
+    if (support & ROARING_SUPPORTS_AVX512) {
+        return avx512_array_container_to_uint32_array(vout, cont->array, cont->cardinality, base);
+    }
+#endif
+    if (support & ROARING_SUPPORTS_AVX2) {
+        return array_container_to_uint32_array_vector16(vout, cont->array, cont->cardinality, base);
+    }
+#endif // CROARING_IS_X64
     int outpos = 0;
     uint32_t *out = (uint32_t *)vout;
-    for (int i = 0; i < cont->cardinality; ++i) {
+    size_t i = 0;
+    for ( ; i < (size_t)cont->cardinality; ++i) {
         const uint32_t val = base + cont->array[i];
         memcpy(out + outpos, &val,
                sizeof(uint32_t));  // should be compiled as a MOV on x64
@@ -414,6 +440,46 @@ void array_container_printf_as_uint32_array(const array_container_t *v,
     }
 }
 
+/*
+ * Validate the container. Returns true if valid.
+ */
+bool array_container_validate(const array_container_t *v, const char **reason) {
+    if (v->capacity < 0) {
+        *reason = "negative capacity";
+        return false;
+    }
+    if (v->cardinality < 0) {
+        *reason = "negative cardinality";
+        return false;
+    }
+    if (v->cardinality > v->capacity) {
+        *reason = "cardinality exceeds capacity";
+        return false;
+    }
+    if (v->cardinality > DEFAULT_MAX_SIZE) {
+        *reason = "cardinality exceeds DEFAULT_MAX_SIZE";
+        return false;
+    }
+    if (v->cardinality == 0) {
+        return true;
+    }
+
+    if (v->array == NULL) {
+        *reason = "NULL array pointer";
+        return false;
+    }
+    uint16_t prev = v->array[0];
+    for (int i = 1; i < v->cardinality; ++i) {
+        if (v->array[i] <= prev) {
+            *reason = "array elements not strictly increasing";
+            return false;
+        }
+        prev = v->array[i];
+    }
+
+    return true;
+}
+
 /* Compute the number of runs */
 int32_t array_container_number_of_runs(const array_container_t *ac) {
     // Can SIMD work here?
diff --git a/src/containers/bitset.c b/src/containers/bitset.c
index c03d32a9f..722eda1e3 100644
--- a/src/containers/bitset.c
+++ b/src/containers/bitset.c
@@ -16,14 +16,20 @@
 #include <roaring/memory.h>
 #include <roaring/utilasm.h>
 
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+
 #ifdef __cplusplus
 extern "C" { namespace roaring { namespace internal {
 #endif
 
 extern inline int bitset_container_cardinality(const bitset_container_t *bitset);
-extern inline bool bitset_container_nonzero_cardinality(bitset_container_t *bitset);
 extern inline void bitset_container_set(bitset_container_t *bitset, uint16_t pos);
-extern inline void bitset_container_unset(bitset_container_t *bitset, uint16_t pos);
+// unused at this time:
+//extern inline void bitset_container_unset(bitset_container_t *bitset, uint16_t pos);
 extern inline bool bitset_container_get(const bitset_container_t *bitset,
                                         uint16_t pos);
 extern inline int32_t bitset_container_serialized_size_in_bytes(void);
@@ -53,9 +59,21 @@ bitset_container_t *bitset_container_create(void) {
     if (!bitset) {
         return NULL;
     }
-    // sizeof(__m256i) == 32
+
+    size_t align_size = 32;
+#if CROARING_IS_X64
+    int support = croaring_hardware_support();
+    if ( support & ROARING_SUPPORTS_AVX512 ) {
+	    // sizeof(__m512i) == 64
+	    align_size = 64;
+    }
+    else {
+      // sizeof(__m256i) == 32
+	    align_size = 32;
+    }
+#endif
     bitset->words = (uint64_t *)roaring_aligned_malloc(
-        32, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
+        align_size, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
     if (!bitset->words) {
         roaring_free(bitset);
         return NULL;
@@ -117,9 +135,20 @@ bitset_container_t *bitset_container_clone(const bitset_container_t *src) {
     if (!bitset) {
         return NULL;
     }
-    // sizeof(__m256i) == 32
+
+    size_t align_size = 32;
+#if CROARING_IS_X64
+    if ( croaring_hardware_support() & ROARING_SUPPORTS_AVX512 ) {
+	    // sizeof(__m512i) == 64
+	    align_size = 64;
+    }
+    else {
+      // sizeof(__m256i) == 32
+	    align_size = 32;
+    }
+#endif
     bitset->words = (uint64_t *)roaring_aligned_malloc(
-        32, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
+        align_size, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
     if (!bitset->words) {
         roaring_free(bitset);
         return NULL;
@@ -214,25 +243,36 @@ bool bitset_container_intersect(const bitset_container_t *src_1,
 }
 
 
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
 #ifndef WORDS_IN_AVX2_REG
 #define WORDS_IN_AVX2_REG sizeof(__m256i) / sizeof(uint64_t)
 #endif
+#ifndef WORDS_IN_AVX512_REG
+#define WORDS_IN_AVX512_REG sizeof(__m512i) / sizeof(uint64_t)
+#endif
 /* Get the number of bits set (force computation) */
 static inline int _scalar_bitset_container_compute_cardinality(const bitset_container_t *bitset) {
   const uint64_t *words = bitset->words;
   int32_t sum = 0;
   for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 4) {
-          sum += hamming(words[i]);
-          sum += hamming(words[i + 1]);
-          sum += hamming(words[i + 2]);
-          sum += hamming(words[i + 3]);
+          sum += roaring_hamming(words[i]);
+          sum += roaring_hamming(words[i + 1]);
+          sum += roaring_hamming(words[i + 2]);
+          sum += roaring_hamming(words[i + 3]);
   }
   return sum;
 }
 /* Get the number of bits set (force computation) */
 int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
-    if( croaring_avx2() ) {
+    int support = croaring_hardware_support();
+#if CROARING_COMPILER_SUPPORTS_AVX512
+    if( support & ROARING_SUPPORTS_AVX512 ) {
+      return (int) avx512_vpopcount(
+        (const __m512i *)bitset->words,
+        BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX512_REG));
+    } else
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+    if( support & ROARING_SUPPORTS_AVX2 ) {
       return (int) avx2_harley_seal_popcount256(
         (const __m256i *)bitset->words,
         BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));
@@ -242,7 +282,7 @@ int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
     }
 }
 
-#elif defined(USENEON)
+#elif defined(CROARING_USENEON)
 int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
     uint16x8_t n0 = vdupq_n_u16(0);
     uint16x8_t n1 = vdupq_n_u16(0);
@@ -273,19 +313,177 @@ int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
     const uint64_t *words = bitset->words;
     int32_t sum = 0;
     for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 4) {
-        sum += hamming(words[i]);
-        sum += hamming(words[i + 1]);
-        sum += hamming(words[i + 2]);
-        sum += hamming(words[i + 3]);
+        sum += roaring_hamming(words[i]);
+        sum += roaring_hamming(words[i + 1]);
+        sum += roaring_hamming(words[i + 2]);
+        sum += roaring_hamming(words[i + 3]);
     }
     return sum;
 }
 
 #endif // CROARING_IS_X64
 
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
 
 #define BITSET_CONTAINER_FN_REPEAT 8
+#ifndef WORDS_IN_AVX512_REG
+#define WORDS_IN_AVX512_REG sizeof(__m512i) / sizeof(uint64_t)
+#endif // WORDS_IN_AVX512_REG
+
+/* Computes a binary operation (eg union) on bitset1 and bitset2 and write the
+   result to bitsetout */
+// clang-format off
+#define AVX512_BITSET_CONTAINER_FN1(before, opname, opsymbol, avx_intrinsic,   \
+                                neon_intrinsic, after)                         \
+  static inline int _avx512_bitset_container_##opname##_nocard(                \
+      const bitset_container_t *src_1, const bitset_container_t *src_2,        \
+      bitset_container_t *dst) {                                               \
+    const uint8_t * __restrict__ words_1 = (const uint8_t *)src_1->words;      \
+    const uint8_t * __restrict__ words_2 = (const uint8_t *)src_2->words;      \
+    /* not using the blocking optimization for some reason*/                   \
+    uint8_t *out = (uint8_t*)dst->words;                                       \
+    const int innerloop = 8;                                                   \
+    for (size_t i = 0;                                                         \
+        i < BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX512_REG);            \
+                                                         i+=innerloop) {       \
+        __m512i A1, A2, AO;                                                    \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1));                   \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2));                   \
+        AO = avx_intrinsic(A2, A1);                                            \
+        _mm512_storeu_si512((__m512i *)out, AO);                               \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 64));              \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 64));              \
+        AO = avx_intrinsic(A2, A1);                                            \
+        _mm512_storeu_si512((__m512i *)(out+64), AO);                          \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 128));             \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 128));             \
+        AO = avx_intrinsic(A2, A1);                                            \
+        _mm512_storeu_si512((__m512i *)(out+128), AO);                         \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 192));             \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 192));             \
+        AO = avx_intrinsic(A2, A1);                                            \
+        _mm512_storeu_si512((__m512i *)(out+192), AO);                         \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 256));             \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 256));             \
+        AO = avx_intrinsic(A2, A1);                                            \
+        _mm512_storeu_si512((__m512i *)(out+256), AO);                         \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 320));             \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 320));             \
+        AO = avx_intrinsic(A2, A1);                                            \
+        _mm512_storeu_si512((__m512i *)(out+320), AO);                         \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 384));             \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 384));             \
+        AO = avx_intrinsic(A2, A1);                                            \
+        _mm512_storeu_si512((__m512i *)(out+384), AO);                         \
+        A1 = _mm512_loadu_si512((const __m512i *)(words_1 + 448));             \
+        A2 = _mm512_loadu_si512((const __m512i *)(words_2 + 448));             \
+        AO = avx_intrinsic(A2, A1);                                     \
+        _mm512_storeu_si512((__m512i *)(out+448), AO);                  \
+        out+=512;                                                       \
+        words_1 += 512;                                                 \
+        words_2 += 512;                                                 \
+    }                                                                   \
+    dst->cardinality = BITSET_UNKNOWN_CARDINALITY;                      \
+    return dst->cardinality;                                            \
+  }
+
+#define AVX512_BITSET_CONTAINER_FN2(before, opname, opsymbol, avx_intrinsic,           \
+                                neon_intrinsic, after)                                 \
+  /* next, a version that updates cardinality*/                                        \
+  static inline int _avx512_bitset_container_##opname(const bitset_container_t *src_1, \
+                                      const bitset_container_t *src_2,                 \
+                                      bitset_container_t *dst) {                       \
+    const __m512i * __restrict__ words_1 = (const __m512i *) src_1->words;             \
+    const __m512i * __restrict__ words_2 = (const __m512i *) src_2->words;             \
+    __m512i *out = (__m512i *) dst->words;                                             \
+    dst->cardinality = (int32_t)avx512_harley_seal_popcount512andstore_##opname(words_2,\
+				words_1, out,BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX512_REG));           \
+    return dst->cardinality;                                                            \
+  }
+
+#define AVX512_BITSET_CONTAINER_FN3(before, opname, opsymbol, avx_intrinsic,            \
+                                neon_intrinsic, after)                                  \
+  /* next, a version that just computes the cardinality*/                               \
+  static inline int _avx512_bitset_container_##opname##_justcard(                       \
+      const bitset_container_t *src_1, const bitset_container_t *src_2) {               \
+    const __m512i * __restrict__ data1 = (const __m512i *) src_1->words;                \
+    const __m512i * __restrict__ data2 = (const __m512i *) src_2->words;                \
+    return (int)avx512_harley_seal_popcount512_##opname(data2,                          \
+				data1, BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX512_REG));                 \
+  }
+
+
+// we duplicate the function because other containers use the "or" term, makes API more consistent
+#if CROARING_COMPILER_SUPPORTS_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, or,    |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, union, |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+// we duplicate the function because other containers use the "intersection" term, makes API more consistent
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, and,          &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, intersection, &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, xor,    ^,  _mm512_xor_si512,    veorq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX512, andnot, &~, _mm512_andnot_si512, vbicq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+// we duplicate the function because other containers use the "or" term, makes API more consistent
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, or,    |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, union, |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+// we duplicate the function because other containers use the "intersection" term, makes API more consistent
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, and,          &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, intersection, &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, xor,    ^,  _mm512_xor_si512,    veorq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX512, andnot, &~, _mm512_andnot_si512, vbicq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+// we duplicate the function because other containers use the "or" term, makes API more consistent
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, or,    |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, union, |, _mm512_or_si512, vorrq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+// we duplicate the function because other containers use the "intersection" term, makes API more consistent
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, and,          &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, intersection, &, _mm512_and_si512, vandq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, xor,    ^,  _mm512_xor_si512,    veorq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+CROARING_TARGET_AVX512
+AVX512_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX512, andnot, &~, _mm512_andnot_si512, vbicq_u64, CROARING_UNTARGET_AVX512)
+CROARING_UNTARGET_AVX512
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+
 #ifndef WORDS_IN_AVX2_REG
 #define WORDS_IN_AVX2_REG sizeof(__m256i) / sizeof(uint64_t)
 #endif // WORDS_IN_AVX2_REG
@@ -379,72 +577,72 @@ int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
 
 // we duplicate the function because other containers use the "or" term, makes API more consistent
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, or,    |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, or,    |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, union, |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, union, |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 
 // we duplicate the function because other containers use the "intersection" term, makes API more consistent
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, and,          &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, and,          &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, intersection, &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, intersection, &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, xor,    ^,  _mm256_xor_si256,    veorq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, xor,    ^,  _mm256_xor_si256,    veorq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, andnot, &~, _mm256_andnot_si256, vbicq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN1(CROARING_TARGET_AVX2, andnot, &~, _mm256_andnot_si256, vbicq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 
 // we duplicate the function because other containers use the "or" term, makes API more consistent
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, or,    |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, or,    |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, union, |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, union, |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 
 // we duplicate the function because other containers use the "intersection" term, makes API more consistent
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, and,          &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, and,          &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, intersection, &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, intersection, &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, xor,    ^,  _mm256_xor_si256,    veorq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, xor,    ^,  _mm256_xor_si256,    veorq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, andnot, &~, _mm256_andnot_si256, vbicq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN2(CROARING_TARGET_AVX2, andnot, &~, _mm256_andnot_si256, vbicq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 
 // we duplicate the function because other containers use the "or" term, makes API more consistent
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, or,    |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, or,    |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, union, |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, union, |, _mm256_or_si256, vorrq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 
 // we duplicate the function because other containers use the "intersection" term, makes API more consistent
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, and,          &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, and,          &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, intersection, &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, intersection, &, _mm256_and_si256, vandq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, xor,    ^,  _mm256_xor_si256,    veorq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, xor,    ^,  _mm256_xor_si256,    veorq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 CROARING_TARGET_AVX2
-AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, andnot, &~, _mm256_andnot_si256, vbicq_u64, CROARING_UNTARGET_REGION)
-CROARING_UNTARGET_REGION
+AVX_BITSET_CONTAINER_FN3(CROARING_TARGET_AVX2, andnot, &~, _mm256_andnot_si256, vbicq_u64, CROARING_UNTARGET_AVX2)
+CROARING_UNTARGET_AVX2
 
 
 #define SCALAR_BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic,            \
@@ -461,8 +659,8 @@ CROARING_UNTARGET_REGION
                      word_2 = (words_1[i + 1]) opsymbol(words_2[i + 1]);       \
       out[i] = word_1;                                                         \
       out[i + 1] = word_2;                                                     \
-      sum += hamming(word_1);                                                  \
-      sum += hamming(word_2);                                                  \
+      sum += roaring_hamming(word_1);                                                  \
+      sum += roaring_hamming(word_2);                                                  \
     }                                                                          \
     dst->cardinality = sum;                                                    \
     return dst->cardinality;                                                   \
@@ -487,8 +685,8 @@ CROARING_UNTARGET_REGION
     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) {           \
       const uint64_t word_1 = (words_1[i])opsymbol(words_2[i]),                \
                      word_2 = (words_1[i + 1]) opsymbol(words_2[i + 1]);       \
-      sum += hamming(word_1);                                                  \
-      sum += hamming(word_2);                                                  \
+      sum += roaring_hamming(word_1);                                                  \
+      sum += roaring_hamming(word_2);                                                  \
     }                                                                          \
     return sum;                                                                \
   }
@@ -504,12 +702,16 @@ SCALAR_BITSET_CONTAINER_FN(intersection, &, _mm256_and_si256, vandq_u64)
 SCALAR_BITSET_CONTAINER_FN(xor,    ^,  _mm256_xor_si256,    veorq_u64)
 SCALAR_BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64)
 
-
+#if CROARING_COMPILER_SUPPORTS_AVX512
 #define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic)   \
   int bitset_container_##opname(const bitset_container_t *src_1,               \
                                 const bitset_container_t *src_2,               \
                                 bitset_container_t *dst) {                     \
-    if ( croaring_avx2() ) {                                                       \
+    int support = croaring_hardware_support();                                 \
+    if ( support & ROARING_SUPPORTS_AVX512 ) {                                 \
+      return _avx512_bitset_container_##opname(src_1, src_2, dst);             \
+    }                                                                          \
+    else if ( support & ROARING_SUPPORTS_AVX2 ) {                              \
       return _avx2_bitset_container_##opname(src_1, src_2, dst);               \
     } else {                                                                   \
       return _scalar_bitset_container_##opname(src_1, src_2, dst);             \
@@ -518,7 +720,11 @@ SCALAR_BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64)
   int bitset_container_##opname##_nocard(const bitset_container_t *src_1,      \
                                          const bitset_container_t *src_2,      \
                                          bitset_container_t *dst) {            \
-    if ( croaring_avx2() ) {                                                       \
+    int support = croaring_hardware_support();                                 \
+    if ( support & ROARING_SUPPORTS_AVX512 ) {                                 \
+      return _avx512_bitset_container_##opname##_nocard(src_1, src_2, dst);    \
+    }                                                                          \
+    else if ( support & ROARING_SUPPORTS_AVX2 ) {                              \
       return _avx2_bitset_container_##opname##_nocard(src_1, src_2, dst);      \
     } else {                                                                   \
       return _scalar_bitset_container_##opname##_nocard(src_1, src_2, dst);    \
@@ -526,17 +732,51 @@ SCALAR_BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64)
   }                                                                            \
   int bitset_container_##opname##_justcard(const bitset_container_t *src_1,    \
                                            const bitset_container_t *src_2) {  \
-    if ((croaring_detect_supported_architectures() & CROARING_AVX2) ==         \
-        CROARING_AVX2) {                                                       \
+     int support = croaring_hardware_support();                                \
+    if ( support & ROARING_SUPPORTS_AVX512 ) {                                 \
+      return _avx512_bitset_container_##opname##_justcard(src_1, src_2);       \
+    }                                                                          \
+    else if ( support & ROARING_SUPPORTS_AVX2 ) {                              \
       return _avx2_bitset_container_##opname##_justcard(src_1, src_2);         \
     } else {                                                                   \
       return _scalar_bitset_container_##opname##_justcard(src_1, src_2);       \
     }                                                                          \
   }
 
+#else // CROARING_COMPILER_SUPPORTS_AVX512
 
 
-#elif defined(USENEON)
+#define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic)   \
+  int bitset_container_##opname(const bitset_container_t *src_1,               \
+                                const bitset_container_t *src_2,               \
+                                bitset_container_t *dst) {                     \
+    if ( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) {               \
+      return _avx2_bitset_container_##opname(src_1, src_2, dst);               \
+    } else {                                                                   \
+      return _scalar_bitset_container_##opname(src_1, src_2, dst);             \
+    }                                                                          \
+  }                                                                            \
+  int bitset_container_##opname##_nocard(const bitset_container_t *src_1,      \
+                                         const bitset_container_t *src_2,      \
+                                         bitset_container_t *dst) {            \
+    if ( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) {               \
+      return _avx2_bitset_container_##opname##_nocard(src_1, src_2, dst);      \
+    } else {                                                                   \
+      return _scalar_bitset_container_##opname##_nocard(src_1, src_2, dst);    \
+    }                                                                          \
+  }                                                                            \
+  int bitset_container_##opname##_justcard(const bitset_container_t *src_1,    \
+                                           const bitset_container_t *src_2) {  \
+    if ( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) {               \
+      return _avx2_bitset_container_##opname##_justcard(src_1, src_2);         \
+    } else {                                                                   \
+      return _scalar_bitset_container_##opname##_justcard(src_1, src_2);       \
+    }                                                                          \
+  }
+
+#endif //  CROARING_COMPILER_SUPPORTS_AVX512
+
+#elif defined(CROARING_USENEON)
 
 #define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic)  \
 int bitset_container_##opname(const bitset_container_t *src_1,                \
@@ -639,8 +879,8 @@ int bitset_container_##opname(const bitset_container_t *src_1,            \
                        word_2 = (words_1[i + 1])opsymbol(words_2[i + 1]); \
         out[i] = word_1;                                                  \
         out[i + 1] = word_2;                                              \
-        sum += hamming(word_1);                                    \
-        sum += hamming(word_2);                                    \
+        sum += roaring_hamming(word_1);                                    \
+        sum += roaring_hamming(word_2);                                    \
     }                                                                     \
     dst->cardinality = sum;                                               \
     return dst->cardinality;                                              \
@@ -659,14 +899,14 @@ int bitset_container_##opname##_nocard(const bitset_container_t *src_1,   \
 }                                                                         \
 int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \
                               const bitset_container_t *src_2) {          \
-    const uint64_t * __restrict__ words_1 = src_1->words;                 \
+   printf("A1\n"); const uint64_t * __restrict__ words_1 = src_1->words;                 \
     const uint64_t * __restrict__ words_2 = src_2->words;                 \
     int32_t sum = 0;                                                      \
     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) {      \
         const uint64_t word_1 = (words_1[i])opsymbol(words_2[i]),         \
                        word_2 = (words_1[i + 1])opsymbol(words_2[i + 1]); \
-        sum += hamming(word_1);                                    \
-        sum += hamming(word_2);                                    \
+        sum += roaring_hamming(word_1);                                    \
+        sum += roaring_hamming(word_2);                                    \
     }                                                                     \
     return sum;                                                           \
 }
@@ -686,13 +926,21 @@ BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64)
 // clang-format On
 
 
+ALLOW_UNALIGNED
 int bitset_container_to_uint32_array(
     uint32_t *out,
     const bitset_container_t *bc,
     uint32_t base
 ){
-#ifdef CROARING_IS_X64
-    if(( croaring_avx2() ) &&  (bc->cardinality >= 8192))  // heuristic
+#if CROARING_IS_X64
+   int support = croaring_hardware_support();
+#if CROARING_COMPILER_SUPPORTS_AVX512
+   if(( support & ROARING_SUPPORTS_AVX512 ) &&  (bc->cardinality >= 8192))  // heuristic
+		return (int) bitset_extract_setbits_avx512(bc->words,
+                BITSET_CONTAINER_SIZE_IN_WORDS, out, bc->cardinality, base);
+   else
+#endif
+   if(( support & ROARING_SUPPORTS_AVX2 ) &&  (bc->cardinality >= 8192))  // heuristic
 		return (int) bitset_extract_setbits_avx2(bc->words,
                 BITSET_CONTAINER_SIZE_IN_WORDS, out, bc->cardinality, base);
 	else
@@ -715,7 +963,7 @@ void bitset_container_printf(const bitset_container_t * v) {
 		uint64_t w = v->words[i];
 		while (w != 0) {
 			uint64_t t = w & (~w + 1);
-			int r = __builtin_ctzll(w);
+			int r = roaring_trailing_zeroes(w);
 			if(iamfirst) {// predicted to be false
 				printf("%u",base + r);
 				iamfirst = false;
@@ -739,7 +987,7 @@ void bitset_container_printf_as_uint32_array(const bitset_container_t * v, uint3
 		uint64_t w = v->words[i];
 		while (w != 0) {
 			uint64_t t = w & (~w + 1);
-			int r = __builtin_ctzll(w);
+			int r = roaring_trailing_zeroes(w);
 			if(iamfirst) {// predicted to be false
 				printf("%u", r + base);
 				iamfirst = false;
@@ -752,6 +1000,26 @@ void bitset_container_printf_as_uint32_array(const bitset_container_t * v, uint3
 	}
 }
 
+/*
+ * Validate the container. Returns true if valid.
+ */
+bool bitset_container_validate(const bitset_container_t *v, const char **reason) {
+    if (v->words == NULL) {
+        *reason = "words is NULL";
+        return false;
+    }
+    if (v->cardinality != bitset_container_compute_cardinality(v)) {
+        *reason = "cardinality is incorrect";
+        return false;
+    }
+    // Attempt to forcibly load the first and last words, hopefully causing
+    // a segfault or an address sanitizer error if words is not allocated.
+    volatile uint64_t *words = v->words;
+    (void) words[0];
+    (void) words[BITSET_CONTAINER_SIZE_IN_WORDS - 1];
+    return true;
+}
+
 
 // TODO: use the fast lower bound, also
 int bitset_container_number_of_runs(bitset_container_t *bc) {
@@ -761,11 +1029,11 @@ int bitset_container_number_of_runs(bitset_container_t *bc) {
   for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS-1; ++i) {
     uint64_t word = next_word;
     next_word = bc->words[i+1];
-    num_runs += hamming((~word) & (word << 1)) + ( (word >> 63) & ~next_word);
+    num_runs += roaring_hamming((~word) & (word << 1)) + ( (word >> 63) & ~next_word);
   }
 
   uint64_t word = next_word;
-  num_runs += hamming((~word) & (word << 1));
+  num_runs += roaring_hamming((~word) & (word << 1));
   if((word & 0x8000000000000000ULL) != 0)
     num_runs++;
   return num_runs;
@@ -791,7 +1059,7 @@ bool bitset_container_iterate(const bitset_container_t *cont, uint32_t base, roa
     uint64_t w = cont->words[i];
     while (w != 0) {
       uint64_t t = w & (~w + 1);
-      int r = __builtin_ctzll(w);
+      int r = roaring_trailing_zeroes(w);
       if(!iterator(r + base, ptr)) return false;
       w ^= t;
     }
@@ -805,7 +1073,7 @@ bool bitset_container_iterate64(const bitset_container_t *cont, uint32_t base, r
     uint64_t w = cont->words[i];
     while (w != 0) {
       uint64_t t = w & (~w + 1);
-      int r = __builtin_ctzll(w);
+      int r = roaring_trailing_zeroes(w);
       if(!iterator(high_bits | (uint64_t)(r + base), ptr)) return false;
       w ^= t;
     }
@@ -814,14 +1082,33 @@ bool bitset_container_iterate64(const bitset_container_t *cont, uint32_t base, r
   return true;
 }
 
-#ifdef CROARING_IS_X64
+#if CROARING_IS_X64
+#if CROARING_COMPILER_SUPPORTS_AVX512
+CROARING_TARGET_AVX512
+ALLOW_UNALIGNED
+static inline bool _avx512_bitset_container_equals(const bitset_container_t *container1, const bitset_container_t *container2) {
+  const __m512i *ptr1 = (const __m512i*)container1->words;
+  const __m512i *ptr2 = (const __m512i*)container2->words;
+  for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS*sizeof(uint64_t)/64; i++) {
+      __m512i r1 = _mm512_loadu_si512(ptr1+i);
+      __m512i r2 = _mm512_loadu_si512(ptr2+i);
+      __mmask64 mask = _mm512_cmpeq_epi8_mask(r1, r2);
+      if ((uint64_t)mask != UINT64_MAX) {
+          return false;
+      }
+  }
+	return true;
+}
+CROARING_UNTARGET_AVX512
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
 CROARING_TARGET_AVX2
+ALLOW_UNALIGNED
 static inline bool _avx2_bitset_container_equals(const bitset_container_t *container1, const bitset_container_t *container2) {
     const __m256i *ptr1 = (const __m256i*)container1->words;
     const __m256i *ptr2 = (const __m256i*)container2->words;
     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS*sizeof(uint64_t)/32; i++) {
-      __m256i r1 = _mm256_load_si256(ptr1+i);
-      __m256i r2 = _mm256_load_si256(ptr2+i);
+      __m256i r1 = _mm256_loadu_si256(ptr1+i);
+      __m256i r2 = _mm256_loadu_si256(ptr2+i);
       int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r1, r2));
       if ((uint32_t)mask != UINT32_MAX) {
           return false;
@@ -829,9 +1116,10 @@ static inline bool _avx2_bitset_container_equals(const bitset_container_t *conta
   }
 	return true;
 }
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 #endif // CROARING_IS_X64
 
+ALLOW_UNALIGNED
 bool bitset_container_equals(const bitset_container_t *container1, const bitset_container_t *container2) {
   if((container1->cardinality != BITSET_UNKNOWN_CARDINALITY) && (container2->cardinality != BITSET_UNKNOWN_CARDINALITY)) {
     if(container1->cardinality != container2->cardinality) {
@@ -841,8 +1129,15 @@ bool bitset_container_equals(const bitset_container_t *container1, const bitset_
       return true;
     }
   }
-#ifdef CROARING_IS_X64
-  if( croaring_avx2() ) {
+#if CROARING_IS_X64
+  int support = croaring_hardware_support();
+#if CROARING_COMPILER_SUPPORTS_AVX512
+  if( support & ROARING_SUPPORTS_AVX512 ) {
+    return _avx512_bitset_container_equals(container1, container2);
+  }
+  else
+#endif
+  if( support & ROARING_SUPPORTS_AVX2 ) {
     return _avx2_bitset_container_equals(container1, container2);
   }
 #endif
@@ -875,13 +1170,13 @@ bool bitset_container_select(const bitset_container_t *container, uint32_t *star
     const uint64_t *words = container->words;
     int32_t size;
     for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 1) {
-        size = hamming(words[i]);
+        size = roaring_hamming(words[i]);
         if(rank <= *start_rank + size) {
             uint64_t w = container->words[i];
             uint16_t base = i*64;
             while (w != 0) {
                 uint64_t t = w & (~w + 1);
-                int r = __builtin_ctzll(w);
+                int r = roaring_trailing_zeroes(w);
                 if(*start_rank == rank) {
                     *element = r+base;
                     return true;
@@ -894,7 +1189,7 @@ bool bitset_container_select(const bitset_container_t *container, uint32_t *star
             *start_rank += size;
     }
     assert(false);
-    __builtin_unreachable();
+    roaring_unreachable;
 }
 
 
@@ -903,7 +1198,7 @@ uint16_t bitset_container_minimum(const bitset_container_t *container) {
   for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
     uint64_t w = container->words[i];
     if (w != 0) {
-      int r = __builtin_ctzll(w);
+      int r = roaring_trailing_zeroes(w);
       return r + i * 64;
     }
   }
@@ -915,7 +1210,7 @@ uint16_t bitset_container_maximum(const bitset_container_t *container) {
   for (int32_t i = BITSET_CONTAINER_SIZE_IN_WORDS - 1; i > 0; --i ) {
     uint64_t w = container->words[i];
     if (w != 0) {
-      int r = __builtin_clzll(w);
+      int r = roaring_leading_zeroes(w);
       return i * 64 + 63  - r;
     }
   }
@@ -928,15 +1223,34 @@ int bitset_container_rank(const bitset_container_t *container, uint16_t x) {
   int sum = 0;
   int i = 0;
   for (int end = x / 64; i < end; i++){
-    sum += hamming(container->words[i]);
+    sum += roaring_hamming(container->words[i]);
   }
   uint64_t lastword = container->words[i];
   uint64_t lastpos = UINT64_C(1) << (x % 64);
   uint64_t mask = lastpos + lastpos - 1; // smear right
-  sum += hamming(lastword & mask);
+  sum += roaring_hamming(lastword & mask);
   return sum;
 }
 
+/* Returns the index of x , if not exsist return -1 */
+int bitset_container_get_index(const bitset_container_t *container, uint16_t x) {
+  if (bitset_container_get(container, x)) {
+    // credit: aqrit
+    int sum = 0;
+    int i = 0;
+    for (int end = x / 64; i < end; i++){
+      sum += roaring_hamming(container->words[i]);
+    }
+    uint64_t lastword = container->words[i];
+    uint64_t lastpos = UINT64_C(1) << (x % 64);
+    uint64_t mask = lastpos + lastpos - 1; // smear right
+    sum += roaring_hamming(lastword & mask);
+    return sum - 1;
+  } else {
+    return -1;
+  }
+}
+
 /* Returns the index of the first value equal or larger than x, or -1 */
 int bitset_container_index_equalorlarger(const bitset_container_t *container, uint16_t x) {
   uint32_t x32 = x;
@@ -949,7 +1263,7 @@ int bitset_container_index_equalorlarger(const bitset_container_t *container, ui
     if(k == BITSET_CONTAINER_SIZE_IN_WORDS) return -1;
     word = container->words[k];
   }
-  return k * 64 + __builtin_ctzll(word);
+  return k * 64 + roaring_trailing_zeroes(word);
 }
 
 #ifdef __cplusplus
diff --git a/src/containers/containers.c b/src/containers/containers.c
index a1fb598c6..78a72db58 100644
--- a/src/containers/containers.c
+++ b/src/containers/containers.c
@@ -51,7 +51,7 @@ void container_free(container_t *c, uint8_t type) {
             break;
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
     }
 }
 
@@ -68,7 +68,7 @@ void container_printf(const container_t *c, uint8_t type) {
             run_container_printf(const_CAST_run(c));
             return;
         default:
-            __builtin_unreachable();
+            roaring_unreachable;
     }
 }
 
@@ -91,7 +91,44 @@ void container_printf_as_uint32_array(
                 const_CAST_run(c), base);
             return;
         default:
-            __builtin_unreachable();
+            roaring_unreachable;
+    }
+}
+
+bool container_internal_validate(const container_t *container,
+                                 uint8_t typecode, const char **reason) {
+    if (container == NULL) {
+        *reason = "container is NULL";
+        return false;
+    }
+    // Not using container_unwrap_shared because it asserts if shared containers are nested
+    if (typecode == SHARED_CONTAINER_TYPE) {
+        const shared_container_t *shared_container = const_CAST_shared(container);
+        if (croaring_refcount_get(&shared_container->counter) == 0) {
+            *reason = "shared container has zero refcount";
+            return false;
+        }
+        if (shared_container->typecode == SHARED_CONTAINER_TYPE) {
+            *reason = "shared container is nested";
+            return false;
+        }
+        if (shared_container->container == NULL) {
+            *reason = "shared container has NULL container";
+            return false;
+        }
+        container = shared_container->container;
+        typecode = shared_container->typecode;
+    }
+    switch (typecode) {
+        case BITSET_CONTAINER_TYPE:
+            return bitset_container_validate(const_CAST_bitset(container), reason);
+        case ARRAY_CONTAINER_TYPE:
+            return array_container_validate(const_CAST_array(container), reason);
+        case RUN_CONTAINER_TYPE:
+            return run_container_validate(const_CAST_run(container), reason);
+        default:
+            *reason = "invalid typecode";
+            return false;
     }
 }
 
@@ -137,7 +174,7 @@ container_t *get_copy_of_container(
         shared_container_t *shared_container;
         if (*typecode == SHARED_CONTAINER_TYPE) {
             shared_container = CAST_shared(c);
-            shared_container->counter += 1;
+            croaring_refcount_inc(&shared_container->counter);
             return shared_container;
         }
         assert(*typecode != SHARED_CONTAINER_TYPE);
@@ -149,7 +186,10 @@ container_t *get_copy_of_container(
 
         shared_container->container = c;
         shared_container->typecode = *typecode;
-
+        // At this point, we are creating new shared container
+        // so there should be no other references, and setting
+        // the counter to 2 - even non-atomically - is safe as
+        // long as the value is set before the return statement.
         shared_container->counter = 2;
         *typecode = SHARED_CONTAINER_TYPE;
 
@@ -180,7 +220,7 @@ container_t *container_clone(const container_t *c, uint8_t typecode) {
             return NULL;
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
             return NULL;
     }
 }
@@ -188,12 +228,10 @@ container_t *container_clone(const container_t *c, uint8_t typecode) {
 container_t *shared_container_extract_copy(
     shared_container_t *sc, uint8_t *typecode
 ){
-    assert(sc->counter > 0);
     assert(sc->typecode != SHARED_CONTAINER_TYPE);
-    sc->counter--;
     *typecode = sc->typecode;
     container_t *answer;
-    if (sc->counter == 0) {
+    if (croaring_refcount_dec(&sc->counter)) {
         answer = sc->container;
         sc->container = NULL;  // paranoid
         roaring_free(sc);
@@ -205,9 +243,7 @@ container_t *shared_container_extract_copy(
 }
 
 void shared_container_free(shared_container_t *container) {
-    assert(container->counter > 0);
-    container->counter--;
-    if (container->counter == 0) {
+    if (croaring_refcount_dec(&container->counter)) {
         assert(container->typecode != SHARED_CONTAINER_TYPE);
         container_free(container->container, container->typecode);
         container->container = NULL;  // paranoid
diff --git a/src/containers/convert.c b/src/containers/convert.c
index b60ac4773..743f62184 100644
--- a/src/containers/convert.c
+++ b/src/containers/convert.c
@@ -5,6 +5,12 @@
 #include <roaring/containers/convert.h>
 #include <roaring/containers/perfparameters.h>
 
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+
 #ifdef __cplusplus
 extern "C" { namespace roaring { namespace internal {
 #endif
@@ -48,11 +54,27 @@ array_container_t *array_container_from_bitset(const bitset_container_t *bits) {
     array_container_t *result =
         array_container_create_given_capacity(bits->cardinality);
     result->cardinality = bits->cardinality;
-    //  sse version ends up being slower here
-    // (bitset_extract_setbits_sse_uint16)
-    // because of the sparsity of the data
-    bitset_extract_setbits_uint16(bits->words, BITSET_CONTAINER_SIZE_IN_WORDS,
+#if CROARING_IS_X64
+#if CROARING_COMPILER_SUPPORTS_AVX512
+    if( croaring_hardware_support() & ROARING_SUPPORTS_AVX512 ) {
+        bitset_extract_setbits_avx512_uint16(bits->words, BITSET_CONTAINER_SIZE_IN_WORDS,
+                                  result->array, bits->cardinality , 0);
+    } else
+#endif
+    {
+        //  sse version ends up being slower here
+        // (bitset_extract_setbits_sse_uint16)
+        // because of the sparsity of the data
+        bitset_extract_setbits_uint16(bits->words, BITSET_CONTAINER_SIZE_IN_WORDS,
+                                  result->array, 0);
+    }
+#else
+        // If the system is not x64, then we have no accelerated function.
+        bitset_extract_setbits_uint16(bits->words, BITSET_CONTAINER_SIZE_IN_WORDS,
                                   result->array, 0);
+#endif
+
+
     return result;
 }
 
@@ -101,10 +123,11 @@ container_t *convert_to_bitset_or_array_container(
         for (int rlepos = 0; rlepos < rc->n_runs; ++rlepos) {
             uint16_t run_start = rc->runs[rlepos].value;
             uint16_t run_end = run_start + rc->runs[rlepos].length;
-            for (uint16_t run_value = run_start; run_value <= run_end;
+            for (uint16_t run_value = run_start; run_value < run_end;
                  ++run_value) {
                 answer->array[answer->cardinality++] = run_value;
             }
+            answer->array[answer->cardinality++] = run_end;
         }
         assert(card == answer->cardinality);
         *resulttype = ARRAY_CONTAINER_TYPE;
@@ -262,7 +285,6 @@ container_t *convert_run_optimize(
 
         int long_ctr = 0;
         uint64_t cur_word = c_qua_bitset->words[0];
-        int run_count = 0;
         while (true) {
             while (cur_word == UINT64_C(0) &&
                    long_ctr < BITSET_CONTAINER_SIZE_IN_WORDS - 1)
@@ -274,7 +296,7 @@ container_t *convert_run_optimize(
                 return answer;
             }
 
-            int local_run_start = __builtin_ctzll(cur_word);
+            int local_run_start = roaring_trailing_zeroes(cur_word);
             int run_start = local_run_start + 64 * long_ctr;
             uint64_t cur_word_with_1s = cur_word | (cur_word - 1);
 
@@ -290,16 +312,15 @@ container_t *convert_run_optimize(
                 *typecode_after = RUN_CONTAINER_TYPE;
                 return answer;
             }
-            int local_run_end = __builtin_ctzll(~cur_word_with_1s);
+            int local_run_end = roaring_trailing_zeroes(~cur_word_with_1s);
             run_end = local_run_end + long_ctr * 64;
             add_run(answer, run_start, run_end - 1);
-            run_count++;
             cur_word = cur_word_with_1s & (cur_word_with_1s + 1);
         }
         return answer;
     } else {
         assert(false);
-        __builtin_unreachable();
+        roaring_unreachable;
         return NULL;
     }
 }
diff --git a/src/containers/mixed_equal.c b/src/containers/mixed_equal.c
index fdf4c2517..120d900b6 100644
--- a/src/containers/mixed_equal.c
+++ b/src/containers/mixed_equal.c
@@ -16,7 +16,7 @@ bool array_container_equal_bitset(const array_container_t* container1,
         uint64_t w = container2->words[i];
         while (w != 0) {
             uint64_t t = w & (~w + 1);
-            uint16_t r = i * 64 + __builtin_ctzll(w);
+            uint16_t r = i * 64 + roaring_trailing_zeroes(w);
             if (pos >= container1->cardinality) {
                 return false;
             }
diff --git a/src/containers/mixed_subset.c b/src/containers/mixed_subset.c
index af6f03e79..695d49731 100644
--- a/src/containers/mixed_subset.c
+++ b/src/containers/mixed_subset.c
@@ -32,7 +32,7 @@ bool run_container_is_subset_array(const run_container_t* container1,
                                  container2->cardinality, start);
         stop_pos = advanceUntil(container2->array, stop_pos,
                                 container2->cardinality, stop);
-        if (start_pos == container2->cardinality) {
+        if (stop_pos == container2->cardinality) {
             return false;
         } else if (stop_pos - start_pos != stop - start ||
                    container2->array[start_pos] != start ||
@@ -108,7 +108,7 @@ bool bitset_container_is_subset_run(const bitset_container_t* container1,
             uint32_t start = container2->runs[i_run].value;
             uint32_t stop = start + container2->runs[i_run].length;
             uint64_t t = w & (~w + 1);
-            uint16_t r = i_bitset * 64 + __builtin_ctzll(w);
+            uint16_t r = i_bitset * 64 + roaring_trailing_zeroes(w);
             if (r < start) {
                 return false;
             } else if (r > stop) {
diff --git a/src/containers/mixed_union.c b/src/containers/mixed_union.c
index 2af172375..0b63c0124 100644
--- a/src/containers/mixed_union.c
+++ b/src/containers/mixed_union.c
@@ -206,6 +206,10 @@ bool array_array_container_inplace_union(
           return false;  // not a bitset
         } else {
           memmove(src_1->array + src_2->cardinality, src_1->array, src_1->cardinality * sizeof(uint16_t));
+          // In theory, we could use fast_union_uint16, but it is unsafe. It fails
+          // with Intel compilers in particular.
+          // https://github.com/RoaringBitmap/CRoaring/pull/452
+          // See report https://github.com/RoaringBitmap/CRoaring/issues/476
           src_1->cardinality = (int32_t)union_uint16(src_1->array + src_2->cardinality, src_1->cardinality,
                                   src_2->array, src_2->cardinality, src_1->array);
           return false; // not a bitset
@@ -242,6 +246,17 @@ bool array_array_container_lazy_union(
     container_t **dst
 ){
     int totalCardinality = src_1->cardinality + src_2->cardinality;
+    //
+    // We assume that operations involving bitset containers will be faster than
+    // operations involving solely array containers, except maybe when array containers
+    // are small. Indeed, for example, it is cheap to compute the union between an array and
+    // a bitset container, generally more so than between a large array and another array.
+    // So it is advantageous to favour bitset containers during the computation.
+    // Of course, if we convert array containers eagerly to bitset containers, we may later
+    // need to revert the bitset containers to array containerr to satisfy the Roaring format requirements,
+    // but such one-time conversions at the end may not be overly expensive. We arrived to this design
+    // based on extensive benchmarking.
+    //
     if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) {
         *dst = array_container_create_given_capacity(totalCardinality);
         if (*dst != NULL) {
@@ -269,6 +284,17 @@ bool array_array_container_lazy_inplace_union(
 ){
     int totalCardinality = src_1->cardinality + src_2->cardinality;
     *dst = NULL;
+    //
+    // We assume that operations involving bitset containers will be faster than
+    // operations involving solely array containers, except maybe when array containers
+    // are small. Indeed, for example, it is cheap to compute the union between an array and
+    // a bitset container, generally more so than between a large array and another array.
+    // So it is advantageous to favour bitset containers during the computation.
+    // Of course, if we convert array containers eagerly to bitset containers, we may later
+    // need to revert the bitset containers to array containerr to satisfy the Roaring format requirements,
+    // but such one-time conversions at the end may not be overly expensive. We arrived to this design
+    // based on extensive benchmarking.
+    //
     if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) {
         if(src_1->capacity < totalCardinality) {
           *dst = array_container_create_given_capacity(2  * totalCardinality); // be purposefully generous
@@ -280,7 +306,37 @@ bool array_array_container_lazy_inplace_union(
           return false;  // not a bitset
         } else {
           memmove(src_1->array + src_2->cardinality, src_1->array, src_1->cardinality * sizeof(uint16_t));
-          src_1->cardinality = (int32_t)union_uint16(src_1->array + src_2->cardinality, src_1->cardinality,
+          /*
+            Next line is safe:
+
+            We just need to focus on the reading and writing performed on array1. In `union_vector16`, both vectorized and scalar code still obey the basic rule: read from two inputs, do the union, and then write the output.
+
+            Let's say the length(cardinality) of input2 is L2:
+            ```
+                |<-  L2  ->|
+            array1: [output--- |input 1---|---]
+            array2: [input 2---]
+            ```
+            Let's define 3 __m128i pointers, `pos1` starts from `input1`, `pos2` starts from `input2`, these 2 point at the next byte to read, `out` starts from `output`, pointing at the next byte to overwrite.
+            ```
+            array1: [output--- |input 1---|---]
+                        ^          ^
+                    out        pos1
+            array2: [input 2---]
+                        ^
+                        pos2
+            ```
+            The union output always contains less or equal number of elements than all inputs added, so we have:
+            ```
+            out <= pos1 + pos2
+            ```
+            therefore:
+            ```
+            out <= pos1 + L2
+            ```
+            which means you will not overwrite data beyond pos1, so the data haven't read is safe, and we don't care the data already read.
+          */
+          src_1->cardinality = (int32_t)fast_union_uint16(src_1->array + src_2->cardinality, src_1->cardinality,
                                   src_2->array, src_2->cardinality, src_1->array);
           return false; // not a bitset
         }
diff --git a/src/containers/mixed_xor.c b/src/containers/mixed_xor.c
index d9cacc7ab..0d77d61b2 100644
--- a/src/containers/mixed_xor.c
+++ b/src/containers/mixed_xor.c
@@ -230,7 +230,19 @@ bool array_array_container_lazy_xor(
     container_t **dst
 ){
     int totalCardinality = src_1->cardinality + src_2->cardinality;
-    // upper bound, but probably poor estimate for xor
+    //
+    // We assume that operations involving bitset containers will be faster than
+    // operations involving solely array containers, except maybe when array containers
+    // are small. Indeed, for example, it is cheap to compute the exclusive union between an array and
+    // a bitset container, generally more so than between a large array and another array.
+    // So it is advantageous to favour bitset containers during the computation.
+    // Of course, if we convert array containers eagerly to bitset containers, we may later
+    // need to revert the bitset containers to array containerr to satisfy the Roaring format requirements,
+    // but such one-time conversions at the end may not be overly expensive. We arrived to this design
+    // based on extensive benchmarking on unions.
+    // For XOR/exclusive union, we simply followed the heuristic used by the unions (see  mixed_union.c).
+    // Further tuning is possible.
+    //
     if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) {
         *dst = array_container_create_given_capacity(totalCardinality);
         if (*dst != NULL)
diff --git a/src/containers/run.c b/src/containers/run.c
index 6c14eef6a..ddba08a3c 100644
--- a/src/containers/run.c
+++ b/src/containers/run.c
@@ -5,6 +5,12 @@
 #include <roaring/portability.h>
 #include <roaring/memory.h>
 
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+
 #ifdef __cplusplus
 extern "C" { namespace roaring { namespace internal {
 #endif
@@ -18,7 +24,6 @@ extern inline bool run_container_contains(const run_container_t *run,
 extern inline int run_container_index_equalorlarger(const run_container_t *arr, uint16_t x);
 extern inline bool run_container_is_full(const run_container_t *run);
 extern inline bool run_container_nonzero_cardinality(const run_container_t *rc);
-extern inline void run_container_clear(run_container_t *run);
 extern inline int32_t run_container_serialized_size_in_bytes(int32_t num_runs);
 extern inline run_container_t *run_container_create_range(uint32_t start,
                                                    uint32_t stop);
@@ -132,7 +137,7 @@ void run_container_offset(const run_container_t *c,
         lo_cap = c->n_runs;
         hi_cap = 0;
     } else {
-        split = c->runs[pivot].value <= top;
+        split = c->runs[pivot].value < top;
         lo_cap = pivot + (split ? 1 : 0);
         hi_cap = c->n_runs - pivot;
     }
@@ -203,11 +208,7 @@ void run_container_grow(run_container_t *run, int32_t min, bool copy) {
         }
         run->runs = (rle16_t *)roaring_malloc(run->capacity * sizeof(rle16_t));
     }
-    // handle the case where realloc fails
-    if (run->runs == NULL) {
-      fprintf(stderr, "could not allocate memory\n");
-    }
-    assert(run->runs != NULL);
+    // We may have run->runs == NULL.
 }
 
 /* copy one container into another */
@@ -626,6 +627,7 @@ void run_container_andnot(const run_container_t *src_1,
     }
 }
 
+ALLOW_UNALIGNED
 int run_container_to_uint32_array(void *vout, const run_container_t *cont,
                                   uint32_t base) {
     int outpos = 0;
@@ -674,8 +676,60 @@ void run_container_printf_as_uint32_array(const run_container_t *cont,
     }
 }
 
+/*
+ * Validate the container. Returns true if valid.
+ */
+bool run_container_validate(const run_container_t *run, const char **reason) {
+    if (run->n_runs < 0) {
+        *reason = "negative run count";
+        return false;
+    }
+    if (run->capacity < 0) {
+        *reason = "negative run capacity";
+        return false;
+    }
+    if (run->capacity < run->n_runs) {
+        *reason = "capacity less than run count";
+        return false;
+    }
+
+    if (run->n_runs == 0) {
+        return true;
+    }
+    if (run->runs == NULL) {
+        *reason = "NULL runs";
+        return false;
+    }
+
+    // Use uint32_t to avoid overflow issues on ranges that contain UINT16_MAX.
+    uint32_t last_end = 0;
+    for (int i = 0; i < run->n_runs; ++i) {
+        uint32_t start = run->runs[i].value;
+        uint32_t end = start + run->runs[i].length + 1;
+        if (end <= start) {
+            *reason = "run start + length overflow";
+            return false;
+        }
+        if (end > (1<<16)) {
+            *reason = "run start + length too large";
+            return false;
+        }
+        if (start < last_end) {
+            *reason = "run start less than last end";
+            return false;
+        }
+        if (start == last_end && last_end != 0) {
+            *reason = "run start equal to last end, should have combined";
+            return false;
+        }
+        last_end = end;
+    }
+    return true;
+}
+
 int32_t run_container_write(const run_container_t *container, char *buf) {
-    memcpy(buf, &container->n_runs, sizeof(uint16_t));
+    uint16_t cast_16 = container->n_runs;
+    memcpy(buf, &cast_16, sizeof(uint16_t));
     memcpy(buf + sizeof(uint16_t), container->runs,
            container->n_runs * sizeof(rle16_t));
     return run_container_size_in_bytes(container);
@@ -684,7 +738,9 @@ int32_t run_container_write(const run_container_t *container, char *buf) {
 int32_t run_container_read(int32_t cardinality, run_container_t *container,
                            const char *buf) {
     (void)cardinality;
-    memcpy(&container->n_runs, buf, sizeof(uint16_t));
+    uint16_t cast_16;
+    memcpy(&cast_16, buf, sizeof(uint16_t));
+    container->n_runs = cast_16;
     if (container->n_runs > container->capacity)
         run_container_grow(container, container->n_runs, false);
     if(container->n_runs > 0) {
@@ -828,9 +884,73 @@ int run_container_rank(const run_container_t *container, uint16_t x) {
     return sum;
 }
 
-#ifdef CROARING_IS_X64
+int run_container_get_index(const run_container_t *container, uint16_t x) {
+    if (run_container_contains(container, x)) {
+        int sum = 0;
+        uint32_t x32 = x;
+        for (int i = 0; i < container->n_runs; i++) {
+            uint32_t startpoint = container->runs[i].value;
+            uint32_t length = container->runs[i].length;
+            uint32_t endpoint = length + startpoint;
+            if (x <= endpoint) {
+                if (x < startpoint) break;
+                return sum + (x32 - startpoint);
+            } else {
+                sum += length + 1;
+            }
+        }
+        return sum - 1;
+    } else {
+        return -1;
+    }
+}
+
+#if defined(CROARING_IS_X64) && CROARING_COMPILER_SUPPORTS_AVX512
+
+CROARING_TARGET_AVX512
+ALLOW_UNALIGNED
+/* Get the cardinality of `run'. Requires an actual computation. */
+static inline int _avx512_run_container_cardinality(const run_container_t *run) {
+    const int32_t n_runs = run->n_runs;
+    const rle16_t *runs = run->runs;
+
+    /* by initializing with n_runs, we omit counting the +1 for each pair. */
+    int sum = n_runs;
+    int32_t k = 0;
+    const int32_t step = sizeof(__m512i) / sizeof(rle16_t);
+    if (n_runs > step) {
+        __m512i total = _mm512_setzero_si512();
+        for (; k + step <= n_runs; k += step) {
+            __m512i ymm1 = _mm512_loadu_si512((const __m512i *)(runs + k));
+            __m512i justlengths = _mm512_srli_epi32(ymm1, 16);
+            total = _mm512_add_epi32(total, justlengths);
+        }
+
+        __m256i lo = _mm512_extracti32x8_epi32(total, 0);
+        __m256i hi = _mm512_extracti32x8_epi32(total, 1);
+
+        // a store might be faster than extract?
+        uint32_t buffer[sizeof(__m256i) / sizeof(rle16_t)];
+        _mm256_storeu_si256((__m256i *)buffer, lo);
+        sum += (buffer[0] + buffer[1]) + (buffer[2] + buffer[3]) +
+               (buffer[4] + buffer[5]) + (buffer[6] + buffer[7]);
+
+        _mm256_storeu_si256((__m256i *)buffer, hi);
+        sum += (buffer[0] + buffer[1]) + (buffer[2] + buffer[3]) +
+               (buffer[4] + buffer[5]) + (buffer[6] + buffer[7]);
+
+    }
+    for (; k < n_runs; ++k) {
+        sum += runs[k].length;
+    }
+
+    return sum;
+}
+
+CROARING_UNTARGET_AVX512
 
 CROARING_TARGET_AVX2
+ALLOW_UNALIGNED
 /* Get the cardinality of `run'. Requires an actual computation. */
 static inline int _avx2_run_container_cardinality(const run_container_t *run) {
     const int32_t n_runs = run->n_runs;
@@ -860,7 +980,7 @@ static inline int _avx2_run_container_cardinality(const run_container_t *run) {
     return sum;
 }
 
-CROARING_UNTARGET_REGION
+CROARING_UNTARGET_AVX2
 
 /* Get the cardinality of `run'. Requires an actual computation. */
 static inline int _scalar_run_container_cardinality(const run_container_t *run) {
@@ -877,7 +997,13 @@ static inline int _scalar_run_container_cardinality(const run_container_t *run)
 }
 
 int run_container_cardinality(const run_container_t *run) {
-  if( croaring_avx2() ) {
+#if CROARING_COMPILER_SUPPORTS_AVX512
+  if( croaring_hardware_support() & ROARING_SUPPORTS_AVX512 ) {
+    return _avx512_run_container_cardinality(run);
+  }
+  else
+#endif
+  if( croaring_hardware_support() & ROARING_SUPPORTS_AVX2 ) {
     return _avx2_run_container_cardinality(run);
   } else {
     return _scalar_run_container_cardinality(run);
diff --git a/src/isadetection.c b/src/isadetection.c
new file mode 100644
index 000000000..ce4b55326
--- /dev/null
+++ b/src/isadetection.c
@@ -0,0 +1,300 @@
+
+/* From
+https://github.com/endorno/pytorch/blob/master/torch/lib/TH/generic/simd/simd.h
+Highly modified.
+
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
+Iain Melvin, Jason Weston) Copyright (c) 2006      Idiap Research Institute
+(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
+Samy Bengio, Johnny Mariethoz)
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
+America and IDIAP Research Institute nor the names of its contributors may be
+   used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+
+// We need portability.h to be included first, see
+// https://github.com/RoaringBitmap/CRoaring/issues/394
+#include <roaring/portability.h>
+#if CROARING_REGULAR_VISUAL_STUDIO
+#include <intrin.h>
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
+#include <cpuid.h>
+#endif // CROARING_REGULAR_VISUAL_STUDIO
+#include <roaring/isadetection.h>
+
+#if CROARING_IS_X64
+#ifndef CROARING_COMPILER_SUPPORTS_AVX512
+#error "CROARING_COMPILER_SUPPORTS_AVX512 needs to be defined."
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+#endif
+
+#ifdef __cplusplus
+extern "C" { namespace roaring { namespace internal {
+#endif
+enum croaring_instruction_set {
+  CROARING_DEFAULT = 0x0,
+  CROARING_NEON = 0x1,
+  CROARING_AVX2 = 0x4,
+  CROARING_SSE42 = 0x8,
+  CROARING_PCLMULQDQ = 0x10,
+  CROARING_BMI1 = 0x20,
+  CROARING_BMI2 = 0x40,
+  CROARING_ALTIVEC = 0x80,
+  CROARING_AVX512F = 0x100,
+  CROARING_AVX512DQ = 0x200,
+  CROARING_AVX512BW = 0x400,
+  CROARING_AVX512VBMI2 = 0x800,
+  CROARING_AVX512BITALG = 0x1000,
+  CROARING_AVX512VPOPCNTDQ = 0x2000,
+  CROARING_UNINITIALIZED = 0x8000
+};
+
+#if CROARING_COMPILER_SUPPORTS_AVX512
+unsigned int CROARING_AVX512_REQUIRED = (CROARING_AVX512F | CROARING_AVX512DQ | CROARING_AVX512BW | CROARING_AVX512VBMI2 | CROARING_AVX512BITALG | CROARING_AVX512VPOPCNTDQ);
+#endif
+
+#if defined(__x86_64__) || defined(_M_AMD64) // x64
+
+
+static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx,
+                         uint32_t *edx) {
+#if CROARING_REGULAR_VISUAL_STUDIO
+  int cpu_info[4];
+  __cpuidex(cpu_info, *eax, *ecx);
+  *eax = cpu_info[0];
+  *ebx = cpu_info[1];
+  *ecx = cpu_info[2];
+  *edx = cpu_info[3];
+#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
+  uint32_t level = *eax;
+  __get_cpuid(level, eax, ebx, ecx, edx);
+#else
+  uint32_t a = *eax, b, c = *ecx, d;
+  __asm__("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
+  *eax = a;
+  *ebx = b;
+  *ecx = c;
+  *edx = d;
+#endif
+}
+
+
+static inline uint64_t xgetbv(void) {
+#if defined(_MSC_VER)
+  return _xgetbv(0);
+#else
+  uint32_t xcr0_lo, xcr0_hi;
+  __asm__("xgetbv\n\t" : "=a" (xcr0_lo), "=d" (xcr0_hi) : "c" (0));
+  return xcr0_lo | ((uint64_t)xcr0_hi << 32);
+#endif
+}
+
+/**
+ * This is a relatively expensive function but it will get called at most
+ * *once* per compilation units. Normally, the CRoaring library is built
+ * as one compilation unit.
+ */
+static inline uint32_t dynamic_croaring_detect_supported_architectures(void) {
+  uint32_t eax, ebx, ecx, edx;
+  uint32_t host_isa = 0x0;
+  // Can be found on Intel ISA Reference for CPUID
+  static uint32_t cpuid_avx2_bit = 1 << 5;      ///< @private Bit 5 of EBX for EAX=0x7
+  static uint32_t cpuid_bmi1_bit = 1 << 3;      ///< @private bit 3 of EBX for EAX=0x7
+  static uint32_t cpuid_bmi2_bit = 1 << 8;      ///< @private bit 8 of EBX for EAX=0x7
+  static uint32_t cpuid_avx512f_bit = 1 << 16;  ///< @private bit 16 of EBX for EAX=0x7
+  static uint32_t cpuid_avx512dq_bit = 1 << 17; ///< @private bit 17 of EBX for EAX=0x7
+  static uint32_t cpuid_avx512bw_bit = 1 << 30; ///< @private bit 30 of EBX for EAX=0x7
+  static uint32_t cpuid_avx512vbmi2_bit = 1 << 6; ///< @private bit 6 of ECX for EAX=0x7
+  static uint32_t cpuid_avx512bitalg_bit = 1 << 12; ///< @private bit 12 of ECX for EAX=0x7
+  static uint32_t cpuid_avx512vpopcntdq_bit = 1 << 14; ///< @private bit 14 of ECX for EAX=0x7
+  static uint64_t cpuid_avx256_saved = 1 << 2; ///< @private bit 2 = AVX
+  static uint64_t cpuid_avx512_saved = 7 << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM
+  static uint32_t cpuid_sse42_bit = 1 << 20;    ///< @private bit 20 of ECX for EAX=0x1
+  static uint32_t cpuid_osxsave = (1 << 26) | (1 << 27); ///< @private bits 26+27 of ECX for EAX=0x1
+  static uint32_t cpuid_pclmulqdq_bit = 1 << 1; ///< @private bit  1 of ECX for EAX=0x1
+
+
+  // EBX for EAX=0x1
+  eax = 0x1;
+  ecx = 0x0;
+  cpuid(&eax, &ebx, &ecx, &edx);
+
+  if (ecx & cpuid_sse42_bit) {
+    host_isa |= CROARING_SSE42;
+  } else {
+    return host_isa; // everything after is redundant
+  }
+
+  if (ecx & cpuid_pclmulqdq_bit) {
+    host_isa |= CROARING_PCLMULQDQ;
+  }
+
+  if ((ecx & cpuid_osxsave) != cpuid_osxsave) {
+    return host_isa;
+  }
+
+  // xgetbv for checking if the OS saves registers
+  uint64_t xcr0 = xgetbv();
+
+  if ((xcr0 & cpuid_avx256_saved) == 0) {
+    return host_isa;
+  }
+
+  // ECX for EAX=0x7
+  eax = 0x7;
+  ecx = 0x0;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if (ebx & cpuid_avx2_bit) {
+    host_isa |= CROARING_AVX2;
+  }
+  if (ebx & cpuid_bmi1_bit) {
+    host_isa |= CROARING_BMI1;
+  }
+
+  if (ebx & cpuid_bmi2_bit) {
+    host_isa |= CROARING_BMI2;
+  }
+
+  if (!((xcr0 & cpuid_avx512_saved) == cpuid_avx512_saved)) {
+     return host_isa;
+  }
+
+  if (ebx & cpuid_avx512f_bit) {
+    host_isa |= CROARING_AVX512F;
+  }
+  
+  if (ebx & cpuid_avx512bw_bit) {
+    host_isa |= CROARING_AVX512BW;
+  }
+  
+  if (ebx & cpuid_avx512dq_bit) {
+    host_isa |= CROARING_AVX512DQ;
+  }
+  
+  if (ecx & cpuid_avx512vbmi2_bit) {
+    host_isa |= CROARING_AVX512VBMI2;
+  }
+  
+  if (ecx & cpuid_avx512bitalg_bit) {
+    host_isa |= CROARING_AVX512BITALG;
+  }
+  
+  if (ecx & cpuid_avx512vpopcntdq_bit) {
+    host_isa |= CROARING_AVX512VPOPCNTDQ;
+  }
+
+  return host_isa;
+}
+
+#endif // end SIMD extension detection code
+
+
+#if defined(__x86_64__) || defined(_M_AMD64) // x64
+
+#if CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_CPP
+static inline uint32_t croaring_detect_supported_architectures(void) {
+    // thread-safe as per the C++11 standard.
+    static uint32_t buffer = dynamic_croaring_detect_supported_architectures();
+    return buffer;
+}
+#elif CROARING_ATOMIC_IMPL == CROARING_ATOMIC_IMPL_C
+static uint32_t croaring_detect_supported_architectures(void) {
+    // we use an atomic for thread safety
+    static _Atomic uint32_t buffer = CROARING_UNINITIALIZED;
+    if (buffer == CROARING_UNINITIALIZED) {
+      // atomicity is sufficient
+      buffer = dynamic_croaring_detect_supported_architectures();
+    }
+    return buffer;
+}
+#else
+// If we do not have atomics, we do the best we can.
+static inline uint32_t croaring_detect_supported_architectures(void) {
+    static uint32_t buffer = CROARING_UNINITIALIZED;
+    if (buffer == CROARING_UNINITIALIZED) {
+      buffer = dynamic_croaring_detect_supported_architectures();
+    }
+    return buffer;
+}
+#endif // CROARING_C_ATOMIC
+
+#ifdef ROARING_DISABLE_AVX
+
+int croaring_hardware_support(void) {
+    return 0;
+}
+
+#elif defined(__AVX512F__) && defined(__AVX512DQ__) && defined(__AVX512BW__) && defined(__AVX512VBMI2__) && defined(__AVX512BITALG__) && defined(__AVX512VPOPCNTDQ__)
+int croaring_hardware_support(void) {
+    return  ROARING_SUPPORTS_AVX2 | ROARING_SUPPORTS_AVX512;
+}
+#elif defined(__AVX2__)
+
+int croaring_hardware_support(void) {
+  static int support = 0xFFFFFFF;
+  if(support == 0xFFFFFFF) {
+    bool avx512_support = false;
+#if CROARING_COMPILER_SUPPORTS_AVX512
+    avx512_support =  ( (croaring_detect_supported_architectures() & CROARING_AVX512_REQUIRED)
+	                        == CROARING_AVX512_REQUIRED);
+#endif
+    support = ROARING_SUPPORTS_AVX2 | (avx512_support ? ROARING_SUPPORTS_AVX512 : 0);
+  }
+  return support;
+}
+#else
+
+int croaring_hardware_support(void) {
+  static int support = 0xFFFFFFF;
+  if(support == 0xFFFFFFF) {
+    bool has_avx2 = (croaring_detect_supported_architectures() & CROARING_AVX2) == CROARING_AVX2;
+    bool has_avx512 = false;
+#if CROARING_COMPILER_SUPPORTS_AVX512
+    has_avx512 = (croaring_detect_supported_architectures() & CROARING_AVX512_REQUIRED) == CROARING_AVX512_REQUIRED;
+#endif // CROARING_COMPILER_SUPPORTS_AVX512
+    support = (has_avx2 ? ROARING_SUPPORTS_AVX2 : 0) | (has_avx512 ? ROARING_SUPPORTS_AVX512 : 0);
+  }
+  return support;
+}
+#endif
+
+#endif // defined(__x86_64__) || defined(_M_AMD64) // x64
+#ifdef __cplusplus
+} } }  // extern "C" { namespace roaring { namespace internal {
+#endif
diff --git a/src/license-comment.h b/src/license-comment.h
index 43d200549..d50fb11d4 100644
--- a/src/license-comment.h
+++ b/src/license-comment.h
@@ -46,7 +46,7 @@
  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
  * IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE
+ * DEALINGS IN THE SOFTWARE.
  *
  * SPDX-License-Identifier: MIT
  */
diff --git a/src/roaring.c b/src/roaring.c
index 303f727c1..c863aa531 100644
--- a/src/roaring.c
+++ b/src/roaring.c
@@ -21,8 +21,12 @@ extern "C" { namespace roaring { namespace api {
 #define CROARING_SERIALIZATION_ARRAY_UINT32 1
 #define CROARING_SERIALIZATION_CONTAINER 2
 
+extern inline void roaring_bitmap_init_cleared(roaring_bitmap_t *r);
 extern inline bool roaring_bitmap_get_copy_on_write(const roaring_bitmap_t* r);
 extern inline void roaring_bitmap_set_copy_on_write(roaring_bitmap_t* r, bool cow);
+extern inline roaring_bitmap_t *roaring_bitmap_create(void);
+extern inline void roaring_bitmap_add_range(roaring_bitmap_t *r, uint64_t min, uint64_t max);
+extern inline void roaring_bitmap_remove_range(roaring_bitmap_t *r, uint64_t min, uint64_t max);
 
 static inline bool is_cow(const roaring_bitmap_t *r) {
     return r->high_low_container.flags & ROARING_FLAG_COW;
@@ -87,46 +91,91 @@ bool roaring_bitmap_init_with_capacity(roaring_bitmap_t *r, uint32_t cap) {
     return ra_init_with_capacity(&r->high_low_container, cap);
 }
 
+static inline void add_bulk_impl(roaring_bitmap_t *r,
+                                 roaring_bulk_context_t *context,
+                                 uint32_t val) {
+    uint16_t key = val >> 16;
+    if (context->container == NULL || context->key != key) {
+        uint8_t typecode;
+        int idx;
+        context->container = containerptr_roaring_bitmap_add(
+            r, val, &typecode, &idx);
+        context->typecode = typecode;
+        context->idx = idx;
+        context->key = key;
+    } else {
+        // no need to seek the container, it is at hand
+        // because we already have the container at hand, we can do the
+        // insertion directly, bypassing the roaring_bitmap_add call
+        uint8_t new_typecode;
+        container_t *container2 = container_add(
+            context->container, val & 0xFFFF, context->typecode, &new_typecode);
+        if (container2 != context->container) {
+            // rare instance when we need to change the container type
+            container_free(context->container, context->typecode);
+            ra_set_container_at_index(&r->high_low_container, context->idx,
+                                      container2, new_typecode);
+            context->typecode = new_typecode;
+            context->container = container2;
+        }
+    }
+}
 
 void roaring_bitmap_add_many(roaring_bitmap_t *r, size_t n_args,
                              const uint32_t *vals) {
-    container_t *container = NULL;  // hold value of last container touched
-    uint8_t typecode = 0;    // typecode of last container touched
-    uint32_t prev = 0;       // previous valued inserted
-    size_t i = 0;            // index of value
-    int containerindex = 0;
-    if (n_args == 0) return;
     uint32_t val;
-    memcpy(&val, vals + i, sizeof(val));
-    container =
-        containerptr_roaring_bitmap_add(r, val, &typecode, &containerindex);
-    prev = val;
-    i++;
-    for (; i < n_args; i++) {
-        memcpy(&val, vals + i, sizeof(val));
-        if (((prev ^ val) >> 16) ==
-            0) {  // no need to seek the container, it is at hand
-            // because we already have the container at hand, we can do the
-            // insertion
-            // automatically, bypassing the roaring_bitmap_add call
-            uint8_t newtypecode = typecode;
-            container_t *container2 =
-                container_add(container, val & 0xFFFF, typecode, &newtypecode);
-            if (container2 != container) {  // rare instance when we need to
-                                            // change the container type
-                container_free(container, typecode);
-                ra_set_container_at_index(&r->high_low_container,
-                                          containerindex, container2,
-                                          newtypecode);
-                typecode = newtypecode;
-                container = container2;
-            }
-        } else {
-            container = containerptr_roaring_bitmap_add(r, val, &typecode,
-                                                        &containerindex);
+    const uint32_t *start = vals;
+    const uint32_t *end = vals + n_args;
+    const uint32_t *current_val = start;
+
+    if (n_args == 0) {
+        return;
+    }
+
+    uint8_t typecode;
+    int idx;
+    container_t *container;
+    val = *current_val;
+    container = containerptr_roaring_bitmap_add(r, val, &typecode, &idx);
+    roaring_bulk_context_t context = {container, idx, (uint16_t)(val >> 16), typecode};
+
+    for (; current_val != end; current_val++) {
+        memcpy(&val, current_val, sizeof(val));
+        add_bulk_impl(r, &context, val);
+    }
+}
+
+void roaring_bitmap_add_bulk(roaring_bitmap_t *r,
+                             roaring_bulk_context_t *context, uint32_t val) {
+    add_bulk_impl(r, context, val);
+}
+
+bool roaring_bitmap_contains_bulk(const roaring_bitmap_t *r,
+                                  roaring_bulk_context_t *context,
+                                  uint32_t val)
+{
+    uint16_t key = val >> 16;
+    if (context->container == NULL || context->key != key) {
+        int32_t start_idx = -1;
+        if (context->container != NULL && context->key < key) {
+            start_idx = context->idx;
+        }
+        int idx = ra_advance_until(&r->high_low_container, key, start_idx);
+        if (idx == ra_get_size(&r->high_low_container)) {
+            return false;
+        }
+        uint8_t typecode;
+        context->container = ra_get_container_at_index(&r->high_low_container, idx, &typecode);
+        context->typecode = typecode;
+        context->idx = idx;
+        context->key = ra_get_key_at_index(&r->high_low_container, idx);
+        // ra_advance_until finds the next key >= the target, we found a later container.
+        if (context->key != key) {
+            return false;
         }
-        prev = val;
     }
+    // context is now set up
+    return container_contains(context->container, val & 0xFFFF, context->typecode);
 }
 
 roaring_bitmap_t *roaring_bitmap_of_ptr(size_t n_args, const uint32_t *vals) {
@@ -139,11 +188,12 @@ roaring_bitmap_t *roaring_bitmap_of(size_t n_args, ...) {
     // todo: could be greatly optimized but we do not expect this call to ever
     // include long lists
     roaring_bitmap_t *answer = roaring_bitmap_create();
+    roaring_bulk_context_t context = {0};
     va_list ap;
     va_start(ap, n_args);
-    for (size_t i = 1; i <= n_args; i++) {
+    for (size_t i = 0; i < n_args; i++) {
         uint32_t val = va_arg(ap, uint32_t);
-        roaring_bitmap_add(answer, val);
+        roaring_bitmap_add_bulk(answer, &context, val);
     }
     va_end(ap);
     return answer;
@@ -275,9 +325,6 @@ void roaring_bitmap_remove_range_closed(roaring_bitmap_t *r, uint32_t min, uint3
     }
 }
 
-extern inline void roaring_bitmap_add_range(roaring_bitmap_t *r, uint64_t min, uint64_t max);
-extern inline void roaring_bitmap_remove_range(roaring_bitmap_t *r, uint64_t min, uint64_t max);
-
 void roaring_bitmap_printf(const roaring_bitmap_t *r) {
     const roaring_array_t *ra = &r->high_low_container;
 
@@ -302,9 +349,9 @@ void roaring_bitmap_printf_describe(const roaring_bitmap_t *r) {
                get_full_container_name(ra->containers[i], ra->typecodes[i]),
                container_get_cardinality(ra->containers[i], ra->typecodes[i]));
         if (ra->typecodes[i] == SHARED_CONTAINER_TYPE) {
-            printf(
-                "(shared count = %" PRIu32 " )",
-                    CAST_shared(ra->containers[i])->counter);
+            printf("(shared count = %" PRIu32 " )",
+                   croaring_refcount_get(
+                       &(CAST_shared(ra->containers[i])->counter)));
         }
 
         if (i + 1 < ra->size) {
@@ -373,11 +420,81 @@ void roaring_bitmap_statistics(const roaring_bitmap_t *r,
                 break;
             default:
                 assert(false);
-                __builtin_unreachable();
+                roaring_unreachable;
         }
     }
 }
 
+/*
+ * Checks that:
+ * - Array containers are sorted and contain no duplicates
+ * - Range containers are sorted and contain no overlapping ranges
+ * - Roaring containers are sorted by key and there are no duplicate keys
+ * - The correct container type is use for each container (e.g. bitmaps aren't used for small containers)
+ */
+bool roaring_bitmap_internal_validate(const roaring_bitmap_t *r, const char **reason) {
+    const char *reason_local;
+    if (reason == NULL) {
+        // Always allow assigning through *reason
+        reason = &reason_local;
+    }
+    *reason = NULL;
+    const roaring_array_t *ra = &r->high_low_container;
+    if (ra->size < 0) {
+        *reason = "negative size";
+        return false;
+    }
+    if (ra->allocation_size < 0) {
+        *reason = "negative allocation size";
+        return false;
+    }
+    if (ra->size > ra->allocation_size) {
+        *reason = "more containers than allocated space";
+        return false;
+    }
+    if (ra->flags & ~(ROARING_FLAG_COW | ROARING_FLAG_FROZEN)) {
+        *reason = "invalid flags";
+        return false;
+    }
+    if (ra->size == 0) {
+        return true;
+    }
+
+    if (ra->keys == NULL) {
+        *reason = "keys is NULL";
+        return false;
+    }
+    if (ra->typecodes == NULL) {
+        *reason = "typecodes is NULL";
+        return false;
+    }
+    if (ra->containers == NULL) {
+        *reason = "containers is NULL";
+        return false;
+    }
+
+    uint32_t prev_key = ra->keys[0];
+    for (int32_t i = 1; i < ra->size; ++i) {
+        if (ra->keys[i] <= prev_key) {
+            *reason = "keys not strictly increasing";
+            return false;
+        }
+        prev_key = ra->keys[i];
+    }
+
+    for (int32_t i = 0; i < ra->size; ++i) {
+        if (!container_internal_validate(ra->containers[i], ra->typecodes[i], reason)) {
+            // reason should already be set
+            if (*reason == NULL) {
+                *reason = "container failed to validate but no reason given";
+            }
+            return false;
+        }
+    }
+
+    return true;
+}
+
 roaring_bitmap_t *roaring_bitmap_copy(const roaring_bitmap_t *r) {
     roaring_bitmap_t *ans =
         (roaring_bitmap_t *)roaring_malloc(sizeof(roaring_bitmap_t));
@@ -408,6 +525,7 @@ bool roaring_bitmap_overwrite(roaring_bitmap_t *dest,
 }
 
 void roaring_bitmap_free(const roaring_bitmap_t *r) {
+    if(r == NULL) { return; }
     if (!is_frozen(r)) {
       ra_clear((roaring_array_t*)&r->high_low_container);
     }
@@ -1387,7 +1505,10 @@ roaring_bitmap_t *roaring_bitmap_portable_deserialize_safe(const char *buf, size
     }
     size_t bytesread;
     bool is_ok = ra_portable_deserialize(&ans->high_low_container, buf, maxbytes, &bytesread);
-    if(is_ok) assert(bytesread <= maxbytes);
+    if (!is_ok) {
+        roaring_free(ans);
+        return NULL;
+    }
     roaring_bitmap_set_copy_on_write(ans, false);
     if (!is_ok) {
         roaring_free(ans);
@@ -1413,20 +1534,76 @@ size_t roaring_bitmap_portable_serialize(const roaring_bitmap_t *r,
 
 roaring_bitmap_t *roaring_bitmap_deserialize(const void *buf) {
     const char *bufaschar = (const char *)buf;
-    if (*(const unsigned char *)buf == CROARING_SERIALIZATION_ARRAY_UINT32) {
+    if (bufaschar[0] == CROARING_SERIALIZATION_ARRAY_UINT32) {
         /* This looks like a compressed set of uint32_t elements */
         uint32_t card;
+
         memcpy(&card, bufaschar + 1, sizeof(uint32_t));
+
         const uint32_t *elems =
             (const uint32_t *)(bufaschar + 1 + sizeof(uint32_t));
+        
+        roaring_bitmap_t *bitmap = roaring_bitmap_create();
+        if (bitmap == NULL) {
+            return NULL;
+        }
+        roaring_bulk_context_t context = {0};
+        for (uint32_t i = 0; i < card; i++) {
+            // elems may not be aligned, read with memcpy
+            uint32_t elem;
+            memcpy(&elem, elems + i, sizeof(elem));
+            roaring_bitmap_add_bulk(bitmap, &context, elem);
+        }
+        return bitmap;
 
-        return roaring_bitmap_of_ptr(card, elems);
     } else if (bufaschar[0] == CROARING_SERIALIZATION_CONTAINER) {
         return roaring_bitmap_portable_deserialize(bufaschar + 1);
     } else
         return (NULL);
 }
 
+roaring_bitmap_t* roaring_bitmap_deserialize_safe(const void *buf, size_t maxbytes) {
+    if (maxbytes < 1) {
+        return NULL;
+    }
+
+    const char *bufaschar = (const char *)buf;
+    if (bufaschar[0] == CROARING_SERIALIZATION_ARRAY_UINT32) {
+        if (maxbytes < 1 + sizeof(uint32_t)) {
+            return NULL;
+        }
+
+        /* This looks like a compressed set of uint32_t elements */
+        uint32_t card;
+        memcpy(&card, bufaschar + 1, sizeof(uint32_t));
+
+        // Check the buffer is big enough to contain card uint32_t elements
+        if (maxbytes < 1 + sizeof(uint32_t) + card * sizeof(uint32_t)) {
+            return NULL;
+        }
+
+        const uint32_t *elems =
+            (const uint32_t *)(bufaschar + 1 + sizeof(uint32_t));
+        
+        roaring_bitmap_t *bitmap = roaring_bitmap_create();
+        if (bitmap == NULL) {
+            return NULL;
+        }
+        roaring_bulk_context_t context = {0};
+        for (uint32_t i = 0; i < card; i++) {
+            // elems may not be aligned, read with memcpy
+            uint32_t elem;
+            memcpy(&elem, elems + i, sizeof(elem));
+            roaring_bitmap_add_bulk(bitmap, &context, elem);
+        }
+        return bitmap;
+        
+    } else if (bufaschar[0] == CROARING_SERIALIZATION_CONTAINER) {
+        return roaring_bitmap_portable_deserialize_safe(bufaschar + 1, maxbytes - 1);
+    } else
+        return (NULL);
+}
+
 bool roaring_iterate(const roaring_bitmap_t *r, roaring_iterator iterator,
                      void *ptr) {
     const roaring_array_t *ra = &r->high_low_container;
@@ -1502,7 +1679,7 @@ static bool loadfirstvalue(roaring_uint32_iterator_t *newit) {
                 wordindex++;  // advance
             }
             // here "word" is non-zero
-            newit->in_container_index = wordindex * 64 + __builtin_ctzll(word);
+            newit->in_container_index = wordindex * 64 + roaring_trailing_zeroes(word);
             newit->current_value = newit->highbits | newit->in_container_index;
             break; }
 
@@ -1535,7 +1712,7 @@ static bool loadlastvalue(roaring_uint32_iterator_t* newit) {
             while ((word = bitset_container->words[wordindex]) == 0)
                 --wordindex;
 
-            int num_leading_zeros = __builtin_clzll(word);
+            int num_leading_zeros = roaring_leading_zeroes(word);
             newit->in_container_index = (wordindex * 64) + (63 - num_leading_zeros);
             newit->current_value = newit->highbits | newit->in_container_index;
             break;
@@ -1594,7 +1771,7 @@ static bool loadfirstvalue_largeorequal(roaring_uint32_iterator_t *newit, uint32
             break; }
 
         default:
-            __builtin_unreachable();
+            roaring_unreachable;
     }
 
     return true;
@@ -1678,7 +1855,7 @@ bool roaring_advance_uint32_iterator(roaring_uint32_iterator_t *it) {
                 word = bc->words[wordindex];
             }
             if (word != 0) {
-                it->in_container_index = wordindex * 64 + __builtin_ctzll(word);
+                it->in_container_index = wordindex * 64 + roaring_trailing_zeroes(word);
                 it->current_value = it->highbits | it->in_container_index;
                 return (it->has_value = true);
             }
@@ -1715,7 +1892,7 @@ bool roaring_advance_uint32_iterator(roaring_uint32_iterator_t *it) {
         }
 
         default:
-            __builtin_unreachable();
+            roaring_unreachable;
     }
 
     // moving to next container
@@ -1747,7 +1924,7 @@ bool roaring_previous_uint32_iterator(roaring_uint32_iterator_t *it) {
             if (word == 0)
                 break;
 
-            int num_leading_zeros = __builtin_clzll(word);
+            int num_leading_zeros = roaring_leading_zeroes(word);
             it->in_container_index = (wordindex * 64) + (63 - num_leading_zeros);
             it->current_value = it->highbits | it->in_container_index;
             return (it->has_value = true);
@@ -1803,7 +1980,7 @@ uint32_t roaring_read_uint32_iterator(roaring_uint32_iterator_t *it, uint32_t* b
         word = bcont->words[wordindex] & (UINT64_MAX << (it->in_container_index % 64));
         do {
           while (word != 0 && ret < count) {
-            buf[0] = it->highbits | (wordindex * 64 + __builtin_ctzll(word));
+            buf[0] = it->highbits | (wordindex * 64 + roaring_trailing_zeroes(word));
             word = word & (word - 1);
             buf++;
             ret++;
@@ -1815,7 +1992,7 @@ uint32_t roaring_read_uint32_iterator(roaring_uint32_iterator_t *it, uint32_t* b
         } while (word != 0 && ret < count);
         it->has_value = (word != 0);
         if (it->has_value) {
-          it->in_container_index = wordindex * 64 + __builtin_ctzll(word);
+          it->in_container_index = wordindex * 64 + roaring_trailing_zeroes(word);
           it->current_value = it->highbits | it->in_container_index;
         }
         break;
@@ -2620,6 +2797,34 @@ uint64_t roaring_bitmap_rank(const roaring_bitmap_t *bm, uint32_t x) {
     return size;
 }
 
+/**
+ * roaring_bitmap_get_index returns the index of x, if not exsist return -1.
+ */
+int64_t roaring_bitmap_get_index(const roaring_bitmap_t *bm, uint32_t x) {
+    int64_t index = 0;
+    const uint16_t xhigh = x >> 16;
+    int32_t high_idx = ra_get_index(&bm->high_low_container, xhigh);
+    if (high_idx < 0) return -1;
+
+    for (int i = 0; i < bm->high_low_container.size; i++) {
+        uint32_t key = bm->high_low_container.keys[i];
+        if (xhigh > key) {
+            index +=
+                container_get_cardinality(bm->high_low_container.containers[i],
+                                          bm->high_low_container.typecodes[i]);
+        } else if (xhigh == key) {
+            int32_t low_idx = container_get_index(
+                bm->high_low_container.containers[high_idx],
+                bm->high_low_container.typecodes[high_idx], x & 0xFFFF);
+            if (low_idx < 0) return -1;
+            return index + low_idx;
+        } else {
+            return -1;
+        }
+    }
+    return index;
+}
+
 /**
 * roaring_bitmap_smallest returns the smallest value in the set.
 * Returns UINT32_MAX if the set is empty.
@@ -2733,7 +2938,6 @@ uint64_t roaring_bitmap_and_cardinality(const roaring_bitmap_t *x1,
               length2 = x2->high_low_container.size;
     uint64_t answer = 0;
     int pos1 = 0, pos2 = 0;
-
     while (pos1 < length1 && pos2 < length2) {
         const uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
         const uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
@@ -2823,8 +3027,7 @@ bool roaring_bitmap_contains_range(const roaring_bitmap_t *r, uint64_t range_sta
     }
     int32_t is = ra_get_index(&r->high_low_container, hb_rs);
     int32_t ie = ra_get_index(&r->high_low_container, hb_re);
-    ie = (ie < 0 ? -ie - 1 : ie);
-    if ((is < 0) || ((ie - is) != span)) {
+    if ((ie < 0) || (is < 0) || ((ie - is) != span) || ie >= hlc_sz) {
        return false;
     }
     const uint32_t lb_rs = range_start & 0xFFFF;
@@ -2838,7 +3041,6 @@ bool roaring_bitmap_contains_range(const roaring_bitmap_t *r, uint64_t range_sta
     if (!container_contains_range(c, lb_rs, 1 << 16, type)) {
       return false;
     }
-    assert(ie < hlc_sz); // would indicate an algorithmic bug
     c = ra_get_container_at_index(&r->high_low_container, ie, &type);
     if (!container_contains_range(c, 0, lb_re, type)) {
         return false;
@@ -2911,7 +3113,7 @@ size_t roaring_bitmap_frozen_size_in_bytes(const roaring_bitmap_t *rb) {
                 break;
             }
             default:
-                __builtin_unreachable();
+                roaring_unreachable;
         }
     }
     num_bytes += (2 + 2 + 1) * ra->size; // keys, counts, typecodes
@@ -2955,7 +3157,7 @@ void roaring_bitmap_frozen_serialize(const roaring_bitmap_t *rb, char *buf) {
                 break;
             }
             default:
-                __builtin_unreachable();
+                roaring_unreachable;
         }
     }
 
@@ -3001,7 +3203,7 @@ void roaring_bitmap_frozen_serialize(const roaring_bitmap_t *rb, char *buf) {
                 break;
             }
             default:
-                __builtin_unreachable();
+                roaring_unreachable;
         }
         memcpy(&count_zone[i], &count, 2);
     }
@@ -3139,6 +3341,204 @@ roaring_bitmap_frozen_view(const char *buf, size_t length) {
     return rb;
 }
 
+ALLOW_UNALIGNED
+roaring_bitmap_t *roaring_bitmap_portable_deserialize_frozen(const char *buf) {
+    char *start_of_buf = (char *) buf;
+    uint32_t cookie;
+    int32_t num_containers;
+    uint16_t *descriptive_headers;
+    uint32_t *offset_headers = NULL;
+    const char *run_flag_bitset = NULL;
+    bool hasrun = false;
+
+    // deserialize cookie
+    memcpy(&cookie, buf, sizeof(uint32_t));
+    buf += sizeof(uint32_t);
+    if (cookie == SERIAL_COOKIE_NO_RUNCONTAINER) {
+        memcpy(&num_containers, buf, sizeof(int32_t));
+        buf += sizeof(int32_t);
+        descriptive_headers = (uint16_t *) buf;
+        buf += num_containers * 2 * sizeof(uint16_t);
+        offset_headers = (uint32_t *) buf;
+        buf += num_containers * sizeof(uint32_t);
+    } else if ((cookie & 0xFFFF) == SERIAL_COOKIE) {
+        num_containers = (cookie >> 16) + 1;
+        hasrun = true;
+        int32_t run_flag_bitset_size = (num_containers + 7) / 8;
+        run_flag_bitset = buf;
+        buf += run_flag_bitset_size;
+        descriptive_headers = (uint16_t *) buf;
+        buf += num_containers * 2 * sizeof(uint16_t);
+        if(num_containers >= NO_OFFSET_THRESHOLD) {
+            offset_headers = (uint32_t *) buf;
+            buf += num_containers * sizeof(uint32_t);
+        }
+    } else {
+        return NULL;
+    }
+
+    // calculate total size for allocation
+    int32_t num_bitset_containers = 0;
+    int32_t num_run_containers = 0;
+    int32_t num_array_containers = 0;
+
+    for (int32_t i = 0; i < num_containers; i++) {
+        uint16_t tmp;
+        memcpy(&tmp, descriptive_headers + 2*i+1, sizeof(tmp));
+        uint32_t cardinality = tmp + 1;
+        bool isbitmap = (cardinality > DEFAULT_MAX_SIZE);
+        bool isrun = false;
+        if(hasrun) {
+          if((run_flag_bitset[i / 8] & (1 << (i % 8))) != 0) {
+            isbitmap = false;
+            isrun = true;
+          }
+        }
+
+        if (isbitmap) {
+            num_bitset_containers++;
+        } else if (isrun) {
+            num_run_containers++;
+        } else {
+            num_array_containers++;
+        }
+    }
+
+    size_t alloc_size = 0;
+    alloc_size += sizeof(roaring_bitmap_t);
+    alloc_size += num_containers * sizeof(container_t*);
+    alloc_size += num_bitset_containers * sizeof(bitset_container_t);
+    alloc_size += num_run_containers * sizeof(run_container_t);
+    alloc_size += num_array_containers * sizeof(array_container_t);
+    alloc_size += num_containers * sizeof(uint16_t); // keys
+    alloc_size += num_containers * sizeof(uint8_t); // typecodes
+
+    // allocate bitmap and construct containers
+    char *arena = (char *)roaring_malloc(alloc_size);
+    if (arena == NULL) {
+        return NULL;
+    }
+
+    roaring_bitmap_t *rb = (roaring_bitmap_t *)
+            arena_alloc(&arena, sizeof(roaring_bitmap_t));
+    rb->high_low_container.flags = ROARING_FLAG_FROZEN;
+    rb->high_low_container.allocation_size = num_containers;
+    rb->high_low_container.size = num_containers;
+    rb->high_low_container.containers =
+        (container_t **)arena_alloc(&arena,
+                                    sizeof(container_t*) * num_containers);
+
+    uint16_t *keys = (uint16_t *)arena_alloc(&arena, num_containers * sizeof(uint16_t));
+    uint8_t *typecodes = (uint8_t *)arena_alloc(&arena, num_containers * sizeof(uint8_t));
+
+    rb->high_low_container.keys = keys;
+    rb->high_low_container.typecodes = typecodes;
+
+    for (int32_t i = 0; i < num_containers; i++) {
+        uint16_t tmp;
+        memcpy(&tmp, descriptive_headers + 2*i+1, sizeof(tmp));
+        int32_t cardinality = tmp + 1;
+        bool isbitmap = (cardinality > DEFAULT_MAX_SIZE);
+        bool isrun = false;
+        if(hasrun) {
+          if((run_flag_bitset[i / 8] & (1 << (i % 8))) != 0) {
+            isbitmap = false;
+            isrun = true;
+          }
+        }
+
+        keys[i] = descriptive_headers[2*i];
+
+        if (isbitmap) {
+            typecodes[i] = BITSET_CONTAINER_TYPE;
+            bitset_container_t *c = (bitset_container_t *)arena_alloc(&arena, sizeof(bitset_container_t));
+            c->cardinality = cardinality;
+            if(offset_headers != NULL) {
+                c->words = (uint64_t *) (start_of_buf + offset_headers[i]);
+            } else {
+                c->words = (uint64_t *) buf;
+                buf += BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
+            }
+            rb->high_low_container.containers[i] = c;
+        } else if (isrun) {
+            typecodes[i] = RUN_CONTAINER_TYPE;
+            run_container_t *c = (run_container_t *)arena_alloc(&arena, sizeof(run_container_t));
+            c->capacity = cardinality;
+            uint16_t n_runs;
+            if(offset_headers != NULL) {
+                memcpy(&n_runs, start_of_buf + offset_headers[i], sizeof(uint16_t));
+                c->n_runs = n_runs;
+                c->runs = (rle16_t *) (start_of_buf + offset_headers[i] + sizeof(uint16_t));
+            } else {
+                memcpy(&n_runs, buf, sizeof(uint16_t));
+                c->n_runs = n_runs;
+                buf += sizeof(uint16_t);
+                c->runs = (rle16_t *) buf;
+                buf += c->n_runs * sizeof(rle16_t);
+            }
+            rb->high_low_container.containers[i] = c;
+        } else {
+            typecodes[i] = ARRAY_CONTAINER_TYPE;
+            array_container_t *c = (array_container_t *)arena_alloc(&arena, sizeof(array_container_t));
+            c->cardinality = cardinality;
+            c->capacity = cardinality;
+            if(offset_headers != NULL) {
+                c->array = (uint16_t *) (start_of_buf + offset_headers[i]);
+            } else {
+                c->array = (uint16_t *) buf;
+                buf += cardinality * sizeof(uint16_t);
+            }
+            rb->high_low_container.containers[i] = c;
+        }
+    }
+
+    return rb;
+}
+
+bool roaring_bitmap_to_bitset(const roaring_bitmap_t *r, bitset_t * bitset) {
+    uint32_t max_value = roaring_bitmap_maximum(r);
+    size_t new_array_size = (size_t)(((uint64_t)max_value + 63)/64);
+    bool resize_ok = bitset_resize(bitset, new_array_size, true);
+    if(!resize_ok) { return false; }
+    const roaring_array_t *ra = &r->high_low_container;
+    for (int i = 0; i < ra->size; ++i) {
+        uint64_t* words = bitset->array + (ra->keys[i]<<10);
+        uint8_t type = ra->typecodes[i];
+        const container_t *c = ra->containers[i];
+        if(type == SHARED_CONTAINER_TYPE) {
+            c = container_unwrap_shared(c, &type);
+        }
+        switch (type) {
+          case BITSET_CONTAINER_TYPE:
+          {
+            size_t max_word_index = new_array_size - (ra->keys[i]<<10);
+            if(max_word_index > 1024) { max_word_index = 1024; }
+            const bitset_container_t *src = const_CAST_bitset(c);
+            memcpy(words, src->words, max_word_index * sizeof(uint64_t));
+          }
+          break;
+          case ARRAY_CONTAINER_TYPE:
+          {
+            const array_container_t *src = const_CAST_array(c);
+            bitset_set_list(words, src->array, src->cardinality);
+          }
+          break;
+          case RUN_CONTAINER_TYPE:
+          {
+            const run_container_t *src = const_CAST_run(c);
+            for (int32_t rlepos = 0; rlepos < src->n_runs; ++rlepos) {
+                rle16_t rle = src->runs[rlepos];
+                bitset_set_lenrange(words, rle.value, rle.length);
+            }
+          }
+          break;
+          default:
+          roaring_unreachable;
+        }
+    }
+    return true;
+}
+
 #ifdef __cplusplus
 } } }  // extern "C" { namespace roaring {
 #endif
diff --git a/src/roaring_array.c b/src/roaring_array.c
index 2e1b2c671..7924aaeda 100644
--- a/src/roaring_array.c
+++ b/src/roaring_array.c
@@ -76,7 +76,10 @@ bool ra_init_with_capacity(roaring_array_t *new_ra, uint32_t cap) {
     if (!new_ra) return false;
     ra_init(new_ra);
 
-    if (cap > INT32_MAX) { return false; }
+    // Containers hold 64Ki elements, so 64Ki containers is enough to hold `0x10000 * 0x10000` (all 2^32) elements
+    if (cap > 0x10000) {
+        cap = 0x10000;
+    }
 
     if(cap > 0) {
       void *bigalloc = roaring_malloc(cap *
@@ -319,9 +322,8 @@ extern inline container_t *ra_get_container_at_index(
     const roaring_array_t *ra, uint16_t i,
     uint8_t *typecode);
 
-uint16_t ra_get_key_at_index(const roaring_array_t *ra, uint16_t i) {
-    return ra->keys[i];
-}
+extern inline uint16_t ra_get_key_at_index(const roaring_array_t *ra,
+                                           uint16_t i);
 
 extern inline int32_t ra_get_index(const roaring_array_t *ra, uint16_t x);
 
@@ -543,6 +545,7 @@ size_t ra_portable_size_in_bytes(const roaring_array_t *ra) {
     return count;
 }
 
+// This function is endian-sensitive.
 size_t ra_portable_serialize(const roaring_array_t *ra, char *buf) {
     char *initbuf = buf;
     uint32_t startOffset = 0;
@@ -632,7 +635,7 @@ size_t ra_portable_deserialize_size(const char *buf, const size_t maxbytes) {
         buf += sizeof(uint32_t);
     }
     if (size > (1<<16)) {
-       return 0; // logically impossible
+       return 0;
     }
     char *bitmapOfRunContainers = NULL;
     bool hasrun = (cookie & 0xFFFF) == SERIAL_COOKIE;
@@ -691,14 +694,15 @@ size_t ra_portable_deserialize_size(const char *buf, const size_t maxbytes) {
     return bytestotal;
 }
 
-
-// this function populates answer from the content of buf (reading up to maxbytes bytes).
+// This function populates answer from the content of buf (reading up to maxbytes bytes).
 // The function returns false if a properly serialized bitmap cannot be found.
-// if it returns true, readbytes is populated by how many bytes were read, we have that *readbytes <= maxbytes.
+// If it returns true, readbytes is populated by how many bytes were read, we have that *readbytes <= maxbytes.
+//
+// This function is endian-sensitive.
 bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const size_t maxbytes, size_t * readbytes) {
     *readbytes = sizeof(int32_t);// for cookie
     if(*readbytes > maxbytes) {
-      fprintf(stderr, "Ran out of bytes while reading first 4 bytes.\n");
+      // Ran out of bytes while reading first 4 bytes.
       return false;
     }
     uint32_t cookie;
@@ -706,8 +710,7 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const siz
     buf += sizeof(uint32_t);
     if ((cookie & 0xFFFF) != SERIAL_COOKIE &&
         cookie != SERIAL_COOKIE_NO_RUNCONTAINER) {
-        fprintf(stderr, "I failed to find one of the right cookies. Found %" PRIu32 "\n",
-                cookie);
+        // "I failed to find one of the right cookies. 
         return false;
     }
     int32_t size;
@@ -717,21 +720,19 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const siz
     else {
         *readbytes += sizeof(int32_t);
         if(*readbytes > maxbytes) {
-          fprintf(stderr, "Ran out of bytes while reading second part of the cookie.\n");
+          // Ran out of bytes while reading second part of the cookie.
           return false;
         }
         memcpy(&size, buf, sizeof(int32_t));
         buf += sizeof(uint32_t);
     }
     if (size < 0) {
-       fprintf(stderr, "You cannot have a negative number of containers, the data must be corrupted: %" PRId32 "\n",
-                size);
-       return false; // logically impossible
+       // You cannot have a negative number of containers, the data must be corrupted.
+       return false;
     }
     if (size > (1<<16)) {
-       fprintf(stderr, "You cannot have so many containers, the data must be corrupted: %" PRId32 "\n",
-                size);
-       return false; // logically impossible
+       // You cannot have so many containers, the data must be corrupted.
+       return false;
     }
     const char *bitmapOfRunContainers = NULL;
     bool hasrun = (cookie & 0xFFFF) == SERIAL_COOKIE;
@@ -739,7 +740,7 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const siz
         int32_t s = (size + 7) / 8;
         *readbytes += s;
         if(*readbytes > maxbytes) {// data is corrupted?
-          fprintf(stderr, "Ran out of bytes while reading run bitmap.\n");
+          // Ran out of bytes while reading run bitmap.
           return false;
         }
         bitmapOfRunContainers = buf;
@@ -749,14 +750,14 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const siz
 
     *readbytes += size * 2 * sizeof(uint16_t);
     if(*readbytes > maxbytes) {
-      fprintf(stderr, "Ran out of bytes while reading key-cardinality array.\n");
+      // Ran out of bytes while reading key-cardinality array.
       return false;
     }
     buf += size * 2 * sizeof(uint16_t);
 
     bool is_ok = ra_init_with_capacity(answer, size);
     if (!is_ok) {
-        fprintf(stderr, "Failed to allocate memory for roaring array. Bailing out.\n");
+        // Failed to allocate memory for roaring array. Bailing out.
         return false;
     }
 
@@ -768,7 +769,7 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const siz
     if ((!hasrun) || (size >= NO_OFFSET_THRESHOLD)) {
         *readbytes += size * 4;
         if(*readbytes > maxbytes) {// data is corrupted?
-          fprintf(stderr, "Ran out of bytes while reading offsets.\n");
+          // Ran out of bytes while reading offsets.
           ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
           return false;
         }
@@ -794,14 +795,14 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const siz
             size_t containersize = BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
             *readbytes += containersize;
             if(*readbytes > maxbytes) {
-              fprintf(stderr, "Running out of bytes while reading a bitset container.\n");
+              // Running out of bytes while reading a bitset container.
               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
               return false;
             }
             // it is now safe to read
             bitset_container_t *c = bitset_container_create();
             if(c == NULL) {// memory allocation failure
-              fprintf(stderr, "Failed to allocate memory for a bitset container.\n");
+              // Failed to allocate memory for a bitset container.
               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
               return false;
             }
@@ -813,7 +814,7 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const siz
             // we check that the read is allowed
             *readbytes += sizeof(uint16_t);
             if(*readbytes > maxbytes) {
-              fprintf(stderr, "Running out of bytes while reading a run container (header).\n");
+              // Running out of bytes while reading a run container (header).
               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
               return false;
             }
@@ -822,7 +823,7 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const siz
             size_t containersize = n_runs * sizeof(rle16_t);
             *readbytes += containersize;
             if(*readbytes > maxbytes) {// data is corrupted?
-              fprintf(stderr, "Running out of bytes while reading a run container.\n");
+              // Running out of bytes while reading a run container.
               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
               return false;
             }
@@ -830,7 +831,7 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const siz
 
             run_container_t *c = run_container_create();
             if(c == NULL) {// memory allocation failure
-              fprintf(stderr, "Failed to allocate memory for a run container.\n");
+              // Failed to allocate memory for a run container.
               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
               return false;
             }
@@ -843,7 +844,7 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const siz
             size_t containersize = thiscard * sizeof(uint16_t);
             *readbytes += containersize;
             if(*readbytes > maxbytes) {// data is corrupted?
-              fprintf(stderr, "Running out of bytes while reading an array container.\n");
+              // Running out of bytes while reading an array container.
               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
               return false;
             }
@@ -851,7 +852,7 @@ bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const siz
             array_container_t *c =
                 array_container_create_given_capacity(thiscard);
             if(c == NULL) {// memory allocation failure
-              fprintf(stderr, "Failed to allocate memory for an array container.\n");
+              // Failed to allocate memory for an array container.
               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
               return false;
             }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 724c0047c..d7fd398ba 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -9,6 +9,7 @@ add_cpp_test(cpp_random_unit)
 add_cpp_test(cpp_example1)
 add_cpp_test(cpp_example2)
 add_c_test(c_example1)
+add_c_test(cbitset_unit)
 add_c_test(array_container_unit)
 add_c_test(bitset_container_unit)
 add_c_test(mixed_container_unit)
@@ -19,6 +20,24 @@ add_c_test(format_portability_unit)
 add_c_test(robust_deserialization_unit)
 add_c_test(container_comparison_unit)
 add_c_test(add_offset)
+find_package(Threads)
+if(Threads_FOUND)
+  message(STATUS "Your system supports threads.")
+  add_executable(threads_unit threads_unit.cpp)
+  target_link_libraries(threads_unit PRIVATE roaring Threads::Threads)
+  if(ROARING_SANITIZE_THREADS)
+    # libtsan might be needed
+    if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+      message(STATUS "Under Linux, you may need to install libtsan." )
+    endif()
+    target_compile_options(threads_unit PRIVATE -fsanitize=thread -fno-sanitize-recover=all)
+    target_link_options(threads_unit PRIVATE -fsanitize=thread -fno-sanitize-recover=all)
+    message(STATUS "Sanitizing threads.")
+  endif()
+  add_test(threads_unit threads_unit)
+else(Threads_FOUND)
+  message(STATUS "Your system does not support threads.")
+endif(Threads_FOUND)
 
 if (NOT WIN32)
 # We exclude POSIX tests from Microsoft Windows
@@ -39,8 +58,3 @@ endif()
 
 
 configure_file(${CMAKE_SOURCE_DIR}/tools/cmake/CTestCustom.cmake ${CMAKE_BINARY_DIR})
-
-set(BUILD_STATIC_LIB ON)
-import_dependency(vendor/cmocka https://cmocka.org/files/1.1/cmocka-1.1.5.tar.xz)
-add_subdirectory(vendor/cmocka)
-
diff --git a/tests/add_offset.c b/tests/add_offset.c
index 211b3ae50..edd515d97 100644
--- a/tests/add_offset.c
+++ b/tests/add_offset.c
@@ -41,7 +41,7 @@ static int setup_container_add_offset_test(void **state_) {
     switch (test.type) {
     case BITSET_CONTAINER_TYPE:
         bc = bitset_container_create();
-        assert(bc != NULL);
+        assert_true(bc != NULL);
         for (size_t i = 0; i < test.n_values; i++) {
             bitset_container_add(bc, test.values[i]);
         }
@@ -49,7 +49,7 @@ static int setup_container_add_offset_test(void **state_) {
         break;
     case ARRAY_CONTAINER_TYPE:
         ac = array_container_create();
-        assert(ac != NULL);
+        assert_true(ac != NULL);
         for (size_t i = 0; i < test.n_values; i++) {
             array_container_add(ac, test.values[i]);
         }
@@ -57,14 +57,14 @@ static int setup_container_add_offset_test(void **state_) {
         break;
     case RUN_CONTAINER_TYPE:
         rc = run_container_create();
-        assert(rc != NULL);
+        assert_true(rc != NULL);
         for (size_t i = 0; i < test.n_values; i++) {
             run_container_add(rc, test.values[i]);
         }
         state->in = rc;
         break;
     default:
-        assert(false); // To catch buggy tests.
+        assert_true(false); // To catch buggy tests.
     }
 
     return 0;
@@ -104,7 +104,7 @@ static void container_add_offset_test(void **state_) {
     uint8_t type = test.type;
     int card_lo = 0, card_hi = 0;
 
-    assert(test.n_values > 0);
+    assert_true(test.n_values > 0);
 
     container_add_offset(state->in, type, &state->lo, &state->hi, offset);
     container_add_offset(state->in, type, NULL, &state->hi_only, offset);
@@ -158,7 +158,7 @@ static int setup_roaring_add_offset_test(void **state_) {
     roaring_add_offset_test_case_t test = state->test_case;
 
     state->in = roaring_bitmap_of_ptr(test.n_values, test.values);
-    assert(state->in != NULL);
+    assert_true(state->in != NULL);
 
     return 0;
 }
@@ -314,7 +314,7 @@ int main() {
     dense_bitmap[i++] = 400000;
     dense_bitmap[i++] = 1400000;
 
-    assert(i == ARRAY_SIZE(dense_bitmap));
+    assert_true(i == ARRAY_SIZE(dense_bitmap));
 
     // NB: only add positive offsets, the test function takes care of also
     // running a negative test for that offset.
@@ -338,7 +338,7 @@ int main() {
         roaring_add_offset_test_state_t state = ROARING_ADD_OFFSET_TEST_CASE(dense_bitmap, offset);
         roaring_state[i++] = state;
     }
-    assert(i <= ARRAY_SIZE(roaring_state));
+    assert_true(i <= ARRAY_SIZE(roaring_state));
 
     i = j = 0;
     struct CMUnitTest tests[ARRAY_SIZE(container_state)+ARRAY_SIZE(roaring_state)];
diff --git a/tests/array_container_unit.c b/tests/array_container_unit.c
index 26f1b57a0..09bb6d03a 100644
--- a/tests/array_container_unit.c
+++ b/tests/array_container_unit.c
@@ -8,6 +8,8 @@
 #include <stdlib.h>
 
 #include <roaring/containers/array.h>
+#include <roaring/containers/mixed_equal.h>
+#include <roaring/containers/bitset.h>
 #include <roaring/misc/configreport.h>
 
 #ifdef __cplusplus  // stronger type checking errors if C built in C++ mode
@@ -192,8 +194,189 @@ DEFINE_TEST(capacity_test) {
     array_container_free(array);
 }
 
+
+/* This is a fixed-increment version of Java 8's SplittableRandom generator
+   See http://dx.doi.org/10.1145/2714064.2660195 and
+   http://docs.oracle.com/javase/8/docs/api/java/util/SplittableRandom.html */
+
+// state for splitmix64
+uint64_t splitmix64_x; /* The state can be seeded with any value. */
+
+// call this one before calling splitmix64
+static inline void splitmix64_seed(uint64_t seed) { splitmix64_x = seed; }
+
+// floor( ( (1+sqrt(5))/2 ) * 2**64 MOD 2**64)
+#define GOLDEN_GAMMA UINT64_C(0x9E3779B97F4A7C15)
+
+// returns random number, modifies seed[0]
+// compared with D. Lemire against
+// http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/8-b132/java/util/SplittableRandom.java#SplittableRandom.0gamma
+static inline uint64_t splitmix64_r(uint64_t *seed) {
+  uint64_t z = (*seed += GOLDEN_GAMMA);
+  // David Stafford's Mix13 for MurmurHash3's 64-bit finalizer
+  z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9);
+  z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB);
+  return z ^ (z >> 31);
+}
+
+static inline uint64_t splitmix64() {
+    return splitmix64_r(&splitmix64_x);
+}
+
+size_t populate(uint16_t* buffer, size_t maxsize) {
+    size_t length = splitmix64() % maxsize;
+    for(size_t i = 0; i < length; i++) {
+        buffer[i] = (uint16_t)splitmix64();
+    }
+    return length;
+}
+
+DEFINE_TEST(mini_fuzz_array_container_intersection_inplace) {
+    splitmix64_seed(12345);
+    uint16_t* buffer1 = (uint16_t*) malloc(DEFAULT_MAX_SIZE * sizeof(uint16_t));
+    uint16_t* buffer2 = (uint16_t*) malloc(DEFAULT_MAX_SIZE * sizeof(uint16_t));
+    uint16_t* buffer3 = (uint16_t*) malloc(DEFAULT_MAX_SIZE * sizeof(uint16_t));
+    for(size_t z = 0; z < 3000; z++) {
+        array_container_t* array1 = array_container_create();
+        array_container_t* array2 = array_container_create();
+        array_container_t* array3 = array_container_create();
+
+        bitset_container_t* bitset1 = bitset_container_create();
+        bitset_container_t* bitset2 = bitset_container_create();
+        bitset_container_t* bitset3 = bitset_container_create();
+        size_t l1 = populate(buffer1, DEFAULT_MAX_SIZE);
+        size_t l2 = populate(buffer2, DEFAULT_MAX_SIZE);
+        size_t l3 = populate(buffer3, DEFAULT_MAX_SIZE);
+
+        for (uint32_t i = 0; i < l1; i++) {
+            array_container_add(array1, buffer1[i]);
+            bitset_container_set(bitset1, buffer1[i]);
+        }
+        for (uint32_t i = 0; i < l2; i++) {
+            array_container_add(array2, buffer2[i]);
+            bitset_container_set(bitset2, buffer2[i]);
+        }
+        for (uint32_t i = 0; i < l3; i++) {
+            array_container_add(array3, buffer3[i]);
+            bitset_container_set(bitset3, buffer3[i]);
+
+        }
+        bitset1->cardinality = BITSET_UNKNOWN_CARDINALITY;
+
+        array_container_intersection_inplace(array1, array2);
+        bitset_container_and_nocard(bitset1, bitset2, bitset1);
+        assert_true(array_container_equal_bitset(array1, bitset1));
+
+        array_container_intersection_inplace(array1, array3);
+        bitset_container_and_nocard(bitset1, bitset3, bitset1);
+        assert_true(array_container_equal_bitset(array1, bitset1));
+
+        for (uint32_t i = 0; i < l1; i++) {
+            array_container_add(array1, buffer1[i]);
+            bitset_container_set(bitset1, buffer1[i]);
+        }
+        bitset1->cardinality = BITSET_UNKNOWN_CARDINALITY;
+        assert_true(array_container_equal_bitset(array1, bitset1));
+
+        array_container_intersection_inplace(array1, array2);
+        bitset_container_and_nocard(bitset1, bitset2, bitset1);
+        assert_true(array_container_equal_bitset(array1, bitset1));
+
+        array_container_intersection_inplace(array1, array3);
+        bitset_container_and_nocard(bitset1, bitset3, bitset1);
+        assert_true(array_container_equal_bitset(array1, bitset1));
+        array_container_free(array1);
+        array_container_free(array2);
+        array_container_free(array3);
+        bitset_container_free(bitset1);
+        bitset_container_free(bitset2);
+        bitset_container_free(bitset3);
+    }
+    free(buffer1);
+    free(buffer2);
+    free(buffer3);
+}
+
+
+
+DEFINE_TEST(mini_fuzz_recycle_array_container_intersection_inplace) {
+    splitmix64_seed(12345);
+    uint16_t* buffer1 = (uint16_t*) malloc(DEFAULT_MAX_SIZE * sizeof(uint16_t));
+    uint16_t* buffer2 = (uint16_t*) malloc(DEFAULT_MAX_SIZE * sizeof(uint16_t));
+    uint16_t* buffer3 = (uint16_t*) malloc(DEFAULT_MAX_SIZE * sizeof(uint16_t));
+    array_container_t* array1 = array_container_create();
+    array_container_t* array2 = array_container_create();
+    array_container_t* array3 = array_container_create();
+
+    bitset_container_t* bitset1 = bitset_container_create();
+    bitset_container_t* bitset2 = bitset_container_create();
+    bitset_container_t* bitset3 = bitset_container_create();
+    for(size_t z = 0; z < 3000; z++) {
+        bitset_container_clear(bitset1);
+        bitset_container_clear(bitset2);
+        bitset_container_clear(bitset3);
+        array1->cardinality = 0;
+        array2->cardinality = 0;
+        array3->cardinality = 0;
+        size_t l1 = populate(buffer1, DEFAULT_MAX_SIZE);
+        size_t l2 = populate(buffer2, DEFAULT_MAX_SIZE);
+        size_t l3 = populate(buffer3, DEFAULT_MAX_SIZE);
+
+        for (uint32_t i = 0; i < l1; i++) {
+            array_container_add(array1, buffer1[i]);
+            bitset_container_set(bitset1, buffer1[i]);
+        }
+        for (uint32_t i = 0; i < l2; i++) {
+            array_container_add(array2, buffer2[i]);
+            bitset_container_set(bitset2, buffer2[i]);
+        }
+        for (uint32_t i = 0; i < l3; i++) {
+            array_container_add(array3, buffer3[i]);
+            bitset_container_set(bitset3, buffer3[i]);
+
+        }
+        bitset1->cardinality = BITSET_UNKNOWN_CARDINALITY;
+
+        array_container_intersection_inplace(array1, array2);
+        bitset_container_and_nocard(bitset1, bitset2, bitset1);
+        assert_true(array_container_equal_bitset(array1, bitset1));
+
+        array_container_intersection_inplace(array1, array3);
+        bitset_container_and_nocard(bitset1, bitset3, bitset1);
+        assert_true(array_container_equal_bitset(array1, bitset1));
+
+        for (uint32_t i = 0; i < l1; i++) {
+            array_container_add(array1, buffer1[i]);
+            bitset_container_set(bitset1, buffer1[i]);
+        }
+        bitset1->cardinality = BITSET_UNKNOWN_CARDINALITY;
+        assert_true(array_container_equal_bitset(array1, bitset1));
+
+        array_container_intersection_inplace(array1, array2);
+        bitset_container_and_nocard(bitset1, bitset2, bitset1);
+        assert_true(array_container_equal_bitset(array1, bitset1));
+
+        array_container_intersection_inplace(array1, array3);
+        bitset_container_and_nocard(bitset1, bitset3, bitset1);
+        assert_true(array_container_equal_bitset(array1, bitset1));
+
+    }
+    array_container_free(array1);
+    array_container_free(array2);
+    array_container_free(array3);
+    bitset_container_free(bitset1);
+    bitset_container_free(bitset2);
+    bitset_container_free(bitset3);
+
+    free(buffer1);
+    free(buffer2);
+    free(buffer3);
+}
+
 int main() {
     const struct CMUnitTest tests[] = {
+        cmocka_unit_test(mini_fuzz_array_container_intersection_inplace),
+        cmocka_unit_test(mini_fuzz_recycle_array_container_intersection_inplace),
         cmocka_unit_test(printf_test), cmocka_unit_test(add_contains_test),
         cmocka_unit_test(and_or_test), cmocka_unit_test(to_uint32_array_test),
         cmocka_unit_test(select_test),
diff --git a/tests/bitset_container_unit.c b/tests/bitset_container_unit.c
index 7de6f6826..4e3f1bc97 100644
--- a/tests/bitset_container_unit.c
+++ b/tests/bitset_container_unit.c
@@ -17,6 +17,17 @@
 
 #include "test.h"
 
+DEFINE_TEST(hamming_test) {
+  assert_true(roaring_hamming(0xffffffffffffffffULL) == 64);
+  for(int k = 0; k < 64; k++) {
+    assert_true(roaring_hamming(1ULL<<k) == 1);
+  }
+  for(int k = 0; k < 64; k++) {
+    for(int l = 0; l < 64; l++) {
+       assert_true(roaring_hamming((1ULL<<k)|(1ULL<<l)) == 2-(k==l));
+    }
+  }
+}
 
 DEFINE_TEST(test_bitset_lenrange_cardinality) {
   uint64_t words[] = {~UINT64_C(0), ~UINT64_C(0), ~UINT64_C(0), ~UINT64_C(0), 0, 0, 0, 0};
@@ -79,7 +90,7 @@ DEFINE_TEST(set_get_test) {
                      (1 << 16) / 3 + 1);
 
     for (size_t x = 0; x < 1 << 16; x += 3) {
-        bitset_container_unset(B, x);
+        bitset_container_remove(B, x);
     }
 
     assert_int_equal(bitset_container_cardinality(B), 0);
@@ -93,34 +104,81 @@ DEFINE_TEST(and_or_test) {
     bitset_container_t* B2 = bitset_container_create();
     bitset_container_t* BI = bitset_container_create();
     bitset_container_t* BO = bitset_container_create();
+    assert_true(bitset_container_compute_cardinality(B1) == 0);
+    assert_true(bitset_container_compute_cardinality(B2) == 0);
+    assert_true(bitset_container_compute_cardinality(BI) == 0);
+    assert_true(bitset_container_compute_cardinality(BO) == 0);
 
     assert_non_null(B1);
     assert_non_null(B2);
     assert_non_null(BI);
     assert_non_null(BO);
 
-    for (size_t x = 0; x < (1 << 16); x += 3) {
+    size_t max_value = 60000;
+
+    int b1_count = 0;
+    int bi_count = 0;
+    for (size_t x = 0; x < max_value; x += 3) {
         bitset_container_set(B1, x);
         bitset_container_set(BI, x);
+        b1_count++;
+        bi_count++;
+    }
+    for (size_t x = 0; x < max_value; x += 3) {
+        assert_true(bitset_container_get(B1, x));
+        assert_true(bitset_container_get(BI, x));
     }
 
+    assert_true(bitset_container_compute_cardinality(B1) == b1_count);
+    assert_true(bitset_container_compute_cardinality(BI) == bi_count);
+
+    int b2_count = 0;
     // important: 62 is not divisible by 3
-    for (size_t x = 0; x < (1 << 16); x += 62) {
+    for (size_t x = 0; x < max_value; x += 62) {
+        bi_count += !bitset_container_get(BI, x);
+
         bitset_container_set(B2, x);
         bitset_container_set(BI, x);
+        b2_count++;
     }
 
-    for (size_t x = 0; x < (1 << 16); x += 62 * 3) {
+    assert_true(bitset_container_compute_cardinality(B2) == b2_count);
+    assert_true(bitset_container_compute_cardinality(BI) == bi_count);
+    int bo_count = 0;
+    for (size_t x = 0; x < max_value; x += 62 * 3) {
         bitset_container_set(BO, x);
+        bo_count++;
     }
 
+    assert_true(bitset_container_compute_cardinality(BO) == bo_count);
+    assert_true(bitset_container_compute_cardinality(BI) == bi_count);
     const int card_union = bitset_container_compute_cardinality(BI);
     const int card_inter = bitset_container_compute_cardinality(BO);
-
-    bitset_container_and_nocard(B1, B2, BI);
+    assert_true(bitset_container_compute_cardinality(BI) == card_union);
+    assert_true(bi_count == card_union);
+    assert_true(bitset_container_compute_cardinality(BO) == bo_count);
+    assert_true(bitset_container_compute_cardinality(BO) == bitset_container_compute_cardinality(BO));
+    assert_true(card_inter == bo_count);
+    bitset_container_printf(B1);  // does it crash?
+    bitset_container_printf(B2);  // does it crash?
+    bitset_container_printf(BI);  // does it crash?
+
+    bitset_container_andnot_nocard(B1, B2, BI);
+
+    bitset_container_printf(B1);  // does it crash?
+    bitset_container_printf(B2);  // does it crash?
+    bitset_container_printf(BI);  // does it crash?
+    int interc = 0;
+    for (size_t x = 0; x < max_value; x ++) {
+        bool in1 = bitset_container_get(B1, x);
+        bool in2 = bitset_container_get(B2, x);
+        bool ini = bitset_container_get(BI, x);
+        assert_true(ini == (in1 & !in2));
+        interc += ini;
+    }
+    assert_true(bitset_container_compute_cardinality(BI) == interc);
     assert_int_not_equal(bitset_container_compute_cardinality(BI), card_union);
     assert_int_not_equal(bitset_container_and(B1, B2, BI), card_union);
-
     bitset_container_or_nocard(B1, B2, BO);
     assert_int_not_equal(bitset_container_compute_cardinality(BO), card_inter);
     assert_int_not_equal(bitset_container_or(B1, B2, BO), card_inter);
@@ -154,7 +212,7 @@ DEFINE_TEST(xor_test) {
     }
 
     for (size_t x = 0; x < (1 << 16); x += 62 * 3) {
-        bitset_container_unset(BI, x);
+        bitset_container_remove(BI, x);
     }
 
     bitset_container_xor(B1, B2, TMP);
@@ -185,7 +243,7 @@ DEFINE_TEST(andnot_test) {
     // important: 62 is not divisible by 3
     for (size_t x = 0; x < (1 << 16); x += 62) {
         bitset_container_set(B2, x);
-        bitset_container_unset(BI, x);
+        bitset_container_remove(BI, x);
     }
 
     const int expected = bitset_container_compute_cardinality(BI);
@@ -255,6 +313,7 @@ DEFINE_TEST(select_test) {
 
 int main() {
     const struct CMUnitTest tests[] = {
+        cmocka_unit_test(hamming_test),
         cmocka_unit_test(test_bitset_lenrange_cardinality),
         cmocka_unit_test(printf_test), cmocka_unit_test(set_get_test),
         cmocka_unit_test(and_or_test), cmocka_unit_test(xor_test),
diff --git a/tests/c_example1.c b/tests/c_example1.c
index 833eeb9ce..dfc594061 100644
--- a/tests/c_example1.c
+++ b/tests/c_example1.c
@@ -1,7 +1,9 @@
 #include <roaring/roaring.h>
+#include <roaring/misc/configreport.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <assert.h>
+#include "test.h"
 
 bool roaring_iterator_sumall(uint32_t value, void *param) {
     *(uint32_t *)param += value;
@@ -9,12 +11,13 @@ bool roaring_iterator_sumall(uint32_t value, void *param) {
 }
 
 int main() {
+    tellmeall();
     // create a new empty bitmap
     roaring_bitmap_t *r1 = roaring_bitmap_create();
     // then we can add values
     for (uint32_t i = 100; i < 1000; i++) roaring_bitmap_add(r1, i);
     // check whether a value is contained
-    assert(roaring_bitmap_contains(r1, 500));
+    assert_true(roaring_bitmap_contains(r1, 500));
     // compute how many bits there are:
     uint32_t cardinality = roaring_bitmap_get_cardinality(r1);
     printf("Cardinality = %d \n", cardinality);
@@ -26,7 +29,6 @@ int main() {
     uint32_t expectedsizerun = roaring_bitmap_portable_size_in_bytes(r1);
     printf("size before run optimize %d bytes, and after %d bytes\n",
            expectedsizebasic, expectedsizerun);
-
     // create a new bitmap containing the values {1,2,3,5,6}
     roaring_bitmap_t *r2 = roaring_bitmap_of(5, 1, 2, 3, 5, 6);
     roaring_bitmap_printf(r2);  // print it
@@ -38,24 +40,24 @@ int main() {
     // we can also go in reverse and go from arrays to bitmaps
     uint64_t card1 = roaring_bitmap_get_cardinality(r1);
     uint32_t *arr1 = (uint32_t *)malloc(card1 * sizeof(uint32_t));
-    assert(arr1 != NULL);
+    assert_true(arr1 != NULL);
     roaring_bitmap_to_uint32_array(r1, arr1);
     roaring_bitmap_t *r1f = roaring_bitmap_of_ptr(card1, arr1);
     free(arr1);
-    assert(roaring_bitmap_equals(r1, r1f));  // what we recover is equal
+    assert_true(roaring_bitmap_equals(r1, r1f));  // what we recover is equal
     roaring_bitmap_free(r1f);
 
     // we can go from arrays to bitmaps from "offset" by "limit"
     size_t offset = 100;
     size_t limit = 1000;
     uint32_t *arr3 = (uint32_t *)malloc(limit * sizeof(uint32_t));
-    assert(arr3 != NULL);
+    assert_true(arr3 != NULL);
     roaring_bitmap_range_uint32_array(r1, offset, limit, arr3);
     free(arr3);
 
     // we can copy and compare bitmaps
     roaring_bitmap_t *z = roaring_bitmap_copy(r3);
-    assert(roaring_bitmap_equals(r3, z));  // what we recover is equal
+    assert_true(roaring_bitmap_equals(r3, z));  // what we recover is equal
     roaring_bitmap_free(z);
 
     // we can compute union two-by-two
@@ -65,12 +67,12 @@ int main() {
     // we can compute a big union
     const roaring_bitmap_t *allmybitmaps[] = {r1, r2, r3};
     roaring_bitmap_t *bigunion = roaring_bitmap_or_many(3, allmybitmaps);
-    assert(
+    assert_true(
         roaring_bitmap_equals(r1_2_3, bigunion));  // what we recover is equal
     // can also do the big union with a heap
     roaring_bitmap_t *bigunionheap =
         roaring_bitmap_or_many_heap(3, allmybitmaps);
-    assert(roaring_bitmap_equals(r1_2_3, bigunionheap));
+    assert_true(roaring_bitmap_equals(r1_2_3, bigunionheap));
 
     roaring_bitmap_free(r1_2_3);
     roaring_bitmap_free(bigunion);
@@ -84,19 +86,36 @@ int main() {
     uint32_t expectedsize = roaring_bitmap_portable_size_in_bytes(r1);
     char *serializedbytes = (char*)malloc(expectedsize);
     roaring_bitmap_portable_serialize(r1, serializedbytes);
-    roaring_bitmap_t *t = roaring_bitmap_portable_deserialize(serializedbytes);
-    assert(roaring_bitmap_equals(r1, t));  // what we recover is equal
+    roaring_bitmap_t *t = roaring_bitmap_portable_deserialize_safe(serializedbytes, expectedsize);
+    if(t == NULL) { return EXIT_FAILURE; }
+    const char *reason = NULL;
+    if (!roaring_bitmap_internal_validate(t, &reason)) {
+        return EXIT_FAILURE;
+    }
+    assert_true(roaring_bitmap_equals(r1, t));  // what we recover is equal
     roaring_bitmap_free(t);
     // we can also check whether there is a bitmap at a memory location without
     // reading it
     size_t sizeofbitmap =
         roaring_bitmap_portable_deserialize_size(serializedbytes, expectedsize);
-    printf("sizeofbitmap = %zu \n", sizeofbitmap);
-    assert(sizeofbitmap ==
+    printf("\nsizeofbitmap = %zu \n", sizeofbitmap);
+    assert_true(sizeofbitmap ==
            expectedsize);  // sizeofbitmap would be zero if no bitmap were found
     // we can also read the bitmap "safely" by specifying a byte size limit:
     t = roaring_bitmap_portable_deserialize_safe(serializedbytes, expectedsize);
-    assert(roaring_bitmap_equals(r1, t));  // what we recover is equal
+    if(t == NULL) {
+        printf("Problem during deserialization.\n");
+        // We could clear any memory and close any file here.
+        return EXIT_FAILURE;
+    }
+    // We can validate the bitmap we recovered to make sure it is proper.
+    const char *reason_failure = NULL;
+    if (!roaring_bitmap_internal_validate(t, &reason_failure)) {
+        printf("safely deserialized invalid bitmap: %s\n", reason_failure);
+        // We could clear any memory and close any file here.
+        return EXIT_FAILURE;
+    }
+    assert_true(roaring_bitmap_equals(r1, t));  // what we recover is equal
     roaring_bitmap_free(t);
 
     free(serializedbytes);
@@ -135,5 +154,6 @@ int main() {
     roaring_bitmap_free(r1);
     roaring_bitmap_free(r2);
     roaring_bitmap_free(r3);
+    printf("Success.\n");
     return EXIT_SUCCESS;
 }
\ No newline at end of file
diff --git a/tests/cbitset_unit.c b/tests/cbitset_unit.c
new file mode 100644
index 000000000..d1b3f8edd
--- /dev/null
+++ b/tests/cbitset_unit.c
@@ -0,0 +1,277 @@
+#include <assert.h>
+#include <roaring/bitset/bitset.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "test.h"
+
+int compute_cardinality(bitset_t *b) {
+    size_t k = 0;
+    for (size_t i = 0; bitset_next_set_bit(b, &i); i++) {
+        k += 1;
+    }
+    return k;
+}
+
+void test_iterate() {
+    bitset_t *b = bitset_create();
+    for (int k = 0; k < 1000; ++k) bitset_set(b, 3 * k);
+    assert_true(bitset_count(b) == 1000);
+    assert_true(compute_cardinality(b) == 1000);
+    size_t k = 0;
+    for (size_t i = 0; bitset_next_set_bit(b, &i); i++) {
+        assert_true(i == k);
+        k += 3;
+    }
+    assert_true(k == 3000);
+    bitset_free(b);
+}
+
+bool increment(size_t value, void *param) {
+    size_t k;
+    memcpy(&k, param, sizeof(size_t));
+    assert_true(value == k);
+    k += 3;
+    memcpy(param, &k, sizeof(size_t));
+    return true;
+}
+
+void test_iterate2() {
+    bitset_t *b = bitset_create();
+    for (int k = 0; k < 1000; ++k) bitset_set(b, 3 * k);
+    assert_true(compute_cardinality(b) == 1000);
+    assert_true(bitset_count(b) == 1000);
+    size_t k = 0;
+    bitset_for_each(b, increment, &k);
+    assert_true(k == 3000);
+    bitset_free(b);
+}
+
+void test_construct() {
+    bitset_t *b = bitset_create();
+    for (int k = 0; k < 1000; ++k) bitset_set(b, 3 * k);
+    assert_true(compute_cardinality(b) == 1000);
+    assert_true(bitset_count(b) == 1000);
+    for (int k = 0; k < 3 * 1000; ++k)
+        assert_true(bitset_get(b, k) == (k / 3 * 3 == k));
+    bitset_free(b);
+}
+
+void test_max_min() {
+    bitset_t *b = bitset_create();
+    for (size_t k = 100; k < 1000; ++k) {
+        bitset_set(b, 3 * k);
+        assert_true(bitset_minimum(b) == 3 * 100);
+        assert_true(bitset_maximum(b) == 3 * k);
+    }
+    bitset_free(b);
+}
+
+void test_shift_left() {
+    for (size_t sh = 0; sh < 256; sh++) {
+        bitset_t *b = bitset_create();
+        int power = 3;
+        size_t s1 = 100;
+        size_t s2 = 5000;
+        for (size_t k = s1; k < s2; ++k) {
+            bitset_set(b, power * k);
+        }
+        int mycount = bitset_count(b);
+        assert_true(compute_cardinality(b) == mycount);
+        bitset_shift_left(b, sh);
+        assert_true(bitset_count(b) == (size_t)mycount);
+        assert_true(compute_cardinality(b) == mycount);
+        for (size_t k = s1; k < s2; ++k) {
+            assert_true(bitset_get(b, power * k + sh));
+        }
+        bitset_free(b);
+    }
+}
+
+void test_set_to_val() {
+    bitset_t *b = bitset_create();
+    bitset_set_to_value(b, 1, true);
+    bitset_set_to_value(b, 1, false);
+    bitset_set_to_value(b, 10, false);
+    bitset_set_to_value(b, 10, true);
+    assert_true(bitset_get(b, 10));
+    assert_true(!bitset_get(b, 1));
+    bitset_free(b);
+}
+
+void test_shift_right() {
+    for (size_t sh = 0; sh < 256; sh++) {
+        bitset_t *b = bitset_create();
+        int power = 3;
+        size_t s1 = 100 + sh;
+        size_t s2 = s1 + 5000;
+        for (size_t k = s1; k < s2; ++k) {
+            bitset_set(b, power * k);
+        }
+        size_t mycount = bitset_count(b);
+        bitset_shift_right(b, sh);
+        assert_true(bitset_count(b) == mycount);
+        for (size_t k = s1; k < s2; ++k) {
+            assert_true(bitset_get(b, power * k - sh));
+        }
+        bitset_free(b);
+    }
+}
+
+void test_union_intersection() {
+    bitset_t *b1 = bitset_create();
+    bitset_t *b2 = bitset_create();
+
+    for (int k = 0; k < 1000; ++k) {
+        bitset_set(b1, 2 * k);
+        bitset_set(b2, 2 * k + 1);
+    }
+    // calling xor twice should leave things unchanged
+    bitset_inplace_symmetric_difference(b1, b2);
+    assert_true(bitset_count(b1) == 2000);
+    bitset_inplace_symmetric_difference(b1, b2);
+    assert_true(bitset_count(b1) == 1000);
+    bitset_inplace_difference(b1, b2);  // should make no difference
+    assert_true(bitset_count(b1) == 1000);
+    bitset_inplace_union(b1, b2);
+    assert_true(bitset_count(b1) == 2000);
+    bitset_inplace_intersection(b1, b2);
+    assert_true(bitset_count(b1) == 1000);
+    bitset_inplace_difference(b1, b2);
+    assert_true(bitset_count(b1) == 0);
+    bitset_inplace_union(b1, b2);
+    bitset_inplace_difference(b2, b1);
+    assert_true(bitset_count(b2) == 0);
+    bitset_free(b1);
+    bitset_free(b2);
+}
+
+void test_counts() {
+    bitset_t *b1 = bitset_create();
+    bitset_t *b2 = bitset_create();
+
+    for (int k = 0; k < 1000; ++k) {
+        bitset_set(b1, 2 * k);
+        bitset_set(b2, 3 * k);
+    }
+    assert_true(bitset_intersection_count(b1, b2) == 334);
+    assert_true(bitset_union_count(b1, b2) == 1666);
+    bitset_free(b1);
+    bitset_free(b2);
+}
+
+/* Creates 2 bitsets, one containing even numbers the other odds.
+Checks bitsets_disjoint() returns that they are disjoint, then sets a common
+bit between both sets and checks that they are no longer disjoint. */
+void test_disjoint() {
+    bitset_t *evens = bitset_create();
+    bitset_t *odds = bitset_create();
+
+    for (int i = 0; i < 1000; i++) {
+        if (i % 2 == 0)
+            bitset_set(evens, i);
+        else
+            bitset_set(odds, i);
+    }
+
+    assert_true(bitsets_disjoint(evens, odds));
+
+    bitset_set(evens, 501);
+    bitset_set(odds, 501);
+
+    assert_true(!bitsets_disjoint(evens, odds));
+
+    bitset_free(evens);
+    bitset_free(odds);
+}
+
+/* Creates 2 bitsets, one containing even numbers the other odds.
+Checks that bitsets_intersect() returns that they do not intersect, then sets
+a common bit and checks that they now intersect. */
+void test_intersects() {
+    bitset_t *evens = bitset_create();
+    bitset_t *odds = bitset_create();
+
+    for (int i = 0; i < 1000; i++) {
+        if (i % 2 == 0)
+            bitset_set(evens, i);
+        else
+            bitset_set(odds, i);
+    }
+
+    assert_true(!bitsets_intersect(evens, odds));
+
+    bitset_set(evens, 1001);
+    bitset_set(odds, 1001);
+
+    assert_true(bitsets_intersect(evens, odds));
+
+    bitset_free(evens);
+    bitset_free(odds);
+}
+/* Create 2 bitsets with different capacity, where the bigger superset
+contains the subset bits plus additional bits after the subset arraysize.
+Checks that the bitset_contains_all() returns false when checking if
+the superset contains all the subset bits, and true in the opposite case. */
+void test_contains_all_different_sizes() {
+    const size_t superset_size = 10;
+    const size_t subset_size = 5;
+
+    bitset_t *superset = bitset_create_with_capacity(superset_size);
+    bitset_t *subset = bitset_create_with_capacity(subset_size);
+
+    bitset_set(superset, 1);
+    bitset_set(superset, subset_size - 1);
+    bitset_set(superset, subset_size + 1);
+
+    bitset_set(subset, 1);
+    bitset_set(subset, subset_size - 1);
+
+    assert_true(bitset_contains_all(superset, subset));
+    assert_true(!bitset_contains_all(subset, superset));
+
+    bitset_free(superset);
+    bitset_free(subset);
+}
+
+/* Creates 2 bitsets, one with all bits from 0->1000 set, the other with only
+even bits set in the same range. Checks that the bitset_contains_all()
+returns true, then sets a single bit at 1001 in the prior subset and checks that
+bitset_contains_all() returns false. */
+void test_contains_all() {
+    bitset_t *superset = bitset_create();
+    bitset_t *subset = bitset_create();
+
+    for (int i = 0; i < 1000; i++) {
+        bitset_set(superset, i);
+        if (i % 2 == 0) bitset_set(subset, i);
+    }
+
+    assert_true(bitset_contains_all(superset, subset));
+    assert_true(!bitset_contains_all(subset, superset));
+
+    bitset_set(subset, 1001);
+
+    assert_true(!bitset_contains_all(superset, subset));
+    assert_true(!bitset_contains_all(subset, superset));
+
+    bitset_free(superset);
+    bitset_free(subset);
+}
+
+int main() {
+    test_set_to_val();
+    test_construct();
+    test_union_intersection();
+    test_iterate();
+    test_iterate2();
+    test_max_min();
+    test_counts();
+    test_shift_right();
+    test_shift_left();
+    test_disjoint();
+    test_intersects();
+    test_contains_all();
+    test_contains_all_different_sizes();
+    printf("All asserts passed. Code is probably ok.\n");
+}
diff --git a/tests/container_comparison_unit.c b/tests/container_comparison_unit.c
index 4e8d05165..4f3240ea7 100644
--- a/tests/container_comparison_unit.c
+++ b/tests/container_comparison_unit.c
@@ -44,7 +44,7 @@ static inline void delegated_add(container_t *container, uint8_t typecode,
             break;
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
     }
 }
 
@@ -62,7 +62,7 @@ static inline container_t *container_create(uint8_t typecode) {
             break;
         default:
             assert(false);
-            __builtin_unreachable();
+            roaring_unreachable;
     }
     assert_non_null(result);
     return result;
diff --git a/tests/cpp_example2.cpp b/tests/cpp_example2.cpp
index 9005712d0..cab19249a 100644
--- a/tests/cpp_example2.cpp
+++ b/tests/cpp_example2.cpp
@@ -1,6 +1,7 @@
 #include <iostream>
 
 #include "roaring.hh"
+#include "test.h"
 
 using namespace roaring;
 
@@ -11,7 +12,7 @@ int main() {
     }
 
     // check whether a value is contained
-    assert(r1.contains(500));
+    assert_true(r1.contains(500));
 
     // compute how many bits there are:
     uint32_t cardinality = r1.cardinality();
@@ -47,11 +48,11 @@ int main() {
     delete[] arr1;
 
     // bitmaps shall be equal
-    assert(r1 == r1f);
+    assert_true(r1 == r1f);
 
     // we can copy and compare bitmaps
     Roaring z(r3);
-    assert(r3 == z);
+    assert_true(r3 == z);
 
     // we can compute union two-by-two
     Roaring r1_2_3 = r1 | r2;
@@ -60,17 +61,23 @@ int main() {
     // we can compute a big union
     const Roaring *allmybitmaps[] = {&r1, &r2, &r3};
     Roaring bigunion = Roaring::fastunion(3, allmybitmaps);
-    assert(r1_2_3 == bigunion);
+    assert_true(r1_2_3 == bigunion);
 
     // we can compute intersection two-by-two
     Roaring i1_2 = r1 & r2;
 
+#if CROARING_IS_BIG_ENDIAN
+    printf("We omit serialization tests because you have a big endian system.\n");
+#else
     // we can write a bitmap to a pointer and recover it later
     uint32_t expectedsize = r1.getSizeInBytes();
     char *serializedbytes = new char[expectedsize];
     r1.write(serializedbytes);
-    Roaring t = Roaring::read(serializedbytes);
-    assert(r1 == t);
+    // readSafe will not overflow, but the resulting bitmap
+    // is only valid and usable if the input follows the
+    // Roaring specification: https://github.com/RoaringBitmap/RoaringFormatSpec/
+    Roaring t = Roaring::readSafe(serializedbytes, expectedsize);
+    assert_true(r1 == t);
     delete[] serializedbytes;
 
     // we can iterate over all values using custom functions
@@ -88,11 +95,12 @@ int main() {
         ++counter;
     }
     // counter == t.cardinality()
-
+#endif
     // we can move iterators to skip values
     const uint32_t manyvalues[] = {2, 3, 4, 7, 8};
     Roaring rogue(5, manyvalues);
     Roaring::const_iterator j = rogue.begin();
     j.equalorlarger(4);  // *j == 4
+
     return EXIT_SUCCESS;
 }
diff --git a/tests/cpp_random_unit.cpp b/tests/cpp_random_unit.cpp
index 21749133e..fc49c3239 100644
--- a/tests/cpp_random_unit.cpp
+++ b/tests/cpp_random_unit.cpp
@@ -1,9 +1,10 @@
 //
 // cpp_random_unit.cpp
 //
-// The `roaring_checked.hh` variation of the C++ wrapper for roaring bitmaps
-// keeps a C++ `std::set` in sync with changes made using the object's methods.
-// That class has the same name (Roaring) and is in namespace `doublecheck`.
+// The `roaring_checked.hh` / `roaring64map_checked.hh variations of the C++
+// wrapper for roaring bitmaps keep a C++ `std::set` in sync with changes made
+// using the object's methods. Those classes have the same name and are in
+// namespace `doublecheck`.
 //
 // This test generates bitsets with randomized content and runs through the
 // various operations with them.
@@ -19,22 +20,24 @@
 // https://www.llvm.org/docs/LibFuzzer.html
 //
 
-#include <type_traits>
-#include <assert.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include <iostream>
+#include <cassert>
+#include <cinttypes>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
 
+#include <iostream>
+#include <type_traits>
 #include <vector>
 
 #include "roaring_checked.hh"
 using doublechecked::Roaring;  // so `Roaring` means `doublecheck::Roaring`
+#include "roaring64map_checked.hh"
+using doublechecked::Roaring64Map;
 
 #include "test.h"
 
-
 // The tests can run as long as one wants.  Ideally, the sanitizer options
 // for `address` and `undefined behavior` should be enabled (see the CMake
 // option ROARING_SANITIZE).
@@ -46,18 +49,18 @@ const unsigned long NUM_STEPS = 1000;
 //
 const int NUM_ROARS = 30;
 
-// If we generated data fully at random in the uint32_t space, then sets would
-// be unlikely to intersect very often.  Use a rolling focal point to kind of
-// distribute the values near enough to each other to be likely to interfere.
-//
+// If we generated data fully at random in the uint32_t / uint64_t space, then
+// sets would be unlikely to intersect very often.  Use a rolling focal point to
+// kind of distribute the values near enough to each other to be likely to
+// interfere.
 uint32_t gravity;
-
+uint64_t gravity64;
 
 Roaring make_random_bitset() {
     Roaring r;
     int num_ops = rand() % 100;
     for (int i = 0; i < num_ops; ++i) {
-        switch (rand() % 4) {
+        switch (rand() % 5) {
           case 0:
             r.add(gravity);
             break;
@@ -68,11 +71,16 @@ Roaring make_random_bitset() {
             break; }
 
           case 2: {
+            uint32_t start = gravity + (rand() % 10) - 5;
+            r.removeRange(start, start + rand() % 5);
+            break; }
+
+          case 3: {
             uint32_t start = gravity + (rand() % 50) - 25;
-            r.flip(start, rand() % 50);
+            r.flip(start, start + rand() % 50);
             break; }
 
-          case 3: {  // tests remove(), select(), rank()
+          case 4: {  // tests remove(), select(), rank()
             uint32_t card = r.cardinality();
             if (card != 0) {
                 uint32_t rnk = rand() % card;
@@ -92,6 +100,53 @@ Roaring make_random_bitset() {
     return r;
 }
 
+Roaring64Map make_random_bitset64() {
+    Roaring64Map r;
+    int num_ops = rand() % 100;
+    for (int i = 0; i < num_ops; ++i) {
+        switch (rand() % 5) {
+            case 0:
+                r.add(gravity64);
+                break;
+
+            case 1: {
+                uint64_t start = gravity64 + (rand() % 50) - 25;
+                r.addRange(start, start + rand() % 100);
+                break;
+            }
+
+            case 2: {
+                uint64_t start = gravity64 + (rand() % 10) - 5;
+                r.removeRange(start, start + rand() % 5);
+                break;
+            }
+
+            case 3: {
+                uint64_t start = gravity64 + (rand() % 50) - 25;
+                r.flip(start, start + rand() % 50);
+                break;
+            }
+
+            case 4: {  // tests remove(), select(), rank()
+                uint64_t card = r.cardinality();
+                if (card != 0) {
+                    uint64_t rnk = rand() % card;
+                    uint64_t element = 0;
+                    assert_true(r.select(rnk, &element));
+                    assert_int_equal(rnk + 1, r.rank(element));
+                    r.remove(rnk);
+                }
+                break;
+            }
+
+            default:
+                assert_true(false);
+        }
+        gravity64 += (rand() % 200) - 100;
+    }
+    assert_true(r.does_std_set_match_roaring());
+    return r;
+}
 
 DEFINE_TEST(sanity_check_doublechecking) {
     Roaring r;
@@ -115,6 +170,26 @@ DEFINE_TEST(sanity_check_doublechecking) {
     assert_true(r.does_std_set_match_roaring());
 }
 
+DEFINE_TEST(sanity_check_doublechecking_64) {
+    Roaring64Map r;
+    while (r.isEmpty()) r = make_random_bitset64();
+
+    // Pick a random element out of the guaranteed non-empty bitset
+    //
+    uint64_t rnk = rand() % r.cardinality();
+    uint64_t element;
+    assert_true(r.select(rnk, &element));
+
+    // Deliberately get check (the std::set) out of sync to ensure match fails
+    //
+    r.check.erase(element);
+    assert_false(r.does_std_set_match_roaring());
+
+    // Put the std::set back in sync so the destructor doesn't assert
+    //
+    r.check.insert(element);
+    assert_true(r.does_std_set_match_roaring());
+}
 
 DEFINE_TEST(random_doublecheck_test) {
     //
@@ -228,7 +303,7 @@ DEFINE_TEST(random_doublecheck_test) {
                 gravity = element;
             }
             uint32_t start = gravity + (rand() % 50) - 25;
-            out.flip(start, rand() % 50);
+            out.flip(start, start + rand() % 50);
             break; }
 
           default:
@@ -281,13 +356,147 @@ DEFINE_TEST(random_doublecheck_test) {
     }
 }
 
+DEFINE_TEST(random_doublecheck_test_64) {
+    //
+    // Make a group of bitsets to choose from when performing operations.
+    //
+    std::vector<Roaring64Map> roars;
+    for (int i = 0; i < NUM_ROARS; ++i)
+        roars.insert(roars.end(), make_random_bitset64());
+
+    for (unsigned long step = 0; step < NUM_STEPS; ++step) {
+        //
+        // Each step modifies the chosen `out` bitset...possibly just
+        // overwriting it completely.
+        //
+        Roaring64Map &out = roars[rand() % NUM_ROARS];
+
+        // The left and right bitsets may be used as inputs for operations.
+        // They can be a reference to the same object as out, or can be
+        // references to each other (which is good to test those conditions).
+        //
+        const Roaring64Map &left = roars[rand() % NUM_ROARS];
+        const Roaring64Map &right = roars[rand() % NUM_ROARS];
+
+#ifdef ROARING_CPP_RANDOM_PRINT_STATUS
+        printf("[%lu]: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", step,
+               left.cardinality(),
+               right.cardinality(),
+               out.cardinality());
+#endif
+
+        int op = rand() % 6;
+
+        switch (op) {
+            case 0: {  // AND
+                out = left & right;
+                if (&out != &left) assert_true(out.isSubset(left));
+                if (&out != &right) assert_true(out.isSubset(right));
+                break;
+            }
+
+            case 1: {  // ANDNOT
+                out = left - right;
+                if (&out != &left) assert_true(out.isSubset(left));
+                break;
+            }
+
+            case 2: {  // OR
+                out = left | right;
+                if (&out != &left) assert_true(left.isSubset(out));
+                if (&out != &right) assert_true(right.isSubset(out));
+                break;
+            }
+
+            case 3: {  // XOR
+                out = left ^ right;
+                break;
+            }
+
+            case 4: {  // FASTUNION
+                const Roaring64Map *inputs[3] = {&out, &left, &right};
+                out = Roaring64Map::fastunion(
+                    3, inputs);  // result checked internally
+                break;
+            }
+
+            case 5: {  // FLIP
+                uint64_t card = out.cardinality();
+                if (card != 0) {  // pick gravity point inside set somewhere
+                    uint64_t rnk = rand() % card;
+                    uint64_t element = 0;
+                    assert_true(out.select(rnk, &element));
+                    assert_int_equal(rnk + 1, out.rank(element));
+                    gravity64 = element;
+                }
+                uint64_t start = gravity64 + (rand() % 50) - 25;
+                out.flip(start, start + rand() % 50);
+                break;
+            }
+
+            default:
+                assert_true(false);
+        }
+
+        // Periodically apply a post-processing step to the out bitset
+        //
+        int post = rand() % 15;
+        switch (post) {
+            case 0:
+                out.removeRunCompression();
+                break;
+
+            case 1:
+                out.runOptimize();
+                break;
+
+            case 2:
+                out.shrinkToFit();
+                break;
+
+            default:
+                break;
+        }
+
+        // Explicitly ask if the `std::set` matches the roaring bitmap in out
+        //
+        assert_true(out.does_std_set_match_roaring());
+
+        // Do some arbitrary query operations.  No need to test the results, as
+        // the doublecheck code ensures the `std::set` matches internally.
+        //
+        out.isEmpty();
+        out.minimum();
+        out.maximum();
+        for (int i = -50; i < 50; ++i) {
+            out.contains(gravity64 + i);
+        }
+
+        // When doing random intersections, the tendency is that sets will
+        // lose all their data points over time.  So empty sets are usually
+        // re-seeded with more data, but a few get through to test empty cases.
+        //
+        if (out.isEmpty() && (rand() % 10 != 0)) out = make_random_bitset64();
+    }
+}
 
 int main() {
+    uint64_t seed = time(nullptr);
+    srand(seed);
+    printf("Seed:  %" PRIu64 "\n", seed);
+
     gravity = rand() % 10000;  // starting focal point
 
+    // Make the 64-bit gravity focus around the edge of a 32-bit value to better
+    // test edge cases.
+    gravity64 = (static_cast<uint64_t>(rand()) << 32) + rand() % 20000 - 10000;
+
     const struct CMUnitTest tests[] = {
         cmocka_unit_test(sanity_check_doublechecking),
-        cmocka_unit_test(random_doublecheck_test)};
+        cmocka_unit_test(sanity_check_doublechecking_64),
+        cmocka_unit_test(random_doublecheck_test),
+        cmocka_unit_test(random_doublecheck_test_64),
+    };
 
     return cmocka_run_group_tests(tests, NULL, NULL);
 }
diff --git a/tests/cpp_unit.cpp b/tests/cpp_unit.cpp
index 5c3de8fca..ecec3c83d 100644
--- a/tests/cpp_unit.cpp
+++ b/tests/cpp_unit.cpp
@@ -3,27 +3,45 @@
  */
 
 #include <assert.h>
+#include <iostream>
 #include <roaring/misc/configreport.h>
 #include <roaring/roaring.h>  // access to pure C exported API for testing
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
+#include <algorithm>
+#include <random>
+#include <vector>
+
 
+#include <fstream>
 #include <iostream>
 #include <type_traits>
+#include <vector>
 
+#include "config.h"
 #include "roaring.hh"
 using roaring::Roaring;  // the C++ wrapper class
 
 #include "roaring64map.hh"
 using roaring::Roaring64Map;  // C++ class extended for 64-bit numbers
 
+#include "roaring64map_checked.hh"
+
 #include "test.h"
 
 static_assert(std::is_nothrow_move_constructible<Roaring>::value,
               "Expected Roaring to be no except move constructable");
 
+
+namespace {
+// We put std::numeric_limits<>::max in parentheses to avoid a
+// clash with the Windows.h header under Windows.
+const auto uint32_max = (std::numeric_limits<uint32_t>::max)();
+const auto uint64_max = (std::numeric_limits<uint64_t>::max)();
+}  // namespace
+
 bool roaring_iterator_sumall(uint32_t value, void *param) {
     *(uint32_t *)param += value;
     return true;  // we always process all values
@@ -34,6 +52,13 @@ bool roaring_iterator_sumall64(uint64_t value, void *param) {
     return true;  // we always process all values
 }
 
+
+DEFINE_TEST(fuzz_001) {
+    roaring::Roaring b;
+    b.addRange(173, 0);
+    assert_true(b.cardinality() == 0);
+}
+
 DEFINE_TEST(serial_test) {
     uint32_t values[] = {5, 2, 3, 4, 1};
     Roaring r1(sizeof(values) / sizeof(uint32_t), values);
@@ -220,6 +245,67 @@ void test_roaring64_iterate_multi_roaring(void) {
     assert_true(iterate_count == 2);
 }
 
+namespace {
+bool roaringEqual(const Roaring64Map &actual,
+                  std::initializer_list<uint64_t> expected) {
+    return expected.size() == actual.cardinality() &&
+           std::equal(expected.begin(), expected.end(), actual.begin());
+}
+}  // namespace
+
+DEFINE_TEST(test_roaring64_remove_32) {
+    Roaring64Map roaring;
+
+    // A specific test to make sure we don't get slots confused.
+    // Specifically, we make Roaring64Map with only one slot (namely slot 5)
+    // with values {100, 200, 300} in its inner bitmap. Then we do a 32-bit
+    // remove of 100 from slot 0. A correct implementation of 'remove' would
+    // be a no-op.
+    const uint64_t b5 = uint64_t(5) << 32;
+    Roaring64Map r;
+    r.add(b5 + 100);
+    r.add(b5 + 200);
+    r.add(b5 + 300);
+    r.remove(uint32_t(100));
+
+    // No change
+    assert_true(roaringEqual(r, {b5 + 100, b5 + 200, b5 + 300}));
+}
+
+DEFINE_TEST(test_roaring64_add_and_remove) {
+    Roaring64Map r;
+
+    const uint64_t b5 = uint64_t(5) << 32;
+
+    // 32-bit adds
+    r.add(300u);
+    r.add(200u);
+    r.add(100u);
+    assert_true(roaringEqual(r, {100, 200, 300}));
+
+    // 64-bit adds
+    r.add(uint64_t(200));  // Duplicate
+    r.add(uint64_t(400));  // New
+    r.add(b5 + 400);  // All new
+    r.add(b5 + 300);
+    r.add(b5 + 200);
+    r.add(b5 + 100);
+    assert_true(roaringEqual(r,
+        {100, 200, 300, 400, b5 + 100, b5 + 200, b5 + 300, b5 + 400}));
+
+    // 32-bit removes
+    r.remove(200u);  // Exists.
+    r.remove(500u);  // Doesn't exist
+    assert_true(roaringEqual(r,
+        {100, 300, 400, b5 + 100, b5 + 200, b5 + 300, b5 + 400}));
+
+    // 64-bit removes
+    r.remove(b5 + 100);  // Exists.
+    r.remove(b5 + 500);  // Doesn't exist
+    assert_true(roaringEqual(r,
+        {100, 300, 400, b5 + 200, b5 + 300, b5 + 400}));
+}
+
 DEFINE_TEST(test_roaring64_iterate_multi_roaring) {
     test_roaring64_iterate_multi_roaring();
 }
@@ -254,6 +340,15 @@ void test_example_cpp(bool copy_on_write) {
 
     r2.printf();
     printf("\n");
+    // create a new bitmap with initializer list
+    Roaring r2i = Roaring::bitmapOfList({1, 2, 3, 5, 6});
+
+    assert_true(r2i == r2);
+
+    // create a new bitmap directly from initializer list
+    Roaring r2id = {1, 2, 3, 5, 6};
+
+    assert_true(r2id == r2);
 
     // test select
     uint32_t element;
@@ -348,7 +443,8 @@ void test_example_cpp(bool copy_on_write) {
         assert_true(a.contains(10));
         assert_true(a.contains(20));
 
-        // b should be destroyed without any errors
+        // Our move semantics allow moved-from objects to continue to be used
+        // normally (they are reset to empty Roarings).
         assert_true(b.cardinality() == 0);
     }
 
@@ -365,10 +461,27 @@ void test_example_cpp(bool copy_on_write) {
         assert_true(a.contains(10));
         assert_true(a.contains(20));
 
-        // b should be destroyed without any errors
+        // Our move semantics allow moved-from objects to continue to be used
+        // normally (they are reset to empty Roarings).
         assert_int_equal(0, b.cardinality());
     }
 
+    // test initializer lists
+    {
+        Roaring a;
+        a.add(10);
+        a.add(20);
+
+        // construction
+        Roaring b({10, 20});
+        assert_true(a == b);
+
+        a.add(30);
+        // assignment
+        b = {10, 20, 30};
+        assert_true(a == b);
+    }
+
     // test toString
     {
         Roaring a;
@@ -453,6 +566,16 @@ void test_example_cpp_64(bool copy_on_write) {
 
     r2.printf();
     printf("\n");
+    // create a new bitmap with initializer list
+    Roaring64Map r2i =
+        Roaring64Map::bitmapOfList({1, 2, 234294967296, 195839473298,
+                               14000000000000000100ull});
+    assert_true(r2i == r2);
+
+    // create a new bitmap directly from initializer list
+    Roaring64Map r2id = {1, 2, 234294967296, 195839473298,
+                         14000000000000000100ull};
+    assert_true(r2id == r2);
 
     // test select
     uint64_t element;
@@ -557,9 +680,11 @@ DEFINE_TEST(test_example_cpp_true) { test_example_cpp(true); }
 
 DEFINE_TEST(test_example_cpp_false) { test_example_cpp(false); }
 
+#if !CROARING_IS_BIG_ENDIAN
 DEFINE_TEST(test_example_cpp_64_true) { test_example_cpp_64(true); }
 
 DEFINE_TEST(test_example_cpp_64_false) { test_example_cpp_64(false); }
+#endif
 
 DEFINE_TEST(test_run_compression_cpp_64_true) {
     test_run_compression_cpp_64(true);
@@ -613,6 +738,534 @@ DEFINE_TEST(test_cpp_add_remove_checked_64) {
     assert_true(roaring.isEmpty());
 }
 
+DEFINE_TEST(test_cpp_add_range) {
+    std::vector<std::pair<uint64_t, uint64_t>> ranges = {
+      {1, 5},
+      {1, 1},
+      {2, 1},
+    };
+    for (const auto &range : ranges) {
+        uint64_t min = range.first;
+        uint64_t max = range.second;
+        Roaring r1;
+        r1.addRangeClosed(min, max);
+        Roaring r2;
+        for (uint64_t v = min; v <= max; ++v) {
+            r2.add(v);
+        }
+        assert_true(r1 == r2);
+    }
+}
+
+DEFINE_TEST(test_cpp_add_bulk) {
+    std::vector<uint32_t> values = {9999, 123, 0xFFFFFFFF, 0xFFFFFFF7, 9999};
+    Roaring r1;
+    Roaring r2;
+    roaring::BulkContext bulk_context;
+    for (const auto value : values) {
+        r1.addBulk(bulk_context, value);
+        r2.add(value);
+        assert_true(r1 == r2);
+    }
+}
+
+DEFINE_TEST(test_cpp_contains_bulk) {
+    std::vector<uint32_t> values_exists = {9999, 123, 0xFFFFFFFF, 0xFFFFFFF7};
+    std::vector<uint32_t> values_not_exists = {10, 12, 2000, 0xFFFFFFF, 0xFFFFFFF9, 2048};
+    Roaring r;
+    r.addMany(values_exists.size(), values_exists.data());
+    roaring::BulkContext bulk_context;
+    for (const auto value: values_exists) {
+        assert_true(r.containsBulk(bulk_context, value));
+    }
+    for (const auto value: values_not_exists) {
+        assert_false(r.containsBulk(bulk_context, value));
+    }
+}
+
+DEFINE_TEST(test_cpp_remove_range) {
+    {
+        // min < r1.minimum, max > r1.maximum
+        Roaring r1 = Roaring::bitmapOf(3, 1, 2, 4);
+        r1.removeRangeClosed(0, 5);
+        assert_true(r1.isEmpty());
+    }
+    {
+        // min < r1.minimum, max < r1.maximum, max does not exactly match an
+        // element
+        Roaring r1 = Roaring::bitmapOf(3, 1, 2, 4);
+        r1.removeRangeClosed(0, 3);
+        Roaring r2 = Roaring::bitmapOf(1, 4);
+        assert_true(r1 == r2);
+    }
+    {
+        // min < r1.minimum, max < r1.maximum, max exactly matches an element
+        Roaring r1 = Roaring::bitmapOf(3, 1, 2, 4);
+        r1.removeRangeClosed(0, 2);
+        Roaring r2 = Roaring::bitmapOf(1, 4);
+        assert_true(r1 == r2);
+    }
+    {
+        // min > r1.minimum, max > r1.maximum, min does not exactly match an
+        // element
+        Roaring r1 = Roaring::bitmapOf(3, 1, 2, 4);
+        r1.removeRangeClosed(3, 5);
+        Roaring r2 = Roaring::bitmapOf(2, 1, 2);
+        assert_true(r1 == r2);
+    }
+    {
+        // min > r1.minimum, max > r1.maximum, min exactly matches an element
+        Roaring r1 = Roaring::bitmapOf(3, 1, 2, 4);
+        r1.removeRangeClosed(2, 5);
+        Roaring r2 = Roaring::bitmapOf(1, 1);
+        assert_true(r1 == r2);
+    }
+    {
+        // min > r1.minimum, max < r1.maximum, no elements between min and max
+        Roaring r1 = Roaring::bitmapOf(3, 1, 2, 4);
+        r1.removeRangeClosed(3, 3);
+        Roaring r2 = Roaring::bitmapOf(3, 1, 2, 4);
+        assert_true(r1 == r2);
+    }
+    {
+        // max < r1.minimum
+        Roaring r1 = Roaring::bitmapOf(3, 1, 2, 4);
+        r1.removeRangeClosed(0, 0);
+        Roaring r2 = Roaring::bitmapOf(3, 1, 2, 4);
+        assert_true(r1 == r2);
+    }
+    {
+        // min > r1.maximum
+        Roaring r1 = Roaring::bitmapOf(3, 1, 2, 4);
+        r1.removeRangeClosed(5, 6);
+        Roaring r2 = Roaring::bitmapOf(3, 1, 2, 4);
+        assert_true(r1 == r2);
+    }
+    {
+        // min > max
+        Roaring r1 = Roaring::bitmapOf(3, 1, 2, 4);
+        r1.removeRangeClosed(2, 1);
+        Roaring r2 = Roaring::bitmapOf(3, 1, 2, 4);
+        assert_true(r1 == r2);
+    }
+}
+
+DEFINE_TEST(test_cpp_add_range_closed_64) {
+    {
+        // 32-bit integers
+        Roaring64Map r1;
+        r1.addRangeClosed(uint32_t(1), uint32_t(5));
+        Roaring64Map r2;
+        for (uint32_t v = 1; v <= 5; ++v) {
+            r2.add(v);
+        }
+        assert_true(r1 == r2);
+    }
+    auto b1 = uint64_t(1) << 32;
+    std::vector<std::pair<uint64_t, uint64_t>> ranges = {
+        {b1, b1 + 10},
+        {b1 + 100, b1 + 100},  // one element
+        {b1 - 10, b1 + 10},
+        {b1 + 2, b1 - 2}};
+    for (const auto &range : ranges) {
+        uint64_t min = range.first;
+        uint64_t max = range.second;
+        Roaring64Map r1;
+        r1.addRangeClosed(min, max);
+        Roaring64Map r2;
+        for (uint64_t v = min; v <= max; ++v) {
+            r2.add(v);
+        }
+        assert_true(r1 == r2);
+    }
+}
+DEFINE_TEST(test_bitmap_of_32) {
+        Roaring r1 = Roaring::bitmapOfList({1, 2, 4});
+        r1.printf();
+        printf("\n");
+        Roaring r2 =
+            Roaring::bitmapOf(3, 1, 2, 4);
+        r2.printf();
+        printf("\n");
+        assert_true(r1 == r2);
+
+        Roaring r1d = {1, 2, 4};
+        assert_true(r1 == r1d);
+
+        Roaring r3a = Roaring::bitmapOfList({7, 8, 9});
+        r3a = {1, 2, 4};  // overwrite with assignment operator
+        assert_true(r1 == r3a);
+}
+
+DEFINE_TEST(test_bitmap_of_64) {
+        Roaring64Map r1 = Roaring64Map::bitmapOfList({1, 2, 4});
+        r1.printf();
+        Roaring64Map r2 =
+            Roaring64Map::bitmapOf(3, uint64_t(1), uint64_t(2), uint64_t(4));
+        r2.printf();
+        assert_true(r1 == r2);
+
+        Roaring64Map r1d = {1, 2, 4};
+        assert_true(r1 == r1d);
+
+        Roaring64Map r3a = Roaring64Map::bitmapOfList({7, 8, 9});
+        r3a = {1, 2, 4};  // overwrite with assignment operator
+        assert_true(r1 == r3a);
+}
+
+DEFINE_TEST(test_cpp_add_range_open_64) {
+    {
+        // 32-bit integers
+        Roaring64Map r1;
+        r1.addRange(uint32_t(1), uint32_t(5));
+        Roaring64Map r2;
+        for (uint32_t v = 1; v < 5; ++v) {
+            r2.add(v);
+        }
+        assert_true(r1 == r2);
+    }
+    auto b1 = uint64_t(1) << 32;
+    std::vector<std::pair<uint64_t, uint64_t>> ranges = {
+        {b1, b1 + 10},
+        {b1 - 10, b1 + 10},
+        {b1 + 100, b1 + 100}, // empty
+        {b1 + 2, b1 - 2}};
+    for (const auto &range : ranges) {
+        uint64_t min = range.first;
+        uint64_t max = range.second;
+        Roaring64Map r1;
+        r1.addRange(min, max);
+        Roaring64Map r2;
+        for (uint64_t v = min; v < max; ++v) {
+            r2.add(v);
+        }
+        assert_true(r1 == r2);
+    }
+}
+
+DEFINE_TEST(test_cpp_add_range_closed_large_64) {
+    uint32_t start_high = 300;
+    for (uint32_t end_high = start_high; end_high != 305; ++end_high) {
+        auto begin = (uint64_t(start_high) << 32) + 0x01234567;
+        auto end = (uint64_t(end_high) << 32) + 0x89abcdef;
+        Roaring64Map r1;
+        r1.addRangeClosed(begin, end);
+        auto size = end - begin + 1;
+        assert_true(r1.cardinality() == size);
+    }
+}
+
+DEFINE_TEST(test_cpp_add_range_open_large_64) {
+    uint32_t start_high = 300;
+    for (uint32_t end_high = start_high; end_high != 305; ++end_high) {
+        auto begin = (uint64_t(start_high) << 32) + 0x01234567;
+        auto end = (uint64_t(end_high) << 32) + 0x89abcdef;
+        Roaring64Map r1;
+        r1.addRange(begin, end);
+        auto size = end - begin;
+        assert_true(r1.cardinality() == size);
+    }
+}
+
+DEFINE_TEST(test_cpp_add_many) {
+    std::vector<uint32_t> values = { 9999, 123, 0xFFFFFFFF, 0xFFFFFFF7, 9999};
+    Roaring r1;
+    r1.addMany(values.size(), values.data());
+    Roaring r2;
+    for (const auto value : values) {
+        r2.add(value);
+    }
+    assert_true(r1 == r2);
+}
+
+DEFINE_TEST(test_cpp_add_many_64) {
+    {
+        // 32-bit integers
+        std::vector<uint32_t> values = { 9999, 123, 0xFFFFFFFF, 0xFFFFFFF7, 0, 9999};
+        Roaring64Map r1;
+        r1.addMany(values.size(), values.data());
+        Roaring64Map r2;
+        for (const auto value : values) {
+            r2.add(value);
+        }
+        assert_true(r1 == r2);
+    }
+
+    auto b1 = uint64_t(1) << 32;
+    auto b555 = uint64_t(555) << 32;
+
+    std::vector<uint64_t> values = {
+        b555 + 9999, b1 + 123, b1 + 0xFFFFFFFF, b555 + 0xFFFFFFF7, 0, b555 + 9999};
+    Roaring64Map r1;
+    r1.addMany(values.size(), values.data());
+    Roaring64Map r2;
+    for (const auto value : values) {
+        r2.add(value);
+    }
+    assert_true(r1 == r2);
+}
+
+DEFINE_TEST(test_cpp_add_range_closed_combinatoric_64) {
+    // Given 'num_slots_to_test' outer slots, we repeatedly seed a Roaring64Map
+    // with all combinations of present and absent outer slots (basically the
+    // powerset of {0...num_slots_to_test - 1}), then we add_range_closed
+    // and see if the cardinality is what we expect.
+    //
+    // For example (assuming num_slots_to_test = 5), the iterations of the outer
+    // loop represent these sets:
+    // 1. {}
+    // 2. {0}
+    // 3. {1}
+    // 4. {0, 1}
+    // 5. {2}
+    // 6. {0, 2}
+    // 7. {1, 2}
+    // 8. {0, 1, 2}
+    // 9. {3}
+    // and so forth...
+    //
+    // For example, in step 6 (representing set {0, 2}) we set a bit somewhere
+    // in slot 0 and we set another bit somehwere in slot 2. The purpose of this
+    // is to make sure 'addRangeClosed' does the right thing when it encounters
+    // an arbitrary mix of present and absent slots. Then we call
+    // 'addRangeClosed' over the whole range and confirm that the cardinality
+    // is what we expect.
+    const uint32_t num_slots_to_test = 5;
+    const uint32_t base_slot = 50;
+
+    const uint32_t bitmask_limit = 1 << num_slots_to_test;
+
+    for (uint32_t bitmask = 0; bitmask < bitmask_limit; ++bitmask) {
+        Roaring64Map roaring;
+
+        // The 1-bits in 'bitmask' indicate which slots we want to seed
+        // with a value.
+        for (uint32_t bit_index = 0; bit_index < num_slots_to_test; ++bit_index) {
+            if ((bitmask & (1 << bit_index)) == 0) {
+                continue;
+            }
+            auto slot = base_slot + bit_index;
+            auto value = (uint64_t(slot) << 32) + bit_index;
+            roaring.add(value);
+        }
+
+        auto first_bucket = uint64_t(base_slot) << 32;
+        auto last_bucket = uint64_t(base_slot + num_slots_to_test - 1) << 32;
+
+        roaring.addRangeClosed(first_bucket,
+                               last_bucket + uint32_max);
+
+        auto expected_cardinality = num_slots_to_test * (uint64_t(1) << 32);
+        assert_int_equal(expected_cardinality, roaring.cardinality());
+    }
+}
+
+DEFINE_TEST(test_cpp_remove_range_closed_64) {
+    {
+        // 32-bit integers
+        Roaring64Map r1 =
+            Roaring64Map::bitmapOf(3, uint64_t(1), uint64_t(2), uint64_t(4));
+        r1.removeRangeClosed(uint32_t(2), uint32_t(3));
+        Roaring64Map r2 = Roaring64Map::bitmapOf(2, uint64_t(1), uint64_t(4));
+        assert_true(r1 == r2);
+    }
+    {
+        // min < r1.minimum, max > r1.maximum
+        Roaring64Map r1 = Roaring64Map::bitmapOf(
+            3, uint64_t(1) << 32, uint64_t(2) << 32, uint64_t(4) << 32);
+        r1.removeRangeClosed(uint64_t(0), uint64_t(5) << 32);
+        assert_true(r1.isEmpty());
+    }
+    {
+        // min < r1.minimum, max < r1.maximum, max does not exactly match an
+        // element
+        Roaring64Map r1 = Roaring64Map::bitmapOf(
+            3, uint64_t(1) << 32, uint64_t(2) << 32, uint64_t(4) << 32);
+        r1.removeRangeClosed(uint64_t(0), uint64_t(3) << 32);
+        Roaring64Map r2 = Roaring64Map::bitmapOf(1, uint64_t(4) << 32);
+        assert_true(r1 == r2);
+    }
+    {
+        // min < r1.minimum, max < r1.maximum, max exactly matches the high bits
+        // of an element
+        Roaring64Map r1 =
+            Roaring64Map::bitmapOf(4, uint64_t(1) << 32, uint64_t(2) << 32,
+                                   (uint64_t(2) << 32) + 1, uint64_t(4) << 32);
+        r1.removeRangeClosed(uint64_t(0), uint64_t(2) << 32);
+        Roaring64Map r2 = Roaring64Map::bitmapOf(2, (uint64_t(2) << 32) + 1,
+                                                 uint64_t(4) << 32);
+        assert_true(r1 == r2);
+    }
+    {
+        // min > r1.minimum, max > r1.maximum, min does not exactly match an
+        // element
+        Roaring64Map r1 = Roaring64Map::bitmapOf(
+            3, uint64_t(1) << 32, uint64_t(2) << 32, uint64_t(4) << 32);
+        r1.removeRangeClosed(uint64_t(3) << 32, uint64_t(5) << 32);
+        Roaring64Map r2 =
+            Roaring64Map::bitmapOf(2, uint64_t(1) << 32, uint64_t(2) << 32);
+        assert_true(r1 == r2);
+    }
+    {
+        // min > r1.minimum, max > r1.maximum, min exactly matches the high bits
+        // of an element
+        Roaring64Map r1 =
+            Roaring64Map::bitmapOf(4, uint64_t(1) << 32, uint64_t(2) << 32,
+                                   (uint64_t(2) << 32) + 1, uint64_t(4) << 32);
+        r1.removeRangeClosed((uint64_t(2) << 32) + 1, uint64_t(5) << 32);
+        Roaring64Map r2 =
+            Roaring64Map::bitmapOf(2, uint64_t(1) << 32, uint64_t(2) << 32);
+        assert_true(r1 == r2);
+    }
+    {
+        // min > r1.minimum, max < r1.maximum, no elements between min and max
+        Roaring64Map r1 = Roaring64Map::bitmapOf(
+            3, uint64_t(1) << 32, uint64_t(2) << 32, uint64_t(4) << 32);
+        r1.removeRangeClosed(uint64_t(3) << 32, (uint64_t(3) << 32) + 1);
+        Roaring64Map r2 = Roaring64Map::bitmapOf(
+            3, uint64_t(1) << 32, uint64_t(2) << 32, uint64_t(4) << 32);
+        assert_true(r1 == r2);
+    }
+    {
+        // max < r1.minimum
+        Roaring64Map r1 = Roaring64Map::bitmapOf(
+            3, uint64_t(1) << 32, uint64_t(2) << 32, uint64_t(4) << 32);
+        r1.removeRangeClosed(uint64_t(1), uint64_t(2));
+        Roaring64Map r2 = Roaring64Map::bitmapOf(
+            3, uint64_t(1) << 32, uint64_t(2) << 32, uint64_t(4) << 32);
+        assert_true(r1 == r2);
+    }
+    {
+        // min > r1.maximum
+        Roaring64Map r1 = Roaring64Map::bitmapOf(
+            3, uint64_t(1) << 32, uint64_t(2) << 32, uint64_t(4) << 32);
+        r1.removeRangeClosed(uint64_t(5) << 32, uint64_t(6) << 32);
+        Roaring64Map r2 = Roaring64Map::bitmapOf(
+            3, uint64_t(1) << 32, uint64_t(2) << 32, uint64_t(4) << 32);
+        assert_true(r1 == r2);
+    }
+    {
+        // min > max
+        Roaring64Map r1 = Roaring64Map::bitmapOf(
+            3, uint64_t(1) << 32, uint64_t(2) << 32, uint64_t(4) << 32);
+        r1.removeRangeClosed(uint64_t(2) << 32, uint64_t(1) << 32);
+        Roaring64Map r2 = Roaring64Map::bitmapOf(
+            3, uint64_t(1) << 32, uint64_t(2) << 32, uint64_t(4) << 32);
+        assert_true(r1 == r2);
+    }
+}
+
+DEFINE_TEST(test_cpp_remove_range_64) {
+    // Because removeRange delegates to removeRangeClosed, we do most of the
+    // unit testing in test_cpp_remove_range_closed_64(). We just do a couple of
+    // sanity checks here.
+    Roaring64Map r1;
+    auto b5 = uint64_t(5) << 32;
+
+    r1.add(0u);  // 32-bit add
+    r1.add(b5 + 1000);  // arbitrary 64 bit add
+    r1.add(b5 + 1001);  // arbitrary 64 bit add
+    r1.add(uint64_max - 1000);
+    r1.add(uint64_max);  // highest possible bit
+
+    // Half-open interval: result should be the set {0, maxUint64}
+    r1.removeRange(1, uint64_max);
+
+    Roaring64Map r2 = Roaring64Map::bitmapOf(2, uint64_t(0), uint64_max);
+    assert_true(r1 == r2);
+}
+
+std::pair<doublechecked::Roaring64Map, doublechecked::Roaring64Map>
+    make_two_big_roaring64_maps() {
+    // Insert a large number of pseudorandom numbers into two sets.
+    const uint32_t randomSeed = 0xdeadbeef;
+    const size_t numValues = 1000000;  // 1 million
+
+    doublechecked::Roaring64Map roaring1;
+    doublechecked::Roaring64Map roaring2;
+
+    std::default_random_engine engine(randomSeed);
+    std::uniform_int_distribution<uint64_t> rng;
+
+    for (size_t i = 0; i < numValues; ++i) {
+        auto value = rng(engine);
+        auto choice = rng(engine) % 4;
+        switch (choice) {
+            case 0: {
+                // Value is added only to set 1.
+                roaring1.add(value);
+                break;
+            }
+
+            case 1: {
+                // Value is added only to set 2.
+                roaring2.add(value);
+                break;
+            }
+
+            case 2: {
+                // Value is added to both sets.
+                roaring1.add(value);
+                roaring2.add(value);
+                break;
+            }
+
+            case 3: {
+                // Value is added to set 1, and a slightly different value
+                // is added to set 2. This makes it likely that they are in
+                // the same "outer" bin, but at a different "inner" position.
+                roaring1.add(value);
+                roaring2.add(value + 1);
+                break;
+            }
+
+            default:
+                assert_true(false);
+        }
+    }
+    return std::make_pair(std::move(roaring1), std::move(roaring2));
+}
+
+DEFINE_TEST(test_cpp_union_64) {
+    auto two_maps = make_two_big_roaring64_maps();
+
+    auto &lhs = two_maps.first;
+    const auto &rhs = two_maps.second;
+
+    lhs |= rhs;
+    assert_true(lhs.does_std_set_match_roaring());
+}
+
+DEFINE_TEST(test_cpp_intersect_64) {
+    auto two_maps = make_two_big_roaring64_maps();
+
+    auto &lhs = two_maps.first;
+    const auto &rhs = two_maps.second;
+
+    lhs &= rhs;
+    assert_true(lhs.does_std_set_match_roaring());
+}
+
+DEFINE_TEST(test_cpp_difference_64) {
+    auto two_maps = make_two_big_roaring64_maps();
+
+    auto &lhs = two_maps.first;
+    const auto &rhs = two_maps.second;
+
+    lhs -= rhs;
+    assert_true(lhs.does_std_set_match_roaring());
+}
+
+DEFINE_TEST(test_cpp_xor_64) {
+    auto two_maps = make_two_big_roaring64_maps();
+
+    auto &lhs = two_maps.first;
+    const auto &rhs = two_maps.second;
+
+    lhs ^= rhs;
+    assert_true(lhs.does_std_set_match_roaring());
+}
+
 DEFINE_TEST(test_cpp_clear_64) {
     Roaring64Map roaring;
 
@@ -679,7 +1332,7 @@ DEFINE_TEST(test_cpp_frozen) {
 
     Roaring r1;
     r1.add(0);
-    r1.add(UINT32_MAX);
+    r1.add(uint32_max);
     r1.add(1000);
     r1.add(2000);
     r1.add(100000);
@@ -699,6 +1352,22 @@ DEFINE_TEST(test_cpp_frozen) {
     const Roaring r2 = Roaring::frozenView(buf, num_bytes);
     assert_true(r1 == r2);
 
+    {
+        Roaring r;
+        r.addRange(0, 100000);
+        r.flip(90000, 91000);
+        r.runOptimize();
+
+        // allocate a buffer and serialize to it
+        size_t num_bytes1 = r.getFrozenSizeInBytes();
+        char *buf1 = (char *)roaring_aligned_malloc(32, num_bytes1);
+        r.writeFrozen(buf1);
+
+        // ensure the frozen bitmap is the same as the original
+        const Roaring rr = Roaring::frozenView(buf1, num_bytes1);
+        assert_true(r == rr);
+        roaring_aligned_free(buf1);
+    }
 #if ROARING_EXCEPTIONS
     // try viewing a misaligned/invalid buffer
     try {
@@ -744,7 +1413,7 @@ DEFINE_TEST(test_cpp_frozen_64) {
 
     Roaring64Map r1;
     r1.add((uint64_t)0);
-    r1.add((uint64_t)UINT32_MAX);
+    r1.add((uint64_t)uint32_max);
     r1.add((uint64_t)1000);
     r1.add((uint64_t)2000);
     r1.add((uint64_t)100000);
@@ -803,13 +1472,18 @@ DEFINE_TEST(test_cpp_frozen_64) {
 }
 
 DEFINE_TEST(test_cpp_flip) {
+    {
+        // flipping an empty map works as expected
+        Roaring r1;
+        r1.flip(2, 5);
+        Roaring r2 = Roaring::bitmapOf(3, 2, 3, 4);
+        assert_true(r1 == r2);
+    }
     {
         // nothing is affected outside of the given range
         Roaring r1 = Roaring::bitmapOf(3, 1, 3, 6);
         r1.flip(2, 5);
         Roaring r2 = Roaring::bitmapOf(4, 1, 2, 4, 6);
-        r1.printf();
-        r2.printf();
         assert_true(r1 == r2);
     }
     {
@@ -828,11 +1502,8 @@ DEFINE_TEST(test_cpp_flip) {
     }
     {
         // uint32 max can be flipped
-        Roaring r1 =
-            Roaring::bitmapOf(1, (std::numeric_limits<uint32_t>::max)());
-        r1.flip(
-            (std::numeric_limits<uint32_t>::max)(),
-            static_cast<uint64_t>((std::numeric_limits<uint32_t>::max)()) + 1);
+        Roaring r1 = Roaring::bitmapOf(1, uint32_max);
+        r1.flip(uint32_max, static_cast<uint64_t>(uint32_max) + 1);
         assert_true(r1.isEmpty());
     }
     {
@@ -844,32 +1515,221 @@ DEFINE_TEST(test_cpp_flip) {
     }
 }
 
+DEFINE_TEST(test_cpp_flip_closed) {
+    {
+        // flipping an empty map works as expected
+        Roaring r1;
+        r1.flipClosed(2, 5);
+        Roaring r2 = Roaring::bitmapOf(4, 2, 3, 4, 5);
+        assert_true(r1 == r2);
+    }
+    {
+        // nothing is affected outside of the given range
+        Roaring r1 = Roaring::bitmapOf(3, 1, 3, 6);
+        r1.flipClosed(2, 4);
+        Roaring r2 = Roaring::bitmapOf(4, 1, 2, 4, 6);
+        assert_true(r1 == r2);
+    }
+    {
+        // given range can go outside of existing range
+        Roaring r1 = Roaring::bitmapOf(2, 1, 3);
+        r1.flipClosed(0, 4);
+        Roaring r2 = Roaring::bitmapOf(3, 0, 2, 4);
+        assert_true(r1 == r2);
+    }
+    {
+        // range end is inclusive
+        Roaring r1 = Roaring::bitmapOf(2, 1, 3);
+        r1.flipClosed(1, 2);
+        Roaring r2 = Roaring::bitmapOf(2, 2, 3);
+        assert_true(r1 == r2);
+    }
+    {
+        // uint32 max can be flipped
+        Roaring r1 = Roaring::bitmapOf(1, uint32_max);
+        r1.flipClosed(uint32_max, uint32_max);
+        assert_true(r1.isEmpty());
+    }
+    {
+        // empty range does nothing
+        Roaring r1 = Roaring::bitmapOf(2, 2, 3);
+        Roaring r2 = r1;
+        r1.flipClosed(2, 1);
+        assert_true(r1 == r2);
+    }
+}
+
+
 DEFINE_TEST(test_cpp_flip_64) {
+    {
+        // 32-bit test
+        {
+            // flipping an empty map works as expected
+            Roaring64Map r1;
+            r1.flip(2, 5);
+            auto r2 = Roaring64Map::bitmapOf(
+                3, uint64_t(2), uint64_t(3), uint64_t(4));
+            assert_true(r1 == r2);
+        }
+        {
+            // nothing is affected outside of the given range
+            auto r1 = Roaring64Map::bitmapOf(
+                3, uint64_t(1), uint64_t(3), uint64_t(6));
+            r1.flip(uint32_t(2), uint32_t(5));
+            Roaring64Map r2 = Roaring64Map::bitmapOf(
+                4, uint64_t(1), uint64_t(2), uint64_t(4), uint64_t(6));
+            assert_true(r1 == r2);
+        }
+        {
+            // given range can go outside of existing range
+            auto r1 = Roaring64Map::bitmapOf(2, uint64_t(1), uint64_t(3));
+            r1.flip(uint32_t(0), uint32_t(5));
+            auto r2 = Roaring64Map::bitmapOf(
+                3, uint64_t(0), uint64_t(2), uint64_t(4));
+            assert_true(r1 == r2);
+        }
+        {
+            // range end is exclusive
+            auto r1 = Roaring64Map::bitmapOf(2, uint64_t(1), uint64_t(3));
+            r1.flip(uint32_t(1), uint32_t(3));
+            auto r2 = Roaring64Map::bitmapOf(2, uint64_t(2), uint64_t(3));
+            assert_true(r1 == r2);
+        }
+        {
+            // uint32 max can be flipped
+            auto r1 = Roaring64Map::bitmapOf(1, uint64_t(uint32_max));
+            r1.flip(uint32_max, uint64_t(uint32_max) + 1);
+            assert_true(r1.isEmpty());
+        }
+        {
+            // empty range does nothing
+            auto r1 = Roaring64Map::bitmapOf(2, uint64_t(2), uint64_t(3));
+            auto r2 = r1;
+            r1.flip(uint32_t(2), uint32_t(2));
+            assert_true(r1 == r2);
+        }
+    }
+
+    const auto b1 = uint64_t(1) << 32;
+    const auto b2 = uint64_t(2) << 32;
+
     {
         // nothing is affected outside of the given range
-        Roaring64Map r1 = Roaring64Map::bitmapOf(3, (((uint64_t)1) << 32) - 3, ((uint64_t)1) << 32,
-                                                 (((uint64_t)1) << 32) + 3);
-        r1.flip((((uint64_t)1) << 32) - 2, (((uint64_t)1) << 32) + 2);
+        Roaring64Map r1 = Roaring64Map::bitmapOf(3, b1 - 3, b1, b1 + 3);
+        r1.flip(b1 - 2, b1 + 2);
         Roaring64Map r2 = Roaring64Map::bitmapOf(
-            5, (((uint64_t)1) << 32) - 3, (((uint64_t)1) << 32) - 2, (((uint64_t)1) << 32) - 1,
-            (((uint64_t)1) << 32) + 1, (((uint64_t)1) << 32) + 3);
+            5, b1 - 3, b1 - 2, b1 - 1, b1 + 1, b1 + 3);
         assert_true(r1 == r2);
     }
     {
         // given range can go outside of existing range
-        Roaring64Map r1 = Roaring64Map::bitmapOf(2, (((uint64_t)1) << 32) - 2, ((uint64_t)1) << 32);
-        r1.flip((((uint64_t)1) << 32) - 3, (((uint64_t)1) << 32) + 2);
+        Roaring64Map r1 = Roaring64Map::bitmapOf(2, b1 - 2, b1);
+        r1.flip(b1 - 3, b1 + 2);
         Roaring64Map r2 = Roaring64Map::bitmapOf(
-            3, (((uint64_t)1) << 32) - 3, (((uint64_t)1) << 32) - 1, (((uint64_t)1) << 32) + 1);
+            3, b1 - 3, b1 - 1, b1 + 1);
         assert_true(r1 == r2);
     }
     {
         // range end is exclusive
+        Roaring64Map r1 = Roaring64Map::bitmapOf(2, b2 - 1, b2 + 2);
+        r1.flip(b2 - 1, b2 + 2);
+        Roaring64Map r2;
+        for (uint64_t i = b2; i <= b2 + 2; ++i) {
+            r2.add(i);
+        }
+        assert_true(r1 == r2);
+    }
+    {
+        // uint32 max can be flipped
         Roaring64Map r1 =
-            Roaring64Map::bitmapOf(2, (((uint64_t)2) << 32) - 1, (((uint64_t)2) << 32) + 2);
-        r1.flip((((uint64_t)2) << 32) - 1, (((uint64_t)2) << 32) + 2);
+            Roaring64Map::bitmapOf(1, static_cast<uint64_t>(uint32_max));
+        r1.flip(uint32_max, static_cast<uint64_t>(uint32_max) + 1);
+        assert_true(r1.isEmpty());
+    }
+    {
+        // empty range does nothing
+        Roaring64Map r1 = Roaring64Map::bitmapOf(2, b1 - 1, b1);
+        Roaring64Map r2 = r1;
+        r1.flip(b1 - 1, b1 - 1);
+        assert_true(r1 == r2);
+    }
+}
+
+DEFINE_TEST(test_cpp_flip_closed_64) {
+    {
+        // 32-bit test
+        {
+            // flipping an empty map works as expected
+            Roaring64Map r1;
+            r1.flipClosed(uint32_t(2), uint32_t(5));
+            auto r2 = Roaring64Map::bitmapOf(
+                4, uint64_t(2), uint64_t(3), uint64_t(4), uint64_t(5));
+            assert_true(r1 == r2);
+        }
+        {
+            // nothing is affected outside of the given range
+            auto r1 = Roaring64Map::bitmapOf(
+                3, uint64_t(1), uint64_t(3), uint64_t(6));
+            r1.flipClosed(uint32_t(2), uint32_t(4));
+            Roaring64Map r2 = Roaring64Map::bitmapOf(
+                4, uint64_t(1), uint64_t(2), uint64_t(4), uint64_t(6));
+            assert_true(r1 == r2);
+        }
+        {
+            // given range can go outside of existing range
+            auto r1 = Roaring64Map::bitmapOf(2, uint64_t(1), uint64_t(3));
+            r1.flipClosed(uint32_t(0), uint32_t(4));
+            auto r2 = Roaring64Map::bitmapOf(
+                3, uint64_t(0), uint64_t(2), uint64_t(4));
+            assert_true(r1 == r2);
+        }
+        {
+            // range end is inclusive
+            auto r1 = Roaring64Map::bitmapOf(2, uint64_t(1), uint64_t(3));
+            r1.flipClosed(uint32_t(1), uint32_t(2));
+            auto r2 = Roaring64Map::bitmapOf(2, uint64_t(2), uint64_t(3));
+            assert_true(r1 == r2);
+        }
+        {
+            // uint32 max can be flipped
+            auto r1 = Roaring64Map::bitmapOf(1, uint64_t(uint32_max));
+            r1.flipClosed(uint32_max, uint32_max);
+            assert_true(r1.isEmpty());
+        }
+        {
+            // empty range does nothing
+            auto r1 = Roaring64Map::bitmapOf(2, uint64_t(2), uint64_t(3));
+            auto r2 = r1;
+            r1.flipClosed(uint32_t(2), uint32_t(1));
+            assert_true(r1 == r2);
+        }
+    }
+
+    const auto b1 = uint64_t(1) << 32;
+    const auto b2 = uint64_t(2) << 32;
+
+    {
+        // nothing is affected outside of the given range
+        Roaring64Map r1 = Roaring64Map::bitmapOf(3, b1 - 3, b1, b1 + 3);
+        r1.flipClosed(b1 - 2, b1 + 1);
+        Roaring64Map r2 = Roaring64Map::bitmapOf(
+            5, b1 - 3, b1 - 2, b1 - 1, b1 + 1, b1 + 3);
+        assert_true(r1 == r2);
+    }
+    {
+        // given range can go outside of existing range
+        Roaring64Map r1 = Roaring64Map::bitmapOf(2, b1 - 2, b1);
+        r1.flipClosed(b1 - 3, b1 + 1);
+        Roaring64Map r2 = Roaring64Map::bitmapOf(
+            3, b1 - 3, b1 - 1, b1 + 1);
+        assert_true(r1 == r2);
+    }
+    {
+        // range end is inclusive
+        Roaring64Map r1 = Roaring64Map::bitmapOf(2, b2 - 1, b2 + 2);
+        r1.flipClosed(b2 - 1, b2 + 1);
         Roaring64Map r2;
-        for (uint64_t i = (((uint64_t)2) << 32); i <= (((uint64_t)2) << 32) + 2; ++i) {
+        for (uint64_t i = b2; i <= b2 + 2; ++i) {
             r2.add(i);
         }
         assert_true(r1 == r2);
@@ -877,49 +1737,315 @@ DEFINE_TEST(test_cpp_flip_64) {
     {
         // uint32 max can be flipped
         Roaring64Map r1 =
-            Roaring64Map::bitmapOf(1, static_cast<uint64_t>((std::numeric_limits<uint32_t>::max)()));
-        r1.flip(
-            (std::numeric_limits<uint32_t>::max)(),
-            static_cast<uint64_t>((std::numeric_limits<uint32_t>::max)()) + 1);
+            Roaring64Map::bitmapOf(1, static_cast<uint64_t>(uint32_max));
+        r1.flipClosed(uint32_max, uint32_max);
         assert_true(r1.isEmpty());
     }
     {
         // empty range does nothing
-        Roaring64Map r1 = Roaring64Map::bitmapOf(2, (((uint64_t)1) << 32) - 1, ((uint64_t)1) << 32);
+        Roaring64Map r1 = Roaring64Map::bitmapOf(2, b1 - 1, b1);
         Roaring64Map r2 = r1;
-        r1.flip((((uint64_t)1) << 32) - 1, (((uint64_t)1) << 32) - 1);
+        r1.flipClosed(b1 - 1, b1 - 2);
         assert_true(r1 == r2);
     }
 }
 
+DEFINE_TEST(test_combinatoric_flip_many_64) {
+    // Given 'num_slots_to_test' outer slots, we repeatedly seed a Roaring64Map
+    // with all combinations of present and absent outer slots (basically the
+    // powerset of {0...num_slots_to_test - 1}), then we add_range_closed
+    // and see if the cardinality is what we expect.
+    //
+    // For example (assuming num_slots_to_test = 5), the iterations of the outer
+    // loop represent these sets:
+    // 1. {}
+    // 2. {0}
+    // 3. {1}
+    // 4. {0, 1}
+    // 5. {2}
+    // 6. {0, 2}
+    // 7. {1, 2}
+    // 8. {0, 1, 2}
+    // 9. {3}
+    // and so forth...
+    //
+    // For example, in step 6 (representing set {0, 2}) we set a bit somewhere
+    // in slot 0 and we set another bit somehwere in slot 2. The purpose of this
+    // is to make sure 'flipClosed' does the right thing when it encounters
+    // an arbitrary mix of present and absent slots. Then we call
+    // 'flipClosed' over the whole range and confirm that the cardinality
+    // is what we expect.
+    const uint32_t num_slots_to_test = 5;
+    const uint32_t base_slot = 50;
+
+    const uint32_t bitmask_limit = 1 << num_slots_to_test;
+
+    for (uint32_t bitmask = 0; bitmask < bitmask_limit; ++bitmask) {
+        Roaring64Map roaring;
+        uint32_t num_one_bits = 0;
+
+        // The 1-bits in 'bitmask' indicate which slots we want to seed
+        // with a value.
+        for (uint32_t bit_index = 0; bit_index < num_slots_to_test; ++bit_index) {
+            if ((bitmask & (1 << bit_index)) == 0) {
+                continue;
+            }
+            auto slot = base_slot + bit_index;
+            auto value = (uint64_t(slot) << 32) + 0x1234567 + bit_index;
+            roaring.add(value);
+            ++num_one_bits;
+        }
+
+        auto first_bucket = uint64_t(base_slot) << 32;
+        auto last_bucket = uint64_t(base_slot + num_slots_to_test - 1) << 32;
+
+        roaring.flipClosed(first_bucket, last_bucket + uint32_max);
+
+        // Slots not initalized with a bit will now have cardinality 2^32
+        // Slots initialized with a bit will have cardinality 2^32 - 1
+        auto expected_cardinality = num_slots_to_test * (uint64_t(1) << 32)
+          - num_one_bits;
+        assert_int_equal(expected_cardinality, roaring.cardinality());
+    }
+}
+
+DEFINE_TEST(test_cpp_is_subset_64) {
+  Roaring64Map r1 = Roaring64Map::bitmapOf(1, uint64_t(1));
+  Roaring64Map r2 = Roaring64Map::bitmapOf(1, uint64_t(1) << 32);
+  Roaring64Map r3 = r1 & r2;
+  assert_true(r3.isSubset(r1));
+  assert_true(r3.isSubset(r2));
+}
+
+DEFINE_TEST(test_cpp_fast_union_64) {
+    auto update = [](Roaring64Map *dest, uint32_t bitmask, uint32_t offset) {
+        for (uint32_t i = 0; i != 32; ++i) {
+            if ((bitmask & (1 << i)) != 0) {
+                dest->add(offset + i);
+            }
+        }
+    };
+
+    // Generate three Roaring64Maps that have a variety of combinations of
+    // present and absent slots and calculate their union with fastunion.
+    const uint32_t num_slots_to_test = 4;
+    const uint32_t bitmask_limit = 1 << num_slots_to_test;
+
+    for (size_t r0_bitmask = 0; r0_bitmask != bitmask_limit; ++r0_bitmask) {
+        for (size_t r1_bitmask = 0; r1_bitmask != bitmask_limit; ++r1_bitmask) {
+            for (size_t r2_bitmask = 0; r2_bitmask != bitmask_limit;
+                 ++r2_bitmask) {
+                Roaring64Map r0_map, r1_map, r2_map;
+                update(&r0_map, r0_bitmask, 0);
+                update(&r1_map, r1_bitmask, 0x1000);
+                update(&r2_map, r2_bitmask, 0x2000);
+
+                const Roaring64Map *maps[] = {
+                    &r0_map, &r1_map, &r2_map
+                };
+                auto actual = Roaring64Map::fastunion(3, maps);
+
+                Roaring64Map expected;
+                update(&expected, r0_bitmask, 0);
+                update(&expected, r1_bitmask, 0x1000);
+                update(&expected, r2_bitmask, 0x2000);
+
+                assert_true(expected == actual);
+            }
+        }
+    }
+}
+
+DEFINE_TEST(test_cpp_to_string) {
+    // test toString
+    const auto b5 = uint64_t(5) << 32;
+
+    {
+        // 32-bit test.
+        Roaring a;
+        assert_string_equal("{}", a.toString().c_str());
+
+        a.add(1);
+        assert_string_equal("{1}", a.toString().c_str());
+
+        a.add(2);
+        a.add(3);
+        a.add(uint32_max);
+        assert_string_equal("{1,2,3,4294967295}", a.toString().c_str());
+    }
+
+    {
+        // 64-bit test.
+        Roaring64Map r;
+        assert_string_equal("{}", r.toString().c_str());
+
+        r.add(b5 + 100);
+        assert_string_equal("{21474836580}", r.toString().c_str());
+
+        r.add(1u);
+        r.add(2u);
+        r.add(uint32_max);
+        r.add(uint64_max);
+        assert_string_equal("{1,2,4294967295,21474836580,18446744073709551615}",
+                            r.toString().c_str());
+    }
+}
+
+DEFINE_TEST(test_cpp_remove_run_compression) {
+  Roaring r;
+  uint32_t max = (std::numeric_limits<uint32_t>::max)();
+  for (uint32_t i = max - 10; i != 0; ++i) {
+    r.add(i);
+  }
+  r.runOptimize();
+  r.removeRunCompression();
+}
+
+// Returns true on success, false on exception.
+bool test64Deserialize(const std::string& filename) {
+#if CROARING_IS_BIG_ENDIAN
+    (void)filename;
+    printf("Big-endian IO unsupported.\n");
+#else // CROARING_IS_BIG_ENDIAN
+    std::ifstream in(TEST_DATA_DIR + filename, std::ios::binary);
+    std::vector<char> buf1(std::istreambuf_iterator<char>(in), {});
+    printf("Reading %lu bytes\n", (unsigned long)buf1.size());
+    Roaring64Map roaring;
+#if ROARING_EXCEPTIONS
+    try {
+        roaring = Roaring64Map::readSafe(buf1.data(), buf1.size());
+    } catch (...) {
+        return false;
+    }
+#else // ROARING_EXCEPTIONS
+    roaring = Roaring64Map::readSafe(buf1.data(), buf1.size());
+#endif // ROARING_EXCEPTIONS
+    std::vector<char> buf2(roaring.getSizeInBytes());
+    assert_true(buf1.size() == buf2.size());
+    assert_true(roaring.write(buf2.data()) == buf2.size());
+    for (size_t i = 0; i < buf1.size(); ++i) {
+        assert_true(buf1[i] == buf2[i]);
+    }
+#endif // CROARING_IS_BIG_ENDIAN
+    return true;
+}
+
+// The valid files were created with cpp_unit_util.cpp.
+DEFINE_TEST(test_cpp_deserialize_64_empty) {
+  assert_true(test64Deserialize("64mapempty.bin"));
+}
+
+DEFINE_TEST(test_cpp_deserialize_64_32bit_vals) {
+  assert_true(test64Deserialize("64map32bitvals.bin"));
+}
+
+DEFINE_TEST(test_cpp_deserialize_64_spread_vals) {
+  assert_true(test64Deserialize("64mapspreadvals.bin"));
+}
+
+DEFINE_TEST(test_cpp_deserialize_64_high_vals) {
+  assert_true(test64Deserialize("64maphighvals.bin"));
+}
+
+#if ROARING_EXCEPTIONS
+DEFINE_TEST(test_cpp_deserialize_64_empty_input) {
+  assert_false(test64Deserialize("64mapemptyinput.bin"));
+}
+
+DEFINE_TEST(test_cpp_deserialize_64_size_too_small) {
+  assert_false(test64Deserialize("64mapsizetoosmall.bin"));
+}
+
+DEFINE_TEST(test_cpp_deserialize_64_invalid_size) {
+  assert_false(test64Deserialize("64mapinvalidsize.bin"));
+}
+
+DEFINE_TEST(test_cpp_deserialize_64_key_too_small) {
+  assert_false(test64Deserialize("64mapkeytoosmall.bin"));
+}
+#endif
+
+DEFINE_TEST(test_cpp_contains_range_interleaved_containers) {
+    Roaring roaring;
+    // Range from last position in first container up to second position in 3rd container.
+    roaring.addRange(0xFFFF, 0x1FFFF + 2);
+    // Query from last position in 2nd container up to second position in 4th container.
+    // There is no 4th container in the bitmap.
+    roaring.containsRange(0x1FFFF, 0x2FFFF + 2);
+}
+
 int main() {
     roaring::misc::tellmeall();
     const struct CMUnitTest tests[] = {
+        cmocka_unit_test(fuzz_001),
+        cmocka_unit_test(test_bitmap_of_32),
+        cmocka_unit_test(test_bitmap_of_64),
         cmocka_unit_test(serial_test),
+#if !CROARING_IS_BIG_ENDIAN
         cmocka_unit_test(test_example_true),
         cmocka_unit_test(test_example_false),
         cmocka_unit_test(test_example_cpp_true),
         cmocka_unit_test(test_example_cpp_false),
         cmocka_unit_test(test_example_cpp_64_true),
         cmocka_unit_test(test_example_cpp_64_false),
+#endif
         cmocka_unit_test(test_cpp_add_remove_checked),
         cmocka_unit_test(test_cpp_add_remove_checked_64),
+        cmocka_unit_test(test_cpp_add_range),
+        cmocka_unit_test(test_cpp_remove_range),
+        cmocka_unit_test(test_cpp_add_range_closed_64),
+        cmocka_unit_test(test_cpp_add_range_open_64),
+        cmocka_unit_test(test_cpp_add_range_closed_large_64),
+        cmocka_unit_test(test_cpp_add_range_open_large_64),
+        cmocka_unit_test(test_cpp_add_many),
+        cmocka_unit_test(test_cpp_add_many_64),
+        cmocka_unit_test(test_cpp_add_range_closed_combinatoric_64),
+        cmocka_unit_test(test_cpp_add_bulk),
+        cmocka_unit_test(test_cpp_contains_bulk),
+        cmocka_unit_test(test_cpp_remove_range_closed_64),
+        cmocka_unit_test(test_cpp_remove_range_64),
         cmocka_unit_test(test_run_compression_cpp_64_true),
         cmocka_unit_test(test_run_compression_cpp_64_false),
         cmocka_unit_test(test_run_compression_cpp_true),
         cmocka_unit_test(test_run_compression_cpp_false),
+        cmocka_unit_test(test_cpp_union_64),
+        cmocka_unit_test(test_cpp_intersect_64),
+        cmocka_unit_test(test_cpp_difference_64),
+        cmocka_unit_test(test_cpp_xor_64),
         cmocka_unit_test(test_cpp_clear_64),
         cmocka_unit_test(test_cpp_move_64),
         cmocka_unit_test(test_roaring64_iterate_multi_roaring),
+        cmocka_unit_test(test_roaring64_remove_32),
+        cmocka_unit_test(test_roaring64_add_and_remove),
         cmocka_unit_test(test_cpp_bidirectional_iterator_64),
         cmocka_unit_test(test_cpp_frozen),
         cmocka_unit_test(test_cpp_frozen_64),
         cmocka_unit_test(test_cpp_flip),
+        cmocka_unit_test(test_cpp_flip_closed),
         cmocka_unit_test(test_cpp_flip_64),
+        cmocka_unit_test(test_cpp_flip_closed_64),
+        cmocka_unit_test(test_combinatoric_flip_many_64),
+#if !CROARING_IS_BIG_ENDIAN
+        cmocka_unit_test(test_cpp_deserialize_64_empty),
+        cmocka_unit_test(test_cpp_deserialize_64_32bit_vals),
+        cmocka_unit_test(test_cpp_deserialize_64_spread_vals),
+        cmocka_unit_test(test_cpp_deserialize_64_high_vals),
+#if ROARING_EXCEPTIONS
+        cmocka_unit_test(test_cpp_deserialize_64_empty_input),
+        cmocka_unit_test(test_cpp_deserialize_64_size_too_small),
+        cmocka_unit_test(test_cpp_deserialize_64_invalid_size),
+        cmocka_unit_test(test_cpp_deserialize_64_key_too_small),
+#endif
+#endif // !CROARING_IS_BIG_ENDIAN
         cmocka_unit_test(issue316),
         cmocka_unit_test(test_issue304),
         cmocka_unit_test(issue_336),
         cmocka_unit_test(issue_372),
+        cmocka_unit_test(test_cpp_is_subset_64),
+        cmocka_unit_test(test_cpp_fast_union_64),
+        cmocka_unit_test(test_cpp_to_string),
+        cmocka_unit_test(test_cpp_remove_run_compression),
+        cmocka_unit_test(test_cpp_contains_range_interleaved_containers),
     };
     return cmocka_run_group_tests(tests, NULL, NULL);
 }
diff --git a/tests/cpp_unit_util.cpp b/tests/cpp_unit_util.cpp
new file mode 100644
index 000000000..7a0b0b553
--- /dev/null
+++ b/tests/cpp_unit_util.cpp
@@ -0,0 +1,49 @@
+#include <fstream>
+#include <vector>
+
+#include "roaring.hh"
+#include "roaring64map.hh"
+
+using namespace roaring;
+
+void writeToFile(const Roaring64Map& roaring, const std::string& filename) {
+    std::vector<char> buf(roaring.getSizeInBytes());
+    roaring.write(buf.data());
+    std::ofstream out(filename, std::ios::binary);
+    out.write(buf.data(), buf.size());
+}
+
+// Utility to create files with valid serialized Roaring64Maps.
+int main() {
+    {
+        Roaring64Map roaring;
+        writeToFile(roaring, "64mapempty.bin");
+    }
+    {
+        Roaring64Map roaring;
+        for (uint32_t v = 0; v < 10; ++v) {
+          roaring.add(v);
+        }
+        writeToFile(roaring, "64map32bitvals.bin");
+    }
+    {
+        Roaring64Map roaring;
+        for (uint64_t high = 0; high < 10; ++high) {
+          for (uint64_t low = 0; low < 10; ++low) {
+            roaring.add((high << 32) + low);
+          }
+        }
+        writeToFile(roaring, "64mapspreadvals.bin");
+    }
+    {
+        Roaring64Map roaring;
+        uint64_t max32 = (std::numeric_limits<uint32_t>::max)();
+        for (uint64_t high = max32 - 10; high <= max32; ++high) {
+          for (uint64_t low = max32 - 10; low <= max32; ++low) {
+            roaring.add((high << 32) + low);
+          }
+        }
+        writeToFile(roaring, "64maphighvals.bin");
+    }
+    return EXIT_SUCCESS;
+}
diff --git a/tests/format_portability_unit.c b/tests/format_portability_unit.c
index a08584688..a823a9b64 100644
--- a/tests/format_portability_unit.c
+++ b/tests/format_portability_unit.c
@@ -74,7 +74,9 @@ void test_deserialize(char* filename) {
     free(input_buffer);
     roaring_bitmap_free(bitmap);
 }
-
+#if CROARING_IS_BIG_ENDIAN
+// port the test below.
+#else
 DEFINE_TEST(test_deserialize_portable_norun) {
     char filename[1024];
 
@@ -92,14 +94,18 @@ DEFINE_TEST(test_deserialize_portable_wrun) {
 
     test_deserialize(filename);
 }
+#endif
 
 int main() {
     tellmeall();
-
+#if CROARING_IS_BIG_ENDIAN
+    printf("Big-endian IO unsupported.\n");
+    return EXIT_SUCCESS;
+#else
     const struct CMUnitTest tests[] = {
         cmocka_unit_test(test_deserialize_portable_norun),
         cmocka_unit_test(test_deserialize_portable_wrun),
     };
-
     return cmocka_run_group_tests(tests, NULL, NULL);
+#endif 
 }
diff --git a/tests/mixed_container_unit.c b/tests/mixed_container_unit.c
index c55a59dc3..cfcaed231 100644
--- a/tests/mixed_container_unit.c
+++ b/tests/mixed_container_unit.c
@@ -1482,7 +1482,6 @@ DEFINE_TEST(array_negation_range_test3) {
  * sparse */
 static int bitset_negation_range_tests(int sparsity, int r_start, int r_end,
                                        bool is_bitset, bool inplace) {
-    int ctr = 0;
     bitset_container_t* BI = bitset_container_create();
     container_t* BO;
     bool result_is_bitset;
@@ -1490,7 +1489,6 @@ static int bitset_negation_range_tests(int sparsity, int r_start, int r_end,
 
     for (int x = 0; x < (1 << 16); x++) {
         if (x % sparsity) bitset_container_add(BI, (uint16_t)x);
-        ++ctr;
     }
 
     for (int x = 0; x < (1 << 16); x++) {
@@ -1588,7 +1586,7 @@ static int run_negation_range_tests(int k, int h, int start_offset, int r_start,
     int result_size_should_be;
     bool result_should_be[1 << 16];
 
-    assert(h < k);  // bad test call otherwise..not failure of code under test
+    assert_true(h < k);  // bad test call otherwise..not failure of code under test
 
     int runlen = h;
     for (int x = 0; x < (1 << 16) - start_offset; x++) {
@@ -1669,7 +1667,7 @@ static int run_negation_range_tests_simpler(int k, int h, int start_offset,
     int result_size_should_be;
     bool result_should_be[1 << 16];
 
-    assert(h < k);
+    assert_true(h < k);
 
     int runlen = h;
     for (int x = 0; x < (1 << 16) - start_offset; x++) {
diff --git a/tests/realdata_unit.c b/tests/realdata_unit.c
index 5603f5206..e6e1c7388 100644
--- a/tests/realdata_unit.c
+++ b/tests/realdata_unit.c
@@ -16,6 +16,7 @@
 
 #include "../benchmarks/numbersfromtextfiles.h"
 #include "config.h"
+#include "test.h"
 
 /**
  * Once you have collected all the integers, build the bitmaps.
@@ -43,6 +44,10 @@ const char *datadir[] = {
     "weather_sept_85_srt", "wikileaks-noquotes", "wikileaks-noquotes_srt"};
 
 bool serialize_correctly(roaring_bitmap_t *r) {
+#if CROARING_IS_BIG_ENDIAN
+    (void)r;
+    return r;
+#else
     uint32_t expectedsize = roaring_bitmap_portable_size_in_bytes(r);
     char *serialized = (char*)malloc(expectedsize);
     if (serialized == NULL) {
@@ -69,6 +74,7 @@ bool serialize_correctly(roaring_bitmap_t *r) {
     }
     roaring_bitmap_free(r2);
     return true;
+#endif
 }
 
 // arrays expected to both be sorted.
@@ -645,17 +651,17 @@ bool compare_wide_unions(roaring_bitmap_t **rnorun, roaring_bitmap_t **rruns,
         printf("[compare_wide_unions] Unions don't agree! (fast run-norun) \n");
         return false;
     }
-    assert(roaring_bitmap_equals(tempornorun, temporruns));
+    assert_true(roaring_bitmap_equals(tempornorun, temporruns));
 
     roaring_bitmap_t *tempornorunheap =
         roaring_bitmap_or_many_heap(count, (const roaring_bitmap_t **)rnorun);
     roaring_bitmap_t *temporrunsheap =
         roaring_bitmap_or_many_heap(count, (const roaring_bitmap_t **)rruns);
-    // assert(slow_bitmap_equals(tempornorun, tempornorunheap));
-    // assert(slow_bitmap_equals(temporruns,temporrunsheap));
+    // assert_true(slow_bitmap_equals(tempornorun, tempornorunheap));
+    // assert_true(slow_bitmap_equals(temporruns,temporrunsheap));
 
-    assert(roaring_bitmap_equals(tempornorun, tempornorunheap));
-    assert(roaring_bitmap_equals(temporruns, temporrunsheap));
+    assert_true(roaring_bitmap_equals(tempornorun, tempornorunheap));
+    assert_true(roaring_bitmap_equals(temporruns, temporrunsheap));
     roaring_bitmap_free(tempornorunheap);
     roaring_bitmap_free(temporrunsheap);
 
@@ -665,24 +671,24 @@ bool compare_wide_unions(roaring_bitmap_t **rnorun, roaring_bitmap_t **rruns,
         longtempornorun = rnorun[0];
         longtemporruns = rruns[0];
     } else {
-        assert(roaring_bitmap_equals(rnorun[0], rruns[0]));
-        assert(roaring_bitmap_equals(rnorun[1], rruns[1]));
+        assert_true(roaring_bitmap_equals(rnorun[0], rruns[0]));
+        assert_true(roaring_bitmap_equals(rnorun[1], rruns[1]));
         longtempornorun = roaring_bitmap_or(rnorun[0], rnorun[1]);
         longtemporruns = roaring_bitmap_or(rruns[0], rruns[1]);
-        assert(roaring_bitmap_equals(longtempornorun, longtemporruns));
+        assert_true(roaring_bitmap_equals(longtempornorun, longtemporruns));
         for (int i = 2; i < (int)count; ++i) {
-            assert(roaring_bitmap_equals(rnorun[i], rruns[i]));
-            assert(roaring_bitmap_equals(longtempornorun, longtemporruns));
+            assert_true(roaring_bitmap_equals(rnorun[i], rruns[i]));
+            assert_true(roaring_bitmap_equals(longtempornorun, longtemporruns));
 
             roaring_bitmap_t *t1 =
                 roaring_bitmap_or(rnorun[i], longtempornorun);
             roaring_bitmap_t *t2 = roaring_bitmap_or(rruns[i], longtemporruns);
-            assert(roaring_bitmap_equals(t1, t2));
+            assert_true(roaring_bitmap_equals(t1, t2));
             roaring_bitmap_free(longtempornorun);
             longtempornorun = t1;
             roaring_bitmap_free(longtemporruns);
             longtemporruns = t2;
-            assert(roaring_bitmap_equals(longtempornorun, longtemporruns));
+            assert_true(roaring_bitmap_equals(longtempornorun, longtemporruns));
         }
     }
     if (!slow_bitmap_equals(longtempornorun, tempornorun)) {
@@ -712,7 +718,7 @@ bool compare_wide_xors(roaring_bitmap_t **rnorun, roaring_bitmap_t **rruns,
         printf("[compare_wide_xors] Xors don't agree! (fast run-norun) \n");
         return false;
     }
-    assert(roaring_bitmap_equals(tempornorun, temporruns));
+    assert_true(roaring_bitmap_equals(tempornorun, temporruns));
 
     roaring_bitmap_t *longtempornorun;
     roaring_bitmap_t *longtemporruns;
@@ -720,24 +726,24 @@ bool compare_wide_xors(roaring_bitmap_t **rnorun, roaring_bitmap_t **rruns,
         longtempornorun = rnorun[0];
         longtemporruns = rruns[0];
     } else {
-        assert(roaring_bitmap_equals(rnorun[0], rruns[0]));
-        assert(roaring_bitmap_equals(rnorun[1], rruns[1]));
+        assert_true(roaring_bitmap_equals(rnorun[0], rruns[0]));
+        assert_true(roaring_bitmap_equals(rnorun[1], rruns[1]));
         longtempornorun = roaring_bitmap_xor(rnorun[0], rnorun[1]);
         longtemporruns = roaring_bitmap_xor(rruns[0], rruns[1]);
-        assert(roaring_bitmap_equals(longtempornorun, longtemporruns));
+        assert_true(roaring_bitmap_equals(longtempornorun, longtemporruns));
         for (int i = 2; i < (int)count; ++i) {
-            assert(roaring_bitmap_equals(rnorun[i], rruns[i]));
-            assert(roaring_bitmap_equals(longtempornorun, longtemporruns));
+            assert_true(roaring_bitmap_equals(rnorun[i], rruns[i]));
+            assert_true(roaring_bitmap_equals(longtempornorun, longtemporruns));
 
             roaring_bitmap_t *t1 =
                 roaring_bitmap_xor(rnorun[i], longtempornorun);
             roaring_bitmap_t *t2 = roaring_bitmap_xor(rruns[i], longtemporruns);
-            assert(roaring_bitmap_equals(t1, t2));
+            assert_true(roaring_bitmap_equals(t1, t2));
             roaring_bitmap_free(longtempornorun);
             longtempornorun = t1;
             roaring_bitmap_free(longtemporruns);
             longtemporruns = t2;
-            assert(roaring_bitmap_equals(longtempornorun, longtemporruns));
+            assert_true(roaring_bitmap_equals(longtempornorun, longtemporruns));
         }
     }
     if (!slow_bitmap_equals(longtempornorun, tempornorun)) {
diff --git a/tests/roaring64map_checked.hh b/tests/roaring64map_checked.hh
new file mode 100644
index 000000000..d195b205a
--- /dev/null
+++ b/tests/roaring64map_checked.hh
@@ -0,0 +1,535 @@
+//
+// roaring64map_checked.hh
+//
+// PURPOSE:
+//
+// This file implements a class which maintains a `class Roaring64Map` bitset in
+// sync with a C++ `std::set` of 64-bit integers.  It asserts if it ever
+// notices a difference between the result the roaring bitset gives and the
+// result that the set would give.
+//
+// The doublechecked class is a drop-in replacement for the plain C++ class.
+// Hence any codebase that uses that class could act as a test...if it wished.
+//
+// USAGE:
+//
+// The checked class has the same name (Roaring64Map) in `namespace doublechecked`.
+// So switching between versions could be done easily with a command-line
+// `-D` setting for a #define, e.g.:
+//
+//     #ifdef ROARING_DOUBLECHECK_CPP
+//         #include "roaring64map_checked.hh"
+//         using doublechecked::Roaring64Map;
+//     #else
+//         #include "roaring64map.hh"
+//     #endif
+
+#ifndef INCLUDE_ROARING_64_MAP_CHECKED_HH_
+#define INCLUDE_ROARING_64_MAP_CHECKED_HH_
+
+#include <stdarg.h>
+
+#include <algorithm>
+#include <new>
+#include <stdexcept>
+#include <string>
+
+#include <set>  // sorted set, typically a red-black tree implementation
+
+#include "test.h"
+
+
+#define ROARING_CPP_NAMESPACE unchecked  // can't be overridden if global
+#include "roaring64map.hh"  // contains Roaring64Map unchecked class
+
+namespace doublechecked {  // put the checked class in its own namespace
+
+class Roaring64Map {
+  public:  // members public to allow tests access to them
+    roaring::Roaring64Map plain;  // ordinary Roaring64Map bitset wrapper class
+    std::set<uint64_t> check;  // contents kept in sync with `plain`
+
+  public:
+    Roaring64Map() : plain() {
+    }
+
+    Roaring64Map(size_t n, const uint32_t *data) : plain (n, data) {
+        for (size_t i = 0; i < n; ++i)
+            check.insert(data[i]);
+    }
+
+    Roaring64Map(const Roaring64Map &r) {
+        plain = r.plain;
+        check = r.check;
+    }
+
+    Roaring64Map(Roaring64Map &&r) noexcept {
+        plain = std::move(r.plain);
+        check = std::move(r.check);
+    }
+
+    // This constructor is unique to doublecheck::Roaring64Map(), for making a
+    // doublechecked version from an unchecked version.  Note that this alone
+    // is somewhat toothless for checking...e.g. running an operation and then
+    // accepting that all the values in it were correct doesn't do much.  So
+    // the results of such constructions should be validated another way.
+    //
+    Roaring64Map(roaring::Roaring64Map &&other_plain) {
+        plain = std::move(other_plain);
+        for (auto value : plain)
+            check.insert(value);
+    }
+
+    // Note: This does not call `::Roaring64Map::bitmapOf()` because variadics can't
+    // forward their parameters.  But this is all the code does, so it's fine.
+    //
+    static Roaring64Map bitmapOf(size_t n, ...) {
+        doublechecked::Roaring64Map ans;
+        va_list vl;
+        va_start(vl, n);
+        for (size_t i = 0; i < n; i++) {
+            ans.add(va_arg(vl, uint32_t));
+        }
+        va_end(vl);
+        return ans;
+    }
+
+    void add(uint32_t x) {
+        plain.add(x);
+        check.insert(x);
+    }
+    void add(uint64_t x) {
+        plain.add(x);
+        check.insert(x);
+    }
+
+    bool addChecked(uint32_t x) {
+        bool ans = plain.addChecked(x);
+        bool was_in_set = check.insert(x).second;  // insert -> pair<iter,bool>
+        assert_true(ans == was_in_set);
+        (void)was_in_set;  // unused besides assert
+        return ans;
+    }
+    bool addChecked(uint64_t x) {
+        bool ans = plain.addChecked(x);
+        bool was_in_set = check.insert(x).second;  // insert -> pair<iter,bool>
+        assert_true(ans == was_in_set);
+        (void)was_in_set;  // unused besides assert
+        return ans;
+    }
+
+    void addRange(const uint64_t min, const uint64_t max) {
+        plain.addRange(min, max);
+        for (uint64_t val = min; val < max; ++val) {
+            check.insert(val);
+        }
+    }
+
+    void addRangeClosed(uint32_t min, uint32_t max) {
+        plain.addRangeClosed(min, max);
+        if (min <= max) {
+            for (uint32_t val = max; val != min - 1; --val)
+                check.insert(val);
+        }
+    }
+    void addRangeClosed(uint64_t min, uint64_t max) {
+        plain.addRangeClosed(min, max);
+        if (min <= max) {
+            for (uint64_t val = max; val != min - 1; --val)
+                check.insert(val);
+        }
+    }
+
+    void addMany(size_t n_args, const uint32_t *vals) {
+        plain.addMany(n_args, vals);
+        for (size_t i = 0; i < n_args; ++i)
+            check.insert(vals[i]);
+    }
+    void addMany(size_t n_args, const uint64_t *vals) {
+        plain.addMany(n_args, vals);
+        for (size_t i = 0; i < n_args; ++i)
+            check.insert(vals[i]);
+    }
+
+    void remove(uint32_t x) {
+        plain.remove(x);
+        check.erase(x);
+    }
+    void remove(uint64_t x) {
+        plain.remove(x);
+        check.erase(x);
+    }
+
+    bool removeChecked(uint32_t x) {
+        bool ans = plain.removeChecked(x);
+        size_t num_removed = check.erase(x);
+        assert_true(ans == (num_removed == 1));
+        (void)num_removed;  // unused besides assert
+        return ans;
+    }
+    bool removeChecked(uint64_t x) {
+        bool ans = plain.removeChecked(x);
+        size_t num_removed = check.erase(x);
+        assert_true(ans == (num_removed == 1));
+        (void)num_removed;  // unused besides assert
+        return ans;
+    }
+
+    void removeRange(const uint64_t min, const uint64_t max) {
+        plain.removeRange(min, max);
+        if (min < max) {
+            // Points to the first entry with key >= min, or end
+            auto start = check.lower_bound(min);
+            // Points to the first entry with key >= max, or end.
+            auto end = check.lower_bound(max);
+            // Removes the half-open interval [start, end) (i.e. does not include max).
+            check.erase(start, end);
+        }
+    }
+
+    void removeRangeClosed(uint32_t min, uint32_t max) {
+        plain.removeRangeClosed(min, max);
+        if (min <= max) {
+            // Points to the first entry with key >= min, or end
+            auto start = check.lower_bound(min);
+            // Points to the first entry with key > max, or end.
+            auto end = check.upper_bound(max);
+            // Removes the half-open interval [start, end) (i.e. includes max).
+            check.erase(start, end);
+        }
+    }
+
+    void removeRangeClosed(uint64_t min, uint64_t max) {
+        plain.removeRangeClosed(min, max);
+        if (min <= max) {
+            // Points to the first entry with key >= min, or end
+            auto start = check.lower_bound(min);
+            // Points to the first entry with key > max, or end.
+            auto end = check.upper_bound(max);
+            // Removes the half-open interval [start, end) (i.e. includes max).
+            check.erase(start, end);
+        }
+    }
+
+    uint64_t maximum() const {
+        uint64_t ans = plain.maximum();
+        assert_true(check.empty() ? ans == 0 : ans == *check.rbegin());
+        return ans;
+    }
+
+    uint64_t minimum() const {
+        uint64_t ans = plain.minimum();
+        assert_true(check.empty()
+            ? ans == (std::numeric_limits<uint64_t>::max)()
+            : ans == *check.begin());
+        return ans;
+    }
+
+    bool contains(uint32_t x) const {
+        bool ans = plain.contains(x);
+        assert_true(ans == (check.find(x) != check.end()));
+        return ans;
+    }
+    bool contains(uint64_t x) const {
+        bool ans = plain.contains(x);
+        assert_true(ans == (check.find(x) != check.end()));
+        return ans;
+    }
+
+
+    // This method is exclusive to `doublechecked::Roaring64Map`
+    //
+    bool does_std_set_match_roaring() const {
+        auto it_check = check.begin();
+        auto it_check_end = check.end();
+        auto it_plain = plain.begin();
+        auto it_plain_end = plain.end();
+
+        for (; it_check != it_check_end; ++it_check, ++it_plain) {
+            if (it_plain == it_plain_end)
+                return false;
+            if (*it_check != *it_plain)
+                return false;
+        }
+        return it_plain == plain.end();  // should have visited all values
+    }
+
+    ~Roaring64Map() {
+        assert_true(does_std_set_match_roaring());  // always check on destructor
+    }
+
+    Roaring64Map &operator=(const Roaring64Map &r) {
+        plain = r.plain;
+        check = r.check;
+        return *this;
+    }
+
+    Roaring64Map &operator=(Roaring64Map &&r) noexcept {
+        plain = std::move(r.plain);
+        check = std::move(r.check);
+        return *this;
+    }
+
+    Roaring64Map &operator&=(const Roaring64Map &r) {
+        plain &= r.plain;
+
+        auto it = check.begin();
+        auto r_it = r.check.begin();
+        while (it != check.end() && r_it != r.check.end()) {
+            if (*it < *r_it) { it = check.erase(it); }
+            else if (*r_it < *it) { ++r_it; }
+            else { ++it; ++r_it; }  // overlapped
+        }
+        check.erase(it, check.end());  // erase rest of check not in r.check
+
+        return *this;
+    }
+
+    Roaring64Map &operator-=(const Roaring64Map &r) {
+        plain -= r.plain;
+
+        for (auto value : r.check)
+            check.erase(value);  // Note std::remove() is not for ordered sets
+
+        return *this;
+    }
+
+    Roaring64Map &operator|=(const Roaring64Map &r) {
+        plain |= r.plain;
+
+        check.insert(r.check.begin(), r.check.end());  // won't add duplicates
+
+        return *this;
+    }
+
+    Roaring64Map &operator^=(const Roaring64Map &r) {
+        plain ^= r.plain;
+
+        auto it = check.begin();
+        auto it_end = check.end();
+        auto r_it = r.check.begin();
+        auto r_it_end = r.check.end();
+        if (it == it_end) { check = r.check; }  // this empty
+        else if (r_it == r_it_end) { }  // r empty
+        else if (*it > *r.check.rbegin() || *r_it > *check.rbegin()) {
+            check.insert(r.check.begin(), r.check.end());  // obvious disjoint
+        } else while (r_it != r_it_end) {  // may overlap
+            if (it == it_end) { check.insert(*r_it); ++r_it; }
+            else if (*it == *r_it) {  // remove overlapping value
+                it = check.erase(it);  // returns *following* iterator
+                ++r_it;
+            }
+            else if (*it < *r_it) { ++it; }  // keep value from this
+            else { check.insert(*r_it); ++r_it; }  // add value from r
+        }
+
+        return *this;
+    }
+
+    void swap(Roaring64Map &r) {
+        std::swap(r.plain, plain);
+        std::swap(r.check, check);
+    }
+
+    uint64_t cardinality() const {
+        uint64_t ans = plain.cardinality();
+        assert_true(ans == check.size());
+        return ans;
+    }
+
+    bool isEmpty() const {
+        bool ans = plain.isEmpty();
+        assert_true(ans == check.empty());
+        return ans;
+    }
+
+    bool isSubset(const Roaring64Map &r) const {  // is `this` subset of `r`?
+        bool ans = plain.isSubset(r.plain);
+        assert_true(ans == std::includes(
+            r.check.begin(), r.check.end(),  // containing range
+            check.begin(), check.end()  // range to test for containment
+        ));
+        return ans;
+    }
+
+    bool isStrictSubset(const Roaring64Map &r) const {  // is `this` subset of `r`?
+        bool ans = plain.isStrictSubset(r.plain);
+        assert_true(ans == (std::includes(
+            r.check.begin(), r.check.end(),  // containing range
+            check.begin(), check.end()  // range to test for containment
+        ) && r.check.size() > check.size()));
+        return ans;
+    }
+
+    void toUint64Array(uint64_t *ans) const {
+        plain.toUint64Array(ans);
+        // TBD: doublecheck
+    }
+
+    bool operator==(const Roaring64Map &r) const {
+        bool ans = (plain == r.plain);
+        assert_true(ans == (check == r.check));
+        return ans;
+    }
+
+    void flip(uint64_t range_start, uint64_t range_end) {
+        plain.flip(range_start, range_end);
+
+        if (range_start < range_end) {
+            auto hint = check.lower_bound(range_start);  // *hint stays as >= i
+            auto it_end = check.end();
+            for (uint64_t i = range_start; i < range_end; ++i) {
+                if (hint == it_end || *hint > i)  // i not present, so add
+                    check.insert(hint, i);  // leave hint past i
+                else  // *hint == i, must adjust hint and erase
+                    hint = check.erase(hint);  // returns *following* iterator
+            }
+        }
+    }
+
+    bool removeRunCompression() {
+        return plain.removeRunCompression();
+    }
+
+    bool runOptimize() {
+        return plain.runOptimize();
+    }
+
+    size_t shrinkToFit() {
+        return plain.shrinkToFit();
+    }
+
+    void iterate(roaring::api::roaring_iterator64 iterator, void *ptr) const {
+        plain.iterate(iterator, ptr);
+        assert_true(does_std_set_match_roaring());  // checks equivalent iteration
+    }
+
+    bool select(uint64_t rnk, uint64_t *element) const {
+        bool ans = plain.select(rnk, element);
+
+        auto it = check.begin();
+        auto it_end = check.end();
+        for (uint64_t i = 0; it != it_end && i < rnk; ++i)
+            ++it;
+        assert_true(ans == (it != it_end) && (ans ? *it == *element : true));
+
+        return ans;
+    }
+
+    uint64_t rank(uint64_t x) const {
+        uint64_t ans = plain.rank(x);
+
+        uint64_t count = 0;
+        auto it = check.begin();
+        auto it_end = check.end();
+        for (; it != it_end && *it <= x; ++it)
+            ++count;
+        assert_true(ans == count);
+
+        return ans;
+    }
+
+    size_t write(char *buf, bool portable = true) const {
+        return plain.write(buf, portable);
+    }
+
+    static Roaring64Map read(const char *buf, bool portable = true) {
+        auto plain = roaring::Roaring64Map::read(buf, portable);
+        return Roaring64Map(std::move(plain));
+    }
+
+    static Roaring64Map readSafe(const char *buf, size_t maxbytes) {
+        auto plain = roaring::Roaring64Map::readSafe(buf, maxbytes);
+        return Roaring64Map(std::move(plain));
+    }
+
+    size_t getSizeInBytes(bool portable = true) const {
+        return plain.getSizeInBytes(portable);
+    }
+
+    Roaring64Map operator&(const Roaring64Map &o) const {
+        Roaring64Map ans(plain & o.plain);
+
+        Roaring64Map inplace(*this);
+        assert_true(ans == (inplace &= o));  // validate against in-place version
+
+        return ans;
+    }
+
+    Roaring64Map operator-(const Roaring64Map &o) const {
+        Roaring64Map ans(plain - o.plain);
+
+        Roaring64Map inplace(*this);
+        assert_true(ans == (inplace -= o));  // validate against in-place version
+
+        return ans;
+    }
+
+    Roaring64Map operator|(const Roaring64Map &o) const {
+        Roaring64Map ans(plain | o.plain);
+
+        Roaring64Map inplace(*this);
+        assert_true(ans == (inplace |= o));  // validate against in-place version
+
+        return ans;
+    }
+
+    Roaring64Map operator^(const Roaring64Map &o) const {
+        Roaring64Map ans(plain ^ o.plain);
+
+        Roaring64Map inplace(*this);
+        assert_true(ans == (inplace ^= o));  // validate against in-place version
+
+        return ans;
+    }
+
+    void setCopyOnWrite(bool val) {
+        plain.setCopyOnWrite(val);
+    }
+
+    void printf() const {
+        plain.printf();
+    }
+
+    std::string toString() const {
+        return plain.toString();
+    }
+
+    bool getCopyOnWrite() const {
+        return plain.getCopyOnWrite();
+    }
+
+    static Roaring64Map fastunion(size_t n, const Roaring64Map **inputs) {
+        auto plain_inputs = new const roaring::Roaring64Map*[n];
+        for (size_t i = 0; i < n; ++i)
+            plain_inputs[i] = &inputs[i]->plain;
+        Roaring64Map ans(roaring::Roaring64Map::fastunion(n, plain_inputs));
+        delete[] plain_inputs;
+
+        if (n == 0)
+            assert_true(ans.cardinality() == 0);
+        else {
+            Roaring64Map temp = *inputs[0];
+            for (size_t i = 1; i < n; ++i)
+                temp |= *inputs[i];
+            assert_true(temp == ans);
+        }
+
+        return ans;
+    }
+
+    typedef roaring::Roaring64MapSetBitForwardIterator const_iterator;
+
+    const_iterator begin() const {
+        return roaring::Roaring64MapSetBitForwardIterator(plain);
+    }
+
+    const_iterator &end() const {
+        static roaring::Roaring64MapSetBitForwardIterator e(plain, true);
+        return e;
+    }
+};
+
+}  // end `namespace doublechecked`
+
+#endif  // INCLUDE_ROARING_64_MAP_CHECKED_HH_
diff --git a/tests/roaring_checked.hh b/tests/roaring_checked.hh
index 9c7da3099..2eb7a2e5c 100644
--- a/tests/roaring_checked.hh
+++ b/tests/roaring_checked.hh
@@ -42,7 +42,7 @@
 #include <string>
 
 #include <set>  // sorted set, typically a red-black tree implementation
-#include <assert.h>
+#include "test.h"
 
 #define ROARING_CPP_NAMESPACE unchecked  // can't be overridden if global
 #include "roaring.hh"  // contains Roaring unchecked class
@@ -112,20 +112,22 @@ class Roaring {
     bool addChecked(uint32_t x) {
         bool ans = plain.addChecked(x);
         bool was_in_set = check.insert(x).second;  // insert -> pair<iter,bool>
-        assert(ans == was_in_set);
+        assert_true(ans == was_in_set);
         (void)was_in_set;  // unused besides assert
         return ans;
     }
 
-    void addRange(const uint64_t x, const uint64_t y)  {
-        plain.addRange(x, y);
+    void addRange(const uint64_t x, const uint64_t y) {
         if (x != y) {  // repeat add_range_closed() cast and bounding logic
-            uint32_t min = static_cast<uint32_t>(x);
-            uint32_t max = static_cast<uint32_t>(y - 1);
-            if (min <= max) {
-                for (uint32_t val = max; val != min - 1; --val)
-                    check.insert(val);
-            }
+            addRangeClosed(x, y - 1);
+        }
+    }
+
+    void addRangeClosed(uint32_t min, uint32_t max) {
+        plain.addRangeClosed(min, max);
+        if (min <= max) {
+            for (uint32_t val = max; val != min - 1; --val)
+                check.insert(val);
         }
     }
 
@@ -143,26 +145,39 @@ class Roaring {
     bool removeChecked(uint32_t x) {
         bool ans = plain.removeChecked(x);
         size_t num_removed = check.erase(x);
-        assert(ans == (num_removed == 1));
+        assert_true(ans == (num_removed == 1));
         (void)num_removed;  // unused besides assert
         return ans;
     }
 
+    void removeRange(const uint64_t x, const uint64_t y) {
+        if (x != y) {  // repeat remove_range_closed() cast and bounding logic
+            removeRangeClosed(x, y - 1);
+        }
+    }
+
+    void removeRangeClosed(uint32_t min, uint32_t max) {
+        plain.removeRangeClosed(min, max);
+        if (min <= max) {
+            check.erase(check.lower_bound(min), check.upper_bound(max));
+        }
+    }
+
     uint32_t maximum() const {
         uint32_t ans = plain.maximum();
-        assert(check.empty() ? ans == 0 : ans == *check.rbegin());
+        assert_true(check.empty() ? ans == 0 : ans == *check.rbegin());
         return ans;
     }
 
     uint32_t minimum() const {
         uint32_t ans = plain.minimum();
-        assert(check.empty() ? ans == UINT32_MAX : ans == *check.begin());
+        assert_true(check.empty() ? ans == UINT32_MAX : ans == *check.begin());
         return ans;
     }
 
     bool contains(uint32_t x) const {
         bool ans = plain.contains(x);
-        assert(ans == (check.find(x) != check.end()));
+        assert_true(ans == (check.find(x) != check.end()));
         return ans;
     }
 
@@ -171,14 +186,14 @@ class Roaring {
 
         auto it = check.find(x);
         if (x >= y)
-            assert(ans == true);  // roaring says true for this
+            assert_true(ans == true);  // roaring says true for this
         else if (it == check.end())
-            assert(ans == false);  // start of range not in set
+            assert_true(ans == false);  // start of range not in set
         else {
             uint64_t last = x;  // iterate up to y so long as values sequential
             while (++it != check.end() && last + 1 == *it && *it < y)
                 last = *it;
-            assert(ans == (last == y - 1));
+            assert_true(ans == (last == y - 1));
         }
 
         return ans;
@@ -202,7 +217,7 @@ class Roaring {
     }
 
     ~Roaring() {
-        assert(does_std_set_match_roaring());  // always check on destructor
+        assert_true(does_std_set_match_roaring());  // always check on destructor
     }
 
     Roaring &operator=(const Roaring &r) {
@@ -280,19 +295,19 @@ class Roaring {
 
     uint64_t cardinality() const {
         uint64_t ans = plain.cardinality();
-        assert(ans == check.size());
+        assert_true(ans == check.size());
         return ans;
     }
 
     bool isEmpty() const {
         bool ans = plain.isEmpty();
-        assert(ans == check.empty());
+        assert_true(ans == check.empty());
         return ans;
     }
 
     bool isSubset(const Roaring &r) const {  // is `this` subset of `r`?
         bool ans = plain.isSubset(r.plain);
-        assert(ans == std::includes(
+        assert_true(ans == std::includes(
             r.check.begin(), r.check.end(),  // containing range
             check.begin(), check.end()  // range to test for containment
         ));
@@ -301,7 +316,7 @@ class Roaring {
 
     bool isStrictSubset(const Roaring &r) const {  // is `this` subset of `r`?
         bool ans = plain.isStrictSubset(r.plain);
-        assert(ans == (std::includes(
+        assert_true(ans == (std::includes(
             r.check.begin(), r.check.end(),  // containing range
             check.begin(), check.end()  // range to test for containment
         ) && r.check.size() > check.size()));
@@ -320,7 +335,7 @@ class Roaring {
 
     bool operator==(const Roaring &r) const {
         bool ans = (plain == r.plain);
-        assert(ans == (check == r.check));
+        assert_true(ans == (check == r.check));
         return ans;
     }
 
@@ -355,7 +370,7 @@ class Roaring {
 
     void iterate(roaring::api::roaring_iterator iterator, void *ptr) const {
         plain.iterate(iterator, ptr);
-        assert(does_std_set_match_roaring());  // checks equivalent iteration
+        assert_true(does_std_set_match_roaring());  // checks equivalent iteration
     }
 
     bool select(uint32_t rnk, uint32_t *element) const {
@@ -365,7 +380,7 @@ class Roaring {
         auto it_end = check.end();
         for (uint32_t i = 0; it != it_end && i < rnk; ++i)
             ++it;
-        assert(ans == (it != it_end) && (ans ? *it == *element : true));
+        assert_true(ans == (it != it_end) && (ans ? *it == *element : true));
 
         return ans;
     }
@@ -378,9 +393,9 @@ class Roaring {
         auto r_it = r.check.begin();
         auto r_it_end = r.check.end();
         if (it == it_end || r_it == r_it_end) {
-            assert(ans == 0);  // if either is empty then no intersection
+            assert_true(ans == 0);  // if either is empty then no intersection
         } else if (*it > *r.check.rbegin() || *r_it > *check.rbegin()) {
-            assert(ans == 0);  // obvious disjoint
+            assert_true(ans == 0);  // obvious disjoint
         } else {  // may overlap
             uint64_t count = 0;
             while (it != it_end && r_it != r_it_end) {
@@ -388,7 +403,7 @@ class Roaring {
                 else if (*it < *r_it) { ++it; }
                 else { ++r_it; }
             }
-            assert(ans == count);
+            assert_true(ans == count);
         }
 
         return ans;
@@ -402,15 +417,15 @@ class Roaring {
         auto r_it = r.check.begin();
         auto r_it_end = r.check.end();
         if (it == it_end || r_it == r_it_end) {
-            assert(ans == false);  // if either are empty, no intersection
+            assert_true(ans == false);  // if either are empty, no intersection
         } else if (*it > *r.check.rbegin() || *r_it > *check.rbegin()) {
-            assert(ans == false);  // obvious disjoint
+            assert_true(ans == false);  // obvious disjoint
         } else while (it != it_end && r_it != r_it_end) {  // may overlap
-            if (*it == *r_it) { assert(ans == true); goto done; }  // overlap
+            if (*it == *r_it) { assert_true(ans == true); goto done; }  // overlap
             else if (*it < *r_it) { ++it; }
             else { ++r_it; }
         }
-        assert(ans == false);
+        assert_true(ans == false);
 
       done:  // (could use lambda vs goto, but debug step in lambdas is poor)
          return ans;
@@ -429,10 +444,10 @@ class Roaring {
         auto it_end = check.end();
         auto r_it = r.check.begin();
         auto r_it_end = r.check.end();
-        if (it == it_end) { assert(ans == r.check.size()); }  // this empty
-        else if (r_it == r_it_end) { assert(ans == check.size()); }  // r empty
+        if (it == it_end) { assert_true(ans == r.check.size()); }  // this empty
+        else if (r_it == r_it_end) { assert_true(ans == check.size()); }  // r empty
         else if (*it > *r.check.rbegin() || *r_it > *check.rbegin()) {
-            assert(ans == check.size() + r.check.size());  // obvious disjoint
+            assert_true(ans == check.size() + r.check.size());  // obvious disjoint
         } else {
             uint64_t count = 0;
             while (it != it_end || r_it != r_it_end) {
@@ -443,7 +458,7 @@ class Roaring {
                 else if (*it < *r_it) { ++it; }
                 else { ++r_it; }
             }
-            assert(ans == count);
+            assert_true(ans == count);
         }
 
         return ans;
@@ -456,10 +471,10 @@ class Roaring {
         auto it_end = check.end();
         auto r_it = r.check.begin();
         auto r_it_end = r.check.end();
-        if (it == it_end) { assert(ans == 0); }  // this empty
-        else if (r_it == r_it_end) { assert(ans == check.size()); }  // r empty
+        if (it == it_end) { assert_true(ans == 0); }  // this empty
+        else if (r_it == r_it_end) { assert_true(ans == check.size()); }  // r empty
         else if (*it > *r.check.rbegin() || *r_it > *check.rbegin()) {
-            assert(ans == check.size());  // disjoint so nothing removed
+            assert_true(ans == check.size());  // disjoint so nothing removed
         } else {  // may overlap
             uint64_t count = check.size();  // start with cardinality of this
             while (it != it_end && r_it != r_it_end) {
@@ -467,7 +482,7 @@ class Roaring {
                 else if (*it < *r_it) { ++it; }
                 else { ++r_it; }
             }
-            assert(ans == count);
+            assert_true(ans == count);
         }
 
         return ans;
@@ -480,10 +495,10 @@ class Roaring {
         auto it_end = check.end();
         auto r_it = r.check.begin();
         auto r_it_end = r.check.end();
-        if (it == it_end) { assert(ans == r.check.size()); }  // this empty
-        else if (r_it == r_it_end) { assert(ans == check.size()); }  // r empty
+        if (it == it_end) { assert_true(ans == r.check.size()); }  // this empty
+        else if (r_it == r_it_end) { assert_true(ans == check.size()); }  // r empty
         else if (*it > *r.check.rbegin() || *r_it > *check.rbegin()) {
-            assert(ans == check.size() + r.check.size());  // obvious disjoint
+            assert_true(ans == check.size() + r.check.size());  // obvious disjoint
         } else {  // may overlap
             uint64_t count = 0;
             while (it != it_end || r_it != r_it_end) {
@@ -493,7 +508,7 @@ class Roaring {
                 else if (*it < *r_it) { ++count; ++it; }
                 else { ++count; ++r_it; }
             }
-            assert(ans == count);
+            assert_true(ans == count);
         }
 
         return ans;
@@ -507,7 +522,7 @@ class Roaring {
         auto it_end = check.end();
         for (; it != it_end && *it <= x; ++it)
             ++count;
-        assert(ans == count);
+        assert_true(ans == count);
 
         return ans;
     }
@@ -534,7 +549,7 @@ class Roaring {
         Roaring ans(plain & o.plain);
 
         Roaring inplace(*this);
-        assert(ans == (inplace &= o));  // validate against in-place version
+        assert_true(ans == (inplace &= o));  // validate against in-place version
 
         return ans;
     }
@@ -543,7 +558,7 @@ class Roaring {
         Roaring ans(plain - o.plain);
 
         Roaring inplace(*this);
-        assert(ans == (inplace -= o));  // validate against in-place version
+        assert_true(ans == (inplace -= o));  // validate against in-place version
 
         return ans;
     }
@@ -552,7 +567,7 @@ class Roaring {
         Roaring ans(plain | o.plain);
 
         Roaring inplace(*this);
-        assert(ans == (inplace |= o));  // validate against in-place version
+        assert_true(ans == (inplace |= o));  // validate against in-place version
 
         return ans;
     }
@@ -561,7 +576,7 @@ class Roaring {
         Roaring ans(plain ^ o.plain);
 
         Roaring inplace(*this);
-        assert(ans == (inplace ^= o));  // validate against in-place version
+        assert_true(ans == (inplace ^= o));  // validate against in-place version
 
         return ans;
     }
@@ -590,12 +605,12 @@ class Roaring {
         delete[] plain_inputs;
 
         if (n == 0)
-            assert(ans.cardinality() == 0);
+            assert_true(ans.cardinality() == 0);
         else {
             Roaring temp = *inputs[0];
             for (size_t i = 1; i < n; ++i)
                 temp |= *inputs[i];
-            assert(temp == ans);
+            assert_true(temp == ans);
         }
 
         return ans;
diff --git a/tests/robust_deserialization_unit.c b/tests/robust_deserialization_unit.c
index ee6750296..24467a99b 100644
--- a/tests/robust_deserialization_unit.c
+++ b/tests/robust_deserialization_unit.c
@@ -165,7 +165,10 @@ DEFINE_TEST(test_robust_deserialize7) {
 
 int main() {
     tellmeall();
-
+#if CROARING_IS_BIG_ENDIAN
+    printf("Big-endian IO unsupported.\n");
+    return EXIT_SUCCESS;
+#else
     const struct CMUnitTest tests[] = {
         cmocka_unit_test(test_robust_deserialize1),
         cmocka_unit_test(test_robust_deserialize2),
@@ -177,4 +180,5 @@ int main() {
      };
 
     return cmocka_run_group_tests(tests, NULL, NULL);
+#endif
 }
diff --git a/tests/run_container_unit.c b/tests/run_container_unit.c
index dbf08f253..94adf6d88 100644
--- a/tests/run_container_unit.c
+++ b/tests/run_container_unit.c
@@ -171,11 +171,18 @@ DEFINE_TEST(select_test) {
     run_container_free(B);
 }
 
+static inline void _run_container_add_range(run_container_t* run,
+                                           uint32_t min, uint32_t max) {
+    int32_t nruns_greater = rle16_count_greater(run->runs, run->n_runs, max);
+    int32_t nruns_less = rle16_count_less(run->runs, run->n_runs - nruns_greater, min);
+    run_container_add_range_nruns(run, min, max, nruns_less, nruns_greater);
+}
+
 DEFINE_TEST(remove_range_test) {
     run_container_t* run = run_container_create();
-    run_container_add_range(run, 100, 150);
-    run_container_add_range(run, 200, 250);
-    run_container_add_range(run, 300, 350);
+    _run_container_add_range(run, 100, 150);
+    _run_container_add_range(run, 200, 250);
+    _run_container_add_range(run, 300, 350);
 
     // act on left-most run
     run_container_remove_range(run, 100, 110);
diff --git a/tests/test.h b/tests/test.h
index f4d1fe4b0..a71d94c4e 100644
--- a/tests/test.h
+++ b/tests/test.h
@@ -27,6 +27,12 @@
 
 #define DESCRIBE_TEST fprintf(stderr, "--- %s\n", __func__)
 
+#define assert_bitmap_validate(b) do {                                       \
+        const char *internal_reason_buf = NULL;                              \
+        if (!roaring_bitmap_internal_validate((b), &internal_reason_buf)) {  \
+            fail_msg("internal validation failed: %s", internal_reason_buf); \
+        }                                                                    \
+    } while (0)
 
 // The "cmocka" test functions are supposed to look like:
 //
diff --git a/tests/testdata/64map32bitvals.bin b/tests/testdata/64map32bitvals.bin
new file mode 100644
index 000000000..475b89441
Binary files /dev/null and b/tests/testdata/64map32bitvals.bin differ
diff --git a/tests/testdata/64mapempty.bin b/tests/testdata/64mapempty.bin
new file mode 100644
index 000000000..1b1cb4d44
Binary files /dev/null and b/tests/testdata/64mapempty.bin differ
diff --git a/tests/testdata/64mapemptyinput.bin b/tests/testdata/64mapemptyinput.bin
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/testdata/64maphighvals.bin b/tests/testdata/64maphighvals.bin
new file mode 100644
index 000000000..54abac71f
Binary files /dev/null and b/tests/testdata/64maphighvals.bin differ
diff --git a/tests/testdata/64mapinvalidsize.bin b/tests/testdata/64mapinvalidsize.bin
new file mode 100644
index 000000000..48a2754f6
Binary files /dev/null and b/tests/testdata/64mapinvalidsize.bin differ
diff --git a/tests/testdata/64mapkeytoosmall.bin b/tests/testdata/64mapkeytoosmall.bin
new file mode 100644
index 000000000..3a768cf46
Binary files /dev/null and b/tests/testdata/64mapkeytoosmall.bin differ
diff --git a/tests/testdata/64mapsizetoosmall.bin b/tests/testdata/64mapsizetoosmall.bin
new file mode 100644
index 000000000..cd2112d98
Binary files /dev/null and b/tests/testdata/64mapsizetoosmall.bin differ
diff --git a/tests/testdata/64mapspreadvals.bin b/tests/testdata/64mapspreadvals.bin
new file mode 100644
index 000000000..83c72f6ba
Binary files /dev/null and b/tests/testdata/64mapspreadvals.bin differ
diff --git a/tests/testdata/README.md b/tests/testdata/README.md
index cecbb6981..62e730066 100644
--- a/tests/testdata/README.md
+++ b/tests/testdata/README.md
@@ -1,4 +1,4 @@
 # test data
 
 These bitmaps were generated from Java : 
-https://github.com/RoaringBitmap/RoaringBitmap/blob/master/examples/SerializeToDiskExample.java
+https://github.com/RoaringBitmap/RoaringBitmap/blob/master/examples/src/main/java/SerializeToDiskExample.java
diff --git a/tests/threads_unit.cpp b/tests/threads_unit.cpp
new file mode 100644
index 000000000..2d913889c
--- /dev/null
+++ b/tests/threads_unit.cpp
@@ -0,0 +1,64 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <thread>
+#include <roaring/roaring.h>
+#include <roaring/misc/configreport.h>
+
+// We are mostly running this test to check for data races suing thread sanitizer.
+void run(roaring_bitmap_t **rarray) {
+    for(size_t i = 0; i < 100; i++) {
+      roaring_bitmap_t *r1 = roaring_bitmap_copy(rarray[0]);
+      roaring_bitmap_t *r2 = roaring_bitmap_copy(rarray[1]);
+      roaring_bitmap_t *r3 = roaring_bitmap_copy(rarray[2]);
+      roaring_bitmap_and_inplace(r1, r2);
+      roaring_bitmap_andnot_inplace(r1, r3);
+      roaring_bitmap_free(r1);
+      roaring_bitmap_free(r2);
+      roaring_bitmap_free(r3);
+    }
+}
+
+bool run_threads_unit_tests() {
+    roaring_bitmap_t *r1 = roaring_bitmap_create();
+
+    for (uint32_t i = 0; i < 50000; i++) {
+        if (i != 300) {
+            roaring_bitmap_add(r1, 65536 + i);
+        }
+    }
+    for (uint32_t i = 50000; i < 150000; i++) {
+        if ((i%500) == 0) {
+            roaring_bitmap_add(r1, i);
+        }
+    }
+    for (uint32_t i = 150000; i < 200000; i++) {
+        if ((i%2) == 0) {
+            roaring_bitmap_add(r1, i);
+        }
+    }
+    
+    roaring_bitmap_set_copy_on_write(r1, true);
+    roaring_bitmap_run_optimize(r1);
+    roaring_bitmap_t *r2 = roaring_bitmap_of(5, 10010,10020,10030,10040,10050);
+    roaring_bitmap_set_copy_on_write(r2, true);
+    roaring_bitmap_t *r3 = roaring_bitmap_copy(r1);
+    roaring_bitmap_set_copy_on_write(r3, true);
+
+    roaring_bitmap_t* rarray1[3] = {r1, r2, r3};
+    roaring_bitmap_t* rarray2[3] = {r1, r2, r3};
+    std::thread thread1(run,rarray1);
+    std::thread thread2(run,rarray2);
+    thread1.join();
+    thread2.join();
+    roaring_bitmap_free(r1);
+    roaring_bitmap_free(r2);
+    roaring_bitmap_free(r3);
+    return true;
+}
+
+int main() {
+    roaring::misc::tellmeall();
+    bool is_ok = run_threads_unit_tests();
+    if(is_ok) { printf("code run completed.\n"); }
+    return is_ok ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/tests/toplevel_unit.c b/tests/toplevel_unit.c
index 3a4de4b3b..50f299967 100644
--- a/tests/toplevel_unit.c
+++ b/tests/toplevel_unit.c
@@ -1,27 +1,26 @@
 #include <assert.h>
+#include <roaring/misc/configreport.h>
+#include <roaring/roaring.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 
-#include <roaring/roaring.h>
-#include <roaring/misc/configreport.h>
-
 // include internal headers for invasive testing
 #include <roaring/containers/containers.h>
 #include <roaring/roaring_array.h>
 
 #ifdef __cplusplus  // stronger type checking errors if C built in C++ mode
-    using namespace roaring::internal;
+using namespace roaring::internal;
 #endif
 
 #include "test.h"
 
-
 static unsigned int seed = 123456789;
 static const int OUR_RAND_MAX = (1 << 30) - 1;
-inline static unsigned int our_rand() {  // we do not want to depend on a system-specific
-                                // random number generator
+inline static unsigned int
+our_rand() {  // we do not want to depend on a system-specific
+              // random number generator
     seed = (1103515245 * seed + 12345);
     return seed & OUR_RAND_MAX;
 }
@@ -31,7 +30,7 @@ static inline uint32_t minimum_uint32(uint32_t a, uint32_t b) {
 }
 
 // arrays expected to both be sorted.
-static int array_equals(uint32_t *a1, int32_t size1, uint32_t *a2,
+static int array_equals(const uint32_t *a1, int32_t size1, const uint32_t *a2,
                         int32_t size2) {
     if (size1 != size2) return 0;
     for (int i = 0; i < size1; ++i) {
@@ -46,14 +45,156 @@ bool roaring_iterator_sumall(uint32_t value, void *param) {
     *(uint32_t *)param += value;
     return true;  // continue till the end
 }
+DEFINE_TEST(issue457) {
+    roaring_bitmap_t *r1 = roaring_bitmap_from_range(65539, 65541, 1);
+    roaring_bitmap_printf_describe(r1);
+    assert_true(roaring_bitmap_get_cardinality(r1) == 2);
+    roaring_bitmap_t *r2 = roaring_bitmap_add_offset(r1, -3);
+    roaring_bitmap_printf_describe(r2);
+    assert_true(roaring_bitmap_get_cardinality(r2) == 2);
+    roaring_bitmap_printf(r2);
+    roaring_bitmap_free(r1);
+    roaring_bitmap_free(r2);
+}
 
+DEFINE_TEST(issue429) {
+    // This is a memory leak test, so we don't need to check the results.
+    roaring_bitmap_t *b1 = roaring_bitmap_create();
+    roaring_bitmap_add_range(b1, 0, 100);
+    roaring_bitmap_remove_range(b1, 0, 99);
+    roaring_bitmap_t *b2 = roaring_bitmap_copy(b1);
+    const roaring_bitmap_t *bitmaps[] = {b1, b2};
+    roaring_bitmap_t *result = roaring_bitmap_or_many_heap(2, bitmaps);
+    roaring_bitmap_free(result);
+    roaring_bitmap_free(b2);
+    roaring_bitmap_free(b1);
+}
+
+DEFINE_TEST(issue431) {
+    // This is a memory access test, so we don't need to check the results.
+    roaring_bitmap_t *b1 = roaring_bitmap_create();
+    roaring_bitmap_add(b1, 100);
+    roaring_bitmap_flip_inplace(b1, 0, 100 + 1);
+    roaring_bitmap_t *b2 = roaring_bitmap_create();
+    roaring_bitmap_add_range(b2, 50, 100 + 1);
+    roaring_bitmap_is_subset(b2, b1);
+    roaring_bitmap_free(b2);
+    roaring_bitmap_free(b1);
+}
+
+DEFINE_TEST(issue433) {
+    roaring_bitmap_t *b1 = roaring_bitmap_create();
+    roaring_bitmap_add(b1, 262143);
+    roaring_bitmap_add_range_closed(b1, 258047, 262143);
+    roaring_bitmap_remove_range_closed(b1, 262143, 262143);
+    size_t len = roaring_bitmap_portable_size_in_bytes(b1);
+    char *data = roaring_malloc(len);
+    roaring_bitmap_portable_serialize(b1, data);
+    roaring_bitmap_t *b2 = roaring_bitmap_portable_deserialize_safe(data, len);
+    assert_true(roaring_bitmap_equals(b1, b2));
+    roaring_bitmap_free(b2);
+    roaring_bitmap_free(b1);
+    roaring_free(data);
+}
+
+DEFINE_TEST(issue436) {
+    roaring_bitmap_t *b1 = roaring_bitmap_create();
+    roaring_bitmap_add_range_closed(b1, 19711, 262068);
+    for (int i = 0; i < 0x10000; i += 2) {
+        roaring_bitmap_add(b1, i);
+    }
+    roaring_bitmap_printf_describe(b1);
+    roaring_bitmap_remove_range_closed(b1, 6143, 65505);
+    size_t len = roaring_bitmap_portable_size_in_bytes(b1);
+    char *data = roaring_malloc(len);
+    roaring_bitmap_portable_serialize(b1, data);
+    roaring_bitmap_t *b2 = roaring_bitmap_portable_deserialize_safe(data, len);
+    assert_true(roaring_bitmap_equals(b1, b2));
+    roaring_bitmap_free(b2);
+    roaring_bitmap_free(b1);
+    roaring_free(data);
+}
+
+DEFINE_TEST(issue440) {
+    roaring_bitmap_t *b1 = roaring_bitmap_create();
+    roaring_bitmap_add_range_closed(b1, 0x20000, 0x2FFFF);
+    roaring_bitmap_add_range_closed(b1, 0, 0xFFFF);
+    uint32_t largest_item = 0x11000;
+    assert_false(roaring_bitmap_contains_range(b1, 0, largest_item + 1));
+    assert_false(roaring_bitmap_contains(b1, largest_item));
+    roaring_bitmap_free(b1);
+}
 
 DEFINE_TEST(range_contains) {
     uint32_t end = 2073952257;
-    uint32_t start = end-2;
-    roaring_bitmap_t *bm = roaring_bitmap_from_range(start, end-1, 1);
-    roaring_bitmap_printf_describe(bm);printf("\n");
-    roaring_bitmap_contains_range(bm, start, end);
+    uint32_t start = end - 2;
+    roaring_bitmap_t *bm = roaring_bitmap_from_range(start, end - 1, 1);
+    roaring_bitmap_printf_describe(bm);
+    printf("\n");
+    assert_true(roaring_bitmap_contains_range(bm, start, end - 1));
+    assert_false(roaring_bitmap_contains_range(bm, start, end));
+    roaring_bitmap_free(bm);
+}
+
+DEFINE_TEST(contains_bulk) {
+    roaring_bitmap_t *bm = roaring_bitmap_create();
+    roaring_bulk_context_t context = {0};
+
+    // Ensure checking an empty bitmap is okay
+    assert_true(!roaring_bitmap_contains_bulk(bm, &context, 0));
+    assert_true(!roaring_bitmap_contains_bulk(bm, &context, 0xFFFFFFFF));
+
+    // create RLE container from [0, 1000]
+    roaring_bitmap_add_range_closed(bm, 0, 1000);
+
+    // add array container from 77000
+    for (uint32_t i = 77000; i < 87000; i += 2) {
+        roaring_bitmap_add(bm, i);
+    }
+    // add bitset container from 132000
+    for (uint32_t i = 132000; i < 140000; i += 2) {
+        roaring_bitmap_add(bm, i);
+    }
+
+    roaring_bitmap_add(bm, UINT32_MAX);
+
+    uint32_t values[] = {
+        1000,            // 1
+        1001,            // 0
+        77000,           // 1
+        77001,           // 0
+        77002,           // 1
+        1002,            // 0
+        132000,          // 1
+        132001,          // 0
+        132002,          // 1
+        77003,           // 0
+        UINT32_MAX,      // 1
+        UINT32_MAX - 1,  // 0
+    };
+    size_t test_count = sizeof(values) / sizeof(values[0]);
+
+    for (size_t i = 0; i < test_count; i++) {
+        roaring_bulk_context_t empty_context = {0};
+        bool expected_contains = roaring_bitmap_contains(bm, values[i]);
+        assert_true(expected_contains == roaring_bitmap_contains_bulk(
+                                             bm, &empty_context, values[i]));
+        assert_true(expected_contains ==
+                    roaring_bitmap_contains_bulk(bm, &context, values[i]));
+
+        if (expected_contains) {
+            assert_int_equal(context.key, values[i] >> 16);
+        }
+        if (context.container != NULL) {
+            assert_in_range(context.idx, 0, bm->high_low_container.size - 1);
+            assert_ptr_equal(context.container,
+                             bm->high_low_container.containers[context.idx]);
+            assert_int_equal(context.key,
+                             bm->high_low_container.keys[context.idx]);
+            assert_int_equal(context.typecode,
+                             bm->high_low_container.typecodes[context.idx]);
+        }
+    }
     roaring_bitmap_free(bm);
 }
 
@@ -65,51 +206,48 @@ DEFINE_TEST(is_really_empty) {
 }
 
 DEFINE_TEST(inplaceorwide) {
-  uint64_t end = 4294901761;
-  roaring_bitmap_t *r1 = roaring_bitmap_from_range(0,1,1);
-  roaring_bitmap_t *r2 = roaring_bitmap_from_range(0,end,1);
-  roaring_bitmap_or_inplace(r1, r2);
-  assert_true(roaring_bitmap_get_cardinality(r1) == end);
-  roaring_bitmap_free(r1);
-  roaring_bitmap_free(r2);
+    uint64_t end = 4294901761;
+    roaring_bitmap_t *r1 = roaring_bitmap_from_range(0, 1, 1);
+    roaring_bitmap_t *r2 = roaring_bitmap_from_range(0, end, 1);
+    roaring_bitmap_or_inplace(r1, r2);
+    assert_true(roaring_bitmap_get_cardinality(r1) == end);
+    roaring_bitmap_free(r1);
+    roaring_bitmap_free(r2);
 }
 
 void can_copy_empty(bool copy_on_write) {
     roaring_bitmap_t *bm1 = roaring_bitmap_create();
     roaring_bitmap_set_copy_on_write(bm1, copy_on_write);
     roaring_bitmap_t *bm2 = roaring_bitmap_copy(bm1);
-    assert(roaring_bitmap_get_cardinality(bm1) == 0);
-    assert(roaring_bitmap_get_cardinality(bm2) == 0);
-    assert(roaring_bitmap_is_empty(bm1));
-    assert(roaring_bitmap_is_empty(bm2));
+    assert_true(roaring_bitmap_get_cardinality(bm1) == 0);
+    assert_true(roaring_bitmap_get_cardinality(bm2) == 0);
+    assert_true(roaring_bitmap_is_empty(bm1));
+    assert_true(roaring_bitmap_is_empty(bm2));
     roaring_bitmap_add(bm1, 3);
     roaring_bitmap_add(bm2, 5);
-    assert(roaring_bitmap_get_cardinality(bm1) == 1);
-    assert(roaring_bitmap_get_cardinality(bm2) == 1);
-    assert(roaring_bitmap_contains(bm1,3));
-    assert(roaring_bitmap_contains(bm2,5));
-    assert(!roaring_bitmap_contains(bm2,3));
-    assert(!roaring_bitmap_contains(bm1,5));
+    assert_true(roaring_bitmap_get_cardinality(bm1) == 1);
+    assert_true(roaring_bitmap_get_cardinality(bm2) == 1);
+    assert_true(roaring_bitmap_contains(bm1, 3));
+    assert_true(roaring_bitmap_contains(bm2, 5));
+    assert_true(!roaring_bitmap_contains(bm2, 3));
+    assert_true(!roaring_bitmap_contains(bm1, 5));
     roaring_bitmap_free(bm1);
     roaring_bitmap_free(bm2);
 }
 
-
-
-
-
 bool check_serialization(roaring_bitmap_t *bitmap) {
-    const int32_t size = roaring_bitmap_portable_size_in_bytes(bitmap);
+    const size_t size = roaring_bitmap_portable_size_in_bytes(bitmap);
     char *data = (char *)malloc(size);
     roaring_bitmap_portable_serialize(bitmap, data);
-    roaring_bitmap_t *deserializedBitmap = roaring_bitmap_portable_deserialize(data);
+    roaring_bitmap_t *deserializedBitmap =
+        roaring_bitmap_portable_deserialize(data);
     bool ret = roaring_bitmap_equals(bitmap, deserializedBitmap);
     roaring_bitmap_free(deserializedBitmap);
     free(data);
     return ret;
 }
 
-
+#if !CROARING_IS_BIG_ENDIAN
 DEFINE_TEST(issue245) {
     roaring_bitmap_t *bitmap = roaring_bitmap_create();
     const uint32_t targetEntries = 2048;
@@ -138,14 +276,15 @@ DEFINE_TEST(issue245) {
     }
     roaring_bitmap_free(bitmap);
 }
+#endif
 
 DEFINE_TEST(issue208) {
     roaring_bitmap_t *r = roaring_bitmap_create();
-    for (uint32_t i = 1; i < 8194; i+=2) {
+    for (uint32_t i = 1; i < 8194; i += 2) {
         roaring_bitmap_add(r, i);
     }
     uint32_t rank = roaring_bitmap_rank(r, 63);
-    assert(rank == 32);
+    assert_true(rank == 32);
     roaring_bitmap_free(r);
 }
 
@@ -154,13 +293,13 @@ DEFINE_TEST(issue208b) {
     for (uint32_t i = 65536 - 64; i < 65536; i++) {
         roaring_bitmap_add(r, i);
     }
-    for (uint32_t i = 0; i < 8196; i+=2) {
+    for (uint32_t i = 0; i < 8196; i += 2) {
         roaring_bitmap_add(r, i);
     }
     for (uint32_t i = 65536 - 64; i < 65536; i++) {
         uint32_t expected = i - (65536 - 64) + 8196 / 2 + 1;
         uint32_t rank = roaring_bitmap_rank(r, i);
-        assert(rank == expected);
+        assert_true(rank == expected);
     }
     roaring_bitmap_free(r);
 }
@@ -180,62 +319,61 @@ DEFINE_TEST(issue288) {
     roaring_bitmap_free(r2);
 }
 
-DEFINE_TEST(can_copy_empty_true) {
-  can_copy_empty(true);
-}
+DEFINE_TEST(can_copy_empty_true) { can_copy_empty(true); }
 
-DEFINE_TEST(can_copy_empty_false) {
-  can_copy_empty(false);
-}
+DEFINE_TEST(can_copy_empty_false) { can_copy_empty(false); }
 
 void can_add_to_copies(bool copy_on_write) {
     roaring_bitmap_t *bm1 = roaring_bitmap_create();
     roaring_bitmap_set_copy_on_write(bm1, copy_on_write);
     roaring_bitmap_add(bm1, 3);
     roaring_bitmap_t *bm2 = roaring_bitmap_copy(bm1);
-    assert(roaring_bitmap_get_cardinality(bm1) == 1);
-    assert(roaring_bitmap_get_cardinality(bm2) == 1);
+    assert_true(roaring_bitmap_get_cardinality(bm1) == 1);
+    assert_true(roaring_bitmap_get_cardinality(bm2) == 1);
     roaring_bitmap_add(bm2, 4);
     roaring_bitmap_add(bm1, 5);
-    assert(roaring_bitmap_get_cardinality(bm1) == 2);
-    assert(roaring_bitmap_get_cardinality(bm2) == 2);
+    assert_true(roaring_bitmap_get_cardinality(bm1) == 2);
+    assert_true(roaring_bitmap_get_cardinality(bm2) == 2);
     roaring_bitmap_free(bm1);
     roaring_bitmap_free(bm2);
 }
 
-void convert_all_containers(roaring_bitmap_t* r, uint8_t dst_type) {
+void convert_all_containers(roaring_bitmap_t *r, uint8_t dst_type) {
     for (int32_t i = 0; i < r->high_low_container.size; i++) {
         // first step: convert src_type to ARRAY
         if (r->high_low_container.typecodes[i] == BITSET_CONTAINER_TYPE) {
-            array_container_t* dst_container = array_container_from_bitset(
-                    CAST_bitset(r->high_low_container.containers[i]));
-            bitset_container_free(CAST_bitset(r->high_low_container.containers[i]));
+            array_container_t *dst_container = array_container_from_bitset(
+                CAST_bitset(r->high_low_container.containers[i]));
+            bitset_container_free(
+                CAST_bitset(r->high_low_container.containers[i]));
             r->high_low_container.containers[i] = dst_container;
             r->high_low_container.typecodes[i] = ARRAY_CONTAINER_TYPE;
         } else if (r->high_low_container.typecodes[i] == RUN_CONTAINER_TYPE) {
-            array_container_t* dst_container = array_container_from_run(
-                    CAST_run(r->high_low_container.containers[i]));
+            array_container_t *dst_container = array_container_from_run(
+                CAST_run(r->high_low_container.containers[i]));
             run_container_free(CAST_run(r->high_low_container.containers[i]));
             r->high_low_container.containers[i] = dst_container;
             r->high_low_container.typecodes[i] = ARRAY_CONTAINER_TYPE;
         }
-        assert(r->high_low_container.typecodes[i] == ARRAY_CONTAINER_TYPE);
+        assert_true(r->high_low_container.typecodes[i] == ARRAY_CONTAINER_TYPE);
 
         // second step: convert ARRAY to dst_type
         if (dst_type == BITSET_CONTAINER_TYPE) {
-            bitset_container_t* dst_container = bitset_container_from_array(
-                    CAST_array(r->high_low_container.containers[i]));
-            array_container_free(CAST_array(r->high_low_container.containers[i]));
+            bitset_container_t *dst_container = bitset_container_from_array(
+                CAST_array(r->high_low_container.containers[i]));
+            array_container_free(
+                CAST_array(r->high_low_container.containers[i]));
             r->high_low_container.containers[i] = dst_container;
             r->high_low_container.typecodes[i] = BITSET_CONTAINER_TYPE;
         } else if (dst_type == RUN_CONTAINER_TYPE) {
-            run_container_t* dst_container = run_container_from_array(
-                    CAST_array(r->high_low_container.containers[i]));
-            array_container_free(CAST_array(r->high_low_container.containers[i]));
+            run_container_t *dst_container = run_container_from_array(
+                CAST_array(r->high_low_container.containers[i]));
+            array_container_free(
+                CAST_array(r->high_low_container.containers[i]));
             r->high_low_container.containers[i] = dst_container;
             r->high_low_container.typecodes[i] = RUN_CONTAINER_TYPE;
         }
-        assert(r->high_low_container.typecodes[i] == dst_type);
+        assert_true(r->high_low_container.typecodes[i] == dst_type);
     }
 }
 
@@ -248,15 +386,15 @@ struct sbs_s {
 
     // reference implementation
     uint64_t *words;
-    uint32_t size; // number of words
+    uint32_t size;  // number of words
 };
 typedef struct sbs_s sbs_t;
 
 sbs_t *sbs_create(void) {
-    sbs_t *sbs = (sbs_t*)malloc(sizeof(sbs_t));
+    sbs_t *sbs = (sbs_t *)malloc(sizeof(sbs_t));
     sbs->roaring = roaring_bitmap_create();
     sbs->size = 1;
-    sbs->words = (uint64_t*)malloc(sbs->size * sizeof(uint64_t));
+    sbs->words = (uint64_t *)malloc(sbs->size * sizeof(uint64_t));
     for (uint32_t i = 0; i < sbs->size; i++) {
         sbs->words[i] = 0;
     }
@@ -264,38 +402,39 @@ sbs_t *sbs_create(void) {
 }
 
 void sbs_free(sbs_t *sbs) {
-  roaring_bitmap_free(sbs->roaring);
-  free(sbs->words);
-  free(sbs);
+    roaring_bitmap_free(sbs->roaring);
+    free(sbs->words);
+    free(sbs);
 }
 
 void sbs_convert(sbs_t *sbs, uint8_t code) {
-  convert_all_containers(sbs->roaring, code);
+    convert_all_containers(sbs->roaring, code);
 }
 
 void sbs_ensure_room(sbs_t *sbs, uint32_t v) {
-  uint32_t i = v / 64;
-  if (i >= sbs->size) {
-    uint32_t new_size = (i+1) * 3 / 2;
-    sbs->words = (uint64_t*)realloc(sbs->words, new_size*sizeof(uint64_t));
-    for (uint32_t j = sbs->size; j < new_size; j++) {
-      sbs->words[j] = 0;
+    uint32_t i = v / 64;
+    if (i >= sbs->size) {
+        uint32_t new_size = (i + 1) * 3 / 2;
+        sbs->words =
+            (uint64_t *)realloc(sbs->words, new_size * sizeof(uint64_t));
+        for (uint32_t j = sbs->size; j < new_size; j++) {
+            sbs->words[j] = 0;
+        }
+        sbs->size = new_size;
     }
-    sbs->size = new_size;
-  }
 }
 
 void sbs_add_value(sbs_t *sbs, uint32_t v) {
     roaring_bitmap_add(sbs->roaring, v);
 
     sbs_ensure_room(sbs, v);
-    sbs->words[v/64] |= UINT64_C(1) << (v % 64);
+    sbs->words[v / 64] |= UINT64_C(1) << (v % 64);
 }
 
 void sbs_add_range(sbs_t *sbs, uint64_t min, uint64_t max) {
     sbs_ensure_room(sbs, max);
     for (uint64_t v = min; v <= max; v++) {
-        sbs->words[v/64] |= UINT64_C(1) << (v % 64);
+        sbs->words[v / 64] |= UINT64_C(1) << (v % 64);
     }
 
     roaring_bitmap_add_range(sbs->roaring, min, max + 1);
@@ -304,7 +443,7 @@ void sbs_add_range(sbs_t *sbs, uint64_t min, uint64_t max) {
 void sbs_remove_range(sbs_t *sbs, uint64_t min, uint64_t max) {
     sbs_ensure_room(sbs, max);
     for (uint64_t v = min; v <= max; v++) {
-        sbs->words[v/64] &= ~(UINT64_C(1) << (v % 64));
+        sbs->words[v / 64] &= ~(UINT64_C(1) << (v % 64));
     }
 
     roaring_bitmap_remove_range(sbs->roaring, min, max + 1);
@@ -314,7 +453,7 @@ void sbs_remove_many(sbs_t *sbs, size_t n_args, uint32_t *vals) {
     for (size_t i = 0; i < n_args; i++) {
         uint32_t v = vals[i];
         sbs_ensure_room(sbs, v);
-        sbs->words[v/64] &= ~(UINT64_C(1) << (v % 64));
+        sbs->words[v / 64] &= ~(UINT64_C(1) << (v % 64));
     }
     roaring_bitmap_remove_many(sbs->roaring, n_args, vals);
 }
@@ -322,13 +461,14 @@ void sbs_remove_many(sbs_t *sbs, size_t n_args, uint32_t *vals) {
 bool sbs_check_type(sbs_t *sbs, uint8_t type) {
     bool answer = true;
     for (int32_t i = 0; i < sbs->roaring->high_low_container.size; i++) {
-        answer = answer && (sbs->roaring->high_low_container.typecodes[i] == type);
+        answer =
+            answer && (sbs->roaring->high_low_container.typecodes[i] == type);
     }
     return answer;
 }
 
 bool sbs_is_empty(sbs_t *sbs) {
-  return sbs->roaring->high_low_container.size == 0;
+    return sbs->roaring->high_low_container.size == 0;
 }
 
 void sbs_compare(sbs_t *sbs) {
@@ -341,24 +481,24 @@ void sbs_compare(sbs_t *sbs) {
         }
     }
     uint32_t *expected_values =
-            (uint32_t*)malloc(expected_cardinality * sizeof(uint32_t));
+        (uint32_t *)malloc(expected_cardinality * sizeof(uint32_t));
     memset(expected_values, 0, expected_cardinality * sizeof(uint32_t));
     for (uint32_t i = 0, dst = 0; i < sbs->size; i++) {
         for (uint32_t j = 0; j < 64; j++) {
             if ((sbs->words[i] & (UINT64_C(1) << j)) != 0) {
-                expected_values[dst++] = i*64 + j;
+                expected_values[dst++] = i * 64 + j;
             }
         }
     }
 
     uint32_t actual_cardinality = roaring_bitmap_get_cardinality(sbs->roaring);
     uint32_t *actual_values =
-            (uint32_t*)malloc(actual_cardinality * sizeof(uint32_t));
+        (uint32_t *)malloc(actual_cardinality * sizeof(uint32_t));
     memset(actual_values, 0, actual_cardinality * sizeof(uint32_t));
     roaring_bitmap_to_uint32_array(sbs->roaring, actual_values);
 
-    bool ok = array_equals(actual_values, actual_cardinality,
-                           expected_values, expected_cardinality);
+    bool ok = array_equals(actual_values, actual_cardinality, expected_values,
+                           expected_cardinality);
     if (!ok) {
         printf("Expected: ");
         for (uint32_t i = 0; i < expected_cardinality; i++) {
@@ -396,6 +536,13 @@ DEFINE_TEST(test_stats) {
     roaring_bitmap_free(r1);
 }
 
+DEFINE_TEST(with_huge_capacity) {
+    roaring_bitmap_t *r = roaring_bitmap_create_with_capacity(UINT32_MAX);
+    assert_non_null(r);
+    assert_int_equal(r->high_low_container.allocation_size, (1 << 16));
+    roaring_bitmap_free(r);
+}
+
 // this should expose memory leaks
 // (https://github.com/RoaringBitmap/CRoaring/pull/70)
 void leaks_with_empty(bool copy_on_write) {
@@ -433,61 +580,59 @@ DEFINE_TEST(check_interval) {
 
     roaring_bitmap_printf(r);
 
-
-    roaring_bitmap_t *range = roaring_bitmap_from_range(10, 1000+1, 1);
+    roaring_bitmap_t *range = roaring_bitmap_from_range(10, 1000 + 1, 1);
     assert_non_null(range);
-    assert_true(roaring_bitmap_intersect(r,range));
+    assert_true(roaring_bitmap_intersect(r, range));
     roaring_bitmap_t *range2 = roaring_bitmap_from_range(10, 1000, 1);
     assert_non_null(range2);
-    assert_false(roaring_bitmap_intersect(r,range2));
+    assert_false(roaring_bitmap_intersect(r, range2));
 
-    assert_true(roaring_bitmap_intersect_with_range(r, 10, 1000+1));
+    assert_true(roaring_bitmap_intersect_with_range(r, 10, 1000 + 1));
     assert_false(roaring_bitmap_intersect_with_range(r, 10, 1000));
 
     roaring_bitmap_free(r);
     roaring_bitmap_free(range);
     roaring_bitmap_free(range2);
-
 }
 
 DEFINE_TEST(check_full_inplace_flip) {
-  roaring_bitmap_t *r1 = roaring_bitmap_create();
-  uint64_t bignumber = UINT64_C(0x100000000);
-  roaring_bitmap_flip_inplace(r1, 0, bignumber);
-  assert_true(roaring_bitmap_get_cardinality(r1) == bignumber);
-  roaring_bitmap_free(r1);
+    roaring_bitmap_t *r1 = roaring_bitmap_create();
+    uint64_t bignumber = UINT64_C(0x100000000);
+    roaring_bitmap_flip_inplace(r1, 0, bignumber);
+    assert_true(roaring_bitmap_get_cardinality(r1) == bignumber);
+    roaring_bitmap_free(r1);
 }
 
 DEFINE_TEST(check_iterate_to_end) {
-  uint64_t bignumber = UINT64_C(0x100000000);
-  for(uint64_t s = 0; s < 1024; s++) {
-    roaring_bitmap_t *r1 = roaring_bitmap_create();
-    roaring_bitmap_flip_inplace(r1, bignumber - s, bignumber);
-    roaring_uint32_iterator_t iterator;
-    roaring_init_iterator(r1, &iterator);
-    uint64_t count = 0;
-    while(iterator.has_value) {
-      assert(iterator.current_value + (s - count) == bignumber);
-      count++;
-      roaring_advance_uint32_iterator(&iterator);
-    }
-    assert_true(count == s);
-    assert_true(roaring_bitmap_get_cardinality(r1) == s);
-    roaring_bitmap_free(r1);
-  }
+    uint64_t bignumber = UINT64_C(0x100000000);
+    for (uint64_t s = 0; s < 1024; s++) {
+        roaring_bitmap_t *r1 = roaring_bitmap_create();
+        roaring_bitmap_flip_inplace(r1, bignumber - s, bignumber);
+        roaring_uint32_iterator_t iterator;
+        roaring_init_iterator(r1, &iterator);
+        uint64_t count = 0;
+        while (iterator.has_value) {
+            assert_true(iterator.current_value + (s - count) == bignumber);
+            count++;
+            roaring_advance_uint32_iterator(&iterator);
+        }
+        assert_true(count == s);
+        assert_true(roaring_bitmap_get_cardinality(r1) == s);
+        roaring_bitmap_free(r1);
+    }
 }
 
 DEFINE_TEST(check_iterate_to_beginning) {
     uint64_t bignumber = UINT64_C(0x100000000);
-    for(uint64_t s = 0; s < 1024; s++) {
+    for (uint64_t s = 0; s < 1024; s++) {
         roaring_bitmap_t *r1 = roaring_bitmap_create();
         roaring_bitmap_flip_inplace(r1, bignumber - s, bignumber);
         roaring_uint32_iterator_t iterator;
         roaring_init_iterator_last(r1, &iterator);
         uint64_t count = 0;
-        while(iterator.has_value) {
+        while (iterator.has_value) {
             count++;
-            assert(iterator.current_value + count == bignumber);
+            assert_true(iterator.current_value + count == bignumber);
             roaring_previous_uint32_iterator(&iterator);
         }
         assert_true(count == s);
@@ -497,70 +642,72 @@ DEFINE_TEST(check_iterate_to_beginning) {
 }
 
 DEFINE_TEST(check_range_contains_from_end) {
-  uint64_t bignumber = UINT64_C(0x100000000);
-  for(uint64_t s = 0; s <  1024 * 1024; s++) {
-    roaring_bitmap_t *r1 = roaring_bitmap_create();
-    roaring_bitmap_add_range(r1, bignumber - s, bignumber);
-    assert_true(roaring_bitmap_get_cardinality(r1) == s);
-    if(s>0) {
-      assert_true(roaring_bitmap_contains_range(r1, bignumber - s, bignumber - 1));
-    }
-    assert_true(roaring_bitmap_contains_range(r1, bignumber - s, bignumber));
-    assert_false(roaring_bitmap_contains_range(r1, bignumber - s - 1, bignumber));
-    assert_true(roaring_bitmap_get_cardinality(r1) == s);
-    roaring_bitmap_free(r1);
-  }
+    uint64_t bignumber = UINT64_C(0x100000000);
+    for (uint64_t s = 0; s < 1024 * 1024; s++) {
+        roaring_bitmap_t *r1 = roaring_bitmap_create();
+        roaring_bitmap_add_range(r1, bignumber - s, bignumber);
+        assert_true(roaring_bitmap_get_cardinality(r1) == s);
+        if (s > 0) {
+            assert_true(roaring_bitmap_contains_range(r1, bignumber - s,
+                                                      bignumber - 1));
+        }
+        assert_true(
+            roaring_bitmap_contains_range(r1, bignumber - s, bignumber));
+        assert_false(
+            roaring_bitmap_contains_range(r1, bignumber - s - 1, bignumber));
+        assert_true(roaring_bitmap_get_cardinality(r1) == s);
+        roaring_bitmap_free(r1);
+    }
 }
 
 DEFINE_TEST(check_full_flip) {
-  roaring_bitmap_t *rorg = roaring_bitmap_create();
-  uint64_t bignumber = UINT64_C(0x100000000);
-  roaring_bitmap_t *r1 = roaring_bitmap_flip(rorg, 0, bignumber);
-  assert_true(roaring_bitmap_get_cardinality(r1) == bignumber);
-  roaring_bitmap_free(r1);
-  roaring_bitmap_free(rorg);
+    roaring_bitmap_t *rorg = roaring_bitmap_create();
+    uint64_t bignumber = UINT64_C(0x100000000);
+    roaring_bitmap_t *r1 = roaring_bitmap_flip(rorg, 0, bignumber);
+    assert_true(roaring_bitmap_get_cardinality(r1) == bignumber);
+    roaring_bitmap_free(r1);
+    roaring_bitmap_free(rorg);
 }
 
 void test_stress_memory(bool copy_on_write) {
-	for (size_t i = 0; i < 5; i++) {
-		roaring_bitmap_t *r1 = roaring_bitmap_create();
-    roaring_bitmap_set_copy_on_write(r1, copy_on_write);
-		assert_non_null(r1);
-		for (size_t k = 0; k < 1000000; k++) {
-			uint32_t j = rand() % (100000000);
-			roaring_bitmap_add(r1, j);
-		}
-		roaring_bitmap_run_optimize(r1);
-		uint32_t compact_size = roaring_bitmap_portable_size_in_bytes(r1);
-		char * serializedbytes = (char *) malloc(compact_size);
-		size_t actualsize = roaring_bitmap_portable_serialize(r1, serializedbytes);
-		assert_int_equal(actualsize, compact_size);
-    roaring_bitmap_t *t = roaring_bitmap_portable_deserialize(serializedbytes);
-    assert_true(roaring_bitmap_equals(r1, t));
-    roaring_bitmap_free(t);
-		free(serializedbytes);
-		roaring_bitmap_free(r1);
-	}
-}
-
-DEFINE_TEST(test_stress_memory_true) {
-  test_stress_memory(true);
+    for (size_t i = 0; i < 5; i++) {
+        roaring_bitmap_t *r1 = roaring_bitmap_create();
+        roaring_bitmap_set_copy_on_write(r1, copy_on_write);
+        assert_non_null(r1);
+        for (size_t k = 0; k < 1000000; k++) {
+            uint32_t j = rand() % (100000000);
+            roaring_bitmap_add(r1, j);
+        }
+        roaring_bitmap_run_optimize(r1);
+        uint32_t compact_size = roaring_bitmap_portable_size_in_bytes(r1);
+        char *serializedbytes = (char *)malloc(compact_size);
+        size_t actualsize =
+            roaring_bitmap_portable_serialize(r1, serializedbytes);
+        assert_int_equal(actualsize, compact_size);
+        roaring_bitmap_t *t =
+            roaring_bitmap_portable_deserialize(serializedbytes);
+        assert_true(roaring_bitmap_equals(r1, t));
+        roaring_bitmap_free(t);
+        free(serializedbytes);
+        roaring_bitmap_free(r1);
+    }
 }
 
-DEFINE_TEST(test_stress_memory_false) {
-  test_stress_memory(false);
-}
+DEFINE_TEST(test_stress_memory_true) { test_stress_memory(true); }
 
+DEFINE_TEST(test_stress_memory_false) { test_stress_memory(false); }
 
 void test_example(bool copy_on_write) {
     // create a new empty bitmap
     roaring_bitmap_t *r1 = roaring_bitmap_create();
     roaring_bitmap_set_copy_on_write(r1, copy_on_write);
+    assert_bitmap_validate(r1);
     assert_non_null(r1);
 
     // then we can add values
     for (uint32_t i = 100; i < 1000; i++) {
         roaring_bitmap_add(r1, i);
+        assert_bitmap_validate(r1);
     }
 
     // check whether a value is contained
@@ -574,6 +721,7 @@ void test_example(bool copy_on_write) {
     // run_optimize
     uint32_t size = roaring_bitmap_portable_size_in_bytes(r1);
     roaring_bitmap_run_optimize(r1);
+    assert_bitmap_validate(r1);
     uint32_t compact_size = roaring_bitmap_portable_size_in_bytes(r1);
 
     printf("size before run optimize %d bytes, and after %d bytes\n", size,
@@ -581,6 +729,7 @@ void test_example(bool copy_on_write) {
 
     // create a new bitmap with varargs
     roaring_bitmap_t *r2 = roaring_bitmap_of(5, 1, 2, 3, 5, 6);
+    assert_bitmap_validate(r2);
     assert_non_null(r2);
 
     roaring_bitmap_printf(r2);
@@ -589,23 +738,24 @@ void test_example(bool copy_on_write) {
     const uint32_t values[] = {2, 3, 4};
     roaring_bitmap_t *r3 = roaring_bitmap_of_ptr(3, values);
     roaring_bitmap_set_copy_on_write(r3, copy_on_write);
+    assert_bitmap_validate(r3);
 
     // we can also go in reverse and go from arrays to bitmaps
     uint64_t card1 = roaring_bitmap_get_cardinality(r1);
     uint32_t *arr1 = (uint32_t *)malloc(card1 * sizeof(uint32_t));
-    assert(arr1 != NULL);
+    assert_true(arr1 != NULL);
     roaring_bitmap_to_uint32_array(r1, arr1);
 
     // we can go from arrays to bitmaps from "offset" by "limit"
     size_t offset = 100;
     size_t limit = 1000;
     uint32_t *arr3 = (uint32_t *)malloc(limit * sizeof(uint32_t));
-    assert(arr3 != NULL);
+    assert_true(arr3 != NULL);
     roaring_bitmap_range_uint32_array(r1, offset, limit, arr3);
     free(arr3);
 
-
     roaring_bitmap_t *r1f = roaring_bitmap_of_ptr(card1, arr1);
+    assert_bitmap_validate(r1f);
     free(arr1);
     assert_non_null(r1f);
 
@@ -616,12 +766,14 @@ void test_example(bool copy_on_write) {
     // we can copy and compare bitmaps
     roaring_bitmap_t *z = roaring_bitmap_copy(r3);
     roaring_bitmap_set_copy_on_write(z, copy_on_write);
+    assert_bitmap_validate(z);
     assert_true(roaring_bitmap_equals(r3, z));
 
     roaring_bitmap_free(z);
 
     // we can compute union two-by-two
     roaring_bitmap_t *r1_2_3 = roaring_bitmap_or(r1, r2);
+    assert_bitmap_validate(r1_2_3);
     assert_true(roaring_bitmap_get_cardinality(r1_2_3) ==
                 roaring_bitmap_or_cardinality(r1, r2));
 
@@ -631,9 +783,11 @@ void test_example(bool copy_on_write) {
     // we can compute a big union
     const roaring_bitmap_t *allmybitmaps[] = {r1, r2, r3};
     roaring_bitmap_t *bigunion = roaring_bitmap_or_many(3, allmybitmaps);
+    assert_bitmap_validate(bigunion);
     assert_true(roaring_bitmap_equals(r1_2_3, bigunion));
     roaring_bitmap_t *bigunionheap =
         roaring_bitmap_or_many_heap(3, allmybitmaps);
+    assert_bitmap_validate(bigunionheap);
     assert_true(roaring_bitmap_equals(r1_2_3, bigunionheap));
     roaring_bitmap_free(r1_2_3);
     roaring_bitmap_free(bigunion);
@@ -642,11 +796,13 @@ void test_example(bool copy_on_write) {
     // we can compute xor two-by-two
     roaring_bitmap_t *rx1_2_3 = roaring_bitmap_xor(r1, r2);
     roaring_bitmap_set_copy_on_write(rx1_2_3, copy_on_write);
+    assert_bitmap_validate(rx1_2_3);
     roaring_bitmap_xor_inplace(rx1_2_3, r3);
 
     // we can compute a big xor
     const roaring_bitmap_t *allmybitmaps_x[] = {r1, r2, r3};
     roaring_bitmap_t *bigxor = roaring_bitmap_xor_many(3, allmybitmaps_x);
+    assert_bitmap_validate(bigxor);
     assert_true(roaring_bitmap_equals(rx1_2_3, bigxor));
 
     roaring_bitmap_free(rx1_2_3);
@@ -654,6 +810,7 @@ void test_example(bool copy_on_write) {
 
     // we can compute intersection two-by-two
     roaring_bitmap_t *i1_2 = roaring_bitmap_and(r1, r2);
+    assert_bitmap_validate(i1_2);
     assert_true(roaring_bitmap_get_cardinality(i1_2) ==
                 roaring_bitmap_and_cardinality(r1, r2));
 
@@ -661,18 +818,24 @@ void test_example(bool copy_on_write) {
 
     // we can write a bitmap to a pointer and recover it later
     uint32_t expectedsize = roaring_bitmap_portable_size_in_bytes(r1);
-    char *serializedbytes = (char*)malloc(expectedsize);
+    char *serializedbytes = (char *)malloc(expectedsize);
     size_t actualsize = roaring_bitmap_portable_serialize(r1, serializedbytes);
     assert_int_equal(actualsize, expectedsize);
     roaring_bitmap_t *t = roaring_bitmap_portable_deserialize(serializedbytes);
+    assert_bitmap_validate(t);
     assert_true(roaring_bitmap_equals(r1, t));
     roaring_bitmap_free(t);
-     // we can also check whether there is a bitmap at a memory location without reading it
-    size_t sizeofbitmap = roaring_bitmap_portable_deserialize_size(serializedbytes,expectedsize);
-    assert(sizeofbitmap == expectedsize);  // sizeofbitmap would be zero if no bitmap were found
+    // we can also check whether there is a bitmap at a memory location without
+    // reading it
+    size_t sizeofbitmap =
+        roaring_bitmap_portable_deserialize_size(serializedbytes, expectedsize);
+    assert_true(
+        sizeofbitmap ==
+        expectedsize);  // sizeofbitmap would be zero if no bitmap were found
     // we can also read the bitmap "safely" by specifying a byte size limit:
-    t = roaring_bitmap_portable_deserialize_safe(serializedbytes,expectedsize);
-    assert(roaring_bitmap_equals(r1, t));  // what we recover is equal
+    t = roaring_bitmap_portable_deserialize_safe(serializedbytes, expectedsize);
+    assert_bitmap_validate(t);
+    assert_true(roaring_bitmap_equals(r1, t));  // what we recover is equal
     roaring_bitmap_free(t);
     free(serializedbytes);
 
@@ -698,18 +861,17 @@ void test_example(bool copy_on_write) {
     roaring_free_uint32_iterator(i);
     assert_true(roaring_bitmap_get_cardinality(r1) == counter);
 
-
     // for greater speed, you can iterate over the data in bulk
     i = roaring_create_iterator(r1);
     uint32_t buffer[256];
     while (1) {
-      uint32_t ret = roaring_read_uint32_iterator(i, buffer, 256);
-      for (uint32_t j = 0; j < ret; j++) {
-             counter += buffer[j];
-      }
-      if (ret < 256) {
-             break;
-     }
+        uint32_t ret = roaring_read_uint32_iterator(i, buffer, 256);
+        for (uint32_t j = 0; j < ret; j++) {
+            counter += buffer[j];
+        }
+        if (ret < 256) {
+            break;
+        }
     }
     roaring_free_uint32_iterator(i);
 
@@ -735,12 +897,13 @@ void test_uint32_iterator(bool run) {
     for (uint32_t i = 800000; i < 900000; i += 7) {
         roaring_bitmap_add(r1, i);
     }
-    if(run) roaring_bitmap_run_optimize(r1);
+    if (run) roaring_bitmap_run_optimize(r1);
+    assert_bitmap_validate(r1);
     roaring_uint32_iterator_t *iter = roaring_create_iterator(r1);
     for (uint32_t i = 0; i < 66000; i += 3) {
         assert_true(iter->has_value);
         assert_true(iter->current_value == i);
-        roaring_move_uint32_iterator_equalorlarger(iter,i);
+        roaring_move_uint32_iterator_equalorlarger(iter, i);
         assert_true(iter->has_value);
         assert_true(iter->current_value == i);
         roaring_advance_uint32_iterator(iter);
@@ -748,7 +911,7 @@ void test_uint32_iterator(bool run) {
     for (uint32_t i = 100000; i < 200000; i++) {
         assert_true(iter->has_value);
         assert_true(iter->current_value == i);
-        roaring_move_uint32_iterator_equalorlarger(iter,i);
+        roaring_move_uint32_iterator_equalorlarger(iter, i);
         assert_true(iter->has_value);
         assert_true(iter->current_value == i);
         roaring_advance_uint32_iterator(iter);
@@ -756,7 +919,7 @@ void test_uint32_iterator(bool run) {
     for (uint32_t i = 300000; i < 500000; i += 100) {
         assert_true(iter->has_value);
         assert_true(iter->current_value == i);
-        roaring_move_uint32_iterator_equalorlarger(iter,i);
+        roaring_move_uint32_iterator_equalorlarger(iter, i);
         assert_true(iter->has_value);
         assert_true(iter->current_value == i);
         roaring_advance_uint32_iterator(iter);
@@ -764,7 +927,7 @@ void test_uint32_iterator(bool run) {
     for (uint32_t i = 600000; i < 700000; i += 1) {
         assert_true(iter->has_value);
         assert_true(iter->current_value == i);
-        roaring_move_uint32_iterator_equalorlarger(iter,i);
+        roaring_move_uint32_iterator_equalorlarger(iter, i);
         assert_true(iter->has_value);
         assert_true(iter->current_value == i);
         roaring_advance_uint32_iterator(iter);
@@ -772,66 +935,66 @@ void test_uint32_iterator(bool run) {
     for (uint32_t i = 800000; i < 900000; i += 7) {
         assert_true(iter->has_value);
         assert_true(iter->current_value == i);
-        roaring_move_uint32_iterator_equalorlarger(iter,i);
+        roaring_move_uint32_iterator_equalorlarger(iter, i);
         assert_true(iter->has_value);
         assert_true(iter->current_value == i);
         roaring_advance_uint32_iterator(iter);
     }
     assert_false(iter->has_value);
-    roaring_move_uint32_iterator_equalorlarger(iter,0);
+    roaring_move_uint32_iterator_equalorlarger(iter, 0);
     assert_true(iter->has_value);
     assert_true(iter->current_value == 0);
-    roaring_move_uint32_iterator_equalorlarger(iter,66000);
+    roaring_move_uint32_iterator_equalorlarger(iter, 66000);
     assert_true(iter->has_value);
     assert_true(iter->current_value == 100000);
-    roaring_move_uint32_iterator_equalorlarger(iter,100000);
+    roaring_move_uint32_iterator_equalorlarger(iter, 100000);
     assert_true(iter->has_value);
     assert_true(iter->current_value == 100000);
-    roaring_move_uint32_iterator_equalorlarger(iter,200000);
+    roaring_move_uint32_iterator_equalorlarger(iter, 200000);
     assert_true(iter->has_value);
     assert_true(iter->current_value == 300000);
-    roaring_move_uint32_iterator_equalorlarger(iter,300000);
+    roaring_move_uint32_iterator_equalorlarger(iter, 300000);
     assert_true(iter->has_value);
     assert_true(iter->current_value == 300000);
-    roaring_move_uint32_iterator_equalorlarger(iter,500000);
+    roaring_move_uint32_iterator_equalorlarger(iter, 500000);
     assert_true(iter->has_value);
     assert_true(iter->current_value == 600000);
-    roaring_move_uint32_iterator_equalorlarger(iter,600000);
+    roaring_move_uint32_iterator_equalorlarger(iter, 600000);
     assert_true(iter->has_value);
     assert_true(iter->current_value == 600000);
-    roaring_move_uint32_iterator_equalorlarger(iter,700000);
+    roaring_move_uint32_iterator_equalorlarger(iter, 700000);
     assert_true(iter->has_value);
     assert_true(iter->current_value == 800000);
-    roaring_move_uint32_iterator_equalorlarger(iter,800000);
+    roaring_move_uint32_iterator_equalorlarger(iter, 800000);
     assert_true(iter->has_value);
     assert_true(iter->current_value == 800000);
-    roaring_move_uint32_iterator_equalorlarger(iter,900000);
+    roaring_move_uint32_iterator_equalorlarger(iter, 900000);
     assert_false(iter->has_value);
-    roaring_move_uint32_iterator_equalorlarger(iter,0);
+    roaring_move_uint32_iterator_equalorlarger(iter, 0);
     for (uint32_t i = 0; i < 66000; i += 3) {
         assert_true(iter->has_value);
         assert_true(iter->current_value == i);
-        roaring_move_uint32_iterator_equalorlarger(iter,i+1);
+        roaring_move_uint32_iterator_equalorlarger(iter, i + 1);
     }
     for (uint32_t i = 100000; i < 200000; i++) {
         assert_true(iter->has_value);
         assert_true(iter->current_value == i);
-        roaring_move_uint32_iterator_equalorlarger(iter,i+1);
+        roaring_move_uint32_iterator_equalorlarger(iter, i + 1);
     }
     for (uint32_t i = 300000; i < 500000; i += 100) {
         assert_true(iter->has_value);
         assert_true(iter->current_value == i);
-        roaring_move_uint32_iterator_equalorlarger(iter,i+1);
+        roaring_move_uint32_iterator_equalorlarger(iter, i + 1);
     }
     for (uint32_t i = 600000; i < 700000; i += 1) {
         assert_true(iter->has_value);
         assert_true(iter->current_value == i);
-        roaring_move_uint32_iterator_equalorlarger(iter,i+1);
+        roaring_move_uint32_iterator_equalorlarger(iter, i + 1);
     }
     for (uint32_t i = 800000; i < 900000; i += 7) {
         assert_true(iter->has_value);
         assert_true(iter->current_value == i);
-        roaring_move_uint32_iterator_equalorlarger(iter,i+1);
+        roaring_move_uint32_iterator_equalorlarger(iter, i + 1);
     }
     assert_false(iter->has_value);
 
@@ -852,19 +1015,19 @@ void can_remove_from_copies(bool copy_on_write) {
     roaring_bitmap_set_copy_on_write(bm1, copy_on_write);
     roaring_bitmap_add(bm1, 3);
     roaring_bitmap_t *bm2 = roaring_bitmap_copy(bm1);
-    assert(roaring_bitmap_get_cardinality(bm1) == 1);
-    assert(roaring_bitmap_get_cardinality(bm2) == 1);
+    assert_true(roaring_bitmap_get_cardinality(bm1) == 1);
+    assert_true(roaring_bitmap_get_cardinality(bm2) == 1);
     roaring_bitmap_add(bm2, 4);
     roaring_bitmap_add(bm1, 5);
-    assert(roaring_bitmap_get_cardinality(bm1) == 2);
-    assert(roaring_bitmap_get_cardinality(bm2) == 2);
+    assert_true(roaring_bitmap_get_cardinality(bm1) == 2);
+    assert_true(roaring_bitmap_get_cardinality(bm2) == 2);
     roaring_bitmap_remove(bm1, 5);
-    assert(roaring_bitmap_get_cardinality(bm1) == 1);
+    assert_true(roaring_bitmap_get_cardinality(bm1) == 1);
     roaring_bitmap_remove(bm1, 4);
-    assert(roaring_bitmap_get_cardinality(bm1) == 1);
-    assert(roaring_bitmap_get_cardinality(bm2) == 2);
+    assert_true(roaring_bitmap_get_cardinality(bm1) == 1);
+    assert_true(roaring_bitmap_get_cardinality(bm2) == 2);
     roaring_bitmap_remove(bm2, 4);
-    assert(roaring_bitmap_get_cardinality(bm2) == 1);
+    assert_true(roaring_bitmap_get_cardinality(bm2) == 1);
     roaring_bitmap_free(bm1);
     roaring_bitmap_free(bm2);
 }
@@ -884,6 +1047,21 @@ DEFINE_TEST(test_addremove) {
     for (uint32_t value = 33057; value < 147849; value += 8) {
         roaring_bitmap_remove(bm, value);
     }
+    assert_bitmap_validate(bm);
+    assert_true(roaring_bitmap_is_empty(bm));
+    roaring_bitmap_free(bm);
+}
+
+DEFINE_TEST(test_addremove_bulk) {
+    roaring_bitmap_t *bm = roaring_bitmap_create();
+    roaring_bulk_context_t context = {0};
+    for (uint32_t value = 33057; value < 147849; value += 8) {
+        roaring_bitmap_add_bulk(bm, &context, value);
+    }
+    for (uint32_t value = 33057; value < 147849; value += 8) {
+        assert_true(roaring_bitmap_remove_checked(bm, value));
+    }
+    assert_bitmap_validate(bm);
     assert_true(roaring_bitmap_is_empty(bm));
     roaring_bitmap_free(bm);
 }
@@ -897,6 +1075,7 @@ DEFINE_TEST(test_addremoverun) {
     for (uint32_t value = 33057; value < 147849; value += 8) {
         roaring_bitmap_remove(bm, value);
     }
+    assert_bitmap_validate(bm);
     assert_true(roaring_bitmap_is_empty(bm));
     roaring_bitmap_free(bm);
 }
@@ -911,7 +1090,7 @@ DEFINE_TEST(test_clear) {
     size_t expected_card = 0;
     for (uint32_t value = 33057; value < 147849; value += 8) {
         roaring_bitmap_add(bm, value);
-        expected_card ++;
+        expected_card++;
     }
     assert_true(roaring_bitmap_get_cardinality(bm) == expected_card);
     roaring_bitmap_clear(bm);
@@ -919,7 +1098,6 @@ DEFINE_TEST(test_clear) {
     roaring_bitmap_free(bm);
 }
 
-
 DEFINE_TEST(test_remove_from_copies_true) { can_remove_from_copies(true); }
 
 DEFINE_TEST(test_remove_from_copies_false) { can_remove_from_copies(false); }
@@ -932,6 +1110,8 @@ bool check_bitmap_from_range(uint32_t min, uint64_t max, uint32_t step) {
     for (uint32_t value = min; value < max; value += step) {
         roaring_bitmap_add(expected, value);
     }
+    assert_bitmap_validate(result);
+    assert_bitmap_validate(expected);
     bool is_equal = roaring_bitmap_equals(expected, result);
     if (!is_equal) {
         fprintf(stderr, "[ERROR] check_bitmap_from_range(%u, %u, %u)\n",
@@ -947,13 +1127,17 @@ DEFINE_TEST(test_silly_range) {
     check_bitmap_from_range(0, 2, 1);
     roaring_bitmap_t *bm1 = roaring_bitmap_from_range(0, 1, 1);
     roaring_bitmap_t *bm2 = roaring_bitmap_from_range(0, 2, 1);
+    assert_bitmap_validate(bm1);
+    assert_bitmap_validate(bm2);
     assert_false(roaring_bitmap_equals(bm1, bm2));
     roaring_bitmap_free(bm1);
     roaring_bitmap_free(bm2);
 }
 
 DEFINE_TEST(test_adversarial_range) {
-    roaring_bitmap_t *bm1 = roaring_bitmap_from_range(0, UINT64_C(0x100000000), 1);
+    roaring_bitmap_t *bm1 =
+        roaring_bitmap_from_range(0, UINT64_C(0x100000000), 1);
+    assert_bitmap_validate(bm1);
     assert_true(roaring_bitmap_get_cardinality(bm1) == UINT64_C(0x100000000));
     roaring_bitmap_free(bm1);
 }
@@ -961,7 +1145,7 @@ DEFINE_TEST(test_adversarial_range) {
 DEFINE_TEST(test_range_and_serialize) {
     roaring_bitmap_t *old_bm = roaring_bitmap_from_range(65520, 131057, 16);
     size_t size = roaring_bitmap_portable_size_in_bytes(old_bm);
-    char *buff = (char*)malloc(size);
+    char *buff = (char *)malloc(size);
     size_t actualsize = roaring_bitmap_portable_serialize(old_bm, buff);
     assert_int_equal(actualsize, size);
     roaring_bitmap_t *new_bm = roaring_bitmap_portable_deserialize(buff);
@@ -1003,6 +1187,7 @@ DEFINE_TEST(test_bitmap_from_range) {
 DEFINE_TEST(test_printf) {
     roaring_bitmap_t *r1 =
         roaring_bitmap_of(8, 1, 2, 3, 100, 1000, 10000, 1000000, 20000000);
+    assert_bitmap_validate(r1);
     assert_non_null(r1);
     roaring_bitmap_printf(r1);
     roaring_bitmap_free(r1);
@@ -1029,6 +1214,7 @@ DEFINE_TEST(test_printf_withrun) {
     for (int i = 100, top_val = 200; i < top_val; i++)
         roaring_bitmap_add(r1, i);
     roaring_bitmap_run_optimize(r1);
+    assert_bitmap_validate(r1);
     roaring_bitmap_printf(r1);  // does it crash?
     roaring_bitmap_free(r1);
     printf("\n");
@@ -1126,7 +1312,7 @@ DEFINE_TEST(test_portable_serialize) {
         roaring_bitmap_add(r1, 3 * i);
 
     uint32_t expectedsize = roaring_bitmap_portable_size_in_bytes(r1);
-    char *serialized = (char*)malloc(expectedsize);
+    char *serialized = (char *)malloc(expectedsize);
     serialize_len = roaring_bitmap_portable_serialize(r1, serialized);
     assert_int_equal(serialize_len, expectedsize);
     assert_int_equal(serialize_len, expectedsize);
@@ -1152,7 +1338,7 @@ DEFINE_TEST(test_portable_serialize) {
     r1 = roaring_bitmap_of(6, 2946000, 2997491, 10478289, 10490227, 10502444,
                            19866827);
     expectedsize = roaring_bitmap_portable_size_in_bytes(r1);
-    serialized = (char*)malloc(expectedsize);
+    serialized = (char *)malloc(expectedsize);
     serialize_len = roaring_bitmap_portable_serialize(r1, serialized);
     assert_int_equal(serialize_len, expectedsize);
     assert_int_equal(serialize_len, expectedsize);
@@ -1185,7 +1371,7 @@ DEFINE_TEST(test_portable_serialize) {
 
     roaring_bitmap_run_optimize(r1);
     expectedsize = roaring_bitmap_portable_size_in_bytes(r1);
-    serialized = (char*)malloc(expectedsize);
+    serialized = (char *)malloc(expectedsize);
     serialize_len = roaring_bitmap_portable_serialize(r1, serialized);
     assert_int_equal(serialize_len, expectedsize);
 
@@ -1200,8 +1386,8 @@ DEFINE_TEST(test_portable_serialize) {
     arr2 = (uint32_t *)malloc(card2 * sizeof(uint32_t));
     roaring_bitmap_to_uint32_array(r2, arr2);
 
-    assert(array_equals(arr1, card1, arr2, card2));
-    assert(roaring_bitmap_equals(r1, r2));
+    assert_true(array_equals(arr1, card1, arr2, card2));
+    assert_true(roaring_bitmap_equals(r1, r2));
     free(arr1);
     free(arr2);
     free(serialized);
@@ -1221,7 +1407,7 @@ DEFINE_TEST(test_serialize) {
     /* Add some values to the bitmap */
     for (int i = 0, top_val = 384000; i < top_val; i++)
         roaring_bitmap_add(r1, 3 * i);
-    serialized = (char*)malloc(roaring_bitmap_size_in_bytes(r1));
+    serialized = (char *)malloc(roaring_bitmap_size_in_bytes(r1));
     serialize_len = roaring_bitmap_serialize(r1, serialized);
     assert_int_equal(serialize_len, roaring_bitmap_size_in_bytes(r1));
     r2 = roaring_bitmap_deserialize(serialized);
@@ -1252,12 +1438,30 @@ DEFINE_TEST(test_serialize) {
     ra_append(&r1->high_low_container, 0, run, RUN_CONTAINER_TYPE);
 
     serialize_len = roaring_bitmap_size_in_bytes(r1);
-    serialized = (char*)malloc(serialize_len);
+    serialized = (char *)malloc(serialize_len);
     assert_int_equal((int32_t)serialize_len,
                      roaring_bitmap_serialize(r1, serialized));
     r2 = roaring_bitmap_deserialize(serialized);
     assert_true(roaring_bitmap_equals(r1, r2));
 
+    // Check that roaring_bitmap_deserialize_safe fails on invalid length
+
+    assert_null(roaring_bitmap_deserialize_safe(serialized, 0));
+    assert_null(roaring_bitmap_deserialize_safe(serialized, serialize_len - 1));
+
+    // Check that roaring_bitmap_deserialize_safe succeed with valid length
+
+    roaring_bitmap_t *t_safe =
+        roaring_bitmap_deserialize_safe(serialized, serialize_len);
+    assert_true(roaring_bitmap_equals(r1, t_safe));
+    roaring_bitmap_free(t_safe);
+
+    // Check that roaring_bitmap_deserialize_safe succeed with larger length
+
+    t_safe = roaring_bitmap_deserialize_safe(serialized, serialize_len + 10);
+    assert_true(roaring_bitmap_equals(r1, t_safe));
+    roaring_bitmap_free(t_safe);
+
     free(serialized);
     roaring_bitmap_free(r1);
     roaring_bitmap_free(r2);
@@ -1265,7 +1469,7 @@ DEFINE_TEST(test_serialize) {
     r1 = roaring_bitmap_of(6, 2946000, 2997491, 10478289, 10490227, 10502444,
                            19866827);
 
-    serialized = (char*)malloc(roaring_bitmap_size_in_bytes(r1));
+    serialized = (char *)malloc(roaring_bitmap_size_in_bytes(r1));
     serialize_len = roaring_bitmap_serialize(r1, serialized);
     assert_int_equal(serialize_len, roaring_bitmap_size_in_bytes(r1));
     r2 = roaring_bitmap_deserialize(serialized);
@@ -1294,7 +1498,7 @@ DEFINE_TEST(test_serialize) {
         roaring_bitmap_add(r1, k);
     }
     roaring_bitmap_run_optimize(r1);
-    serialized = (char*)malloc(roaring_bitmap_size_in_bytes(r1));
+    serialized = (char *)malloc(roaring_bitmap_size_in_bytes(r1));
     serialize_len = roaring_bitmap_serialize(r1, serialized);
     assert_int_equal(serialize_len, roaring_bitmap_size_in_bytes(r1));
     r2 = roaring_bitmap_deserialize(serialized);
@@ -1311,6 +1515,7 @@ DEFINE_TEST(test_serialize) {
 
     assert_true(array_equals(arr1, card1, arr2, card2));
     assert_true(roaring_bitmap_equals(r1, r2));
+
     free(arr1);
     free(arr2);
     free(serialized);
@@ -1320,10 +1525,18 @@ DEFINE_TEST(test_serialize) {
     /* ******* */
     roaring_bitmap_t *old_bm = roaring_bitmap_create();
     for (unsigned i = 0; i < 102; i++) roaring_bitmap_add(old_bm, i);
-    char *buff = (char*)malloc(roaring_bitmap_size_in_bytes(old_bm));
+    char *buff = (char *)malloc(roaring_bitmap_size_in_bytes(old_bm));
     uint32_t size = roaring_bitmap_serialize(old_bm, buff);
     assert_int_equal(size, roaring_bitmap_size_in_bytes(old_bm));
     roaring_bitmap_t *new_bm = roaring_bitmap_deserialize(buff);
+
+    // Check that roaring_bitmap_deserialize_safe fails on invalid length
+    assert_null(roaring_bitmap_deserialize_safe(buff, size - 1));
+    // Check that roaring_bitmap_deserialize_safe succeed with valid length
+    t_safe = roaring_bitmap_deserialize_safe(buff, size);
+    assert_true(roaring_bitmap_equals(new_bm, t_safe));
+    roaring_bitmap_free(t_safe);
+
     free(buff);
     assert_true((unsigned int)roaring_bitmap_get_cardinality(old_bm) ==
                 (unsigned int)roaring_bitmap_get_cardinality(new_bm));
@@ -1393,42 +1606,47 @@ DEFINE_TEST(test_contains) {
 }
 
 DEFINE_TEST(test_contains_range) {
-    uint32_t* values = (uint32_t*)malloc(100000 * sizeof(uint32_t));
+    uint32_t *values = (uint32_t *)malloc(100000 * sizeof(uint32_t));
     assert_non_null(values);
     for (uint32_t length_range = 1; length_range <= 64; ++length_range) {
-      roaring_bitmap_t *r1 = roaring_bitmap_create();
-      assert_non_null(r1);
-      for (uint32_t i = 0; i < 100000; ++i){
+        roaring_bitmap_t *r1 = roaring_bitmap_create();
+        assert_non_null(r1);
+        for (uint32_t i = 0; i < 100000; ++i) {
             const uint32_t val = rand() % 200000;
             roaring_bitmap_add(r1, val);
             values[i] = val;
-      }
-      for (uint64_t i = 0; i < 100000; ++i){
-            if (roaring_bitmap_contains_range(r1, values[i], values[i] + length_range)){
-                for (uint32_t j = values[i]; j < values[i] + length_range; ++j) assert_true(roaring_bitmap_contains(r1, j));
-            }
-            else {
+        }
+        for (uint64_t i = 0; i < 100000; ++i) {
+            if (roaring_bitmap_contains_range(r1, values[i],
+                                              values[i] + length_range)) {
+                for (uint32_t j = values[i]; j < values[i] + length_range; ++j)
+                    assert_true(roaring_bitmap_contains(r1, j));
+            } else {
                 uint32_t count = 0;
-                for (uint32_t j = values[i]; j < values[i] + length_range; ++j){
-                    if (roaring_bitmap_contains(r1, j)) ++count;
-                    else break;
+                for (uint32_t j = values[i]; j < values[i] + length_range;
+                     ++j) {
+                    if (roaring_bitmap_contains(r1, j))
+                        ++count;
+                    else
+                        break;
                 }
                 assert_true(count != length_range);
             }
         }
-      roaring_bitmap_free(r1);
+        roaring_bitmap_free(r1);
     }
     free(values);
     for (uint32_t length_range = 1; length_range <= 64; ++length_range) {
         roaring_bitmap_t *r1 = roaring_bitmap_create();
         assert_non_null(r1);
         const uint32_t length_range_twice = length_range * 2;
-        for (uint32_t i = 0; i < 130000; i += length_range){
-            if (i % length_range_twice == 0){
-                for (uint32_t j = i; j < i + length_range; ++j) roaring_bitmap_add(r1, j);
+        for (uint32_t i = 0; i < 130000; i += length_range) {
+            if (i % length_range_twice == 0) {
+                for (uint32_t j = i; j < i + length_range; ++j)
+                    roaring_bitmap_add(r1, j);
             }
         }
-        for (uint32_t i = 0; i < 130000; i += length_range){
+        for (uint32_t i = 0; i < 130000; i += length_range) {
             bool pres = roaring_bitmap_contains_range(r1, i, i + length_range);
             assert_true(((i % length_range_twice == 0) ? pres : !pres));
         }
@@ -1436,6 +1654,14 @@ DEFINE_TEST(test_contains_range) {
     }
 }
 
+DEFINE_TEST(test_contains_range_PyRoaringBitMap_issue81) {
+    roaring_bitmap_t *r = roaring_bitmap_create();
+    roaring_bitmap_add_range(r, 1, 1900544);
+    assert_true(roaring_bitmap_contains_range(r, 1, 1900544));
+    assert_false(roaring_bitmap_contains_range(r, 1900543, 1900545));
+    roaring_bitmap_free(r);
+}
+
 DEFINE_TEST(test_intersection_array_x_array) {
     roaring_bitmap_t *r1 = roaring_bitmap_create();
     assert_non_null(r1);
@@ -1466,9 +1692,9 @@ DEFINE_TEST(test_intersection_array_x_array) {
 
 DEFINE_TEST(test_intersection_array_x_array_inplace) {
     roaring_bitmap_t *r1 = roaring_bitmap_create();
-    assert(r1);
+    assert_true(r1);
     roaring_bitmap_t *r2 = roaring_bitmap_create();
-    assert(r2);
+    assert_true(r2);
 
     for (uint32_t i = 0; i < 100; ++i) {
         roaring_bitmap_add(r1, 2 * i);
@@ -1489,9 +1715,9 @@ DEFINE_TEST(test_intersection_array_x_array_inplace) {
 
 DEFINE_TEST(test_intersection_bitset_x_bitset) {
     roaring_bitmap_t *r1 = roaring_bitmap_create();
-    assert(r1);
+    assert_true(r1);
     roaring_bitmap_t *r2 = roaring_bitmap_create();
-    assert(r2);
+    assert_true(r2);
 
     for (uint32_t i = 0; i < 20000; ++i) {
         roaring_bitmap_add(r1, 2 * i);
@@ -1521,9 +1747,9 @@ DEFINE_TEST(test_intersection_bitset_x_bitset) {
 
 DEFINE_TEST(test_intersection_bitset_x_bitset_inplace) {
     roaring_bitmap_t *r1 = roaring_bitmap_create();
-    assert(r1);
+    assert_true(r1);
     roaring_bitmap_t *r2 = roaring_bitmap_create();
-    assert(r2);
+    assert_true(r2);
 
     for (uint32_t i = 0; i < 20000; ++i) {
         roaring_bitmap_add(r1, 2 * i);
@@ -1547,10 +1773,10 @@ DEFINE_TEST(test_intersection_bitset_x_bitset_inplace) {
 void test_union(bool copy_on_write) {
     roaring_bitmap_t *r1 = roaring_bitmap_create();
     roaring_bitmap_set_copy_on_write(r1, copy_on_write);
-    assert(r1);
+    assert_true(r1);
     roaring_bitmap_t *r2 = roaring_bitmap_create();
     roaring_bitmap_set_copy_on_write(r2, copy_on_write);
-    assert(r2);
+    assert_true(r2);
 
     for (uint32_t i = 0; i < 100; ++i) {
         roaring_bitmap_add(r1, 2 * i);
@@ -1587,8 +1813,8 @@ static roaring_bitmap_t *gen_bitmap(double start_density,
     for (int i = 0; i < universe_size; i += run_length) {
         d = start_density + i * density_gradient;
         double r = our_rand() / (double)OUR_RAND_MAX;
-        assert(r <= 1.0);
-        assert(r >= 0);
+        assert_true(r <= 1.0);
+        assert_true(r >= 0);
         if (r < d && !(i >= blank_range_start && i < blank_range_end))
             for (int j = 0; j < run_length; ++j) roaring_bitmap_add(ans, i + j);
     }
@@ -2388,7 +2614,7 @@ static roaring_bitmap_t *make_roaring_from_array(uint32_t *a, int len) {
 
 DEFINE_TEST(test_conversion_to_int_array) {
     int ans_ctr = 0;
-    uint32_t *ans = (uint32_t*)calloc(100000, sizeof(int32_t));
+    uint32_t *ans = (uint32_t *)calloc(100000, sizeof(int32_t));
 
     // a dense bitmap container  (best done with runs)
     for (uint32_t i = 0; i < 50000; ++i) {
@@ -2425,7 +2651,7 @@ DEFINE_TEST(test_conversion_to_int_array) {
 DEFINE_TEST(test_conversion_to_int_array_with_runoptimize) {
     roaring_bitmap_t *r1 = roaring_bitmap_create();
     int ans_ctr = 0;
-    uint32_t *ans = (uint32_t*)calloc(100000, sizeof(int32_t));
+    uint32_t *ans = (uint32_t *)calloc(100000, sizeof(int32_t));
 
     // a dense bitmap container  (best done with runs)
     for (uint32_t i = 0; i < 50000; ++i) {
@@ -2463,7 +2689,7 @@ DEFINE_TEST(test_conversion_to_int_array_with_runoptimize) {
 
 DEFINE_TEST(test_array_to_run) {
     int ans_ctr = 0;
-    uint32_t *ans = (uint32_t*)calloc(100000, sizeof(int32_t));
+    uint32_t *ans = (uint32_t *)calloc(100000, sizeof(int32_t));
 
     // array container  (best done with runs)
     for (uint32_t i = 0; i < 500; ++i) {
@@ -2488,7 +2714,7 @@ DEFINE_TEST(test_array_to_run) {
 DEFINE_TEST(test_array_to_self) {
     int ans_ctr = 0;
 
-    uint32_t *ans = (uint32_t*)calloc(100000, sizeof(int32_t));
+    uint32_t *ans = (uint32_t *)calloc(100000, sizeof(int32_t));
 
     // array container  (best not done with runs)
     for (uint32_t i = 0; i < 500; i += 2) {
@@ -2512,7 +2738,7 @@ DEFINE_TEST(test_array_to_self) {
 
 DEFINE_TEST(test_bitset_to_self) {
     int ans_ctr = 0;
-    uint32_t *ans = (uint32_t*)calloc(100000, sizeof(int32_t));
+    uint32_t *ans = (uint32_t *)calloc(100000, sizeof(int32_t));
 
     // bitset container  (best not done with runs)
     for (uint32_t i = 0; i < 50000; i += 2) {
@@ -2536,7 +2762,7 @@ DEFINE_TEST(test_bitset_to_self) {
 
 DEFINE_TEST(test_bitset_to_run) {
     int ans_ctr = 0;
-    uint32_t *ans = (uint32_t*)calloc(100000, sizeof(int32_t));
+    uint32_t *ans = (uint32_t *)calloc(100000, sizeof(int32_t));
 
     // bitset container  (best done with runs)
     for (uint32_t i = 0; i < 50000; i++) {
@@ -2546,7 +2772,7 @@ DEFINE_TEST(test_bitset_to_run) {
     }
 
     roaring_bitmap_t *r1 = make_roaring_from_array(ans, ans_ctr);
-    assert(roaring_bitmap_run_optimize(r1));
+    assert_true(roaring_bitmap_run_optimize(r1));
 
     uint64_t card = roaring_bitmap_get_cardinality(r1);
     uint32_t *arr = (uint32_t *)malloc(card * sizeof(uint32_t));
@@ -2562,7 +2788,7 @@ DEFINE_TEST(test_bitset_to_run) {
 
 DEFINE_TEST(test_run_to_self) {
     int ans_ctr = 0;
-    uint32_t *ans = (uint32_t*)calloc(100000, sizeof(int32_t));
+    uint32_t *ans = (uint32_t *)calloc(100000, sizeof(int32_t));
 
     // bitset container  (best done with runs)
     for (uint32_t i = 0; i < 50000; i++) {
@@ -2588,7 +2814,7 @@ DEFINE_TEST(test_run_to_self) {
 
 DEFINE_TEST(test_remove_run_to_bitset) {
     int ans_ctr = 0;
-    uint32_t *ans = (uint32_t*)calloc(100000, sizeof(int32_t));
+    uint32_t *ans = (uint32_t *)calloc(100000, sizeof(int32_t));
 
     // bitset container  (best done with runs)
     for (uint32_t i = 0; i < 50000; i++) {
@@ -2615,7 +2841,7 @@ DEFINE_TEST(test_remove_run_to_bitset) {
 
 DEFINE_TEST(test_remove_run_to_array) {
     int ans_ctr = 0;
-    uint32_t *ans = (uint32_t*)calloc(100000, sizeof(int32_t));
+    uint32_t *ans = (uint32_t *)calloc(100000, sizeof(int32_t));
 
     // array  (best done with runs)
     for (uint32_t i = 0; i < 500; i++) {
@@ -2640,10 +2866,9 @@ DEFINE_TEST(test_remove_run_to_array) {
     free(ans);
 }
 
-
 DEFINE_TEST(test_remove_run_to_bitset_cow) {
     int ans_ctr = 0;
-    uint32_t *ans = (uint32_t*)calloc(100000, sizeof(int32_t));
+    uint32_t *ans = (uint32_t *)calloc(100000, sizeof(int32_t));
 
     // bitset container  (best done with runs)
     for (uint32_t i = 0; i < 50000; i++) {
@@ -2674,7 +2899,7 @@ DEFINE_TEST(test_remove_run_to_bitset_cow) {
 
 DEFINE_TEST(test_remove_run_to_array_cow) {
     int ans_ctr = 0;
-    uint32_t *ans = (uint32_t*)calloc(100000, sizeof(int32_t));
+    uint32_t *ans = (uint32_t *)calloc(100000, sizeof(int32_t));
 
     // array  (best done with runs)
     for (uint32_t i = 0; i < 500; i++) {
@@ -2847,7 +3072,7 @@ void test_negation_helper(bool runopt, uint32_t gap) {
         assert_true(hasrun);
     }
 
-    int orig_card = (int) roaring_bitmap_get_cardinality(r1);
+    int orig_card = (int)roaring_bitmap_get_cardinality(r1);
 
     // get the first batch of ones but not the second
     roaring_bitmap_t *notted_r1 = roaring_bitmap_flip(r1, 0U, 100000U);
@@ -3051,7 +3276,7 @@ void test_inplace_negation_helper(bool runopt, uint32_t gap) {
         assert_true(hasrun);
     }
 
-    int orig_card = (int) roaring_bitmap_get_cardinality(r1);
+    int orig_card = (int)roaring_bitmap_get_cardinality(r1);
     roaring_bitmap_t *r1_orig = roaring_bitmap_copy(r1);
 
     // get the first batch of ones but not the second
@@ -3118,8 +3343,8 @@ DEFINE_TEST(test_rand_flips) {
     const int min_runs = 1;
     const int flip_trials = 5;  // these are expensive tests
     const int range = 2000000;
-    char *input = (char*)malloc(range);
-    char *output = (char*)malloc(range);
+    char *input = (char *)malloc(range);
+    char *output = (char *)malloc(range);
 
     for (int card = 2; card < 1000000; card *= 8) {
         printf("test_rand_flips with attempted card %d", card);
@@ -3132,8 +3357,8 @@ DEFINE_TEST(test_rand_flips) {
             double f3 = our_rand() / (double)OUR_RAND_MAX;
             int pos = (int)(f1 * f2 * f3 *
                             range);  // denser at the start, sparser at end
-            assert(pos < range);
-            assert(pos >= 0);
+            assert_true(pos < range);
+            assert_true(pos >= 0);
             roaring_bitmap_add(r, pos);
             input[pos] = 1;
         }
@@ -3176,8 +3401,8 @@ DEFINE_TEST(test_inplace_rand_flips) {
     const int min_runs = 1;
     const int flip_trials = 5;  // these are expensive tests
     const int range = 2000000;
-    char *input = (char*)malloc(range);
-    char *output = (char*)malloc(range);
+    char *input = (char *)malloc(range);
+    char *output = (char *)malloc(range);
 
     for (int card = 2; card < 1000000; card *= 8) {
         roaring_bitmap_t *r = roaring_bitmap_create();
@@ -3188,8 +3413,8 @@ DEFINE_TEST(test_inplace_rand_flips) {
             double f3 = our_rand() / (double)OUR_RAND_MAX;
             int pos = (int)(f1 * f2 * f3 *
                             range);  // denser at the start, sparser at end
-            assert(pos < range);
-            assert(pos >= 0);
+            assert_true(pos < range);
+            assert_true(pos >= 0);
             roaring_bitmap_add(r, pos);
             input[pos] = 1;
         }
@@ -3263,19 +3488,19 @@ DEFINE_TEST(select_test) {
     srand(1234);
     const int min_runs = 1;
     const uint32_t range = 2000000;
-    char *input = (char*)malloc(range);
+    char *input = (char *)malloc(range);
 
     for (int card = 2; card < 1000000; card *= 8) {
-
         roaring_bitmap_t *r = roaring_bitmap_create();
         memset(input, 0, range);
         for (int i = 0; i < card; ++i) {
             double f1 = our_rand() / (double)OUR_RAND_MAX;
             double f2 = our_rand() / (double)OUR_RAND_MAX;
             double f3 = our_rand() / (double)OUR_RAND_MAX;
-            uint32_t pos = (uint32_t)(f1 * f2 * f3 *
-                            range);  // denser at the start, sparser at end
-            assert(pos < range);
+            uint32_t pos =
+                (uint32_t)(f1 * f2 * f3 *
+                           range);  // denser at the start, sparser at end
+            assert_true(pos < range);
             roaring_bitmap_add(r, pos);
             input[pos] = 1;
         }
@@ -3355,6 +3580,15 @@ static uint64_t rank(uint32_t *arr, size_t length, uint32_t x) {
     return sum;
 }
 
+static int64_t get_index(uint32_t *arr, size_t length, uint32_t x) {
+    for (size_t i = 0; i < length; ++i) {
+        if (arr[i] == x) {
+            return i;
+        }
+    }
+    return -1;
+}
+
 DEFINE_TEST(test_rank) {
     for (uint32_t mymin = 123; mymin < 1000000; mymin *= 2) {
         // just arrays
@@ -3364,7 +3598,7 @@ DEFINE_TEST(test_rank) {
             roaring_bitmap_add(r, x);
         }
         uint64_t card = roaring_bitmap_get_cardinality(r);
-        uint32_t *ans = (uint32_t*)malloc(card * sizeof(uint32_t));
+        uint32_t *ans = (uint32_t *)malloc(card * sizeof(uint32_t));
         roaring_bitmap_to_uint32_array(r, ans);
         for (uint32_t z = 0; z < 1000 + mymin + 10; z += 10) {
             uint64_t truerank = rank(ans, card, z);
@@ -3380,7 +3614,7 @@ DEFINE_TEST(test_rank) {
             roaring_bitmap_add(r, x);
         }
         card = roaring_bitmap_get_cardinality(r);
-        ans = (uint32_t*)malloc(card * sizeof(uint32_t));
+        ans = (uint32_t *)malloc(card * sizeof(uint32_t));
         roaring_bitmap_to_uint32_array(r, ans);
         for (uint32_t z = 0; z < 64000 + mymin + 10; z += 10) {
             uint64_t truerank = rank(ans, card, z);
@@ -3397,7 +3631,7 @@ DEFINE_TEST(test_rank) {
         }
         roaring_bitmap_run_optimize(r);
         card = roaring_bitmap_get_cardinality(r);
-        ans = (uint32_t*)malloc(card * sizeof(uint32_t));
+        ans = (uint32_t *)malloc(card * sizeof(uint32_t));
         roaring_bitmap_to_uint32_array(r, ans);
         for (uint32_t z = 0; z < 64000 + mymin + 10; z += 10) {
             uint64_t truerank = rank(ans, card, z);
@@ -3412,6 +3646,63 @@ DEFINE_TEST(test_rank) {
     }
 }
 
+DEFINE_TEST(test_get_index) {
+    for (uint32_t mymin = 123; mymin < 1000000; mymin *= 2) {
+        // just arrays
+        roaring_bitmap_t *r = roaring_bitmap_create();
+        uint32_t x = mymin;
+        for (; x < 1000 + mymin; x += 100) {
+            roaring_bitmap_add(r, x);
+        }
+        uint64_t card = roaring_bitmap_get_cardinality(r);
+        uint32_t *ans = (uint32_t *)malloc(card * sizeof(uint32_t));
+        roaring_bitmap_to_uint32_array(r, ans);
+        for (uint32_t z = 0; z < 1000 + mymin + 10; z += 10) {
+            int64_t trueidx = get_index(ans, card, z);
+            int64_t computedidx = roaring_bitmap_get_index(r, z);
+            if (trueidx != computedidx)
+                printf("%d != %d \n", (int)trueidx, (int)computedidx);
+            assert_true(trueidx == computedidx);
+        }
+        free(ans);
+        // now bitmap
+        x = mymin;
+        for (; x < 64000 + mymin; x += 2) {
+            roaring_bitmap_add(r, x);
+        }
+        card = roaring_bitmap_get_cardinality(r);
+        ans = (uint32_t *)malloc(card * sizeof(uint32_t));
+        roaring_bitmap_to_uint32_array(r, ans);
+        for (uint32_t z = 0; z < 64000 + mymin + 10; z += 10) {
+            int64_t trueidx = get_index(ans, card, z);
+            int64_t computedidx = roaring_bitmap_get_index(r, z);
+            if (trueidx != computedidx)
+                printf("%d != %d \n", (int)trueidx, (int)computedidx);
+            assert_true(trueidx == computedidx);
+        }
+        free(ans);
+        // now run
+        x = mymin;
+        for (; x < 64000 + mymin; x++) {
+            roaring_bitmap_add(r, x);
+        }
+        roaring_bitmap_run_optimize(r);
+        card = roaring_bitmap_get_cardinality(r);
+        ans = (uint32_t *)malloc(card * sizeof(uint32_t));
+        roaring_bitmap_to_uint32_array(r, ans);
+        for (uint32_t z = 0; z < 64000 + mymin + 10; z += 10) {
+            int64_t trueidx = get_index(ans, card, z);
+            int64_t computedidx = roaring_bitmap_get_index(r, z);
+            if (trueidx != computedidx)
+                printf("%d != %d \n", (int)trueidx, (int)computedidx);
+            assert_true(trueidx == computedidx);
+        }
+        free(ans);
+
+        roaring_bitmap_free(r);
+    }
+}
+
 // Return a random value which does not belong to the roaring bitmap.
 // Value will be lower than upper_bound.
 uint32_t choose_missing_value(roaring_bitmap_t *rb, uint32_t upper_bound) {
@@ -3429,7 +3720,6 @@ DEFINE_TEST(test_intersect_small_run_bitset) {
     roaring_bitmap_free(rb2);
 }
 
-
 DEFINE_TEST(issue316) {
     roaring_bitmap_t *rb1 = roaring_bitmap_create();
     roaring_bitmap_set_copy_on_write(rb1, true);
@@ -3493,9 +3783,9 @@ DEFINE_TEST(test_subset) {
 }
 
 DEFINE_TEST(test_or_many_memory_leak) {
-    for(int i=0; i<10; i++) {
+    for (int i = 0; i < 10; i++) {
         roaring_bitmap_t *bm1 = roaring_bitmap_create();
-        for(int j=0; j<10; j++) {
+        for (int j = 0; j < 10; j++) {
             roaring_bitmap_t *bm2 = roaring_bitmap_create();
             const roaring_bitmap_t *buff[] = {bm1, bm2};
             roaring_bitmap_t *bm3 = roaring_bitmap_or_many(2, buff);
@@ -3507,102 +3797,104 @@ DEFINE_TEST(test_or_many_memory_leak) {
 }
 
 void test_iterator_generate_data(uint32_t **values_out, uint32_t *count_out) {
-    const size_t capacity = 1000*1000;
-    uint32_t* values =
-             (uint32_t*)malloc(sizeof(uint32_t) * capacity);  // ascending order
+    const size_t capacity = 1000 * 1000;
+    uint32_t *values =
+        (uint32_t *)malloc(sizeof(uint32_t) * capacity);  // ascending order
     uint32_t count = 0;
-    uint32_t base = 1234; // container index
+    uint32_t base = 1234;  // container index
 
     // min allowed value
     values[count++] = 0;
 
     // only the very first value in container is set
-    values[count++] = base*65536;
+    values[count++] = base * 65536;
     base += 2;
 
     // only the very last value in container is set
-    values[count++] = base*65536 + 65535;
+    values[count++] = base * 65536 + 65535;
     base += 2;
 
     // fully filled container
     for (uint32_t i = 0; i < 65536; i++) {
-        values[count++] = base*65536 + i;
+        values[count++] = base * 65536 + i;
     }
     base += 2;
 
     // even values
     for (uint32_t i = 0; i < 65536; i += 2) {
-        values[count++] = base*65536 + i;
+        values[count++] = base * 65536 + i;
     }
     base += 2;
 
     // odd values
     for (uint32_t i = 1; i < 65536; i += 2) {
-        values[count++] = base*65536 + i;
+        values[count++] = base * 65536 + i;
     }
     base += 2;
 
     // each next 64-bit word is ROR'd by one
     for (uint32_t i = 0; i < 65536; i += 65) {
-        values[count++] = base*65536 + i;
+        values[count++] = base * 65536 + i;
     }
     base += 2;
 
     // runs of increasing length: 0, 1,0, 1,1,0, 1,1,1,0, ...
     for (uint32_t i = 0, run_index = 0; i < 65536; i++) {
-      if (i != (run_index+1)*(run_index+2)/2-1) {
-        values[count++] = base*65536 + i;
-      } else {
-        run_index++;
-      }
+        if (i != (run_index + 1) * (run_index + 2) / 2 - 1) {
+            values[count++] = base * 65536 + i;
+        } else {
+            run_index++;
+        }
     }
     base += 2;
 
     // 00000XX, XXXXXX, XX0000
-    for (uint32_t i = 65536-100; i < 65536; i++) {
-        values[count++] = base*65536 + i;
+    for (uint32_t i = 65536 - 100; i < 65536; i++) {
+        values[count++] = base * 65536 + i;
     }
     base += 1;
     for (uint32_t i = 0; i < 65536; i++) {
-        values[count++] = base*65536 + i;
+        values[count++] = base * 65536 + i;
     }
     base += 1;
     for (uint32_t i = 0; i < 100; i++) {
-        values[count++] = base*65536 + i;
+        values[count++] = base * 65536 + i;
     }
     base += 2;
 
     // random
-    for (int i = 0; i < 65536; i += our_rand()%10+1) {
-        values[count++] = base*65536 + i;
+    for (int i = 0; i < 65536; i += our_rand() % 10 + 1) {
+        values[count++] = base * 65536 + i;
     }
     base += 2;
 
     // max allowed value
     values[count++] = UINT32_MAX;
 
-    assert(count <= capacity);
+    assert_true(count <= capacity);
     *values_out = values;
     *count_out = count;
 }
 
 /*
  * Read bitmap in steps of given size, compare with reference values.
- * If step is UINT32_MAX (special value), then read single non-empty container at a time.
+ * If step is UINT32_MAX (special value), then read single non-empty container
+ * at a time.
  */
-void read_compare(roaring_bitmap_t* r, const uint32_t* ref_values, uint32_t ref_count, uint32_t step) {
+void read_compare(roaring_bitmap_t *r, const uint32_t *ref_values,
+                  uint32_t ref_count, uint32_t step) {
     roaring_uint32_iterator_t *iter = roaring_create_iterator(r);
-    uint32_t* buffer = (uint32_t*)malloc(
-            sizeof(uint32_t) * (step == UINT32_MAX ? 65536 : step));
+    uint32_t *buffer = (uint32_t *)malloc(sizeof(uint32_t) *
+                                          (step == UINT32_MAX ? 65536 : step));
     while (ref_count > 0) {
-        assert(iter->has_value == true);
-        assert(iter->current_value == ref_values[0]);
+        assert_true(iter->has_value == true);
+        assert_true(iter->current_value == ref_values[0]);
 
         uint32_t num_ask = step;
         if (step == UINT32_MAX) {
             num_ask = 0;
             for (uint32_t i = 0; i < ref_count; i++) {
-                if ((ref_values[i]>>16) == (ref_values[0]>>16)) {
+                if ((ref_values[i] >> 16) == (ref_values[0] >> 16)) {
                     num_ask++;
                 } else {
                     break;
@@ -3611,27 +3903,27 @@ void read_compare(roaring_bitmap_t* r, const uint32_t* ref_values, uint32_t ref_
         }
 
         uint32_t num_got = roaring_read_uint32_iterator(iter, buffer, num_ask);
-        assert(num_got == minimum_uint32(num_ask, ref_count));
+        assert_true(num_got == minimum_uint32(num_ask, ref_count));
         for (uint32_t i = 0; i < num_got; i++) {
-            assert(ref_values[i] == buffer[i]);
+            assert_true(ref_values[i] == buffer[i]);
         }
         ref_values += num_got;
         ref_count -= num_got;
     }
 
-    assert(iter->has_value == false);
-    assert(iter->current_value == UINT32_MAX);
+    assert_true(iter->has_value == false);
+    assert_true(iter->current_value == UINT32_MAX);
 
-    assert(roaring_read_uint32_iterator(iter, buffer, step) == 0);
-    assert(iter->has_value == false);
-    assert(iter->current_value == UINT32_MAX);
+    assert_true(roaring_read_uint32_iterator(iter, buffer, step) == 0);
+    assert_true(iter->has_value == false);
+    assert_true(iter->current_value == UINT32_MAX);
 
     free(buffer);
     roaring_free_uint32_iterator(iter);
 }
 
 void test_read_uint32_iterator(uint8_t type) {
-    uint32_t* ref_values;
+    uint32_t *ref_values;
     uint32_t ref_count;
     test_iterator_generate_data(&ref_values, &ref_count);
 
@@ -3646,9 +3938,9 @@ void test_read_uint32_iterator(uint8_t type) {
     read_compare(r, ref_values, ref_count, 1);
     read_compare(r, ref_values, ref_count, 2);
     read_compare(r, ref_values, ref_count, 7);
-    read_compare(r, ref_values, ref_count, ref_count-1);
+    read_compare(r, ref_values, ref_count, ref_count - 1);
     read_compare(r, ref_values, ref_count, ref_count);
-    read_compare(r, ref_values, ref_count, UINT32_MAX); // special value
+    read_compare(r, ref_values, ref_count, UINT32_MAX);  // special value
 
     roaring_bitmap_free(r);
     free(ref_values);
@@ -3664,11 +3956,11 @@ DEFINE_TEST(test_read_uint32_iterator_run) {
     test_read_uint32_iterator(RUN_CONTAINER_TYPE);
 }
 DEFINE_TEST(test_read_uint32_iterator_native) {
-    test_read_uint32_iterator(UINT8_MAX); // special value
+    test_read_uint32_iterator(UINT8_MAX);  // special value
 }
 
 void test_previous_iterator(uint8_t type) {
-    uint32_t* ref_values;
+    uint32_t *ref_values;
     uint32_t ref_count;
     test_iterator_generate_data(&ref_values, &ref_count);
 
@@ -3685,13 +3977,13 @@ void test_previous_iterator(uint8_t type) {
     uint32_t count = 0;
 
     do {
-        assert(iterator.has_value);
+        assert_true(iterator.has_value);
         ++count;
-        assert((int64_t)ref_count - (int64_t)count >= 0); // sanity check
-        assert(ref_values[ref_count - count] == iterator.current_value);
+        assert_true((int64_t)ref_count - (int64_t)count >= 0);  // sanity check
+        assert_true(ref_values[ref_count - count] == iterator.current_value);
     } while (roaring_previous_uint32_iterator(&iterator));
 
-    assert(ref_count == count);
+    assert_true(ref_count == count);
 
     roaring_bitmap_free(r);
     free(ref_values);
@@ -3710,17 +4002,17 @@ DEFINE_TEST(test_previous_iterator_run) {
 }
 
 DEFINE_TEST(test_previous_iterator_native) {
-    test_previous_iterator(UINT8_MAX); // special value
+    test_previous_iterator(UINT8_MAX);  // special value
 }
 
-void test_iterator_reuse_retry_count(int retry_count){
-    uint32_t* ref_values;
+void test_iterator_reuse_retry_count(int retry_count) {
+    uint32_t *ref_values;
     uint32_t ref_count;
     test_iterator_generate_data(&ref_values, &ref_count);
 
-    roaring_bitmap_t* with_edges = roaring_bitmap_create();
+    roaring_bitmap_t *with_edges = roaring_bitmap_create();
     // We don't want min and max values inside this bitmap
-    roaring_bitmap_t* without_edges = roaring_bitmap_create();
+    roaring_bitmap_t *without_edges = roaring_bitmap_create();
 
     for (uint32_t i = 0; i < ref_count; i++) {
         roaring_bitmap_add(with_edges, ref_values[i]);
@@ -3730,19 +4022,20 @@ void test_iterator_reuse_retry_count(int retry_count){
     }
 
     // sanity checks
-    assert(roaring_bitmap_contains(with_edges, 0));
-    assert(roaring_bitmap_contains(with_edges, UINT32_MAX));
-    assert(!roaring_bitmap_contains(without_edges, 0));
-    assert(!roaring_bitmap_contains(without_edges, UINT32_MAX));
-    assert(roaring_bitmap_get_cardinality(with_edges) - 2 == roaring_bitmap_get_cardinality(without_edges));
-
-    const roaring_bitmap_t* bitmaps[] = {with_edges, without_edges};
+    assert_true(roaring_bitmap_contains(with_edges, 0));
+    assert_true(roaring_bitmap_contains(with_edges, UINT32_MAX));
+    assert_true(!roaring_bitmap_contains(without_edges, 0));
+    assert_true(!roaring_bitmap_contains(without_edges, UINT32_MAX));
+    assert_true(roaring_bitmap_get_cardinality(with_edges) - 2 ==
+                roaring_bitmap_get_cardinality(without_edges));
+
+    const roaring_bitmap_t *bitmaps[] = {with_edges, without_edges};
     int num_bitmaps = sizeof(bitmaps) / sizeof(bitmaps[0]);
 
-    for (int i = 0; i < num_bitmaps; ++i){
+    for (int i = 0; i < num_bitmaps; ++i) {
         roaring_uint32_iterator_t iterator;
         roaring_init_iterator(bitmaps[i], &iterator);
-        assert(iterator.has_value);
+        assert_true(iterator.has_value);
         uint32_t first_value = iterator.current_value;
 
         uint32_t count = 0;
@@ -3750,7 +4043,7 @@ void test_iterator_reuse_retry_count(int retry_count){
             count++;
             roaring_advance_uint32_iterator(&iterator);
         }
-        assert(count == roaring_bitmap_get_cardinality(bitmaps[i]));
+        assert_true(count == roaring_bitmap_get_cardinality(bitmaps[i]));
 
         // Test advancing the iterator more times than necessary
         for (int retry = 0; retry < retry_count; ++retry) {
@@ -3764,7 +4057,7 @@ void test_iterator_reuse_retry_count(int retry_count){
             count++;
             roaring_previous_uint32_iterator(&iterator);
         }
-        assert(count == roaring_bitmap_get_cardinality(bitmaps[i]));
+        assert_true(count == roaring_bitmap_get_cardinality(bitmaps[i]));
 
         // Test decrement the iterator more times than necessary
         for (int retry = 0; retry < retry_count; ++retry) {
@@ -3772,149 +4065,148 @@ void test_iterator_reuse_retry_count(int retry_count){
         }
 
         roaring_advance_uint32_iterator(&iterator);
-        assert(iterator.has_value);
-        assert(first_value == iterator.current_value);
+        assert_true(iterator.has_value);
+        assert_true(first_value == iterator.current_value);
     }
 
-
     roaring_bitmap_free(without_edges);
     roaring_bitmap_free(with_edges);
     free(ref_values);
 }
 
-DEFINE_TEST(test_iterator_reuse) {
-    test_iterator_reuse_retry_count(0);
-}
+DEFINE_TEST(test_iterator_reuse) { test_iterator_reuse_retry_count(0); }
 
-DEFINE_TEST(test_iterator_reuse_many) {
-    test_iterator_reuse_retry_count(10);
-}
+DEFINE_TEST(test_iterator_reuse_many) { test_iterator_reuse_retry_count(10); }
 
 DEFINE_TEST(test_add_range) {
     // autoconversion: BITSET -> BITSET -> RUN
     {
-      sbs_t* sbs = sbs_create();
-      sbs_add_value(sbs, 100);
-      sbs_convert(sbs, BITSET_CONTAINER_TYPE);
-      sbs_add_range(sbs, 0, 299);
-      assert_true(sbs_check_type(sbs, BITSET_CONTAINER_TYPE));
-      sbs_add_range(sbs, 301, 65535);
-      assert_true(sbs_check_type(sbs, BITSET_CONTAINER_TYPE));
-      // after and only after BITSET becomes [0, 65535], it is converted to RUN
-      sbs_add_range(sbs, 300, 300);
-      assert_true(sbs_check_type(sbs, RUN_CONTAINER_TYPE));
-      sbs_compare(sbs);
-      sbs_free(sbs);
+        sbs_t *sbs = sbs_create();
+        sbs_add_value(sbs, 100);
+        sbs_convert(sbs, BITSET_CONTAINER_TYPE);
+        sbs_add_range(sbs, 0, 299);
+        assert_true(sbs_check_type(sbs, BITSET_CONTAINER_TYPE));
+        sbs_add_range(sbs, 301, 65535);
+        assert_true(sbs_check_type(sbs, BITSET_CONTAINER_TYPE));
+        // after and only after BITSET becomes [0, 65535], it is converted to
+        // RUN
+        sbs_add_range(sbs, 300, 300);
+        assert_true(sbs_check_type(sbs, RUN_CONTAINER_TYPE));
+        sbs_compare(sbs);
+        sbs_free(sbs);
     }
 
     // autoconversion: ARRAY -> ARRAY -> BITSET
     {
-      sbs_t* sbs = sbs_create();
-      sbs_add_value(sbs, 100);
-      sbs_convert(sbs, ARRAY_CONTAINER_TYPE);
+        sbs_t *sbs = sbs_create();
+        sbs_add_value(sbs, 100);
+        sbs_convert(sbs, ARRAY_CONTAINER_TYPE);
 
-      // unless threshold was hit, it is still ARRAY
-      for (int i = 0; i < 100; i += 2) {
-        sbs_add_value(sbs, i);
-        assert_true(sbs_check_type(sbs, ARRAY_CONTAINER_TYPE));
-      }
+        // unless threshold was hit, it is still ARRAY
+        for (int i = 0; i < 100; i += 2) {
+            sbs_add_value(sbs, i);
+            assert_true(sbs_check_type(sbs, ARRAY_CONTAINER_TYPE));
+        }
 
-      // after threshold on number of elements was hit, it is converted to BITSET
-      for (int i = 0; i < 65535; i += 2) {
-        sbs_add_value(sbs, i);
-      }
-      assert_true(sbs_check_type(sbs, BITSET_CONTAINER_TYPE));
+        // after threshold on number of elements was hit, it is converted to
+        // BITSET
+        for (int i = 0; i < 65535; i += 2) {
+            sbs_add_value(sbs, i);
+        }
+        assert_true(sbs_check_type(sbs, BITSET_CONTAINER_TYPE));
 
-      sbs_compare(sbs);
-      sbs_free(sbs);
+        sbs_compare(sbs);
+        sbs_free(sbs);
     }
 
-     // autoconversion: ARRAY -> RUN
-     {
-      sbs_t* sbs = sbs_create();
-      sbs_add_range(sbs, 0, 100);
-      sbs_convert(sbs, ARRAY_CONTAINER_TYPE);
+    // autoconversion: ARRAY -> RUN
+    {
+        sbs_t *sbs = sbs_create();
+        sbs_add_range(sbs, 0, 100);
+        sbs_convert(sbs, ARRAY_CONTAINER_TYPE);
 
-      // after ARRAY becomes full [0, 65535], it is converted to RUN
-      sbs_add_range(sbs, 100, 65535);
-      assert_true(sbs_check_type(sbs, RUN_CONTAINER_TYPE));
+        // after ARRAY becomes full [0, 65535], it is converted to RUN
+        sbs_add_range(sbs, 100, 65535);
+        assert_true(sbs_check_type(sbs, RUN_CONTAINER_TYPE));
 
-      sbs_compare(sbs);
-      sbs_free(sbs);
+        sbs_compare(sbs);
+        sbs_free(sbs);
     }
     // autoconversion: RUN -> RUN -> BITSET
     {
-      sbs_t* sbs = sbs_create();
-      // by default, RUN container is used
-      for (int i = 0; i < 100; i += 2) {
-        sbs_add_range(sbs, 4*i, 4*i + 1);
-        assert_true(sbs_check_type(sbs, RUN_CONTAINER_TYPE));
-      }
-      // after number of RLE runs exceeded threshold, it is converted to BITSET
-      for (int i = 0; i < 65535; i += 2) {
-        sbs_add_range(sbs, i, i);
-      }
-      assert_true(sbs_check_type(sbs, BITSET_CONTAINER_TYPE));
-      sbs_compare(sbs);
-      sbs_free(sbs);
+        sbs_t *sbs = sbs_create();
+        // by default, RUN container is used
+        for (int i = 0; i < 100; i += 2) {
+            sbs_add_range(sbs, 4 * i, 4 * i + 1);
+            assert_true(sbs_check_type(sbs, RUN_CONTAINER_TYPE));
+        }
+        // after number of RLE runs exceeded threshold, it is converted to
+        // BITSET
+        for (int i = 0; i < 65535; i += 2) {
+            sbs_add_range(sbs, i, i);
+        }
+        assert_true(sbs_check_type(sbs, BITSET_CONTAINER_TYPE));
+        sbs_compare(sbs);
+        sbs_free(sbs);
     }
 
     // autoconversion: ARRAY -> ARRAY -> BITSET
     {
-      sbs_t* sbs = sbs_create();
-      for (int i = 0; i < 100; i += 2) {
-        sbs_add_range(sbs, i, i);
-        assert_true(sbs_check_type(sbs, ARRAY_CONTAINER_TYPE));
-      }
-      // after number of RLE runs exceeded threshold, it is converted to BITSET
-      for (int i = 0; i < 65535; i += 2) {
-        sbs_add_range(sbs, i, i);
-      }
-      assert_true(sbs_check_type(sbs, BITSET_CONTAINER_TYPE));
-      sbs_compare(sbs);
-      sbs_free(sbs);
+        sbs_t *sbs = sbs_create();
+        for (int i = 0; i < 100; i += 2) {
+            sbs_add_range(sbs, i, i);
+            assert_true(sbs_check_type(sbs, ARRAY_CONTAINER_TYPE));
+        }
+        // after number of RLE runs exceeded threshold, it is converted to
+        // BITSET
+        for (int i = 0; i < 65535; i += 2) {
+            sbs_add_range(sbs, i, i);
+        }
+        assert_true(sbs_check_type(sbs, BITSET_CONTAINER_TYPE));
+        sbs_compare(sbs);
+        sbs_free(sbs);
     }
 
     // append new container to the end
     {
-      sbs_t* sbs = sbs_create();
-      sbs_add_value(sbs, 5);
-      sbs_add_range(sbs, 65536+5, 65536+20);
-      sbs_compare(sbs);
-      sbs_free(sbs);
+        sbs_t *sbs = sbs_create();
+        sbs_add_value(sbs, 5);
+        sbs_add_range(sbs, 65536 + 5, 65536 + 20);
+        sbs_compare(sbs);
+        sbs_free(sbs);
     }
 
     // prepend new container to the beginning
     {
-      sbs_t* sbs = sbs_create();
-      sbs_add_value(sbs, 65536*1+5);
-      sbs_add_range(sbs, 5, 20);
-      sbs_compare(sbs);
-      sbs_free(sbs);
+        sbs_t *sbs = sbs_create();
+        sbs_add_value(sbs, 65536 * 1 + 5);
+        sbs_add_range(sbs, 5, 20);
+        sbs_compare(sbs);
+        sbs_free(sbs);
     }
 
     // add new container between existing ones
     {
-      sbs_t* sbs = sbs_create();
-      sbs_add_value(sbs, 65536*0+5);
-      sbs_add_value(sbs, 65536*2+5);
-      sbs_add_range(sbs, 65536*1+5, 65536*1+20);
-      sbs_compare(sbs);
-      sbs_free(sbs);
+        sbs_t *sbs = sbs_create();
+        sbs_add_value(sbs, 65536 * 0 + 5);
+        sbs_add_value(sbs, 65536 * 2 + 5);
+        sbs_add_range(sbs, 65536 * 1 + 5, 65536 * 1 + 20);
+        sbs_compare(sbs);
+        sbs_free(sbs);
     }
 
     // invalid range
     {
-      sbs_t* sbs = sbs_create();
-      sbs_add_range(sbs, 200, 100);
-      sbs_compare(sbs);
-      sbs_free(sbs);
+        sbs_t *sbs = sbs_create();
+        sbs_add_range(sbs, 200, 100);
+        sbs_compare(sbs);
+        sbs_free(sbs);
     }
 
     // random data inside [0..span)
-    const uint32_t span = 16*65536;
+    const uint32_t span = 16 * 65536;
     for (uint32_t range_length = 1; range_length < 16384; range_length *= 3) {
-        sbs_t* sbs = sbs_create();
+        sbs_t *sbs = sbs_create();
         for (int i = 0; i < 50; i++) {
             uint32_t value = our_rand() % span;
             sbs_add_value(sbs, value);
@@ -3941,8 +4233,8 @@ DEFINE_TEST(test_add_range) {
         roaring_bitmap_set_copy_on_write(r1, true);
         roaring_bitmap_t *r2 = roaring_bitmap_copy(r1);
         roaring_bitmap_add_range(r1, 0, 1);
-        assert(roaring_bitmap_get_cardinality(r1) == 1);
-        assert(roaring_bitmap_get_cardinality(r2) == 1);
+        assert_true(roaring_bitmap_get_cardinality(r1) == 1);
+        assert_true(roaring_bitmap_get_cardinality(r2) == 1);
         roaring_bitmap_free(r2);
         roaring_bitmap_free(r1);
     }
@@ -4020,27 +4312,32 @@ DEFINE_TEST(test_remove_range) {
     // remove containers
     {
         sbs_t *sbs = sbs_create();
-        sbs_add_value(sbs, 65536*1+100);
-        sbs_add_value(sbs, 65536*3+100);
-        sbs_add_value(sbs, 65536*5+100);
-        sbs_add_value(sbs, 65536*7+100);
-        sbs_remove_range(sbs, 65536*3+0, 65536*3+65535); // from the middle
+        sbs_add_value(sbs, 65536 * 1 + 100);
+        sbs_add_value(sbs, 65536 * 3 + 100);
+        sbs_add_value(sbs, 65536 * 5 + 100);
+        sbs_add_value(sbs, 65536 * 7 + 100);
+        sbs_remove_range(sbs, 65536 * 3 + 0,
+                         65536 * 3 + 65535);  // from the middle
         sbs_compare(sbs);
-        sbs_remove_range(sbs, 65536*1+0, 65536*1+65535); // from the beginning
+        sbs_remove_range(sbs, 65536 * 1 + 0,
+                         65536 * 1 + 65535);  // from the beginning
         sbs_compare(sbs);
-        sbs_remove_range(sbs, 65536*7+0, 65536*7+65535); // from the end
+        sbs_remove_range(sbs, 65536 * 7 + 0,
+                         65536 * 7 + 65535);  // from the end
         sbs_compare(sbs);
-        sbs_remove_range(sbs, 65536*5+0, 65536*5+65535); // the last one
+        sbs_remove_range(sbs, 65536 * 5 + 0,
+                         65536 * 5 + 65535);  // the last one
         sbs_compare(sbs);
-        sbs_remove_range(sbs, 65536*9+0, 65536*9+65535); // non-existent
+        sbs_remove_range(sbs, 65536 * 9 + 0,
+                         65536 * 9 + 65535);  // non-existent
         sbs_compare(sbs);
         sbs_free(sbs);
     }
 
     // random data inside [0..span)
-    const uint32_t span = 16*65536;
+    const uint32_t span = 16 * 65536;
     for (uint32_t range_length = 3; range_length <= 16384; range_length *= 3) {
-        sbs_t* sbs = sbs_create();
+        sbs_t *sbs = sbs_create();
         for (int i = 0; i < 50; i++) {
             uint64_t range_start = our_rand() % (span - range_length);
             sbs_add_range(sbs, range_start, range_start + range_length - 1);
@@ -4058,9 +4355,10 @@ DEFINE_TEST(test_remove_many) {
     // multiple values per container (sorted)
     {
         sbs_t *sbs = sbs_create();
-        sbs_add_range(sbs, 0, 65536*2-1);
-        uint32_t values[] = {1, 3, 5, 7, 65536+1, 65536+3, 65536+5, 65536+7};
-        sbs_remove_many(sbs, sizeof(values)/sizeof(values[0]), values);
+        sbs_add_range(sbs, 0, 65536 * 2 - 1);
+        uint32_t values[] = {1,         3,         5,         7,
+                             65536 + 1, 65536 + 3, 65536 + 5, 65536 + 7};
+        sbs_remove_many(sbs, sizeof(values) / sizeof(values[0]), values);
         sbs_compare(sbs);
         sbs_free(sbs);
     }
@@ -4068,9 +4366,10 @@ DEFINE_TEST(test_remove_many) {
     // multiple values per container (interleaved)
     {
         sbs_t *sbs = sbs_create();
-        sbs_add_range(sbs, 0, 65536*2-1);
-        uint32_t values[] = {65536+7, 65536+5, 7, 5, 1, 65536+1, 65536+3, 3};
-        sbs_remove_many(sbs, sizeof(values)/sizeof(values[0]), values);
+        sbs_add_range(sbs, 0, 65536 * 2 - 1);
+        uint32_t values[] = {65536 + 7, 65536 + 5, 7,         5,
+                             1,         65536 + 1, 65536 + 3, 3};
+        sbs_remove_many(sbs, sizeof(values) / sizeof(values[0]), values);
         sbs_compare(sbs);
         sbs_free(sbs);
     }
@@ -4079,9 +4378,9 @@ DEFINE_TEST(test_remove_many) {
     {
         sbs_t *sbs = sbs_create();
         sbs_add_value(sbs, 500);
-        uint32_t values[] = {501, 80000}; // non-existent value/container
-        sbs_remove_many(sbs, sizeof(values)/sizeof(values[0]), values);
-        sbs_remove_many(sbs, 0, NULL); // NULL ptr is not dereferenced
+        uint32_t values[] = {501, 80000};  // non-existent value/container
+        sbs_remove_many(sbs, sizeof(values) / sizeof(values[0]), values);
+        sbs_remove_many(sbs, 0, NULL);  // NULL ptr is not dereferenced
         sbs_compare(sbs);
         sbs_free(sbs);
     }
@@ -4092,51 +4391,57 @@ DEFINE_TEST(test_remove_many) {
         sbs_add_range(sbs, 0, 65535);
         for (uint32_t v = 0; v <= 65535; v++) {
             sbs_remove_many(sbs, 1, &v);
-            assert(roaring_bitmap_get_cardinality(sbs->roaring) == 65535-v);
+            assert_true(roaring_bitmap_get_cardinality(sbs->roaring) ==
+                        65535 - v);
         }
-        assert(sbs_is_empty(sbs));
+        assert_true(sbs_is_empty(sbs));
         sbs_free(sbs);
     }
-
 }
 
 DEFINE_TEST(test_range_cardinality) {
     const uint64_t s = 65536;
 
     roaring_bitmap_t *r = roaring_bitmap_create();
-    roaring_bitmap_add_range(r, s*2, s*10);
+    roaring_bitmap_add_range(r, s * 2, s * 10);
 
     // single container (minhb == maxhb)
-    assert(roaring_bitmap_range_cardinality(r, s*2, s*3) == s);
-    assert(roaring_bitmap_range_cardinality(r, s*2+100, s*3) == s-100);
-    assert(roaring_bitmap_range_cardinality(r, s*2, s*3-200) == s-200);
-    assert(roaring_bitmap_range_cardinality(r, s*2+100, s*3-200) == s-300);
+    assert_true(roaring_bitmap_range_cardinality(r, s * 2, s * 3) == s);
+    assert_true(roaring_bitmap_range_cardinality(r, s * 2 + 100, s * 3) ==
+                s - 100);
+    assert_true(roaring_bitmap_range_cardinality(r, s * 2, s * 3 - 200) ==
+                s - 200);
+    assert_true(roaring_bitmap_range_cardinality(r, s * 2 + 100, s * 3 - 200) ==
+                s - 300);
 
     // multiple containers (maxhb > minhb)
-    assert(roaring_bitmap_range_cardinality(r, s*2, s*5) == s*3);
-    assert(roaring_bitmap_range_cardinality(r, s*2+100, s*5) == s*3-100);
-    assert(roaring_bitmap_range_cardinality(r, s*2, s*5-200) == s*3-200);
-    assert(roaring_bitmap_range_cardinality(r, s*2+100, s*5-200) == s*3-300);
+    assert_true(roaring_bitmap_range_cardinality(r, s * 2, s * 5) == s * 3);
+    assert_true(roaring_bitmap_range_cardinality(r, s * 2 + 100, s * 5) ==
+                s * 3 - 100);
+    assert_true(roaring_bitmap_range_cardinality(r, s * 2, s * 5 - 200) ==
+                s * 3 - 200);
+    assert_true(roaring_bitmap_range_cardinality(r, s * 2 + 100, s * 5 - 200) ==
+                s * 3 - 300);
 
     // boundary checks
-    assert(roaring_bitmap_range_cardinality(r, s*20, s*21) == 0);
-    assert(roaring_bitmap_range_cardinality(r, 100, 100) == 0);
-    assert(roaring_bitmap_range_cardinality(r, 0, s*7) == s*5);
-    assert(roaring_bitmap_range_cardinality(r, s*7, UINT64_MAX) == s*3);
+    assert_true(roaring_bitmap_range_cardinality(r, s * 20, s * 21) == 0);
+    assert_true(roaring_bitmap_range_cardinality(r, 100, 100) == 0);
+    assert_true(roaring_bitmap_range_cardinality(r, 0, s * 7) == s * 5);
+    assert_true(roaring_bitmap_range_cardinality(r, s * 7, UINT64_MAX) ==
+                s * 3);
 
     roaring_bitmap_free(r);
 }
 
 void frozen_serialization_compare(roaring_bitmap_t *r1) {
     size_t num_bytes = roaring_bitmap_frozen_size_in_bytes(r1);
-    char *buf = (char*)roaring_aligned_malloc(32, num_bytes);
+    char *buf = (char *)roaring_aligned_malloc(32, num_bytes);
     roaring_bitmap_frozen_serialize(r1, buf);
 
-    const roaring_bitmap_t *r2 =
-        roaring_bitmap_frozen_view(buf, num_bytes);
+    const roaring_bitmap_t *r2 = roaring_bitmap_frozen_view(buf, num_bytes);
 
-    assert(roaring_bitmap_equals(r1, r2));
-    assert(roaring_bitmap_frozen_view(buf+1, num_bytes-1) == NULL);
+    assert_true(roaring_bitmap_equals(r1, r2));
+    assert_true(roaring_bitmap_frozen_view(buf + 1, num_bytes - 1) == NULL);
 
     roaring_bitmap_free(r1);
     roaring_bitmap_free(r2);
@@ -4153,12 +4458,12 @@ DEFINE_TEST(test_frozen_serialization) {
     roaring_bitmap_add(r, 2000);
     roaring_bitmap_add(r, 100000);
     roaring_bitmap_add(r, 200000);
-    roaring_bitmap_add_range(r, s*10 + 100, s*13 - 100);
-    for (uint64_t i = 0; i < s*3; i += 2) {
-        roaring_bitmap_add(r, s*20 + i);
+    roaring_bitmap_add_range(r, s * 10 + 100, s * 13 - 100);
+    for (uint64_t i = 0; i < s * 3; i += 2) {
+        roaring_bitmap_add(r, s * 20 + i);
     }
     roaring_bitmap_run_optimize(r);
-    //roaring_bitmap_printf_describe(r);
+    // roaring_bitmap_printf_describe(r);
     frozen_serialization_compare(r);
 }
 
@@ -4167,21 +4472,188 @@ DEFINE_TEST(test_frozen_serialization_max_containers) {
     for (int64_t i = 0; i < 65536; i++) {
         roaring_bitmap_add(r, 65536 * i);
     }
-    assert(r->high_low_container.size == 65536);
+    assert_true(r->high_low_container.size == 65536);
     frozen_serialization_compare(r);
 }
 
+DEFINE_TEST(test_portable_deserialize_frozen) {
+    roaring_bitmap_t *r1 =
+        roaring_bitmap_of(8, 1, 2, 3, 100, 1000, 10000, 1000000, 20000000);
+    assert_non_null(r1);
+
+    uint32_t serialize_len;
+    roaring_bitmap_t *r2;
+
+    for (int i = 0, top_val = 384000; i < top_val; i++)
+        roaring_bitmap_add(r1, 3 * i);
+
+    uint32_t expectedsize = roaring_bitmap_portable_size_in_bytes(r1);
+    char *serialized = (char *)malloc(expectedsize);
+    serialize_len = roaring_bitmap_portable_serialize(r1, serialized);
+    assert_int_equal(serialize_len, expectedsize);
+    r2 = roaring_bitmap_portable_deserialize_frozen(serialized);
+    assert_non_null(r2);
+
+    uint64_t card1 = roaring_bitmap_get_cardinality(r1);
+    uint32_t *arr1 = (uint32_t *)malloc(card1 * sizeof(uint32_t));
+    roaring_bitmap_to_uint32_array(r1, arr1);
+
+    uint64_t card2 = roaring_bitmap_get_cardinality(r2);
+    uint32_t *arr2 = (uint32_t *)malloc(card2 * sizeof(uint32_t));
+    roaring_bitmap_to_uint32_array(r2, arr2);
+
+    assert_true(array_equals(arr1, card1, arr2, card2));
+    assert_true(roaring_bitmap_equals(r1, r2));
+    free(arr1);
+    free(arr2);
+    free(serialized);
+    roaring_bitmap_free(r1);
+    roaring_bitmap_free(r2);
+
+    r1 = roaring_bitmap_of(6, 2946000, 2997491, 10478289, 10490227, 10502444,
+                           19866827);
+    expectedsize = roaring_bitmap_portable_size_in_bytes(r1);
+    serialized = (char *)malloc(expectedsize);
+    serialize_len = roaring_bitmap_portable_serialize(r1, serialized);
+    assert_int_equal(serialize_len, expectedsize);
+    assert_int_equal(serialize_len, expectedsize);
+
+    r2 = roaring_bitmap_portable_deserialize_frozen(serialized);
+    assert_non_null(r2);
+
+    card1 = roaring_bitmap_get_cardinality(r1);
+    arr1 = (uint32_t *)malloc(card1 * sizeof(uint32_t));
+    roaring_bitmap_to_uint32_array(r1, arr1);
+
+    card2 = roaring_bitmap_get_cardinality(r2);
+    arr2 = (uint32_t *)malloc(card2 * sizeof(uint32_t));
+    roaring_bitmap_to_uint32_array(r2, arr2);
+
+    assert_true(array_equals(arr1, card1, arr2, card2));
+    assert_true(roaring_bitmap_equals(r1, r2));
+    free(arr1);
+    free(arr2);
+    roaring_bitmap_free(r1);
+    roaring_bitmap_free(r2);
+    free(serialized);
+
+    r1 = roaring_bitmap_create();
+    assert_non_null(r1);
+
+    for (uint32_t k = 100; k < 100000; ++k) {
+        roaring_bitmap_add(r1, k);
+    }
+
+    roaring_bitmap_run_optimize(r1);
+    expectedsize = roaring_bitmap_portable_size_in_bytes(r1);
+    serialized = (char *)malloc(expectedsize);
+    serialize_len = roaring_bitmap_portable_serialize(r1, serialized);
+    assert_int_equal(serialize_len, expectedsize);
+
+    r2 = roaring_bitmap_portable_deserialize_frozen(serialized);
+    assert_non_null(r2);
+
+    card1 = roaring_bitmap_get_cardinality(r1);
+    arr1 = (uint32_t *)malloc(card1 * sizeof(uint32_t));
+    roaring_bitmap_to_uint32_array(r1, arr1);
+
+    card2 = roaring_bitmap_get_cardinality(r2);
+    arr2 = (uint32_t *)malloc(card2 * sizeof(uint32_t));
+    roaring_bitmap_to_uint32_array(r2, arr2);
+
+    assert_true(array_equals(arr1, card1, arr2, card2));
+    assert_true(roaring_bitmap_equals(r1, r2));
+    free(arr1);
+    free(arr2);
+    roaring_bitmap_free(r1);
+    roaring_bitmap_free(r2);
+    free(serialized);
+}
+
+DEFINE_TEST(convert_to_bitset) {
+    roaring_bitmap_t *r1 = roaring_bitmap_create();
+    for (uint32_t i = 100; i < 100000; i += 1 + (i % 5)) {
+        roaring_bitmap_add(r1, i);
+    }
+    for (uint32_t i = 100000; i < 500000; i += 100) {
+        roaring_bitmap_add(r1, i);
+    }
+    roaring_bitmap_add_range(r1, 500000, 600000);
+    bitset_t *bitset = bitset_create();
+    bool success = roaring_bitmap_to_bitset(r1, bitset);
+    assert_true(success);  // could fail due to memory allocation.
+    assert_true(bitset_count(bitset) == roaring_bitmap_get_cardinality(r1));
+    // You can then query the bitset:
+    for (uint32_t i = 100; i < 100000; i += 1 + (i % 5)) {
+        assert_true(bitset_get(bitset, i));
+    }
+    for (uint32_t i = 100000; i < 500000; i += 100) {
+        assert_true(bitset_get(bitset, i));
+    }
+    // you must free the memory:
+    bitset_free(bitset);
+    roaring_bitmap_free(r1);
+}
+
+
+bool deserialization_test(const char *data, size_t size) {
+    // We test that deserialization never fails.
+    roaring_bitmap_t *bitmap =
+        roaring_bitmap_portable_deserialize_safe(data, size);
+    if (bitmap) {
+        // The bitmap may not be usable if it does not follow the specification.
+        // We can validate the bitmap we recovered to make sure it is proper.
+        const char *reason_failure = NULL;
+        if (roaring_bitmap_internal_validate(bitmap, &reason_failure)) {
+            // the bitmap is ok!
+            uint32_t cardinality = roaring_bitmap_get_cardinality(bitmap);
+
+            for (uint32_t i = 100; i < 1000; i++) {
+                if (!roaring_bitmap_contains(bitmap, i)) {
+                    cardinality++;
+                    roaring_bitmap_add(bitmap, i);
+                }
+            }
+
+            uint32_t new_cardinality = roaring_bitmap_get_cardinality(bitmap);
+            if (cardinality != new_cardinality) {
+                return false;
+            }
+        }
+        roaring_bitmap_free(bitmap);
+    }
+    return true;
+}
+
+DEFINE_TEST(robust_deserialization) {
+    assert_true(deserialization_test(NULL, 0));
+    // contains a run container that overflows the 16-bit boundary.
+    const char test1[] = "\x3b\x30\x00\x00\x01\x00\x00\xfa\x2e\x01\x00\x00\x02\xff\xff";
+    assert_true(deserialization_test(test1, sizeof(test1)));
+}
 
 int main() {
     tellmeall();
 
     const struct CMUnitTest tests[] = {
+        cmocka_unit_test(robust_deserialization),
+        cmocka_unit_test(issue457),
+        cmocka_unit_test(convert_to_bitset),
+        cmocka_unit_test(issue440),
+        cmocka_unit_test(issue436),
+        cmocka_unit_test(issue433),
+        cmocka_unit_test(issue429),
+        cmocka_unit_test(issue431),
+        cmocka_unit_test(test_contains_range_PyRoaringBitMap_issue81),
         cmocka_unit_test(issue316),
         cmocka_unit_test(issue288),
+#if !CROARING_IS_BIG_ENDIAN
         cmocka_unit_test(issue245),
+#endif
         cmocka_unit_test(issue208),
         cmocka_unit_test(issue208b),
         cmocka_unit_test(range_contains),
+        cmocka_unit_test(contains_bulk),
         cmocka_unit_test(inplaceorwide),
         cmocka_unit_test(test_contains_range),
         cmocka_unit_test(check_range_contains_from_end),
@@ -4195,17 +4667,21 @@ int main() {
         cmocka_unit_test(test_stress_memory_false),
         cmocka_unit_test(check_interval),
         cmocka_unit_test(test_uint32_iterator_true),
+#if !CROARING_IS_BIG_ENDIAN
         cmocka_unit_test(test_example_true),
         cmocka_unit_test(test_example_false),
+#endif
         cmocka_unit_test(test_clear),
         cmocka_unit_test(can_copy_empty_true),
         cmocka_unit_test(can_copy_empty_false),
         cmocka_unit_test(test_intersect_small_run_bitset),
         cmocka_unit_test(is_really_empty),
         cmocka_unit_test(test_rank),
+        cmocka_unit_test(test_get_index),
         cmocka_unit_test(test_maximum_minimum),
         cmocka_unit_test(test_stats),
         cmocka_unit_test(test_addremove),
+        cmocka_unit_test(test_addremove_bulk),
         cmocka_unit_test(test_addremoverun),
         cmocka_unit_test(test_basic_add),
         cmocka_unit_test(test_remove_withrun),
@@ -4215,6 +4691,7 @@ int main() {
         cmocka_unit_test(test_silly_range),
         cmocka_unit_test(test_uint32_iterator_true),
         cmocka_unit_test(test_uint32_iterator_false),
+        cmocka_unit_test(with_huge_capacity),
         cmocka_unit_test(leaks_with_empty_true),
         cmocka_unit_test(leaks_with_empty_false),
         cmocka_unit_test(test_bitmap_from_range),
@@ -4225,8 +4702,10 @@ int main() {
         cmocka_unit_test(test_iterate_empty),
         cmocka_unit_test(test_iterate_withbitmap),
         cmocka_unit_test(test_iterate_withrun),
+#if !CROARING_IS_BIG_ENDIAN
         cmocka_unit_test(test_serialize),
         cmocka_unit_test(test_portable_serialize),
+#endif
         cmocka_unit_test(test_add),
         cmocka_unit_test(test_add_checked),
         cmocka_unit_test(test_remove_checked),
@@ -4299,8 +4778,11 @@ int main() {
         cmocka_unit_test(test_remove_range),
         cmocka_unit_test(test_remove_many),
         cmocka_unit_test(test_range_cardinality),
+#if !CROARING_IS_BIG_ENDIAN
         cmocka_unit_test(test_frozen_serialization),
         cmocka_unit_test(test_frozen_serialization_max_containers),
+        cmocka_unit_test(test_portable_deserialize_frozen),
+#endif
     };
 
     return cmocka_run_group_tests(tests, NULL, NULL);
diff --git a/tools/cmake/FindCTargets.cmake b/tools/cmake/FindCTargets.cmake
index 8dae8ffc3..341f9b2c2 100644
--- a/tools/cmake/FindCTargets.cmake
+++ b/tools/cmake/FindCTargets.cmake
@@ -1,6 +1,10 @@
 if (CMAKE_VERSION VERSION_GREATER 3.0.0)
   cmake_policy(VERSION 3.0.0)
 endif ()
+include(${PROJECT_SOURCE_DIR}/tools/cmake/Import.cmake)
+set(BUILD_STATIC_LIB ON)
+import_dependency(cmocka clibs/cmocka  f5e2cd7)
+add_dependency(cmocka)
 
 function(add_c_test TEST_NAME)
   if(ROARING_BUILD_C_TESTS_AS_CPP)  # under C++, container_t* != void*
@@ -9,8 +13,7 @@ function(add_c_test TEST_NAME)
 
   add_executable(${TEST_NAME} ${TEST_NAME}.c)
 
-  include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/vendor/cmocka)
-  target_link_libraries(${TEST_NAME} ${ROARING_LIB_NAME} cmocka-static)
+  target_link_libraries(${TEST_NAME} roaring cmocka-static)
 
   add_test(${TEST_NAME} ${TEST_NAME})
 endfunction(add_c_test)
@@ -26,8 +29,7 @@ if (CMAKE_VERSION VERSION_GREATER 2.8.10)
     endif()
     target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/cpp)
 
-    include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/vendor/cmocka)
-    target_link_libraries(${TEST_NAME} ${ROARING_LIB_NAME} cmocka-static)
+    target_link_libraries(${TEST_NAME} roaring cmocka-static)
 
     add_test(${TEST_NAME} ${TEST_NAME})
   endfunction(add_cpp_test)
@@ -39,5 +41,15 @@ endif()
 
 function(add_c_benchmark BENCH_NAME)
   add_executable(${BENCH_NAME} ${BENCH_NAME}.c)
-  target_link_libraries(${BENCH_NAME} ${ROARING_LIB_NAME})
+  target_link_libraries(${BENCH_NAME} roaring)
 endfunction(add_c_benchmark)
+
+function(add_cpp_benchmark BENCH_NAME)
+  add_executable(${BENCH_NAME} ${BENCH_NAME}.cpp)
+  target_link_libraries(${BENCH_NAME} roaring)
+  if(ROARING_EXCEPTIONS)
+    target_compile_definitions(${BENCH_NAME} PUBLIC ROARING_EXCEPTIONS=1)
+  else()
+    target_compile_definitions(${BENCH_NAME} PUBLIC ROARING_EXCEPTIONS=0)
+  endif()
+endfunction(add_cpp_benchmark)
diff --git a/tools/cmake/FindOptions.cmake b/tools/cmake/FindOptions.cmake
index 49f797664..f7b8f9675 100644
--- a/tools/cmake/FindOptions.cmake
+++ b/tools/cmake/FindOptions.cmake
@@ -16,17 +16,6 @@ endif()
 if((NOT MSVC) AND ROARING_ARCH)
 set(OPT_FLAGS "-march=${ROARING_ARCH}")
 endif()
-if(ROARING_DISABLE_X64)
-  # we can manually disable any optimization for x64
-  set (OPT_FLAGS "${OPT_FLAGS} -DROARING_DISABLE_X64" )
-endif()
-if(ROARING_DISABLE_AVX)
-   # we can manually disable AVX by defining DISABLEAVX
-   set (OPT_FLAGS "${OPT_FLAGS} -DROARING_DISABLE_AVX" )
- endif()
-if(ROARING_DISABLE_NEON)
-  set (OPT_FLAGS "${OPT_FLAGS} -DDISABLENEON" )
-endif()
 
 if(FORCE_AVX) # some compilers like clang do not automagically define __AVX2__ and __BMI2__ even when the hardware supports it
 if(NOT MSVC)
@@ -36,6 +25,14 @@ else()
 endif()
 endif()
 
+if(FORCE_AVX512) # some compilers like clang do not automagically define __AVX512__ even when the hardware supports it
+if(NOT MSVC)
+   set (OPT_FLAGS "${OPT_FLAGS} -mbmi2 -mavx512f -mavx512bw -mavx512dq -mavx512vbmi2 -mavx512bitalg -mavx512vpopcntdq")
+else()
+   set (OPT_FLAGS "${OPT_FLAGS} /arch:AVX512")
+endif()
+endif()
+
 if(NOT MSVC)
 set(STD_FLAGS "-std=c11 -fPIC")
 set(CXXSTD_FLAGS "-std=c++11 -fPIC")
diff --git a/tools/cmake/Import.cmake b/tools/cmake/Import.cmake
index a79ed8f58..2b4e3e755 100644
--- a/tools/cmake/Import.cmake
+++ b/tools/cmake/Import.cmake
@@ -1,10 +1,9 @@
-# Based on github.com/simdjson/simdjson/blob/master/dependencies/import.cmocka by @friendlyanon
+set(dep_root "${PROJECT_SOURCE_DIR}/dependencies/.cache")
 
-set(dep_root "${CMAKE_CURRENT_SOURCE_DIR}/.cache")
 
-function(import_dependency NAME URL)
-  message(STATUS "Importing ${NAME} (${URL})")
-  set(target "${CMAKE_CURRENT_SOURCE_DIR}/${NAME}")
+function(import_dependency NAME GITHUB_REPO COMMIT)
+  message(STATUS "Importing ${NAME} (${GITHUB_REPO}@${COMMIT})")
+  set(target "${dep_root}/${NAME}")
 
   # If the folder exists in the cache, then we assume that everything is as
   # should be and do nothing
@@ -13,12 +12,12 @@ function(import_dependency NAME URL)
     return()
   endif()
 
-  set(archive "${dep_root}/archive.tar.xz")
+  set(zip_url "https://github.com/${GITHUB_REPO}/archive/${COMMIT}.zip")
+  set(archive "${dep_root}/archive.zip")
   set(dest "${dep_root}/_extract")
 
-  file(DOWNLOAD "${URL}" "${archive}")
+  file(DOWNLOAD "${zip_url}" "${archive}")
   file(MAKE_DIRECTORY "${dest}")
-  file(GLOB dir LIST_DIRECTORIES YES "${dep_root}/*")
   execute_process(
           WORKING_DIRECTORY "${dest}"
           COMMAND "${CMAKE_COMMAND}" -E tar xf "${archive}")
@@ -32,3 +31,20 @@ function(import_dependency NAME URL)
 
   set("${NAME}_SOURCE_DIR" "${target}" PARENT_SCOPE)
 endfunction()
+
+# Delegates to the dependency
+macro(add_dependency NAME)
+  if(NOT DEFINED "${NAME}_SOURCE_DIR")
+    message(FATAL_ERROR "Missing ${NAME}_SOURCE_DIR variable")
+  endif()
+
+  add_subdirectory("${${NAME}_SOURCE_DIR}" "${PROJECT_BINARY_DIR}/_deps/${NAME}" EXCLUDE_FROM_ALL)
+endmacro()
+
+function(set_off NAME)
+  set("${NAME}" OFF CACHE INTERNAL "")
+endfunction()
+
+function(set_on NAME)
+  set("${NAME}" ON CACHE INTERNAL "")
+endfunction()
\ No newline at end of file
diff --git a/tools/prepare_doxygen.sh b/tools/prepare_doxygen.sh
new file mode 100755
index 000000000..f04ff628c
--- /dev/null
+++ b/tools/prepare_doxygen.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+set -e
+
+PACKAGE_URL="https://github.com/jothepro/doxygen-awesome-css.git"
+PACKAGE_VERSION="v2.1.0"
+
+BASE_DIR=$(pwd)
+THEME_DIR="$BASE_DIR/theme"
+WORKSPACE=$(mktemp -d 2> /dev/null || mktemp -d -t 'tmp')
+cleanup () {
+  EXIT_CODE=$?
+  [ -d "$WORKSPACE" ] && rm -rf "$WORKSPACE"
+  exit $EXIT_CODE
+}
+
+trap cleanup INT TERM EXIT
+
+cd "$WORKSPACE"
+git clone --depth=1 --branch "$PACKAGE_VERSION" "$PACKAGE_URL" theme
+rm -rf "$THEME_DIR"
+mv "$WORKSPACE/theme" "$THEME_DIR"
diff --git a/tools/release.py b/tools/release.py
index 8f488f67f..5d4ff71c6 100755
--- a/tools/release.py
+++ b/tools/release.py
@@ -135,6 +135,11 @@ def topaddedversionstring(major, minor, rev):
 
 print("modified "+cmakefile+", a backup was made")
 
+doxygenfile = maindir + os.sep + "doxygen"
+
+for line in fileinput.input(doxygenfile, inplace=1, backup='.bak'):
+    line = re.sub('PROJECT_NUMBER         = "\d+\.\d+\.\d+','PROJECT_NUMBER         = "'+newversionstring, line.rstrip())
+    print(line)
 
 print("Please run the tests before issuing a release: "+scriptlocation + "/prereleasetests.sh \n")
 print("to issue release, enter \n git commit -a \n git push \n git tag -a v"+toversionstring(*newversion)+" -m \"version "+toversionstring(*newversion)+"\"\n git push --tags \n")