diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..b33b3b2
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,6 @@
+BasedOnStyle: WebKit
+AlignAfterOpenBracket: BlockIndent
+ColumnLimit: 100
+BinPackParameters: false
+BinPackArguments: false
+IncludeBlocks: Regroup
diff --git a/.gitignore b/.gitignore
index fb9f65d..2902970 100644
--- a/.gitignore
+++ b/.gitignore
@@ -137,3 +137,5 @@ dmypy.json
 
 # End of https://www.toptal.com/developers/gitignore/api/python
 instances/
+
+bruteforce_wrapper_gpu.cpp
\ No newline at end of file
diff --git a/docs/Makefile b/docs/Makefile
deleted file mode 100644
index d4bb2cb..0000000
--- a/docs/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS    ?=
-SPHINXBUILD   ?= sphinx-build
-SOURCEDIR     = .
-BUILDDIR      = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/_templates/autoapi/index.rst b/docs/_templates/autoapi/index.rst
deleted file mode 100644
index 87a7ff9..0000000
--- a/docs/_templates/autoapi/index.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-API Reference
-=============
-
-This page contains auto-generated API reference documentation.
-
-.. toctree::
-   :titlesonly:
-
-   {% for page in pages | sort %}
-   {% if (page.top_level_object or page.name.split('.') | length == 3) and page.display %}
-   {{ page.short_name | capitalize }} <{{ page.include_path }}>
-   {% endif %}
-   {% endfor %}
diff --git a/docs/assets/logo-large.png b/docs/assets/logo-large.png
new file mode 100644
index 0000000..2d16045
Binary files /dev/null and b/docs/assets/logo-large.png differ
diff --git a/docs/assets/logo.png b/docs/assets/logo.png
new file mode 100644
index 0000000..1e22174
Binary files /dev/null and b/docs/assets/logo.png differ
diff --git a/docs/conf.py b/docs/conf.py
deleted file mode 100644
index b6b4a53..0000000
--- a/docs/conf.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
-
-
-# -- Project information -----------------------------------------------------
-
-project = "Omnisolver Bruteforce"
-copyright = "2021, Konrad Jałowiecki, Łukasz Pawela"
-author = "Konrad Jałowiecki, Łukasz Pawela"
-
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = ["myst_parser", "sphinx_term.termynal", "autoapi.extension"]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
-
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = "pydata_sphinx_theme"
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
-
-autoapi_dirs = ["../omnisolver"]
-autoapi_python_use_implicit_namespaces = True
-autoapi_template_dir = "_templates/autoapi"
diff --git a/docs/index.md b/docs/index.md
index e078135..f99fb01 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,49 +1,91 @@
-# Welcome to omnisolver-pt documentation!
+---
+hide:
+  - navigation
+---
 
-The `omnisolver-bruteforce` is Omnisolver plugin implementing the bruteforce (a.k.a exhaustive 
-search) algorithm.
+<p align="center">
+    <a href="https://github.com/euro-hpc-pl/omnisolver"><img src="assets/logo-large.png" alt="Omnisolver"></a>
+</p>
 
-## Installation
+---
 
-Currently, the `omnisolver-bruteforce` package requires working CUDA installation.
+<h1></h1>
 
-To install the plugin first set the `CUDAHOME` environmental library to your CUDA instalaltion location, e.g.:
+<p align="center">
+    <a href="https://github.com/euro-hpc-pl/omnisolver/actions/workflows/quality_checks.yml">
+    <img src="https://github.com/euro-hpc-pl/omnisolver/actions/workflows/quality_checks.yml/badge.svg" alt="Tests"/>
+    </a>
+<a href="https://euro-hpc-pl.github.io/omnisolver">
+<img alt="Docs" src="https://img.shields.io/github/actions/workflow/status/euro-hpc-pl/omnisolver/quality_checks.yml?label=Docs">
+</a>
+</p>
 
-```shell
-# Rmember, your actual location  may vary!
-export CUDAHOME=/usr/local/cuda
-```
+**Documentation:** https://euro-hpc-pl.github.io/omnisolver 
 
-and then run:
+**Source code:** https://github.com/euro-hpc-pl/omnisolver
 
-```shell
-pip install omnisolver-bruteforce
-```
+---
 
-> **Warning**
-> If you don't set the `CUDAHOME` directory, an attempt will be made to deduce it based on the location of your `nvcc` compiler.
-> However, this process might not work in all the cases and should not be relied on.
+Omnisolver is a collection of Binary Quadratic Model solvers and a framework for implementing them.
 
-## Command line usage
-```text
-usage: omnisolver bruteforce-gpu [-h] [--output OUTPUT] [--vartype {SPIN,BINARY}] [--num_states NUM_STATES] [--suffix_size SUFFIX_SIZE] [--grid_size GRID_SIZE]
-                                 [--block_size BLOCK_SIZE] [--dtype {float,double}]
-                                 input
 
-Bruteforce (a.k.a exhaustive search) sampler using CUA-enabled GPU
 
-positional arguments:
-  input                 Path of the input BQM file in COO format. If not specified, stdin is used.
+## Why Omnisolver?
+
+### Benefits for the end-users
+
+Omnisolver contains a selection of standard and more sophisticated algorithms for solving BQMs. All solvers are available through intuitive CLI or from Python scripts as dimod based Samplers.
+
+### Benefits for solver creators
+
+Omnisolver allows developer to focus on algorithms instead of common tasks like handling input/output or creating CLI.
+
+## Quickstart
 
-optional arguments:
-  -h, --help            show this help message and exit
-  --output OUTPUT       Path of the output file. If not specified, stdout is used.
-  --vartype {SPIN,BINARY}
-                        Variable type
-  --num_states NUM_STATES
+<!-- termynal -->
+
+```
+# Install base omnisolver package
+$ pip install omnisolver
+---> 100%
+Successfuly installed omnisolver
+# Install chosen plugins (e.g. parallel-tempering solver)
+$ pip install omnisolver-pt
+---> 100%
+Successfuly installed omnisolver-pt
+# Create an instance file in COOrdinate format
+$ echo "0 1 1.0
+> 1 2 1.0
+> 2 0 1.0" > instance.txt
+# Run solver
+$ omnisolver pt --vartype SPIN instance.txt
+0,1,2,energy,num_occurrences
+1,-1,-1,-1.0,1
 ```
 
-## Documentation
-```{toctree}
-:maxdepth: 1
+## What's next?
+
+Here are some resources to get you started:
+
+- Start with user guide to learn about the installation methods and general usage patterns.
+- Discover available solvers in our plugin list.
+- If you are interested in developing your own solver, or are interested in in-depth details of how the Omnisolver
+  works, check our solver creator guide and reference manual.
+
+## Citing
+
+If you used Omnisolver in your research, consider citing it in your paper.
+You can use the following BibTeX entry:
+
+```text
+@article{omnisolver2023,
+    title = {Omnisolver: An extensible interface to Ising spin–glass and QUBO solvers},
+    journal = {SoftwareX},
+    volume = {24},
+    pages = {101559},
+    year = {2023},
+    doi = {https://doi.org/10.1016/j.softx.2023.101559},
+    url = {https://www.softxjournal.com/article/S2352-7110(23)00255-8/},
+    author = {Konrad Jałowiecki and {\L}ukasz Pawela},
+}
 ```
diff --git a/docs/javascripts/mathjax.js b/docs/javascripts/mathjax.js
new file mode 100644
index 0000000..396f03b
--- /dev/null
+++ b/docs/javascripts/mathjax.js
@@ -0,0 +1,18 @@
+window.MathJax = {
+  tex: {
+    inlineMath: [["\\(", "\\)"]],
+    displayMath: [["\\[", "\\]"]],
+    processEscapes: true,
+    processEnvironments: true
+  },
+  options: {
+    ignoreHtmlClass: ".*|",
+    processHtmlClass: "arithmatex"
+  }
+};
+
+document$.subscribe(() => {
+
+
+  MathJax.typesetPromise()
+})
diff --git a/docs/make.bat b/docs/make.bat
deleted file mode 100644
index 8084272..0000000
--- a/docs/make.bat
+++ /dev/null
@@ -1,35 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=.
-set BUILDDIR=_build
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
-	exit /b 1
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
diff --git a/docs/reference/omnisolver_bruteforce_distributed.md b/docs/reference/omnisolver_bruteforce_distributed.md
new file mode 100644
index 0000000..8860640
--- /dev/null
+++ b/docs/reference/omnisolver_bruteforce_distributed.md
@@ -0,0 +1,2 @@
+# ::: omnisolver.bruteforce.gpu.distributed
+    handler: python
diff --git a/docs/reference/omnisolver_bruteforce_sampler.md b/docs/reference/omnisolver_bruteforce_sampler.md
new file mode 100644
index 0000000..84cc6a5
--- /dev/null
+++ b/docs/reference/omnisolver_bruteforce_sampler.md
@@ -0,0 +1,2 @@
+# ::: omnisolver.bruteforce.gpu.sampler
+    handler: python
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index 09f95f3..0000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-sphinx~=6.1.3
-sphinx-autoapi~=2.0.1
-pydata-sphinx-theme~=0.13.1
-sphinx-term~=0.1
-myst-parser~=1.0.0
diff --git a/examples/distributed.py b/examples/distributed.py
new file mode 100644
index 0000000..aab4250
--- /dev/null
+++ b/examples/distributed.py
@@ -0,0 +1,55 @@
+from dimod import BQM
+from numpy.random import default_rng
+
+from omnisolver.bruteforce.gpu import BruteforceGPUSampler
+from omnisolver.bruteforce.gpu.distributed import DistributedBruteforceGPUSampler
+
+
+def random_bqm(num_variables, vartype, offset, rng):
+    linear = {
+        i: coef for i, coef in zip(range(num_variables), rng.uniform(-2, 2, size=num_variables))
+    }
+    quadratic = {
+        (i, j): coef
+        for (i, j), coef in zip(
+            [(i, j) for i in range(num_variables) for j in range(i + 1, num_variables)],
+            rng.uniform(-1, -1, size=(num_variables - 1) * num_variables // 2),
+        )
+    }
+    return BQM(linear, quadratic, offset, vartype=vartype)
+
+
+NUM_VARIABLES = 40
+
+
+def main():
+    sampler = DistributedBruteforceGPUSampler()
+    sampler2 = BruteforceGPUSampler()
+    rng = default_rng(1234)
+
+    bqm = random_bqm(NUM_VARIABLES, "BINARY", 0.0, rng)
+
+    import time
+
+    start = time.perf_counter()
+    result = sampler.sample(
+        bqm, num_states=100, num_fixed_vars=1, suffix_size=25, grid_size=1024, block_size=1024
+    )
+    duration = time.perf_counter() - start
+    distributed_en = [entry.energy for entry in result.data()]
+
+    print(f"Distributed finished in : {duration}s")
+
+    start = time.perf_counter()
+    result2 = sampler2.sample(
+        bqm, num_states=100, suffix_size=25, grid_size=2**12, block_size=512
+    )
+    duration = time.perf_counter() - start
+    single_en = [entry.energy for entry in result2.data()]
+    print(f"Single finished in: {duration}s")
+
+    print(max(abs(en1 - en2) for en1, en2 in zip(distributed_en, single_en)))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..2d13988
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,66 @@
+# yaml-language-server: $schema=https://squidfunk.github.io/mkdocs-material/schema.json
+site_name: Omnisolver-Bruteforce
+site_author: "Konrad Jałowiecki & Łukasz Pawela"
+copyright: ""
+theme:
+  name: material
+  logo: assets/logo.png
+  palette:
+    primary: custom
+  features:
+    - navigation.tabs
+    - navigation.tracking
+extra:
+  version:
+    provider: mike
+extra_css:
+  - stylesheets/extra.css
+nav:
+  - User guide: userguide.md
+  #- Solver creator guide: creatorguide.md
+  - Reference manual:
+    - omnisolver.bruteforce.gpu.sampler: reference/omnisolver_bruteforce_sampler.md
+    - omnisolver.bruteforce.gpu.distributed: reference/omnisolver_bruteforce_distributed.md
+  - Plugin list: plugins.md
+plugins:
+  - search
+  - mike:
+      canonical_version: latest
+      version_selector: true
+  - termynal:
+      prompt_literal_start:
+        - "$"
+  - mkdocstrings:
+      handlers:
+        python:
+          options:
+            annotation_path: brief
+            heading_level: 2
+            show_source: false
+            show_root_toc_entry: false
+            show_signature_annotations: true
+            members_order: source
+            separate_signature: true
+            docstring_style: sphinx
+            show_if_no_docstring: true
+            docstring_options:
+              ignore_init_summary: true
+              merge_init_into_class: true
+              show_docstring_returns: true
+
+  - with-pdf:
+       render_js: true
+       cover_subtitle: "Exhaustive search plugin for Omnisolver"
+repo_name: Omnisolver
+repo_url: https://github.com/euro-hpc-pl/omnisolver
+markdown_extensions:
+  - pymdownx.superfences
+  - pymdownx.arithmatex:
+      generic: true
+  - admonition
+  - pymdownx.tabbed:
+      alternate_style: true
+extra_javascript:
+  - javascripts/mathjax.js
+  - https://polyfill.io/v3/polyfill.min.js?features=es6
+  - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
diff --git a/omnisolver/bruteforce/gpu/__init__.py b/omnisolver/bruteforce/gpu/__init__.py
index d0c35ef..5b868f2 100644
--- a/omnisolver/bruteforce/gpu/__init__.py
+++ b/omnisolver/bruteforce/gpu/__init__.py
@@ -1,12 +1,11 @@
 """Implementation of parallel tempering plugin for Omnisolver."""
-from omnisolver.common.plugin import Plugin, plugin_from_specification, plugin_impl
+from omnisolver.common.plugin import Plugin, plugin_from_specification
 from pkg_resources import resource_stream
 from yaml import safe_load
 
 from .sampler import BruteforceGPUSampler
 
 
-@plugin_impl
 def get_plugin() -> Plugin:
     """Get package name and resource path."""
     specification = safe_load(resource_stream("omnisolver.bruteforce.gpu", "gpu.yml"))
diff --git a/omnisolver/bruteforce/gpu/distributed.py b/omnisolver/bruteforce/gpu/distributed.py
new file mode 100644
index 0000000..9729a0c
--- /dev/null
+++ b/omnisolver/bruteforce/gpu/distributed.py
@@ -0,0 +1,108 @@
+import typing
+from itertools import product
+from time import perf_counter
+
+import numpy as np
+import ray
+from dimod import Sampler, Vartype, append_variables, concatenate
+
+from .sampler import BruteforceGPUSampler
+
+
+@ray.remote(num_gpus=1)
+def _solve_subproblem(
+    bqm,
+    num_states,
+    fixed_vars,
+    suffix_size,
+    grid_size,
+    block_size,
+    num_steps_per_kernel,
+    partial_diff_buffer_depth,
+    dtype,
+):
+    new_bqm = bqm.copy()
+    new_bqm.fix_variables(fixed_vars)
+
+    sampler = BruteforceGPUSampler()
+    result = sampler.sample(
+        new_bqm,
+        num_states,
+        suffix_size,
+        grid_size,
+        block_size,
+        num_steps_per_kernel,
+        partial_diff_buffer_depth,
+        dtype,
+    )
+
+    return append_variables(result, fixed_vars)
+
+
+class DistributedBruteforceGPUSampler(Sampler):
+    def sample(
+        self,
+        bqm,
+        num_states,
+        num_fixed_vars,
+        suffix_size,
+        grid_size,
+        block_size,
+        num_steps_per_kernel=1,
+        partial_diff_buffer_depth=1,
+        dtype=np.float32,
+    ):
+        if bqm.vartype == Vartype.SPIN:
+            return self.sample(
+                bqm.change_vartype("BINARY", inplace=False),
+                num_states,
+                num_fixed_vars,
+                suffix_size,
+                grid_size,
+                block_size,
+                num_steps_per_kernel,
+                partial_diff_buffer_depth,
+            ).change_vartype("SPIN", inplace=False)
+
+        bqm, mapping = bqm.relabel_variables_as_integers()
+
+        start_counter = perf_counter()
+
+        subproblems = [
+            {i: v for i, v in enumerate(vals)} for vals in product([0, 1], repeat=num_fixed_vars)
+        ]
+
+        refs = [
+            _solve_subproblem.remote(
+                bqm,
+                num_states,
+                fixed_vars,
+                suffix_size,
+                grid_size,
+                block_size,
+                num_steps_per_kernel,
+                partial_diff_buffer_depth,
+                dtype,
+            )
+            for fixed_vars in subproblems
+        ]
+
+        subsolutions = [ray.get(ref) for ref in refs]
+        solve_time_in_seconds = perf_counter() - start_counter
+        result = concatenate(subsolutions).truncate(num_states)
+        result.info["solve_time_in_seconds"] = solve_time_in_seconds
+        return result.relabel_variables(mapping)
+
+    @property
+    def parameters(self) -> typing.Dict[str, typing.Any]:
+        return {
+            "num_states": [],
+            "suffix_size": [],
+            "grid_size": [],
+            "block_size": [],
+            "dtype": [],
+        }
+
+    @property
+    def properties(self) -> typing.Dict[str, typing.Any]:
+        return {}
diff --git a/omnisolver/bruteforce/gpu/sampler.py b/omnisolver/bruteforce/gpu/sampler.py
index d26e8c4..81a960c 100644
--- a/omnisolver/bruteforce/gpu/sampler.py
+++ b/omnisolver/bruteforce/gpu/sampler.py
@@ -4,7 +4,7 @@
 import numpy as np
 from dimod import Sampler, SampleSet, Vartype
 
-from omnisolver.bruteforce.ext.gpu import gpu_search
+from omnisolver.bruteforce.ext.gpu import gpu_search, gpu_search_ground_only
 
 
 def _convert_int_to_sample(val, num_variables):
@@ -16,7 +16,17 @@ def _convert_int_to_sample(val, num_variables):
 
 
 class BruteforceGPUSampler(Sampler):
-    def sample(self, bqm, num_states, suffix_size, grid_size, block_size, dtype=np.float32):
+    def sample(
+        self,
+        bqm,
+        num_states,
+        suffix_size,
+        grid_size,
+        block_size,
+        num_steps_per_kernel=16,
+        partial_diff_buffer_depth=1,
+        dtype=np.float32,
+    ):
         """Solve Binary Quadratic Model using exhaustive (bruteforce) search on the GPU.
 
         :param bqm: Binary Quadratic Model instance to solve.
@@ -38,6 +48,9 @@ def sample(self, bqm, num_states, suffix_size, grid_size, block_size, dtype=np.f
                 suffix_size,
                 grid_size,
                 block_size,
+                num_steps_per_kernel,
+                partial_diff_buffer_depth,
+                dtype,
             ).change_vartype("SPIN", inplace=False)
 
         bqm, mapping = bqm.relabel_variables_as_integers()
@@ -56,15 +69,27 @@ def sample(self, bqm, num_states, suffix_size, grid_size, block_size, dtype=np.f
 
         start_counter = perf_counter()
 
-        gpu_search(
-            qubo_mat,
-            num_states,
-            states_out,
-            energies_out,
-            grid_size,
-            block_size,
-            suffix_size,
-        )
+        if num_states == 1:  # Shortcut if we are only looking for a ground state
+            gpu_search_ground_only(
+                qubo_mat,
+                states_out,
+                energies_out,
+                grid_size,
+                block_size,
+                suffix_size,
+                num_steps_per_kernel,
+                partial_diff_buffer_depth,
+            )
+        else:
+            gpu_search(
+                qubo_mat,
+                num_states,
+                states_out,
+                energies_out,
+                grid_size,
+                block_size,
+                suffix_size,
+            )
 
         solve_time_in_seconds = perf_counter() - start_counter
 
diff --git a/omnisolver/extensions/bruteforce_gpu.cu b/omnisolver/extensions/bruteforce_gpu.cu
index 03d62d1..1adc65d 100644
--- a/omnisolver/extensions/bruteforce_gpu.cu
+++ b/omnisolver/extensions/bruteforce_gpu.cu
@@ -1,30 +1,36 @@
 // Implementation of exhaustive search using a CUDA--enabled GPU.
-#include <bit>
-#include <random>
-#include <stdexcept>
+#include "vectors.h"
+
 #include <sstream>
+#include <stdexcept>
 #include <thrust/execution_policy.h>
 #include <thrust/sort.h>
-#include "vectors.h"
 
 // Basic error checking (used only for kernel launches)
 // It should throw an instance of std::runtime_error containing the decoded
 // cuda runtime error message and also filename and line at which the error
 // occurred.
-#define cuda_error_check(code) { cuda_assert((code), __FILE__, __LINE__); }
+#define cuda_error_check(code)                                                                     \
+    {                                                                                              \
+        cuda_assert((code), __FILE__, __LINE__);                                                   \
+    }
+
+__device__ __forceinline__ int thread_id() { return blockIdx.x * blockDim.x + threadIdx.x; }
+
+__device__ __forceinline__ int grid_size() { return blockDim.x * gridDim.x; }
 
-inline void cuda_assert(cudaError_t code, const char *file, int line)
+inline void cuda_assert(cudaError_t code, const char* file, int line)
 {
-   if (code != cudaSuccess)
-   {
-      std::stringstream what;
-      what << "File: " << __FILE__ << " at " << __LINE__ << ", error: " << cudaGetErrorString(code);
-      throw std::runtime_error(what.str());
-   }
+    if (code != cudaSuccess) {
+        std::stringstream what;
+        what << "File: " << __FILE__ << " at " << __LINE__
+             << ", error: " << cudaGetErrorString(code);
+        throw std::runtime_error(what.str());
+    }
 }
 
 // Compute i-th Gray code.
-uint64_t gray(uint64_t index) { return index ^ (index >> 1); }
+__host__ __device__ uint64_t gray(uint64_t index) { return index ^ (index >> 1); }
 
 // Find First bit Set in a number.
 // Following common convention for this function it returns bits indexed from 1
@@ -33,12 +39,13 @@ uint64_t gray(uint64_t index) { return index ^ (index >> 1); }
 //     ffs(7) = 1
 //     ffs(10) = 2
 //     ffs(24) = 4
-int ffs(uint64_t number) {
-    if(number == 0) {
+int ffs(uint64_t number)
+{
+    if (number == 0) {
         return 0;
     }
     int result = 1;
-    while((number & 1) != 1) {
+    while ((number & 1) != 1) {
         number >>= 1;
         result++;
     }
@@ -46,23 +53,62 @@ int ffs(uint64_t number) {
     return result;
 }
 
-// Given a QUBO and a state, compute how the energy changes if the i-th bit of state
-// is flipped.
-// Specifically, E(.) is energy function of the qubo, then this function computes
-// E(s) - E(s'), where s' is the state after flipping i-th bit of s.
-// One can easily verify that to compute the difference above we need only the
-// i-th row of QUBO, and hence we already pass this row instead of computign the offset.
+// Given a QUBO and a state, compute how the energy changes if the i-th bit of
+// state is flipped. Specifically, E(.) is energy function of the qubo, then
+// this function computes E(s) - E(s'), where s' is the state after flipping
+// i-th bit of s. One can easily verify that to compute the difference above we
+// need only the i-th row of QUBO, and hence we already pass this row instead
+// of computign the offset.
 template <typename T>
-__device__ T energy_diff(T* qubo_row, int N, uint64_t state, int i) {
+__host__ __device__ __forceinline__ T energy_diff(T* qubo_row, int N, uint64_t state, int i)
+{
     int qi = (state >> i) & 1; // This is the i-th bit of state
-    T total = qubo_row[i];     // Start accumulating from the lineaer term
+    T total = qubo_row[i]; // Start accumulating from the linear term
+
+    uint32_t word = state & 0xFFFFFFFF;
 
     // Go through each bit of state
-    for(int j = 0; j < N; j++) {
-        if(i != j) { // except the one to flip, it's already accounted for
-            total += qubo_row[j] * (state & 1);
+    for (int j = 0; j < 32 && j < N; j++) {
+        if (i != j && (word % 2)) { // except the one to flip, it's already accounted for
+            total += qubo_row[j];
         }
-        state >>= 1; // move to next bit
+        word /= 2; //>>= 1; // move to next bit
+    }
+
+    word = state >> 32;
+    for (int j = 32; j < N; j++) {
+        if (i != j && (word % 2)) { // except the one to flip, it's already accounted for
+            total += qubo_row[j];
+        }
+        word /= 2; // move to next bit
+    }
+
+    // When flipping from 0 to 1 there is nothing to do, otherwise we need
+    // to multiply the total by -1.
+    return (2 * qi - 1) * total;
+}
+
+template <typename T>
+__device__ __forceinline__ T
+energy_diff_reduced(T* qubo_row, int N, int offset, uint64_t state, int i, T common_part)
+{
+    int qi = (state >> i) & 1; // This is the i-th bit of state
+    T total = common_part; // Start accumulating from the linear term
+
+    uint32_t word = (state >> offset) & 0xFFFFFFFF;
+
+    // Go through each bit of state
+    for (int j = offset; j < 32 + offset && j < N; j++) {
+        if (word % 2)
+            total += qubo_row[j];
+        word /= 2; // move to next bit
+    }
+
+    word = state >> (32 + offset);
+    for (int j = 32 + offset; j < N; j++) {
+        if (word % 2)
+            total += qubo_row[j];
+        word /= 2; // move to next bit
     }
 
     // When flipping from 0 to 1 there is nothing to do, otherwise we need
@@ -71,14 +117,14 @@ __device__ T energy_diff(T* qubo_row, int N, uint64_t state, int i) {
 }
 
 // Initialize first states and energies for further processing.
-// The idea is as follows. Each state (which is N-bit number) is (logically) split into
-// two parts:
-// l-bit part specific to state
-// k-bit part that changes in all states in the same way for all states in given iteration.
-// This function computes energy of a state with some l-bits set according to index,
+// The idea is as follows. Each state (which is N-bit number) is (logically)
+// split into two parts: l-bit part specific to state k-bit part that changes
+// in all states in the same way for all states in given iteration. This
+// function computes energy of a state with some l-bits set according to index,
 // and k least significant bits set to 0.
 template <typename T>
-__device__ void _init(T* qubo, int N, uint64_t* states, T* energies, uint64_t index) {
+__device__ void _init(T* qubo, int N, uint64_t* states, T* energies, uint64_t index)
+{
     T energy = 0.0;
     uint64_t state = 0, suffix = index, offset = 0;
 
@@ -86,9 +132,9 @@ __device__ void _init(T* qubo, int N, uint64_t* states, T* energies, uint64_t in
     // Instead we enumerate bits from the most significant one.
     // For instance, if index=12, which is 110 in binary, we will set the three
     // most significant bits of state to 011.
-    while(suffix > 0) {
+    while (suffix > 0) {
         // Find first set (note: not our function, this one is CUDA's intrinsic)
-        uint64_t bit_in_suffix = __ffs(suffix);
+        uint64_t bit_in_suffix = __ffsll(suffix);
         // Compute bit index from the most significant position
         // Since we reduce suffix in each iteration, we have to store the offset
         // travelled so far in the `offset` variable.
@@ -108,36 +154,28 @@ __device__ void _init(T* qubo, int N, uint64_t* states, T* energies, uint64_t in
     states[index] = state;
 }
 
-// Kernel that basically launched the _init function above for all states and energies
+// Kernel that basically launches the _init function above for all states and
+// energies
 template <typename T>
-__global__ void init(
-    T* qubo,
-    int N,
-    T* energies,
-    uint64_t* states,
-    uint64_t states_in_chunk
-) {
-    for(int i=blockIdx.x * blockDim.x + threadIdx.x; i < states_in_chunk; i += blockDim.x * gridDim.x) {
+__global__ void init(T* qubo, int N, T* energies, uint64_t* states, uint64_t states_in_chunk)
+{
+    for (uint64_t i = thread_id(); i < states_in_chunk; i += grid_size()) {
         _init(qubo, N, states, energies, i);
     }
 }
 
 // A single step of bruteforce algorithm.
-// Recall that we split all states (logically) into l-most significant fixed bits
-// and k-least significant bits that are modified, one at a time, in each iteration.
-// This kernel flips one of those k-significant bits.
+// Recall that we split all states (logically) into l-most significant fixed
+// bits and k-least significant bits that are modified, one at a time, in each
+// iteration. This kernel flips one of those k-significant bits.
 template <typename T>
 __global__ void single_step(
-    T* qubo,
-    int N,
-    T* energies,
-    uint64_t* states,
-    int bit_to_flip,
-    uint64_t states_in_chunk
-) {
+    T* qubo, int N, T* energies, uint64_t* states, int bit_to_flip, uint64_t states_in_chunk
+)
+{
     T* qubo_row = qubo + N * bit_to_flip;
 
-    for(int i=blockIdx.x * blockDim.x + threadIdx.x; i < states_in_chunk; i += blockDim.x * gridDim.x) {
+    for (uint64_t i = thread_id(); i < states_in_chunk; i += grid_size()) {
         uint64_t state = states[i];
         T energy = energies[i];
         energies[i] = energy - energy_diff(qubo_row, N, state, bit_to_flip);
@@ -145,29 +183,131 @@ __global__ void single_step(
     }
 }
 
+template <typename T>
+__device__ __forceinline__ T* row_ptr(T* array_2d, uint64_t n_cols, uint64_t row_id)
+{
+    return array_2d + n_cols * row_id;
+}
+
+// Kernel performing whole ground state (and only ground state) computation in
+// a single step
+template <typename T>
+__global__ void find_ground(
+    T* qubo,
+    int N,
+    T* energies,
+    uint64_t* states,
+    T* best_energies,
+    uint64_t* best_states,
+    uint64_t suffix_size,
+    uint64_t num_iterations,
+    int* bit_buffer,
+    T* prefix_diff_buffer,
+    T* partial_diff_buffer,
+    int partial_diff_buffer_depth
+)
+{
+    uint64_t chunk_size = 1L << suffix_size;
+    T tmp_energy, candidate_energy;
+    uint64_t tmp_state, candidate_state;
+    int bit_to_flip;
+    int qi;
+
+    for (uint64_t i = thread_id(); i < chunk_size; i += grid_size()) {
+        candidate_energy = best_energies[i];
+        tmp_energy = energies[i];
+        candidate_state = best_states[i];
+        tmp_state = states[i];
+        for (uint64_t j = 0; j < num_iterations; j++) {
+            bit_to_flip = bit_buffer[j];
+            if (bit_to_flip < partial_diff_buffer_depth) {
+                qi = (tmp_state >> bit_to_flip) & 1;
+                tmp_energy = tmp_energy
+                    - (2 * qi - 1)
+                        * (row_ptr(partial_diff_buffer, chunk_size, bit_to_flip)[i]
+                           + prefix_diff_buffer[j]);
+            } else {
+                tmp_energy -= energy_diff_reduced(
+                    row_ptr(qubo, N, bit_to_flip),
+                    N,
+                    N - suffix_size,
+                    tmp_state,
+                    bit_to_flip,
+                    prefix_diff_buffer[j]
+                );
+            }
+
+            tmp_state = tmp_state ^ (1L << bit_to_flip);
+            if (tmp_energy < candidate_energy) {
+                candidate_energy = tmp_energy;
+                candidate_state = tmp_state;
+            }
+        }
+        states[i] = tmp_state;
+        energies[i] = tmp_energy;
+        best_states[i] = candidate_state;
+        best_energies[i] = candidate_energy;
+    }
+}
+
+template <typename T>
+__global__ void _initialize_partial_diff_buffer(
+    T* qubo,
+    int N,
+    uint64_t* states,
+    uint64_t suffix_size,
+    T* partial_diff_buffer,
+    int partial_diff_buffer_depth
+)
+{
+    uint64_t chunk_size = 1L << suffix_size;
+    uint64_t shifted_state, tmp_state;
+    T total;
+    T* qubo_row;
+
+    for (uint64_t i = thread_id(); i < chunk_size; i += grid_size()) {
+        shifted_state = states[i] >> (N - suffix_size);
+        for (int bit_to_flip = 0; bit_to_flip < partial_diff_buffer_depth; bit_to_flip++) {
+            total = 0;
+            tmp_state = shifted_state;
+            qubo_row = qubo + N * bit_to_flip;
+            for (int k = N - suffix_size; k < N; k++) {
+                if (tmp_state % 2)
+                    total += qubo_row[k];
+                tmp_state /= 2;
+            }
+
+            partial_diff_buffer[bit_to_flip * chunk_size + i] = total;
+        }
+    }
+}
+
 // Predicate used in thrust::copy_if
 // Given a tuple (state, energy), copy this copule iff energy < threshold.
-template <typename T>
-struct energy_less_then {
-    energy_less_then(T threshold) : threshold(threshold) {};
+template <typename T> struct energy_less_then {
+    energy_less_then(T threshold)
+        : threshold(threshold) {};
+
     T threshold;
 
-    __host__ __device__
-    inline bool operator () (const thrust::tuple<uint64_t, T>& pair) {
+    __host__ __device__ inline bool operator()(const thrust::tuple<uint64_t, T>& pair)
+    {
         return thrust::get<1>(pair) < threshold;
     }
 };
 
 // The pièce de résistance of this module, the search function.
-// QUBO: N x N matrix of floats or doubles (depending on the template parameter T)
-// num_states: requested size of the low energy spectrum
-// states_out, energies_out: output buffers, should be of size num_states
-// grid_size, block_size: definition of grid on which init and single_step kernels will
-//   be launched. Warning: other kernels might be launched by thrust, and we cannot control
-//   their grid (and frankly, thrust's judgement is probably better then our judgement)
-// suffix_size: the number l determining how many most significant bits in each state are
-//   fixed. Since there are 2 ** l possible configurations of l-bits, the temporary buffers
-//   for energies and states will also have 2 ** l items.
+// QUBO: N x N matrix of floats or doubles (depending on the template parameter
+// T) num_states: requested size of the low energy spectrum states_out,
+// energies_out: output buffers, should be of size num_states grid_size,
+// block_size: definition of grid on which init and single_step kernels will
+//   be launched. Warning: other kernels might be launched by thrust, and we
+//   cannot control their grid (and frankly, thrust's judgement is probably
+//   better then our judgement)
+// suffix_size: the number l determining how many most significant bits in each
+// state are
+//   fixed. Since there are 2 ** l possible configurations of l-bits, the
+//   temporary buffers for energies and states will also have 2 ** l items.
 template <typename T>
 void search(
     T* qubo,
@@ -178,7 +318,8 @@ void search(
     int grid_size,
     int block_size,
     int suffix_size
-) {
+)
+{
     // chunk_size = 2 ** suffix_size
     uint64_t chunk_size = 1 << suffix_size;
     // Device vectors with qubo
@@ -208,15 +349,14 @@ void search(
     thrust::sort_by_key(energies.begin(), energies.end(), states.begin());
 
     // The initial states and energies are becoming the first approximation of
-    // the low energy spectrum. We copy first num_states of them into best_spectrum_it.
-    thrust::copy(
-        current_spectrum_it, current_spectrum_it + num_states, best_spectrum_it
-    );
+    // the low energy spectrum. We copy first num_states of them into
+    // best_spectrum_it.
+    thrust::copy(current_spectrum_it, current_spectrum_it + num_states, best_spectrum_it);
 
     // Iterate in chunks in gray code order.
-    for(int i = 1; i < num_chunks; i++) {
-        // To compute bit to flip compute two consecutive gray codes and see at which
-        // bit they differ. Subtract 1 because ffs counts from 1.
+    for (uint64_t i = 1; i < num_chunks; i++) {
+        // To compute bit to flip compute two consecutive gray codes and see at
+        // which bit they differ. Subtract 1 because ffs counts from 1.
         int bit_to_flip = ffs(gray(i) ^ gray(i - 1)) - 1;
         // Perform single step of energy computation and check if it succeeded.
         single_step<<<grid_size, block_size>>>(
@@ -224,20 +364,20 @@ void search(
         );
         cuda_error_check(cudaGetLastError());
 
-
-        // Find the maximum energy in current approximation if low enrgy spectrum.
+        // Find the maximum energy in current approximation if low enrgy
+        // spectrum.
         T threshold = *thrust::max_element(
             thrust::device, best_energies.begin(), best_energies.begin() + num_states
         );
 
-        // From the currently computed chunk, the only candidates to enter the low energy
-        // spectrum are the ones that have energy < threshold.
-        // Hence, we copy only those states and energies into the range directly being
+        // From the currently computed chunk, the only candidates to enter the
+        // low energy spectrum are the ones that have energy < threshold. Hence,
+        // we copy only those states and energies into the range directly being
         // the current approximation of low energy spectrum.
         auto candidates_it = best_spectrum_it + num_states;
 
-        // We also store the position at which we placed the last candidate state.
-        // This way, we can sometimes sort much smaller range.
+        // We also store the position at which we placed the last candidate
+        // state. This way, we can sometimes sort much smaller range.
         auto end = thrust::copy_if(
             thrust::device,
             current_spectrum_it,
@@ -246,7 +386,8 @@ void search(
             energy_less_then<T>(threshold)
         );
 
-        // This sort effectively combines the current approximation and candidates.
+        // This sort effectively combines the current approximation and
+        // candidates.
         thrust::sort_by_key(
             best_energies.begin(),
             best_energies.begin() + num_states + (end - candidates_it),
@@ -281,3 +422,125 @@ template void search(
     int block_size,
     int suffix_size
 );
+
+template <typename T>
+void search_ground_only(
+    T* qubo,
+    int N,
+    uint64_t* states_out,
+    T* energies_out,
+    int grid_size,
+    int block_size,
+    int suffix_size,
+    int num_steps_per_kernel,
+    int partial_diff_buffer_depth
+)
+{
+    uint64_t chunk_size = 1L << suffix_size;
+    device_vector<T> qubo_gpu(qubo, qubo + N * N);
+
+    energy_vector<T> energies(chunk_size);
+    state_vector states(chunk_size);
+    energy_vector<T> best_energies(chunk_size);
+    state_vector best_states(chunk_size);
+
+    device_vector<int> bit_flip_buffer(num_steps_per_kernel);
+    energy_vector<T> prefix_diff_buffer(num_steps_per_kernel);
+    energy_vector<T> partial_diff_buffer(partial_diff_buffer_depth * chunk_size);
+
+    num_steps_per_kernel = std::min(long(num_steps_per_kernel), 1L << (N - suffix_size));
+
+
+    std::vector<int> src_bit_flip_buffer(num_steps_per_kernel);
+    std::vector<T> src_prefix_diff_buffer(num_steps_per_kernel);
+
+    uint64_t last_gray_code, next_gray_code;
+    uint64_t counter;
+    int64_t base_state;
+    T base_energy;
+
+    init<<<grid_size, block_size>>>((T*)qubo_gpu, N, (T*)energies, states, chunk_size);
+    cuda_error_check(cudaGetLastError());
+    cudaDeviceSynchronize();
+
+    best_energies = energies;
+    best_states = states;
+
+    _initialize_partial_diff_buffer<<<grid_size, block_size>>>(
+        (T*)qubo_gpu, N, states, suffix_size, (T*)partial_diff_buffer, partial_diff_buffer_depth
+    );
+    cuda_error_check(cudaGetLastError());
+    cudaDeviceSynchronize();
+
+    counter = 0;
+    last_gray_code = 0;
+
+    base_energy = 0;
+    base_state = 0;
+
+    for (uint64_t i = 0; i < (1L << (N - suffix_size)) / num_steps_per_kernel; i++) {
+        cudaDeviceSynchronize();
+        for (uint64_t j = 0; j < num_steps_per_kernel; j++) {
+            counter += 1;
+            next_gray_code = gray(counter);
+            int bit_to_flip = ffs(last_gray_code ^ next_gray_code) - 1;
+            src_bit_flip_buffer[j] = bit_to_flip;
+            src_prefix_diff_buffer[j] = qubo[(N + 1) * bit_to_flip];
+            for (int k = 0; k < N - suffix_size; k++) {
+                if (((base_state >> k) % 2) && k != bit_to_flip) {
+                    src_prefix_diff_buffer[j] += qubo[N * bit_to_flip + k];
+                }
+            }
+            last_gray_code = next_gray_code;
+            base_state ^= (1L << src_bit_flip_buffer[j]);
+        }
+        bit_flip_buffer = src_bit_flip_buffer;
+        prefix_diff_buffer = src_prefix_diff_buffer;
+        cuda_error_check(cudaGetLastError());
+        cudaDeviceSynchronize();
+        find_ground<<<grid_size, block_size>>>(
+            (T*)qubo_gpu,
+            N,
+            (T*)energies,
+            states,
+            (T*)best_energies,
+            best_states,
+            suffix_size,
+            num_steps_per_kernel,
+            (int*)bit_flip_buffer,
+            (T*)prefix_diff_buffer,
+            (T*)partial_diff_buffer,
+            partial_diff_buffer_depth
+        );
+    }
+
+    cuda_error_check(cudaGetLastError());
+    cudaDeviceSynchronize();
+    thrust::sort_by_key(best_energies.begin(), best_energies.end(), best_states.begin());
+    thrust::copy(best_states.begin(), best_states.begin() + 1, states_out);
+    thrust::copy(best_energies.begin(), best_energies.begin() + 1, energies_out);
+}
+
+template void search_ground_only(
+    float* qubo,
+    int N,
+    uint64_t* states_out,
+    float* energies_out,
+    int grid_size,
+    int block_size,
+    int suffix_size,
+    int num_steps_per_kernel,
+    int partial_diff_buffer_depth
+);
+
+template void search_ground_only(
+    double* qubo,
+    int N,
+    uint64_t* states_out,
+    double* energies_out,
+    int grid_size,
+    int block_size,
+    int suffix_size,
+    int num_steps_per_kernel,
+    int partial_diff_buffer_depth
+);
diff --git a/omnisolver/extensions/bruteforce_gpu.h b/omnisolver/extensions/bruteforce_gpu.h
index 6b7ff56..d424c47 100644
--- a/omnisolver/extensions/bruteforce_gpu.h
+++ b/omnisolver/extensions/bruteforce_gpu.h
@@ -11,12 +11,14 @@ void search(
 );
 
 template <typename T>
-void search(
+void search_ground_only(
     T* qubo,
     int N,
-    T* energies_out,
     uint64_t* states_out,
-    uint64_t num_states_in_chunk,
+    T* energies_out,
     int blocks_per_grid,
-    int threads_per_block
+    int threads_per_block,
+    int suffix_size,
+    int num_steps_per_kernel,
+    int partial_diff_buff_depth
 );
diff --git a/omnisolver/extensions/bruteforce_wrapper_gpu.pyx b/omnisolver/extensions/bruteforce_wrapper_gpu.pyx
index 307c4ef..e0291b4 100644
--- a/omnisolver/extensions/bruteforce_wrapper_gpu.pyx
+++ b/omnisolver/extensions/bruteforce_wrapper_gpu.pyx
@@ -21,6 +21,17 @@ cdef extern from "bruteforce_gpu.h":
         int suffix_size
     ) except+
 
+    void search_ground_only[T](
+        T* qubo,
+        int N,
+        uint64_t* states_out,
+        T* energies_out,
+        int blocks_per_grid,
+        int threads_per_block,
+        int suffix_size,
+        int num_steps_per_kernel,
+        int partial_diff_buff_depth
+    ) except+
 
 def gpu_search(
     real[:,:] qubo,
@@ -41,3 +52,25 @@ def gpu_search(
         block_size,
         suffix_size
     )
+
+def gpu_search_ground_only(
+    real[:,:] qubo,
+    uint64_t[::1] states_out,
+    real[::1] energies_out,
+    int grid_size,
+    int block_size,
+    int suffix_size,
+    int num_steps_per_kernel=16,
+    int partial_diff_buffer_depth=1
+):
+    search_ground_only(
+        &qubo[0, 0],
+        qubo.shape[0],
+        &states_out[0],
+        &energies_out[0],
+        grid_size,
+        block_size,
+        suffix_size,
+        num_steps_per_kernel,
+        partial_diff_buffer_depth
+    )
diff --git a/pyproject.toml b/pyproject.toml
index faabd03..caaa913 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,15 +4,14 @@ build-backend = "setuptools.build_meta"
 
 [project]
 readme = "README.md"
-requires-python = ">=3.7,<3.10"
+requires-python = ">=3.7"
 name = "omnisolver-bruteforce"
 description = "Bruteforce (a.k.a. exhaustive search) Plugin for Omnisolver"
 dependencies = [
-    "omnisolver ~= 0.0.3",
+    "omnisolver >= 0.0.3",
     "dimod ~= 0.12",
-    "numba ~= 0.56.4",
-    "pluggy ~= 0.13.1",
-    "numpy ~= 1.19.4"
+    "numba ~= 0.58",
+    "numpy ~= 1.26"
 ]
 dynamic = ["version"]
 
@@ -22,7 +21,7 @@ docs = [
     "sphinx-autoapi~=2.0.1",
     "pydata-sphinx-theme~=0.13.1",
     "sphinx-term~=0.1",
-    "myst-parser~=1.0.0"
+    "myst-parser~=1.0.0",
 ]
 
 [project.entry-points."omnisolver"]
diff --git a/tests/cpu/test_gray_code.py b/tests/cpu/test_gray_code.py
index 0a7964f..aaff5be 100644
--- a/tests/cpu/test_gray_code.py
+++ b/tests/cpu/test_gray_code.py
@@ -33,14 +33,10 @@ def test_gray_code_index_is_computed_correctly(self, binary_number, gray_code):
 @pytest.mark.parametrize("num_bits", [10, 12, 16])
 class TestBijectionBetweenGrayAndBinary:
     def test_nth_gray_code_inverts_gray_code_index(self, num_bits):
-        assert all(
-            gray_code_index(nth_gray_number(n)) == n for n in range(2 ** num_bits)
-        )
+        assert all(gray_code_index(nth_gray_number(n)) == n for n in range(2**num_bits))
 
     def test_gray_code_index_inverts_nth_gray_code(self, num_bits):
-        assert all(
-            nth_gray_number(gray_code_index(n)) == n for n in range(2 ** num_bits)
-        )
+        assert all(nth_gray_number(gray_code_index(n)) == n for n in range(2**num_bits))
 
 
 def _binary_array_to_number(arr):
@@ -57,12 +53,12 @@ def test_iterating_algorithm_l_yields_all_gray_codes(num_bits):
     state = np.zeros(num_bits, dtype=np.int8)
     produced_numbers = [_binary_array_to_number(state)]
 
-    for _ in range(2 ** num_bits - 1):
+    for _ in range(2**num_bits - 1):
         bit_to_flip = focus_vector[0] % num_bits
         state[bit_to_flip] = 1 - state[bit_to_flip]
         advance_focus_vector(focus_vector)
         produced_numbers.append(_binary_array_to_number(state))
 
     np.testing.assert_array_equal(
-        produced_numbers, [nth_gray_number(n) for n in range(2 ** num_bits)]
+        produced_numbers, [nth_gray_number(n) for n in range(2**num_bits)]
     )