diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..b33b3b2
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,6 @@
+BasedOnStyle: WebKit
+AlignAfterOpenBracket: BlockIndent
+ColumnLimit: 100
+BinPackParameters: false
+BinPackArguments: false
+IncludeBlocks: Regroup
diff --git a/.gitignore b/.gitignore
index fb9f65d..2902970 100644
--- a/.gitignore
+++ b/.gitignore
@@ -137,3 +137,5 @@ dmypy.json
# End of https://www.toptal.com/developers/gitignore/api/python
instances/
+
+bruteforce_wrapper_gpu.cpp
\ No newline at end of file
diff --git a/docs/Makefile b/docs/Makefile
deleted file mode 100644
index d4bb2cb..0000000
--- a/docs/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS ?=
-SPHINXBUILD ?= sphinx-build
-SOURCEDIR = .
-BUILDDIR = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
- @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
- @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/_templates/autoapi/index.rst b/docs/_templates/autoapi/index.rst
deleted file mode 100644
index 87a7ff9..0000000
--- a/docs/_templates/autoapi/index.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-API Reference
-=============
-
-This page contains auto-generated API reference documentation.
-
-.. toctree::
- :titlesonly:
-
- {% for page in pages | sort %}
- {% if (page.top_level_object or page.name.split('.') | length == 3) and page.display %}
- {{ page.short_name | capitalize }} <{{ page.include_path }}>
- {% endif %}
- {% endfor %}
diff --git a/docs/assets/logo-large.png b/docs/assets/logo-large.png
new file mode 100644
index 0000000..2d16045
Binary files /dev/null and b/docs/assets/logo-large.png differ
diff --git a/docs/assets/logo.png b/docs/assets/logo.png
new file mode 100644
index 0000000..1e22174
Binary files /dev/null and b/docs/assets/logo.png differ
diff --git a/docs/conf.py b/docs/conf.py
deleted file mode 100644
index b6b4a53..0000000
--- a/docs/conf.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
-
-
-# -- Project information -----------------------------------------------------
-
-project = "Omnisolver Bruteforce"
-copyright = "2021, Konrad Jałowiecki, Łukasz Pawela"
-author = "Konrad Jałowiecki, Łukasz Pawela"
-
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = ["myst_parser", "sphinx_term.termynal", "autoapi.extension"]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
-
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages. See the documentation for
-# a list of builtin themes.
-#
-html_theme = "pydata_sphinx_theme"
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
-
-autoapi_dirs = ["../omnisolver"]
-autoapi_python_use_implicit_namespaces = True
-autoapi_template_dir = "_templates/autoapi"
diff --git a/docs/index.md b/docs/index.md
index e078135..f99fb01 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,49 +1,91 @@
-# Welcome to omnisolver-pt documentation!
+---
+hide:
+ - navigation
+---
-The `omnisolver-bruteforce` is Omnisolver plugin implementing the bruteforce (a.k.a exhaustive
-search) algorithm.
+
+
+
-## Installation
+---
-Currently, the `omnisolver-bruteforce` package requires working CUDA installation.
+
-To install the plugin first set the `CUDAHOME` environmental library to your CUDA instalaltion location, e.g.:
+
+
+
+
+
+
+
+
-```shell
-# Rmember, your actual location may vary!
-export CUDAHOME=/usr/local/cuda
-```
+**Documentation:** https://euro-hpc-pl.github.io/omnisolver
-and then run:
+**Source code:** https://github.com/euro-hpc-pl/omnisolver
-```shell
-pip install omnisolver-bruteforce
-```
+---
-> **Warning**
-> If you don't set the `CUDAHOME` directory, an attempt will be made to deduce it based on the location of your `nvcc` compiler.
-> However, this process might not work in all the cases and should not be relied on.
+Omnisolver is a collection of Binary Quadratic Model solvers and a framework for implementing them.
-## Command line usage
-```text
-usage: omnisolver bruteforce-gpu [-h] [--output OUTPUT] [--vartype {SPIN,BINARY}] [--num_states NUM_STATES] [--suffix_size SUFFIX_SIZE] [--grid_size GRID_SIZE]
- [--block_size BLOCK_SIZE] [--dtype {float,double}]
- input
-Bruteforce (a.k.a exhaustive search) sampler using CUA-enabled GPU
-positional arguments:
- input Path of the input BQM file in COO format. If not specified, stdin is used.
+## Why Omnisolver?
+
+### Benefits for the end-users
+
+Omnisolver contains a selection of standard and more sophisticated algorithms for solving BQMs. All solvers are available through intuitive CLI or from Python scripts as dimod based Samplers.
+
+### Benefits for solver creators
+
+Omnisolver allows developer to focus on algorithms instead of common tasks like handling input/output or creating CLI.
+
+## Quickstart
-optional arguments:
- -h, --help show this help message and exit
- --output OUTPUT Path of the output file. If not specified, stdout is used.
- --vartype {SPIN,BINARY}
- Variable type
- --num_states NUM_STATES
+
+
+```
+# Install base omnisolver package
+$ pip install omnisolver
+---> 100%
+Successfuly installed omnisolver
+# Install chosen plugins (e.g. parallel-tempering solver)
+$ pip install omnisolver-pt
+---> 100%
+Successfuly installed omnisolver-pt
+# Create an instance file in COOrdinate format
+$ echo "0 1 1.0
+> 1 2 1.0
+> 2 0 1.0" > instance.txt
+# Run solver
+$ omnisolver pt --vartype SPIN instance.txt
+0,1,2,energy,num_occurrences
+1,-1,-1,-1.0,1
```
-## Documentation
-```{toctree}
-:maxdepth: 1
+## What's next?
+
+Here are some resources to get you started:
+
+- Start with user guide to learn about the installation methods and general usage patterns.
+- Discover available solvers in our plugin list.
+- If you are interested in developing your own solver, or are interested in in-depth details of how the Omnisolver
+ works, check our solver creator guide and reference manual.
+
+## Citing
+
+If you used Omnisolver in your research, consider citing it in your paper.
+You can use the following BibTeX entry:
+
+```text
+@article{omnisolver2023,
+ title = {Omnisolver: An extensible interface to Ising spin–glass and QUBO solvers},
+ journal = {SoftwareX},
+ volume = {24},
+ pages = {101559},
+ year = {2023},
+ doi = {https://doi.org/10.1016/j.softx.2023.101559},
+ url = {https://www.softxjournal.com/article/S2352-7110(23)00255-8/},
+ author = {Konrad Jałowiecki and {\L}ukasz Pawela},
+}
```
diff --git a/docs/javascripts/mathjax.js b/docs/javascripts/mathjax.js
new file mode 100644
index 0000000..396f03b
--- /dev/null
+++ b/docs/javascripts/mathjax.js
@@ -0,0 +1,18 @@
+window.MathJax = {
+ tex: {
+ inlineMath: [["\\(", "\\)"]],
+ displayMath: [["\\[", "\\]"]],
+ processEscapes: true,
+ processEnvironments: true
+ },
+ options: {
+ ignoreHtmlClass: ".*|",
+ processHtmlClass: "arithmatex"
+ }
+};
+
+document$.subscribe(() => {
+
+
+ MathJax.typesetPromise()
+})
diff --git a/docs/make.bat b/docs/make.bat
deleted file mode 100644
index 8084272..0000000
--- a/docs/make.bat
+++ /dev/null
@@ -1,35 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
- set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=.
-set BUILDDIR=_build
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
- echo.
- echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
- echo.installed, then set the SPHINXBUILD environment variable to point
- echo.to the full path of the 'sphinx-build' executable. Alternatively you
- echo.may add the Sphinx directory to PATH.
- echo.
- echo.If you don't have Sphinx installed, grab it from
- echo.https://www.sphinx-doc.org/
- exit /b 1
-)
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
diff --git a/docs/reference/omnisolver_bruteforce_distributed.md b/docs/reference/omnisolver_bruteforce_distributed.md
new file mode 100644
index 0000000..8860640
--- /dev/null
+++ b/docs/reference/omnisolver_bruteforce_distributed.md
@@ -0,0 +1,2 @@
+# ::: omnisolver.bruteforce.gpu.distributed
+ handler: python
diff --git a/docs/reference/omnisolver_bruteforce_sampler.md b/docs/reference/omnisolver_bruteforce_sampler.md
new file mode 100644
index 0000000..84cc6a5
--- /dev/null
+++ b/docs/reference/omnisolver_bruteforce_sampler.md
@@ -0,0 +1,2 @@
+# ::: omnisolver.bruteforce.gpu.sampler
+ handler: python
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index 09f95f3..0000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-sphinx~=6.1.3
-sphinx-autoapi~=2.0.1
-pydata-sphinx-theme~=0.13.1
-sphinx-term~=0.1
-myst-parser~=1.0.0
diff --git a/examples/distributed.py b/examples/distributed.py
new file mode 100644
index 0000000..aab4250
--- /dev/null
+++ b/examples/distributed.py
@@ -0,0 +1,55 @@
+from dimod import BQM
+from numpy.random import default_rng
+
+from omnisolver.bruteforce.gpu import BruteforceGPUSampler
+from omnisolver.bruteforce.gpu.distributed import DistributedBruteforceGPUSampler
+
+
+def random_bqm(num_variables, vartype, offset, rng):
+ linear = {
+ i: coef for i, coef in zip(range(num_variables), rng.uniform(-2, 2, size=num_variables))
+ }
+ quadratic = {
+ (i, j): coef
+ for (i, j), coef in zip(
+ [(i, j) for i in range(num_variables) for j in range(i + 1, num_variables)],
+ rng.uniform(-1, -1, size=(num_variables - 1) * num_variables // 2),
+ )
+ }
+ return BQM(linear, quadratic, offset, vartype=vartype)
+
+
+NUM_VARIABLES = 40
+
+
+def main():
+ sampler = DistributedBruteforceGPUSampler()
+ sampler2 = BruteforceGPUSampler()
+ rng = default_rng(1234)
+
+ bqm = random_bqm(NUM_VARIABLES, "BINARY", 0.0, rng)
+
+ import time
+
+ start = time.perf_counter()
+ result = sampler.sample(
+ bqm, num_states=100, num_fixed_vars=1, suffix_size=25, grid_size=1024, block_size=1024
+ )
+ duration = time.perf_counter() - start
+ distributed_en = [entry.energy for entry in result.data()]
+
+ print(f"Distributed finished in : {duration}s")
+
+ start = time.perf_counter()
+ result2 = sampler2.sample(
+ bqm, num_states=100, suffix_size=25, grid_size=2**12, block_size=512
+ )
+ duration = time.perf_counter() - start
+ single_en = [entry.energy for entry in result2.data()]
+ print(f"Single finished in: {duration}s")
+
+ print(max(abs(en1 - en2) for en1, en2 in zip(distributed_en, single_en)))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..2d13988
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,66 @@
+# yaml-language-server: $schema=https://squidfunk.github.io/mkdocs-material/schema.json
+site_name: Omnisolver-Bruteforce
+site_author: "Konrad Jałowiecki & Łukasz Pawela"
+copyright: ""
+theme:
+ name: material
+ logo: assets/logo.png
+ palette:
+ primary: custom
+ features:
+ - navigation.tabs
+ - navigation.tracking
+extra:
+ version:
+ provider: mike
+extra_css:
+ - stylesheets/extra.css
+nav:
+ - User guide: userguide.md
+ #- Solver creator guide: creatorguide.md
+ - Reference manual:
+ - omnisolver.bruteforce.gpu.sampler: reference/omnisolver_bruteforce_sampler.md
+ - omnisolver.bruteforce.gpu.distributed: reference/omnisolver_bruteforce_distributed.md
+ - Plugin list: plugins.md
+plugins:
+ - search
+ - mike:
+ canonical_version: latest
+ version_selector: true
+ - termynal:
+ prompt_literal_start:
+ - "$"
+ - mkdocstrings:
+ handlers:
+ python:
+ options:
+ annotation_path: brief
+ heading_level: 2
+ show_source: false
+ show_root_toc_entry: false
+ show_signature_annotations: true
+ members_order: source
+ separate_signature: true
+ docstring_style: sphinx
+ show_if_no_docstring: true
+ docstring_options:
+ ignore_init_summary: true
+ merge_init_into_class: true
+ show_docstring_returns: true
+
+ - with-pdf:
+ render_js: true
+ cover_subtitle: "Exhaustive search plugin for Omnisolver"
+repo_name: Omnisolver
+repo_url: https://github.com/euro-hpc-pl/omnisolver
+markdown_extensions:
+ - pymdownx.superfences
+ - pymdownx.arithmatex:
+ generic: true
+ - admonition
+ - pymdownx.tabbed:
+ alternate_style: true
+extra_javascript:
+ - javascripts/mathjax.js
+ - https://polyfill.io/v3/polyfill.min.js?features=es6
+ - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
diff --git a/omnisolver/bruteforce/gpu/__init__.py b/omnisolver/bruteforce/gpu/__init__.py
index d0c35ef..5b868f2 100644
--- a/omnisolver/bruteforce/gpu/__init__.py
+++ b/omnisolver/bruteforce/gpu/__init__.py
@@ -1,12 +1,11 @@
"""Implementation of parallel tempering plugin for Omnisolver."""
-from omnisolver.common.plugin import Plugin, plugin_from_specification, plugin_impl
+from omnisolver.common.plugin import Plugin, plugin_from_specification
from pkg_resources import resource_stream
from yaml import safe_load
from .sampler import BruteforceGPUSampler
-@plugin_impl
def get_plugin() -> Plugin:
"""Get package name and resource path."""
specification = safe_load(resource_stream("omnisolver.bruteforce.gpu", "gpu.yml"))
diff --git a/omnisolver/bruteforce/gpu/distributed.py b/omnisolver/bruteforce/gpu/distributed.py
new file mode 100644
index 0000000..9729a0c
--- /dev/null
+++ b/omnisolver/bruteforce/gpu/distributed.py
@@ -0,0 +1,108 @@
+import typing
+from itertools import product
+from time import perf_counter
+
+import numpy as np
+import ray
+from dimod import Sampler, Vartype, append_variables, concatenate
+
+from .sampler import BruteforceGPUSampler
+
+
+@ray.remote(num_gpus=1)
+def _solve_subproblem(
+ bqm,
+ num_states,
+ fixed_vars,
+ suffix_size,
+ grid_size,
+ block_size,
+ num_steps_per_kernel,
+ partial_diff_buffer_depth,
+ dtype,
+):
+ new_bqm = bqm.copy()
+ new_bqm.fix_variables(fixed_vars)
+
+ sampler = BruteforceGPUSampler()
+ result = sampler.sample(
+ new_bqm,
+ num_states,
+ suffix_size,
+ grid_size,
+ block_size,
+ num_steps_per_kernel,
+ partial_diff_buffer_depth,
+ dtype,
+ )
+
+ return append_variables(result, fixed_vars)
+
+
+class DistributedBruteforceGPUSampler(Sampler):
+ def sample(
+ self,
+ bqm,
+ num_states,
+ num_fixed_vars,
+ suffix_size,
+ grid_size,
+ block_size,
+ num_steps_per_kernel=1,
+ partial_diff_buffer_depth=1,
+ dtype=np.float32,
+ ):
+ if bqm.vartype == Vartype.SPIN:
+ return self.sample(
+ bqm.change_vartype("BINARY", inplace=False),
+ num_states,
+ num_fixed_vars,
+ suffix_size,
+ grid_size,
+ block_size,
+ num_steps_per_kernel,
+ partial_diff_buffer_depth,
+ ).change_vartype("SPIN", inplace=False)
+
+ bqm, mapping = bqm.relabel_variables_as_integers()
+
+ start_counter = perf_counter()
+
+ subproblems = [
+ {i: v for i, v in enumerate(vals)} for vals in product([0, 1], repeat=num_fixed_vars)
+ ]
+
+ refs = [
+ _solve_subproblem.remote(
+ bqm,
+ num_states,
+ fixed_vars,
+ suffix_size,
+ grid_size,
+ block_size,
+ num_steps_per_kernel,
+ partial_diff_buffer_depth,
+ dtype,
+ )
+ for fixed_vars in subproblems
+ ]
+
+ subsolutions = [ray.get(ref) for ref in refs]
+ solve_time_in_seconds = perf_counter() - start_counter
+ result = concatenate(subsolutions).truncate(num_states)
+ result.info["solve_time_in_seconds"] = solve_time_in_seconds
+ return result.relabel_variables(mapping)
+
+ @property
+ def parameters(self) -> typing.Dict[str, typing.Any]:
+ return {
+ "num_states": [],
+ "suffix_size": [],
+ "grid_size": [],
+ "block_size": [],
+ "dtype": [],
+ }
+
+ @property
+ def properties(self) -> typing.Dict[str, typing.Any]:
+ return {}
diff --git a/omnisolver/bruteforce/gpu/sampler.py b/omnisolver/bruteforce/gpu/sampler.py
index d26e8c4..81a960c 100644
--- a/omnisolver/bruteforce/gpu/sampler.py
+++ b/omnisolver/bruteforce/gpu/sampler.py
@@ -4,7 +4,7 @@
import numpy as np
from dimod import Sampler, SampleSet, Vartype
-from omnisolver.bruteforce.ext.gpu import gpu_search
+from omnisolver.bruteforce.ext.gpu import gpu_search, gpu_search_ground_only
def _convert_int_to_sample(val, num_variables):
@@ -16,7 +16,17 @@ def _convert_int_to_sample(val, num_variables):
class BruteforceGPUSampler(Sampler):
- def sample(self, bqm, num_states, suffix_size, grid_size, block_size, dtype=np.float32):
+ def sample(
+ self,
+ bqm,
+ num_states,
+ suffix_size,
+ grid_size,
+ block_size,
+ num_steps_per_kernel=16,
+ partial_diff_buffer_depth=1,
+ dtype=np.float32,
+ ):
"""Solve Binary Quadratic Model using exhaustive (bruteforce) search on the GPU.
:param bqm: Binary Quadratic Model instance to solve.
@@ -38,6 +48,9 @@ def sample(self, bqm, num_states, suffix_size, grid_size, block_size, dtype=np.f
suffix_size,
grid_size,
block_size,
+ num_steps_per_kernel,
+ partial_diff_buffer_depth,
+ dtype,
).change_vartype("SPIN", inplace=False)
bqm, mapping = bqm.relabel_variables_as_integers()
@@ -56,15 +69,27 @@ def sample(self, bqm, num_states, suffix_size, grid_size, block_size, dtype=np.f
start_counter = perf_counter()
- gpu_search(
- qubo_mat,
- num_states,
- states_out,
- energies_out,
- grid_size,
- block_size,
- suffix_size,
- )
+ if num_states == 1: # Shortcut if we are only looking for a ground state
+ gpu_search_ground_only(
+ qubo_mat,
+ states_out,
+ energies_out,
+ grid_size,
+ block_size,
+ suffix_size,
+ num_steps_per_kernel,
+ partial_diff_buffer_depth,
+ )
+ else:
+ gpu_search(
+ qubo_mat,
+ num_states,
+ states_out,
+ energies_out,
+ grid_size,
+ block_size,
+ suffix_size,
+ )
solve_time_in_seconds = perf_counter() - start_counter
diff --git a/omnisolver/extensions/bruteforce_gpu.cu b/omnisolver/extensions/bruteforce_gpu.cu
index 03d62d1..1adc65d 100644
--- a/omnisolver/extensions/bruteforce_gpu.cu
+++ b/omnisolver/extensions/bruteforce_gpu.cu
@@ -1,30 +1,36 @@
// Implementation of exhaustive search using a CUDA--enabled GPU.
-#include
-#include
-#include
+#include "vectors.h"
+
#include
+#include
#include
#include
-#include "vectors.h"
// Basic error checking (used only for kernel launches)
// It should throw an instance of std::runtime_error containing the decoded
// cuda runtime error message and also filename and line at which the error
// occurred.
-#define cuda_error_check(code) { cuda_assert((code), __FILE__, __LINE__); }
+#define cuda_error_check(code) \
+ { \
+ cuda_assert((code), __FILE__, __LINE__); \
+ }
+
+__device__ __forceinline__ int thread_id() { return blockIdx.x * blockDim.x + threadIdx.x; }
+
+__device__ __forceinline__ int grid_size() { return blockDim.x * gridDim.x; }
-inline void cuda_assert(cudaError_t code, const char *file, int line)
+inline void cuda_assert(cudaError_t code, const char* file, int line)
{
- if (code != cudaSuccess)
- {
- std::stringstream what;
- what << "File: " << __FILE__ << " at " << __LINE__ << ", error: " << cudaGetErrorString(code);
- throw std::runtime_error(what.str());
- }
+ if (code != cudaSuccess) {
+ std::stringstream what;
+ what << "File: " << __FILE__ << " at " << __LINE__
+ << ", error: " << cudaGetErrorString(code);
+ throw std::runtime_error(what.str());
+ }
}
// Compute i-th Gray code.
-uint64_t gray(uint64_t index) { return index ^ (index >> 1); }
+__host__ __device__ uint64_t gray(uint64_t index) { return index ^ (index >> 1); }
// Find First bit Set in a number.
// Following common convention for this function it returns bits indexed from 1
@@ -33,12 +39,13 @@ uint64_t gray(uint64_t index) { return index ^ (index >> 1); }
// ffs(7) = 1
// ffs(10) = 2
// ffs(24) = 4
-int ffs(uint64_t number) {
- if(number == 0) {
+int ffs(uint64_t number)
+{
+ if (number == 0) {
return 0;
}
int result = 1;
- while((number & 1) != 1) {
+ while ((number & 1) != 1) {
number >>= 1;
result++;
}
@@ -46,23 +53,62 @@ int ffs(uint64_t number) {
return result;
}
-// Given a QUBO and a state, compute how the energy changes if the i-th bit of state
-// is flipped.
-// Specifically, E(.) is energy function of the qubo, then this function computes
-// E(s) - E(s'), where s' is the state after flipping i-th bit of s.
-// One can easily verify that to compute the difference above we need only the
-// i-th row of QUBO, and hence we already pass this row instead of computign the offset.
+// Given a QUBO and a state, compute how the energy changes if the i-th bit of
+// state is flipped. Specifically, E(.) is energy function of the qubo, then
+// this function computes E(s) - E(s'), where s' is the state after flipping
+// i-th bit of s. One can easily verify that to compute the difference above we
+// need only the i-th row of QUBO, and hence we already pass this row instead
+// of computign the offset.
template
-__device__ T energy_diff(T* qubo_row, int N, uint64_t state, int i) {
+__host__ __device__ __forceinline__ T energy_diff(T* qubo_row, int N, uint64_t state, int i)
+{
int qi = (state >> i) & 1; // This is the i-th bit of state
- T total = qubo_row[i]; // Start accumulating from the lineaer term
+ T total = qubo_row[i]; // Start accumulating from the linear term
+
+ uint32_t word = state & 0xFFFFFFFF;
// Go through each bit of state
- for(int j = 0; j < N; j++) {
- if(i != j) { // except the one to flip, it's already accounted for
- total += qubo_row[j] * (state & 1);
+ for (int j = 0; j < 32 && j < N; j++) {
+ if (i != j && (word % 2)) { // except the one to flip, it's already accounted for
+ total += qubo_row[j];
}
- state >>= 1; // move to next bit
+ word /= 2; //>>= 1; // move to next bit
+ }
+
+ word = state >> 32;
+ for (int j = 32; j < N; j++) {
+ if (i != j && (word % 2)) { // except the one to flip, it's already accounted for
+ total += qubo_row[j];
+ }
+ word /= 2; // move to next bit
+ }
+
+ // When flipping from 0 to 1 there is nothing to do, otherwise we need
+ // to multiply the total by -1.
+ return (2 * qi - 1) * total;
+}
+
+template
+__device__ __forceinline__ T
+energy_diff_reduced(T* qubo_row, int N, int offset, uint64_t state, int i, T common_part)
+{
+ int qi = (state >> i) & 1; // This is the i-th bit of state
+ T total = common_part; // Start accumulating from the linear term
+
+ uint32_t word = (state >> offset) & 0xFFFFFFFF;
+
+ // Go through each bit of state
+ for (int j = offset; j < 32 + offset && j < N; j++) {
+ if (word % 2)
+ total += qubo_row[j];
+ word /= 2; // move to next bit
+ }
+
+ word = state >> (32 + offset);
+ for (int j = 32 + offset; j < N; j++) {
+ if (word % 2)
+ total += qubo_row[j];
+ word /= 2; // move to next bit
}
// When flipping from 0 to 1 there is nothing to do, otherwise we need
@@ -71,14 +117,14 @@ __device__ T energy_diff(T* qubo_row, int N, uint64_t state, int i) {
}
// Initialize first states and energies for further processing.
-// The idea is as follows. Each state (which is N-bit number) is (logically) split into
-// two parts:
-// l-bit part specific to state
-// k-bit part that changes in all states in the same way for all states in given iteration.
-// This function computes energy of a state with some l-bits set according to index,
+// The idea is as follows. Each state (which is N-bit number) is (logically)
+// split into two parts: l-bit part specific to state k-bit part that changes
+// in all states in the same way for all states in given iteration. This
+// function computes energy of a state with some l-bits set according to index,
// and k least significant bits set to 0.
template
-__device__ void _init(T* qubo, int N, uint64_t* states, T* energies, uint64_t index) {
+__device__ void _init(T* qubo, int N, uint64_t* states, T* energies, uint64_t index)
+{
T energy = 0.0;
uint64_t state = 0, suffix = index, offset = 0;
@@ -86,9 +132,9 @@ __device__ void _init(T* qubo, int N, uint64_t* states, T* energies, uint64_t in
// Instead we enumerate bits from the most significant one.
// For instance, if index=12, which is 110 in binary, we will set the three
// most significant bits of state to 011.
- while(suffix > 0) {
+ while (suffix > 0) {
// Find first set (note: not our function, this one is CUDA's intrinsic)
- uint64_t bit_in_suffix = __ffs(suffix);
+ uint64_t bit_in_suffix = __ffsll(suffix);
// Compute bit index from the most significant position
// Since we reduce suffix in each iteration, we have to store the offset
// travelled so far in the `offset` variable.
@@ -108,36 +154,28 @@ __device__ void _init(T* qubo, int N, uint64_t* states, T* energies, uint64_t in
states[index] = state;
}
-// Kernel that basically launched the _init function above for all states and energies
+// Kernel that basically launches the _init function above for all states and
+// energies
template
-__global__ void init(
- T* qubo,
- int N,
- T* energies,
- uint64_t* states,
- uint64_t states_in_chunk
-) {
- for(int i=blockIdx.x * blockDim.x + threadIdx.x; i < states_in_chunk; i += blockDim.x * gridDim.x) {
+__global__ void init(T* qubo, int N, T* energies, uint64_t* states, uint64_t states_in_chunk)
+{
+ for (uint64_t i = thread_id(); i < states_in_chunk; i += grid_size()) {
_init(qubo, N, states, energies, i);
}
}
// A single step of bruteforce algorithm.
-// Recall that we split all states (logically) into l-most significant fixed bits
-// and k-least significant bits that are modified, one at a time, in each iteration.
-// This kernel flips one of those k-significant bits.
+// Recall that we split all states (logically) into l-most significant fixed
+// bits and k-least significant bits that are modified, one at a time, in each
+// iteration. This kernel flips one of those k-significant bits.
template
__global__ void single_step(
- T* qubo,
- int N,
- T* energies,
- uint64_t* states,
- int bit_to_flip,
- uint64_t states_in_chunk
-) {
+ T* qubo, int N, T* energies, uint64_t* states, int bit_to_flip, uint64_t states_in_chunk
+)
+{
T* qubo_row = qubo + N * bit_to_flip;
- for(int i=blockIdx.x * blockDim.x + threadIdx.x; i < states_in_chunk; i += blockDim.x * gridDim.x) {
+ for (uint64_t i = thread_id(); i < states_in_chunk; i += grid_size()) {
uint64_t state = states[i];
T energy = energies[i];
energies[i] = energy - energy_diff(qubo_row, N, state, bit_to_flip);
@@ -145,29 +183,131 @@ __global__ void single_step(
}
}
+template
+__device__ __forceinline__ T* row_ptr(T* array_2d, uint64_t n_cols, uint64_t row_id)
+{
+ return array_2d + n_cols * row_id;
+}
+
+// Kernel performing whole ground state (and only ground state) computation in
+// a single step
+template
+__global__ void find_ground(
+ T* qubo,
+ int N,
+ T* energies,
+ uint64_t* states,
+ T* best_energies,
+ uint64_t* best_states,
+ uint64_t suffix_size,
+ uint64_t num_iterations,
+ int* bit_buffer,
+ T* prefix_diff_buffer,
+ T* partial_diff_buffer,
+ int partial_diff_buffer_depth
+)
+{
+ uint64_t chunk_size = 1L << suffix_size;
+ T tmp_energy, candidate_energy;
+ uint64_t tmp_state, candidate_state;
+ int bit_to_flip;
+ int qi;
+
+ for (uint64_t i = thread_id(); i < chunk_size; i += grid_size()) {
+ candidate_energy = best_energies[i];
+ tmp_energy = energies[i];
+ candidate_state = best_states[i];
+ tmp_state = states[i];
+ for (uint64_t j = 0; j < num_iterations; j++) {
+ bit_to_flip = bit_buffer[j];
+ if (bit_to_flip < partial_diff_buffer_depth) {
+ qi = (tmp_state >> bit_to_flip) & 1;
+ tmp_energy = tmp_energy
+ - (2 * qi - 1)
+ * (row_ptr(partial_diff_buffer, chunk_size, bit_to_flip)[i]
+ + prefix_diff_buffer[j]);
+ } else {
+ tmp_energy -= energy_diff_reduced(
+ row_ptr(qubo, N, bit_to_flip),
+ N,
+ N - suffix_size,
+ tmp_state,
+ bit_to_flip,
+ prefix_diff_buffer[j]
+ );
+ }
+
+ tmp_state = tmp_state ^ (1L << bit_to_flip);
+ if (tmp_energy < candidate_energy) {
+ candidate_energy = tmp_energy;
+ candidate_state = tmp_state;
+ }
+ }
+ states[i] = tmp_state;
+ energies[i] = tmp_energy;
+ best_states[i] = candidate_state;
+ best_energies[i] = candidate_energy;
+ }
+}
+
+template
+__global__ void _initialize_partial_diff_buffer(
+ T* qubo,
+ int N,
+ uint64_t* states,
+ uint64_t suffix_size,
+ T* partial_diff_buffer,
+ int partial_diff_buffer_depth
+)
+{
+ uint64_t chunk_size = 1L << suffix_size;
+ uint64_t shifted_state, tmp_state;
+ T total;
+ T* qubo_row;
+
+ for (uint64_t i = thread_id(); i < chunk_size; i += grid_size()) {
+ shifted_state = states[i] >> (N - suffix_size);
+ for (int bit_to_flip = 0; bit_to_flip < partial_diff_buffer_depth; bit_to_flip++) {
+ total = 0;
+ tmp_state = shifted_state;
+ qubo_row = qubo + N * bit_to_flip;
+ for (int k = N - suffix_size; k < N; k++) {
+ if (tmp_state % 2)
+ total += qubo_row[k];
+ tmp_state /= 2;
+ }
+
+ partial_diff_buffer[bit_to_flip * chunk_size + i] = total;
+ }
+ }
+}
+
// Predicate used in thrust::copy_if
// Given a tuple (state, energy), copy this copule iff energy < threshold.
-template
-struct energy_less_then {
- energy_less_then(T threshold) : threshold(threshold) {};
+template struct energy_less_then {
+ energy_less_then(T threshold)
+ : threshold(threshold) {};
+
T threshold;
- __host__ __device__
- inline bool operator () (const thrust::tuple& pair) {
+ __host__ __device__ inline bool operator()(const thrust::tuple& pair)
+ {
return thrust::get<1>(pair) < threshold;
}
};
// The pièce de résistance of this module, the search function.
-// QUBO: N x N matrix of floats or doubles (depending on the template parameter T)
-// num_states: requested size of the low energy spectrum
-// states_out, energies_out: output buffers, should be of size num_states
-// grid_size, block_size: definition of grid on which init and single_step kernels will
-// be launched. Warning: other kernels might be launched by thrust, and we cannot control
-// their grid (and frankly, thrust's judgement is probably better then our judgement)
-// suffix_size: the number l determining how many most significant bits in each state are
-// fixed. Since there are 2 ** l possible configurations of l-bits, the temporary buffers
-// for energies and states will also have 2 ** l items.
+// QUBO: N x N matrix of floats or doubles (depending on the template parameter
+// T) num_states: requested size of the low energy spectrum states_out,
+// energies_out: output buffers, should be of size num_states grid_size,
+// block_size: definition of grid on which init and single_step kernels will
+// be launched. Warning: other kernels might be launched by thrust, and we
+// cannot control their grid (and frankly, thrust's judgement is probably
+// better then our judgement)
+// suffix_size: the number l determining how many most significant bits in each
+// state are
+// fixed. Since there are 2 ** l possible configurations of l-bits, the
+// temporary buffers for energies and states will also have 2 ** l items.
template
void search(
T* qubo,
@@ -178,7 +318,8 @@ void search(
int grid_size,
int block_size,
int suffix_size
-) {
+)
+{
// chunk_size = 2 ** suffix_size
uint64_t chunk_size = 1 << suffix_size;
// Device vectors with qubo
@@ -208,15 +349,14 @@ void search(
thrust::sort_by_key(energies.begin(), energies.end(), states.begin());
// The initial states and energies are becoming the first approximation of
- // the low energy spectrum. We copy first num_states of them into best_spectrum_it.
- thrust::copy(
- current_spectrum_it, current_spectrum_it + num_states, best_spectrum_it
- );
+ // the low energy spectrum. We copy first num_states of them into
+ // best_spectrum_it.
+ thrust::copy(current_spectrum_it, current_spectrum_it + num_states, best_spectrum_it);
// Iterate in chunks in gray code order.
- for(int i = 1; i < num_chunks; i++) {
- // To compute bit to flip compute two consecutive gray codes and see at which
- // bit they differ. Subtract 1 because ffs counts from 1.
+ for (uint64_t i = 1; i < num_chunks; i++) {
+ // To compute bit to flip compute two consecutive gray codes and see at
+ // which bit they differ. Subtract 1 because ffs counts from 1.
int bit_to_flip = ffs(gray(i) ^ gray(i - 1)) - 1;
// Perform single step of energy computation and check if it succeeded.
single_step<<>>(
@@ -224,20 +364,20 @@ void search(
);
cuda_error_check(cudaGetLastError());
-
- // Find the maximum energy in current approximation if low enrgy spectrum.
+ // Find the maximum energy in current approximation if low enrgy
+ // spectrum.
T threshold = *thrust::max_element(
thrust::device, best_energies.begin(), best_energies.begin() + num_states
);
- // From the currently computed chunk, the only candidates to enter the low energy
- // spectrum are the ones that have energy < threshold.
- // Hence, we copy only those states and energies into the range directly being
+ // From the currently computed chunk, the only candidates to enter the
+ // low energy spectrum are the ones that have energy < threshold. Hence,
+ // we copy only those states and energies into the range directly being
// the current approximation of low energy spectrum.
auto candidates_it = best_spectrum_it + num_states;
- // We also store the position at which we placed the last candidate state.
- // This way, we can sometimes sort much smaller range.
+ // We also store the position at which we placed the last candidate
+ // state. This way, we can sometimes sort much smaller range.
auto end = thrust::copy_if(
thrust::device,
current_spectrum_it,
@@ -246,7 +386,8 @@ void search(
energy_less_then(threshold)
);
- // This sort effectively combines the current approximation and candidates.
+ // This sort effectively combines the current approximation and
+ // candidates.
thrust::sort_by_key(
best_energies.begin(),
best_energies.begin() + num_states + (end - candidates_it),
@@ -281,3 +422,125 @@ template void search(
int block_size,
int suffix_size
);
+
+template
+void search_ground_only(
+ T* qubo,
+ int N,
+ uint64_t* states_out,
+ T* energies_out,
+ int grid_size,
+ int block_size,
+ int suffix_size,
+ int num_steps_per_kernel,
+ int partial_diff_buffer_depth
+)
+{
+ uint64_t chunk_size = 1L << suffix_size;
+ device_vector qubo_gpu(qubo, qubo + N * N);
+
+ energy_vector energies(chunk_size);
+ state_vector states(chunk_size);
+ energy_vector best_energies(chunk_size);
+ state_vector best_states(chunk_size);
+
+ device_vector bit_flip_buffer(num_steps_per_kernel);
+ energy_vector prefix_diff_buffer(num_steps_per_kernel);
+ energy_vector partial_diff_buffer(partial_diff_buffer_depth * chunk_size);
+
+ num_steps_per_kernel = std::min(long(num_steps_per_kernel), 1L << (N - suffix_size));
+
+
+ std::vector src_bit_flip_buffer(num_steps_per_kernel);
+ std::vector src_prefix_diff_buffer(num_steps_per_kernel);
+
+ uint64_t last_gray_code, next_gray_code;
+ uint64_t counter;
+ int64_t base_state;
+ T base_energy;
+
+ init<<>>((T*)qubo_gpu, N, (T*)energies, states, chunk_size);
+ cuda_error_check(cudaGetLastError());
+ cudaDeviceSynchronize();
+
+ best_energies = energies;
+ best_states = states;
+
+ _initialize_partial_diff_buffer<<>>(
+ (T*)qubo_gpu, N, states, suffix_size, (T*)partial_diff_buffer, partial_diff_buffer_depth
+ );
+ cuda_error_check(cudaGetLastError());
+ cudaDeviceSynchronize();
+
+ counter = 0;
+ last_gray_code = 0;
+
+ base_energy = 0;
+ base_state = 0;
+
+ for (uint64_t i = 0; i < (1L << (N - suffix_size)) / num_steps_per_kernel; i++) {
+ cudaDeviceSynchronize();
+ for (uint64_t j = 0; j < num_steps_per_kernel; j++) {
+ counter += 1;
+ next_gray_code = gray(counter);
+ int bit_to_flip = ffs(last_gray_code ^ next_gray_code) - 1;
+ src_bit_flip_buffer[j] = bit_to_flip;
+ src_prefix_diff_buffer[j] = qubo[(N + 1) * bit_to_flip];
+ for (int k = 0; k < N - suffix_size; k++) {
+ if (((base_state >> k) % 2) && k != bit_to_flip) {
+ src_prefix_diff_buffer[j] += qubo[N * bit_to_flip + k];
+ }
+ }
+ last_gray_code = next_gray_code;
+ base_state ^= (1L << src_bit_flip_buffer[j]);
+ }
+ bit_flip_buffer = src_bit_flip_buffer;
+ prefix_diff_buffer = src_prefix_diff_buffer;
+ cuda_error_check(cudaGetLastError());
+ cudaDeviceSynchronize();
+ find_ground<<>>(
+ (T*)qubo_gpu,
+ N,
+ (T*)energies,
+ states,
+ (T*)best_energies,
+ best_states,
+ suffix_size,
+ num_steps_per_kernel,
+ (int*)bit_flip_buffer,
+ (T*)prefix_diff_buffer,
+ (T*)partial_diff_buffer,
+ partial_diff_buffer_depth
+ );
+ }
+
+ cuda_error_check(cudaGetLastError());
+ cudaDeviceSynchronize();
+ thrust::sort_by_key(best_energies.begin(), best_energies.end(), best_states.begin());
+ thrust::copy(best_states.begin(), best_states.begin() + 1, states_out);
+ thrust::copy(best_energies.begin(), best_energies.begin() + 1, energies_out);
+}
+
+template void search_ground_only(
+ float* qubo,
+ int N,
+ uint64_t* states_out,
+ float* energies_out,
+ int grid_size,
+ int block_size,
+ int suffix_size,
+ int num_steps_per_kernel,
+ int partial_diff_buffer_depth
+);
+
+template void search_ground_only(
+ double* qubo,
+ int N,
+ uint64_t* states_out,
+ double* energies_out,
+ int grid_size,
+ int block_size,
+ int suffix_size,
+ int num_steps_per_kernel,
+ int partial_diff_buffer_depth
+);
diff --git a/omnisolver/extensions/bruteforce_gpu.h b/omnisolver/extensions/bruteforce_gpu.h
index 6b7ff56..d424c47 100644
--- a/omnisolver/extensions/bruteforce_gpu.h
+++ b/omnisolver/extensions/bruteforce_gpu.h
@@ -11,12 +11,14 @@ void search(
);
template
-void search(
+void search_ground_only(
T* qubo,
int N,
- T* energies_out,
uint64_t* states_out,
- uint64_t num_states_in_chunk,
+ T* energies_out,
int blocks_per_grid,
- int threads_per_block
+ int threads_per_block,
+ int suffix_size,
+ int num_steps_per_kernel,
+ int partial_diff_buff_depth
);
diff --git a/omnisolver/extensions/bruteforce_wrapper_gpu.pyx b/omnisolver/extensions/bruteforce_wrapper_gpu.pyx
index 307c4ef..e0291b4 100644
--- a/omnisolver/extensions/bruteforce_wrapper_gpu.pyx
+++ b/omnisolver/extensions/bruteforce_wrapper_gpu.pyx
@@ -21,6 +21,17 @@ cdef extern from "bruteforce_gpu.h":
int suffix_size
) except+
+ void search_ground_only[T](
+ T* qubo,
+ int N,
+ uint64_t* states_out,
+ T* energies_out,
+ int blocks_per_grid,
+ int threads_per_block,
+ int suffix_size,
+ int num_steps_per_kernel,
+ int partial_diff_buff_depth
+ ) except+
def gpu_search(
real[:,:] qubo,
@@ -41,3 +52,25 @@ def gpu_search(
block_size,
suffix_size
)
+
+def gpu_search_ground_only(
+ real[:,:] qubo,
+ uint64_t[::1] states_out,
+ real[::1] energies_out,
+ int grid_size,
+ int block_size,
+ int suffix_size,
+ int num_steps_per_kernel=16,
+ int partial_diff_buffer_depth=1
+):
+ search_ground_only(
+ &qubo[0, 0],
+ qubo.shape[0],
+ &states_out[0],
+ &energies_out[0],
+ grid_size,
+ block_size,
+ suffix_size,
+ num_steps_per_kernel,
+ partial_diff_buffer_depth
+ )
diff --git a/pyproject.toml b/pyproject.toml
index faabd03..caaa913 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,15 +4,14 @@ build-backend = "setuptools.build_meta"
[project]
readme = "README.md"
-requires-python = ">=3.7,<3.10"
+requires-python = ">=3.7"
name = "omnisolver-bruteforce"
description = "Bruteforce (a.k.a. exhaustive search) Plugin for Omnisolver"
dependencies = [
- "omnisolver ~= 0.0.3",
+ "omnisolver >= 0.0.3",
"dimod ~= 0.12",
- "numba ~= 0.56.4",
- "pluggy ~= 0.13.1",
- "numpy ~= 1.19.4"
+ "numba ~= 0.58",
+ "numpy ~= 1.26"
]
dynamic = ["version"]
@@ -22,7 +21,7 @@ docs = [
"sphinx-autoapi~=2.0.1",
"pydata-sphinx-theme~=0.13.1",
"sphinx-term~=0.1",
- "myst-parser~=1.0.0"
+ "myst-parser~=1.0.0",
]
[project.entry-points."omnisolver"]
diff --git a/tests/cpu/test_gray_code.py b/tests/cpu/test_gray_code.py
index 0a7964f..aaff5be 100644
--- a/tests/cpu/test_gray_code.py
+++ b/tests/cpu/test_gray_code.py
@@ -33,14 +33,10 @@ def test_gray_code_index_is_computed_correctly(self, binary_number, gray_code):
@pytest.mark.parametrize("num_bits", [10, 12, 16])
class TestBijectionBetweenGrayAndBinary:
def test_nth_gray_code_inverts_gray_code_index(self, num_bits):
- assert all(
- gray_code_index(nth_gray_number(n)) == n for n in range(2 ** num_bits)
- )
+ assert all(gray_code_index(nth_gray_number(n)) == n for n in range(2**num_bits))
def test_gray_code_index_inverts_nth_gray_code(self, num_bits):
- assert all(
- nth_gray_number(gray_code_index(n)) == n for n in range(2 ** num_bits)
- )
+ assert all(nth_gray_number(gray_code_index(n)) == n for n in range(2**num_bits))
def _binary_array_to_number(arr):
@@ -57,12 +53,12 @@ def test_iterating_algorithm_l_yields_all_gray_codes(num_bits):
state = np.zeros(num_bits, dtype=np.int8)
produced_numbers = [_binary_array_to_number(state)]
- for _ in range(2 ** num_bits - 1):
+ for _ in range(2**num_bits - 1):
bit_to_flip = focus_vector[0] % num_bits
state[bit_to_flip] = 1 - state[bit_to_flip]
advance_focus_vector(focus_vector)
produced_numbers.append(_binary_array_to_number(state))
np.testing.assert_array_equal(
- produced_numbers, [nth_gray_number(n) for n in range(2 ** num_bits)]
+ produced_numbers, [nth_gray_number(n) for n in range(2**num_bits)]
)