diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..b33b3b2 --- /dev/null +++ b/.clang-format @@ -0,0 +1,6 @@ +BasedOnStyle: WebKit +AlignAfterOpenBracket: BlockIndent +ColumnLimit: 100 +BinPackParameters: false +BinPackArguments: false +IncludeBlocks: Regroup diff --git a/.gitignore b/.gitignore index fb9f65d..2902970 100644 --- a/.gitignore +++ b/.gitignore @@ -137,3 +137,5 @@ dmypy.json # End of https://www.toptal.com/developers/gitignore/api/python instances/ + +bruteforce_wrapper_gpu.cpp \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index d4bb2cb..0000000 --- a/docs/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/_templates/autoapi/index.rst b/docs/_templates/autoapi/index.rst deleted file mode 100644 index 87a7ff9..0000000 --- a/docs/_templates/autoapi/index.rst +++ /dev/null @@ -1,13 +0,0 @@ -API Reference -============= - -This page contains auto-generated API reference documentation. - -.. toctree:: - :titlesonly: - - {% for page in pages | sort %} - {% if (page.top_level_object or page.name.split('.') | length == 3) and page.display %} - {{ page.short_name | capitalize }} <{{ page.include_path }}> - {% endif %} - {% endfor %} diff --git a/docs/assets/logo-large.png b/docs/assets/logo-large.png new file mode 100644 index 0000000..2d16045 Binary files /dev/null and b/docs/assets/logo-large.png differ diff --git a/docs/assets/logo.png b/docs/assets/logo.png new file mode 100644 index 0000000..1e22174 Binary files /dev/null and b/docs/assets/logo.png differ diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index b6b4a53..0000000 --- a/docs/conf.py +++ /dev/null @@ -1,55 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) - - -# -- Project information ----------------------------------------------------- - -project = "Omnisolver Bruteforce" -copyright = "2021, Konrad Jałowiecki, Łukasz Pawela" -author = "Konrad Jałowiecki, Łukasz Pawela" - - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = ["myst_parser", "sphinx_term.termynal", "autoapi.extension"] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "pydata_sphinx_theme" - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] - -autoapi_dirs = ["../omnisolver"] -autoapi_python_use_implicit_namespaces = True -autoapi_template_dir = "_templates/autoapi" diff --git a/docs/index.md b/docs/index.md index e078135..f99fb01 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,49 +1,91 @@ -# Welcome to omnisolver-pt documentation! +--- +hide: + - navigation +--- -The `omnisolver-bruteforce` is Omnisolver plugin implementing the bruteforce (a.k.a exhaustive -search) algorithm. +

+ Omnisolver +

-## Installation +--- -Currently, the `omnisolver-bruteforce` package requires working CUDA installation. +

-To install the plugin first set the `CUDAHOME` environmental library to your CUDA instalaltion location, e.g.: +

+ + Tests + + +Docs + +

-```shell -# Rmember, your actual location may vary! -export CUDAHOME=/usr/local/cuda -``` +**Documentation:** https://euro-hpc-pl.github.io/omnisolver -and then run: +**Source code:** https://github.com/euro-hpc-pl/omnisolver -```shell -pip install omnisolver-bruteforce -``` +--- -> **Warning** -> If you don't set the `CUDAHOME` directory, an attempt will be made to deduce it based on the location of your `nvcc` compiler. -> However, this process might not work in all the cases and should not be relied on. +Omnisolver is a collection of Binary Quadratic Model solvers and a framework for implementing them. -## Command line usage -```text -usage: omnisolver bruteforce-gpu [-h] [--output OUTPUT] [--vartype {SPIN,BINARY}] [--num_states NUM_STATES] [--suffix_size SUFFIX_SIZE] [--grid_size GRID_SIZE] - [--block_size BLOCK_SIZE] [--dtype {float,double}] - input -Bruteforce (a.k.a exhaustive search) sampler using CUA-enabled GPU -positional arguments: - input Path of the input BQM file in COO format. If not specified, stdin is used. +## Why Omnisolver? + +### Benefits for the end-users + +Omnisolver contains a selection of standard and more sophisticated algorithms for solving BQMs. All solvers are available through intuitive CLI or from Python scripts as dimod based Samplers. + +### Benefits for solver creators + +Omnisolver allows developer to focus on algorithms instead of common tasks like handling input/output or creating CLI. + +## Quickstart -optional arguments: - -h, --help show this help message and exit - --output OUTPUT Path of the output file. If not specified, stdout is used. - --vartype {SPIN,BINARY} - Variable type - --num_states NUM_STATES + + +``` +# Install base omnisolver package +$ pip install omnisolver +---> 100% +Successfuly installed omnisolver +# Install chosen plugins (e.g. parallel-tempering solver) +$ pip install omnisolver-pt +---> 100% +Successfuly installed omnisolver-pt +# Create an instance file in COOrdinate format +$ echo "0 1 1.0 +> 1 2 1.0 +> 2 0 1.0" > instance.txt +# Run solver +$ omnisolver pt --vartype SPIN instance.txt +0,1,2,energy,num_occurrences +1,-1,-1,-1.0,1 ``` -## Documentation -```{toctree} -:maxdepth: 1 +## What's next? + +Here are some resources to get you started: + +- Start with user guide to learn about the installation methods and general usage patterns. +- Discover available solvers in our plugin list. +- If you are interested in developing your own solver, or are interested in in-depth details of how the Omnisolver + works, check our solver creator guide and reference manual. + +## Citing + +If you used Omnisolver in your research, consider citing it in your paper. +You can use the following BibTeX entry: + +```text +@article{omnisolver2023, + title = {Omnisolver: An extensible interface to Ising spin–glass and QUBO solvers}, + journal = {SoftwareX}, + volume = {24}, + pages = {101559}, + year = {2023}, + doi = {https://doi.org/10.1016/j.softx.2023.101559}, + url = {https://www.softxjournal.com/article/S2352-7110(23)00255-8/}, + author = {Konrad Jałowiecki and {\L}ukasz Pawela}, +} ``` diff --git a/docs/javascripts/mathjax.js b/docs/javascripts/mathjax.js new file mode 100644 index 0000000..396f03b --- /dev/null +++ b/docs/javascripts/mathjax.js @@ -0,0 +1,18 @@ +window.MathJax = { + tex: { + inlineMath: [["\\(", "\\)"]], + displayMath: [["\\[", "\\]"]], + processEscapes: true, + processEnvironments: true + }, + options: { + ignoreHtmlClass: ".*|", + processHtmlClass: "arithmatex" + } +}; + +document$.subscribe(() => { + + + MathJax.typesetPromise() +}) diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 8084272..0000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=. -set BUILDDIR=_build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.https://www.sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/reference/omnisolver_bruteforce_distributed.md b/docs/reference/omnisolver_bruteforce_distributed.md new file mode 100644 index 0000000..8860640 --- /dev/null +++ b/docs/reference/omnisolver_bruteforce_distributed.md @@ -0,0 +1,2 @@ +# ::: omnisolver.bruteforce.gpu.distributed + handler: python diff --git a/docs/reference/omnisolver_bruteforce_sampler.md b/docs/reference/omnisolver_bruteforce_sampler.md new file mode 100644 index 0000000..84cc6a5 --- /dev/null +++ b/docs/reference/omnisolver_bruteforce_sampler.md @@ -0,0 +1,2 @@ +# ::: omnisolver.bruteforce.gpu.sampler + handler: python diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 09f95f3..0000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -sphinx~=6.1.3 -sphinx-autoapi~=2.0.1 -pydata-sphinx-theme~=0.13.1 -sphinx-term~=0.1 -myst-parser~=1.0.0 diff --git a/examples/distributed.py b/examples/distributed.py new file mode 100644 index 0000000..aab4250 --- /dev/null +++ b/examples/distributed.py @@ -0,0 +1,55 @@ +from dimod import BQM +from numpy.random import default_rng + +from omnisolver.bruteforce.gpu import BruteforceGPUSampler +from omnisolver.bruteforce.gpu.distributed import DistributedBruteforceGPUSampler + + +def random_bqm(num_variables, vartype, offset, rng): + linear = { + i: coef for i, coef in zip(range(num_variables), rng.uniform(-2, 2, size=num_variables)) + } + quadratic = { + (i, j): coef + for (i, j), coef in zip( + [(i, j) for i in range(num_variables) for j in range(i + 1, num_variables)], + rng.uniform(-1, -1, size=(num_variables - 1) * num_variables // 2), + ) + } + return BQM(linear, quadratic, offset, vartype=vartype) + + +NUM_VARIABLES = 40 + + +def main(): + sampler = DistributedBruteforceGPUSampler() + sampler2 = BruteforceGPUSampler() + rng = default_rng(1234) + + bqm = random_bqm(NUM_VARIABLES, "BINARY", 0.0, rng) + + import time + + start = time.perf_counter() + result = sampler.sample( + bqm, num_states=100, num_fixed_vars=1, suffix_size=25, grid_size=1024, block_size=1024 + ) + duration = time.perf_counter() - start + distributed_en = [entry.energy for entry in result.data()] + + print(f"Distributed finished in : {duration}s") + + start = time.perf_counter() + result2 = sampler2.sample( + bqm, num_states=100, suffix_size=25, grid_size=2**12, block_size=512 + ) + duration = time.perf_counter() - start + single_en = [entry.energy for entry in result2.data()] + print(f"Single finished in: {duration}s") + + print(max(abs(en1 - en2) for en1, en2 in zip(distributed_en, single_en))) + + +if __name__ == "__main__": + main() diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..2d13988 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,66 @@ +# yaml-language-server: $schema=https://squidfunk.github.io/mkdocs-material/schema.json +site_name: Omnisolver-Bruteforce +site_author: "Konrad Jałowiecki & Łukasz Pawela" +copyright: "" +theme: + name: material + logo: assets/logo.png + palette: + primary: custom + features: + - navigation.tabs + - navigation.tracking +extra: + version: + provider: mike +extra_css: + - stylesheets/extra.css +nav: + - User guide: userguide.md + #- Solver creator guide: creatorguide.md + - Reference manual: + - omnisolver.bruteforce.gpu.sampler: reference/omnisolver_bruteforce_sampler.md + - omnisolver.bruteforce.gpu.distributed: reference/omnisolver_bruteforce_distributed.md + - Plugin list: plugins.md +plugins: + - search + - mike: + canonical_version: latest + version_selector: true + - termynal: + prompt_literal_start: + - "$" + - mkdocstrings: + handlers: + python: + options: + annotation_path: brief + heading_level: 2 + show_source: false + show_root_toc_entry: false + show_signature_annotations: true + members_order: source + separate_signature: true + docstring_style: sphinx + show_if_no_docstring: true + docstring_options: + ignore_init_summary: true + merge_init_into_class: true + show_docstring_returns: true + + - with-pdf: + render_js: true + cover_subtitle: "Exhaustive search plugin for Omnisolver" +repo_name: Omnisolver +repo_url: https://github.com/euro-hpc-pl/omnisolver +markdown_extensions: + - pymdownx.superfences + - pymdownx.arithmatex: + generic: true + - admonition + - pymdownx.tabbed: + alternate_style: true +extra_javascript: + - javascripts/mathjax.js + - https://polyfill.io/v3/polyfill.min.js?features=es6 + - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js diff --git a/omnisolver/bruteforce/gpu/__init__.py b/omnisolver/bruteforce/gpu/__init__.py index d0c35ef..5b868f2 100644 --- a/omnisolver/bruteforce/gpu/__init__.py +++ b/omnisolver/bruteforce/gpu/__init__.py @@ -1,12 +1,11 @@ """Implementation of parallel tempering plugin for Omnisolver.""" -from omnisolver.common.plugin import Plugin, plugin_from_specification, plugin_impl +from omnisolver.common.plugin import Plugin, plugin_from_specification from pkg_resources import resource_stream from yaml import safe_load from .sampler import BruteforceGPUSampler -@plugin_impl def get_plugin() -> Plugin: """Get package name and resource path.""" specification = safe_load(resource_stream("omnisolver.bruteforce.gpu", "gpu.yml")) diff --git a/omnisolver/bruteforce/gpu/distributed.py b/omnisolver/bruteforce/gpu/distributed.py new file mode 100644 index 0000000..9729a0c --- /dev/null +++ b/omnisolver/bruteforce/gpu/distributed.py @@ -0,0 +1,108 @@ +import typing +from itertools import product +from time import perf_counter + +import numpy as np +import ray +from dimod import Sampler, Vartype, append_variables, concatenate + +from .sampler import BruteforceGPUSampler + + +@ray.remote(num_gpus=1) +def _solve_subproblem( + bqm, + num_states, + fixed_vars, + suffix_size, + grid_size, + block_size, + num_steps_per_kernel, + partial_diff_buffer_depth, + dtype, +): + new_bqm = bqm.copy() + new_bqm.fix_variables(fixed_vars) + + sampler = BruteforceGPUSampler() + result = sampler.sample( + new_bqm, + num_states, + suffix_size, + grid_size, + block_size, + num_steps_per_kernel, + partial_diff_buffer_depth, + dtype, + ) + + return append_variables(result, fixed_vars) + + +class DistributedBruteforceGPUSampler(Sampler): + def sample( + self, + bqm, + num_states, + num_fixed_vars, + suffix_size, + grid_size, + block_size, + num_steps_per_kernel=1, + partial_diff_buffer_depth=1, + dtype=np.float32, + ): + if bqm.vartype == Vartype.SPIN: + return self.sample( + bqm.change_vartype("BINARY", inplace=False), + num_states, + num_fixed_vars, + suffix_size, + grid_size, + block_size, + num_steps_per_kernel, + partial_diff_buffer_depth, + ).change_vartype("SPIN", inplace=False) + + bqm, mapping = bqm.relabel_variables_as_integers() + + start_counter = perf_counter() + + subproblems = [ + {i: v for i, v in enumerate(vals)} for vals in product([0, 1], repeat=num_fixed_vars) + ] + + refs = [ + _solve_subproblem.remote( + bqm, + num_states, + fixed_vars, + suffix_size, + grid_size, + block_size, + num_steps_per_kernel, + partial_diff_buffer_depth, + dtype, + ) + for fixed_vars in subproblems + ] + + subsolutions = [ray.get(ref) for ref in refs] + solve_time_in_seconds = perf_counter() - start_counter + result = concatenate(subsolutions).truncate(num_states) + result.info["solve_time_in_seconds"] = solve_time_in_seconds + return result.relabel_variables(mapping) + + @property + def parameters(self) -> typing.Dict[str, typing.Any]: + return { + "num_states": [], + "suffix_size": [], + "grid_size": [], + "block_size": [], + "dtype": [], + } + + @property + def properties(self) -> typing.Dict[str, typing.Any]: + return {} diff --git a/omnisolver/bruteforce/gpu/sampler.py b/omnisolver/bruteforce/gpu/sampler.py index d26e8c4..81a960c 100644 --- a/omnisolver/bruteforce/gpu/sampler.py +++ b/omnisolver/bruteforce/gpu/sampler.py @@ -4,7 +4,7 @@ import numpy as np from dimod import Sampler, SampleSet, Vartype -from omnisolver.bruteforce.ext.gpu import gpu_search +from omnisolver.bruteforce.ext.gpu import gpu_search, gpu_search_ground_only def _convert_int_to_sample(val, num_variables): @@ -16,7 +16,17 @@ def _convert_int_to_sample(val, num_variables): class BruteforceGPUSampler(Sampler): - def sample(self, bqm, num_states, suffix_size, grid_size, block_size, dtype=np.float32): + def sample( + self, + bqm, + num_states, + suffix_size, + grid_size, + block_size, + num_steps_per_kernel=16, + partial_diff_buffer_depth=1, + dtype=np.float32, + ): """Solve Binary Quadratic Model using exhaustive (bruteforce) search on the GPU. :param bqm: Binary Quadratic Model instance to solve. @@ -38,6 +48,9 @@ def sample(self, bqm, num_states, suffix_size, grid_size, block_size, dtype=np.f suffix_size, grid_size, block_size, + num_steps_per_kernel, + partial_diff_buffer_depth, + dtype, ).change_vartype("SPIN", inplace=False) bqm, mapping = bqm.relabel_variables_as_integers() @@ -56,15 +69,27 @@ def sample(self, bqm, num_states, suffix_size, grid_size, block_size, dtype=np.f start_counter = perf_counter() - gpu_search( - qubo_mat, - num_states, - states_out, - energies_out, - grid_size, - block_size, - suffix_size, - ) + if num_states == 1: # Shortcut if we are only looking for a ground state + gpu_search_ground_only( + qubo_mat, + states_out, + energies_out, + grid_size, + block_size, + suffix_size, + num_steps_per_kernel, + partial_diff_buffer_depth, + ) + else: + gpu_search( + qubo_mat, + num_states, + states_out, + energies_out, + grid_size, + block_size, + suffix_size, + ) solve_time_in_seconds = perf_counter() - start_counter diff --git a/omnisolver/extensions/bruteforce_gpu.cu b/omnisolver/extensions/bruteforce_gpu.cu index 03d62d1..1adc65d 100644 --- a/omnisolver/extensions/bruteforce_gpu.cu +++ b/omnisolver/extensions/bruteforce_gpu.cu @@ -1,30 +1,36 @@ // Implementation of exhaustive search using a CUDA--enabled GPU. -#include -#include -#include +#include "vectors.h" + #include +#include #include #include -#include "vectors.h" // Basic error checking (used only for kernel launches) // It should throw an instance of std::runtime_error containing the decoded // cuda runtime error message and also filename and line at which the error // occurred. -#define cuda_error_check(code) { cuda_assert((code), __FILE__, __LINE__); } +#define cuda_error_check(code) \ + { \ + cuda_assert((code), __FILE__, __LINE__); \ + } + +__device__ __forceinline__ int thread_id() { return blockIdx.x * blockDim.x + threadIdx.x; } + +__device__ __forceinline__ int grid_size() { return blockDim.x * gridDim.x; } -inline void cuda_assert(cudaError_t code, const char *file, int line) +inline void cuda_assert(cudaError_t code, const char* file, int line) { - if (code != cudaSuccess) - { - std::stringstream what; - what << "File: " << __FILE__ << " at " << __LINE__ << ", error: " << cudaGetErrorString(code); - throw std::runtime_error(what.str()); - } + if (code != cudaSuccess) { + std::stringstream what; + what << "File: " << __FILE__ << " at " << __LINE__ + << ", error: " << cudaGetErrorString(code); + throw std::runtime_error(what.str()); + } } // Compute i-th Gray code. -uint64_t gray(uint64_t index) { return index ^ (index >> 1); } +__host__ __device__ uint64_t gray(uint64_t index) { return index ^ (index >> 1); } // Find First bit Set in a number. // Following common convention for this function it returns bits indexed from 1 @@ -33,12 +39,13 @@ uint64_t gray(uint64_t index) { return index ^ (index >> 1); } // ffs(7) = 1 // ffs(10) = 2 // ffs(24) = 4 -int ffs(uint64_t number) { - if(number == 0) { +int ffs(uint64_t number) +{ + if (number == 0) { return 0; } int result = 1; - while((number & 1) != 1) { + while ((number & 1) != 1) { number >>= 1; result++; } @@ -46,23 +53,62 @@ int ffs(uint64_t number) { return result; } -// Given a QUBO and a state, compute how the energy changes if the i-th bit of state -// is flipped. -// Specifically, E(.) is energy function of the qubo, then this function computes -// E(s) - E(s'), where s' is the state after flipping i-th bit of s. -// One can easily verify that to compute the difference above we need only the -// i-th row of QUBO, and hence we already pass this row instead of computign the offset. +// Given a QUBO and a state, compute how the energy changes if the i-th bit of +// state is flipped. Specifically, E(.) is energy function of the qubo, then +// this function computes E(s) - E(s'), where s' is the state after flipping +// i-th bit of s. One can easily verify that to compute the difference above we +// need only the i-th row of QUBO, and hence we already pass this row instead +// of computign the offset. template -__device__ T energy_diff(T* qubo_row, int N, uint64_t state, int i) { +__host__ __device__ __forceinline__ T energy_diff(T* qubo_row, int N, uint64_t state, int i) +{ int qi = (state >> i) & 1; // This is the i-th bit of state - T total = qubo_row[i]; // Start accumulating from the lineaer term + T total = qubo_row[i]; // Start accumulating from the linear term + + uint32_t word = state & 0xFFFFFFFF; // Go through each bit of state - for(int j = 0; j < N; j++) { - if(i != j) { // except the one to flip, it's already accounted for - total += qubo_row[j] * (state & 1); + for (int j = 0; j < 32 && j < N; j++) { + if (i != j && (word % 2)) { // except the one to flip, it's already accounted for + total += qubo_row[j]; } - state >>= 1; // move to next bit + word /= 2; //>>= 1; // move to next bit + } + + word = state >> 32; + for (int j = 32; j < N; j++) { + if (i != j && (word % 2)) { // except the one to flip, it's already accounted for + total += qubo_row[j]; + } + word /= 2; // move to next bit + } + + // When flipping from 0 to 1 there is nothing to do, otherwise we need + // to multiply the total by -1. + return (2 * qi - 1) * total; +} + +template +__device__ __forceinline__ T +energy_diff_reduced(T* qubo_row, int N, int offset, uint64_t state, int i, T common_part) +{ + int qi = (state >> i) & 1; // This is the i-th bit of state + T total = common_part; // Start accumulating from the linear term + + uint32_t word = (state >> offset) & 0xFFFFFFFF; + + // Go through each bit of state + for (int j = offset; j < 32 + offset && j < N; j++) { + if (word % 2) + total += qubo_row[j]; + word /= 2; // move to next bit + } + + word = state >> (32 + offset); + for (int j = 32 + offset; j < N; j++) { + if (word % 2) + total += qubo_row[j]; + word /= 2; // move to next bit } // When flipping from 0 to 1 there is nothing to do, otherwise we need @@ -71,14 +117,14 @@ __device__ T energy_diff(T* qubo_row, int N, uint64_t state, int i) { } // Initialize first states and energies for further processing. -// The idea is as follows. Each state (which is N-bit number) is (logically) split into -// two parts: -// l-bit part specific to state -// k-bit part that changes in all states in the same way for all states in given iteration. -// This function computes energy of a state with some l-bits set according to index, +// The idea is as follows. Each state (which is N-bit number) is (logically) +// split into two parts: l-bit part specific to state k-bit part that changes +// in all states in the same way for all states in given iteration. This +// function computes energy of a state with some l-bits set according to index, // and k least significant bits set to 0. template -__device__ void _init(T* qubo, int N, uint64_t* states, T* energies, uint64_t index) { +__device__ void _init(T* qubo, int N, uint64_t* states, T* energies, uint64_t index) +{ T energy = 0.0; uint64_t state = 0, suffix = index, offset = 0; @@ -86,9 +132,9 @@ __device__ void _init(T* qubo, int N, uint64_t* states, T* energies, uint64_t in // Instead we enumerate bits from the most significant one. // For instance, if index=12, which is 110 in binary, we will set the three // most significant bits of state to 011. - while(suffix > 0) { + while (suffix > 0) { // Find first set (note: not our function, this one is CUDA's intrinsic) - uint64_t bit_in_suffix = __ffs(suffix); + uint64_t bit_in_suffix = __ffsll(suffix); // Compute bit index from the most significant position // Since we reduce suffix in each iteration, we have to store the offset // travelled so far in the `offset` variable. @@ -108,36 +154,28 @@ __device__ void _init(T* qubo, int N, uint64_t* states, T* energies, uint64_t in states[index] = state; } -// Kernel that basically launched the _init function above for all states and energies +// Kernel that basically launches the _init function above for all states and +// energies template -__global__ void init( - T* qubo, - int N, - T* energies, - uint64_t* states, - uint64_t states_in_chunk -) { - for(int i=blockIdx.x * blockDim.x + threadIdx.x; i < states_in_chunk; i += blockDim.x * gridDim.x) { +__global__ void init(T* qubo, int N, T* energies, uint64_t* states, uint64_t states_in_chunk) +{ + for (uint64_t i = thread_id(); i < states_in_chunk; i += grid_size()) { _init(qubo, N, states, energies, i); } } // A single step of bruteforce algorithm. -// Recall that we split all states (logically) into l-most significant fixed bits -// and k-least significant bits that are modified, one at a time, in each iteration. -// This kernel flips one of those k-significant bits. +// Recall that we split all states (logically) into l-most significant fixed +// bits and k-least significant bits that are modified, one at a time, in each +// iteration. This kernel flips one of those k-significant bits. template __global__ void single_step( - T* qubo, - int N, - T* energies, - uint64_t* states, - int bit_to_flip, - uint64_t states_in_chunk -) { + T* qubo, int N, T* energies, uint64_t* states, int bit_to_flip, uint64_t states_in_chunk +) +{ T* qubo_row = qubo + N * bit_to_flip; - for(int i=blockIdx.x * blockDim.x + threadIdx.x; i < states_in_chunk; i += blockDim.x * gridDim.x) { + for (uint64_t i = thread_id(); i < states_in_chunk; i += grid_size()) { uint64_t state = states[i]; T energy = energies[i]; energies[i] = energy - energy_diff(qubo_row, N, state, bit_to_flip); @@ -145,29 +183,131 @@ __global__ void single_step( } } +template +__device__ __forceinline__ T* row_ptr(T* array_2d, uint64_t n_cols, uint64_t row_id) +{ + return array_2d + n_cols * row_id; +} + +// Kernel performing whole ground state (and only ground state) computation in +// a single step +template +__global__ void find_ground( + T* qubo, + int N, + T* energies, + uint64_t* states, + T* best_energies, + uint64_t* best_states, + uint64_t suffix_size, + uint64_t num_iterations, + int* bit_buffer, + T* prefix_diff_buffer, + T* partial_diff_buffer, + int partial_diff_buffer_depth +) +{ + uint64_t chunk_size = 1L << suffix_size; + T tmp_energy, candidate_energy; + uint64_t tmp_state, candidate_state; + int bit_to_flip; + int qi; + + for (uint64_t i = thread_id(); i < chunk_size; i += grid_size()) { + candidate_energy = best_energies[i]; + tmp_energy = energies[i]; + candidate_state = best_states[i]; + tmp_state = states[i]; + for (uint64_t j = 0; j < num_iterations; j++) { + bit_to_flip = bit_buffer[j]; + if (bit_to_flip < partial_diff_buffer_depth) { + qi = (tmp_state >> bit_to_flip) & 1; + tmp_energy = tmp_energy + - (2 * qi - 1) + * (row_ptr(partial_diff_buffer, chunk_size, bit_to_flip)[i] + + prefix_diff_buffer[j]); + } else { + tmp_energy -= energy_diff_reduced( + row_ptr(qubo, N, bit_to_flip), + N, + N - suffix_size, + tmp_state, + bit_to_flip, + prefix_diff_buffer[j] + ); + } + + tmp_state = tmp_state ^ (1L << bit_to_flip); + if (tmp_energy < candidate_energy) { + candidate_energy = tmp_energy; + candidate_state = tmp_state; + } + } + states[i] = tmp_state; + energies[i] = tmp_energy; + best_states[i] = candidate_state; + best_energies[i] = candidate_energy; + } +} + +template +__global__ void _initialize_partial_diff_buffer( + T* qubo, + int N, + uint64_t* states, + uint64_t suffix_size, + T* partial_diff_buffer, + int partial_diff_buffer_depth +) +{ + uint64_t chunk_size = 1L << suffix_size; + uint64_t shifted_state, tmp_state; + T total; + T* qubo_row; + + for (uint64_t i = thread_id(); i < chunk_size; i += grid_size()) { + shifted_state = states[i] >> (N - suffix_size); + for (int bit_to_flip = 0; bit_to_flip < partial_diff_buffer_depth; bit_to_flip++) { + total = 0; + tmp_state = shifted_state; + qubo_row = qubo + N * bit_to_flip; + for (int k = N - suffix_size; k < N; k++) { + if (tmp_state % 2) + total += qubo_row[k]; + tmp_state /= 2; + } + + partial_diff_buffer[bit_to_flip * chunk_size + i] = total; + } + } +} + // Predicate used in thrust::copy_if // Given a tuple (state, energy), copy this copule iff energy < threshold. -template -struct energy_less_then { - energy_less_then(T threshold) : threshold(threshold) {}; +template struct energy_less_then { + energy_less_then(T threshold) + : threshold(threshold) {}; + T threshold; - __host__ __device__ - inline bool operator () (const thrust::tuple& pair) { + __host__ __device__ inline bool operator()(const thrust::tuple& pair) + { return thrust::get<1>(pair) < threshold; } }; // The pièce de résistance of this module, the search function. -// QUBO: N x N matrix of floats or doubles (depending on the template parameter T) -// num_states: requested size of the low energy spectrum -// states_out, energies_out: output buffers, should be of size num_states -// grid_size, block_size: definition of grid on which init and single_step kernels will -// be launched. Warning: other kernels might be launched by thrust, and we cannot control -// their grid (and frankly, thrust's judgement is probably better then our judgement) -// suffix_size: the number l determining how many most significant bits in each state are -// fixed. Since there are 2 ** l possible configurations of l-bits, the temporary buffers -// for energies and states will also have 2 ** l items. +// QUBO: N x N matrix of floats or doubles (depending on the template parameter +// T) num_states: requested size of the low energy spectrum states_out, +// energies_out: output buffers, should be of size num_states grid_size, +// block_size: definition of grid on which init and single_step kernels will +// be launched. Warning: other kernels might be launched by thrust, and we +// cannot control their grid (and frankly, thrust's judgement is probably +// better then our judgement) +// suffix_size: the number l determining how many most significant bits in each +// state are +// fixed. Since there are 2 ** l possible configurations of l-bits, the +// temporary buffers for energies and states will also have 2 ** l items. template void search( T* qubo, @@ -178,7 +318,8 @@ void search( int grid_size, int block_size, int suffix_size -) { +) +{ // chunk_size = 2 ** suffix_size uint64_t chunk_size = 1 << suffix_size; // Device vectors with qubo @@ -208,15 +349,14 @@ void search( thrust::sort_by_key(energies.begin(), energies.end(), states.begin()); // The initial states and energies are becoming the first approximation of - // the low energy spectrum. We copy first num_states of them into best_spectrum_it. - thrust::copy( - current_spectrum_it, current_spectrum_it + num_states, best_spectrum_it - ); + // the low energy spectrum. We copy first num_states of them into + // best_spectrum_it. + thrust::copy(current_spectrum_it, current_spectrum_it + num_states, best_spectrum_it); // Iterate in chunks in gray code order. - for(int i = 1; i < num_chunks; i++) { - // To compute bit to flip compute two consecutive gray codes and see at which - // bit they differ. Subtract 1 because ffs counts from 1. + for (uint64_t i = 1; i < num_chunks; i++) { + // To compute bit to flip compute two consecutive gray codes and see at + // which bit they differ. Subtract 1 because ffs counts from 1. int bit_to_flip = ffs(gray(i) ^ gray(i - 1)) - 1; // Perform single step of energy computation and check if it succeeded. single_step<<>>( @@ -224,20 +364,20 @@ void search( ); cuda_error_check(cudaGetLastError()); - - // Find the maximum energy in current approximation if low enrgy spectrum. + // Find the maximum energy in current approximation if low enrgy + // spectrum. T threshold = *thrust::max_element( thrust::device, best_energies.begin(), best_energies.begin() + num_states ); - // From the currently computed chunk, the only candidates to enter the low energy - // spectrum are the ones that have energy < threshold. - // Hence, we copy only those states and energies into the range directly being + // From the currently computed chunk, the only candidates to enter the + // low energy spectrum are the ones that have energy < threshold. Hence, + // we copy only those states and energies into the range directly being // the current approximation of low energy spectrum. auto candidates_it = best_spectrum_it + num_states; - // We also store the position at which we placed the last candidate state. - // This way, we can sometimes sort much smaller range. + // We also store the position at which we placed the last candidate + // state. This way, we can sometimes sort much smaller range. auto end = thrust::copy_if( thrust::device, current_spectrum_it, @@ -246,7 +386,8 @@ void search( energy_less_then(threshold) ); - // This sort effectively combines the current approximation and candidates. + // This sort effectively combines the current approximation and + // candidates. thrust::sort_by_key( best_energies.begin(), best_energies.begin() + num_states + (end - candidates_it), @@ -281,3 +422,125 @@ template void search( int block_size, int suffix_size ); + +template +void search_ground_only( + T* qubo, + int N, + uint64_t* states_out, + T* energies_out, + int grid_size, + int block_size, + int suffix_size, + int num_steps_per_kernel, + int partial_diff_buffer_depth +) +{ + uint64_t chunk_size = 1L << suffix_size; + device_vector qubo_gpu(qubo, qubo + N * N); + + energy_vector energies(chunk_size); + state_vector states(chunk_size); + energy_vector best_energies(chunk_size); + state_vector best_states(chunk_size); + + device_vector bit_flip_buffer(num_steps_per_kernel); + energy_vector prefix_diff_buffer(num_steps_per_kernel); + energy_vector partial_diff_buffer(partial_diff_buffer_depth * chunk_size); + + num_steps_per_kernel = std::min(long(num_steps_per_kernel), 1L << (N - suffix_size)); + + + std::vector src_bit_flip_buffer(num_steps_per_kernel); + std::vector src_prefix_diff_buffer(num_steps_per_kernel); + + uint64_t last_gray_code, next_gray_code; + uint64_t counter; + int64_t base_state; + T base_energy; + + init<<>>((T*)qubo_gpu, N, (T*)energies, states, chunk_size); + cuda_error_check(cudaGetLastError()); + cudaDeviceSynchronize(); + + best_energies = energies; + best_states = states; + + _initialize_partial_diff_buffer<<>>( + (T*)qubo_gpu, N, states, suffix_size, (T*)partial_diff_buffer, partial_diff_buffer_depth + ); + cuda_error_check(cudaGetLastError()); + cudaDeviceSynchronize(); + + counter = 0; + last_gray_code = 0; + + base_energy = 0; + base_state = 0; + + for (uint64_t i = 0; i < (1L << (N - suffix_size)) / num_steps_per_kernel; i++) { + cudaDeviceSynchronize(); + for (uint64_t j = 0; j < num_steps_per_kernel; j++) { + counter += 1; + next_gray_code = gray(counter); + int bit_to_flip = ffs(last_gray_code ^ next_gray_code) - 1; + src_bit_flip_buffer[j] = bit_to_flip; + src_prefix_diff_buffer[j] = qubo[(N + 1) * bit_to_flip]; + for (int k = 0; k < N - suffix_size; k++) { + if (((base_state >> k) % 2) && k != bit_to_flip) { + src_prefix_diff_buffer[j] += qubo[N * bit_to_flip + k]; + } + } + last_gray_code = next_gray_code; + base_state ^= (1L << src_bit_flip_buffer[j]); + } + bit_flip_buffer = src_bit_flip_buffer; + prefix_diff_buffer = src_prefix_diff_buffer; + cuda_error_check(cudaGetLastError()); + cudaDeviceSynchronize(); + find_ground<<>>( + (T*)qubo_gpu, + N, + (T*)energies, + states, + (T*)best_energies, + best_states, + suffix_size, + num_steps_per_kernel, + (int*)bit_flip_buffer, + (T*)prefix_diff_buffer, + (T*)partial_diff_buffer, + partial_diff_buffer_depth + ); + } + + cuda_error_check(cudaGetLastError()); + cudaDeviceSynchronize(); + thrust::sort_by_key(best_energies.begin(), best_energies.end(), best_states.begin()); + thrust::copy(best_states.begin(), best_states.begin() + 1, states_out); + thrust::copy(best_energies.begin(), best_energies.begin() + 1, energies_out); +} + +template void search_ground_only( + float* qubo, + int N, + uint64_t* states_out, + float* energies_out, + int grid_size, + int block_size, + int suffix_size, + int num_steps_per_kernel, + int partial_diff_buffer_depth +); + +template void search_ground_only( + double* qubo, + int N, + uint64_t* states_out, + double* energies_out, + int grid_size, + int block_size, + int suffix_size, + int num_steps_per_kernel, + int partial_diff_buffer_depth +); diff --git a/omnisolver/extensions/bruteforce_gpu.h b/omnisolver/extensions/bruteforce_gpu.h index 6b7ff56..d424c47 100644 --- a/omnisolver/extensions/bruteforce_gpu.h +++ b/omnisolver/extensions/bruteforce_gpu.h @@ -11,12 +11,14 @@ void search( ); template -void search( +void search_ground_only( T* qubo, int N, - T* energies_out, uint64_t* states_out, - uint64_t num_states_in_chunk, + T* energies_out, int blocks_per_grid, - int threads_per_block + int threads_per_block, + int suffix_size, + int num_steps_per_kernel, + int partial_diff_buff_depth ); diff --git a/omnisolver/extensions/bruteforce_wrapper_gpu.pyx b/omnisolver/extensions/bruteforce_wrapper_gpu.pyx index 307c4ef..e0291b4 100644 --- a/omnisolver/extensions/bruteforce_wrapper_gpu.pyx +++ b/omnisolver/extensions/bruteforce_wrapper_gpu.pyx @@ -21,6 +21,17 @@ cdef extern from "bruteforce_gpu.h": int suffix_size ) except+ + void search_ground_only[T]( + T* qubo, + int N, + uint64_t* states_out, + T* energies_out, + int blocks_per_grid, + int threads_per_block, + int suffix_size, + int num_steps_per_kernel, + int partial_diff_buff_depth + ) except+ def gpu_search( real[:,:] qubo, @@ -41,3 +52,25 @@ def gpu_search( block_size, suffix_size ) + +def gpu_search_ground_only( + real[:,:] qubo, + uint64_t[::1] states_out, + real[::1] energies_out, + int grid_size, + int block_size, + int suffix_size, + int num_steps_per_kernel=16, + int partial_diff_buffer_depth=1 +): + search_ground_only( + &qubo[0, 0], + qubo.shape[0], + &states_out[0], + &energies_out[0], + grid_size, + block_size, + suffix_size, + num_steps_per_kernel, + partial_diff_buffer_depth + ) diff --git a/pyproject.toml b/pyproject.toml index faabd03..caaa913 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,15 +4,14 @@ build-backend = "setuptools.build_meta" [project] readme = "README.md" -requires-python = ">=3.7,<3.10" +requires-python = ">=3.7" name = "omnisolver-bruteforce" description = "Bruteforce (a.k.a. exhaustive search) Plugin for Omnisolver" dependencies = [ - "omnisolver ~= 0.0.3", + "omnisolver >= 0.0.3", "dimod ~= 0.12", - "numba ~= 0.56.4", - "pluggy ~= 0.13.1", - "numpy ~= 1.19.4" + "numba ~= 0.58", + "numpy ~= 1.26" ] dynamic = ["version"] @@ -22,7 +21,7 @@ docs = [ "sphinx-autoapi~=2.0.1", "pydata-sphinx-theme~=0.13.1", "sphinx-term~=0.1", - "myst-parser~=1.0.0" + "myst-parser~=1.0.0", ] [project.entry-points."omnisolver"] diff --git a/tests/cpu/test_gray_code.py b/tests/cpu/test_gray_code.py index 0a7964f..aaff5be 100644 --- a/tests/cpu/test_gray_code.py +++ b/tests/cpu/test_gray_code.py @@ -33,14 +33,10 @@ def test_gray_code_index_is_computed_correctly(self, binary_number, gray_code): @pytest.mark.parametrize("num_bits", [10, 12, 16]) class TestBijectionBetweenGrayAndBinary: def test_nth_gray_code_inverts_gray_code_index(self, num_bits): - assert all( - gray_code_index(nth_gray_number(n)) == n for n in range(2 ** num_bits) - ) + assert all(gray_code_index(nth_gray_number(n)) == n for n in range(2**num_bits)) def test_gray_code_index_inverts_nth_gray_code(self, num_bits): - assert all( - nth_gray_number(gray_code_index(n)) == n for n in range(2 ** num_bits) - ) + assert all(nth_gray_number(gray_code_index(n)) == n for n in range(2**num_bits)) def _binary_array_to_number(arr): @@ -57,12 +53,12 @@ def test_iterating_algorithm_l_yields_all_gray_codes(num_bits): state = np.zeros(num_bits, dtype=np.int8) produced_numbers = [_binary_array_to_number(state)] - for _ in range(2 ** num_bits - 1): + for _ in range(2**num_bits - 1): bit_to_flip = focus_vector[0] % num_bits state[bit_to_flip] = 1 - state[bit_to_flip] advance_focus_vector(focus_vector) produced_numbers.append(_binary_array_to_number(state)) np.testing.assert_array_equal( - produced_numbers, [nth_gray_number(n) for n in range(2 ** num_bits)] + produced_numbers, [nth_gray_number(n) for n in range(2**num_bits)] )