diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..ac68c026 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "third_party/kmc2"] + path = third_party/kmc2 + url = https://github.com/mneilly/kmc2.git diff --git a/BUILD.md b/BUILD.md new file mode 100644 index 00000000..c8845b64 --- /dev/null +++ b/BUILD.md @@ -0,0 +1,70 @@ + +## Building Bolt + +Make sure you have cmake, swig and python 3 installed. This was +tested with cmake 3.18.4, swig 4.0 and python 3.9 on Debian 11. It was +also tested with cmake 3.21.0, swig 4.02 and python 3.9.7 (installed +from Brew) on Mac OS X 10.14.6. Optionally, you can also use the +system Eigen3 if you have it installed. + +### Using Docker +``` +(cd docker && docker build -t bolt .) +docker run -v $PWD:$PWD -w $PWD -it bolt /bin/bash +./build.sh +source venv/bin/activate +python tests/test_encoder.py +./cpp/build-bolt/bolt amm* +``` + +### The Easy Way + +This assumes you have appropriate versions of tools, libraries, +etc. already available on your system. +``` +./build.sh +source venv/bin/activate +pytest tests +cd cpp/build-bolt +./bolt amm* +``` + + +### C++ + +``` + cd cpp + mkdir build-bolt + cd build-bolt + cmake .. + make + + ./bolt amm* +``` + +### Python + +To build the python package: + +``` + git submodule update --init # for kmc2 + + virtualenv -p $(which python3) venv + source venv/bin/activate + + pip install -r requirements.txt + pip install ./third_party/kmc2 + python setup.py install +``` + +To build with GCC instead of clang: + + `CC=gcc CXX=g++ python setup.py install` + +If you want to use the system Eigen installation set the appropriate path for your system. E.g. - + + `EIGEN_INCLUDE_DIR=/usr/include/eigen3 python setup.py install` + +To test that it works: + + `pytest tests` diff --git a/README.md b/README.md index 00d04251..792cf20f 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,9 @@ EDIT: this repo now also features the source code for [MADDNESS](https://arxiv.o EDIT2: Looking for a research project? See our [list of ideas](https://github.com/dblalock/bolt/tree/master/experiments). -**NOTE: All below code refers to the Python wrapper for Bolt and has nothing to do with MADDNESS.** It also seems to be [no longer building](https://github.com/dblalock/bolt/issues/4) for many people. If you want to use MADDNESS, see the [Python Implementation](https://github.com/dblalock/bolt/blob/45454e6cfbc9300a43da6770abf9715674b47a0f/experiments/python/vq_amm.py#L273) driven by [amm_main.py](https://github.com/dblalock/bolt/blob/45454e6cfbc9300a43da6770abf9715674b47a0f/experiments/python/amm_main.py) or [C++ implementation](https://github.com/dblalock/bolt/blob/45454e6cfbc9300a43da6770abf9715674b47a0f/cpp/src/quantize/mithral.cpp). All code is ugly, but Python code should be pretty easy to add new AMM methods/variations to.** +EDIT3: See [Build.md](https://github.com/dblalock/bolt/blob/master/BUILD.md) for a working dockerfile that builds and runs Bolt, contributed by @mneilly. + +**NOTE: All below code refers to the Python wrapper for Bolt and has nothing to do with MADDNESS.** It also seems to be [no longer building](https://github.com/dblalock/bolt/issues/4) for many people. If you want to use MADDNESS, see the [Python Implementation](https://github.com/dblalock/bolt/blob/45454e6cfbc9300a43da6770abf9715674b47a0f/experiments/python/vq_amm.py#L273) driven by [amm_main.py](https://github.com/dblalock/bolt/blob/45454e6cfbc9300a43da6770abf9715674b47a0f/experiments/python/amm_main.py) or [C++ implementation](https://github.com/dblalock/bolt/blob/45454e6cfbc9300a43da6770abf9715674b47a0f/cpp/src/quantize/mithral.cpp). All code is ugly, but Python code should be pretty easy to add new AMM methods/variations to. diff --git a/build.sh b/build.sh new file mode 100755 index 00000000..1e170c88 --- /dev/null +++ b/build.sh @@ -0,0 +1,35 @@ +#/usr/bin/env bash + +MYNAME=${0##*/} +MYPATH=$(pwd -P) + +(cd docker && docker build -t bolt .) + +# Create virtual environment + +if [ ! -e venv ]; then + virtualenv -p $(which python3) venv +fi + +. venv/bin/activate + +# Build python package + +git submodule update --init +pip install -r requirements.txt +pip install ./third_party/kmc2 +# pip install --use-feature=in-tree-build -r requirements.txt +# pip install --use-feature=in-tree-build ./third_party/kmc2 +# pip install . # doesn't work due to custom install command +python setup.py install +# python tests/test_encoder.py +#--or-- +# python setup.py build_ext --inplace +# PYTHONPATH=${MYPATH}/python python tests/test_encoder.py + +# Build C++ + +mkdir -p cpp/build-bolt +cd cpp/build-bolt +cmake .. +make -j4 diff --git a/clean.sh b/clean.sh new file mode 100755 index 00000000..dad42902 --- /dev/null +++ b/clean.sh @@ -0,0 +1,8 @@ +#/usr/bin/env bash + +rm -rf venv + +rm -rf build/ pybolt.egg-info/ +rm -rf python/bolt/bolt.py python/bolt/native_wrap.cpp python/bolt/pybolt.egg-info + +rm -rf cpp/build-bolt diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt new file mode 100644 index 00000000..4f094b21 --- /dev/null +++ b/cpp/CMakeLists.txt @@ -0,0 +1,55 @@ +cmake_minimum_required(VERSION 3.3 FATAL_ERROR) + +project(bolt CXX) + +find_package(Eigen3 REQUIRED) + +set(sourceFiles + ${CMAKE_SOURCE_DIR}/src/quantize/bolt.cpp + ${CMAKE_SOURCE_DIR}/src/quantize/mithral.cpp + ${CMAKE_SOURCE_DIR}/src/utils/avx_utils.cpp + ${CMAKE_SOURCE_DIR}/test/main.cpp + ${CMAKE_SOURCE_DIR}/test/quantize + ${CMAKE_SOURCE_DIR}/test/test_avx_utils.cpp + ${CMAKE_SOURCE_DIR}/test/quantize/profile_amm.cpp + #${CMAKE_SOURCE_DIR}/test/quantize/profile_amm_old.cpp + ${CMAKE_SOURCE_DIR}/test/quantize/profile_bolt.cpp + ${CMAKE_SOURCE_DIR}/test/quantize/profile_encode.cpp + ${CMAKE_SOURCE_DIR}/test/quantize/profile_lut_creation.cpp + ${CMAKE_SOURCE_DIR}/test/quantize/profile_multicodebook.cpp + ${CMAKE_SOURCE_DIR}/test/quantize/profile_pq.cpp + ${CMAKE_SOURCE_DIR}/test/quantize/profile_scan.cpp + ${CMAKE_SOURCE_DIR}/test/quantize/test_bolt.cpp + ${CMAKE_SOURCE_DIR}/test/quantize/test_mithral.cpp + ${CMAKE_SOURCE_DIR}/test/quantize/test_multicodebook.cpp + ) + +set(headerFiles + ${CMAKE_SOURCE_DIR}/src/include/public.hpp + ${CMAKE_SOURCE_DIR}/src/quantize/bolt.hpp + ${CMAKE_SOURCE_DIR}/src/quantize/mithral.hpp + ${CMAKE_SOURCE_DIR}/src/quantize/mithral_v1.hpp + ${CMAKE_SOURCE_DIR}/src/quantize/multi_codebook.hpp + ${CMAKE_SOURCE_DIR}/src/quantize/multisplit.hpp + ${CMAKE_SOURCE_DIR}/src/quantize/product_quantize.hpp + ${CMAKE_SOURCE_DIR}/src/utils/avx_utils.hpp + ${CMAKE_SOURCE_DIR}/src/utils/bit_ops.hpp + ${CMAKE_SOURCE_DIR}/src/utils/debug_utils.hpp + ${CMAKE_SOURCE_DIR}/src/utils/eigen_utils.hpp + ${CMAKE_SOURCE_DIR}/src/utils/memory.hpp + ${CMAKE_SOURCE_DIR}/src/utils/nn_utils.hpp + ${CMAKE_SOURCE_DIR}/src/utils/timing_utils.hpp + ${CMAKE_SOURCE_DIR}/test/external/catch.hpp + ${CMAKE_SOURCE_DIR}/test/quantize/amm_common.hpp + ${CMAKE_SOURCE_DIR}/test/quantize/profile_amm.hpp + ${CMAKE_SOURCE_DIR}/test/quantize/test_bolt.hpp + ${CMAKE_SOURCE_DIR}/test/testing_utils/testing_utils.hpp + ) + +add_executable(bolt ${sourceFiles} ${headerFiles}) +#add_library(bolt SHARED ${sourceFiles} ${headerFiles}) +set_target_properties(bolt PROPERTIES LINKER_LANGUAGE CXX) +target_compile_definitions(bolt PRIVATE "-DBLAZE") +target_link_libraries(bolt Eigen3::Eigen) +target_include_directories(bolt PUBLIC ${CMAKE_SOURCE_DIR}) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -march=native -fno-rtti -ffast-math") diff --git a/cpp/src/external/eigen/.hg/dirstate b/cpp/src/external/eigen/.hg/dirstate index 92e06adc..79ccfcc8 100644 Binary files a/cpp/src/external/eigen/.hg/dirstate and b/cpp/src/external/eigen/.hg/dirstate differ diff --git a/cpp/test/quantize/profile_amm.hpp b/cpp/test/quantize/profile_amm.hpp index 3ede56db..3a84685f 100644 --- a/cpp/test/quantize/profile_amm.hpp +++ b/cpp/test/quantize/profile_amm.hpp @@ -54,7 +54,7 @@ struct mithral_amm_task { centroids(ncentroids * ncodebooks, D), nsplits(ncodebooks * nsplits_per_codebook), splitdims(nsplits), - splitvals(max_splitvals, nsplits), + splitvals(1 << 4, nsplits), encode_scales(nsplits), encode_offsets(nsplits), nnz_per_centroid(lut_work_const > 0 ? diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 00000000..3a20d08f --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,7 @@ +FROM debian:11.3 + +RUN apt update -y +RUN apt install -y build-essential clang python3 virtualenv git cmake swig libeigen3-dev + + + diff --git a/python/bolt/__init__.py b/python/bolt/__init__.py index b2d91184..41b3bd86 100644 --- a/python/bolt/__init__.py +++ b/python/bolt/__init__.py @@ -2,4 +2,4 @@ # note that we import module generate py file, not the generated # wrapper so (which is _bolt) -from bolt_api import * # noqa +from .bolt_api import * # noqa diff --git a/python/bolt/bolt_api.py b/python/bolt/bolt_api.py index a2d426ed..291119c5 100644 --- a/python/bolt/bolt_api.py +++ b/python/bolt/bolt_api.py @@ -3,7 +3,7 @@ # TODO maybe have sklearn transforms for dot prod and Lp dists # TODO add L1 distance -import bolt # inner bolt because of SWIG +from . import bolt # inner bolt because of SWIG import kmc2 # state-of-the-art kmeans initialization (as of NIPS 2016) import numpy as np @@ -209,8 +209,8 @@ def _learn_quantization_params(X, centroids, elemwise_dist_func, Q=None, """learn distros of entries in each lut""" if Q is None: - num_rows = min(10*1000, len(X) / 2) - how_many = min(1000, num_rows // 2) + num_rows = int(min(10*1000, len(X) / 2)) + how_many = int(min(1000, num_rows // 2)) _, Q = _extract_random_rows( X[num_rows:], how_many=how_many, remove_from_X=False) X = X[:num_rows] # limit to first 10k rows of X diff --git a/python/bolt/eigen.i b/python/bolt/eigen.i index f57992e5..2bc8603c 100644 --- a/python/bolt/eigen.i +++ b/python/bolt/eigen.i @@ -44,7 +44,7 @@ %{ #define SWIG_FILE_WITH_INIT - #include "eigen/Core" + #include "Eigen/Core" %} %include "numpy.i" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..a53a3c6a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +numpy +scikit-learn +Cython +pytest diff --git a/setup.py b/setup.py index 43f9cde2..37c626c3 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,7 @@ from setuptools import find_packages from setuptools import setup from setuptools import Extension +from setuptools.command.install import install # ================================ C++ extension @@ -40,8 +41,10 @@ for root, dirNames, fileNames in os.walk(srcDir): for dirName in dirNames: absPath = os.path.join(root, dirName) + if absPath.startswith("cpp/src/external"): + continue print('adding dir to path: %s' % absPath) - globStr = "%s/*.c*" % absPath + globStr = "%s/*.cpp" % absPath files = glob(globStr) if 'eigen/src' not in absPath: # just include top level includeDirs.append(absPath) @@ -59,17 +62,33 @@ os.environ['LDFLAGS'] = '-mmacosx-version-min=10.9 -stdlib=libc++ -framework Accelerate' os.environ["CC"] = "g++" # force compiling c as c++ else: # based on Issue #4 - extra_args += ['-stdlib=libc++'] - os.environ['CC'] = "clang" - os.environ['CXX'] = "clang++" - os.environ['LDFLAGS'] = '-lc++' + if "CC" not in os.environ: + os.environ['CC'] = "clang" + if "CXX" not in os.environ: + os.environ['CXX'] = "clang++" + # extra_args += ['-stdlib=libc++'] + # os.environ['CC'] = "clang" + # os.environ['CXX'] = "clang++" + # os.environ['LDFLAGS'] = '-lc++' # else: # os.environ["CC"] = "clang++" # force compiling c as c++ # inplace extension module -includeDirs += [join(PROJ_DIR, 'python', 'bolt')] # for swig -nativeExt = Extension("_bolt", # must match cpp header name with leading _ +includeDirs += [join(PROJ_DIR, 'python', 'bolt')] + +if 'EIGEN_INCLUDE_DIR' in os.environ: + includeDirs += [ + os.environ['EIGEN_INCLUDE_DIR'], + os.environ['EIGEN_INCLUDE_DIR'] + '/Eigen' + ] +else: + includeDirs += [ + join(PROJ_DIR, 'cpp', 'src', 'external', 'eigen'), + join(PROJ_DIR, 'cpp', 'src', 'external', 'eigen', 'Eigen') + ] + +nativeExt = Extension("bolt._bolt", # must match cpp header name with leading _ srcFiles, define_macros=[('NDEBUG', '1')], include_dirs=includeDirs, @@ -81,12 +100,21 @@ # ================================ Python modules -glob_str = join('python', 'bolt') + '*.py' -modules = [splitext(basename(path))[0] for path in glob(glob_str)] +# glob_str = join('python', 'bolt') + '*.py' +# modules = [splitext(basename(path))[0] for path in glob(glob_str)] # ================================ Call to setup() +# This ensures that the extension (which generates bolt.py) is built +# before py_modules are copied. + +class CustomInstall(install): + def run(self): + self.run_command('build_ext') + self.do_egg_install() + setup( + cmdclass={'install': CustomInstall}, name='pybolt', version='0.1.4', license='MPL', @@ -97,7 +125,7 @@ download_url='https://github.com/dblalock/bolt/archive/0.1.tar.gz', packages=['bolt'], package_dir={'bolt': 'python/bolt'}, - py_modules=modules, + py_modules=['python/bolt/bolt'], include_package_data=True, zip_safe=False, classifiers=[ @@ -123,7 +151,7 @@ install_requires=[ 'numpy', 'scikit-learn', - 'kmc2' + #'kmc2' # 'sphinx_rtd_theme' # for docs ], extras_require={ diff --git a/third_party/kmc2 b/third_party/kmc2 new file mode 160000 index 00000000..e4559ca4 --- /dev/null +++ b/third_party/kmc2 @@ -0,0 +1 @@ +Subproject commit e4559ca41f04ad2286906a1f5fbed8ec83feada7