Skip to content

Commit

Permalink
Merge branch 'master' of github.com:dblalock/bolt
Browse files Browse the repository at this point in the history
  • Loading branch information
dblalock committed Jun 19, 2022
2 parents 09ce684 + e0a3af8 commit f468e9b
Show file tree
Hide file tree
Showing 15 changed files with 231 additions and 18 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "third_party/kmc2"]
path = third_party/kmc2
url = https://github.com/mneilly/kmc2.git
70 changes: 70 additions & 0 deletions BUILD.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@

## Building Bolt

Make sure you have cmake, swig and python 3 installed. This was
tested with cmake 3.18.4, swig 4.0 and python 3.9 on Debian 11. It was
also tested with cmake 3.21.0, swig 4.02 and python 3.9.7 (installed
from Brew) on Mac OS X 10.14.6. Optionally, you can also use the
system Eigen3 if you have it installed.

### Using Docker
```
(cd docker && docker build -t bolt .)
docker run -v $PWD:$PWD -w $PWD -it bolt /bin/bash
./build.sh
source venv/bin/activate
python tests/test_encoder.py
./cpp/build-bolt/bolt amm*
```

### The Easy Way

This assumes you have appropriate versions of tools, libraries,
etc. already available on your system.
```
./build.sh
source venv/bin/activate
pytest tests
cd cpp/build-bolt
./bolt amm*
```


### C++

```
cd cpp
mkdir build-bolt
cd build-bolt
cmake ..
make
./bolt amm*
```

### Python

To build the python package:

```
git submodule update --init # for kmc2
virtualenv -p $(which python3) venv
source venv/bin/activate
pip install -r requirements.txt
pip install ./third_party/kmc2
python setup.py install
```

To build with GCC instead of clang:

`CC=gcc CXX=g++ python setup.py install`

If you want to use the system Eigen installation set the appropriate path for your system. E.g. -

`EIGEN_INCLUDE_DIR=/usr/include/eigen3 python setup.py install`

To test that it works:

`pytest tests`
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ EDIT: this repo now also features the source code for [MADDNESS](https://arxiv.o

EDIT2: Looking for a research project? See our [list of ideas](https://github.com/dblalock/bolt/tree/master/experiments).

**NOTE: All below code refers to the Python wrapper for Bolt and has nothing to do with MADDNESS.** It also seems to be [no longer building](https://github.com/dblalock/bolt/issues/4) for many people. If you want to use MADDNESS, see the [Python Implementation](https://github.com/dblalock/bolt/blob/45454e6cfbc9300a43da6770abf9715674b47a0f/experiments/python/vq_amm.py#L273) driven by [amm_main.py](https://github.com/dblalock/bolt/blob/45454e6cfbc9300a43da6770abf9715674b47a0f/experiments/python/amm_main.py) or [C++ implementation](https://github.com/dblalock/bolt/blob/45454e6cfbc9300a43da6770abf9715674b47a0f/cpp/src/quantize/mithral.cpp). All code is ugly, but Python code should be pretty easy to add new AMM methods/variations to.**
EDIT3: See [Build.md](https://github.com/dblalock/bolt/blob/master/BUILD.md) for a working dockerfile that builds and runs Bolt, contributed by @mneilly.

**NOTE: All below code refers to the Python wrapper for Bolt and has nothing to do with MADDNESS.** It also seems to be [no longer building](https://github.com/dblalock/bolt/issues/4) for many people. If you want to use MADDNESS, see the [Python Implementation](https://github.com/dblalock/bolt/blob/45454e6cfbc9300a43da6770abf9715674b47a0f/experiments/python/vq_amm.py#L273) driven by [amm_main.py](https://github.com/dblalock/bolt/blob/45454e6cfbc9300a43da6770abf9715674b47a0f/experiments/python/amm_main.py) or [C++ implementation](https://github.com/dblalock/bolt/blob/45454e6cfbc9300a43da6770abf9715674b47a0f/cpp/src/quantize/mithral.cpp). All code is ugly, but Python code should be pretty easy to add new AMM methods/variations to.

<!-- NOTE: All the code, documentation, and results associated with Bolt's KDD paper can be found in the `experiments/` directory. See the README therein for details. A cleaned-up version of the paper is available [here](https://github.com/dblalock/bolt/blob/master/assets/bolt.pdf?raw=true). -->

Expand Down
35 changes: 35 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#/usr/bin/env bash

MYNAME=${0##*/}
MYPATH=$(pwd -P)

(cd docker && docker build -t bolt .)

# Create virtual environment

if [ ! -e venv ]; then
virtualenv -p $(which python3) venv
fi

. venv/bin/activate

# Build python package

git submodule update --init
pip install -r requirements.txt
pip install ./third_party/kmc2
# pip install --use-feature=in-tree-build -r requirements.txt
# pip install --use-feature=in-tree-build ./third_party/kmc2
# pip install . # doesn't work due to custom install command
python setup.py install
# python tests/test_encoder.py
#--or--
# python setup.py build_ext --inplace
# PYTHONPATH=${MYPATH}/python python tests/test_encoder.py

# Build C++

mkdir -p cpp/build-bolt
cd cpp/build-bolt
cmake ..
make -j4
8 changes: 8 additions & 0 deletions clean.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#/usr/bin/env bash

rm -rf venv

rm -rf build/ pybolt.egg-info/
rm -rf python/bolt/bolt.py python/bolt/native_wrap.cpp python/bolt/pybolt.egg-info

rm -rf cpp/build-bolt
55 changes: 55 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
cmake_minimum_required(VERSION 3.3 FATAL_ERROR)

project(bolt CXX)

find_package(Eigen3 REQUIRED)

set(sourceFiles
${CMAKE_SOURCE_DIR}/src/quantize/bolt.cpp
${CMAKE_SOURCE_DIR}/src/quantize/mithral.cpp
${CMAKE_SOURCE_DIR}/src/utils/avx_utils.cpp
${CMAKE_SOURCE_DIR}/test/main.cpp
${CMAKE_SOURCE_DIR}/test/quantize
${CMAKE_SOURCE_DIR}/test/test_avx_utils.cpp
${CMAKE_SOURCE_DIR}/test/quantize/profile_amm.cpp
#${CMAKE_SOURCE_DIR}/test/quantize/profile_amm_old.cpp
${CMAKE_SOURCE_DIR}/test/quantize/profile_bolt.cpp
${CMAKE_SOURCE_DIR}/test/quantize/profile_encode.cpp
${CMAKE_SOURCE_DIR}/test/quantize/profile_lut_creation.cpp
${CMAKE_SOURCE_DIR}/test/quantize/profile_multicodebook.cpp
${CMAKE_SOURCE_DIR}/test/quantize/profile_pq.cpp
${CMAKE_SOURCE_DIR}/test/quantize/profile_scan.cpp
${CMAKE_SOURCE_DIR}/test/quantize/test_bolt.cpp
${CMAKE_SOURCE_DIR}/test/quantize/test_mithral.cpp
${CMAKE_SOURCE_DIR}/test/quantize/test_multicodebook.cpp
)

set(headerFiles
${CMAKE_SOURCE_DIR}/src/include/public.hpp
${CMAKE_SOURCE_DIR}/src/quantize/bolt.hpp
${CMAKE_SOURCE_DIR}/src/quantize/mithral.hpp
${CMAKE_SOURCE_DIR}/src/quantize/mithral_v1.hpp
${CMAKE_SOURCE_DIR}/src/quantize/multi_codebook.hpp
${CMAKE_SOURCE_DIR}/src/quantize/multisplit.hpp
${CMAKE_SOURCE_DIR}/src/quantize/product_quantize.hpp
${CMAKE_SOURCE_DIR}/src/utils/avx_utils.hpp
${CMAKE_SOURCE_DIR}/src/utils/bit_ops.hpp
${CMAKE_SOURCE_DIR}/src/utils/debug_utils.hpp
${CMAKE_SOURCE_DIR}/src/utils/eigen_utils.hpp
${CMAKE_SOURCE_DIR}/src/utils/memory.hpp
${CMAKE_SOURCE_DIR}/src/utils/nn_utils.hpp
${CMAKE_SOURCE_DIR}/src/utils/timing_utils.hpp
${CMAKE_SOURCE_DIR}/test/external/catch.hpp
${CMAKE_SOURCE_DIR}/test/quantize/amm_common.hpp
${CMAKE_SOURCE_DIR}/test/quantize/profile_amm.hpp
${CMAKE_SOURCE_DIR}/test/quantize/test_bolt.hpp
${CMAKE_SOURCE_DIR}/test/testing_utils/testing_utils.hpp
)

add_executable(bolt ${sourceFiles} ${headerFiles})
#add_library(bolt SHARED ${sourceFiles} ${headerFiles})
set_target_properties(bolt PROPERTIES LINKER_LANGUAGE CXX)
target_compile_definitions(bolt PRIVATE "-DBLAZE")
target_link_libraries(bolt Eigen3::Eigen)
target_include_directories(bolt PUBLIC ${CMAKE_SOURCE_DIR})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -march=native -fno-rtti -ffast-math")
Binary file modified cpp/src/external/eigen/.hg/dirstate
Binary file not shown.
2 changes: 1 addition & 1 deletion cpp/test/quantize/profile_amm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ struct mithral_amm_task {
centroids(ncentroids * ncodebooks, D),
nsplits(ncodebooks * nsplits_per_codebook),
splitdims(nsplits),
splitvals(max_splitvals, nsplits),
splitvals(1 << 4, nsplits),
encode_scales(nsplits),
encode_offsets(nsplits),
nnz_per_centroid(lut_work_const > 0 ?
Expand Down
7 changes: 7 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM debian:11.3

RUN apt update -y
RUN apt install -y build-essential clang python3 virtualenv git cmake swig libeigen3-dev



2 changes: 1 addition & 1 deletion python/bolt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

# note that we import module generate py file, not the generated
# wrapper so (which is _bolt)
from bolt_api import * # noqa
from .bolt_api import * # noqa
6 changes: 3 additions & 3 deletions python/bolt/bolt_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# TODO maybe have sklearn transforms for dot prod and Lp dists
# TODO add L1 distance

import bolt # inner bolt because of SWIG
from . import bolt # inner bolt because of SWIG

import kmc2 # state-of-the-art kmeans initialization (as of NIPS 2016)
import numpy as np
Expand Down Expand Up @@ -209,8 +209,8 @@ def _learn_quantization_params(X, centroids, elemwise_dist_func, Q=None,
"""learn distros of entries in each lut"""

if Q is None:
num_rows = min(10*1000, len(X) / 2)
how_many = min(1000, num_rows // 2)
num_rows = int(min(10*1000, len(X) / 2))
how_many = int(min(1000, num_rows // 2))
_, Q = _extract_random_rows(
X[num_rows:], how_many=how_many, remove_from_X=False)
X = X[:num_rows] # limit to first 10k rows of X
Expand Down
2 changes: 1 addition & 1 deletion python/bolt/eigen.i
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

%{
#define SWIG_FILE_WITH_INIT
#include "eigen/Core"
#include "Eigen/Core"
%}

%include "numpy.i"
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
numpy
scikit-learn
Cython
pytest
50 changes: 39 additions & 11 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from setuptools import find_packages
from setuptools import setup
from setuptools import Extension
from setuptools.command.install import install


# ================================ C++ extension
Expand All @@ -40,8 +41,10 @@
for root, dirNames, fileNames in os.walk(srcDir):
for dirName in dirNames:
absPath = os.path.join(root, dirName)
if absPath.startswith("cpp/src/external"):
continue
print('adding dir to path: %s' % absPath)
globStr = "%s/*.c*" % absPath
globStr = "%s/*.cpp" % absPath
files = glob(globStr)
if 'eigen/src' not in absPath: # just include top level
includeDirs.append(absPath)
Expand All @@ -59,17 +62,33 @@
os.environ['LDFLAGS'] = '-mmacosx-version-min=10.9 -stdlib=libc++ -framework Accelerate'
os.environ["CC"] = "g++" # force compiling c as c++
else: # based on Issue #4
extra_args += ['-stdlib=libc++']
os.environ['CC'] = "clang"
os.environ['CXX'] = "clang++"
os.environ['LDFLAGS'] = '-lc++'
if "CC" not in os.environ:
os.environ['CC'] = "clang"
if "CXX" not in os.environ:
os.environ['CXX'] = "clang++"
# extra_args += ['-stdlib=libc++']
# os.environ['CC'] = "clang"
# os.environ['CXX'] = "clang++"
# os.environ['LDFLAGS'] = '-lc++'
# else:
# os.environ["CC"] = "clang++" # force compiling c as c++


# inplace extension module
includeDirs += [join(PROJ_DIR, 'python', 'bolt')] # for swig
nativeExt = Extension("_bolt", # must match cpp header name with leading _
includeDirs += [join(PROJ_DIR, 'python', 'bolt')]

if 'EIGEN_INCLUDE_DIR' in os.environ:
includeDirs += [
os.environ['EIGEN_INCLUDE_DIR'],
os.environ['EIGEN_INCLUDE_DIR'] + '/Eigen'
]
else:
includeDirs += [
join(PROJ_DIR, 'cpp', 'src', 'external', 'eigen'),
join(PROJ_DIR, 'cpp', 'src', 'external', 'eigen', 'Eigen')
]

nativeExt = Extension("bolt._bolt", # must match cpp header name with leading _
srcFiles,
define_macros=[('NDEBUG', '1')],
include_dirs=includeDirs,
Expand All @@ -81,12 +100,21 @@

# ================================ Python modules

glob_str = join('python', 'bolt') + '*.py'
modules = [splitext(basename(path))[0] for path in glob(glob_str)]
# glob_str = join('python', 'bolt') + '*.py'
# modules = [splitext(basename(path))[0] for path in glob(glob_str)]

# ================================ Call to setup()

# This ensures that the extension (which generates bolt.py) is built
# before py_modules are copied.

class CustomInstall(install):
def run(self):
self.run_command('build_ext')
self.do_egg_install()

setup(
cmdclass={'install': CustomInstall},
name='pybolt',
version='0.1.4',
license='MPL',
Expand All @@ -97,7 +125,7 @@
download_url='https://github.com/dblalock/bolt/archive/0.1.tar.gz',
packages=['bolt'],
package_dir={'bolt': 'python/bolt'},
py_modules=modules,
py_modules=['python/bolt/bolt'],
include_package_data=True,
zip_safe=False,
classifiers=[
Expand All @@ -123,7 +151,7 @@
install_requires=[
'numpy',
'scikit-learn',
'kmc2'
#'kmc2'
# 'sphinx_rtd_theme' # for docs
],
extras_require={
Expand Down
1 change: 1 addition & 0 deletions third_party/kmc2
Submodule kmc2 added at e4559c

0 comments on commit f468e9b

Please sign in to comment.