forked from simongog/sdsl-lite
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request simongog#344 from fmontoto/k2-tree
K^2 tree implementation
- Loading branch information
Showing
18 changed files
with
1,799 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,12 @@ | ||
../data/%: | ||
../data/%: | ||
$(eval URL:=$(firstword $(call config_filter,test_case.config,$@,4))) | ||
@$(if $(URL),,\ | ||
$(error "No download link nor generation program specified for test case $@") ) | ||
@echo "Download input from $(URL) using curl" | ||
$(eval DEST_DIR:=$(shell dirname $@)) | ||
cd $(DEST_DIR); curl -O $(URL) | ||
$(eval FILE:=$(DEST_DIR)/$(notdir $(URL))) | ||
@$(if $(filter-out ".gz",$(FILE)),\ | ||
@$(if $(filter %.gz,$(FILE)),\ | ||
echo "Extract file $(FILE) using gunzip";\ | ||
gunzip $(FILE)) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
include ../../Make.helper | ||
CFLAGS = $(MY_CXX_FLAGS) | ||
SRC_DIR = src | ||
BIN_DIR = bin | ||
LIBS = -lsdsl | ||
|
||
C_OPTIONS:=$(call config_ids,compile_options.config) | ||
TC_IDS:=$(call config_ids,test_case.config) | ||
K2_IDS:=$(call config_ids,k2tree.config) | ||
|
||
DL = $(foreach TC_ID,$(TC_IDS),\ | ||
$(call config_select,test_case.config,$(TC_ID),2)) | ||
DL_K2T = $(foreach TC_ID,$(TC_IDS),\ | ||
$(foreach K2_ID,$(K2_IDS),\ | ||
../tmp/K2T.$(TC_ID).$(K2_ID))) | ||
|
||
K2_EXECS = $(foreach K2_ID,$(K2_IDS),$(BIN_DIR)/build_$(K2_ID)) | ||
|
||
RES_FILES = $(foreach K2_ID,$(K2_IDS),\ | ||
$(foreach TC_ID,$(TC_IDS),\ | ||
results/$(K2_ID).$(TC_ID))) | ||
|
||
GEN_FILES = $(foreach K2_ID,$(K2_IDS),\ | ||
$(BIN_DIR)/gen_k2_$(K2_ID)) | ||
|
||
K2T_RES_FILES = $(foreach K2_ID,$(K2_IDS),\ | ||
$(foreach TC_ID,$(TC_IDS),\ | ||
results-k2t/$(K2_ID).$(TC_ID))) | ||
|
||
RESULT_FILE=results/all.txt | ||
|
||
all: execs | ||
|
||
execs: $(K2_EXECS) | ||
|
||
timing: execs $(RES_FILES) | ||
@cat $(RES_FILES) > $(RESULT_FILE) | ||
@cd visualize;make | ||
|
||
../tmp/K2T.%: $(DL) $(GEN_FILES) | ||
$(eval TC_ID:=$(call dim,1,$*)) | ||
$(eval K2_ID:=$(call dim,2,$*)) | ||
$(eval TC_PATH:=$(call config_select,test_case.config,$(TC_ID),2)) | ||
$(eval NUM_BYTE:=$(call config_select,test_case.config,$(TC_ID),5)) | ||
@$(BIN_DIR)/gen_k2_$(K2_ID) $(TC_PATH) "../tmp/K2T.$(TC_ID).VECTOR" "../tmp/K2T.$(TC_ID).$(K2_ID)" | ||
|
||
# Execute $(BIN_DIR)/build_[K2_ID] and write result | ||
results/%: test_case.config $(DL) $(DL_K2T) execs | ||
$(eval K2_ID:=$(call dim,1,$*)) | ||
$(eval TC_ID:=$(call dim,2,$*)) | ||
$(eval K2_TEX_NAME:=$(call config_select,k2tree.config,$(K2_ID),3)) | ||
$(eval TC_TEX_NAME:=$(call config_select,test_case.config,$(TC_ID),3)) | ||
$(eval K2T:=$(strip $(call config_select,test_case.config,$(TC_ID),6))) | ||
$(eval TC_PATH:=$(call config_select,test_case.config,$(TC_ID),2)) | ||
$(eval TC_TYPE:=$(call config_select,test_case.config,$(TC_ID),5)) | ||
$(eval TC_SIZE:=$(shell wc -c <$(TC_PATH))) | ||
$(eval ARGS:="../tmp/K2T.$(TC_ID).$(K2_ID)") | ||
@echo "Running bin/build_$(K2_ID) on $(TC_ID)" | ||
@echo "# K2_ID = $(K2_ID)" > $@ | ||
@echo "# TC_ID = $(TC_ID)" >> $@ | ||
@echo "# K2_TEX_NAME = $(K2_TEX_NAME)">>$@ | ||
@echo "# TC_TEX_NAME = $(TC_TEX_NAME)">>$@ | ||
@echo "# TC_SIZE = $(TC_SIZE)">>$@ | ||
@$(BIN_DIR)/build_$(K2_ID) $(ARGS) >> $@ | ||
|
||
# $(BIN_DIR)/build_[K2_ID] | ||
$(BIN_DIR)/build_%: $(SRC_DIR)/k2_time_and_space.cpp k2tree.config | ||
$(eval K2_ID:=$(call dim,1,$*)) | ||
$(eval K2_TYPE:=$(call config_select,k2tree.config,$(K2_ID),2)) | ||
@$(MY_CXX) $(CFLAGS) $(C_OPTIONS) -DK2_TYPE="$(K2_TYPE)" -L$(LIB_DIR)\ | ||
$(SRC_DIR)/k2_time_and_space.cpp -I$(INC_DIR) -o $@ $(LIBS) | ||
|
||
$(BIN_DIR)/gen_k2_%: $(SRC_DIR)/gen_k2t.cpp | ||
$(eval K2_ID:=$(call dim,1,$*)) | ||
$(eval K2_TYPE:=$(call config_select,k2tree.config,$(K2_ID),2)) | ||
@$(MY_CXX) $(CFLAGS) $(C_OPTIONS) -DK2_TYPE="$(K2_TYPE)" -L$(LIB_DIR)\ | ||
$(SRC_DIR)/gen_k2t.cpp -I$(INC_DIR) -o $@ $(LIBS) -ldivsufsort -ldivsufsort64 | ||
|
||
include ../Make.download | ||
|
||
clean-build: | ||
@echo "Remove executables" | ||
rm -rf $(BIN_DIR)/build* | ||
rm -rf $(BIN_DIR)/gen* | ||
|
||
clean-result: | ||
@echo "Remove results" | ||
rm -rf results/* | ||
|
||
cleanall: clean-build clean-result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
# Benchmarking k2 trees | ||
|
||
## Methodology | ||
|
||
Explored dimensions: | ||
|
||
* k2 tree implementations | ||
* test cases | ||
* methods (`adj`, `neighbors`, `reverse_neighbors`) | ||
|
||
## Data | ||
|
||
* The data input for the benchmarking are arc files. An arc file is a text | ||
file where each line represents a directed edge between two nodes, the | ||
first column is the origin node and the second the target node. | ||
|
||
### Test cases | ||
|
||
* EXAMPLE test case uses an small file with a little more than 100 nodes and | ||
roughly the same number of edges from web data commons. | ||
* HOSTGRAPH is a test case where the data comes from the Web Cropus released | ||
by the Common Crawl Foundation in April 2014. The file aggregates the | ||
page graph by subdomain/host. It has 123.660.351 edges. | ||
|
||
## Directory structure | ||
|
||
* [bin](./bin): Contains the executables of the project. | ||
* `build_*` generates the binary file with the graph from the arc files. | ||
* `gen_*` executes the experiments. | ||
* [results](./results): Contains the results of the experiments. | ||
* [src](./src): Contains the source code of the benchmark. | ||
* [visualize](./visualize): Contains a `R`-script which generates | ||
a report in LaTeX format. | ||
|
||
## Prerequisites | ||
|
||
* For the visualization you need the following software: | ||
- [R][RPJ] with package `tikzDevice`. You can install the | ||
package by calling | ||
`install.packages("filehash", repos="http://cran.r-project.org")` | ||
and | ||
`install.packages("tikzDevice", repos="http://R-Forge.R-project.org")` | ||
in `R`. | ||
- [pdflatex][LT] to generate the pdf reports. | ||
|
||
## Usage | ||
|
||
* `make timing` compiles the programs, downloads or generates | ||
the test instances, builds the k2 trees, | ||
runs the performance tests and generated a report located at | ||
`visualize/k2.pdf`. The raw numbers of the timings | ||
can be found in the `results/all.txt`. The default benchmark | ||
took 75 minutes on my machine (MacBookPro Retina 2.6Ghz Intel | ||
Core i5 16GB 1600 Mhz DDR3, SSD). Have a look at the | ||
[complete report][RES]. | ||
* All created binaries and test results can be deleted | ||
by calling `make cleanall`. | ||
|
||
## Customization of the benchmark | ||
|
||
The project contains several configuration files: | ||
|
||
* [k2tree.config][K2CONFIG]: Specify different k2 tree implementations. | ||
* [test_case.config][TCCONF]: Specify test instances by ID, path, LaTeX-name | ||
for the report, and download URL. | ||
* [compile_options.config][CCONF]: Specify compile options by option string. | ||
|
||
Note that the benchmark will execute every combination of k2 trees and test cases. | ||
|
||
[RPJ]: http://www.r-project.org/ "R" | ||
[LT]: http://www.tug.org/applications/pdftex/ "pdflatex" | ||
[K2CONFIG]: ./k2tree.config "k2tree.config" | ||
[TCCONF]: ./test_case.config "test_case.config" | ||
[CCONF]: ./compile_options.config "compile_options.config" | ||
[RES]: https://users.dcc.uchile.cl/~fmontoto/static/k2.pdf "k2.pdf" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
* | ||
!.gitignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Compile options | ||
-O3 -funroll-loops -fomit-frame-pointer -ffast-math -DNDEBUG |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# This file specifies k^2 trees that are used in the benchmark. | ||
# | ||
# Each index is specified by a triple: K2_ID;SDSL_TYPE;K2_LATEX_NAME | ||
# * K2_ID : An identifier for the index. Only letters and underscores are allowed in ID. | ||
# * SDSL_TYPE : Corresponding sdsl type. | ||
# * K2_LATEX_NAME: LaTeX name for output in the benchmark report. No underscores are allowed here. | ||
|
||
# Different k and Bit-Vectors: | ||
K2_BV;k2_tree<2, bit_vector, bit_vector::rank_1_type>;K2BV | ||
# K3_BV;k2_tree<3, bit_vector, bit_vector::rank_1_type>;K3BV | ||
# K4_BV;k2_tree<4, bit_vector, bit_vector::rank_1_type>;K4BV | ||
# K2_RRR63V;k2_tree<2, rrr_vector<63>, rrr_vector<63>::rank_1_type>;K2RRR63V | ||
# K3_RRR63V;k2_tree<3, rrr_vector<63>, rrr_vector<63>::rank_1_type>;K3RRR63V | ||
# K4_RRR63V;k2_tree<4, rrr_vector<63>, rrr_vector<63>::rank_1_type>;K4RRR63V | ||
# K2_RRR126V;k2_tree<2, rrr_vector<126>, rrr_vector<126>::rank_1_type>;K2RRR126V | ||
# K3_RRR126V;k2_tree<3, rrr_vector<126>, rrr_vector<126>::rank_1_type>;K3RRR126V | ||
# K4_RRR126V;k2_tree<4, rrr_vector<126>, rrr_vector<126>::rank_1_type>;K4RRR126V | ||
K2_ILV;k2_tree<2, bit_vector_il<512>, bit_vector_il<512>::rank_1_type>;K2ILV | ||
# K3_ILV;k2_tree<3, bit_vector_il<512>, bit_vector_il<512>::rank_1_type>;K3ILV | ||
# K4_ILV;k2_tree<4, bit_vector_il<512>, bit_vector_il<512>::rank_1_type>;K4ILV | ||
K2_SDV;k2_tree<2, sd_vector<bit_vector>, sd_vector<bit_vector>::rank_1_type>;K2SDV | ||
# K3_SDV;k2_tree<3, sd_vector<bit_vector>, sd_vector<bit_vector>::rank_1_type>;K3SDV | ||
# K4_SDV;k2_tree<4, sd_vector<bit_vector>, sd_vector<bit_vector>::rank_1_type>;K4SDV |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
* | ||
!.gitignore |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#include <iostream> | ||
#include <fstream> | ||
#include <tuple> | ||
#include <sdsl/bit_vectors.hpp> | ||
#include <sdsl/k2_tree.hpp> | ||
#include <stdexcept> | ||
#include <string> | ||
#include <vector> | ||
|
||
using namespace sdsl; | ||
|
||
void generate_bit_vector_buffers(const std::string& idx_file, | ||
const std::string& output_x_file, | ||
const std::string& output_y_file) | ||
{ | ||
std::ifstream infile(idx_file); | ||
std::string line; | ||
uint64_t cnt = 0; | ||
|
||
for (int i = 0; std::getline(infile, line); ++i) | ||
cnt++; | ||
|
||
infile.clear(); | ||
infile.seekg(0, std::ios::beg); | ||
// Set size of vector to the amount of lines in the input file. | ||
int_vector<>xv(cnt), yv(cnt); | ||
cnt = 0; | ||
|
||
while(std::getline(infile, line)) { | ||
sdsl::k2_tree_ns::idx_type x, y; | ||
std::istringstream iss(line); | ||
if(!(iss >> x >> y)) | ||
throw std::invalid_argument("Not expected line at construct"); | ||
xv[cnt] = x; | ||
yv[cnt++] = y; | ||
} | ||
|
||
store_to_file(xv, output_x_file); | ||
store_to_file(yv, output_y_file); | ||
} | ||
|
||
inline bool exists(const std::string& name) { | ||
std::ifstream f(name.c_str()); | ||
return f.good(); | ||
} | ||
|
||
int main(int argc, char* argv[]) | ||
{ | ||
if(argc < 4) { | ||
std::cout<<"Usage: input_file output_file_prefix output_k2_file" << std::endl; | ||
} | ||
|
||
std::string out_x(argv[2]); | ||
out_x.append(".x"); | ||
std::string out_y(argv[2]); | ||
out_y.append(+ ".y"); | ||
|
||
if(!exists(out_x) || !exists(out_y)) | ||
generate_bit_vector_buffers(argv[1], out_x, out_y); | ||
|
||
K2_TYPE k2(argv[2]); | ||
std::ofstream fs; | ||
fs.open(argv[3]); | ||
k2.serialize(fs); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
#include<fstream> | ||
#include<string> | ||
#include<sstream> | ||
|
||
#include<sdsl/bit_vectors.hpp> | ||
#include<sdsl/k2_tree.hpp> | ||
|
||
using namespace std; | ||
using namespace sdsl; | ||
using namespace std::chrono; | ||
using timer = std::chrono::high_resolution_clock; | ||
|
||
typedef K2_TYPE::idx_type idx_type; | ||
typedef K2_TYPE::size_type size_type; | ||
|
||
|
||
template<class t_kt> | ||
size_type test_adj(const t_kt &tree, idx_type node, size_type neighbor, | ||
uint64_t times) | ||
{ | ||
size_type cnt = 0; | ||
for(uint64_t i = 0; i < times; i++) | ||
if(tree.adj(node, neighbor)) | ||
cnt++; | ||
return cnt; | ||
} | ||
|
||
template<class t_kt> | ||
size_type test_neighbors(const t_kt &tree, idx_type node, uint64_t times) | ||
{ | ||
size_type cnt = 0; | ||
for(uint64_t i = 0; i < times; i++) | ||
cnt += tree.neigh(node).size(); | ||
return cnt; | ||
} | ||
|
||
template<class t_kt> | ||
size_type test_reverse_neighbors(const t_kt &tree, idx_type node, uint64_t times) | ||
{ | ||
size_type cnt = 0; | ||
for(uint64_t i = 0; i < times; i++) | ||
cnt += tree.reverse_neigh(node).size(); | ||
return cnt; | ||
} | ||
|
||
int main(int argc, char* argv[]) | ||
{ | ||
if (argc < 2) { | ||
cout << "Usage: file" << endl; | ||
return 1; | ||
} | ||
|
||
const uint64_t reps = 100000; | ||
|
||
// construct | ||
memory_monitor::start(); | ||
std::ifstream is(argv[1]); | ||
auto start = timer::now(); | ||
K2_TYPE k2; | ||
k2.load(is); | ||
auto stop = timer::now(); | ||
memory_monitor::stop(); | ||
cout << "# constructs_time = " << duration_cast<milliseconds>(stop-start).count()/(double)1000 << endl; | ||
cout << "# constructs_space = " << memory_monitor::peak() << endl; | ||
// size | ||
cout << "# k2_size = " << size_in_bytes(k2) << endl; | ||
is.close(); | ||
|
||
// adj | ||
start = timer::now(); | ||
auto check = test_adj<K2_TYPE>(k2, 7, 5, reps); | ||
stop = timer::now(); | ||
cout << "# adj_time = " << duration_cast<microseconds>(stop-start).count()/(double)reps << endl; | ||
cout << "# adj_check = " << check << endl; | ||
|
||
// neighbors | ||
start = timer::now(); | ||
check = test_neighbors<K2_TYPE>(k2, 7, reps); | ||
stop = timer::now(); | ||
cout << "# neighbors_time = " << duration_cast<microseconds>(stop-start).count()/(double)reps << endl; | ||
cout << "# neighbors_check = " << check << endl; | ||
|
||
start = timer::now(); | ||
check = test_reverse_neighbors<K2_TYPE>(k2, 10, reps); | ||
stop = timer::now(); | ||
cout << "# reverse_neighbors_time = " << duration_cast<microseconds>(stop-start).count()/(double)reps << endl; | ||
cout << "# reverse_neighbors_check = " << check << endl; | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Configuration for test files | ||
# (1) Identifier for test file (consisting of letters, no `.`) | ||
# (2) Path to the test file | ||
# (3) LaTeX name | ||
# (4) Download link (if the test is available online) | ||
EXAMPLE;../data/example_arcs;examples;http://webdatacommons.org/hyperlinkgraph/data/example_arcs | ||
HOSTGRAPH;../data/hostgraph.arc;hostgraph;http://users.dcc.uchile.cl/~fmontoto/static/hostgraph.arc.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
* | ||
!.gitignore | ||
!Makefile | ||
!k2-footer.tex | ||
!k2-header.tex | ||
!k2.R |
Oops, something went wrong.