Skip to content

Commit

Permalink
Merge pull request simongog#344 from fmontoto/k2-tree
Browse files Browse the repository at this point in the history
K^2 tree implementation
  • Loading branch information
simongog authored Sep 26, 2016
2 parents c91314f + f042720 commit 86c5386
Show file tree
Hide file tree
Showing 18 changed files with 1,799 additions and 2 deletions.
4 changes: 2 additions & 2 deletions benchmark/Make.download
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
../data/%:
../data/%:
$(eval URL:=$(firstword $(call config_filter,test_case.config,$@,4)))
@$(if $(URL),,\
$(error "No download link nor generation program specified for test case $@") )
@echo "Download input from $(URL) using curl"
$(eval DEST_DIR:=$(shell dirname $@))
cd $(DEST_DIR); curl -O $(URL)
$(eval FILE:=$(DEST_DIR)/$(notdir $(URL)))
@$(if $(filter-out ".gz",$(FILE)),\
@$(if $(filter %.gz,$(FILE)),\
echo "Extract file $(FILE) using gunzip";\
gunzip $(FILE))

90 changes: 90 additions & 0 deletions benchmark/k2_trees/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
include ../../Make.helper
CFLAGS = $(MY_CXX_FLAGS)
SRC_DIR = src
BIN_DIR = bin
LIBS = -lsdsl

C_OPTIONS:=$(call config_ids,compile_options.config)
TC_IDS:=$(call config_ids,test_case.config)
K2_IDS:=$(call config_ids,k2tree.config)

DL = $(foreach TC_ID,$(TC_IDS),\
$(call config_select,test_case.config,$(TC_ID),2))
DL_K2T = $(foreach TC_ID,$(TC_IDS),\
$(foreach K2_ID,$(K2_IDS),\
../tmp/K2T.$(TC_ID).$(K2_ID)))

K2_EXECS = $(foreach K2_ID,$(K2_IDS),$(BIN_DIR)/build_$(K2_ID))

RES_FILES = $(foreach K2_ID,$(K2_IDS),\
$(foreach TC_ID,$(TC_IDS),\
results/$(K2_ID).$(TC_ID)))

GEN_FILES = $(foreach K2_ID,$(K2_IDS),\
$(BIN_DIR)/gen_k2_$(K2_ID))

K2T_RES_FILES = $(foreach K2_ID,$(K2_IDS),\
$(foreach TC_ID,$(TC_IDS),\
results-k2t/$(K2_ID).$(TC_ID)))

RESULT_FILE=results/all.txt

all: execs

execs: $(K2_EXECS)

timing: execs $(RES_FILES)
@cat $(RES_FILES) > $(RESULT_FILE)
@cd visualize;make

../tmp/K2T.%: $(DL) $(GEN_FILES)
$(eval TC_ID:=$(call dim,1,$*))
$(eval K2_ID:=$(call dim,2,$*))
$(eval TC_PATH:=$(call config_select,test_case.config,$(TC_ID),2))
$(eval NUM_BYTE:=$(call config_select,test_case.config,$(TC_ID),5))
@$(BIN_DIR)/gen_k2_$(K2_ID) $(TC_PATH) "../tmp/K2T.$(TC_ID).VECTOR" "../tmp/K2T.$(TC_ID).$(K2_ID)"

# Execute $(BIN_DIR)/build_[K2_ID] and write result
results/%: test_case.config $(DL) $(DL_K2T) execs
$(eval K2_ID:=$(call dim,1,$*))
$(eval TC_ID:=$(call dim,2,$*))
$(eval K2_TEX_NAME:=$(call config_select,k2tree.config,$(K2_ID),3))
$(eval TC_TEX_NAME:=$(call config_select,test_case.config,$(TC_ID),3))
$(eval K2T:=$(strip $(call config_select,test_case.config,$(TC_ID),6)))
$(eval TC_PATH:=$(call config_select,test_case.config,$(TC_ID),2))
$(eval TC_TYPE:=$(call config_select,test_case.config,$(TC_ID),5))
$(eval TC_SIZE:=$(shell wc -c <$(TC_PATH)))
$(eval ARGS:="../tmp/K2T.$(TC_ID).$(K2_ID)")
@echo "Running bin/build_$(K2_ID) on $(TC_ID)"
@echo "# K2_ID = $(K2_ID)" > $@
@echo "# TC_ID = $(TC_ID)" >> $@
@echo "# K2_TEX_NAME = $(K2_TEX_NAME)">>$@
@echo "# TC_TEX_NAME = $(TC_TEX_NAME)">>$@
@echo "# TC_SIZE = $(TC_SIZE)">>$@
@$(BIN_DIR)/build_$(K2_ID) $(ARGS) >> $@

# $(BIN_DIR)/build_[K2_ID]
$(BIN_DIR)/build_%: $(SRC_DIR)/k2_time_and_space.cpp k2tree.config
$(eval K2_ID:=$(call dim,1,$*))
$(eval K2_TYPE:=$(call config_select,k2tree.config,$(K2_ID),2))
@$(MY_CXX) $(CFLAGS) $(C_OPTIONS) -DK2_TYPE="$(K2_TYPE)" -L$(LIB_DIR)\
$(SRC_DIR)/k2_time_and_space.cpp -I$(INC_DIR) -o $@ $(LIBS)

$(BIN_DIR)/gen_k2_%: $(SRC_DIR)/gen_k2t.cpp
$(eval K2_ID:=$(call dim,1,$*))
$(eval K2_TYPE:=$(call config_select,k2tree.config,$(K2_ID),2))
@$(MY_CXX) $(CFLAGS) $(C_OPTIONS) -DK2_TYPE="$(K2_TYPE)" -L$(LIB_DIR)\
$(SRC_DIR)/gen_k2t.cpp -I$(INC_DIR) -o $@ $(LIBS) -ldivsufsort -ldivsufsort64

include ../Make.download

clean-build:
@echo "Remove executables"
rm -rf $(BIN_DIR)/build*
rm -rf $(BIN_DIR)/gen*

clean-result:
@echo "Remove results"
rm -rf results/*

cleanall: clean-build clean-result
75 changes: 75 additions & 0 deletions benchmark/k2_trees/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Benchmarking k2 trees

## Methodology

Explored dimensions:

* k2 tree implementations
* test cases
* methods (`adj`, `neighbors`, `reverse_neighbors`)

## Data

* The data input for the benchmarking are arc files. An arc file is a text
file where each line represents a directed edge between two nodes, the
first column is the origin node and the second the target node.

### Test cases

* EXAMPLE test case uses an small file with a little more than 100 nodes and
roughly the same number of edges from web data commons.
* HOSTGRAPH is a test case where the data comes from the Web Cropus released
by the Common Crawl Foundation in April 2014. The file aggregates the
page graph by subdomain/host. It has 123.660.351 edges.

## Directory structure

* [bin](./bin): Contains the executables of the project.
* `build_*` generates the binary file with the graph from the arc files.
* `gen_*` executes the experiments.
* [results](./results): Contains the results of the experiments.
* [src](./src): Contains the source code of the benchmark.
* [visualize](./visualize): Contains a `R`-script which generates
a report in LaTeX format.

## Prerequisites

* For the visualization you need the following software:
- [R][RPJ] with package `tikzDevice`. You can install the
package by calling
`install.packages("filehash", repos="http://cran.r-project.org")`
and
`install.packages("tikzDevice", repos="http://R-Forge.R-project.org")`
in `R`.
- [pdflatex][LT] to generate the pdf reports.

## Usage

* `make timing` compiles the programs, downloads or generates
the test instances, builds the k2 trees,
runs the performance tests and generated a report located at
`visualize/k2.pdf`. The raw numbers of the timings
can be found in the `results/all.txt`. The default benchmark
took 75 minutes on my machine (MacBookPro Retina 2.6Ghz Intel
Core i5 16GB 1600 Mhz DDR3, SSD). Have a look at the
[complete report][RES].
* All created binaries and test results can be deleted
by calling `make cleanall`.

## Customization of the benchmark

The project contains several configuration files:

* [k2tree.config][K2CONFIG]: Specify different k2 tree implementations.
* [test_case.config][TCCONF]: Specify test instances by ID, path, LaTeX-name
for the report, and download URL.
* [compile_options.config][CCONF]: Specify compile options by option string.

Note that the benchmark will execute every combination of k2 trees and test cases.

[RPJ]: http://www.r-project.org/ "R"
[LT]: http://www.tug.org/applications/pdftex/ "pdflatex"
[K2CONFIG]: ./k2tree.config "k2tree.config"
[TCCONF]: ./test_case.config "test_case.config"
[CCONF]: ./compile_options.config "compile_options.config"
[RES]: https://users.dcc.uchile.cl/~fmontoto/static/k2.pdf "k2.pdf"
2 changes: 2 additions & 0 deletions benchmark/k2_trees/bin/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
2 changes: 2 additions & 0 deletions benchmark/k2_trees/compile_options.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Compile options
-O3 -funroll-loops -fomit-frame-pointer -ffast-math -DNDEBUG
23 changes: 23 additions & 0 deletions benchmark/k2_trees/k2tree.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# This file specifies k^2 trees that are used in the benchmark.
#
# Each index is specified by a triple: K2_ID;SDSL_TYPE;K2_LATEX_NAME
# * K2_ID : An identifier for the index. Only letters and underscores are allowed in ID.
# * SDSL_TYPE : Corresponding sdsl type.
# * K2_LATEX_NAME: LaTeX name for output in the benchmark report. No underscores are allowed here.

# Different k and Bit-Vectors:
K2_BV;k2_tree<2, bit_vector, bit_vector::rank_1_type>;K2BV
# K3_BV;k2_tree<3, bit_vector, bit_vector::rank_1_type>;K3BV
# K4_BV;k2_tree<4, bit_vector, bit_vector::rank_1_type>;K4BV
# K2_RRR63V;k2_tree<2, rrr_vector<63>, rrr_vector<63>::rank_1_type>;K2RRR63V
# K3_RRR63V;k2_tree<3, rrr_vector<63>, rrr_vector<63>::rank_1_type>;K3RRR63V
# K4_RRR63V;k2_tree<4, rrr_vector<63>, rrr_vector<63>::rank_1_type>;K4RRR63V
# K2_RRR126V;k2_tree<2, rrr_vector<126>, rrr_vector<126>::rank_1_type>;K2RRR126V
# K3_RRR126V;k2_tree<3, rrr_vector<126>, rrr_vector<126>::rank_1_type>;K3RRR126V
# K4_RRR126V;k2_tree<4, rrr_vector<126>, rrr_vector<126>::rank_1_type>;K4RRR126V
K2_ILV;k2_tree<2, bit_vector_il<512>, bit_vector_il<512>::rank_1_type>;K2ILV
# K3_ILV;k2_tree<3, bit_vector_il<512>, bit_vector_il<512>::rank_1_type>;K3ILV
# K4_ILV;k2_tree<4, bit_vector_il<512>, bit_vector_il<512>::rank_1_type>;K4ILV
K2_SDV;k2_tree<2, sd_vector<bit_vector>, sd_vector<bit_vector>::rank_1_type>;K2SDV
# K3_SDV;k2_tree<3, sd_vector<bit_vector>, sd_vector<bit_vector>::rank_1_type>;K3SDV
# K4_SDV;k2_tree<4, sd_vector<bit_vector>, sd_vector<bit_vector>::rank_1_type>;K4SDV
2 changes: 2 additions & 0 deletions benchmark/k2_trees/results/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
65 changes: 65 additions & 0 deletions benchmark/k2_trees/src/gen_k2t.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#include <iostream>
#include <fstream>
#include <tuple>
#include <sdsl/bit_vectors.hpp>
#include <sdsl/k2_tree.hpp>
#include <stdexcept>
#include <string>
#include <vector>

using namespace sdsl;

void generate_bit_vector_buffers(const std::string& idx_file,
const std::string& output_x_file,
const std::string& output_y_file)
{
std::ifstream infile(idx_file);
std::string line;
uint64_t cnt = 0;

for (int i = 0; std::getline(infile, line); ++i)
cnt++;

infile.clear();
infile.seekg(0, std::ios::beg);
// Set size of vector to the amount of lines in the input file.
int_vector<>xv(cnt), yv(cnt);
cnt = 0;

while(std::getline(infile, line)) {
sdsl::k2_tree_ns::idx_type x, y;
std::istringstream iss(line);
if(!(iss >> x >> y))
throw std::invalid_argument("Not expected line at construct");
xv[cnt] = x;
yv[cnt++] = y;
}

store_to_file(xv, output_x_file);
store_to_file(yv, output_y_file);
}

inline bool exists(const std::string& name) {
std::ifstream f(name.c_str());
return f.good();
}

int main(int argc, char* argv[])
{
if(argc < 4) {
std::cout<<"Usage: input_file output_file_prefix output_k2_file" << std::endl;
}

std::string out_x(argv[2]);
out_x.append(".x");
std::string out_y(argv[2]);
out_y.append(+ ".y");

if(!exists(out_x) || !exists(out_y))
generate_bit_vector_buffers(argv[1], out_x, out_y);

K2_TYPE k2(argv[2]);
std::ofstream fs;
fs.open(argv[3]);
k2.serialize(fs);
}
90 changes: 90 additions & 0 deletions benchmark/k2_trees/src/k2_time_and_space.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#include<fstream>
#include<string>
#include<sstream>

#include<sdsl/bit_vectors.hpp>
#include<sdsl/k2_tree.hpp>

using namespace std;
using namespace sdsl;
using namespace std::chrono;
using timer = std::chrono::high_resolution_clock;

typedef K2_TYPE::idx_type idx_type;
typedef K2_TYPE::size_type size_type;


template<class t_kt>
size_type test_adj(const t_kt &tree, idx_type node, size_type neighbor,
uint64_t times)
{
size_type cnt = 0;
for(uint64_t i = 0; i < times; i++)
if(tree.adj(node, neighbor))
cnt++;
return cnt;
}

template<class t_kt>
size_type test_neighbors(const t_kt &tree, idx_type node, uint64_t times)
{
size_type cnt = 0;
for(uint64_t i = 0; i < times; i++)
cnt += tree.neigh(node).size();
return cnt;
}

template<class t_kt>
size_type test_reverse_neighbors(const t_kt &tree, idx_type node, uint64_t times)
{
size_type cnt = 0;
for(uint64_t i = 0; i < times; i++)
cnt += tree.reverse_neigh(node).size();
return cnt;
}

int main(int argc, char* argv[])
{
if (argc < 2) {
cout << "Usage: file" << endl;
return 1;
}

const uint64_t reps = 100000;

// construct
memory_monitor::start();
std::ifstream is(argv[1]);
auto start = timer::now();
K2_TYPE k2;
k2.load(is);
auto stop = timer::now();
memory_monitor::stop();
cout << "# constructs_time = " << duration_cast<milliseconds>(stop-start).count()/(double)1000 << endl;
cout << "# constructs_space = " << memory_monitor::peak() << endl;
// size
cout << "# k2_size = " << size_in_bytes(k2) << endl;
is.close();

// adj
start = timer::now();
auto check = test_adj<K2_TYPE>(k2, 7, 5, reps);
stop = timer::now();
cout << "# adj_time = " << duration_cast<microseconds>(stop-start).count()/(double)reps << endl;
cout << "# adj_check = " << check << endl;

// neighbors
start = timer::now();
check = test_neighbors<K2_TYPE>(k2, 7, reps);
stop = timer::now();
cout << "# neighbors_time = " << duration_cast<microseconds>(stop-start).count()/(double)reps << endl;
cout << "# neighbors_check = " << check << endl;

start = timer::now();
check = test_reverse_neighbors<K2_TYPE>(k2, 10, reps);
stop = timer::now();
cout << "# reverse_neighbors_time = " << duration_cast<microseconds>(stop-start).count()/(double)reps << endl;
cout << "# reverse_neighbors_check = " << check << endl;

return 0;
}
7 changes: 7 additions & 0 deletions benchmark/k2_trees/test_case.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Configuration for test files
# (1) Identifier for test file (consisting of letters, no `.`)
# (2) Path to the test file
# (3) LaTeX name
# (4) Download link (if the test is available online)
EXAMPLE;../data/example_arcs;examples;http://webdatacommons.org/hyperlinkgraph/data/example_arcs
HOSTGRAPH;../data/hostgraph.arc;hostgraph;http://users.dcc.uchile.cl/~fmontoto/static/hostgraph.arc.gz
6 changes: 6 additions & 0 deletions benchmark/k2_trees/visualize/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
*
!.gitignore
!Makefile
!k2-footer.tex
!k2-header.tex
!k2.R
Loading

0 comments on commit 86c5386

Please sign in to comment.