Skip to content

Commit

Permalink
tuning the wording and adding a spaceusage benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
lemire committed Jan 22, 2025
1 parent 5e2aff2 commit cb64259
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 7 deletions.
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,9 @@ For serialization, there is a choice between an unpacked and a packed format.
The unpacked format is roughly of the same size as in-core data, but uses most
efficient memory copy operations.
The packed format avoids storing zero bytes and is considered near optimal (it
can not be compressed further by zlib and its required space is very close to
the theoretical lower limit), but it needs to copy individual words, so it
should be expected to be somewhat slower.
The packed format avoids storing zero bytes and relies on a bitset to locate them, so it
should be expected to be somewhat slower. The packed format might be smaller or larger.
When in doubt, prefer the regular (unpacked) format.
The two formats use slightly different APIs.
Expand All @@ -77,11 +76,13 @@ You may serialize and deserialize in unpacked format as follows:
free(buffer);
```

This should be the default.

To serialize and deserialize in packed format, use the `_pack_bytes()`,
`_pack()` and `_unpack()` functions. The latter two have an additional `size_t`
argument for the buffer length. `_pack()` can be used with a buffer of arbitrary
size, it returns the used space if serialization fit into the buffer or 0
otherwise.
otherwise. Note that the packed format will be slower and may not save space.

For example:

Expand Down
3 changes: 3 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
add_executable(bench bench.c)
target_link_libraries(bench PUBLIC xor_singleheader)

add_executable(spaceusage spaceusage.c)
target_link_libraries(spaceusage PUBLIC xor_singleheader)
119 changes: 119 additions & 0 deletions benchmarks/spaceusage.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#include "binaryfusefilter.h"
#include "xorfilter.h"
#include <stdlib.h>
#include <iso646.h>

typedef struct {
size_t standard;
size_t pack;
} sizes;

sizes fuse16(size_t n) {
binary_fuse16_t filter = {0};
if (! binary_fuse16_allocate(n, &filter)) {
printf("allocation failed\n");
return (sizes) {0, 0};
}
uint64_t* big_set = malloc(n * sizeof(uint64_t));
for(size_t i = 0; i < n; i++) {
big_set[i] = i;
}
bool is_ok = binary_fuse16_populate(big_set, n, &filter);
if(! is_ok ) {
printf("populating failed\n");
}
free(big_set);
sizes s = {
.standard = binary_fuse16_serialization_bytes(&filter),
.pack = binary_fuse16_pack_bytes(&filter)
};
binary_fuse16_free(&filter);
return s;
}

sizes fuse8(size_t n) {
binary_fuse8_t filter = {0};
if (! binary_fuse8_allocate(n, &filter)) {
printf("allocation failed\n");
return (sizes) {0, 0};
}
uint64_t* big_set = malloc(n * sizeof(uint64_t));
for(size_t i = 0; i < n; i++) {
big_set[i] = i;
}
bool is_ok = binary_fuse8_populate(big_set, n, &filter);
if(! is_ok ) {
printf("populating failed\n");
}
free(big_set);
sizes s = {
.standard = binary_fuse8_serialization_bytes(&filter),
.pack = binary_fuse8_pack_bytes(&filter)
};
binary_fuse8_free(&filter);
return s;
}

sizes xor16(size_t n) {
xor16_t filter = {0};
if (! xor16_allocate(n, &filter)) {
printf("allocation failed\n");
return (sizes) {0, 0};
}
uint64_t* big_set = malloc(n * sizeof(uint64_t));
for(size_t i = 0; i < n; i++) {
big_set[i] = i;
}
bool is_ok = xor16_populate(big_set, n, &filter);
if(! is_ok ) {
printf("populating failed\n");
}
free(big_set);
sizes s = {
.standard = xor16_serialization_bytes(&filter),
.pack = xor16_pack_bytes(&filter)
};
xor16_free(&filter);
return s;
}

sizes xor8(size_t n) {
xor8_t filter = {0};
if (! xor8_allocate(n, &filter)) {
printf("allocation failed\n");
return (sizes) {0, 0};
}
uint64_t* big_set = malloc(n * sizeof(uint64_t));
for(size_t i = 0; i < n; i++) {
big_set[i] = i;
}
bool is_ok = xor8_populate(big_set, n, &filter);
if(! is_ok ) {
printf("populating failed\n");
}
free(big_set);
sizes s = {
.standard = xor8_serialization_bytes(&filter),
.pack = xor8_pack_bytes(&filter)
};
xor8_free(&filter);

return s;
}

int main() {
for (size_t n = 10; n <= 10000000; n *= 2) {
printf("%-10zu ", n); // Align number to 10 characters wide
sizes f16 = fuse16(n);
sizes f8 = fuse8(n);
sizes x16 = xor16(n);
sizes x8 = xor8(n);

printf("fuse16: %5.2f %5.2f ", (double)f16.standard * 8.0 / n, (double)f16.pack * 8.0 / n);
printf("fuse8: %5.2f %5.2f ", (double)f8.standard * 8.0 / n, (double)f8.pack * 8.0 / n);
printf("xor16: %5.2f %5.2f ", (double)x16.standard * 8.0 / n, (double)x16.pack * 8.0 / n);
printf("xor8: %5.2f %5.2f ", (double)x8.standard * 8.0 / n, (double)x8.pack * 8.0 / n);
printf("\n");
}
return EXIT_SUCCESS;
}
4 changes: 2 additions & 2 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ else() # *nix
-Wall -Wextra -Wshadow -Wcast-qual -Wconversion -Wsign-conversion -Werror)

if (NOT MINGW) # sanitizers are not supported under mingw
list(APPEND TEST_COMPILE_OPTIONS -fsanitize=address,undefined,leak)
list(APPEND TEST_COMPILE_OPTIONS -fsanitize=address,undefined)
# sanitsizers need to be specified at link time as well
target_link_options(unit PRIVATE -fsanitize=address,leak,undefined)
target_link_options(unit PRIVATE -fsanitize=address,undefined)
endif()
endif()

Expand Down

0 comments on commit cb64259

Please sign in to comment.