diff --git a/README.md b/README.md index f582d311..eb5ee927 100644 --- a/README.md +++ b/README.md @@ -410,10 +410,17 @@ int main() { // we can write a bitmap to a pointer and recover it later uint32_t expectedsize = roaring_bitmap_portable_size_in_bytes(r1); char *serializedbytes = malloc(expectedsize); + // When serializing data to a file, we recommend that you also use + // checksums so that, at deserialization, you can be confident + // that you are recovering the correct data. roaring_bitmap_portable_serialize(r1, serializedbytes); // Note: it is expected that the input follows the specification // https://github.com/RoaringBitmap/RoaringFormatSpec // otherwise the result may be unusable. + // The 'roaring_bitmap_portable_deserialize_safe' function will not read + // beyond expectedsize bytes. + // We recommend you further use checksums to make sure that the input is from + // serialized data. roaring_bitmap_t *t = roaring_bitmap_portable_deserialize_safe(serializedbytes, expectedsize); if(t == NULL) { return EXIT_FAILURE; } const char *reason = NULL; @@ -428,7 +435,11 @@ int main() { roaring_bitmap_portable_deserialize_size(serializedbytes, expectedsize); assert(sizeofbitmap == expectedsize); // sizeofbitmap would be zero if no bitmap were found - // we can also read the bitmap "safely" by specifying a byte size limit: + // We can also read the bitmap "safely" by specifying a byte size limit. + // The 'roaring_bitmap_portable_deserialize_safe' function will not read + // beyond expectedsize bytes. + // We recommend you further use checksums to make sure that the input is from + // serialized data. t = roaring_bitmap_portable_deserialize_safe(serializedbytes, expectedsize); if(t == NULL) { printf("Problem during deserialization.\n"); @@ -500,6 +511,8 @@ We also support efficient 64-bit compressed bitmaps in C: roaring64_bitmap_free(r2); ``` +The API is similar to the conventional 32-bit bitmaps. Please see +the header file `roaring64.h` (compare with `roaring.h`). # Conventional bitsets (C) diff --git a/include/roaring/roaring.h b/include/roaring/roaring.h index 295bc3cb..4256ad0c 100644 --- a/include/roaring/roaring.h +++ b/include/roaring/roaring.h @@ -552,6 +552,10 @@ size_t roaring_bitmap_shrink_to_fit(roaring_bitmap_t *r); * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not * compatible with little-endian systems. + * + * When serializing data to a file, we recommend that you also use + * checksums so that, at deserialization, you can be confident + * that you are recovering the correct data. */ size_t roaring_bitmap_serialize(const roaring_bitmap_t *r, char *buf); @@ -615,7 +619,10 @@ roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf); * https://github.com/RoaringBitmap/RoaringFormatSpec * * The function itself is safe in the sense that it will not cause buffer - * overflows. However, for correct operations, it is assumed that the bitmap + * overflows: it will not read beyond the scope of the provided buffer + * (buf,maxbytes). + * + * However, for correct operations, it is assumed that the bitmap * read was once serialized from a valid bitmap (i.e., it follows the format * specification). If you provided an incorrect input (garbage), then the bitmap * read may not be in a valid state and following operations may not lead to @@ -625,8 +632,10 @@ roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf); * but not for random inputs. * * You may use roaring_bitmap_internal_validate to check the validity of the - * bitmap prior to using it. You may also use other strategies to check for - * corrupted inputs (e.g., checksums). + * bitmap prior to using it. + * + * We recommend that you use checksums to check that serialized data corresponds + * to a serialized bitmap. * * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not @@ -688,6 +697,10 @@ size_t roaring_bitmap_portable_size_in_bytes(const roaring_bitmap_t *r); * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not * compatible with little-endian systems. + * + * When serializing data to a file, we recommend that you also use + * checksums so that, at deserialization, you can be confident + * that you are recovering the correct data. */ size_t roaring_bitmap_portable_serialize(const roaring_bitmap_t *r, char *buf); @@ -722,6 +735,10 @@ size_t roaring_bitmap_frozen_size_in_bytes(const roaring_bitmap_t *r); * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not * compatible with little-endian systems. + * + * When serializing data to a file, we recommend that you also use + * checksums so that, at deserialization, you can be confident + * that you are recovering the correct data. */ void roaring_bitmap_frozen_serialize(const roaring_bitmap_t *r, char *buf); diff --git a/include/roaring/roaring64.h b/include/roaring/roaring64.h index fea6ef90..c1f574d6 100644 --- a/include/roaring/roaring64.h +++ b/include/roaring/roaring64.h @@ -511,6 +511,10 @@ size_t roaring64_bitmap_portable_size_in_bytes(const roaring64_bitmap_t *r); * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not * compatible with little-endian systems. + * + * When serializing data to a file, we recommend that you also use + * checksums so that, at deserialization, you can be confident + * that you are recovering the correct data. */ size_t roaring64_bitmap_portable_serialize(const roaring64_bitmap_t *r, char *buf); @@ -525,14 +529,17 @@ size_t roaring64_bitmap_portable_deserialize_size(const char *buf, size_t maxbytes); /** - * Read a bitmap from a serialized buffer safely (reading up to maxbytes). + * Read a bitmap from a serialized buffer (reading up to maxbytes). * In case of failure, NULL is returned. * * This is meant to be compatible with other languages * https://github.com/RoaringBitmap/RoaringFormatSpec#extension-for-64-bit-implementations * * The function itself is safe in the sense that it will not cause buffer - * overflows. However, for correct operations, it is assumed that the bitmap + * overflows: it will not read beyond the scope of the provided buffer + * (buf,maxbytes). + * + * However, for correct operations, it is assumed that the bitmap * read was once serialized from a valid bitmap (i.e., it follows the format * specification). If you provided an incorrect input (garbage), then the bitmap * read may not be in a valid state and following operations may not lead to @@ -541,6 +548,12 @@ size_t roaring64_bitmap_portable_deserialize_size(const char *buf, * order. This is is guaranteed to happen when serializing an existing bitmap, * but not for random inputs. * + * You may use roaring64_bitmap_internal_validate to check the validity of the + * bitmap prior to using it. + * + * We recommend that you use checksums to check that serialized data corresponds + * to a serialized bitmap. + * * This function is endian-sensitive. If you have a big-endian system (e.g., a * mainframe IBM s390x), the data format is going to be big-endian and not * compatible with little-endian systems. diff --git a/src/roaring.c b/src/roaring.c index e3847bae..5a71fd39 100644 --- a/src/roaring.c +++ b/src/roaring.c @@ -3319,7 +3319,7 @@ roaring_bitmap_t *roaring_bitmap_portable_deserialize_frozen(const char *buf) { bool roaring_bitmap_to_bitset(const roaring_bitmap_t *r, bitset_t *bitset) { uint32_t max_value = roaring_bitmap_maximum(r); - size_t new_array_size = (size_t)(((uint64_t)max_value + 63) / 64); + size_t new_array_size = (size_t)(max_value / 64 + 1); bool resize_ok = bitset_resize(bitset, new_array_size, true); if (!resize_ok) { return false; diff --git a/tests/roaring64_unit.cpp b/tests/roaring64_unit.cpp index 6d8f2c9a..100796c9 100644 --- a/tests/roaring64_unit.cpp +++ b/tests/roaring64_unit.cpp @@ -1,6 +1,8 @@ +#include #include #include #include +#include #include #include @@ -10,6 +12,15 @@ using namespace roaring::api; +static unsigned int seed = 123456789; +static const int OUR_RAND_MAX = (1 << 30) - 1; +inline static unsigned int +our_rand() { // we do not want to depend on a system-specific + // random number generator + seed = (1103515245 * seed + 12345); + return seed & OUR_RAND_MAX; +} + namespace { void assert_vector_equal(const std::vector& lhs, @@ -37,6 +48,47 @@ void assert_r64_valid(roaring64_bitmap_t* b) { } } +bool deserialization_test(const char* data, size_t size) { + // We test that deserialization never fails. + roaring64_bitmap_t* bitmap = + roaring64_bitmap_portable_deserialize_safe(data, size); + if (bitmap) { + // The bitmap may not be usable if it does not follow the specification. + // We can validate the bitmap we recovered to make sure it is proper. + const char* reason_failure = NULL; + if (roaring64_bitmap_internal_validate(bitmap, &reason_failure)) { + // the bitmap is ok! + uint32_t cardinality = roaring64_bitmap_get_cardinality(bitmap); + + for (uint32_t i = 100; i < 1000; i++) { + if (!roaring64_bitmap_contains(bitmap, i)) { + cardinality++; + roaring64_bitmap_add(bitmap, i); + } + } + + uint32_t new_cardinality = roaring64_bitmap_get_cardinality(bitmap); + if (cardinality != new_cardinality) { + return false; + } + } + roaring64_bitmap_free(bitmap); + } + return true; +} + +DEFINE_TEST(fuzz_deserializer) { + for (size_t i = 0; i < 10000; i++) { + size_t vec_size = our_rand() % 10000; + char *buffer = (char *)malloc(vec_size); + for (size_t j = 0; j < vec_size; j++) { + buffer[j] = our_rand() % 256; + } + deserialization_test(buffer, vec_size); + free(buffer); + } +} + DEFINE_TEST(test_copy) { roaring64_bitmap_t* r1 = roaring64_bitmap_create(); assert_r64_valid(r1); @@ -1879,6 +1931,7 @@ DEFINE_TEST(test_stats) { int main() { const struct CMUnitTest tests[] = { + cmocka_unit_test(fuzz_deserializer), cmocka_unit_test(test_copy), cmocka_unit_test(test_from_range), cmocka_unit_test(test_move_from_roaring32), diff --git a/tests/toplevel_unit.c b/tests/toplevel_unit.c index 40532b73..868188e9 100644 --- a/tests/toplevel_unit.c +++ b/tests/toplevel_unit.c @@ -46,6 +46,18 @@ bool roaring_iterator_sumall(uint32_t value, void *param) { *(uint32_t *)param += value; return true; // continue till the end } + +DEFINE_TEST(issue660) { + roaring_bitmap_t *r1 = roaring_bitmap_create(); + roaring_bitmap_add(r1, 0); + bitset_t *b = bitset_create(); + bool success = + roaring_bitmap_to_bitset(r1, b); // Segfault happens on this line + assert_true(success); + bitset_free(b); + roaring_bitmap_free(r1); +} + DEFINE_TEST(issue457) { roaring_bitmap_t *r1 = roaring_bitmap_from_range(65539, 65541, 1); roaring_bitmap_printf_describe(r1); @@ -1807,7 +1819,6 @@ static roaring_bitmap_t *gen_bitmap(double start_density, double density_gradient, int run_length, int blank_range_start, int blank_range_end, int universe_size) { - srand(2345); roaring_bitmap_t *ans = roaring_bitmap_create(); double d = start_density; @@ -3340,7 +3351,6 @@ DEFINE_TEST(test_inplace_negation_run2) { // TODO it DEFINE_TEST(test_rand_flips) { - srand(1234); const int min_runs = 1; const int flip_trials = 5; // these are expensive tests const int range = 2000000; @@ -3398,7 +3408,6 @@ DEFINE_TEST(test_rand_flips) { // randomized flipping test - inplace version DEFINE_TEST(test_inplace_rand_flips) { - srand(1234); const int min_runs = 1; const int flip_trials = 5; // these are expensive tests const int range = 2000000; @@ -3486,7 +3495,6 @@ DEFINE_TEST(test_flip_run_container_removal2) { // randomized test for rank query DEFINE_TEST(select_test) { - srand(1234); const int min_runs = 1; const uint32_t range = 2000000; char *input = (char *)malloc(range); @@ -4702,6 +4710,18 @@ DEFINE_TEST(robust_deserialization) { assert_true(deserialization_test(test1, sizeof(test1))); } +DEFINE_TEST(fuzz_deserializer) { + for (size_t i = 0; i < 10000; i++) { + size_t vec_size = our_rand() % 10000; + char *buffer = malloc(vec_size); + for (size_t j = 0; j < vec_size; j++) { + buffer[j] = our_rand() % 256; + } + deserialization_test(buffer, vec_size); + free(buffer); + } +} + DEFINE_TEST(issue538) { roaring_bitmap_t *dense = roaring_bitmap_create(); int *values = (int *)malloc(4500 * sizeof(int)); @@ -4785,6 +4805,8 @@ int main() { tellmeall(); const struct CMUnitTest tests[] = { + cmocka_unit_test(fuzz_deserializer), + cmocka_unit_test(issue660), cmocka_unit_test(issue538b), cmocka_unit_test(issue538), cmocka_unit_test(simple_roaring_bitmap_or_many),