From 6445e7372d9b748ec100b6aa30035f7bc1276833 Mon Sep 17 00:00:00 2001 From: Ka Ho Ng Date: Sun, 19 Nov 2023 23:14:09 -0500 Subject: [PATCH] Add cista::*::cstring type This new type is able to store a trailing \0 character, without compromising one byte for storage when the string is a small-string. Storage of NUL character within data is also supported. This is inspired by https://github.com/felixguendling/cista/issues/187#issuecomment-1603128882. See https://github.com/felixguendling/cista/issues/187. --- include/cista/containers.h | 1 + include/cista/containers/cstring.h | 460 +++++++++++++++++++++++++++++ include/cista/hashing.h | 1 + include/cista/serialization.h | 34 +++ test/cstring_serialize_test.cc | 30 ++ test/cstring_test.cc | 85 ++++++ 6 files changed, 611 insertions(+) create mode 100644 include/cista/containers/cstring.h create mode 100644 test/cstring_serialize_test.cc create mode 100644 test/cstring_test.cc diff --git a/include/cista/containers.h b/include/cista/containers.h index ef5c177a..5c1bc0e5 100644 --- a/include/cista/containers.h +++ b/include/cista/containers.h @@ -3,6 +3,7 @@ #include "cista/containers/array.h" #include "cista/containers/bitset.h" #include "cista/containers/bitvec.h" +#include "cista/containers/cstring.h" #include "cista/containers/fws_multimap.h" #include "cista/containers/hash_map.h" #include "cista/containers/hash_set.h" diff --git a/include/cista/containers/cstring.h b/include/cista/containers/cstring.h new file mode 100644 index 00000000..ce7803a5 --- /dev/null +++ b/include/cista/containers/cstring.h @@ -0,0 +1,460 @@ +#pragma once + +#include +#include +#include + +#include +#include +#include + +#include "cista/containers/ptr.h" +#include "cista/type_traits.h" + +namespace cista { + +// This class is a generic string container that stores an extra \0 byte post +// the last byte of the valid data. This makes sure the pointer returned by +// data() can be passed as a C-string. +// +// The content stored within this container can contain binary data, that is, +// any number of \0 bytes is permitted within [data(), data() + size()). +template +struct generic_cstring { + using msize_t = std::uint32_t; + using value_type = char; + + static msize_t mstrlen(char const* s) noexcept { + return static_cast(std::strlen(s)); + } + + static constexpr struct owning_t { + } owning{}; + static constexpr struct non_owning_t { + } non_owning{}; + + constexpr generic_cstring() noexcept {} + ~generic_cstring() noexcept { reset(); } + + generic_cstring(std::string_view s, owning_t const) { set_owning(s); } + generic_cstring(std::string_view s, non_owning_t const) { set_non_owning(s); } + generic_cstring(std::string const& s, owning_t const) { set_owning(s); } + generic_cstring(std::string const& s, non_owning_t const) { + set_non_owning(s); + } + generic_cstring(char const* s, owning_t const) { + set_owning(s); + } + generic_cstring(char const* s, non_owning_t const) { set_non_owning(s); } + + char* begin() noexcept { return data(); } + char* end() noexcept { return data() + size(); } + char const* begin() const noexcept { return data(); } + char const* end() const noexcept { return data() + size(); } + + friend char const* begin(generic_cstring const& s) { return s.begin(); } + friend char* begin(generic_cstring& s) { return s.begin(); } + friend char const* end(generic_cstring const& s) { return s.end(); } + friend char* end(generic_cstring& s) { return s.end(); } + + bool is_short() const noexcept { return s_.remaining_ >= 0; } + + bool is_owning() const { return is_short() || h_.self_allocated_; } + + void reset() noexcept { + if (!is_short() && h_.self_allocated_) { + std::free(data()); + } + s_ = stack{}; + } + + void set_owning(std::string const& s) { + set_owning(s.data(), static_cast(s.size())); + } + + void set_owning(std::string_view s) { + set_owning(s.data(), static_cast(s.size())); + } + + void set_owning(char const* str) { + set_owning(str, mstrlen(str)); + } + + static constexpr msize_t short_length_limit = 15U; + + void set_owning(char const* str, msize_t const len) { + reset(); + if (str == nullptr || len == 0U) { + return; + } + s_.remaining_ = static_cast( + std::max(static_cast(short_length_limit - len), -1)); + if (is_short()) { + std::memcpy(s_.s_, str, len); + } else { + h_ = heap(len, owning); + std::memcpy(data(), str, len); + } + } + + void set_non_owning(std::string const& v) { + set_non_owning(v.data(), static_cast(v.size())); + } + + void set_non_owning(std::string_view v) { + set_non_owning(v.data(), static_cast(v.size())); + } + + void set_non_owning(char const* str) { + set_non_owning(str, str != nullptr ? mstrlen(str) : 0); + } + + void set_non_owning(char const* str, msize_t const len) { + reset(); + h_ = heap(str, len, non_owning); + } + + void move_from(generic_cstring&& s) noexcept { + std::memcpy(static_cast(this), &s, sizeof(*this)); + if constexpr (std::is_pointer_v) { + std::memset(static_cast(&s), 0, sizeof(*this)); + } else if (!s.is_short()) { + h_.ptr_ = s.h_.ptr_; + s.s_ = stack{}; + } + } + + void copy_from(generic_cstring const& s) { + reset(); + if (s.is_short()) { + std::memcpy(static_cast(this), &s, sizeof(s)); + } else if (s.h_.self_allocated_) { + set_owning(s.data(), s.size()); + } else { + set_non_owning(s.data(), s.size()); + } + } + + bool empty() const noexcept { return size() == 0U; } + std::string_view view() const noexcept { return {data(), size()}; } + std::string str() const { return {data(), size()}; } + + operator std::string_view() const { return view(); } + + char& operator[](std::size_t const i) noexcept { return data()[i]; } + char const& operator[](std::size_t const i) const noexcept { + return data()[i]; + } + + friend std::ostream& operator<<(std::ostream& out, generic_cstring const& s) { + return out << s.view(); + } + + friend bool operator==(generic_cstring const& a, + generic_cstring const& b) noexcept { + return a.view() == b.view(); + } + + friend bool operator!=(generic_cstring const& a, + generic_cstring const& b) noexcept { + return a.view() != b.view(); + } + + friend bool operator<(generic_cstring const& a, + generic_cstring const& b) noexcept { + return a.view() < b.view(); + } + + friend bool operator>(generic_cstring const& a, + generic_cstring const& b) noexcept { + return a.view() > b.view(); + } + + friend bool operator<=(generic_cstring const& a, + generic_cstring const& b) noexcept { + return a.view() <= b.view(); + } + + friend bool operator>=(generic_cstring const& a, + generic_cstring const& b) noexcept { + return a.view() >= b.view(); + } + + friend bool operator==(generic_cstring const& a, + std::string_view b) noexcept { + return a.view() == b; + } + + friend bool operator!=(generic_cstring const& a, + std::string_view b) noexcept { + return a.view() != b; + } + + friend bool operator<(generic_cstring const& a, std::string_view b) noexcept { + return a.view() < b; + } + + friend bool operator>(generic_cstring const& a, std::string_view b) noexcept { + return a.view() > b; + } + + friend bool operator<=(generic_cstring const& a, + std::string_view b) noexcept { + return a.view() <= b; + } + + friend bool operator>=(generic_cstring const& a, + std::string_view b) noexcept { + return a.view() >= b; + } + + friend bool operator==(std::string_view a, + generic_cstring const& b) noexcept { + return a == b.view(); + } + + friend bool operator!=(std::string_view a, + generic_cstring const& b) noexcept { + return a != b.view(); + } + + friend bool operator<(std::string_view a, generic_cstring const& b) noexcept { + return a < b.view(); + } + + friend bool operator>(std::string_view a, generic_cstring const& b) noexcept { + return a > b.view(); + } + + friend bool operator<=(std::string_view a, + generic_cstring const& b) noexcept { + return a <= b.view(); + } + + friend bool operator>=(std::string_view a, + generic_cstring const& b) noexcept { + return a >= b.view(); + } + + friend bool operator==(generic_cstring const& a, char const* b) noexcept { + return a.view() == std::string_view{b}; + } + + friend bool operator!=(generic_cstring const& a, char const* b) noexcept { + return a.view() != std::string_view{b}; + } + + friend bool operator<(generic_cstring const& a, char const* b) noexcept { + return a.view() < std::string_view{b}; + } + + friend bool operator>(generic_cstring const& a, char const* b) noexcept { + return a.view() > std::string_view{b}; + } + + friend bool operator<=(generic_cstring const& a, char const* b) noexcept { + return a.view() <= std::string_view{b}; + } + + friend bool operator>=(generic_cstring const& a, char const* b) noexcept { + return a.view() >= std::string_view{b}; + } + + friend bool operator==(char const* a, generic_cstring const& b) noexcept { + return std::string_view{a} == b.view(); + } + + friend bool operator!=(char const* a, generic_cstring const& b) noexcept { + return std::string_view{a} != b.view(); + } + + friend bool operator<(char const* a, generic_cstring const& b) noexcept { + return std::string_view{a} < b.view(); + } + + friend bool operator>(char const* a, generic_cstring const& b) noexcept { + return std::string_view{a} > b.view(); + } + + friend bool operator<=(char const* a, generic_cstring const& b) noexcept { + return std::string_view{a} <= b.view(); + } + + friend bool operator>=(char const* a, generic_cstring const& b) noexcept { + return std::string_view{a} >= b.view(); + } + + char const* internal_data() const noexcept { + if constexpr (std::is_pointer_v) { + return is_short() ? s_.s_ : h_.ptr_; + } else { + return is_short() ? s_.s_ : h_.ptr_.get(); + } + } + + char* data() noexcept { return const_cast(internal_data()); } + char const* data() const noexcept { return internal_data(); } + + msize_t size() const noexcept { return is_short() ? s_.size() : h_.size(); } + + struct heap { + Ptr ptr_{nullptr}; + std::uint32_t size_{0}; + bool self_allocated_{false}; + char __fill__[sizeof(uintptr_t) == 8 ? 2 : 6]{0}; + int8_t minus_one_{-1}; // The offset of this field needs to match the + // offset of stack::remaining_ below. + + heap() = default; + heap(msize_t len, owning_t) { + char* mem = static_cast(std::malloc(len + 1)); + if (mem == nullptr) { + throw std::bad_alloc{}; + } + mem[len] = '\0'; + ptr_ = mem; + size_ = len; + self_allocated_ = true; + } + heap(Ptr ptr, msize_t len, non_owning_t) { + ptr_ = ptr; + size_ = len; + } + + msize_t size() const { return size_; } + }; + + struct stack { + char s_[short_length_limit]{0}; + int8_t remaining_{ + short_length_limit}; // The remaining capacity the inline buffer still + // has. A negative value indicates the buffer is + // not inline. In case the inline buffer is fully + // occupied, this field also serves as a null + // terminator. + + msize_t size() const { + assert(remaining_ >= 0); + return short_length_limit - static_cast(remaining_); + } + }; + + union { + heap h_; + stack s_{}; + }; +}; + +template +struct basic_cstring : public generic_cstring { + using base = generic_cstring; + + using base::base; + using base::operator std::string_view; + + friend std::ostream& operator<<(std::ostream& out, basic_cstring const& s) { + return out << s.view(); + } + + explicit operator std::string() const { return {base::data(), base::size()}; } + + basic_cstring(std::string_view s) : base{s, base::owning} {} + basic_cstring(std::string const& s) : base{s, base::owning} {} + basic_cstring(char const* s) : base{s, base::owning} {} + basic_cstring(char const* s, typename base::msize_t const len) + : base{s, len, base::owning} {} + + basic_cstring(basic_cstring const& o) : base{o.view(), base::owning} {} + basic_cstring(basic_cstring&& o) { base::move_from(std::move(o)); } + + basic_cstring& operator=(basic_cstring const& o) { + base::set_owning(o.data(), o.size()); + return *this; + } + + basic_cstring& operator=(basic_cstring&& o) { + base::move_from(std::move(o)); + return *this; + } + + basic_cstring& operator=(char const* s) { + base::set_owning(s); + return *this; + } + basic_cstring& operator=(std::string const& s) { + base::set_owning(s); + return *this; + } + basic_cstring& operator=(std::string_view s) { + base::set_owning(s); + return *this; + } +}; + +template +struct basic_cstring_view : public generic_cstring { + using base = generic_cstring; + + using base::base; + using base::operator std::string_view; + + friend std::ostream& operator<<(std::ostream& out, + basic_cstring_view const& s) { + return out << s.view(); + } + + basic_cstring_view(std::string_view s) : base{s, base::non_owning} {} + basic_cstring_view(std::string const& s) : base{s, base::non_owning} {} + basic_cstring_view(char const* s) : base{s, base::non_owning} {} + basic_cstring_view(char const* s, typename base::msize_t const len) + : base{s, len, base::non_owning} {} + + basic_cstring_view(basic_cstring_view const& o) { + base::set_non_owning(o.data(), o.size()); + } + basic_cstring_view(basic_cstring_view&& o) { + base::set_non_owning(o.data(), o.size()); + } + basic_cstring_view& operator=(basic_cstring_view const& o) { + base::set_non_owning(o.data(), o.size()); + return *this; + } + basic_cstring_view& operator=(basic_cstring_view&& o) { + base::set_non_owning(o.data(), o.size()); + return *this; + } + + basic_cstring_view& operator=(char const* s) { + base::set_non_owning(s); + return *this; + } + basic_cstring_view& operator=(std::string_view s) { + base::set_non_owning(s); + return *this; + } + basic_cstring_view& operator=(std::string const& s) { + base::set_non_owning(s); + return *this; + } +}; + +template +struct is_string_helper> : std::true_type {}; + +template +struct is_string_helper> : std::true_type {}; + +template +struct is_string_helper> : std::true_type {}; + +namespace raw { +using generic_cstring = generic_cstring>; +using cstring = basic_cstring>; +} // namespace raw + +namespace offset { +using generic_cstring = generic_cstring>; +using cstring = basic_cstring>; +} // namespace offset + +} // namespace cista \ No newline at end of file diff --git a/include/cista/hashing.h b/include/cista/hashing.h index 63faa642..effc9868 100644 --- a/include/cista/hashing.h +++ b/include/cista/hashing.h @@ -8,6 +8,7 @@ #include #include +#include "cista/containers/cstring.h" #include "cista/containers/offset_ptr.h" #include "cista/containers/pair.h" #include "cista/containers/string.h" diff --git a/include/cista/serialization.h b/include/cista/serialization.h index fd9a4b19..1617e6fc 100644 --- a/include/cista/serialization.h +++ b/include/cista/serialization.h @@ -260,6 +260,35 @@ void serialize(Ctx& c, pos + cista_member_offset(Type, element_count_)); } +template +void serialize(Ctx& c, generic_cstring const* origin, offset_t const pos) { + using Type = generic_cstring; + + if (origin->is_short()) { + return; + } + + const auto* data = origin->data(); + auto size = origin->size(); + std::string buf; + if (!origin->is_owning()) { + buf = origin->str(); + data = buf.data(); + size = buf.size(); + } + auto capacity = size + 1; + + auto const start = c.write(data, capacity); + c.write(pos + cista_member_offset(Type, h_.ptr_), + convert_endian(start - cista_member_offset(Type, h_.ptr_) - + pos)); + c.write(pos + cista_member_offset(Type, h_.size_), + convert_endian(origin->h_.size_)); + c.write(pos + cista_member_offset(Type, h_.self_allocated_), false); + c.write(pos + cista_member_offset(Type, h_.minus_one_), + static_cast(-1)); +} + template void serialize(Ctx& c, basic_string const* origin, offset_t const pos) { serialize(c, static_cast const*>(origin), pos); @@ -271,6 +300,11 @@ void serialize(Ctx& c, basic_string_view const* origin, serialize(c, static_cast const*>(origin), pos); } +template +void serialize(Ctx& c, basic_cstring const* origin, offset_t const pos) { + serialize(c, static_cast const*>(origin), pos); +} + template void serialize(Ctx& c, basic_unique_ptr const* origin, offset_t const pos) { diff --git a/test/cstring_serialize_test.cc b/test/cstring_serialize_test.cc new file mode 100644 index 00000000..7c3e91e7 --- /dev/null +++ b/test/cstring_serialize_test.cc @@ -0,0 +1,30 @@ +#include "doctest.h" + +#ifdef SINGLE_HEADER +#include "cista.h" +#else +#include "cista/serialization.h" +#endif + +constexpr auto const LONG_STR = "aaahelloworldtestaaa"; +constexpr auto const SHORT_STR = "ahelloworldtest"; + +TEST_CASE("cstring serialization long_str") { + cista::raw::string s = LONG_STR; + + cista::byte_buf buf = cista::serialize(s); + auto const serialized = + cista::deserialize(&buf[0], &buf[0] + buf.size()); + CHECK(*serialized == std::string_view{LONG_STR}); +} + +TEST_CASE("cstring serialization short_str") { + cista::raw::string s = SHORT_STR; + + cista::byte_buf buf = cista::serialize(s); + CHECK(buf.size() == sizeof(cista::raw::string)); + + auto const serialized = + cista::deserialize(&buf[0], &buf[0] + buf.size()); + CHECK(*serialized == std::string_view{SHORT_STR}); +} \ No newline at end of file diff --git a/test/cstring_test.cc b/test/cstring_test.cc new file mode 100644 index 00000000..fed12cd8 --- /dev/null +++ b/test/cstring_test.cc @@ -0,0 +1,85 @@ +#include + +#include "doctest.h" + +#ifdef SINGLE_HEADER +#include "cista.h" +#else +#include "cista/containers/cstring.h" +#include "cista/hash.h" +#endif + +using cista::raw::cstring; + +constexpr auto const CORNER_CASE_SHORT_14 = "01234567891234"; +constexpr auto const CORNER_CASE_SHORT_15 = "012345678912345"; +constexpr auto const CORNER_CASE_LONG_16 = "0123456789123456"; +constexpr auto const LONG_STR = "hello world hello world"; +constexpr auto const SHORT_STR = "hello world"; + +TEST_CASE("cstring init") { + auto s = cstring{}; + CHECK(s.is_short()); + CHECK(s.size() == 0); + CHECK(s.data() != nullptr); +} + +TEST_CASE("cstring long short corner 14") { + auto s = cstring{CORNER_CASE_SHORT_14, cstring::owning}; + CHECK(s.is_short()); + CHECK(s.size() == std::strlen(CORNER_CASE_SHORT_14)); + CHECK(s.view() == CORNER_CASE_SHORT_14); +} + +TEST_CASE("cstring long short corner 15") { + auto s = cstring{CORNER_CASE_SHORT_15, cstring::owning}; + CHECK(s.is_short()); + CHECK(s.size() == std::strlen(CORNER_CASE_SHORT_15)); + CHECK(s.view() == CORNER_CASE_SHORT_15); +} + +TEST_CASE("cstring long short corner 16") { + auto s = cstring{CORNER_CASE_LONG_16, cstring::owning}; + CHECK(!s.is_short()); + CHECK(s.size() == std::strlen(CORNER_CASE_LONG_16)); + CHECK(s.view() == CORNER_CASE_LONG_16); +} + +TEST_CASE("cstring long short") { + auto s = cstring{SHORT_STR, cstring::owning}; + CHECK(s.view() == SHORT_STR); + CHECK(s.is_short()); + + s.set_owning(CORNER_CASE_LONG_16); + CHECK(!s.is_short()); + CHECK(s.view() == CORNER_CASE_LONG_16); + + s.set_owning(LONG_STR); + CHECK(!s.is_short()); + CHECK(s.view() == LONG_STR); +} + +TEST_CASE("cstring dealloc long to short") { + cstring s = "one two"; + CHECK(s.size() == std::strlen("one two")); + CHECK(s.is_short()); + s.set_non_owning(""); +} + +TEST_CASE("cstring copy assign and copy construct") { + auto s0 = cstring{LONG_STR, cstring::owning}; + auto s1 = cstring{s0}; + CHECK(s0 == s1); + CHECK(s1.view() == LONG_STR); + + cstring s2; + s2 = s0; + CHECK(s0 == s2); + CHECK(s2.view() == LONG_STR); +} + +TEST_CASE("cstring hash") { + auto str = cstring{""}; + auto h = cista::hash(str, cista::BASE_HASH); + CHECK(cista::BASE_HASH == h); +}