diff --git a/huffman/src/table.hpp b/huffman/src/table.hpp index 779071d..fe10c1c 100644 --- a/huffman/src/table.hpp +++ b/huffman/src/table.hpp @@ -66,6 +66,11 @@ constexpr static auto find_node_if(I first, I last, P pred) /// `std::array` is used to store the Huffman tree, with the size determined by /// `Extent`. /// +/// This type uses "DEFLATE" canonical form for codes: +/// * All codes of a given bit length have lexicographically consecutive +/// values, in the same order as the symbols they represent; +/// * Shorter codes lexicographically precede longer codes. +/// template class table { @@ -155,6 +160,60 @@ class table } } + /// Update table code values to DEFLATE canonical form + /// + /// The Huffman codes used for each alphabet in the "deflate" format have two + /// additional rules: + /// * All codes of a given bit length have lexicographically consecutive + /// values, in the same order as the symbols they represent; + /// * Shorter codes lexicographically precede longer codes. + /// + /// @see section 3.2.2 https://datatracker.ietf.org/doc/html/rfc1951 + /// + /// @note This method is called in all constructors except for the + /// table-contents constructor. + /// + constexpr auto canonicalize() & -> table& + { + using value_type = decltype(std::declval().value()); + + // set lexicographical order + std::ranges::sort( // + table_, // + [](const auto& x, const auto& y) { + return std::pair{x.bitsize(), std::ref(x.symbol)} < + std::pair{y.bitsize(), std::ref(y.symbol)}; + }); + + // used to determine initial value of next_code[bits] + // calculated in step 2 + auto base_code = value_type{}; + + // used in determining consecutive code values in step 3 + auto next_code = code{}; + + // clang-format off + for (auto& elem : table_) { + assert(next_code.bitsize() <= elem.bitsize()); + + next_code = { + elem.bitsize(), + next_code.bitsize() == elem.bitsize() + ? next_code.value() + value_type{1} // 3) next_code[len]++; + : base_code <<= (elem.bitsize() - next_code.bitsize()) // 2) next_code[bits] = code; code = (...) << 1; + }; + + static_cast(elem) = next_code; // 3) tree[n].Code = next_code[len]; + + ++base_code; // 2) (code + bl_count[bits-1]) + } + // clang-format on + + set_skip_fields(); + + return *this; + } + public: /// Symbol type /// @@ -191,7 +250,7 @@ class table : table_{detail::frequency_tag{}, frequencies, eot} { construct_table(); - set_skip_fields(); + canonicalize(); } template @@ -229,7 +288,7 @@ class table : table_{detail::data_tag{}, data, eot} { construct_table(); - set_skip_fields(); + canonicalize(); } template @@ -242,11 +301,10 @@ class table /// Constructs a `table` from the given code-symbol mapping contents /// @tparam R sized-range of code-symbol 2-tuples /// @pre all `code` and `symbol` values container in mapping are unique - /// @pre `code` values are prefix free + /// @pre `code` values are specified in DEFLATE canonical form + /// @pre `code` and `symbol` values are provided in lexicographical order /// - /// Construct a `table` with explicit contents. This constructor avoids - /// generation of prefix-free codes for symbols and assumes that the provided - /// codes have been generated correctly. + /// Construct a `table` with explicit contents. /// /// @{ @@ -260,6 +318,24 @@ class table constexpr table(table_contents_tag, const R& map) : table_{table_contents, map} { + assert( + std::ranges::is_sorted( + map, + [](const auto& x, const auto& y) { + const auto x_value = std::get(x).value(); + const auto y_value = std::get(y).value(); + + const auto x_bitsize = std::get(x).bitsize(); + const auto y_bitsize = std::get(y).bitsize(); + + const auto x_symbol = std::get(x); + const auto y_symbol = std::get(y); + + return (x_value < y_value) and + ((x_bitsize < y_bitsize) or + ((x_bitsize == y_bitsize) and (x_symbol < y_symbol))); + }) and + "table contents are not provided in DEFLATE canonical form"); set_skip_fields(); } @@ -302,8 +378,10 @@ class table /// Returns an iterator to the first `encoding` /// - /// @note elements are ordered by code bitsize. If multiple elements have the - /// same code bitsize, the order is unspecified. + /// @note + /// * All codes of a given bit length have lexicographically consecutive + /// values, in the same order as the symbols they represent; + /// * Shorter codes lexicographically precede longer codes. /// [[nodiscard]] constexpr auto begin() const -> const_iterator @@ -381,66 +459,6 @@ class table } return os; } - - /// Update table code values to DEFLATE canonical form - /// - /// The Huffman codes used for each alphabet in the "deflate" format have two - /// additional rules: - /// * All codes of a given bit length have lexicographically consecutive - /// values, in the same order as the symbols they represent; - /// * Shorter codes lexicographically precede longer codes. - /// - /// @see section 3.2.2 https://datatracker.ietf.org/doc/html/rfc1951 - /// - /// @{ - - constexpr auto canonicalize() & -> table& - { - using value_type = decltype(std::declval().value()); - - // set lexicographical order - std::ranges::sort( // - table_, // - [](const auto& x, const auto& y) { - return std::pair{x.bitsize(), std::ref(x.symbol)} < - std::pair{y.bitsize(), std::ref(y.symbol)}; - }); - - // used to determine initial value of next_code[bits] - // calculated in step 2 - auto base_code = value_type{}; - - // used in determining consecutive code values in step 3 - auto next_code = code{}; - - // clang-format off - for (auto& elem : table_) { - assert(next_code.bitsize() <= elem.bitsize()); - - next_code = { - elem.bitsize(), - next_code.bitsize() == elem.bitsize() - ? next_code.value() + value_type{1} // 3) next_code[len]++; - : base_code <<= (elem.bitsize() - next_code.bitsize()) // 2) next_code[bits] = code; code = (...) << 1; - }; - - static_cast(elem) = next_code; // 3) tree[n].Code = next_code[len]; - - ++base_code; // 2) (code + bl_count[bits-1]) - } - // clang-format on - - set_skip_fields(); - - return *this; - } - - constexpr auto canonicalize() && -> table&& - { - return std::move(canonicalize()); - } - - /// @} }; namespace detail { diff --git a/huffman/test/BUILD.bazel b/huffman/test/BUILD.bazel index ac63579..c2554c2 100644 --- a/huffman/test/BUILD.bazel +++ b/huffman/test/BUILD.bazel @@ -20,16 +20,6 @@ cc_test( ], ) -cc_test( - name = "table_canonicalize_test", - timeout = "short", - srcs = ["table_canonicalize_test.cpp"], - deps = [ - "//:boost_ut", - "//huffman", - ], -) - cc_test( name = "table_find_code_test", timeout = "short", diff --git a/huffman/test/decode_test.cpp b/huffman/test/decode_test.cpp index b7e3d7c..01b4985 100644 --- a/huffman/test/decode_test.cpp +++ b/huffman/test/decode_test.cpp @@ -26,6 +26,7 @@ auto main() -> int namespace huffman = ::starflate::huffman; using namespace huffman::literals; + // FIXME table contents are not in canonical form test("basic") = [] { // encoded data from soxofaan/dahuffman readme.rst. // We reverse the bits in each byte to match the encoding used in DEFLATE. @@ -45,15 +46,17 @@ auto main() -> int {01_c, 'i'}, {001_c, 'n'}, {0001_c, 'q'}, - {00001_c, 'x'}, - {00000_c, eot}} + {00000_c, eot}, + {00001_c, 'x'} + } }; // clang-format on constexpr std::array expected = { 'e', 'x', 'e', 'n', 'e', 'e', 'e', 'e', 'x', 'n', 'i', 'q', 'n', 'e', 'i', 'e', 'i', 'n', 'i', eot, }; - constexpr auto output_buf = [&] { + + const auto output_buf = [&] { std::array output_buf{}; auto result = decode(code_table, encoded_bytes, output_buf.begin()); // result should point to the back of output_buf. @@ -63,6 +66,6 @@ auto main() -> int return output_buf; }(); - static_assert(output_buf == expected); + expect(output_buf == expected); }; } diff --git a/huffman/test/table_canonicalize_test.cpp b/huffman/test/table_canonicalize_test.cpp deleted file mode 100644 index 025db64..0000000 --- a/huffman/test/table_canonicalize_test.cpp +++ /dev/null @@ -1,106 +0,0 @@ -#include "huffman/huffman.hpp" - -#include - -#include -#include - -auto main() -> int -{ - using ::boost::ut::expect; - using ::boost::ut::test; - - namespace huffman = ::starflate::huffman; - using namespace huffman::literals; - - test("table with DEFLATE canonical code, example 1") = [] { - static constexpr auto actual = // clang-format off - huffman::table{ - huffman::table_contents, - {std::pair{00_c, 'A'}, - {1_c, 'B'}, - {011_c, 'C'}, - {010_c, 'D'}}}.canonicalize(); - // clang-format on - - static constexpr auto expected = // clang-format off - huffman::table{ - huffman::table_contents, - {std::pair{0_c, 'B'}, - {10_c, 'A'}, - {110_c, 'C'}, - {111_c, 'D'}}}; - // clang-format on - - expect(std::ranges::equal(actual, expected)); - }; - - test("table with DEFLATE canonical code, example 2") = [] { - // NOTE: t1 is an *invalid* table (as initially specified) because - // some codes are prefixes of others. - static constexpr auto actual = // clang-format off - huffman::table{ - huffman::table_contents, - {std::pair{000_c, 'A'}, - {001_c, 'B'}, - {010_c, 'C'}, - {011_c, 'D'}, - {100_c, 'E'}, - {11_c, 'F'}, - {0111_c, 'G'}, - {1111_c, 'H'}}}.canonicalize(); - // clang-format on - - static constexpr auto expected = // clang-format off - huffman::table{ - huffman::table_contents, - {std::pair{00_c, 'F'}, - {010_c, 'A'}, - {011_c, 'B'}, - {100_c, 'C'}, - {101_c, 'D'}, - {110_c, 'E'}, - {1110_c, 'G'}, - {1111_c, 'H'}}}; - // clang-format on - - expect(std::ranges::equal(actual, expected)); - }; - - test("canonicalization is idempotent") = [] { - static constexpr auto t1 = // clang-format off - huffman::table{ - huffman::table_contents, - {std::pair{00_c, 'F'}, - {010_c, 'A'}, - {011_c, 'B'}, - {100_c, 'C'}, - {101_c, 'D'}, - {110_c, 'E'}, - {1110_c, 'G'}, - {1111_c, 'H'}}}; - // clang-format on - - auto t2 = t1; - t2.canonicalize(); - - expect(std::ranges::equal(t1, t2)); - expect(std::ranges::equal(t1, t2.canonicalize())); - }; - - test("canonicalize invocable on empty table") = [] { - static constexpr auto actual = huffman::table{}.canonicalize(); - auto expected = huffman::table{}; - - expect(std::ranges::equal(actual, expected)) << actual << '\n' << expected; - }; - - test("canonicalize invocable on single element table") = [] { - static constexpr auto actual = huffman::table{ - huffman::table_contents, - {std::pair{0_c, 'A'}}}.canonicalize(); - auto expected = huffman::table{std::array{'A'}, std::nullopt}; - - expect(std::ranges::equal(actual, expected)) << actual << '\n' << expected; - }; -} diff --git a/huffman/test/table_find_code_test.cpp b/huffman/test/table_find_code_test.cpp index 31308cf..c2e5a96 100644 --- a/huffman/test/table_find_code_test.cpp +++ b/huffman/test/table_find_code_test.cpp @@ -16,77 +16,78 @@ auto main() -> int static constexpr auto table1 = // clang-format off huffman::table{ huffman::table_contents, - {std::pair{1_c, 'e'}, - {01_c, 'i'}, - {001_c, 'n'}, - {0001_c, 'q'}, - {00001_c, 'x'}, - {00000_c, '\4'}} + {std::pair{0_c, 'e'}, + {10_c, 'i'}, + {110_c, 'n'}, + {1110_c, 'q'}, + {11110_c, '\4'}, + {11111_c, 'x'}} }; // clang-format on test("finds code in table") = [] { - static_assert('e' == table1.find(1_c).value()->symbol); - static_assert('i' == table1.find(01_c).value()->symbol); - static_assert('n' == table1.find(001_c).value()->symbol); - static_assert('q' == table1.find(0001_c).value()->symbol); - static_assert('x' == table1.find(00001_c).value()->symbol); - static_assert('\4' == table1.find(00000_c).value()->symbol); + static_assert('e' == table1.find(0_c).value()->symbol); + static_assert('i' == table1.find(10_c).value()->symbol); + static_assert('n' == table1.find(110_c).value()->symbol); + static_assert('q' == table1.find(1110_c).value()->symbol); + static_assert('\4' == table1.find(11110_c).value()->symbol); + static_assert('x' == table1.find(11111_c).value()->symbol); }; // bitsize values we compare against are derived from the code // NOLINTBEGIN(readability-magic-numbers) test("code not in table but valid prefix") = [] { - static_assert(table1.find(0_c).error()->symbol == 'i'); - static_assert(table1.find(0_c).error()->bitsize() == 2); + static_assert(table1.find(1_c).error()->symbol == 'i'); + static_assert(table1.find(1_c).error()->bitsize() == 2); - static_assert(table1.find(00_c).error()->symbol == 'n'); - static_assert(table1.find(00_c).error()->bitsize() == 3); + static_assert(table1.find(11_c).error()->symbol == 'n'); + static_assert(table1.find(11_c).error()->bitsize() == 3); - static_assert(table1.find(000_c).error()->symbol == 'q'); - static_assert(table1.find(000_c).error()->bitsize() == 4); + static_assert(table1.find(111_c).error()->symbol == 'q'); + static_assert(table1.find(111_c).error()->bitsize() == 4); - // ordering of elements with the same bitsize is unspecified - static_assert(table1.find(0000_c).error()->bitsize() == 5); + static_assert(table1.find(1111_c).error()->symbol == '\4'); + static_assert(table1.find(1111_c).error()->bitsize() == 5); }; test("code not in table but valid prefix, using explicit pos") = [] { - constexpr auto pos1 = table1.find(0_c).error(); + constexpr auto pos1 = table1.find(1_c).error(); static_assert(pos1->symbol == 'i'); static_assert(pos1->bitsize() == 2); - constexpr auto pos2 = table1.find(00_c, pos1).error(); + constexpr auto pos2 = table1.find(11_c, pos1).error(); static_assert(pos2->symbol == 'n'); static_assert(pos2->bitsize() == 3); - constexpr auto pos3 = table1.find(000_c, pos2).error(); + constexpr auto pos3 = table1.find(111_c, pos2).error(); static_assert(pos3->symbol == 'q'); static_assert(pos3->bitsize() == 4); - // ordering of elements with the same bitsize is unspecified - static_assert(table1.find(0000_c, pos3).error()->bitsize() == 5); + static_assert(table1.find(1111_c, pos3).error()->symbol == '\4'); + static_assert(table1.find(1111_c, pos3).error()->bitsize() == 5); }; // NOLINTEND(readability-magic-numbers) test("code bitsize exceeds all codes in table") = [] { static_assert(table1.find(000000_c).error() == table1.end()); + static_assert(table1.find(111111_c).error() == table1.end()); }; test("code is smaller than any code in table") = [] { static constexpr auto table = // clang-format off huffman::table{ - huffman::table_contents, - {std::pair{11_c, 'e'}, - {01_c, 'i'}, - {001_c, 'n'}, - {0001_c, 'q'}, - {00001_c, 'x'}, - {00000_c, '\4'}} - }; + huffman::table_contents, + {std::pair{00_c, 'e'}, + {110_c, 'i'}, + {1110_c, 'n'}, + {11110_c, 'q'}, + {111110_c, '\4'}, + {111111_c, 'x'}}}; // clang-format on + static_assert(table.find(0_c).error() == table.begin()); static_assert(table.find(1_c).error() == table.begin()); }; } diff --git a/huffman/test/table_from_contents_test.cpp b/huffman/test/table_from_contents_test.cpp index 792186f..9c171a1 100644 --- a/huffman/test/table_from_contents_test.cpp +++ b/huffman/test/table_from_contents_test.cpp @@ -20,12 +20,12 @@ auto main() -> int const auto t = huffman::table{ // clang-format off huffman::table_contents, - {{1_c, 'e'}, - {01_c, 'i'}, - {001_c, 'n'}, - {0001_c, 'q'}, - {00001_c, 'x'}, - {00000_c, '\4'}}}; + {{0_c, 'e'}, + {10_c, 'i'}, + {110_c, 'n'}, + {1110_c, 'q'}, + {11110_c, '\4'}, + {11111_c, 'x'}}}; // clang-format on auto ss = std::stringstream{}; @@ -33,12 +33,12 @@ auto main() -> int constexpr auto table = "Bits\tCode\tValue\tSymbol\n" - "1\t1\t1\t`e`\n" - "2\t01\t1\t`i`\n" - "3\t001\t1\t`n`\n" - "4\t0001\t1\t`q`\n" - "5\t00001\t1\t`x`\n" - "5\t00000\t0\t`\4`\n"; + "1\t0\t0\t`e`\n" + "2\t10\t2\t`i`\n" + "3\t110\t6\t`n`\n" + "4\t1110\t14\t`q`\n" + "5\t11110\t30\t`\4`\n" + "5\t11111\t31\t`x`\n"; expect(table == ss.str()) << ss.str(); }; @@ -49,24 +49,24 @@ auto main() -> int const auto t1 = // clang-format off huffman::table{ huffman::table_contents, - {{1_c, 'e'}, - {01_c, 'i'}, - {001_c, 'n'}, - {0001_c, 'q'}, - {00001_c, 'x'}, - {00000_c, '\4'}}}; + {{0_c, 'e'}, + {10_c, 'i'}, + {110_c, 'n'}, + {1110_c, 'q'}, + {11110_c, '\4'}, + {11111_c, 'x'}}}; // clang-format on const auto t2 = // clang-format off huffman::table{ huffman::table_contents, std::vector{ - std::tuple{1_c, 'e'}, - {01_c, 'i'}, - {001_c, 'n'}, - {0001_c, 'q'}, - {00001_c, 'x'}, - {00000_c, '\4'}}}; + std::tuple{0_c, 'e'}, + {10_c, 'i'}, + {110_c, 'n'}, + {1110_c, 'q'}, + {11110_c, '\4'}, + {11111_c, 'x'}}}; // clang-format on expect(std::ranges::equal(t1, t2)); @@ -77,24 +77,24 @@ auto main() -> int const auto t1 = // clang-format off huffman::table{ huffman::table_contents, - {std::pair{1_c, 'e'}, - {01_c, 'i'}, - {001_c, 'n'}, - {0001_c, 'q'}, - {00001_c, 'x'}, - {00000_c, '\4'}}}; + {std::pair{0_c, 'e'}, + {10_c, 'i'}, + {110_c, 'n'}, + {1110_c, 'q'}, + {11110_c, '\4'}, + {11111_c, 'x'}}}; // clang-format on const auto t2 = // clang-format off huffman::table{ huffman::table_contents, std::vector{ - std::tuple{1_c, 'e'}, - {01_c, 'i'}, - {001_c, 'n'}, - {0001_c, 'q'}, - {00001_c, 'x'}, - {00000_c, '\4'}}}; + std::tuple{0_c, 'e'}, + {10_c, 'i'}, + {110_c, 'n'}, + {1110_c, 'q'}, + {11110_c, '\4'}, + {11111_c, 'x'}}}; // clang-format off expect(std::ranges::equal(t1, t2)); @@ -104,23 +104,23 @@ auto main() -> int static constexpr auto t1 = // clang-format off huffman::table{ huffman::table_contents, - {std::pair{1_c, 'e'}, - {01_c, 'i'}, - {001_c, 'n'}, - {0001_c, 'q'}, - {00001_c, 'x'}, - {00000_c, '\4'}}}; + {std::pair{0_c, 'e'}, + {10_c, 'i'}, + {110_c, 'n'}, + {1110_c, 'q'}, + {11110_c, '\4'}, + {11111_c, 'x'}}}; // clang-format on static constexpr auto t2 = // clang-format off huffman::table{ huffman::table_contents, - {{1_c, 'e'}, - {01_c, 'i'}, - {001_c, 'n'}, - {0001_c, 'q'}, - {00001_c, 'x'}, - {00000_c, '\4'}}}; + {{0_c, 'e'}, + {10_c, 'i'}, + {110_c, 'n'}, + {1110_c, 'q'}, + {11110_c, '\4'}, + {11111_c, 'x'}}}; // clang-format on expect(std::ranges::equal(t1, t2)); diff --git a/huffman/test/table_from_frequencies_test.cpp b/huffman/test/table_from_frequencies_test.cpp index 237af36..ad4268e 100644 --- a/huffman/test/table_from_frequencies_test.cpp +++ b/huffman/test/table_from_frequencies_test.cpp @@ -49,7 +49,9 @@ class Country auto main() -> int { - using namespace ::boost::ut; + using ::boost::ut::aborts; + using ::boost::ut::expect; + using ::boost::ut::test; namespace huffman = ::starflate::huffman; @@ -61,12 +63,12 @@ auto main() -> int constexpr auto table = "Bits\tCode\tValue\tSymbol\n" - "1\t1\t1\t`e`\n" - "2\t01\t1\t`i`\n" - "3\t001\t1\t`n`\n" - "4\t0001\t1\t`q`\n" - "5\t00001\t1\t`x`\n" - "5\t00000\t0\t`\4`\n"; + "1\t0\t0\t`e`\n" + "2\t10\t2\t`i`\n" + "3\t110\t6\t`n`\n" + "4\t1110\t14\t`q`\n" + "5\t11110\t30\t`\4`\n" + "5\t11111\t31\t`x`\n"; auto ss = std::stringstream{}; ss << huffman::table{frequencies, eot}; @@ -80,13 +82,14 @@ auto main() -> int const auto table = huffman::table{frequencies}; - using E = huffman::encoding; + using namespace huffman::literals; + using encoding = huffman::encoding; - expect(E{"FR", {1, 1}} == table.begin()[0]); - expect(E{"IT", {2, 1}} == table.begin()[1]); - expect(E{"UK", {3, 1}} == table.begin()[2]); - expect(E{"DE", {4, 1}} == table.begin()[3]); - expect(E{"BE", {4, 0}} == table.begin()[4]); + expect(encoding{"FR", 0_c} == table.begin()[0]); + expect(encoding{"IT", 10_c} == table.begin()[1]); + expect(encoding{"UK", 110_c} == table.begin()[2]); + expect(encoding{"BE", 1110_c} == table.begin()[3]); + expect(encoding{"DE", 1111_c} == table.begin()[4]); }; test("code table can be statically sized") = [] { @@ -140,19 +143,25 @@ auto main() -> int }; test("code table constructible in constant expression context") = [] { - static constexpr auto frequencies = std::array< - std::pair, - 5>{{{'e', 100}, {'n', 20}, {'x', 1}, {'i', 40}, {'q', 3}}}; + static constexpr auto frequencies = // clang-format off + std::array, 5>{{ + {'e', 100}, + {'n', 20}, + {'x', 1}, + {'i', 40}, + {'q', 3}}}; + // clang-format on static constexpr auto table = huffman::table{frequencies}; - using E = huffman::encoding; + using namespace huffman::literals; + using huffman::encoding; - static_assert(E{'e', {1, 1}} == table.begin()[0]); - static_assert(E{'i', {2, 1}} == table.begin()[1]); - static_assert(E{'n', {3, 1}} == table.begin()[2]); - static_assert(E{'q', {4, 1}} == table.begin()[3]); - static_assert(E{'x', {4, 0}} == table.begin()[4]); + static_assert(encoding{'e', 0_c} == table.begin()[0]); + static_assert(encoding{'i', 10_c} == table.begin()[1]); + static_assert(encoding{'n', 110_c} == table.begin()[2]); + static_assert(encoding{'q', 1110_c} == table.begin()[3]); + static_assert(encoding{'x', 1111_c} == table.begin()[4]); }; test("code table constructible in constant expression context with deduced "