From be508eb2b3e2d48c4ec28d5481afd8d5fc017633 Mon Sep 17 00:00:00 2001 From: Oliver Lee Date: Fri, 22 Sep 2023 21:07:51 -0700 Subject: [PATCH] define huffman::table::canonicalize() huffman::table::canonicalize() updates the existing codes in a table to canonical form for DEFLATE: * All codes of a given bit length have lexicographically consecutive values, in the same order as the symbols they represent; * Shorter codes lexicographically precede longer codes. Change-Id: Idc3dceefe1b5d17a54f1dab29155a499c7e1d138 --- huffman/src/detail/table_storage.hpp | 2 + huffman/src/table.hpp | 73 ++++++++++++++++ huffman/test/BUILD.bazel | 10 +++ huffman/test/table_canonicalize_test.cpp | 106 +++++++++++++++++++++++ 4 files changed, 191 insertions(+) create mode 100644 huffman/test/table_canonicalize_test.cpp diff --git a/huffman/src/detail/table_storage.hpp b/huffman/src/detail/table_storage.hpp index b9928d4..74d1411 100644 --- a/huffman/src/detail/table_storage.hpp +++ b/huffman/src/detail/table_storage.hpp @@ -50,6 +50,8 @@ class table_storage : table_storage_base_t using const_iterator = typename base_type::const_iterator; + table_storage() = default; + template constexpr table_storage( frequency_tag, const R& frequencies, std::optional eot) diff --git a/huffman/src/table.hpp b/huffman/src/table.hpp index cb4cc24..c6e1fce 100644 --- a/huffman/src/table.hpp +++ b/huffman/src/table.hpp @@ -56,6 +56,14 @@ class table constexpr auto construct_table() -> void { + using size_type = decltype(table_.size()); + + if (table_.size() == size_type{1}) { + using namespace huffman::literals; + static_cast(table_.front()) = 0_c; + return; + } + std::ranges::sort(table_); assert( @@ -99,6 +107,10 @@ class table typename detail::table_storage::const_iterator>, encoding>; + /// Constructs an empty table + /// + table() = default; + /// Constructs a `table` from a symbol-frequency mapping /// @tparam R sized-range of symbol-frequency 2-tuples /// @param frequencies mapping with symbol frequencies @@ -266,6 +278,67 @@ class table } return os; } + + /// Update table code values to DEFLATE canonical form + /// + /// The Huffman codes used for each alphabet in the "deflate" format have two + /// additional rules: + /// * All codes of a given bit length have lexicographically consecutive + /// values, in the same order as the symbols they represent; + /// * Shorter codes lexicographically precede longer codes. + /// + /// @see section 3.2.2 https://datatracker.ietf.org/doc/html/rfc1951 + /// + /// @{ + + constexpr auto canonicalize() & -> table& + { + using value_type = decltype(std::declval().value()); + + // elements are stored in reverse order, so we maintain that order + auto reversed = std::views::reverse(table_); + + // set lexicographical order + std::ranges::sort( // + reversed, // + [](const auto& x, const auto& y) { + return std::pair{x.bitsize(), std::ref(x.symbol)} < + std::pair{y.bitsize(), std::ref(y.symbol)}; + }); + + // used to determine initial value of next_code[bits] + // calculated in step 2 + auto base_code = value_type{}; + + // used in determining consecutive code values in step 3 + auto next_code = code{}; + + // clang-format off + for (auto& n : reversed) { + assert(next_code.bitsize() <= n.bitsize()); + + next_code = { + n.bitsize(), + next_code.bitsize() == n.bitsize() + ? next_code.value() + value_type{1} // 3) next_code[len]++; + : base_code <<= (n.bitsize() - next_code.bitsize()) // 2) next_code[bits] = code; code = (...) << 1; + }; + + static_cast(n) = next_code; // 3) tree[n].Code = next_code[len]; + + ++base_code; // 2) (code + bl_count[bits-1]) + } + // clang-format on + + return *this; + } + + constexpr auto canonicalize() && -> table&& + { + return std::move(canonicalize()); + } + + /// @} }; namespace detail { diff --git a/huffman/test/BUILD.bazel b/huffman/test/BUILD.bazel index 9e61c9b..bc6d00b 100644 --- a/huffman/test/BUILD.bazel +++ b/huffman/test/BUILD.bazel @@ -30,6 +30,16 @@ cc_test( ], ) +cc_test( + name = "table_canonicalize_test", + timeout = "short", + srcs = ["table_canonicalize_test.cpp"], + deps = [ + "//huffman", + "@boost_ut", + ], +) + cc_test( name = "table_from_data_test", timeout = "short", diff --git a/huffman/test/table_canonicalize_test.cpp b/huffman/test/table_canonicalize_test.cpp new file mode 100644 index 0000000..4cbb417 --- /dev/null +++ b/huffman/test/table_canonicalize_test.cpp @@ -0,0 +1,106 @@ +#include "huffman/huffman.hpp" + +#include + +#include +#include + +auto main() -> int +{ + using ::boost::ut::expect; + using ::boost::ut::test; + + namespace huffman = ::starflate::huffman; + using namespace huffman::literals; + + test("table with DEFLATE canonical code, example 1") = [] { + static constexpr auto actual = // clang-format off + huffman::table{ + huffman::table_contents, + {std::pair{010_c, 'D'}, + {011_c, 'C'}, + {00_c, 'A'}, + {1_c, 'B'}}}.canonicalize(); + // clang-format on + + static constexpr auto expected = // clang-format off + huffman::table{ + huffman::table_contents, + {std::pair{111_c, 'D'}, + {110_c, 'C'}, + {10_c, 'A'}, + {0_c, 'B'}}}; + // clang-format on + + expect(std::ranges::equal(actual, expected)); + }; + + test("table with DEFLATE canonical code, example 2") = [] { + // NOTE: t1 is an *invalid* table (as initially specified) because + // some codes are prefixes of others. + static constexpr auto actual = // clang-format off + huffman::table{ + huffman::table_contents, + {std::pair{1111_c, 'H'}, + {0111_c, 'G'}, + {100_c, 'E'}, + {011_c, 'D'}, + {010_c, 'C'}, + {001_c, 'B'}, + {000_c, 'A'}, + {11_c, 'F'}}}.canonicalize(); + // clang-format on + + static constexpr auto expected = // clang-format off + huffman::table{ + huffman::table_contents, + {std::pair{1111_c, 'H'}, + {1110_c, 'G'}, + {110_c, 'E'}, + {101_c, 'D'}, + {100_c, 'C'}, + {011_c, 'B'}, + {010_c, 'A'}, + {00_c, 'F'}}}; + // clang-format on + + expect(std::ranges::equal(actual, expected)); + }; + + test("canonicalization is idempotent") = [] { + static constexpr auto t1 = // clang-format off + huffman::table{ + huffman::table_contents, + {std::pair{1111_c, 'H'}, + {1110_c, 'G'}, + {110_c, 'E'}, + {101_c, 'D'}, + {100_c, 'C'}, + {011_c, 'B'}, + {010_c, 'A'}, + {00_c, 'F'}}}; + // clang-format on + + auto t2 = t1; + t2.canonicalize(); + + expect(std::ranges::equal(t1, t2)); + expect(std::ranges::equal(t1, t2.canonicalize())); + }; + + test("canonicalize invocable on empty table") = [] { + static constexpr auto actual = huffman::table{}.canonicalize(); + auto expected = huffman::table{}; + + expect(std::ranges::equal(actual, expected)) << actual << '\n' << expected; + }; + + test("canonicalize invocable on single element table") = [] { + static constexpr auto actual = huffman::table{ + huffman::table_contents, + {std::pair{0_c, 'A'}}}.canonicalize(); + auto expected = huffman::table{std::array{'A'}, std::nullopt}; + + expect(std::ranges::equal(actual, expected)) << actual << '\n' << expected; + }; +}