Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

define huffman::table::canonicalize() #88

Merged
merged 1 commit into from
Sep 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions huffman/src/detail/table_storage.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ class table_storage : table_storage_base_t<IntrusiveNode, Extent>

using const_iterator = typename base_type::const_iterator;

table_storage() = default;

template <class R>
constexpr table_storage(
frequency_tag, const R& frequencies, std::optional<symbol_type> eot)
Expand Down
73 changes: 73 additions & 0 deletions huffman/src/table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,14 @@ class table

constexpr auto construct_table() -> void
{
using size_type = decltype(table_.size());

if (table_.size() == size_type{1}) {
using namespace huffman::literals;
static_cast<code&>(table_.front()) = 0_c;
return;
}

std::ranges::sort(table_);

assert(
Expand Down Expand Up @@ -99,6 +107,10 @@ class table
typename detail::table_storage<node_type, Extent>::const_iterator>,
encoding<Symbol>>;

/// Constructs an empty table
///
table() = default;

/// Constructs a `table` from a symbol-frequency mapping
/// @tparam R sized-range of symbol-frequency 2-tuples
/// @param frequencies mapping with symbol frequencies
Expand Down Expand Up @@ -266,6 +278,67 @@ class table
}
return os;
}

/// Update table code values to DEFLATE canonical form
///
/// The Huffman codes used for each alphabet in the "deflate" format have two
/// additional rules:
/// * All codes of a given bit length have lexicographically consecutive
/// values, in the same order as the symbols they represent;
/// * Shorter codes lexicographically precede longer codes.
///
/// @see section 3.2.2 https://datatracker.ietf.org/doc/html/rfc1951
///
/// @{

constexpr auto canonicalize() & -> table&
{
using value_type = decltype(std::declval<code>().value());

// elements are stored in reverse order, so we maintain that order
auto reversed = std::views::reverse(table_);

// set lexicographical order
std::ranges::sort( //
reversed, //
[](const auto& x, const auto& y) {
return std::pair{x.bitsize(), std::ref(x.symbol)} <
std::pair{y.bitsize(), std::ref(y.symbol)};
});

// used to determine initial value of next_code[bits]
// calculated in step 2
auto base_code = value_type{};

// used in determining consecutive code values in step 3
auto next_code = code{};

// clang-format off
for (auto& n : reversed) {
assert(next_code.bitsize() <= n.bitsize());

next_code = {
n.bitsize(),
next_code.bitsize() == n.bitsize()
? next_code.value() + value_type{1} // 3) next_code[len]++;
: base_code <<= (n.bitsize() - next_code.bitsize()) // 2) next_code[bits] = code; code = (...) << 1;
};

static_cast<code&>(n) = next_code; // 3) tree[n].Code = next_code[len];

++base_code; // 2) (code + bl_count[bits-1])
}
// clang-format on

return *this;
}

constexpr auto canonicalize() && -> table&&
{
return std::move(canonicalize());
}

/// @}
};

namespace detail {
Expand Down
10 changes: 10 additions & 0 deletions huffman/test/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,16 @@ cc_test(
],
)

cc_test(
name = "table_canonicalize_test",
timeout = "short",
srcs = ["table_canonicalize_test.cpp"],
deps = [
"//huffman",
"@boost_ut",
],
)

cc_test(
name = "table_from_data_test",
timeout = "short",
Expand Down
106 changes: 106 additions & 0 deletions huffman/test/table_canonicalize_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#include "huffman/huffman.hpp"

#include <boost/ut.hpp>

#include <algorithm>
#include <utility>

auto main() -> int
{
using ::boost::ut::expect;
using ::boost::ut::test;

namespace huffman = ::starflate::huffman;
using namespace huffman::literals;

test("table with DEFLATE canonical code, example 1") = [] {
static constexpr auto actual = // clang-format off
huffman::table{
huffman::table_contents,
{std::pair{010_c, 'D'},
{011_c, 'C'},
{00_c, 'A'},
{1_c, 'B'}}}.canonicalize();
// clang-format on

static constexpr auto expected = // clang-format off
huffman::table{
huffman::table_contents,
{std::pair{111_c, 'D'},
{110_c, 'C'},
{10_c, 'A'},
{0_c, 'B'}}};
// clang-format on

expect(std::ranges::equal(actual, expected));
};

test("table with DEFLATE canonical code, example 2") = [] {
// NOTE: t1 is an *invalid* table (as initially specified) because
// some codes are prefixes of others.
static constexpr auto actual = // clang-format off
huffman::table{
huffman::table_contents,
{std::pair{1111_c, 'H'},
{0111_c, 'G'},
{100_c, 'E'},
{011_c, 'D'},
{010_c, 'C'},
{001_c, 'B'},
{000_c, 'A'},
{11_c, 'F'}}}.canonicalize();
// clang-format on

static constexpr auto expected = // clang-format off
huffman::table{
huffman::table_contents,
{std::pair{1111_c, 'H'},
{1110_c, 'G'},
{110_c, 'E'},
{101_c, 'D'},
{100_c, 'C'},
{011_c, 'B'},
{010_c, 'A'},
{00_c, 'F'}}};
// clang-format on

expect(std::ranges::equal(actual, expected));
};

test("canonicalization is idempotent") = [] {
static constexpr auto t1 = // clang-format off
huffman::table{
huffman::table_contents,
{std::pair{1111_c, 'H'},
{1110_c, 'G'},
{110_c, 'E'},
{101_c, 'D'},
{100_c, 'C'},
{011_c, 'B'},
{010_c, 'A'},
{00_c, 'F'}}};
// clang-format on

auto t2 = t1;
t2.canonicalize();

expect(std::ranges::equal(t1, t2));
expect(std::ranges::equal(t1, t2.canonicalize()));
};

test("canonicalize invocable on empty table") = [] {
static constexpr auto actual = huffman::table<char, 0>{}.canonicalize();
auto expected = huffman::table<char, 0>{};

expect(std::ranges::equal(actual, expected)) << actual << '\n' << expected;
};

test("canonicalize invocable on single element table") = [] {
static constexpr auto actual = huffman::table{
huffman::table_contents,
{std::pair{0_c, 'A'}}}.canonicalize();
auto expected = huffman::table<char, 1>{std::array{'A'}, std::nullopt};

expect(std::ranges::equal(actual, expected)) << actual << '\n' << expected;
};
}