Skip to content

Commit

Permalink
always use DEFLATE canonical form for huffman::table codes
Browse files Browse the repository at this point in the history
Update huffman::table constructors to always call canonicalize(). As
canonicalize() is now called in the constructor, this method is now
private.

Note that the table_contents constructor is an exception and does not
call canonicalize().

Change-Id: I5597dc51ea5e9f086c71e2f9396fccd5ace923b2
  • Loading branch information
oliverlee authored and garymm committed Oct 13, 2023
1 parent efcadda commit c6c647a
Show file tree
Hide file tree
Showing 7 changed files with 221 additions and 307 deletions.
154 changes: 86 additions & 68 deletions huffman/src/table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@ constexpr auto to_code_symbol(const R& rng)
/// `std::array` is used to store the Huffman tree, with the size determined by
/// `Extent`.
///
/// This table type implements uses "DEFLATE" canonical form for codes:
/// * All codes of a given bit length have lexicographically consecutive
/// values, in the same order as the symbols they represent;
/// * Shorter codes lexicographically precede longer codes.
///
template <symbol Symbol, std::size_t Extent = std::dynamic_extent>
class table
{
Expand Down Expand Up @@ -175,6 +180,60 @@ class table
}
}

/// Update table code values to DEFLATE canonical form
///
/// The Huffman codes used for each alphabet in the "deflate" format have two
/// additional rules:
/// * All codes of a given bit length have lexicographically consecutive
/// values, in the same order as the symbols they represent;
/// * Shorter codes lexicographically precede longer codes.
///
/// @see section 3.2.2 https://datatracker.ietf.org/doc/html/rfc1951
///
/// @note This method is called in all constructors except for the
/// table-contents constructor.
///
constexpr auto canonicalize() & -> table&
{
using value_type = decltype(std::declval<code>().value());

// set lexicographical order
std::ranges::sort( //
table_, //
[](const auto& x, const auto& y) {
return std::pair{x.bitsize(), std::ref(x.symbol)} <
std::pair{y.bitsize(), std::ref(y.symbol)};
});

// used to determine initial value of next_code[bits]
// calculated in step 2
auto base_code = value_type{};

// used in determining consecutive code values in step 3
auto next_code = code{};

// clang-format off
for (auto& elem : table_) {
assert(next_code.bitsize() <= elem.bitsize());

next_code = {
elem.bitsize(),
next_code.bitsize() == elem.bitsize()
? next_code.value() + value_type{1} // 3) next_code[len]++;
: base_code <<= (elem.bitsize() - next_code.bitsize()) // 2) next_code[bits] = code; code = (...) << 1;
};

static_cast<code&>(elem) = next_code; // 3) tree[n].Code = next_code[len];

++base_code; // 2) (code + bl_count[bits-1])
}
// clang-format on

set_skip_fields();

return *this;
}

public:
/// Symbol type
///
Expand Down Expand Up @@ -211,7 +270,7 @@ class table
: table_{detail::frequency_tag{}, frequencies, eot}
{
construct_table();
set_skip_fields();
canonicalize();
}

template <std::ranges::sized_range R>
Expand Down Expand Up @@ -249,7 +308,7 @@ class table
: table_{detail::data_tag{}, data, eot}
{
construct_table();
set_skip_fields();
canonicalize();
}

template <std::ranges::input_range R>
Expand All @@ -262,11 +321,10 @@ class table
/// Constructs a `table` from the given code-symbol mapping contents
/// @tparam R sized-range of code-symbol 2-tuples
/// @pre all `code` and `symbol` values container in mapping are unique
/// @pre `code` values are prefix free
/// @pre `code` values are specified in DEFLATE canonical form
/// @pre `code` and `symbol` values are provided in lexicographical order
///
/// Construct a `table` with explicit contents. This constructor avoids
/// generation of prefix-free codes for symbols and assumes that the provided
/// codes have been generated correctly.
/// Construct a `table` with explicit contents.
///
/// @{

Expand All @@ -280,6 +338,24 @@ class table
constexpr table(table_contents_tag, const R& map)
: table_{table_contents, map}
{
assert(
std::ranges::is_sorted(
map,
[](const auto& x, const auto& y) {
const auto x_value = std::get<code>(x).value();
const auto y_value = std::get<code>(y).value();

const auto x_bitsize = std::get<code>(x).bitsize();
const auto y_bitsize = std::get<code>(y).bitsize();

const auto x_symbol = std::get<symbol_type>(x);
const auto y_symbol = std::get<symbol_type>(y);

return (x_value < y_value) and
((x_bitsize < y_bitsize) or
((x_bitsize == y_bitsize) and (x_symbol < y_symbol)));
}) and
"table contents are not provided in DEFLATE canonical form");
set_skip_fields();
}

Expand Down Expand Up @@ -321,8 +397,10 @@ class table

/// Returns an iterator to the first `encoding`
///
/// @note elements are ordered by code bitsize. If multiple elements have the
/// same code bitsize, the order is unspecified.
/// @note
/// * All codes of a given bit length have lexicographically consecutive
/// values, in the same order as the symbols they represent;
/// * Shorter codes lexicographically precede longer codes.
///
[[nodiscard]]
constexpr auto begin() const -> const_iterator
Expand Down Expand Up @@ -400,66 +478,6 @@ class table
}
return os;
}

/// Update table code values to DEFLATE canonical form
///
/// The Huffman codes used for each alphabet in the "deflate" format have two
/// additional rules:
/// * All codes of a given bit length have lexicographically consecutive
/// values, in the same order as the symbols they represent;
/// * Shorter codes lexicographically precede longer codes.
///
/// @see section 3.2.2 https://datatracker.ietf.org/doc/html/rfc1951
///
/// @{

constexpr auto canonicalize() & -> table&
{
using value_type = decltype(std::declval<code>().value());

// set lexicographical order
std::ranges::sort( //
table_, //
[](const auto& x, const auto& y) {
return std::pair{x.bitsize(), std::ref(x.symbol)} <
std::pair{y.bitsize(), std::ref(y.symbol)};
});

// used to determine initial value of next_code[bits]
// calculated in step 2
auto base_code = value_type{};

// used in determining consecutive code values in step 3
auto next_code = code{};

// clang-format off
for (auto& elem : table_) {
assert(next_code.bitsize() <= elem.bitsize());

next_code = {
elem.bitsize(),
next_code.bitsize() == elem.bitsize()
? next_code.value() + value_type{1} // 3) next_code[len]++;
: base_code <<= (elem.bitsize() - next_code.bitsize()) // 2) next_code[bits] = code; code = (...) << 1;
};

static_cast<code&>(elem) = next_code; // 3) tree[n].Code = next_code[len];

++base_code; // 2) (code + bl_count[bits-1])
}
// clang-format on

set_skip_fields();

return *this;
}

constexpr auto canonicalize() && -> table&&
{
return std::move(canonicalize());
}

/// @}
};

namespace detail {
Expand Down
10 changes: 0 additions & 10 deletions huffman/test/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,6 @@ cc_test(
],
)

cc_test(
name = "table_canonicalize_test",
timeout = "short",
srcs = ["table_canonicalize_test.cpp"],
deps = [
"//huffman",
"@boost_ut",
],
)

cc_test(
name = "table_find_code_test",
timeout = "short",
Expand Down
42 changes: 22 additions & 20 deletions huffman/test/decode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,34 +16,36 @@ auto main() -> int
using namespace huffman::literals;

test("basic") = [] {
// encoded data from dahuffman readme.rst, but in hex.
constexpr std::array encoded_bytes = {
std::byte{0x86},
std::byte{0x7c},
std::byte{0x25},
std::byte{0x13},
std::byte{0x69},
std::byte{0x40}};

constexpr char eot = {'\4'};
constexpr auto encoded_bytes = std::array{
std::byte{0b0111'1101},
std::byte{0b1000'0011},
std::byte{0b1111'1010},
std::byte{0b1110'1100},
std::byte{0b1001'0110},
std::byte{0b1101'1110}};

constexpr auto eot = char{'\4'};
static constexpr auto code_table = // clang-format off
huffman::table{
huffman::table_contents,
{std::pair{1_c, 'e'},
{01_c, 'i'},
{001_c, 'n'},
{0001_c, 'q'},
{00001_c, 'x'},
{00000_c, eot}}
{std::pair{0_c, 'e'},
{10_c, 'i'},
{110_c, 'n'},
{1110_c, 'q'},
{11110_c, eot},
{11111_c, 'x'}
}
}; // clang-format on

constexpr std::array expected = {
constexpr auto expected = std::array{
'e', 'x', 'e', 'n', 'e', 'e', 'e', 'e', 'x', 'n',
'i', 'q', 'n', 'e', 'i', 'e', 'i', 'n', 'i', eot,
'i', 'q', 'n', 'e', 'i', 'e', 'i', 'n', 'n', eot,
};
constexpr auto output_buf = [&] {
std::array<char, expected.size()> output_buf{};
auto result = decode(code_table, encoded_bytes, output_buf.begin());
auto output_buf = std::array<char, expected.size()>{};

const auto result = decode(code_table, encoded_bytes, output_buf.begin());

// result should point to the back of output_buf.
if (output_buf.end() != result) {
throw std::runtime_error("assertion failed");
Expand Down
106 changes: 0 additions & 106 deletions huffman/test/table_canonicalize_test.cpp

This file was deleted.

Loading

0 comments on commit c6c647a

Please sign in to comment.