Skip to content

Commit

Permalink
always use DEFLATE canonical form for huffman::table codes
Browse files Browse the repository at this point in the history
Update huffman::table constructors to always call canonicalize(). As
canonicalize() is now called in the constructor, this method is now
private.

Note that the table_contents constructor is an exception and does not
call canonicalize().

File decode_test.cpp does not yet update the Huffman table to use
canonical form as this requires the recalculating the encoded bits.

Change-Id: I5597dc51ea5e9f086c71e2f9396fccd5ace923b2
  • Loading branch information
oliverlee committed Nov 21, 2023
1 parent 2de2289 commit c72bc7d
Show file tree
Hide file tree
Showing 7 changed files with 206 additions and 291 deletions.
154 changes: 86 additions & 68 deletions huffman/src/table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ constexpr static auto find_node_if(I first, I last, P pred)
/// `std::array` is used to store the Huffman tree, with the size determined by
/// `Extent`.
///
/// This type uses "DEFLATE" canonical form for codes:
/// * All codes of a given bit length have lexicographically consecutive
/// values, in the same order as the symbols they represent;
/// * Shorter codes lexicographically precede longer codes.
///
template <symbol Symbol, std::size_t Extent = std::dynamic_extent>
class table
{
Expand Down Expand Up @@ -155,6 +160,60 @@ class table
}
}

/// Update table code values to DEFLATE canonical form
///
/// The Huffman codes used for each alphabet in the "deflate" format have two
/// additional rules:
/// * All codes of a given bit length have lexicographically consecutive
/// values, in the same order as the symbols they represent;
/// * Shorter codes lexicographically precede longer codes.
///
/// @see section 3.2.2 https://datatracker.ietf.org/doc/html/rfc1951
///
/// @note This method is called in all constructors except for the
/// table-contents constructor.
///
constexpr auto canonicalize() & -> table&
{
using value_type = decltype(std::declval<code>().value());

// set lexicographical order
std::ranges::sort( //
table_, //
[](const auto& x, const auto& y) {
return std::pair{x.bitsize(), std::ref(x.symbol)} <
std::pair{y.bitsize(), std::ref(y.symbol)};
});

// used to determine initial value of next_code[bits]
// calculated in step 2
auto base_code = value_type{};

// used in determining consecutive code values in step 3
auto next_code = code{};

// clang-format off
for (auto& elem : table_) {
assert(next_code.bitsize() <= elem.bitsize());

next_code = {
elem.bitsize(),
next_code.bitsize() == elem.bitsize()
? next_code.value() + value_type{1} // 3) next_code[len]++;
: base_code <<= (elem.bitsize() - next_code.bitsize()) // 2) next_code[bits] = code; code = (...) << 1;
};

static_cast<code&>(elem) = next_code; // 3) tree[n].Code = next_code[len];

++base_code; // 2) (code + bl_count[bits-1])
}
// clang-format on

set_skip_fields();

return *this;
}

public:
/// Symbol type
///
Expand Down Expand Up @@ -191,7 +250,7 @@ class table
: table_{detail::frequency_tag{}, frequencies, eot}
{
construct_table();
set_skip_fields();
canonicalize();
}

template <std::ranges::sized_range R>
Expand Down Expand Up @@ -229,7 +288,7 @@ class table
: table_{detail::data_tag{}, data, eot}
{
construct_table();
set_skip_fields();
canonicalize();
}

template <std::ranges::input_range R>
Expand All @@ -242,11 +301,10 @@ class table
/// Constructs a `table` from the given code-symbol mapping contents
/// @tparam R sized-range of code-symbol 2-tuples
/// @pre all `code` and `symbol` values container in mapping are unique
/// @pre `code` values are prefix free
/// @pre `code` values are specified in DEFLATE canonical form
/// @pre `code` and `symbol` values are provided in lexicographical order
///
/// Construct a `table` with explicit contents. This constructor avoids
/// generation of prefix-free codes for symbols and assumes that the provided
/// codes have been generated correctly.
/// Construct a `table` with explicit contents.
///
/// @{

Expand All @@ -260,6 +318,24 @@ class table
constexpr table(table_contents_tag, const R& map)
: table_{table_contents, map}
{
assert(
std::ranges::is_sorted(
map,
[](const auto& x, const auto& y) {
const auto x_value = std::get<code>(x).value();
const auto y_value = std::get<code>(y).value();

const auto x_bitsize = std::get<code>(x).bitsize();
const auto y_bitsize = std::get<code>(y).bitsize();

const auto x_symbol = std::get<symbol_type>(x);
const auto y_symbol = std::get<symbol_type>(y);

return (x_value < y_value) and
((x_bitsize < y_bitsize) or
((x_bitsize == y_bitsize) and (x_symbol < y_symbol)));
}) and
"table contents are not provided in DEFLATE canonical form");
set_skip_fields();
}

Expand Down Expand Up @@ -302,8 +378,10 @@ class table

/// Returns an iterator to the first `encoding`
///
/// @note elements are ordered by code bitsize. If multiple elements have the
/// same code bitsize, the order is unspecified.
/// @note
/// * All codes of a given bit length have lexicographically consecutive
/// values, in the same order as the symbols they represent;
/// * Shorter codes lexicographically precede longer codes.
///
[[nodiscard]]
constexpr auto begin() const -> const_iterator
Expand Down Expand Up @@ -381,66 +459,6 @@ class table
}
return os;
}

/// Update table code values to DEFLATE canonical form
///
/// The Huffman codes used for each alphabet in the "deflate" format have two
/// additional rules:
/// * All codes of a given bit length have lexicographically consecutive
/// values, in the same order as the symbols they represent;
/// * Shorter codes lexicographically precede longer codes.
///
/// @see section 3.2.2 https://datatracker.ietf.org/doc/html/rfc1951
///
/// @{

constexpr auto canonicalize() & -> table&
{
using value_type = decltype(std::declval<code>().value());

// set lexicographical order
std::ranges::sort( //
table_, //
[](const auto& x, const auto& y) {
return std::pair{x.bitsize(), std::ref(x.symbol)} <
std::pair{y.bitsize(), std::ref(y.symbol)};
});

// used to determine initial value of next_code[bits]
// calculated in step 2
auto base_code = value_type{};

// used in determining consecutive code values in step 3
auto next_code = code{};

// clang-format off
for (auto& elem : table_) {
assert(next_code.bitsize() <= elem.bitsize());

next_code = {
elem.bitsize(),
next_code.bitsize() == elem.bitsize()
? next_code.value() + value_type{1} // 3) next_code[len]++;
: base_code <<= (elem.bitsize() - next_code.bitsize()) // 2) next_code[bits] = code; code = (...) << 1;
};

static_cast<code&>(elem) = next_code; // 3) tree[n].Code = next_code[len];

++base_code; // 2) (code + bl_count[bits-1])
}
// clang-format on

set_skip_fields();

return *this;
}

constexpr auto canonicalize() && -> table&&
{
return std::move(canonicalize());
}

/// @}
};

namespace detail {
Expand Down
10 changes: 0 additions & 10 deletions huffman/test/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,6 @@ cc_test(
],
)

cc_test(
name = "table_canonicalize_test",
timeout = "short",
srcs = ["table_canonicalize_test.cpp"],
deps = [
"//:boost_ut",
"//huffman",
],
)

cc_test(
name = "table_find_code_test",
timeout = "short",
Expand Down
11 changes: 7 additions & 4 deletions huffman/test/decode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ auto main() -> int
namespace huffman = ::starflate::huffman;
using namespace huffman::literals;

// FIXME table contents are not in canonical form
test("basic") = [] {
// encoded data from soxofaan/dahuffman readme.rst.
// We reverse the bits in each byte to match the encoding used in DEFLATE.
Expand All @@ -45,15 +46,17 @@ auto main() -> int
{01_c, 'i'},
{001_c, 'n'},
{0001_c, 'q'},
{00001_c, 'x'},
{00000_c, eot}}
{00000_c, eot},
{00001_c, 'x'}
}
}; // clang-format on

constexpr std::array expected = {
'e', 'x', 'e', 'n', 'e', 'e', 'e', 'e', 'x', 'n',
'i', 'q', 'n', 'e', 'i', 'e', 'i', 'n', 'i', eot,
};
constexpr auto output_buf = [&] {

const auto output_buf = [&] {
std::array<char, expected.size()> output_buf{};
auto result = decode(code_table, encoded_bytes, output_buf.begin());
// result should point to the back of output_buf.
Expand All @@ -63,6 +66,6 @@ auto main() -> int
return output_buf;
}();

static_assert(output_buf == expected);
expect(output_buf == expected);
};
}
106 changes: 0 additions & 106 deletions huffman/test/table_canonicalize_test.cpp

This file was deleted.

Loading

0 comments on commit c72bc7d

Please sign in to comment.