Skip to content

Commit

Permalink
Fix TrueTypeWrapper to correctly encode multiple codepoints refering …
Browse files Browse the repository at this point in the history
…to the same glyph

When multiple codepoints are mapped to the same glyph, the result is
erroneous due to the double usage of identity maps for the charcode to
CID mapping and the CID to GID mapping.

To fix this,

* glyph caching in #glyph must be performed not only on the glyph ID but
  also on the supplied string;

* glyph encoding in #encode must be done using custom charcodes and not
  the glpyh ID;

* the custom mapping of charcodes to CIDs must be encoded using a custom
  CID CMap for the /Encoding entry (in most cases, sometimes it is still
  possible to use the identity encoding).
  • Loading branch information
gettalong committed Oct 16, 2024
1 parent 54f859c commit 1bbbc57
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 19 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
* Parsing of invalid `)` character in PDF objects and content streams
* Handling of files that contain stream length values that are indirect objects
that do not exist
* [HexaPDF::Font::TrueTypeWrapper] to correctly handle the situation when
multiple codepoints refer to the same glyph ID


## 0.47.0 - 2024-09-07
Expand Down
57 changes: 41 additions & 16 deletions lib/hexapdf/font/true_type_wrapper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ module Font
class TrueTypeWrapper

# Represents a single glyph of the wrapped font.
#
# Since some characters/strings may be mapped to the same glyph id by the font's builtin cmap
# table, it is possible that different Glyph instances with the same #id but different #str
# exist.
class Glyph

# The associated TrueTypeWrapper object.
Expand Down Expand Up @@ -152,6 +156,7 @@ def initialize(document, font, pdf_object: nil, subset: true)
@id_to_glyph = {}
@codepoint_to_glyph = {}
@encoded_glyphs = {}
@last_char_code = 0
end

# Returns the type of the font, i.e. :TrueType.
Expand Down Expand Up @@ -179,14 +184,15 @@ def subset?
!@subsetter.nil?
end

# Returns a Glyph object for the given glyph ID.
# Returns a Glyph object for the given glyph ID and +str+ pair.
#
# The optional argument +str+ should be the string representation of the glyph. Only use it if
# it is known,
# The optional argument +str+ should be the string representation of the glyph. It is possible
# that multiple strings map to the same glyph (e.g. hyphen and soft-hyphen could be
# represented by the same glyph).
#
# Note: Although this method is public, it should normally not be used by application code!
def glyph(id, str = nil)
@id_to_glyph[id] ||=
@id_to_glyph[[id, str]] ||=
if id >= 0 && id < @wrapped_font[:maxp].num_glyphs
Glyph.new(self, id, str || (+'' << (@cmap.gid_to_code(id) || 0xFFFD)))
else
Expand Down Expand Up @@ -228,14 +234,12 @@ def decode_codepoint(codepoint)

# Encodes the glyph and returns the code string.
def encode(glyph)
(@encoded_glyphs[glyph.id] ||=
(@encoded_glyphs[glyph] ||=
begin
raise HexaPDF::MissingGlyphError.new(glyph) if glyph.kind_of?(InvalidGlyph)
if @subsetter
[[@subsetter.use_glyph(glyph.id)].pack('n'), glyph]
else
[[glyph.id].pack('n'), glyph]
end
@subsetter.use_glyph(glyph.id) if @subsetter
@last_char_code += 1
[[@last_char_code].pack('n'), @last_char_code]
end)[0]
end

Expand Down Expand Up @@ -286,14 +290,15 @@ def create_pdf_object(document)
Supplement: 0},
CIDToGIDMap: :Identity})
dict = document.add({Type: :Font, Subtype: :Type0, BaseFont: cid_font[:BaseFont],
Encoding: :'Identity-H', DescendantFonts: [cid_font]})
DescendantFonts: [cid_font]})
dict.font_wrapper = self

document.register_listener(:complete_objects) do
update_font_name(dict)
embed_font(dict, document)
complete_width_information(dict)
create_to_unicode_cmap(dict, document)
add_encoding_information_cmap(dict, document)
end

dict
Expand All @@ -306,7 +311,7 @@ def update_font_name(dict)
return unless @subsetter

tag = +''
data = @encoded_glyphs.each_with_object(''.b) {|(id, v), s| s << id.to_s << v[0] }
data = @encoded_glyphs.each_with_object(''.b) {|(g, v), s| s << g.id.to_s << v[0] }
hash = Digest::MD5.hexdigest(data << @wrapped_font.font_name).to_i(16)
while hash != 0 && tag.length < 6
hash, mod = hash.divmod(UPPERCASE_LETTERS.length)
Expand Down Expand Up @@ -336,8 +341,8 @@ def embed_font(dict, document)
# Adds the /DW and /W fields to the CIDFont dictionary.
def complete_width_information(dict)
default_width = glyph(3, " ").width.to_i
widths = @encoded_glyphs.reject {|_, v| v[1].width == default_width }.map do |id, v|
[(@subsetter ? @subsetter.subset_glyph_id(id) : id), v[1].width]
widths = @encoded_glyphs.reject {|g, _| g.width == default_width }.map do |g, _|
[(@subsetter ? @subsetter.subset_glyph_id(g.id) : g.id), g.width]
end.sort!
dict[:DescendantFonts].first.set_widths(widths, default_width: default_width)
end
Expand All @@ -346,9 +351,10 @@ def complete_width_information(dict)
# correctly.
def create_to_unicode_cmap(dict, document)
stream = HexaPDF::StreamData.new do
mapping = @encoded_glyphs.keys.map! do |id|
mapping = @encoded_glyphs.map do |glyph, (_, char_code)|
# Using 0xFFFD as mentioned in Adobe #5411, last line before section 1.5
[(@subsetter ? @subsetter.subset_glyph_id(id) : id), @cmap.gid_to_code(id) || 0xFFFD]
# TODO: glyph.str assumed to consist of single char, No support for multiple chars
[char_code, glyph.str.ord || 0xFFFD]
end.sort_by!(&:first)
HexaPDF::Font::CMap.create_to_unicode_cmap(mapping)
end
Expand All @@ -357,6 +363,25 @@ def create_to_unicode_cmap(dict, document)
dict[:ToUnicode] = stream_obj
end

# Adds the /Encoding entry to the +dict+.
#
# This can either be the identity mapping or, if some Unicode codepoints are mapped to the
# same glyph, a custom CMap.
def add_encoding_information_cmap(dict, document)
mapping = @encoded_glyphs.map do |glyph, (_, char_code)|
# Using 0xFFFD as mentioned in Adobe #5411, last line before section 1.5
[char_code, (@subsetter ? @subsetter.subset_glyph_id(glyph.id) : glyph.id)]
end.sort_by!(&:first)
if mapping.all? {|char_code, cid| char_code == cid }
dict[:Encoding] = :'Identity-H'
else
stream = HexaPDF::StreamData.new { HexaPDF::Font::CMap.create_cid_cmap(mapping) }
stream_obj = document.add({}, stream: stream)
stream_obj.set_filter(:FlateDecode)
dict[:Encoding] = stream_obj
end
end

end

end
Expand Down
20 changes: 17 additions & 3 deletions test/hexapdf/font/test_true_type_wrapper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,12 @@
glyph.inspect)
end

it "caches glyphs based on the id and string" do
glyph = @font_wrapper.glyph(17)
assert_same(glyph, @font_wrapper.glyph(17))
refute_same(glyph, @font_wrapper.glyph(17, "1"))
end

it "invokes font.on_missing_glyph for missing glyphs" do
glyph = @font_wrapper.glyph(9999)
assert_kind_of(HexaPDF::Font::InvalidGlyph, glyph)
Expand Down Expand Up @@ -99,14 +105,18 @@
assert_equal([1].pack('n'), code)
code = @font_wrapper.encode(@font_wrapper.glyph(10))
assert_equal([2].pack('n'), code)
code = @font_wrapper.encode(@font_wrapper.glyph(10, "o"))
assert_equal([3].pack('n'), code)
end

it "returns the encoded glyph ID for fonts that are not subset" do
@font_wrapper = HexaPDF::Font::TrueTypeWrapper.new(@doc, @font, subset: false)
code = @font_wrapper.encode(@font_wrapper.glyph(3))
assert_equal([3].pack('n'), code)
assert_equal([1].pack('n'), code)
code = @font_wrapper.encode(@font_wrapper.glyph(10))
assert_equal([10].pack('n'), code)
assert_equal([2].pack('n'), code)
code = @font_wrapper.encode(@font_wrapper.glyph(10, "o"))
assert_equal([3].pack('n'), code)
end

it "raises an error if an InvalidGlyph is encoded" do
Expand Down Expand Up @@ -180,14 +190,18 @@
it "with fonts that are not subset (only differences to other case)" do
@font_wrapper = HexaPDF::Font::TrueTypeWrapper.new(@doc, @font, subset: false)
@font_wrapper.encode(@font_wrapper.glyph(3))
@font_wrapper.encode(@font_wrapper.glyph(3, "-"))
glyph = @font_wrapper.decode_utf8('H').first
@font_wrapper.encode(glyph)
@doc.dispatch_message(:complete_objects)

dict = @font_wrapper.pdf_object

assert_equal(HexaPDF::Font::CMap.create_to_unicode_cmap([[3, ' '.ord], [glyph.id, 'H'.ord]]),
assert_equal(HexaPDF::Font::CMap.create_to_unicode_cmap([[1, ' '.ord], [2, '-'.ord],
[3, 'H'.ord]]),
dict[:ToUnicode].stream)
assert_equal(HexaPDF::Font::CMap.create_cid_cmap([[1, 3], [2, 3], [3, glyph.id]]),
dict[:Encoding].stream)
assert_equal([glyph.id, [glyph.width]], dict[:DescendantFonts][0][:W].value)
end
end
Expand Down

0 comments on commit 1bbbc57

Please sign in to comment.