From b1a62325ddc76d27a0a190a9a65fcfeaae332e55 Mon Sep 17 00:00:00 2001 From: Alexander Mankuta Date: Fri, 3 Nov 2023 15:14:20 +0200 Subject: [PATCH] Full font embedding This add an option to disable font subsetting. Original fonts can be embedded in full original form. This feature can make documents substantially bigger. In addition to embedded fonts being bigger PDF requires additional information in order to properly render text. Specifically, it requires glyph widths. Some fonts contain thousands of glyps. A thousand of glyph widths on average would result in about 4 Kb additional size of the document. Additionally, PDF requires another mapping to make the text intelligible when copying. This additional size is much harder to estimate as it greatly depend on the font coverage but usually on the order of ~1-10 Kb per font. Intended use case is a workaround for when TTFunk breaks fonts in subsetting. But also this might be useful for documents that are going to be edited. For example, documents that are templates and more text would be added later, or AcroForm feature that allows end users to fill forms. --- lib/prawn/font.rb | 38 ++++- lib/prawn/fonts/to_unicode_cmap.rb | 138 +++++++++++++++ lib/prawn/fonts/ttf.rb | 260 ++++++++++++++++++++++------- spec/prawn/font_spec.rb | 91 ++++++++++ 4 files changed, 459 insertions(+), 68 deletions(-) create mode 100644 lib/prawn/fonts/to_unicode_cmap.rb diff --git a/lib/prawn/font.rb b/lib/prawn/font.rb index 41281adff..4d7597463 100644 --- a/lib/prawn/font.rb +++ b/lib/prawn/font.rb @@ -145,19 +145,23 @@ def width_of(string, options = {}) end end - # Hash that maps font family names to their styled individual font names. + # Hash that maps font family names to their styled individual font + # definitions. # # To add support for another font family, append to this hash, e.g: # # pdf.font_families.update( - # "MyTrueTypeFamily" => { :bold => "foo-bold.ttf", - # :italic => "foo-italic.ttf", - # :bold_italic => "foo-bold-italic.ttf", - # :normal => "foo.ttf" }) + # "MyTrueTypeFamily" => { + # bold: "foo-bold.ttf", + # italic: "foo-italic.ttf", + # bold_italic: "foo-bold-italic.ttf", + # normal: "foo.ttf" + # } + # ) # # This will then allow you to use the fonts like so: # - # pdf.font("MyTrueTypeFamily", :style => :bold) + # pdf.font("MyTrueTypeFamily", style: :bold) # pdf.text "Some bold text" # pdf.font("MyTrueTypeFamily") # pdf.text "Some normal text" @@ -170,6 +174,17 @@ def width_of(string, options = {}) # defining your own font families, you can map any or all of these # styles to whatever font files you'd like. # + # Font definition can be either a hash or just a string. + # + # A hash font definition can specify a number of options: + # + # - :file -- path to the font file (required) + # - :subset -- whether to subset the font (default false). Only + # applicable to TrueType and OpenType fonts (includnig DFont and TTC). + # + # A string font definition is equivalent to hash definition with only + # :file being specified. + # def font_families @font_families ||= {}.merge!( 'Courier' => { @@ -339,6 +354,8 @@ def initialize(document, name, options = {}) # :nodoc: @references = {} @subset_name_cache = {} + + @full_font_embedding = options.key?(:subset) && !options[:subset] end # The size of the font ascender in PDF points @@ -401,7 +418,12 @@ def add_to_current_page(subset) end def identifier_for(subset) # :nodoc: - @subset_name_cache[subset] ||= "#{@identifier}.#{subset}".to_sym + @subset_name_cache[subset] ||= + if full_font_embedding + @identifier.to_sym + else + "#{@identifier}.#{subset}".to_sym + end end def inspect # :nodoc: @@ -426,6 +448,8 @@ def eql?(other) # :nodoc: private + attr_reader :full_font_embedding + # generate a font identifier that hasn't been used on the current page yet # def generate_unique_id diff --git a/lib/prawn/fonts/to_unicode_cmap.rb b/lib/prawn/fonts/to_unicode_cmap.rb new file mode 100644 index 000000000..b2436575c --- /dev/null +++ b/lib/prawn/fonts/to_unicode_cmap.rb @@ -0,0 +1,138 @@ +# frozen_string_literal: true + +module Prawn + module Fonts + class ToUnicodeCMap # @private + # mapping is expected to be a hash with keys being charater codes (in + # broad sense, as used in the showing operation strings) and values being + # Unicode code points + def initialize(mapping, code_space_size = nil) + @mapping = mapping + @code_space_size = code_space_size + end + + def generatate + chunks = [] + + # Header + chunks << <<~HEADER.chomp + /CIDInit /ProcSet findresource begin + 12 dict begin + begincmap + /CIDSystemInfo 3 dict dup begin + /Registry (Adobe) def + /Ordering (UCS) def + /Supplement 0 def + end def + /CMapName /Adobe-Identity-UCS def + /CMapType 2 def + HEADER + + max_glyph_index = mapping.keys.max + # Range + code_space_size = @code_space_size || (max_glyph_index.bit_length / 8.0).ceil + + # In CMap codespaces are not sequentional, they're ranges in + # a multi-dimentional space. Each byte is considered separately. So we + # have to maximally extend the lower bytes in order to allow for + # continuos mapping. + # We only keep the highest byte because usually it's lower than + # maximally allowed and we don't want to cover that unused space. + code_space_max = max_glyph_index | ('ff' * (code_space_size - 1)).to_i(16) + + chunks << '1 begincodespacerange' + chunks << format("<%0#{code_space_size * 2}X><%0#{code_space_size * 2}X>", 0, code_space_max) + chunks << 'endcodespacerange' + + # Mapping + all_spans = + mapping_spans( + mapping.reject { |gid, cid| gid.zero? || (0xd800..0xdfff).cover?(cid) } + ) + + short_spans, long_spans = all_spans.partition { _1[0] == :short } + + long_spans + .each_slice(100) do |spans| + chunks << "#{spans.length} beginbfrange" + + spans.each do |type, span| + case type + when :fully_sorted + chunks << format( + "<%0#{code_space_size * 2}X><%0#{code_space_size * 2}X><%s>", + span.first[0], + span.last[0], + span.first[1].chr(::Encoding::UTF_16BE).unpack1('H*') + ) + when :index_sorted + chunks << format( + "<%0#{code_space_size * 2}X><%0#{code_space_size * 2}X>[%s]", + span.first[0], + span.last[0], + span.map { |_, cid| "<#{cid.chr(::Encoding::UTF_16BE).unpack1('H*')}>" }.join('') + ) + end + end + + chunks << 'endbfrange' + end + + short_spans + .map { |_type, slice| slice.flatten(1) } + .each_slice(100) do |mapping| + chunks << "#{mapping.length} beginbfchar" + chunks.concat( + mapping.map do |(gid, cid)| + format( + "<%0#{code_space_size * 2}X><%s>", + gid, + cid.chr(::Encoding::UTF_16BE).unpack1('H*') + ) + end + ) + chunks << 'endbfchar' + end + + # Footer + chunks << <<~FOOTER.chomp + endcmap + CMapName currentdict /CMap defineresource pop + end + end + FOOTER + + chunks.join("\n") + end + + private + + attr_reader :mapping + + attr_reader :cmap, :code_space_size, :code_space_max + + def mapping_spans(mapping) + mapping + .sort + .slice_when { |a, b| (b[0] - a[0]) != 1 } # Slice at key discontinuity + .flat_map do |slice| + if slice.length == 1 + [[:short, slice]] + else + continuous_clices, discontinuous_slices = + slice + .slice_when { |a, b| b[1] - a[1] != 1 } # Slice at value discontinuity + .partition { |subslice| subslice.length > 1 } + + discontinuous_slices + .flatten(1) # Join together + .slice_when { |a, b| (b[0] - a[0]) != 1 } # Slice at key discontinuity, again + .map { _1.length > 1 ? [:index_sorted, _1] : [:short, _1] } + + continuous_clices.map { [:fully_sorted, _1] } + end + end + .sort_by { _1[1][0][0] } # Sort span start key + end + end + end +end diff --git a/lib/prawn/fonts/ttf.rb b/lib/prawn/fonts/ttf.rb index 7a28a9ca5..01b6d94ae 100644 --- a/lib/prawn/fonts/ttf.rb +++ b/lib/prawn/fonts/ttf.rb @@ -9,6 +9,7 @@ require 'ttfunk' require 'ttfunk/subset_collection' +require 'prawn/fonts/to_unicode_cmap' module Prawn module Fonts @@ -43,11 +44,70 @@ def unicode? true end + class FullFontSubsetsCollection + FULL_FONT = Object.new.tap do |obj| + obj.singleton_class.define_method(:inspect) do + super().insert(-2, ' FULL_FONT') + end + end.freeze + + def initialize(original) + @original = original + + (@cmap ||= original.cmap.unicode.first) || raise(NoUnicodeCMap.new(font: name)) + + @code_space_size = + case cmap.code_map.keys.max + when 0..0xff then 1 + when 0x100..0xffff then 2 + when 0x10000..0xffffff then 3 + else + 4 + end + + # Codespaces are not sequentional, they're ranges in + # a multi-dimentional space. Each byte is considered separately. So we + # have to maximally extend the lower two bytes in order to allow for + # continuos Unicode mapping. + # We only keep the highest byte because Unicode only goes to 1FFFFF + # and fonts usually cover even less of the space. We don't want to + # list all those unmapped charac codes here. + @code_space_max = cmap.code_map.keys.max | ('ff' * (code_space_size - 1)).to_i(16) + end + + def encode(characters) + [ + [ + FULL_FONT, + characters.map do |c| + check_bounds!(c) + [cmap[c]].pack('n') + end.join('') + ] + ] + end + + private + + attr_reader :cmap, :code_space_size, :code_space_max + + def check_bounds!(num) + if num > code_space_max + raise Error, "CID (#{num}) exceedes code space size" + end + end + end + def initialize(document, name, options = {}) super @ttf = read_ttf_file - @subsets = TTFunk::SubsetCollection.new(@ttf) + @subsets = + if full_font_embedding + FullFontSubsetsCollection.new(@ttf) + else + TTFunk::SubsetCollection.new(@ttf) + end @italic_angle = nil @attributes = {} @@ -200,7 +260,6 @@ def pdf_flags def normalize_encoding(text) text.encode(::Encoding::UTF_8) rescue StandardError => e - puts e raise Prawn::Errors::IncompatibleStringEncoding, "Encoding #{text.encoding} can not be transparently converted to UTF-8. " \ 'Please ensure the encoding of the string you are attempting ' \ @@ -289,12 +348,26 @@ def register(subset) end def embed(reference, subset) - font_content = @subsets[subset].encode + if full_font_embedding + embed_full_font(reference) + else + embed_subset(reference, subset) + end + end - # FIXME: we need postscript_name and glyph widths from the font - # subset. Perhaps this could be done by querying the subset, - # rather than by parsing the font that the subset produces? - font = TTFunk::File.new(font_content) + def embed_subset(reference, subset) + font = TTFunk::File.new(@subsets[subset].encode) + unicode_mapping = @subsets[subset].to_unicode_map + embed_simple_font(reference, font, unicode_mapping) + end + + def embed_simple_font(reference, font, unicode_mapping) + if font_type(font) == :unknown + raise Error, %(Simple font embedding is not uspported for font "#{font.name}.") + end + + true_type = font_type(font) == :true_type + open_type = font_type(font) == :open_type # empirically, it looks like Adobe Reader will not display fonts # if their font name is more than 33 bytes long. Strange. But true. @@ -302,14 +375,14 @@ def embed(reference, subset) raise NoPostscriptName.new(font: font) if basename.nil? - fontfile = @document.ref!(Length1: font_content.size) - fontfile.stream << font_content - fontfile.stream.compress! + fontfile = @document.ref!({}) + fontfile.data[:Length1] = font.contents.size + fontfile.stream << font.contents.string + fontfile.stream.compress! if @document.compression_enabled? descriptor = @document.ref!( Type: :FontDescriptor, FontName: basename.to_sym, - FontFile2: fontfile, FontBBox: bbox, Flags: pdf_flags, StemV: stem_v, @@ -320,10 +393,20 @@ def embed(reference, subset) XHeight: x_height ) + first_char = font.cmap.tables.first.code_map.index { |gid| !gid.zero? } + last_char = font.cmap.tables.first.code_map.rindex { |gid| !gid.zero? } hmtx = font.horizontal_metrics - widths = font.cmap.tables.first.code_map.map do |gid| - Integer(hmtx.widths[gid] * scale_factor) - end[32..] + widths = + font.cmap.tables.first.code_map[first_char..last_char].map do |gid| + if gid.zero? + # These characters are not in the document so we don't ever use + # these values but we need to encode them so let's use as little + # sapce as possible. + 0 + else + Integer(hmtx.widths[gid] * scale_factor) + end + end # It would be nice to have Encoding set for the macroman subsets, # and only do a ToUnicode cmap for non-encoded unicode subsets. @@ -335,65 +418,120 @@ def embed(reference, subset) # For now, it's simplest to just create a unicode cmap for every font. # It offends my inner purist, but it'll do. - map = @subsets[subset].to_unicode_map + to_unicode = @document.ref!({}) + to_unicode << ToUnicodeCMap.new(unicode_mapping).generatate + to_unicode.stream.compress! if @document.compression_enabled? - ranges = [[]] - map.keys.sort.reduce('') do |_s, code| - ranges << [] if ranges.last.length >= 100 - unicode = map[code] - ranges.last << format( - '<%02x><%04x>', - code: code, - unicode: unicode - ) + reference.data.update( + BaseFont: basename.to_sym, + FontDescriptor: descriptor, + FirstChar: first_char, + LastChar: last_char, + Widths: @document.ref!(widths), + ToUnicode: to_unicode + ) + + if true_type + reference.data.update(Subtype: :TrueType) + descriptor.data.update(FontFile2: fontfile) + elsif open_type + @document.renderer.min_version(1.6) + reference.data.update(Subtype: :Type1) + descriptor.data.update(FontFile3: fontfile) + fontfile.data.update(Subtype: :OpenType) end + end - range_blocks = - ranges.reduce(+'') do |s, list| - s << format( - "%d beginbfchar\n%s\nendbfchar\n", - lenght: list.length, - list: list.join("\n") - ) - end + def embed_full_font(reference) + embed_composite_font(reference, @ttf) + end - to_unicode_cmap = UNICODE_CMAP_TEMPLATE % range_blocks.strip + def embed_composite_font(reference, font) + if font_type(font) == :unknown + raise Error, %(Composite font embedding is not uspported for font "#{font.name}.") + end - cmap = @document.ref!({}) - cmap << to_unicode_cmap - cmap.stream.compress! + true_type = font_type(font) == :true_type + open_type = font_type(font) == :open_type - reference.data.update( - Subtype: :TrueType, + fontfile = @document.ref!({}) + fontfile.data[:Length1] = font.contents.size if true_type + fontfile.data[:Subtype] = :CIDFontType0C if open_type + fontfile.stream << font.contents.string + fontfile.stream.compress! if @document.compression_enabled? + + # empirically, it looks like Adobe Reader will not display fonts + # if their font name is more than 33 bytes long. Strange. But true. + basename = font.name.postscript_name[0, 33].delete("\0") + + descriptor = @document.ref!( + Type: :FontDescriptor, + FontName: basename.to_sym, + FontBBox: bbox, + Flags: pdf_flags, + StemV: stem_v, + ItalicAngle: italic_angle, + Ascent: @ascender, + Descent: @descender, + CapHeight: cap_height, + XHeight: x_height + ) + descriptor.data[:FontFile2] = fontfile if true_type + descriptor.data[:FontFile3] = fontfile if open_type + + to_unicode = @document.ref!({}) + to_unicode << ToUnicodeCMap.new( + font.cmap.unicode.first + .code_map + .reject { |cid, gid| gid.zero? || (0xd800..0xdfff).cover?(cid) } + .invert + .sort.to_h, + 2 # Identity-H is a 2-byte encoding + ).generatate + to_unicode.stream.compress! if @document.compression_enabled? + + widths = + font.horizontal_metrics.widths.map { |w| (w * scale_factor).round } + + child_font = @document.ref!( + Type: :Font, BaseFont: basename.to_sym, + CIDSystemInfo: { + Registry: 'Adobe', + Ordering: 'Identity', + Supplement: 0 + }, FontDescriptor: descriptor, - FirstChar: 32, - LastChar: 255, - Widths: @document.ref!(widths), - ToUnicode: cmap + W: [0, widths] + ) + if true_type + child_font.data.update( + Subtype: :CIDFontType2, + CIDToGIDMap: :Identity + ) + end + if open_type + child_font.data[:Subtype] = :CIDFontType0 + end + + reference.data.update( + Subtype: :Type0, + BaseFont: basename.to_sym, + Encoding: :'Identity-H', + DescendantFonts: [child_font], + ToUnicode: to_unicode ) end - UNICODE_CMAP_TEMPLATE = <<-STR.strip.gsub(/^\s*/, '') - /CIDInit /ProcSet findresource begin - 12 dict begin - begincmap - /CIDSystemInfo << - /Registry (Adobe) - /Ordering (UCS) - /Supplement 0 - >> def - /CMapName /Adobe-Identity-UCS def - /CMapType 2 def - 1 begincodespacerange - <00> - endcodespacerange - %s - endcmap - CMapName currentdict /CMap defineresource pop - end + def font_type(font) + if font.directory.tables.key?('glyf') + :true_type + elsif font.directory.tables.key?('CFF ') + :open_type + else + :unknown end - STR + end def read_ttf_file TTFunk::File.open(@name) diff --git a/spec/prawn/font_spec.rb b/spec/prawn/font_spec.rb index 185009c75..5211b468f 100644 --- a/spec/prawn/font_spec.rb +++ b/spec/prawn/font_spec.rb @@ -469,6 +469,52 @@ def page_should_not_include_font(font) expect(original.equal?(normalized)).to eq false end end + + describe 'full font embedding' do + let(:font) { pdf.find_font "#{Prawn::DATADIR}/fonts/DejaVuSans.ttf", subset: false } + let(:ref) { pdf.ref!({}).tap { |ref| font.__send__(:embed, ref, nil) } } + + it 'is a composite font' do + font_obj = ref.data + + expect(font_obj[:Subtype]).to eq(:Type0) + expect(font_obj[:DescendantFonts]).to be_an(Array) + expect(font_obj[:DescendantFonts].length).to eq(1) + desc_font = font_obj[:DescendantFonts].first.data + expect(desc_font[:Type]).to eq(:Font) + expect(desc_font[:Subtype]).to eq(:CIDFontType2) + end + + it 'has proper metrics' do + descriptor = ref.data[:DescendantFonts].first.data[:FontDescriptor].data + expect(descriptor[:Ascent]).to eq(759) + expect(descriptor[:Descent]).to eq(-240) + expect(descriptor[:CapHeight]).to eq(759) + end + + it 'has proper encoding' do + font_obj = ref.data + expect(font_obj[:Encoding]).to eq(:'Identity-H') + desc_font = font_obj[:DescendantFonts].first.data + expect(desc_font[:CIDToGIDMap]).to eq(:Identity) + end + + it 'contains glyph widths' do + desc_font = ref.data[:DescendantFonts].first.data + expect(desc_font[:W]).to be_an(Array) + expect(desc_font[:W].length).to eq(2) + expect(desc_font[:W][0]).to eq(0) + expect(desc_font[:W][1]).to be_an(Array) + expect(desc_font[:W][1].length).to eq(6108) # All glyph metrics + end + + it 'propely embeds font data' do + descriptor = ref.data[:DescendantFonts].first.data[:FontDescriptor].data + expect(descriptor).to have_key(:FontFile2) + expect(descriptor[:FontFile2].data[:Length1]).to eq(741_536) + expect(descriptor[:FontFile2].stream).to_not be_empty + end + end end describe 'OTF fonts' do @@ -500,6 +546,51 @@ def page_should_not_include_font(font) expect(original).to_not be_equal(normalized) end end + + describe 'full font embedding' do + let(:font) { pdf.find_font "#{Prawn::DATADIR}/fonts/Bodoni-Book.otf", subset: false } + let(:ref) { pdf.ref!({}).tap { |ref| font.__send__(:embed, ref, nil) } } + + it 'is a composite font' do + font_obj = ref.data + + expect(font_obj[:Subtype]).to eq(:Type0) + expect(font_obj[:DescendantFonts]).to be_an(Array) + expect(font_obj[:DescendantFonts].length).to eq(1) + desc_font = font_obj[:DescendantFonts].first.data + expect(desc_font[:Type]).to eq(:Font) + expect(desc_font[:Subtype]).to eq(:CIDFontType0) + end + + it 'has proper metrics' do + descriptor = ref.data[:DescendantFonts].first.data[:FontDescriptor].data + expect(descriptor[:Ascent]).to eq(1023) + expect(descriptor[:Descent]).to eq(-200) + expect(descriptor[:CapHeight]).to eq(3072) + end + + it 'has proper encoding' do + font_obj = ref.data + expect(font_obj[:Encoding]).to eq(:'Identity-H') + desc_font = font_obj[:DescendantFonts].first.data + expect(desc_font).to_not have_key(:CIDToGIDMap) + end + + it 'contains glyph widths' do + desc_font = ref.data[:DescendantFonts].first.data + expect(desc_font[:W]).to be_an(Array) + expect(desc_font[:W].length).to eq(2) + expect(desc_font[:W][0]).to eq(0) + expect(desc_font[:W][1]).to be_an(Array) + expect(desc_font[:W][1].length).to eq(353) # All glyph metrics + end + + it 'propely embeds font data' do + descriptor = ref.data[:DescendantFonts].first.data[:FontDescriptor].data + expect(descriptor).to have_key(:FontFile3) + expect(descriptor[:FontFile3].stream).to_not be_empty + end + end end describe 'DFont fonts' do