From a000d21f59fb6015c2246fb74e2e8a44623dc0db Mon Sep 17 00:00:00 2001 From: vidy Date: Tue, 19 Nov 2024 13:20:13 +0800 Subject: [PATCH] Support font encoder --- Cargo.toml | 11 ++++++++--- src/lib.rs | 8 ++++---- src/text.rs | 3 ++- src/tree.rs | 17 +++++++++-------- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 144d34d..d84875e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,15 +9,20 @@ description = "PDF text extraction" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies.pdf] +default-features=false +features = ["cache", "dump"] +git = "https://github.com/pdf-rs/pdf" + [dependencies] -pdf = { git = "https://github.com/pdf-rs/pdf", features = ["cache"] } -pdf_render = { git = "https://github.com/pdf-rs/pdf_render" } -font = { git = "https://github.com/pdf-rs/font" } +pdf_render= { git = "https://github.com/videni/pdf_render_with_vello", rev="2aae6fbec9e8276b24e6a38595c50e181dda0141"} itertools = "*" log = "*" ordered-float = "*" serde = { version = "*", features = ["derive"] } unicode-normalization = "0.1.19" +font = { git = "https://github.com/pdf-rs/font", branch = "vello", features=['cff']} pathfinder_geometry = { git = "https://github.com/servo/pathfinder" } pathfinder_color = { git = "https://github.com/servo/pathfinder" } diff --git a/src/lib.rs b/src/lib.rs index 8407487..3bd8789 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,7 +2,7 @@ use std::collections::HashSet; use entry::Flow; use pdf::{backend::Backend, object::{Page, Resolve}, PdfError}; -use pdf_render::{tracer::{TraceCache, Tracer, DrawItem}, Fill, render_pattern, render_page, FillMode}; +use pdf_render::{tracer::{TraceCache, Tracer, DrawItem}, Fill, render_pattern, render_page, FillMode, font::OutlineBuilder}; mod tree; mod util; @@ -10,10 +10,10 @@ mod text; pub mod entry; pub fn run(file: &pdf::file::CachedFile, page: &Page, resolve: &impl Resolve) -> Result { - let cache = TraceCache::new(); + let mut cache = TraceCache::new(OutlineBuilder::default()); let mut clip_paths = vec![]; - let mut tracer = Tracer::new(&cache, &mut clip_paths); + let mut tracer = Tracer::new(&mut cache, &mut clip_paths); render_page(&mut tracer, resolve, &page, Default::default())?; @@ -68,7 +68,7 @@ pub fn run(file: &pdf::file::CachedFile, page: &Page, resolve: &i continue; } }; - let mut pat_tracer = Tracer::new(&cache, &mut clip_paths); + let mut pat_tracer = Tracer::new(&mut cache, &mut clip_paths); render_pattern(&mut pat_tracer, &*pattern, resolve)?; let pat_items = pat_tracer.finish(); diff --git a/src/text.rs b/src/text.rs index 2f6e6cd..fa306f4 100644 --- a/src/text.rs +++ b/src/text.rs @@ -1,10 +1,11 @@ +use font::Encoder; use pathfinder_geometry::vector::Vector2F; use pdf_render::TextSpan; use itertools::{Itertools}; use unicode_normalization::UnicodeNormalization; use crate::{util::avg, entry::Word, util::Rect}; -pub fn concat_text<'a>(out: &mut String, items: impl Iterator + Clone) -> Vec { +pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator> + Clone) -> Vec { let mut words = vec![]; let gaps = items.clone() .flat_map(|s| { diff --git a/src/tree.rs b/src/tree.rs index 2eb696c..4a2e58e 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -16,8 +16,9 @@ use crate::util::{is_number, avg, CellContent}; use crate::text::{concat_text}; use std::mem::take; use table::Table; +use font::{Encoder, Glyph}; -pub fn build(spans: &[TextSpan], bbox: RectF, lines: &[[f32; 4]]) -> Node { +pub fn build(spans: &[TextSpan], bbox: RectF, lines: &[[f32; 4]]) -> Node { if spans.len() == 0 { return Node::singleton(&[]); } @@ -180,7 +181,7 @@ impl Span { } } -pub fn split2(boxes: &mut [(RectF, usize)], spans: &[TextSpan], lines_info: &Lines) -> Node { +pub fn split2(boxes: &mut [(RectF, usize)], spans: &[TextSpan], lines_info: &Lines) -> Node { use std::mem::replace; #[derive(Debug)] @@ -383,7 +384,7 @@ pub enum NodeTag { Complex, } -pub fn items(mut flow: &mut Flow, spans: &[TextSpan], node: &Node, x_anchor: f32) { +pub fn items(mut flow: &mut Flow, spans: &[TextSpan], node: &Node, x_anchor: f32) { match *node { Node::Final { ref indices } => { if indices.len() > 0 { @@ -534,10 +535,10 @@ pub fn items(mut flow: &mut Flow, spans: &[TextSpan], node: &Node, x_anchor: f32 } -pub fn render(w: &mut String, spans: &[TextSpan], node: &Node, bbox: RectF) { +pub fn render(w: &mut String, spans: &[TextSpan], node: &Node, bbox: RectF) { _render(w, spans, node, bbox, 0) } -fn _render(w: &mut String, spans: &[TextSpan], node: &Node, bbox: RectF, level: usize) { +fn _render(w: &mut String, spans: &[TextSpan], node: &Node, bbox: RectF, level: usize) { use std::fmt::Write; match *node { @@ -596,7 +597,7 @@ fn _render(w: &mut String, spans: &[TextSpan], node: &Node, bbox: RectF, level: } } -fn split(boxes: &mut [(RectF, usize)], spans: &[TextSpan], lines: &Lines) -> Node { +fn split(boxes: &mut [(RectF, usize)], spans: &[TextSpan], lines: &Lines) -> Node { let num_boxes = boxes.len(); if num_boxes < 2 { return Node::singleton(boxes); @@ -925,13 +926,13 @@ impl TriCount { } } } -fn classify<'a>(spans: impl Iterator) -> Class { +fn classify<'a, E: Encoder + 'a>(spans: impl Iterator>) -> Class { use pdf_render::FontEntry; let mut bold = TriCount::new(); let mut numeric = TriCount::new(); let mut uniform = TriCount::new(); - let mut first_font: *const FontEntry = std::ptr::null(); + let mut first_font: *const FontEntry = std::ptr::null(); for s in spans { numeric.add(is_number(&s.text));