diff --git a/Cargo.toml b/Cargo.toml index 144d34d..fc53ae1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,15 +9,18 @@ description = "PDF text extraction" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + [dependencies] -pdf = { git = "https://github.com/pdf-rs/pdf", features = ["cache"] } -pdf_render = { git = "https://github.com/pdf-rs/pdf_render" } -font = { git = "https://github.com/pdf-rs/font" } +pdf_render= { git = "https://github.com/videni/pdf_render_with_vello.git", branch="vello_wip"} +# pdf_render= { path = "../pdf_render/render"} +pdf = { git = "https://github.com/pdf-rs/pdf", features = ["cache", "dump"], default-features = false, rev = "9002322822a3773d3d265dee81d855b40f5e0d0a"} + itertools = "*" log = "*" ordered-float = "*" serde = { version = "*", features = ["derive"] } unicode-normalization = "0.1.19" +font = { git = "https://github.com/videni/font", branch = "master", features=['cff']} pathfinder_geometry = { git = "https://github.com/servo/pathfinder" } pathfinder_color = { git = "https://github.com/servo/pathfinder" } diff --git a/examples/text.rs b/examples/text.rs index 0917053..3a24b30 100644 --- a/examples/text.rs +++ b/examples/text.rs @@ -6,15 +6,35 @@ fn main() { let file = FileOptions::cached().open(&input).expect("can't read PDF"); let resolver = file.resolver(); - for (page_nr, page) in file.pages().enumerate() { - let page = page.expect("can't read page"); - let flow = pdf_text::run(&file, &page, &resolver).expect("can't render page"); - println!("# page {}", page_nr + 1); + // for (page_nr, page) in file.pages().enumerate() { + let page: pdf::object::PageRc = file.get_page(0).unwrap(); + let flow = pdf_text::run(&file, &page, &resolver, Default::default(), false).expect("can't render page"); for run in flow.runs { - for line in run.lines { - println!("{}", line.words.iter().map(|w| &w.text).format(" ")); + for line in &run.lines { + println!("{:?}", line.rect); + for word in &line.words { + println!("{}, {:?}", word.text.as_str(), word.rect); + dbg!(&word.chars); + + let text = &word.text; + let mut offset = 0; + let mut chars = word.chars.iter().peekable(); + let mut texts = vec![]; + + while let Some(_) = chars.next() { + // Get text for current char + let s = if let Some(next) = chars.peek() { + let s = &text[offset..next.offset]; + offset = next.offset; + s + } else { + &text[offset..] + }; + + texts.push(s); + } + } } - println!(); } - } + // } } diff --git a/src/classify.rs b/src/classify.rs new file mode 100644 index 0000000..5033738 --- /dev/null +++ b/src/classify.rs @@ -0,0 +1,81 @@ +use std::sync::Arc; + +use font::Encoder; +use pdf_render::TextSpan; + +use crate::util::is_number; + +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Class { + Number, + Header, + Paragraph, + Mixed, +} + +pub fn classify<'a, E: Encoder + 'a>(spans: impl Iterator>) -> Class { + use pdf_render::FontEntry; + + let mut bold = TriCount::new(); + let mut numeric = TriCount::new(); + let mut uniform = TriCount::new(); + let mut first_font: *const FontEntry = std::ptr::null(); + + for s in spans { + numeric.add(is_number(&s.text)); + if let Some(ref font) = s.font { + bold.add(font.name.contains("Bold")); + let font_ptr = Arc::as_ptr(font); + if first_font.is_null() { + first_font = font_ptr; + } else { + uniform.add(font_ptr == first_font); + } + } + } + uniform.add(true); + + match (numeric.count(), bold.count(), uniform.count()) { + (Tri::True, _, Tri::True) => Class::Number, + (_, Tri::True, Tri::True) => Class::Header, + (_, Tri::False, Tri::True) => Class::Paragraph, + (_, Tri::False, _) => Class::Paragraph, + (_, Tri::Maybe(_), _) => Class::Paragraph, + _ => Class::Mixed + } +} + +pub enum Tri { + False, + True, + Maybe(f32), + Unknown, +} + +#[derive(Debug)] +pub struct TriCount { + tru: usize, + fal: usize, +} +impl TriCount { + fn new() -> Self { + TriCount { + tru: 0, + fal: 0 + } + } + fn add(&mut self, b: bool) { + match b { + false => self.fal += 1, + true => self.tru += 1, + } + } + fn count(&self) -> Tri { + match (self.fal, self.tru) { + (0, 0) => Tri::Unknown, + (0, _) => Tri::True, + (_, 0) => Tri::False, + (f, t) => Tri::Maybe(t as f32 / (t + f) as f32) + } + } +} \ No newline at end of file diff --git a/src/entry.rs b/src/entry.rs deleted file mode 100644 index 9aeff6e..0000000 --- a/src/entry.rs +++ /dev/null @@ -1,52 +0,0 @@ -use serde::{Serialize, Deserialize}; -use table::Table; - -use crate::util::{Rect, CellContent}; - -#[derive(Serialize, Deserialize)] -pub struct Word { - pub text: String, - pub rect: Rect, -} -#[derive(Serialize, Deserialize)] -pub struct Line { - pub words: Vec, -} -#[derive(Serialize, Deserialize)] -pub struct Run { - pub lines: Vec, - pub kind: RunType, -} - -#[derive(Serialize, Deserialize)] -pub struct Flow { - pub lines: Vec, - pub runs: Vec, -} -#[derive(Serialize, Deserialize)] -pub enum RunType { - ParagraphContinuation, - Paragraph, - Header, - Cell, -} - -impl Flow { - pub fn new() -> Self { - Flow { - lines: vec![], - runs: vec![] - } - } - pub fn add_line(&mut self, words: Vec, kind: RunType) { - if words.len() > 0 { - self.runs.push(Run { - lines: vec![Line { words }], - kind - }); - } - } - pub fn add_table(&mut self, table: Table) { - - } -} diff --git a/src/flow.rs b/src/flow.rs new file mode 100644 index 0000000..74b8dbd --- /dev/null +++ b/src/flow.rs @@ -0,0 +1,278 @@ +use crate::classify::{classify, Class}; +use crate::node::{Node, NodeTag}; +use crate::util::avg; +use crate::text::concat_text; +use std::iter::once; +use pathfinder_geometry::rect::RectF; +use pdf_render::TextSpan; + +use std::mem::take; +use font::Encoder; +use serde::{Serialize, Deserialize}; +use table::Table; + +#[derive(Serialize, Deserialize)] +pub struct Word { + pub text: String, + pub rect: Rect, + pub chars: Vec +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct Char { + // Byte offset + pub offset: usize, + pub pos: f32, + pub width: f32, +} + +#[derive(Serialize, Deserialize)] +pub struct Line { + pub words: Vec, + pub rect: Rect, +} +#[derive(Serialize, Deserialize)] +pub struct Run { + pub lines: Vec, + pub kind: RunType, +} + +#[derive(Serialize, Deserialize)] +pub enum RunType { + ParagraphContinuation, + Paragraph, + Header, + Cell, +} + + +#[derive(Copy, Clone, Debug)] +#[derive(Serialize, Deserialize)] +#[repr(C)] +pub struct Rect { + pub x: f32, + pub y: f32, + pub w: f32, + pub h: f32 +} +impl From for Rect { + fn from(r: RectF) -> Self { + Rect { + x: r.origin_x(), + y: r.origin_y(), + w: r.width(), + h: r.height() + } + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct CellContent { + pub text: String, + pub rect: Rect, +} + +#[derive(Serialize, Deserialize)] +pub struct Flow { + pub runs: Vec, +} + +impl Flow { + pub fn new() -> Self { + Flow { + runs: vec![] + } + } + pub fn add_line(&mut self, words: Vec, kind: RunType, rect: Rect) { + if words.len() > 0 { + self.runs.push(Run { + lines: vec![Line { words, rect}], + kind, + }); + } + } + pub fn add_table(&mut self, table: Table) { + + } +} + +pub(crate) fn build(mut flow: &mut Flow, spans: &[TextSpan], node: &Node, x_anchor: f32) { + match *node { + Node::Final { ref indices } => { + if indices.len() > 0 { + let node_spans = indices.iter() + .flat_map(|&i| spans.get(i)); + let bbox = node_spans.clone() + .map(|s| s.rect) + .reduce(|a, b| a.union_rect(b)) + .unwrap(); + + let class = classify(node_spans.clone()); + let mut text = String::new(); + let words = concat_text(&mut text, node_spans); + + let t = match class { + Class::Header => RunType::Header, + _ => RunType::Paragraph, + }; + + flow.add_line(words, t, bbox.into()); + } + } + Node::Grid { ref x, ref y, ref cells, tag } => { + match tag { + NodeTag::Singleton | + NodeTag::Line => { + let mut indices = vec![]; + node.indices(&mut indices); + + let line_spans = indices.iter().flat_map(|&i| spans.get(i)); + let bbox: RectF = line_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap().into(); + + let class = classify(line_spans.clone()); + let mut text = String::new(); + let words = concat_text(&mut text, line_spans); + + let t = match class { + Class::Header => RunType::Header, + _ => RunType::Paragraph, + }; + + flow.add_line(words, t, bbox.into()); + } + NodeTag::Paragraph => { + assert_eq!(x.len(), 0, "For paragraph x gaps must be empty"); + + let mut lines: Vec<(RectF, usize)> = vec![]; + let mut indices = vec![]; + + for n in cells { + let start: usize = indices.len(); + n.indices(&mut indices); + if indices.len() > start { + let cell_spans = indices[start..].iter().flat_map(|&i| spans.get(i)); + let bbox = cell_spans.map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap().into(); + lines.push((bbox, indices.len())); + } + } + + let para_spans = indices.iter().flat_map(|&i| spans.get(i)); + let class = classify(para_spans.clone()); + // the bounding box the paragraph + let bbox = lines.iter().map(|t| t.0).reduce(|a, b| a.union_rect(b)).unwrap(); + let line_height = avg(para_spans.map(|s| s.rect.height())).unwrap(); + + // classify the lines by this vertical line + let left_margin = bbox.min_x() + 0.5 * line_height; + + // count how many are right and left of the split. + let mut left = 0; + let mut right = 0; + + for (line_bbox, _) in lines.iter() { + if line_bbox.min_x() >= left_margin { + right += 1; + } else { + left += 1; + } + } + //typically paragraphs are indented to the right and longer than 2 lines. + //then there will be a higher left count than right count. + let indent = left > right; + + // A paragraph with 3 lines, 3 cases: + // case 1: outdented(right > left, will get 3 runs) + // |------- + // | ---- + // | ---- + // case 2: indented (left > right, one new run) + // | ------ + // |------- + // |------- + // case 3: same x (no indentation, but left > right, right = 0, will be in the same run) + // |------ + // |------ + // |------ + + //TODO: A paragraph with two lines starts at the same x? then left = right. + // the second line will be treated as as another run, but actually it should be in + // in the same run. + + let mut para_start = 0; + let mut line_start = 0; + let mut text = String::new(); + let mut para_bbox = RectF::default(); + let mut flow_lines = vec![]; + for &(line_bbox, end) in lines.iter() { + if line_start != 0 { + //Always add a line break for new line, which will be treated as whitespace in the concat_text method + text.push('\n'); + + // if a line is indented(indent = true) or outdented(indent = false), it marks a new paragraph + // so here, save previous lines as a new run. + if (line_bbox.min_x() >= left_margin) == indent { + flow.runs.push(Run { + lines: take(&mut flow_lines), + kind: match class { + Class::Header => RunType::Header, + _ => RunType::Paragraph + }, + }); + para_start = line_start; + } + } + if end > line_start { + let words = concat_text(&mut text, indices[line_start..end].iter().flat_map(|&i| spans.get(i))); + + if words.len() > 0 { + flow_lines.push(Line { words , rect: line_bbox.into()}); + } + } + if para_start == line_start { + para_bbox = line_bbox; + } else { + para_bbox = para_bbox.union_rect(line_bbox); + } + line_start = end; + } + + flow.runs.push(Run { + lines: flow_lines, + kind: match class { + Class::Header => RunType::Header, + _ => RunType::Paragraph + } + }); + } + NodeTag::Complex => { + let x_anchors = once(x_anchor).chain(x.iter().cloned()).cycle(); + for (node, x) in cells.iter().zip(x_anchors) { + build(flow, spans, node, x); + } + } + } + } + Node::Table { ref table } => { + if let Some(bbox) = table.values() + .flat_map(|v| v.value.iter().flat_map(|&i| spans.get(i).map(|s| s.rect))) + .reduce(|a, b| a.union_rect(b)) { + let table = table.flat_map(|indices| { + if indices.len() == 0 { + None + } else { + let line_spans = indices.iter().flat_map(|&i| spans.get(i)); + let bbox: RectF = line_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap().into(); + + let mut text = String::new(); + concat_text(&mut text, line_spans.clone()); + Some(CellContent { + text, + rect: bbox.into(), + }) + } + }); + flow.add_table(table); + } + } + } +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 8407487..f166c1b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,25 +1,28 @@ use std::collections::HashSet; -use entry::Flow; +use flow::Flow; +use pathfinder_geometry::transform2d::Transform2F; use pdf::{backend::Backend, object::{Page, Resolve}, PdfError}; -use pdf_render::{tracer::{TraceCache, Tracer, DrawItem}, Fill, render_pattern, render_page, FillMode}; +use pdf_render::{tracer::{TraceCache, Tracer, DrawItem}, Fill, render_pattern, render_page, FillMode, font::OutlineBuilder}; -mod tree; +mod node; mod util; mod text; -pub mod entry; +mod classify; +pub mod flow; -pub fn run(file: &pdf::file::CachedFile, page: &Page, resolve: &impl Resolve) -> Result { - let cache = TraceCache::new(); +pub fn run(file: &pdf::file::CachedFile, page: &Page, resolve: &impl Resolve, transform: Transform2F, without_header_and_footer: bool) -> Result { + let mut cache = TraceCache::new(OutlineBuilder::default()); let mut clip_paths = vec![]; - let mut tracer = Tracer::new(&cache, &mut clip_paths); + let mut tracer = Tracer::new(&mut cache, &mut clip_paths); - render_page(&mut tracer, resolve, &page, Default::default())?; + //Get text, pattern, image by the Tracer backend. + render_page(&mut tracer, resolve, page, transform)?; let bbox = tracer.view_box(); - - let items = tracer.finish(); + let items: Vec> = tracer.finish(); + //Get all patterns which may have lines and texts inside. let mut patterns = HashSet::new(); for item in items.iter() { if let DrawItem::Vector(ref v) = item { @@ -34,6 +37,7 @@ pub fn run(file: &pdf::file::CachedFile, page: &Page, resolve: &i let mut spans = vec![]; let mut lines = vec![]; + let mut visit_item = |item| { match item { DrawItem::Text(t, _) if bbox.intersects(t.rect) => { @@ -60,6 +64,7 @@ pub fn run(file: &pdf::file::CachedFile, page: &Page, resolve: &i } }; + // Analyze patterns to get lines and texts. for &p in patterns.iter() { let pattern = match resolve.get(p) { Ok(p) => p, @@ -68,7 +73,7 @@ pub fn run(file: &pdf::file::CachedFile, page: &Page, resolve: &i continue; } }; - let mut pat_tracer = Tracer::new(&cache, &mut clip_paths); + let mut pat_tracer = Tracer::new(&mut cache, &mut clip_paths); render_pattern(&mut pat_tracer, &*pattern, resolve)?; let pat_items = pat_tracer.finish(); @@ -77,12 +82,16 @@ pub fn run(file: &pdf::file::CachedFile, page: &Page, resolve: &i } } + // After this loop, all the text and lines are ready for further processing. for item in items { visit_item(item); } - let root = tree::build(&spans, bbox, &lines); + let root = node::build(&spans, bbox, &lines, without_header_and_footer); + let mut flow = Flow::new(); - tree::items(&mut flow, &spans, &root, bbox.min_x()); + + flow::build(&mut flow, &spans, &root, bbox.min_x()); + Ok(flow) } \ No newline at end of file diff --git a/src/node.rs b/src/node.rs new file mode 100644 index 0000000..20300a4 --- /dev/null +++ b/src/node.rs @@ -0,0 +1,265 @@ +mod gap; +mod line; +mod render; +mod table; + +use gap::{dist_x, dist_y, gaps, left_right_gap, top_bottom_gap}; +use line::{analyze_lines, overlapping_lines, Lines}; +use pdf_render::TextSpan; +use pathfinder_geometry::rect::RectF; + + +use crate::classify::{classify, Class}; +use crate::util::avg; + +#[cfg(feature="ocr")] +use tesseract_plumbing::Text; + +use std::mem::take; +use font::Encoder; + +pub fn build(spans: &[TextSpan], bbox: RectF, lines: &[[f32; 4]], without_header_and_footer: bool) -> Node { + if spans.len() == 0 { + return Node::singleton(&[]); + } + + let mut boxes: Vec<(RectF, usize)> = spans.iter().enumerate().map(|(i, t)| (t.rect, i)).collect(); + let mut boxes = boxes.as_mut_slice(); + if without_header_and_footer { + boxes = exclude_header_and_footer(boxes, bbox, spans); + } + + let lines = analyze_lines(lines); + + split(&mut boxes, &spans, &lines) +} + +pub fn exclude_header_and_footer<'a, E: Encoder>(boxes: &'a mut [(RectF, usize)], bbox: RectF, spans: &[TextSpan]) -> &'a mut [(RectF, usize)] +{ + let avg_font_size: f32 = avg(spans.iter().map(|s| s.font_size)).unwrap(); + + let probably_header = |boxes: &[(RectF, usize)]| { + let class = classify(boxes.iter().filter_map(|&(_, i)| spans.get(i))); + if matches!(class, Class::Header | Class::Number) { + return true; + } + let f = avg(boxes.iter().filter_map(|&(_, i)| spans.get(i)).map(|s| s.font_size)).unwrap(); + f > avg_font_size + }; + let probably_footer = |boxes: &mut [(RectF, usize)]| { + sort_x(boxes); + let x_gaps: Vec = gap::gaps(avg_font_size, boxes, |r| (r.min_x(), r.max_x())) + .collect(); + + let is_footer = split_by(boxes, x_gaps.as_slice(), |r| r.min_x()) + .all(|cell| probably_header(cell)); + + is_footer + }; + + sort_y(boxes); + + let mut boxes = boxes; + let (top, bottom) = top_bottom_gap(boxes, bbox); + if let Some(bottom) = bottom { + if probably_footer(&mut boxes[bottom..]) { + boxes = &mut boxes[..bottom]; + } + } + if let Some(top) = top { + if probably_header(&mut boxes[..top]) { + boxes = &mut boxes[top..]; + } + } + sort_x(boxes); + let (left, right) = left_right_gap(boxes, bbox); + if let Some(right) = right { + if probably_header(&boxes[right..]) { + boxes = &mut boxes[..right]; + } + } + if let Some(left) = left { + if probably_header(&boxes[..left]) { + boxes = &mut boxes[left..]; + } + } + + boxes +} + + +#[derive(Debug)] +pub enum Node { + Final { indices: Vec }, + Grid { + // vertical gaps + x: Vec, + // horizontal gaps + y: Vec, + cells: Vec, + tag: NodeTag + }, + Table { table: table::Table> }, +} +impl Node { + pub fn tag(&self) -> NodeTag { + match *self { + Node::Grid { tag, .. } => tag, + Node::Table { .. } => NodeTag::Complex, + Node::Final { .. } => NodeTag::Singleton, + } + } + pub fn indices(&self, out: &mut Vec) { + match *self { + Node::Final { ref indices } => out.extend_from_slice(&indices), + Node::Grid { ref cells, .. } => { + for n in cells { + n.indices(out); + } + } + Node::Table { ref table } => { + out.extend( + table.values() + .flat_map(|v| v.value.iter()) + .cloned() + ); + } + } + } + pub fn singleton(nodes: &[(RectF, usize)]) -> Self { + Node::Final { indices: nodes.iter().map(|t| t.1).collect() } + } +} + +#[derive(PartialOrd, Ord, Eq, PartialEq, Clone, Copy, Debug)] +pub enum NodeTag { + Singleton, + Line, + Paragraph, + Complex, +} + +fn split(boxes: &mut [(RectF, usize)], spans: &[TextSpan], lines: &Lines) -> Node { + let num_boxes = boxes.len(); + if num_boxes < 2 { + return Node::singleton(boxes); + } + + sort_x(boxes); + let max_x_gap = dist_x(boxes); + + sort_y(boxes); + let max_y_gap = dist_y(boxes); + + let x_y_ratio = 1.0; + + let max_gap = match (max_x_gap, max_y_gap) { + (Some((x, _)), Some((y, _))) => x.max(y * x_y_ratio), + (Some((x, _)), None) => x, + (None, Some((y, _))) => y * x_y_ratio, + (None, None) => { + sort_x(boxes); + return Node::singleton(boxes); + } + }; + let x_threshold = (max_gap * 0.5).max(1.0); + let y_threshold = (max_gap * 0.5 / x_y_ratio).max(0.1); + + let y_gaps: Vec = gaps(y_threshold, boxes, |r| (r.min_y(), r.max_y())) + .collect(); + + sort_x(boxes); + let x_gaps: Vec = gaps(x_threshold, boxes, |r| (r.min_x(), r.max_x())) + .collect(); + + if x_gaps.len() == 0 && y_gaps.len() == 0 { + return overlapping_lines(boxes); + } + + //TODO: Disable the table::split for now, because it is not accurate + // if x_gaps.len() > 1 && y_gaps.len() > 1 { + // return table::split(boxes, spans, lines); + // } + + assert!( + x_gaps.len() > 0 || y_gaps.len() > 0, + "At least one of x_gaps and y_gaps must be non-empty, otherwise the memory will be exhausted" + ); + sort_y(boxes); + + let mut cells = vec![]; + for row in split_by(boxes, &y_gaps, |r| r.min_y()) { + if x_gaps.len() > 0 { + sort_x(row); + for cell in split_by(row, &x_gaps, |r| r.min_x()) { + sort_y(cell); + assert!(cell.len() < num_boxes); + cells.push(split(cell, spans, lines)); + } + } else { + cells.push(split(row, spans, lines)); + } + } + + let tag = match (y_gaps.is_empty(), x_gaps.is_empty()) { + // N y gaps, whatever x_gap is, if cells are all lines, then it is a line + (true, _) if cells.iter().all(|n| n.tag() <= NodeTag::Line) => NodeTag::Line, + // N x gaps, whatever y_gap is, if cells are all lines, then it is a paragraph + (_, true) if cells.iter().all(|n| n.tag() <= NodeTag::Line) => NodeTag::Paragraph, + // Otherwise it is a complex node + _ => NodeTag::Complex + }; + + Node::Grid { + x: x_gaps, + y: y_gaps, + cells, + tag, + } +} + +fn sort_x(boxes: &mut [(RectF, usize)]) { + boxes.sort_unstable_by(|a, b| a.0.min_x().partial_cmp(&b.0.min_x()).unwrap()); +} +fn sort_y(boxes: &mut [(RectF, usize)]) { + boxes.sort_unstable_by(|a, b| a.0.min_y().partial_cmp(&b.0.min_y()).unwrap()); +} + +fn split_by<'a>(list: &'a mut [(RectF, usize)], at: &'a [f32], by: impl Fn(&RectF) -> f32) -> impl Iterator { + SplitBy { + data: list, + points: at.iter().cloned(), + by, + end: false + } +} + +struct SplitBy<'a, I, F> { + data: &'a mut [(RectF, usize)], + points: I, + by: F, + end: bool, +} +impl<'a, I, F> Iterator for SplitBy<'a, I, F> where + I: Iterator, + F: Fn(&RectF) -> f32 +{ + type Item = &'a mut [(RectF, usize)]; + fn next(&mut self) -> Option { + if self.end { + return None; + } + match self.points.next() { + Some(p) => { + let idx = self.data.iter().position(|(ref r, _)| (self.by)(r) > p).unwrap_or(self.data.len()); + let (head, tail) = take(&mut self.data).split_at_mut(idx); + self.data = tail; + Some(head) + }, + None => { + self.end = true; + Some(take(&mut self.data)) + } + } + } +} diff --git a/src/node/gap.rs b/src/node/gap.rs new file mode 100644 index 0000000..189580e --- /dev/null +++ b/src/node/gap.rs @@ -0,0 +1,133 @@ +use ordered_float::NotNan; +use pathfinder_geometry::rect::RectF; + +/// Find all the gaps in boxes +pub fn gap_list<'a>(boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator + 'a { + let mut boxes = boxes.iter(); + let &(ref r, _) = boxes.next().unwrap(); + let (_, mut last_max) = span(r); + + boxes.enumerate().filter_map(move |(idx, &(ref r, _))| { + let (min, max) = span(&r); + let r = if min > last_max { + Some((last_max, min, idx+1)) + } else { + None + }; + last_max = max.max(last_max); + r + }) +} + +/// Find every the middle points of a gap in boxes that are greater than the threshold. +pub fn gaps<'a>(threshold: f32, boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator + 'a { + let mut boxes = boxes.iter(); + let &(ref r, _) = boxes.next().unwrap(); + let (_, mut last_max) = span(r); + boxes.filter_map(move |&(ref r, _)| { + let (min, max) = span(&r); + let r = if min - last_max >= threshold { + // The middle position of the gap + Some(0.5 * (last_max + min)) + } else { + None + }; + last_max = max.max(last_max); + r + }) +} + +/// Return the size of the max gap and its the middle position. +pub fn max_gap(boxes: &[(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32)) -> Option<(f32, f32)> { + gap_list(boxes, span) + .max_by_key(|&(a, b, _)| NotNan::new(b - a).unwrap()) + .map(|(a, b, _)| (b - a, 0.5 * (a + b))) +} + +pub fn dist_x(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> { + max_gap(boxes, |r| (r.min_x(), r.max_x())) +} +pub fn dist_y(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> { + max_gap(boxes, |r| (r.min_y(), r.max_y())) +} + +pub fn top_bottom_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option, Option) { + let num_boxes = boxes.len(); + if num_boxes < 2 { + return (None, None); + } + + let mut gaps = gap_list(boxes, |r| ( + // top left y + r.min_y(), + // bottom right y + r.max_y() + )); + let top_limit = bbox.min_y() + bbox.height() * 0.2; + let bottom_limit = bbox.min_y() + bbox.height() * 0.8; + + match gaps.next() { + Some((y, _, top)) if y < top_limit => { + match gaps.last() { + Some((y, _, bottom)) if y > bottom_limit => (Some(top), Some(bottom)), + _ => (Some(top), None) + } + } + Some((y, _, bottom)) if y > bottom_limit => (None, Some(bottom)), + _ => (None, None) + } +} + +pub fn left_right_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option, Option) { + let num_boxes = boxes.len(); + if num_boxes < 2 { + return (None, None); + } + + let mut gaps = gap_list(boxes, |r| (r.min_x(), r.max_x())); + let left_limit = bbox.min_x() + bbox.width() * 0.2; + let right_limit = bbox.min_x() + bbox.width() * 0.8; + match gaps.next() { + Some((x, _, left)) if x < left_limit => { + match gaps.last() { + Some((x, _, right)) if x > right_limit => (Some(left), Some(right)), + _ => (Some(left), None) + } + } + Some((x, _, right)) if x > right_limit => (None, Some(right)), + _ => (None, None) + } +} + + +#[cfg(test)] +mod tests { + use super::*; + use pathfinder_geometry::rect::RectF; + use pathfinder_geometry::vector::Vector2F; + + #[test] + fn test_the_gaps_method() { + // 3 horizontal rectangles + let boxes = vec![ + (RectF::from_points(Vector2F::new(0.0, 0.0), Vector2F::new(10.0, 10.0)), 1), // Rectangle 1 + (RectF::from_points(Vector2F::new(12.0, 0.0), Vector2F::new(22.0, 10.0)), 2), // Rectangle 2 (gap from 10 to 12) + (RectF::from_points(Vector2F::new(25.0, 0.0),Vector2F::new( 35.0, 10.0)), 3), // Rectangle 3 (gap from 22 to 25) + ]; + + // Define the threshold for gap detection + let threshold = 2.0; + + // Define the span function (maps rectangles to their min and max x-coordinates) + let span = |rect: &RectF| (rect.min_x(), rect.max_x()); + + // Call the gaps function + let gaps: Vec = gaps(threshold, &boxes, span).collect(); + + // Expected gaps are the midpoints of the gaps: [(10+12)/2 = 11, (22+25)/2 = 23.5] + let expected_gaps = vec![11.0, 23.5]; + + // Assert that the results match the expected values + assert_eq!(gaps, expected_gaps); + } +} \ No newline at end of file diff --git a/src/node/line.rs b/src/node/line.rs new file mode 100644 index 0000000..46cc0d0 --- /dev/null +++ b/src/node/line.rs @@ -0,0 +1,159 @@ + +use std::collections::BTreeSet; +use ordered_float::NotNan; +use pathfinder_geometry::rect::RectF; + +use crate::util::avg; + +use super::{sort_x, sort_y, Node, NodeTag}; + +pub fn analyze_lines(lines: &[[f32; 4]]) -> Lines { + let mut hlines = BTreeSet::new(); + let mut vlines = BTreeSet::new(); + + for &[x1, y1, x2, y2] in lines { + if x1 == x2 { + vlines.insert(NotNan::new(x1).unwrap()); + } else if y1 == y2 { + hlines.insert(NotNan::new(y1).unwrap()); + } + } + + let hlines = dedup(hlines.iter().cloned()); + let vlines = dedup(vlines.iter().cloned()); + + let mut line_grid = vec![false; vlines.len() * hlines.len()]; + for &[x1, y1, x2, y2] in lines { + // vertical line + if x1 == x2 { + let v_idx = vlines.iter().position(|&(a, b)| a <= x1 && x1 <= b).unwrap_or(vlines.len()); + let h_start = hlines.iter().position(|&(a, b)| y1 >= a).unwrap_or(hlines.len()); + let h_end = hlines.iter().position(|&(a, b)| y2 <= b).unwrap_or(hlines.len()); + for h in h_start .. h_end { + line_grid[v_idx * hlines.len() + h] = true; + } + } + // horizontal line + else if y1 == y2 { + let h_idx = hlines.iter().position(|&(a, b)| a <= y1 && y1 <= b).unwrap_or(hlines.len()); + let v_start = vlines.iter().position(|&(a, b)| x1 >= a).unwrap_or(vlines.len()); + let v_end = vlines.iter().position(|&(a, b)| x2 <= b).unwrap_or(vlines.len()); + for v in v_start .. v_end { + line_grid[v * hlines.len() + h_idx] = true; + } + } + } + + //println!("hlines: {:?}", hlines); + //println!("vlines: {:?}", vlines); + + Lines { hlines, vlines, line_grid } +} + +/// Group lines that are consecutive within a distance of 10.0. +fn dedup(lines: impl Iterator>) -> Vec<(f32, f32)> { + let threshold = 10.0; + let mut out = vec![]; + let mut lines = lines.map(|f| *f).peekable(); + while let Some(start) = lines.next() { + let mut last = start; + while let Some(&p) = lines.peek() { + if last + threshold > p { + last = p; + lines.next(); + } else { + break; + } + } + out.push((start, last)); + } + out +} + +#[derive(Debug)] +pub struct Lines { + pub hlines: Vec<(f32, f32)>, + pub vlines: Vec<(f32, f32)>, + pub line_grid: Vec, +} + +/// Deals with things like superscript and subscript, which fall outside the usual bounds +/// but need to be assigned to the correct line. +/// +/// example, two lines: +/// hello world +/// m³2 test a number℡ +pub fn overlapping_lines(boxes: &mut [(RectF, usize)]) -> Node { + sort_y(boxes); + let avg_height = avg(boxes.iter().map(|(r, _)| r.height())).unwrap(); + + let mut y_center = boxes[0].0.center().y(); + let mut lines = vec![]; + let mut y_splits = vec![]; + + let mut start = 0; + 'a: loop { + for (i, &(r, _)) in boxes[start..].iter().enumerate() { + // Superscript + if r.center().y() > 0.5 * avg_height + y_center { + let end = start + i; + sort_x(&mut boxes[start..end]); + let bbox = boxes[start..end].iter().map(|&(r, _)| r).reduce(|a, b| a.union_rect(b)).unwrap(); + + y_splits.push(bbox.max_y()); + lines.push(Node::singleton(&boxes[start..end])); + y_center = r.center().y(); + + start = end; + continue 'a; + } + } + + sort_x(&mut boxes[start..]); + lines.push(Node::singleton(&boxes[start..])); + + break; + } + match lines.len() { + 0 => Node::singleton(&[]), + 1 => lines.pop().unwrap(), + _ => Node::Grid { + x: vec![], + y: y_splits, + cells: lines, + tag: NodeTag::Paragraph + } + } +} + + +#[cfg(test)] +mod tests { + use super::*; + use ordered_float::NotNan; + + #[test] + fn test_dedup() { + // Input data: A series of sorted `NotNan` values + let input = vec![ + NotNan::new(1.0).unwrap(), + NotNan::new(5.0).unwrap(), + NotNan::new(8.0).unwrap(), + NotNan::new(12.0).unwrap(), + + NotNan::new(25.0).unwrap(), + NotNan::new(28.0).unwrap(), + ]; + + // Call the dedup function + let result = dedup(input.into_iter()); + + // Expected output: + // (1.0, 12.0): All values between 1.0 and 12.0 are within a threshold of 10.0. + // (25.0, 28.0): 25.0 and 28.0 are within a threshold of 10.0. + let expected = vec![(1.0, 12.0), (25.0, 28.0)]; + + // Assert that the result matches the expected output + assert_eq!(result, expected); + } +} \ No newline at end of file diff --git a/src/node/render.rs b/src/node/render.rs new file mode 100644 index 0000000..618e581 --- /dev/null +++ b/src/node/render.rs @@ -0,0 +1,71 @@ +use font::Encoder; +use itertools::Itertools; +use pathfinder_geometry::{rect::RectF, vector::Vector2F}; +use pdf_render::TextSpan; + +use crate::classify::classify; + +use super::Node; + +pub fn render(w: &mut String, spans: &[TextSpan], node: &Node, bbox: RectF) { + _render(w, spans, node, bbox, 0) +} + +fn _render(w: &mut String, spans: &[TextSpan], node: &Node, bbox: RectF, level: usize) { + use std::fmt::Write; + + match *node { + Node::Final { ref indices } => { + /* + for i in start..end { + if let Span::Text(ref t) = spans[i] { + write!(w, r#"").unwrap(); + } + } + */ + + if indices.len() > 0 { + let class = classify(indices.iter().cloned().filter_map(|i| spans.get(i))); + + for &i in indices.iter() { + let r = spans[i].rect; + write!(w, r#""#, + r.min_x(), r.max_x(), r.max_y(), r.max_y(), + class + ); + } + } + } + Node::Grid { ref x, ref y, ref cells, tag } => { + use std::iter::once; + let columns = x.len() + 1; + write!(w, r#""#, + bbox.min_x(), bbox.min_y(), bbox.width(), bbox.height(), tag + ); + + for (j, ((min_y, max_y), row)) in once(bbox.min_y()).chain(y.iter().cloned()).chain(once(bbox.max_y())).tuple_windows().zip(cells.chunks_exact(columns)).enumerate() { + if j > 0 { + writeln!(w, r#""#, + bbox.min_x(), bbox.max_x(), min_y, min_y); + } + + for (i, ((min_x, max_x), cell)) in once(bbox.min_x()).chain(x.iter().cloned()).chain(once(bbox.max_x())).tuple_windows().zip(row).enumerate() { + if i > 0 { + writeln!(w, r#""#, + min_x, min_x, bbox.min_y(), bbox.max_y()); + } + + let bbox = RectF::from_points(Vector2F::new(min_x, min_y), Vector2F::new(max_x, max_y)); + _render(w, spans, cell, bbox, level+1); + } + } + } + Node::Table { .. } => { + + } + } +} diff --git a/src/node/table.rs b/src/node/table.rs new file mode 100644 index 0000000..4d433f3 --- /dev/null +++ b/src/node/table.rs @@ -0,0 +1,253 @@ +use font::Encoder; +use pathfinder_geometry::rect::RectF; +use pdf_render::TextSpan; +use itertools::Itertools; +use ordered_float::NotNan; +use crate::{node::{sort_x, sort_y, NodeTag}, util::avg}; +use super::{gap::{dist_y, gaps}, line::Lines, split_by, Node}; + +pub use table::Table; + +pub fn split(boxes: &mut [(RectF, usize)], spans: &[TextSpan], lines_info: &Lines) -> Node { + sort_y(boxes); + let mut lines = vec![]; + let mut y = Span::vert(&boxes[0].0).unwrap(); + let mut items = vec![boxes[0]]; + + let mut line = vec![boxes[0]]; + for &(rect, i) in &boxes[1..] { + let y2 = Span::vert(&rect).unwrap(); + if let Some(overlap) = y.intersect(y2) { + y = overlap; + } else { + sort_x(&mut line); + lines.push(build_line(&line, spans)); + line.clear(); + y = y2 + } + line.push((rect, i)); + } + sort_x(&mut line); + lines.push(build_line(&line, spans)); + + + let mut vparts = vec![]; + let mut start = 0; + while let Some(p) = lines[start..].iter().position(|(tag, _, line)| matches!(tag, LineTag::Unknown | LineTag::Table)) { + let table_start = start + p; + let table_end = lines[table_start+1..].iter().position(|(tag, _, _)| matches!(tag, LineTag::Text)).map(|e| table_start+1+e).unwrap_or(lines.len()); + + for &(_, y, ref line) in &lines[start..table_start] { + vparts.push((y, Node::Final { indices: line.iter().flat_map(|(_, indices)| indices.iter().cloned()).collect() })); + } + + let lines = &lines[table_start..table_end]; + start = table_end; + + let mut columns: Vec = vec![]; + for (_, _, line) in lines.iter() { + for &(x, ref parts) in line.iter() { + // find any column that is contained in this + let mut found = 0; + for span in columns.iter_mut() { + if let Some(overlap) = span.intersect(x) { + *span = overlap; + found += 1; + } + } + if found == 0 { + columns.push(x); + } + } + } + let avg_vgap = avg(lines.iter().map(|(_, y, _)| y).tuple_windows().map(|(a, b)| *(b.start - a.end))); + + columns.sort_by_key(|s| s.start); + + let mut buf = String::new(); + + let d_threshold = avg_vgap.unwrap_or(0.0); + let mut prev_end = None; + + let mut table: Table> = Table::empty(lines.len() as u32, columns.len() as u32); + + let mut row = 0; + for (_, span, line) in lines { + let mut col = 0; + + let combine = prev_end.map(|y: NotNan| { + if *(span.start - y) < d_threshold { + !lines_info.hlines.iter().map(|(a, b)| 0.5 * (a+b)).any(|l| *y < l && *span.start > l) + } else { + false + } + }).unwrap_or(false); + + if !combine { + row += 1; + } + + for &(x, ref parts) in line { + let mut cols = columns.iter().enumerate() + .filter(|&(_, &x2)| x.intersect(x2).is_some()) + .map(|(i, _)| i); + + let first_col = cols.next().unwrap(); + let last_col = cols.last().unwrap_or(first_col); + + if let Some(cell) = combine.then(|| table.get_cell_value_mut(row, first_col as u32)).flatten() { + // append to previous line + cell.extend_from_slice(parts); + } else { + let colspan = (last_col - first_col) as u32 + 1; + let rowspan = 1; + table.set_cell(parts.clone(), row, first_col as u32, rowspan, colspan); + } + col = last_col + 1; + } + prev_end = Some(span.end); + } + let y = Span { start: lines[0].1.start, end: lines.last().unwrap().1.end }; + vparts.push((y, Node::Table { table })); + } + for &(_, y, ref line) in &lines[start..] { + vparts.push((y, Node::Final { indices: line.iter().flat_map(|(_, indices)| indices.iter().cloned()).collect() })); + } + + if vparts.len() > 1 { + let y = vparts.iter().tuple_windows().map(|(a, b)| 0.5 * (a.0.end + b.0.start).into_inner()).collect(); + Node::Grid { + tag: NodeTag::Complex, + x: vec![], + y, + cells: vparts.into_iter().map(|(_, n)| n).collect() + } + } else { + vparts.pop().unwrap().1 + } +} + +#[derive(Debug)] +enum LineTag { + Unknown, + Text, + Table, +} + +fn build_line(boxes: &[(RectF, usize)], spans: &[TextSpan]) -> (LineTag, Span, Vec<(Span, Vec)>) { + use std::mem::replace; + let mut line = vec![]; + let mut x = Span::horiz(&boxes[0].0).unwrap(); + let mut y = Span::vert(&boxes[0].0).unwrap(); + let mut items = vec![boxes[0].1]; + + for &(rect, i) in &boxes[1..] { + y = y.union(Span::vert(&rect).unwrap()).unwrap(); + let x2 = Span::horiz(&rect).unwrap(); + if let Some(u) = x.union(x2) { + x = u; + items.push(i); + } else { + line.push((x, replace(&mut items, vec![i]))); + x = x2; + } + } + line.push((x, items)); + + let avg_font_size = avg(boxes.iter().filter_map(|&(_, i)| spans.get(i)).map(|s| s.font_size)).unwrap(); + + let max_gap = line.iter().tuple_windows().map(|(l, r)| r.0.start - l.0.end).max(); + let tag = match max_gap { + None => LineTag::Unknown, + Some(x) if x.into_inner() < 0.3 * avg_font_size => LineTag::Text, + Some(_) => LineTag::Table, + }; + + (tag, y, line) +} + +#[derive(Copy, Clone, Debug)] +struct Span { + start: NotNan, + end: NotNan, +} +impl Span { + fn horiz(rect: &RectF) -> Option { + Self::new(rect.min_x(), rect.max_x()) + } + fn vert(rect: &RectF) -> Option { + Self::new(rect.min_y(), rect.max_y()) + } + fn new(mut start: f32, mut end: f32) -> Option { + if start > end { + std::mem::swap(&mut start, &mut end); + } + Some(Span { + start: NotNan::new(start).ok()?, + end: NotNan::new(end).ok()?, + }) + } + // Whether two vertical or horizontal lines overlap, return the intersection. + fn intersect(self, other: Span) -> Option { + if self.start <= other.end && other.start <= self.end { + Some(Span { + start: self.start.max(other.start), + end: self.end.min(other.end), + }) + } else { + None + } + } + + fn union(self, other: Span) -> Option { + if self.start <= other.end && other.start <= self.end { + Some(Span { + start: self.start.min(other.start), + end: self.end.max(other.end) + }) + } else { + None + } + } +} + +#[allow(dead_code)] +fn split_v(boxes: &mut [(RectF, usize)]) -> Node { + let num_boxes = boxes.len(); + if num_boxes < 2 { + return Node::singleton(boxes) + } + + let max_y_gap = dist_y(boxes); + + let max_gap = match max_y_gap { + Some((y, _)) => y, + None => { + sort_x(boxes); + return Node::singleton(boxes); + } + }; + let threshold = max_gap * 0.8; + let mut cells = vec![]; + + let y_gaps: Vec = gaps(threshold, boxes, |r| (r.min_y(), r.max_y())) + .collect(); + + for row in split_by(boxes, &y_gaps, |r| r.min_y()) { + assert!(row.len() < num_boxes); + cells.push(split_v(row)); + } + + let tag = if cells.iter().all(|n| n.tag() <= NodeTag::Line) { + NodeTag::Paragraph + } else { + NodeTag::Complex + }; + + Node::Grid { + x: vec![], + y: y_gaps, + cells, + tag, + } +} \ No newline at end of file diff --git a/src/text.rs b/src/text.rs index 2f6e6cd..e31fe76 100644 --- a/src/text.rs +++ b/src/text.rs @@ -1,15 +1,202 @@ +use std::mem::take; + +use font::Encoder; use pathfinder_geometry::vector::Vector2F; use pdf_render::TextSpan; -use itertools::{Itertools}; +use itertools::Itertools; use unicode_normalization::UnicodeNormalization; -use crate::{util::avg, entry::Word, util::Rect}; +use crate::{flow::{Char, Rect, Word}, util::avg}; + +pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator> + Clone) -> Vec { + let word_gap = analyze_word_gap(items.clone()); + let mut words = Vec::new(); + let mut current_word = WordBuilder::new(out.len(), 0.0); + + // Whether the last processed TextChar is a whitespace + // ' ' Space + // '\t' Tab + // '\n' Line feed + // '\r' Carriage return + // '\u{00A0}' Non-breaking space + let mut trailing_space = out.chars().last().map_or(true, |c| c.is_whitespace()); + + let mut end = 0.; // trailing edge of the last char + + for span in items { + let mut offset = 0; + let tr_inv = span.transform.matrix.inverse(); + // Device space to em space + let x_off = (tr_inv * span.transform.vector).x(); + + let mut chars = span.chars.iter().peekable(); -pub fn concat_text<'a>(out: &mut String, items: impl Iterator + Clone) -> Vec { - let mut words = vec![]; + while let Some(current) = chars.next() { + // Get text for current char + let text = if let Some(next) = chars.peek() { + let s = &span.text[offset..next.offset]; + offset = next.offset; + s + } else { + &span.text[offset..] + }; + + // Calculate char positions in device space + let char_start = (span.transform * Vector2F::new(current.pos, 0.0)).x(); + let char_end = (span.transform * Vector2F::new(current.pos + current.width, 0.0)).x(); + + let is_whitespace = text.chars().all(|c| c.is_whitespace()); + + // byte offsets + let bytes_offset = text.len(); + // Handle word boundaries + if trailing_space && !is_whitespace { + // Start new word after space + current_word = WordBuilder::new(out.len(),char_start); + current_word.add_char(bytes_offset, char_start, char_end); + + out.push_str(text); + } else if !trailing_space { + if is_whitespace { + // End word at space + words.push(current_word.build(out)); + + out.push(' '); + current_word = WordBuilder::new(out.len(),char_start); + } else if current.pos + x_off > end + word_gap { + + // End word at large gap + words.push(current_word.build(out)); + + current_word = WordBuilder::new(out.len(), char_start); + current_word.add_char(bytes_offset, char_start, char_end); + + out.push_str(text); + } else { + // Continue current word + current_word.add_char(bytes_offset, char_start, char_end); + + // out.extend(text.nfkc()); // nfkc will change the bytes length of a char. + out.push_str(text); + } + } + trailing_space = is_whitespace; + + end = current.pos + x_off + current.width; + + current_word.update_bounds(span.rect.min_y(), span.rect.max_y()); + } + } + + // Add final word if any + if !current_word.is_empty() { + words.push(current_word.build(out)); + } + + words +} + +// Helper struct to build up words +struct WordBuilder { + word_start_idx: usize, + + // For calculating the layout(position, width , height) of a word + start_pos: f32, + end_pos: f32, + y_min: f32, + y_max: f32, + + chars: Vec, + bytes_offset: usize, + + // New word + new: bool, +} + +impl WordBuilder { + fn new(word_start_idx: usize, start_pos: f32) -> Self { + Self { + word_start_idx, + start_pos, + end_pos: 0.0, + y_min: f32::INFINITY, + y_max: -f32::INFINITY, + chars: Vec::new(), + bytes_offset: 0, + new: true, + } + } + + fn add_char(&mut self, bytes_offset: usize, start: f32, end: f32) { + self.chars.push(Char { + offset: self.bytes_offset, + pos: start, + width: end - start, + }); + self.end_pos = end; + + self.bytes_offset += bytes_offset; + } + + fn update_bounds(&mut self, min_y: f32, max_y: f32) { + if self.new { + self.y_min = min_y; + self.y_max = max_y; + + self.new = false; + } else { + self.y_min = self.y_min.min(min_y); + self.y_max = self.y_max.max(max_y); + } + } + + fn is_empty(&self) -> bool { + self.chars.is_empty() + } + + fn build(mut self, out: &str) -> Word { + Word { + text: out[self.word_start_idx..].into(), + rect: Rect { + x: self.start_pos, + y: self.y_min, + h: self.y_max - self.y_min, + w: self.end_pos - self.start_pos + }, + chars: take(&mut self.chars) + } + } +} + +/// Calculate gaps between each char, the return value unit is em + +/// The most important thing here is to make sure the gap is bigger than char gap, and less than word gap. +/// for example: +/// think of something like "ab____________c de" +/// +/// a-b has a zero space (or 0.01) +/// b-c has a huge space of 10 +/// c-d has 0.2 +/// d-e has 0.01 +/// if we just take the average = 10.2 and divide that by 4 we get 2.5 +/// and now c-d is smaller than that and not classified as a space +/// but if b-c is capped by the threshold of 0.5, the sum is 0.7, and the avg is 0.7/4 ~ 0.18 +/// and everything is fine. + +/// 0 + min(0.5, 10) + 0.2 + 0 +/// 10 capped at 0.5 is0.5 +/// min(0, 0.5) + min(10, 0.5) + min(0.2, 0.5) + min(0, 0.5) +/// 0 + 0.5 + 0.2 + 0 +/// every value is limited to be at least 0.01 and not more than 0.5. +/// the 0.5 is 0.25 * font size of the left char and 0.25 * font size of the right char +/// if they are the same font size it is 0.5 +fn analyze_word_gap<'a, E: Encoder + 'a>(items: impl Iterator> + Clone) -> f32 { let gaps = items.clone() .flat_map(|s| { + // the transform matrix is from em space to device space + // so we need to invert it, becoming device space to em space let tr_inv = s.transform.matrix.inverse(); let pos = (tr_inv * s.transform.vector).x(); + s.chars.iter() .filter(|c| !s.text[c.offset..].chars().next().unwrap().is_whitespace()) .map(move |c| (c.pos + pos, c.pos + pos + c.width, s.font_size)) @@ -17,78 +204,82 @@ pub fn concat_text<'a>(out: &mut String, items: impl Iterator .tuple_windows() .filter(|(a, b)| b.0 > a.0) .map(|(a, b)| (b.0 - a.1).max(0.01).min(0.25 * (a.2 + b.2))); - - let font_size = avg(items.clone().map(|s| s.font_size)).unwrap(); + + let avg_font_size = avg(items.clone().map(|s| s.font_size)).unwrap(); //gaps.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); - let space_gap = (0.5 * font_size).min(2.0 * avg(gaps).unwrap_or(0.0)); //2.0 * gaps[gaps.len()/2]; - let mut end = 0.; // trailing edge of the last char - let mut trailing_space = out.chars().last().map(|c| c.is_whitespace()).unwrap_or(true); - let mut word_start_pos = 0.0; - let mut word_start_idx = out.len(); - let mut y_min = f32::INFINITY; - let mut y_max = -f32::INFINITY; - let mut word_start = true; - let mut word_end = 0.0; - for span in items { - let mut pos = 0; // byte index of last char into span.text - let tr_inv = span.transform.matrix.inverse(); - let x_off = (tr_inv * span.transform.vector).x(); - for c in span.chars.iter() { + (0.5 * avg_font_size).min(2.0 * avg(gaps).unwrap_or(0.0)) //2.0 * gaps[gaps.len()/2]; +} - let s = &span.text[pos..c.offset]; - if c.offset > 0 { - let is_whitespace = s.chars().all(|c| c.is_whitespace()); - if !trailing_space || !is_whitespace { - out.extend(s.nfkc()); - } - trailing_space = is_whitespace; - } - if !trailing_space && c.pos + x_off > end + space_gap { - words.push(Word { - text: out[word_start_idx..].into(), - rect: Rect { - x: word_start_pos, - y: y_min, - h: y_max - y_min, - w: word_end - word_start_pos - } - }); - - out.push(' '); - trailing_space = true; - word_start = true; - word_start_idx = out.len(); - } - pos = c.offset; - end = c.pos + x_off + c.width; - if c.offset == 0 || !trailing_space { - word_end = (span.transform.matrix * Vector2F::new(end, 0.0)).x(); - } +#[cfg(test)] +mod tests { + use pathfinder_geometry::{rect::RectF, transform2d::Transform2F}; + use pdf_render::{font::OutlineBuilder, Fill, TextChar}; - if word_start { - y_min = span.rect.min_y(); - y_max = span.rect.max_y(); - word_start_pos = (span.transform.matrix * Vector2F::new(c.pos + x_off, 0.0)).x(); - word_start = false; - } else { - y_min = y_min.min(span.rect.min_y()); - y_max = y_max.max(span.rect.max_y()); + use super::*; + + #[test] + fn test_concat_text() { + let text_span: TextSpan = TextSpan { + rect: RectF::from_points(Vector2F::new(56.8, 55.85077), Vector2F::new(136.26399, 67.85077)), + width: 79.464, + bbox: None, + font_size: 12.0, + font: None, + text: "hello world".to_string(), + chars: vec![ + TextChar { offset: 0, pos: 0.0, width: 7.224001 }, + TextChar { offset: 1, pos: 7.224001, width: 7.224001 }, + TextChar { offset: 2, pos: 14.448002, width: 7.224001 }, + TextChar { offset: 3, pos: 21.672003, width: 7.224001 }, + TextChar { offset: 4, pos: 28.896004, width: 7.224001 }, + TextChar { offset: 5, pos: 36.120003, width: 7.224001 }, + TextChar { offset: 6, pos: 43.344, width: 7.224001 }, + TextChar { offset: 7, pos: 50.568, width: 7.224001 }, + TextChar { offset: 8, pos: 57.792, width: 7.224001 }, + TextChar { offset: 9, pos: 65.016, width: 7.224001 }, + TextChar { offset: 10, pos: 72.24, width: 7.224001 }, + ], + color: Fill::Solid(0.0, 0.5019608, 0.0), + alpha: 1.0, + transform: Transform2F::row_major(1.0, 0.0, 56.8, 0.0, 1.0, 67.85077), + mode: pdf::content::TextMode::Fill, + op_nr: 18, + }; + + let mut output = String::new(); + let words = concat_text(&mut output, vec![&text_span].into_iter()); + + // Assert the concatenated text + assert_eq!(output, "hello world"); + + // Assert the words + // Expect two words: "hello" and "world" + assert_eq!(words.len(), 2); + assert_eq!(words[0].text, "hello"); + assert_eq!(words[1].text, "world"); + + // Assert chars positions + for w in words { + let text = &w.text; + let mut offset = 0; + + let mut texts = vec![]; + + let mut chars = w.chars.iter().peekable(); + + while let Some(_) = chars.next() { + // Get text for current char + let s = if let Some(next) = chars.peek() { + let s = &text[offset..next.offset]; + offset = next.offset; + s + } else { + &text[offset..] + }; + + texts.push(s); } } - trailing_space = span.text[pos..].chars().all(|c| c.is_whitespace()); - - out.extend(span.text[pos..].nfkc()); } - words.push(Word { - text: out[word_start_idx..].into(), - rect: Rect { - x: word_start_pos, - y: y_min, - h: y_max - y_min, - w: word_end - word_start_pos - } - }); - - words } \ No newline at end of file diff --git a/src/tree.rs b/src/tree.rs deleted file mode 100644 index 2eb696c..0000000 --- a/src/tree.rs +++ /dev/null @@ -1,958 +0,0 @@ -use pdf_render::TextSpan; -use pathfinder_geometry::{ - vector::Vector2F, - rect::RectF -}; -#[cfg(feature="ocr")] -use tesseract_plumbing::Text; - -use std::collections::BTreeSet; -use std::iter::once; -use std::sync::Arc; -use itertools::{Itertools}; -use ordered_float::NotNan; -use crate::entry::{Flow, Line, Run, RunType, Word}; -use crate::util::{is_number, avg, CellContent}; -use crate::text::{concat_text}; -use std::mem::take; -use table::Table; - -pub fn build(spans: &[TextSpan], bbox: RectF, lines: &[[f32; 4]]) -> Node { - if spans.len() == 0 { - return Node::singleton(&[]); - } - - let mut boxes: Vec<(RectF, usize)> = spans.iter().enumerate().map(|(i, t)| (t.rect, i)).collect(); - let mut boxes = boxes.as_mut_slice(); - - let avg_font_size = avg(spans.iter().map(|s| s.font_size)).unwrap(); - let probaby_header = |boxes: &[(RectF, usize)]| { - let class = classify(boxes.iter().filter_map(|&(_, i)| spans.get(i))); - if matches!(class, Class::Header | Class::Number) { - return true; - } - let f = avg(boxes.iter().filter_map(|&(_, i)| spans.get(i)).map(|s| s.font_size)).unwrap(); - f > avg_font_size - }; - let probably_footer = |boxes: &mut [(RectF, usize)]| { - sort_x(boxes); - let x_gaps: Vec = gaps(avg_font_size, boxes, |r| (r.min_x(), r.max_x())) - .collect(); - - let count = split_by(boxes, &x_gaps, |r| r.min_x()).filter(|cell| probaby_header(cell)).count(); - count == x_gaps.len() + 1 - }; - - sort_y(boxes); - let (top, bottom) = top_bottom_gap(boxes, bbox); - if let Some(bottom) = bottom { - if probably_footer(&mut boxes[bottom..]) { - boxes = &mut boxes[..bottom]; - } - } - if let Some(top) = top { - if probaby_header(&mut boxes[..top]) { - boxes = &mut boxes[top..]; - } - } - sort_x(boxes); - let (left, right) = left_right_gap(boxes, bbox); - if let Some(right) = right { - if probaby_header(&boxes[right..]) { - boxes = &mut boxes[..right]; - } - } - if let Some(left) = left { - if probaby_header(&boxes[..left]) { - boxes = &mut boxes[left..]; - } - } - let lines = analyze_lines(lines); - split(boxes, &spans, &lines) -} - -fn analyze_lines(lines: &[[f32; 4]]) -> Lines { - let mut hlines = BTreeSet::new(); - let mut vlines = BTreeSet::new(); - - for &[x1, y1, x2, y2] in lines { - if x1 == x2 { - vlines.insert(NotNan::new(x1).unwrap()); - } else if y1 == y2 { - hlines.insert(NotNan::new(y1).unwrap()); - } - } - - fn dedup(lines: impl Iterator>) -> Vec<(f32, f32)> { - let threshold = 10.0; - let mut out = vec![]; - let mut lines = lines.map(|f| *f).peekable(); - while let Some(start) = lines.next() { - let mut last = start; - while let Some(&p) = lines.peek() { - if last + threshold > p { - last = p; - lines.next(); - } else { - break; - } - } - out.push((start, last)); - } - out - } - - let hlines = dedup(hlines.iter().cloned()); - let vlines = dedup(vlines.iter().cloned()); - - let mut line_grid = vec![false; vlines.len() * hlines.len()]; - for &[x1, y1, x2, y2] in lines { - if x1 == x2 { - let v_idx = vlines.iter().position(|&(a, b)| a <= x1 && x1 <= b).unwrap_or(vlines.len()); - let h_start = hlines.iter().position(|&(a, b)| y1 >= a).unwrap_or(hlines.len()); - let h_end = hlines.iter().position(|&(a, b)| y2 <= b).unwrap_or(hlines.len()); - for h in h_start .. h_end { - line_grid[v_idx * hlines.len() + h] = true; - } - } else if y1 == y2 { - let h_idx = hlines.iter().position(|&(a, b)| a <= y1 && y1 <= b).unwrap_or(hlines.len()); - let v_start = vlines.iter().position(|&(a, b)| x1 >= a).unwrap_or(vlines.len()); - let v_end = vlines.iter().position(|&(a, b)| x2 <= b).unwrap_or(vlines.len()); - for v in v_start .. v_end { - line_grid[v * hlines.len() + h_idx] = true; - } - } - } - - - //println!("hlines: {:?}", hlines); - //println!("vlines: {:?}", vlines); - - Lines { hlines, vlines, line_grid } -} - -pub struct Lines { - hlines: Vec<(f32, f32)>, - vlines: Vec<(f32, f32)>, - line_grid: Vec, -} - -#[derive(Copy, Clone, Debug)] -struct Span { - start: NotNan, - end: NotNan, -} -impl Span { - fn horiz(rect: &RectF) -> Option { - Self::new(rect.min_x(), rect.max_x()) - } - fn vert(rect: &RectF) -> Option { - Self::new(rect.min_y(), rect.max_y()) - } - fn new(mut start: f32, mut end: f32) -> Option { - if start > end { - std::mem::swap(&mut start, &mut end); - } - Some(Span { - start: NotNan::new(start).ok()?, - end: NotNan::new(end).ok()?, - }) - } - fn intersect(self, other: Span) -> Option { - if self.start <= other.end && other.start <= self.end { - Some(Span { - start: self.start.max(other.start), - end: self.end.min(other.end), - }) - } else { - None - } - } - fn union(self, other: Span) -> Option { - if self.start <= other.end && other.start <= self.end { - Some(Span { - start: self.start.min(other.start), - end: self.end.max(other.end) - }) - } else { - None - } - } -} - -pub fn split2(boxes: &mut [(RectF, usize)], spans: &[TextSpan], lines_info: &Lines) -> Node { - use std::mem::replace; - - #[derive(Debug)] - enum LineTag { - Unknown, - Text, - Table, - } - - sort_y(boxes); - let mut lines = vec![]; - let mut y = Span::vert(&boxes[0].0).unwrap(); - let mut items = vec![boxes[0]]; - - let build_line = |boxes: &[(RectF, usize)]| -> (LineTag, Span, Vec<(Span, Vec)>) { - let mut line = vec![]; - let mut x = Span::horiz(&boxes[0].0).unwrap(); - let mut y = Span::vert(&boxes[0].0).unwrap(); - let mut items = vec![boxes[0].1]; - - for &(rect, i) in &boxes[1..] { - y = y.union(Span::vert(&rect).unwrap()).unwrap(); - let x2 = Span::horiz(&rect).unwrap(); - if let Some(u) = x.union(x2) { - x = u; - items.push(i); - } else { - line.push((x, replace(&mut items, vec![i]))); - x = x2; - } - } - line.push((x, items)); - - let f = avg(boxes.iter().filter_map(|&(_, i)| spans.get(i)).map(|s| s.font_size)).unwrap(); - - let max_gap = line.iter().tuple_windows().map(|(l, r)| r.0.start - l.0.end).max(); - let tag = match max_gap { - None => LineTag::Unknown, - Some(x) if x.into_inner() < 0.3 * f => LineTag::Text, - Some(_) => LineTag::Table, - }; - - (tag, y, line) - }; - - let mut line = vec![boxes[0]]; - for &(rect, i) in &boxes[1..] { - let y2 = Span::vert(&rect).unwrap(); - if let Some(overlap) = y.intersect(y2) { - y = overlap; - } else { - sort_x(&mut line); - lines.push(build_line(&line)); - line.clear(); - y = y2 - } - line.push((rect, i)); - } - sort_x(&mut line); - lines.push(build_line(&line)); - - - let mut vparts = vec![]; - let mut start = 0; - while let Some(p) = lines[start..].iter().position(|(tag, _, line)| matches!(tag, LineTag::Unknown | LineTag::Table)) { - let table_start = start + p; - let table_end = lines[table_start+1..].iter().position(|(tag, _, _)| matches!(tag, LineTag::Text)).map(|e| table_start+1+e).unwrap_or(lines.len()); - - for &(_, y, ref line) in &lines[start..table_start] { - vparts.push((y, Node::Final { indices: line.iter().flat_map(|(_, indices)| indices.iter().cloned()).collect() })); - } - - let lines = &lines[table_start..table_end]; - start = table_end; - - let mut columns: Vec = vec![]; - for (_, _, line) in lines.iter() { - for &(x, ref parts) in line.iter() { - // find any column that is contained in this - let mut found = 0; - for span in columns.iter_mut() { - if let Some(overlap) = span.intersect(x) { - *span = overlap; - found += 1; - } - } - if found == 0 { - columns.push(x); - } - } - } - let avg_vgap = avg(lines.iter().map(|(_, y, _)| y).tuple_windows().map(|(a, b)| *(b.start - a.end))); - - columns.sort_by_key(|s| s.start); - - let mut buf = String::new(); - - let d_threshold = avg_vgap.unwrap_or(0.0); - let mut prev_end = None; - - let mut table: Table> = Table::empty(lines.len() as u32, columns.len() as u32); - - let mut row = 0; - for (_, span, line) in lines { - let mut col = 0; - - let combine = prev_end.map(|y: NotNan| { - if *(span.start - y) < d_threshold { - !lines_info.hlines.iter().map(|(a, b)| 0.5 * (a+b)).any(|l| *y < l && *span.start > l) - } else { - false - } - }).unwrap_or(false); - - if !combine { - row += 1; - } - - for &(x, ref parts) in line { - let mut cols = columns.iter().enumerate() - .filter(|&(_, &x2)| x.intersect(x2).is_some()) - .map(|(i, _)| i); - - let first_col = cols.next().unwrap(); - let last_col = cols.last().unwrap_or(first_col); - - if let Some(cell) = combine.then(|| table.get_cell_value_mut(row, first_col as u32)).flatten() { - // append to previous line - cell.extend_from_slice(parts); - } else { - let colspan = (last_col - first_col) as u32 + 1; - let rowspan = 1; - table.set_cell(parts.clone(), row, first_col as u32, rowspan, colspan); - } - col = last_col + 1; - } - prev_end = Some(span.end); - } - let y = Span { start: lines[0].1.start, end: lines.last().unwrap().1.end }; - vparts.push((y, Node::Table { table })); - } - for &(_, y, ref line) in &lines[start..] { - vparts.push((y, Node::Final { indices: line.iter().flat_map(|(_, indices)| indices.iter().cloned()).collect() })); - } - - if vparts.len() > 1 { - let y = vparts.iter().tuple_windows().map(|(a, b)| 0.5 * (a.0.end + b.0.start).into_inner()).collect(); - Node::Grid { - tag: NodeTag::Complex, - x: vec![], - y, - cells: vparts.into_iter().map(|(_, n)| n).collect() - } - } else { - vparts.pop().unwrap().1 - } -} - -#[derive(Debug)] -pub enum Node { - Final { indices: Vec }, - Grid { x: Vec, y: Vec, cells: Vec, tag: NodeTag }, - Table { table: Table> }, -} -impl Node { - fn tag(&self) -> NodeTag { - match *self { - Node::Grid { tag, .. } => tag, - Node::Table { .. } => NodeTag::Complex, - Node::Final { .. } => NodeTag::Singleton, - } - } - fn indices(&self, out: &mut Vec) { - match *self { - Node::Final { ref indices } => out.extend_from_slice(&indices), - Node::Grid { ref cells, .. } => { - for n in cells { - n.indices(out); - } - } - Node::Table { ref table } => { - out.extend( - table.values() - .flat_map(|v| v.value.iter()) - .cloned() - ); - } - } - } - fn singleton(nodes: &[(RectF, usize)]) -> Self { - Node::Final { indices: nodes.iter().map(|t| t.1).collect() } - } -} - -#[derive(PartialOrd, Ord, Eq, PartialEq, Clone, Copy, Debug)] -pub enum NodeTag { - Singleton, - Line, - Paragraph, - Complex, -} - -pub fn items(mut flow: &mut Flow, spans: &[TextSpan], node: &Node, x_anchor: f32) { - match *node { - Node::Final { ref indices } => { - if indices.len() > 0 { - let node_spans = indices.iter().flat_map(|&i| spans.get(i)); - let bbox = node_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap(); - let class = classify(node_spans.clone()); - let mut text = String::new(); - let words = concat_text(&mut text, node_spans); - - let t = match class { - Class::Header => RunType::Header, - _ => RunType::Paragraph, - }; - flow.add_line(words, t); - } - } - Node::Grid { ref x, ref y, ref cells, tag } => { - match tag { - NodeTag::Singleton | - NodeTag::Line => { - let mut indices = vec![]; - node.indices(&mut indices); - let line_spans = indices.iter().flat_map(|&i| spans.get(i)); - let bbox: RectF = line_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap().into(); - - let mut text = String::new(); - let words = concat_text(&mut text, line_spans.clone()); - let class = classify(line_spans.clone()); - - let t = match class { - Class::Header => RunType::Header, - _ => RunType::Paragraph, - }; - flow.add_line(words, t); - } - NodeTag::Paragraph => { - assert_eq!(x.len(), 0); - let mut lines: Vec<(RectF, usize)> = vec![]; - let mut indices = vec![]; - for n in cells { - let start = indices.len(); - n.indices(&mut indices); - if indices.len() > start { - let cell_spans = indices[start..].iter().flat_map(|&i| spans.get(i)); - let bbox = cell_spans.map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap().into(); - lines.push((bbox, indices.len())); - } - } - - let para_spans = indices.iter().flat_map(|&i| spans.get(i)); - let class = classify(para_spans.clone()); - let bbox = lines.iter().map(|t| t.0).reduce(|a, b| a.union_rect(b)).unwrap(); - let line_height = avg(para_spans.map(|s| s.rect.height())).unwrap(); - // classify the lines by this vertical line - let left_margin = bbox.min_x() + 0.5 * line_height; - - // count how many are right and left of the split. - let mut left = 0; - let mut right = 0; - - for (line_bbox, _) in lines.iter() { - if line_bbox.min_x() >= left_margin { - right += 1; - } else { - left += 1; - } - } - - // typically paragraphs are indented to the right and longer than 2 lines. - // then there will be a higher left count than right count. - let indent = left > right; - - let mut para_start = 0; - let mut line_start = 0; - let mut text = String::new(); - let mut para_bbox = RectF::default(); - let mut flow_lines = vec![]; - for &(line_bbox, end) in lines.iter() { - if line_start != 0 { - // if a line is indented (or outdented), it marks a new paragraph - if (line_bbox.min_x() >= left_margin) == indent { - flow.runs.push(Run { - lines: take(&mut flow_lines), - kind: match class { - Class::Header => RunType::Header, - _ => RunType::Paragraph - } - }); - para_start = line_start; - } else { - text.push('\n'); - } - } - if end > line_start { - let words = concat_text(&mut text, indices[line_start..end].iter().flat_map(|&i| spans.get(i))); - - if words.len() > 0 { - flow_lines.push(Line { words }); - } - } - if para_start == line_start { - para_bbox = line_bbox; - } else { - para_bbox = para_bbox.union_rect(line_bbox); - } - line_start = end; - } - - flow.runs.push(Run { - lines: flow_lines, - kind: match class { - Class::Header => RunType::Header, - _ => RunType::Paragraph - } - }); - } - NodeTag::Complex => { - let x_anchors = once(x_anchor).chain(x.iter().cloned()).cycle(); - for (node, x) in cells.iter().zip(x_anchors) { - items(flow, spans, node, x); - } - } - } - } - Node::Table { ref table } => { - if let Some(bbox) = table.values() - .flat_map(|v| v.value.iter().flat_map(|&i| spans.get(i).map(|s| s.rect))) - .reduce(|a, b| a.union_rect(b)) { - let table = table.flat_map(|indices| { - if indices.len() == 0 { - None - } else { - let line_spans = indices.iter().flat_map(|&i| spans.get(i)); - let bbox: RectF = line_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap().into(); - - let mut text = String::new(); - concat_text(&mut text, line_spans.clone()); - Some(CellContent { - text, - rect: bbox.into(), - }) - } - }); - flow.add_table(table); - } - } - } -} - - -pub fn render(w: &mut String, spans: &[TextSpan], node: &Node, bbox: RectF) { - _render(w, spans, node, bbox, 0) -} -fn _render(w: &mut String, spans: &[TextSpan], node: &Node, bbox: RectF, level: usize) { - use std::fmt::Write; - - match *node { - Node::Final { ref indices } => { - /* - for i in start..end { - if let Span::Text(ref t) = spans[i] { - write!(w, r#"").unwrap(); - } - } - */ - - if indices.len() > 0 { - let class = classify(indices.iter().cloned().filter_map(|i| spans.get(i))); - - for &i in indices.iter() { - let r = spans[i].rect; - write!(w, r#""#, - r.min_x(), r.max_x(), r.max_y(), r.max_y(), - class - ); - } - } - } - Node::Grid { ref x, ref y, ref cells, tag } => { - use std::iter::once; - let columns = x.len() + 1; - write!(w, r#""#, - bbox.min_x(), bbox.min_y(), bbox.width(), bbox.height(), tag - ); - - for (j, ((min_y, max_y), row)) in once(bbox.min_y()).chain(y.iter().cloned()).chain(once(bbox.max_y())).tuple_windows().zip(cells.chunks_exact(columns)).enumerate() { - if j > 0 { - writeln!(w, r#""#, - bbox.min_x(), bbox.max_x(), min_y, min_y); - } - - for (i, ((min_x, max_x), cell)) in once(bbox.min_x()).chain(x.iter().cloned()).chain(once(bbox.max_x())).tuple_windows().zip(row).enumerate() { - if i > 0 { - writeln!(w, r#""#, - min_x, min_x, bbox.min_y(), bbox.max_y()); - } - - let bbox = RectF::from_points(Vector2F::new(min_x, min_y), Vector2F::new(max_x, max_y)); - _render(w, spans, cell, bbox, level+1); - } - } - } - Node::Table { .. } => { - - } - } -} - -fn split(boxes: &mut [(RectF, usize)], spans: &[TextSpan], lines: &Lines) -> Node { - let num_boxes = boxes.len(); - if num_boxes < 2 { - return Node::singleton(boxes); - } - - sort_x(boxes); - let max_x_gap = dist_x(boxes); - sort_y(boxes); - let max_y_gap = dist_y(boxes); - - let x_y_ratio = 1.0; - - let max_gap = match (max_x_gap, max_y_gap) { - (Some((x, _)), Some((y, _))) => x.max(y * x_y_ratio), - (Some((x, _)), None) => x, - (None, Some((y, _))) => y * x_y_ratio, - (None, None) => { - sort_x(boxes); - return Node::singleton(boxes); - } - }; - let x_threshold = (max_gap * 0.5).max(1.0); - let y_threshold = (max_gap * 0.5 / x_y_ratio).max(0.1); - let mut cells = vec![]; - - let y_gaps: Vec = gaps(y_threshold, boxes, |r| (r.min_y(), r.max_y())) - .collect(); - - sort_x(boxes); - let x_gaps: Vec = gaps(x_threshold, boxes, |r| (r.min_x(), r.max_x())) - .collect(); - - if x_gaps.len() == 0 && y_gaps.len() == 0 { - return overlapping_lines(boxes); - } - - if x_gaps.len() > 1 && y_gaps.len() > 1 { - return split2(boxes, spans, lines); - } - - sort_y(boxes); - for row in split_by(boxes, &y_gaps, |r| r.min_y()) { - - if x_gaps.len() > 0 { - sort_x(row); - for cell in split_by(row, &x_gaps, |r| r.min_x()) { - sort_y(cell); - assert!(cell.len() < num_boxes); - cells.push(split(cell, spans, lines)); - } - } else { - cells.push(split(row, spans, lines)); - } - } - - assert!(x_gaps.len() > 0 || y_gaps.len() > 0); - let tag = if y_gaps.len() == 0 { - if cells.iter().all(|n| n.tag() <= NodeTag::Line) { - NodeTag::Line - } else { - NodeTag::Complex - } - } else if x_gaps.len() == 0 { - if cells.iter().all(|n| n.tag() <= NodeTag::Line) { - NodeTag::Paragraph - } else { - NodeTag::Complex - } - } else { - NodeTag::Complex - }; - - Node::Grid { - x: x_gaps, - y: y_gaps, - cells, - tag, - } -} -#[allow(dead_code)] -fn split_v(boxes: &mut [(RectF, usize)]) -> Node { - let num_boxes = boxes.len(); - if num_boxes < 2 { - return Node::singleton(boxes) - } - - let max_y_gap = dist_y(boxes); - - let max_gap = match max_y_gap { - Some((y, _)) => y, - None => { - sort_x(boxes); - return Node::singleton(boxes); - } - }; - let threshold = max_gap * 0.8; - let mut cells = vec![]; - - let y_gaps: Vec = gaps(threshold, boxes, |r| (r.min_y(), r.max_y())) - .collect(); - - for row in split_by(boxes, &y_gaps, |r| r.min_y()) { - assert!(row.len() < num_boxes); - cells.push(split_v(row)); - } - - let tag = if cells.iter().all(|n| n.tag() <= NodeTag::Line) { - NodeTag::Paragraph - } else { - NodeTag::Complex - }; - - Node::Grid { - x: vec![], - y: y_gaps, - cells, - tag, - } -} - -fn top_bottom_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option, Option) { - let num_boxes = boxes.len(); - if num_boxes < 2 { - return (None, None); - } - - let mut gaps = gap_list(boxes, |r| (r.min_y(), r.max_y())); - let top_limit = bbox.min_y() + bbox.height() * 0.2; - let bottom_limit = bbox.min_y() + bbox.height() * 0.8; - match gaps.next() { - Some((y, _, top)) if y < top_limit => { - match gaps.last() { - Some((y, _, bottom)) if y > bottom_limit => (Some(top), Some(bottom)), - _ => (Some(top), None) - } - } - Some((y, _, bottom)) if y > bottom_limit => (None, Some(bottom)), - _ => (None, None) - } -} -fn left_right_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option, Option) { - let num_boxes = boxes.len(); - if num_boxes < 2 { - return (None, None); - } - - let mut gaps = gap_list(boxes, |r| (r.min_x(), r.max_x())); - let left_limit = bbox.min_x() + bbox.width() * 0.2; - let right_limit = bbox.min_x() + bbox.width() * 0.8; - match gaps.next() { - Some((x, _, left)) if x < left_limit => { - match gaps.last() { - Some((x, _, right)) if x > right_limit => (Some(left), Some(right)), - _ => (Some(left), None) - } - } - Some((x, _, right)) if x > right_limit => (None, Some(right)), - _ => (None, None) - } -} - -fn sort_x(boxes: &mut [(RectF, usize)]) { - boxes.sort_unstable_by(|a, b| a.0.min_x().partial_cmp(&b.0.min_x()).unwrap()); -} -fn sort_y(boxes: &mut [(RectF, usize)]) { - boxes.sort_unstable_by(|a, b| a.0.min_y().partial_cmp(&b.0.min_y()).unwrap()); -} -fn overlapping_lines(boxes: &mut [(RectF, usize)]) -> Node { - sort_y(boxes); - let avg_height = avg(boxes.iter().map(|(r, _)| r.height())).unwrap(); - - let mut y_center = boxes[0].0.center().y(); - let mut lines = vec![]; - let mut y_splits = vec![]; - - let mut start = 0; - 'a: loop { - for (i, &(r, _)) in boxes[start..].iter().enumerate() { - if r.center().y() > 0.5 * avg_height + y_center { - let end = start + i; - sort_x(&mut boxes[start..end]); - let bbox = boxes[start..end].iter().map(|&(r, _)| r).reduce(|a, b| a.union_rect(b)).unwrap(); - - y_splits.push(bbox.max_y()); - lines.push(Node::singleton(&boxes[start..end])); - y_center = r.center().y(); - - start = end; - continue 'a; - } - } - - sort_x(&mut boxes[start..]); - lines.push(Node::singleton(&boxes[start..])); - - break; - } - match lines.len() { - 0 => Node::singleton(&[]), - 1 => lines.pop().unwrap(), - _ => Node::Grid { - x: vec![], - y: y_splits, - cells: lines, - tag: NodeTag::Paragraph - } - } -} - -fn gap_list<'a>(boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator + 'a { - let mut boxes = boxes.iter(); - let &(ref r, _) = boxes.next().unwrap(); - let (_, mut last_max) = span(r); - boxes.enumerate().filter_map(move |(idx, &(ref r, _))| { - let (min, max) = span(&r); - let r = if min > last_max { - Some((last_max, min, idx+1)) - } else { - None - }; - last_max = max.max(last_max); - r - }) -} - -fn gaps<'a>(threshold: f32, boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator + 'a { - let mut boxes = boxes.iter(); - let &(ref r, _) = boxes.next().unwrap(); - let (_, mut last_max) = span(r); - boxes.filter_map(move |&(ref r, _)| { - let (min, max) = span(&r); - let r = if min - last_max >= threshold { - Some(0.5 * (last_max + min)) - } else { - None - }; - last_max = max.max(last_max); - r - }) -} - -fn max_gap(boxes: &[(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32)) -> Option<(f32, f32)> { - gap_list(boxes, span) - .max_by_key(|&(a, b, _)| NotNan::new(b - a).unwrap()) - .map(|(a, b, _)| (b - a, 0.5 * (a + b))) -} - -fn dist_x(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> { - max_gap(boxes, |r| (r.min_x(), r.max_x())) -} -fn dist_y(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> { - max_gap(boxes, |r| (r.min_y(), r.max_y())) -} -fn split_by<'a>(list: &'a mut [(RectF, usize)], at: &'a [f32], by: impl Fn(&RectF) -> f32) -> impl Iterator { - SplitBy { - data: list, - points: at.iter().cloned(), - by, - end: false - } -} - -struct SplitBy<'a, I, F> { - data: &'a mut [(RectF, usize)], - points: I, - by: F, - end: bool, -} -impl<'a, I, F> Iterator for SplitBy<'a, I, F> where - I: Iterator, - F: Fn(&RectF) -> f32 -{ - type Item = &'a mut [(RectF, usize)]; - fn next(&mut self) -> Option { - if self.end { - return None; - } - match self.points.next() { - Some(p) => { - let idx = self.data.iter().position(|(ref r, _)| (self.by)(r) > p).unwrap_or(self.data.len()); - let (head, tail) = take(&mut self.data).split_at_mut(idx); - self.data = tail; - Some(head) - }, - None => { - self.end = true; - Some(take(&mut self.data)) - } - } - } -} - -use super::util::Tri; -#[derive(Copy, Clone, Debug, PartialEq)] -enum Class { - Number, - Header, - Paragraph, - Mixed, -} - -#[derive(Debug)] -struct TriCount { - tru: usize, - fal: usize, -} -impl TriCount { - fn new() -> Self { - TriCount { - tru: 0, - fal: 0 - } - } - fn add(&mut self, b: bool) { - match b { - false => self.fal += 1, - true => self.tru += 1, - } - } - fn count(&self) -> Tri { - match (self.fal, self.tru) { - (0, 0) => Tri::Unknown, - (0, _) => Tri::True, - (_, 0) => Tri::False, - (f, t) => Tri::Maybe(t as f32 / (t + f) as f32) - } - } -} -fn classify<'a>(spans: impl Iterator) -> Class { - use pdf_render::FontEntry; - - let mut bold = TriCount::new(); - let mut numeric = TriCount::new(); - let mut uniform = TriCount::new(); - let mut first_font: *const FontEntry = std::ptr::null(); - - for s in spans { - numeric.add(is_number(&s.text)); - if let Some(ref font) = s.font { - bold.add(font.name.contains("Bold")); - let font_ptr = Arc::as_ptr(font); - if first_font.is_null() { - first_font = font_ptr; - } else { - uniform.add(font_ptr == first_font); - } - } - } - uniform.add(true); - - match (numeric.count(), bold.count(), uniform.count()) { - (Tri::True, _, Tri::True) => Class::Number, - (_, Tri::True, Tri::True) => Class::Header, - (_, Tri::False, Tri::True) => Class::Paragraph, - (_, Tri::False, _) => Class::Paragraph, - (_, Tri::Maybe(_), _) => Class::Paragraph, - _ => Class::Mixed - } -} \ No newline at end of file diff --git a/src/util.rs b/src/util.rs index a68d533..adf68cc 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,7 +1,3 @@ -use pathfinder_geometry::rect::RectF; -use serde::{Serialize, Deserialize}; - - pub fn is_number(s: &str) -> bool { s.len() > 0 && s.chars().all(|c| ('0' ..= '9').contains(&c)) } @@ -18,37 +14,4 @@ pub fn avg(iter: impl Iterator) -> Option { } else { None } -} - -pub enum Tri { - False, - True, - Maybe(f32), - Unknown, -} - -#[derive(Copy, Clone, Debug)] -#[derive(Serialize, Deserialize)] -#[repr(C)] -pub struct Rect { - pub x: f32, - pub y: f32, - pub w: f32, - pub h: f32 -} -impl From for Rect { - fn from(r: RectF) -> Self { - Rect { - x: r.origin_x(), - y: r.origin_y(), - w: r.width(), - h: r.height() - } - } -} - -#[derive(Clone, Debug, Serialize)] -pub struct CellContent { - pub text: String, - pub rect: Rect, } \ No newline at end of file