diff --git a/src/classify.rs b/src/classify.rs index 8a10b94..5033738 100644 --- a/src/classify.rs +++ b/src/classify.rs @@ -5,8 +5,6 @@ use pdf_render::TextSpan; use crate::util::is_number; -use super::util::Tri; - #[derive(Copy, Clone, Debug, PartialEq)] pub enum Class { Number, @@ -15,33 +13,6 @@ pub enum Class { Mixed, } -#[derive(Debug)] -pub struct TriCount { - tru: usize, - fal: usize, -} -impl TriCount { - fn new() -> Self { - TriCount { - tru: 0, - fal: 0 - } - } - fn add(&mut self, b: bool) { - match b { - false => self.fal += 1, - true => self.tru += 1, - } - } - fn count(&self) -> Tri { - match (self.fal, self.tru) { - (0, 0) => Tri::Unknown, - (0, _) => Tri::True, - (_, 0) => Tri::False, - (f, t) => Tri::Maybe(t as f32 / (t + f) as f32) - } - } -} pub fn classify<'a, E: Encoder + 'a>(spans: impl Iterator>) -> Class { use pdf_render::FontEntry; @@ -72,4 +43,39 @@ pub fn classify<'a, E: Encoder + 'a>(spans: impl Iterator>) (_, Tri::Maybe(_), _) => Class::Paragraph, _ => Class::Mixed } +} + +pub enum Tri { + False, + True, + Maybe(f32), + Unknown, +} + +#[derive(Debug)] +pub struct TriCount { + tru: usize, + fal: usize, +} +impl TriCount { + fn new() -> Self { + TriCount { + tru: 0, + fal: 0 + } + } + fn add(&mut self, b: bool) { + match b { + false => self.fal += 1, + true => self.tru += 1, + } + } + fn count(&self) -> Tri { + match (self.fal, self.tru) { + (0, 0) => Tri::Unknown, + (0, _) => Tri::True, + (_, 0) => Tri::False, + (f, t) => Tri::Maybe(t as f32 / (t + f) as f32) + } + } } \ No newline at end of file diff --git a/src/flow.rs b/src/flow.rs index c203f29..93f7a28 100644 --- a/src/flow.rs +++ b/src/flow.rs @@ -1,5 +1,5 @@ use crate::classify::{classify, Class}; -use crate::tree::{Node, NodeTag}; +use crate::node::{Node, NodeTag}; use crate::util::{avg, CellContent, Rect}; use crate::text::concat_text; use std::iter::once; diff --git a/src/lib.rs b/src/lib.rs index fc4ae70..569a604 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,7 @@ use pathfinder_geometry::transform2d::Transform2F; use pdf::{backend::Backend, object::{Page, Resolve}, PdfError}; use pdf_render::{tracer::{TraceCache, Tracer, DrawItem}, Fill, render_pattern, render_page, FillMode, font::OutlineBuilder}; -mod tree; +mod node; mod util; mod text; mod classify; @@ -88,16 +88,8 @@ pub fn run(file: &pdf::file::CachedFile, page: &Page, resolve: &i for item in items { visit_item(item); } - - spans.sort_unstable_by(|a, b| a.rect.min_y().partial_cmp(&b.rect.min_y()).unwrap()); - spans.sort_unstable_by(|a, b| a.rect.min_x().partial_cmp(&b.rect.min_x()).unwrap()); - - for s in spans.iter().map(|s|s.text.as_str()) { - println!(":{}", s) - } - - let root = tree::build(&spans, bbox, &lines); + let root = node::build(&spans, bbox, &lines); let mut flow = Flow::new(); flow::build(&mut flow, &spans, &root, bbox.min_x()); diff --git a/src/tree.rs b/src/node.rs similarity index 100% rename from src/tree.rs rename to src/node.rs diff --git a/src/node/gap.rs b/src/node/gap.rs new file mode 100644 index 0000000..49e8212 --- /dev/null +++ b/src/node/gap.rs @@ -0,0 +1,96 @@ +use ordered_float::NotNan; +use pathfinder_geometry::rect::RectF; + +pub fn gap_list<'a>(boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator + 'a { + let mut boxes = boxes.iter(); + let &(ref r, _) = boxes.next().unwrap(); + let (_, mut last_max) = span(r); + boxes.enumerate().filter_map(move |(idx, &(ref r, _))| { + // top left y, bottom right y + let (min, max) = span(&r); + let r = if min > last_max { + Some((last_max, min, idx+1)) + } else { + None + }; + last_max = max.max(last_max); + r + }) +} + +pub fn gaps<'a>(threshold: f32, boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator + 'a { + let mut boxes = boxes.iter(); + let &(ref r, _) = boxes.next().unwrap(); + let (_, mut last_max) = span(r); + boxes.filter_map(move |&(ref r, _)| { + let (min, max) = span(&r); + let r = if min - last_max >= threshold { + Some(0.5 * (last_max + min)) + } else { + None + }; + last_max = max.max(last_max); + r + }) +} + +pub fn max_gap(boxes: &[(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32)) -> Option<(f32, f32)> { + gap_list(boxes, span) + .max_by_key(|&(a, b, _)| NotNan::new(b - a).unwrap()) + .map(|(a, b, _)| (b - a, 0.5 * (a + b))) +} + +pub fn dist_x(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> { + max_gap(boxes, |r| (r.min_x(), r.max_x())) +} +pub fn dist_y(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> { + max_gap(boxes, |r| (r.min_y(), r.max_y())) +} + +pub fn top_bottom_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option, Option) { + let num_boxes = boxes.len(); + if num_boxes < 2 { + return (None, None); + } + + let mut gaps = gap_list(boxes, |r| ( + // top left y + r.min_y(), + // bottom right y + r.max_y() + )); + let top_limit = bbox.min_y() + bbox.height() * 0.2; + let bottom_limit = bbox.min_y() + bbox.height() * 0.8; + + match gaps.next() { + Some((y, _, top)) if y < top_limit => { + match gaps.last() { + Some((y, _, bottom)) if y > bottom_limit => (Some(top), Some(bottom)), + _ => (Some(top), None) + } + } + Some((y, _, bottom)) if y > bottom_limit => (None, Some(bottom)), + _ => (None, None) + } +} + +pub fn left_right_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option, Option) { + let num_boxes = boxes.len(); + if num_boxes < 2 { + return (None, None); + } + + let mut gaps = gap_list(boxes, |r| (r.min_x(), r.max_x())); + let left_limit = bbox.min_x() + bbox.width() * 0.2; + let right_limit = bbox.min_x() + bbox.width() * 0.8; + match gaps.next() { + Some((x, _, left)) if x < left_limit => { + match gaps.last() { + Some((x, _, right)) if x > right_limit => (Some(left), Some(right)), + _ => (Some(left), None) + } + } + Some((x, _, right)) if x > right_limit => (None, Some(right)), + _ => (None, None) + } +} diff --git a/src/node/line.rs b/src/node/line.rs new file mode 100644 index 0000000..40a0b2e --- /dev/null +++ b/src/node/line.rs @@ -0,0 +1,116 @@ + +use std::collections::BTreeSet; +use ordered_float::NotNan; +use pathfinder_geometry::rect::RectF; + +use crate::util::avg; + +use super::{sort_x, sort_y, Node, NodeTag}; + +pub fn analyze_lines(lines: &[[f32; 4]]) -> Lines { + let mut hlines = BTreeSet::new(); + let mut vlines = BTreeSet::new(); + + for &[x1, y1, x2, y2] in lines { + if x1 == x2 { + vlines.insert(NotNan::new(x1).unwrap()); + } else if y1 == y2 { + hlines.insert(NotNan::new(y1).unwrap()); + } + } + + fn dedup(lines: impl Iterator>) -> Vec<(f32, f32)> { + let threshold = 10.0; + let mut out = vec![]; + let mut lines = lines.map(|f| *f).peekable(); + while let Some(start) = lines.next() { + let mut last = start; + while let Some(&p) = lines.peek() { + if last + threshold > p { + last = p; + lines.next(); + } else { + break; + } + } + out.push((start, last)); + } + out + } + + let hlines = dedup(hlines.iter().cloned()); + let vlines = dedup(vlines.iter().cloned()); + + let mut line_grid = vec![false; vlines.len() * hlines.len()]; + for &[x1, y1, x2, y2] in lines { + if x1 == x2 { + let v_idx = vlines.iter().position(|&(a, b)| a <= x1 && x1 <= b).unwrap_or(vlines.len()); + let h_start = hlines.iter().position(|&(a, b)| y1 >= a).unwrap_or(hlines.len()); + let h_end = hlines.iter().position(|&(a, b)| y2 <= b).unwrap_or(hlines.len()); + for h in h_start .. h_end { + line_grid[v_idx * hlines.len() + h] = true; + } + } else if y1 == y2 { + let h_idx = hlines.iter().position(|&(a, b)| a <= y1 && y1 <= b).unwrap_or(hlines.len()); + let v_start = vlines.iter().position(|&(a, b)| x1 >= a).unwrap_or(vlines.len()); + let v_end = vlines.iter().position(|&(a, b)| x2 <= b).unwrap_or(vlines.len()); + for v in v_start .. v_end { + line_grid[v * hlines.len() + h_idx] = true; + } + } + } + + + //println!("hlines: {:?}", hlines); + //println!("vlines: {:?}", vlines); + + Lines { hlines, vlines, line_grid } +} + +pub struct Lines { + pub hlines: Vec<(f32, f32)>, + pub vlines: Vec<(f32, f32)>, + pub line_grid: Vec, +} + +pub fn overlapping_lines(boxes: &mut [(RectF, usize)]) -> Node { + sort_y(boxes); + let avg_height = avg(boxes.iter().map(|(r, _)| r.height())).unwrap(); + + let mut y_center = boxes[0].0.center().y(); + let mut lines = vec![]; + let mut y_splits = vec![]; + + let mut start = 0; + 'a: loop { + for (i, &(r, _)) in boxes[start..].iter().enumerate() { + if r.center().y() > 0.5 * avg_height + y_center { + let end = start + i; + sort_x(&mut boxes[start..end]); + let bbox = boxes[start..end].iter().map(|&(r, _)| r).reduce(|a, b| a.union_rect(b)).unwrap(); + + y_splits.push(bbox.max_y()); + lines.push(Node::singleton(&boxes[start..end])); + y_center = r.center().y(); + + start = end; + continue 'a; + } + } + + sort_x(&mut boxes[start..]); + lines.push(Node::singleton(&boxes[start..])); + + break; + } + match lines.len() { + 0 => Node::singleton(&[]), + 1 => lines.pop().unwrap(), + _ => Node::Grid { + x: vec![], + y: y_splits, + cells: lines, + tag: NodeTag::Paragraph + } + } +} \ No newline at end of file diff --git a/src/node/render.rs b/src/node/render.rs new file mode 100644 index 0000000..618e581 --- /dev/null +++ b/src/node/render.rs @@ -0,0 +1,71 @@ +use font::Encoder; +use itertools::Itertools; +use pathfinder_geometry::{rect::RectF, vector::Vector2F}; +use pdf_render::TextSpan; + +use crate::classify::classify; + +use super::Node; + +pub fn render(w: &mut String, spans: &[TextSpan], node: &Node, bbox: RectF) { + _render(w, spans, node, bbox, 0) +} + +fn _render(w: &mut String, spans: &[TextSpan], node: &Node, bbox: RectF, level: usize) { + use std::fmt::Write; + + match *node { + Node::Final { ref indices } => { + /* + for i in start..end { + if let Span::Text(ref t) = spans[i] { + write!(w, r#"").unwrap(); + } + } + */ + + if indices.len() > 0 { + let class = classify(indices.iter().cloned().filter_map(|i| spans.get(i))); + + for &i in indices.iter() { + let r = spans[i].rect; + write!(w, r#""#, + r.min_x(), r.max_x(), r.max_y(), r.max_y(), + class + ); + } + } + } + Node::Grid { ref x, ref y, ref cells, tag } => { + use std::iter::once; + let columns = x.len() + 1; + write!(w, r#""#, + bbox.min_x(), bbox.min_y(), bbox.width(), bbox.height(), tag + ); + + for (j, ((min_y, max_y), row)) in once(bbox.min_y()).chain(y.iter().cloned()).chain(once(bbox.max_y())).tuple_windows().zip(cells.chunks_exact(columns)).enumerate() { + if j > 0 { + writeln!(w, r#""#, + bbox.min_x(), bbox.max_x(), min_y, min_y); + } + + for (i, ((min_x, max_x), cell)) in once(bbox.min_x()).chain(x.iter().cloned()).chain(once(bbox.max_x())).tuple_windows().zip(row).enumerate() { + if i > 0 { + writeln!(w, r#""#, + min_x, min_x, bbox.min_y(), bbox.max_y()); + } + + let bbox = RectF::from_points(Vector2F::new(min_x, min_y), Vector2F::new(max_x, max_y)); + _render(w, spans, cell, bbox, level+1); + } + } + } + Node::Table { .. } => { + + } + } +} diff --git a/src/util.rs b/src/util.rs index a68d533..f0b7ada 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,7 +1,6 @@ use pathfinder_geometry::rect::RectF; use serde::{Serialize, Deserialize}; - pub fn is_number(s: &str) -> bool { s.len() > 0 && s.chars().all(|c| ('0' ..= '9').contains(&c)) } @@ -20,13 +19,6 @@ pub fn avg(iter: impl Iterator) -> Option { } } -pub enum Tri { - False, - True, - Maybe(f32), - Unknown, -} - #[derive(Copy, Clone, Debug)] #[derive(Serialize, Deserialize)] #[repr(C)]