Skip to content

Commit

Permalink
Reorganize the tree into serveral small files
Browse files Browse the repository at this point in the history
  • Loading branch information
vidy committed Dec 8, 2024
1 parent 509df96 commit 5182bab
Showing 1 changed file with 8 additions and 268 deletions.
276 changes: 8 additions & 268 deletions src/tree.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
use pdf_render::TextSpan;
use pathfinder_geometry::{
vector::Vector2F,
rect::RectF
};
mod gap;
mod line;
mod render;

use std::collections::BTreeSet;
use gap::{dist_x, dist_y, gaps, left_right_gap, top_bottom_gap};
use line::{analyze_lines, overlapping_lines, Lines};
use pdf_render::TextSpan;
use pathfinder_geometry::rect::RectF;

use itertools::Itertools;
use ordered_float::NotNan;
Expand Down Expand Up @@ -37,7 +38,7 @@ pub fn build<E: Encoder>(spans: &[TextSpan<E>], bbox: RectF, lines: &[[f32; 4]])
};
let probably_footer = |boxes: &mut [(RectF, usize)]| {
sort_x(boxes);
let x_gaps: Vec<f32> = gaps(avg_font_size, boxes, |r| (r.min_x(), r.max_x()))
let x_gaps: Vec<f32> = gap::gaps(avg_font_size, boxes, |r| (r.min_x(), r.max_x()))
.collect();

let count = split_by(boxes, &x_gaps, |r| r.min_x()).filter(|cell| probably_header(cell)).count();
Expand Down Expand Up @@ -72,71 +73,6 @@ pub fn build<E: Encoder>(spans: &[TextSpan<E>], bbox: RectF, lines: &[[f32; 4]])
split(boxes, &spans, &lines)
}

fn analyze_lines(lines: &[[f32; 4]]) -> Lines {
let mut hlines = BTreeSet::new();
let mut vlines = BTreeSet::new();

for &[x1, y1, x2, y2] in lines {
if x1 == x2 {
vlines.insert(NotNan::new(x1).unwrap());
} else if y1 == y2 {
hlines.insert(NotNan::new(y1).unwrap());
}
}

fn dedup(lines: impl Iterator<Item=NotNan<f32>>) -> Vec<(f32, f32)> {
let threshold = 10.0;
let mut out = vec![];
let mut lines = lines.map(|f| *f).peekable();
while let Some(start) = lines.next() {
let mut last = start;
while let Some(&p) = lines.peek() {
if last + threshold > p {
last = p;
lines.next();
} else {
break;
}
}
out.push((start, last));
}
out
}

let hlines = dedup(hlines.iter().cloned());
let vlines = dedup(vlines.iter().cloned());

let mut line_grid = vec![false; vlines.len() * hlines.len()];
for &[x1, y1, x2, y2] in lines {
if x1 == x2 {
let v_idx = vlines.iter().position(|&(a, b)| a <= x1 && x1 <= b).unwrap_or(vlines.len());
let h_start = hlines.iter().position(|&(a, b)| y1 >= a).unwrap_or(hlines.len());
let h_end = hlines.iter().position(|&(a, b)| y2 <= b).unwrap_or(hlines.len());
for h in h_start .. h_end {
line_grid[v_idx * hlines.len() + h] = true;
}
} else if y1 == y2 {
let h_idx = hlines.iter().position(|&(a, b)| a <= y1 && y1 <= b).unwrap_or(hlines.len());
let v_start = vlines.iter().position(|&(a, b)| x1 >= a).unwrap_or(vlines.len());
let v_end = vlines.iter().position(|&(a, b)| x2 <= b).unwrap_or(vlines.len());
for v in v_start .. v_end {
line_grid[v * hlines.len() + h_idx] = true;
}
}
}


//println!("hlines: {:?}", hlines);
//println!("vlines: {:?}", vlines);

Lines { hlines, vlines, line_grid }
}

pub struct Lines {
hlines: Vec<(f32, f32)>,
vlines: Vec<(f32, f32)>,
line_grid: Vec<bool>,
}

#[derive(Copy, Clone, Debug)]
struct Span {
Expand Down Expand Up @@ -384,68 +320,6 @@ pub enum NodeTag {
Complex,
}

pub fn render<E: Encoder>(w: &mut String, spans: &[TextSpan<E>], node: &Node, bbox: RectF) {
_render(w, spans, node, bbox, 0)
}
fn _render<E: Encoder>(w: &mut String, spans: &[TextSpan<E>], node: &Node, bbox: RectF, level: usize) {
use std::fmt::Write;

match *node {
Node::Final { ref indices } => {
/*
for i in start..end {
if let Span::Text(ref t) = spans[i] {
write!(w, r#"<text"#).unwrap();
write!(w, r#" font-size="{}""#, t.font_size).unwrap();
write!(w, r#" transform="{}""#, Transform::from(t.transform)).unwrap();
write_text_span(w, t);
write!(w, "</text>").unwrap();
}
}
*/

if indices.len() > 0 {
let class = classify(indices.iter().cloned().filter_map(|i| spans.get(i)));

for &i in indices.iter() {
let r = spans[i].rect;
write!(w, r#"<line x1="{}" x2="{}" y1="{}" y2="{}" class="{:?}" />"#,
r.min_x(), r.max_x(), r.max_y(), r.max_y(),
class
);
}
}
}
Node::Grid { ref x, ref y, ref cells, tag } => {
use std::iter::once;
let columns = x.len() + 1;
write!(w, r#"<rect x="{}" y="{}" width="{}" height="{}" class="{:?}" />"#,
bbox.min_x(), bbox.min_y(), bbox.width(), bbox.height(), tag
);

for (j, ((min_y, max_y), row)) in once(bbox.min_y()).chain(y.iter().cloned()).chain(once(bbox.max_y())).tuple_windows().zip(cells.chunks_exact(columns)).enumerate() {
if j > 0 {
writeln!(w, r#"<line x1="{}" x2="{}" y1="{}" y2="{}" level="{level}"></line>"#,
bbox.min_x(), bbox.max_x(), min_y, min_y);
}

for (i, ((min_x, max_x), cell)) in once(bbox.min_x()).chain(x.iter().cloned()).chain(once(bbox.max_x())).tuple_windows().zip(row).enumerate() {
if i > 0 {
writeln!(w, r#"<line x1="{}" x2="{}" y1="{}" y2="{}" level="{level}"></line>"#,
min_x, min_x, bbox.min_y(), bbox.max_y());
}

let bbox = RectF::from_points(Vector2F::new(min_x, min_y), Vector2F::new(max_x, max_y));
_render(w, spans, cell, bbox, level+1);
}
}
}
Node::Table { .. } => {

}
}
}

fn split<E: Encoder>(boxes: &mut [(RectF, usize)], spans: &[TextSpan<E>], lines: &Lines) -> Node {
let num_boxes = boxes.len();
if num_boxes < 2 {
Expand Down Expand Up @@ -567,147 +441,13 @@ fn split_v(boxes: &mut [(RectF, usize)]) -> Node {
}
}

fn top_bottom_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option<usize>, Option<usize>) {
let num_boxes = boxes.len();
if num_boxes < 2 {
return (None, None);
}

let mut gaps = gap_list(boxes, |r| (
// top left y
r.min_y(),
// bottom right y
r.max_y()
));
let top_limit = bbox.min_y() + bbox.height() * 0.2;
let bottom_limit = bbox.min_y() + bbox.height() * 0.8;

match gaps.next() {
Some((y, _, top)) if y < top_limit => {
match gaps.last() {
Some((y, _, bottom)) if y > bottom_limit => (Some(top), Some(bottom)),
_ => (Some(top), None)
}
}
Some((y, _, bottom)) if y > bottom_limit => (None, Some(bottom)),
_ => (None, None)
}
}

fn left_right_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option<usize>, Option<usize>) {
let num_boxes = boxes.len();
if num_boxes < 2 {
return (None, None);
}

let mut gaps = gap_list(boxes, |r| (r.min_x(), r.max_x()));
let left_limit = bbox.min_x() + bbox.width() * 0.2;
let right_limit = bbox.min_x() + bbox.width() * 0.8;
match gaps.next() {
Some((x, _, left)) if x < left_limit => {
match gaps.last() {
Some((x, _, right)) if x > right_limit => (Some(left), Some(right)),
_ => (Some(left), None)
}
}
Some((x, _, right)) if x > right_limit => (None, Some(right)),
_ => (None, None)
}
}

fn sort_x(boxes: &mut [(RectF, usize)]) {
boxes.sort_unstable_by(|a, b| a.0.min_x().partial_cmp(&b.0.min_x()).unwrap());
}
fn sort_y(boxes: &mut [(RectF, usize)]) {
boxes.sort_unstable_by(|a, b| a.0.min_y().partial_cmp(&b.0.min_y()).unwrap());
}
fn overlapping_lines(boxes: &mut [(RectF, usize)]) -> Node {
sort_y(boxes);
let avg_height = avg(boxes.iter().map(|(r, _)| r.height())).unwrap();

let mut y_center = boxes[0].0.center().y();
let mut lines = vec![];
let mut y_splits = vec![];

let mut start = 0;
'a: loop {
for (i, &(r, _)) in boxes[start..].iter().enumerate() {
if r.center().y() > 0.5 * avg_height + y_center {
let end = start + i;
sort_x(&mut boxes[start..end]);
let bbox = boxes[start..end].iter().map(|&(r, _)| r).reduce(|a, b| a.union_rect(b)).unwrap();

y_splits.push(bbox.max_y());
lines.push(Node::singleton(&boxes[start..end]));
y_center = r.center().y();

start = end;
continue 'a;
}
}

sort_x(&mut boxes[start..]);
lines.push(Node::singleton(&boxes[start..]));

break;
}
match lines.len() {
0 => Node::singleton(&[]),
1 => lines.pop().unwrap(),
_ => Node::Grid {
x: vec![],
y: y_splits,
cells: lines,
tag: NodeTag::Paragraph
}
}
}

fn gap_list<'a>(boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator<Item=(f32, f32, usize)> + 'a {
let mut boxes = boxes.iter();
let &(ref r, _) = boxes.next().unwrap();
let (_, mut last_max) = span(r);
boxes.enumerate().filter_map(move |(idx, &(ref r, _))| {
// top left y, bottom right y
let (min, max) = span(&r);
let r = if min > last_max {
Some((last_max, min, idx+1))
} else {
None
};
last_max = max.max(last_max);
r
})
}

fn gaps<'a>(threshold: f32, boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator<Item=f32> + 'a {
let mut boxes = boxes.iter();
let &(ref r, _) = boxes.next().unwrap();
let (_, mut last_max) = span(r);
boxes.filter_map(move |&(ref r, _)| {
let (min, max) = span(&r);
let r = if min - last_max >= threshold {
Some(0.5 * (last_max + min))
} else {
None
};
last_max = max.max(last_max);
r
})
}

fn max_gap(boxes: &[(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32)) -> Option<(f32, f32)> {
gap_list(boxes, span)
.max_by_key(|&(a, b, _)| NotNan::new(b - a).unwrap())
.map(|(a, b, _)| (b - a, 0.5 * (a + b)))
}

fn dist_x(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> {
max_gap(boxes, |r| (r.min_x(), r.max_x()))
}
fn dist_y(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> {
max_gap(boxes, |r| (r.min_y(), r.max_y()))
}
fn split_by<'a>(list: &'a mut [(RectF, usize)], at: &'a [f32], by: impl Fn(&RectF) -> f32) -> impl Iterator<Item=&'a mut [(RectF, usize)]> {
SplitBy {
data: list,
Expand Down

0 comments on commit 5182bab

Please sign in to comment.