Skip to content

Commit

Permalink
Rename tree to node
Browse files Browse the repository at this point in the history
  • Loading branch information
vidy committed Dec 8, 2024
1 parent 5182bab commit 59e6331
Show file tree
Hide file tree
Showing 10 changed files with 356 additions and 86 deletions.
4 changes: 2 additions & 2 deletions examples/text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ fn main() {
let resolver = file.resolver();

// for (page_nr, page) in file.pages().enumerate() {
let page = file.get_page(0).unwrap();
let page: pdf::object::PageRc = file.get_page(0).unwrap();
let flow = pdf_text::run(&file, &page, &resolver, Default::default()).expect("can't render page");
println!("# page {}", 0 + 1);
for run in flow.runs {
for line in run.lines {
for w in line.words {
// println!(": {}", w.text);
println!(": {}", w.text);
}
}
println!();
Expand Down
64 changes: 35 additions & 29 deletions src/classify.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ use pdf_render::TextSpan;

use crate::util::is_number;

use super::util::Tri;

#[derive(Copy, Clone, Debug, PartialEq)]
pub enum Class {
Number,
Expand All @@ -15,33 +13,6 @@ pub enum Class {
Mixed,
}

#[derive(Debug)]
pub struct TriCount {
tru: usize,
fal: usize,
}
impl TriCount {
fn new() -> Self {
TriCount {
tru: 0,
fal: 0
}
}
fn add(&mut self, b: bool) {
match b {
false => self.fal += 1,
true => self.tru += 1,
}
}
fn count(&self) -> Tri {
match (self.fal, self.tru) {
(0, 0) => Tri::Unknown,
(0, _) => Tri::True,
(_, 0) => Tri::False,
(f, t) => Tri::Maybe(t as f32 / (t + f) as f32)
}
}
}
pub fn classify<'a, E: Encoder + 'a>(spans: impl Iterator<Item=&'a TextSpan<E>>) -> Class {
use pdf_render::FontEntry;

Expand Down Expand Up @@ -72,4 +43,39 @@ pub fn classify<'a, E: Encoder + 'a>(spans: impl Iterator<Item=&'a TextSpan<E>>)
(_, Tri::Maybe(_), _) => Class::Paragraph,
_ => Class::Mixed
}
}

pub enum Tri {
False,
True,
Maybe(f32),
Unknown,
}

#[derive(Debug)]
pub struct TriCount {
tru: usize,
fal: usize,
}
impl TriCount {
fn new() -> Self {
TriCount {
tru: 0,
fal: 0
}
}
fn add(&mut self, b: bool) {
match b {
false => self.fal += 1,
true => self.tru += 1,
}
}
fn count(&self) -> Tri {
match (self.fal, self.tru) {
(0, 0) => Tri::Unknown,
(0, _) => Tri::True,
(_, 0) => Tri::False,
(f, t) => Tri::Maybe(t as f32 / (t + f) as f32)
}
}
}
31 changes: 29 additions & 2 deletions src/flow.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::classify::{classify, Class};
use crate::tree::{Node, NodeTag};
use crate::util::{avg, CellContent, Rect};
use crate::node::{Node, NodeTag};
use crate::util::avg;
use crate::text::concat_text;
use std::iter::once;
use pathfinder_geometry::rect::RectF;
Expand Down Expand Up @@ -34,6 +34,33 @@ pub enum RunType {
Cell,
}


#[derive(Copy, Clone, Debug)]
#[derive(Serialize, Deserialize)]
#[repr(C)]
pub struct Rect {
pub x: f32,
pub y: f32,
pub w: f32,
pub h: f32
}
impl From<RectF> for Rect {
fn from(r: RectF) -> Self {
Rect {
x: r.origin_x(),
y: r.origin_y(),
w: r.width(),
h: r.height()
}
}
}

#[derive(Clone, Debug, Serialize)]
pub struct CellContent {
pub text: String,
pub rect: Rect,
}

#[derive(Serialize, Deserialize)]
pub struct Flow {
pub lines: Vec<Line>,
Expand Down
19 changes: 5 additions & 14 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use pathfinder_geometry::transform2d::Transform2F;
use pdf::{backend::Backend, object::{Page, Resolve}, PdfError};
use pdf_render::{tracer::{TraceCache, Tracer, DrawItem}, Fill, render_pattern, render_page, FillMode, font::OutlineBuilder};

mod tree;
mod node;
mod util;
mod text;
mod classify;
Expand All @@ -17,14 +17,13 @@ pub fn run<B: Backend>(file: &pdf::file::CachedFile<B>, page: &Page, resolve: &i
let mut clip_paths = vec![];
let mut tracer = Tracer::new(&mut cache, &mut clip_paths);

// The tracer backend can be used to get text, pattern, image, etc.
// We will use text and pattern to do further text processing.
//Get text, pattern, image by the Tracer backend.
render_page(&mut tracer, resolve, &page, transform)?;

let bbox = tracer.view_box();

let items: Vec<DrawItem<OutlineBuilder>> = tracer.finish();
//Get patterns which may have lines and texts inside.
//Get all patterns which may have lines and texts inside.
let mut patterns = HashSet::new();
for item in items.iter() {
if let DrawItem::Vector(ref v) = item {
Expand Down Expand Up @@ -84,20 +83,12 @@ pub fn run<B: Backend>(file: &pdf::file::CachedFile<B>, page: &Page, resolve: &i
}
}

// After this loop, all the text and lines are ready
// After this loop, all the text and lines are ready for further processing.
for item in items {
visit_item(item);
}

spans.sort_unstable_by(|a, b| a.rect.min_y().partial_cmp(&b.rect.min_y()).unwrap());

spans.sort_unstable_by(|a, b| a.rect.min_x().partial_cmp(&b.rect.min_x()).unwrap());

for s in spans.iter().map(|s|s.text.as_str()) {
println!(":{}", s)
}

let root = tree::build(&spans, bbox, &lines);
let root = node::build(&spans, bbox, &lines);

let mut flow = Flow::new();
flow::build(&mut flow, &spans, &root, bbox.min_x());
Expand Down
File renamed without changes.
96 changes: 96 additions & 0 deletions src/node/gap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
use ordered_float::NotNan;
use pathfinder_geometry::rect::RectF;

pub fn gap_list<'a>(boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator<Item=(f32, f32, usize)> + 'a {
let mut boxes = boxes.iter();
let &(ref r, _) = boxes.next().unwrap();
let (_, mut last_max) = span(r);
boxes.enumerate().filter_map(move |(idx, &(ref r, _))| {
// top left y, bottom right y
let (min, max) = span(&r);
let r = if min > last_max {
Some((last_max, min, idx+1))
} else {
None
};
last_max = max.max(last_max);
r
})
}

pub fn gaps<'a>(threshold: f32, boxes: &'a [(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32) + 'a) -> impl Iterator<Item=f32> + 'a {
let mut boxes = boxes.iter();
let &(ref r, _) = boxes.next().unwrap();
let (_, mut last_max) = span(r);
boxes.filter_map(move |&(ref r, _)| {
let (min, max) = span(&r);
let r = if min - last_max >= threshold {
Some(0.5 * (last_max + min))
} else {
None
};
last_max = max.max(last_max);
r
})
}

pub fn max_gap(boxes: &[(RectF, usize)], span: impl Fn(&RectF) -> (f32, f32)) -> Option<(f32, f32)> {
gap_list(boxes, span)
.max_by_key(|&(a, b, _)| NotNan::new(b - a).unwrap())
.map(|(a, b, _)| (b - a, 0.5 * (a + b)))
}

pub fn dist_x(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> {
max_gap(boxes, |r| (r.min_x(), r.max_x()))
}
pub fn dist_y(boxes: &[(RectF, usize)]) -> Option<(f32, f32)> {
max_gap(boxes, |r| (r.min_y(), r.max_y()))
}

pub fn top_bottom_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option<usize>, Option<usize>) {
let num_boxes = boxes.len();
if num_boxes < 2 {
return (None, None);
}

let mut gaps = gap_list(boxes, |r| (
// top left y
r.min_y(),
// bottom right y
r.max_y()
));
let top_limit = bbox.min_y() + bbox.height() * 0.2;
let bottom_limit = bbox.min_y() + bbox.height() * 0.8;

match gaps.next() {
Some((y, _, top)) if y < top_limit => {
match gaps.last() {
Some((y, _, bottom)) if y > bottom_limit => (Some(top), Some(bottom)),
_ => (Some(top), None)
}
}
Some((y, _, bottom)) if y > bottom_limit => (None, Some(bottom)),
_ => (None, None)
}
}

pub fn left_right_gap(boxes: &mut [(RectF, usize)], bbox: RectF) -> (Option<usize>, Option<usize>) {
let num_boxes = boxes.len();
if num_boxes < 2 {
return (None, None);
}

let mut gaps = gap_list(boxes, |r| (r.min_x(), r.max_x()));
let left_limit = bbox.min_x() + bbox.width() * 0.2;
let right_limit = bbox.min_x() + bbox.width() * 0.8;
match gaps.next() {
Some((x, _, left)) if x < left_limit => {
match gaps.last() {
Some((x, _, right)) if x > right_limit => (Some(left), Some(right)),
_ => (Some(left), None)
}
}
Some((x, _, right)) if x > right_limit => (None, Some(right)),
_ => (None, None)
}
}
Loading

0 comments on commit 59e6331

Please sign in to comment.