Skip to content

Commit

Permalink
Add chars to word
Browse files Browse the repository at this point in the history
  • Loading branch information
vidy committed Dec 14, 2024
1 parent d22bd40 commit c4cccba
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 17 deletions.
7 changes: 6 additions & 1 deletion examples/text.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@ fn main() {

for run in flow.runs {
for line in run.lines {
println!("{}", line.words.iter().map(|w| w.text.as_str()).format(" "));
for word in line.words {
println!("{}", word.text.as_str());
for char in word.chars {
println!("{:?}", char);
}
}
}
}
// }
Expand Down
75 changes: 68 additions & 7 deletions src/flow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,33 @@ use table::Table;
pub struct Word {
pub text: String,
pub rect: Rect,
pub chars: Vec<Char>
}

#[derive(Serialize, Deserialize, Debug)]
pub struct Char {
pub offset: i32,
pub pos: f32,
pub width: f32,
}

#[derive(Serialize, Deserialize)]
pub struct Line {
pub words: Vec<Word>,
pub rect: Rect,
}
#[derive(Serialize, Deserialize)]
pub struct Run {
pub lines: Vec<Line>,
pub kind: RunType,
}

impl Run {
pub fn rect(&self) -> Option<Rect> {
self.lines.iter().map(|s| s.rect).reduce(|a, b| a.union(b))
}
}

#[derive(Serialize, Deserialize)]
pub enum RunType {
ParagraphContinuation,
Expand Down Expand Up @@ -55,6 +71,51 @@ impl From<RectF> for Rect {
}
}

impl Rect {
pub fn union(self, other: Rect) -> Rect {
let min_x = self.x.min(other.x);
let min_y = self.y.min(other.y);
let max_x = (self.x + self.w).max(other.x + other.w);
let max_y = (self.y + self.h).max(other.y + other.h);

Rect {
x: min_x,
y: min_y,
w: max_x - min_x,
h: max_y - min_y
}
}

pub fn intersects(self, other: Rect) -> bool {
let self_max_x = self.x + self.w;
let self_max_y = self.y + self.h;

let other_max_x = other.x + other.w;
let other_max_y = other.y + other.h;

self.x < other_max_x && other.x < self_max_x &&
self.y < other_max_y && other.y < self_max_y
}

pub fn intersection(self, other: Rect) -> Option<Rect> {
if !self.intersects(other) {
None
} else {
let min_x = self.x.max(other.x);
let min_y = self.y.max(other.y);
let max_x = (self.x + self.w).min(other.x + other.w);
let max_y = (self.y + self.h).min(other.y + other.h);

Some(Rect {
x: min_x,
y: min_y,
w: max_x - min_x,
h: max_y - min_y
})
}
}
}

#[derive(Clone, Debug, Serialize)]
pub struct CellContent {
pub text: String,
Expand All @@ -74,11 +135,11 @@ impl Flow {
runs: vec![]
}
}
pub fn add_line(&mut self, words: Vec<Word>, kind: RunType) {
pub fn add_line(&mut self, words: Vec<Word>, kind: RunType, rect: Rect) {
if words.len() > 0 {
self.runs.push(Run {
lines: vec![Line { words }],
kind
lines: vec![Line { words, rect}],
kind,
});
}
}
Expand Down Expand Up @@ -107,7 +168,7 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
_ => RunType::Paragraph,
};

flow.add_line(words, t);
flow.add_line(words, t, bbox.into());
}
}
Node::Grid { ref x, ref y, ref cells, tag } => {
Expand All @@ -129,7 +190,7 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
_ => RunType::Paragraph,
};

flow.add_line(words, t);
flow.add_line(words, t, bbox.into());
}
NodeTag::Paragraph => {
assert_eq!(x.len(), 0, "For a paragraph x gaps should be empty");
Expand Down Expand Up @@ -203,7 +264,7 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
kind: match class {
Class::Header => RunType::Header,
_ => RunType::Paragraph
}
},
});
para_start = line_start;
}
Expand All @@ -214,7 +275,7 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
let words = concat_text(&mut text, indices[line_start..end].iter().flat_map(|&i| spans.get(i)));

if words.len() > 0 {
flow_lines.push(Line { words });
flow_lines.push(Line { words , rect: line_bbox.into()});
}
}
if para_start == line_start {
Expand Down
48 changes: 39 additions & 9 deletions src/text.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
use std::mem::take;

use font::Encoder;
use pathfinder_geometry::vector::Vector2F;
use pdf_render::TextSpan;
use itertools::Itertools;
use unicode_normalization::UnicodeNormalization;
use crate::{util::avg, flow::{Word, Rect}};
use crate::{flow::{Char, Rect, Word}, util::avg};

pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<Item=&'a TextSpan<E>> + Clone) -> Vec<Word> {
let word_gap = analyze_word_gap(items.clone());
Expand All @@ -28,6 +30,8 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
let mut y_max = -f32::INFINITY;

let mut word_start = true;
let mut word_chars = vec![];
let mut word_char_idx = 0;

for span in items {
let mut offset = 0; // byte index of last char into span.text
Expand All @@ -43,15 +47,26 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
} else {
s = &span.text[offset..];
}
end = current.pos + x_off + current.width;

let char_start_pos = (span.transform.matrix * Vector2F::new(current.pos + x_off, 0.0)).x();
let char_end_pos = (span.transform.matrix * Vector2F::new(end, 0.0)).x();

let is_whitespace = s.chars().all(|c| c.is_whitespace());

if trailing_space {
if !is_whitespace {
word_start = true;
word_start_idx = out.len();

word_chars.push(Char {
offset: 0,
pos: char_start_pos,
width: char_end_pos - char_start_pos,
});
out.extend(s.nfkc());

word_char_idx += 1;
}
} else {
if is_whitespace {
Expand All @@ -62,10 +77,12 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
y: y_min,
h: y_max - y_min,
w: word_end_pos - word_start_pos
}
},
chars: take(&mut word_chars)
});
out.push_str(" ");
word_start_idx = out.len();
word_char_idx = 0;
} else if current.pos + x_off > end + word_gap {
words.push(Word {
text: out[word_start_idx..].into(),
Expand All @@ -74,27 +91,39 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
y: y_min,
h: y_max - y_min,
w: word_end_pos - word_start_pos
}
},
chars: take(&mut word_chars)
});

word_start = true;
word_start_idx = out.len();
word_chars.push(Char {
offset: 0,
pos: char_start_pos,
width: char_end_pos - char_start_pos,
});
word_char_idx += 1;

out.extend(s.nfkc());
} else {
word_chars.push(Char {
offset: word_char_idx,
pos: char_start_pos,
width: char_end_pos - char_start_pos,
});

word_char_idx += 1;
out.extend(s.nfkc());
}
}

trailing_space = is_whitespace;

end = current.pos + x_off + current.width;
word_end_pos = (span.transform.matrix * Vector2F::new(end, 0.0)).x();
word_end_pos = char_end_pos;

if word_start {
y_min = span.rect.min_y();
y_max = span.rect.max_y();
word_start_pos = (span.transform.matrix * Vector2F::new(current.pos + x_off, 0.0)).x();
word_start_pos = char_start_pos;
word_start = false;
} else {
y_min = y_min.min(span.rect.min_y());
Expand All @@ -110,7 +139,8 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
y: y_min,
h: y_max - y_min,
w: word_end_pos - word_start_pos
}
},
chars: take(&mut word_chars)
});

words
Expand Down

0 comments on commit c4cccba

Please sign in to comment.