Add chars to word

pdf-rs · Dec 14, 2024 · c4cccba · c4cccba
1 parent d22bd40
commit c4cccba
Show file tree

Hide file tree

Showing 3 changed files with 113 additions and 17 deletions.
diff --git a/examples/text.rs b/examples/text.rs
@@ -12,7 +12,12 @@ fn main() {
 
         for run in flow.runs {
             for line in run.lines {
-                println!("{}", line.words.iter().map(|w| w.text.as_str()).format(" "));
+                for word in line.words {
+                    println!("{}", word.text.as_str());
+                    for char in word.chars {
+                        println!("{:?}", char);
+                    }
+                }
             }
         }
     // }

diff --git a/src/flow.rs b/src/flow.rs
@@ -15,17 +15,33 @@ use table::Table;
 pub struct Word {
     pub text: String,
     pub rect: Rect,
+    pub chars: Vec<Char>
 }
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct Char {
+    pub offset: i32,
+    pub pos: f32,
+    pub width: f32,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct Line {
     pub words: Vec<Word>,
+    pub rect: Rect,
 }
 #[derive(Serialize, Deserialize)]
 pub struct Run {
     pub lines: Vec<Line>,
     pub kind: RunType,
 }
 
+impl Run {
+    pub fn rect(&self) -> Option<Rect> {
+        self.lines.iter().map(|s| s.rect).reduce(|a, b| a.union(b))
+    }
+}
+
 #[derive(Serialize, Deserialize)]
 pub enum RunType {
     ParagraphContinuation,
@@ -55,6 +71,51 @@ impl From<RectF> for Rect {
     }
 }
 
+impl Rect {
+    pub fn union(self, other: Rect) -> Rect {
+        let min_x = self.x.min(other.x);
+        let min_y = self.y.min(other.y);
+        let max_x = (self.x + self.w).max(other.x + other.w);
+        let max_y = (self.y + self.h).max(other.y + other.h);
+
+        Rect {
+            x: min_x,
+            y: min_y,
+            w: max_x - min_x,
+            h: max_y - min_y
+        }
+    }
+
+    pub fn intersects(self, other: Rect) -> bool {
+        let self_max_x = self.x + self.w;
+        let self_max_y = self.y + self.h;
+
+        let other_max_x = other.x + other.w;
+        let other_max_y = other.y + other.h;
+
+        self.x < other_max_x && other.x < self_max_x && 
+        self.y < other_max_y && other.y < self_max_y
+    }
+
+    pub fn intersection(self, other: Rect) -> Option<Rect> {
+        if !self.intersects(other) {
+            None
+        } else {
+            let min_x = self.x.max(other.x);
+            let min_y = self.y.max(other.y);
+            let max_x = (self.x + self.w).min(other.x + other.w);
+            let max_y = (self.y + self.h).min(other.y + other.h);
+
+            Some(Rect {
+                x: min_x,
+                y: min_y,
+                w: max_x - min_x,
+                h: max_y - min_y
+            })
+        }
+    }
+}
+
 #[derive(Clone, Debug, Serialize)]
 pub struct CellContent {
     pub text: String,
@@ -74,11 +135,11 @@ impl Flow {
             runs: vec![]
         }
     }
-    pub fn add_line(&mut self, words: Vec<Word>, kind: RunType) {
+    pub fn add_line(&mut self, words: Vec<Word>, kind: RunType, rect: Rect) {
         if words.len() > 0 {
             self.runs.push(Run {
-                lines: vec![Line { words }], 
-                kind
+                lines: vec![Line { words, rect}], 
+                kind,
             });
         }
     }
@@ -107,7 +168,7 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
                     _ => RunType::Paragraph,
                 };
 
-                flow.add_line(words, t);
+                flow.add_line(words, t, bbox.into());
             }
         }
         Node::Grid { ref x, ref y, ref cells, tag } => {
@@ -129,7 +190,7 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
                         _ => RunType::Paragraph,
                     };
 
-                    flow.add_line(words, t);
+                    flow.add_line(words, t, bbox.into());
                 }
                 NodeTag::Paragraph => {
                     assert_eq!(x.len(), 0, "For a paragraph x gaps should be empty");
@@ -203,7 +264,7 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
                                     kind: match class {
                                         Class::Header => RunType::Header,
                                         _ => RunType::Paragraph
-                                    }
+                                    },
                                 });
                                 para_start = line_start;
                             }
@@ -214,7 +275,7 @@ pub(crate) fn build<E: Encoder>(mut flow: &mut Flow, spans: &[TextSpan<E>], node
                             let words = concat_text(&mut text, indices[line_start..end].iter().flat_map(|&i| spans.get(i)));
 
                             if words.len() > 0 {
-                                flow_lines.push(Line { words });
+                                flow_lines.push(Line { words , rect: line_bbox.into()});
                             }
                         }
                         if para_start == line_start {

diff --git a/src/text.rs b/src/text.rs
@@ -1,9 +1,11 @@
+use std::mem::take;
+
 use font::Encoder;
 use pathfinder_geometry::vector::Vector2F;
 use pdf_render::TextSpan;
 use itertools::Itertools;
 use unicode_normalization::UnicodeNormalization;
-use crate::{util::avg, flow::{Word, Rect}};
+use crate::{flow::{Char, Rect, Word}, util::avg};
 
 pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<Item=&'a TextSpan<E>> + Clone) -> Vec<Word> {
     let word_gap = analyze_word_gap(items.clone());
@@ -28,6 +30,8 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
     let mut y_max = -f32::INFINITY;
 
     let mut word_start = true;
+    let mut word_chars = vec![];
+    let mut word_char_idx = 0;
 
     for span in items {
         let mut offset = 0; // byte index of last char into span.text
@@ -43,15 +47,26 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
             } else {
                 s = &span.text[offset..];
             }
+            end = current.pos + x_off + current.width;
+
+            let char_start_pos = (span.transform.matrix * Vector2F::new(current.pos + x_off, 0.0)).x();
+            let char_end_pos = (span.transform.matrix * Vector2F::new(end, 0.0)).x();
 
             let is_whitespace = s.chars().all(|c| c.is_whitespace());
-            
+
             if trailing_space {
                 if !is_whitespace {
                     word_start = true;
                     word_start_idx = out.len();
 
+                    word_chars.push(Char {
+                        offset: 0,
+                        pos: char_start_pos,
+                        width: char_end_pos - char_start_pos,
+                    });
                     out.extend(s.nfkc());
+
+                    word_char_idx += 1;
                 }
             } else {
                 if is_whitespace {
@@ -62,10 +77,12 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
                             y: y_min,
                             h: y_max - y_min,
                             w: word_end_pos - word_start_pos
-                        }
+                        },
+                        chars: take(&mut word_chars)
                     });
                     out.push_str(" ");
                     word_start_idx = out.len();
+                    word_char_idx = 0;
                 } else if current.pos + x_off > end + word_gap {
                     words.push(Word {
                         text: out[word_start_idx..].into(),
@@ -74,27 +91,39 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
                             y: y_min,
                             h: y_max - y_min,
                             w: word_end_pos - word_start_pos
-                        }
+                        },
+                        chars: take(&mut word_chars)
                     });
 
                     word_start = true;
                     word_start_idx = out.len();
+                    word_chars.push(Char {
+                        offset: 0,
+                        pos: char_start_pos,
+                        width: char_end_pos - char_start_pos,
+                    });
+                    word_char_idx += 1;
 
                     out.extend(s.nfkc());
                 } else {
+                    word_chars.push(Char {
+                        offset: word_char_idx,
+                        pos: char_start_pos,
+                        width: char_end_pos - char_start_pos,
+                    });
+
+                    word_char_idx += 1;
                     out.extend(s.nfkc());
                 }
             }
-
             trailing_space = is_whitespace;
 
-            end = current.pos + x_off + current.width;
-            word_end_pos = (span.transform.matrix * Vector2F::new(end, 0.0)).x();
+            word_end_pos = char_end_pos;
 
             if word_start {
                 y_min = span.rect.min_y();
                 y_max = span.rect.max_y();
-                word_start_pos = (span.transform.matrix * Vector2F::new(current.pos + x_off, 0.0)).x();
+                word_start_pos = char_start_pos;
                 word_start = false;
             } else {
                 y_min = y_min.min(span.rect.min_y());
@@ -110,7 +139,8 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator<I
             y: y_min,
             h: y_max - y_min,
             w: word_end_pos - word_start_pos
-        }
+        },
+        chars: take(&mut word_chars)
     });
 
     words