From 402a26b50d2c9b9e024b7bbd8bffaa5ada7bf10b Mon Sep 17 00:00:00 2001 From: vidy Date: Thu, 12 Dec 2024 23:36:16 +0800 Subject: [PATCH] Fix word gap not handle correctly --- src/flow.rs | 29 +++++++++++++++++++---------- src/node.rs | 11 +++++++++-- src/text.rs | 35 ++++++++++++++++++++--------------- 3 files changed, 48 insertions(+), 27 deletions(-) diff --git a/src/flow.rs b/src/flow.rs index d377244..b589489 100644 --- a/src/flow.rs +++ b/src/flow.rs @@ -91,8 +91,13 @@ pub(crate) fn build(mut flow: &mut Flow, spans: &[TextSpan], node match *node { Node::Final { ref indices } => { if indices.len() > 0 { - let node_spans = indices.iter().flat_map(|&i| spans.get(i)); - let bbox = node_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap(); + let node_spans = indices.iter() + .flat_map(|&i| spans.get(i)); + let bbox = node_spans.clone() + .map(|s| s.rect) + .reduce(|a, b| a.union_rect(b)) + .unwrap(); + let class = classify(node_spans.clone()); let mut text = String::new(); let words = concat_text(&mut text, node_spans); @@ -111,25 +116,26 @@ pub(crate) fn build(mut flow: &mut Flow, spans: &[TextSpan], node NodeTag::Line => { let mut indices = vec![]; node.indices(&mut indices); + let line_spans = indices.iter().flat_map(|&i| spans.get(i)); let bbox: RectF = line_spans.clone().map(|s| s.rect).reduce(|a, b| a.union_rect(b)).unwrap().into(); - let mut text = String::new(); - let words = concat_text(&mut text, line_spans.clone()); let class = classify(line_spans.clone()); + let mut text = String::new(); + let words = concat_text(&mut text, line_spans); let t = match class { Class::Header => RunType::Header, _ => RunType::Paragraph, }; - flow.add_line(words, t); } NodeTag::Paragraph => { - assert_eq!(x.len(), 0); + assert_eq!(x.len(), 0, "For a paragraph x gaps should be empty"); let mut lines: Vec<(RectF, usize)> = vec![]; let mut indices = vec![]; + for n in cells { let start = indices.len(); n.indices(&mut indices); @@ -142,8 +148,10 @@ pub(crate) fn build(mut flow: &mut Flow, spans: &[TextSpan], node let para_spans = indices.iter().flat_map(|&i| spans.get(i)); let class = classify(para_spans.clone()); + // the bounding box the paragraph let bbox = lines.iter().map(|t| t.0).reduce(|a, b| a.union_rect(b)).unwrap(); let line_height = avg(para_spans.map(|s| s.rect.height())).unwrap(); + // classify the lines by this vertical line let left_margin = bbox.min_x() + 0.5 * line_height; @@ -158,9 +166,10 @@ pub(crate) fn build(mut flow: &mut Flow, spans: &[TextSpan], node left += 1; } } + //typically paragraphs are indented to the right and longer than 2 lines. + //then there will be a higher left count than right count. - // typically paragraphs are indented to the right and longer than 2 lines. - // then there will be a higher left count than right count. + //TODO: What if a paragraph with two lines starts at the same x? It will result in left = right. let indent = left > right; let mut para_start = 0; @@ -180,9 +189,9 @@ pub(crate) fn build(mut flow: &mut Flow, spans: &[TextSpan], node } }); para_start = line_start; - } else { - text.push('\n'); } + //Always add a line break for new line, which will be treated as whitespace in concat_text method + text.push('\n'); } if end > line_start { let words = concat_text(&mut text, indices[line_start..end].iter().flat_map(|&i| spans.get(i))); diff --git a/src/node.rs b/src/node.rs index 5e3c857..9a61fbf 100644 --- a/src/node.rs +++ b/src/node.rs @@ -91,7 +91,14 @@ pub fn exclude_header_and_footer<'a, E: Encoder>(boxes: &'a mut [(RectF, usize)] #[derive(Debug)] pub enum Node { Final { indices: Vec }, - Grid { x: Vec, y: Vec, cells: Vec, tag: NodeTag }, + Grid { + // vertical gaps + x: Vec, + // horizontal gaps + y: Vec, + cells: Vec, + tag: NodeTag + }, Table { table: table::Table> }, } impl Node { @@ -170,7 +177,7 @@ fn split(boxes: &mut [(RectF, usize)], spans: &[TextSpan], lines: return overlapping_lines(boxes); } - //TODO: Disable the table::split for now,becuase it is not accurate + //TODO: Disable the table::split for now,because it is not accurate // if x_gaps.len() > 1 && y_gaps.len() > 1 { // return table::split(boxes, spans, lines); // } diff --git a/src/text.rs b/src/text.rs index 2b3771d..2e3032d 100644 --- a/src/text.rs +++ b/src/text.rs @@ -14,40 +14,39 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator(out: &mut String, items: impl Iterator end + word_gap { words.push(Word { text: out[word_start_idx..].into(), @@ -66,13 +67,17 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator(out: &mut String, items: impl Iterator(out: &mut String, items: impl Iterator