diff --git a/examples/text.rs b/examples/text.rs index beb0798..416f2cb 100644 --- a/examples/text.rs +++ b/examples/text.rs @@ -13,15 +13,8 @@ fn main() { println!("# page {}", 0 + 1); for run in flow.runs { for line in run.lines { - for w in line.words { - println!("{}", w.text); - } + println!("{}", line.words.iter().map(|w| w.text.as_str()).format(" ")); } } - // for line in flow.lines { - // for w in line.words { - // println!("{}", w.text); - // } - // } // } } diff --git a/src/text.rs b/src/text.rs index fef537b..0e08b6b 100644 --- a/src/text.rs +++ b/src/text.rs @@ -6,76 +6,74 @@ use unicode_normalization::UnicodeNormalization; use crate::{util::avg, flow::{Word, Rect}}; pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator> + Clone) -> Vec { + let word_gap = analyze_word_gap(items.clone()); let mut words: Vec = vec![]; - - // Calculate gaps between each char, the unit is em, relative to the font size. - let gaps = items.clone() - .flat_map(|s| { - // the transform matrix is from em space to device space - // so we need to invert it - let tr_inv = s.transform.matrix.inverse(); - let pos = (tr_inv * s.transform.vector).x(); - s.chars.iter() - .filter(|c| !s.text[c.offset..].chars().next().unwrap().is_whitespace()) - .map(move |c| (c.pos + pos, c.pos + pos + c.width, s.font_size)) - }) - .tuple_windows() - .filter(|(a, b)| b.0 > a.0) - .map(|(a, b)| (b.0 - a.1).max(0.01).min(0.25 * (a.2 + b.2))); - - let font_size = avg(items.clone().map(|s| s.font_size)).unwrap(); - //gaps.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); - let space_gap = (0.5 * font_size).min(2.0 * avg(gaps).unwrap_or(0.0)); //2.0 * gaps[gaps.len()/2]; - let mut end = 0.; // trailing edge of the last char - // out中最后一个字符是否是空格 + + // Whether the last processed TextChar is a space let mut trailing_space = out.chars().last().map(|c| c.is_whitespace()).unwrap_or(true); + let mut word_start_pos = 0.0; + let mut word_end_pos = 0.0; + let mut word_start_idx = out.len(); let mut y_min = f32::INFINITY; let mut y_max = -f32::INFINITY; let mut word_start = true; - let mut word_end = 0.0; for span in items { - let mut pos = 0; // byte index of last char into span.text + let mut offset = 0; // byte index of last char into span.text let tr_inv = span.transform.matrix.inverse(); let x_off = (tr_inv * span.transform.vector).x(); - for c in span.chars.iter() { - // current string of TextChar - let s = &span.text[pos..c.offset]; - if c.offset > 0 { - let is_whitespace = s.chars().all(|c| c.is_whitespace()); - // 在不为空格的时候, 将 s 写入 out. - if !trailing_space || !is_whitespace { - out.extend(s.nfkc()); + let chars = span.chars.as_slice(); + for (i, c) in chars.iter().enumerate() { + let next_offset = chars.get(i + 1).map_or(span.text.len(), |next| next.offset); + let s: &str = &span.text[offset..next_offset]; + + out.extend(s.nfkc()); + + let is_whitespace = s.chars().all(|c| c.is_whitespace()); + let len = s.chars().count(); + if trailing_space { + if !is_whitespace { + word_start = true; + word_start_idx = out.len() - len; } trailing_space = is_whitespace; + } else { + trailing_space = is_whitespace; + if is_whitespace { + words.push(Word { + text: out[word_start_idx..out.len()-len].into(), + rect: Rect { + x: word_start_pos, + y: y_min, + h: y_max - y_min, + w: word_end_pos - word_start_pos + } + }); + } else if c.pos + x_off > end + word_gap { + words.push(Word { + text: out[word_start_idx..].into(), + rect: Rect { + x: word_start_pos, + y: y_min, + h: y_max - y_min, + w: word_end_pos - word_start_pos + } + }); + + out.push(' '); + trailing_space = true; + word_start = true; + word_start_idx = out.len() - 1; + } } - // 在 s 不为空格,且有gap 的时候,记录一个 word. - if !trailing_space && c.pos + x_off > end + space_gap { - words.push(Word { - text: out[word_start_idx..].into(), - rect: Rect { - x: word_start_pos, - y: y_min, - h: y_max - y_min, - w: word_end - word_start_pos - } - }); - - out.push(' '); - trailing_space = true; - word_start = true; - word_start_idx = out.len(); - } - pos = c.offset; + end = c.pos + x_off + c.width; - if c.offset == 0 || !trailing_space { - word_end = (span.transform.matrix * Vector2F::new(end, 0.0)).x(); - } + word_end_pos = (span.transform.matrix * Vector2F::new(end, 0.0)).x(); if word_start { y_min = span.rect.min_y(); @@ -86,25 +84,68 @@ pub fn concat_text<'a, E: Encoder + 'a>(out: &mut String, items: impl Iterator(items: impl Iterator> + Clone) -> f32 { + let gaps = items.clone() + .flat_map(|s| { + // the transform matrix is from em space to device space + // so we need to invert it + let tr_inv = s.transform.matrix.inverse(); + let pos = (tr_inv * s.transform.vector).x(); + + s.chars.iter() + .filter(|c| !s.text[c.offset..].chars().next().unwrap().is_whitespace()) + .map(move |c| (c.pos + pos, c.pos + pos + c.width, s.font_size)) + }) + .tuple_windows() + .filter(|(a, b)| b.0 > a.0) + .map(|(a, b)| (b.0 - a.1).max(0.01).min(0.25 * (a.2 + b.2))); + + let avg_font_size = avg(items.clone().map(|s| s.font_size)).unwrap(); + //gaps.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap()); + + (0.5 * avg_font_size).min(2.0 * avg(gaps).unwrap_or(0.0)) //2.0 * gaps[gaps.len()/2]; +} + #[cfg(test)] mod tests { use pathfinder_geometry::{rect::RectF, transform2d::Transform2F};