From 81562da04a2fea1af9db5bc297ea5b00954340c2 Mon Sep 17 00:00:00 2001 From: benolt Date: Mon, 16 Sep 2024 14:55:47 +0200 Subject: [PATCH] Add page numbers to text groups --- .../src/file_parser/file_parser_helper.rs | 26 ++++++-- .../src/file_parser/file_parser_types.rs | 63 ++++++++++--------- .../file_parser/local_parsing/docx_parsing.rs | 13 +++- .../file_parser/local_parsing/html_parsing.rs | 5 ++ .../file_parser/local_parsing/md_parsing.rs | 4 ++ .../file_parser/local_parsing/pdf_parsing.rs | 8 ++- .../vector_resource/vector_resource_types.rs | 2 +- 7 files changed, 85 insertions(+), 36 deletions(-) diff --git a/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser_helper.rs b/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser_helper.rs index 414ff392b..5f4e2c6b8 100644 --- a/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser_helper.rs +++ b/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser_helper.rs @@ -351,7 +351,11 @@ impl ShinkaiFileParser { (parsed_result.to_string(), serialized_metadata) } - pub fn parse_and_split_into_text_groups(text: String, max_node_text_size: u64) -> Vec { + pub fn parse_and_split_into_text_groups( + text: String, + max_node_text_size: u64, + page_number: Option, + ) -> Vec { let mut text_groups = Vec::new(); let (parsed_text, metadata, parsed_any_metadata) = ShinkaiFileParser::parse_and_extract_metadata(&text); let (parsed_md_text, md_metadata) = ShinkaiFileParser::parse_and_extract_md_metadata(&parsed_text); @@ -367,11 +371,23 @@ impl ShinkaiFileParser { let (parsed_chunk, metadata, _) = ShinkaiFileParser::parse_and_extract_metadata(&chunk); let (parsed_md_chunk, md_metadata) = ShinkaiFileParser::parse_and_extract_md_metadata(&parsed_chunk); let metadata = metadata.into_iter().chain(md_metadata).collect(); - text_groups.push(TextGroup::new(parsed_md_chunk, metadata, vec![], None)); + let mut text_group = TextGroup::new(parsed_md_chunk, metadata, vec![], None); + + if let Some(page_number) = page_number { + text_group.push_page_number(page_number); + } + + text_groups.push(text_group); } } else { let metadata = metadata.into_iter().chain(md_metadata).collect(); - text_groups.push(TextGroup::new(parsed_md_text, metadata, vec![], None)); + let mut text_group = TextGroup::new(parsed_md_text, metadata, vec![], None); + + if let Some(page_number) = page_number { + text_group.push_page_number(page_number); + } + + text_groups.push(text_group); } text_groups @@ -384,9 +400,11 @@ impl ShinkaiFileParser { depth: usize, text: String, max_node_text_size: u64, + page_number: Option, ) { if !text.is_empty() { - let created_text_groups = ShinkaiFileParser::parse_and_split_into_text_groups(text, max_node_text_size); + let created_text_groups = + ShinkaiFileParser::parse_and_split_into_text_groups(text, max_node_text_size, page_number); if depth > 0 { let mut parent_group = text_groups.last_mut(); diff --git a/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser_types.rs b/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser_types.rs index 7848905f5..843f632d8 100644 --- a/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser_types.rs +++ b/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser_types.rs @@ -75,39 +75,44 @@ impl TextGroup { } if let Some(page_number) = page_number { - let mut unique_page_numbers: HashSet = HashSet::new(); - - if let Some(page_numbers_metadata) = self.metadata.get(&ShinkaiFileParser::page_numbers_metadata_key()) { - let page_numbers_metadata: Result, _> = page_numbers_metadata - .trim_matches(|c| c == '[' || c == ']') - .split(",") - .map(|n| n.trim().parse::()) - .collect(); - - match page_numbers_metadata { - Ok(page_numbers) => { - for page_number in page_numbers { - unique_page_numbers.insert(page_number); - } + self.push_page_number(page_number); + } + } + + /// Pushes a page number into this TextGroup + pub fn push_page_number(&mut self, page_number: u32) { + let mut unique_page_numbers: HashSet = HashSet::new(); + + if let Some(page_numbers_metadata) = self.metadata.get(&ShinkaiFileParser::page_numbers_metadata_key()) { + let page_numbers_metadata: Result, _> = page_numbers_metadata + .trim_matches(|c| c == '[' || c == ']') + .split(",") + .map(|n| n.trim().parse::()) + .collect(); + + match page_numbers_metadata { + Ok(page_numbers) => { + for page_number in page_numbers { + unique_page_numbers.insert(page_number); } - Err(_) => {} } + Err(_) => {} } - - unique_page_numbers.insert(page_number); - - self.metadata.insert( - ShinkaiFileParser::page_numbers_metadata_key(), - format!( - "[{}]", - unique_page_numbers - .iter() - .map(|n| n.to_string()) - .collect::>() - .join(", ") - ), - ); } + + unique_page_numbers.insert(page_number); + + self.metadata.insert( + ShinkaiFileParser::page_numbers_metadata_key(), + format!( + "[{}]", + unique_page_numbers + .iter() + .map(|n| n.to_string()) + .collect::>() + .join(", ") + ), + ); } /// Pushes a sub-group into this TextGroup diff --git a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/docx_parsing.rs b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/docx_parsing.rs index 9ce1d45c6..9939f8242 100644 --- a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/docx_parsing.rs +++ b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/docx_parsing.rs @@ -94,6 +94,7 @@ impl LocalFileParser { heading_depth, current_text.clone(), max_node_text_size, + None, ); current_text = "".to_string(); @@ -108,6 +109,7 @@ impl LocalFileParser { heading_depth, text, max_node_text_size, + None, ); heading_depth += 1; return; @@ -127,6 +129,7 @@ impl LocalFileParser { heading_depth, current_text.clone(), max_node_text_size, + None, ); current_text = text; } @@ -156,6 +159,7 @@ impl LocalFileParser { heading_depth, current_text.clone(), max_node_text_size, + None, ); current_text = "".to_string(); @@ -165,12 +169,19 @@ impl LocalFileParser { heading_depth, table_text, max_node_text_size, + None, ); } _ => {} }); - ShinkaiFileParser::push_text_group_by_depth(&mut text_groups, heading_depth, current_text, max_node_text_size); + ShinkaiFileParser::push_text_group_by_depth( + &mut text_groups, + heading_depth, + current_text, + max_node_text_size, + None, + ); Ok(text_groups) } diff --git a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/html_parsing.rs b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/html_parsing.rs index 5f37eaf08..8c8a601c4 100644 --- a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/html_parsing.rs +++ b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/html_parsing.rs @@ -120,6 +120,7 @@ impl LocalFileParser { heading_parents.len(), node_text.trim().to_owned(), max_node_text_size, + None, ); node_text.clear(); } @@ -131,6 +132,7 @@ impl LocalFileParser { heading_parents.len(), node_text.trim().to_owned(), max_node_text_size, + None, ); node_text.clear(); @@ -229,6 +231,7 @@ impl LocalFileParser { heading_depth, inner_text.trim().to_owned(), max_node_text_size, + None, ); } "a" => { @@ -284,6 +287,7 @@ impl LocalFileParser { heading_parents.len(), inner_text.trim().to_owned(), max_node_text_size, + None, ); } "caption" => { @@ -339,6 +343,7 @@ impl LocalFileParser { heading_parents.len(), result_text.trim().to_owned(), max_node_text_size, + None, ); Ok(text_groups) diff --git a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/md_parsing.rs b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/md_parsing.rs index 85e9d95e8..f5960eb88 100644 --- a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/md_parsing.rs +++ b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/md_parsing.rs @@ -73,6 +73,7 @@ impl LocalFileParser { heading_parents.len(), current_text.clone(), max_node_text_size, + None, ); current_text = "".to_string(); @@ -104,6 +105,7 @@ impl LocalFileParser { heading_depth, text.to_string(), max_node_text_size, + None, ); } NodeValue::Paragraph => { @@ -169,6 +171,7 @@ impl LocalFileParser { heading_parents.len(), current_text.clone(), max_node_text_size, + None, ); current_text = "".to_string(); } @@ -181,6 +184,7 @@ impl LocalFileParser { heading_parents.len(), current_text.clone(), max_node_text_size, + None, ); Ok(text_groups) diff --git a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/pdf_parsing.rs b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/pdf_parsing.rs index a80dbca42..5a63673dc 100644 --- a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/pdf_parsing.rs +++ b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/pdf_parsing.rs @@ -20,7 +20,13 @@ impl LocalFileParser { for page in parsed_pages.into_iter() { for pdf_text in page.content.into_iter() { - ShinkaiFileParser::push_text_group_by_depth(&mut text_groups, 0, pdf_text.text, max_node_text_size); + ShinkaiFileParser::push_text_group_by_depth( + &mut text_groups, + 0, + pdf_text.text, + max_node_text_size, + Some(page.page_number.try_into().unwrap_or_default()), + ); } } diff --git a/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource_types.rs b/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource_types.rs index 859b480b3..f39e81493 100644 --- a/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource_types.rs +++ b/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource_types.rs @@ -211,7 +211,7 @@ impl RetrievedNode { if let Some(metadata) = &self.node.metadata { if let Some(page_numbers) = metadata.get(&ShinkaiFileParser::page_numbers_metadata_key()) { if !page_numbers.is_empty() { - return format!("Pgs: {}", page_numbers); + return format!("Pages: {}", page_numbers); } } }