diff --git a/Cargo.lock b/Cargo.lock index bbefdcf9d..1a37ef9ba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6648,6 +6648,7 @@ dependencies = [ "shinkai_sqlite", "tempfile", "thiserror 2.0.3", + "tokio", "urlencoding", "utoipa", ] diff --git a/shinkai-libs/shinkai-embedding/src/lib.rs b/shinkai-libs/shinkai-embedding/src/lib.rs index 883c7d517..8ad43d457 100644 --- a/shinkai-libs/shinkai-embedding/src/lib.rs +++ b/shinkai-libs/shinkai-embedding/src/lib.rs @@ -1,3 +1,4 @@ pub mod embedding_generator; pub mod model_type; pub mod shinkai_embedding_errors; +pub mod mock_generator; \ No newline at end of file diff --git a/shinkai-libs/shinkai-embedding/src/mock_generator.rs b/shinkai-libs/shinkai-embedding/src/mock_generator.rs new file mode 100644 index 000000000..bd2ee8251 --- /dev/null +++ b/shinkai-libs/shinkai-embedding/src/mock_generator.rs @@ -0,0 +1,50 @@ +use crate::model_type::EmbeddingModelType; +use crate::shinkai_embedding_errors::ShinkaiEmbeddingError; +use async_trait::async_trait; +use crate::embedding_generator::EmbeddingGenerator; + +#[derive(Clone)] +pub struct MockGenerator { + model_type: EmbeddingModelType, + num_embeddings: usize, +} + +impl MockGenerator { + pub fn new(model_type: EmbeddingModelType, num_embeddings: usize) -> Self { + MockGenerator { + model_type, + num_embeddings, + } + } +} + +#[async_trait] +impl EmbeddingGenerator for MockGenerator { + fn model_type(&self) -> EmbeddingModelType { + self.model_type.clone() + } + + fn set_model_type(&mut self, model_type: EmbeddingModelType) { + self.model_type = model_type; + } + + fn box_clone(&self) -> Box { + Box::new((*self).clone()) + } + + fn generate_embedding_blocking(&self, _input_string: &str) -> Result, ShinkaiEmbeddingError> { + Ok(vec![0.0; self.num_embeddings]) + } + + fn generate_embeddings_blocking(&self, input_strings: &Vec) -> Result>, ShinkaiEmbeddingError> { + Ok(input_strings.iter().map(|_| vec![0.0; self.num_embeddings]).collect()) + } + + async fn generate_embedding(&self, _input_string: &str) -> Result, ShinkaiEmbeddingError> { + Ok(vec![0.0; self.num_embeddings]) + } + + async fn generate_embeddings(&self, input_strings: &Vec) -> Result>, ShinkaiEmbeddingError> { + Ok(input_strings.iter().map(|_| vec![0.0; self.num_embeddings]).collect()) + } +} diff --git a/shinkai-libs/shinkai-fs/Cargo.toml b/shinkai-libs/shinkai-fs/Cargo.toml index f57bb74a6..d1fc0d5a0 100644 --- a/shinkai-libs/shinkai-fs/Cargo.toml +++ b/shinkai-libs/shinkai-fs/Cargo.toml @@ -13,7 +13,7 @@ bincode = { workspace = true } serde_json = { workspace = true } rand = { workspace = true } blake3 = { workspace = true } -# tokio = { workspace = true, features = ["full"] } +tokio = { workspace = true, features = ["full"] } chrono = { workspace = true } comrak = { version = "0.22.0", default-features = true } thiserror = "2.0.3" diff --git a/shinkai-libs/shinkai-fs/src/file_parser/file_parser.rs b/shinkai-libs/shinkai-fs/src/file_parser/file_parser.rs deleted file mode 100644 index 12d0be8ad..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parser/file_parser.rs +++ /dev/null @@ -1,280 +0,0 @@ -use shinkai_embedding::embedding_generator::EmbeddingGenerator; - -use super::{file_parser_types::TextGroup, utils::TextChunkingStrategy}; -use super::local_parsing::LocalFileParser; - -use crate::shinkai_fs_error::ShinkaiFsError; -use std::{future::Future, pin::Pin}; - -pub struct ShinkaiFileParser; - -impl ShinkaiFileParser { - pub async fn initialize_local_file_parser() -> Result<(), Box> { - use shinkai_ocr::image_parser::ImageParser; - ImageParser::check_and_download_dependencies().await - } - - /// Processes the input file into a BaseVectorResource. - pub async fn process_file_into_resource( - file_buffer: Vec, - generator: &dyn EmbeddingGenerator, - file_name: String, - desc: Option, - parsing_tags: &Vec, // TODO: do we need this? - max_node_text_size: u64, - ) -> Result { - let cleaned_name = ShinkaiFileParser::clean_name(&file_name); - let source = VRSourceReference::from_file(&file_name, TextChunkingStrategy::V1)?; - let text_groups = - Self::process_file_into_text_groups(file_buffer, file_name, max_node_text_size, source.clone()).await?; - - ShinkaiFileParser::process_groups_into_resource( - text_groups, - generator, - cleaned_name, - desc, - source, - parsing_tags, - max_node_text_size, - ) - .await - } - - /// Processes the input file into a list of `TextGroup` with no embedding generated yet. - pub async fn process_file_into_text_groups( - file_buffer: Vec, - file_name: String, - max_node_text_size: u64, - source: VRSourceReference, - ) -> Result, ShinkaiFsError> { - LocalFileParser::process_file_into_grouped_text(file_buffer, file_name, max_node_text_size, source) - } - - /// Processes an ordered list of `TextGroup`s into a ready-to-go BaseVectorResource - pub async fn process_groups_into_resource( - text_groups: Vec, - generator: &dyn EmbeddingGenerator, - name: String, - desc: Option, - source: VRSourceReference, - parsing_tags: &Vec, - max_node_text_size: u64, - ) -> Result { - Self::process_groups_into_resource_with_custom_collection( - text_groups, - generator, - name, - desc, - source, - parsing_tags, - max_node_text_size, - ShinkaiFileParser::collect_texts_and_indices, - ) - .await - } - - /// Processes an ordered list of `TextGroup`s into a ready-to-go BaseVectorResource. - /// Allows specifying a custom collection function. - pub async fn process_groups_into_resource_with_custom_collection( - text_groups: Vec, - generator: &dyn EmbeddingGenerator, - name: String, - desc: Option, - source: VRSourceReference, - parsing_tags: &Vec, - max_node_text_size: u64, - collect_texts_and_indices: fn(&[TextGroup], u64, Vec) -> (Vec, Vec<(Vec, usize)>), - ) -> Result { - let new_text_groups = ShinkaiFileParser::generate_text_group_embeddings( - text_groups, - generator.box_clone(), - 31, - max_node_text_size, - collect_texts_and_indices, - ) - .await?; - - let mut resource = ShinkaiFileParser::process_new_doc_resource_with_embeddings_already_generated( - new_text_groups, - &*generator, - &name, - desc, - source, - parsing_tags, - None, - ) - .await?; - resource.as_trait_object_mut().set_distribution_info(distribution_info); - Ok(resource) - } - - /// Processes an ordered list of `TextGroup`s into a - /// a ready-to-go BaseVectorResource. Allows specifying a custom collection function. - pub fn process_groups_into_resource_blocking_with_custom_collection( - text_groups: Vec, - generator: &dyn EmbeddingGenerator, - name: String, - desc: Option, - source: VRSourceReference, - parsing_tags: &Vec, - max_node_text_size: u64, - collect_texts_and_indices: fn(&[TextGroup], u64, Vec) -> (Vec, Vec<(Vec, usize)>), - distribution_info: DistributionInfo, - ) -> Result { - // Group elements together before generating the doc - let cloned_generator = generator.box_clone(); - - // Use block_on to run the async-based batched embedding generation logic - let new_text_groups = ShinkaiFileParser::generate_text_group_embeddings_blocking( - &text_groups, - cloned_generator, - 31, - max_node_text_size, - collect_texts_and_indices, - )?; - - let mut resource = ShinkaiFileParser::process_new_doc_resource_blocking_with_embeddings_already_generated( - new_text_groups, - &*generator, - &name, - desc, - source, - parsing_tags, - None, - )?; - - resource.as_trait_object_mut().set_distribution_info(distribution_info); - Ok(resource) - } - - /// Recursively processes all text groups & their sub groups into DocumentResources. - /// This method assumes your text groups already have embeddings generated for them. - fn process_new_doc_resource_with_embeddings_already_generated<'a>( - text_groups: Vec, - generator: &'a dyn EmbeddingGenerator, - name: &'a str, - desc: Option, - source: VRSourceReference, - parsing_tags: &'a Vec, - resource_embedding: Option, - ) -> Pin> + Send + 'a>> { - Box::pin(async move { - let name = ShinkaiFileParser::clean_name(name); - let max_embedding_token_count = generator.model_type().max_input_token_count(); - let resource_desc = Self::_setup_resource_description( - desc, - &text_groups, - max_embedding_token_count, - max_embedding_token_count.checked_div(2).unwrap_or(100), - ); - let mut doc = DocumentVectorResource::new_empty(&name, resource_desc.as_deref(), source.clone(), true); - doc.set_embedding_model_used(generator.model_type()); - - // Sets the keywords - let keywords = Self::extract_keywords(&text_groups, 25); - doc.keywords_mut().set_keywords(keywords.clone()); - doc.keywords_mut().update_keywords_embedding(generator).await?; - // Sets a Resource Embedding if none provided. Primarily only used at the root level as the rest should already have them. - match resource_embedding { - Some(embedding) => doc.set_resource_embedding(embedding), - None => { - doc.update_resource_embedding(generator, None).await?; - } - } - - // Add each text group as either Vector Resource Nodes, - // or data-holding Nodes depending on if each has any sub-groups - for grouped_text in &text_groups { - let (_, metadata, has_sub_groups, new_name) = Self::process_grouped_text(grouped_text); - if has_sub_groups { - let new_doc = Self::process_new_doc_resource_with_embeddings_already_generated( - grouped_text.sub_groups.clone(), - generator, - &new_name, - None, - source.clone(), - parsing_tags, - grouped_text.embedding.clone(), - ) - .await?; - doc.append_vector_resource_node_auto(new_doc, metadata)?; - } else { - if grouped_text.text.len() <= 2 { - continue; - } - if let Some(embedding) = &grouped_text.embedding { - doc.append_text_node(&grouped_text.text, metadata, embedding.clone(), parsing_tags)?; - } else { - let embedding = generator.generate_embedding_default(&grouped_text.text).await?; - doc.append_text_node(&grouped_text.text, metadata, embedding, parsing_tags)?; - } - } - } - - Ok(BaseVectorResource::Document(doc)) - }) - } - - /// Recursively processes all text groups & their sub groups into DocumentResources. - /// This method assumes your text groups already have embeddings generated for them. - fn process_new_doc_resource_blocking_with_embeddings_already_generated( - text_groups: Vec, - generator: &dyn EmbeddingGenerator, - name: &str, - desc: Option, - source: VRSourceReference, - parsing_tags: &Vec, - resource_embedding: Option, - ) -> Result { - let name = ShinkaiFileParser::clean_name(name); - let max_embedding_token_count = generator.model_type().max_input_token_count(); - let resource_desc = Self::_setup_resource_description( - desc, - &text_groups, - max_embedding_token_count, - max_embedding_token_count / 2, - ); - let mut doc = DocumentVectorResource::new_empty(&name, resource_desc.as_deref(), source.clone(), true); - doc.set_embedding_model_used(generator.model_type()); - - // Sets the keywords and generates a keyword embedding - let keywords = Self::extract_keywords(&text_groups, 25); - doc.keywords_mut().set_keywords(keywords.clone()); - doc.keywords_mut().update_keywords_embedding_blocking(generator)?; - // Sets a Resource Embedding if none provided. Primarily only used at the root level as the rest should already have them. - match resource_embedding { - Some(embedding) => doc.set_resource_embedding(embedding), - None => { - doc.update_resource_embedding_blocking(generator, None)?; - } - } - - for grouped_text in &text_groups { - let (_new_resource_id, metadata, has_sub_groups, new_name) = Self::process_grouped_text(grouped_text); - if has_sub_groups { - let new_doc = Self::process_new_doc_resource_blocking_with_embeddings_already_generated( - grouped_text.sub_groups.clone(), - generator, - &new_name, - None, - source.clone(), - parsing_tags, - grouped_text.embedding.clone(), - )?; - let _ = doc.append_vector_resource_node_auto(new_doc, metadata); - } else { - if grouped_text.text.len() <= 2 { - continue; - } - if let Some(embedding) = &grouped_text.embedding { - let _ = doc.append_text_node(&grouped_text.text, metadata, embedding.clone(), parsing_tags); - } else { - let embedding = generator.generate_embedding_default_blocking(&grouped_text.text)?; - let _ = doc.append_text_node(&grouped_text.text, metadata, embedding, parsing_tags); - } - } - } - - Ok(BaseVectorResource::Document(doc)) - } -} diff --git a/shinkai-libs/shinkai-fs/src/file_parser/file_parser_grouping.rs b/shinkai-libs/shinkai-fs/src/file_parser/file_parser_grouping.rs deleted file mode 100644 index 99db8b984..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parser/file_parser_grouping.rs +++ /dev/null @@ -1,315 +0,0 @@ -use super::file_parser::ShinkaiFileParser; -use super::file_parser_types::TextGroup; -use crate::shinkai_fs_error::ShinkaiFsError; -use keyphrases::KeyPhraseExtractor; -use regex::Regex; -use shinkai_embedding::embedding_generator::EmbeddingGenerator; -use std::collections::HashMap; -use std::{future::Future, pin::Pin}; - -impl ShinkaiFileParser { - /// Recursive function to collect all texts from the text groups and their subgroups - pub fn collect_texts_and_indices( - text_groups: &[TextGroup], - max_node_text_size: u64, - path: Vec, - ) -> (Vec, Vec<(Vec, usize)>) { - let mut texts = Vec::new(); - let mut indices = Vec::new(); - - for (i, text_group) in text_groups.iter().enumerate() { - texts.push(text_group.format_text_for_embedding(max_node_text_size)); - let mut current_path = path.clone(); - current_path.push(i); - indices.push((current_path.clone(), texts.len() - 1)); - - for (j, sub_group) in text_group.sub_groups.iter().enumerate() { - texts.push(sub_group.format_text_for_embedding(max_node_text_size)); - let mut sub_path = current_path.clone(); - sub_path.push(j); - indices.push((sub_path.clone(), texts.len() - 1)); - - let (sub_texts, sub_indices) = - Self::collect_texts_and_indices(&sub_group.sub_groups, max_node_text_size, sub_path); - texts.extend(sub_texts); - indices.extend(sub_indices); - } - } - - (texts, indices) - } - - /// Recursive function to assign the generated embeddings back to the text groups and their subgroups - fn assign_embeddings( - text_groups: &mut [TextGroup], - embeddings: &mut Vec>, - indices: &[(Vec, usize)], - ) { - for (path, flat_index) in indices { - if let Some(embedding) = embeddings.get(*flat_index) { - let mut target = &mut text_groups[path[0]]; - for &index in &path[1..] { - target = &mut target.sub_groups[index]; - } - target.embedding = Some(embedding.clone()); - } - } - } - - /// Recursively goes through all of the text groups and batch generates embeddings - /// for all of them in parallel, processing up to 10 futures at a time. - pub fn generate_text_group_embeddings( - text_groups: Vec, - generator: Box, - mut max_batch_size: u64, - max_node_text_size: u64, - collect_texts_and_indices: fn(&[TextGroup], u64, Vec) -> (Vec, Vec<(Vec, usize)>), - ) -> Pin, ShinkaiFsError>> + Send>> { - Box::pin(async move { - // Clone the input text_groups - - let mut text_groups = text_groups; - - // Collect all texts from the text groups and their subgroups - let (texts, indices) = collect_texts_and_indices(&text_groups, max_node_text_size, vec![]); - - // Generate embeddings for all texts in batches - let ids: Vec = vec!["".to_string(); texts.len()]; - let mut all_futures = Vec::new(); - let mut current_batch_futures = Vec::new(); - - for (index, batch) in texts.chunks(max_batch_size as usize).enumerate() { - let batch_texts = batch.to_vec(); - let generator_clone = generator.box_clone(); // Clone the generator for use in the future. - - // Use the `move` keyword to take ownership of `generator_clone` inside the async block. - let future = async move { generator_clone.generate_embeddings(&batch_texts).await }; - current_batch_futures.push(future); - - // If we've collected 10 futures or are at the last batch, add them to all_futures and start a new vector - if current_batch_futures.len() == 10 || index == texts.chunks(max_batch_size as usize).count() - 1 { - all_futures.push(current_batch_futures); - current_batch_futures = Vec::new(); - } - } - - // Process each group of up to 10 futures in sequence - let mut embeddings = Vec::new(); - for futures_group in all_futures { - let results = futures::future::join_all(futures_group).await; - for result in results { - match result { - Ok(batch_embeddings) => { - embeddings.extend(batch_embeddings); - } - Err(e) => { - if max_batch_size > 5 { - max_batch_size -= 5; - return Self::generate_text_group_embeddings( - text_groups, - generator, - max_batch_size, - max_node_text_size, - collect_texts_and_indices, - ) - .await; - } else { - return Err(e); - } - } - } - } - } - - // Assign the generated embeddings back to the text groups and their subgroups - Self::assign_embeddings(&mut text_groups, &mut embeddings, &indices); - - Ok(text_groups) - }) - } - - /// Recursively goes through all of the text groups and batch generates embeddings - /// for all of them. - pub fn generate_text_group_embeddings_blocking( - text_groups: &Vec, - generator: Box, - mut max_batch_size: u64, - max_node_text_size: u64, - collect_texts_and_indices: fn(&[TextGroup], u64, Vec) -> (Vec, Vec<(Vec, usize)>), - ) -> Result, ShinkaiFsError> { - // Clone the input text_groups - let mut text_groups = text_groups.clone(); - - // Collect all texts from the text groups and their subgroups - let (texts, indices) = collect_texts_and_indices(&text_groups, max_node_text_size, vec![]); - - // Generate embeddings for all texts in batches - let ids: Vec = vec!["".to_string(); texts.len()]; - let mut embeddings = Vec::new(); - for batch in texts.chunks(max_batch_size as usize) { - match generator.generate_embeddings_blocking(&batch.to_vec()) { - Ok(batch_embeddings) => { - embeddings.extend(batch_embeddings); - } - Err(e) => { - if max_batch_size > 5 { - max_batch_size -= 5; - return Self::generate_text_group_embeddings_blocking( - &text_groups, - generator, - max_batch_size, - max_node_text_size, - collect_texts_and_indices, - ); - } else { - return Err(e); - } - } - } - } - - // Assign the generated embeddings back to the text groups and their subgroups - Self::assign_embeddings(&mut text_groups, &mut embeddings, &indices); - - Ok(text_groups) - } - - /// Helper method for processing a grouped text for process_new_doc_resource - pub fn process_grouped_text(grouped_text: &TextGroup) -> (String, Option>, bool, String) { - let has_sub_groups = !grouped_text.sub_groups.is_empty(); - let new_name = grouped_text.text.clone(); - let new_resource_id = Self::generate_data_hash(new_name.as_bytes()); - - let metadata = grouped_text.metadata.clone(); - - (new_resource_id, Some(metadata), has_sub_groups, new_name) - } - - /// Internal method used to push into correct group for hierarchical grouping - pub fn push_group_to_appropriate_parent( - group: TextGroup, - title_group: &mut Option, - groups: &mut Vec, - ) { - if group.text.len() <= 2 { - return; - } - - if let Some(title_group) = title_group.as_mut() { - title_group.push_sub_group(group); - } else { - groups.push(group); - } - } - - /// Splits a string into chunks at the nearest whitespace to a given size - pub fn split_into_chunks(text: &str, chunk_size: usize) -> Vec { - let mut chunks = Vec::new(); - let mut start = 0; - while start < text.len() { - let end = start + chunk_size; - let end = if end < text.len() { - let mut end = end; - while end > start && !text.as_bytes()[end].is_ascii_whitespace() { - end -= 1; - } - if end == start { - start + chunk_size - } else { - end - } - } else { - text.len() - }; - - let chunk = &text[start..end]; - chunks.push(chunk.to_string()); - - start = end; - } - - chunks - } - - /// Splits a string into chunks at the nearest whitespace to a given size avoiding splitting metadata - pub fn split_into_chunks_with_metadata(text: &str, chunk_size: usize) -> Vec { - // The regex matches both pure and replaceable metadata - let re = Regex::new(Self::METADATA_REGEX).unwrap(); - let matched_positions: Vec<(usize, usize)> = re.find_iter(text).map(|m| (m.start(), m.end())).collect(); - - let mut chunks = Vec::new(); - let mut start = 0; - while start < text.len() { - let end = start + chunk_size; - let end = if end < text.len() { - let mut end = end; - while end > start - && (!text.as_bytes()[end].is_ascii_whitespace() - || matched_positions - .iter() - .any(|(meta_start, meta_end)| end >= *meta_start && end < *meta_end)) - { - end -= 1; - } - if end == start { - start + chunk_size - } else { - end - } - } else { - text.len() - }; - - let chunk = &text[start..end]; - chunks.push(chunk.to_string()); - - start = end; - } - - chunks - } - - /// Extracts the most important keywords from all Groups/Sub-groups - /// using the RAKE algorithm. - pub fn extract_keywords(groups: &Vec, num: u64) -> Vec { - // Extract all the text out of all the TextGroup and its subgroups and combine them together into a single string - let text = Self::extract_all_text_from_groups(groups); - - // Create a new KeyPhraseExtractor with a maximum of num keywords - let extractor = KeyPhraseExtractor::new(&text, num as usize); - - // Get the keywords - let keywords = extractor.get_keywords(); - - // Return only the keywords, discarding the scores - keywords.into_iter().map(|(_score, keyword)| keyword).collect() - } - - /// Extracts all text from the list of groups and any sub-groups inside - fn extract_all_text_from_groups(group: &Vec) -> String { - group - .iter() - .map(|element| { - let mut text = element.text.clone(); - for sub_group in &element.sub_groups { - text.push_str(&Self::extract_all_text_from_groups(&vec![sub_group.clone()])); - } - text - }) - .collect::>() - .join(" ") - } - - /// Concatenate text up to a maximum size. - pub fn concatenate_groups_up_to_max_size(elements: &Vec, max_size: usize) -> String { - let mut desc = String::new(); - for e in elements { - if desc.len() + e.text.len() + 1 > max_size { - break; // Stop appending if adding the next element would exceed max_size - } - desc.push_str(&e.text); - desc.push('\n'); // Add a line break after each element's text - } - desc.trim_end().to_string() // Trim any trailing space before returning - } -} diff --git a/shinkai-libs/shinkai-fs/src/file_parser/file_parser_helper.rs b/shinkai-libs/shinkai-fs/src/file_parser/file_parser_helper.rs deleted file mode 100644 index da27df820..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parser/file_parser_helper.rs +++ /dev/null @@ -1,432 +0,0 @@ -use blake3::Hasher; -use chrono::{TimeZone, Utc}; -use regex::{Captures, Regex}; -use reqwest::Url; -use std::collections::HashMap; - -use super::file_parser::ShinkaiFileParser; -use super::file_parser_types::TextGroup; - -impl ShinkaiFileParser { - pub const PURE_METADATA_REGEX: &'static str = r"!\{\{\{([^:}]+):((?:[^}]*\}{0,2}[^}]+))\}\}\}!"; - pub const METADATA_REGEX: &'static str = r"\{\{\{([^:}]+):((?:[^}]*\}{0,2}[^}]+))\}\}\}"; - pub const MD_URL_REGEX: &'static str = r"(.?)\[(.*?)\]\((.*?)\)"; - - /// Key of page numbers metadata - pub fn page_numbers_metadata_key() -> String { - "pg_nums".to_string() - } - - /// Key of datetime metadata - pub fn datetime_metadata_key() -> String { - "datetime".to_string() - } - - /// Key of timestamp metadata - pub fn timestamp_metadata_key() -> String { - "timestamp".to_string() - } - - // Key of likes metadata - pub fn likes_metadata_key() -> String { - "likes".to_string() - } - - // Key of reposts metadata - pub fn reposts_metadata_key() -> String { - "reposts".to_string() - } - - // Key of replies metadata - pub fn replies_metadata_key() -> String { - "replies".to_string() - } - - /// Clean's the file name of auxiliary data (file extension, url in front of file name, etc.) - pub fn clean_name(name: &str) -> String { - // Decode URL-encoded characters to simplify processing. - let decoded_name = urlencoding::decode(name).unwrap_or_else(|_| name.into()); - - // Check if the name ends with ".htm" or ".html" and calculate the position to avoid deletion. - let avoid_deletion_position = if decoded_name.ends_with(".htm") || decoded_name.ends_with(".html") { - decoded_name.len().saturating_sub(4) // Position before ".htm" - } else if decoded_name.ends_with(".html") { - decoded_name.len().saturating_sub(5) // Position before ".html" - } else if decoded_name.ends_with(".mhtml") { - decoded_name.len().saturating_sub(6) // Position before ".mhtml" - } else { - decoded_name.len() // Use the full length if not ending with ".htm" or ".html" - }; - - // Find the last occurrence of "/" or "%2F" that is not too close to the ".htm" extension. - let last_relevant_slash_position = decoded_name.rmatch_indices(&['/', '%']).find_map(|(index, _)| { - if index + 3 < avoid_deletion_position && decoded_name[index..].starts_with("%2F") { - Some(index) - } else if index + 1 < avoid_deletion_position && decoded_name[index..].starts_with("/") { - Some(index) - } else { - None - } - }); - // If a relevant slash is found, slice the string from the character immediately following this slash. - let http_cleaned = match last_relevant_slash_position { - Some(index) => decoded_name - .get((index + if decoded_name[index..].starts_with("%2F") { 3 } else { 1 })..) - .unwrap_or(&decoded_name), - None => &decoded_name, - }; - - let http_cleaned = if http_cleaned.is_empty() || http_cleaned == ".html" || http_cleaned == ".htm" { - decoded_name.to_string() - } else { - http_cleaned.to_string() - }; - - // Remove extension - let cleaned_name = SourceFileType::clean_string_of_extension(&http_cleaned); - - cleaned_name - } - - /// Helper function that processes groups into a list of descriptions. - /// Only takes the top level Group text, does not traverse deeper. - pub fn process_groups_into_descriptions_list( - groups: &Vec, - max_size: usize, - max_node_text_size: usize, - ) -> Vec { - let mut descriptions = Vec::new(); - let mut description = String::new(); - let mut total_size = 0; - - for group in groups { - let element_text = &group.text; - if description.len() + element_text.len() > max_node_text_size { - descriptions.push(description.clone()); - total_size += description.len(); - description.clear(); - } - if total_size + element_text.len() > max_size { - break; - } - description.push_str(element_text); - description.push(' '); - } - if !description.is_empty() { - descriptions.push(description); - } - - descriptions - } - - /// Processes groups into a single description string. - /// Only takes the top level Group text, does not traverse deeper. - pub fn process_groups_into_description( - groups: &Vec, - max_size: usize, - max_node_text_size: usize, - ) -> String { - let descriptions = Self::process_groups_into_descriptions_list(groups, max_size, max_node_text_size); - descriptions.join(" ") - } - - /// Helper method for setting a description if none provided for process_new_doc_resource - pub fn _setup_resource_description( - desc: Option, - text_groups: &Vec, - max_size: usize, - max_node_text_size: usize, - ) -> Option { - if let Some(description) = desc { - Some(description.to_string()) - } else if !text_groups.is_empty() { - Some(Self::process_groups_into_description( - text_groups, - max_size, - max_node_text_size, - )) - } else { - None - } - } - - /// Generates a Blake3 hash of the data in the buffer - pub fn generate_data_hash(buffer: &[u8]) -> String { - let mut hasher = Hasher::new(); - hasher.update(buffer); - let result = hasher.finalize(); - result.to_hex().to_string() - } - - // Parse and extract metadata in a text - // Returns the parsed text and a hashmap of metadata - pub fn parse_and_extract_metadata(input_text: &str) -> (String, HashMap, bool) { - let mut metadata = HashMap::new(); - let mut parsed_any_metadata = false; - let pure_metadata_re = Regex::new(Self::PURE_METADATA_REGEX).unwrap(); - let replaceable_metadata_re = Regex::new(Self::METADATA_REGEX).unwrap(); - - let pure_result = pure_metadata_re.replace_all(input_text, |caps: &Captures| { - Self::extract_metadata_from_capture(&mut metadata, &mut parsed_any_metadata, caps, true) - }); - - let parsed_result = replaceable_metadata_re.replace_all(&pure_result, |caps: &Captures| { - Self::extract_metadata_from_capture(&mut metadata, &mut parsed_any_metadata, caps, false) - }); - - (parsed_result.to_string(), metadata, parsed_any_metadata) - } - - // Helper function to extract metadata from a capture - // is_pure is used to determine if the metadata should be removed from the text - fn extract_metadata_from_capture( - metadata: &mut HashMap, - parsed_any_metadata: &mut bool, - caps: &Captures, - is_pure: bool, - ) -> String { - // In case extracting the capture groups fails, return the original text which is guaranteed to be valid - let key = match caps.get(1) { - Some(key) => key.as_str(), - None => return caps.get(0).unwrap().as_str().to_string(), - }; - - let value = match caps.get(2) { - Some(value) => value.as_str(), - None => return caps.get(0).unwrap().as_str().to_string(), - }; - - *parsed_any_metadata = true; - - // 1. Verify supported key value constraints and ignore invalid ones - // 2. Replace any matched metadata or remove if it's pure - match key { - // timestamp or datetime: RFC3339 formatted date string - _ if key == ShinkaiFileParser::datetime_metadata_key() - || key == ShinkaiFileParser::timestamp_metadata_key() => - { - let datetime = chrono::DateTime::parse_from_rfc3339(value); - - match datetime { - Ok(_) => { - metadata.insert(ShinkaiFileParser::datetime_metadata_key(), value.to_string()); - - if is_pure { - "".to_string() - } else { - value.to_string() - } - } - Err(_) => { - // Attempt to parse timestamp in a less strict format - let datetime = chrono::NaiveDateTime::parse_from_str(value, "%Y-%m-%dT%H:%M:%S%.3fZ"); - - match datetime { - Ok(parsed_datetime) => { - let formatted_datetime = Utc.from_utc_datetime(&parsed_datetime).to_rfc3339(); - metadata.insert(key.to_string(), formatted_datetime.clone()); - - if is_pure { - "".to_string() - } else { - formatted_datetime - } - } - Err(_) => value.to_string(), - } - } - } - } - // pg_nums: Array of integers - _ if key == ShinkaiFileParser::page_numbers_metadata_key() => { - let page_numbers: Result, _> = value - .trim_matches(|c| c == '[' || c == ']') - .split(",") - .map(|n| n.trim().parse::()) - .collect(); - - match page_numbers { - Ok(_) => { - metadata.insert(key.to_string(), value.to_string()); - - if is_pure { - "".to_string() - } else { - value.to_string() - } - } - Err(_) => value.to_string(), - } - } - // likes, reposts, replies: Integer - _ if key == ShinkaiFileParser::likes_metadata_key() - || key == ShinkaiFileParser::reposts_metadata_key() - || key == ShinkaiFileParser::replies_metadata_key() => - { - let number = value.parse::(); - - match number { - Ok(_) => { - metadata.insert(key.to_string(), value.to_string()); - - if is_pure { - "".to_string() - } else { - value.to_string() - } - } - Err(_) => value.to_string(), - } - } - _ => { - metadata.insert(key.to_string(), value.to_string()); - - if is_pure { - "".to_string() - } else { - value.to_string() - } - } - } - } - - pub fn parse_and_extract_md_metadata(input_text: &str) -> (String, HashMap) { - let mut metadata = HashMap::new(); - let md_url_re = Regex::new(Self::MD_URL_REGEX).unwrap(); - - let parsed_result = md_url_re.replace_all(input_text, |caps: &Captures| { - let prefix = match caps.get(1) { - Some(prefix) => prefix.as_str(), - None => return caps.get(0).unwrap().as_str().to_string(), - }; - - let text = match caps.get(2) { - Some(text) => text.as_str(), - None => return caps.get(0).unwrap().as_str().to_string(), - }; - - let url = match caps.get(3) { - Some(url) => url.as_str(), - None => return caps.get(0).unwrap().as_str().to_string(), - }; - - let mut shortened_url = Url::parse(url) - .ok() - .map(|u| { - let mut scheme = u.scheme().to_string(); - let host = u.host_str().unwrap_or("").to_string(); - - if !scheme.is_empty() { - scheme = format!("{}://", scheme); - } - - format!("{}{}", scheme, host) - }) - .unwrap_or("".to_string()); - - if shortened_url.is_empty() { - shortened_url = url.chars().take(100).collect(); - } - - match prefix { - "!" => { - let image_urls_entry = metadata.entry("image-urls".to_string()).or_insert(Vec::::new()); - image_urls_entry.push(format!("![{}]({})", text, url)); - format!("![{}]({})", text, shortened_url) - } - _ => { - let link_urls_entry = metadata.entry("link-urls".to_string()).or_insert(Vec::::new()); - link_urls_entry.push(format!("[{}]({})", text, url)); - format!("{}[{}]({})", prefix, text, shortened_url) - } - } - }); - - let serialized_metadata = metadata - .into_iter() - .map(|(key, values)| (key, serde_json::to_string(&values).unwrap_or_default())) - .collect::>(); - - (parsed_result.to_string(), serialized_metadata) - } - - pub fn parse_and_split_into_text_groups( - text: String, - max_node_text_size: u64, - page_number: Option, - ) -> Vec { - let mut text_groups = Vec::new(); - let (parsed_text, metadata, parsed_any_metadata) = ShinkaiFileParser::parse_and_extract_metadata(&text); - let (parsed_md_text, md_metadata) = ShinkaiFileParser::parse_and_extract_md_metadata(&parsed_text); - - if parsed_md_text.len() as u64 > max_node_text_size { - let chunks = if parsed_any_metadata { - ShinkaiFileParser::split_into_chunks_with_metadata(&text, max_node_text_size as usize) - } else { - ShinkaiFileParser::split_into_chunks(&text, max_node_text_size as usize) - }; - - for chunk in chunks { - let (parsed_chunk, metadata, _) = ShinkaiFileParser::parse_and_extract_metadata(&chunk); - let (parsed_md_chunk, md_metadata) = ShinkaiFileParser::parse_and_extract_md_metadata(&parsed_chunk); - let metadata = metadata.into_iter().chain(md_metadata).collect(); - let mut text_group = TextGroup::new(parsed_md_chunk, metadata, vec![], None); - - if let Some(page_number) = page_number { - text_group.push_page_number(page_number); - } - - text_groups.push(text_group); - } - } else { - let metadata = metadata.into_iter().chain(md_metadata).collect(); - let mut text_group = TextGroup::new(parsed_md_text, metadata, vec![], None); - - if let Some(page_number) = page_number { - text_group.push_page_number(page_number); - } - - text_groups.push(text_group); - } - - text_groups - } - - // Creates a new text group and nests it under the last group at the given depth. - // It splits text groups into chunks if needed and parses metadata in the text. - pub fn push_text_group_by_depth( - text_groups: &mut Vec, - depth: usize, - text: String, - max_node_text_size: u64, - page_number: Option, - ) { - if !text.is_empty() { - let created_text_groups = - ShinkaiFileParser::parse_and_split_into_text_groups(text, max_node_text_size, page_number); - - if depth > 0 { - let mut parent_group = text_groups.last_mut(); - for _ in 1..depth { - if let Some(last_group) = parent_group { - parent_group = last_group.sub_groups.last_mut(); - } - } - - if let Some(last_group) = parent_group { - for text_group in created_text_groups { - last_group.push_sub_group(text_group); - } - } else { - for text_group in created_text_groups { - text_groups.push(text_group); - } - } - } else { - for text_group in created_text_groups { - text_groups.push(text_group); - } - } - } - } -} diff --git a/shinkai-libs/shinkai-fs/src/file_parser/file_parser_types.rs b/shinkai-libs/shinkai-fs/src/file_parser/file_parser_types.rs deleted file mode 100644 index 843f632d8..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parser/file_parser_types.rs +++ /dev/null @@ -1,122 +0,0 @@ -use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, HashSet}; - -use crate::{embeddings::Embedding, file_parser::file_parser::ShinkaiFileParser}; - -/// An intermediary type for processing content into Node's held in VectorResources -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct TextGroup { - pub text: String, - pub metadata: HashMap, - pub sub_groups: Vec, - pub embedding: Option, -} - -impl TextGroup { - /// Creates a new instance of TextGroup - pub fn new( - text: String, - metadata: HashMap, - sub_groups: Vec, - embedding: Option, - ) -> Self { - TextGroup { - text, - metadata, - sub_groups, - embedding, - } - } - - /// Creates a new instance of TextGroup with default empty values. - pub fn new_empty() -> Self { - TextGroup { - text: String::new(), - metadata: HashMap::new(), - sub_groups: Vec::new(), - embedding: None, - } - } - - /// Prepares a string to be used to generate an Embedding for this TextGroup. - /// Extracts most prevalent keywords from all sub-groups and appends them to - /// the end of the groups actual text. - pub fn format_text_for_embedding(&self, max_node_text_size: u64) -> String { - let mut keyword_string = String::new(); - let base_string = &self.text; - let pre_keyword_length = base_string.len(); - - // Extract keywords from the TextGroup and its sub-groups - let keywords: Vec = ShinkaiFileParser::extract_keywords(&vec![self.clone()], 1); - - for keyword in keywords { - if pre_keyword_length + keyword_string.len() + keyword.len() <= max_node_text_size as usize { - keyword_string = format!("{}, {}", keyword_string, keyword); - } else { - break; - } - } - - format!("{} Keywords: {}", base_string, keyword_string.trim_start_matches(", ")) - } - - /// Pushes data into this TextGroup and extracts metadata - pub fn push_data(&mut self, text: &str, page_number: Option) { - if !self.text.is_empty() { - self.text.push(' '); - } - - let (parsed_text, metadata, parsed_any_metadata) = ShinkaiFileParser::parse_and_extract_metadata(text); - if parsed_any_metadata { - self.text.push_str(&parsed_text); - self.metadata.extend(metadata); - } else { - self.text.push_str(text); - } - - if let Some(page_number) = page_number { - self.push_page_number(page_number); - } - } - - /// Pushes a page number into this TextGroup - pub fn push_page_number(&mut self, page_number: u32) { - let mut unique_page_numbers: HashSet = HashSet::new(); - - if let Some(page_numbers_metadata) = self.metadata.get(&ShinkaiFileParser::page_numbers_metadata_key()) { - let page_numbers_metadata: Result, _> = page_numbers_metadata - .trim_matches(|c| c == '[' || c == ']') - .split(",") - .map(|n| n.trim().parse::()) - .collect(); - - match page_numbers_metadata { - Ok(page_numbers) => { - for page_number in page_numbers { - unique_page_numbers.insert(page_number); - } - } - Err(_) => {} - } - } - - unique_page_numbers.insert(page_number); - - self.metadata.insert( - ShinkaiFileParser::page_numbers_metadata_key(), - format!( - "[{}]", - unique_page_numbers - .iter() - .map(|n| n.to_string()) - .collect::>() - .join(", ") - ), - ); - } - - /// Pushes a sub-group into this TextGroup - pub fn push_sub_group(&mut self, sub_group: TextGroup) { - self.sub_groups.push(sub_group); - } -} diff --git a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/csv_parsing.rs b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/csv_parsing.rs deleted file mode 100644 index dee50d3e5..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/csv_parsing.rs +++ /dev/null @@ -1,72 +0,0 @@ -use super::LocalFileParser; -use crate::{file_parser::file_parser_types::TextGroup, shinkai_fs_error::ShinkaiFsError}; -use csv::ReaderBuilder; -use std::io::Cursor; - -impl LocalFileParser { - /// Attempts to process the provided csv file into a list of TextGroups. - pub fn process_csv_file(file_buffer: Vec, max_node_text_size: u64) -> Result, ShinkaiFsError> { - let csv_lines = Self::parse_csv_auto(&file_buffer).map_err(|_| ShinkaiFsError::FailedCSVParsing)?; - Self::process_table_rows(csv_lines, max_node_text_size) - } - - // /// Parse CSV data from a buffer and attempt to automatically detect - // /// headers. - pub fn parse_csv_auto(buffer: &[u8]) -> Result, ShinkaiFsError> { - let mut reader = ReaderBuilder::new().flexible(true).from_reader(Cursor::new(buffer)); - let headers = reader - .headers() - .map_err(|_| ShinkaiFsError::FailedCSVParsing)? - .iter() - .map(String::from) - .collect::>(); - - let likely_header = headers.iter().all(|s| { - let is_alphabetic = s.chars().all(|c| c.is_alphabetic() || c.is_whitespace() || c == '_'); - let no_duplicates = headers.iter().filter(|&item| item == s).count() == 1; - let no_prohibited_chars = !s.contains(&['@', '#', '$', '%', '^', '&', '*']); - - is_alphabetic && no_duplicates && no_prohibited_chars - }); - - Self::parse_csv(&buffer, likely_header) - } - - // /// Parse CSV data from a buffer. - // /// * `header` - A boolean indicating whether to prepend column headers to - // /// values. - pub fn parse_csv(buffer: &[u8], header: bool) -> Result, ShinkaiFsError> { - let mut reader = ReaderBuilder::new() - .flexible(true) - .has_headers(header) - .from_reader(Cursor::new(buffer)); - let headers = if header { - reader - .headers() - .map_err(|_| ShinkaiFsError::FailedCSVParsing)? - .iter() - .map(String::from) - .collect::>() - } else { - Vec::new() - }; - - let mut result = Vec::new(); - for record in reader.records() { - let record = record.map_err(|_| ShinkaiFsError::FailedCSVParsing)?; - let row: Vec = if header { - record - .iter() - .enumerate() - .map(|(i, e)| format!("{}: {}", headers[i], e)) - .collect() - } else { - record.iter().map(String::from).collect() - }; - let row_string = row.join("|"); - result.push(row_string); - } - - Ok(result) - } -} diff --git a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/html_parsing.rs b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/html_parsing.rs deleted file mode 100644 index a0c0dc1c2..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/html_parsing.rs +++ /dev/null @@ -1,347 +0,0 @@ -use regex::Regex; -use scraper::{ElementRef, Html, Selector}; - -use crate::{file_parser::{file_parser::ShinkaiFileParser, file_parser_types::TextGroup}, shinkai_fs_error::ShinkaiFsError}; - -use super::LocalFileParser; - -/// If the file provided is an html file, attempt to extract out the core content to improve overall quality. -pub fn extract_core_content(file_buffer: Vec, file_name: &str) -> Vec { - if file_name.ends_with(".html") || file_name.ends_with(".htm") { - let file_content = String::from_utf8_lossy(&file_buffer); - let document = Html::parse_document(&file_content); - - // If the file is from GitHub, use a specific selector for GitHub's layout - if file_name.contains("github.com") { - if let Ok(layout_selector) = Selector::parse(".entry-content") { - if let Some(layout_element) = document.select(&layout_selector).next() { - return layout_element.inner_html().into_bytes(); - } - } - } else if file_name.contains("twitter.com") || file_name.contains("x.com") { - // Selector for Twitter or X.com's layout - if let Ok(primary_column_selector) = Selector::parse("div[data-testid='primaryColumn']") { - if let Some(primary_column_element) = document.select(&primary_column_selector).next() { - return primary_column_element.inner_html().into_bytes(); - } - } - } else if file_name.contains("youtube.com") { - // Selector for YouTube's layout - let mut content = String::new(); - if let Ok(above_the_fold_selector) = Selector::parse("#above-the-fold") { - if let Some(above_the_fold_element) = document.select(&above_the_fold_selector).next() { - content += &above_the_fold_element.inner_html(); - } - } - if let Ok(comments_selector) = Selector::parse(".ytd-comments") { - if let Some(comments_element) = document.select(&comments_selector).next() { - content += &comments_element.inner_html(); - } - } - return content.into_bytes(); - } else { - // Try to select the 'main', 'article' tag or a class named 'main' - if let Ok(main_selector) = Selector::parse("main, .main, article") { - if let Some(main_element) = document.select(&main_selector).next() { - return main_element.inner_html().into_bytes(); - } - } - - if let Ok(body_selector) = Selector::parse("body") { - if let Some(body_element) = document.select(&body_selector).next() { - return body_element.inner_html().into_bytes(); - } - } - } - } - - file_buffer -} - -impl LocalFileParser { - const IGNORED_ELEMENTS: &'static [&'static str] = &[ - "base", "head", "link", "meta", "noscript", "script", "style", "svg", "template", "title", - ]; - const HTML_HEADERS: &'static [&'static str] = &["h1", "h2", "h3", "h4", "h5", "h6"]; - - pub fn process_html_file( - file_buffer: Vec, - file_name: &str, - max_node_text_size: u64, - ) -> Result, ShinkaiFsError> { - let extracted_buffer = extract_core_content(file_buffer, file_name); - let document = Html::parse_fragment(&String::from_utf8_lossy(&extracted_buffer)); - - let mut text_groups: Vec = Vec::new(); - - // to keep track of the current parent headings - let mut heading_parents: Vec = Vec::with_capacity(6); - - // Parent nodes propagate context to child nodes. - // Nodes can alter their state and propagate them to their children. - #[derive(Default)] - struct HTMLNodeContext { - is_preformatted: bool, // pre tags - is_ordered_list: bool, // ol tags - list_item_start: u64, // start attribute for ol tags - list_depth: u64, // nested lists - } - - // Iterate through HTML elements and text nodes in order - fn iter_nodes<'a>( - element: ElementRef<'a>, - text_groups: &mut Vec, - max_node_text_size: u64, - heading_parents: &mut Vec, - context: HTMLNodeContext, - ) -> String { - let mut node_text = "".to_string(); - let mut list_item_index = context.list_item_start; - - for node in element.children() { - match node.value() { - scraper::Node::Element(element) => { - let el_name = element.name().to_lowercase(); - - if let Some(element) = ElementRef::wrap(node) { - // Jump to next node if the element is ignored - if LocalFileParser::IGNORED_ELEMENTS.contains(&element.value().name()) { - continue; - } - - // Push current text and start a new text group on section elements - if el_name == "article" || el_name == "section" || el_name == "table" || el_name == "hr" { - ShinkaiFileParser::push_text_group_by_depth( - text_groups, - heading_parents.len(), - node_text.trim().to_owned(), - max_node_text_size, - None, - ); - node_text.clear(); - } - - // Header elements - if LocalFileParser::HTML_HEADERS.contains(&el_name.as_str()) { - ShinkaiFileParser::push_text_group_by_depth( - text_groups, - heading_parents.len(), - node_text.trim().to_owned(), - max_node_text_size, - None, - ); - node_text.clear(); - - let heading_level = el_name - .chars() - .last() - .unwrap_or_default() - .to_digit(10) - .unwrap_or_default() as usize; - - // Adjust heading_parents based on the current heading level - // Find the parent and remove previous child headings - if let Some(index) = heading_parents - .iter() - .rposition(|&parent_level| parent_level <= heading_level) - { - heading_parents.truncate(index + 1); - - if heading_parents[index] < heading_level { - heading_parents.push(heading_level); - } - } else { - heading_parents.clear(); - heading_parents.push(heading_level); - } - } - - match el_name.as_str() { - "div" | "button" | "label" | "footer" => { - if node_text.len() > 0 && !node_text.ends_with(char::is_whitespace) { - node_text.push_str(" "); - } - } - "p" | "br" | "blockquote" => { - if !node_text.is_empty() { - node_text.push_str("\n"); - } - } - "img" => { - let alt = element.attr("alt").unwrap_or(""); - let src = element.attr("src").unwrap_or(""); - - if alt.len() > 0 && src.len() > 0 { - node_text.push_str(&format!(" ![{}]({})", alt, src)); - } - } - "ol" => { - if !node_text.is_empty() && !node_text.ends_with("\n") { - node_text.push_str("\n"); - } - - let start = element.attr("start").unwrap_or("1"); - list_item_index = start.parse::().unwrap_or(1); - } - "ul" => { - if !node_text.is_empty() && !node_text.ends_with("\n") { - node_text.push_str("\n"); - } - list_item_index = 1; - } - _ => (), - } - - let list_depth = if el_name == "ol" || el_name == "ul" { - context.list_depth + 1 - } else { - context.list_depth - }; - - // Process child nodes - let inner_text = iter_nodes( - element, - text_groups, - max_node_text_size, - heading_parents, - HTMLNodeContext { - is_preformatted: context.is_preformatted || el_name == "pre", - is_ordered_list: (context.is_ordered_list || el_name == "ol") && el_name != "ul", - list_item_start: list_item_index, - list_depth, - }, - ); - - // Process inner text returned from child nodes - if inner_text.len() > 0 { - match el_name.as_str() { - "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => { - let heading_depth = if heading_parents.len() > 0 { - heading_parents.len() - 1 - } else { - 0 - }; - - ShinkaiFileParser::push_text_group_by_depth( - text_groups, - heading_depth, - inner_text.trim().to_owned(), - max_node_text_size, - None, - ); - } - "a" => { - let href = element.attr("href").unwrap_or(""); - - if href.len() > 0 && !href.starts_with("#") { - node_text.push_str(&format!(" [{}]({})", inner_text, href)); - } else { - node_text.push_str(&format!(" {}", inner_text)); - } - } - "blockquote" => { - inner_text.split("\n").for_each(|line| { - node_text.push_str(&format!("> {}\n", line)); - }); - } - "code" => { - if context.is_preformatted { - node_text.push_str(&format!("```\n{}\n```\n", inner_text)); - } else { - node_text.push_str(&format!("`{}`", inner_text)); - } - } - "li" => { - let list_depth = if context.list_depth > 0 { context.list_depth } else { 1 }; - let indentation = "\t".repeat((list_depth - 1) as usize); - - if !node_text.is_empty() && !node_text.ends_with("\n") { - node_text.push_str("\n"); - } - - if context.is_ordered_list { - let li_value = element.attr("value").unwrap_or(""); - if let Some(value) = li_value.parse::().ok() { - list_item_index = value; - } - - node_text.push_str(&format!( - "{}{}. {}\n", - indentation, - list_item_index, - inner_text.trim() - )); - list_item_index += 1; - } else { - node_text.push_str(&format!("{}* {}\n", indentation, inner_text.trim())); - } - } - // Push table data to a text group - "table" => { - ShinkaiFileParser::push_text_group_by_depth( - text_groups, - heading_parents.len(), - inner_text.trim().to_owned(), - max_node_text_size, - None, - ); - } - "caption" => { - node_text.push_str(&format!("{}\n", inner_text.trim())); - } - "tr" => { - let row_text = inner_text.trim(); - let row_text = row_text.trim_end_matches(';'); - node_text.push_str(&format!("{}\n", row_text)); - } - "td" | "th" => { - node_text.push_str(&format!("{}; ", inner_text)); - } - _ => { - node_text.push_str(&inner_text); - } - } - } - } - } - scraper::Node::Text(text) => { - if text.text.trim().is_empty() { - continue; - } - - // Save preformatted text as is, otherwise remove extra whitespaces - if context.is_preformatted { - node_text.push_str(&text.text); - } else { - let re = Regex::new(r"\s{2,}|\n").unwrap(); - let sanitized_text = re.replace_all(&text.text, " "); - - node_text.push_str(&sanitized_text); - } - } - _ => (), - }; - } - - node_text - } - - let result_text = iter_nodes( - document.root_element(), - &mut text_groups, - max_node_text_size, - &mut heading_parents, - HTMLNodeContext::default(), - ); - - ShinkaiFileParser::push_text_group_by_depth( - &mut text_groups, - heading_parents.len(), - result_text.trim().to_owned(), - max_node_text_size, - None, - ); - - Ok(text_groups) - } -} diff --git a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/json_parsing.rs b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/json_parsing.rs deleted file mode 100644 index ae80cf31a..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/json_parsing.rs +++ /dev/null @@ -1,79 +0,0 @@ -use std::collections::HashMap; - -use super::LocalFileParser; -use crate::file_parser::file_parser::ShinkaiFileParser; -use crate::file_parser::file_parser_types::TextGroup; -use crate::shinkai_fs_error::ShinkaiFsError; -use serde_json::Value as JsonValue; - -impl LocalFileParser { - /// Attempts to process the provided json file into a list of TextGroups. - pub fn process_json_file(file_buffer: Vec, max_node_text_size: u64) -> Result, ShinkaiFsError> { - let json_string = String::from_utf8(file_buffer.clone()).map_err(|_| ShinkaiFsError::FailedJSONParsing)?; - let json: JsonValue = serde_json::from_str(&json_string)?; - - let text_groups = Self::process_container_json_value(&json, max_node_text_size); - - Ok(text_groups) - } - - /// Recursively processes a JSON value to build a hierarchy of TextGroups. - pub fn process_container_json_value(json: &JsonValue, max_node_text_size: u64) -> Vec { - let fn_merge_groups = |mut acc: Vec, current_group: TextGroup| { - if let Some(prev_group) = acc.last_mut() { - if prev_group.sub_groups.is_empty() - && current_group.sub_groups.is_empty() - && prev_group.text.len() + current_group.text.len() < max_node_text_size as usize - { - prev_group.text.push_str(format!("\n{}", current_group.text).as_str()); - return acc; - } - } - - acc.push(current_group); - acc - }; - - match json { - JsonValue::Object(map) => map - .iter() - .flat_map(|(key, value)| match value { - JsonValue::Object(_) | JsonValue::Array(_) => { - let mut text_group = TextGroup::new_empty(); - text_group.text = key.clone(); - text_group.sub_groups = Self::process_container_json_value(value, max_node_text_size); - - vec![text_group] - } - _ => Self::process_content_json_value(Some(key), value, max_node_text_size), - }) - .fold(Vec::new(), fn_merge_groups), - JsonValue::Array(arr) => arr - .iter() - .flat_map(|value| Self::process_container_json_value(value, max_node_text_size)) - .fold(Vec::new(), fn_merge_groups), - _ => Self::process_content_json_value(None, json, max_node_text_size), - } - } - - fn process_content_json_value(key: Option<&str>, value: &JsonValue, max_node_text_size: u64) -> Vec { - let mut text_groups = Vec::new(); - - let text = match key { - Some(key) => format!("{}: {}", key, value.to_string()), - None => value.to_string(), - }; - - if text.len() as u64 > max_node_text_size { - let chunks = ShinkaiFileParser::split_into_chunks(&text, max_node_text_size as usize); - - for chunk in chunks { - text_groups.push(TextGroup::new(chunk, HashMap::new(), vec![], None)); - } - } else { - text_groups.push(TextGroup::new(text, HashMap::new(), vec![], None)); - } - - text_groups - } -} diff --git a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/local_parsing.rs b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/local_parsing.rs deleted file mode 100644 index 67622d6e7..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/local_parsing.rs +++ /dev/null @@ -1,54 +0,0 @@ -use crate::file_parser::file_parser_types::TextGroup; -use crate::shinkai_fs_error::ShinkaiFsError; - -pub struct LocalFileParser {} - -impl LocalFileParser { - /// Attempts to process a file into a list of TextGroups using local processing logic - /// implemented in Rust directly without relying on external services. - /// If local processing is not available for the provided source, then returns Err. - pub fn process_file_into_grouped_text( - file_buffer: Vec, - file_name: String, - max_node_text_size: u64, - source: VRSourceReference, - ) -> Result, ShinkaiFsError> { - let source_base = source; - - match &source_base { - VRSourceReference::None => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), - VRSourceReference::Standard(source) => match source { - SourceReference::Other(_) => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), - SourceReference::FileRef(file_source) => match file_source.clone().file_type { - SourceFileType::Image(_) - | SourceFileType::Code(_) - | SourceFileType::ConfigFileType(_) - | SourceFileType::Video(_) - | SourceFileType::Audio(_) - | SourceFileType::Shinkai(_) => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), - SourceFileType::Document(doc) => match doc { - DocumentFileType::Txt => LocalFileParser::process_txt_file(file_buffer, max_node_text_size), - DocumentFileType::Json => LocalFileParser::process_json_file(file_buffer, max_node_text_size), - DocumentFileType::Csv => LocalFileParser::process_csv_file(file_buffer, max_node_text_size), - // DocumentFileType::Docx => LocalFileParser::process_docx_file(file_buffer, max_node_text_size), - DocumentFileType::Html => { - LocalFileParser::process_html_file(file_buffer, &file_name, max_node_text_size) - } - - DocumentFileType::Md => LocalFileParser::process_md_file(file_buffer, max_node_text_size), - - DocumentFileType::Pdf => LocalFileParser::process_pdf_file(file_buffer, max_node_text_size), - - DocumentFileType::Xlsx | DocumentFileType::Xls => { - LocalFileParser::process_xlsx_file(file_buffer, max_node_text_size) - } - - _ => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), - }, - }, - SourceReference::ExternalURI(_) => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), - }, - VRSourceReference::Notarized(_) => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), - } - } -} diff --git a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/md_parsing.rs b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/md_parsing.rs deleted file mode 100644 index 12dd2c175..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/md_parsing.rs +++ /dev/null @@ -1,187 +0,0 @@ -use comrak::{ - nodes::{AstNode, ListDelimType, ListType, NodeValue}, - parse_document, Arena, Options, -}; - -use crate::{file_parser::{file_parser::ShinkaiFileParser, file_parser_types::TextGroup}, shinkai_fs_error::ShinkaiFsError}; - -use super::LocalFileParser; - -impl LocalFileParser { - pub fn process_md_file(file_buffer: Vec, max_node_text_size: u64) -> Result, ShinkaiFsError> { - let md_string = String::from_utf8(file_buffer).map_err(|_| ShinkaiFsError::FailedMDParsing)?; - - let arena = Arena::new(); - let root = parse_document(&arena, &md_string, &Options::default()); - - // build up an AST and iterate through nodes in order - fn iter_nodes<'a, F>(node: &'a AstNode<'a>, f: &mut F) - where - F: FnMut(&'a AstNode<'a>), - { - f(node); - for c in node.children() { - iter_nodes(c, f); - } - } - - let mut text_groups: Vec = Vec::new(); - let mut current_text = "".to_string(); - let mut processed_node_type = NodeValue::Document; - - // heading_parents is used to keep track of the depth of the headings - let mut heading_parents: Vec = Vec::with_capacity(6); - - iter_nodes(root, &mut |node| match &node.data.borrow().value { - // Actual text comes in the next text node, set processed_node_type to the proper type - NodeValue::Heading(ref heading) => { - processed_node_type = NodeValue::Heading(heading.clone()); - } - NodeValue::Paragraph => match processed_node_type { - // paragraph inside a list item - NodeValue::Item(_) => { - return; - } - _ => { - processed_node_type = NodeValue::Paragraph; - - if current_text.len() > 0 { - current_text.push_str("\n"); - } - } - }, - NodeValue::Item(ref list_item) => { - processed_node_type = NodeValue::Item(list_item.clone()); - } - NodeValue::Link(ref link) => { - processed_node_type = NodeValue::Link(link.clone()); - } - NodeValue::Image(ref image) => { - processed_node_type = NodeValue::Image(image.clone()); - } - - NodeValue::Text(ref text) => match processed_node_type { - NodeValue::Heading(ref heading) => { - // Push previous text to a text group - ShinkaiFileParser::push_text_group_by_depth( - &mut text_groups, - heading_parents.len(), - current_text.clone(), - max_node_text_size, - None, - ); - current_text = "".to_string(); - - let level = heading.level as usize; - - // Adjust heading_parents based on the current heading level - // Find the parent and remove previous child headings - if let Some(index) = heading_parents.iter().rposition(|&parent_level| parent_level <= level) { - heading_parents.truncate(index + 1); - - if heading_parents[index] < level { - heading_parents.push(level); - } - } else { - heading_parents.clear(); - heading_parents.push(level); - } - - let heading_depth = if heading_parents.len() > 0 { - heading_parents.len() - 1 - } else { - 0 - }; - - // Create a new text group for the heading - // Upcoming content will be added to its subgroups - ShinkaiFileParser::push_text_group_by_depth( - &mut text_groups, - heading_depth, - text.to_string(), - max_node_text_size, - None, - ); - } - NodeValue::Paragraph => { - current_text.push_str(text); - } - NodeValue::Item(ref list_item) => { - let prefix = match list_item.list_type { - ListType::Bullet => format!("{} ", list_item.bullet_char as char), - ListType::Ordered => match list_item.delimiter { - ListDelimType::Period => format!("{}. ", list_item.start), - ListDelimType::Paren => format!("{}) ", list_item.start), - }, - }; - - current_text.push_str(format!("\n{} {}", prefix, text).as_str()); - processed_node_type = NodeValue::Paragraph; - } - NodeValue::Link(ref link) => { - current_text.push_str(format!("[{}]({})", text, link.url).as_str()); - processed_node_type = NodeValue::Paragraph; - } - NodeValue::Image(ref image) => { - current_text.push_str(format!("![{}]({})", text, image.url).as_str()); - processed_node_type = NodeValue::Paragraph; - } - _ => (), - }, - NodeValue::Code(ref code) => { - let ticks = "`".repeat(code.num_backticks); - current_text.push_str(format!("{}{}{}", ticks, code.literal, ticks).as_str()); - } - NodeValue::CodeBlock(ref code_block) => { - let fence = if code_block.fenced { - format!( - "{}", - (code_block.fence_char as char) - .to_string() - .repeat(code_block.fence_length) - ) - } else { - "".to_string() - }; - - current_text - .push_str(format!("\n{}{}\n{}{}\n", fence, code_block.info, code_block.literal, fence).as_str()); - } - NodeValue::HtmlBlock(ref html_block) => { - current_text.push_str(format!("\n{}", html_block.literal).as_str()); - } - NodeValue::HtmlInline(ref html_inline) => { - current_text.push_str(html_inline.as_str()); - } - NodeValue::LineBreak => { - current_text.push_str("\n"); - } - NodeValue::SoftBreak => { - current_text.push_str("\n"); - } - // split text groups by --- - NodeValue::ThematicBreak => { - ShinkaiFileParser::push_text_group_by_depth( - &mut text_groups, - heading_parents.len(), - current_text.clone(), - max_node_text_size, - None, - ); - current_text = "".to_string(); - } - _ => (), - }); - - // Push the last text group - ShinkaiFileParser::push_text_group_by_depth( - &mut text_groups, - heading_parents.len(), - current_text.clone(), - max_node_text_size, - None, - ); - - Ok(text_groups) - } -} diff --git a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/mod.rs b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/mod.rs deleted file mode 100644 index 361cd4e96..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/mod.rs +++ /dev/null @@ -1,9 +0,0 @@ -pub mod csv_parsing; -pub mod html_parsing; -pub mod json_parsing; -pub mod local_parsing; -pub mod md_parsing; -pub mod pdf_parsing; -pub mod txt_parsing; - -pub use local_parsing::*; diff --git a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/pdf_parsing.rs b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/pdf_parsing.rs deleted file mode 100644 index 4abeb9426..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/pdf_parsing.rs +++ /dev/null @@ -1,30 +0,0 @@ -use crate::{file_parser::{file_parser::ShinkaiFileParser, file_parser_types::TextGroup}, shinkai_fs_error::ShinkaiFsError}; - -use super::LocalFileParser; - -impl LocalFileParser { - pub fn process_pdf_file(file_buffer: Vec, max_node_text_size: u64) -> Result, ShinkaiFsError> { - use shinkai_ocr::pdf_parser::PDFParser; - - let pdf_parser = PDFParser::new().map_err(|_| ShinkaiFsError::FailedPDFParsing)?; - let parsed_pages = pdf_parser - .process_pdf_file(file_buffer) - .map_err(|_| ShinkaiFsError::FailedPDFParsing)?; - - let mut text_groups = Vec::new(); - - for page in parsed_pages.into_iter() { - for pdf_text in page.content.into_iter() { - ShinkaiFileParser::push_text_group_by_depth( - &mut text_groups, - 0, - pdf_text.text, - max_node_text_size, - Some(page.page_number.try_into().unwrap_or_default()), - ); - } - } - - Ok(text_groups) - } -} diff --git a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/txt_parsing.rs b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/txt_parsing.rs deleted file mode 100644 index a9842bad0..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/txt_parsing.rs +++ /dev/null @@ -1,152 +0,0 @@ -use std::collections::HashMap; - -use regex::Regex; - -use super::LocalFileParser; -use crate::file_parser::file_parser::ShinkaiFileParser; -use crate::file_parser::file_parser_types::TextGroup; -use crate::shinkai_fs_error::ShinkaiFsError; - -impl LocalFileParser { - /// Attempts to process the provided json file into a list of TextGroups. - pub fn process_txt_file(file_buffer: Vec, max_node_text_size: u64) -> Result, ShinkaiFsError> { - let txt_string = String::from_utf8(file_buffer).map_err(|_| ShinkaiFsError::FailedTXTParsing)?; - let sentences = LocalFileParser::process_into_sentences(txt_string); - let text_groups = LocalFileParser::process_into_text_groups(sentences, max_node_text_size); - // for sentence in &sentences { - // println!("S: {}", sentence); - // } - // for text_group in &text_groups { - // println!("TG: {}", text_group.text); - // } - - Ok(text_groups) - } - - /// Build a non-hierarchical list of TextGroups using the sentences - pub fn process_into_text_groups(text_lines: Vec, max_node_text_size: u64) -> Vec { - let mut text_groups = Vec::new(); - let mut current_text = String::new(); - let mut current_metadata = HashMap::new(); - - for line in text_lines { - let (parsed_line, metadata, parsed_any_metadata) = ShinkaiFileParser::parse_and_extract_metadata(&line); - - if parsed_line.len() as u64 + current_text.len() as u64 > max_node_text_size { - if !current_text.is_empty() { - text_groups.push(TextGroup::new( - current_text.clone(), - current_metadata.clone(), - vec![], - None, - )); - current_text.clear(); - current_metadata.clear(); - } - if parsed_line.len() as u64 > max_node_text_size { - // If the line itself exceeds max_node_text_size, split it into chunks - // Split the unparsed line into chunks and parse metadata in each chunk - let chunks = if parsed_any_metadata { - ShinkaiFileParser::split_into_chunks_with_metadata(&line, max_node_text_size as usize) - } else { - ShinkaiFileParser::split_into_chunks(&line, max_node_text_size as usize) - }; - - for chunk in chunks { - let (parsed_chunk, metadata, _) = if parsed_any_metadata { - ShinkaiFileParser::parse_and_extract_metadata(&chunk) - } else { - (chunk, HashMap::new(), false) - }; - - text_groups.push(TextGroup::new(parsed_chunk, metadata, vec![], None)); - } - } else { - current_text = parsed_line; - current_metadata.extend(metadata); - } - } else { - if !current_text.is_empty() { - current_text.push(' '); // Add space between sentences - } - current_text.push_str(&parsed_line); - current_metadata.extend(metadata); - } - } - - // Don't forget to add the last accumulated text as a TextGroup if it's not empty - if !current_text.is_empty() { - text_groups.push(TextGroup::new(current_text, current_metadata.clone(), vec![], None)); - } - - text_groups - } - - /// Given a piece of text, split it into a list of sentences, doing its best to respect punctuation - /// and taking into account English-based exceptions. - pub fn process_into_sentences(text: String) -> Vec { - let punctuation_marks = [',', '.', ';', '-', '&', '(', '{', '<', '"', '\'', '`']; - text.split("\n") - .filter(|line| !line.trim().is_empty() && line.trim().len() > 1) // Filter out empty or nearly empty lines - .flat_map(|line| { - let trimmed_line = line.trim(); - - let re = Regex::new(ShinkaiFileParser::PURE_METADATA_REGEX).unwrap(); - let is_pure_metadata = re.is_match(trimmed_line) - && re - .find(trimmed_line) - .map(|m| m.start() == 0 && m.end() == trimmed_line.len()) - .unwrap_or(false); - - // Ensure each line ends with a punctuation mark, defaulting to '.' - let line_with_ending = - if is_pure_metadata || punctuation_marks.iter().any(|&mark| trimmed_line.ends_with(mark)) { - trimmed_line.to_string() - } else { - format!("{}\n", trimmed_line) - }; - - Self::split_line_into_sentences(&line_with_ending) - }) - .collect() - } - - /// Splits a single line into sentences, considering common exceptions for English. - fn split_line_into_sentences(line: &str) -> Vec { - let mut sentences = Vec::new(); - let mut start = 0; - - // Expanded list of exceptions in lowercase - let exceptions = [ - " mr.", " mrs.", " ms.", " dr.", " prof.", " gen.", " rep.", " sen.", " jr.", " sr.", " ave.", " blvd.", - " st.", " rd.", " ln.", " ter.", " ct.", " pl.", " p.o.", " a.m.", " p.m.", " cm.", " kg.", " lb.", " oz.", - " ft.", " in.", " mi.", " b.a.", " m.a.", " ph.d.", " m.d.", " b.sc.", " m.sc.", " inc.", " ltd.", " co.", - " corp.", " llc.", " plc.", " et al.", " e.g.", " i.e.", " vs.", " viz.", " approx.", " dept.", " div.", - " est.", - ]; - - for (index, _) in line.match_indices(". ") { - let potential_end = index + 1; // Position after the period - let sentence = &line[start..potential_end]; // Extract sentence up to and including the period - - // Convert the end of the sentence to lowercase for case-insensitive comparison - let sentence_end_lc = sentence.to_lowercase(); - - // Check if the sentence ends with an exception and not actually the end of a sentence - if exceptions.iter().any(|&exc| sentence_end_lc.ends_with(exc)) { - continue; // Skip splitting here, it's an exception - } - - // If it's a valid end of a sentence, push it to the sentences vector - sentences.push(sentence.trim().to_string()); - start = potential_end + 1; // Move start to after the space following the period - } - - // Add any remaining part of the line as the last sentence - if start < line.len() { - sentences.push(line[start..].trim().to_string()); - } - - sentences - } -} diff --git a/shinkai-libs/shinkai-fs/src/file_parser/mod.rs b/shinkai-libs/shinkai-fs/src/file_parser/mod.rs deleted file mode 100644 index 5ef7f8ea2..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parser/mod.rs +++ /dev/null @@ -1,6 +0,0 @@ -pub mod file_parser; -pub mod file_parser_grouping; -pub mod file_parser_helper; -pub mod file_parser_types; -pub mod local_parsing; -pub mod utils; \ No newline at end of file diff --git a/shinkai-libs/shinkai-fs/src/file_parser/utils.rs b/shinkai-libs/shinkai-fs/src/file_parser/utils.rs deleted file mode 100644 index 119a45e0d..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parser/utils.rs +++ /dev/null @@ -1,8 +0,0 @@ -use serde::{Deserialize, Serialize}; -use utoipa::ToSchema; - -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, ToSchema)] -pub enum TextChunkingStrategy { - /// The default text chunking strategy implemented in VR lib using local parsing. - V1, -} \ No newline at end of file diff --git a/shinkai-libs/shinkai-fs/src/file_parsing/file_parser.rs b/shinkai-libs/shinkai-fs/src/file_parsing/file_parser.rs deleted file mode 100644 index 11828b295..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parsing/file_parser.rs +++ /dev/null @@ -1,277 +0,0 @@ -use shinkai_embedding::embedding_generator::EmbeddingGenerator; -use std::{future::Future, pin::Pin}; - -use crate::shinkai_fs_error::ShinkaiFsError; - -use super::{file_parser_types::TextGroup, utils::TextChunkingStrategy}; -use super::local_parsing::LocalFileParser; - -pub struct ShinkaiFileParser; - -impl ShinkaiFileParser { - /// Optionally, if you need some global initialization for OCR, etc. - pub async fn initialize_local_file_parser() -> Result<(), Box> { - use shinkai_ocr::image_parser::ImageParser; - ImageParser::check_and_download_dependencies().await - } - - /// Processes the input file into a BaseVectorResource, auto-detecting extension - /// and using local parsing. Then runs embedding logic. - pub async fn process_file_into_resource( - file_buffer: Vec, - generator: &dyn EmbeddingGenerator, - file_name: String, - desc: Option, - parsing_tags: &Vec, - max_node_text_size: u64, - ) -> Result { - let cleaned_name = ShinkaiFileParser::clean_name(&file_name); - - // 1) Parse into text groups - let text_groups = Self::process_file_into_text_groups(file_buffer, file_name, max_node_text_size).await?; - - // 2) Turn those text groups into a resource - Self::process_groups_into_resource( - text_groups, - generator, - cleaned_name, - desc, - parsing_tags, - max_node_text_size, - ) - .await - } - - /// Processes the input file into a list of `TextGroup` with no embedding generated yet, - /// auto-detecting the file type by extension. - pub async fn process_file_into_text_groups( - file_buffer: Vec, - file_name: String, - max_node_text_size: u64, - ) -> Result, ShinkaiFsError> { - // The new LocalFileParser method automatically detects extension from `file_name` - LocalFileParser::process_file_into_grouped_text(file_buffer, file_name, max_node_text_size) - } - - /// Processes an ordered list of `TextGroup`s into a ready-to-go BaseVectorResource - pub async fn process_groups_into_resource( - text_groups: Vec, - generator: &dyn EmbeddingGenerator, - name: String, - desc: Option, - parsing_tags: &Vec, - max_node_text_size: u64, - ) -> Result { - // We keep the same pattern as before but remove references to `source` - Self::process_groups_into_resource_with_custom_collection( - text_groups, - generator, - name, - desc, - parsing_tags, - max_node_text_size, - ShinkaiFileParser::collect_texts_and_indices, - ) - .await - } - - /// Same as above, but allows a custom function for collecting text/index pairs - pub async fn process_groups_into_resource_with_custom_collection( - text_groups: Vec, - generator: &dyn EmbeddingGenerator, - name: String, - desc: Option, - parsing_tags: &Vec, - max_node_text_size: u64, - collect_texts_and_indices: fn(&[TextGroup], u64, Vec) -> (Vec, Vec<(Vec, usize)>), - ) -> Result { - // Generate embeddings for all text groups - let new_text_groups = ShinkaiFileParser::generate_text_group_embeddings( - text_groups, - generator.box_clone(), - 31, - max_node_text_size, - collect_texts_and_indices, - ) - .await?; - - // Build a resource from those text groups - let mut resource = ShinkaiFileParser::process_new_doc_resource_with_embeddings_already_generated( - new_text_groups, - &*generator, - &name, - desc, - parsing_tags, - None, - ) - .await?; - - // In your code, presumably you have something like `distribution_info` you want to set: - // resource.as_trait_object_mut().set_distribution_info(distribution_info); - - Ok(resource) - } - - /// Blocking version - pub fn process_groups_into_resource_blocking_with_custom_collection( - text_groups: Vec, - generator: &dyn EmbeddingGenerator, - name: String, - desc: Option, - parsing_tags: &Vec, - max_node_text_size: u64, - collect_texts_and_indices: fn(&[TextGroup], u64, Vec) -> (Vec, Vec<(Vec, usize)>), - distribution_info: DistributionInfo, - ) -> Result { - let cloned_generator = generator.box_clone(); - - // Generate embeddings (blocking) - let new_text_groups = ShinkaiFileParser::generate_text_group_embeddings_blocking( - &text_groups, - cloned_generator, - 31, - max_node_text_size, - collect_texts_and_indices, - )?; - - // Build the resource - let mut resource = ShinkaiFileParser::process_new_doc_resource_blocking_with_embeddings_already_generated( - new_text_groups, - &*generator, - &name, - desc, - parsing_tags, - None, - )?; - - resource.as_trait_object_mut().set_distribution_info(distribution_info); - Ok(resource) - } - - /// Async: builds a DocumentVectorResource from text groups that already have embeddings - fn process_new_doc_resource_with_embeddings_already_generated<'a>( - text_groups: Vec, - generator: &'a dyn EmbeddingGenerator, - name: &'a str, - desc: Option, - parsing_tags: &'a Vec, - resource_embedding: Option, - ) -> Pin> + Send + 'a>> { - Box::pin(async move { - let name = ShinkaiFileParser::clean_name(name); - let max_embedding_token_count = generator.model_type().max_input_token_count(); - let resource_desc = Self::_setup_resource_description( - desc, - &text_groups, - max_embedding_token_count, - max_embedding_token_count.checked_div(2).unwrap_or(100), - ); - - let mut doc = DocumentVectorResource::new_empty(&name, resource_desc.as_deref(), true); - doc.set_embedding_model_used(generator.model_type()); - - // Set keywords - let keywords = Self::extract_keywords(&text_groups, 25); - doc.keywords_mut().set_keywords(keywords.clone()); - doc.keywords_mut().update_keywords_embedding(generator).await?; - - // Possibly set the root resource embedding - match resource_embedding { - Some(embedding) => doc.set_resource_embedding(embedding), - None => { - doc.update_resource_embedding(generator, None).await?; - } - } - - // Recursively add each text group - for grouped_text in &text_groups { - let (_, metadata, has_sub_groups, new_name) = Self::process_grouped_text(grouped_text); - if has_sub_groups { - let new_doc = Self::process_new_doc_resource_with_embeddings_already_generated( - grouped_text.sub_groups.clone(), - generator, - &new_name, - None, - parsing_tags, - grouped_text.embedding.clone(), - ) - .await?; - doc.append_vector_resource_node_auto(new_doc, metadata)?; - } else { - if grouped_text.text.len() <= 2 { - continue; - } - if let Some(embedding) = &grouped_text.embedding { - doc.append_text_node(&grouped_text.text, metadata, embedding.clone(), parsing_tags)?; - } else { - let embedding = generator.generate_embedding_default(&grouped_text.text).await?; - doc.append_text_node(&grouped_text.text, metadata, embedding, parsing_tags)?; - } - } - } - - Ok(BaseVectorResource::Document(doc)) - }) - } - - /// Blocking: builds a DocumentVectorResource from text groups that already have embeddings - fn process_new_doc_resource_blocking_with_embeddings_already_generated( - text_groups: Vec, - generator: &dyn EmbeddingGenerator, - name: &str, - desc: Option, - parsing_tags: &Vec, - resource_embedding: Option, - ) -> Result { - let name = ShinkaiFileParser::clean_name(name); - let max_embedding_token_count = generator.model_type().max_input_token_count(); - let resource_desc = Self::_setup_resource_description( - desc, - &text_groups, - max_embedding_token_count, - max_embedding_token_count / 2, - ); - let mut doc = DocumentVectorResource::new_empty(&name, resource_desc.as_deref(), true); - doc.set_embedding_model_used(generator.model_type()); - - // keywords - let keywords = Self::extract_keywords(&text_groups, 25); - doc.keywords_mut().set_keywords(keywords.clone()); - doc.keywords_mut().update_keywords_embedding_blocking(generator)?; - - // Possibly set the resource embedding - match resource_embedding { - Some(embedding) => doc.set_resource_embedding(embedding), - None => { - doc.update_resource_embedding_blocking(generator, None)?; - } - } - - for grouped_text in &text_groups { - let (_new_resource_id, metadata, has_sub_groups, new_name) = Self::process_grouped_text(grouped_text); - if has_sub_groups { - let new_doc = Self::process_new_doc_resource_blocking_with_embeddings_already_generated( - grouped_text.sub_groups.clone(), - generator, - &new_name, - None, - parsing_tags, - grouped_text.embedding.clone(), - )?; - doc.append_vector_resource_node_auto(new_doc, metadata)?; - } else { - if grouped_text.text.len() <= 2 { - continue; - } - if let Some(embedding) = &grouped_text.embedding { - doc.append_text_node(&grouped_text.text, metadata, embedding.clone(), parsing_tags)?; - } else { - let embedding = generator.generate_embedding_default_blocking(&grouped_text.text)?; - doc.append_text_node(&grouped_text.text, metadata, embedding, parsing_tags)?; - } - } - } - - Ok(BaseVectorResource::Document(doc)) - } -} diff --git a/shinkai-libs/shinkai-fs/src/file_parsing/file_parser_types.rs b/shinkai-libs/shinkai-fs/src/file_parsing/file_parser_types.rs deleted file mode 100644 index 451d51afc..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parsing/file_parser_types.rs +++ /dev/null @@ -1,116 +0,0 @@ -use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, HashSet}; - -use super::file_parser::ShinkaiFileParser; - - -/// An intermediary type for processing content into Node's held in VectorResources -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] -pub struct TextGroup { - pub text: String, - pub metadata: HashMap, - pub sub_groups: Vec, - pub embedding: Option>, -} - -impl TextGroup { - pub fn new( - text: String, - metadata: HashMap, - sub_groups: Vec, - embedding: Option>, - ) -> Self { - TextGroup { - text, - metadata, - sub_groups, - embedding, - } - } - - pub fn new_empty() -> Self { - TextGroup { - text: String::new(), - metadata: HashMap::new(), - sub_groups: Vec::new(), - embedding: None, - } - } - - /// Prepares a string to be used to generate an Embedding for this TextGroup. - /// Extracts most prevalent keywords from all sub-groups and appends them to - /// the end of the group's actual text. - pub fn format_text_for_embedding(&self, max_node_text_size: u64) -> String { - let mut keyword_string = String::new(); - let base_string = &self.text; - let pre_keyword_length = base_string.len(); - - // Extract keywords from the TextGroup and its sub-groups - let keywords: Vec = ShinkaiFileParser::extract_keywords(&vec![self.clone()], 1); - - for keyword in keywords { - if pre_keyword_length + keyword_string.len() + keyword.len() <= max_node_text_size as usize { - keyword_string = format!("{}, {}", keyword_string, keyword); - } else { - break; - } - } - - format!("{} Keywords: {}", base_string, keyword_string.trim_start_matches(", ")) - } - - /// Pushes data into this TextGroup and extracts metadata - pub fn push_data(&mut self, text: &str, page_number: Option) { - if !self.text.is_empty() { - self.text.push(' '); - } - - let (parsed_text, metadata, parsed_any_metadata) = ShinkaiFileParser::parse_and_extract_metadata(text); - if parsed_any_metadata { - self.text.push_str(&parsed_text); - self.metadata.extend(metadata); - } else { - self.text.push_str(text); - } - - if let Some(page_number) = page_number { - self.push_page_number(page_number); - } - } - - pub fn push_page_number(&mut self, page_number: u32) { - let mut unique_page_numbers: HashSet = HashSet::new(); - - if let Some(page_numbers_metadata) = self.metadata.get(&ShinkaiFileParser::page_numbers_metadata_key()) { - let page_numbers_metadata: Result, _> = page_numbers_metadata - .trim_matches(|c| c == '[' || c == ']') - .split(',') - .map(|n| n.trim().parse::()) - .collect(); - - if let Ok(page_numbers) = page_numbers_metadata { - for pg in page_numbers { - unique_page_numbers.insert(pg); - } - } - } - - unique_page_numbers.insert(page_number); - - self.metadata.insert( - ShinkaiFileParser::page_numbers_metadata_key(), - format!( - "[{}]", - unique_page_numbers - .iter() - .map(|n| n.to_string()) - .collect::>() - .join(", ") - ), - ); - } - - pub fn push_sub_group(&mut self, sub_group: TextGroup) { - self.sub_groups.push(sub_group); - } -} diff --git a/shinkai-libs/shinkai-fs/src/file_parsing/local_file_parser.rs b/shinkai-libs/shinkai-fs/src/file_parsing/local_file_parser.rs deleted file mode 100644 index b4484a594..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parsing/local_file_parser.rs +++ /dev/null @@ -1,59 +0,0 @@ -// mod local_file_parser { -// use super::*; -// use crate::file_parser::file_parser_types::TextGroup; - -// pub struct LocalFileParser; - -// impl LocalFileParser { -// /// Top-level auto-detect parser: -// pub fn parse_file_auto( -// file_buffer: Vec, -// file_name: &str, -// max_node_text_size: u64, -// ) -> Result, ShinkaiFsError> { -// // Figure out extension (lowercased), then route to a specific parser -// let ext = Path::new(file_name) -// .extension() -// .and_then(|s| s.to_str()) -// .map(|s| s.to_lowercase()) -// .unwrap_or_default(); - -// match ext.as_str() { -// "txt" => Self::process_txt_file(file_buffer, max_node_text_size), -// "md" => Self::process_md_file(file_buffer, max_node_text_size), -// "csv" => Self::process_csv_file(file_buffer, max_node_text_size), -// "json"=> Self::process_json_file(file_buffer, max_node_text_size), -// "pdf" => Self::process_pdf_file(file_buffer, max_node_text_size), -// "htm" | "html" => Self::process_html_file(file_buffer, file_name, max_node_text_size), -// "xlsx" | "xls" => Self::process_xlsx_file(file_buffer, max_node_text_size), -// // fall back to txt-like processing, or return an error: -// _ => Self::process_txt_file(file_buffer, max_node_text_size), -// } -// } - -// // Below are minimal stubs; in your code, call into your existing specialized methods -// pub fn process_txt_file(_file_buffer: Vec, _max_node_text_size: u64) -> Result, ShinkaiFsError> { -// // e.g. call your real .txt parser -// Ok(vec![]) -// } -// pub fn process_md_file(_file_buffer: Vec, _max_node_text_size: u64) -> Result, ShinkaiFsError> { -// Ok(vec![]) -// } -// pub fn process_csv_file(_file_buffer: Vec, _max_node_text_size: u64) -> Result, ShinkaiFsError> { -// Ok(vec![]) -// } -// pub fn process_json_file(_file_buffer: Vec, _max_node_text_size: u64) -> Result, ShinkaiFsError> { -// Ok(vec![]) -// } -// pub fn process_pdf_file(_file_buffer: Vec, _max_node_text_size: u64) -> Result, ShinkaiFsError> { -// Ok(vec![]) -// } -// pub fn process_html_file(_file_buffer: Vec, _file_name: &str, _max_node_text_size: u64) -// -> Result, ShinkaiFsError> { -// Ok(vec![]) -// } -// pub fn process_xlsx_file(_file_buffer: Vec, _max_node_text_size: u64) -> Result, ShinkaiFsError> { -// Ok(vec![]) -// } -// } -// } \ No newline at end of file diff --git a/shinkai-libs/shinkai-fs/src/file_parsing/mod.rs b/shinkai-libs/shinkai-fs/src/file_parsing/mod.rs deleted file mode 100644 index e6ea1fc9c..000000000 --- a/shinkai-libs/shinkai-fs/src/file_parsing/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -// pub mod local_file_parser; -pub mod file_parser_types; -pub mod file_parser; \ No newline at end of file diff --git a/shinkai-libs/shinkai-fs/src/lib.rs b/shinkai-libs/shinkai-fs/src/lib.rs index f1d88addf..b4d518692 100644 --- a/shinkai-libs/shinkai-fs/src/lib.rs +++ b/shinkai-libs/shinkai-fs/src/lib.rs @@ -1,7 +1,4 @@ -// pub mod file_parser; pub mod shinkai_fs_error; pub mod shinkai_file_manager; pub mod shinkai_file_manager_ops; -// pub mod file_parser; -// pub mod file_parsing; pub mod simple_parser; \ No newline at end of file diff --git a/shinkai-libs/shinkai-fs/src/shinkai_file_manager.rs b/shinkai-libs/shinkai-fs/src/shinkai_file_manager.rs index 561cd1c56..2eab05d24 100644 --- a/shinkai-libs/shinkai-fs/src/shinkai_file_manager.rs +++ b/shinkai-libs/shinkai-fs/src/shinkai_file_manager.rs @@ -4,13 +4,12 @@ use std::path::Path; use std::time::SystemTime; use shinkai_embedding::embedding_generator::EmbeddingGenerator; -use shinkai_message_primitives::schemas::shinkai_fs::ParsedFile; +use shinkai_message_primitives::schemas::shinkai_fs::{ParsedFile, ShinkaiFileChunk}; use shinkai_message_primitives::shinkai_utils::shinkai_path::ShinkaiPath; use shinkai_sqlite::SqliteManager; use crate::shinkai_fs_error::ShinkaiFsError; use crate::simple_parser::simple_parser::SimpleParser; -use crate::simple_parser::text_group::TextGroup; pub struct ShinkaiFileManager; @@ -33,59 +32,75 @@ pub enum FileProcessingMode { impl ShinkaiFileManager { /// Process file: If not in DB, add it. If supported, generate chunks. /// If already processed, consider checking if file changed (not implemented here). - pub async fn process_file( + pub async fn process_embeddings_for_file( path: ShinkaiPath, base_dir: &Path, sqlite_manager: &SqliteManager, - mode: FileProcessingMode, + mode: FileProcessingMode, // TODO: maybe we dont need this? generator: &dyn EmbeddingGenerator, ) -> Result<(), ShinkaiFsError> { - // let rel_path = Self::compute_relative_path(&path, base_dir)?; - // let parsed_file = if let Some(pf) = sqlite_manager.get_parsed_file_by_rel_path(&rel_path)? { - // pf - // } else { - // let original_extension = path - // .as_path() - // .extension() - // .and_then(|ext| ext.to_str()) - // .map(|s| s.to_string()); - - // let pf = ParsedFile { - // id: 0, - // relative_path: rel_path.clone(), - // original_extension, - // description: None, - // source: None, - // embedding_model_used: None, - // keywords: None, - // distribution_info: None, - // created_time: Some(Self::current_timestamp()), - // tags: None, - // total_tokens: None, - // total_characters: None, - // }; - // sqlite_manager.add_parsed_file(&pf)?; - // sqlite_manager.get_parsed_file_by_rel_path(&rel_path)?.unwrap() - // }; - - /* - File Processing: - - - we need to be able to read a file - - create the chunks - - create the embedding - - create the vector resource - - add the vector resource to the db - - add the parsed file to the db - - */ - if mode == FileProcessingMode::NoParsing { return Ok(()); } + // Compute the relative path + let rel_path = Self::compute_relative_path(&path, base_dir)?; + + // Check if the file is already processed + if let Some(_parsed_file) = sqlite_manager.get_parsed_file_by_rel_path(&rel_path)? { + // TODO: check if the file has changed since last processing + return Ok(()); + } + + // Steps to process a file: + // 1. Read the file content to ensure accessibility. + // 2. Divide the file content into manageable chunks. + // 3. Generate embeddings for each chunk using the specified model. + // 4. Construct a ParsedFile object and associate it with its chunks. + // 5. Persist the ParsedFile and its chunks into the database. + + // 1- Parse the file let max_node_text_size = generator.model_type().max_input_token_count(); - let text_groups = SimpleParser::parse_file(path, max_node_text_size.try_into().unwrap())?; + let mut text_groups = SimpleParser::parse_file(path.clone(), max_node_text_size.try_into().unwrap())?; + + // Generate embeddings for each text group and assign them directly + for text_group in &mut text_groups { + let embedding = generator.generate_embedding_default(&text_group.text).await?; + text_group.embedding = Some(embedding); + } + + // Add the parsed file to the database + let parsed_file = ParsedFile { + id: None, // Expected. The DB will auto-generate the id. + relative_path: rel_path.clone(), + original_extension: path.extension().map(|s| s.to_string()), + description: None, // TODO: connect this + source: None, // TODO: connect this + embedding_model_used: Some(generator.model_type().to_string()), + keywords: None, // TODO: connect this + distribution_info: None, // TODO: connect this + created_time: Some(Self::current_timestamp()), + tags: None, // TODO: connect this + total_tokens: None, // TODO: connect this + total_characters: None, // TODO: connect this + }; + sqlite_manager.add_parsed_file(&parsed_file)?; + + // Retrieve the parsed file ID + let parsed_file_id = sqlite_manager.get_parsed_file_by_rel_path(&rel_path)? + .ok_or(ShinkaiFsError::FailedToRetrieveParsedFileID)? + .id.unwrap(); + + // Create and add chunks to the database + for (position, text_group) in text_groups.iter().enumerate() { + let chunk = ShinkaiFileChunk { + chunk_id: None, + parsed_file_id, + position: position as i64, + content: text_group.text.clone(), + }; + sqlite_manager.add_chunk(&chunk)?; + } Ok(()) } @@ -134,11 +149,32 @@ impl ShinkaiFileManager { Ok(contents) } + + /// Save a file to disk and process it for embeddings based on the mode. + pub async fn save_and_process_file( + dest_path: ShinkaiPath, + data: Vec, + base_dir: &Path, + sqlite_manager: &SqliteManager, + mode: FileProcessingMode, + generator: &dyn EmbeddingGenerator, + ) -> Result<(), ShinkaiFsError> { + // Save the file to disk + Self::add_file(dest_path.clone(), data)?; + + // Process the file for embeddings if the mode is not NoParsing + if mode != FileProcessingMode::NoParsing { + Self::process_embeddings_for_file(dest_path, base_dir, sqlite_manager, mode, generator).await?; + } + + Ok(()) + } } #[cfg(test)] mod tests { use super::*; + use shinkai_embedding::mock_generator::MockGenerator; use shinkai_embedding::model_type::{EmbeddingModelType, OllamaTextEmbeddingsInference}; use shinkai_message_primitives::schemas::shinkai_fs::ParsedFile; use std::fs::{self, File}; @@ -158,7 +194,7 @@ mod tests { fn create_test_parsed_file(id: i64, relative_path: &str) -> ParsedFile { ParsedFile { - id, + id: Some(id), relative_path: relative_path.to_string(), original_extension: None, description: None, @@ -173,6 +209,40 @@ mod tests { } } + // Helper function to set up a test environment + fn setup_test_environment() -> (SqliteManager, tempfile::TempDir, ShinkaiPath, MockGenerator) { + let db = setup_test_db(); + + // Initialize the database tables + let conn = db.get_connection().unwrap(); + SqliteManager::initialize_filesystem_tables(&conn).unwrap(); + + // Create a temporary directory and file path + let dir = tempdir().unwrap(); + let file_path = dir.path().join("test_file.txt"); + + // Create a mock embedding generator + let model_type = EmbeddingModelType::OllamaTextEmbeddingsInference(OllamaTextEmbeddingsInference::SnowflakeArcticEmbed_M); + let generator = MockGenerator::new(model_type, 128); // 128 is the number of floats in the mock embedding + + (db, dir, ShinkaiPath::from_string(file_path.to_str().unwrap().to_string()), generator) + } + + // Helper function to write large content to a file + fn write_large_content(file: &mut File) { + let large_content = [ + "This is the first part of the test file. It contains some initial text to start the file processing. ", + "Here is the second part of the test file. It adds more content to ensure the file is large enough. ", + "Finally, this is the third part of the test file. It completes the content needed for multiple chunks. ", + "Additional content to ensure the file is sufficiently large for testing purposes. This should help in generating multiple chunks. ", + "More content to further increase the size of the file. This should definitely ensure multiple chunks are created. ", + "Even more content to make sure we exceed the threshold for chunking. This is important for testing the chunking logic. ", + "Continuing to add content to ensure the file is large enough. This should be more than sufficient for the test. ", + "Final addition of content to make sure we have enough text. This should cover all bases for the chunking test." + ].join(""); + writeln!(file, "{}", large_content).unwrap(); + } + #[test] fn test_list_directory_contents() { let db = setup_test_db(); @@ -277,4 +347,74 @@ mod tests { assert!(found_march, "File 'march.txt' should be found."); assert!(found_subdir, "Directory 'subdir' should be found."); } + + #[tokio::test] + async fn test_process_file() { + let (db, dir, shinkai_path, generator) = setup_test_environment(); + + // Create and write to the file + let mut file = File::create(shinkai_path.as_path()).unwrap(); + write_large_content(&mut file); + + // Call the process_embeddings_for_file function + let result = ShinkaiFileManager::process_embeddings_for_file( + shinkai_path.clone(), + dir.path(), + &db, + FileProcessingMode::Auto, + &generator, + ).await; + + // Assert the result is Ok + assert!(result.is_ok()); + + // Verify the file is added to the database + let parsed_file = db.get_parsed_file_by_rel_path("test_file.txt").unwrap(); + assert!(parsed_file.is_some()); + + // Verify the chunks are added to the database + let parsed_file_id = parsed_file.unwrap().id.unwrap(); + let chunks = db.get_chunks_for_parsed_file(parsed_file_id).unwrap(); + println!("chunks: {:?}", chunks); // Debugging output + assert!(chunks.len() >= 2, "Expected at least 2 chunks, found {}", chunks.len()); + + // Clean up + dir.close().unwrap(); + } + + #[tokio::test] + async fn test_save_and_process_file() { + let (db, dir, shinkai_path, generator) = setup_test_environment(); + + // Prepare the data to be written + let mut file = File::create(shinkai_path.as_path()).unwrap(); + write_large_content(&mut file); + let data = std::fs::read(shinkai_path.as_path()).unwrap(); + + // Call the save_and_process_file function + let result = ShinkaiFileManager::save_and_process_file( + shinkai_path.clone(), + data, + dir.path(), + &db, + FileProcessingMode::Auto, + &generator, + ).await; + + // Assert the result is Ok + assert!(result.is_ok()); + + // Verify the file is added to the database + let parsed_file = db.get_parsed_file_by_rel_path("test_file.txt").unwrap(); + assert!(parsed_file.is_some()); + + // Verify the chunks are added to the database + let parsed_file_id = parsed_file.unwrap().id.unwrap(); + let chunks = db.get_chunks_for_parsed_file(parsed_file_id).unwrap(); + println!("chunks: {:?}", chunks); // Debugging output + assert!(chunks.len() >= 2, "Expected at least 2 chunks, found {}", chunks.len()); + + // Clean up + dir.close().unwrap(); + } } diff --git a/shinkai-libs/shinkai-fs/src/shinkai_file_manager_ops.rs b/shinkai-libs/shinkai-fs/src/shinkai_file_manager_ops.rs index 5d9b037eb..ad31083ca 100644 --- a/shinkai-libs/shinkai-fs/src/shinkai_file_manager_ops.rs +++ b/shinkai-libs/shinkai-fs/src/shinkai_file_manager_ops.rs @@ -39,7 +39,11 @@ impl ShinkaiFileManager { // Update DB let rel_path = Self::compute_relative_path(&path, base_dir)?; if let Some(parsed_file) = sqlite_manager.get_parsed_file_by_rel_path(&rel_path)? { - sqlite_manager.remove_parsed_file(parsed_file.id)?; + if let Some(parsed_file_id) = parsed_file.id { + sqlite_manager.remove_parsed_file(parsed_file_id)?; + } else { + return Err(ShinkaiFsError::FailedToRetrieveParsedFileID); + } } else { return Err(ShinkaiFsError::FileNotFoundInDatabase); } diff --git a/shinkai-libs/shinkai-fs/src/shinkai_fs_error.rs b/shinkai-libs/shinkai-fs/src/shinkai_fs_error.rs index 0c5be860e..eb8fc8101 100644 --- a/shinkai-libs/shinkai-fs/src/shinkai_fs_error.rs +++ b/shinkai-libs/shinkai-fs/src/shinkai_fs_error.rs @@ -103,6 +103,12 @@ pub enum ShinkaiFsError { VRPackEmbeddingModelError(String), #[error("Unsupported file type: {0}")] UnsupportedFileType(String), + #[error("Failed to retrieve parsed file ID")] + FailedToRetrieveParsedFileID, + #[error("Failed to add parsed file to database")] + FailedToAddParsedFileToDatabase, + #[error("Failed to add chunks to database")] + FailedToAddChunksToDatabase, } impl From for ShinkaiFsError { diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/file_parser_helper.rs b/shinkai-libs/shinkai-fs/src/simple_parser/file_parser_helper.rs index a5c92bac1..863a7c8e3 100644 --- a/shinkai-libs/shinkai-fs/src/simple_parser/file_parser_helper.rs +++ b/shinkai-libs/shinkai-fs/src/simple_parser/file_parser_helper.rs @@ -363,69 +363,4 @@ impl ShinkaiFileParser { text_groups.extend(created_text_groups); } } - - /// Split a string at the nearest whitespace boundary, producing chunks. - pub fn split_into_chunks(text: &str, chunk_size: usize) -> Vec { - let mut chunks = Vec::new(); - let mut start = 0; - while start < text.len() { - let end = { - let mut candidate_end = start + chunk_size; - if candidate_end >= text.len() { - text.len() - } else { - // Walk backward until whitespace - while candidate_end > start && !text.as_bytes()[candidate_end].is_ascii_whitespace() { - candidate_end -= 1; - } - if candidate_end == start { - // No whitespace found - start + chunk_size.min(text.len() - start) - } else { - candidate_end - } - } - }; - let chunk = &text[start..end]; - chunks.push(chunk.to_string()); - start = end; - } - chunks - } - - /// Same as `split_into_chunks`, but also avoids splitting in the middle of metadata. - pub fn split_into_chunks_with_metadata(text: &str, chunk_size: usize) -> Vec { - let re = Regex::new(Self::METADATA_REGEX).unwrap(); - let matched_positions: Vec<(usize, usize)> = re.find_iter(text).map(|m| (m.start(), m.end())).collect(); - - let mut chunks = Vec::new(); - let mut start = 0; - while start < text.len() { - let end = { - let mut candidate_end = start + chunk_size; - if candidate_end >= text.len() { - text.len() - } else { - // Walk backward until whitespace or we exit a metadata block - while candidate_end > start && - ( - !text.as_bytes()[candidate_end].is_ascii_whitespace() - || matched_positions.iter().any(|&(s,e)| candidate_end >= s && candidate_end < e) - ) - { - candidate_end -= 1; - } - if candidate_end == start { - start + chunk_size.min(text.len() - start) - } else { - candidate_end - } - } - }; - let chunk = &text[start..end]; - chunks.push(chunk.to_string()); - start = end; - } - chunks - } } diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/txt_parsing.rs b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/txt_parsing.rs index ca81ca3d0..fb7e8b52c 100644 --- a/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/txt_parsing.rs +++ b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/txt_parsing.rs @@ -149,3 +149,28 @@ impl LocalFileParser { sentences } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_process_txt_file_multiple_text_groups() { + let input_text = "This is a test sentence. This is another test sentence."; + let file_buffer = input_text.as_bytes().to_vec(); + let max_node_text_size = 10; // Low max node text size to force multiple text groups + + let result = LocalFileParser::process_txt_file(file_buffer, max_node_text_size); + + assert!(result.is_ok()); + let text_groups = result.unwrap(); + + // We expect more than one text group due to the low max_node_text_size + assert!(text_groups.len() > 1); + + // Optionally, check the content of the text groups + for text_group in text_groups { + println!("TextGroup: {}", text_group.text); + } + } +} diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/simple_parser.rs b/shinkai-libs/shinkai-fs/src/simple_parser/simple_parser.rs index fd90ed4c1..76ba37ed8 100644 --- a/shinkai-libs/shinkai-fs/src/simple_parser/simple_parser.rs +++ b/shinkai-libs/shinkai-fs/src/simple_parser/simple_parser.rs @@ -86,7 +86,9 @@ impl SimpleParser { let file_buffer = fs::read(&filepath.as_path()).map_err(|e| ShinkaiFsError::FailedIO(e.to_string()))?; // call the new function based on the file extension - SimpleParser::process_file_by_extension(file_buffer, file_type, max_node_text_size) + let text_groups = SimpleParser::process_file_by_extension(file_buffer, file_type, max_node_text_size)?; + + Ok(text_groups) } fn process_file_by_extension(file_buffer: Vec, file_type: SupportedFileType, max_node_text_size: u64) -> Result, ShinkaiFsError> { @@ -102,3 +104,71 @@ impl SimpleParser { } } } + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use std::io::Write; + use tempfile::tempdir; + + #[test] + fn test_parse_csv_file() { + // Create a temporary directory + let dir = tempdir().unwrap(); + let file_path = dir.path().join("test.csv"); + + // Write a simple CSV content to the file + let mut file = fs::File::create(&file_path).unwrap(); + writeln!(file, "header1,header2").unwrap(); + writeln!(file, "value1,value2").unwrap(); + + // Convert the path to ShinkaiPath + let shinkai_path = ShinkaiPath::from_string(file_path.to_str().unwrap().to_string()); + + // Call the parse_file function + let result = SimpleParser::parse_file(shinkai_path, 1024); + eprintln!("result: {:?}", result); + + // Assert the result is Ok and contains expected data + assert!(result.is_ok()); + let text_groups = result.unwrap(); + assert!(!text_groups.is_empty()); + + // Clean up + dir.close().unwrap(); + } + + #[test] + fn test_parse_large_csv_file() { + // Create a temporary directory + let dir = tempdir().unwrap(); + let file_path = dir.path().join("large_test.csv"); + + // Write a larger CSV content to the file + let mut file = fs::File::create(&file_path).unwrap(); + writeln!(file, "header1,header2,header3").unwrap(); + for i in 0..100 { + writeln!(file, "value1_{},value2_{},value3_{}", i, i, i).unwrap(); + } + + // Convert the path to ShinkaiPath + let shinkai_path = ShinkaiPath::from_string(file_path.to_str().unwrap().to_string()); + + // Call the parse_file function with a smaller max_node_text_size + let result = SimpleParser::parse_file(shinkai_path, 20); + eprintln!("result: {:?}", result); + + // Assert the result is Ok and contains expected data + assert!(result.is_ok()); + let text_groups = result.unwrap(); + + eprintln!("length: {:?}", text_groups.len()); + + + assert!(!text_groups.is_empty()); + + // Clean up + dir.close().unwrap(); + } +} diff --git a/shinkai-libs/shinkai-message-primitives/src/schemas/shinkai_fs.rs b/shinkai-libs/shinkai-message-primitives/src/schemas/shinkai_fs.rs index 4d36259f3..5219bfddb 100644 --- a/shinkai-libs/shinkai-message-primitives/src/schemas/shinkai_fs.rs +++ b/shinkai-libs/shinkai-message-primitives/src/schemas/shinkai_fs.rs @@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ParsedFile { /// Unique identifier for the parsed file entry. - pub id: i64, + pub id: Option, /// The file's path relative to some base directory (e.g., "docs/manual.txt"). pub relative_path: String, /// The original file extension (e.g., "txt", "md", "pdf"). @@ -35,7 +35,7 @@ pub struct ParsedFile { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ShinkaiFileChunk { /// Unique identifier for the file chunk. - pub chunk_id: i64, + pub chunk_id: Option, /// Identifier of the parsed file this chunk is associated with. pub parsed_file_id: i64, /// Sequence number of the chunk, indicating its order within the file. diff --git a/shinkai-libs/shinkai-sqlite/src/file_system.rs b/shinkai-libs/shinkai-sqlite/src/file_system.rs index 1a14b9a86..195f439cc 100644 --- a/shinkai-libs/shinkai-sqlite/src/file_system.rs +++ b/shinkai-libs/shinkai-sqlite/src/file_system.rs @@ -7,7 +7,7 @@ impl SqliteManager { // parsed_files table conn.execute( "CREATE TABLE IF NOT EXISTS parsed_files ( - id INTEGER PRIMARY KEY, + id INTEGER PRIMARY KEY AUTOINCREMENT, relative_path TEXT NOT NULL UNIQUE, original_extension TEXT, description TEXT, @@ -31,7 +31,7 @@ impl SqliteManager { // chunks table conn.execute( "CREATE TABLE IF NOT EXISTS chunks ( - id INTEGER PRIMARY KEY, + id INTEGER PRIMARY KEY AUTOINCREMENT, parsed_file_id INTEGER NOT NULL REFERENCES parsed_files(id) ON DELETE CASCADE, position INTEGER NOT NULL, chunk TEXT NOT NULL, @@ -218,7 +218,7 @@ impl SqliteManager { )?; let rows = stmt.query_map([parsed_file_id], |row| { Ok(ShinkaiFileChunk { - chunk_id: row.get(0)?, + chunk_id: Some(row.get(0)?), parsed_file_id: row.get(1)?, position: row.get(2)?, content: row.get(3)?, @@ -362,7 +362,7 @@ mod tests { fn create_test_parsed_file(id: i64, relative_path: &str) -> ParsedFile { ParsedFile { - id, + id: Some(id), relative_path: relative_path.to_string(), original_extension: None, description: None, @@ -488,4 +488,31 @@ mod tests { assert!(!files_in_directory.iter().any(|pf| pf.relative_path == "docs/reports/2024/march/summary.txt")); assert!(!files_in_directory.iter().any(|pf| pf.relative_path == "docs/reports/old_stuff/misc.txt")); } + + #[test] + fn test_add_chunk_auto_id() { + let db = setup_test_db(); + + // Create and add a parsed file to associate with the chunk + let parsed_file = create_test_parsed_file(1, "file.txt"); + db.add_parsed_file(&parsed_file).unwrap(); + + // Create a chunk without specifying an id + let chunk = ShinkaiFileChunk { + chunk_id: None, // No id specified + parsed_file_id: parsed_file.id.unwrap(), + position: 1, + content: "This is a test chunk.".to_string(), + }; + + // Add the chunk to the database + let result = db.add_chunk(&chunk); + assert!(result.is_ok()); + + // Retrieve the chunk to verify it was added and has an auto-generated id + let chunks = db.get_chunks_for_parsed_file(parsed_file.id.unwrap()).unwrap(); + assert_eq!(chunks.len(), 1); + assert!(chunks[0].chunk_id.is_some()); // Check that the id is auto-generated + assert_eq!(chunks[0].content, "This is a test chunk."); + } }