From 3e61a6f7178f5c96770839c84ebdfe77c7d6979a Mon Sep 17 00:00:00 2001 From: Nico Arqueros Date: Sun, 22 Dec 2024 23:04:46 +0530 Subject: [PATCH] update --- .gitignore | 2 + Cargo.lock | 514 +++++++++++++++++- shinkai-libs/shinkai-fs/Cargo.toml | 3 + .../shinkai-fs/src/file_parser/file_parser.rs | 2 +- .../src/file_parser/file_parser_helper.rs | 1 - .../file_parser/local_parsing/csv_parsing.rs | 16 +- .../file_parser/local_parsing/html_parsing.rs | 7 +- .../file_parser/local_parsing/json_parsing.rs | 6 +- .../local_parsing/local_parsing.rs | 25 +- .../file_parser/local_parsing/md_parsing.rs | 11 +- .../file_parser/local_parsing/pdf_parsing.rs | 13 +- .../file_parser/local_parsing/txt_parsing.rs | 6 +- .../shinkai-fs/src/file_parser/mod.rs | 1 + .../shinkai-fs/src/file_parser/utils.rs | 8 + .../src/file_parsing/file_parser.rs | 277 ++++++++++ .../src/file_parsing/file_parser_types.rs | 116 ++++ .../src/file_parsing/local_file_parser.rs | 59 ++ .../shinkai-fs/src/file_parsing/mod.rs | 3 + shinkai-libs/shinkai-fs/src/lib.rs | 4 +- .../shinkai-fs/src/shinkai_file_manager.rs | 104 ++-- .../shinkai-fs/src/shinkai_fs_error.rs | 95 +++- .../src/simple_parser/file_parser_grouping.rs | 224 ++++++++ .../src/simple_parser/file_parser_helper.rs | 431 +++++++++++++++ .../local_parsing/csv_parsing.rs | 160 ++++++ .../local_parsing/html_parsing.rs | 347 ++++++++++++ .../local_parsing/json_parsing.rs | 87 +++ .../local_parsing/local_parsing.rs | 54 ++ .../simple_parser/local_parsing/md_parsing.rs | 187 +++++++ .../src/simple_parser/local_parsing/mod.rs | 9 + .../local_parsing/pdf_parsing.rs | 30 + .../local_parsing/txt_parsing.rs | 151 +++++ .../shinkai-fs/src/simple_parser/mod.rs | 5 + .../src/simple_parser/simple_parser.rs | 104 ++++ .../src/simple_parser/text_group.rs | 113 ++++ .../src/shinkai_utils/shinkai_path.rs | 15 + .../src/embedding_generator.rs | 30 +- .../src/file_parser/file_parser.rs | 26 +- .../src/file_parser/file_parser_grouping.rs | 6 +- .../local_parsing/local_parsing.rs | 6 +- .../file_parser/local_parsing/md_parsing.rs | 4 +- .../file_parser/local_parsing/pdf_parsing.rs | 4 +- .../file_parser/local_parsing/xlsx_parsing.rs | 2 +- .../src/vector_resource/vector_resource.rs | 10 +- .../vector_resource/vector_resource_search.rs | 10 +- .../vector_resource/vector_resource_types.rs | 4 +- .../src/vector_resource/vrpack.rs | 14 +- 46 files changed, 3127 insertions(+), 179 deletions(-) create mode 100644 shinkai-libs/shinkai-fs/src/file_parser/utils.rs create mode 100644 shinkai-libs/shinkai-fs/src/file_parsing/file_parser.rs create mode 100644 shinkai-libs/shinkai-fs/src/file_parsing/file_parser_types.rs create mode 100644 shinkai-libs/shinkai-fs/src/file_parsing/local_file_parser.rs create mode 100644 shinkai-libs/shinkai-fs/src/file_parsing/mod.rs create mode 100644 shinkai-libs/shinkai-fs/src/simple_parser/file_parser_grouping.rs create mode 100644 shinkai-libs/shinkai-fs/src/simple_parser/file_parser_helper.rs create mode 100644 shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/csv_parsing.rs create mode 100644 shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/html_parsing.rs create mode 100644 shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/json_parsing.rs create mode 100644 shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/local_parsing.rs create mode 100644 shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/md_parsing.rs create mode 100644 shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/mod.rs create mode 100644 shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/pdf_parsing.rs create mode 100644 shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/txt_parsing.rs create mode 100644 shinkai-libs/shinkai-fs/src/simple_parser/mod.rs create mode 100644 shinkai-libs/shinkai-fs/src/simple_parser/simple_parser.rs create mode 100644 shinkai-libs/shinkai-fs/src/simple_parser/text_group.rs diff --git a/.gitignore b/.gitignore index e3028814a..92894e76a 100644 --- a/.gitignore +++ b/.gitignore @@ -107,3 +107,5 @@ shinkai-bin/shinkai-node/files/shinkai_intro.vrkai shinkai-bin/shinkai-node/files/shinkai_intro.pdf shinkai-bin/shinkai-node/files/hispania_jina_es.vrkai storage_testing_debug_my_local_ai/ +output.txt +file_aggregator.sh diff --git a/Cargo.lock b/Cargo.lock index 8957ccf5c..bbefdcf9d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1088,15 +1088,38 @@ checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123" dependencies = [ "atty", "bitflags 1.3.2", - "clap_derive", - "clap_lex", + "clap_derive 3.2.25", + "clap_lex 0.2.4", "indexmap 1.9.3", "once_cell", - "strsim", + "strsim 0.10.0", "termcolor", "textwrap", ] +[[package]] +name = "clap" +version = "4.5.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84" +dependencies = [ + "clap_builder", + "clap_derive 4.5.18", +] + +[[package]] +name = "clap_builder" +version = "4.5.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838" +dependencies = [ + "anstream", + "anstyle", + "clap_lex 0.7.4", + "strsim 0.11.1", + "terminal_size", +] + [[package]] name = "clap_derive" version = "3.2.25" @@ -1110,6 +1133,18 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "clap_derive" +version = "4.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "clap_lex" version = "0.2.4" @@ -1119,6 +1154,12 @@ dependencies = [ "os_str_bytes", ] +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + [[package]] name = "coins-bip32" version = "0.8.7" @@ -1193,6 +1234,26 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "comrak" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0436149c9f6a1935b13306206c739b1ba84fa81f551b5eb87fc2ca7a13700af" +dependencies = [ + "clap 4.5.23", + "derive_builder", + "entities", + "memchr", + "once_cell", + "regex", + "shell-words", + "slug", + "syntect", + "typed-arena", + "unicode_categories", + "xdg", +] + [[package]] name = "concurrent-queue" version = "2.5.0" @@ -1443,6 +1504,29 @@ dependencies = [ "subtle", ] +[[package]] +name = "cssparser" +version = "0.31.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf 0.11.2", + "smallvec 1.13.2", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn 2.0.87", +] + [[package]] name = "csv" version = "1.3.1" @@ -1500,6 +1584,41 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "darling" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.10.0", + "syn 1.0.109", +] + +[[package]] +name = "darling_macro" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" +dependencies = [ + "darling_core", + "quote", + "syn 1.0.109", +] + [[package]] name = "dashmap" version = "5.5.3" @@ -1576,6 +1695,37 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "derive_builder" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "derive_builder_macro" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e" +dependencies = [ + "derive_builder_core", + "syn 1.0.109", +] + [[package]] name = "derive_more" version = "0.99.18" @@ -1609,6 +1759,12 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "deunicode" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339544cc9e2c4dc3fc7149fd630c5f22263a4fdf18a98afd0075784968b5cf00" + [[package]] name = "digest" version = "0.9.0" @@ -1722,6 +1878,21 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75b325c5dbd37f80359721ad39aca5a29fb04c89279657cffdda8736d0c0b9d2" +[[package]] +name = "dtoa" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + [[package]] name = "dunce" version = "1.0.5" @@ -1767,6 +1938,12 @@ dependencies = [ "zeroize", ] +[[package]] +name = "ego-tree" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12a0bb14ac04a9fcf170d0bbbef949b44cc492f4452bd20c095636956f653642" + [[package]] name = "either" version = "1.13.0" @@ -1828,6 +2005,12 @@ dependencies = [ "zeroize", ] +[[package]] +name = "entities" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5320ae4c3782150d900b79807611a59a99fc9a1d61d686faafc24b93fc8d7ca" + [[package]] name = "enum-as-inner" version = "0.6.1" @@ -2261,6 +2444,16 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" +[[package]] +name = "fancy-regex" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b95f7c0680e4142284cf8b22c14a476e87d61b004a3a0861872b32ef7ead40a2" +dependencies = [ + "bit-set", + "regex", +] + [[package]] name = "fancy-regex" version = "0.13.0" @@ -2411,6 +2604,16 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + [[package]] name = "futures" version = "0.3.31" @@ -2553,6 +2756,15 @@ dependencies = [ "zeroize", ] +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.1.16" @@ -2892,6 +3104,20 @@ dependencies = [ "winapi", ] +[[package]] +name = "html5ever" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 2.0.87", +] + [[package]] name = "html_parser" version = "0.7.0" @@ -3243,6 +3469,12 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "0.4.0" @@ -3757,6 +3989,26 @@ dependencies = [ "crc", ] +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" +dependencies = [ + "log", + "phf 0.11.2", + "phf_codegen 0.11.2", + "string_cache", + "string_cache_codegen", + "tendril", +] + [[package]] name = "match_cfg" version = "0.1.0" @@ -4187,6 +4439,28 @@ version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" +[[package]] +name = "onig" +version = "6.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f" +dependencies = [ + "bitflags 1.3.2", + "libc", + "once_cell", + "onig_sys", +] + +[[package]] +name = "onig_sys" +version = "69.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "opaque-debug" version = "0.3.1" @@ -4579,6 +4853,15 @@ dependencies = [ "rustc_version", ] +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared 0.10.0", +] + [[package]] name = "phf" version = "0.11.2" @@ -4589,6 +4872,36 @@ dependencies = [ "phf_shared 0.11.2", ] +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator 0.11.2", + "phf_shared 0.11.2", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared 0.10.0", + "rand 0.8.5", +] + [[package]] name = "phf_generator" version = "0.11.2" @@ -4605,7 +4918,7 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b" dependencies = [ - "phf_generator", + "phf_generator 0.11.2", "phf_shared 0.11.2", "proc-macro2", "quote", @@ -4695,6 +5008,19 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" +[[package]] +name = "plist" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42cf17e9a1800f5f396bc67d193dc9411b59012a5876445ef450d449881e1016" +dependencies = [ + "base64 0.22.1", + "indexmap 2.6.0", + "quick-xml 0.32.0", + "serde", + "time", +] + [[package]] name = "png" version = "0.16.8" @@ -4992,6 +5318,15 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" +[[package]] +name = "quick-xml" +version = "0.32.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d3a6e5838b60e0e8fa7a43f22ade549a37d61f8bdbe636d0d7816191de969c2" +dependencies = [ + "memchr", +] + [[package]] name = "quick-xml" version = "0.35.0" @@ -5945,6 +6280,22 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "scraper" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "761fb705fdf625482d2ed91d3f0559dcfeab2798fe2771c69560a774865d0802" +dependencies = [ + "ahash 0.8.11", + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "once_cell", + "selectors", + "tendril", +] + [[package]] name = "scrypt" version = "0.10.0" @@ -6010,6 +6361,25 @@ dependencies = [ "libc", ] +[[package]] +name = "selectors" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" +dependencies = [ + "bitflags 2.6.0", + "cssparser", + "derive_more 0.99.18", + "fxhash", + "log", + "new_debug_unreachable", + "phf 0.10.1", + "phf_codegen 0.10.0", + "precomputed-hash", + "servo_arc", + "smallvec 1.13.2", +] + [[package]] name = "semver" version = "1.0.23" @@ -6119,6 +6489,15 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "servo_arc" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "sha-1" version = "0.9.8" @@ -6186,6 +6565,12 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shell-words" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24188a676b6ae68c3b2cb3a01be17fbf7240ce009799bb56d5b1409051e78fde" + [[package]] name = "shinkai-spreadsheet-llm" version = "0.9.3" @@ -6246,6 +6631,7 @@ dependencies = [ "bincode", "blake3", "chrono", + "comrak", "csv", "futures", "keyphrases", @@ -6253,6 +6639,7 @@ dependencies = [ "rand 0.8.5", "regex", "reqwest 0.11.27", + "scraper", "serde", "serde_json", "shinkai_embedding", @@ -6261,6 +6648,7 @@ dependencies = [ "shinkai_sqlite", "tempfile", "thiserror 2.0.3", + "urlencoding", "utoipa", ] @@ -6353,7 +6741,7 @@ dependencies = [ "blake3", "chashmap", "chrono", - "clap", + "clap 3.2.25", "console-subscriber", "cron-parser", "csv", @@ -6472,7 +6860,7 @@ name = "shinkai_tcp_relayer" version = "0.9.3" dependencies = [ "chrono", - "clap", + "clap 3.2.25", "derivative", "dotenv", "ed25519-dalek", @@ -6523,7 +6911,7 @@ dependencies = [ "nanoid", "once_cell", "patch", - "phf", + "phf 0.11.2", "reqwest 0.11.27", "serde", "serde_json", @@ -6614,6 +7002,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "slug" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "882a80f72ee45de3cc9a5afeb2da0331d58df69e4e7d8eeb5d3c7784ae67e724" +dependencies = [ + "deunicode", + "wasm-bindgen", +] + [[package]] name = "smallvec" version = "0.6.14" @@ -6648,7 +7046,7 @@ dependencies = [ "itertools 0.11.0", "lalrpop", "lalrpop-util", - "phf", + "phf 0.11.2", "thiserror 1.0.69", "unicode-xid", ] @@ -6716,6 +7114,19 @@ dependencies = [ "parking_lot 0.12.3", "phf_shared 0.10.0", "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", + "proc-macro2", + "quote", ] [[package]] @@ -6724,6 +7135,12 @@ version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "strum" version = "0.26.3" @@ -6820,6 +7237,29 @@ dependencies = [ "syn 2.0.87", ] +[[package]] +name = "syntect" +version = "5.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "874dcfa363995604333cf947ae9f751ca3af4522c60886774c4963943b4746b1" +dependencies = [ + "bincode", + "bitflags 1.3.2", + "fancy-regex 0.11.0", + "flate2", + "fnv", + "once_cell", + "onig", + "plist", + "regex-syntax 0.8.5", + "serde", + "serde_derive", + "serde_json", + "thiserror 1.0.69", + "walkdir", + "yaml-rust", +] + [[package]] name = "system-configuration" version = "0.5.1" @@ -6890,6 +7330,17 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "term" version = "0.7.0" @@ -6910,6 +7361,16 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "terminal_size" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5352447f921fda68cf61b4101566c0bdb5104eff6804d0678e5227580ab6a4e9" +dependencies = [ + "rustix", + "windows-sys 0.59.0", +] + [[package]] name = "textwrap" version = "0.16.1" @@ -7511,6 +7972,12 @@ dependencies = [ "utf-8", ] +[[package]] +name = "typed-arena" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" + [[package]] name = "typenum" version = "1.17.0" @@ -7549,7 +8016,7 @@ dependencies = [ "cfb", "chrono", "encoding_rs", - "fancy-regex", + "fancy-regex 0.13.0", "getrandom 0.2.15", "hashbrown 0.14.5", "hmac 0.12.1", @@ -7557,7 +8024,7 @@ dependencies = [ "image 0.25.5", "lazy_static", "md-5", - "quick-xml", + "quick-xml 0.35.0", "regex", "sha2 0.10.8", "thousands", @@ -7597,12 +8064,24 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + [[package]] name = "unicode-xid" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + [[package]] name = "universal-hash" version = "0.4.0" @@ -8309,6 +8788,21 @@ dependencies = [ "rustix", ] +[[package]] +name = "xdg" +version = "2.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "213b7324336b53d2414b2db8537e56544d981803139155afa84f76eeebb7a546" + +[[package]] +name = "yaml-rust" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56c1936c4cc7a1c9ab21a1ebb602eb942ba868cbd44a99cb7cdc5892335e1c85" +dependencies = [ + "linked-hash-map", +] + [[package]] name = "yansi" version = "0.5.1" diff --git a/shinkai-libs/shinkai-fs/Cargo.toml b/shinkai-libs/shinkai-fs/Cargo.toml index 86c37350a..f57bb74a6 100644 --- a/shinkai-libs/shinkai-fs/Cargo.toml +++ b/shinkai-libs/shinkai-fs/Cargo.toml @@ -15,12 +15,15 @@ rand = { workspace = true } blake3 = { workspace = true } # tokio = { workspace = true, features = ["full"] } chrono = { workspace = true } +comrak = { version = "0.22.0", default-features = true } thiserror = "2.0.3" reqwest = { workspace = true } lazy_static = "1.5.0" async-trait = { workspace = true } keyphrases = { workspace = true } futures = { workspace = true } +scraper = "0.19.0" +urlencoding = "2.1.0" csv = "1.1.6" utoipa = "4.2.3" diff --git a/shinkai-libs/shinkai-fs/src/file_parser/file_parser.rs b/shinkai-libs/shinkai-fs/src/file_parser/file_parser.rs index 9e5e0a7ca..12d0be8ad 100644 --- a/shinkai-libs/shinkai-fs/src/file_parser/file_parser.rs +++ b/shinkai-libs/shinkai-fs/src/file_parser/file_parser.rs @@ -1,6 +1,6 @@ use shinkai_embedding::embedding_generator::EmbeddingGenerator; -use super::file_parser_types::TextGroup; +use super::{file_parser_types::TextGroup, utils::TextChunkingStrategy}; use super::local_parsing::LocalFileParser; use crate::shinkai_fs_error::ShinkaiFsError; diff --git a/shinkai-libs/shinkai-fs/src/file_parser/file_parser_helper.rs b/shinkai-libs/shinkai-fs/src/file_parser/file_parser_helper.rs index 5f4e2c6b8..da27df820 100644 --- a/shinkai-libs/shinkai-fs/src/file_parser/file_parser_helper.rs +++ b/shinkai-libs/shinkai-fs/src/file_parser/file_parser_helper.rs @@ -6,7 +6,6 @@ use std::collections::HashMap; use super::file_parser::ShinkaiFileParser; use super::file_parser_types::TextGroup; -use crate::vector_resource::SourceFileType; impl ShinkaiFileParser { pub const PURE_METADATA_REGEX: &'static str = r"!\{\{\{([^:}]+):((?:[^}]*\}{0,2}[^}]+))\}\}\}!"; diff --git a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/csv_parsing.rs b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/csv_parsing.rs index c47829af5..dee50d3e5 100644 --- a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/csv_parsing.rs +++ b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/csv_parsing.rs @@ -1,22 +1,22 @@ use super::LocalFileParser; -use crate::{file_parser::file_parser_types::TextGroup, resource_errors::VRError}; +use crate::{file_parser::file_parser_types::TextGroup, shinkai_fs_error::ShinkaiFsError}; use csv::ReaderBuilder; use std::io::Cursor; impl LocalFileParser { /// Attempts to process the provided csv file into a list of TextGroups. - pub fn process_csv_file(file_buffer: Vec, max_node_text_size: u64) -> Result, VRError> { - let csv_lines = Self::parse_csv_auto(&file_buffer).map_err(|_| VRError::FailedCSVParsing)?; + pub fn process_csv_file(file_buffer: Vec, max_node_text_size: u64) -> Result, ShinkaiFsError> { + let csv_lines = Self::parse_csv_auto(&file_buffer).map_err(|_| ShinkaiFsError::FailedCSVParsing)?; Self::process_table_rows(csv_lines, max_node_text_size) } // /// Parse CSV data from a buffer and attempt to automatically detect // /// headers. - pub fn parse_csv_auto(buffer: &[u8]) -> Result, VRError> { + pub fn parse_csv_auto(buffer: &[u8]) -> Result, ShinkaiFsError> { let mut reader = ReaderBuilder::new().flexible(true).from_reader(Cursor::new(buffer)); let headers = reader .headers() - .map_err(|_| VRError::FailedCSVParsing)? + .map_err(|_| ShinkaiFsError::FailedCSVParsing)? .iter() .map(String::from) .collect::>(); @@ -35,7 +35,7 @@ impl LocalFileParser { // /// Parse CSV data from a buffer. // /// * `header` - A boolean indicating whether to prepend column headers to // /// values. - pub fn parse_csv(buffer: &[u8], header: bool) -> Result, VRError> { + pub fn parse_csv(buffer: &[u8], header: bool) -> Result, ShinkaiFsError> { let mut reader = ReaderBuilder::new() .flexible(true) .has_headers(header) @@ -43,7 +43,7 @@ impl LocalFileParser { let headers = if header { reader .headers() - .map_err(|_| VRError::FailedCSVParsing)? + .map_err(|_| ShinkaiFsError::FailedCSVParsing)? .iter() .map(String::from) .collect::>() @@ -53,7 +53,7 @@ impl LocalFileParser { let mut result = Vec::new(); for record in reader.records() { - let record = record.map_err(|_| VRError::FailedCSVParsing)?; + let record = record.map_err(|_| ShinkaiFsError::FailedCSVParsing)?; let row: Vec = if header { record .iter() diff --git a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/html_parsing.rs b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/html_parsing.rs index f4c5aadc1..a0c0dc1c2 100644 --- a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/html_parsing.rs +++ b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/html_parsing.rs @@ -1,10 +1,7 @@ use regex::Regex; use scraper::{ElementRef, Html, Selector}; -use crate::{ - file_parser::{file_parser::ShinkaiFileParser, file_parser_types::TextGroup}, - resource_errors::VRError, -}; +use crate::{file_parser::{file_parser::ShinkaiFileParser, file_parser_types::TextGroup}, shinkai_fs_error::ShinkaiFsError}; use super::LocalFileParser; @@ -71,7 +68,7 @@ impl LocalFileParser { file_buffer: Vec, file_name: &str, max_node_text_size: u64, - ) -> Result, VRError> { + ) -> Result, ShinkaiFsError> { let extracted_buffer = extract_core_content(file_buffer, file_name); let document = Html::parse_fragment(&String::from_utf8_lossy(&extracted_buffer)); diff --git a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/json_parsing.rs b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/json_parsing.rs index 15ef2224e..ae80cf31a 100644 --- a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/json_parsing.rs +++ b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/json_parsing.rs @@ -3,13 +3,13 @@ use std::collections::HashMap; use super::LocalFileParser; use crate::file_parser::file_parser::ShinkaiFileParser; use crate::file_parser::file_parser_types::TextGroup; -use crate::resource_errors::VRError; +use crate::shinkai_fs_error::ShinkaiFsError; use serde_json::Value as JsonValue; impl LocalFileParser { /// Attempts to process the provided json file into a list of TextGroups. - pub fn process_json_file(file_buffer: Vec, max_node_text_size: u64) -> Result, VRError> { - let json_string = String::from_utf8(file_buffer.clone()).map_err(|_| VRError::FailedJSONParsing)?; + pub fn process_json_file(file_buffer: Vec, max_node_text_size: u64) -> Result, ShinkaiFsError> { + let json_string = String::from_utf8(file_buffer.clone()).map_err(|_| ShinkaiFsError::FailedJSONParsing)?; let json: JsonValue = serde_json::from_str(&json_string)?; let text_groups = Self::process_container_json_value(&json, max_node_text_size); diff --git a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/local_parsing.rs b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/local_parsing.rs index d72fb4f92..67622d6e7 100644 --- a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/local_parsing.rs +++ b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/local_parsing.rs @@ -1,9 +1,5 @@ use crate::file_parser::file_parser_types::TextGroup; -use crate::resource_errors::VRError; -use crate::source::VRSourceReference; -use crate::vector_resource::DocumentFileType; -use crate::vector_resource::SourceFileType; -use crate::vector_resource::SourceReference; +use crate::shinkai_fs_error::ShinkaiFsError; pub struct LocalFileParser {} @@ -16,46 +12,43 @@ impl LocalFileParser { file_name: String, max_node_text_size: u64, source: VRSourceReference, - ) -> Result, VRError> { + ) -> Result, ShinkaiFsError> { let source_base = source; match &source_base { - VRSourceReference::None => Err(VRError::UnsupportedFileType(file_name.to_string())), + VRSourceReference::None => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), VRSourceReference::Standard(source) => match source { - SourceReference::Other(_) => Err(VRError::UnsupportedFileType(file_name.to_string())), + SourceReference::Other(_) => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), SourceReference::FileRef(file_source) => match file_source.clone().file_type { SourceFileType::Image(_) | SourceFileType::Code(_) | SourceFileType::ConfigFileType(_) | SourceFileType::Video(_) | SourceFileType::Audio(_) - | SourceFileType::Shinkai(_) => Err(VRError::UnsupportedFileType(file_name.to_string())), + | SourceFileType::Shinkai(_) => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), SourceFileType::Document(doc) => match doc { DocumentFileType::Txt => LocalFileParser::process_txt_file(file_buffer, max_node_text_size), DocumentFileType::Json => LocalFileParser::process_json_file(file_buffer, max_node_text_size), DocumentFileType::Csv => LocalFileParser::process_csv_file(file_buffer, max_node_text_size), - DocumentFileType::Docx => LocalFileParser::process_docx_file(file_buffer, max_node_text_size), + // DocumentFileType::Docx => LocalFileParser::process_docx_file(file_buffer, max_node_text_size), DocumentFileType::Html => { LocalFileParser::process_html_file(file_buffer, &file_name, max_node_text_size) } - #[cfg(feature = "desktop-only")] DocumentFileType::Md => LocalFileParser::process_md_file(file_buffer, max_node_text_size), - #[cfg(feature = "desktop-only")] DocumentFileType::Pdf => LocalFileParser::process_pdf_file(file_buffer, max_node_text_size), - #[cfg(feature = "desktop-only")] DocumentFileType::Xlsx | DocumentFileType::Xls => { LocalFileParser::process_xlsx_file(file_buffer, max_node_text_size) } - _ => Err(VRError::UnsupportedFileType(file_name.to_string())), + _ => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), }, }, - SourceReference::ExternalURI(_) => Err(VRError::UnsupportedFileType(file_name.to_string())), + SourceReference::ExternalURI(_) => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), }, - VRSourceReference::Notarized(_) => Err(VRError::UnsupportedFileType(file_name.to_string())), + VRSourceReference::Notarized(_) => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), } } } diff --git a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/md_parsing.rs b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/md_parsing.rs index f5960eb88..12dd2c175 100644 --- a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/md_parsing.rs +++ b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/md_parsing.rs @@ -1,20 +1,15 @@ -#[cfg(feature = "desktop-only")] use comrak::{ nodes::{AstNode, ListDelimType, ListType, NodeValue}, parse_document, Arena, Options, }; -use crate::{ - file_parser::{file_parser::ShinkaiFileParser, file_parser_types::TextGroup}, - resource_errors::VRError, -}; +use crate::{file_parser::{file_parser::ShinkaiFileParser, file_parser_types::TextGroup}, shinkai_fs_error::ShinkaiFsError}; use super::LocalFileParser; impl LocalFileParser { - #[cfg(feature = "desktop-only")] - pub fn process_md_file(file_buffer: Vec, max_node_text_size: u64) -> Result, VRError> { - let md_string = String::from_utf8(file_buffer).map_err(|_| VRError::FailedMDParsing)?; + pub fn process_md_file(file_buffer: Vec, max_node_text_size: u64) -> Result, ShinkaiFsError> { + let md_string = String::from_utf8(file_buffer).map_err(|_| ShinkaiFsError::FailedMDParsing)?; let arena = Arena::new(); let root = parse_document(&arena, &md_string, &Options::default()); diff --git a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/pdf_parsing.rs b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/pdf_parsing.rs index da7070320..4abeb9426 100644 --- a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/pdf_parsing.rs +++ b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/pdf_parsing.rs @@ -1,20 +1,15 @@ -#[cfg(feature = "desktop-only")] -use crate::{ - file_parser::{file_parser::ShinkaiFileParser, file_parser_types::TextGroup}, - resource_errors::VRError, -}; +use crate::{file_parser::{file_parser::ShinkaiFileParser, file_parser_types::TextGroup}, shinkai_fs_error::ShinkaiFsError}; use super::LocalFileParser; impl LocalFileParser { - #[cfg(feature = "desktop-only")] - pub fn process_pdf_file(file_buffer: Vec, max_node_text_size: u64) -> Result, VRError> { + pub fn process_pdf_file(file_buffer: Vec, max_node_text_size: u64) -> Result, ShinkaiFsError> { use shinkai_ocr::pdf_parser::PDFParser; - let pdf_parser = PDFParser::new().map_err(|_| VRError::FailedPDFParsing)?; + let pdf_parser = PDFParser::new().map_err(|_| ShinkaiFsError::FailedPDFParsing)?; let parsed_pages = pdf_parser .process_pdf_file(file_buffer) - .map_err(|_| VRError::FailedPDFParsing)?; + .map_err(|_| ShinkaiFsError::FailedPDFParsing)?; let mut text_groups = Vec::new(); diff --git a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/txt_parsing.rs b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/txt_parsing.rs index c48ef8dcb..a9842bad0 100644 --- a/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/txt_parsing.rs +++ b/shinkai-libs/shinkai-fs/src/file_parser/local_parsing/txt_parsing.rs @@ -5,12 +5,12 @@ use regex::Regex; use super::LocalFileParser; use crate::file_parser::file_parser::ShinkaiFileParser; use crate::file_parser::file_parser_types::TextGroup; -use crate::resource_errors::VRError; +use crate::shinkai_fs_error::ShinkaiFsError; impl LocalFileParser { /// Attempts to process the provided json file into a list of TextGroups. - pub fn process_txt_file(file_buffer: Vec, max_node_text_size: u64) -> Result, VRError> { - let txt_string = String::from_utf8(file_buffer).map_err(|_| VRError::FailedTXTParsing)?; + pub fn process_txt_file(file_buffer: Vec, max_node_text_size: u64) -> Result, ShinkaiFsError> { + let txt_string = String::from_utf8(file_buffer).map_err(|_| ShinkaiFsError::FailedTXTParsing)?; let sentences = LocalFileParser::process_into_sentences(txt_string); let text_groups = LocalFileParser::process_into_text_groups(sentences, max_node_text_size); // for sentence in &sentences { diff --git a/shinkai-libs/shinkai-fs/src/file_parser/mod.rs b/shinkai-libs/shinkai-fs/src/file_parser/mod.rs index 2ef0305f8..5ef7f8ea2 100644 --- a/shinkai-libs/shinkai-fs/src/file_parser/mod.rs +++ b/shinkai-libs/shinkai-fs/src/file_parser/mod.rs @@ -3,3 +3,4 @@ pub mod file_parser_grouping; pub mod file_parser_helper; pub mod file_parser_types; pub mod local_parsing; +pub mod utils; \ No newline at end of file diff --git a/shinkai-libs/shinkai-fs/src/file_parser/utils.rs b/shinkai-libs/shinkai-fs/src/file_parser/utils.rs new file mode 100644 index 000000000..119a45e0d --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/file_parser/utils.rs @@ -0,0 +1,8 @@ +use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, ToSchema)] +pub enum TextChunkingStrategy { + /// The default text chunking strategy implemented in VR lib using local parsing. + V1, +} \ No newline at end of file diff --git a/shinkai-libs/shinkai-fs/src/file_parsing/file_parser.rs b/shinkai-libs/shinkai-fs/src/file_parsing/file_parser.rs new file mode 100644 index 000000000..11828b295 --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/file_parsing/file_parser.rs @@ -0,0 +1,277 @@ +use shinkai_embedding::embedding_generator::EmbeddingGenerator; +use std::{future::Future, pin::Pin}; + +use crate::shinkai_fs_error::ShinkaiFsError; + +use super::{file_parser_types::TextGroup, utils::TextChunkingStrategy}; +use super::local_parsing::LocalFileParser; + +pub struct ShinkaiFileParser; + +impl ShinkaiFileParser { + /// Optionally, if you need some global initialization for OCR, etc. + pub async fn initialize_local_file_parser() -> Result<(), Box> { + use shinkai_ocr::image_parser::ImageParser; + ImageParser::check_and_download_dependencies().await + } + + /// Processes the input file into a BaseVectorResource, auto-detecting extension + /// and using local parsing. Then runs embedding logic. + pub async fn process_file_into_resource( + file_buffer: Vec, + generator: &dyn EmbeddingGenerator, + file_name: String, + desc: Option, + parsing_tags: &Vec, + max_node_text_size: u64, + ) -> Result { + let cleaned_name = ShinkaiFileParser::clean_name(&file_name); + + // 1) Parse into text groups + let text_groups = Self::process_file_into_text_groups(file_buffer, file_name, max_node_text_size).await?; + + // 2) Turn those text groups into a resource + Self::process_groups_into_resource( + text_groups, + generator, + cleaned_name, + desc, + parsing_tags, + max_node_text_size, + ) + .await + } + + /// Processes the input file into a list of `TextGroup` with no embedding generated yet, + /// auto-detecting the file type by extension. + pub async fn process_file_into_text_groups( + file_buffer: Vec, + file_name: String, + max_node_text_size: u64, + ) -> Result, ShinkaiFsError> { + // The new LocalFileParser method automatically detects extension from `file_name` + LocalFileParser::process_file_into_grouped_text(file_buffer, file_name, max_node_text_size) + } + + /// Processes an ordered list of `TextGroup`s into a ready-to-go BaseVectorResource + pub async fn process_groups_into_resource( + text_groups: Vec, + generator: &dyn EmbeddingGenerator, + name: String, + desc: Option, + parsing_tags: &Vec, + max_node_text_size: u64, + ) -> Result { + // We keep the same pattern as before but remove references to `source` + Self::process_groups_into_resource_with_custom_collection( + text_groups, + generator, + name, + desc, + parsing_tags, + max_node_text_size, + ShinkaiFileParser::collect_texts_and_indices, + ) + .await + } + + /// Same as above, but allows a custom function for collecting text/index pairs + pub async fn process_groups_into_resource_with_custom_collection( + text_groups: Vec, + generator: &dyn EmbeddingGenerator, + name: String, + desc: Option, + parsing_tags: &Vec, + max_node_text_size: u64, + collect_texts_and_indices: fn(&[TextGroup], u64, Vec) -> (Vec, Vec<(Vec, usize)>), + ) -> Result { + // Generate embeddings for all text groups + let new_text_groups = ShinkaiFileParser::generate_text_group_embeddings( + text_groups, + generator.box_clone(), + 31, + max_node_text_size, + collect_texts_and_indices, + ) + .await?; + + // Build a resource from those text groups + let mut resource = ShinkaiFileParser::process_new_doc_resource_with_embeddings_already_generated( + new_text_groups, + &*generator, + &name, + desc, + parsing_tags, + None, + ) + .await?; + + // In your code, presumably you have something like `distribution_info` you want to set: + // resource.as_trait_object_mut().set_distribution_info(distribution_info); + + Ok(resource) + } + + /// Blocking version + pub fn process_groups_into_resource_blocking_with_custom_collection( + text_groups: Vec, + generator: &dyn EmbeddingGenerator, + name: String, + desc: Option, + parsing_tags: &Vec, + max_node_text_size: u64, + collect_texts_and_indices: fn(&[TextGroup], u64, Vec) -> (Vec, Vec<(Vec, usize)>), + distribution_info: DistributionInfo, + ) -> Result { + let cloned_generator = generator.box_clone(); + + // Generate embeddings (blocking) + let new_text_groups = ShinkaiFileParser::generate_text_group_embeddings_blocking( + &text_groups, + cloned_generator, + 31, + max_node_text_size, + collect_texts_and_indices, + )?; + + // Build the resource + let mut resource = ShinkaiFileParser::process_new_doc_resource_blocking_with_embeddings_already_generated( + new_text_groups, + &*generator, + &name, + desc, + parsing_tags, + None, + )?; + + resource.as_trait_object_mut().set_distribution_info(distribution_info); + Ok(resource) + } + + /// Async: builds a DocumentVectorResource from text groups that already have embeddings + fn process_new_doc_resource_with_embeddings_already_generated<'a>( + text_groups: Vec, + generator: &'a dyn EmbeddingGenerator, + name: &'a str, + desc: Option, + parsing_tags: &'a Vec, + resource_embedding: Option, + ) -> Pin> + Send + 'a>> { + Box::pin(async move { + let name = ShinkaiFileParser::clean_name(name); + let max_embedding_token_count = generator.model_type().max_input_token_count(); + let resource_desc = Self::_setup_resource_description( + desc, + &text_groups, + max_embedding_token_count, + max_embedding_token_count.checked_div(2).unwrap_or(100), + ); + + let mut doc = DocumentVectorResource::new_empty(&name, resource_desc.as_deref(), true); + doc.set_embedding_model_used(generator.model_type()); + + // Set keywords + let keywords = Self::extract_keywords(&text_groups, 25); + doc.keywords_mut().set_keywords(keywords.clone()); + doc.keywords_mut().update_keywords_embedding(generator).await?; + + // Possibly set the root resource embedding + match resource_embedding { + Some(embedding) => doc.set_resource_embedding(embedding), + None => { + doc.update_resource_embedding(generator, None).await?; + } + } + + // Recursively add each text group + for grouped_text in &text_groups { + let (_, metadata, has_sub_groups, new_name) = Self::process_grouped_text(grouped_text); + if has_sub_groups { + let new_doc = Self::process_new_doc_resource_with_embeddings_already_generated( + grouped_text.sub_groups.clone(), + generator, + &new_name, + None, + parsing_tags, + grouped_text.embedding.clone(), + ) + .await?; + doc.append_vector_resource_node_auto(new_doc, metadata)?; + } else { + if grouped_text.text.len() <= 2 { + continue; + } + if let Some(embedding) = &grouped_text.embedding { + doc.append_text_node(&grouped_text.text, metadata, embedding.clone(), parsing_tags)?; + } else { + let embedding = generator.generate_embedding_default(&grouped_text.text).await?; + doc.append_text_node(&grouped_text.text, metadata, embedding, parsing_tags)?; + } + } + } + + Ok(BaseVectorResource::Document(doc)) + }) + } + + /// Blocking: builds a DocumentVectorResource from text groups that already have embeddings + fn process_new_doc_resource_blocking_with_embeddings_already_generated( + text_groups: Vec, + generator: &dyn EmbeddingGenerator, + name: &str, + desc: Option, + parsing_tags: &Vec, + resource_embedding: Option, + ) -> Result { + let name = ShinkaiFileParser::clean_name(name); + let max_embedding_token_count = generator.model_type().max_input_token_count(); + let resource_desc = Self::_setup_resource_description( + desc, + &text_groups, + max_embedding_token_count, + max_embedding_token_count / 2, + ); + let mut doc = DocumentVectorResource::new_empty(&name, resource_desc.as_deref(), true); + doc.set_embedding_model_used(generator.model_type()); + + // keywords + let keywords = Self::extract_keywords(&text_groups, 25); + doc.keywords_mut().set_keywords(keywords.clone()); + doc.keywords_mut().update_keywords_embedding_blocking(generator)?; + + // Possibly set the resource embedding + match resource_embedding { + Some(embedding) => doc.set_resource_embedding(embedding), + None => { + doc.update_resource_embedding_blocking(generator, None)?; + } + } + + for grouped_text in &text_groups { + let (_new_resource_id, metadata, has_sub_groups, new_name) = Self::process_grouped_text(grouped_text); + if has_sub_groups { + let new_doc = Self::process_new_doc_resource_blocking_with_embeddings_already_generated( + grouped_text.sub_groups.clone(), + generator, + &new_name, + None, + parsing_tags, + grouped_text.embedding.clone(), + )?; + doc.append_vector_resource_node_auto(new_doc, metadata)?; + } else { + if grouped_text.text.len() <= 2 { + continue; + } + if let Some(embedding) = &grouped_text.embedding { + doc.append_text_node(&grouped_text.text, metadata, embedding.clone(), parsing_tags)?; + } else { + let embedding = generator.generate_embedding_default_blocking(&grouped_text.text)?; + doc.append_text_node(&grouped_text.text, metadata, embedding, parsing_tags)?; + } + } + } + + Ok(BaseVectorResource::Document(doc)) + } +} diff --git a/shinkai-libs/shinkai-fs/src/file_parsing/file_parser_types.rs b/shinkai-libs/shinkai-fs/src/file_parsing/file_parser_types.rs new file mode 100644 index 000000000..451d51afc --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/file_parsing/file_parser_types.rs @@ -0,0 +1,116 @@ +use serde::{Deserialize, Serialize}; +use std::collections::{HashMap, HashSet}; + +use super::file_parser::ShinkaiFileParser; + + +/// An intermediary type for processing content into Node's held in VectorResources +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct TextGroup { + pub text: String, + pub metadata: HashMap, + pub sub_groups: Vec, + pub embedding: Option>, +} + +impl TextGroup { + pub fn new( + text: String, + metadata: HashMap, + sub_groups: Vec, + embedding: Option>, + ) -> Self { + TextGroup { + text, + metadata, + sub_groups, + embedding, + } + } + + pub fn new_empty() -> Self { + TextGroup { + text: String::new(), + metadata: HashMap::new(), + sub_groups: Vec::new(), + embedding: None, + } + } + + /// Prepares a string to be used to generate an Embedding for this TextGroup. + /// Extracts most prevalent keywords from all sub-groups and appends them to + /// the end of the group's actual text. + pub fn format_text_for_embedding(&self, max_node_text_size: u64) -> String { + let mut keyword_string = String::new(); + let base_string = &self.text; + let pre_keyword_length = base_string.len(); + + // Extract keywords from the TextGroup and its sub-groups + let keywords: Vec = ShinkaiFileParser::extract_keywords(&vec![self.clone()], 1); + + for keyword in keywords { + if pre_keyword_length + keyword_string.len() + keyword.len() <= max_node_text_size as usize { + keyword_string = format!("{}, {}", keyword_string, keyword); + } else { + break; + } + } + + format!("{} Keywords: {}", base_string, keyword_string.trim_start_matches(", ")) + } + + /// Pushes data into this TextGroup and extracts metadata + pub fn push_data(&mut self, text: &str, page_number: Option) { + if !self.text.is_empty() { + self.text.push(' '); + } + + let (parsed_text, metadata, parsed_any_metadata) = ShinkaiFileParser::parse_and_extract_metadata(text); + if parsed_any_metadata { + self.text.push_str(&parsed_text); + self.metadata.extend(metadata); + } else { + self.text.push_str(text); + } + + if let Some(page_number) = page_number { + self.push_page_number(page_number); + } + } + + pub fn push_page_number(&mut self, page_number: u32) { + let mut unique_page_numbers: HashSet = HashSet::new(); + + if let Some(page_numbers_metadata) = self.metadata.get(&ShinkaiFileParser::page_numbers_metadata_key()) { + let page_numbers_metadata: Result, _> = page_numbers_metadata + .trim_matches(|c| c == '[' || c == ']') + .split(',') + .map(|n| n.trim().parse::()) + .collect(); + + if let Ok(page_numbers) = page_numbers_metadata { + for pg in page_numbers { + unique_page_numbers.insert(pg); + } + } + } + + unique_page_numbers.insert(page_number); + + self.metadata.insert( + ShinkaiFileParser::page_numbers_metadata_key(), + format!( + "[{}]", + unique_page_numbers + .iter() + .map(|n| n.to_string()) + .collect::>() + .join(", ") + ), + ); + } + + pub fn push_sub_group(&mut self, sub_group: TextGroup) { + self.sub_groups.push(sub_group); + } +} diff --git a/shinkai-libs/shinkai-fs/src/file_parsing/local_file_parser.rs b/shinkai-libs/shinkai-fs/src/file_parsing/local_file_parser.rs new file mode 100644 index 000000000..b4484a594 --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/file_parsing/local_file_parser.rs @@ -0,0 +1,59 @@ +// mod local_file_parser { +// use super::*; +// use crate::file_parser::file_parser_types::TextGroup; + +// pub struct LocalFileParser; + +// impl LocalFileParser { +// /// Top-level auto-detect parser: +// pub fn parse_file_auto( +// file_buffer: Vec, +// file_name: &str, +// max_node_text_size: u64, +// ) -> Result, ShinkaiFsError> { +// // Figure out extension (lowercased), then route to a specific parser +// let ext = Path::new(file_name) +// .extension() +// .and_then(|s| s.to_str()) +// .map(|s| s.to_lowercase()) +// .unwrap_or_default(); + +// match ext.as_str() { +// "txt" => Self::process_txt_file(file_buffer, max_node_text_size), +// "md" => Self::process_md_file(file_buffer, max_node_text_size), +// "csv" => Self::process_csv_file(file_buffer, max_node_text_size), +// "json"=> Self::process_json_file(file_buffer, max_node_text_size), +// "pdf" => Self::process_pdf_file(file_buffer, max_node_text_size), +// "htm" | "html" => Self::process_html_file(file_buffer, file_name, max_node_text_size), +// "xlsx" | "xls" => Self::process_xlsx_file(file_buffer, max_node_text_size), +// // fall back to txt-like processing, or return an error: +// _ => Self::process_txt_file(file_buffer, max_node_text_size), +// } +// } + +// // Below are minimal stubs; in your code, call into your existing specialized methods +// pub fn process_txt_file(_file_buffer: Vec, _max_node_text_size: u64) -> Result, ShinkaiFsError> { +// // e.g. call your real .txt parser +// Ok(vec![]) +// } +// pub fn process_md_file(_file_buffer: Vec, _max_node_text_size: u64) -> Result, ShinkaiFsError> { +// Ok(vec![]) +// } +// pub fn process_csv_file(_file_buffer: Vec, _max_node_text_size: u64) -> Result, ShinkaiFsError> { +// Ok(vec![]) +// } +// pub fn process_json_file(_file_buffer: Vec, _max_node_text_size: u64) -> Result, ShinkaiFsError> { +// Ok(vec![]) +// } +// pub fn process_pdf_file(_file_buffer: Vec, _max_node_text_size: u64) -> Result, ShinkaiFsError> { +// Ok(vec![]) +// } +// pub fn process_html_file(_file_buffer: Vec, _file_name: &str, _max_node_text_size: u64) +// -> Result, ShinkaiFsError> { +// Ok(vec![]) +// } +// pub fn process_xlsx_file(_file_buffer: Vec, _max_node_text_size: u64) -> Result, ShinkaiFsError> { +// Ok(vec![]) +// } +// } +// } \ No newline at end of file diff --git a/shinkai-libs/shinkai-fs/src/file_parsing/mod.rs b/shinkai-libs/shinkai-fs/src/file_parsing/mod.rs new file mode 100644 index 000000000..e6ea1fc9c --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/file_parsing/mod.rs @@ -0,0 +1,3 @@ +// pub mod local_file_parser; +pub mod file_parser_types; +pub mod file_parser; \ No newline at end of file diff --git a/shinkai-libs/shinkai-fs/src/lib.rs b/shinkai-libs/shinkai-fs/src/lib.rs index 10924825f..f1d88addf 100644 --- a/shinkai-libs/shinkai-fs/src/lib.rs +++ b/shinkai-libs/shinkai-fs/src/lib.rs @@ -2,4 +2,6 @@ pub mod shinkai_fs_error; pub mod shinkai_file_manager; pub mod shinkai_file_manager_ops; -pub mod file_parser; +// pub mod file_parser; +// pub mod file_parsing; +pub mod simple_parser; \ No newline at end of file diff --git a/shinkai-libs/shinkai-fs/src/shinkai_file_manager.rs b/shinkai-libs/shinkai-fs/src/shinkai_file_manager.rs index 1c6d464f2..561cd1c56 100644 --- a/shinkai-libs/shinkai-fs/src/shinkai_file_manager.rs +++ b/shinkai-libs/shinkai-fs/src/shinkai_file_manager.rs @@ -1,16 +1,16 @@ +use std::collections::HashMap; use std::fs; use std::path::Path; use std::time::SystemTime; -use std::collections::HashMap; +use shinkai_embedding::embedding_generator::EmbeddingGenerator; +use shinkai_message_primitives::schemas::shinkai_fs::ParsedFile; use shinkai_message_primitives::shinkai_utils::shinkai_path::ShinkaiPath; use shinkai_sqlite::SqliteManager; -use shinkai_message_primitives::schemas::shinkai_fs::ParsedFile; - use crate::shinkai_fs_error::ShinkaiFsError; -use crate::file_parser::ShinkaiFileParser; -use crate::embedding_generator::EmbeddingGenerator; +use crate::simple_parser::simple_parser::SimpleParser; +use crate::simple_parser::text_group::TextGroup; pub struct ShinkaiFileManager; @@ -23,6 +23,7 @@ pub struct FileInfo { pub has_embeddings: bool, } +#[derive(PartialEq)] pub enum FileProcessingMode { Auto, NoParsing, @@ -39,56 +40,52 @@ impl ShinkaiFileManager { mode: FileProcessingMode, generator: &dyn EmbeddingGenerator, ) -> Result<(), ShinkaiFsError> { - let rel_path = Self::compute_relative_path(&path, base_dir)?; - let parsed_file = if let Some(pf) = sqlite_manager.get_parsed_file_by_rel_path(&rel_path)? { - pf - } else { - let original_extension = path - .as_path() - .extension() - .and_then(|ext| ext.to_str()) - .map(|s| s.to_string()); - - let pf = ParsedFile { - id: 0, - relative_path: rel_path.clone(), - original_extension, - description: None, - source: None, - embedding_model_used: None, - keywords: None, - distribution_info: None, - created_time: Some(Self::current_timestamp()), - tags: None, - total_tokens: None, - total_characters: None, - }; - sqlite_manager.add_parsed_file(&pf)?; - sqlite_manager.get_parsed_file_by_rel_path(&rel_path)?.unwrap() - }; - - match mode { - FileProcessingMode::Auto => { - // Implement logic for Auto mode - let file_buffer = fs::read(path.as_path())?; - let text_groups = ShinkaiFileParser::process_file_into_text_groups( - file_buffer, - rel_path.clone(), - 1024, // Example max_node_text_size - VRSourceReference::from_file(&rel_path, TextChunkingStrategy::V1)?, - ).await?; - // Further processing... - } - FileProcessingMode::NoParsing => { - // NoParsing mode: Skip parsing logic - // You might still want to update metadata or perform other tasks - } - FileProcessingMode::MustParse => { - // Implement logic for MustParse mode - } + // let rel_path = Self::compute_relative_path(&path, base_dir)?; + // let parsed_file = if let Some(pf) = sqlite_manager.get_parsed_file_by_rel_path(&rel_path)? { + // pf + // } else { + // let original_extension = path + // .as_path() + // .extension() + // .and_then(|ext| ext.to_str()) + // .map(|s| s.to_string()); + + // let pf = ParsedFile { + // id: 0, + // relative_path: rel_path.clone(), + // original_extension, + // description: None, + // source: None, + // embedding_model_used: None, + // keywords: None, + // distribution_info: None, + // created_time: Some(Self::current_timestamp()), + // tags: None, + // total_tokens: None, + // total_characters: None, + // }; + // sqlite_manager.add_parsed_file(&pf)?; + // sqlite_manager.get_parsed_file_by_rel_path(&rel_path)?.unwrap() + // }; + + /* + File Processing: + + - we need to be able to read a file + - create the chunks + - create the embedding + - create the vector resource + - add the vector resource to the db + - add the parsed file to the db + + */ + + if mode == FileProcessingMode::NoParsing { + return Ok(()); } - // TODO: Implement embedding checking with sqlite_manager + let max_node_text_size = generator.model_type().max_input_token_count(); + let text_groups = SimpleParser::parse_file(path, max_node_text_size.try_into().unwrap())?; Ok(()) } @@ -143,6 +140,7 @@ impl ShinkaiFileManager { mod tests { use super::*; use shinkai_embedding::model_type::{EmbeddingModelType, OllamaTextEmbeddingsInference}; + use shinkai_message_primitives::schemas::shinkai_fs::ParsedFile; use std::fs::{self, File}; use std::io::Write; use std::path::PathBuf; diff --git a/shinkai-libs/shinkai-fs/src/shinkai_fs_error.rs b/shinkai-libs/shinkai-fs/src/shinkai_fs_error.rs index 87abcb1d7..0c5be860e 100644 --- a/shinkai-libs/shinkai-fs/src/shinkai_fs_error.rs +++ b/shinkai-libs/shinkai-fs/src/shinkai_fs_error.rs @@ -1,18 +1,25 @@ +use regex::Error as RegexError; +use serde_json::Error as SerdeError; +use shinkai_embedding::shinkai_embedding_errors::ShinkaiEmbeddingError; use shinkai_sqlite::errors::SqliteManagerError; use std::io; use thiserror::Error; #[derive(Error, Debug)] pub enum ShinkaiFsError { + #[error("Failed to read file: {0}")] + FailedIO(String), #[error("File not found")] FileNotFound, + #[error("File not found: {0}")] + FileNotFoundWithPath(String), #[error("Invalid model architecture")] InvalidModelArchitecture, - #[error("Unimplemented model dimensions")] + #[error("Unimplemented model dimensions: {0}")] UnimplementedModelDimensions(String), - #[error("Request failed")] + #[error("Request failed: {0}")] RequestFailed(String), - #[error("Failed to generate embeddings")] + #[error("Failed to generate embeddings: {0}")] FailedEmbeddingGeneration(String), #[error("IO error occurred: {0}")] Io(#[from] io::Error), @@ -26,6 +33,82 @@ pub enum ShinkaiFsError { FolderNotFoundOnFilesystem, #[error("Cannot move folder into itself")] InvalidFolderMove, + #[error("Invalid node id: {0}")] + InvalidNodeId(String), + #[error("VectorResource is empty")] + VectorResourceEmpty, + #[error("No matching node found")] + NoNodeFound, + #[error("Failed JSON parsing")] + FailedJSONParsing, + #[error("Failed CSV parsing")] + FailedCSVParsing, + #[error("Failed DOCX parsing")] + FailedDOCXParsing, + #[error("Failed PDF parsing")] + FailedPDFParsing, + #[error("Failed MD parsing")] + FailedMDParsing, + #[error("Failed TXT parsing")] + FailedTXTParsing, + #[error("Failed XLSX parsing")] + FailedXLSXParsing, + #[error("No embedding provided")] + NoEmbeddingProvided, + #[error("The resource type does not match any of the VRBaseTypes")] + InvalidVRBaseType, + #[error("Regex error: {0}")] + RegexError(#[from] RegexError), + #[error("Content inside of the Node is of a different type than requested")] + ContentIsNonMatchingType, + #[error("Failed to parse Unstructed API response json: {0}")] + FailedParsingUnstructedAPIJSON(String), + #[error("File type not supported: {0}")] + FileTypeNotSupported(String), + #[error("Vector Resource reference string is invalid: {0}")] + InvalidReferenceString(String), + #[error("Provided datetime string does not match RFC3339: {0}")] + InvalidDateTimeString(String), + #[error("Failed to acquire lock for: {0}")] + LockAcquisitionFailed(String), + #[error("Missing key not found in hashmap: {0}")] + MissingKey(String), + #[error("String is not formatted as a proper path string: {0}")] + InvalidPathString(String), + #[error( + "Attempted to perform ordered operations on a resource that does not implement OrderedVectorResource: {0}" + )] + ResourceDoesNotSupportOrderedOperations(String), + #[error("Unexpected/unsupported NodeContent type for Node with id: {0}")] + InvalidNodeType(String), + #[error("The provided merkle hash String is not a validly encoded Blake3 hash: {0}")] + InvalidMerkleHashString(String), + #[error("The Vector Resource does not contain a merkle root: {0}")] + MerkleRootNotFound(String), + #[error("The Node does not contain a merkle root: {0}")] + MerkleHashNotFoundInNode(String), + #[error("The Vector Resource is not merkelized, and thus cannot perform merkel-related functionality: {0}")] + VectorResourceIsNotMerkelized(String), + #[error("Failed to parse contents into VRKai struct: {0}")] + VRKaiParsingError(String), + #[error("Failed to parse contents into VRPack struct: {0}")] + VRPackParsingError(String), + #[error("Unsupported VRKai version: {0}")] + UnsupportedVRKaiVersion(String), + #[error("Unsupported VRPack version: {0}")] + UnsupportedVRPackVersion(String), + #[error("Failed to convert SimplifiedFSEntry at path: {0}")] + InvalidSimplifiedFSEntryType(String), + #[error("Embedding Model Error: {0}")] + VRPackEmbeddingModelError(String), + #[error("Unsupported file type: {0}")] + UnsupportedFileType(String), +} + +impl From for ShinkaiFsError { + fn from(_error: SerdeError) -> Self { + ShinkaiFsError::FailedJSONParsing + } } impl From for ShinkaiFsError { @@ -33,3 +116,9 @@ impl From for ShinkaiFsError { ShinkaiFsError::RequestFailed(error.to_string()) } } + +impl From for ShinkaiFsError { + fn from(error: ShinkaiEmbeddingError) -> Self { + ShinkaiFsError::FailedEmbeddingGeneration(error.to_string()) + } +} diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/file_parser_grouping.rs b/shinkai-libs/shinkai-fs/src/simple_parser/file_parser_grouping.rs new file mode 100644 index 000000000..5bad13472 --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/simple_parser/file_parser_grouping.rs @@ -0,0 +1,224 @@ +use keyphrases::KeyPhraseExtractor; +use regex::Regex; +use shinkai_embedding::embedding_generator::EmbeddingGenerator; + +use std::{future::Future, pin::Pin}; + +use crate::shinkai_fs_error::ShinkaiFsError; + +use super::file_parser_helper::ShinkaiFileParser; +use super::text_group::TextGroup; + +impl ShinkaiFileParser { + /// Collect all texts from the TextGroups in a single dimension (no subgroups). + /// Returns a tuple of: + /// - `Vec` for all text + /// - `Vec<(Vec, usize)>` for the “paths” (here just `[i]`) and text index + pub fn collect_texts_and_indices( + text_groups: &[TextGroup], + max_node_text_size: u64, + path: Vec, + ) -> (Vec, Vec<(Vec, usize)>) { + let mut texts = Vec::new(); + let mut indices = Vec::new(); + + for (i, text_group) in text_groups.iter().enumerate() { + // Format text with your metadata or keyword logic + let formatted_text = text_group.format_text_for_embedding(max_node_text_size); + texts.push(formatted_text); + + // Build a “path” that refers to just the top-level group (no subgroups). + let mut current_path = path.clone(); + current_path.push(i); + // The last text we pushed is at index `texts.len() - 1` + indices.push((current_path, texts.len() - 1)); + } + + (texts, indices) + } + + /// Assign generated embeddings back into the TextGroups (single dimension). + fn assign_embeddings( + text_groups: &mut [TextGroup], + embeddings: &mut Vec>, + indices: &[(Vec, usize)], + ) { + for (path, text_idx) in indices { + // We expect path = [i], but if you store deeper paths, you can interpret them differently. + let i = path[0]; + if let Some(embedding) = embeddings.get(*text_idx) { + text_groups[i].embedding = Some(embedding.clone()); + } + } + } + + /// Batch-generate embeddings for all the TextGroups (no subgroups). + pub fn generate_text_group_embeddings( + text_groups: Vec, + generator: Box, + mut max_batch_size: u64, + max_node_text_size: u64, + collect_texts_and_indices: fn(&[TextGroup], u64, Vec) -> (Vec, Vec<(Vec, usize)>), + ) -> Pin, ShinkaiFsError>> + Send>> { + Box::pin(async move { + // Make a mutable copy of the incoming text groups + let mut text_groups = text_groups; + + // Collect all texts (flattened) from the text groups + let (texts, indices) = collect_texts_and_indices(&text_groups, max_node_text_size, vec![]); + + // Prepare to generate embeddings in batches + let mut embeddings = Vec::new(); + let mut all_futures = Vec::new(); + let mut current_batch_futures = Vec::new(); + + // Break texts into chunks of size `max_batch_size` + for (index, batch) in texts.chunks(max_batch_size as usize).enumerate() { + let batch_texts = batch.to_vec(); + let generator_clone = generator.box_clone(); // clone for an async block below + + let future = async move { + generator_clone.generate_embeddings(&batch_texts).await + }; + current_batch_futures.push(future); + + // If we have 10 futures queued or we're at the last batch, we gather them + if current_batch_futures.len() == 10 || + index == texts.chunks(max_batch_size as usize).count() - 1 + { + all_futures.push(current_batch_futures); + current_batch_futures = Vec::new(); + } + } + + // Run each group of futures in sequence + for futures_group in all_futures { + // Wait for them all to complete + let results = futures::future::join_all(futures_group).await; + for result in results { + match result { + Ok(batch_embeddings) => embeddings.extend(batch_embeddings), + Err(e) => { + // Attempt to reduce batch size and retry + if max_batch_size > 5 { + max_batch_size -= 5; + return Self::generate_text_group_embeddings( + text_groups, + generator, + max_batch_size, + max_node_text_size, + collect_texts_and_indices, + ) + .await; + } else { + return Err(ShinkaiFsError::FailedEmbeddingGeneration(e.to_string())); + } + } + } + } + } + + // Assign embeddings back to the flattened text_groups + Self::assign_embeddings(&mut text_groups, &mut embeddings, &indices); + Ok(text_groups) + }) + } + + /// Splits a string into chunks at the nearest whitespace to a given size + pub fn split_into_chunks(text: &str, chunk_size: usize) -> Vec { + let mut chunks = Vec::new(); + let mut start = 0; + while start < text.len() { + let end = { + let mut candidate_end = start + chunk_size; + if candidate_end >= text.len() { + text.len() + } else { + // walk backward until whitespace (or until we reach start) + while candidate_end > start && !text.as_bytes()[candidate_end].is_ascii_whitespace() { + candidate_end -= 1; + } + if candidate_end == start { + start + chunk_size.min(text.len() - start) + } else { + candidate_end + } + } + }; + let chunk = &text[start..end]; + chunks.push(chunk.to_string()); + start = end; + } + chunks + } + + /// Splits a string into chunks at the nearest whitespace to a given size, avoiding splitting metadata. + pub fn split_into_chunks_with_metadata(text: &str, chunk_size: usize) -> Vec { + // The regex matches both pure and replaceable metadata. + let re = Regex::new(Self::METADATA_REGEX).unwrap(); + let matched_positions: Vec<(usize, usize)> = re.find_iter(text).map(|m| (m.start(), m.end())).collect(); + + let mut chunks = Vec::new(); + let mut start = 0; + while start < text.len() { + let end = { + let mut candidate_end = start + chunk_size; + if candidate_end >= text.len() { + text.len() + } else { + // walk backward until whitespace or until we exit a metadata block + while candidate_end > start && + ( + !text.as_bytes()[candidate_end].is_ascii_whitespace() || + matched_positions.iter().any(|&(s, e)| candidate_end >= s && candidate_end < e) + ) + { + candidate_end -= 1; + } + if candidate_end == start { + start + chunk_size.min(text.len() - start) + } else { + candidate_end + } + } + }; + + let chunk = &text[start..end]; + chunks.push(chunk.to_string()); + start = end; + } + chunks + } + + /// Extracts the most important keywords from all TextGroups using the RAKE algorithm. + pub fn extract_keywords(groups: &Vec, num: u64) -> Vec { + // Flatten the text from all groups into one string + let text = groups + .iter() + .map(|element| element.text.clone()) + .collect::>() + .join(" "); + + // Create a KeyPhraseExtractor with a maximum of `num` keywords + let extractor = KeyPhraseExtractor::new(&text, num as usize); + + // Return keywords only, discarding scores + extractor.get_keywords() + .into_iter() + .map(|(_score, keyword)| keyword) + .collect() + } + + /// Concatenate text from multiple groups up to a maximum size. + pub fn concatenate_groups_up_to_max_size(elements: &Vec, max_size: usize) -> String { + let mut desc = String::new(); + for e in elements { + if desc.len() + e.text.len() + 1 > max_size { + break; + } + desc.push_str(&e.text); + desc.push('\n'); + } + desc.trim_end().to_string() + } +} diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/file_parser_helper.rs b/shinkai-libs/shinkai-fs/src/simple_parser/file_parser_helper.rs new file mode 100644 index 000000000..a5c92bac1 --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/simple_parser/file_parser_helper.rs @@ -0,0 +1,431 @@ +use blake3::Hasher; +use chrono::{TimeZone, Utc}; +use regex::{Captures, Regex}; +use reqwest::Url; +use std::collections::HashMap; + +use super::text_group::TextGroup; + +pub struct ShinkaiFileParser; + +impl ShinkaiFileParser { + pub const PURE_METADATA_REGEX: &'static str = r"!\{\{\{([^:}]+):((?:[^}]*\}{0,2}[^}]+))\}\}\}!"; + pub const METADATA_REGEX: &'static str = r"\{\{\{([^:}]+):((?:[^}]*\}{0,2}[^}]+))\}\}\}"; + pub const MD_URL_REGEX: &'static str = r"(.?)\[(.*?)\]\((.*?)\)"; + + /// Key of page numbers metadata + pub fn page_numbers_metadata_key() -> String { + "pg_nums".to_string() + } + + /// Key of datetime metadata + pub fn datetime_metadata_key() -> String { + "datetime".to_string() + } + + /// Key of timestamp metadata + pub fn timestamp_metadata_key() -> String { + "timestamp".to_string() + } + + // // Key of likes metadata + // pub fn likes_metadata_key() -> String { + // "likes".to_string() + // } + + // // Key of reposts metadata + // pub fn reposts_metadata_key() -> String { + // "reposts".to_string() + // } + + // // Key of replies metadata + // pub fn replies_metadata_key() -> String { + // "replies".to_string() + // } + + // /// Clean's the file name of auxiliary data (file extension, url in front of file name, etc.) + // pub fn clean_name(name: &str) -> String { + // // Decode URL-encoded characters to simplify processing. + // let decoded_name = urlencoding::decode(name).unwrap_or_else(|_| name.into()); + + // // Check if the name ends with ".htm" or ".html" and calculate the position to avoid deletion. + // let avoid_deletion_position = if decoded_name.ends_with(".htm") || decoded_name.ends_with(".html") { + // decoded_name.len().saturating_sub(4) // Position before ".htm" + // } else if decoded_name.ends_with(".html") { + // decoded_name.len().saturating_sub(5) // Position before ".html" + // } else if decoded_name.ends_with(".mhtml") { + // decoded_name.len().saturating_sub(6) // Position before ".mhtml" + // } else { + // decoded_name.len() // Use the full length if not ending with ".htm" or ".html" + // }; + + // // Find the last occurrence of "/" or "%2F" that is not too close to the ".htm" extension. + // let last_relevant_slash_position = decoded_name.rmatch_indices(&['/', '%']).find_map(|(index, _)| { + // if index + 3 < avoid_deletion_position && decoded_name[index..].starts_with("%2F") { + // Some(index) + // } else if index + 1 < avoid_deletion_position && decoded_name[index..].starts_with("/") { + // Some(index) + // } else { + // None + // } + // }); + // // If a relevant slash is found, slice the string from the character immediately following this slash. + // let http_cleaned = match last_relevant_slash_position { + // Some(index) => decoded_name + // .get((index + if decoded_name[index..].starts_with("%2F") { 3 } else { 1 })..) + // .unwrap_or(&decoded_name), + // None => &decoded_name, + // }; + + // let http_cleaned = if http_cleaned.is_empty() || http_cleaned == ".html" || http_cleaned == ".htm" { + // decoded_name.to_string() + // } else { + // http_cleaned.to_string() + // }; + + // // Remove extension + // let cleaned_name = SourceFileType::clean_string_of_extension(&http_cleaned); + + // cleaned_name + // } + + /// Helper function that processes groups into a list of descriptions. + /// Only takes the top-level group text, does not recurse into subgroups. + pub fn process_groups_into_descriptions_list( + groups: &Vec, + max_size: usize, + max_node_text_size: usize, + ) -> Vec { + let mut descriptions = Vec::new(); + let mut description = String::new(); + let mut total_size = 0; + + for group in groups { + let element_text = &group.text; + if description.len() + element_text.len() > max_node_text_size { + descriptions.push(description.clone()); + total_size += description.len(); + description.clear(); + } + if total_size + element_text.len() > max_size { + break; + } + description.push_str(element_text); + description.push(' '); + } + if !description.is_empty() { + descriptions.push(description); + } + + descriptions + } + + /// Processes groups into a single description string. + /// Only takes the top-level `TextGroup` text, not subgroups. + pub fn process_groups_into_description( + groups: &Vec, + max_size: usize, + max_node_text_size: usize, + ) -> String { + let descriptions = Self::process_groups_into_descriptions_list(groups, max_size, max_node_text_size); + descriptions.join(" ") + } + + /// Helper method for setting a description if none is provided. + pub fn _setup_resource_description( + desc: Option, + text_groups: &Vec, + max_size: usize, + max_node_text_size: usize, + ) -> Option { + if let Some(description) = desc { + Some(description.to_string()) + } else if !text_groups.is_empty() { + Some(Self::process_groups_into_description( + text_groups, + max_size, + max_node_text_size, + )) + } else { + None + } + } + + /// Generates a Blake3 hash of the data in the buffer. + pub fn generate_data_hash(buffer: &[u8]) -> String { + let mut hasher = Hasher::new(); + hasher.update(buffer); + let result = hasher.finalize(); + result.to_hex().to_string() + } + + /// Parse and extract metadata from `input_text`. + /// Returns `(parsed_text, metadata, parsed_any_metadata)`. + pub fn parse_and_extract_metadata(input_text: &str) -> (String, HashMap, bool) { + let mut metadata = HashMap::new(); + let mut parsed_any_metadata = false; + let pure_metadata_re = Regex::new(Self::PURE_METADATA_REGEX).unwrap(); + let replaceable_metadata_re = Regex::new(Self::METADATA_REGEX).unwrap(); + + let pure_result = pure_metadata_re.replace_all(input_text, |caps: &Captures| { + Self::extract_metadata_from_capture(&mut metadata, &mut parsed_any_metadata, caps, true) + }); + + let parsed_result = replaceable_metadata_re.replace_all(&pure_result, |caps: &Captures| { + Self::extract_metadata_from_capture(&mut metadata, &mut parsed_any_metadata, caps, false) + }); + + (parsed_result.to_string(), metadata, parsed_any_metadata) + } + + /// Helper function to extract metadata from a capture. + /// If `is_pure == true`, the captured text is removed from the final string. + fn extract_metadata_from_capture( + metadata: &mut HashMap, + parsed_any_metadata: &mut bool, + caps: &Captures, + is_pure: bool, + ) -> String { + let key = match caps.get(1) { + Some(key) => key.as_str(), + None => return caps.get(0).unwrap().as_str().to_string(), + }; + let value = match caps.get(2) { + Some(value) => value.as_str(), + None => return caps.get(0).unwrap().as_str().to_string(), + }; + + *parsed_any_metadata = true; + + match key { + // timestamp or datetime: RFC3339 format + _ if key == ShinkaiFileParser::datetime_metadata_key() + || key == ShinkaiFileParser::timestamp_metadata_key() => + { + let datetime = chrono::DateTime::parse_from_rfc3339(value); + match datetime { + Ok(_) => { + metadata.insert(ShinkaiFileParser::datetime_metadata_key(), value.to_string()); + if is_pure { "".to_string() } else { value.to_string() } + } + Err(_) => { + // Attempt a less strict format + let datetime = chrono::NaiveDateTime::parse_from_str(value, "%Y-%m-%dT%H:%M:%S%.3fZ"); + match datetime { + Ok(parsed_datetime) => { + let formatted_datetime = Utc.from_utc_datetime(&parsed_datetime).to_rfc3339(); + metadata.insert(key.to_string(), formatted_datetime.clone()); + if is_pure { "".to_string() } else { formatted_datetime } + } + Err(_) => value.to_string(), + } + } + } + } + // pg_nums: array of integers + _ if key == ShinkaiFileParser::page_numbers_metadata_key() => { + let page_numbers: Result, _> = value + .trim_matches(|c| c == '[' || c == ']') + .split(',') + .map(|n| n.trim().parse::()) + .collect(); + + match page_numbers { + Ok(_) => { + metadata.insert(key.to_string(), value.to_string()); + if is_pure { "".to_string() } else { value.to_string() } + } + Err(_) => value.to_string(), + } + } + // Fallback + _ => { + metadata.insert(key.to_string(), value.to_string()); + if is_pure { "".to_string() } else { value.to_string() } + } + } + } + + /// Parse and extract Markdown URLs like `[text](url)` or `![text](url)`. + pub fn parse_and_extract_md_metadata(input_text: &str) -> (String, HashMap) { + let mut metadata = HashMap::new(); + let md_url_re = Regex::new(Self::MD_URL_REGEX).unwrap(); + + let parsed_result = md_url_re.replace_all(input_text, |caps: &Captures| { + let prefix = caps.get(1).map_or("", |m| m.as_str()); + let text = caps.get(2).map_or("", |m| m.as_str()); + let url = caps.get(3).map_or("", |m| m.as_str()); + + let mut shortened_url = Url::parse(url) + .ok() + .map(|u| { + let mut scheme = u.scheme().to_string(); + let host = u.host_str().unwrap_or("").to_string(); + if !scheme.is_empty() { + scheme = format!("{}://", scheme); + } + format!("{}{}", scheme, host) + }) + .unwrap_or_else(|| "".to_string()); + + if shortened_url.is_empty() { + shortened_url = url.chars().take(100).collect(); + } + + match prefix { + "!" => { + let image_urls_entry = metadata.entry("image-urls".to_string()).or_insert(Vec::::new()); + image_urls_entry.push(format!("![{}]({})", text, url)); + format!("![{}]({})", text, shortened_url) + } + _ => { + let link_urls_entry = metadata.entry("link-urls".to_string()).or_insert(Vec::::new()); + link_urls_entry.push(format!("[{}]({})", text, url)); + format!("{}[{}]({})", prefix, text, shortened_url) + } + } + }); + + let serialized_metadata = metadata + .into_iter() + .map(|(key, values)| (key, serde_json::to_string(&values).unwrap_or_default())) + .collect::>(); + + (parsed_result.to_string(), serialized_metadata) + } + + /// Splits `text` into as many `TextGroup`s as needed, ignoring sub-groups. + pub fn parse_and_split_into_text_groups( + text: String, + max_node_text_size: u64, + page_number: Option, + ) -> Vec { + let mut text_groups = Vec::new(); + let (parsed_text, metadata, parsed_any_metadata) = + ShinkaiFileParser::parse_and_extract_metadata(&text); + let (parsed_md_text, md_metadata) = + ShinkaiFileParser::parse_and_extract_md_metadata(&parsed_text); + + // Merge the two sets of metadata + let all_metadata = metadata.into_iter().chain(md_metadata).collect::>(); + + if parsed_md_text.len() as u64 > max_node_text_size { + // If the text is too large, split it + let chunks = if parsed_any_metadata { + ShinkaiFileParser::split_into_chunks_with_metadata(&text, max_node_text_size as usize) + } else { + Self::split_into_chunks(&text, max_node_text_size as usize) + }; + + for chunk in chunks { + let (parsed_chunk, chunk_metadata, _) = Self::parse_and_extract_metadata(&chunk); + let (parsed_md_chunk, md_metadata_chunk) = Self::parse_and_extract_md_metadata(&parsed_chunk); + + let merged_metadata = chunk_metadata + .into_iter() + .chain(md_metadata_chunk) + .collect::>(); + + let mut text_group = TextGroup::new(parsed_md_chunk, merged_metadata, None); + if let Some(page_number) = page_number { + text_group.push_page_number(page_number); + } + text_groups.push(text_group); + } + } else { + // Single chunk + let mut text_group = TextGroup::new(parsed_md_text, all_metadata, None); + if let Some(page_number) = page_number { + text_group.push_page_number(page_number); + } + text_groups.push(text_group); + } + + text_groups + } + + /// Previously, this method would nest groups at `depth`. + /// Now, we flatten everything and simply append the created groups. + pub fn push_text_group_by_depth( + text_groups: &mut Vec, + _depth: usize, // ignore depth + text: String, + max_node_text_size: u64, + page_number: Option, + ) { + if !text.is_empty() { + let created_text_groups = Self::parse_and_split_into_text_groups( + text, + max_node_text_size, + page_number + ); + // Just extend the top-level list, ignoring `_depth`. + text_groups.extend(created_text_groups); + } + } + + /// Split a string at the nearest whitespace boundary, producing chunks. + pub fn split_into_chunks(text: &str, chunk_size: usize) -> Vec { + let mut chunks = Vec::new(); + let mut start = 0; + while start < text.len() { + let end = { + let mut candidate_end = start + chunk_size; + if candidate_end >= text.len() { + text.len() + } else { + // Walk backward until whitespace + while candidate_end > start && !text.as_bytes()[candidate_end].is_ascii_whitespace() { + candidate_end -= 1; + } + if candidate_end == start { + // No whitespace found + start + chunk_size.min(text.len() - start) + } else { + candidate_end + } + } + }; + let chunk = &text[start..end]; + chunks.push(chunk.to_string()); + start = end; + } + chunks + } + + /// Same as `split_into_chunks`, but also avoids splitting in the middle of metadata. + pub fn split_into_chunks_with_metadata(text: &str, chunk_size: usize) -> Vec { + let re = Regex::new(Self::METADATA_REGEX).unwrap(); + let matched_positions: Vec<(usize, usize)> = re.find_iter(text).map(|m| (m.start(), m.end())).collect(); + + let mut chunks = Vec::new(); + let mut start = 0; + while start < text.len() { + let end = { + let mut candidate_end = start + chunk_size; + if candidate_end >= text.len() { + text.len() + } else { + // Walk backward until whitespace or we exit a metadata block + while candidate_end > start && + ( + !text.as_bytes()[candidate_end].is_ascii_whitespace() + || matched_positions.iter().any(|&(s,e)| candidate_end >= s && candidate_end < e) + ) + { + candidate_end -= 1; + } + if candidate_end == start { + start + chunk_size.min(text.len() - start) + } else { + candidate_end + } + } + }; + let chunk = &text[start..end]; + chunks.push(chunk.to_string()); + start = end; + } + chunks + } +} diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/csv_parsing.rs b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/csv_parsing.rs new file mode 100644 index 000000000..b8e6339ff --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/csv_parsing.rs @@ -0,0 +1,160 @@ +use crate::{ + shinkai_fs_error::ShinkaiFsError, + simple_parser::{file_parser_helper::ShinkaiFileParser, text_group::TextGroup}, +}; + +use csv::ReaderBuilder; +use std::{collections::HashMap, io::Cursor}; + +use super::LocalFileParser; + +impl LocalFileParser { + /// Attempts to process the provided csv file into a list of TextGroups. + pub fn process_csv_file(file_buffer: Vec, max_node_text_size: u64) -> Result, ShinkaiFsError> { + let csv_lines = Self::parse_csv_auto(&file_buffer).map_err(|_| ShinkaiFsError::FailedCSVParsing)?; + Self::process_table_rows(csv_lines, max_node_text_size) + } + + // /// Parse CSV data from a buffer and attempt to automatically detect + // /// headers. + pub fn parse_csv_auto(buffer: &[u8]) -> Result, ShinkaiFsError> { + let mut reader = ReaderBuilder::new().flexible(true).from_reader(Cursor::new(buffer)); + let headers = reader + .headers() + .map_err(|_| ShinkaiFsError::FailedCSVParsing)? + .iter() + .map(String::from) + .collect::>(); + + let likely_header = headers.iter().all(|s| { + let is_alphabetic = s.chars().all(|c| c.is_alphabetic() || c.is_whitespace() || c == '_'); + let no_duplicates = headers.iter().filter(|&item| item == s).count() == 1; + let no_prohibited_chars = !s.contains(&['@', '#', '$', '%', '^', '&', '*']); + + is_alphabetic && no_duplicates && no_prohibited_chars + }); + + Self::parse_csv(&buffer, likely_header) + } + + // /// Parse CSV data from a buffer. + // /// * `header` - A boolean indicating whether to prepend column headers to + // /// values. + pub fn parse_csv(buffer: &[u8], header: bool) -> Result, ShinkaiFsError> { + let mut reader = ReaderBuilder::new() + .flexible(true) + .has_headers(header) + .from_reader(Cursor::new(buffer)); + let headers = if header { + reader + .headers() + .map_err(|_| ShinkaiFsError::FailedCSVParsing)? + .iter() + .map(String::from) + .collect::>() + } else { + Vec::new() + }; + + let mut result = Vec::new(); + for record in reader.records() { + let record = record.map_err(|_| ShinkaiFsError::FailedCSVParsing)?; + let row: Vec = if header { + record + .iter() + .enumerate() + .map(|(i, e)| format!("{}: {}", headers[i], e)) + .collect() + } else { + record.iter().map(String::from).collect() + }; + let row_string = row.join("|"); + result.push(row_string); + } + + Ok(result) + } + + pub fn process_table_rows( + table_rows: Vec, + max_node_text_size: u64, + ) -> Result, ShinkaiFsError> { + let mut table_rows_split = Vec::new(); + let mut current_group = Vec::new(); + let mut current_length = 0; + + for row in table_rows { + let line_length = row.len() as u64; + if current_length + line_length > max_node_text_size { + if !current_group.is_empty() { + table_rows_split.push(current_group); + } + current_group = Vec::new(); + current_length = 0; + } + current_group.push(row); + current_length += line_length; + } + + if !current_group.is_empty() { + table_rows_split.push(current_group); + } + + let joined_lines = table_rows_split + .into_iter() + .map(|group| group.join("\n")) + .collect::>(); + + let mut text_groups = Vec::new(); + for line in joined_lines { + let (parsed_line, metadata, parsed_any_metadata) = ShinkaiFileParser::parse_and_extract_metadata(&line); + + if parsed_line.len() as u64 > max_node_text_size { + // Instead of sub-groups, just create multiple TextGroups: + let chunks = if parsed_any_metadata { + ShinkaiFileParser::split_into_chunks_with_metadata(&line, max_node_text_size as usize) + } else { + ShinkaiFileParser::split_into_chunks(&line, max_node_text_size as usize) + }; + + for chunk in chunks { + let (parsed_chunk, chunk_metadata, _) = if parsed_any_metadata { + ShinkaiFileParser::parse_and_extract_metadata(&chunk) + } else { + (chunk.to_owned(), HashMap::new(), false) + }; + text_groups.push(TextGroup::new(parsed_chunk, chunk_metadata, None)); + } + } else if !parsed_line.is_empty() { + text_groups.push(TextGroup::new(parsed_line, metadata, None)); + } + } + + Ok(text_groups) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_process_csv_file() { + // Sample CSV data + let csv_data = b"header1,header2\nvalue1,value2\nvalue3,value4"; + let max_node_text_size = 10; + + // Call the function + let result = LocalFileParser::process_csv_file(csv_data.to_vec(), max_node_text_size); + eprintln!("result: {:?}", result); + + // Check the result + assert!(result.is_ok()); + let text_groups = result.unwrap(); + + // Verify the output + assert_eq!(text_groups.len(), 1); + let expected_text = "header1|header2\nvalue1|value2\nvalue3|value4"; + assert_eq!(text_groups[0].text, expected_text); + } +} diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/html_parsing.rs b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/html_parsing.rs new file mode 100644 index 000000000..2c363dcee --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/html_parsing.rs @@ -0,0 +1,347 @@ +use regex::Regex; +use scraper::{ElementRef, Html, Selector}; + +use crate::{shinkai_fs_error::ShinkaiFsError, simple_parser::{file_parser_helper::ShinkaiFileParser, text_group::TextGroup}}; + +use super::LocalFileParser; + +/// If the file provided is an html file, attempt to extract out the core content to improve overall quality. +pub fn extract_core_content(file_buffer: Vec, file_name: &str) -> Vec { + if file_name.ends_with(".html") || file_name.ends_with(".htm") { + let file_content = String::from_utf8_lossy(&file_buffer); + let document = Html::parse_document(&file_content); + + // If the file is from GitHub, use a specific selector for GitHub's layout + if file_name.contains("github.com") { + if let Ok(layout_selector) = Selector::parse(".entry-content") { + if let Some(layout_element) = document.select(&layout_selector).next() { + return layout_element.inner_html().into_bytes(); + } + } + } else if file_name.contains("twitter.com") || file_name.contains("x.com") { + // Selector for Twitter or X.com's layout + if let Ok(primary_column_selector) = Selector::parse("div[data-testid='primaryColumn']") { + if let Some(primary_column_element) = document.select(&primary_column_selector).next() { + return primary_column_element.inner_html().into_bytes(); + } + } + } else if file_name.contains("youtube.com") { + // Selector for YouTube's layout + let mut content = String::new(); + if let Ok(above_the_fold_selector) = Selector::parse("#above-the-fold") { + if let Some(above_the_fold_element) = document.select(&above_the_fold_selector).next() { + content += &above_the_fold_element.inner_html(); + } + } + if let Ok(comments_selector) = Selector::parse(".ytd-comments") { + if let Some(comments_element) = document.select(&comments_selector).next() { + content += &comments_element.inner_html(); + } + } + return content.into_bytes(); + } else { + // Try to select the 'main', 'article' tag or a class named 'main' + if let Ok(main_selector) = Selector::parse("main, .main, article") { + if let Some(main_element) = document.select(&main_selector).next() { + return main_element.inner_html().into_bytes(); + } + } + + if let Ok(body_selector) = Selector::parse("body") { + if let Some(body_element) = document.select(&body_selector).next() { + return body_element.inner_html().into_bytes(); + } + } + } + } + + file_buffer +} + +impl LocalFileParser { + const IGNORED_ELEMENTS: &'static [&'static str] = &[ + "base", "head", "link", "meta", "noscript", "script", "style", "svg", "template", "title", + ]; + const HTML_HEADERS: &'static [&'static str] = &["h1", "h2", "h3", "h4", "h5", "h6"]; + + pub fn process_html_file( + file_buffer: Vec, + file_name: &str, + max_node_text_size: u64, + ) -> Result, ShinkaiFsError> { + let extracted_buffer = extract_core_content(file_buffer, file_name); + let document = Html::parse_fragment(&String::from_utf8_lossy(&extracted_buffer)); + + let mut text_groups: Vec = Vec::new(); + + // to keep track of the current parent headings + let mut heading_parents: Vec = Vec::with_capacity(6); + + // Parent nodes propagate context to child nodes. + // Nodes can alter their state and propagate them to their children. + #[derive(Default)] + struct HTMLNodeContext { + is_preformatted: bool, // pre tags + is_ordered_list: bool, // ol tags + list_item_start: u64, // start attribute for ol tags + list_depth: u64, // nested lists + } + + // Iterate through HTML elements and text nodes in order + fn iter_nodes<'a>( + element: ElementRef<'a>, + text_groups: &mut Vec, + max_node_text_size: u64, + heading_parents: &mut Vec, + context: HTMLNodeContext, + ) -> String { + let mut node_text = "".to_string(); + let mut list_item_index = context.list_item_start; + + for node in element.children() { + match node.value() { + scraper::Node::Element(element) => { + let el_name = element.name().to_lowercase(); + + if let Some(element) = ElementRef::wrap(node) { + // Jump to next node if the element is ignored + if LocalFileParser::IGNORED_ELEMENTS.contains(&element.value().name()) { + continue; + } + + // Push current text and start a new text group on section elements + if el_name == "article" || el_name == "section" || el_name == "table" || el_name == "hr" { + ShinkaiFileParser::push_text_group_by_depth( + text_groups, + heading_parents.len(), + node_text.trim().to_owned(), + max_node_text_size, + None, + ); + node_text.clear(); + } + + // Header elements + if LocalFileParser::HTML_HEADERS.contains(&el_name.as_str()) { + ShinkaiFileParser::push_text_group_by_depth( + text_groups, + heading_parents.len(), + node_text.trim().to_owned(), + max_node_text_size, + None, + ); + node_text.clear(); + + let heading_level = el_name + .chars() + .last() + .unwrap_or_default() + .to_digit(10) + .unwrap_or_default() as usize; + + // Adjust heading_parents based on the current heading level + // Find the parent and remove previous child headings + if let Some(index) = heading_parents + .iter() + .rposition(|&parent_level| parent_level <= heading_level) + { + heading_parents.truncate(index + 1); + + if heading_parents[index] < heading_level { + heading_parents.push(heading_level); + } + } else { + heading_parents.clear(); + heading_parents.push(heading_level); + } + } + + match el_name.as_str() { + "div" | "button" | "label" | "footer" => { + if node_text.len() > 0 && !node_text.ends_with(char::is_whitespace) { + node_text.push_str(" "); + } + } + "p" | "br" | "blockquote" => { + if !node_text.is_empty() { + node_text.push_str("\n"); + } + } + "img" => { + let alt = element.attr("alt").unwrap_or(""); + let src = element.attr("src").unwrap_or(""); + + if alt.len() > 0 && src.len() > 0 { + node_text.push_str(&format!(" ![{}]({})", alt, src)); + } + } + "ol" => { + if !node_text.is_empty() && !node_text.ends_with("\n") { + node_text.push_str("\n"); + } + + let start = element.attr("start").unwrap_or("1"); + list_item_index = start.parse::().unwrap_or(1); + } + "ul" => { + if !node_text.is_empty() && !node_text.ends_with("\n") { + node_text.push_str("\n"); + } + list_item_index = 1; + } + _ => (), + } + + let list_depth = if el_name == "ol" || el_name == "ul" { + context.list_depth + 1 + } else { + context.list_depth + }; + + // Process child nodes + let inner_text = iter_nodes( + element, + text_groups, + max_node_text_size, + heading_parents, + HTMLNodeContext { + is_preformatted: context.is_preformatted || el_name == "pre", + is_ordered_list: (context.is_ordered_list || el_name == "ol") && el_name != "ul", + list_item_start: list_item_index, + list_depth, + }, + ); + + // Process inner text returned from child nodes + if inner_text.len() > 0 { + match el_name.as_str() { + "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => { + let heading_depth = if heading_parents.len() > 0 { + heading_parents.len() - 1 + } else { + 0 + }; + + ShinkaiFileParser::push_text_group_by_depth( + text_groups, + heading_depth, + inner_text.trim().to_owned(), + max_node_text_size, + None, + ); + } + "a" => { + let href = element.attr("href").unwrap_or(""); + + if href.len() > 0 && !href.starts_with("#") { + node_text.push_str(&format!(" [{}]({})", inner_text, href)); + } else { + node_text.push_str(&format!(" {}", inner_text)); + } + } + "blockquote" => { + inner_text.split("\n").for_each(|line| { + node_text.push_str(&format!("> {}\n", line)); + }); + } + "code" => { + if context.is_preformatted { + node_text.push_str(&format!("```\n{}\n```\n", inner_text)); + } else { + node_text.push_str(&format!("`{}`", inner_text)); + } + } + "li" => { + let list_depth = if context.list_depth > 0 { context.list_depth } else { 1 }; + let indentation = "\t".repeat((list_depth - 1) as usize); + + if !node_text.is_empty() && !node_text.ends_with("\n") { + node_text.push_str("\n"); + } + + if context.is_ordered_list { + let li_value = element.attr("value").unwrap_or(""); + if let Some(value) = li_value.parse::().ok() { + list_item_index = value; + } + + node_text.push_str(&format!( + "{}{}. {}\n", + indentation, + list_item_index, + inner_text.trim() + )); + list_item_index += 1; + } else { + node_text.push_str(&format!("{}* {}\n", indentation, inner_text.trim())); + } + } + // Push table data to a text group + "table" => { + ShinkaiFileParser::push_text_group_by_depth( + text_groups, + heading_parents.len(), + inner_text.trim().to_owned(), + max_node_text_size, + None, + ); + } + "caption" => { + node_text.push_str(&format!("{}\n", inner_text.trim())); + } + "tr" => { + let row_text = inner_text.trim(); + let row_text = row_text.trim_end_matches(';'); + node_text.push_str(&format!("{}\n", row_text)); + } + "td" | "th" => { + node_text.push_str(&format!("{}; ", inner_text)); + } + _ => { + node_text.push_str(&inner_text); + } + } + } + } + } + scraper::Node::Text(text) => { + if text.text.trim().is_empty() { + continue; + } + + // Save preformatted text as is, otherwise remove extra whitespaces + if context.is_preformatted { + node_text.push_str(&text.text); + } else { + let re = Regex::new(r"\s{2,}|\n").unwrap(); + let sanitized_text = re.replace_all(&text.text, " "); + + node_text.push_str(&sanitized_text); + } + } + _ => (), + }; + } + + node_text + } + + let result_text = iter_nodes( + document.root_element(), + &mut text_groups, + max_node_text_size, + &mut heading_parents, + HTMLNodeContext::default(), + ); + + ShinkaiFileParser::push_text_group_by_depth( + &mut text_groups, + heading_parents.len(), + result_text.trim().to_owned(), + max_node_text_size, + None, + ); + + Ok(text_groups) + } +} diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/json_parsing.rs b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/json_parsing.rs new file mode 100644 index 000000000..70cba4f68 --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/json_parsing.rs @@ -0,0 +1,87 @@ +use std::collections::HashMap; + +use super::LocalFileParser; +use crate::{shinkai_fs_error::ShinkaiFsError, simple_parser::{file_parser_helper::ShinkaiFileParser, text_group::TextGroup}}; +use serde_json::Value as JsonValue; + +impl LocalFileParser { + /// Attempts to process the provided json file into a list of TextGroups. + pub fn process_json_file( + file_buffer: Vec, + max_node_text_size: u64, + ) -> Result, ShinkaiFsError> { + let json_string = + String::from_utf8(file_buffer).map_err(|_| ShinkaiFsError::FailedJSONParsing)?; + let json: JsonValue = serde_json::from_str(&json_string)?; + + let text_groups = Self::process_container_json_value(&json, max_node_text_size); + Ok(text_groups) + } + + /// Recursively processes a JSON value into a *flat* list of TextGroups. + pub fn process_container_json_value(json: &JsonValue, max_node_text_size: u64) -> Vec { + // Helper to merge small TextGroups + let fn_merge_groups = |mut acc: Vec, current_group: TextGroup| { + if let Some(prev_group) = acc.last_mut() { + if prev_group.text.len() + current_group.text.len() < max_node_text_size as usize { + prev_group + .text + .push_str(format!("\n{}", current_group.text).as_str()); + return acc; + } + } + acc.push(current_group); + acc + }; + + match json { + JsonValue::Object(map) => { + // For each (key, value), produce a TextGroup for `key`, plus sub-groups from `value`. + let mut result = Vec::new(); + for (key, value) in map { + // Optionally create a TextGroup for the key itself + result.push(TextGroup::new(key.clone(), HashMap::new(), None)); + // Then flatten out whatever the value contains + let sub_result = Self::process_container_json_value(value, max_node_text_size); + result.extend(sub_result); + } + result.into_iter().fold(Vec::new(), fn_merge_groups) + } + JsonValue::Array(arr) => { + // Flatten all elements + let mut result = Vec::new(); + for value in arr { + let sub_result = Self::process_container_json_value(value, max_node_text_size); + result.extend(sub_result); + } + result.into_iter().fold(Vec::new(), fn_merge_groups) + } + // Base case: it’s a primitive (string, number, bool, null) + _ => Self::process_content_json_value(None, json, max_node_text_size), + } + } + + /// Processes a single JSON value (primitive) into one or more TextGroups. + fn process_content_json_value( + key: Option<&str>, + value: &JsonValue, + max_node_text_size: u64, + ) -> Vec { + let mut text_groups = Vec::new(); + let text = match key { + Some(k) => format!("{}: {}", k, value.to_string()), + None => value.to_string(), + }; + + if text.len() as u64 > max_node_text_size { + let chunks = ShinkaiFileParser::split_into_chunks(&text, max_node_text_size as usize); + for chunk in chunks { + text_groups.push(TextGroup::new(chunk, HashMap::new(), None)); + } + } else { + text_groups.push(TextGroup::new(text, HashMap::new(), None)); + } + + text_groups + } +} diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/local_parsing.rs b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/local_parsing.rs new file mode 100644 index 000000000..c887b9187 --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/local_parsing.rs @@ -0,0 +1,54 @@ +// use crate::file_parser::file_parser_types::TextGroup; +// use crate::shinkai_fs_error::ShinkaiFsError; + +// pub struct LocalFileParser {} + +// impl LocalFileParser { +// /// Attempts to process a file into a list of TextGroups using local processing logic +// /// implemented in Rust directly without relying on external services. +// /// If local processing is not available for the provided source, then returns Err. +// pub fn process_file_into_grouped_text( +// file_buffer: Vec, +// file_name: String, +// max_node_text_size: u64, +// source: VRSourceReference, +// ) -> Result, ShinkaiFsError> { +// let source_base = source; + +// match &source_base { +// VRSourceReference::None => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), +// VRSourceReference::Standard(source) => match source { +// SourceReference::Other(_) => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), +// SourceReference::FileRef(file_source) => match file_source.clone().file_type { +// SourceFileType::Image(_) +// | SourceFileType::Code(_) +// | SourceFileType::ConfigFileType(_) +// | SourceFileType::Video(_) +// | SourceFileType::Audio(_) +// | SourceFileType::Shinkai(_) => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), +// SourceFileType::Document(doc) => match doc { +// DocumentFileType::Txt => LocalFileParser::process_txt_file(file_buffer, max_node_text_size), +// DocumentFileType::Json => LocalFileParser::process_json_file(file_buffer, max_node_text_size), +// DocumentFileType::Csv => LocalFileParser::process_csv_file(file_buffer, max_node_text_size), +// // DocumentFileType::Docx => LocalFileParser::process_docx_file(file_buffer, max_node_text_size), +// DocumentFileType::Html => { +// LocalFileParser::process_html_file(file_buffer, &file_name, max_node_text_size) +// } + +// DocumentFileType::Md => LocalFileParser::process_md_file(file_buffer, max_node_text_size), + +// DocumentFileType::Pdf => LocalFileParser::process_pdf_file(file_buffer, max_node_text_size), + +// DocumentFileType::Xlsx | DocumentFileType::Xls => { +// LocalFileParser::process_xlsx_file(file_buffer, max_node_text_size) +// } + +// _ => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), +// }, +// }, +// SourceReference::ExternalURI(_) => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), +// }, +// VRSourceReference::Notarized(_) => Err(ShinkaiFsError::UnsupportedFileType(file_name.to_string())), +// } +// } +// } diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/md_parsing.rs b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/md_parsing.rs new file mode 100644 index 000000000..ed59f0c3f --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/md_parsing.rs @@ -0,0 +1,187 @@ +use comrak::{ + nodes::{AstNode, ListDelimType, ListType, NodeValue}, + parse_document, Arena, Options, +}; + +use crate::{shinkai_fs_error::ShinkaiFsError, simple_parser::{file_parser_helper::ShinkaiFileParser, text_group::TextGroup}}; + +use super::LocalFileParser; + +impl LocalFileParser { + pub fn process_md_file(file_buffer: Vec, max_node_text_size: u64) -> Result, ShinkaiFsError> { + let md_string = String::from_utf8(file_buffer).map_err(|_| ShinkaiFsError::FailedMDParsing)?; + + let arena = Arena::new(); + let root = parse_document(&arena, &md_string, &Options::default()); + + // build up an AST and iterate through nodes in order + fn iter_nodes<'a, F>(node: &'a AstNode<'a>, f: &mut F) + where + F: FnMut(&'a AstNode<'a>), + { + f(node); + for c in node.children() { + iter_nodes(c, f); + } + } + + let mut text_groups: Vec = Vec::new(); + let mut current_text = "".to_string(); + let mut processed_node_type = NodeValue::Document; + + // heading_parents is used to keep track of the depth of the headings + let mut heading_parents: Vec = Vec::with_capacity(6); + + iter_nodes(root, &mut |node| match &node.data.borrow().value { + // Actual text comes in the next text node, set processed_node_type to the proper type + NodeValue::Heading(ref heading) => { + processed_node_type = NodeValue::Heading(heading.clone()); + } + NodeValue::Paragraph => match processed_node_type { + // paragraph inside a list item + NodeValue::Item(_) => { + return; + } + _ => { + processed_node_type = NodeValue::Paragraph; + + if current_text.len() > 0 { + current_text.push_str("\n"); + } + } + }, + NodeValue::Item(ref list_item) => { + processed_node_type = NodeValue::Item(list_item.clone()); + } + NodeValue::Link(ref link) => { + processed_node_type = NodeValue::Link(link.clone()); + } + NodeValue::Image(ref image) => { + processed_node_type = NodeValue::Image(image.clone()); + } + + NodeValue::Text(ref text) => match processed_node_type { + NodeValue::Heading(ref heading) => { + // Push previous text to a text group + ShinkaiFileParser::push_text_group_by_depth( + &mut text_groups, + heading_parents.len(), + current_text.clone(), + max_node_text_size, + None, + ); + current_text = "".to_string(); + + let level = heading.level as usize; + + // Adjust heading_parents based on the current heading level + // Find the parent and remove previous child headings + if let Some(index) = heading_parents.iter().rposition(|&parent_level| parent_level <= level) { + heading_parents.truncate(index + 1); + + if heading_parents[index] < level { + heading_parents.push(level); + } + } else { + heading_parents.clear(); + heading_parents.push(level); + } + + let heading_depth = if heading_parents.len() > 0 { + heading_parents.len() - 1 + } else { + 0 + }; + + // Create a new text group for the heading + // Upcoming content will be added to its subgroups + ShinkaiFileParser::push_text_group_by_depth( + &mut text_groups, + heading_depth, + text.to_string(), + max_node_text_size, + None, + ); + } + NodeValue::Paragraph => { + current_text.push_str(text); + } + NodeValue::Item(ref list_item) => { + let prefix = match list_item.list_type { + ListType::Bullet => format!("{} ", list_item.bullet_char as char), + ListType::Ordered => match list_item.delimiter { + ListDelimType::Period => format!("{}. ", list_item.start), + ListDelimType::Paren => format!("{}) ", list_item.start), + }, + }; + + current_text.push_str(format!("\n{} {}", prefix, text).as_str()); + processed_node_type = NodeValue::Paragraph; + } + NodeValue::Link(ref link) => { + current_text.push_str(format!("[{}]({})", text, link.url).as_str()); + processed_node_type = NodeValue::Paragraph; + } + NodeValue::Image(ref image) => { + current_text.push_str(format!("![{}]({})", text, image.url).as_str()); + processed_node_type = NodeValue::Paragraph; + } + _ => (), + }, + NodeValue::Code(ref code) => { + let ticks = "`".repeat(code.num_backticks); + current_text.push_str(format!("{}{}{}", ticks, code.literal, ticks).as_str()); + } + NodeValue::CodeBlock(ref code_block) => { + let fence = if code_block.fenced { + format!( + "{}", + (code_block.fence_char as char) + .to_string() + .repeat(code_block.fence_length) + ) + } else { + "".to_string() + }; + + current_text + .push_str(format!("\n{}{}\n{}{}\n", fence, code_block.info, code_block.literal, fence).as_str()); + } + NodeValue::HtmlBlock(ref html_block) => { + current_text.push_str(format!("\n{}", html_block.literal).as_str()); + } + NodeValue::HtmlInline(ref html_inline) => { + current_text.push_str(html_inline.as_str()); + } + NodeValue::LineBreak => { + current_text.push_str("\n"); + } + NodeValue::SoftBreak => { + current_text.push_str("\n"); + } + // split text groups by --- + NodeValue::ThematicBreak => { + ShinkaiFileParser::push_text_group_by_depth( + &mut text_groups, + heading_parents.len(), + current_text.clone(), + max_node_text_size, + None, + ); + current_text = "".to_string(); + } + _ => (), + }); + + // Push the last text group + ShinkaiFileParser::push_text_group_by_depth( + &mut text_groups, + heading_parents.len(), + current_text.clone(), + max_node_text_size, + None, + ); + + Ok(text_groups) + } +} diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/mod.rs b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/mod.rs new file mode 100644 index 000000000..ad6412e97 --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/mod.rs @@ -0,0 +1,9 @@ +pub mod csv_parsing; +pub mod html_parsing; +pub mod json_parsing; +pub mod md_parsing; +pub mod pdf_parsing; +pub mod txt_parsing; + + +pub struct LocalFileParser {} \ No newline at end of file diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/pdf_parsing.rs b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/pdf_parsing.rs new file mode 100644 index 000000000..b374bd78c --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/pdf_parsing.rs @@ -0,0 +1,30 @@ +use crate::{shinkai_fs_error::ShinkaiFsError, simple_parser::{file_parser_helper::ShinkaiFileParser, text_group::TextGroup}}; + +use super::LocalFileParser; + +impl LocalFileParser { + pub fn process_pdf_file(file_buffer: Vec, max_node_text_size: u64) -> Result, ShinkaiFsError> { + use shinkai_ocr::pdf_parser::PDFParser; + + let pdf_parser = PDFParser::new().map_err(|_| ShinkaiFsError::FailedPDFParsing)?; + let parsed_pages = pdf_parser + .process_pdf_file(file_buffer) + .map_err(|_| ShinkaiFsError::FailedPDFParsing)?; + + let mut text_groups = Vec::new(); + + for page in parsed_pages.into_iter() { + for pdf_text in page.content.into_iter() { + ShinkaiFileParser::push_text_group_by_depth( + &mut text_groups, + 0, + pdf_text.text, + max_node_text_size, + Some(page.page_number.try_into().unwrap_or_default()), + ); + } + } + + Ok(text_groups) + } +} diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/txt_parsing.rs b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/txt_parsing.rs new file mode 100644 index 000000000..ca81ca3d0 --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/simple_parser/local_parsing/txt_parsing.rs @@ -0,0 +1,151 @@ +use std::collections::HashMap; + +use regex::Regex; + +use super::LocalFileParser; +use crate::{shinkai_fs_error::ShinkaiFsError, simple_parser::{file_parser_helper::ShinkaiFileParser, text_group::TextGroup}}; + +impl LocalFileParser { + /// Attempts to process the provided json file into a list of TextGroups. + pub fn process_txt_file(file_buffer: Vec, max_node_text_size: u64) -> Result, ShinkaiFsError> { + let txt_string = String::from_utf8(file_buffer).map_err(|_| ShinkaiFsError::FailedTXTParsing)?; + let sentences = LocalFileParser::process_into_sentences(txt_string); + let text_groups = LocalFileParser::process_into_text_groups(sentences, max_node_text_size); + // for sentence in &sentences { + // println!("S: {}", sentence); + // } + // for text_group in &text_groups { + // println!("TG: {}", text_group.text); + // } + + Ok(text_groups) + } + + /// Build a non-hierarchical list of TextGroups using the sentences + pub fn process_into_text_groups(text_lines: Vec, max_node_text_size: u64) -> Vec { + let mut text_groups = Vec::new(); + let mut current_text = String::new(); + let mut current_metadata = HashMap::new(); + + for line in text_lines { + let (parsed_line, metadata, parsed_any_metadata) = ShinkaiFileParser::parse_and_extract_metadata(&line); + + if (parsed_line.len() as u64 + current_text.len() as u64) > max_node_text_size { + if !current_text.is_empty() { + text_groups.push( + TextGroup::new(current_text.clone(), current_metadata.clone(), None) + ); + current_text.clear(); + current_metadata.clear(); + } + if parsed_line.len() as u64 > max_node_text_size { + // If the line itself exceeds max_node_text_size, split it into chunks. + let chunks = if parsed_any_metadata { + ShinkaiFileParser::split_into_chunks_with_metadata( + &line, + max_node_text_size as usize + ) + } else { + ShinkaiFileParser::split_into_chunks(&line, max_node_text_size as usize) + }; + + for chunk in chunks { + let (parsed_chunk, chunk_metadata, _) = if parsed_any_metadata { + ShinkaiFileParser::parse_and_extract_metadata(&chunk) + } else { + (chunk, HashMap::new(), false) + }; + text_groups.push(TextGroup::new(parsed_chunk, chunk_metadata, None)); + } + } else { + current_text = parsed_line; + current_metadata.extend(metadata); + } + } else { + if !current_text.is_empty() { + current_text.push(' '); // space between sentences + } + current_text.push_str(&parsed_line); + current_metadata.extend(metadata); + } + } + + // Push the last segment + if !current_text.is_empty() { + text_groups.push(TextGroup::new(current_text, current_metadata.clone(), None)); + } + + text_groups + } + + /// Given a piece of text, split it into a list of sentences, doing its best to respect punctuation + /// and taking into account English-based exceptions. + pub fn process_into_sentences(text: String) -> Vec { + let punctuation_marks = [ + ',', '.', ';', '-', '&', '(', '{', '<', '"', '\'', '`' + ]; + text.split('\n') + .filter(|line| !line.trim().is_empty() && line.trim().len() > 1) + .flat_map(|line| { + let trimmed_line = line.trim(); + + let re = Regex::new(ShinkaiFileParser::PURE_METADATA_REGEX).unwrap(); + let is_pure_metadata = re.is_match(trimmed_line) + && re + .find(trimmed_line) + .map(|m| m.start() == 0 && m.end() == trimmed_line.len()) + .unwrap_or(false); + + // Ensure each line ends with punctuation, or default to a newline + let line_with_ending = if is_pure_metadata + || punctuation_marks + .iter() + .any(|&mark| trimmed_line.ends_with(mark)) + { + trimmed_line.to_string() + } else { + format!("{}\n", trimmed_line) + }; + + Self::split_line_into_sentences(&line_with_ending) + }) + .collect() + } + + /// Splits a single line into sentences, considering common exceptions for English. + fn split_line_into_sentences(line: &str) -> Vec { + let mut sentences = Vec::new(); + let mut start = 0; + + // Common abbreviations in lowercase + let exceptions = [ + " mr.", " mrs.", " ms.", " dr.", " prof.", " gen.", " rep.", " sen.", " jr.", " sr.", + " ave.", " blvd.", " st.", " rd.", " ln.", " ter.", " ct.", " pl.", " p.o.", " a.m.", + " p.m.", " cm.", " kg.", " lb.", " oz.", " ft.", " in.", " mi.", " b.a.", " m.a.", + " ph.d.", " m.d.", " b.sc.", " m.sc.", " inc.", " ltd.", " co.", " corp.", " llc.", + " plc.", " et al.", " e.g.", " i.e.", " vs.", " viz.", " approx.", " dept.", " div.", + " est.", + ]; + + for (index, _) in line.match_indices(". ") { + let potential_end = index + 1; + let sentence = &line[start..potential_end]; + let sentence_end_lc = sentence.to_lowercase(); + + // Skip splitting if it matches an abbreviation + if exceptions.iter().any(|&exc| sentence_end_lc.ends_with(exc)) { + continue; + } + + sentences.push(sentence.trim().to_string()); + start = potential_end + 1; + } + + // Final leftover + if start < line.len() { + sentences.push(line[start..].trim().to_string()); + } + + sentences + } +} diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/mod.rs b/shinkai-libs/shinkai-fs/src/simple_parser/mod.rs new file mode 100644 index 000000000..595c80a8a --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/simple_parser/mod.rs @@ -0,0 +1,5 @@ +pub mod simple_parser; +pub mod local_parsing; +pub mod file_parser_helper; +pub mod text_group; +pub mod file_parser_grouping; \ No newline at end of file diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/simple_parser.rs b/shinkai-libs/shinkai-fs/src/simple_parser/simple_parser.rs new file mode 100644 index 000000000..fd90ed4c1 --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/simple_parser/simple_parser.rs @@ -0,0 +1,104 @@ +/* + +- takes a file (filepath) +- checks if it exists +- reads the filetype and redirects to the appropriate parser depending on the filetype +- it gets a vec of chunks (or another structure) +- it returns that + +Use generator: &dyn EmbeddingGenerator for converting chunks to embeddings +also use the generator to know how big the chunks could be +*/ + +use shinkai_message_primitives::shinkai_utils::shinkai_path::ShinkaiPath; + +use crate::shinkai_fs_error::ShinkaiFsError; + +use std::{fmt, fs}; + +use super::{local_parsing::LocalFileParser, text_group::TextGroup}; + +pub struct SimpleParser; + +#[derive(Debug, PartialEq, Eq)] +enum SupportedFileType { + Txt, + Json, + Csv, + Html, + Md, + Pdf, + Xlsx, + Xls, +} + +impl SupportedFileType { + fn from_extension(extension: &str) -> Option { + match extension { + "txt" => Some(SupportedFileType::Txt), + "json" => Some(SupportedFileType::Json), + "csv" => Some(SupportedFileType::Csv), + "html" => Some(SupportedFileType::Html), + "md" => Some(SupportedFileType::Md), + "pdf" => Some(SupportedFileType::Pdf), + "xlsx" => Some(SupportedFileType::Xlsx), + "xls" => Some(SupportedFileType::Xls), + _ => None, + } + } +} + +impl fmt::Display for SupportedFileType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let file_type_str = match self { + SupportedFileType::Txt => "txt", + SupportedFileType::Json => "json", + SupportedFileType::Csv => "csv", + SupportedFileType::Html => "html", + SupportedFileType::Md => "md", + SupportedFileType::Pdf => "pdf", + SupportedFileType::Xlsx => "xlsx", + SupportedFileType::Xls => "xls", + }; + write!(f, "{}", file_type_str) + } +} + +impl SimpleParser { + pub fn parse_file(filepath: ShinkaiPath, max_node_text_size: u64) -> Result, ShinkaiFsError> { + // check if file exists + if !filepath.exists() { + return Err(ShinkaiFsError::FileNotFoundWithPath(filepath.to_string())); + } + + // extract file extension + let extension = filepath.extension(); + + if extension.is_none() { + return Err(ShinkaiFsError::UnsupportedFileType(filepath.to_string())); + } + + // check if the file extension is supported + let file_type = SupportedFileType::from_extension(extension.unwrap()) + .ok_or_else(|| ShinkaiFsError::UnsupportedFileType(filepath.to_string()))?; + + // read file into memory + let file_buffer = fs::read(&filepath.as_path()).map_err(|e| ShinkaiFsError::FailedIO(e.to_string()))?; + + // call the new function based on the file extension + SimpleParser::process_file_by_extension(file_buffer, file_type, max_node_text_size) + } + + fn process_file_by_extension(file_buffer: Vec, file_type: SupportedFileType, max_node_text_size: u64) -> Result, ShinkaiFsError> { + match file_type { + SupportedFileType::Txt => LocalFileParser::process_txt_file(file_buffer, max_node_text_size), + SupportedFileType::Json => LocalFileParser::process_json_file(file_buffer, max_node_text_size), + SupportedFileType::Csv => LocalFileParser::process_csv_file(file_buffer, max_node_text_size), + SupportedFileType::Html => LocalFileParser::process_html_file(file_buffer, "filename", max_node_text_size), + SupportedFileType::Md => LocalFileParser::process_md_file(file_buffer, max_node_text_size), + SupportedFileType::Pdf => LocalFileParser::process_pdf_file(file_buffer, max_node_text_size), + _ => Err(ShinkaiFsError::UnsupportedFileType(file_type.to_string())), + // SupportedFileType::Xlsx | SupportedFileType::Xls => LocalFileParser::process_xlsx_file(file_buffer, max_node_text_size), + } + } +} diff --git a/shinkai-libs/shinkai-fs/src/simple_parser/text_group.rs b/shinkai-libs/shinkai-fs/src/simple_parser/text_group.rs new file mode 100644 index 000000000..04fba8da1 --- /dev/null +++ b/shinkai-libs/shinkai-fs/src/simple_parser/text_group.rs @@ -0,0 +1,113 @@ +use serde::{Deserialize, Serialize}; +use std::collections::{HashMap, HashSet}; + +use crate::simple_parser::file_parser_helper::ShinkaiFileParser; + +/// An intermediary type for processing content into Node's held in VectorResources +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct TextGroup { + pub text: String, + pub metadata: HashMap, + pub embedding: Option>, +} + +impl TextGroup { + /// Creates a new instance of TextGroup + pub fn new( + text: String, + metadata: HashMap, + embedding: Option>, + ) -> Self { + TextGroup { + text, + metadata, + embedding, + } + } + + /// Creates a new instance of TextGroup with default empty values. + pub fn new_empty() -> Self { + TextGroup { + text: String::new(), + metadata: HashMap::new(), + embedding: None, + } + } + + /// Prepares a string to be used to generate an Embedding for this TextGroup. + /// Extracts most prevalent keywords from all sub-groups and appends them to + /// the end of the groups actual text. + pub fn format_text_for_embedding(&self, max_node_text_size: u64) -> String { + let mut keyword_string = String::new(); + let base_string = &self.text; + let pre_keyword_length = base_string.len(); + + // Extract keywords from the TextGroup and its sub-groups + let keywords: Vec = ShinkaiFileParser::extract_keywords(&vec![self.clone()], 1); + + for keyword in keywords { + if pre_keyword_length + keyword_string.len() + keyword.len() <= max_node_text_size as usize { + keyword_string = format!("{}, {}", keyword_string, keyword); + } else { + break; + } + } + + format!("{} Keywords: {}", base_string, keyword_string.trim_start_matches(", ")) + } + + /// Pushes data into this TextGroup and extracts metadata + pub fn push_data(&mut self, text: &str, page_number: Option) { + if !self.text.is_empty() { + self.text.push(' '); + } + + let (parsed_text, metadata, parsed_any_metadata) = ShinkaiFileParser::parse_and_extract_metadata(text); + if parsed_any_metadata { + self.text.push_str(&parsed_text); + self.metadata.extend(metadata); + } else { + self.text.push_str(text); + } + + if let Some(page_number) = page_number { + self.push_page_number(page_number); + } + } + + /// Pushes a page number into this TextGroup + pub fn push_page_number(&mut self, page_number: u32) { + let mut unique_page_numbers: HashSet = HashSet::new(); + + if let Some(page_numbers_metadata) = self.metadata.get(&ShinkaiFileParser::page_numbers_metadata_key()) { + let page_numbers_metadata: Result, _> = page_numbers_metadata + .trim_matches(|c| c == '[' || c == ']') + .split(",") + .map(|n| n.trim().parse::()) + .collect(); + + match page_numbers_metadata { + Ok(page_numbers) => { + for page_number in page_numbers { + unique_page_numbers.insert(page_number); + } + } + Err(_) => {} + } + } + + unique_page_numbers.insert(page_number); + + self.metadata.insert( + ShinkaiFileParser::page_numbers_metadata_key(), + format!( + "[{}]", + unique_page_numbers + .iter() + .map(|n| n.to_string()) + .collect::>() + .join(", ") + ), + ); + } +} diff --git a/shinkai-libs/shinkai-message-primitives/src/shinkai_utils/shinkai_path.rs b/shinkai-libs/shinkai-message-primitives/src/shinkai_utils/shinkai_path.rs index a1b8186ae..80eb59d4c 100644 --- a/shinkai-libs/shinkai-message-primitives/src/shinkai_utils/shinkai_path.rs +++ b/shinkai-libs/shinkai-message-primitives/src/shinkai_utils/shinkai_path.rs @@ -77,6 +77,11 @@ impl ShinkaiPath { self.as_str() } } + + /// Returns the extension of the path, if any. + pub fn extension(&self) -> Option<&str> { + self.path.extension().and_then(|ext| ext.to_str()) + } } // Implement Display for ShinkaiPath to easily print it @@ -135,4 +140,14 @@ mod tests { assert_eq!(absolute_outside.relative_path(), "/some/other/path"); env::remove_var("NODE_STORAGE_PATH"); } + + #[test] + #[serial] + fn test_extension() { + let path_with_extension = ShinkaiPath::from_string("word_files/christmas.docx".to_string()); + assert_eq!(path_with_extension.extension(), Some("docx")); + + let path_without_extension = ShinkaiPath::from_string("word_files/christmas".to_string()); + assert_eq!(path_without_extension.extension(), None); + } } diff --git a/shinkai-libs/shinkai-vector-resources/src/embedding_generator.rs b/shinkai-libs/shinkai-vector-resources/src/embedding_generator.rs index c7ef66880..c79d6c89d 100644 --- a/shinkai-libs/shinkai-vector-resources/src/embedding_generator.rs +++ b/shinkai-libs/shinkai-vector-resources/src/embedding_generator.rs @@ -4,9 +4,9 @@ use crate::resource_errors::VRError; use async_trait::async_trait; use lazy_static::lazy_static; -#[cfg(feature = "desktop-only")] + use reqwest::blocking::Client; -#[cfg(feature = "desktop-only")] + use reqwest::Client as AsyncClient; use serde::{Deserialize, Serialize}; use reqwest::ClientBuilder; @@ -72,14 +72,14 @@ pub trait EmbeddingGenerator: Sync + Send { } #[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)] -#[cfg(feature = "desktop-only")] + pub struct RemoteEmbeddingGenerator { pub model_type: EmbeddingModelType, pub api_url: String, pub api_key: Option, } -#[cfg(feature = "desktop-only")] + #[async_trait] impl EmbeddingGenerator for RemoteEmbeddingGenerator { /// Clones self and wraps it in a Box @@ -87,7 +87,7 @@ impl EmbeddingGenerator for RemoteEmbeddingGenerator { Box::new(self.clone()) } - #[cfg(feature = "desktop-only")] + /// Generate Embeddings for an input list of strings by using the external API. /// This method batch generates whenever possible to increase speed. /// Note this method is blocking. @@ -124,7 +124,7 @@ impl EmbeddingGenerator for RemoteEmbeddingGenerator { } } - #[cfg(feature = "desktop-only")] + /// Generate an Embedding for an input string by using the external API. /// Note this method is blocking. fn generate_embedding_blocking(&self, input_string: &str, id: &str) -> Result { @@ -145,7 +145,7 @@ impl EmbeddingGenerator for RemoteEmbeddingGenerator { } } - #[cfg(feature = "desktop-only")] + /// Generate an Embedding for an input string by using the external API. /// This method batch generates whenever possible to increase speed. async fn generate_embeddings( @@ -183,7 +183,7 @@ impl EmbeddingGenerator for RemoteEmbeddingGenerator { } } - #[cfg(feature = "desktop-only")] + /// Generate an Embedding for an input string by using the external API. async fn generate_embedding(&self, input_string: &str, id: &str) -> Result { let input_strings = [input_string.to_string()]; @@ -214,7 +214,7 @@ impl EmbeddingGenerator for RemoteEmbeddingGenerator { } } -#[cfg(feature = "desktop-only")] + impl RemoteEmbeddingGenerator { /// Create a RemoteEmbeddingGenerator pub fn new(model_type: EmbeddingModelType, api_url: &str, api_key: Option) -> RemoteEmbeddingGenerator { @@ -266,7 +266,7 @@ impl RemoteEmbeddingGenerator { } } - #[cfg(feature = "desktop-only")] + /// Generates embeddings using Hugging Face's Text Embedding Interface server /// pub async fn generate_embedding_open_ai(&self, input_string: &str, id: &str) -> Result { pub async fn generate_embedding_ollama( @@ -366,7 +366,7 @@ impl RemoteEmbeddingGenerator { } } - #[cfg(feature = "desktop-only")] + /// Generate an Embedding for an input string by using the external Ollama API. fn generate_embedding_ollama_blocking(&self, input_string: &str, id: &str) -> Result { // Prepare the request body @@ -416,7 +416,7 @@ impl RemoteEmbeddingGenerator { } } - #[cfg(feature = "desktop-only")] + /// Generates embeddings using Hugging Face's Text Embedding Interface server pub async fn generate_embedding_tei( &self, @@ -528,7 +528,7 @@ impl RemoteEmbeddingGenerator { } } - #[cfg(feature = "desktop-only")] + /// Generates embeddings using a Hugging Face Text Embeddings Inference server fn generate_embedding_tei_blocking( &self, @@ -621,7 +621,7 @@ impl RemoteEmbeddingGenerator { } } - #[cfg(feature = "desktop-only")] + /// Generate an Embedding for an input string by using the external OpenAI-matching API. pub async fn generate_embedding_open_ai(&self, input_string: &str, id: &str) -> Result { // Prepare the request body @@ -673,7 +673,7 @@ impl RemoteEmbeddingGenerator { } } - #[cfg(feature = "desktop-only")] + /// Generate an Embedding for an input string by using the external OpenAI-matching API. fn generate_embedding_open_ai_blocking(&self, input_string: &str, id: &str) -> Result { // Prepare the request body diff --git a/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser.rs b/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser.rs index 741462ad1..69d3319ed 100644 --- a/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser.rs +++ b/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser.rs @@ -1,6 +1,6 @@ use super::file_parser_types::TextGroup; use super::local_parsing::LocalFileParser; -#[cfg(feature = "desktop-only")] + use crate::data_tags::DataTag; use crate::embedding_generator::EmbeddingGenerator; use crate::embeddings::Embedding; @@ -9,20 +9,20 @@ use crate::source::DistributionInfo; use crate::source::TextChunkingStrategy; use crate::source::VRSourceReference; use crate::vector_resource::{BaseVectorResource, DocumentVectorResource, VectorResourceCore}; -#[cfg(feature = "desktop-only")] + use std::{future::Future, pin::Pin}; pub struct ShinkaiFileParser; impl ShinkaiFileParser { - #[cfg(feature = "desktop-only")] + pub async fn initialize_local_file_parser() -> Result<(), Box> { use shinkai_ocr::image_parser::ImageParser; ImageParser::check_and_download_dependencies().await } - #[cfg(feature = "desktop-only")] + /// Processes the input file into a BaseVectorResource. pub async fn process_file_into_resource( file_buffer: Vec, @@ -51,7 +51,7 @@ impl ShinkaiFileParser { .await } - #[cfg(feature = "desktop-only")] + /// Processes the input file into a BaseVectorResource. pub fn process_file_into_resource_blocking( file_buffer: Vec, @@ -84,7 +84,7 @@ impl ShinkaiFileParser { ) } - #[cfg(feature = "desktop-only")] + /// Processes the input file into a list of `TextGroup` with no embedding generated yet. pub async fn process_file_into_text_groups( file_buffer: Vec, @@ -95,7 +95,7 @@ impl ShinkaiFileParser { LocalFileParser::process_file_into_grouped_text(file_buffer, file_name, max_node_text_size, source) } - #[cfg(feature = "desktop-only")] + /// Processes the input file into a list of `TextGroup` with no embedding generated yet. pub fn process_file_into_text_groups_blocking( file_buffer: Vec, @@ -106,7 +106,7 @@ impl ShinkaiFileParser { LocalFileParser::process_file_into_grouped_text(file_buffer, file_name, max_node_text_size, source) } - #[cfg(feature = "desktop-only")] + /// Processes an ordered list of `TextGroup`s into a ready-to-go BaseVectorResource pub async fn process_groups_into_resource( text_groups: Vec, @@ -132,7 +132,7 @@ impl ShinkaiFileParser { .await } - #[cfg(feature = "desktop-only")] + /// Processes an ordered list of `TextGroup`s into a ready-to-go BaseVectorResource. pub fn process_groups_into_resource_blocking( text_groups: Vec, @@ -157,7 +157,7 @@ impl ShinkaiFileParser { ) } - #[cfg(feature = "desktop-only")] + /// Processes an ordered list of `TextGroup`s into a ready-to-go BaseVectorResource. /// Allows specifying a custom collection function. pub async fn process_groups_into_resource_with_custom_collection( @@ -194,7 +194,7 @@ impl ShinkaiFileParser { Ok(resource) } - #[cfg(feature = "desktop-only")] + /// Processes an ordered list of `TextGroup`s into a /// a ready-to-go BaseVectorResource. Allows specifying a custom collection function. pub fn process_groups_into_resource_blocking_with_custom_collection( @@ -234,7 +234,7 @@ impl ShinkaiFileParser { Ok(resource) } - #[cfg(feature = "desktop-only")] + /// Recursively processes all text groups & their sub groups into DocumentResources. /// This method assumes your text groups already have embeddings generated for them. fn process_new_doc_resource_with_embeddings_already_generated<'a>( @@ -303,7 +303,7 @@ impl ShinkaiFileParser { }) } - #[cfg(feature = "desktop-only")] + /// Recursively processes all text groups & their sub groups into DocumentResources. /// This method assumes your text groups already have embeddings generated for them. fn process_new_doc_resource_blocking_with_embeddings_already_generated( diff --git a/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser_grouping.rs b/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser_grouping.rs index a7ce5547e..fc815872e 100644 --- a/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser_grouping.rs +++ b/shinkai-libs/shinkai-vector-resources/src/file_parser/file_parser_grouping.rs @@ -6,7 +6,7 @@ use crate::resource_errors::VRError; use keyphrases::KeyPhraseExtractor; use regex::Regex; use std::collections::HashMap; -#[cfg(feature = "desktop-only")] + use std::{future::Future, pin::Pin}; impl ShinkaiFileParser { @@ -58,7 +58,7 @@ impl ShinkaiFileParser { } } - #[cfg(feature = "desktop-only")] + /// Recursively goes through all of the text groups and batch generates embeddings /// for all of them in parallel, processing up to 10 futures at a time. pub fn generate_text_group_embeddings( @@ -132,7 +132,7 @@ impl ShinkaiFileParser { }) } - #[cfg(feature = "desktop-only")] + /// Recursively goes through all of the text groups and batch generates embeddings /// for all of them. pub fn generate_text_group_embeddings_blocking( diff --git a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/local_parsing.rs b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/local_parsing.rs index d72fb4f92..8cc48e6d7 100644 --- a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/local_parsing.rs +++ b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/local_parsing.rs @@ -39,13 +39,13 @@ impl LocalFileParser { LocalFileParser::process_html_file(file_buffer, &file_name, max_node_text_size) } - #[cfg(feature = "desktop-only")] + DocumentFileType::Md => LocalFileParser::process_md_file(file_buffer, max_node_text_size), - #[cfg(feature = "desktop-only")] + DocumentFileType::Pdf => LocalFileParser::process_pdf_file(file_buffer, max_node_text_size), - #[cfg(feature = "desktop-only")] + DocumentFileType::Xlsx | DocumentFileType::Xls => { LocalFileParser::process_xlsx_file(file_buffer, max_node_text_size) } diff --git a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/md_parsing.rs b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/md_parsing.rs index f5960eb88..90265bc20 100644 --- a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/md_parsing.rs +++ b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/md_parsing.rs @@ -1,4 +1,4 @@ -#[cfg(feature = "desktop-only")] + use comrak::{ nodes::{AstNode, ListDelimType, ListType, NodeValue}, parse_document, Arena, Options, @@ -12,7 +12,7 @@ use crate::{ use super::LocalFileParser; impl LocalFileParser { - #[cfg(feature = "desktop-only")] + pub fn process_md_file(file_buffer: Vec, max_node_text_size: u64) -> Result, VRError> { let md_string = String::from_utf8(file_buffer).map_err(|_| VRError::FailedMDParsing)?; diff --git a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/pdf_parsing.rs b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/pdf_parsing.rs index da7070320..e9ccadde8 100644 --- a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/pdf_parsing.rs +++ b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/pdf_parsing.rs @@ -1,4 +1,4 @@ -#[cfg(feature = "desktop-only")] + use crate::{ file_parser::{file_parser::ShinkaiFileParser, file_parser_types::TextGroup}, resource_errors::VRError, @@ -7,7 +7,7 @@ use crate::{ use super::LocalFileParser; impl LocalFileParser { - #[cfg(feature = "desktop-only")] + pub fn process_pdf_file(file_buffer: Vec, max_node_text_size: u64) -> Result, VRError> { use shinkai_ocr::pdf_parser::PDFParser; diff --git a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/xlsx_parsing.rs b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/xlsx_parsing.rs index f96db0b06..cb749d244 100644 --- a/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/xlsx_parsing.rs +++ b/shinkai-libs/shinkai-vector-resources/src/file_parser/local_parsing/xlsx_parsing.rs @@ -9,7 +9,7 @@ use crate::{ use super::LocalFileParser; impl LocalFileParser { - #[cfg(feature = "desktop-only")] + pub fn process_xlsx_file(file_buffer: Vec, max_node_text_size: u64) -> Result, VRError> { let spreadsheet = umya_spreadsheet::reader::xlsx::read_reader(Cursor::new(file_buffer), true) .map_err(|_| VRError::FailedXLSXParsing)?; diff --git a/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource.rs b/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource.rs index 2ca20b128..c36536f77 100644 --- a/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource.rs +++ b/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource.rs @@ -1,9 +1,9 @@ pub use super::vector_resource_search::VectorResourceSearch; use super::OrderedVectorResource; use crate::data_tags::DataTagIndex; -#[cfg(feature = "desktop-only")] + use crate::embedding_generator::EmbeddingGenerator; -#[cfg(feature = "desktop-only")] + use crate::embedding_generator::RemoteEmbeddingGenerator; use crate::embeddings::Embedding; use crate::metadata_index::MetadataIndex; @@ -206,7 +206,7 @@ pub trait VectorResourceCore: Send + Sync { self.set_merkle_root(root_hash.to_hex().to_string()) } - #[cfg(feature = "desktop-only")] + /// Regenerates and updates the resource's embedding using the name/description/source and the provided keywords. /// If keyword_list is None, will use the resource's set keywords (enables flexibility of which keywords get added to which embedding) async fn update_resource_embedding( @@ -221,7 +221,7 @@ pub trait VectorResourceCore: Send + Sync { Ok(()) } - #[cfg(feature = "desktop-only")] + /// Regenerates and updates the resource's embedding using the name/description/source and the provided keywords. /// If keyword_list is None, will use the resource's set keywords (enables flexibility of which keywords get added to which embedding) fn update_resource_embedding_blocking( @@ -251,7 +251,7 @@ pub trait VectorResourceCore: Send + Sync { self.set_resource_id(hashed_string) } - #[cfg(feature = "desktop-only")] + /// Initializes a `RemoteEmbeddingGenerator` that is compatible with this VectorResource /// (targets the same model and interface for embedding generation). Of note, you need /// to make sure the api_url/api_key match for the model used. diff --git a/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource_search.rs b/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource_search.rs index c8b355eec..ddd4e8b23 100644 --- a/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource_search.rs +++ b/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource_search.rs @@ -1,7 +1,7 @@ use super::VectorResourceCore; -#[cfg(feature = "desktop-only")] + use crate::embedding_generator::EmbeddingGenerator; -#[cfg(feature = "desktop-only")] + use crate::embedding_generator::RemoteEmbeddingGenerator; use crate::embeddings::Embedding; use crate::file_parser::file_parser::ShinkaiFileParser; @@ -19,7 +19,7 @@ use std::vec; #[async_trait] pub trait VectorResourceSearch: VectorResourceCore { - #[cfg(feature = "desktop-only")] + /// Fetches percent_to_verify (between 0.0 - 1.0) of random nodes from within the VectorResource /// and validates that said node's included embeddings in the VectorResource are correct. async fn verify_internal_embeddings_coherence( @@ -192,7 +192,7 @@ pub trait VectorResourceSearch: VectorResourceCore { } } - #[cfg(feature = "desktop-only")] + /// Performs a dynamic vector search that returns the most similar nodes based on the input query String. /// Dynamic Vector Searches support internal VectorResources with different Embedding models by automatically generating /// the query Embedding from the input_query for each model. Dynamic Vector Searches are always Exhaustive. @@ -214,7 +214,7 @@ pub trait VectorResourceSearch: VectorResourceCore { .await } - #[cfg(feature = "desktop-only")] + /// Performs a dynamic vector search that returns the most similar nodes based on the input query String. /// Dynamic Vector Searches support internal VectorResources with different Embedding models by automatically generating /// the query Embedding from the input_query for each model. Dynamic Vector Searches are always Exhaustive. diff --git a/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource_types.rs b/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource_types.rs index c15abcca7..85c9ac7b1 100644 --- a/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource_types.rs +++ b/shinkai-libs/shinkai-vector-resources/src/vector_resource/vector_resource_types.rs @@ -909,7 +909,7 @@ impl VRKeywords { self.keywords_embedding.take() } - #[cfg(feature = "desktop-only")] + /// Asynchronously regenerates and updates the keywords' embedding using the provided keywords. pub async fn update_keywords_embedding(&mut self, generator: &dyn EmbeddingGenerator) -> Result<(), VRError> { let formatted_keywords = format!("Keywords: [{}]", self.keyword_list.join(",")); @@ -918,7 +918,7 @@ impl VRKeywords { Ok(()) } - #[cfg(feature = "desktop-only")] + /// Synchronously regenerates and updates the keywords' embedding using the provided keywords. pub fn update_keywords_embedding_blocking(&mut self, generator: &dyn EmbeddingGenerator) -> Result<(), VRError> { let formatted_keywords = format!("Keywords: [{}]", self.keyword_list.join(",")); diff --git a/shinkai-libs/shinkai-vector-resources/src/vector_resource/vrpack.rs b/shinkai-libs/shinkai-vector-resources/src/vector_resource/vrpack.rs index 43f5bffdb..f21879ca1 100644 --- a/shinkai-libs/shinkai-vector-resources/src/vector_resource/vrpack.rs +++ b/shinkai-libs/shinkai-vector-resources/src/vector_resource/vrpack.rs @@ -4,7 +4,7 @@ use super::{ deep_search_scores_average_out, BaseVectorResource, MapVectorResource, Node, NodeContent, RetrievedNode, ScoringMode, TraversalMethod, TraversalOption, VRKai, VRPath, VRSourceReference, VectorSearchMode, }; -#[cfg(feature = "desktop-only")] + use crate::embedding_generator::{EmbeddingGenerator, RemoteEmbeddingGenerator}; use crate::model_type::EmbeddingModelTypeString; use crate::{embeddings::Embedding, resource_errors::VRError}; @@ -559,7 +559,7 @@ impl VRPack { Ok(sorted_retrieved_nodes) } - #[cfg(feature = "desktop-only")] + /// Performs a dynamic vector search within the VRPack and returns the most similar VRKais based on the input query String. /// This allows for multiple embedding models to be used within the VRPack, as it automatically generates the input query embedding. pub async fn dynamic_vector_search_vrkai( @@ -580,7 +580,7 @@ impl VRPack { .await } - #[cfg(feature = "desktop-only")] + /// Performs a dynamic vector search within the VRPack and returns the most similar VRKais based on the input query String. /// Supports customizing the search starting path/traversal options. /// This allows for multiple embedding models to be used within the VRPack, as it automatically generates the input query embedding. @@ -607,7 +607,7 @@ impl VRPack { Ok(vrkais) } - #[cfg(feature = "desktop-only")] + /// Performs a dynamic vector search within the VRPack and returns the most similar (VRKai, score) based on the input query String. /// Supports customizing the search starting path/traversal options. /// This allows for multiple embedding models to be used within the VRPack, as it automatically generates the input query embedding. @@ -649,7 +649,7 @@ impl VRPack { Ok(vrkais_with_score) } - #[cfg(feature = "desktop-only")] + /// Performs a dynamic deep vector search within the VRPack, returning the highest scored `RetrievedNode`s across /// the VRKais stored in the VRPack. /// This allows for multiple embedding models to be used within the VRPack, as it automatically generates the input query embedding. @@ -676,7 +676,7 @@ impl VRPack { .await } - #[cfg(feature = "desktop-only")] + /// Performs a dynamic deep vector search within the VRPack, returning the highest scored `RetrievedNode`s across /// the VRKais stored in the VRPack. This allows for multiple embedding models to be used within the VRPack, as it automatically generates the input query embedding. /// Customized allows specifying options for the first top-level search for VRKais, and then "deep" options/method for the vector searches into the VRKais to acquire the `RetrievedNode`s. @@ -710,7 +710,7 @@ impl VRPack { .map(|retrieved_nodes| retrieved_nodes.into_iter().map(|(ret_node, _)| ret_node).collect()) } - #[cfg(feature = "desktop-only")] + /// Performs a dynamic deep vector search within the VRPack, returning the highest scored `RetrievedNode`s across /// the VRKais stored in the VRPack (with the relative VRPath of the VRKai in the VRPack). This allows for multiple embedding models to be used within the VRPack, as it automatically generates the input query embedding. /// Customized allows specifying options for the first top-level search for VRKais, and then "deep" options/method for the vector searches into the VRKais to acquire the `RetrievedNode`s.