Skip to content

Commit

Permalink
embedding super naive approach (#90)
Browse files Browse the repository at this point in the history
* so far

* Small Fixes (#92)

* Updated db keys to all use `:::`

* VRSource update & added doc/image distinction

* Implemented methods for fetching all res pointers

* Added VRSource to resource pointers

* embeddings in the query!

* checkpointing

* Vector Resource Path System (#93)

* Updated db keys to all use `:::`

* VRSource update & added doc/image distinction

* Implemented methods for fetching all res pointers

* Added VRSource to resource pointers

* Implemented basic path logic on vector searches

* Implemented VRPath struct

* Removed needless retrieval depth + path bugfix

* Moved to id based paths

* Fixed/added depth checking tests

* Fixed exhaustive search logic

* Added path search tests

* fix tests

---------

Co-authored-by: Robert Kornacki <[email protected]>
  • Loading branch information
nicarq and robkorn authored Oct 6, 2023
1 parent 42c0a14 commit 7e38b98
Show file tree
Hide file tree
Showing 21 changed files with 927 additions and 455 deletions.
Binary file added files/Zeko_Mina_Rollup.pdf
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
use serde::{Deserialize, Serialize};
use shinkai_vector_resources::{
base_vector_resources::BaseVectorResource, vector_resource_types::VectorResourcePointer, source::VRSource,
base_vector_resources::BaseVectorResource,
source::{SourceFile, VRSource},
vector_resource_types::VectorResourcePointer,
};

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct LocalScopeEntry {
pub resource: BaseVectorResource,
pub source: VRSource,
pub source: SourceFile,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
Expand Down
13 changes: 9 additions & 4 deletions shinkai-libs/shinkai-vector-resources/src/document_resource.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use crate::embeddings::Embedding;
use crate::model_type::{EmbeddingModelType, RemoteModel};
use crate::resource_errors::VectorResourceError;
use crate::source::VRSource;
use crate::vector_resource::{DataChunk, DataContent, RetrievedDataChunk, TraversalMethod, VectorResource};
use crate::vector_resource::{DataChunk, DataContent, RetrievedDataChunk, TraversalMethod, VRPath, VectorResource};
use serde_json;
use std::collections::HashMap;

Expand Down Expand Up @@ -94,6 +94,11 @@ impl VectorResource for DocumentVectorResource {
let index = id.checked_sub(1).ok_or(VectorResourceError::InvalidChunkId)? as usize;
Ok(self.data_chunks[index].clone())
}

/// Returns all data chunks in the MapVectorResource
fn get_all_data_chunks(&self) -> Vec<DataChunk> {
self.data_chunks.clone()
}
}

impl DocumentVectorResource {
Expand Down Expand Up @@ -148,7 +153,7 @@ impl DocumentVectorResource {
query: Embedding,
proximity_window: u64,
) -> Result<Vec<RetrievedDataChunk>, VectorResourceError> {
let search_results = self.vector_search_with_traversal(query, 1, &TraversalMethod::UntilDepth(0));
let search_results = self.vector_search_with_options(query, 1, &TraversalMethod::UntilDepth(0), None);
let most_similar_chunk = search_results.first().ok_or(VectorResourceError::VectorResourceEmpty)?;
let most_similar_id = most_similar_chunk
.chunk
Expand Down Expand Up @@ -180,7 +185,7 @@ impl DocumentVectorResource {
chunk: chunk.clone(),
score: 0.00,
resource_pointer: self.get_resource_pointer(),
retrieval_depth: 0,
retrieval_path: VRPath::new(),
});
}
}
Expand All @@ -204,7 +209,7 @@ impl DocumentVectorResource {
chunk: chunk.clone(),
score: 0.00,
resource_pointer: self.get_resource_pointer(),
retrieval_depth: 0,
retrieval_path: VRPath::new(),
}),
_ => (),
}
Expand Down
9 changes: 7 additions & 2 deletions shinkai-libs/shinkai-vector-resources/src/map_resource.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use crate::embeddings::Embedding;
use crate::model_type::{EmbeddingModelType, RemoteModel};
use crate::resource_errors::VectorResourceError;
use crate::source::VRSource;
use crate::vector_resource::{DataChunk, DataContent, RetrievedDataChunk, VectorResource};
use crate::vector_resource::{DataChunk, DataContent, RetrievedDataChunk, VRPath, VectorResource};
use serde_json;
use std::collections::HashMap;

Expand Down Expand Up @@ -91,6 +91,11 @@ impl VectorResource for MapVectorResource {
.ok_or(VectorResourceError::InvalidChunkId)?
.clone())
}

/// Returns all data chunks in the MapVectorResource
fn get_all_data_chunks(&self) -> Vec<DataChunk> {
self.data_chunks.values().cloned().collect()
}
}

impl MapVectorResource {
Expand Down Expand Up @@ -151,7 +156,7 @@ impl MapVectorResource {
chunk: chunk.clone(),
score: 0.00,
resource_pointer: self.get_resource_pointer(),
retrieval_depth: 0,
retrieval_path: VRPath::new(),
}),
_ => (),
}
Expand Down
4 changes: 4 additions & 0 deletions shinkai-libs/shinkai-vector-resources/src/resource_errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ use serde_json::Error as SerdeError;
use std::error::Error;
use std::fmt;

use crate::vector_resource::VRPath;

#[derive(Debug, PartialEq)]
pub enum VectorResourceError {
InvalidChunkId,
Expand All @@ -17,6 +19,7 @@ pub enum VectorResourceError {
RequestFailed(String),
NoEmbeddingProvided,
DataIsNonMatchingType,
InvalidVRPath(VRPath),
}

impl fmt::Display for VectorResourceError {
Expand All @@ -42,6 +45,7 @@ impl fmt::Display for VectorResourceError {
VectorResourceError::DataIsNonMatchingType => {
write!(f, "Data inside of the DataChunk is of a different type than requested.")
}
VectorResourceError::InvalidVRPath(ref p) => write!(f, "Vector Resource Path is invalid: {}", p),
}
}
}
Expand Down
111 changes: 82 additions & 29 deletions shinkai-libs/shinkai-vector-resources/src/source.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
use serde::{Deserialize, Serialize};
use std::fmt;

use crate::resource_errors::VectorResourceError;

/// The source of a Vector Resource as either the file contents of the source file itself,
/// or a pointer to the source file (either external such as URL, or a FileRef)
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub enum VRSource {
File(SourceFile),
Reference(SourceReference),
None,
}
Expand All @@ -14,7 +15,6 @@ impl VRSource {
/// Formats a printable string based on the source
pub fn format_source_string(&self) -> String {
match self {
VRSource::File(file) => file.format_source_string(),
VRSource::Reference(pointer) => pointer.format_source_string(),
VRSource::None => String::from("None"),
}
Expand All @@ -39,10 +39,19 @@ impl VRSource {
VRSource::Reference(SourceReference::Other(other))
}

/// Creates a VRSource reference using a SourceFile itself
/// Do note, this will store the SourceFile
pub fn new_file(file: SourceFile) -> Self {
VRSource::File(file)
/// Creates a VRSource which represents no/unknown source.
pub fn none() -> Self {
VRSource::None
}

/// Serializes the VRSource to a JSON string
pub fn to_json(&self) -> Result<String, VectorResourceError> {
serde_json::to_string(self).map_err(|_| VectorResourceError::FailedJSONParsing)
}

/// Deserializes a VRSource from a JSON string
pub fn from_json(json: &str) -> Result<Self, VectorResourceError> {
serde_json::from_str(json).map_err(|_| VectorResourceError::FailedJSONParsing)
}
}

Expand Down Expand Up @@ -101,7 +110,7 @@ pub struct SourceFileReference {
impl SourceFileReference {
/// The default key for this file in the Shinkai DB
pub fn shinkai_db_key(&self) -> String {
format!("{}:{}", self.file_name, self.content_hash)
format!("{}:::{}", self.file_name, self.content_hash)
}

pub fn format_source_string(&self) -> String {
Expand All @@ -111,42 +120,86 @@ impl SourceFileReference {

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub enum SourceFileType {
Document(SourceDocumentType),
Image(SourceImageType),
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub enum SourceImageType {
Png,
Jpeg,
Gif,
Bmp,
Tiff,
Svg,
Webp,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub enum SourceDocumentType {
Pdf,
Md,
Txt,
Epub,
Doc,
Docx,
Rtf, // Rich Text Format
Odt, // OpenDocument Text Document
Html, // HTML Document
Csv, // Comma-Separated Values
Xls, // Excel Spreadsheet
Xlsx, // Excel Open XML Spreadsheet
Ppt, // PowerPoint Presentation
Pptx, // PowerPoint Open XML Presentation
Rtf,
Odt,
Html,
Csv,
Xls,
Xlsx,
Ppt,
Pptx,
}

impl fmt::Display for SourceFileType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SourceFileType::Document(doc_type) => write!(f, "{}", doc_type),
SourceFileType::Image(img_type) => write!(f, "{}", img_type),
}
}
}

impl fmt::Display for SourceImageType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"{}",
match self {
SourceImageType::Png => "png",
SourceImageType::Jpeg => "jpeg",
SourceImageType::Gif => "gif",
SourceImageType::Bmp => "bmp",
SourceImageType::Tiff => "tiff",
SourceImageType::Svg => "svg",
SourceImageType::Webp => "webp",
}
)
}
}

impl fmt::Display for SourceDocumentType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"{}",
match self {
SourceFileType::Pdf => "pdf",
SourceFileType::Md => "md",
SourceFileType::Txt => "txt",
SourceFileType::Epub => "epub",
SourceFileType::Doc => "doc",
SourceFileType::Docx => "docx",
SourceFileType::Rtf => "rtf",
SourceFileType::Odt => "odt",
SourceFileType::Html => "html",
SourceFileType::Csv => "csv",
SourceFileType::Xls => "xls",
SourceFileType::Xlsx => "xlsx",
SourceFileType::Ppt => "ppt",
SourceFileType::Pptx => "pptx",
SourceDocumentType::Pdf => "pdf",
SourceDocumentType::Md => "md",
SourceDocumentType::Txt => "txt",
SourceDocumentType::Epub => "epub",
SourceDocumentType::Doc => "doc",
SourceDocumentType::Docx => "docx",
SourceDocumentType::Rtf => "rtf",
SourceDocumentType::Odt => "odt",
SourceDocumentType::Html => "html",
SourceDocumentType::Csv => "csv",
SourceDocumentType::Xls => "xls",
SourceDocumentType::Xlsx => "xlsx",
SourceDocumentType::Ppt => "ppt",
SourceDocumentType::Pptx => "pptx",
}
)
}
Expand Down
Loading

0 comments on commit 7e38b98

Please sign in to comment.