Skip to content

Commit

Permalink
LanceDB
Browse files Browse the repository at this point in the history
  • Loading branch information
awhiteside1 committed Sep 16, 2024
1 parent 10adc6d commit 187412d
Show file tree
Hide file tree
Showing 13 changed files with 7,529 additions and 621 deletions.
1 change: 1 addition & 0 deletions biome.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"$schema": "https://biomejs.dev/schemas/1.8.3/schema.json",
"files": {
"ignore": ["playground/**"],
"ignoreUnknown": true
},
"vcs": {
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"vitest": "^2.1.1",
"typescript": "^5.6.2",
"unbuild": "^2.0.0",
"payload": "^3.0.0-beta.106"
"payload": "beta"
},
"prettier": {}
}
7 changes: 5 additions & 2 deletions packages/semantic-search/e2e/_setup/createConfig.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import { mongooseAdapter } from '@payloadcms/db-mongodb'
import defu from 'defu'
import { MongoMemoryReplSet } from 'mongodb-memory-server'
import type { CollectionConfig, Config } from 'payload'
import { uid } from 'radash'
import { inject } from 'vitest'
import type { VectorDB } from '../../src'

Expand All @@ -26,6 +24,11 @@ export const givenAnEnvironment = (base: Partial<Config>): Config => {
// }),
db: mongooseAdapter({ url: inject('mongoURL') }),
plugins: [],
logger: {
options: {
enabled: false,
},
},
}

return defu(base, config) as Config
Expand Down
112 changes: 98 additions & 14 deletions packages/semantic-search/e2e/index.spec.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import * as fs from 'node:fs/promises'
import * as os from 'node:os'
import { mongooseAdapter } from '@payloadcms/db-mongodb'
import { MongoMemoryReplSet } from 'mongodb-memory-server'
import { join } from 'pathe'
import payload, { buildConfig, type Payload } from 'payload'
import { list } from 'radash'
import { afterAll, beforeAll, expect } from 'vitest'
import { list, sleep, uid } from 'radash'
import { afterEach, beforeEach, expect } from 'vitest'
import { semanticSearchPlugin } from '../src'
import { LanceDB } from '../src/components/lancedb'
import {
givenACollectionConfig,
givenAVectorDB,
Expand All @@ -12,7 +16,7 @@ import {

describe('Semantic Search', async () => {
let instance: Payload
const mongo = await MongoMemoryReplSet.create()
let mongo: MongoMemoryReplSet

const spys = {
embeddingSpy: vi.fn(() =>
Expand All @@ -21,7 +25,13 @@ describe('Semantic Search', async () => {
upsertSpy: vi.fn(),
}

beforeAll(async () => {
beforeEach(async () => {
mongo = await MongoMemoryReplSet.create({ replSet: { dbName: uid(7) } })
})
afterEach(async () => {
await mongo.stop()
})
it('should insert a vector on create', async () => {
const environment = givenAnEnvironment({
collections: [givenACollectionConfig({ slug: 'myCollection' })],
plugins: [
Expand All @@ -30,33 +40,107 @@ describe('Semantic Search', async () => {
indexableFields: ['myCollection.description'],
enabled: true,
dimensions: 768,
embeddingFn: spys.embeddingSpy,
}),
],
db: mongooseAdapter({ mongoMemoryServer: mongo, url: mongo.getUri() }),
db: mongooseAdapter({
mongoMemoryServer: mongo,
url: mongo.getUri(uid(7)),
}),
})
instance = await payload.init({
config: buildConfig(environment),
loggerOptions: { enabled: false },
})
})

afterAll(async () => {
await mongo.stop()
})
it('should insert a vector on create', async () => {
const item = await instance.create({
collection: 'myCollection',
data: { description: 'hello' },
})
expect(item.id).toBeTruthy()
expect(spys.embeddingSpy).toHaveBeenCalledWith('hello')
expect(spys.upsertSpy).toHaveBeenCalledWith(
expect.objectContaining({
documentId: item.id,
collection: 'myCollection',
field: 'description',
}),
)
// @ts-ignore
await payload.db?.destroy()
})
it.skipIf(process.env.CI)('should work with LanceDB', async () => {
const tmp = await fs.mkdtemp(join(os.tmpdir(), uid(5)))
const lance = await LanceDB.create(tmp)
const environment = givenAnEnvironment({
collections: [givenACollectionConfig({ slug: 'myCollection' })],
plugins: [
semanticSearchPlugin({
vectorDB: lance,
indexableFields: ['myCollection.description'],
enabled: true,
dimensions: 768,
}),
],
db: mongooseAdapter({
mongoMemoryServer: mongo,
url: mongo.getUri(uid(7)),
}),
})
instance = await payload.init({
config: buildConfig(environment),
})
const spy = vi.spyOn(lance, 'upsert')

const descriptions = [
'For a gooey rich soup, add some bourbon and lime.',
'Lobster salad has to have a chilled, dark blood oranges component.',
'Honey soup is just not the same without sugar and ripe fluffy pork butts.',
'Treasure, greed, and power. All seas view dark, coal-black swabbies.',
'Landlubbers only dream of the minty fresh fix us swashbuckers have for halitosis!',
]
for (const description of descriptions) {
await instance.create({
data: { description },
collection: 'myCollection',
})
}

await sleep(800)
expect(spy).toHaveBeenCalled()

const raw = lance.getTable()
const all = await raw.toArrow()
const allArray = all.toArray()
expect(allArray.length).toEqual(5)

const query = 'What Crustaceans can be served cold with citrus?'
const result = await raw
.search(query)
.select(['text', 'documentId', '_distance'])
.limit(1)
.toArrow()
const data = result
.toArray()
.map((x) => x.toJSON())
.pop()
expect(data).toMatchObject(
expect.objectContaining({
text: 'Lobster salad has to have a chilled, dark blood oranges component.',
}),
)

const query2 = 'How can pirates get minty clean breath?'
const result2 = await raw
.search(query2)
.select(['text', 'documentId', '_distance'])
.limit(1)
.toArrow()
const data2 = result2
.toArray()
.map((x) => x.toJSON())
.pop()

expect(data2).toMatchObject(
expect.objectContaining({
text: 'Landlubbers only dream of the minty fresh fix us swashbuckers have for halitosis!',
}),
)
})
})
7 changes: 4 additions & 3 deletions packages/semantic-search/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,22 @@
"postinstall": "pnpm build:stub"
},
"devDependencies": {
"@apache-arrow/ts": "^17.0.0",
"@lancedb/lancedb": "^0.10.0",
"@lancedb/lancedb": "0.10.0",
"@payloadcms/db-mongodb": "^3.0.0-beta.107",
"@payloadcms/db-postgres": "^3.0.0-beta.107",
"@types/dockerode": "^3.3.31",
"apache-arrow": "^17.0.0",
"dockerode": "^4.0.2",
"mongodb-memory-server": "^10.0.0",
"ollama": "^0.5.9"
},
"dependencies": {
"@workspace/llm-utils": "workspace:*",
"defu": "^6.1.4",
"pathe": "^1.1.2",
"radash": "^12.1.0"
},
"peerDependencies": {
"payload": "^3.0.0-beta.107"
"payload": "beta"
}
}
57 changes: 57 additions & 0 deletions packages/semantic-search/src/components/lancedb/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import { type Table, connect } from '@lancedb/lancedb'
import { LanceSchema } from '@lancedb/lancedb/embedding'
import { Utf8 } from 'apache-arrow'
import type { Identifier, InsertFields, VectorDB } from '../../types'
import { OllamaEmbeddings } from '../ollama'

export class LanceDB implements VectorDB {
public name = 'lancedb'
async search(index: { collection: string; field: string }, query: string) {
const where = `WHERE field=${index.field} and collection=${index.collection} `
const result = await this.table
.search(query)
.where(where)
.limit(5)
.toArray()
return result.map((res) => res.toJSON())
}

constructor(private table: Table) {}

getTable() {
return this.table
}

static async create(path = './lancedb') {
const func = new OllamaEmbeddings({
host: 'http://100.67.29.127:11434',
model: 'nomic-embed-text',
timeout: 10000,
})
const schema = LanceSchema({
text: func.sourceField(),
documentId: new Utf8(),
field: new Utf8(),
collection: new Utf8(),
vector: func.vectorField(),
})

const connection = await connect(path)
const table = await connection.createEmptyTable(
'payloadDocuments',
schema,
{ existOk: true },
)
const instance = new LanceDB(table)
return instance
}

async delete(record: Identifier) {
const where = `WHERE documentId=${record.documentId} and field=${record.field} and collection=${record.collection}`
await this.table.delete(where)
}

async upsert(fields: InsertFields) {
await this.table.add([fields])
}
}
59 changes: 59 additions & 0 deletions packages/semantic-search/src/components/ollama/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import { embedding } from '@lancedb/lancedb'
import {
EmbeddingFunction,
TextEmbeddingFunction,
getRegistry,
} from '@lancedb/lancedb/embedding'
import type { Float } from 'apache-arrow'
import { Ollama } from 'ollama'

interface Options {
model: string
timeout: number
host: string
}
// @ts-ignore
@embedding.register('ollama')
export class OllamaEmbeddings extends TextEmbeddingFunction<Partial<Options>> {
private client: Ollama
constructor(private modelOptions: Options) {
super()
this.client = new Ollama({
host: modelOptions.host,
})
}
embeddingDataType(): Float {
return super.embeddingDataType()
}

override ndims() {
return 768
}
toJSON(): object {
return {
...this.modelOptions,
type: 'ollama',
}
}

async generateEmbeddings(
texts: string[],
): Promise<number[][] | Float32Array[] | Float64Array[]> {
const embeddings = await Promise.all(
texts.map(async (text) => {
const response = await this.client.embeddings({
model: this.modelOptions?.model ?? 'nomic-embed-text',
prompt: text,
})
return response.embedding
}),
)
return embeddings
}
}

export const register = () => {
const registry = getRegistry()
// @ts-ignore
registry.register('ollama')(OllamaEmbeddings)
}
3 changes: 1 addition & 2 deletions packages/semantic-search/src/hooks/afterChangeHook.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,9 @@ const insertEmbedding = async ({
)
return

const vector = await semanticSearch.embeddingFn(value)
await semanticSearch.vectorDB.upsert({
text: value,
collection: collection.slug,
vector,
field: field.name,
documentId: originalDoc.id as string | number,
})
Expand Down
2 changes: 1 addition & 1 deletion packages/semantic-search/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ export const semanticSearchPlugin =

return setupSemanticSearchCustom(config, {
vectorDB: incomingPluginConfig.vectorDB,
embeddingFn: incomingPluginConfig.embeddingFn,
embeddingFn: incomingPluginConfig.vectorDB,
})
}

Expand Down
Loading

0 comments on commit 187412d

Please sign in to comment.