Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

aiv2 #181

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft

aiv2 #181

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions ai-v2/db.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
create extension if not exists vector with schema public;

create table "public"."page" (
id bigserial primary key,
path text not null unique,
checksum text,
meta jsonb
);

create table "public"."page_section" (
id bigserial primary key,
page_id bigint not null references public.page on delete cascade,
content text,
token_count int,
embedding vector(1536)
);

CREATE INDEX ON page_section USING ivfflat (embedding vector_cosine_ops);

create or replace function match_page_sections(embedding vector(1536), match_threshold float, match_count int, min_content_length int)
returns table (path text, content text, similarity float)
language plpgsql
as $$
#variable_conflict use_variable
begin
return query
select
page.path,
page_section.content,
(page_section.embedding <#> embedding) * -1 as similarity
from page_section
join page
on page_section.page_id = page.id

-- We only care about sections that have a useful amount of content
where length(page_section.content) >= min_content_length

-- The dot product is negative because of a Postgres limitation, so we negate it
and (page_section.embedding <#> embedding) * -1 > match_threshold

-- OpenAI embeddings are normalized to length 1, so
-- cosine similarity and dot product will produce the same results.
-- Using dot product which can be computed slightly faster.
--
-- For the different syntaxes, see https://github.com/pgvector/pgvector
order by page_section.embedding <#> embedding

limit match_count;
end;
$$;
89 changes: 89 additions & 0 deletions ai-v2/ingest.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import * as fs from 'fs';
import glob from "glob"
import path from "path"
import { fileURLToPath } from 'url';
import dotenv from "dotenv"
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { createHash } from 'crypto'
import postgres from 'postgres'
import { Configuration, OpenAIApi } from 'openai';
dotenv.config({
path: "../.env",
})
const sql = postgres(process.env.POSTGRES_AI_URL as string)
const getDirName = function () {
const filename = fileURLToPath(import.meta.url)
return path.dirname(filename)
}

const __dirname = getDirName();

const ignoredFiles = ['/ai']

const files = glob.sync(path.resolve(__dirname, "../content/**/*.md"))
.map((f) => ({
path: f.replace(/.*\/content/, '')
.replace(/\.md?$/, '')
.replace(/[0-9]+\./g, '')
.replace('/index', '/')
.replace(/\/$/, "") || '/',
filename: f,
}))
.filter((f) => !ignoredFiles.includes(f.path))

console.log(`Discovered ${files.length} pages`)
console.log('Checking which pages are new or have changed')

const configuration = new Configuration({
apiKey: process.env.OPENAI_API_KEY
})
const openai = new OpenAIApi(configuration)


for (const file of files) {
const content = fs.readFileSync(file.filename, { encoding: "utf8" });
const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200 });
const sections = textSplitter.createDocuments([content])
const checksum = createHash('sha256').update(content).digest('base64')
let [page] = await sql`SELECT * from page where path = ${file.path} limit 1`
let isNew = false;

if (!page) {
isNew = true;
await sql`INSERT INTO "public"."page" ("path", "checksum") VALUES (${file.path}, ${checksum});`;
page = await sql`SELECT * from page where path = ${file.path} limit 1`.then(r => r[0])
}

if (!isNew && page.checksum === checksum) {
continue
}

await sql`DELETE FROM "public"."page_section" WHERE "id" = ${page.id};`

console.log(`Adding ${sections.length} page sections (with embeddings) for '${page.path}'`)

for (const section of sections) {
// OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
const input = section.pageContent.replace(/\n/g, ' ')

const embeddingResponse = await openai.createEmbedding({
model: 'text-embedding-ada-002',
input,
})

const [responseData] = embeddingResponse.data.data

const data = {
page_id: page.id,
content: section.pageContent,
token_count: embeddingResponse.data.usage.total_tokens,
embedding: `[${responseData.embedding}]`,
}
await sql`insert into "public"."page_section" ${sql(data)}`
}

await sql`UPDATE "public"."page" SET "checksum" = ${checksum} WHERE "id" = ${page.id};`

}

process.exit()
103 changes: 103 additions & 0 deletions ai-v2/prompt.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import dotenv from "dotenv"
import postgres from 'postgres'
import { Configuration, OpenAIApi } from 'openai';
import GPT3Tokenizer from 'gpt3-tokenizer'
import { oneLine, stripIndent } from 'common-tags'
if (!process.argv[2]) {
console.log("Usage: npx tsx prompt.ts 'What is Instadapp?'")
process.exit()
}
const sanitizedQuery = process.argv[2].trim()

dotenv.config({
path: "../.env",
})
const sql = postgres(process.env.POSTGRES_AI_URL as string)

const configuration = new Configuration({
apiKey: process.env.OPENAI_API_KEY
})
const openai = new OpenAIApi(configuration)

const embeddingResponse = await openai.createEmbedding({
model: 'text-embedding-ada-002',
input: sanitizedQuery.replaceAll('\n', ' '),
})

const [{ embedding }] = embeddingResponse.data.data

const embeddingData = `[${embedding}]`;

console.time("match_page_sections")
const pageSections = await sql`select public.match_page_sections(${embeddingData}, 0.78, 10, 50) as content`;
console.timeEnd("match_page_sections")

console.time("tokenizing")

//@ts-ignore
const tokenizer = new (GPT3Tokenizer.default || GPT3Tokenizer)({ type: 'gpt3' })
let tokenCount = 0
let contextText = ''


for (let i = 0; i < pageSections.length; i++) {
const pageSection = pageSections[i]
const content = pageSection.content
const encoded = tokenizer.encode(content)
tokenCount += encoded.text.length

if (tokenCount >= 3500) {
break
}

contextText += `${content.trim()}\n---\n`
}

console.timeEnd("tokenizing")


const prompt = stripIndent`
${oneLine`
You are a very enthusiastic Instadapp representative who loves
to help people! Given the following sections from the Instadapp
documentation, answer the question using only that information,
outputted in markdown format. If you are unsure and the answer
is not explicitly written in the documentation, say
"Sorry, I don't know how to help with that."
`}
Context sections:
${contextText}
Question: """
${sanitizedQuery}
"""
Answer as markdown (including related code snippets if available):
`
const response: any = await openai.createCompletion({
model: 'text-davinci-003',
prompt,
max_tokens: 512,
temperature: 0,
stream: true,
}, { responseType: 'stream' })

console.log("\nAnswer:")
response.data.on('data', (data: any) => {
const lines = data.toString().split('\n').filter((line: string) => line.trim() !== '');
for (const line of lines) {
const message = line.replace(/^data: /, '');
if (message === '[DONE]') {
return; // Stream finished
}
try {
const parsed = JSON.parse(message);
console.log(parsed.choices[0].text);
} catch (error) {
console.error('Could not JSON parse stream message', message, error);
}
}
});


// console.log("\nAnswer:")
// console.log(response.data.choices[0].text.trim())
// process.exit()
68 changes: 68 additions & 0 deletions components/global/AiSearch.vue
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
<script setup lang="ts">
import { SSE } from 'sse.js'
const loading = ref(false)
const query = ref("")
const answer = ref('')
const answerInMD = ref()
watch(answer, async () => {
answerInMD.value = await parseMarkdown(answer.value.replaceAll('""', '"'))
})

let eventSource: SSE;
const submit = async () => {
answer.value = ''

if (loading.value) {
return;
}

if (eventSource) {
eventSource.close()
}

loading.value = true


eventSource = new SSE(`api/search`, {
headers: {
'Content-Type': 'application/json',
},
payload: JSON.stringify({ query: query.value }),
})

eventSource.addEventListener('error', console.error)
eventSource.addEventListener('message', (e: any) => {
try {

if (e.data === '[DONE]') {
loading.value = false
return
}

const completionResponse = JSON.parse(e.data)
const [{ text }] = completionResponse.choices

answer.value = (answer.value ?? '') + text
} catch (err) {
console.log(err)
}
})

eventSource.stream()

}
</script>
<template>
<div>
<input type="text" placeholder="Ask me anything about Instadapp" style="width:100%; padding: 10px;" v-model="query"
@keyup.enter.prevent="submit">

<div style="margin-top: 20px;">
{{ loading ? 'Thinking...' : '' }}
</div>

<div v-if="answerInMD" style="background-color: #22293a;width:100%; padding: 10px;margin-top: 20px;">
<ContentRendererMarkdown :value="answerInMD" />
</div>
</div>
</template>
6 changes: 6 additions & 0 deletions content/100.search/0.index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
layout: page
title: Search
---

<AISearch />
1 change: 1 addition & 0 deletions content/100.search/_dir.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
navigation: false
1 change: 1 addition & 0 deletions nuxt.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export default defineNuxtConfig({

runtimeConfig: {
opeanAiKey: process.env.OPENAI_API_KEY || "",
postgresAiUrl: process.env.POSTGRES_AI_URL || "",
pinceconeApiKey: process.env.PINECONE_API_KEY || "",
pinceconeBaseUrl: process.env.PINECONE_BASE_URL || "",
public: {
Expand Down
7 changes: 7 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,23 @@
"devDependencies": {
"@docsearch/js": "^3.3.3",
"@nuxt-themes/docus": "^1.8.1",
"@types/common-tags": "^1.8.1",
"@types/glob": "^8.1.0",
"axios": "^0.21.1",
"cohere-ai": "^5.0.2",
"common-tags": "^1.8.2",
"del": "^6.0.0",
"dotenv": "^8.2.0",
"github-trees": "^0.2.0",
"glob": "^8.1.0",
"gpt3-tokenizer": "^1.1.5",
"inquirer": "^8.0.0",
"langchain": "^0.0.9",
"nuxt": "^3.1.2",
"openai": "^3.1.0",
"pinecone-client": "^1.0.1",
"postgres": "^3.3.3",
"sse.js": "^0.6.1",
"web3": "^1.8.2"
}
}
Loading