Instadapp · KABBOUCHI · Feb 25, 2023 · Feb 25, 2023 · Feb 25, 2023
diff --git a/ai-v2/db.sql b/ai-v2/db.sql
@@ -0,0 +1,50 @@
+create extension if not exists vector with schema public;
+
+create table "public"."page" (
+  id bigserial primary key,
+  path text not null unique,
+  checksum text,
+  meta jsonb
+);
+
+create table "public"."page_section" (
+  id bigserial primary key,
+  page_id bigint not null references public.page on delete cascade,
+  content text,
+  token_count int,
+  embedding vector(1536)
+);
+
+CREATE INDEX ON page_section USING ivfflat (embedding vector_cosine_ops);
+
+create or replace function match_page_sections(embedding vector(1536), match_threshold float, match_count int, min_content_length int)
+returns table (path text, content text, similarity float)
+language plpgsql
+as $$
+#variable_conflict use_variable
+begin
+  return query
+  select
+    page.path,
+    page_section.content,
+    (page_section.embedding <#> embedding) * -1 as similarity
+  from page_section
+  join page
+    on page_section.page_id = page.id
+
+  -- We only care about sections that have a useful amount of content
+  where length(page_section.content) >= min_content_length
+
+  -- The dot product is negative because of a Postgres limitation, so we negate it
+  and (page_section.embedding <#> embedding) * -1 > match_threshold
+
+  -- OpenAI embeddings are normalized to length 1, so
+  -- cosine similarity and dot product will produce the same results.
+  -- Using dot product which can be computed slightly faster.
+  --
+  -- For the different syntaxes, see https://github.com/pgvector/pgvector
+  order by page_section.embedding <#> embedding
+
+  limit match_count;
+end;
+$$;
diff --git a/ai-v2/ingest.ts b/ai-v2/ingest.ts
@@ -0,0 +1,89 @@
+import * as fs from 'fs';
+import glob from "glob"
+import path from "path"
+import { fileURLToPath } from 'url';
+import dotenv from "dotenv"
+import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
+import { createHash } from 'crypto'
+import postgres from 'postgres'
+import { Configuration, OpenAIApi } from 'openai';
+dotenv.config({
+  path: "../.env",
+})
+const sql = postgres(process.env.POSTGRES_AI_URL as string)
+const getDirName = function () {
+  const filename = fileURLToPath(import.meta.url)
+  return path.dirname(filename)
+}
+
+const __dirname = getDirName();
+
+const ignoredFiles = ['/ai']
+
+const files = glob.sync(path.resolve(__dirname, "../content/**/*.md"))
+  .map((f) => ({
+    path: f.replace(/.*\/content/, '')
+      .replace(/\.md?$/, '')
+      .replace(/[0-9]+\./g, '')
+      .replace('/index', '/')
+      .replace(/\/$/, "") || '/',
+    filename: f,
+  }))
+  .filter((f) => !ignoredFiles.includes(f.path))
+
+console.log(`Discovered ${files.length} pages`)
+console.log('Checking which pages are new or have changed')
+
+const configuration = new Configuration({
+  apiKey: process.env.OPENAI_API_KEY
+})
+const openai = new OpenAIApi(configuration)
+
+
+for (const file of files) {
+  const content = fs.readFileSync(file.filename, { encoding: "utf8" });
+  const textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: 1000, chunkOverlap: 200 });
+  const sections = textSplitter.createDocuments([content])
+  const checksum = createHash('sha256').update(content).digest('base64')
+  let [page] = await sql`SELECT * from page where path = ${file.path} limit 1`
+  let isNew = false;
+
+  if (!page) {
+    isNew = true;
+    await sql`INSERT INTO "public"."page" ("path", "checksum") VALUES (${file.path}, ${checksum});`;
+    page = await sql`SELECT * from page where path = ${file.path} limit 1`.then(r => r[0])
+  }
+
+  if (!isNew && page.checksum === checksum) {
+    continue
+  }
+
+  await sql`DELETE FROM "public"."page_section" WHERE "id" = ${page.id};`
+
+  console.log(`Adding ${sections.length} page sections (with embeddings) for '${page.path}'`)
+
+  for (const section of sections) {
+    // OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
+    const input = section.pageContent.replace(/\n/g, ' ')
+
+    const embeddingResponse = await openai.createEmbedding({
+      model: 'text-embedding-ada-002',
+      input,
+    })
+
+    const [responseData] = embeddingResponse.data.data
+
+    const data = {
+      page_id: page.id,
+      content: section.pageContent,
+      token_count: embeddingResponse.data.usage.total_tokens,
+      embedding: `[${responseData.embedding}]`,
+    }
+    await sql`insert into "public"."page_section" ${sql(data)}`
+  }
+
+  await sql`UPDATE "public"."page" SET "checksum" = ${checksum} WHERE "id" = ${page.id};`
+
+}
+
+process.exit()
diff --git a/ai-v2/prompt.ts b/ai-v2/prompt.ts
@@ -0,0 +1,103 @@
+import dotenv from "dotenv"
+import postgres from 'postgres'
+import { Configuration, OpenAIApi } from 'openai';
+import GPT3Tokenizer from 'gpt3-tokenizer'
+import { oneLine, stripIndent } from 'common-tags'
+if (!process.argv[2]) {
+  console.log("Usage: npx tsx prompt.ts 'What is Instadapp?'")
+  process.exit()
+}
+const sanitizedQuery = process.argv[2].trim()
+
+dotenv.config({
+  path: "../.env",
+})
+const sql = postgres(process.env.POSTGRES_AI_URL as string)
+
+const configuration = new Configuration({
+  apiKey: process.env.OPENAI_API_KEY
+})
+const openai = new OpenAIApi(configuration)
+
+const embeddingResponse = await openai.createEmbedding({
+  model: 'text-embedding-ada-002',
+  input: sanitizedQuery.replaceAll('\n', ' '),
+})
+
+const [{ embedding }] = embeddingResponse.data.data
+
+const embeddingData = `[${embedding}]`;
+
+console.time("match_page_sections")
+const pageSections = await sql`select public.match_page_sections(${embeddingData}, 0.78, 10, 50) as content`;
+console.timeEnd("match_page_sections")
+
+console.time("tokenizing")
+
+//@ts-ignore
+const tokenizer = new (GPT3Tokenizer.default || GPT3Tokenizer)({ type: 'gpt3' })
+let tokenCount = 0
+let contextText = ''
+
+
+for (let i = 0; i < pageSections.length; i++) {
+  const pageSection = pageSections[i]
+  const content = pageSection.content
+  const encoded = tokenizer.encode(content)
+  tokenCount += encoded.text.length
+
+  if (tokenCount >= 3500) {
+    break
+  }
+
+  contextText += `${content.trim()}\n---\n`
+}
+
+console.timeEnd("tokenizing")
+
+
+const prompt = stripIndent`
+${oneLine`
+  You are a very enthusiastic Instadapp representative who loves
+  to help people! Given the following sections from the Instadapp
+  documentation, answer the question using only that information,
+  outputted in markdown format. If you are unsure and the answer
+  is not explicitly written in the documentation, say
+  "Sorry, I don't know how to help with that."
+`}
+Context sections:
+${contextText}
+Question: """
+${sanitizedQuery}
+"""
+Answer as markdown (including related code snippets if available):
+`
+const response: any = await openai.createCompletion({
+  model: 'text-davinci-003',
+  prompt,
+  max_tokens: 512,
+  temperature: 0,
+  stream: true,
+}, { responseType: 'stream' })
+
+console.log("\nAnswer:")
+response.data.on('data', (data: any) => {
+  const lines = data.toString().split('\n').filter((line: string) => line.trim() !== '');
+  for (const line of lines) {
+    const message = line.replace(/^data: /, '');
+    if (message === '[DONE]') {
+      return; // Stream finished
+    }
+    try {
+      const parsed = JSON.parse(message);
+      console.log(parsed.choices[0].text);
+    } catch (error) {
+      console.error('Could not JSON parse stream message', message, error);
+    }
+  }
+});
+
+
+// console.log("\nAnswer:")
+// console.log(response.data.choices[0].text.trim())
+// process.exit()
diff --git a/components/global/AiSearch.vue b/components/global/AiSearch.vue
@@ -0,0 +1,68 @@
+<script setup lang="ts">
+import { SSE } from 'sse.js'
+const loading = ref(false)
+const query = ref("")
+const answer = ref('')
+const answerInMD = ref()
+watch(answer, async () => {
+    answerInMD.value = await parseMarkdown(answer.value.replaceAll('""', '"'))
+})
+
+let eventSource: SSE;
+const submit = async () => {
+    answer.value = ''
+
+    if (loading.value) {
+        return;
+    }
+
+    if (eventSource) {
+        eventSource.close()
+    }
+
+    loading.value = true
+
+
+    eventSource = new SSE(`api/search`, {
+        headers: {
+            'Content-Type': 'application/json',
+        },
+        payload: JSON.stringify({ query: query.value }),
+    })
+
+    eventSource.addEventListener('error', console.error)
+    eventSource.addEventListener('message', (e: any) => {
+        try {
+
+            if (e.data === '[DONE]') {
+                loading.value = false
+                return
+            }
+
+            const completionResponse = JSON.parse(e.data)
+            const [{ text }] = completionResponse.choices
+
+            answer.value = (answer.value ?? '') + text
+        } catch (err) {
+            console.log(err)
+        }
+    })
+
+    eventSource.stream()
+
+}
+</script>
+<template>
+    <div>
+        <input type="text" placeholder="Ask me anything about Instadapp" style="width:100%; padding: 10px;" v-model="query"
+            @keyup.enter.prevent="submit">
+
+        <div style="margin-top: 20px;">
+            {{ loading ? 'Thinking...' : '' }}
+        </div>
+
+        <div v-if="answerInMD" style="background-color: #22293a;width:100%; padding: 10px;margin-top: 20px;">
+            <ContentRendererMarkdown :value="answerInMD" />
+        </div>
+    </div>
+</template>
diff --git a/content/100.search/0.index.md b/content/100.search/0.index.md
@@ -0,0 +1,6 @@
+---
+layout: page
+title: Search
+---
+
+<AISearch />
diff --git a/content/100.search/_dir.yml b/content/100.search/_dir.yml
@@ -0,0 +1 @@
+navigation: false
diff --git a/nuxt.config.ts b/nuxt.config.ts
@@ -20,6 +20,7 @@ export default defineNuxtConfig({
 
   runtimeConfig: {
     opeanAiKey: process.env.OPENAI_API_KEY || "",
+    postgresAiUrl: process.env.POSTGRES_AI_URL || "",
     pinceconeApiKey: process.env.PINECONE_API_KEY || "",
     pinceconeBaseUrl: process.env.PINECONE_BASE_URL || "",
     public: {

diff --git a/package.json b/package.json
@@ -13,16 +13,23 @@
   "devDependencies": {
     "@docsearch/js": "^3.3.3",
     "@nuxt-themes/docus": "^1.8.1",
+    "@types/common-tags": "^1.8.1",
+    "@types/glob": "^8.1.0",
     "axios": "^0.21.1",
     "cohere-ai": "^5.0.2",
+    "common-tags": "^1.8.2",
     "del": "^6.0.0",
     "dotenv": "^8.2.0",
     "github-trees": "^0.2.0",
+    "glob": "^8.1.0",
+    "gpt3-tokenizer": "^1.1.5",
     "inquirer": "^8.0.0",
     "langchain": "^0.0.9",
     "nuxt": "^3.1.2",
     "openai": "^3.1.0",
     "pinecone-client": "^1.0.1",
+    "postgres": "^3.3.3",
+    "sse.js": "^0.6.1",
     "web3": "^1.8.2"
   }
 }