ubiquity-os-marketplace · 0x4007 · Oct 28, 2024 · Oct 27, 2024 · Oct 27, 2024 · Oct 27, 2024
diff --git a/package.json b/package.json
@@ -17,7 +17,7 @@
     "knip-ci": "knip --no-exit-code --reporter json --config .github/knip.ts",
     "prepare": "husky install",
     "test": "jest --setupFiles dotenv/config --coverage",
-    "worker": "wrangler dev --env dev --port 4000"
+    "worker": "wrangler dev --env dev --port 5000"
   },
   "keywords": [
     "typescript",
@@ -34,6 +34,8 @@
     "@supabase/supabase-js": "^2.45.4",
     "@ubiquity-dao/ubiquibot-logger": "^1.3.0",
     "dotenv": "^16.4.5",
+    "github-diff-tool": "^1.0.6",
+    "gpt-tokenizer": "^2.5.1",
     "openai": "^4.63.0",
     "typebox-validators": "0.3.5",
     "voyageai": "^0.0.1-5"

diff --git a/src/adapters/openai/helpers/completions.ts b/src/adapters/openai/helpers/completions.ts
@@ -2,7 +2,7 @@ import OpenAI from "openai";
 import { Context } from "../../../types";
 import { SuperOpenAi } from "./openai";
 import { CompletionsModelHelper, ModelApplications } from "../../../types/llm";
-const MAX_TOKENS = 7000;
+import { encode } from "gpt-tokenizer";
 
 export interface CompletionsType {
   answer: string;
@@ -28,7 +28,8 @@ export class Completions extends SuperOpenAi {
     additionalContext: string[],
     localContext: string[],
     groundTruths: string[],
-    botName: string
+    botName: string,
+    maxTokens: number
   ): Promise<CompletionsType> {
     const res: OpenAI.Chat.Completions.ChatCompletion = await this.client.chat.completions.create({
       model: model,
@@ -46,10 +47,10 @@ export class Completions extends SuperOpenAi {
                 "Your name is : " +
                 botName +
                 "\n" +
-                "Primary Context: " +
-                additionalContext.join("\n") +
-                "\nLocal Context: " +
-                localContext.join("\n"),
+                "Main Context (Provide additional precedence in terms of information): " +
+                localContext.join("\n") +
+                "Secondary Context: " +
+                additionalContext.join("\n"),
             },
           ],
         },
@@ -64,7 +65,7 @@ export class Completions extends SuperOpenAi {
         },
       ],
       temperature: 0.2,
-      max_tokens: MAX_TOKENS,
+      max_tokens: maxTokens,
       top_p: 0.5,
       frequency_penalty: 0,
       presence_penalty: 0,
@@ -117,4 +118,8 @@ export class Completions extends SuperOpenAi {
 
     return res.choices[0].message.content;
   }
+
+  async findTokenLength(prompt: string, additionalContext: string[] = [], localContext: string[] = [], groundTruths: string[] = []): Promise<number> {
+    return encode(prompt + additionalContext.join("\n") + localContext.join("\n") + groundTruths.join("\n")).length;
+  }
 }
diff --git a/src/handlers/ask-llm.ts b/src/handlers/ask-llm.ts
@@ -4,7 +4,6 @@ import { CommentSimilaritySearchResult } from "../adapters/supabase/helpers/comm
 import { IssueSimilaritySearchResult } from "../adapters/supabase/helpers/issues";
 import { recursivelyFetchLinkedIssues } from "../helpers/issue-fetching";
 import { formatChatHistory } from "../helpers/format-chat-history";
-import { optimizeContext } from "../helpers/issue";
 import { fetchRepoDependencies, fetchRepoLanguageStats } from "./ground-truths/chat-bot";
 import { findGroundTruths } from "./ground-truths/find-ground-truths";
 
@@ -39,7 +38,7 @@ export async function askQuestion(context: Context, question: string) {
 export async function askGpt(context: Context, question: string, formattedChat: string[]): Promise<CompletionsType> {
   const {
     env: { UBIQUITY_OS_APP_NAME },
-    config: { model, similarityThreshold },
+    config: { model, similarityThreshold, maxTokens },
   } = context;
   let similarComments: CommentSimilaritySearchResult[] = [];
   let similarIssues: IssueSimilaritySearchResult[] = [];
@@ -58,20 +57,17 @@ export async function askGpt(context: Context, question: string, formattedChat:
   // Remove Null Results (Private Comments)
   similarText = similarText.filter((text) => text !== null);
   formattedChat = formattedChat.filter((text) => text !== null);
-  // Optimize the context
-  formattedChat = optimizeContext(formattedChat);
-  // ReRank the results based on the question
-  // const reRankedChat = formattedChat.length > 0 ? await context.adapters.voyage.reranker.reRankResults(formattedChat.filter(text => text !== ""), question, 300) : [];
   similarText = similarText.filter((text) => text !== "");
   const rerankedText = similarText.length > 0 ? await context.adapters.voyage.reranker.reRankResults(similarText, question) : [];
-
   const languages = await fetchRepoLanguageStats(context);
   const { dependencies, devDependencies } = await fetchRepoDependencies(context);
   const groundTruths = await findGroundTruths(context, "chat-bot", {
     languages,
     dependencies,
     devDependencies,
   });
-
-  return context.adapters.openai.completions.createCompletion(question, model, rerankedText, formattedChat, groundTruths, UBIQUITY_OS_APP_NAME);
+  //Calculate the current context size in tokens
+  const numTokens = await context.adapters.openai.completions.findTokenLength(question, rerankedText, formattedChat, groundTruths);
+  context.logger.info(`Number of tokens: ${numTokens}`);
+  return context.adapters.openai.completions.createCompletion(question, model, rerankedText, formattedChat, groundTruths, UBIQUITY_OS_APP_NAME, maxTokens);
 }
diff --git a/src/helpers/format-chat-history.ts b/src/helpers/format-chat-history.ts
@@ -1,7 +1,7 @@
 import { Context } from "../types";
 import { StreamlinedComment, StreamlinedComments } from "../types/llm";
 import { createKey, streamlineComments } from "../handlers/comments";
-import { fetchPullRequestDiff, fetchIssue, fetchIssueComments } from "./issue-fetching";
+import { fetchPullRequestDiff, fetchIssue, fetchIssueComments, fetchLinkedPullRequests } from "./issue-fetching";
 import { splitKey } from "./issue";
 
 /**
@@ -18,10 +18,13 @@ export async function formatChatHistory(
   specAndBodies: Record<string, string>
 ): Promise<string[]> {
   const keys = new Set([...Object.keys(streamlined), ...Object.keys(specAndBodies), createKey(context.payload.issue.html_url)]);
+  let runningTokenCount = 0;
   const chatHistory = await Promise.all(
     Array.from(keys).map(async (key) => {
       const isCurrentIssue = key === createKey(context.payload.issue.html_url);
-      return createContextBlockSection(context, key, streamlined, specAndBodies, isCurrentIssue);
+      const [currentTokenCount, result] = await createContextBlockSection(context, key, streamlined, specAndBodies, isCurrentIssue, runningTokenCount);
+      runningTokenCount += currentTokenCount;
+      return result;
     })
   );
   return Array.from(new Set(chatHistory));
@@ -65,8 +68,10 @@ async function createContextBlockSection(
   key: string,
   streamlined: Record<string, StreamlinedComment[]>,
   specAndBodies: Record<string, string>,
-  isCurrentIssue: boolean
-) {
+  isCurrentIssue: boolean,
+  currentContextTokenCount: number = 0
+): Promise<[number, string]> {
+  const maxTokens = context.config.maxTokens;
   let comments = streamlined[key];
   if (!comments || comments.length === 0) {
     const [owner, repo, number] = splitKey(key);
@@ -83,7 +88,18 @@ async function createContextBlockSection(
   if (!issueNumber || isNaN(issueNumber)) {
     throw context.logger.error("Issue number is not valid");
   }
-  const prDiff = await fetchPullRequestDiff(context, org, repo, issueNumber);
+  const pulls = (await fetchLinkedPullRequests(org, repo, issueNumber, context)) || [];
+  const prDiffs = await Promise.all(pulls.map((pull) => fetchPullRequestDiff(context, org, repo, pull.number)));
+  let prDiff: string | null = null;
+  for (const pullDiff of prDiffs.flat()) {
+    if (currentContextTokenCount > maxTokens) break;
+    if (pullDiff) {
+      const tokenLength = await context.adapters.openai.completions.findTokenLength(pullDiff.diff);
+      if (currentContextTokenCount + tokenLength > maxTokens) break;
+      currentContextTokenCount += tokenLength;
+      prDiff = (prDiff ? prDiff + "\n" : "") + pullDiff.diff;
+    }
+  }
   const specHeader = getCorrectHeaderString(prDiff, issueNumber, isCurrentIssue, false);
   let specOrBody = specAndBodies[key];
   if (!specOrBody) {
@@ -98,14 +114,16 @@ async function createContextBlockSection(
       )?.body || "No specification or body available";
   }
   const specOrBodyBlock = [createHeader(specHeader, key), createSpecOrBody(specOrBody), createFooter(specHeader)];
+  currentContextTokenCount += await context.adapters.openai.completions.findTokenLength(specOrBody);
   const header = getCorrectHeaderString(prDiff, issueNumber, isCurrentIssue, true);
   const repoString = `${org}/${repo} #${issueNumber}`;
   const block = [specOrBodyBlock.join(""), createHeader(header, repoString), createComment({ issueNumber, repo, org, comments }), createFooter(header)];
+  currentContextTokenCount += await context.adapters.openai.completions.findTokenLength(block.join(" "));
   if (!prDiff) {
-    return block.join("");
+    return [currentContextTokenCount, block.join("")];
   }
-  const diffBlock = [createHeader("Linked Pull Request Code Diff", repoString), prDiff, createFooter("Linked Pull Request Code Diff")];
-  return block.concat(diffBlock).join("");
+  const diffBlock = [createHeader("Linked Pull Request Code Diff", repoString), prDiff, createFooter("\nLinked Pull Request Code Diff")];
+  return [currentContextTokenCount, block.join("") + diffBlock.join("")];
 }
 
 /**

diff --git a/src/helpers/issue-fetching.ts b/src/helpers/issue-fetching.ts
@@ -1,6 +1,7 @@
+import { GithubDiff } from "github-diff-tool";
 import { createKey, getAllStreamlinedComments } from "../handlers/comments";
 import { Context } from "../types";
-import { IssueWithUser, SimplifiedComment, User } from "../types/github-types";
+import { IssueWithUser, LinkedPullsToIssue, SimplifiedComment, User } from "../types/github-types";
 import { FetchParams, Issue, Comments, LinkedIssues } from "../types/github-types";
 import { StreamlinedComment } from "../types/llm";
 import {
@@ -164,18 +165,40 @@ export async function mergeCommentsAndFetchSpec(
  * @param issue - The pull request number.
  * @returns A promise that resolves to the diff of the pull request as a string, or null if an error occurs.
  */
-export async function fetchPullRequestDiff(context: Context, org: string, repo: string, issue: number): Promise<string | null> {
+export async function fetchPullRequestDiff(context: Context, org: string, repo: string, issue: number): Promise<{ diff: string; diffSize: number }[] | null> {
   const { octokit, logger } = context;
   try {
-    const { data } = await octokit.pulls.get({
-      owner: org,
-      repo,
-      pull_number: issue,
-      mediaType: {
-        format: "diff",
-      },
-    });
-    return data as unknown as string;
+    const githubDiff = new GithubDiff(octokit);
+    //Fetch the statistics of the pull request
+    const stats = await githubDiff.getPullRequestStats(org, repo, issue);
+    //Ignore files like in dist or build or .lock files
+    const ignoredFiles = (await buildIgnoreFilesFromGitIgnore(context, org, repo)) || [];
+    const files = stats
+      .filter((file) => !ignoredFiles.some((pattern) => file.filename.includes(pattern)))
+      .map((file) => ({ filename: file.filename, diffSizeInBytes: file.diffSizeInBytes }));
+    //Fetch the diff of the files
+    const prDiffs = await Promise.all(
+      files.map(async (file) => {
+        let diff = null;
+        try {
+          diff = await githubDiff.getPullRequestDiff({
+            owner: org,
+            repo,
+            pullNumber: issue,
+            filePath: file.filename,
+          });
+        } catch {
+          logger.error(`Error fetching pull request diff for the file`, {
+            owner: org,
+            repo,
+            pull_number: issue,
+            file: file.filename,
+          });
+        }
+        return diff ? { diff: file.filename + diff, diffSize: file.diffSizeInBytes } : null;
+      })
+    );
+    return prDiffs.filter((diff): diff is { diff: string; diffSize: number } => diff !== null).sort((a, b) => a.diffSize - b.diffSize);
   } catch (error) {
     logger.error(`Error fetching pull request diff`, {
       error: error as Error,
@@ -188,10 +211,9 @@ export async function fetchPullRequestDiff(context: Context, org: string, repo:
 }
 
 /**
- * Fetches the details of a pull request.
- *
- * @param params - The parameters required to fetch the pull request, including context and other details.
- * @returns A promise that resolves to the pull request details or null if an error occurs.
+ * Fetches an issue from the GitHub API.
+ * @param params - Context
+ * @returns A promise that resolves to an issue object or null if an error occurs.
  */
 export async function fetchIssue(params: FetchParams): Promise<Issue | null> {
   const { octokit, payload, logger } = params.context;
@@ -296,3 +318,68 @@ function castCommentsToSimplifiedComments(comments: Comments, params: FetchParam
       url: comment.html_url,
     }));
 }
+
+export async function fetchLinkedPullRequests(owner: string, repo: string, issueNumber: number, context: Context) {
+  const query = `
+    query($owner: String!, $repo: String!, $issueNumber: Int!) {
+      repository(owner: $owner, name: $repo) {
+        issue(number: $issueNumber) {
+          closedByPullRequestsReferences(first: 100) {
+            nodes {
+              number
+              title
+              state
+              merged
+              url
+            }
+          }
+        }
+      }
+    }
+  `;
+
+  try {
+    const { repository } = await context.octokit.graphql<LinkedPullsToIssue>(query, {
+      owner,
+      repo,
+      issueNumber,
+    });
+    return repository.issue.closedByPullRequestsReferences.nodes;
+  } catch (error) {
+    context.logger.error(`Error fetching linked PRs from issue`, {
+      error: error as Error,
+      owner,
+      repo,
+      issueNumber,
+    });
+    return null;
+  }
+}
+
+async function buildIgnoreFilesFromGitIgnore(context: Context, owner: string, repo: string): Promise<string[] | null> {
+  try {
+    const gitignore = await context.octokit.rest.repos.getContent({
+      owner,
+      repo,
+      path: ".gitignore",
+    });
+    // Build an array of files to ignore
+    const ignoreFiles: string[] = [];
+    if ("content" in gitignore.data) {
+      const content = Buffer.from(gitignore.data.content, "base64").toString();
+      content.split("\n").forEach((line) => {
+        if (line && !line.startsWith("#")) {
+          ignoreFiles.push(line);
+        }
+      });
+    }
+    return ignoreFiles;
+  } catch (error) {
+    context.logger.error(`Error fetching .gitignore file`, {
+      error: error as Error,
+      owner,
+      repo,
+    });
+    return null;
+  }
+}
diff --git a/src/helpers/issue.ts b/src/helpers/issue.ts
@@ -166,61 +166,6 @@ export async function fetchCodeLinkedFromIssue(
     }));
 }
 
-/**
- * Optimizes the context strings by removing duplicates and sorting by information density.
- * Removes exact duplicates and sorts by information density and length.
- *
- * @param strings - The array of context strings to optimize.
- * @returns The optimized array of context strings.
- */
-export function optimizeContext(strings: string[]): string[] {
-  // Helper function to clean strings while preserving links
-  function cleanString(inputString: string): string {
-    // Preserve links by temporarily replacing them
-    const links: string[] = [];
-    inputString = inputString.replace(/https?:\/\/\S+/g, (match) => {
-      links.push(match);
-      return `__LINK${links.length - 1}__`;
-    });
-    // Clean the string
-    inputString = inputString
-      .replace(/[^\w\s-/]|_/g, "") // Remove punctuation except '-' and '/'
-      .replace(/\s+/g, " ")
-      .trim()
-      .toLowerCase();
-    // Restore links
-    inputString = inputString.replace(/__LINK(\d+)__/g, (i) => links[parseInt(i)]);
-
-    return inputString;
-  }
-  // Helper function to calculate information density
-  function informationDensity(s: string): number {
-    const words = s.split(/\s+/);
-    const uniqueWords = new Set(words);
-    return uniqueWords.size / words.length;
-  }
-  // Clean and remove empty strings
-  const cleanedStrings = strings.map(cleanString).filter((s) => s.length > 0);
-  // Remove exact duplicates
-  const uniqueStrings = Array.from(new Set(cleanedStrings));
-  // Sort strings by information density and length
-  uniqueStrings.sort((a, b) => {
-    const densityDiff = informationDensity(b) - informationDensity(a);
-    return densityDiff !== 0 ? densityDiff : b.length - a.length;
-  });
-  const result: string[] = [];
-  const wordSet = new Set<string>();
-  for (const str of uniqueStrings) {
-    const words = str.split(/\s+/);
-    const newWords = words.filter((word) => !wordSet.has(word) && !word.startsWith("http"));
-    if (newWords.length > 0 || str.includes("http")) {
-      result.push(str);
-      newWords.forEach((word) => wordSet.add(word));
-    }
-  }
-  return result;
-}
-
 /**
  * Extracts and returns the README content from the repository associated with the given issue.
  *