Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: ignore file path and diffs #21

Merged
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"knip-ci": "knip --no-exit-code --reporter json --config .github/knip.ts",
"prepare": "husky install",
"test": "jest --setupFiles dotenv/config --coverage",
"worker": "wrangler dev --env dev --port 4000"
"worker": "wrangler dev --env dev --port 5000"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a specific reason that you change this? The template uses :4000 so all other plugins also, it's tedious having to change it back for no reason

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is port 4000 required by some other service/kernel? I'm currently unable to use it but can switch back if needed

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The plugin-template comes with :4000 as standard so it's habitual for plugin devs to copy paste that url into -config.yml and expecting it to work thinking something is broken for it to be the dev port lmao.

Ideally don't commit it if you want to use another port, like running multiple plugins locally for example, always change it back to the template port if you can remember to.

},
"keywords": [
"typescript",
Expand All @@ -34,6 +34,8 @@
"@supabase/supabase-js": "^2.45.4",
"@ubiquity-dao/ubiquibot-logger": "^1.3.0",
"dotenv": "^16.4.5",
"github-diff-tool": "^1.0.6",
"gpt-tokenizer": "^2.5.1",
"openai": "^4.63.0",
"typebox-validators": "0.3.5",
"voyageai": "^0.0.1-5"
Expand Down
19 changes: 12 additions & 7 deletions src/adapters/openai/helpers/completions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import OpenAI from "openai";
import { Context } from "../../../types";
import { SuperOpenAi } from "./openai";
import { CompletionsModelHelper, ModelApplications } from "../../../types/llm";
const MAX_TOKENS = 7000;
import { encode } from "gpt-tokenizer";

export interface CompletionsType {
answer: string;
Expand All @@ -28,7 +28,8 @@ export class Completions extends SuperOpenAi {
additionalContext: string[],
localContext: string[],
groundTruths: string[],
botName: string
botName: string,
maxTokens: number
): Promise<CompletionsType> {
const res: OpenAI.Chat.Completions.ChatCompletion = await this.client.chat.completions.create({
model: model,
Expand All @@ -46,10 +47,10 @@ export class Completions extends SuperOpenAi {
"Your name is : " +
botName +
"\n" +
"Primary Context: " +
additionalContext.join("\n") +
"\nLocal Context: " +
localContext.join("\n"),
"Main Context (Provide additional precedence in terms of information): " +
localContext.join("\n") +
"Secondary Context: " +
additionalContext.join("\n"),
},
],
},
Expand All @@ -64,7 +65,7 @@ export class Completions extends SuperOpenAi {
},
],
temperature: 0.2,
max_tokens: MAX_TOKENS,
max_tokens: maxTokens,
top_p: 0.5,
frequency_penalty: 0,
presence_penalty: 0,
Expand Down Expand Up @@ -117,4 +118,8 @@ export class Completions extends SuperOpenAi {

return res.choices[0].message.content;
}

async findTokenLength(prompt: string, additionalContext: string[] = [], localContext: string[] = [], groundTruths: string[] = []): Promise<number> {
return encode(prompt + additionalContext.join("\n") + localContext.join("\n") + groundTruths.join("\n")).length;
}
}
14 changes: 5 additions & 9 deletions src/handlers/ask-llm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import { CommentSimilaritySearchResult } from "../adapters/supabase/helpers/comm
import { IssueSimilaritySearchResult } from "../adapters/supabase/helpers/issues";
import { recursivelyFetchLinkedIssues } from "../helpers/issue-fetching";
import { formatChatHistory } from "../helpers/format-chat-history";
import { optimizeContext } from "../helpers/issue";
import { fetchRepoDependencies, fetchRepoLanguageStats } from "./ground-truths/chat-bot";
import { findGroundTruths } from "./ground-truths/find-ground-truths";

Expand Down Expand Up @@ -39,7 +38,7 @@ export async function askQuestion(context: Context, question: string) {
export async function askGpt(context: Context, question: string, formattedChat: string[]): Promise<CompletionsType> {
const {
env: { UBIQUITY_OS_APP_NAME },
config: { model, similarityThreshold },
config: { model, similarityThreshold, maxTokens },
} = context;
let similarComments: CommentSimilaritySearchResult[] = [];
let similarIssues: IssueSimilaritySearchResult[] = [];
Expand All @@ -58,20 +57,17 @@ export async function askGpt(context: Context, question: string, formattedChat:
// Remove Null Results (Private Comments)
similarText = similarText.filter((text) => text !== null);
formattedChat = formattedChat.filter((text) => text !== null);
// Optimize the context
formattedChat = optimizeContext(formattedChat);
// ReRank the results based on the question
// const reRankedChat = formattedChat.length > 0 ? await context.adapters.voyage.reranker.reRankResults(formattedChat.filter(text => text !== ""), question, 300) : [];
similarText = similarText.filter((text) => text !== "");
const rerankedText = similarText.length > 0 ? await context.adapters.voyage.reranker.reRankResults(similarText, question) : [];

const languages = await fetchRepoLanguageStats(context);
const { dependencies, devDependencies } = await fetchRepoDependencies(context);
const groundTruths = await findGroundTruths(context, "chat-bot", {
languages,
dependencies,
devDependencies,
});

return context.adapters.openai.completions.createCompletion(question, model, rerankedText, formattedChat, groundTruths, UBIQUITY_OS_APP_NAME);
//Calculate the current context size in tokens
const numTokens = await context.adapters.openai.completions.findTokenLength(question, rerankedText, formattedChat, groundTruths);
context.logger.info(`Number of tokens: ${numTokens}`);
return context.adapters.openai.completions.createCompletion(question, model, rerankedText, formattedChat, groundTruths, UBIQUITY_OS_APP_NAME, maxTokens);
}
34 changes: 26 additions & 8 deletions src/helpers/format-chat-history.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { Context } from "../types";
import { StreamlinedComment, StreamlinedComments } from "../types/llm";
import { createKey, streamlineComments } from "../handlers/comments";
import { fetchPullRequestDiff, fetchIssue, fetchIssueComments } from "./issue-fetching";
import { fetchPullRequestDiff, fetchIssue, fetchIssueComments, fetchLinkedPullRequests } from "./issue-fetching";
import { splitKey } from "./issue";

/**
Expand All @@ -18,10 +18,13 @@ export async function formatChatHistory(
specAndBodies: Record<string, string>
): Promise<string[]> {
const keys = new Set([...Object.keys(streamlined), ...Object.keys(specAndBodies), createKey(context.payload.issue.html_url)]);
let runningTokenCount = 0;
const chatHistory = await Promise.all(
Array.from(keys).map(async (key) => {
const isCurrentIssue = key === createKey(context.payload.issue.html_url);
return createContextBlockSection(context, key, streamlined, specAndBodies, isCurrentIssue);
const [currentTokenCount, result] = await createContextBlockSection(context, key, streamlined, specAndBodies, isCurrentIssue, runningTokenCount);
runningTokenCount += currentTokenCount;
return result;
})
);
return Array.from(new Set(chatHistory));
Expand Down Expand Up @@ -65,8 +68,10 @@ async function createContextBlockSection(
key: string,
streamlined: Record<string, StreamlinedComment[]>,
specAndBodies: Record<string, string>,
isCurrentIssue: boolean
) {
isCurrentIssue: boolean,
currentContextTokenCount: number = 0
): Promise<[number, string]> {
const maxTokens = context.config.maxTokens;
let comments = streamlined[key];
if (!comments || comments.length === 0) {
const [owner, repo, number] = splitKey(key);
Expand All @@ -83,7 +88,18 @@ async function createContextBlockSection(
if (!issueNumber || isNaN(issueNumber)) {
throw context.logger.error("Issue number is not valid");
}
const prDiff = await fetchPullRequestDiff(context, org, repo, issueNumber);
const pulls = (await fetchLinkedPullRequests(org, repo, issueNumber, context)) || [];
const prDiffs = await Promise.all(pulls.map((pull) => fetchPullRequestDiff(context, org, repo, pull.number)));
let prDiff: string | null = null;
for (const pullDiff of prDiffs.flat()) {
if (currentContextTokenCount > maxTokens) break;
if (pullDiff) {
const tokenLength = await context.adapters.openai.completions.findTokenLength(pullDiff.diff);
if (currentContextTokenCount + tokenLength > maxTokens) break;
currentContextTokenCount += tokenLength;
prDiff = (prDiff ? prDiff + "\n" : "") + pullDiff.diff;
}
}
const specHeader = getCorrectHeaderString(prDiff, issueNumber, isCurrentIssue, false);
let specOrBody = specAndBodies[key];
if (!specOrBody) {
Expand All @@ -98,14 +114,16 @@ async function createContextBlockSection(
)?.body || "No specification or body available";
}
const specOrBodyBlock = [createHeader(specHeader, key), createSpecOrBody(specOrBody), createFooter(specHeader)];
currentContextTokenCount += await context.adapters.openai.completions.findTokenLength(specOrBody);
const header = getCorrectHeaderString(prDiff, issueNumber, isCurrentIssue, true);
const repoString = `${org}/${repo} #${issueNumber}`;
const block = [specOrBodyBlock.join(""), createHeader(header, repoString), createComment({ issueNumber, repo, org, comments }), createFooter(header)];
currentContextTokenCount += await context.adapters.openai.completions.findTokenLength(block.join(" "));
if (!prDiff) {
return block.join("");
return [currentContextTokenCount, block.join("")];
}
const diffBlock = [createHeader("Linked Pull Request Code Diff", repoString), prDiff, createFooter("Linked Pull Request Code Diff")];
return block.concat(diffBlock).join("");
const diffBlock = [createHeader("Linked Pull Request Code Diff", repoString), prDiff, createFooter("\nLinked Pull Request Code Diff")];
return [currentContextTokenCount, block.join("") + diffBlock.join("")];
}

/**
Expand Down
117 changes: 102 additions & 15 deletions src/helpers/issue-fetching.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { GithubDiff } from "github-diff-tool";
import { createKey, getAllStreamlinedComments } from "../handlers/comments";
import { Context } from "../types";
import { IssueWithUser, SimplifiedComment, User } from "../types/github-types";
import { IssueWithUser, LinkedPullsToIssue, SimplifiedComment, User } from "../types/github-types";
import { FetchParams, Issue, Comments, LinkedIssues } from "../types/github-types";
import { StreamlinedComment } from "../types/llm";
import {
Expand Down Expand Up @@ -164,18 +165,40 @@ export async function mergeCommentsAndFetchSpec(
* @param issue - The pull request number.
* @returns A promise that resolves to the diff of the pull request as a string, or null if an error occurs.
*/
export async function fetchPullRequestDiff(context: Context, org: string, repo: string, issue: number): Promise<string | null> {
export async function fetchPullRequestDiff(context: Context, org: string, repo: string, issue: number): Promise<{ diff: string; diffSize: number }[] | null> {
const { octokit, logger } = context;
try {
const { data } = await octokit.pulls.get({
owner: org,
repo,
pull_number: issue,
mediaType: {
format: "diff",
},
});
return data as unknown as string;
const githubDiff = new GithubDiff(octokit);
//Fetch the statistics of the pull request
const stats = await githubDiff.getPullRequestStats(org, repo, issue);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This lib is only called twice and can be done just as simply with one or two rest calls I'm sure, it's not clear if this is ideal relying on a lib for this. Nothing to hold back the PR for just an observation.

//Ignore files like in dist or build or .lock files
const ignoredFiles = (await buildIgnoreFilesFromGitIgnore(context, org, repo)) || [];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey i just realized this is pointless.

If its ignored it wont be on git or on the diff. Remove all this logic and just rely on filtering files out based on largest amount of changes to smallest.

const files = stats
.filter((file) => !ignoredFiles.some((pattern) => file.filename.includes(pattern)))
.map((file) => ({ filename: file.filename, diffSizeInBytes: file.diffSizeInBytes }));
//Fetch the diff of the files
const prDiffs = await Promise.all(
files.map(async (file) => {
let diff = null;
try {
diff = await githubDiff.getPullRequestDiff({
owner: org,
repo,
pullNumber: issue,
filePath: file.filename,
});
} catch {
logger.error(`Error fetching pull request diff for the file`, {
owner: org,
repo,
pull_number: issue,
file: file.filename,
});
}
return diff ? { diff: file.filename + diff, diffSize: file.diffSizeInBytes } : null;
})
);
return prDiffs.filter((diff): diff is { diff: string; diffSize: number } => diff !== null).sort((a, b) => a.diffSize - b.diffSize);
} catch (error) {
logger.error(`Error fetching pull request diff`, {
error: error as Error,
Expand All @@ -188,10 +211,9 @@ export async function fetchPullRequestDiff(context: Context, org: string, repo:
}

/**
* Fetches the details of a pull request.
*
* @param params - The parameters required to fetch the pull request, including context and other details.
* @returns A promise that resolves to the pull request details or null if an error occurs.
* Fetches an issue from the GitHub API.
* @param params - Context
* @returns A promise that resolves to an issue object or null if an error occurs.
*/
export async function fetchIssue(params: FetchParams): Promise<Issue | null> {
const { octokit, payload, logger } = params.context;
Expand Down Expand Up @@ -296,3 +318,68 @@ function castCommentsToSimplifiedComments(comments: Comments, params: FetchParam
url: comment.html_url,
}));
}

export async function fetchLinkedPullRequests(owner: string, repo: string, issueNumber: number, context: Context) {
const query = `
query($owner: String!, $repo: String!, $issueNumber: Int!) {
repository(owner: $owner, name: $repo) {
issue(number: $issueNumber) {
closedByPullRequestsReferences(first: 100) {
nodes {
number
title
state
merged
url
}
}
}
}
}
`;

try {
const { repository } = await context.octokit.graphql<LinkedPullsToIssue>(query, {
owner,
repo,
issueNumber,
});
return repository.issue.closedByPullRequestsReferences.nodes;
} catch (error) {
context.logger.error(`Error fetching linked PRs from issue`, {
error: error as Error,
owner,
repo,
issueNumber,
});
return null;
}
}

async function buildIgnoreFilesFromGitIgnore(context: Context, owner: string, repo: string): Promise<string[] | null> {
try {
const gitignore = await context.octokit.rest.repos.getContent({
owner,
repo,
path: ".gitignore",
});
// Build an array of files to ignore
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Get rid of all this logic too

const ignoreFiles: string[] = [];
if ("content" in gitignore.data) {
const content = Buffer.from(gitignore.data.content, "base64").toString();
content.split("\n").forEach((line) => {
if (line && !line.startsWith("#")) {
ignoreFiles.push(line);
}
});
}
return ignoreFiles;
} catch (error) {
context.logger.error(`Error fetching .gitignore file`, {
error: error as Error,
owner,
repo,
});
return null;
}
}
55 changes: 0 additions & 55 deletions src/helpers/issue.ts
Original file line number Diff line number Diff line change
Expand Up @@ -166,61 +166,6 @@ export async function fetchCodeLinkedFromIssue(
}));
}

/**
* Optimizes the context strings by removing duplicates and sorting by information density.
* Removes exact duplicates and sorts by information density and length.
*
* @param strings - The array of context strings to optimize.
* @returns The optimized array of context strings.
*/
export function optimizeContext(strings: string[]): string[] {
// Helper function to clean strings while preserving links
function cleanString(inputString: string): string {
// Preserve links by temporarily replacing them
const links: string[] = [];
inputString = inputString.replace(/https?:\/\/\S+/g, (match) => {
links.push(match);
return `__LINK${links.length - 1}__`;
});
// Clean the string
inputString = inputString
.replace(/[^\w\s-/]|_/g, "") // Remove punctuation except '-' and '/'
.replace(/\s+/g, " ")
.trim()
.toLowerCase();
// Restore links
inputString = inputString.replace(/__LINK(\d+)__/g, (i) => links[parseInt(i)]);

return inputString;
}
// Helper function to calculate information density
function informationDensity(s: string): number {
const words = s.split(/\s+/);
const uniqueWords = new Set(words);
return uniqueWords.size / words.length;
}
// Clean and remove empty strings
const cleanedStrings = strings.map(cleanString).filter((s) => s.length > 0);
// Remove exact duplicates
const uniqueStrings = Array.from(new Set(cleanedStrings));
// Sort strings by information density and length
uniqueStrings.sort((a, b) => {
const densityDiff = informationDensity(b) - informationDensity(a);
return densityDiff !== 0 ? densityDiff : b.length - a.length;
});
const result: string[] = [];
const wordSet = new Set<string>();
for (const str of uniqueStrings) {
const words = str.split(/\s+/);
const newWords = words.filter((word) => !wordSet.has(word) && !word.startsWith("http"));
if (newWords.length > 0 || str.includes("http")) {
result.push(str);
newWords.forEach((word) => wordSet.add(word));
}
}
return result;
}

/**
* Extracts and returns the README content from the repository associated with the given issue.
*
Expand Down
Loading