Skip to content

Commit

Permalink
adding deepseek direct route and prompt caching experiment (#5246)
Browse files Browse the repository at this point in the history
## Context
Adding the changes for experiment with:
1. Prompt Caching: Uses the pre-computed KV cache for common context
2. Direct routing to fireworks model: Skips the fireworks middle and
directly queries the on-demand deployment

## Dependent PR
1. Sourcegraph backend:
sourcegraph/sourcegraph#26
2. Infra: sourcegraph/infrastructure#6266

## Test plan
Manual Testing:
1. Testing that we are hitting direct route end point for deepseek
2. Unit test
  • Loading branch information
hitesh-1997 committed Aug 27, 2024
1 parent 0f383a4 commit 29a8099
Show file tree
Hide file tree
Showing 8 changed files with 75 additions and 30 deletions.
14 changes: 7 additions & 7 deletions lib/shared/src/experimentation/FeatureFlagProvider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ export enum FeatureFlag {
CodyAutocompleteDeepseekV2LiteBase = 'cody-autocomplete-deepseek-v2-lite-base',

// Enable various feature flags to experiment with FIM trained fine-tuned models via Fireworks
CodyAutocompleteFIMModelExperimentBaseFeatureFlag = 'cody-autocomplete-fim-model-experiment-flag-v1',
CodyAutocompleteFIMModelExperimentControl = 'cody-autocomplete-fim-model-experiment-control-v1',
CodyAutocompleteFIMModelExperimentCurrentBest = 'cody-autocomplete-fim-model-experiment-current-best-v1',
CodyAutocompleteFIMModelExperimentVariant1 = 'cody-autocomplete-fim-model-experiment-variant-1-v1',
CodyAutocompleteFIMModelExperimentVariant2 = 'cody-autocomplete-fim-model-experiment-variant-2-v1',
CodyAutocompleteFIMModelExperimentVariant3 = 'cody-autocomplete-fim-model-experiment-variant-3-v1',
CodyAutocompleteFIMModelExperimentVariant4 = 'cody-autocomplete-fim-model-experiment-variant-4-v1',
CodyAutocompleteFIMModelExperimentBaseFeatureFlag = 'cody-autocomplete-fim-model-experiment-flag-v2',
CodyAutocompleteFIMModelExperimentControl = 'cody-autocomplete-fim-model-experiment-control-v2',
CodyAutocompleteFIMModelExperimentCurrentBest = 'cody-autocomplete-fim-model-experiment-current-best-v2',
CodyAutocompleteFIMModelExperimentVariant1 = 'cody-autocomplete-fim-model-experiment-variant-1-v2',
CodyAutocompleteFIMModelExperimentVariant2 = 'cody-autocomplete-fim-model-experiment-variant-2-v2',
CodyAutocompleteFIMModelExperimentVariant3 = 'cody-autocomplete-fim-model-experiment-variant-3-v2',
CodyAutocompleteFIMModelExperimentVariant4 = 'cody-autocomplete-fim-model-experiment-variant-4-v2',

// Enables Claude 3 if the user is in our holdout group
CodyAutocompleteClaude3 = 'cody-autocomplete-claude-3',
Expand Down
11 changes: 10 additions & 1 deletion vscode/src/completions/context/context-mixer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,9 @@ describe('ContextMixer', () => {
duration: 0,
retrieverStats: {},
strategy: 'none',
totalChars: 0,
totalChars: 8,
prefixChars: 8,
suffixChars: 0,
})
})
})
Expand Down Expand Up @@ -120,10 +122,13 @@ describe('ContextMixer', () => {
positionBitmap: 3,
retrievedItems: 2,
suggestedItems: 2,
retrieverChars: 34,
},
},
strategy: 'jaccard-similarity',
totalChars: 42,
prefixChars: 8,
suffixChars: 0,
})
})
})
Expand Down Expand Up @@ -218,16 +223,20 @@ describe('ContextMixer', () => {
positionBitmap: 0b00101,
retrievedItems: 2,
suggestedItems: 2,
retrieverChars: 36,
},
retriever2: {
duration: expect.any(Number),
positionBitmap: 0b11010,
retrievedItems: 3,
suggestedItems: 3,
retrieverChars: 92,
},
},
strategy: 'jaccard-similarity',
totalChars: 136,
prefixChars: 8,
suffixChars: 0,
})
})

Expand Down
14 changes: 13 additions & 1 deletion vscode/src/completions/context/context-mixer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,19 @@ export interface ContextSummary {
duration: number
/** Total characters of combined context snippets */
totalChars: number
/** The number of characters in the prompt used from the document prefix. */
prefixChars: number
/** The number of characters in the prompt used from the document suffix. */
suffixChars: number
/** Detailed information for each retriever that has run */
retrieverStats: {
[identifier: string]: {
/** Number of items that are ended up being suggested to be used by the context mixer */
suggestedItems: number
/** Number of total snippets */
retrievedItems: number
/** Number of characters in the suggested Items from the retriever */
retrieverChars: number
/** Duration of the individual retriever */
duration: number
/**
Expand Down Expand Up @@ -75,7 +81,9 @@ export class ContextMixer implements vscode.Disposable {
context: [],
logSummary: {
strategy: 'none',
totalChars: 0,
totalChars: options.docContext.prefix.length + options.docContext.suffix.length,
prefixChars: options.docContext.prefix.length,
suffixChars: options.docContext.suffix.length,
duration: 0,
retrieverStats: {},
},
Expand Down Expand Up @@ -147,11 +155,13 @@ export class ContextMixer implements vscode.Disposable {
retrieverStats[retrieverId] = {
suggestedItems: 0,
positionBitmap: 0,
retrieverChars: 0,
retrievedItems:
results.find(r => r.identifier === retrieverId)?.snippets.size ?? 0,
duration: results.find(r => r.identifier === retrieverId)?.duration ?? 0,
}
}
retrieverStats[retrieverId].retrieverChars += snippet.content.length
retrieverStats[retrieverId].suggestedItems++
// Only log the position for the first 32 results to avoid overflowing the bitmap
if (position < 32) {
Expand All @@ -166,6 +176,8 @@ export class ContextMixer implements vscode.Disposable {
strategy,
duration: performance.now() - start,
totalChars,
prefixChars: options.docContext.prefix.length,
suffixChars: options.docContext.suffix.length,
retrieverStats,
}

Expand Down
2 changes: 1 addition & 1 deletion vscode/src/completions/fast-path-client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ export function createFastPathClient(
stop: [...(requestParams.stopSequences || []), ...(fireworksConfig?.parameters?.stop || [])],
stream: true,
languageId: providerOptions.document.languageId,
anonymousUserID,
user: anonymousUserID,
}
const headers = new Headers(customHeaders)
// Force HTTP connection reuse to reduce latency.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,11 @@ describe('[getInlineCompletions] completion event', () => {
"artificialDelay": undefined,
"completionIntent": "function.body",
"contextSummary": {
"prefixChars": 16,
"retrieverStats": {},
"strategy": "none",
"totalChars": 0,
"suffixChars": 1,
"totalChars": 17,
},
"id": "stable-uuid",
"isFuzzyMatch": false,
Expand Down Expand Up @@ -196,9 +198,11 @@ describe('[getInlineCompletions] completion event', () => {
"artificialDelay": undefined,
"completionIntent": "return_statement",
"contextSummary": {
"prefixChars": 25,
"retrieverStats": {},
"strategy": "none",
"totalChars": 0,
"suffixChars": 1,
"totalChars": 26,
},
"id": "stable-uuid",
"isFuzzyMatch": false,
Expand Down
2 changes: 2 additions & 0 deletions vscode/src/completions/logger.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ const defaultContextSummary = {
strategy: 'none',
duration: 0.1337,
totalChars: 3,
prefixChars: 0,
suffixChars: 3,
retrieverStats: {},
} satisfies ContextSummary

Expand Down
17 changes: 8 additions & 9 deletions vscode/src/completions/providers/create-provider.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ import { createProviderConfig as createExperimentalOllamaProviderConfig } from '
import { createProviderConfig as createExperimentalOpenAICompatibleProviderConfig } from './expopenaicompatible'
import {
DEEPSEEK_CODER_V2_LITE_BASE,
DEEPSEEK_CODER_V2_LITE_BASE_WINDOW_4096,
DEEPSEEK_CODER_V2_LITE_BASE_WINDOW_8192,
DEEPSEEK_CODER_V2_LITE_BASE_WINDOW_16384,
DEEPSEEK_CODER_V2_LITE_BASE_WINDOW_32768,
DEEPSEEK_CODER_V2_LITE_BASE_DIRECT_ROUTE,
FIREWORKS_DEEPSEEK_7B_LANG_ALL,
FIREWORKS_DEEPSEEK_7B_LANG_SPECIFIC_V0,
FIREWORKS_DEEPSEEK_7B_LANG_SPECIFIC_V1,
type FireworksOptions,
createProviderConfig as createFireworksProviderConfig,
} from './fireworks'
Expand Down Expand Up @@ -167,17 +167,16 @@ async function resolveFIMModelExperimentFromFeatureFlags(): ReturnType<
),
])
if (fimModelVariant1) {
// Variant 1: Current production model with +200msec latency to quantity the effect of latency increase while keeping same quality
return { provider: 'fireworks', model: DEEPSEEK_CODER_V2_LITE_BASE_WINDOW_4096 }
return { provider: 'fireworks', model: DEEPSEEK_CODER_V2_LITE_BASE_DIRECT_ROUTE }
}
if (fimModelVariant2) {
return { provider: 'fireworks', model: DEEPSEEK_CODER_V2_LITE_BASE_WINDOW_8192 }
return { provider: 'fireworks', model: FIREWORKS_DEEPSEEK_7B_LANG_SPECIFIC_V0 }
}
if (fimModelVariant3) {
return { provider: 'fireworks', model: DEEPSEEK_CODER_V2_LITE_BASE_WINDOW_16384 }
return { provider: 'fireworks', model: FIREWORKS_DEEPSEEK_7B_LANG_SPECIFIC_V1 }
}
if (fimModelVariant4) {
return { provider: 'fireworks', model: DEEPSEEK_CODER_V2_LITE_BASE_WINDOW_32768 }
return { provider: 'fireworks', model: FIREWORKS_DEEPSEEK_7B_LANG_ALL }
}
if (fimModelCurrentBest) {
return { provider: 'fireworks', model: DEEPSEEK_CODER_V2_LITE_BASE }
Expand Down
37 changes: 28 additions & 9 deletions vscode/src/completions/providers/fireworks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,11 @@ export interface FireworksOptions {

const PROVIDER_IDENTIFIER = 'fireworks'

export const FIREWORKS_DEEPSEEK_7B_LANG_STACK_FINETUNED =
'fim-lang-specific-model-deepseek-stack-trained'
export const FIREWORKS_DEEPSEEK_7B_LANG_LOG_FINETUNED = 'fim-lang-specific-model-deepseek-logs-trained'
export const FIREWORKS_DEEPSEEK_7B_LANG_SPECIFIC_V0 = 'deepseek-finetuned-lang-specific-v0'
export const FIREWORKS_DEEPSEEK_7B_LANG_SPECIFIC_V1 = 'deepseek-finetuned-lang-specific-v1'
export const FIREWORKS_DEEPSEEK_7B_LANG_ALL = 'deepseek-finetuned-lang-all-v0'

export const DEEPSEEK_CODER_V2_LITE_BASE_DIRECT_ROUTE = 'deepseek-coder-v2-lite-base-direct-route'
export const DEEPSEEK_CODER_V2_LITE_BASE = 'deepseek-coder-v2-lite-base'

// Context window experiments with DeepSeek Model
Expand All @@ -71,9 +72,12 @@ const MODEL_MAP = {
// Fireworks model identifiers
'llama-code-13b': 'fireworks/accounts/fireworks/models/llama-v2-13b-code',

[FIREWORKS_DEEPSEEK_7B_LANG_LOG_FINETUNED]: FIREWORKS_DEEPSEEK_7B_LANG_LOG_FINETUNED,
[FIREWORKS_DEEPSEEK_7B_LANG_STACK_FINETUNED]: FIREWORKS_DEEPSEEK_7B_LANG_STACK_FINETUNED,
[FIREWORKS_DEEPSEEK_7B_LANG_SPECIFIC_V0]: 'finetuned-fim-lang-specific-model-ds2-v0',
[FIREWORKS_DEEPSEEK_7B_LANG_SPECIFIC_V1]: 'finetuned-fim-lang-specific-model-ds2-v1',
[FIREWORKS_DEEPSEEK_7B_LANG_ALL]: 'accounts/sourcegraph/models/finetuned-fim-lang-all-model-ds2-v0',
[DEEPSEEK_CODER_V2_LITE_BASE]: 'fireworks/deepseek-coder-v2-lite-base',
[DEEPSEEK_CODER_V2_LITE_BASE_DIRECT_ROUTE]:
'accounts/sourcegraph/models/deepseek-coder-v2-lite-base',
[DEEPSEEK_CODER_V2_LITE_BASE_WINDOW_4096]: 'accounts/sourcegraph/models/deepseek-coder-v2-lite-base',
[DEEPSEEK_CODER_V2_LITE_BASE_WINDOW_8192]: 'accounts/sourcegraph/models/deepseek-coder-v2-lite-base',
[DEEPSEEK_CODER_V2_LITE_BASE_WINDOW_16384]:
Expand Down Expand Up @@ -101,9 +105,11 @@ function getMaxContextTokens(model: FireworksModel): number {
// Llama 2 on Fireworks supports up to 4k tokens. We're constraining it here to better
// compare the results
return 2048
case FIREWORKS_DEEPSEEK_7B_LANG_STACK_FINETUNED:
case FIREWORKS_DEEPSEEK_7B_LANG_LOG_FINETUNED:
case DEEPSEEK_CODER_V2_LITE_BASE: {
case FIREWORKS_DEEPSEEK_7B_LANG_SPECIFIC_V0:
case FIREWORKS_DEEPSEEK_7B_LANG_SPECIFIC_V1:
case FIREWORKS_DEEPSEEK_7B_LANG_ALL:
case DEEPSEEK_CODER_V2_LITE_BASE:
case DEEPSEEK_CODER_V2_LITE_BASE_DIRECT_ROUTE: {
return 2048
}
case DEEPSEEK_CODER_V2_LITE_BASE_WINDOW_4096:
Expand Down Expand Up @@ -137,6 +143,7 @@ class FireworksProvider extends Provider {
private fireworksConfig?: ClientConfiguration['autocompleteExperimentalFireworksOptions']
private modelHelper: DefaultModel
private anonymousUserID: string | undefined
private shouldEnableDirectRoute = false

constructor(
options: ProviderOptions,
Expand All @@ -156,6 +163,7 @@ class FireworksProvider extends Provider {
this.client = client
this.authStatus = authStatus
this.anonymousUserID = anonymousUserID
this.shouldEnableDirectRoute = this.checkIfDirectRouteShouldBeEnabled()
this.isLocalInstance = Boolean(
this.authStatus.endpoint?.includes('sourcegraph.test') ||
this.authStatus.endpoint?.includes('localhost')
Expand All @@ -181,6 +189,10 @@ class FireworksProvider extends Provider {
}
}

private checkIfDirectRouteShouldBeEnabled(): boolean {
return this.model === DEEPSEEK_CODER_V2_LITE_BASE_DIRECT_ROUTE
}

public generateCompletions(
abortSignal: AbortSignal,
snippets: AutocompleteContextSnippet[],
Expand Down Expand Up @@ -254,7 +266,14 @@ class FireworksProvider extends Provider {
private getCustomHeaders = (): Record<string, string> => {
// Enabled Fireworks tracing for Sourcegraph teammates.
// https://readme.fireworks.ai/docs/enabling-tracing
return this.authStatus.isFireworksTracingEnabled ? { 'X-Fireworks-Genie': 'true' } : {}
const customHeader: Record<string, string> = {}
if (this.authStatus.isFireworksTracingEnabled) {
customHeader['X-Fireworks-Genie'] = 'true'
}
if (this.shouldEnableDirectRoute) {
customHeader['X-Sourcegraph-Use-Direct-Route'] = 'true'
}
return customHeader
}

private createClient(
Expand Down

0 comments on commit 29a8099

Please sign in to comment.