Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tokenizer: Support GPT4o o200k encoding #748

Merged
merged 7 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
package com.xebia.functional.tokenizer

import com.xebia.functional.tokenizer.internal.SPECIAL_TOKENS_CL100K_BASE
import com.xebia.functional.tokenizer.internal.SPECIAL_TOKENS_O200K_BASE
import com.xebia.functional.tokenizer.internal.SPECIAL_TOKENS_P50K_EDIT
import com.xebia.functional.tokenizer.internal.SPECIAL_TOKENS_X50K_BASE
import com.xebia.functional.tokenizer.internal.cl100k_base
import com.xebia.functional.tokenizer.internal.o200k_base
import com.xebia.functional.tokenizer.internal.p50k_base
import com.xebia.functional.tokenizer.internal.r50k_base
import kotlin.io.encoding.Base64
Expand Down Expand Up @@ -43,9 +45,19 @@ enum class EncodingType(@Suppress("UNUSED_PARAMETER") name: String) {
}
},
CL100K_BASE("cl100k_base") {
override val base: String = cl100k_base
override val regex: Regex = cl100k_base_regex
override val specialTokensBase: Map<String, Int> = SPECIAL_TOKENS_CL100K_BASE
override val base: String = cl100k_base
override val regex: Regex = cl100k_base_regex
override val specialTokensBase: Map<String, Int> = SPECIAL_TOKENS_CL100K_BASE
override val encoding by lazy {
EncodingFactory.fromPredefinedParameters(
name, regex, base, specialTokensBase
)
}
},
O200K_BASE("o200k_base") {
override val base: String = o200k_base
override val regex: Regex = o200k_base_regex
override val specialTokensBase: Map<String, Int> = SPECIAL_TOKENS_O200K_BASE
override val encoding by lazy {
EncodingFactory.fromPredefinedParameters(
name, regex, base, specialTokensBase
Expand Down Expand Up @@ -73,6 +85,7 @@ private object EncodingFactory {
private fun fromParameters(parameters: GptBytePairEncodingParams): Encoding =
GptBytePairEncoding(parameters)

@OptIn(ExperimentalEncodingApi::class)
fun loadMergeableRanks(base: String): Map<ByteArray, Int> =
buildMap {
base.lineSequence().forEach { line ->
Expand All @@ -82,5 +95,6 @@ private object EncodingFactory {
}
}

expect val cl100k_base_regex: Regex
expect val p50k_regex: Regex
expect val cl100k_base_regex: Regex
expect val o200k_base_regex: Regex
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.xebia.functional.tokenizer

import com.xebia.functional.tokenizer.EncodingType.CL100K_BASE
import com.xebia.functional.tokenizer.EncodingType.O200K_BASE
import com.xebia.functional.tokenizer.EncodingType.P50K_BASE
import com.xebia.functional.tokenizer.EncodingType.R50K_BASE
import kotlin.jvm.JvmStatic
Expand Down Expand Up @@ -146,11 +147,10 @@ sealed class ModelType(
ModelType("gpt-4-vision-preview", CL100K_BASE, 128000, tokensPerMessage = 3, tokensPerName = 2, tokenPadding = 5)

object GPT_4O :
ModelType("gpt-4o", CL100K_BASE, 128000, tokensPerMessage = 3, tokensPerName = 2, tokenPadding = 5)
ModelType("gpt-4o", O200K_BASE, 128000, tokensPerMessage = 3, tokensPerName = 2, tokenPadding = 5)

object GPT_4O_2024_05_13 :
ModelType("gpt-4o-2024-05-13", CL100K_BASE, 128000, tokensPerMessage = 3, tokensPerName = 2, tokenPadding = 5)

ModelType("gpt-4o-2024-05-13", O200K_BASE, 128000, tokensPerMessage = 3, tokensPerName = 2, tokenPadding = 5)

// text
object TEXT_DAVINCI_003 : ModelType("text-davinci-003", P50K_BASE, 4097)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,8 @@ internal val SPECIAL_TOKENS_CL100K_BASE: Map<String, Int> = HashMap<String, Int>
put(FIM_SUFFIX, 100260)
put(ENDOFPROMPT, 100276)
}

internal val SPECIAL_TOKENS_O200K_BASE: Map<String, Int> = HashMap<String, Int>(2).apply {
put(ENDOFTEXT, 199999);
put(ENDOFPROMPT, 200018);
}
Loading
Loading