Skip to content

Commit

Permalink
#28813: applying feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
victoralfaro-dotcms committed Jul 30, 2024
1 parent 29e7807 commit 65240a8
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ private String getTextPrompt(final String prompt, final String supportingContent
}

private int countTokens(final String testString) {
return EncodingUtil.registry
return EncodingUtil.REGISTRY
.getEncodingForModel(config.get().getModel().getCurrentModel())
.map(enc -> enc.countTokens(testString))
.orElseThrow(() -> new DotRuntimeException("Encoder not found"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ public Tuple2<Integer, List<Float>> pullOrGenerateEmbeddings(final String conten
return cachedEmbeddings;
}

final List<Integer> tokens = EncodingUtil.encoding.get().encode(content);
final List<Integer> tokens = EncodingUtil.ENCODING.get().encode(content);
if (tokens.isEmpty()) {
debugLogger(this.getClass(), () -> String.format("No tokens for content ID '%s' were encoded: %s", contentId, content));
return Tuple.of(0, List.of());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ public void run() {
int totalTokens = 0;
for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) {
final String sentence = cleanContent.substring(start, end);
final int tokenCount = EncodingUtil.encoding.get().countTokens(sentence);
final int tokenCount = EncodingUtil.ENCODING.get().countTokens(sentence);
totalTokens += tokenCount;

if (totalTokens < splitAtTokens) {
Expand Down
8 changes: 7 additions & 1 deletion dotCMS/src/main/java/com/dotcms/ai/model/OpenAIModel.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,13 @@

import java.io.Serializable;

public class OpenAIModel implements Serializable {
/**
* Represents an OpenAI model with details such as ID, object type, creation timestamp, and owner.
* This class is immutable and uses Jackson annotations for JSON serialization and deserialization.
*
* @author vico
*/
public class OpenAIModel implements Serializable {

private final String id;
private final String object;
Expand Down
6 changes: 6 additions & 0 deletions dotCMS/src/main/java/com/dotcms/ai/model/OpenAIModels.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@
import java.io.Serializable;
import java.util.List;

/**
* Represents a collection of OpenAI models with details such as the type of object and the list of models.
* This class is immutable and uses Jackson annotations for JSON serialization and deserialization.
*
* @author vico
*/
public class OpenAIModels implements Serializable {

private final String object;
Expand Down
16 changes: 10 additions & 6 deletions dotCMS/src/main/java/com/dotcms/ai/util/EncodingUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,18 @@
import com.knuddels.jtokkit.api.EncodingRegistry;
import io.vavr.Lazy;

/**
* Utility class for handling encoding operations related to AI models.
* It provides a registry for encoding and a lazy-loaded encoding instance based on the current model.
* The class uses the ConfigService to retrieve the current model configuration.
*/
public class EncodingUtil {

public static final EncodingRegistry registry = Encodings.newDefaultEncodingRegistry();
public static final EncodingRegistry REGISTRY = Encodings.newDefaultEncodingRegistry();
public static final String MODEL = ConfigService.INSTANCE.config().getEmbeddingsModel().getCurrentModel();
public static final Lazy<Encoding> ENCODING = Lazy.of(() -> REGISTRY.getEncodingForModel(MODEL).get());

public static final String model = ConfigService.INSTANCE.config().getEmbeddingsModel().getCurrentModel();

public static Lazy<Encoding> encoding = Lazy.of(()->
registry.getEncodingForModel(model).get()
);
private EncodingUtil() {
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public void init(Object initData) {
* @return The number of tokens in the prompt, or -1 if no encoding is found for the model.
*/
public int countTokens(final String prompt) {
return EncodingUtil.registry
return EncodingUtil.REGISTRY
.getEncodingForModel(appConfig.getModel().getCurrentModel())
.map(encoding -> encoding.countTokens(prompt))
.orElse(-1);
Expand Down

0 comments on commit 65240a8

Please sign in to comment.