Skip to content

Commit

Permalink
#10 updating the EmbeddingContentListener to automatically embed cont…
Browse files Browse the repository at this point in the history
…ent based on config
  • Loading branch information
wezell committed Jan 5, 2024
1 parent 3e4c752 commit aafdd48
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 9 deletions.
24 changes: 18 additions & 6 deletions src/main/java/com/dotcms/ai/listener/EmbeddingContentListener.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import com.dotcms.ai.db.EmbeddingsDTO;
import com.dotcms.ai.util.Logger;
import com.dotcms.content.elasticsearch.business.event.ContentletArchiveEvent;
import com.dotcms.content.elasticsearch.business.event.ContentletCheckinEvent;
import com.dotcms.content.elasticsearch.business.event.ContentletDeletedEvent;
import com.dotcms.content.elasticsearch.business.event.ContentletPublishEvent;
import com.dotcms.contenttype.model.field.Field;
Expand All @@ -16,11 +15,14 @@
import com.dotmarketing.portlets.contentlet.model.Contentlet;
import com.dotmarketing.portlets.contentlet.model.ContentletListener;
import com.dotmarketing.util.json.JSONObject;
import com.github.benmanes.caffeine.cache.Caffeine;
import com.github.benmanes.caffeine.cache.LoadingCache;
import io.vavr.control.Try;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.TimeUnit;


public class EmbeddingContentListener implements ContentletListener<Contentlet> {
Expand All @@ -32,6 +34,10 @@ public String getId() {
}


LoadingCache<Host, JSONObject> cache = Caffeine.newBuilder()
.expireAfterWrite(1, TimeUnit.MINUTES)
.maximumSize(10_000)
.build(this::getConfigJson);


@Override
Expand Down Expand Up @@ -79,15 +85,15 @@ public void onDeleted(ContentletDeletedEvent<Contentlet> contentletDeletedEvent)
deleteFromIndexes(contentlet);
}


/**
* JSONObject that has a list of indexes and the content types that should be indexed in them.
*
* @param contentlet
* @param host
* @return
*/
JSONObject getConfigJson(Contentlet contentlet) {
Host host = Try.of(() -> APILocator.getHostAPI().find(contentlet.getHost(), APILocator.systemUser(), false))
.getOrElse(APILocator.systemHost());
JSONObject getConfigJson(Host host) {


return Try.of(() -> new JSONObject(ConfigService.INSTANCE.config(host).getConfig(AppKeys.LISTENER_INDEXER)))
.onFailure(e->Logger.warn(EmbeddingContentListener.class, "error in json config from app:" + e.getMessage()))
Expand All @@ -106,7 +112,9 @@ void addToIndexesIfNeeded(Contentlet contentlet) {
if (contentType == null) {
return;
}
JSONObject config = getConfigJson(contentlet);
Host host = Try.of(() -> APILocator.getHostAPI().find(contentlet.getHost(), APILocator.systemUser(), false))
.getOrElse(APILocator.systemHost());
JSONObject config = cache.get(host);

for (Entry<String, Object> entry : (Set<Entry<String, Object>>) config.entrySet()) {
final String indexName = entry.getKey();
Expand All @@ -121,6 +129,10 @@ void addToIndexesIfNeeded(Contentlet contentlet) {
}
}

/**
* If a contentlet is unpublished, we delete it from the dot_embeddings no matter what index it is part of
* @param contentlet
*/
void deleteFromIndexes(Contentlet contentlet) {
EmbeddingsDTO dto = new EmbeddingsDTO.Builder()
.withIdentifier(contentlet.getIdentifier())
Expand Down
8 changes: 5 additions & 3 deletions src/main/resources/dotAI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,14 @@ params:
type: "STRING"
label: "Auto Index Content Config"
hint: |
this is a json map that automatically maps indexes:contentType and tells dotCMS which content types should be indexed and where, e.g.
A json map that automatically maps indexes->contentTypes and tells dotCMS which content types should be indexed and where, e.g.
```
{
"default": "blog,news,webPageContent",
"blogsOnly": "blog"
"blogsOnly": "blog.blogcontent"
}
```
The list of content types is a comma separated list of glob patterns which will matched against when a contentlet is published (will be indexed) or unpublished (will be removed from the index).
means that blog, news and webPageContent will be indexed in the `default` index and the blog field `blog.blogcontent` will be
indexed into the `blogsOnly` index. The list of content types is a comma separated list content types and can optionally
include the field that should be indexed when a contentlet is published. All unpublished content will be removed from the index.
required: false

0 comments on commit aafdd48

Please sign in to comment.