From 58a32fb397abeec674a240c6cc7124f1827be902 Mon Sep 17 00:00:00 2001 From: Laura Trotta Date: Fri, 25 Oct 2024 17:17:28 +0200 Subject: [PATCH] addressed reviews, using boot autoconfig --- examples/rag-spring-article/pom.xml | 18 +-- .../elastic/clients/rag/article/Config.java | 65 --------- .../rag/article/PageContentHandler.java | 128 ------------------ .../clients/rag/article/RagService.java | 93 ++++--------- .../src/main/resources/application.properties | 8 ++ 5 files changed, 42 insertions(+), 270 deletions(-) delete mode 100644 examples/rag-spring-article/src/main/java/co/elastic/clients/rag/article/Config.java delete mode 100644 examples/rag-spring-article/src/main/java/co/elastic/clients/rag/article/PageContentHandler.java diff --git a/examples/rag-spring-article/pom.xml b/examples/rag-spring-article/pom.xml index 3bf33b87d..68b4b4126 100644 --- a/examples/rag-spring-article/pom.xml +++ b/examples/rag-spring-article/pom.xml @@ -39,27 +39,29 @@ org.springframework.ai spring-ai-bom - 1.0.0-SNAPSHOT + 1.0.0-M3 pom import + org.springframework.ai - spring-ai-elasticsearch-store + spring-ai-spring-boot-autoconfigure 1.0.0-SNAPSHOT + - org.apache.tika - tika-core - 2.9.2 + org.springframework.ai + spring-ai-elasticsearch-store + 1.0.0-SNAPSHOT - org.apache.tika - tika-parser-pdf-module - 2.9.2 + org.springframework.ai + spring-ai-pdf-document-reader + 1.0.0-SNAPSHOT diff --git a/examples/rag-spring-article/src/main/java/co/elastic/clients/rag/article/Config.java b/examples/rag-spring-article/src/main/java/co/elastic/clients/rag/article/Config.java deleted file mode 100644 index 1568c9dbc..000000000 --- a/examples/rag-spring-article/src/main/java/co/elastic/clients/rag/article/Config.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Licensed to Elasticsearch B.V. under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch B.V. licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package co.elastic.clients.rag.article; - -import org.apache.http.Header; -import org.apache.http.HttpHost; -import org.apache.http.message.BasicHeader; -import org.elasticsearch.client.RestClient; -import org.springframework.ai.chat.model.ChatModel; -import org.springframework.ai.embedding.EmbeddingModel; -import org.springframework.ai.openai.OpenAiChatModel; -import org.springframework.ai.openai.OpenAiEmbeddingModel; -import org.springframework.ai.openai.api.OpenAiApi; -import org.springframework.ai.vectorstore.ElasticsearchVectorStore; -import org.springframework.ai.vectorstore.ElasticsearchVectorStoreOptions; -import org.springframework.context.annotation.Bean; -import org.springframework.context.annotation.Configuration; - -@Configuration -public class Config { - - @Bean - public ElasticsearchVectorStore vectorStoreDefault(EmbeddingModel embeddingModel, RestClient restClient) { - ElasticsearchVectorStoreOptions options = new ElasticsearchVectorStoreOptions(); - return new ElasticsearchVectorStore(options,restClient, embeddingModel, true); - } - - @Bean - public EmbeddingModel embeddingModel() { - return new OpenAiEmbeddingModel(new OpenAiApi(System.getenv("OPENAI_API_KEY"))); - } - - @Bean - public ChatModel chatModel() { - return new OpenAiChatModel(new OpenAiApi(System.getenv("OPENAI_API_KEY"))); - } - - @Bean - RestClient restClient() { - - return RestClient - .builder(HttpHost.create(System.getenv("ES_SERVER_URL"))) - .setDefaultHeaders(new Header[]{ - new BasicHeader("Authorization", "ApiKey " + System.getenv("ES_API_KEY")) - }) - .build(); - } - -} diff --git a/examples/rag-spring-article/src/main/java/co/elastic/clients/rag/article/PageContentHandler.java b/examples/rag-spring-article/src/main/java/co/elastic/clients/rag/article/PageContentHandler.java deleted file mode 100644 index 0fb58b0ba..000000000 --- a/examples/rag-spring-article/src/main/java/co/elastic/clients/rag/article/PageContentHandler.java +++ /dev/null @@ -1,128 +0,0 @@ -package co.elastic.clients.rag.article; - -import org.apache.tika.sax.ToTextContentHandler; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -// taken from: -// https://github.com/mkalus/tika-page-extractor/blob/master/src/main/java/de/auxnet/PageContentHandler.java -public class PageContentHandler extends ToTextContentHandler { - /** - * logger - */ - private static final Logger logger = LoggerFactory.getLogger(PageContentHandler.class); - - final static private String pageTag = "div"; - final static private String pageClass = "page"; - - /** - * StringBuilder of current page - */ - private StringBuilder builder; - - /** - * page counter - */ - private int pageNumber = 0; - - /** - * page map - setting the initial capacity to 500 will enhance speed by a tiny bit up to 500 bits, but will require - * more RAM - */ - private Map pages = new HashMap<>(500); - - /** - * flag telling to compress text information by stripping whitespace? - */ - private final boolean compress; - - /** - * Default constructor - */ - public PageContentHandler() { - this.compress = true; - } - - /** - * Constructor - * - * @param compress text information by stripping whitespace? - */ - public PageContentHandler(boolean compress) { - this.compress = compress; - } - - @Override - public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { - if (pageTag.endsWith(qName) && pageClass.equals(atts.getValue("class"))) - startPage(); - } - - @Override - public void endElement(String uri, String localName, String qName) throws SAXException { - if (pageTag.endsWith(qName)) - endPage(); - } - - @Override - public void characters(char[] ch, int start, int length) throws SAXException { - // append data - if (length > 0 && builder != null) { - builder.append(ch); - } - } - - protected void startPage() throws SAXException { - builder = new StringBuilder(); - pageNumber++; - if (logger.isDebugEnabled()) - logger.debug("Page: " + pageNumber); - } - - protected void endPage() throws SAXException { - String page = builder.toString(); - builder = new StringBuilder(); - - // if compression has been turned on, compact whitespace and trim string - if (compress) - page = page.replaceAll("\\s+", " ").trim(); - - // page number already exists? - if (pages.containsKey(pageNumber)) { - if (page.isEmpty()) return; // do not add empty pages to map - - page = pages.get(pageNumber) + " " + page; // concatenate pages - page = page.trim(); - } - - // add to page list - pages.put(pageNumber, page); - } - - /** - * @return all extracted pages - */ - public List getPages() { - List pagesReal = new ArrayList<>(pageNumber); - - // convert to list - for (int i = 1; i <= pageNumber; i++) { - String page = pages.get(i); - if (page == null) page = ""; - - pagesReal.add(page); - } - - if (logger.isDebugEnabled()) - logger.debug("Returning " + pageNumber + " page(s)."); - - return pagesReal; - } -} diff --git a/examples/rag-spring-article/src/main/java/co/elastic/clients/rag/article/RagService.java b/examples/rag-spring-article/src/main/java/co/elastic/clients/rag/article/RagService.java index 73e7a9912..50a76b7c8 100644 --- a/examples/rag-spring-article/src/main/java/co/elastic/clients/rag/article/RagService.java +++ b/examples/rag-spring-article/src/main/java/co/elastic/clients/rag/article/RagService.java @@ -18,79 +18,34 @@ */ package co.elastic.clients.rag.article; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.AutoDetectParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.parser.pdf.PDFParserConfig; -import org.springframework.ai.chat.messages.Message; -import org.springframework.ai.chat.messages.UserMessage; -import org.springframework.ai.chat.model.ChatModel; -import org.springframework.ai.chat.model.ChatResponse; -import org.springframework.ai.chat.prompt.Prompt; -import org.springframework.ai.chat.prompt.SystemPromptTemplate; +import org.springframework.ai.chat.client.ChatClient; import org.springframework.ai.document.Document; +import org.springframework.ai.reader.pdf.PagePdfDocumentReader; import org.springframework.ai.transformer.splitter.TokenTextSplitter; import org.springframework.ai.vectorstore.ElasticsearchVectorStore; import org.springframework.ai.vectorstore.SearchRequest; -import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; -import org.xml.sax.SAXException; -import java.io.FileInputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; import java.util.stream.Collectors; @Service public class RagService { + // Both beans autowired from default configuration private ElasticsearchVectorStore vectorStore; - private ChatModel chatModel; + private ChatClient chatClient; - @Autowired - public RagService(ElasticsearchVectorStore vectorStore, ChatModel model) { + public RagService(ElasticsearchVectorStore vectorStore, ChatClient.Builder clientBuilder) { this.vectorStore = vectorStore; - this.chatModel = model; + this.chatClient = clientBuilder.build(); } - public void ingestPDF(String path) throws IOException, TikaException, SAXException { - // Initializing the PDF parser - // Keep in mind that AutoDetectParser is not thread safe - Parser parser = new AutoDetectParser(); - // Using our custom single page handler class - PageContentHandler handler = new PageContentHandler(); + public void ingestPDF(String path) { - // No need for any other specific PDF configuration - ParseContext parseContext = new ParseContext(); - parseContext.set(PDFParserConfig.class, new PDFParserConfig()); - - // The metadata contain information such as creation date, creation tool used, etc... which we - // don't need - Metadata metadata = new Metadata(); - - // Reading the file - try (FileInputStream stream = new FileInputStream(path)) { - parser.parse(stream, handler, metadata, parseContext); - } - - // Getting the result as a list of Strings with the content of the pages - List allPages = handler.getPages(); - List docbatch = new ArrayList<>(); - - // Converting pages to Documents - for (int i = 0; i < allPages.size(); i++) { - Map docMetadata = new HashMap<>(); - // The page number will be used in the response - docMetadata.put("page", i + 1); - - Document doc = new Document(allPages.get(i), docMetadata); - docbatch.add(doc); - } + // Spring AI utility class to read a PDF file page by page + PagePdfDocumentReader pdfReader = new PagePdfDocumentReader(path); + List docbatch = pdfReader.read(); // Sending batch of documents to vector store // applying tokenizer @@ -109,31 +64,31 @@ public String queryLLM(String question) { .map(Document::getContent) .collect(Collectors.joining(System.lineSeparator())); - // Setting the prompt - String basePrompt = """ + // Setting the prompt with the context + String prompt = """ You're assisting with providing the rules of the tabletop game Runewars. - Use the information from the DOCUMENTS section to provide accurate answers. + Use the information from the DOCUMENTS section to provide accurate answers to the + question in the QUESTION section. If unsure, simply state that you don't know. DOCUMENTS: - {documents} - """; - - // Preparing the question for the LLM - SystemPromptTemplate systemPromptTemplate = new SystemPromptTemplate(basePrompt); - Message systemMessage = systemPromptTemplate.createMessage(Map.of("documents", documents)); + """ + documents + + """ + QUESTION: + """ + question; - UserMessage userMessage = new UserMessage(question); - Prompt prompt = new Prompt(List.of(systemMessage, userMessage)); // Calling the chat model with the question - ChatResponse response = chatModel.call(prompt); + String response = chatClient.prompt() + .user(prompt) + .call() + .content(); - return response.getResult().getOutput().getContent() + + return response + System.lineSeparator() + "Found at page: " + // Retrieving the first ranked page number from the document metadata - vectorStoreResult.get(0).getMetadata().get("page") + + vectorStoreResult.get(0).getMetadata().get(PagePdfDocumentReader.METADATA_START_PAGE_NUMBER) + " of the manual"; } } diff --git a/examples/rag-spring-article/src/main/resources/application.properties b/examples/rag-spring-article/src/main/resources/application.properties index 2b7cf4008..ffc9d4b5c 100644 --- a/examples/rag-spring-article/src/main/resources/application.properties +++ b/examples/rag-spring-article/src/main/resources/application.properties @@ -1 +1,9 @@ spring.application.name=rag + +spring.ai.openai.api-key=${OPENAI_API_KEY} +spring.ai.chat.client.enabled=true + +spring.elasticsearch.uris=${ES_SERVER_URL} +spring.elasticsearch.username=${ES_USERNAME} +spring.elasticsearch.password=${ES_PASSWORD} +spring.ai.vectorstore.elasticsearch.initialize-schema=true