feat : using tika document reader

rajadilipkolli · Mar 28, 2024 · 4931dae · 4931dae
1 parent 96093eb
commit 4931dae
Show file tree

Hide file tree

Showing 5 changed files with 20 additions and 15 deletions.
diff --git a/rag-springai-openai-llm/pom.xml b/rag-springai-openai-llm/pom.xml
@@ -43,7 +43,7 @@
         </dependency>
         <dependency>
             <groupId>org.springframework.ai</groupId>
-            <artifactId>spring-ai-pdf-document-reader</artifactId>
+            <artifactId>spring-ai-tika-document-reader</artifactId>
         </dependency>
         <dependency>
             <groupId>org.springframework.retry</groupId>

diff --git a/...pringai-openai-llm/src/main/java/com/learning/ai/llmragwithspringai/config/AppConfig.java b/...pringai-openai-llm/src/main/java/com/learning/ai/llmragwithspringai/config/AppConfig.java
@@ -3,8 +3,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.ai.reader.ExtractedTextFormatter;
-import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
-import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
+import org.springframework.ai.reader.tika.TikaDocumentReader;
 import org.springframework.ai.transformer.splitter.TokenTextSplitter;
 import org.springframework.ai.vectorstore.VectorStore;
 import org.springframework.beans.factory.annotation.Value;
@@ -18,7 +17,7 @@
 public class AppConfig {
     private static final Logger log = LoggerFactory.getLogger(AppConfig.class);
 
-    @Value("classpath:Rohit_Gurunath_Sharma.pdf")
+    @Value("classpath:Rohit_Gurunath_Sharma.docx")
     private Resource resource;
 
     @Bean
@@ -30,14 +29,11 @@ TokenTextSplitter tokenTextSplitter() {
     ApplicationRunner runner(VectorStore vectorStore, JdbcTemplate template, TokenTextSplitter tokenTextSplitter) {
         return args -> {
             log.info("Loading file(s) as Documents");
-            PdfDocumentReaderConfig config = PdfDocumentReaderConfig.builder()
-                    .withPageExtractedTextFormatter(new ExtractedTextFormatter.Builder()
-                            .withNumberOfBottomTextLinesToDelete(3)
-                            .withNumberOfTopPagesToSkipBeforeDelete(1)
-                            .build())
-                    .withPagesPerDocument(1)
+            ExtractedTextFormatter textFormatter = ExtractedTextFormatter.builder()
+                    .withNumberOfBottomTextLinesToDelete(3)
+                    .withNumberOfTopPagesToSkipBeforeDelete(1)
                     .build();
-            PagePdfDocumentReader pagePdfDocumentReader = new PagePdfDocumentReader(resource, config);
+            TikaDocumentReader pagePdfDocumentReader = new TikaDocumentReader(resource, textFormatter);
             template.update("delete from vector_store");
             vectorStore.accept(tokenTextSplitter.apply(pagePdfDocumentReader.get()));
             log.info("Loaded document to database.");

diff --git a/rag-springai-openai-llm/src/main/resources/Rohit_Gurunath_Sharma.docx b/rag-springai-openai-llm/src/main/resources/Rohit_Gurunath_Sharma.docx
diff --git a/rag-springai-openai-llm/src/main/resources/Rohit_Gurunath_Sharma.pdf b/rag-springai-openai-llm/src/main/resources/Rohit_Gurunath_Sharma.pdf
diff --git a/...rc/test/java/com/learning/ai/llmragwithspringai/LlmRagWithSpringAiApplicationIntTest.java b/...rc/test/java/com/learning/ai/llmragwithspringai/LlmRagWithSpringAiApplicationIntTest.java
@@ -1,9 +1,7 @@
 package com.learning.ai.llmragwithspringai;
 
 import static io.restassured.RestAssured.given;
-import static org.hamcrest.Matchers.containsString;
-import static org.hamcrest.Matchers.hasSize;
-import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.*;
 
 import com.learning.ai.llmragwithspringai.config.AbstractIntegrationTest;
 import io.restassured.RestAssured;
@@ -30,7 +28,18 @@ void testRag() {
                 .get("/api/ai/chat")
                 .then()
                 .statusCode(200)
-                .body("response", containsString("2007 T20 World Cup and the 2013 ICC Champions Trophy"));
+                .body("response", containsString("2007 T20 World Cup"))
+                .body("response", containsString("2013 ICC Champions Trophy"));
+    }
+
+    @Test
+    void testRag2() {
+        given().param("question", "Who is successful IPL captain")
+                .when()
+                .get("/api/ai/chat")
+                .then()
+                .statusCode(200)
+                .body("response", containsString("Rohit Sharma"));
     }
 
     @Test