Skip to content

Commit

Permalink
feat : using tika document reader
Browse files Browse the repository at this point in the history
  • Loading branch information
rajadilipkolli committed Mar 28, 2024
1 parent 96093eb commit 4931dae
Show file tree
Hide file tree
Showing 5 changed files with 20 additions and 15 deletions.
2 changes: 1 addition & 1 deletion rag-springai-openai-llm/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
</dependency>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-pdf-document-reader</artifactId>
<artifactId>spring-ai-tika-document-reader</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.retry</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.reader.ExtractedTextFormatter;
import org.springframework.ai.reader.pdf.PagePdfDocumentReader;
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
import org.springframework.ai.reader.tika.TikaDocumentReader;
import org.springframework.ai.transformer.splitter.TokenTextSplitter;
import org.springframework.ai.vectorstore.VectorStore;
import org.springframework.beans.factory.annotation.Value;
Expand All @@ -18,7 +17,7 @@
public class AppConfig {
private static final Logger log = LoggerFactory.getLogger(AppConfig.class);

@Value("classpath:Rohit_Gurunath_Sharma.pdf")
@Value("classpath:Rohit_Gurunath_Sharma.docx")
private Resource resource;

@Bean
Expand All @@ -30,14 +29,11 @@ TokenTextSplitter tokenTextSplitter() {
ApplicationRunner runner(VectorStore vectorStore, JdbcTemplate template, TokenTextSplitter tokenTextSplitter) {
return args -> {
log.info("Loading file(s) as Documents");
PdfDocumentReaderConfig config = PdfDocumentReaderConfig.builder()
.withPageExtractedTextFormatter(new ExtractedTextFormatter.Builder()
.withNumberOfBottomTextLinesToDelete(3)
.withNumberOfTopPagesToSkipBeforeDelete(1)
.build())
.withPagesPerDocument(1)
ExtractedTextFormatter textFormatter = ExtractedTextFormatter.builder()
.withNumberOfBottomTextLinesToDelete(3)
.withNumberOfTopPagesToSkipBeforeDelete(1)
.build();
PagePdfDocumentReader pagePdfDocumentReader = new PagePdfDocumentReader(resource, config);
TikaDocumentReader pagePdfDocumentReader = new TikaDocumentReader(resource, textFormatter);
template.update("delete from vector_store");
vectorStore.accept(tokenTextSplitter.apply(pagePdfDocumentReader.get()));
log.info("Loaded document to database.");
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
package com.learning.ai.llmragwithspringai;

import static io.restassured.RestAssured.given;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.*;

import com.learning.ai.llmragwithspringai.config.AbstractIntegrationTest;
import io.restassured.RestAssured;
Expand All @@ -30,7 +28,18 @@ void testRag() {
.get("/api/ai/chat")
.then()
.statusCode(200)
.body("response", containsString("2007 T20 World Cup and the 2013 ICC Champions Trophy"));
.body("response", containsString("2007 T20 World Cup"))
.body("response", containsString("2013 ICC Champions Trophy"));
}

@Test
void testRag2() {
given().param("question", "Who is successful IPL captain")
.when()
.get("/api/ai/chat")
.then()
.statusCode(200)
.body("response", containsString("Rohit Sharma"));
}

@Test
Expand Down

0 comments on commit 4931dae

Please sign in to comment.