diff --git a/community/document-parsers/document-parser-apache-pdfbox/pom.xml b/community/document-parsers/document-parser-apache-pdfbox/pom.xml new file mode 100644 index 00000000..7a1a0b9d --- /dev/null +++ b/community/document-parsers/document-parser-apache-pdfbox/pom.xml @@ -0,0 +1,76 @@ + + + 4.0.0 + + com.alibaba.cloud.ai + spring-ai-alibaba + ${revision} + ../../../pom.xml + + + document-parser-apache-pdfbox + document-parser-apache-pdfbox + document-parser-apache-pdfbox for Spring AI Alibaba + jar + https://github.com/alibaba/spring-ai-alibaba + + https://github.com/alibaba/spring-ai-alibaba + git://github.com/alibaba/spring-ai-alibaba.git + git@github.com:alibaba/spring-ai-alibaba.git + + + + 17 + 17 + UTF-8 + 2.0.32 + + + + + com.alibaba.cloud.ai + spring-ai-alibaba-core + ${project.parent.version} + + + + org.apache.pdfbox + pdfbox + ${pdfbox.version} + + + commons-logging + commons-logging + + + + + + + org.springframework.ai + spring-ai-test + test + + + + org.springframework.boot + spring-boot-starter-test + test + + + + org.junit.jupiter + junit-jupiter-engine + test + + + + org.assertj + assertj-core + test + + + + \ No newline at end of file diff --git a/community/document-parsers/document-parser-apache-pdfbox/src/main/java/com/alibaba/cloud/ai/parser/apache/pdfbox/ApachePdfBoxDocumentParser.java b/community/document-parsers/document-parser-apache-pdfbox/src/main/java/com/alibaba/cloud/ai/parser/apache/pdfbox/ApachePdfBoxDocumentParser.java new file mode 100644 index 00000000..a8b6f257 --- /dev/null +++ b/community/document-parsers/document-parser-apache-pdfbox/src/main/java/com/alibaba/cloud/ai/parser/apache/pdfbox/ApachePdfBoxDocumentParser.java @@ -0,0 +1,60 @@ +package com.alibaba.cloud.ai.parser.apache.pdfbox; + +import com.alibaba.cloud.ai.document.DocumentParser; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentInformation; +import org.apache.pdfbox.text.PDFTextStripper; +import org.springframework.ai.document.Document; +import org.springframework.util.Assert; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * @author HeYQ + * @since 2024-12-08 22:34 + */ + +public class ApachePdfBoxDocumentParser implements DocumentParser { + + private final boolean includeMetadata; + + public ApachePdfBoxDocumentParser() { + this(false); + } + + public ApachePdfBoxDocumentParser(boolean includeMetadata) { + this.includeMetadata = includeMetadata; + } + + @Override + public List parse(InputStream inputStream) { + try (PDDocument pdfDocument = PDDocument.load(inputStream)) { + PDFTextStripper stripper = new PDFTextStripper(); + String text = stripper.getText(pdfDocument); + Assert.notNull(text, "Text cannot be null"); + return includeMetadata ? Collections.singletonList(new Document(text, toMetadata(pdfDocument))) + : Collections.singletonList(new Document(text)); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + + private Map toMetadata(PDDocument pdDocument) { + PDDocumentInformation documentInformation = pdDocument.getDocumentInformation(); + Map metadata = new HashMap<>(); + for (String metadataKey : documentInformation.getMetadataKeys()) { + String value = documentInformation.getCustomMetadataValue(metadataKey); + if (value != null) { + metadata.put(metadataKey, value); + } + } + return metadata; + } + +} diff --git a/community/document-parsers/document-parser-apache-pdfbox/src/test/java/com/alibaba/cloud/ai/parser/apache/pdfbox/ApachePdfBoxDocumentParserTest.java b/community/document-parsers/document-parser-apache-pdfbox/src/test/java/com/alibaba/cloud/ai/parser/apache/pdfbox/ApachePdfBoxDocumentParserTest.java new file mode 100644 index 00000000..ec7f4e4a --- /dev/null +++ b/community/document-parsers/document-parser-apache-pdfbox/src/test/java/com/alibaba/cloud/ai/parser/apache/pdfbox/ApachePdfBoxDocumentParserTest.java @@ -0,0 +1,45 @@ +package com.alibaba.cloud.ai.parser.apache.pdfbox; + +import com.alibaba.cloud.ai.document.DocumentParser; +import org.junit.jupiter.api.Test; +import org.springframework.ai.document.Document; + +import java.io.IOException; +import java.io.InputStream; + +import static org.assertj.core.api.Assertions.assertThat; + +class ApachePdfBoxDocumentParserTest { + + @Test + void should_parse_pdf_file() { + try (InputStream inputStream = getClass().getClassLoader().getResourceAsStream("test-file.pdf")) { + DocumentParser parser = new ApachePdfBoxDocumentParser(); + Document document = parser.parse(inputStream).get(0); + + assertThat(document.getContent()).isEqualToIgnoringWhitespace("test content"); + assertThat(document.getMetadata()).isEmpty(); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Test + void should_parse_pdf_file_include_metadata() { + try (InputStream inputStream = getClass().getClassLoader().getResourceAsStream("test-file.pdf")) { + DocumentParser parser = new ApachePdfBoxDocumentParser(true); + Document document = parser.parse(inputStream).get(0); + + assertThat(document.getContent()).isEqualToIgnoringWhitespace("test content"); + assertThat(document.getMetadata()).containsEntry("Author", "ljuba") + .containsEntry("Creator", "WPS Writer") + .containsEntry("CreationDate", "D:20230608171011+15'10'") + .containsEntry("SourceModified", "D:20230608171011+15'10'"); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + +} \ No newline at end of file diff --git a/community/document-parsers/document-parser-apache-pdfbox/src/test/resources/blank-file.pdf b/community/document-parsers/document-parser-apache-pdfbox/src/test/resources/blank-file.pdf new file mode 100644 index 00000000..757bb1f3 Binary files /dev/null and b/community/document-parsers/document-parser-apache-pdfbox/src/test/resources/blank-file.pdf differ diff --git a/community/document-parsers/document-parser-apache-pdfbox/src/test/resources/test-file.pdf b/community/document-parsers/document-parser-apache-pdfbox/src/test/resources/test-file.pdf new file mode 100644 index 00000000..920f1fce Binary files /dev/null and b/community/document-parsers/document-parser-apache-pdfbox/src/test/resources/test-file.pdf differ diff --git a/community/document-parsers/document-parser-markdown/pom.xml b/community/document-parsers/document-parser-markdown/pom.xml new file mode 100644 index 00000000..0b8b47ce --- /dev/null +++ b/community/document-parsers/document-parser-markdown/pom.xml @@ -0,0 +1,71 @@ + + + 4.0.0 + + com.alibaba.cloud.ai + spring-ai-alibaba + ${revision} + ../../../pom.xml + + + document-parser-markdown + document-parser-markdown + document-parser-markdown for Spring AI Alibaba + jar + https://github.com/alibaba/spring-ai-alibaba + + https://github.com/alibaba/spring-ai-alibaba + git://github.com/alibaba/spring-ai-alibaba.git + git@github.com:alibaba/spring-ai-alibaba.git + + + + + 17 + 17 + UTF-8 + 0.22.0 + + + + + com.alibaba.cloud.ai + spring-ai-alibaba-core + ${project.parent.version} + + + + org.commonmark + commonmark + ${commonmark.version} + + + + + org.springframework.ai + spring-ai-test + test + + + + org.springframework.boot + spring-boot-starter-test + test + + + + org.junit.jupiter + junit-jupiter-engine + test + + + + org.assertj + assertj-core + test + + + + \ No newline at end of file diff --git a/community/document-parsers/document-parser-markdown/src/main/java/com/alibaba/cloud/ai/parser/markdown/MarkdownDocumentParser.java b/community/document-parsers/document-parser-markdown/src/main/java/com/alibaba/cloud/ai/parser/markdown/MarkdownDocumentParser.java new file mode 100644 index 00000000..56ec3dae --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/main/java/com/alibaba/cloud/ai/parser/markdown/MarkdownDocumentParser.java @@ -0,0 +1,206 @@ +package com.alibaba.cloud.ai.parser.markdown; + +import com.alibaba.cloud.ai.document.DocumentParser; +import com.alibaba.cloud.ai.parser.markdown.config.MarkdownDocumentParserConfig; +import org.commonmark.node.AbstractVisitor; +import org.commonmark.node.BlockQuote; +import org.commonmark.node.Code; +import org.commonmark.node.FencedCodeBlock; +import org.commonmark.node.HardLineBreak; +import org.commonmark.node.Heading; +import org.commonmark.node.ListItem; +import org.commonmark.node.Node; +import org.commonmark.node.SoftLineBreak; +import org.commonmark.node.Text; +import org.commonmark.node.ThematicBreak; +import org.commonmark.parser.Parser; +import org.springframework.ai.document.Document; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; + +/** + * @author HeYQ + * @since 2024-12-08 21:32 + */ + +public class MarkdownDocumentParser implements DocumentParser { + + /** + * Configuration to a parsing process. + */ + private final MarkdownDocumentParserConfig config; + + /** + * Markdown parser. + */ + private final Parser parser; + + public MarkdownDocumentParser() { + this(MarkdownDocumentParserConfig.defaultConfig()); + } + + /** + * Create a new {@link MarkdownDocumentParser} instance. + * + */ + public MarkdownDocumentParser(MarkdownDocumentParserConfig config) { + this.config = config; + this.parser = Parser.builder().build(); + } + + @Override + public List parse(InputStream inputStream) { + try (var input = inputStream) { + Node node = this.parser.parseReader(new InputStreamReader(input)); + + DocumentVisitor documentVisitor = new DocumentVisitor(this.config); + node.accept(documentVisitor); + + return documentVisitor.getDocuments(); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * A convenient class for visiting handled nodes in the Markdown document. + */ + static class DocumentVisitor extends AbstractVisitor { + + private final List documents = new ArrayList<>(); + + private final List currentParagraphs = new ArrayList<>(); + + private final MarkdownDocumentParserConfig config; + + private Document.Builder currentDocumentBuilder; + + DocumentVisitor(MarkdownDocumentParserConfig config) { + this.config = config; + } + + /** + * Visits the document node and initializes the current document builder. + */ + @Override + public void visit(org.commonmark.node.Document document) { + this.currentDocumentBuilder = Document.builder(); + super.visit(document); + } + + @Override + public void visit(Heading heading) { + buildAndFlush(); + super.visit(heading); + } + + @Override + public void visit(ThematicBreak thematicBreak) { + if (this.config.horizontalRuleCreateDocument) { + buildAndFlush(); + } + super.visit(thematicBreak); + } + + @Override + public void visit(SoftLineBreak softLineBreak) { + translateLineBreakToSpace(); + super.visit(softLineBreak); + } + + @Override + public void visit(HardLineBreak hardLineBreak) { + translateLineBreakToSpace(); + super.visit(hardLineBreak); + } + + @Override + public void visit(ListItem listItem) { + translateLineBreakToSpace(); + super.visit(listItem); + } + + @Override + public void visit(BlockQuote blockQuote) { + if (!this.config.includeBlockquote) { + buildAndFlush(); + } + + translateLineBreakToSpace(); + this.currentDocumentBuilder.withMetadata("category", "blockquote"); + super.visit(blockQuote); + } + + @Override + public void visit(Code code) { + this.currentParagraphs.add(code.getLiteral()); + this.currentDocumentBuilder.withMetadata("category", "code_inline"); + super.visit(code); + } + + @Override + public void visit(FencedCodeBlock fencedCodeBlock) { + if (!this.config.includeCodeBlock) { + buildAndFlush(); + } + + translateLineBreakToSpace(); + this.currentParagraphs.add(fencedCodeBlock.getLiteral()); + this.currentDocumentBuilder.withMetadata("category", "code_block"); + this.currentDocumentBuilder.withMetadata("lang", fencedCodeBlock.getInfo()); + + buildAndFlush(); + + super.visit(fencedCodeBlock); + } + + @Override + public void visit(Text text) { + if (text.getParent() instanceof Heading heading) { + this.currentDocumentBuilder.withMetadata("category", "header_%d".formatted(heading.getLevel())) + .withMetadata("title", text.getLiteral()); + } + else { + this.currentParagraphs.add(text.getLiteral()); + } + + super.visit(text); + } + + public List getDocuments() { + buildAndFlush(); + + return this.documents; + } + + private void buildAndFlush() { + if (!this.currentParagraphs.isEmpty()) { + String content = String.join("", this.currentParagraphs); + + Document.Builder builder = this.currentDocumentBuilder.withContent(content); + + this.config.additionalMetadata.forEach(builder::withMetadata); + + Document document = builder.build(); + + this.documents.add(document); + + this.currentParagraphs.clear(); + } + this.currentDocumentBuilder = Document.builder(); + } + + private void translateLineBreakToSpace() { + if (!this.currentParagraphs.isEmpty()) { + this.currentParagraphs.add(" "); + } + } + + } + +} diff --git a/community/document-parsers/document-parser-markdown/src/main/java/com/alibaba/cloud/ai/parser/markdown/config/MarkdownDocumentParserConfig.java b/community/document-parsers/document-parser-markdown/src/main/java/com/alibaba/cloud/ai/parser/markdown/config/MarkdownDocumentParserConfig.java new file mode 100644 index 00000000..1db101e7 --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/main/java/com/alibaba/cloud/ai/parser/markdown/config/MarkdownDocumentParserConfig.java @@ -0,0 +1,122 @@ +package com.alibaba.cloud.ai.parser.markdown.config; + +import org.springframework.ai.document.Document; +import org.springframework.util.Assert; + +import java.util.HashMap; +import java.util.Map; + +/** + * @author HeYQ + * @since 2024-12-08 21:38 + */ + +public class MarkdownDocumentParserConfig { + + public final boolean horizontalRuleCreateDocument; + + public final boolean includeCodeBlock; + + public final boolean includeBlockquote; + + public final Map additionalMetadata; + + public MarkdownDocumentParserConfig(Builder builder) { + this.horizontalRuleCreateDocument = builder.horizontalRuleCreateDocument; + this.includeCodeBlock = builder.includeCodeBlock; + this.includeBlockquote = builder.includeBlockquote; + this.additionalMetadata = builder.additionalMetadata; + } + + /** + * @return the default configuration + */ + public static MarkdownDocumentParserConfig defaultConfig() { + return builder().build(); + } + + public static Builder builder() { + return new Builder(); + } + + public static final class Builder { + + private boolean horizontalRuleCreateDocument = false; + + private boolean includeCodeBlock = false; + + private boolean includeBlockquote = false; + + private Map additionalMetadata = new HashMap<>(); + + private Builder() { + } + + /** + * Text divided by horizontal lines will create new {@link Document}s. The default + * is {@code false}, meaning text separated by horizontal lines won't create a new + * document. + * @param horizontalRuleCreateDocument flag to determine whether new documents are + * created from text divided by horizontal line + * @return this builder + */ + public Builder withHorizontalRuleCreateDocument(boolean horizontalRuleCreateDocument) { + this.horizontalRuleCreateDocument = horizontalRuleCreateDocument; + return this; + } + + /** + * Whatever to include code blocks in {@link Document}s. The default is + * {@code false}, which means all code blocks are in separate documents. + * @param includeCodeBlock flag to include code block into paragraph document or + * create new with code only + * @return this builder + */ + public Builder withIncludeCodeBlock(boolean includeCodeBlock) { + this.includeCodeBlock = includeCodeBlock; + return this; + } + + /** + * Whatever to include blockquotes in {@link Document}s. The default is + * {@code false}, which means all blockquotes are in separate documents. + * @param includeBlockquote flag to include blockquotes into paragraph document or + * create new with blockquote only + * @return this builder + */ + public Builder withIncludeBlockquote(boolean includeBlockquote) { + this.includeBlockquote = includeBlockquote; + return this; + } + + /** + * Adds this additional metadata to the all built {@link Document}s. + * @return this builder + */ + public Builder withAdditionalMetadata(String key, Object value) { + Assert.notNull(key, "key must not be null"); + Assert.notNull(value, "value must not be null"); + this.additionalMetadata.put(key, value); + return this; + } + + /** + * Adds this additional metadata to the all built {@link Document}s. + * @return this builder + */ + public Builder withAdditionalMetadata(Map additionalMetadata) { + Assert.notNull(additionalMetadata, "additionalMetadata must not be null"); + this.additionalMetadata = additionalMetadata; + return this; + } + + /** + * @return the immutable configuration + */ + public MarkdownDocumentParserConfig build() { + return new MarkdownDocumentParserConfig(this); + } + + } + +} diff --git a/community/document-parsers/document-parser-markdown/src/test/java/com/alibaba/cloud/ai/parser/markdown/MarkdownDocumentParserTest.java b/community/document-parsers/document-parser-markdown/src/test/java/com/alibaba/cloud/ai/parser/markdown/MarkdownDocumentParserTest.java new file mode 100644 index 00000000..230af67a --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/java/com/alibaba/cloud/ai/parser/markdown/MarkdownDocumentParserTest.java @@ -0,0 +1,263 @@ +/* + * Copyright 2023-2024 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.alibaba.cloud.ai.parser.markdown; + +import com.alibaba.cloud.ai.parser.markdown.config.MarkdownDocumentParserConfig; +import org.junit.jupiter.api.Test; +import org.springframework.ai.document.Document; +import org.springframework.core.io.DefaultResourceLoader; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.groups.Tuple.tuple; + +/** + * @author HeYQ + * @since 2024-12-08 21:38 + */ +class MarkdownDocumentParserTest { + + @Test + void testOnlyHeadersWithParagraphs() throws IOException { + MarkdownDocumentParser reader = new MarkdownDocumentParser(); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/only-headers.md").getInputStream()); + + assertThat(documents).hasSize(4) + .extracting(Document::getMetadata, Document::getContent) + .containsOnly(tuple(Map.of("category", "header_1", "title", "Header 1a"), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."), + tuple(Map.of("category", "header_1", "title", "Header 1b"), + "Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh."), + tuple(Map.of("category", "header_2", "title", "Header 2b"), + "Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero."), + tuple(Map.of("category", "header_2", "title", "Header 2c"), + "Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit.")); + } + + @Test + void testWithFormatting() throws IOException { + MarkdownDocumentParser reader = new MarkdownDocumentParser(); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/with-formatting.md").getInputStream()); + + assertThat(documents).hasSize(2) + .extracting(Document::getMetadata, Document::getContent) + .containsOnly(tuple(Map.of("category", "header_1", "title", "This is a fancy header name"), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."), + tuple(Map.of("category", "header_3", "title", "Header 3"), + "Aenean eu leo eu nibh tristique posuere quis quis massa.")); + } + + @Test + void testDocumentDividedViaHorizontalRules() throws IOException { + MarkdownDocumentParserConfig config = MarkdownDocumentParserConfig.builder() + .withHorizontalRuleCreateDocument(true) + .build(); + + MarkdownDocumentParser reader = new MarkdownDocumentParser(config); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/horizontal-rules.md").getInputStream()); + + assertThat(documents).hasSize(7) + .extracting(Document::getMetadata, Document::getContent) + .containsOnly(tuple(Map.of(), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida."), + tuple(Map.of(), + "Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."), + tuple(Map.of(), + "Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis et magna."), + tuple(Map.of(), + "Vestibulum nec eros non felis fermentum posuere eget ac risus. Curabitur et fringilla massa. Cras facilisis nec nisl sit amet sagittis."), + tuple(Map.of(), + "Aenean eu leo eu nibh tristique posuere quis quis massa. Nullam lacinia luctus sem ut vehicula."), + tuple(Map.of(), + "Aenean quis vulputate mi. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Nam tincidunt nunc a tortor tincidunt, nec lobortis diam rhoncus."), + tuple(Map.of(), "Nulla facilisi. Phasellus eget tellus sed nibh ornare interdum eu eu mi.")); + } + + @Test + void testDocumentNotDividedViaHorizontalRulesWhenIsDisabled() throws IOException { + + MarkdownDocumentParserConfig config = MarkdownDocumentParserConfig.builder() + .withHorizontalRuleCreateDocument(false) + .build(); + MarkdownDocumentParser reader = new MarkdownDocumentParser(config); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/horizontal-rules.md").getInputStream()); + + assertThat(documents).hasSize(1); + + Document documentsFirst = documents.get(0); + assertThat(documentsFirst.getMetadata()).isEmpty(); + assertThat(documentsFirst.getContent()).startsWith("Lorem ipsum dolor sit amet, consectetur adipiscing elit") + .endsWith("Phasellus eget tellus sed nibh ornare interdum eu eu mi."); + } + + @Test + void testSimpleMarkdownDocumentWithHardAndSoftLineBreaks() throws IOException { + + MarkdownDocumentParser reader = new MarkdownDocumentParser(); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/simple.md").getInputStream()); + + assertThat(documents).hasSize(1); + + Document documentsFirst = documents.get(0); + assertThat(documentsFirst.getMetadata()).isEmpty(); + assertThat(documentsFirst.getContent()).isEqualTo( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim.Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis et magna. Vestibulum nec eros non felis fermentum posuere eget ac risus.Aenean eu leo eu nibh tristique posuere quis quis massa. Nullam lacinia luctus sem ut vehicula."); + } + + @Test + void testCode() throws IOException { + MarkdownDocumentParserConfig config = MarkdownDocumentParserConfig.builder() + .withHorizontalRuleCreateDocument(true) + .build(); + + MarkdownDocumentParser reader = new MarkdownDocumentParser(config); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/code.md").getInputStream()); + + assertThat(documents).satisfiesExactly(document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of()); + assertThat(document.getContent()).isEqualTo("This is a Java sample application:"); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "java", "category", "code_block")); + assertThat(document.getContent()).startsWith("package com.example.demo;") + .contains("SpringApplication.run(DemoApplication.class, args);"); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("category", "code_inline")); + assertThat(document.getContent()).isEqualTo( + "Markdown also provides the possibility to use inline code formatting throughout the entire sentence."); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of()); + assertThat(document.getContent()) + .isEqualTo("Another possibility is to set block code without specific highlighting:"); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "", "category", "code_block")); + assertThat(document.getContent()).isEqualTo("./mvnw spring-javaformat:apply\n"); + }); + } + + @Test + void testCodeWhenCodeBlockShouldNotBeSeparatedDocument() throws IOException { + MarkdownDocumentParserConfig config = MarkdownDocumentParserConfig.builder() + .withHorizontalRuleCreateDocument(true) + .withIncludeCodeBlock(true) + .build(); + + MarkdownDocumentParser reader = new MarkdownDocumentParser(config); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/code.md").getInputStream()); + + assertThat(documents).satisfiesExactly(document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "java", "category", "code_block")); + assertThat(document.getContent()).startsWith("This is a Java sample application: package com.example.demo") + .contains("SpringApplication.run(DemoApplication.class, args);"); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("category", "code_inline")); + assertThat(document.getContent()).isEqualTo( + "Markdown also provides the possibility to use inline code formatting throughout the entire sentence."); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "", "category", "code_block")); + assertThat(document.getContent()).isEqualTo( + "Another possibility is to set block code without specific highlighting: ./mvnw spring-javaformat:apply\n"); + }); + } + + @Test + void testBlockquote() throws IOException { + + MarkdownDocumentParser reader = new MarkdownDocumentParser(); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/blockquote.md").getInputStream()); + + assertThat(documents).hasSize(2) + .extracting(Document::getMetadata, Document::getContent) + .containsOnly(tuple(Map.of(), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."), + tuple(Map.of("category", "blockquote"), + "Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit.")); + } + + @Test + void testBlockquoteWhenBlockquoteShouldNotBeSeparatedDocument() throws IOException { + MarkdownDocumentParserConfig config = MarkdownDocumentParserConfig.builder() + .withIncludeBlockquote(true) + .build(); + + MarkdownDocumentParser reader = new MarkdownDocumentParser(config); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/blockquote.md").getInputStream()); + + assertThat(documents).hasSize(1); + + Document documentsFirst = documents.get(0); + assertThat(documentsFirst.getMetadata()).isEqualTo(Map.of("category", "blockquote")); + assertThat(documentsFirst.getContent()).isEqualTo( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."); + } + + @Test + void testLists() throws IOException { + + MarkdownDocumentParser reader = new MarkdownDocumentParser(); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/lists.md").getInputStream()); + + assertThat(documents).hasSize(2) + .extracting(Document::getMetadata, Document::getContent) + .containsOnly(tuple(Map.of("category", "header_2", "title", "Ordered list"), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor."), + tuple(Map.of("category", "header_2", "title", "Unordered list"), + "Aenean eu leo eu nibh tristique posuere quis quis massa. Aenean imperdiet libero dui, nec malesuada dui maximus vel. Vestibulum sed dui condimentum, cursus libero in, dapibus tortor. Etiam facilisis enim in egestas dictum.")); + } + + @Test + void testWithAdditionalMetadata() throws IOException { + MarkdownDocumentParserConfig config = MarkdownDocumentParserConfig.builder() + .withAdditionalMetadata("service", "some-service-name") + .withAdditionalMetadata("env", "prod") + .build(); + + MarkdownDocumentParser reader = new MarkdownDocumentParser(config); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/simple.md").getInputStream()); + + assertThat(documents).hasSize(1); + + Document documentsFirst = documents.get(0); + assertThat(documentsFirst.getMetadata()).isEqualTo(Map.of("service", "some-service-name", "env", "prod")); + assertThat(documentsFirst.getContent()).startsWith("Lorem ipsum dolor sit amet, consectetur adipiscing elit."); + } + +} diff --git a/community/document-parsers/document-parser-markdown/src/test/resources/blockquote.md b/community/document-parsers/document-parser-markdown/src/test/resources/blockquote.md new file mode 100644 index 00000000..d92ac44f --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/resources/blockquote.md @@ -0,0 +1,8 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed +nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. + +> Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget +> sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a +> porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum +> suscipit. + diff --git a/community/document-parsers/document-parser-markdown/src/test/resources/code.md b/community/document-parsers/document-parser-markdown/src/test/resources/code.md new file mode 100644 index 00000000..31d7c7b0 --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/resources/code.md @@ -0,0 +1,25 @@ +This is a Java sample application: + +```java +package com.example.demo; + +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; + +@SpringBootApplication +public class DemoApplication { + public static void main(String[] args) { + SpringApplication.run(DemoApplication.class, args); + } +} +``` + +Markdown also provides the possibility to `use inline code formatting throughout` the entire sentence. + +--- + +Another possibility is to set block code without specific highlighting: + +``` +./mvnw spring-javaformat:apply +``` diff --git a/community/document-parsers/document-parser-markdown/src/test/resources/horizontal-rules.md b/community/document-parsers/document-parser-markdown/src/test/resources/horizontal-rules.md new file mode 100644 index 00000000..f7affefc --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/resources/horizontal-rules.md @@ -0,0 +1,27 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. + +--- + +Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu +elementum dignissim. + +*** +Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis +et magna. + +* * * + +Vestibulum nec eros non felis fermentum posuere eget ac risus. Curabitur et fringilla massa. Cras facilisis nec nisl sit +amet sagittis. + +***** + +Aenean eu leo eu nibh tristique posuere quis quis massa. Nullam lacinia luctus sem ut vehicula. + +--------------------------------------- + +Aenean quis vulputate mi. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Nam tincidunt nunc a tortor tincidunt, nec lobortis diam rhoncus. + +- - - + +Nulla facilisi. Phasellus eget tellus sed nibh ornare interdum eu eu mi. diff --git a/community/document-parsers/document-parser-markdown/src/test/resources/lists.md b/community/document-parsers/document-parser-markdown/src/test/resources/lists.md new file mode 100644 index 00000000..f82e7e34 --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/resources/lists.md @@ -0,0 +1,17 @@ +## Ordered list + +1. Lorem ipsum dolor sit *amet*, consectetur adipiscing elit. **Curabitur** diam eros, laoreet sit _amet_ cursus vitae, + varius sed nisi. +2. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. +3. Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget + sapien odio. + 1. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum + suscipit. + 2. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. + +## Unordered list + +* Aenean eu leo eu nibh tristique posuere quis quis massa. +* Aenean imperdiet libero dui, nec malesuada dui maximus vel. Vestibulum sed dui condimentum, cursus libero in, dapibus + tortor. + * Etiam facilisis enim in egestas dictum. diff --git a/community/document-parsers/document-parser-markdown/src/test/resources/only-headers.md b/community/document-parsers/document-parser-markdown/src/test/resources/only-headers.md new file mode 100644 index 00000000..81c770e8 --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/resources/only-headers.md @@ -0,0 +1,20 @@ +# Header 1a + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed +nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. + +# Header 1b + +Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed +sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh. + +## Header 2b + +Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien +odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. + +# Header 1c + +## Header 2c + +Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit. diff --git a/community/document-parsers/document-parser-markdown/src/test/resources/simple.md b/community/document-parsers/document-parser-markdown/src/test/resources/simple.md new file mode 100644 index 00000000..3275c89b --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/resources/simple.md @@ -0,0 +1,8 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan +tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim. + +Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis et magna. Vestibulum nec eros non felis fermentum posuere eget ac risus. + +Aenean eu leo eu nibh tristique posuere quis quis massa.\ +Nullam lacinia luctus sem ut vehicula. + diff --git a/community/document-parsers/document-parser-markdown/src/test/resources/with-formatting.md b/community/document-parsers/document-parser-markdown/src/test/resources/with-formatting.md new file mode 100644 index 00000000..963743ec --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/resources/with-formatting.md @@ -0,0 +1,9 @@ +# This is a fancy header name + +Lorem ipsum dolor sit amet, **consectetur adipiscing elit**. Donec tincidunt velit non bibendum gravida. Cras accumsan +tincidunt ornare. Donec hendrerit consequat tellus *blandit* accumsan. Aenean aliquam metus at ***arcu elementum*** +dignissim. + +### Header 3 + +Aenean eu leo eu nibh tristique _posuere quis quis massa_. diff --git a/community/document-parsers/document-parser-tika/pom.xml b/community/document-parsers/document-parser-tika/pom.xml new file mode 100644 index 00000000..93492ee8 --- /dev/null +++ b/community/document-parsers/document-parser-tika/pom.xml @@ -0,0 +1,77 @@ + + + 4.0.0 + + com.alibaba.cloud.ai + spring-ai-alibaba + ${revision} + ../../../pom.xml + + + document-parser-tika + document-parser-tika + document-parser-tika for Spring AI Alibaba + jar + https://github.com/alibaba/spring-ai-alibaba + + https://github.com/alibaba/spring-ai-alibaba + git://github.com/alibaba/spring-ai-alibaba.git + git@github.com:alibaba/spring-ai-alibaba.git + + + + 17 + 17 + UTF-8 + 2.9.1 + + + + + com.alibaba.cloud.ai + spring-ai-alibaba-core + ${project.parent.version} + + + + org.apache.tika + tika-core + ${apache-tika.version} + + + + org.apache.tika + tika-parsers-standard-package + ${apache-tika.version} + + + + + + org.springframework.ai + spring-ai-test + test + + + + org.springframework.boot + spring-boot-starter-test + test + + + + org.junit.jupiter + junit-jupiter-engine + test + + + + org.assertj + assertj-core + test + + + + \ No newline at end of file diff --git a/community/document-parsers/document-parser-tika/src/main/java/com/alibaba/cloud/ai/parser/tika/TikaDocumentParser.java b/community/document-parsers/document-parser-tika/src/main/java/com/alibaba/cloud/ai/parser/tika/TikaDocumentParser.java new file mode 100644 index 00000000..b3a8a79a --- /dev/null +++ b/community/document-parsers/document-parser-tika/src/main/java/com/alibaba/cloud/ai/parser/tika/TikaDocumentParser.java @@ -0,0 +1,132 @@ +package com.alibaba.cloud.ai.parser.tika; + +import com.alibaba.cloud.ai.document.DocumentParser; +import org.apache.tika.exception.ZeroByteFileException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.springframework.ai.document.Document; +import org.springframework.ai.reader.ExtractedTextFormatter; +import org.xml.sax.ContentHandler; + +import java.io.InputStream; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import java.util.function.Supplier; + +/** + * Parses files into {@link Document}s using Apache Tika library, automatically detecting + * the file format. This parser supports various file formats, including PDF, DOC, PPT, + * XLS. For detailed information on supported formats, please refer to the + * Apache Tika documentation. + * + * @author HeYQ + * @since 2024-12-02 11:32 + */ + +public class TikaDocumentParser implements DocumentParser { + + private static final int NO_WRITE_LIMIT = -1; + + public static final Supplier DEFAULT_PARSER_SUPPLIER = AutoDetectParser::new; + + public static final Supplier DEFAULT_METADATA_SUPPLIER = Metadata::new; + + public static final Supplier DEFAULT_PARSE_CONTEXT_SUPPLIER = ParseContext::new; + + public static final Supplier DEFAULT_CONTENT_HANDLER_SUPPLIER = () -> new BodyContentHandler( + NO_WRITE_LIMIT); + + private final Supplier parserSupplier; + + private final Supplier contentHandlerSupplier; + + private final Supplier metadataSupplier; + + private final Supplier parseContextSupplier; + + private final ExtractedTextFormatter textFormatter; + + public TikaDocumentParser() { + this((Supplier) null, null, null, null, ExtractedTextFormatter.defaults()); + } + + public TikaDocumentParser(ExtractedTextFormatter textFormatter) { + this((Supplier) null, null, null, null, textFormatter); + } + + public TikaDocumentParser(Supplier contentHandlerSupplier, ExtractedTextFormatter textFormatter) { + this((Supplier) null, contentHandlerSupplier, null, null, textFormatter); + } + + public TikaDocumentParser(Supplier parserSupplier, Supplier contentHandlerSupplier, + Supplier metadataSupplier, Supplier parseContextSupplier) { + this(parserSupplier, contentHandlerSupplier, metadataSupplier, parseContextSupplier, + ExtractedTextFormatter.defaults()); + } + + /** + * Creates an instance of an {@code ApacheTikaDocumentParser} with the provided + * suppliers for Tika components. If some of the suppliers are not provided + * ({@code null}), the defaults will be used. + * @param parserSupplier Supplier for Tika parser to use. Default: + * {@link AutoDetectParser} + * @param contentHandlerSupplier Supplier for Tika content handler. Default: + * {@link BodyContentHandler} without write limit + * @param metadataSupplier Supplier for Tika metadata. Default: empty {@link Metadata} + * @param parseContextSupplier Supplier for Tika parse context. Default: empty + * {@link ParseContext} + * @param textFormatter Formatter for extracted text. Default: + * {@link ExtractedTextFormatter#defaults()} + */ + public TikaDocumentParser(Supplier parserSupplier, Supplier contentHandlerSupplier, + Supplier metadataSupplier, Supplier parseContextSupplier, + ExtractedTextFormatter textFormatter) { + this.parserSupplier = getOrDefault(parserSupplier, () -> DEFAULT_PARSER_SUPPLIER); + this.contentHandlerSupplier = getOrDefault(contentHandlerSupplier, () -> DEFAULT_CONTENT_HANDLER_SUPPLIER); + this.metadataSupplier = getOrDefault(metadataSupplier, () -> DEFAULT_METADATA_SUPPLIER); + this.parseContextSupplier = getOrDefault(parseContextSupplier, () -> DEFAULT_PARSE_CONTEXT_SUPPLIER); + this.textFormatter = textFormatter; + } + + @Override + public List parse(InputStream inputStream) { + try { + Parser parser = parserSupplier.get(); + ContentHandler contentHandler = contentHandlerSupplier.get(); + Metadata metadata = metadataSupplier.get(); + ParseContext parseContext = parseContextSupplier.get(); + + parser.parse(inputStream, contentHandler, metadata, parseContext); + String text = contentHandler.toString(); + + if (Objects.isNull(text)) { + throw new ZeroByteFileException("The content is blank!"); + } + + return Collections.singletonList(toDocument(text)); + } + catch (Exception e) { + throw new RuntimeException(e); + } + } + + /** + * Converts the given text to a {@link Document}. + * @param docText Text to be converted + * @return Converted document + */ + private Document toDocument(String docText) { + docText = Objects.requireNonNullElse(docText, ""); + docText = this.textFormatter.format(docText); + return new Document(docText); + } + + private static T getOrDefault(T value, Supplier defaultValueSupplier) { + return value != null ? value : defaultValueSupplier.get(); + } + +} diff --git a/community/document-parsers/document-parser-tika/src/test/java/com/alibaba/cloud/ai/parser/tika/ApacheTikaDocumentParserTest.java b/community/document-parsers/document-parser-tika/src/test/java/com/alibaba/cloud/ai/parser/tika/ApacheTikaDocumentParserTest.java new file mode 100644 index 00000000..bf98c0cf --- /dev/null +++ b/community/document-parsers/document-parser-tika/src/test/java/com/alibaba/cloud/ai/parser/tika/ApacheTikaDocumentParserTest.java @@ -0,0 +1,72 @@ +package com.alibaba.cloud.ai.parser.tika; + +import com.alibaba.cloud.ai.document.DocumentParser; +import org.apache.tika.exception.ZeroByteFileException; +import org.apache.tika.parser.AutoDetectParser; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; +import org.springframework.ai.document.Document; + +import java.io.InputStream; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +class ApacheTikaDocumentParserTest { + + @ParameterizedTest + @ValueSource(strings = { "test-file.doc", "test-file.docx", "test-file.ppt", "test-file.pptx", "test-file.pdf" }) + void should_parse_doc_ppt_and_pdf_files(String fileName) { + + DocumentParser parser = new TikaDocumentParser(); + InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName); + + Document document = parser.parse(inputStream).get(0); + + assertThat(document.getContent()).isEqualToIgnoringWhitespace("test content"); + assertThat(document.getMetadata()).isEmpty(); + } + + @ParameterizedTest + @ValueSource(strings = { "test-file.xls", "test-file.xlsx" }) + void should_parse_xls_files(String fileName) { + + DocumentParser parser = new TikaDocumentParser(AutoDetectParser::new, null, null, null); + InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName); + + Document document = parser.parse(inputStream).get(0); + + assertThat(document.getContent()).isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content"); + assertThat(document.getMetadata()).isEmpty(); + } + + @Test + void should_parse_files_stateless() { + + DocumentParser parser = new TikaDocumentParser(); + InputStream inputStream1 = getClass().getClassLoader().getResourceAsStream("test-file.xls"); + InputStream inputStream2 = getClass().getClassLoader().getResourceAsStream("test-file.xls"); + + Document document1 = parser.parse(inputStream1).get(0); + Document document2 = parser.parse(inputStream2).get(0); + + assertThat(document1.getContent()).isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content"); + assertThat(document2.getContent()).isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content"); + assertThat(document1.getMetadata()).isEmpty(); + assertThat(document2.getMetadata()).isEmpty(); + } + + @ParameterizedTest + @ValueSource(strings = { "empty-file.txt", "blank-file.txt", "blank-file.docx", "blank-file.pptx" + // "blank-file.xlsx" TODO + }) + void should_throw_BlankDocumentException(String fileName) { + + DocumentParser parser = new TikaDocumentParser(); + InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName); + + assertThatThrownBy(() -> parser.parse(inputStream)).isExactlyInstanceOf(ZeroByteFileException.class); + } + +} \ No newline at end of file diff --git a/community/document-parsers/document-parser-tika/src/test/resources/blank-file.docx b/community/document-parsers/document-parser-tika/src/test/resources/blank-file.docx new file mode 100644 index 00000000..f211f65f Binary files /dev/null and b/community/document-parsers/document-parser-tika/src/test/resources/blank-file.docx differ diff --git a/community/document-parsers/document-parser-tika/src/test/resources/blank-file.pptx b/community/document-parsers/document-parser-tika/src/test/resources/blank-file.pptx new file mode 100644 index 00000000..beb1acaf Binary files /dev/null and b/community/document-parsers/document-parser-tika/src/test/resources/blank-file.pptx differ diff --git a/community/document-parsers/document-parser-tika/src/test/resources/blank-file.txt b/community/document-parsers/document-parser-tika/src/test/resources/blank-file.txt new file mode 100644 index 00000000..e69de29b diff --git a/community/document-parsers/document-parser-tika/src/test/resources/blank-file.xlsx b/community/document-parsers/document-parser-tika/src/test/resources/blank-file.xlsx new file mode 100644 index 00000000..998e5c76 Binary files /dev/null and b/community/document-parsers/document-parser-tika/src/test/resources/blank-file.xlsx differ diff --git a/community/document-parsers/document-parser-tika/src/test/resources/empty-file.txt b/community/document-parsers/document-parser-tika/src/test/resources/empty-file.txt new file mode 100644 index 00000000..e69de29b diff --git a/community/document-parsers/document-parser-tika/src/test/resources/test-file.doc b/community/document-parsers/document-parser-tika/src/test/resources/test-file.doc new file mode 100644 index 00000000..f4f15a04 Binary files /dev/null and b/community/document-parsers/document-parser-tika/src/test/resources/test-file.doc differ diff --git a/community/document-parsers/document-parser-tika/src/test/resources/test-file.docx b/community/document-parsers/document-parser-tika/src/test/resources/test-file.docx new file mode 100644 index 00000000..57f47fd0 Binary files /dev/null and b/community/document-parsers/document-parser-tika/src/test/resources/test-file.docx differ diff --git a/community/document-parsers/document-parser-tika/src/test/resources/test-file.pdf b/community/document-parsers/document-parser-tika/src/test/resources/test-file.pdf new file mode 100644 index 00000000..920f1fce Binary files /dev/null and b/community/document-parsers/document-parser-tika/src/test/resources/test-file.pdf differ diff --git a/community/document-parsers/document-parser-tika/src/test/resources/test-file.ppt b/community/document-parsers/document-parser-tika/src/test/resources/test-file.ppt new file mode 100644 index 00000000..02a35da1 Binary files /dev/null and b/community/document-parsers/document-parser-tika/src/test/resources/test-file.ppt differ diff --git a/community/document-parsers/document-parser-tika/src/test/resources/test-file.pptx b/community/document-parsers/document-parser-tika/src/test/resources/test-file.pptx new file mode 100644 index 00000000..9e20e3c7 Binary files /dev/null and b/community/document-parsers/document-parser-tika/src/test/resources/test-file.pptx differ diff --git a/community/document-parsers/document-parser-tika/src/test/resources/test-file.xls b/community/document-parsers/document-parser-tika/src/test/resources/test-file.xls new file mode 100644 index 00000000..49847c0f Binary files /dev/null and b/community/document-parsers/document-parser-tika/src/test/resources/test-file.xls differ diff --git a/community/document-parsers/document-parser-tika/src/test/resources/test-file.xlsx b/community/document-parsers/document-parser-tika/src/test/resources/test-file.xlsx new file mode 100644 index 00000000..a2ba64dd Binary files /dev/null and b/community/document-parsers/document-parser-tika/src/test/resources/test-file.xlsx differ diff --git a/community/document-readers/github-reader/src/main/java/com/alibaba/cloud/ai/reader/github/GitHubDocumentReader.java b/community/document-readers/github-reader/src/main/java/com/alibaba/cloud/ai/reader/github/GitHubDocumentReader.java index 01204571..5d3253d6 100644 --- a/community/document-readers/github-reader/src/main/java/com/alibaba/cloud/ai/reader/github/GitHubDocumentReader.java +++ b/community/document-readers/github-reader/src/main/java/com/alibaba/cloud/ai/reader/github/GitHubDocumentReader.java @@ -4,13 +4,14 @@ import org.springframework.ai.document.Document; import org.springframework.ai.document.DocumentReader; -import org.springframework.ai.reader.ExtractedTextFormatter; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Objects; -import com.alibaba.cloud.ai.reader.DocumentParser; +import com.alibaba.cloud.ai.document.DocumentParser; /** * @author HeYQ @@ -18,46 +19,62 @@ */ public class GitHubDocumentReader implements DocumentReader { - private final DocumentReader parser; + private final DocumentParser parser; - private final GitHubResource gitHubResource; + private GitHubResource gitHubResource; - public GitHubDocumentReader(GitHubResource gitHubResource, DocumentParser parserType) { - this(gitHubResource, parserType.getParser(gitHubResource)); - } + private List gitHubResourceList; - public GitHubDocumentReader(GitHubResource gitHubResource, DocumentParser parserType, - ExtractedTextFormatter formatter) { - this(gitHubResource, parserType.getParser(gitHubResource, formatter)); + public GitHubDocumentReader(GitHubResource gitHubResource, DocumentParser parser) { + this.gitHubResource = gitHubResource; + this.parser = parser; } - public GitHubDocumentReader(GitHubResource gitHubResource, DocumentReader parser) { - this.gitHubResource = gitHubResource; + public GitHubDocumentReader(List gitHubResourceList, DocumentParser parser) { + this.gitHubResourceList = gitHubResourceList; this.parser = parser; } @Override public List get() { - GHContent ghContent = gitHubResource.getContent(); - List documents = parser.get(); - for (Document document : documents) { - Map metadata = document.getMetadata(); - metadata.put("github_git_url", ghContent.getGitUrl()); - try { + List documents = new ArrayList<>(); + if (!Objects.isNull(gitHubResourceList) && !gitHubResourceList.isEmpty()) { + processResourceList(documents); + } + else if (gitHubResource != null) { + loadDocuments(documents, gitHubResource); + } + + return documents; + } + + private void processResourceList(List documents) { + for (GitHubResource resource : gitHubResourceList) { + loadDocuments(documents, resource); + } + } + + private void loadDocuments(List documents, GitHubResource gitHubResource) { + try { + List documentList = parser.parse(gitHubResource.getInputStream()); + for (Document document : documentList) { + GHContent ghContent = gitHubResource.getContent(); + Map metadata = document.getMetadata(); + metadata.put("github_git_url", ghContent.getGitUrl()); metadata.put("github_download_url", ghContent.getDownloadUrl()); + metadata.put("github_html_url", ghContent.getHtmlUrl()); + metadata.put("github_url", ghContent.getUrl()); + metadata.put("github_file_name", ghContent.getName()); + metadata.put("github_file_path", ghContent.getPath()); + metadata.put("github_file_sha", ghContent.getSha()); + metadata.put("github_file_size", Long.toString(ghContent.getSize())); + metadata.put("github_file_encoding", ghContent.getEncoding()); + documents.add(document); } - catch (IOException e) { - // Ignore if download_url is not available - } - metadata.put("github_html_url", ghContent.getHtmlUrl()); - metadata.put("github_url", ghContent.getUrl()); - metadata.put("github_file_name", ghContent.getName()); - metadata.put("github_file_path", ghContent.getPath()); - metadata.put("github_file_sha", ghContent.getSha()); - metadata.put("github_file_size", Long.toString(ghContent.getSize())); - metadata.put("github_file_encoding", ghContent.getEncoding()); } - return documents; + catch (IOException ioException) { + throw new RuntimeException("Failed to load document from GitHub: {}", ioException); + } } } diff --git a/community/document-readers/github-reader/src/main/java/com/alibaba/cloud/ai/reader/github/GitHubResource.java b/community/document-readers/github-reader/src/main/java/com/alibaba/cloud/ai/reader/github/GitHubResource.java index 937cf624..914de26d 100644 --- a/community/document-readers/github-reader/src/main/java/com/alibaba/cloud/ai/reader/github/GitHubResource.java +++ b/community/document-readers/github-reader/src/main/java/com/alibaba/cloud/ai/reader/github/GitHubResource.java @@ -3,6 +3,8 @@ import org.kohsuke.github.GHContent; import org.kohsuke.github.GitHub; import org.kohsuke.github.GitHubBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.springframework.core.io.Resource; import org.springframework.util.Assert; @@ -11,6 +13,9 @@ import java.io.InputStream; import java.net.URI; import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; /** * @author HeYQ @@ -18,35 +23,17 @@ */ public class GitHubResource implements Resource { + private static final Logger logger = LoggerFactory.getLogger(GitHubResource.class); + private final InputStream inputStream; private final GHContent content; - public GitHubResource(String gitHubToken, String gitHubTokenOrganization, String owner, String repo, String branch, - String path) { - this(null, gitHubToken, gitHubTokenOrganization, owner, repo, branch, path); - } - - public GitHubResource(String gitHubToken, String gitHubTokenOrganization, String owner, String repo, String path) { - this(null, gitHubToken, gitHubTokenOrganization, owner, repo, "main", path); - } - - public GitHubResource(String apiUrl, String gitHubToken, String gitHubTokenOrganization, String owner, String repo, - String branch, String path) { - GitHubBuilder gitHubBuilder = new GitHubBuilder(); - if (apiUrl != null) { - gitHubBuilder.withEndpoint(apiUrl); - } - if (gitHubToken != null) { - if (gitHubTokenOrganization == null) { - gitHubBuilder.withOAuthToken(gitHubToken); - } - else { - gitHubBuilder.withOAuthToken(gitHubToken, gitHubTokenOrganization); - } + public GitHubResource(GitHub gitHub, String owner, String repo, String branch, String path) { + if (Objects.isNull(branch)) { + branch = "main"; } try { - GitHub gitHub = gitHubBuilder.build(); content = gitHub.getRepository(owner + "/" + repo).getFileContent(path, branch); Assert.isTrue(content.isFile(), "Path must be a file"); inputStream = content.read(); @@ -56,10 +43,74 @@ public GitHubResource(String apiUrl, String gitHubToken, String gitHubTokenOrgan } } + public GitHubResource(GHContent content) { + try { + this.content = content; + inputStream = content.read(); + } + catch (IOException ioException) { + throw new RuntimeException(ioException); + } + } + + public static GitHubResource getInstance(GHContent content) { + return new GitHubResource(content); + } + public GHContent getContent() { return content; } + @Override + public boolean exists() { + return false; + } + + @Override + public URL getURL() throws IOException { + return null; + } + + @Override + public URI getURI() throws IOException { + return null; + } + + @Override + public File getFile() throws IOException { + return null; + } + + @Override + public long contentLength() throws IOException { + return 0; + } + + @Override + public long lastModified() throws IOException { + return 0; + } + + @Override + public Resource createRelative(String relativePath) throws IOException { + return null; + } + + @Override + public String getFilename() { + return ""; + } + + @Override + public String getDescription() { + return ""; + } + + @Override + public InputStream getInputStream() throws IOException { + return inputStream; + } + public static Builder builder() { return new Builder(); } @@ -72,6 +123,8 @@ public static class Builder { private String gitHubTokenOrganization; + private GitHub gitHub; + private String owner; private String repo; @@ -95,6 +148,11 @@ public Builder gitHubTokenOrganization(String gitHubTokenOrganization) { return this; } + public Builder gitHub(GitHub gitHub) { + this.gitHub = gitHub; + return this; + } + public Builder owner(String owner) { this.owner = owner; return this; @@ -116,62 +174,79 @@ public Builder path(String path) { } public GitHubResource build() { + createGithub(); + return new GitHubResource(gitHub, owner, repo, branch, path); + } + + public List buildBatch() { + createGithub(); + return loadGitHubResources(); + } + + private void createGithub() { Assert.notNull(owner, "Owner must not be null"); Assert.notNull(repo, "Repo must not be null"); Assert.notNull(path, "Path must not be null"); - return new GitHubResource(apiUrl, gitHubToken, gitHubTokenOrganization, owner, repo, branch, path); + if (Objects.isNull(gitHub)) { + Assert.notNull(gitHubToken, "GitHub token must not be null"); + GitHubBuilder gitHubBuilder = new GitHubBuilder(); + if (apiUrl != null) { + gitHubBuilder.withEndpoint(apiUrl); + } + if (gitHubToken != null) { + if (gitHubTokenOrganization == null) { + gitHubBuilder.withOAuthToken(gitHubToken); + } + else { + gitHubBuilder.withOAuthToken(gitHubToken, gitHubTokenOrganization); + } + } + try { + this.gitHub = gitHubBuilder.build(); + } + catch (IOException ioException) { + throw new RuntimeException(ioException); + } + } } - } - - @Override - public boolean exists() { - return false; - } - - @Override - public URL getURL() throws IOException { - return null; - } - - @Override - public URI getURI() throws IOException { - return null; - } - - @Override - public File getFile() throws IOException { - return null; - } - - @Override - public long contentLength() throws IOException { - return 0; - } - - @Override - public long lastModified() throws IOException { - return 0; - } - - @Override - public Resource createRelative(String relativePath) throws IOException { - return null; - } - - @Override - public String getFilename() { - return ""; - } + private List loadGitHubResources() { + List gitHubResources = new ArrayList<>(); + try { + gitHub.getRepository(owner + "/" + repo) + .getDirectoryContent(path, branch) + .forEach(ghDirectoryContent -> Builder.scanDirectory(ghDirectoryContent, gitHubResources)); + } + catch (IOException ioException) { + throw new RuntimeException(ioException); + } + return gitHubResources; + } - @Override - public String getDescription() { - return ""; - } + private static void scanDirectory(GHContent ghContent, List gitHubResources) { + if (ghContent.isDirectory()) { + try { + ghContent.listDirectoryContent() + .forEach(ghDirectoryContent -> Builder.scanDirectory(ghDirectoryContent, gitHubResources)); + } + catch (IOException ioException) { + logger.error("Failed to read directory from GitHub: {}", ghContent.getHtmlUrl(), ioException); + } + } + else { + GitHubResource gitHubResource = null; + try { + gitHubResource = GitHubResource.getInstance(ghContent); + } + catch (RuntimeException runtimeException) { + logger.error("Failed to read document from GitHub: {}", ghContent.getHtmlUrl(), runtimeException); + } + if (gitHubResource != null) { + gitHubResources.add(gitHubResource); + } + } + } - @Override - public InputStream getInputStream() throws IOException { - return inputStream; } } diff --git a/community/document-readers/github-reader/src/test/java/com/alibaba/cloud/ai/reader/github/GitHubDocumentLoaderIT.java b/community/document-readers/github-reader/src/test/java/com/alibaba/cloud/ai/reader/github/GitHubDocumentLoaderIT.java index 7a9e8176..f6b72021 100644 --- a/community/document-readers/github-reader/src/test/java/com/alibaba/cloud/ai/reader/github/GitHubDocumentLoaderIT.java +++ b/community/document-readers/github-reader/src/test/java/com/alibaba/cloud/ai/reader/github/GitHubDocumentLoaderIT.java @@ -1,11 +1,11 @@ package com.alibaba.cloud.ai.reader.github; -import com.alibaba.cloud.ai.reader.DocumentParser; +import com.alibaba.cloud.ai.document.TextDocumentParser; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable; import org.springframework.ai.document.Document; - +import com.alibaba.cloud.ai.document.DocumentParser; import java.util.List; @EnabledIfEnvironmentVariable(named = "GITHUB_TOKEN", matches = ".+") @@ -25,9 +25,11 @@ class GitHubDocumentLoaderIT { .path("Mergekit.ipynb") // Mergekit.ipynb //LICENSE .build(); + DocumentParser parser = new TextDocumentParser(); + @BeforeEach public void beforeEach() { - reader = new GitHubDocumentReader(source, DocumentParser.TEXT_PARSER); + reader = new GitHubDocumentReader(source, parser); } @Test diff --git a/community/document-readers/tencent-cos-reader/src/main/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentReader.java b/community/document-readers/tencent-cos-reader/src/main/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentReader.java index d7c46ff2..38cb4d71 100644 --- a/community/document-readers/tencent-cos-reader/src/main/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentReader.java +++ b/community/document-readers/tencent-cos-reader/src/main/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentReader.java @@ -1,11 +1,10 @@ package com.alibaba.cloud.ai.tencent.cos; -import com.alibaba.cloud.ai.reader.DocumentParser; +import com.alibaba.cloud.ai.document.DocumentParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.ai.document.Document; import org.springframework.ai.document.DocumentReader; -import org.springframework.ai.reader.ExtractedTextFormatter; import java.util.ArrayList; import java.util.List; @@ -20,46 +19,20 @@ public class TencentCosDocumentReader implements DocumentReader { private static final Logger log = LoggerFactory.getLogger(TencentCosDocumentReader.class); - private DocumentReader parser; + private final DocumentParser parser; private TencentCosResource tencentCosResource; private List tencentCosResourceList; - private List parserList; - - public TencentCosDocumentReader(TencentCosResource tencentCosResource, DocumentParser parserType) { - this(tencentCosResource, parserType.getParser(tencentCosResource)); - - } - - public TencentCosDocumentReader(TencentCosResource tencentCosResource, DocumentParser parserType, - ExtractedTextFormatter formatter) { - this(tencentCosResource, parserType.getParser(tencentCosResource, formatter)); - } - - public TencentCosDocumentReader(TencentCosResource tencentCosResource, DocumentReader parser) { + public TencentCosDocumentReader(TencentCosResource tencentCosResource, DocumentParser parser) { this.tencentCosResource = tencentCosResource; this.parser = parser; } - public TencentCosDocumentReader(List tencentCosResourceList, DocumentParser parserType, - ExtractedTextFormatter formatter) { - this.tencentCosResourceList = tencentCosResourceList; - List parserList = new ArrayList<>(); - for (TencentCosResource tencentCosResource : tencentCosResourceList) { - parserList.add(parserType.getParser(tencentCosResource, formatter)); - } - this.parserList = parserList; - } - - public TencentCosDocumentReader(List tencentCosResourceList, DocumentParser parserType) { + public TencentCosDocumentReader(List tencentCosResourceList, DocumentParser parser) { this.tencentCosResourceList = tencentCosResourceList; - List parserList = new ArrayList<>(); - for (TencentCosResource tencentCosResource : tencentCosResourceList) { - parserList.add(parserType.getParser(tencentCosResource)); - } - this.parserList = parserList; + this.parser = parser; } @Override @@ -69,44 +42,28 @@ public List get() { processResourceList(documents); } else if (tencentCosResource != null) { - processSingleResource(documents); + loadDocuments(documents, tencentCosResource); } return documents; } private void processResourceList(List documents) { - for (int i = 0; i < tencentCosResourceList.size(); i++) { - TencentCosResource resource = tencentCosResourceList.get(i); - String key = resource.getKey(); - String bucket = resource.getBucket(); - String source = format("cos://%s/%s", bucket, key); - - try { - List document = parserList.get(i).get(); - for (Document doc : document) { - doc.getMetadata().put(TencentCosResource.SOURCE, source); - } - documents.addAll(document); - } - catch (Exception e) { - log.warn("Failed to load an object with key '{}' from bucket '{}', skipping it. Stack trace: {}", key, - bucket, e.getMessage(), e); - } + for (TencentCosResource resource : tencentCosResourceList) { + loadDocuments(documents, resource); } } - private void processSingleResource(List documents) { - String key = tencentCosResource.getKey(); - String bucket = tencentCosResource.getBucket(); + private void loadDocuments(List documents, TencentCosResource resource) { + String key = resource.getKey(); + String bucket = resource.getBucket(); String source = format("cos://%s/%s", bucket, key); - try { - List document = parser.get(); - for (Document doc : document) { - doc.getMetadata().put(TencentCosResource.SOURCE, source); + List documentList = parser.parse(resource.getInputStream()); + for (Document document : documentList) { + document.getMetadata().put(TencentCosResource.SOURCE, source); + documents.add(document); } - documents.addAll(document); } catch (Exception e) { log.warn("Failed to load an object with key '{}' from bucket '{}', skipping it. Stack trace: {}", key, diff --git a/community/document-readers/tencent-cos-reader/src/test/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentLoaderIT.java b/community/document-readers/tencent-cos-reader/src/test/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentLoaderIT.java index 74f10f94..7d592f65 100644 --- a/community/document-readers/tencent-cos-reader/src/test/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentLoaderIT.java +++ b/community/document-readers/tencent-cos-reader/src/test/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentLoaderIT.java @@ -1,6 +1,7 @@ package com.alibaba.cloud.ai.tencent.cos; -import com.alibaba.cloud.ai.reader.DocumentParser; +import com.alibaba.cloud.ai.document.TextDocumentParser; +import com.alibaba.cloud.ai.document.DocumentParser; import com.qcloud.cos.COSClient; import com.qcloud.cos.ClientConfig; import com.qcloud.cos.model.PutObjectRequest; @@ -39,6 +40,8 @@ class TencentCosDocumentLoaderIT { static COSClient cosClient; + DocumentParser parser = new TextDocumentParser(); + @BeforeAll public static void beforeAll() { TencentCredentials tencentCredentials = new TencentCredentials(System.getenv("TENCENT_SECRET_ID"), @@ -72,7 +75,7 @@ void should_load_single_document() { TencentCosResource tencentCosResource3 = TencentCosResource.builder().cosClient(cosClient).build(); - loader = new TencentCosDocumentReader(tencentCosResource, DocumentParser.TEXT_PARSER); + loader = new TencentCosDocumentReader(tencentCosResource, parser); // when Document document = loader.get().get(0); @@ -87,9 +90,11 @@ void should_load_multiple_documents() { // given URL url = getClass().getClassLoader().getResource("test.txt"); + assert url != null; cosClient.putObject(new PutObjectRequest(TEST_BUCKET, TEST_KEY, new File(url.getFile()))); URL url2 = getClass().getClassLoader().getResource("test2.txt"); + assert url2 != null; cosClient.putObject(new PutObjectRequest(TEST_BUCKET, TEST_KEY_2, new File(url2.getFile()))); List tencentCosResourceList = TencentCosResource.builder() @@ -99,7 +104,7 @@ void should_load_multiple_documents() { .bucket(TEST_BUCKET) .buildBatch(); - batchLoader = new TencentCosDocumentReader(tencentCosResourceList, DocumentParser.TEXT_PARSER); + batchLoader = new TencentCosDocumentReader(tencentCosResourceList, parser); // when List documents = batchLoader.get(); @@ -123,13 +128,16 @@ void should_load_multiple_documents_with_prefix() { // given URL otherUrl = getClass().getClassLoader().getResource("other.txt"); + assert otherUrl != null; cosClient .putObject(new PutObjectRequest(TEST_BUCKET, "other_directory/file.txt", new File(otherUrl.getFile()))); URL url = getClass().getClassLoader().getResource("test.txt"); + assert url != null; cosClient.putObject(new PutObjectRequest(TEST_BUCKET, TEST_KEY, new File(url.getFile()))); URL url2 = getClass().getClassLoader().getResource("test2.txt"); + assert url2 != null; cosClient.putObject(new PutObjectRequest(TEST_BUCKET, TEST_KEY_2, new File(url2.getFile()))); List tencentCosResourceList = TencentCosResource.builder() @@ -140,7 +148,7 @@ void should_load_multiple_documents_with_prefix() { .prefix("test") .buildBatch(); - batchLoader = new TencentCosDocumentReader(tencentCosResourceList, DocumentParser.TEXT_PARSER); + batchLoader = new TencentCosDocumentReader(tencentCosResourceList, parser); // when List documents = batchLoader.get(); diff --git a/pom.xml b/pom.xml index 57541cb5..1594913a 100644 --- a/pom.xml +++ b/pom.xml @@ -41,10 +41,6 @@ community/plugins/spring-ai-alibaba-starter-plugin-gaode community/plugins/spring-ai-alibaba-starter-plugin-weather community/plugins/spring-ai-alibaba-starter-plugin-larksuite - - community/document-readers/github-reader - community/document-readers/poi-document-reader - community/document-readers/tencent-cos-reader diff --git a/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/DocumentParser.java b/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/DocumentParser.java new file mode 100644 index 00000000..c93e49e3 --- /dev/null +++ b/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/DocumentParser.java @@ -0,0 +1,27 @@ +package com.alibaba.cloud.ai.document; + +import org.springframework.ai.document.Document; + +import java.io.InputStream; +import java.util.List; + +/** + * @author HeYQ + * @since 2024-12-02 11:25 + */ + +public interface DocumentParser { + + /** + * Parses a given {@link InputStream} into a {@link Document}. The specific + * implementation of this method will depend on the type of the document being parsed. + *

+ * Note: This method does not close the provided {@link InputStream} - it is the + * caller's responsibility to manage the lifecycle of the stream. + * @param inputStream The {@link InputStream} that contains the content of the + * {@link Document}. + * @return The parsed {@link Document}. + */ + List parse(InputStream inputStream); + +} diff --git a/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/JsonDocumentParser.java b/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/JsonDocumentParser.java new file mode 100644 index 00000000..2cdc8dfe --- /dev/null +++ b/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/JsonDocumentParser.java @@ -0,0 +1,112 @@ +package com.alibaba.cloud.ai.document; + +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.springframework.ai.document.Document; +import org.springframework.ai.reader.EmptyJsonMetadataGenerator; +import org.springframework.ai.reader.JsonMetadataGenerator; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.StreamSupport; + +/** + * @author HeYQ + * @since 2024-12-08 21:13 + */ + +public class JsonDocumentParser implements DocumentParser { + + private final JsonMetadataGenerator jsonMetadataGenerator; + + private final ObjectMapper objectMapper = new ObjectMapper(); + + /** + * The key from the JSON that we will use as the text to parse into the Document text + */ + private final List jsonKeysToUse; + + public JsonDocumentParser(String... jsonKeysToUse) { + this(new EmptyJsonMetadataGenerator(), jsonKeysToUse); + } + + public JsonDocumentParser(JsonMetadataGenerator jsonMetadataGenerator, String... jsonKeysToUse) { + Objects.requireNonNull(jsonKeysToUse, "keys must not be null"); + Objects.requireNonNull(jsonMetadataGenerator, "jsonMetadataGenerator must not be null"); + this.jsonMetadataGenerator = jsonMetadataGenerator; + this.jsonKeysToUse = List.of(jsonKeysToUse); + } + + @Override + public List parse(InputStream inputStream) { + try { + JsonNode rootNode = this.objectMapper.readTree(inputStream); + + if (rootNode.isArray()) { + return StreamSupport.stream(rootNode.spliterator(), true) + .map(jsonNode -> parseJsonNode(jsonNode, this.objectMapper)) + .toList(); + } + else { + return Collections.singletonList(parseJsonNode(rootNode, this.objectMapper)); + } + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + + private Document parseJsonNode(JsonNode jsonNode, ObjectMapper objectMapper) { + Map item = objectMapper.convertValue(jsonNode, new TypeReference>() { + + }); + var sb = new StringBuilder(); + + this.jsonKeysToUse.stream() + .filter(item::containsKey) + .forEach(key -> sb.append(key).append(": ").append(item.get(key)).append(System.lineSeparator())); + + Map metadata = this.jsonMetadataGenerator.generate(item); + String content = sb.isEmpty() ? item.toString() : sb.toString(); + return new Document(content, metadata); + } + + protected List get(JsonNode rootNode) { + if (rootNode.isArray()) { + return StreamSupport.stream(rootNode.spliterator(), true) + .map(jsonNode -> parseJsonNode(jsonNode, this.objectMapper)) + .toList(); + } + else { + return Collections.singletonList(parseJsonNode(rootNode, this.objectMapper)); + } + } + + /** + * Retrieves documents from the JSON resource using a JSON Pointer. + * @param pointer A JSON Pointer string (RFC 6901) to locate the desired element + * @return A list of Documents parsed from the located JSON element + * @throws RuntimeException if the JSON cannot be parsed or the pointer is invalid + */ + public List get(String pointer, InputStream inputStream) { + try { + JsonNode rootNode = this.objectMapper.readTree(inputStream); + JsonNode targetNode = rootNode.at(pointer); + + if (targetNode.isMissingNode()) { + throw new IllegalArgumentException("Invalid JSON Pointer: " + pointer); + } + + return get(targetNode); + } + catch (IOException e) { + throw new RuntimeException("Error reading JSON resource", e); + } + } + +} diff --git a/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/TextDocumentParser.java b/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/TextDocumentParser.java new file mode 100644 index 00000000..9a895a00 --- /dev/null +++ b/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/TextDocumentParser.java @@ -0,0 +1,44 @@ +package com.alibaba.cloud.ai.document; + +import org.springframework.ai.document.Document; +import org.springframework.util.Assert; + +import java.io.InputStream; +import java.nio.charset.Charset; +import java.util.Collections; +import java.util.List; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * @author HeYQ + * @since 2024-12-08 21:13 + */ +public class TextDocumentParser implements DocumentParser { + + private final Charset charset; + + public TextDocumentParser() { + this(UTF_8); + } + + public TextDocumentParser(Charset charset) { + Assert.notNull(charset, "charset"); + this.charset = charset; + } + + @Override + public List parse(InputStream inputStream) { + try { + String text = new String(inputStream.readAllBytes(), charset); + if (text.isBlank()) { + throw new Exception(); + } + return Collections.singletonList(new Document(text)); + } + catch (Exception e) { + throw new RuntimeException(e); + } + } + +} diff --git a/spring-ai-alibaba-core/src/test/java/com/alibaba/cloud/ai/dashscope/rag/AnalyticdbVectorTest.java b/spring-ai-alibaba-core/src/test/java/com/alibaba/cloud/ai/dashscope/rag/AnalyticdbVectorTest.java new file mode 100644 index 00000000..1e2d00f5 --- /dev/null +++ b/spring-ai-alibaba-core/src/test/java/com/alibaba/cloud/ai/dashscope/rag/AnalyticdbVectorTest.java @@ -0,0 +1,92 @@ +package com.alibaba.cloud.ai.dashscope.rag; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable; +import org.springframework.ai.document.Document; +import org.springframework.ai.vectorstore.SearchRequest; +import org.springframework.boot.test.context.SpringBootTest; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +@SpringBootTest +@EnabledIfEnvironmentVariable(named = "ANALYTICDB_SECRET_KEY", matches = ".+") +class AnalyticdbVectorTest { + + AnalyticdbVector analyticdbVector; + + @BeforeEach + public void init() throws Exception { + AnalyticdbConfig config = new AnalyticdbConfig(); + config.setRegionId("cn-beijing"); + config.setDBInstanceId("gp-2ze41j8y0ry4spfev"); + config.setAccessKeyId(System.getenv("ANALYTICDB_SECRET_ID")); + config.setAccessKeySecret(System.getenv("ANALYTICDB_SECRET_KEY")); + config.setManagerAccount("hyq"); // admin0 + config.setManagerAccountPassword("hdcHDC1997@@@"); // 123456 + config.setNamespace("llama"); + config.setNamespacePassword("llamapassword"); + config.setEmbeddingDimension(3L); + analyticdbVector = new AnalyticdbVector("test_llama", config); + } + + @Test + void testGetInstance() throws Exception { + List list = new ArrayList<>(10); + Map metadata = new HashMap<>(); + metadata.put("docId", "1"); // 123 //12344 + Document document = new Document("hello1234you arewomen12334444", metadata); + int length = 1536; // Array length + float min = 0f; // smallest value + float max = 1f; // the largest value + float[] em = new float[length]; // create float array + Random random = new Random(); + for (int i = 0; i < length; i++) { + em[i] = min + (max - min) * random.nextFloat(); + } + document.setEmbedding(em); + list.add(document); + analyticdbVector.add(list); + SearchRequest searchRequest = SearchRequest.query("hello"); + List documents = analyticdbVector.similaritySearch(searchRequest); + System.out.println(documents.get(0).getContent()); + + // analyticdbVector.delete(List.of("1")); + + } + + @Test + void testSearchByVector() { + // Suppose we have a known vector and some preset parameters. + // List queryVector = Arrays.asList(0.1f, 0.2f, 0.3f); + // Map kwargs = new HashMap<>(); + // kwargs.put("score_threshold", 0.5f); + SearchRequest searchRequest = SearchRequest.query("hello"); + searchRequest.withTopK(5); + searchRequest.withSimilarityThreshold(0.5f); + + // Call the method and verify the return result. + List results = analyticdbVector.similaritySearch(searchRequest); + + // There should be some assertions here to verify that the results meet + // expectations. + Assertions.assertNotNull(results); + // The more specific assertions can be added based on your needs. + } + + @Test + void testDelete() { + // Call the delete method. + analyticdbVector.delete(List.of("1")); + + // Based on your actual situation, you can add logic here to verify + // whether the delete operation was successful. + // For example, check whether the collection exists in the database. + } + +}