diff --git a/community/document-parsers/document-parser-apache-pdfbox/pom.xml b/community/document-parsers/document-parser-apache-pdfbox/pom.xml new file mode 100644 index 00000000..7a1a0b9d --- /dev/null +++ b/community/document-parsers/document-parser-apache-pdfbox/pom.xml @@ -0,0 +1,76 @@ + + + 4.0.0 + + com.alibaba.cloud.ai + spring-ai-alibaba + ${revision} + ../../../pom.xml + + + document-parser-apache-pdfbox + document-parser-apache-pdfbox + document-parser-apache-pdfbox for Spring AI Alibaba + jar + https://github.com/alibaba/spring-ai-alibaba + + https://github.com/alibaba/spring-ai-alibaba + git://github.com/alibaba/spring-ai-alibaba.git + git@github.com:alibaba/spring-ai-alibaba.git + + + + 17 + 17 + UTF-8 + 2.0.32 + + + + + com.alibaba.cloud.ai + spring-ai-alibaba-core + ${project.parent.version} + + + + org.apache.pdfbox + pdfbox + ${pdfbox.version} + + + commons-logging + commons-logging + + + + + + + org.springframework.ai + spring-ai-test + test + + + + org.springframework.boot + spring-boot-starter-test + test + + + + org.junit.jupiter + junit-jupiter-engine + test + + + + org.assertj + assertj-core + test + + + + \ No newline at end of file diff --git a/community/document-parsers/document-parser-apache-pdfbox/src/main/java/com/alibaba/cloud/ai/parser/apache/pdfbox/ApachePdfBoxDocumentParser.java b/community/document-parsers/document-parser-apache-pdfbox/src/main/java/com/alibaba/cloud/ai/parser/apache/pdfbox/ApachePdfBoxDocumentParser.java new file mode 100644 index 00000000..a8b6f257 --- /dev/null +++ b/community/document-parsers/document-parser-apache-pdfbox/src/main/java/com/alibaba/cloud/ai/parser/apache/pdfbox/ApachePdfBoxDocumentParser.java @@ -0,0 +1,60 @@ +package com.alibaba.cloud.ai.parser.apache.pdfbox; + +import com.alibaba.cloud.ai.document.DocumentParser; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentInformation; +import org.apache.pdfbox.text.PDFTextStripper; +import org.springframework.ai.document.Document; +import org.springframework.util.Assert; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * @author HeYQ + * @since 2024-12-08 22:34 + */ + +public class ApachePdfBoxDocumentParser implements DocumentParser { + + private final boolean includeMetadata; + + public ApachePdfBoxDocumentParser() { + this(false); + } + + public ApachePdfBoxDocumentParser(boolean includeMetadata) { + this.includeMetadata = includeMetadata; + } + + @Override + public List parse(InputStream inputStream) { + try (PDDocument pdfDocument = PDDocument.load(inputStream)) { + PDFTextStripper stripper = new PDFTextStripper(); + String text = stripper.getText(pdfDocument); + Assert.notNull(text, "Text cannot be null"); + return includeMetadata ? Collections.singletonList(new Document(text, toMetadata(pdfDocument))) + : Collections.singletonList(new Document(text)); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + + private Map toMetadata(PDDocument pdDocument) { + PDDocumentInformation documentInformation = pdDocument.getDocumentInformation(); + Map metadata = new HashMap<>(); + for (String metadataKey : documentInformation.getMetadataKeys()) { + String value = documentInformation.getCustomMetadataValue(metadataKey); + if (value != null) { + metadata.put(metadataKey, value); + } + } + return metadata; + } + +} diff --git a/community/document-parsers/document-parser-apache-pdfbox/src/test/java/com/alibaba/cloud/ai/parser/apache/pdfbox/ApachePdfBoxDocumentParserTest.java b/community/document-parsers/document-parser-apache-pdfbox/src/test/java/com/alibaba/cloud/ai/parser/apache/pdfbox/ApachePdfBoxDocumentParserTest.java new file mode 100644 index 00000000..ec7f4e4a --- /dev/null +++ b/community/document-parsers/document-parser-apache-pdfbox/src/test/java/com/alibaba/cloud/ai/parser/apache/pdfbox/ApachePdfBoxDocumentParserTest.java @@ -0,0 +1,45 @@ +package com.alibaba.cloud.ai.parser.apache.pdfbox; + +import com.alibaba.cloud.ai.document.DocumentParser; +import org.junit.jupiter.api.Test; +import org.springframework.ai.document.Document; + +import java.io.IOException; +import java.io.InputStream; + +import static org.assertj.core.api.Assertions.assertThat; + +class ApachePdfBoxDocumentParserTest { + + @Test + void should_parse_pdf_file() { + try (InputStream inputStream = getClass().getClassLoader().getResourceAsStream("test-file.pdf")) { + DocumentParser parser = new ApachePdfBoxDocumentParser(); + Document document = parser.parse(inputStream).get(0); + + assertThat(document.getContent()).isEqualToIgnoringWhitespace("test content"); + assertThat(document.getMetadata()).isEmpty(); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Test + void should_parse_pdf_file_include_metadata() { + try (InputStream inputStream = getClass().getClassLoader().getResourceAsStream("test-file.pdf")) { + DocumentParser parser = new ApachePdfBoxDocumentParser(true); + Document document = parser.parse(inputStream).get(0); + + assertThat(document.getContent()).isEqualToIgnoringWhitespace("test content"); + assertThat(document.getMetadata()).containsEntry("Author", "ljuba") + .containsEntry("Creator", "WPS Writer") + .containsEntry("CreationDate", "D:20230608171011+15'10'") + .containsEntry("SourceModified", "D:20230608171011+15'10'"); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + +} \ No newline at end of file diff --git a/community/document-parsers/document-parser-apache-pdfbox/src/test/resources/blank-file.pdf b/community/document-parsers/document-parser-apache-pdfbox/src/test/resources/blank-file.pdf new file mode 100644 index 00000000..757bb1f3 Binary files /dev/null and b/community/document-parsers/document-parser-apache-pdfbox/src/test/resources/blank-file.pdf differ diff --git a/community/document-parsers/document-parser-apache-pdfbox/src/test/resources/test-file.pdf b/community/document-parsers/document-parser-apache-pdfbox/src/test/resources/test-file.pdf new file mode 100644 index 00000000..920f1fce Binary files /dev/null and b/community/document-parsers/document-parser-apache-pdfbox/src/test/resources/test-file.pdf differ diff --git a/community/document-parsers/document-parser-markdown/pom.xml b/community/document-parsers/document-parser-markdown/pom.xml new file mode 100644 index 00000000..0b8b47ce --- /dev/null +++ b/community/document-parsers/document-parser-markdown/pom.xml @@ -0,0 +1,71 @@ + + + 4.0.0 + + com.alibaba.cloud.ai + spring-ai-alibaba + ${revision} + ../../../pom.xml + + + document-parser-markdown + document-parser-markdown + document-parser-markdown for Spring AI Alibaba + jar + https://github.com/alibaba/spring-ai-alibaba + + https://github.com/alibaba/spring-ai-alibaba + git://github.com/alibaba/spring-ai-alibaba.git + git@github.com:alibaba/spring-ai-alibaba.git + + + + + 17 + 17 + UTF-8 + 0.22.0 + + + + + com.alibaba.cloud.ai + spring-ai-alibaba-core + ${project.parent.version} + + + + org.commonmark + commonmark + ${commonmark.version} + + + + + org.springframework.ai + spring-ai-test + test + + + + org.springframework.boot + spring-boot-starter-test + test + + + + org.junit.jupiter + junit-jupiter-engine + test + + + + org.assertj + assertj-core + test + + + + \ No newline at end of file diff --git a/community/document-parsers/document-parser-markdown/src/main/java/com/alibaba/cloud/ai/parser/markdown/MarkdownDocumentParser.java b/community/document-parsers/document-parser-markdown/src/main/java/com/alibaba/cloud/ai/parser/markdown/MarkdownDocumentParser.java new file mode 100644 index 00000000..56ec3dae --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/main/java/com/alibaba/cloud/ai/parser/markdown/MarkdownDocumentParser.java @@ -0,0 +1,206 @@ +package com.alibaba.cloud.ai.parser.markdown; + +import com.alibaba.cloud.ai.document.DocumentParser; +import com.alibaba.cloud.ai.parser.markdown.config.MarkdownDocumentParserConfig; +import org.commonmark.node.AbstractVisitor; +import org.commonmark.node.BlockQuote; +import org.commonmark.node.Code; +import org.commonmark.node.FencedCodeBlock; +import org.commonmark.node.HardLineBreak; +import org.commonmark.node.Heading; +import org.commonmark.node.ListItem; +import org.commonmark.node.Node; +import org.commonmark.node.SoftLineBreak; +import org.commonmark.node.Text; +import org.commonmark.node.ThematicBreak; +import org.commonmark.parser.Parser; +import org.springframework.ai.document.Document; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; + +/** + * @author HeYQ + * @since 2024-12-08 21:32 + */ + +public class MarkdownDocumentParser implements DocumentParser { + + /** + * Configuration to a parsing process. + */ + private final MarkdownDocumentParserConfig config; + + /** + * Markdown parser. + */ + private final Parser parser; + + public MarkdownDocumentParser() { + this(MarkdownDocumentParserConfig.defaultConfig()); + } + + /** + * Create a new {@link MarkdownDocumentParser} instance. + * + */ + public MarkdownDocumentParser(MarkdownDocumentParserConfig config) { + this.config = config; + this.parser = Parser.builder().build(); + } + + @Override + public List parse(InputStream inputStream) { + try (var input = inputStream) { + Node node = this.parser.parseReader(new InputStreamReader(input)); + + DocumentVisitor documentVisitor = new DocumentVisitor(this.config); + node.accept(documentVisitor); + + return documentVisitor.getDocuments(); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * A convenient class for visiting handled nodes in the Markdown document. + */ + static class DocumentVisitor extends AbstractVisitor { + + private final List documents = new ArrayList<>(); + + private final List currentParagraphs = new ArrayList<>(); + + private final MarkdownDocumentParserConfig config; + + private Document.Builder currentDocumentBuilder; + + DocumentVisitor(MarkdownDocumentParserConfig config) { + this.config = config; + } + + /** + * Visits the document node and initializes the current document builder. + */ + @Override + public void visit(org.commonmark.node.Document document) { + this.currentDocumentBuilder = Document.builder(); + super.visit(document); + } + + @Override + public void visit(Heading heading) { + buildAndFlush(); + super.visit(heading); + } + + @Override + public void visit(ThematicBreak thematicBreak) { + if (this.config.horizontalRuleCreateDocument) { + buildAndFlush(); + } + super.visit(thematicBreak); + } + + @Override + public void visit(SoftLineBreak softLineBreak) { + translateLineBreakToSpace(); + super.visit(softLineBreak); + } + + @Override + public void visit(HardLineBreak hardLineBreak) { + translateLineBreakToSpace(); + super.visit(hardLineBreak); + } + + @Override + public void visit(ListItem listItem) { + translateLineBreakToSpace(); + super.visit(listItem); + } + + @Override + public void visit(BlockQuote blockQuote) { + if (!this.config.includeBlockquote) { + buildAndFlush(); + } + + translateLineBreakToSpace(); + this.currentDocumentBuilder.withMetadata("category", "blockquote"); + super.visit(blockQuote); + } + + @Override + public void visit(Code code) { + this.currentParagraphs.add(code.getLiteral()); + this.currentDocumentBuilder.withMetadata("category", "code_inline"); + super.visit(code); + } + + @Override + public void visit(FencedCodeBlock fencedCodeBlock) { + if (!this.config.includeCodeBlock) { + buildAndFlush(); + } + + translateLineBreakToSpace(); + this.currentParagraphs.add(fencedCodeBlock.getLiteral()); + this.currentDocumentBuilder.withMetadata("category", "code_block"); + this.currentDocumentBuilder.withMetadata("lang", fencedCodeBlock.getInfo()); + + buildAndFlush(); + + super.visit(fencedCodeBlock); + } + + @Override + public void visit(Text text) { + if (text.getParent() instanceof Heading heading) { + this.currentDocumentBuilder.withMetadata("category", "header_%d".formatted(heading.getLevel())) + .withMetadata("title", text.getLiteral()); + } + else { + this.currentParagraphs.add(text.getLiteral()); + } + + super.visit(text); + } + + public List getDocuments() { + buildAndFlush(); + + return this.documents; + } + + private void buildAndFlush() { + if (!this.currentParagraphs.isEmpty()) { + String content = String.join("", this.currentParagraphs); + + Document.Builder builder = this.currentDocumentBuilder.withContent(content); + + this.config.additionalMetadata.forEach(builder::withMetadata); + + Document document = builder.build(); + + this.documents.add(document); + + this.currentParagraphs.clear(); + } + this.currentDocumentBuilder = Document.builder(); + } + + private void translateLineBreakToSpace() { + if (!this.currentParagraphs.isEmpty()) { + this.currentParagraphs.add(" "); + } + } + + } + +} diff --git a/community/document-parsers/document-parser-markdown/src/main/java/com/alibaba/cloud/ai/parser/markdown/config/MarkdownDocumentParserConfig.java b/community/document-parsers/document-parser-markdown/src/main/java/com/alibaba/cloud/ai/parser/markdown/config/MarkdownDocumentParserConfig.java new file mode 100644 index 00000000..1db101e7 --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/main/java/com/alibaba/cloud/ai/parser/markdown/config/MarkdownDocumentParserConfig.java @@ -0,0 +1,122 @@ +package com.alibaba.cloud.ai.parser.markdown.config; + +import org.springframework.ai.document.Document; +import org.springframework.util.Assert; + +import java.util.HashMap; +import java.util.Map; + +/** + * @author HeYQ + * @since 2024-12-08 21:38 + */ + +public class MarkdownDocumentParserConfig { + + public final boolean horizontalRuleCreateDocument; + + public final boolean includeCodeBlock; + + public final boolean includeBlockquote; + + public final Map additionalMetadata; + + public MarkdownDocumentParserConfig(Builder builder) { + this.horizontalRuleCreateDocument = builder.horizontalRuleCreateDocument; + this.includeCodeBlock = builder.includeCodeBlock; + this.includeBlockquote = builder.includeBlockquote; + this.additionalMetadata = builder.additionalMetadata; + } + + /** + * @return the default configuration + */ + public static MarkdownDocumentParserConfig defaultConfig() { + return builder().build(); + } + + public static Builder builder() { + return new Builder(); + } + + public static final class Builder { + + private boolean horizontalRuleCreateDocument = false; + + private boolean includeCodeBlock = false; + + private boolean includeBlockquote = false; + + private Map additionalMetadata = new HashMap<>(); + + private Builder() { + } + + /** + * Text divided by horizontal lines will create new {@link Document}s. The default + * is {@code false}, meaning text separated by horizontal lines won't create a new + * document. + * @param horizontalRuleCreateDocument flag to determine whether new documents are + * created from text divided by horizontal line + * @return this builder + */ + public Builder withHorizontalRuleCreateDocument(boolean horizontalRuleCreateDocument) { + this.horizontalRuleCreateDocument = horizontalRuleCreateDocument; + return this; + } + + /** + * Whatever to include code blocks in {@link Document}s. The default is + * {@code false}, which means all code blocks are in separate documents. + * @param includeCodeBlock flag to include code block into paragraph document or + * create new with code only + * @return this builder + */ + public Builder withIncludeCodeBlock(boolean includeCodeBlock) { + this.includeCodeBlock = includeCodeBlock; + return this; + } + + /** + * Whatever to include blockquotes in {@link Document}s. The default is + * {@code false}, which means all blockquotes are in separate documents. + * @param includeBlockquote flag to include blockquotes into paragraph document or + * create new with blockquote only + * @return this builder + */ + public Builder withIncludeBlockquote(boolean includeBlockquote) { + this.includeBlockquote = includeBlockquote; + return this; + } + + /** + * Adds this additional metadata to the all built {@link Document}s. + * @return this builder + */ + public Builder withAdditionalMetadata(String key, Object value) { + Assert.notNull(key, "key must not be null"); + Assert.notNull(value, "value must not be null"); + this.additionalMetadata.put(key, value); + return this; + } + + /** + * Adds this additional metadata to the all built {@link Document}s. + * @return this builder + */ + public Builder withAdditionalMetadata(Map additionalMetadata) { + Assert.notNull(additionalMetadata, "additionalMetadata must not be null"); + this.additionalMetadata = additionalMetadata; + return this; + } + + /** + * @return the immutable configuration + */ + public MarkdownDocumentParserConfig build() { + return new MarkdownDocumentParserConfig(this); + } + + } + +} diff --git a/community/document-parsers/document-parser-markdown/src/test/java/com/alibaba/cloud/ai/parser/markdown/MarkdownDocumentParserTest.java b/community/document-parsers/document-parser-markdown/src/test/java/com/alibaba/cloud/ai/parser/markdown/MarkdownDocumentParserTest.java new file mode 100644 index 00000000..230af67a --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/java/com/alibaba/cloud/ai/parser/markdown/MarkdownDocumentParserTest.java @@ -0,0 +1,263 @@ +/* + * Copyright 2023-2024 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.alibaba.cloud.ai.parser.markdown; + +import com.alibaba.cloud.ai.parser.markdown.config.MarkdownDocumentParserConfig; +import org.junit.jupiter.api.Test; +import org.springframework.ai.document.Document; +import org.springframework.core.io.DefaultResourceLoader; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.groups.Tuple.tuple; + +/** + * @author HeYQ + * @since 2024-12-08 21:38 + */ +class MarkdownDocumentParserTest { + + @Test + void testOnlyHeadersWithParagraphs() throws IOException { + MarkdownDocumentParser reader = new MarkdownDocumentParser(); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/only-headers.md").getInputStream()); + + assertThat(documents).hasSize(4) + .extracting(Document::getMetadata, Document::getContent) + .containsOnly(tuple(Map.of("category", "header_1", "title", "Header 1a"), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."), + tuple(Map.of("category", "header_1", "title", "Header 1b"), + "Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh."), + tuple(Map.of("category", "header_2", "title", "Header 2b"), + "Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero."), + tuple(Map.of("category", "header_2", "title", "Header 2c"), + "Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit.")); + } + + @Test + void testWithFormatting() throws IOException { + MarkdownDocumentParser reader = new MarkdownDocumentParser(); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/with-formatting.md").getInputStream()); + + assertThat(documents).hasSize(2) + .extracting(Document::getMetadata, Document::getContent) + .containsOnly(tuple(Map.of("category", "header_1", "title", "This is a fancy header name"), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."), + tuple(Map.of("category", "header_3", "title", "Header 3"), + "Aenean eu leo eu nibh tristique posuere quis quis massa.")); + } + + @Test + void testDocumentDividedViaHorizontalRules() throws IOException { + MarkdownDocumentParserConfig config = MarkdownDocumentParserConfig.builder() + .withHorizontalRuleCreateDocument(true) + .build(); + + MarkdownDocumentParser reader = new MarkdownDocumentParser(config); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/horizontal-rules.md").getInputStream()); + + assertThat(documents).hasSize(7) + .extracting(Document::getMetadata, Document::getContent) + .containsOnly(tuple(Map.of(), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida."), + tuple(Map.of(), + "Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."), + tuple(Map.of(), + "Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis et magna."), + tuple(Map.of(), + "Vestibulum nec eros non felis fermentum posuere eget ac risus. Curabitur et fringilla massa. Cras facilisis nec nisl sit amet sagittis."), + tuple(Map.of(), + "Aenean eu leo eu nibh tristique posuere quis quis massa. Nullam lacinia luctus sem ut vehicula."), + tuple(Map.of(), + "Aenean quis vulputate mi. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Nam tincidunt nunc a tortor tincidunt, nec lobortis diam rhoncus."), + tuple(Map.of(), "Nulla facilisi. Phasellus eget tellus sed nibh ornare interdum eu eu mi.")); + } + + @Test + void testDocumentNotDividedViaHorizontalRulesWhenIsDisabled() throws IOException { + + MarkdownDocumentParserConfig config = MarkdownDocumentParserConfig.builder() + .withHorizontalRuleCreateDocument(false) + .build(); + MarkdownDocumentParser reader = new MarkdownDocumentParser(config); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/horizontal-rules.md").getInputStream()); + + assertThat(documents).hasSize(1); + + Document documentsFirst = documents.get(0); + assertThat(documentsFirst.getMetadata()).isEmpty(); + assertThat(documentsFirst.getContent()).startsWith("Lorem ipsum dolor sit amet, consectetur adipiscing elit") + .endsWith("Phasellus eget tellus sed nibh ornare interdum eu eu mi."); + } + + @Test + void testSimpleMarkdownDocumentWithHardAndSoftLineBreaks() throws IOException { + + MarkdownDocumentParser reader = new MarkdownDocumentParser(); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/simple.md").getInputStream()); + + assertThat(documents).hasSize(1); + + Document documentsFirst = documents.get(0); + assertThat(documentsFirst.getMetadata()).isEmpty(); + assertThat(documentsFirst.getContent()).isEqualTo( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim.Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis et magna. Vestibulum nec eros non felis fermentum posuere eget ac risus.Aenean eu leo eu nibh tristique posuere quis quis massa. Nullam lacinia luctus sem ut vehicula."); + } + + @Test + void testCode() throws IOException { + MarkdownDocumentParserConfig config = MarkdownDocumentParserConfig.builder() + .withHorizontalRuleCreateDocument(true) + .build(); + + MarkdownDocumentParser reader = new MarkdownDocumentParser(config); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/code.md").getInputStream()); + + assertThat(documents).satisfiesExactly(document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of()); + assertThat(document.getContent()).isEqualTo("This is a Java sample application:"); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "java", "category", "code_block")); + assertThat(document.getContent()).startsWith("package com.example.demo;") + .contains("SpringApplication.run(DemoApplication.class, args);"); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("category", "code_inline")); + assertThat(document.getContent()).isEqualTo( + "Markdown also provides the possibility to use inline code formatting throughout the entire sentence."); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of()); + assertThat(document.getContent()) + .isEqualTo("Another possibility is to set block code without specific highlighting:"); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "", "category", "code_block")); + assertThat(document.getContent()).isEqualTo("./mvnw spring-javaformat:apply\n"); + }); + } + + @Test + void testCodeWhenCodeBlockShouldNotBeSeparatedDocument() throws IOException { + MarkdownDocumentParserConfig config = MarkdownDocumentParserConfig.builder() + .withHorizontalRuleCreateDocument(true) + .withIncludeCodeBlock(true) + .build(); + + MarkdownDocumentParser reader = new MarkdownDocumentParser(config); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/code.md").getInputStream()); + + assertThat(documents).satisfiesExactly(document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "java", "category", "code_block")); + assertThat(document.getContent()).startsWith("This is a Java sample application: package com.example.demo") + .contains("SpringApplication.run(DemoApplication.class, args);"); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("category", "code_inline")); + assertThat(document.getContent()).isEqualTo( + "Markdown also provides the possibility to use inline code formatting throughout the entire sentence."); + }, document -> { + assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "", "category", "code_block")); + assertThat(document.getContent()).isEqualTo( + "Another possibility is to set block code without specific highlighting: ./mvnw spring-javaformat:apply\n"); + }); + } + + @Test + void testBlockquote() throws IOException { + + MarkdownDocumentParser reader = new MarkdownDocumentParser(); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/blockquote.md").getInputStream()); + + assertThat(documents).hasSize(2) + .extracting(Document::getMetadata, Document::getContent) + .containsOnly(tuple(Map.of(), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."), + tuple(Map.of("category", "blockquote"), + "Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit.")); + } + + @Test + void testBlockquoteWhenBlockquoteShouldNotBeSeparatedDocument() throws IOException { + MarkdownDocumentParserConfig config = MarkdownDocumentParserConfig.builder() + .withIncludeBlockquote(true) + .build(); + + MarkdownDocumentParser reader = new MarkdownDocumentParser(config); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/blockquote.md").getInputStream()); + + assertThat(documents).hasSize(1); + + Document documentsFirst = documents.get(0); + assertThat(documentsFirst.getMetadata()).isEqualTo(Map.of("category", "blockquote")); + assertThat(documentsFirst.getContent()).isEqualTo( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."); + } + + @Test + void testLists() throws IOException { + + MarkdownDocumentParser reader = new MarkdownDocumentParser(); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/lists.md").getInputStream()); + + assertThat(documents).hasSize(2) + .extracting(Document::getMetadata, Document::getContent) + .containsOnly(tuple(Map.of("category", "header_2", "title", "Ordered list"), + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor."), + tuple(Map.of("category", "header_2", "title", "Unordered list"), + "Aenean eu leo eu nibh tristique posuere quis quis massa. Aenean imperdiet libero dui, nec malesuada dui maximus vel. Vestibulum sed dui condimentum, cursus libero in, dapibus tortor. Etiam facilisis enim in egestas dictum.")); + } + + @Test + void testWithAdditionalMetadata() throws IOException { + MarkdownDocumentParserConfig config = MarkdownDocumentParserConfig.builder() + .withAdditionalMetadata("service", "some-service-name") + .withAdditionalMetadata("env", "prod") + .build(); + + MarkdownDocumentParser reader = new MarkdownDocumentParser(config); + + List documents = reader + .parse(new DefaultResourceLoader().getResource("classpath:/simple.md").getInputStream()); + + assertThat(documents).hasSize(1); + + Document documentsFirst = documents.get(0); + assertThat(documentsFirst.getMetadata()).isEqualTo(Map.of("service", "some-service-name", "env", "prod")); + assertThat(documentsFirst.getContent()).startsWith("Lorem ipsum dolor sit amet, consectetur adipiscing elit."); + } + +} diff --git a/community/document-parsers/document-parser-markdown/src/test/resources/blockquote.md b/community/document-parsers/document-parser-markdown/src/test/resources/blockquote.md new file mode 100644 index 00000000..d92ac44f --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/resources/blockquote.md @@ -0,0 +1,8 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed +nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. + +> Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget +> sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a +> porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum +> suscipit. + diff --git a/community/document-parsers/document-parser-markdown/src/test/resources/code.md b/community/document-parsers/document-parser-markdown/src/test/resources/code.md new file mode 100644 index 00000000..31d7c7b0 --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/resources/code.md @@ -0,0 +1,25 @@ +This is a Java sample application: + +```java +package com.example.demo; + +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; + +@SpringBootApplication +public class DemoApplication { + public static void main(String[] args) { + SpringApplication.run(DemoApplication.class, args); + } +} +``` + +Markdown also provides the possibility to `use inline code formatting throughout` the entire sentence. + +--- + +Another possibility is to set block code without specific highlighting: + +``` +./mvnw spring-javaformat:apply +``` diff --git a/community/document-parsers/document-parser-markdown/src/test/resources/horizontal-rules.md b/community/document-parsers/document-parser-markdown/src/test/resources/horizontal-rules.md new file mode 100644 index 00000000..f7affefc --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/resources/horizontal-rules.md @@ -0,0 +1,27 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. + +--- + +Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu +elementum dignissim. + +*** +Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis +et magna. + +* * * + +Vestibulum nec eros non felis fermentum posuere eget ac risus. Curabitur et fringilla massa. Cras facilisis nec nisl sit +amet sagittis. + +***** + +Aenean eu leo eu nibh tristique posuere quis quis massa. Nullam lacinia luctus sem ut vehicula. + +--------------------------------------- + +Aenean quis vulputate mi. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Nam tincidunt nunc a tortor tincidunt, nec lobortis diam rhoncus. + +- - - + +Nulla facilisi. Phasellus eget tellus sed nibh ornare interdum eu eu mi. diff --git a/community/document-parsers/document-parser-markdown/src/test/resources/lists.md b/community/document-parsers/document-parser-markdown/src/test/resources/lists.md new file mode 100644 index 00000000..f82e7e34 --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/resources/lists.md @@ -0,0 +1,17 @@ +## Ordered list + +1. Lorem ipsum dolor sit *amet*, consectetur adipiscing elit. **Curabitur** diam eros, laoreet sit _amet_ cursus vitae, + varius sed nisi. +2. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. +3. Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget + sapien odio. + 1. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum + suscipit. + 2. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. + +## Unordered list + +* Aenean eu leo eu nibh tristique posuere quis quis massa. +* Aenean imperdiet libero dui, nec malesuada dui maximus vel. Vestibulum sed dui condimentum, cursus libero in, dapibus + tortor. + * Etiam facilisis enim in egestas dictum. diff --git a/community/document-parsers/document-parser-markdown/src/test/resources/only-headers.md b/community/document-parsers/document-parser-markdown/src/test/resources/only-headers.md new file mode 100644 index 00000000..81c770e8 --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/resources/only-headers.md @@ -0,0 +1,20 @@ +# Header 1a + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed +nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. + +# Header 1b + +Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed +sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh. + +## Header 2b + +Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien +odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. + +# Header 1c + +## Header 2c + +Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit. diff --git a/community/document-parsers/document-parser-markdown/src/test/resources/simple.md b/community/document-parsers/document-parser-markdown/src/test/resources/simple.md new file mode 100644 index 00000000..3275c89b --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/resources/simple.md @@ -0,0 +1,8 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan +tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim. + +Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis et magna. Vestibulum nec eros non felis fermentum posuere eget ac risus. + +Aenean eu leo eu nibh tristique posuere quis quis massa.\ +Nullam lacinia luctus sem ut vehicula. + diff --git a/community/document-parsers/document-parser-markdown/src/test/resources/with-formatting.md b/community/document-parsers/document-parser-markdown/src/test/resources/with-formatting.md new file mode 100644 index 00000000..963743ec --- /dev/null +++ b/community/document-parsers/document-parser-markdown/src/test/resources/with-formatting.md @@ -0,0 +1,9 @@ +# This is a fancy header name + +Lorem ipsum dolor sit amet, **consectetur adipiscing elit**. Donec tincidunt velit non bibendum gravida. Cras accumsan +tincidunt ornare. Donec hendrerit consequat tellus *blandit* accumsan. Aenean aliquam metus at ***arcu elementum*** +dignissim. + +### Header 3 + +Aenean eu leo eu nibh tristique _posuere quis quis massa_. diff --git a/community/document-parsers/document-parser-tika/src/main/java/com/alibaba/cloud/ai/parser/tika/TikaDocumentParser.java b/community/document-parsers/document-parser-tika/src/main/java/com/alibaba/cloud/ai/parser/tika/TikaDocumentParser.java index cdd2d648..b3a8a79a 100644 --- a/community/document-parsers/document-parser-tika/src/main/java/com/alibaba/cloud/ai/parser/tika/TikaDocumentParser.java +++ b/community/document-parsers/document-parser-tika/src/main/java/com/alibaba/cloud/ai/parser/tika/TikaDocumentParser.java @@ -12,16 +12,19 @@ import org.xml.sax.ContentHandler; import java.io.InputStream; +import java.util.Collections; +import java.util.List; import java.util.Objects; import java.util.function.Supplier; /** + * Parses files into {@link Document}s using Apache Tika library, automatically detecting + * the file format. This parser supports various file formats, including PDF, DOC, PPT, + * XLS. For detailed information on supported formats, please refer to the + * Apache Tika documentation. + * * @author HeYQ - * @since 2024-12-02 11:32 Parses files into {@link Document}s using Apache Tika library, - * automatically detecting the file format. This parser supports various file formats, - * including PDF, DOC, PPT, XLS. For detailed information on supported formats, please - * refer to the Apache Tika - * documentation. + * @since 2024-12-02 11:32 */ public class TikaDocumentParser implements DocumentParser { @@ -90,7 +93,7 @@ public TikaDocumentParser(Supplier parserSupplier, Supplier parse(InputStream inputStream) { try { Parser parser = parserSupplier.get(); ContentHandler contentHandler = contentHandlerSupplier.get(); @@ -104,7 +107,7 @@ public Document parse(InputStream inputStream) { throw new ZeroByteFileException("The content is blank!"); } - return toDocument(text); + return Collections.singletonList(toDocument(text)); } catch (Exception e) { throw new RuntimeException(e); diff --git a/community/document-parsers/document-parser-tika/src/test/java/com/alibaba/cloud/ai/parser/tika/ApacheTikaDocumentParserTest.java b/community/document-parsers/document-parser-tika/src/test/java/com/alibaba/cloud/ai/parser/tika/ApacheTikaDocumentParserTest.java index e7c13664..bf98c0cf 100644 --- a/community/document-parsers/document-parser-tika/src/test/java/com/alibaba/cloud/ai/parser/tika/ApacheTikaDocumentParserTest.java +++ b/community/document-parsers/document-parser-tika/src/test/java/com/alibaba/cloud/ai/parser/tika/ApacheTikaDocumentParserTest.java @@ -22,7 +22,7 @@ void should_parse_doc_ppt_and_pdf_files(String fileName) { DocumentParser parser = new TikaDocumentParser(); InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName); - Document document = parser.parse(inputStream); + Document document = parser.parse(inputStream).get(0); assertThat(document.getContent()).isEqualToIgnoringWhitespace("test content"); assertThat(document.getMetadata()).isEmpty(); @@ -35,7 +35,7 @@ void should_parse_xls_files(String fileName) { DocumentParser parser = new TikaDocumentParser(AutoDetectParser::new, null, null, null); InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName); - Document document = parser.parse(inputStream); + Document document = parser.parse(inputStream).get(0); assertThat(document.getContent()).isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content"); assertThat(document.getMetadata()).isEmpty(); @@ -48,8 +48,8 @@ void should_parse_files_stateless() { InputStream inputStream1 = getClass().getClassLoader().getResourceAsStream("test-file.xls"); InputStream inputStream2 = getClass().getClassLoader().getResourceAsStream("test-file.xls"); - Document document1 = parser.parse(inputStream1); - Document document2 = parser.parse(inputStream2); + Document document1 = parser.parse(inputStream1).get(0); + Document document2 = parser.parse(inputStream2).get(0); assertThat(document1.getContent()).isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content"); assertThat(document2.getContent()).isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content"); diff --git a/community/document-readers/github-reader/src/main/java/com/alibaba/cloud/ai/reader/github/GitHubDocumentReader.java b/community/document-readers/github-reader/src/main/java/com/alibaba/cloud/ai/reader/github/GitHubDocumentReader.java index 1782fcb8..5d3253d6 100644 --- a/community/document-readers/github-reader/src/main/java/com/alibaba/cloud/ai/reader/github/GitHubDocumentReader.java +++ b/community/document-readers/github-reader/src/main/java/com/alibaba/cloud/ai/reader/github/GitHubDocumentReader.java @@ -56,19 +56,21 @@ private void processResourceList(List documents) { private void loadDocuments(List documents, GitHubResource gitHubResource) { try { - Document document = parser.parse(gitHubResource.getInputStream()); - GHContent ghContent = gitHubResource.getContent(); - Map metadata = document.getMetadata(); - metadata.put("github_git_url", ghContent.getGitUrl()); - metadata.put("github_download_url", ghContent.getDownloadUrl()); - metadata.put("github_html_url", ghContent.getHtmlUrl()); - metadata.put("github_url", ghContent.getUrl()); - metadata.put("github_file_name", ghContent.getName()); - metadata.put("github_file_path", ghContent.getPath()); - metadata.put("github_file_sha", ghContent.getSha()); - metadata.put("github_file_size", Long.toString(ghContent.getSize())); - metadata.put("github_file_encoding", ghContent.getEncoding()); - documents.add(document); + List documentList = parser.parse(gitHubResource.getInputStream()); + for (Document document : documentList) { + GHContent ghContent = gitHubResource.getContent(); + Map metadata = document.getMetadata(); + metadata.put("github_git_url", ghContent.getGitUrl()); + metadata.put("github_download_url", ghContent.getDownloadUrl()); + metadata.put("github_html_url", ghContent.getHtmlUrl()); + metadata.put("github_url", ghContent.getUrl()); + metadata.put("github_file_name", ghContent.getName()); + metadata.put("github_file_path", ghContent.getPath()); + metadata.put("github_file_sha", ghContent.getSha()); + metadata.put("github_file_size", Long.toString(ghContent.getSize())); + metadata.put("github_file_encoding", ghContent.getEncoding()); + documents.add(document); + } } catch (IOException ioException) { throw new RuntimeException("Failed to load document from GitHub: {}", ioException); diff --git a/community/document-readers/tencent-cos-reader/src/main/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentReader.java b/community/document-readers/tencent-cos-reader/src/main/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentReader.java index df17eb5f..38cb4d71 100644 --- a/community/document-readers/tencent-cos-reader/src/main/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentReader.java +++ b/community/document-readers/tencent-cos-reader/src/main/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentReader.java @@ -59,9 +59,11 @@ private void loadDocuments(List documents, TencentCosResource resource String bucket = resource.getBucket(); String source = format("cos://%s/%s", bucket, key); try { - Document document = parser.parse(resource.getInputStream()); - document.getMetadata().put(TencentCosResource.SOURCE, source); - documents.add(document); + List documentList = parser.parse(resource.getInputStream()); + for (Document document : documentList) { + document.getMetadata().put(TencentCosResource.SOURCE, source); + documents.add(document); + } } catch (Exception e) { log.warn("Failed to load an object with key '{}' from bucket '{}', skipping it. Stack trace: {}", key, diff --git a/community/document-readers/tencent-cos-reader/src/test/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentLoaderIT.java b/community/document-readers/tencent-cos-reader/src/test/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentLoaderIT.java index 7de87251..7d592f65 100644 --- a/community/document-readers/tencent-cos-reader/src/test/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentLoaderIT.java +++ b/community/document-readers/tencent-cos-reader/src/test/java/com/alibaba/cloud/ai/tencent/cos/TencentCosDocumentLoaderIT.java @@ -90,9 +90,11 @@ void should_load_multiple_documents() { // given URL url = getClass().getClassLoader().getResource("test.txt"); + assert url != null; cosClient.putObject(new PutObjectRequest(TEST_BUCKET, TEST_KEY, new File(url.getFile()))); URL url2 = getClass().getClassLoader().getResource("test2.txt"); + assert url2 != null; cosClient.putObject(new PutObjectRequest(TEST_BUCKET, TEST_KEY_2, new File(url2.getFile()))); List tencentCosResourceList = TencentCosResource.builder() @@ -126,13 +128,16 @@ void should_load_multiple_documents_with_prefix() { // given URL otherUrl = getClass().getClassLoader().getResource("other.txt"); + assert otherUrl != null; cosClient .putObject(new PutObjectRequest(TEST_BUCKET, "other_directory/file.txt", new File(otherUrl.getFile()))); URL url = getClass().getClassLoader().getResource("test.txt"); + assert url != null; cosClient.putObject(new PutObjectRequest(TEST_BUCKET, TEST_KEY, new File(url.getFile()))); URL url2 = getClass().getClassLoader().getResource("test2.txt"); + assert url2 != null; cosClient.putObject(new PutObjectRequest(TEST_BUCKET, TEST_KEY_2, new File(url2.getFile()))); List tencentCosResourceList = TencentCosResource.builder() diff --git a/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/DocumentParser.java b/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/DocumentParser.java index c5cb8cf8..c93e49e3 100644 --- a/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/DocumentParser.java +++ b/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/DocumentParser.java @@ -3,6 +3,7 @@ import org.springframework.ai.document.Document; import java.io.InputStream; +import java.util.List; /** * @author HeYQ @@ -21,6 +22,6 @@ public interface DocumentParser { * {@link Document}. * @return The parsed {@link Document}. */ - Document parse(InputStream inputStream); + List parse(InputStream inputStream); } diff --git a/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/JsonDocumentParser.java b/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/JsonDocumentParser.java new file mode 100644 index 00000000..2cdc8dfe --- /dev/null +++ b/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/JsonDocumentParser.java @@ -0,0 +1,112 @@ +package com.alibaba.cloud.ai.document; + +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.springframework.ai.document.Document; +import org.springframework.ai.reader.EmptyJsonMetadataGenerator; +import org.springframework.ai.reader.JsonMetadataGenerator; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.stream.StreamSupport; + +/** + * @author HeYQ + * @since 2024-12-08 21:13 + */ + +public class JsonDocumentParser implements DocumentParser { + + private final JsonMetadataGenerator jsonMetadataGenerator; + + private final ObjectMapper objectMapper = new ObjectMapper(); + + /** + * The key from the JSON that we will use as the text to parse into the Document text + */ + private final List jsonKeysToUse; + + public JsonDocumentParser(String... jsonKeysToUse) { + this(new EmptyJsonMetadataGenerator(), jsonKeysToUse); + } + + public JsonDocumentParser(JsonMetadataGenerator jsonMetadataGenerator, String... jsonKeysToUse) { + Objects.requireNonNull(jsonKeysToUse, "keys must not be null"); + Objects.requireNonNull(jsonMetadataGenerator, "jsonMetadataGenerator must not be null"); + this.jsonMetadataGenerator = jsonMetadataGenerator; + this.jsonKeysToUse = List.of(jsonKeysToUse); + } + + @Override + public List parse(InputStream inputStream) { + try { + JsonNode rootNode = this.objectMapper.readTree(inputStream); + + if (rootNode.isArray()) { + return StreamSupport.stream(rootNode.spliterator(), true) + .map(jsonNode -> parseJsonNode(jsonNode, this.objectMapper)) + .toList(); + } + else { + return Collections.singletonList(parseJsonNode(rootNode, this.objectMapper)); + } + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + + private Document parseJsonNode(JsonNode jsonNode, ObjectMapper objectMapper) { + Map item = objectMapper.convertValue(jsonNode, new TypeReference>() { + + }); + var sb = new StringBuilder(); + + this.jsonKeysToUse.stream() + .filter(item::containsKey) + .forEach(key -> sb.append(key).append(": ").append(item.get(key)).append(System.lineSeparator())); + + Map metadata = this.jsonMetadataGenerator.generate(item); + String content = sb.isEmpty() ? item.toString() : sb.toString(); + return new Document(content, metadata); + } + + protected List get(JsonNode rootNode) { + if (rootNode.isArray()) { + return StreamSupport.stream(rootNode.spliterator(), true) + .map(jsonNode -> parseJsonNode(jsonNode, this.objectMapper)) + .toList(); + } + else { + return Collections.singletonList(parseJsonNode(rootNode, this.objectMapper)); + } + } + + /** + * Retrieves documents from the JSON resource using a JSON Pointer. + * @param pointer A JSON Pointer string (RFC 6901) to locate the desired element + * @return A list of Documents parsed from the located JSON element + * @throws RuntimeException if the JSON cannot be parsed or the pointer is invalid + */ + public List get(String pointer, InputStream inputStream) { + try { + JsonNode rootNode = this.objectMapper.readTree(inputStream); + JsonNode targetNode = rootNode.at(pointer); + + if (targetNode.isMissingNode()) { + throw new IllegalArgumentException("Invalid JSON Pointer: " + pointer); + } + + return get(targetNode); + } + catch (IOException e) { + throw new RuntimeException("Error reading JSON resource", e); + } + } + +} diff --git a/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/TextDocumentParser.java b/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/TextDocumentParser.java index 89ff98a6..9a895a00 100644 --- a/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/TextDocumentParser.java +++ b/spring-ai-alibaba-core/src/main/java/com/alibaba/cloud/ai/document/TextDocumentParser.java @@ -5,9 +5,15 @@ import java.io.InputStream; import java.nio.charset.Charset; +import java.util.Collections; +import java.util.List; import static java.nio.charset.StandardCharsets.UTF_8; +/** + * @author HeYQ + * @since 2024-12-08 21:13 + */ public class TextDocumentParser implements DocumentParser { private final Charset charset; @@ -22,13 +28,13 @@ public TextDocumentParser(Charset charset) { } @Override - public Document parse(InputStream inputStream) { + public List parse(InputStream inputStream) { try { String text = new String(inputStream.readAllBytes(), charset); if (text.isBlank()) { throw new Exception(); } - return new Document(text); + return Collections.singletonList(new Document(text)); } catch (Exception e) { throw new RuntimeException(e); diff --git a/spring-ai-alibaba-core/src/test/java/com/alibaba/cloud/ai/dashscope/rag/AnalyticdbVectorTest.java b/spring-ai-alibaba-core/src/test/java/com/alibaba/cloud/ai/dashscope/rag/AnalyticdbVectorTest.java index e1473930..1e2d00f5 100644 --- a/spring-ai-alibaba-core/src/test/java/com/alibaba/cloud/ai/dashscope/rag/AnalyticdbVectorTest.java +++ b/spring-ai-alibaba-core/src/test/java/com/alibaba/cloud/ai/dashscope/rag/AnalyticdbVectorTest.java @@ -40,11 +40,11 @@ void testGetInstance() throws Exception { List list = new ArrayList<>(10); Map metadata = new HashMap<>(); metadata.put("docId", "1"); // 123 //12344 - Document document = new Document("你好吗1234你是women12334444", metadata); - int length = 1536; // 数组长度 - float min = 0f; // 最小值 - float max = 1f; // 最大值 - float[] em = new float[length]; // 创建 float 数组 + Document document = new Document("hello1234you arewomen12334444", metadata); + int length = 1536; // Array length + float min = 0f; // smallest value + float max = 1f; // the largest value + float[] em = new float[length]; // create float array Random random = new Random(); for (int i = 0; i < length; i++) { em[i] = min + (max - min) * random.nextFloat(); @@ -52,7 +52,7 @@ void testGetInstance() throws Exception { document.setEmbedding(em); list.add(document); analyticdbVector.add(list); - SearchRequest searchRequest = SearchRequest.query("你好"); + SearchRequest searchRequest = SearchRequest.query("hello"); List documents = analyticdbVector.similaritySearch(searchRequest); System.out.println(documents.get(0).getContent()); @@ -62,29 +62,31 @@ void testGetInstance() throws Exception { @Test void testSearchByVector() { - // 假设我们有一个已知的向量和一些预设的参数 + // Suppose we have a known vector and some preset parameters. // List queryVector = Arrays.asList(0.1f, 0.2f, 0.3f); // Map kwargs = new HashMap<>(); // kwargs.put("score_threshold", 0.5f); - SearchRequest searchRequest = SearchRequest.query("你好"); + SearchRequest searchRequest = SearchRequest.query("hello"); searchRequest.withTopK(5); searchRequest.withSimilarityThreshold(0.5f); - // 调用方法并验证返回结果 + // Call the method and verify the return result. List results = analyticdbVector.similaritySearch(searchRequest); - // 这里应该有一些断言来验证结果是否符合预期 + // There should be some assertions here to verify that the results meet + // expectations. Assertions.assertNotNull(results); - // 更具体的断言可以根据你的需求添加 + // The more specific assertions can be added based on your needs. } @Test void testDelete() { - // 调用 delete 方法 + // Call the delete method. analyticdbVector.delete(List.of("1")); - // 根据你的实际情况,这里可以添加验证删除操作是否成功的逻辑 - // 例如,检查数据库中是否存在该集合 + // Based on your actual situation, you can add logic here to verify + // whether the delete operation was successful. + // For example, check whether the collection exists in the database. } }