Skip to content

Commit

Permalink
update: Document parser
Browse files Browse the repository at this point in the history
  • Loading branch information
sincerity-being committed Dec 8, 2024
1 parent 7a30024 commit 2beae9f
Show file tree
Hide file tree
Showing 25 changed files with 1,134 additions and 44 deletions.
76 changes: 76 additions & 0 deletions community/document-parsers/document-parser-apache-pdfbox/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>spring-ai-alibaba</artifactId>
<version>${revision}</version>
<relativePath>../../../pom.xml</relativePath>
</parent>

<artifactId>document-parser-apache-pdfbox</artifactId>
<name>document-parser-apache-pdfbox</name>
<description>document-parser-apache-pdfbox for Spring AI Alibaba</description>
<packaging>jar</packaging>
<url>https://github.com/alibaba/spring-ai-alibaba</url>
<scm>
<url>https://github.com/alibaba/spring-ai-alibaba</url>
<connection>git://github.com/alibaba/spring-ai-alibaba.git</connection>
<developerConnection>[email protected]:alibaba/spring-ai-alibaba.git</developerConnection>
</scm>

<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<pdfbox.version>2.0.32</pdfbox.version>
</properties>

<dependencies>
<dependency>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>spring-ai-alibaba-core</artifactId>
<version>${project.parent.version}</version>
</dependency>

<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>${pdfbox.version}</version>
<exclusions>
<exclusion>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
</exclusions>
</dependency>

<!-- test dependencies -->
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-test</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<scope>test</scope>
</dependency>
</dependencies>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package com.alibaba.cloud.ai.parser.apache.pdfbox;

import com.alibaba.cloud.ai.document.DocumentParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.ai.document.Document;
import org.springframework.util.Assert;

import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* @author HeYQ
* @since 2024-12-08 22:34
*/

public class ApachePdfBoxDocumentParser implements DocumentParser {

private final boolean includeMetadata;

public ApachePdfBoxDocumentParser() {
this(false);
}

public ApachePdfBoxDocumentParser(boolean includeMetadata) {
this.includeMetadata = includeMetadata;
}

@Override
public List<Document> parse(InputStream inputStream) {
try (PDDocument pdfDocument = PDDocument.load(inputStream)) {
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(pdfDocument);
Assert.notNull(text, "Text cannot be null");
return includeMetadata ? Collections.singletonList(new Document(text, toMetadata(pdfDocument)))
: Collections.singletonList(new Document(text));
}
catch (IOException e) {
throw new RuntimeException(e);
}
}

private Map<String, Object> toMetadata(PDDocument pdDocument) {
PDDocumentInformation documentInformation = pdDocument.getDocumentInformation();
Map<String, Object> metadata = new HashMap<>();
for (String metadataKey : documentInformation.getMetadataKeys()) {
String value = documentInformation.getCustomMetadataValue(metadataKey);
if (value != null) {
metadata.put(metadataKey, value);
}
}
return metadata;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package com.alibaba.cloud.ai.parser.apache.pdfbox;

import com.alibaba.cloud.ai.document.DocumentParser;
import org.junit.jupiter.api.Test;
import org.springframework.ai.document.Document;

import java.io.IOException;
import java.io.InputStream;

import static org.assertj.core.api.Assertions.assertThat;

class ApachePdfBoxDocumentParserTest {

@Test
void should_parse_pdf_file() {
try (InputStream inputStream = getClass().getClassLoader().getResourceAsStream("test-file.pdf")) {
DocumentParser parser = new ApachePdfBoxDocumentParser();
Document document = parser.parse(inputStream).get(0);

assertThat(document.getContent()).isEqualToIgnoringWhitespace("test content");
assertThat(document.getMetadata()).isEmpty();
}
catch (IOException e) {
throw new RuntimeException(e);
}
}

@Test
void should_parse_pdf_file_include_metadata() {
try (InputStream inputStream = getClass().getClassLoader().getResourceAsStream("test-file.pdf")) {
DocumentParser parser = new ApachePdfBoxDocumentParser(true);
Document document = parser.parse(inputStream).get(0);

assertThat(document.getContent()).isEqualToIgnoringWhitespace("test content");
assertThat(document.getMetadata()).containsEntry("Author", "ljuba")
.containsEntry("Creator", "WPS Writer")
.containsEntry("CreationDate", "D:20230608171011+15'10'")
.containsEntry("SourceModified", "D:20230608171011+15'10'");
}
catch (IOException e) {
throw new RuntimeException(e);
}
}

}
Binary file not shown.
Binary file not shown.
71 changes: 71 additions & 0 deletions community/document-parsers/document-parser-markdown/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>spring-ai-alibaba</artifactId>
<version>${revision}</version>
<relativePath>../../../pom.xml</relativePath>
</parent>

<artifactId>document-parser-markdown</artifactId>
<name>document-parser-markdown</name>
<description>document-parser-markdown for Spring AI Alibaba</description>
<packaging>jar</packaging>
<url>https://github.com/alibaba/spring-ai-alibaba</url>
<scm>
<url>https://github.com/alibaba/spring-ai-alibaba</url>
<connection>git://github.com/alibaba/spring-ai-alibaba.git</connection>
<developerConnection>[email protected]:alibaba/spring-ai-alibaba.git</developerConnection>
</scm>


<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<commonmark.version>0.22.0</commonmark.version>
</properties>

<dependencies>
<dependency>
<groupId>com.alibaba.cloud.ai</groupId>
<artifactId>spring-ai-alibaba-core</artifactId>
<version>${project.parent.version}</version>
</dependency>

<dependency>
<groupId>org.commonmark</groupId>
<artifactId>commonmark</artifactId>
<version>${commonmark.version}</version>
</dependency>

<!-- test dependencies -->
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-test</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<scope>test</scope>
</dependency>
</dependencies>

</project>
Loading

0 comments on commit 2beae9f

Please sign in to comment.