From 7726ca0aab7ae9d1c709c64a15b40f85b9a22300 Mon Sep 17 00:00:00 2001 From: Andrew Charneski Date: Sun, 8 Sep 2024 15:10:40 -0400 Subject: [PATCH] abstracted parsing model --- .../skyenet/apps/general/DocumentParserApp.kt | 17 ++-- .../DefaultParsingModel.kt} | 78 ++++++++++--------- .../apps/general/{ => parsers}/PDFReader.kt | 5 +- .../apps/general/parsers/ParsingModel.kt | 14 ++++ .../apps/general/{ => parsers}/TextReader.kt | 3 +- 5 files changed, 70 insertions(+), 47 deletions(-) rename webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/{ParsingModel.kt => parsers/DefaultParsingModel.kt} (68%) rename webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/{ => parsers}/PDFReader.kt (88%) create mode 100644 webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/ParsingModel.kt rename webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/{ => parsers}/TextReader.kt (83%) diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt index fc5b93b6..88938d65 100644 --- a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt @@ -4,7 +4,11 @@ import com.simiacryptus.jopenai.API import com.simiacryptus.jopenai.ChatClient import com.simiacryptus.jopenai.models.ChatModels import com.simiacryptus.jopenai.util.JsonUtil +import com.simiacryptus.skyenet.apps.general.parsers.ParsingModel +import com.simiacryptus.skyenet.apps.general.parsers.DefaultParsingModel import com.simiacryptus.skyenet.TabbedDisplay +import com.simiacryptus.skyenet.apps.general.parsers.PDFReader +import com.simiacryptus.skyenet.apps.general.parsers.TextReader import com.simiacryptus.skyenet.core.platform.Session import com.simiacryptus.skyenet.core.platform.User import com.simiacryptus.skyenet.webui.application.ApplicationInterface @@ -25,7 +29,7 @@ class DocumentParserApp( applicationName: String = "Document Extractor", path: String = "/pdfExtractor", val api: API = ChatClient(), - val parsingModel: ParsingModel = ParsingModel(ChatModels.Claude35Sonnet, 0.1), + val parsingModel: ParsingModel = DefaultParsingModel(ChatModels.Claude35Sonnet, 0.1), val reader: (File) -> DocumentReader = { when { it.name.endsWith(".pdf", ignoreCase = true) -> PDFReader(it) @@ -104,8 +108,7 @@ class DocumentParserApp( val outputDir = root.resolve("output").apply { mkdirs() } lateinit var runningDocument: ParsingModel.DocumentData reader(pdfFile).use { reader -> - runningDocument = - ParsingModel.DocumentData(id = pdfFile.name, content = ArrayList(), entities = mutableMapOf()) + runningDocument = parsingModel.newDocument() var previousPageText = "" // Keep this for context task.add( MarkdownUtil.renderMarkdown( @@ -158,9 +161,9 @@ class DocumentParserApp( |``` """.trimMargin() ) - @Language("Markdown") val jsonResult = parsingModel.getParser(api).apply( - promptList.toList().joinToString("\n\n") - ) + @Language("Markdown") val jsonResult = parsingModel.getParser(api).let { + it(promptList.toList().joinToString("\n\n")) + } val jsonFile = outputDir.resolve("pages_${batchStart + 1}_to_${batchEnd}_content.json") jsonFile.writeText(JsonUtil.toJson(jsonResult)) ui.newTask(false).apply { @@ -240,7 +243,7 @@ class DocumentParserApp( task.error(ui, e) } } - + data class Settings( val dpi: Float = 120f, val maxPages: Int = Int.MAX_VALUE, diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/ParsingModel.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/DefaultParsingModel.kt similarity index 68% rename from webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/ParsingModel.kt rename to webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/DefaultParsingModel.kt index dc2c082d..5a7c45fc 100644 --- a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/ParsingModel.kt +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/DefaultParsingModel.kt @@ -1,25 +1,29 @@ -package com.simiacryptus.skyenet.apps.general +package com.simiacryptus.skyenet.apps.general.parsers import com.simiacryptus.jopenai.API import com.simiacryptus.jopenai.describe.Description import com.simiacryptus.jopenai.models.ChatModels import com.simiacryptus.skyenet.core.actors.ParsedActor -import java.time.LocalDateTime -class ParsingModel( +class DefaultParsingModel( private val chatModels: ChatModels, private val temperature: Double -) { +) : ParsingModel { + + override fun merge( + runningDocument: ParsingModel.DocumentData, + newData: ParsingModel.DocumentData + ) : ParsingModel.DocumentData { + val runningDocument = runningDocument as DocumentData + val newData = newData as DocumentData + return DocumentData( + id = newData.id ?: runningDocument.id, + content = mergeContent(runningDocument.content, newData.content).takeIf { it.isNotEmpty() }, + entities = mergeEntities(runningDocument.entities, newData.entities).takeIf { it.isNotEmpty() }, + metadata = mergeMetadata(runningDocument.metadata, newData.metadata) + ) + } - fun merge( - runningDocument: DocumentData, - newData: DocumentData - ) = DocumentData( - id = newData.id ?: runningDocument.id, - content = mergeContent(runningDocument.content, newData.content).takeIf { it.isNotEmpty() }, - entities = mergeEntities(runningDocument.entities, newData.entities).takeIf { it.isNotEmpty() }, - metadata = mergeMetadata(runningDocument.metadata, newData.metadata) - ) private fun mergeMetadata(existing: DocumentMetadata?, new: DocumentMetadata?): DocumentMetadata { return DocumentMetadata( title = new?.title ?: existing?.title, @@ -28,7 +32,6 @@ class ParsingModel( ) } - private fun mergeContent( existingContent: List?, newContent: List? @@ -52,7 +55,6 @@ class ParsingModel( tags = ((existing.tags ?: emptyList()) + (new.tags ?: emptyList())).distinct().takeIf { it.isNotEmpty() } ) - private fun mergeEntities( existingEntities: Map?, newEntities: Map? @@ -74,13 +76,36 @@ class ParsingModel( type = new.type ?: existing.type ) + override fun getParser(api: API): (String) -> DocumentData { + val parser = ParsedActor( + resultClass = DocumentData::class.java, + prompt = "", + parsingModel = chatModels, + temperature = temperature + ).getParser( + api, promptSuffix = """ + Parse the text into a hierarchical structure that describes the content of the page: + 1. Separate the content into sections, paragraphs, statements, etc. + 2. The final level of the hierarchy should contain singular, short, standalone sentences. + 3. Capture any entities, relationships, and properties that can be extracted from the text of the current page(s). + 4. For each entity, include mentions with their exact text and location (start and end indices) in the document. + 5. Extract document metadata such as title, author, creation date, and keywords if available. + 6. Assign relevant tags to each content section to improve searchability and categorization. + 7. Do not copy data from the accumulated document JSON to your response; it is provided for context only. + """.trimIndent() + ) + return { text -> parser.apply(text) } + } + + override fun newDocument() = DocumentData() + data class DocumentData( @Description("Document/Page identifier") val id: String? = null, @Description("Entities extracted") val entities: Map? = null, @Description("Hierarchical structure and data") val content: List? = null, @Description("Document metadata") val metadata: DocumentMetadata? = null - ) + ) : ParsingModel.DocumentData data class EntityData( @Description("Aliases for the entity") val aliases: List? = null, @@ -102,25 +127,4 @@ class ParsingModel( @Description("Other metadata") val properties: Map? = null, ) - - fun getParser(api: API) = ParsedActor( - resultClass = DocumentData::class.java, - prompt = "", - parsingModel = chatModels, - temperature = temperature - ).getParser(api, promptSuffix = """ - |Parse the text into a hierarchical structure that describes the content of the page: - |1. Separate the content into sections, paragraphs, statements, etc. - |2. The final level of the hierarchy should contain singular, short, standalone sentences. - |3. Capture any entities, relationships, and properties that can be extracted from the text of the current page(s). - |4. For each entity, include mentions with their exact text and location (start and end indices) in the document. - |5. Extract document metadata such as title, author, creation date, and keywords if available. - |6. Assign relevant tags to each content section to improve searchability and categorization. - |7. Do not copy data from the accumulated document JSON to your response; it is provided for context only. - """.trimMargin() - ) - - companion object { - private val log = org.slf4j.LoggerFactory.getLogger(ParsingModel::class.java) - } } \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/PDFReader.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/PDFReader.kt similarity index 88% rename from webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/PDFReader.kt rename to webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/PDFReader.kt index 52491184..f30634d9 100644 --- a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/PDFReader.kt +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/PDFReader.kt @@ -1,12 +1,13 @@ -package com.simiacryptus.skyenet.apps.general +package com.simiacryptus.skyenet.apps.general.parsers +import com.simiacryptus.skyenet.apps.general.DocumentParserApp import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.rendering.PDFRenderer import org.apache.pdfbox.text.PDFTextStripper import java.awt.image.BufferedImage import java.io.File -class PDFReader(pdfFile: File) : DocumentParserApp.DocumentReader, AutoCloseable { +class PDFReader(pdfFile: File) : DocumentParserApp.DocumentReader { private val document: PDDocument = PDDocument.load(pdfFile) private val renderer: PDFRenderer = PDFRenderer(document) private val stripper: PDFTextStripper = PDFTextStripper().apply { sortByPosition = true } diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/ParsingModel.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/ParsingModel.kt new file mode 100644 index 00000000..f73f3d90 --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/ParsingModel.kt @@ -0,0 +1,14 @@ +package com.simiacryptus.skyenet.apps.general.parsers + +import com.simiacryptus.jopenai.API + +interface ParsingModel { + fun merge(runningDocument: DocumentData, newData: DocumentData): DocumentData + fun getParser(api: API): (String) -> DocumentData + fun newDocument(): DocumentData + + interface DocumentData + companion object { + private val log = org.slf4j.LoggerFactory.getLogger(ParsingModel::class.java) + } +} \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/TextReader.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/TextReader.kt similarity index 83% rename from webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/TextReader.kt rename to webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/TextReader.kt index 945d5d10..8292ff1c 100644 --- a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/TextReader.kt +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/TextReader.kt @@ -1,5 +1,6 @@ -package com.simiacryptus.skyenet.apps.general +package com.simiacryptus.skyenet.apps.general.parsers +import com.simiacryptus.skyenet.apps.general.DocumentParserApp import java.awt.image.BufferedImage import java.io.File