Skip to content

Commit

Permalink
abstracted parsing model
Browse files Browse the repository at this point in the history
  • Loading branch information
acharneski committed Sep 8, 2024
1 parent c401929 commit 7726ca0
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 47 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@ import com.simiacryptus.jopenai.API
import com.simiacryptus.jopenai.ChatClient
import com.simiacryptus.jopenai.models.ChatModels
import com.simiacryptus.jopenai.util.JsonUtil
import com.simiacryptus.skyenet.apps.general.parsers.ParsingModel
import com.simiacryptus.skyenet.apps.general.parsers.DefaultParsingModel
import com.simiacryptus.skyenet.TabbedDisplay
import com.simiacryptus.skyenet.apps.general.parsers.PDFReader
import com.simiacryptus.skyenet.apps.general.parsers.TextReader
import com.simiacryptus.skyenet.core.platform.Session
import com.simiacryptus.skyenet.core.platform.User
import com.simiacryptus.skyenet.webui.application.ApplicationInterface
Expand All @@ -25,7 +29,7 @@ class DocumentParserApp(
applicationName: String = "Document Extractor",
path: String = "/pdfExtractor",
val api: API = ChatClient(),
val parsingModel: ParsingModel = ParsingModel(ChatModels.Claude35Sonnet, 0.1),
val parsingModel: ParsingModel = DefaultParsingModel(ChatModels.Claude35Sonnet, 0.1),
val reader: (File) -> DocumentReader = {
when {
it.name.endsWith(".pdf", ignoreCase = true) -> PDFReader(it)
Expand Down Expand Up @@ -104,8 +108,7 @@ class DocumentParserApp(
val outputDir = root.resolve("output").apply { mkdirs() }
lateinit var runningDocument: ParsingModel.DocumentData
reader(pdfFile).use { reader ->
runningDocument =
ParsingModel.DocumentData(id = pdfFile.name, content = ArrayList(), entities = mutableMapOf())
runningDocument = parsingModel.newDocument()
var previousPageText = "" // Keep this for context
task.add(
MarkdownUtil.renderMarkdown(
Expand Down Expand Up @@ -158,9 +161,9 @@ class DocumentParserApp(
|```
""".trimMargin()
)
@Language("Markdown") val jsonResult = parsingModel.getParser(api).apply(
promptList.toList().joinToString("\n\n")
)
@Language("Markdown") val jsonResult = parsingModel.getParser(api).let {
it(promptList.toList().joinToString("\n\n"))
}
val jsonFile = outputDir.resolve("pages_${batchStart + 1}_to_${batchEnd}_content.json")
jsonFile.writeText(JsonUtil.toJson(jsonResult))
ui.newTask(false).apply {
Expand Down Expand Up @@ -240,7 +243,7 @@ class DocumentParserApp(
task.error(ui, e)
}
}

data class Settings(
val dpi: Float = 120f,
val maxPages: Int = Int.MAX_VALUE,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,25 +1,29 @@
package com.simiacryptus.skyenet.apps.general
package com.simiacryptus.skyenet.apps.general.parsers

import com.simiacryptus.jopenai.API
import com.simiacryptus.jopenai.describe.Description
import com.simiacryptus.jopenai.models.ChatModels
import com.simiacryptus.skyenet.core.actors.ParsedActor
import java.time.LocalDateTime

class ParsingModel(
class DefaultParsingModel(
private val chatModels: ChatModels,
private val temperature: Double
) {
) : ParsingModel {

override fun merge(
runningDocument: ParsingModel.DocumentData,
newData: ParsingModel.DocumentData
) : ParsingModel.DocumentData {
val runningDocument = runningDocument as DocumentData
val newData = newData as DocumentData
return DocumentData(
id = newData.id ?: runningDocument.id,
content = mergeContent(runningDocument.content, newData.content).takeIf { it.isNotEmpty() },
entities = mergeEntities(runningDocument.entities, newData.entities).takeIf { it.isNotEmpty() },
metadata = mergeMetadata(runningDocument.metadata, newData.metadata)
)
}

fun merge(
runningDocument: DocumentData,
newData: DocumentData
) = DocumentData(
id = newData.id ?: runningDocument.id,
content = mergeContent(runningDocument.content, newData.content).takeIf { it.isNotEmpty() },
entities = mergeEntities(runningDocument.entities, newData.entities).takeIf { it.isNotEmpty() },
metadata = mergeMetadata(runningDocument.metadata, newData.metadata)
)
private fun mergeMetadata(existing: DocumentMetadata?, new: DocumentMetadata?): DocumentMetadata {
return DocumentMetadata(
title = new?.title ?: existing?.title,
Expand All @@ -28,7 +32,6 @@ class ParsingModel(
)
}


private fun mergeContent(
existingContent: List<ContentData>?,
newContent: List<ContentData>?
Expand All @@ -52,7 +55,6 @@ class ParsingModel(
tags = ((existing.tags ?: emptyList()) + (new.tags ?: emptyList())).distinct().takeIf { it.isNotEmpty() }
)


private fun mergeEntities(
existingEntities: Map<String, EntityData>?,
newEntities: Map<String, EntityData>?
Expand All @@ -74,13 +76,36 @@ class ParsingModel(
type = new.type ?: existing.type
)

override fun getParser(api: API): (String) -> DocumentData {
val parser = ParsedActor(
resultClass = DocumentData::class.java,
prompt = "",
parsingModel = chatModels,
temperature = temperature
).getParser(
api, promptSuffix = """
Parse the text into a hierarchical structure that describes the content of the page:
1. Separate the content into sections, paragraphs, statements, etc.
2. The final level of the hierarchy should contain singular, short, standalone sentences.
3. Capture any entities, relationships, and properties that can be extracted from the text of the current page(s).
4. For each entity, include mentions with their exact text and location (start and end indices) in the document.
5. Extract document metadata such as title, author, creation date, and keywords if available.
6. Assign relevant tags to each content section to improve searchability and categorization.
7. Do not copy data from the accumulated document JSON to your response; it is provided for context only.
""".trimIndent()
)
return { text -> parser.apply(text) }
}

override fun newDocument() = DocumentData()


data class DocumentData(
@Description("Document/Page identifier") val id: String? = null,
@Description("Entities extracted") val entities: Map<String, EntityData>? = null,
@Description("Hierarchical structure and data") val content: List<ContentData>? = null,
@Description("Document metadata") val metadata: DocumentMetadata? = null
)
) : ParsingModel.DocumentData

data class EntityData(
@Description("Aliases for the entity") val aliases: List<String>? = null,
Expand All @@ -102,25 +127,4 @@ class ParsingModel(
@Description("Other metadata") val properties: Map<String, Any>? = null,
)


fun getParser(api: API) = ParsedActor(
resultClass = DocumentData::class.java,
prompt = "",
parsingModel = chatModels,
temperature = temperature
).getParser(api, promptSuffix = """
|Parse the text into a hierarchical structure that describes the content of the page:
|1. Separate the content into sections, paragraphs, statements, etc.
|2. The final level of the hierarchy should contain singular, short, standalone sentences.
|3. Capture any entities, relationships, and properties that can be extracted from the text of the current page(s).
|4. For each entity, include mentions with their exact text and location (start and end indices) in the document.
|5. Extract document metadata such as title, author, creation date, and keywords if available.
|6. Assign relevant tags to each content section to improve searchability and categorization.
|7. Do not copy data from the accumulated document JSON to your response; it is provided for context only.
""".trimMargin()
)

companion object {
private val log = org.slf4j.LoggerFactory.getLogger(ParsingModel::class.java)
}
}
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
package com.simiacryptus.skyenet.apps.general
package com.simiacryptus.skyenet.apps.general.parsers

import com.simiacryptus.skyenet.apps.general.DocumentParserApp
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.rendering.PDFRenderer
import org.apache.pdfbox.text.PDFTextStripper
import java.awt.image.BufferedImage
import java.io.File

class PDFReader(pdfFile: File) : DocumentParserApp.DocumentReader, AutoCloseable {
class PDFReader(pdfFile: File) : DocumentParserApp.DocumentReader {
private val document: PDDocument = PDDocument.load(pdfFile)
private val renderer: PDFRenderer = PDFRenderer(document)
private val stripper: PDFTextStripper = PDFTextStripper().apply { sortByPosition = true }
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package com.simiacryptus.skyenet.apps.general.parsers

import com.simiacryptus.jopenai.API

interface ParsingModel {
fun merge(runningDocument: DocumentData, newData: DocumentData): DocumentData
fun getParser(api: API): (String) -> DocumentData
fun newDocument(): DocumentData

interface DocumentData
companion object {
private val log = org.slf4j.LoggerFactory.getLogger(ParsingModel::class.java)
}
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.simiacryptus.skyenet.apps.general
package com.simiacryptus.skyenet.apps.general.parsers

import com.simiacryptus.skyenet.apps.general.DocumentParserApp
import java.awt.image.BufferedImage
import java.io.File

Expand Down

0 comments on commit 7726ca0

Please sign in to comment.