Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
acharneski committed Sep 8, 2024
1 parent 7726ca0 commit c6a53f5
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ import com.simiacryptus.jopenai.API
import com.simiacryptus.jopenai.ChatClient
import com.simiacryptus.jopenai.models.ChatModels
import com.simiacryptus.jopenai.util.JsonUtil
import com.simiacryptus.skyenet.apps.general.parsers.ParsingModel
import com.simiacryptus.skyenet.apps.general.parsers.DefaultParsingModel
import com.simiacryptus.skyenet.TabbedDisplay
import com.simiacryptus.skyenet.apps.general.parsers.DefaultParsingModel
import com.simiacryptus.skyenet.apps.general.parsers.PDFReader
import com.simiacryptus.skyenet.apps.general.parsers.ParsingModel
import com.simiacryptus.skyenet.apps.general.parsers.TextReader
import com.simiacryptus.skyenet.core.platform.Session
import com.simiacryptus.skyenet.core.platform.User
Expand All @@ -25,7 +25,7 @@ import java.util.*
import javax.imageio.ImageIO
import kotlin.math.min

class DocumentParserApp(
open class DocumentParserApp(
applicationName: String = "Document Extractor",
path: String = "/pdfExtractor",
val api: API = ChatClient(),
Expand Down Expand Up @@ -130,7 +130,9 @@ class DocumentParserApp(
})
try {
val text = reader.getText(batchStart + 1, batchEnd)
outputDir.resolve("pages_${batchStart + 1}_to_${batchEnd}_text.txt").writeText(text)
if (settings.saveTextFiles) {
outputDir.resolve("pages_${batchStart + 1}_to_${batchEnd}_text.txt").writeText(text)
}
val promptList = mutableListOf<String>()
promptList.add(
"""
Expand Down Expand Up @@ -194,14 +196,16 @@ class DocumentParserApp(
image(image)
}
}
val imageFile =
outputDir.resolve("page_${pageIndex + 1}.${settings.outputFormat.lowercase(Locale.getDefault())}")
when (settings.outputFormat.uppercase(Locale.getDefault())) {
"PNG" -> ImageIO.write(image, "PNG", imageFile)
"JPEG", "JPG" -> ImageIO.write(image, "JPEG", imageFile)
"GIF" -> ImageIO.write(image, "GIF", imageFile)
"BMP" -> ImageIO.write(image, "BMP", imageFile)
else -> throw IllegalArgumentException("Unsupported output format: ${settings.outputFormat}")
if (settings.saveImageFiles) {
val imageFile =
outputDir.resolve("page_${pageIndex + 1}.${settings.outputFormat.lowercase(Locale.getDefault())}")
when (settings.outputFormat.uppercase(Locale.getDefault())) {
"PNG" -> ImageIO.write(image, "PNG", imageFile)
"JPEG", "JPG" -> ImageIO.write(image, "JPEG", imageFile)
"GIF" -> ImageIO.write(image, "GIF", imageFile)
"BMP" -> ImageIO.write(image, "BMP", imageFile)
else -> throw IllegalArgumentException("Unsupported output format: ${settings.outputFormat}")
}
}
}
runningDocument = parsingModel.merge(runningDocument, jsonResult)
Expand Down Expand Up @@ -238,6 +242,17 @@ class DocumentParserApp(
""".trimMargin(), ui = ui
)
)
// Save final JSON if enabled in settings
if (settings.saveFinalJson) {
val finalJsonFile = outputDir.resolve("final_document.json")
finalJsonFile.writeText(JsonUtil.toJson(runningDocument))
task.add(
MarkdownUtil.renderMarkdown(
"Final JSON saved to: ${finalJsonFile.absolutePath}",
ui = ui
)
)
}
}
} catch (e: Throwable) {
task.error(ui, e)
Expand All @@ -250,7 +265,10 @@ class DocumentParserApp(
val outputFormat: String = "PNG",
val fileInput: String? = "",
val showImages: Boolean = true,
val pagesPerBatch: Int = 1
val pagesPerBatch: Int = 1,
val saveImageFiles: Boolean = true,
val saveTextFiles: Boolean = true,
val saveFinalJson: Boolean = false
)

override val settingsClass: Class<*> get() = Settings::class.java
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import com.simiacryptus.jopenai.describe.Description
import com.simiacryptus.jopenai.models.ChatModels
import com.simiacryptus.skyenet.core.actors.ParsedActor

class DefaultParsingModel(
open class DefaultParsingModel(
private val chatModels: ChatModels,
private val temperature: Double
) : ParsingModel {
Expand All @@ -24,15 +24,15 @@ class DefaultParsingModel(
)
}

private fun mergeMetadata(existing: DocumentMetadata?, new: DocumentMetadata?): DocumentMetadata {
protected open fun mergeMetadata(existing: DocumentMetadata?, new: DocumentMetadata?): DocumentMetadata {
return DocumentMetadata(
title = new?.title ?: existing?.title,
keywords = ((existing?.keywords ?: emptyList()) + (new?.keywords ?: emptyList())).distinct(),
properties = ((existing?.properties ?: emptyMap()) + (new?.properties ?: emptyMap())).takeIf { it.isNotEmpty() }
)
}

private fun mergeContent(
protected open fun mergeContent(
existingContent: List<ContentData>?,
newContent: List<ContentData>?
): List<ContentData> {
Expand All @@ -48,14 +48,14 @@ class DefaultParsingModel(
return mergedContent
}

private fun mergeContentData(existing: ContentData, new: ContentData) = existing.copy(
protected open fun mergeContentData(existing: ContentData, new: ContentData) = existing.copy(
content = mergeContent(existing.content, new.content).takeIf { it.isNotEmpty() },
entities = ((existing.entities ?: emptyList()) + (new.entities ?: emptyList())).distinct()
.takeIf { it.isNotEmpty() },
tags = ((existing.tags ?: emptyList()) + (new.tags ?: emptyList())).distinct().takeIf { it.isNotEmpty() }
)

private fun mergeEntities(
protected open fun mergeEntities(
existingEntities: Map<String, EntityData>?,
newEntities: Map<String, EntityData>?
) = ((existingEntities?.keys ?: emptySet()) + (newEntities?.keys ?: emptySet())).associateWith { key ->
Expand All @@ -68,38 +68,40 @@ class DefaultParsingModel(
}
}

private fun mergeEntityData(existing: EntityData, new: EntityData) = existing.copy(
protected open fun mergeEntityData(existing: EntityData, new: EntityData) = existing.copy(
aliases = ((existing.aliases ?: emptyList()) + (new.aliases ?: emptyList())).distinct()
.takeIf { it.isNotEmpty() },
properties = ((existing.properties ?: emptyMap()) + (new.properties ?: emptyMap())).takeIf { it.isNotEmpty() },
relations = ((existing.relations ?: emptyMap()) + (new.relations ?: emptyMap())).takeIf { it.isNotEmpty() },
type = new.type ?: existing.type
)

open val promptSuffix = """
|Parse the text into a hierarchical structure that describes the content of the page:
|1. Separate the content into sections, paragraphs, statements, etc.
|2. The final level of the hierarchy should contain singular, short, standalone sentences.
|3. Capture any entities, relationships, and properties that can be extracted from the text of the current page(s).
|4. For each entity, include mentions with their exact text and location (start and end indices) in the document.
|5. Extract document metadata such as title, author, creation date, and keywords if available.
|6. Assign relevant tags to each content section to improve searchability and categorization.
|7. Do not copy data from the accumulated document JSON to your response; it is provided for context only.
""".trimMargin()
open val exampleInstance = DocumentData()
override fun getParser(api: API): (String) -> DocumentData {
val parser = ParsedActor(
resultClass = DocumentData::class.java,
exampleInstance = exampleInstance,
prompt = "",
parsingModel = chatModels,
temperature = temperature
).getParser(
api, promptSuffix = """
Parse the text into a hierarchical structure that describes the content of the page:
1. Separate the content into sections, paragraphs, statements, etc.
2. The final level of the hierarchy should contain singular, short, standalone sentences.
3. Capture any entities, relationships, and properties that can be extracted from the text of the current page(s).
4. For each entity, include mentions with their exact text and location (start and end indices) in the document.
5. Extract document metadata such as title, author, creation date, and keywords if available.
6. Assign relevant tags to each content section to improve searchability and categorization.
7. Do not copy data from the accumulated document JSON to your response; it is provided for context only.
""".trimIndent()
api, promptSuffix = promptSuffix
)
return { text -> parser.apply(text) }
}

override fun newDocument() = DocumentData()


data class DocumentData(
@Description("Document/Page identifier") val id: String? = null,
@Description("Entities extracted") val entities: Map<String, EntityData>? = null,
Expand Down Expand Up @@ -127,4 +129,8 @@ class DefaultParsingModel(
@Description("Other metadata") val properties: Map<String, Any>? = null,
)

companion object {
val log = org.slf4j.LoggerFactory.getLogger(DefaultParsingModel::class.java)
}

}

0 comments on commit c6a53f5

Please sign in to comment.