diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt index 88938d65..974ade15 100644 --- a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt @@ -4,10 +4,10 @@ import com.simiacryptus.jopenai.API import com.simiacryptus.jopenai.ChatClient import com.simiacryptus.jopenai.models.ChatModels import com.simiacryptus.jopenai.util.JsonUtil -import com.simiacryptus.skyenet.apps.general.parsers.ParsingModel -import com.simiacryptus.skyenet.apps.general.parsers.DefaultParsingModel import com.simiacryptus.skyenet.TabbedDisplay +import com.simiacryptus.skyenet.apps.general.parsers.DefaultParsingModel import com.simiacryptus.skyenet.apps.general.parsers.PDFReader +import com.simiacryptus.skyenet.apps.general.parsers.ParsingModel import com.simiacryptus.skyenet.apps.general.parsers.TextReader import com.simiacryptus.skyenet.core.platform.Session import com.simiacryptus.skyenet.core.platform.User @@ -25,7 +25,7 @@ import java.util.* import javax.imageio.ImageIO import kotlin.math.min -class DocumentParserApp( +open class DocumentParserApp( applicationName: String = "Document Extractor", path: String = "/pdfExtractor", val api: API = ChatClient(), @@ -130,7 +130,9 @@ class DocumentParserApp( }) try { val text = reader.getText(batchStart + 1, batchEnd) - outputDir.resolve("pages_${batchStart + 1}_to_${batchEnd}_text.txt").writeText(text) + if (settings.saveTextFiles) { + outputDir.resolve("pages_${batchStart + 1}_to_${batchEnd}_text.txt").writeText(text) + } val promptList = mutableListOf() promptList.add( """ @@ -194,14 +196,16 @@ class DocumentParserApp( image(image) } } - val imageFile = - outputDir.resolve("page_${pageIndex + 1}.${settings.outputFormat.lowercase(Locale.getDefault())}") - when (settings.outputFormat.uppercase(Locale.getDefault())) { - "PNG" -> ImageIO.write(image, "PNG", imageFile) - "JPEG", "JPG" -> ImageIO.write(image, "JPEG", imageFile) - "GIF" -> ImageIO.write(image, "GIF", imageFile) - "BMP" -> ImageIO.write(image, "BMP", imageFile) - else -> throw IllegalArgumentException("Unsupported output format: ${settings.outputFormat}") + if (settings.saveImageFiles) { + val imageFile = + outputDir.resolve("page_${pageIndex + 1}.${settings.outputFormat.lowercase(Locale.getDefault())}") + when (settings.outputFormat.uppercase(Locale.getDefault())) { + "PNG" -> ImageIO.write(image, "PNG", imageFile) + "JPEG", "JPG" -> ImageIO.write(image, "JPEG", imageFile) + "GIF" -> ImageIO.write(image, "GIF", imageFile) + "BMP" -> ImageIO.write(image, "BMP", imageFile) + else -> throw IllegalArgumentException("Unsupported output format: ${settings.outputFormat}") + } } } runningDocument = parsingModel.merge(runningDocument, jsonResult) @@ -238,6 +242,17 @@ class DocumentParserApp( """.trimMargin(), ui = ui ) ) + // Save final JSON if enabled in settings + if (settings.saveFinalJson) { + val finalJsonFile = outputDir.resolve("final_document.json") + finalJsonFile.writeText(JsonUtil.toJson(runningDocument)) + task.add( + MarkdownUtil.renderMarkdown( + "Final JSON saved to: ${finalJsonFile.absolutePath}", + ui = ui + ) + ) + } } } catch (e: Throwable) { task.error(ui, e) @@ -250,7 +265,10 @@ class DocumentParserApp( val outputFormat: String = "PNG", val fileInput: String? = "", val showImages: Boolean = true, - val pagesPerBatch: Int = 1 + val pagesPerBatch: Int = 1, + val saveImageFiles: Boolean = true, + val saveTextFiles: Boolean = true, + val saveFinalJson: Boolean = false ) override val settingsClass: Class<*> get() = Settings::class.java diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/DefaultParsingModel.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/DefaultParsingModel.kt index 5a7c45fc..24bf413e 100644 --- a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/DefaultParsingModel.kt +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/DefaultParsingModel.kt @@ -5,7 +5,7 @@ import com.simiacryptus.jopenai.describe.Description import com.simiacryptus.jopenai.models.ChatModels import com.simiacryptus.skyenet.core.actors.ParsedActor -class DefaultParsingModel( +open class DefaultParsingModel( private val chatModels: ChatModels, private val temperature: Double ) : ParsingModel { @@ -24,7 +24,7 @@ class DefaultParsingModel( ) } - private fun mergeMetadata(existing: DocumentMetadata?, new: DocumentMetadata?): DocumentMetadata { + protected open fun mergeMetadata(existing: DocumentMetadata?, new: DocumentMetadata?): DocumentMetadata { return DocumentMetadata( title = new?.title ?: existing?.title, keywords = ((existing?.keywords ?: emptyList()) + (new?.keywords ?: emptyList())).distinct(), @@ -32,7 +32,7 @@ class DefaultParsingModel( ) } - private fun mergeContent( + protected open fun mergeContent( existingContent: List?, newContent: List? ): List { @@ -48,14 +48,14 @@ class DefaultParsingModel( return mergedContent } - private fun mergeContentData(existing: ContentData, new: ContentData) = existing.copy( + protected open fun mergeContentData(existing: ContentData, new: ContentData) = existing.copy( content = mergeContent(existing.content, new.content).takeIf { it.isNotEmpty() }, entities = ((existing.entities ?: emptyList()) + (new.entities ?: emptyList())).distinct() .takeIf { it.isNotEmpty() }, tags = ((existing.tags ?: emptyList()) + (new.tags ?: emptyList())).distinct().takeIf { it.isNotEmpty() } ) - private fun mergeEntities( + protected open fun mergeEntities( existingEntities: Map?, newEntities: Map? ) = ((existingEntities?.keys ?: emptySet()) + (newEntities?.keys ?: emptySet())).associateWith { key -> @@ -68,7 +68,7 @@ class DefaultParsingModel( } } - private fun mergeEntityData(existing: EntityData, new: EntityData) = existing.copy( + protected open fun mergeEntityData(existing: EntityData, new: EntityData) = existing.copy( aliases = ((existing.aliases ?: emptyList()) + (new.aliases ?: emptyList())).distinct() .takeIf { it.isNotEmpty() }, properties = ((existing.properties ?: emptyMap()) + (new.properties ?: emptyMap())).takeIf { it.isNotEmpty() }, @@ -76,30 +76,32 @@ class DefaultParsingModel( type = new.type ?: existing.type ) + open val promptSuffix = """ + |Parse the text into a hierarchical structure that describes the content of the page: + |1. Separate the content into sections, paragraphs, statements, etc. + |2. The final level of the hierarchy should contain singular, short, standalone sentences. + |3. Capture any entities, relationships, and properties that can be extracted from the text of the current page(s). + |4. For each entity, include mentions with their exact text and location (start and end indices) in the document. + |5. Extract document metadata such as title, author, creation date, and keywords if available. + |6. Assign relevant tags to each content section to improve searchability and categorization. + |7. Do not copy data from the accumulated document JSON to your response; it is provided for context only. + """.trimMargin() + open val exampleInstance = DocumentData() override fun getParser(api: API): (String) -> DocumentData { val parser = ParsedActor( resultClass = DocumentData::class.java, + exampleInstance = exampleInstance, prompt = "", parsingModel = chatModels, temperature = temperature ).getParser( - api, promptSuffix = """ - Parse the text into a hierarchical structure that describes the content of the page: - 1. Separate the content into sections, paragraphs, statements, etc. - 2. The final level of the hierarchy should contain singular, short, standalone sentences. - 3. Capture any entities, relationships, and properties that can be extracted from the text of the current page(s). - 4. For each entity, include mentions with their exact text and location (start and end indices) in the document. - 5. Extract document metadata such as title, author, creation date, and keywords if available. - 6. Assign relevant tags to each content section to improve searchability and categorization. - 7. Do not copy data from the accumulated document JSON to your response; it is provided for context only. - """.trimIndent() + api, promptSuffix = promptSuffix ) return { text -> parser.apply(text) } } override fun newDocument() = DocumentData() - data class DocumentData( @Description("Document/Page identifier") val id: String? = null, @Description("Entities extracted") val entities: Map? = null, @@ -127,4 +129,8 @@ class DefaultParsingModel( @Description("Other metadata") val properties: Map? = null, ) + companion object { + val log = org.slf4j.LoggerFactory.getLogger(DefaultParsingModel::class.java) + } + } \ No newline at end of file