From e3a20fba6399d8c83437e985b60b56bdca49f5b0 Mon Sep 17 00:00:00 2001 From: Andrew Charneski Date: Sun, 8 Sep 2024 21:56:20 -0400 Subject: [PATCH] 1.2.1 (#99) * 1.2.1 * wip * wip * abstracted parsing model * wip * wip * Update DefaultParsingModel.kt --- README.md | 6 +- core/build.gradle.kts | 2 +- .../skyenet/core/actors/ParsedActor.kt | 10 +- gradle.properties | 2 +- webui/build.gradle.kts | 10 +- .../skyenet/apps/general/DocumentParserApp.kt | 287 ++++++++++++++++++ .../general/parsers/DefaultParsingModel.kt | 233 ++++++++++++++ .../skyenet/apps/general/parsers/PDFReader.kt | 30 ++ .../apps/general/parsers/ParsingModel.kt | 14 + .../apps/general/parsers/TextReader.kt | 23 ++ .../skyenet/webui/session/SessionTask.kt | 2 +- .../skyenet/webui/ActorTestAppServer.kt | 5 +- 12 files changed, 611 insertions(+), 13 deletions(-) create mode 100644 webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt create mode 100644 webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/DefaultParsingModel.kt create mode 100644 webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/PDFReader.kt create mode 100644 webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/ParsingModel.kt create mode 100644 webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/TextReader.kt diff --git a/README.md b/README.md index b4744299..b22896a2 100644 --- a/README.md +++ b/README.md @@ -76,18 +76,18 @@ Maven: com.simiacryptus skyenet-webui - 1.1.0 + 1.1.1 ``` Gradle: ```groovy -implementation group: 'com.simiacryptus', name: 'skyenet', version: '1.1.0' +implementation group: 'com.simiacryptus', name: 'skyenet', version: '1.1.1' ``` ```kotlin -implementation("com.simiacryptus:skyenet:1.1.0") +implementation("com.simiacryptus:skyenet:1.1.1") ``` ### 🌟 To Use diff --git a/core/build.gradle.kts b/core/build.gradle.kts index aa5e70c9..3af12976 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -33,7 +33,7 @@ val hsqldb_version = "2.7.2" dependencies { - implementation(group = "com.simiacryptus", name = "jo-penai", version = "1.1.0") + implementation(group = "com.simiacryptus", name = "jo-penai", version = "1.1.1") implementation(group = "org.hsqldb", name = "hsqldb", version = hsqldb_version) implementation("org.apache.commons:commons-text:1.11.0") diff --git a/core/src/main/kotlin/com/simiacryptus/skyenet/core/actors/ParsedActor.kt b/core/src/main/kotlin/com/simiacryptus/skyenet/core/actors/ParsedActor.kt index ec7686fb..de227cc6 100644 --- a/core/src/main/kotlin/com/simiacryptus/skyenet/core/actors/ParsedActor.kt +++ b/core/src/main/kotlin/com/simiacryptus/skyenet/core/actors/ParsedActor.kt @@ -58,7 +58,7 @@ open class ParsedActor( override val obj get() = _obj } - fun getParser(api: API) = Function { input -> + fun getParser(api: API, promptSuffix: String? = null) = Function { input -> describer.coverMethods = false val describe = resultClass?.let { describer.describe(it) } ?: "" val exceptions = mutableListOf() @@ -73,6 +73,7 @@ open class ParsedActor( |```json |${JsonUtil.toJson(exampleInstance!!)/*.indent(" ")*/} |``` + |${promptSuffix?.let { "\n$it" } ?: ""} | """.trimMargin() for (i in 0 until deserializerRetries) { @@ -100,7 +101,12 @@ open class ParsedActor( // if input is wrapped in a ```json block, remove the block if (contentUnwrapped.startsWith("```json")) { - contentUnwrapped = contentUnwrapped.substring(7, contentUnwrapped.lastIndexOf("```")) + val endIndex = contentUnwrapped.lastIndexOf("```") + if (endIndex > 7) { + contentUnwrapped = contentUnwrapped.substring(7, endIndex) + } else { + throw RuntimeException("Failed to parse response: ${contentUnwrapped.replace("\n", "\n ")}") + } } contentUnwrapped.let { diff --git a/gradle.properties b/gradle.properties index 37bb4de2..b024eb62 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,5 +1,5 @@ # Gradle Releases -> https://github.com/gradle/gradle/releases libraryGroup = com.simiacryptus.skyenet -libraryVersion = 1.2.0 +libraryVersion = 1.2.1 gradleVersion = 7.6.1 kotlin.daemon.jvmargs=-Xmx2g diff --git a/webui/build.gradle.kts b/webui/build.gradle.kts index e19909c8..b9aa7f37 100644 --- a/webui/build.gradle.kts +++ b/webui/build.gradle.kts @@ -33,15 +33,17 @@ kotlin { val kotlin_version = "2.0.0-Beta5" val jetty_version = "11.0.18" val jackson_version = "2.17.0" + dependencies { - implementation(group = "com.simiacryptus", name = "jo-penai", version = "1.1.0") { + implementation(group = "com.simiacryptus", name = "jo-penai", version = "1.1.1") { exclude(group = "org.slf4j", module = "slf4j-api") } implementation(project(":core")) implementation(project(":kotlin")) + implementation("org.apache.pdfbox:pdfbox:2.0.27") implementation("org.seleniumhq.selenium:selenium-chrome-driver:4.16.1") compileOnly("org.jsoup:jsoup:1.17.2") @@ -61,6 +63,11 @@ dependencies { } testRuntimeOnly("org.openapitools:openapi-generator-cli:7.3.0") + implementation("org.apache.parquet:parquet-common:1.12.3") + implementation("org.apache.parquet:parquet-avro:1.12.3") + implementation("org.apache.hadoop:hadoop-common:3.3.4") + implementation("org.apache.hadoop:hadoop-mapreduce-client-core:3.3.4") + implementation(group = "org.eclipse.jetty", name = "jetty-server", version = jetty_version) implementation(group = "org.eclipse.jetty", name = "jetty-servlet", version = jetty_version) implementation(group = "org.eclipse.jetty", name = "jetty-annotations", version = jetty_version) @@ -114,7 +121,6 @@ sass { tasks { - compileKotlin { compilerOptions { javaParameters.set(true) diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt new file mode 100644 index 00000000..0b75d046 --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt @@ -0,0 +1,287 @@ +package com.simiacryptus.skyenet.apps.general + +import com.simiacryptus.jopenai.API +import com.simiacryptus.jopenai.ChatClient +import com.simiacryptus.jopenai.models.ChatModels +import com.simiacryptus.jopenai.util.JsonUtil +import com.simiacryptus.skyenet.TabbedDisplay +import com.simiacryptus.skyenet.apps.general.parsers.DefaultParsingModel +import com.simiacryptus.skyenet.apps.general.parsers.PDFReader +import com.simiacryptus.skyenet.apps.general.parsers.ParsingModel +import com.simiacryptus.skyenet.apps.general.parsers.TextReader +import com.simiacryptus.skyenet.core.platform.Session +import com.simiacryptus.skyenet.core.platform.User +import com.simiacryptus.skyenet.webui.application.ApplicationInterface +import com.simiacryptus.skyenet.webui.application.ApplicationServer +import com.simiacryptus.skyenet.webui.application.ApplicationSocketManager +import com.simiacryptus.skyenet.webui.session.SessionTask +import com.simiacryptus.skyenet.webui.session.SocketManager +import com.simiacryptus.skyenet.webui.util.MarkdownUtil +import org.intellij.lang.annotations.Language +import java.awt.image.BufferedImage +import java.io.File +import java.nio.file.Path +import java.util.* +import javax.imageio.ImageIO +import kotlin.io.path.name +import kotlin.math.min + +open class DocumentParserApp( + applicationName: String = "Document Extractor", + path: String = "/pdfExtractor", + val api: API = ChatClient(), + val parsingModel: ParsingModel = DefaultParsingModel(ChatModels.Claude35Sonnet, 0.1), + val reader: (File) -> DocumentReader = { + when { + it.name.endsWith(".pdf", ignoreCase = true) -> PDFReader(it) + it.name.endsWith(".txt", ignoreCase = true) -> TextReader(it) + it.name.endsWith(".md", ignoreCase = true) -> TextReader(it) + it.name.endsWith(".html", ignoreCase = true) -> TextReader(it) + else -> throw IllegalArgumentException("Unsupported file type") + } + }, + val fileInput: Path? = null, +) : ApplicationServer( + applicationName = applicationName, + path = path, + showMenubar = true +) { + override val singleInput: Boolean = true + override val stickyInput: Boolean = false + + override fun newSession(user: User?, session: Session): SocketManager { + val socketManager = super.newSession(user, session) + val ui = (socketManager as ApplicationSocketManager).applicationInterface + val settings = getSettings(session, user, Settings::class.java) ?: Settings() + if (null == (fileInput ?: settings.fileInput)) { + log.info("No file input provided") + } else socketManager.pool.submit { + run( + task = ui.newTask(), + ui = ui, + fileInput = (this.fileInput ?: settings.fileInput?.let { File(it).toPath() } + ?: error("File input not provided")).apply { + if (!toFile().exists()) error("File not found: $this") + }, + maxPages = settings.maxPages.coerceAtMost(Int.MAX_VALUE), + settings = settings, + pagesPerBatch = settings.pagesPerBatch, + ) + } + return socketManager + } + + override fun userMessage(session: Session, user: User?, userMessage: String, ui: ApplicationInterface, api: API) { + val settings = getSettings(session, user, Settings::class.java) ?: Settings() + val fileInput = + (fileInput ?: settings.fileInput?.let { File(it).toPath() } ?: error("File input not provided")).apply { + if (!toFile().exists()) error("File not found: $this") + } + ui.socketManager!!.pool.submit { + run( + task = ui.newTask(), + ui = ui, + fileInput = fileInput, + maxPages = settings.maxPages.coerceAtMost(Int.MAX_VALUE), + settings = settings, + pagesPerBatch = settings.pagesPerBatch, + ) + } + } + + private fun run( + task: SessionTask, + ui: ApplicationInterface, + fileInput: Path, + maxPages: Int, + settings: Settings, + pagesPerBatch: Int, + ) { + try { + val pdfFile = fileInput.toFile() + if (!pdfFile.exists() || !pdfFile.isFile || !pdfFile.name.endsWith(".pdf", ignoreCase = true)) { + throw IllegalArgumentException("Invalid PDF file: $pdfFile") + } + task.add(MarkdownUtil.renderMarkdown("# PDF Extractor", ui = ui)) + val outputDir = root.resolve("output").apply { mkdirs() } + lateinit var runningDocument: ParsingModel.DocumentData + reader(pdfFile).use { reader -> + runningDocument = parsingModel.newDocument() + var previousPageText = "" // Keep this for context + task.add( + MarkdownUtil.renderMarkdown( + """ + ## Processing PDF: ${pdfFile.name} + Total pages: ${reader.getPageCount()} + """.trimIndent(), ui = ui + ) + ) + val pageCount = minOf(reader.getPageCount(), maxPages) + val tabs = TabbedDisplay(task) + for (batchStart in 0 until pageCount step pagesPerBatch) { + val batchEnd = min(batchStart + pagesPerBatch, pageCount) + val pageTask = ui.newTask(false) + val pageTabs = TabbedDisplay(pageTask.apply { + val label = + if ((batchStart + 1) != batchEnd) "Pages ${batchStart + 1}-${batchEnd}" else "Page ${batchStart + 1}" + tabs[label] = this.placeholder + }) + try { + val text = reader.getText(batchStart + 1, batchEnd) + if (settings.saveTextFiles) { + outputDir.resolve("pages_${batchStart + 1}_to_${batchEnd}_text.txt").writeText(text) + } + val promptList = mutableListOf() + promptList.add( + """ + |# Accumulated Prior JSON: + | + |FOR INFORMATIVE CONTEXT ONLY. DO NOT COPY TO OUTPUT. + |```json + |${JsonUtil.toJson(runningDocument)} + |``` + """.trimMargin() + ) + promptList.add( + """ + |# Prior Text + | + |FOR INFORMATIVE CONTEXT ONLY. DO NOT COPY TO OUTPUT. + |```text + |$previousPageText + |``` + |""".trimMargin() + ) + promptList.add( + """ + |# Current Page + | + |```text + |$text + |``` + """.trimMargin() + ) + @Language("Markdown") val jsonResult = parsingModel.getParser(api).let { + it(promptList.toList().joinToString("\n\n")) + } + val jsonFile = outputDir.resolve("pages_${batchStart + 1}_to_${batchEnd}_content.json") + jsonFile.writeText(JsonUtil.toJson(jsonResult)) + ui.newTask(false).apply { + pageTabs["Text"] = this.placeholder + add( + MarkdownUtil.renderMarkdown( + "\n```text\n${ + text + }\n```\n", ui = ui + ) + ) + } + ui.newTask(false).apply { + pageTabs["JSON"] = this.placeholder + add( + MarkdownUtil.renderMarkdown( + "\n```json\n${ + JsonUtil.toJson(jsonResult) + }\n```\n", ui = ui + ) + ) + } + for (pageIndex in batchStart until batchEnd) { + val image = reader.renderImage(pageIndex, settings.dpi) + if (settings.showImages) { + ui.newTask(false).apply { + pageTabs["Image ${pageIndex + 1}"] = this.placeholder + image(image) + } + } + if (settings.saveImageFiles) { + val imageFile = + outputDir.resolve("page_${pageIndex + 1}.${settings.outputFormat.lowercase(Locale.getDefault())}") + when (settings.outputFormat.uppercase(Locale.getDefault())) { + "PNG" -> ImageIO.write(image, "PNG", imageFile) + "JPEG", "JPG" -> ImageIO.write(image, "JPEG", imageFile) + "GIF" -> ImageIO.write(image, "GIF", imageFile) + "BMP" -> ImageIO.write(image, "BMP", imageFile) + else -> throw IllegalArgumentException("Unsupported output format: ${settings.outputFormat}") + } + } + } + runningDocument = parsingModel.merge(runningDocument, jsonResult) + ui.newTask(false).apply { + pageTabs["Accumulator"] = this.placeholder + add( + MarkdownUtil.renderMarkdown( + """ + |## Accumulated Document JSON + | + |```json + |${JsonUtil.toJson(runningDocument)} + |``` + """.trimMargin(), ui = ui + ) + ) + } + previousPageText = text.takeLast(1000) + } catch (e: Throwable) { + pageTask.error(ui, e) + continue + } + } + task.add( + MarkdownUtil.renderMarkdown( + """ + |## Document JSON + | + |```json + |${JsonUtil.toJson(runningDocument)} + |``` + | + |Extracted files are saved in: ${outputDir.absolutePath} + """.trimMargin(), ui = ui + ) + ) + // Save final JSON if enabled in settings + if (settings.saveFinalJson) { + val finalJsonFile = root.resolve(fileInput.name.reversed().split(delimiters = arrayOf("."), false, 2)[1].reversed() + ".parsed.json") + finalJsonFile.writeText(JsonUtil.toJson(runningDocument)) + task.add( + MarkdownUtil.renderMarkdown( + "Final JSON saved to: ${finalJsonFile.absolutePath}", + ui = ui + ) + ) + } + } + } catch (e: Throwable) { + task.error(ui, e) + } + } + + data class Settings( + val dpi: Float = 120f, + val maxPages: Int = Int.MAX_VALUE, + val outputFormat: String = "PNG", + val fileInput: String? = "", + val showImages: Boolean = true, + val pagesPerBatch: Int = 1, + val saveImageFiles: Boolean = true, + val saveTextFiles: Boolean = true, + val saveFinalJson: Boolean = false + ) + + override val settingsClass: Class<*> get() = Settings::class.java + + @Suppress("UNCHECKED_CAST") + override fun initSettings(session: Session): T = Settings() as T + + companion object { + private val log = org.slf4j.LoggerFactory.getLogger(DocumentParserApp::class.java) + } + + interface DocumentReader : AutoCloseable { + fun getPageCount(): Int + fun getText(startPage: Int, endPage: Int): String + fun renderImage(pageIndex: Int, dpi: Float): BufferedImage + } + +} \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/DefaultParsingModel.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/DefaultParsingModel.kt new file mode 100644 index 00000000..16703931 --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/DefaultParsingModel.kt @@ -0,0 +1,233 @@ +package com.simiacryptus.skyenet.apps.general.parsers + +import com.simiacryptus.jopenai.API +import com.simiacryptus.jopenai.ApiModel +import com.simiacryptus.jopenai.OpenAIClient +import com.simiacryptus.jopenai.describe.Description +import com.simiacryptus.jopenai.models.ChatModels +import com.simiacryptus.jopenai.models.EmbeddingModels +import com.simiacryptus.jopenai.util.JsonUtil +import com.simiacryptus.skyenet.core.actors.ParsedActor +import org.apache.avro.Schema +import org.apache.avro.generic.GenericData +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.parquet.avro.AvroParquetWriter +import org.apache.parquet.hadoop.ParquetWriter +import org.apache.parquet.hadoop.metadata.CompressionCodecName +import java.io.File + +open class DefaultParsingModel( + private val chatModels: ChatModels, + private val temperature: Double +) : ParsingModel { + + override fun merge( + runningDocument: ParsingModel.DocumentData, + newData: ParsingModel.DocumentData + ) : ParsingModel.DocumentData { + val runningDocument = runningDocument as DocumentData + val newData = newData as DocumentData + return DocumentData( + id = newData.id ?: runningDocument.id, + content = mergeContent(runningDocument.content, newData.content).takeIf { it.isNotEmpty() }, + entities = mergeEntities(runningDocument.entities, newData.entities).takeIf { it.isNotEmpty() }, + metadata = mergeMetadata(runningDocument.metadata, newData.metadata) + ) + } + + protected open fun mergeMetadata(existing: DocumentMetadata?, new: DocumentMetadata?): DocumentMetadata { + return DocumentMetadata( + title = new?.title ?: existing?.title, + keywords = ((existing?.keywords ?: emptyList()) + (new?.keywords ?: emptyList())).distinct(), + properties = ((existing?.properties ?: emptyMap()) + (new?.properties ?: emptyMap())).takeIf { it.isNotEmpty() } + ) + } + + protected open fun mergeContent( + existingContent: List?, + newContent: List? + ): List { + val mergedContent = (existingContent ?: emptyList()).toMutableList() + (newContent ?: emptyList()).forEach { newItem -> + val existingIndex = mergedContent.indexOfFirst { it.type == newItem.type && it.text?.trim() == newItem.text?.trim() } + if (existingIndex != -1) { + mergedContent[existingIndex] = mergeContentData(mergedContent[existingIndex], newItem) + } else { + mergedContent.add(newItem) + } + } + return mergedContent + } + + protected open fun mergeContentData(existing: ContentData, new: ContentData) = existing.copy( + content = mergeContent(existing.content, new.content).takeIf { it.isNotEmpty() }, + entities = ((existing.entities ?: emptyList()) + (new.entities ?: emptyList())).distinct() + .takeIf { it.isNotEmpty() }, + tags = ((existing.tags ?: emptyList()) + (new.tags ?: emptyList())).distinct().takeIf { it.isNotEmpty() } + ) + + protected open fun mergeEntities( + existingEntities: Map?, + newEntities: Map? + ) = ((existingEntities?.keys ?: emptySet()) + (newEntities?.keys ?: emptySet())).associateWith { key -> + val existing = existingEntities?.get(key) + val new = newEntities?.get(key) + when { + existing == null -> new!! + new == null -> existing + else -> mergeEntityData(existing, new) + } + } + + protected open fun mergeEntityData(existing: EntityData, new: EntityData) = existing.copy( + aliases = ((existing.aliases ?: emptyList()) + (new.aliases ?: emptyList())).distinct() + .takeIf { it.isNotEmpty() }, + properties = ((existing.properties ?: emptyMap()) + (new.properties ?: emptyMap())).takeIf { it.isNotEmpty() }, + relations = ((existing.relations ?: emptyMap()) + (new.relations ?: emptyMap())).takeIf { it.isNotEmpty() }, + type = new.type ?: existing.type + ) + + open val promptSuffix = """ + |Parse the text into a hierarchical structure that describes the content of the page: + |1. Separate the content into sections, paragraphs, statements, etc. + |2. The final level of the hierarchy should contain singular, short, standalone sentences. + |3. Capture any entities, relationships, and properties that can be extracted from the text of the current page(s). + |4. For each entity, include mentions with their exact text and location (start and end indices) in the document. + |5. Extract document metadata such as title, author, creation date, and keywords if available. + |6. Assign relevant tags to each content section to improve searchability and categorization. + |7. Do not copy data from the accumulated document JSON to your response; it is provided for context only. + """.trimMargin() + open val exampleInstance = DocumentData() + override fun getParser(api: API): (String) -> DocumentData { + val parser = ParsedActor( + resultClass = DocumentData::class.java, + exampleInstance = exampleInstance, + prompt = "", + parsingModel = chatModels, + temperature = temperature + ).getParser( + api, promptSuffix = promptSuffix + ) + return { text -> parser.apply(text) } + } + + override fun newDocument() = DocumentData() + + data class DocumentData( + @Description("Document/Page identifier") val id: String? = null, + @Description("Entities extracted") val entities: Map? = null, + @Description("Hierarchical structure and data") val content: List? = null, + @Description("Document metadata") val metadata: DocumentMetadata? = null + ) : ParsingModel.DocumentData + + data class EntityData( + @Description("Aliases for the entity") val aliases: List? = null, + @Description("Entity attributes extracted from the page") val properties: Map? = null, + @Description("Entity relationships extracted from the page") val relations: Map? = null, + @Description("Entity type (e.g., person, organization, location)") val type: String? = null + ) + + data class ContentData( + @Description("Content type, e.g. heading, paragraph, statement, list") val type: String = "", + @Description("Brief, self-contained text either copied, paraphrased, or summarized") val text: String? = null, + @Description("Sub-elements") val content: List? = null, + @Description("Related entities by ID") val entities: List? = null, + @Description("Tags - related topics and non-entity indexing") val tags: List? = null + ) + data class DocumentMetadata( + @Description("Document title") val title: String? = null, + @Description("Keywords or tags associated with the document") val keywords: List? = null, + @Description("Other metadata") val properties: Map? = null, + ) + + companion object { + val log = org.slf4j.LoggerFactory.getLogger(DefaultParsingModel::class.java) + + fun saveAsParquet(outputPath: String, openAIClient: OpenAIClient, vararg inputPaths: String) { + val schema = Schema.Parser().parse(File("document_schema.avsc")) + val rows = mutableListOf() + inputPaths.forEach { inputPath -> + processDocument( + inputPath, + JsonUtil.fromJson(File(inputPath).readText(), DocumentData::class.java), + schema, + rows, + openAIClient + ) + } + writeParquet(outputPath, schema, rows) + } + + private fun processDocument( + inputPath: String, + document: DocumentData, + schema: Schema, + rows: MutableList, + openAIClient: OpenAIClient + ) { + fun processContent(content: ContentData, parentId: String? = null, depth: Int = 0, path: String = "") { + val record = GenericData.Record(schema) + record.put("id", content.hashCode().toString()) + record.put("parent_id", parentId) + record.put("type", content.type) + record.put("text", content.text) + record.put("entities", content.entities?.joinToString(",")) + record.put("tags", content.tags?.joinToString(",")) + record.put("source_path", inputPath) + record.put("depth", depth) + record.put("json_path", path) + // Generate vector embedding + val textToEmbed = "${content.type}: ${content.text}" + val embedding: DoubleArray = openAIClient.createEmbedding( + ApiModel.EmbeddingRequest( + EmbeddingModels.Large.modelName, textToEmbed + ) + ).data.get(0).embedding ?: DoubleArray(0) + record.put("vector", embedding) + + rows.add(record) + content.content?.forEachIndexed { index, childContent -> + processContent(childContent, content.hashCode().toString(), depth + 1, "$path.content[$index]") + } + } + document.content?.forEachIndexed { index, content -> + processContent(content, null, 0, "content[$index]") + } + document.entities?.forEach { (entityId, entityData) -> + val record = GenericData.Record(schema) + record.put("id", entityId) + record.put("type", "entity") + record.put("text", entityData.aliases?.joinToString(", ")) + record.put("properties", entityData.properties?.entries?.joinToString(", ") { "${it.key}:${it.value}" }) + record.put("relations", entityData.relations?.entries?.joinToString(", ") { "${it.key}:${it.value}" }) + record.put("source_path", inputPath) + record.put("depth", -1) // Use -1 to indicate it's an entity, not part of the content hierarchy + record.put("json_path", "entities.$entityId") + // Generate vector embedding for entity + val textToEmbed = "Entity ${entityData.type}: ${entityData.aliases?.joinToString(", ")}" + val embedding = openAIClient.createEmbedding( + ApiModel.EmbeddingRequest( + EmbeddingModels.Large.modelName, textToEmbed + ) + ).data.get(0).embedding ?: DoubleArray(0) + record.put("vector", embedding) + + rows.add(record) + } + } + + private fun writeParquet(outputPath: String, schema: Schema, rows: List) { + log.info("Writing ${rows.size} rows to $outputPath") + val conf = Configuration() + val writer: ParquetWriter = AvroParquetWriter.builder(Path(outputPath)) + .withSchema(schema) + .withConf(conf) + .withCompressionCodec(CompressionCodecName.SNAPPY) + .build() + rows.forEach { writer.write(it) } + writer.close() + } + } + +} \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/PDFReader.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/PDFReader.kt new file mode 100644 index 00000000..f30634d9 --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/PDFReader.kt @@ -0,0 +1,30 @@ +package com.simiacryptus.skyenet.apps.general.parsers + +import com.simiacryptus.skyenet.apps.general.DocumentParserApp +import org.apache.pdfbox.pdmodel.PDDocument +import org.apache.pdfbox.rendering.PDFRenderer +import org.apache.pdfbox.text.PDFTextStripper +import java.awt.image.BufferedImage +import java.io.File + +class PDFReader(pdfFile: File) : DocumentParserApp.DocumentReader { + private val document: PDDocument = PDDocument.load(pdfFile) + private val renderer: PDFRenderer = PDFRenderer(document) + private val stripper: PDFTextStripper = PDFTextStripper().apply { sortByPosition = true } + + override fun getPageCount(): Int = document.numberOfPages + + override fun getText(startPage: Int, endPage: Int): String { + stripper.startPage = startPage + stripper.endPage = endPage + return stripper.getText(document) + } + + override fun renderImage(pageIndex: Int, dpi: Float): BufferedImage { + return renderer.renderImageWithDPI(pageIndex, dpi) + } + + override fun close() { + document.close() + } +} \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/ParsingModel.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/ParsingModel.kt new file mode 100644 index 00000000..f73f3d90 --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/ParsingModel.kt @@ -0,0 +1,14 @@ +package com.simiacryptus.skyenet.apps.general.parsers + +import com.simiacryptus.jopenai.API + +interface ParsingModel { + fun merge(runningDocument: DocumentData, newData: DocumentData): DocumentData + fun getParser(api: API): (String) -> DocumentData + fun newDocument(): DocumentData + + interface DocumentData + companion object { + private val log = org.slf4j.LoggerFactory.getLogger(ParsingModel::class.java) + } +} \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/TextReader.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/TextReader.kt new file mode 100644 index 00000000..8292ff1c --- /dev/null +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/TextReader.kt @@ -0,0 +1,23 @@ +package com.simiacryptus.skyenet.apps.general.parsers + +import com.simiacryptus.skyenet.apps.general.DocumentParserApp +import java.awt.image.BufferedImage +import java.io.File + +class TextReader(private val textFile: File) : DocumentParserApp.DocumentReader { + private val content: List = textFile.readLines() + + override fun getPageCount(): Int = 1 + + override fun getText(startPage: Int, endPage: Int): String { + return content.joinToString("\n") + } + + override fun renderImage(pageIndex: Int, dpi: Float): BufferedImage { + throw UnsupportedOperationException("Text files do not support image rendering") + } + + override fun close() { + // No resources to close for text files + } +} \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/webui/session/SessionTask.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/webui/session/SessionTask.kt index 130d79f0..2baeed13 100644 --- a/webui/src/main/kotlin/com/simiacryptus/skyenet/webui/session/SessionTask.kt +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/webui/session/SessionTask.kt @@ -187,7 +187,7 @@ abstract class SessionTask( fun image( @Description("The image to display") image: BufferedImage - ) = add("""""") + ) = add("""""") companion object { val log = LoggerFactory.getLogger(SessionTask::class.java) diff --git a/webui/src/test/kotlin/com/simiacryptus/skyenet/webui/ActorTestAppServer.kt b/webui/src/test/kotlin/com/simiacryptus/skyenet/webui/ActorTestAppServer.kt index 16a19108..421146b8 100644 --- a/webui/src/test/kotlin/com/simiacryptus/skyenet/webui/ActorTestAppServer.kt +++ b/webui/src/test/kotlin/com/simiacryptus/skyenet/webui/ActorTestAppServer.kt @@ -2,9 +2,9 @@ package com.simiacryptus.skyenet.webui import com.simiacryptus.jopenai.models.ChatModels import com.simiacryptus.jopenai.util.ClientUtil.keyTxt +import com.simiacryptus.skyenet.apps.general.DocumentParserApp import com.simiacryptus.skyenet.apps.general.PlanAheadApp import com.simiacryptus.skyenet.apps.general.StressTestApp -import com.simiacryptus.skyenet.apps.plan.PlanCoordinator import com.simiacryptus.skyenet.apps.plan.PlanSettings import com.simiacryptus.skyenet.apps.plan.PlanUtil.isWindows import com.simiacryptus.skyenet.core.actors.CodingActor @@ -80,7 +80,6 @@ object ActorTestAppServer : com.simiacryptus.skyenet.webui.application.Applicati CodingActorTestApp(CodingActor(GroovyInterpreter::class, model = ChatModels.GPT35Turbo)) ), ChildWebApp("/test_file_patch", FilePatchTestApp()), - /*PlanAheadApp*/ ChildWebApp( "/taskDev", PlanAheadApp( @@ -106,8 +105,8 @@ object ActorTestAppServer : com.simiacryptus.skyenet.webui.application.Applicati parsingModel = ChatModels.GPT4oMini, ) ), - /*StressTestApp*/ ChildWebApp("/stressTest", StressTestApp()), + ChildWebApp("/pdfExtractor", DocumentParserApp()), ) }