Skip to content

Commit

Permalink
1.2.1 (#99)
Browse files Browse the repository at this point in the history
* 1.2.1

* wip

* wip

* abstracted parsing model

* wip

* wip

* Update DefaultParsingModel.kt
  • Loading branch information
acharneski authored Sep 9, 2024
1 parent 375042b commit e3a20fb
Show file tree
Hide file tree
Showing 12 changed files with 611 additions and 13 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,18 +76,18 @@ Maven:
<dependency>
<groupId>com.simiacryptus</groupId>
<artifactId>skyenet-webui</artifactId>
<version>1.1.0</version>
<version>1.1.1</version>
</dependency>
```

Gradle:

```groovy
implementation group: 'com.simiacryptus', name: 'skyenet', version: '1.1.0'
implementation group: 'com.simiacryptus', name: 'skyenet', version: '1.1.1'
```

```kotlin
implementation("com.simiacryptus:skyenet:1.1.0")
implementation("com.simiacryptus:skyenet:1.1.1")
```

### 🌟 To Use
Expand Down
2 changes: 1 addition & 1 deletion core/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ val hsqldb_version = "2.7.2"

dependencies {

implementation(group = "com.simiacryptus", name = "jo-penai", version = "1.1.0")
implementation(group = "com.simiacryptus", name = "jo-penai", version = "1.1.1")
implementation(group = "org.hsqldb", name = "hsqldb", version = hsqldb_version)

implementation("org.apache.commons:commons-text:1.11.0")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ open class ParsedActor<T : Any>(
override val obj get() = _obj
}

fun getParser(api: API) = Function<String, T> { input ->
fun getParser(api: API, promptSuffix: String? = null) = Function<String, T> { input ->
describer.coverMethods = false
val describe = resultClass?.let { describer.describe(it) } ?: ""
val exceptions = mutableListOf<Exception>()
Expand All @@ -73,6 +73,7 @@ open class ParsedActor<T : Any>(
|```json
|${JsonUtil.toJson(exampleInstance!!)/*.indent(" ")*/}
|```
|${promptSuffix?.let { "\n$it" } ?: ""}
|
""".trimMargin()
for (i in 0 until deserializerRetries) {
Expand Down Expand Up @@ -100,7 +101,12 @@ open class ParsedActor<T : Any>(

// if input is wrapped in a ```json block, remove the block
if (contentUnwrapped.startsWith("```json")) {
contentUnwrapped = contentUnwrapped.substring(7, contentUnwrapped.lastIndexOf("```"))
val endIndex = contentUnwrapped.lastIndexOf("```")
if (endIndex > 7) {
contentUnwrapped = contentUnwrapped.substring(7, endIndex)
} else {
throw RuntimeException("Failed to parse response: ${contentUnwrapped.replace("\n", "\n ")}")
}
}

contentUnwrapped.let {
Expand Down
2 changes: 1 addition & 1 deletion gradle.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Gradle Releases -> https://github.com/gradle/gradle/releases
libraryGroup = com.simiacryptus.skyenet
libraryVersion = 1.2.0
libraryVersion = 1.2.1
gradleVersion = 7.6.1
kotlin.daemon.jvmargs=-Xmx2g
10 changes: 8 additions & 2 deletions webui/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,17 @@ kotlin {
val kotlin_version = "2.0.0-Beta5"
val jetty_version = "11.0.18"
val jackson_version = "2.17.0"

dependencies {

implementation(group = "com.simiacryptus", name = "jo-penai", version = "1.1.0") {
implementation(group = "com.simiacryptus", name = "jo-penai", version = "1.1.1") {
exclude(group = "org.slf4j", module = "slf4j-api")
}

implementation(project(":core"))
implementation(project(":kotlin"))

implementation("org.apache.pdfbox:pdfbox:2.0.27")
implementation("org.seleniumhq.selenium:selenium-chrome-driver:4.16.1")
compileOnly("org.jsoup:jsoup:1.17.2")

Expand All @@ -61,6 +63,11 @@ dependencies {
}
testRuntimeOnly("org.openapitools:openapi-generator-cli:7.3.0")

implementation("org.apache.parquet:parquet-common:1.12.3")
implementation("org.apache.parquet:parquet-avro:1.12.3")
implementation("org.apache.hadoop:hadoop-common:3.3.4")
implementation("org.apache.hadoop:hadoop-mapreduce-client-core:3.3.4")

implementation(group = "org.eclipse.jetty", name = "jetty-server", version = jetty_version)
implementation(group = "org.eclipse.jetty", name = "jetty-servlet", version = jetty_version)
implementation(group = "org.eclipse.jetty", name = "jetty-annotations", version = jetty_version)
Expand Down Expand Up @@ -114,7 +121,6 @@ sass {


tasks {

compileKotlin {
compilerOptions {
javaParameters.set(true)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
package com.simiacryptus.skyenet.apps.general

import com.simiacryptus.jopenai.API
import com.simiacryptus.jopenai.ChatClient
import com.simiacryptus.jopenai.models.ChatModels
import com.simiacryptus.jopenai.util.JsonUtil
import com.simiacryptus.skyenet.TabbedDisplay
import com.simiacryptus.skyenet.apps.general.parsers.DefaultParsingModel
import com.simiacryptus.skyenet.apps.general.parsers.PDFReader
import com.simiacryptus.skyenet.apps.general.parsers.ParsingModel
import com.simiacryptus.skyenet.apps.general.parsers.TextReader
import com.simiacryptus.skyenet.core.platform.Session
import com.simiacryptus.skyenet.core.platform.User
import com.simiacryptus.skyenet.webui.application.ApplicationInterface
import com.simiacryptus.skyenet.webui.application.ApplicationServer
import com.simiacryptus.skyenet.webui.application.ApplicationSocketManager
import com.simiacryptus.skyenet.webui.session.SessionTask
import com.simiacryptus.skyenet.webui.session.SocketManager
import com.simiacryptus.skyenet.webui.util.MarkdownUtil
import org.intellij.lang.annotations.Language
import java.awt.image.BufferedImage
import java.io.File
import java.nio.file.Path
import java.util.*
import javax.imageio.ImageIO
import kotlin.io.path.name
import kotlin.math.min

open class DocumentParserApp(
applicationName: String = "Document Extractor",
path: String = "/pdfExtractor",
val api: API = ChatClient(),
val parsingModel: ParsingModel = DefaultParsingModel(ChatModels.Claude35Sonnet, 0.1),
val reader: (File) -> DocumentReader = {
when {
it.name.endsWith(".pdf", ignoreCase = true) -> PDFReader(it)
it.name.endsWith(".txt", ignoreCase = true) -> TextReader(it)
it.name.endsWith(".md", ignoreCase = true) -> TextReader(it)
it.name.endsWith(".html", ignoreCase = true) -> TextReader(it)
else -> throw IllegalArgumentException("Unsupported file type")
}
},
val fileInput: Path? = null,
) : ApplicationServer(
applicationName = applicationName,
path = path,
showMenubar = true
) {
override val singleInput: Boolean = true
override val stickyInput: Boolean = false

override fun newSession(user: User?, session: Session): SocketManager {
val socketManager = super.newSession(user, session)
val ui = (socketManager as ApplicationSocketManager).applicationInterface
val settings = getSettings(session, user, Settings::class.java) ?: Settings()
if (null == (fileInput ?: settings.fileInput)) {
log.info("No file input provided")
} else socketManager.pool.submit {
run(
task = ui.newTask(),
ui = ui,
fileInput = (this.fileInput ?: settings.fileInput?.let { File(it).toPath() }
?: error("File input not provided")).apply {
if (!toFile().exists()) error("File not found: $this")
},
maxPages = settings.maxPages.coerceAtMost(Int.MAX_VALUE),
settings = settings,
pagesPerBatch = settings.pagesPerBatch,
)
}
return socketManager
}

override fun userMessage(session: Session, user: User?, userMessage: String, ui: ApplicationInterface, api: API) {
val settings = getSettings(session, user, Settings::class.java) ?: Settings()
val fileInput =
(fileInput ?: settings.fileInput?.let { File(it).toPath() } ?: error("File input not provided")).apply {
if (!toFile().exists()) error("File not found: $this")
}
ui.socketManager!!.pool.submit {
run(
task = ui.newTask(),
ui = ui,
fileInput = fileInput,
maxPages = settings.maxPages.coerceAtMost(Int.MAX_VALUE),
settings = settings,
pagesPerBatch = settings.pagesPerBatch,
)
}
}

private fun run(
task: SessionTask,
ui: ApplicationInterface,
fileInput: Path,
maxPages: Int,
settings: Settings,
pagesPerBatch: Int,
) {
try {
val pdfFile = fileInput.toFile()
if (!pdfFile.exists() || !pdfFile.isFile || !pdfFile.name.endsWith(".pdf", ignoreCase = true)) {
throw IllegalArgumentException("Invalid PDF file: $pdfFile")
}
task.add(MarkdownUtil.renderMarkdown("# PDF Extractor", ui = ui))
val outputDir = root.resolve("output").apply { mkdirs() }
lateinit var runningDocument: ParsingModel.DocumentData
reader(pdfFile).use { reader ->
runningDocument = parsingModel.newDocument()
var previousPageText = "" // Keep this for context
task.add(
MarkdownUtil.renderMarkdown(
"""
## Processing PDF: ${pdfFile.name}
Total pages: ${reader.getPageCount()}
""".trimIndent(), ui = ui
)
)
val pageCount = minOf(reader.getPageCount(), maxPages)
val tabs = TabbedDisplay(task)
for (batchStart in 0 until pageCount step pagesPerBatch) {
val batchEnd = min(batchStart + pagesPerBatch, pageCount)
val pageTask = ui.newTask(false)
val pageTabs = TabbedDisplay(pageTask.apply {
val label =
if ((batchStart + 1) != batchEnd) "Pages ${batchStart + 1}-${batchEnd}" else "Page ${batchStart + 1}"
tabs[label] = this.placeholder
})
try {
val text = reader.getText(batchStart + 1, batchEnd)
if (settings.saveTextFiles) {
outputDir.resolve("pages_${batchStart + 1}_to_${batchEnd}_text.txt").writeText(text)
}
val promptList = mutableListOf<String>()
promptList.add(
"""
|# Accumulated Prior JSON:
|
|FOR INFORMATIVE CONTEXT ONLY. DO NOT COPY TO OUTPUT.
|```json
|${JsonUtil.toJson(runningDocument)}
|```
""".trimMargin()
)
promptList.add(
"""
|# Prior Text
|
|FOR INFORMATIVE CONTEXT ONLY. DO NOT COPY TO OUTPUT.
|```text
|$previousPageText
|```
|""".trimMargin()
)
promptList.add(
"""
|# Current Page
|
|```text
|$text
|```
""".trimMargin()
)
@Language("Markdown") val jsonResult = parsingModel.getParser(api).let {
it(promptList.toList().joinToString("\n\n"))
}
val jsonFile = outputDir.resolve("pages_${batchStart + 1}_to_${batchEnd}_content.json")
jsonFile.writeText(JsonUtil.toJson(jsonResult))
ui.newTask(false).apply {
pageTabs["Text"] = this.placeholder
add(
MarkdownUtil.renderMarkdown(
"\n```text\n${
text
}\n```\n", ui = ui
)
)
}
ui.newTask(false).apply {
pageTabs["JSON"] = this.placeholder
add(
MarkdownUtil.renderMarkdown(
"\n```json\n${
JsonUtil.toJson(jsonResult)
}\n```\n", ui = ui
)
)
}
for (pageIndex in batchStart until batchEnd) {
val image = reader.renderImage(pageIndex, settings.dpi)
if (settings.showImages) {
ui.newTask(false).apply {
pageTabs["Image ${pageIndex + 1}"] = this.placeholder
image(image)
}
}
if (settings.saveImageFiles) {
val imageFile =
outputDir.resolve("page_${pageIndex + 1}.${settings.outputFormat.lowercase(Locale.getDefault())}")
when (settings.outputFormat.uppercase(Locale.getDefault())) {
"PNG" -> ImageIO.write(image, "PNG", imageFile)
"JPEG", "JPG" -> ImageIO.write(image, "JPEG", imageFile)
"GIF" -> ImageIO.write(image, "GIF", imageFile)
"BMP" -> ImageIO.write(image, "BMP", imageFile)
else -> throw IllegalArgumentException("Unsupported output format: ${settings.outputFormat}")
}
}
}
runningDocument = parsingModel.merge(runningDocument, jsonResult)
ui.newTask(false).apply {
pageTabs["Accumulator"] = this.placeholder
add(
MarkdownUtil.renderMarkdown(
"""
|## Accumulated Document JSON
|
|```json
|${JsonUtil.toJson(runningDocument)}
|```
""".trimMargin(), ui = ui
)
)
}
previousPageText = text.takeLast(1000)
} catch (e: Throwable) {
pageTask.error(ui, e)
continue
}
}
task.add(
MarkdownUtil.renderMarkdown(
"""
|## Document JSON
|
|```json
|${JsonUtil.toJson(runningDocument)}
|```
|
|Extracted files are saved in: ${outputDir.absolutePath}
""".trimMargin(), ui = ui
)
)
// Save final JSON if enabled in settings
if (settings.saveFinalJson) {
val finalJsonFile = root.resolve(fileInput.name.reversed().split(delimiters = arrayOf("."), false, 2)[1].reversed() + ".parsed.json")
finalJsonFile.writeText(JsonUtil.toJson(runningDocument))
task.add(
MarkdownUtil.renderMarkdown(
"Final JSON saved to: ${finalJsonFile.absolutePath}",
ui = ui
)
)
}
}
} catch (e: Throwable) {
task.error(ui, e)
}
}

data class Settings(
val dpi: Float = 120f,
val maxPages: Int = Int.MAX_VALUE,
val outputFormat: String = "PNG",
val fileInput: String? = "",
val showImages: Boolean = true,
val pagesPerBatch: Int = 1,
val saveImageFiles: Boolean = true,
val saveTextFiles: Boolean = true,
val saveFinalJson: Boolean = false
)

override val settingsClass: Class<*> get() = Settings::class.java

@Suppress("UNCHECKED_CAST")
override fun <T : Any> initSettings(session: Session): T = Settings() as T

companion object {
private val log = org.slf4j.LoggerFactory.getLogger(DocumentParserApp::class.java)
}

interface DocumentReader : AutoCloseable {
fun getPageCount(): Int
fun getText(startPage: Int, endPage: Int): String
fun renderImage(pageIndex: Int, dpi: Float): BufferedImage
}

}
Loading

0 comments on commit e3a20fb

Please sign in to comment.