diff --git a/README.md b/README.md
index fad9ef9e..06d49e17 100644
--- a/README.md
+++ b/README.md
@@ -76,18 +76,18 @@ Maven:
com.simiacryptus
skyenet-webui
- 1.1.6
+ 1.1.7
```
Gradle:
```groovy
-implementation group: 'com.simiacryptus', name: 'skyenet', version: '1.1.6'
+implementation group: 'com.simiacryptus', name: 'skyenet', version: '1.1.7'
```
```kotlin
-implementation("com.simiacryptus:skyenet:1.1.6")
+implementation("com.simiacryptus:skyenet:1.1.7")
```
### 🌟 To Use
diff --git a/core/build.gradle.kts b/core/build.gradle.kts
index 13e45b77..e27b0704 100644
--- a/core/build.gradle.kts
+++ b/core/build.gradle.kts
@@ -33,7 +33,7 @@ val hsqldb_version = "2.7.2"
dependencies {
- implementation(group = "com.simiacryptus", name = "jo-penai", version = "1.1.6")
+ implementation(group = "com.simiacryptus", name = "jo-penai", version = "1.1.7")
implementation(group = "org.hsqldb", name = "hsqldb", version = hsqldb_version)
implementation("org.apache.commons:commons-text:1.11.0")
diff --git a/gradle.properties b/gradle.properties
index ac1df5c0..e504bdd7 100644
--- a/gradle.properties
+++ b/gradle.properties
@@ -1,5 +1,5 @@
# Gradle Releases -> https://github.com/gradle/gradle/releases
libraryGroup = com.simiacryptus.skyenet
-libraryVersion = 1.2.7
+libraryVersion = 1.2.8
gradleVersion = 7.6.1
kotlin.daemon.jvmargs=-Xmx4g
diff --git a/webui/build.gradle.kts b/webui/build.gradle.kts
index b6b60618..f432d91b 100644
--- a/webui/build.gradle.kts
+++ b/webui/build.gradle.kts
@@ -36,7 +36,7 @@ val jackson_version = "2.17.2"
dependencies {
- implementation(group = "com.simiacryptus", name = "jo-penai", version = "1.1.6") {
+ implementation(group = "com.simiacryptus", name = "jo-penai", version = "1.1.7") {
exclude(group = "org.slf4j")
}
diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/CodeParsingModel.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/CodeParsingModel.kt
new file mode 100644
index 00000000..691d4747
--- /dev/null
+++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/CodeParsingModel.kt
@@ -0,0 +1,137 @@
+package com.simiacryptus.skyenet.apps.parsers
+
+import com.simiacryptus.jopenai.API
+import com.simiacryptus.jopenai.describe.Description
+import com.simiacryptus.jopenai.models.ChatModels
+import com.simiacryptus.skyenet.core.actors.ParsedActor
+
+open class CodeParsingModel(
+ private val parsingModel: ChatModels,
+ private val temperature: Double
+) : ParsingModel {
+
+ override fun merge(
+ runningDocument: ParsingModel.DocumentData,
+ newData: ParsingModel.DocumentData
+ ): ParsingModel.DocumentData {
+ val runningDocument = runningDocument as CodeData
+ val newData = newData as CodeData
+ return CodeData(
+ id = newData.id ?: runningDocument.id,
+ content = mergeContent(runningDocument.content, newData.content).takeIf { it.isNotEmpty() },
+ entities = mergeEntities(runningDocument.entities, newData.entities).takeIf { it.isNotEmpty() },
+ metadata = mergeMetadata(runningDocument.metadata, newData.metadata)
+ )
+ }
+
+ protected open fun mergeMetadata(existing: CodeMetadata?, new: CodeMetadata?): CodeMetadata {
+ return CodeMetadata(
+ language = new?.language ?: existing?.language,
+ libraries = ((existing?.libraries ?: emptyList()) + (new?.libraries ?: emptyList())).distinct(),
+ properties = ((existing?.properties ?: emptyMap()) + (new?.properties ?: emptyMap())).takeIf { it.isNotEmpty() }
+ )
+ }
+
+ protected open fun mergeContent(
+ existingContent: List?,
+ newContent: List?
+ ): List {
+ val mergedContent = (existingContent ?: emptyList()).toMutableList()
+ (newContent ?: emptyList()).forEach { newItem ->
+ val existingIndex = mergedContent.indexOfFirst { it.type == newItem.type && it.text?.trim() == newItem.text?.trim() }
+ if (existingIndex != -1) {
+ mergedContent[existingIndex] = mergeContentData(mergedContent[existingIndex], newItem)
+ } else {
+ mergedContent.add(newItem)
+ }
+ }
+ return mergedContent
+ }
+
+ protected open fun mergeContentData(existing: CodeContent, new: CodeContent) = existing.copy(
+ content = mergeContent(existing.content, new.content).takeIf { it.isNotEmpty() },
+ entities = ((existing.entities ?: emptyList()) + (new.entities ?: emptyList())).distinct()
+ .takeIf { it.isNotEmpty() },
+ tags = ((existing.tags ?: emptyList()) + (new.tags ?: emptyList())).distinct().takeIf { it.isNotEmpty() }
+ )
+
+ protected open fun mergeEntities(
+ existingEntities: Map?,
+ newEntities: Map?
+ ) = ((existingEntities?.keys ?: emptySet()) + (newEntities?.keys ?: emptySet())).associateWith { key ->
+ val existing = existingEntities?.get(key)
+ val new = newEntities?.get(key)
+ when {
+ existing == null -> new!!
+ new == null -> existing
+ else -> mergeEntityData(existing, new)
+ }
+ }
+
+ protected open fun mergeEntityData(existing: CodeEntity, new: CodeEntity) = existing.copy(
+ aliases = ((existing.aliases ?: emptyList()) + (new.aliases ?: emptyList())).distinct()
+ .takeIf { it.isNotEmpty() },
+ properties = ((existing.properties ?: emptyMap()) + (new.properties ?: emptyMap())).takeIf { it.isNotEmpty() },
+ relations = ((existing.relations ?: emptyMap()) + (new.relations ?: emptyMap())).takeIf { it.isNotEmpty() },
+ type = new.type ?: existing.type
+ )
+
+ open val promptSuffix = """
+Parse the code into a structured format that describes its components:
+1. Identify functions, classes, and other code structures.
+2. Extract comments and document them with their associated code.
+3. Capture any dependencies or libraries used in the code.
+4. Extract metadata such as programming language and version if available.
+5. Assign relevant tags to each code section to improve searchability and categorization.
+6. Do not copy data from the accumulated code JSON to your response; it is provided for context only.
+ """.trimMargin()
+
+ open val exampleInstance = CodeData()
+
+ override fun getParser(api: API): (String) -> CodeData {
+ val parser = ParsedActor(
+ resultClass = CodeData::class.java,
+ exampleInstance = exampleInstance,
+ prompt = "",
+ parsingModel = parsingModel,
+ temperature = temperature
+ ).getParser(
+ api, promptSuffix = promptSuffix
+ )
+ return { text -> parser.apply(text) }
+ }
+
+ override fun newDocument() = CodeData()
+
+ data class CodeData(
+ @Description("Code identifier") override val id: String? = null,
+ @Description("Entities extracted") val entities: Map? = null,
+ @Description("Hierarchical structure and data") override val content: List? = null,
+ @Description("Code metadata") override val metadata: CodeMetadata? = null
+ ) : ParsingModel.DocumentData
+
+ data class CodeEntity(
+ @Description("Aliases for the entity") val aliases: List? = null,
+ @Description("Entity attributes extracted from the code") val properties: Map? = null,
+ @Description("Entity relationships extracted from the code") val relations: Map? = null,
+ @Description("Entity type (e.g., function, class, variable)") val type: String? = null
+ )
+
+ data class CodeContent(
+ @Description("Content type, e.g. function, class, comment") override val type: String = "",
+ @Description("Brief, self-contained text either copied, paraphrased, or summarized") override val text: String? = null,
+ @Description("Sub-elements") override val content: List? = null,
+ @Description("Related entities by ID") val entities: List? = null,
+ @Description("Tags - related topics and non-entity indexing") override val tags: List? = null
+ ) : ParsingModel.ContentData
+
+ data class CodeMetadata(
+ @Description("Programming language") val language: String? = null,
+ @Description("Libraries or dependencies associated with the code") val libraries: List? = null,
+ @Description("Other metadata") val properties: Map? = null,
+ ) : ParsingModel.DocumentMetadata
+
+ companion object {
+ val log = org.slf4j.LoggerFactory.getLogger(CodeParsingModel::class.java)
+ }
+}
\ No newline at end of file
diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/DocumentParserApp.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/DocumentParserApp.kt
index 477a4b36..6f0e73d4 100644
--- a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/DocumentParserApp.kt
+++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/DocumentParserApp.kt
@@ -2,7 +2,6 @@ package com.simiacryptus.skyenet.apps.parsers
import com.simiacryptus.jopenai.API
import com.simiacryptus.jopenai.ChatClient
-import com.simiacryptus.jopenai.models.AnthropicModels
import com.simiacryptus.util.JsonUtil
import com.simiacryptus.skyenet.TabbedDisplay
import com.simiacryptus.skyenet.core.platform.Session
@@ -26,14 +25,11 @@ open class DocumentParserApp(
applicationName: String = "Document Extractor",
path: String = "/pdfExtractor",
val api: API = ChatClient(),
- val parsingModel: ParsingModel = DefaultParsingModel(AnthropicModels.Claude35Sonnet, 0.1),
+ val parsingModel: ParsingModel,
val reader: (File) -> DocumentReader = {
when {
it.name.endsWith(".pdf", ignoreCase = true) -> PDFReader(it)
- it.name.endsWith(".txt", ignoreCase = true) -> TextReader(it)
- it.name.endsWith(".md", ignoreCase = true) -> TextReader(it)
- it.name.endsWith(".html", ignoreCase = true) -> TextReader(it)
- else -> throw IllegalArgumentException("Unsupported file type")
+ else -> TextReader(it)
}
},
val fileInput: Path? = null,
@@ -262,9 +258,9 @@ open class DocumentParserApp(
val fileInput: String? = "",
val showImages: Boolean = true,
val pagesPerBatch: Int = 1,
- val saveImageFiles: Boolean = true,
- val saveTextFiles: Boolean = true,
- val saveFinalJson: Boolean = false
+ val saveImageFiles: Boolean = false,
+ val saveTextFiles: Boolean = false,
+ val saveFinalJson: Boolean = true
)
override val settingsClass: Class<*> get() = Settings::class.java
diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/DefaultParsingModel.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/DocumentParsingModel.kt
similarity index 89%
rename from webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/DefaultParsingModel.kt
rename to webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/DocumentParsingModel.kt
index 589bb93a..61190375 100644
--- a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/DefaultParsingModel.kt
+++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/DocumentParsingModel.kt
@@ -6,7 +6,7 @@ import com.simiacryptus.jopenai.models.ChatModels
import com.simiacryptus.skyenet.core.actors.ParsedActor
-open class DefaultParsingModel(
+open class DocumentParsingModel(
private val parsingModel: ChatModels,
private val temperature: Double
) : ParsingModel {
@@ -14,7 +14,7 @@ open class DefaultParsingModel(
override fun merge(
runningDocument: ParsingModel.DocumentData,
newData: ParsingModel.DocumentData
- ) : ParsingModel.DocumentData {
+ ): ParsingModel.DocumentData {
val runningDocument = runningDocument as DocumentData
val newData = newData as DocumentData
return DocumentData(
@@ -87,7 +87,9 @@ open class DefaultParsingModel(
|6. Assign relevant tags to each content section to improve searchability and categorization.
|7. Do not copy data from the accumulated document JSON to your response; it is provided for context only.
""".trimMargin()
+
open val exampleInstance = DocumentData()
+
override fun getParser(api: API): (String) -> DocumentData {
val parser = ParsedActor(
resultClass = DocumentData::class.java,
@@ -104,10 +106,10 @@ open class DefaultParsingModel(
override fun newDocument() = DocumentData()
data class DocumentData(
- @Description("Document/Page identifier") val id: String? = null,
+ @Description("Document/Page identifier") override val id: String? = null,
@Description("Entities extracted") val entities: Map? = null,
- @Description("Hierarchical structure and data") val content: List? = null,
- @Description("Document metadata") val metadata: DocumentMetadata? = null
+ @Description("Hierarchical structure and data") override val content: List? = null,
+ @Description("Document metadata") override val metadata: DocumentMetadata? = null
) : ParsingModel.DocumentData
data class EntityData(
@@ -118,20 +120,21 @@ open class DefaultParsingModel(
)
data class ContentData(
- @Description("Content type, e.g. heading, paragraph, statement, list") val type: String = "",
- @Description("Brief, self-contained text either copied, paraphrased, or summarized") val text: String? = null,
- @Description("Sub-elements") val content: List? = null,
+ @Description("Content type, e.g. heading, paragraph, statement, list") override val type: String = "",
+ @Description("Brief, self-contained text either copied, paraphrased, or summarized") override val text: String? = null,
+ @Description("Sub-elements") override val content: List? = null,
@Description("Related entities by ID") val entities: List? = null,
- @Description("Tags - related topics and non-entity indexing") val tags: List? = null
- )
+ @Description("Tags - related topics and non-entity indexing") override val tags: List? = null
+ ) : ParsingModel.ContentData
+
data class DocumentMetadata(
@Description("Document title") val title: String? = null,
@Description("Keywords or tags associated with the document") val keywords: List? = null,
@Description("Other metadata") val properties: Map? = null,
- )
+ ) : ParsingModel.DocumentMetadata
companion object {
- val log = org.slf4j.LoggerFactory.getLogger(DefaultParsingModel::class.java)
+ val log = org.slf4j.LoggerFactory.getLogger(DocumentParsingModel::class.java)
}
diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/DocumentRecord.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/DocumentRecord.kt
index 996b76d0..3a96076b 100644
--- a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/DocumentRecord.kt
+++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parsers/DocumentRecord.kt
@@ -1,10 +1,16 @@
package com.simiacryptus.skyenet.apps.parsers
-import com.simiacryptus.jopenai.models.ApiModel
import com.simiacryptus.jopenai.OpenAIClient
+import com.simiacryptus.jopenai.models.ApiModel
import com.simiacryptus.jopenai.models.EmbeddingModels
import com.simiacryptus.util.JsonUtil
-import java.io.*
+import java.io.File
+import java.io.FileInputStream
+import java.io.FileOutputStream
+import java.io.IOException
+import java.io.ObjectInputStream
+import java.io.ObjectOutputStream
+import java.io.Serializable
import java.util.concurrent.ExecutorService
import java.util.concurrent.TimeUnit
@@ -13,7 +19,6 @@ data class DocumentRecord(
val parentId: String?,
val type: String,
val text: String?,
- val entities: String?,
val tags: String?,
val sourcePath: String,
val depth: Int,
@@ -28,7 +33,6 @@ data class DocumentRecord(
out.writeObject(parentId)
out.writeUTF(type)
out.writeObject(text)
- out.writeObject(entities)
out.writeObject(tags)
out.writeUTF(sourcePath)
out.writeInt(depth)
@@ -37,6 +41,7 @@ data class DocumentRecord(
out.writeObject(properties)
out.writeObject(relations)
}
+
@Throws(IOException::class, ClassNotFoundException::class)
fun readObject(input: ObjectInputStream): DocumentRecord {
val id = input.readUTF()
@@ -56,7 +61,6 @@ data class DocumentRecord(
parentId,
type,
text,
- entities,
tags,
sourcePath,
depth,
@@ -66,20 +70,21 @@ data class DocumentRecord(
relations
)
}
+
companion object {
val log = org.slf4j.LoggerFactory.getLogger(DocumentRecord::class.java)
- fun saveAsBinary(
+ fun saveAsBinary(
openAIClient: OpenAIClient,
outputPath: String,
pool: ExecutorService,
- vararg inputPaths: String
+ vararg inputPaths: String,
) {
val records = mutableListOf()
inputPaths.forEach { inputPath ->
processDocument(
inputPath,
- JsonUtil.fromJson(File(inputPath).readText(), DefaultParsingModel.DocumentData::class.java),
+ JsonUtil.fromJson(File(inputPath).readText(), Map::class.java) as T,
records,
openAIClient,
pool
@@ -88,21 +93,20 @@ data class DocumentRecord(
writeBinary(outputPath, records)
}
- private fun processDocument(
+ private fun processDocument(
inputPath: String,
- document: DefaultParsingModel.DocumentData,
+ document: T,
records: MutableList,
openAIClient: OpenAIClient,
pool: ExecutorService
) {
- fun processContent(content: DefaultParsingModel.ContentData, parentId: String? = null, depth: Int = 0, path: String = "") {
+ fun processContent(content: Map, parentId: String? = null, depth: Int = 0, path: String = "") {
val record = DocumentRecord(
id = content.hashCode().toString(),
parentId = parentId,
- type = content.type,
- text = content.text,
- entities = content.entities?.joinToString(","),
- tags = content.tags?.joinToString(","),
+ type = content["type"] as? String ?: "",
+ text = content["text"] as? String,
+ tags = (content["tags"] as? List<*>)?.joinToString(","),
sourcePath = inputPath,
depth = depth,
jsonPath = path,
@@ -111,28 +115,14 @@ data class DocumentRecord(
relations = null
)
records.add(record)
- content.content?.forEachIndexed { index, childContent ->
+ (content["content"] as? List