From 8c607039638b80f91331d905212b29b7d80c81be Mon Sep 17 00:00:00 2001 From: Andrew Charneski Date: Sun, 8 Sep 2024 16:35:14 -0400 Subject: [PATCH] wip --- webui/build.gradle.kts | 5 +++ .../skyenet/apps/general/DocumentParserApp.kt | 6 +-- .../general/parsers/DefaultParsingModel.kt | 44 +++++++++++++++++++ .../skyenet/webui/session/SessionTask.kt | 2 +- 4 files changed, 52 insertions(+), 5 deletions(-) diff --git a/webui/build.gradle.kts b/webui/build.gradle.kts index 0d910d3b..b9aa7f37 100644 --- a/webui/build.gradle.kts +++ b/webui/build.gradle.kts @@ -63,6 +63,11 @@ dependencies { } testRuntimeOnly("org.openapitools:openapi-generator-cli:7.3.0") + implementation("org.apache.parquet:parquet-common:1.12.3") + implementation("org.apache.parquet:parquet-avro:1.12.3") + implementation("org.apache.hadoop:hadoop-common:3.3.4") + implementation("org.apache.hadoop:hadoop-mapreduce-client-core:3.3.4") + implementation(group = "org.eclipse.jetty", name = "jetty-server", version = jetty_version) implementation(group = "org.eclipse.jetty", name = "jetty-servlet", version = jetty_version) implementation(group = "org.eclipse.jetty", name = "jetty-annotations", version = jetty_version) diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt index 974ade15..0b75d046 100644 --- a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/DocumentParserApp.kt @@ -23,6 +23,7 @@ import java.io.File import java.nio.file.Path import java.util.* import javax.imageio.ImageIO +import kotlin.io.path.name import kotlin.math.min open class DocumentParserApp( @@ -65,7 +66,6 @@ open class DocumentParserApp( maxPages = settings.maxPages.coerceAtMost(Int.MAX_VALUE), settings = settings, pagesPerBatch = settings.pagesPerBatch, - root = dataStorage.getDataDir(user, session) ) } return socketManager @@ -85,7 +85,6 @@ open class DocumentParserApp( maxPages = settings.maxPages.coerceAtMost(Int.MAX_VALUE), settings = settings, pagesPerBatch = settings.pagesPerBatch, - root = dataStorage.getDataDir(user, session) ) } } @@ -97,7 +96,6 @@ open class DocumentParserApp( maxPages: Int, settings: Settings, pagesPerBatch: Int, - root: File ) { try { val pdfFile = fileInput.toFile() @@ -244,7 +242,7 @@ open class DocumentParserApp( ) // Save final JSON if enabled in settings if (settings.saveFinalJson) { - val finalJsonFile = outputDir.resolve("final_document.json") + val finalJsonFile = root.resolve(fileInput.name.reversed().split(delimiters = arrayOf("."), false, 2)[1].reversed() + ".parsed.json") finalJsonFile.writeText(JsonUtil.toJson(runningDocument)) task.add( MarkdownUtil.renderMarkdown( diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/DefaultParsingModel.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/DefaultParsingModel.kt index 24bf413e..bd6674b0 100644 --- a/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/DefaultParsingModel.kt +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/apps/general/parsers/DefaultParsingModel.kt @@ -4,6 +4,14 @@ import com.simiacryptus.jopenai.API import com.simiacryptus.jopenai.describe.Description import com.simiacryptus.jopenai.models.ChatModels import com.simiacryptus.skyenet.core.actors.ParsedActor +import org.apache.avro.Schema +import org.apache.avro.generic.GenericData +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.parquet.avro.AvroParquetWriter +import org.apache.parquet.hadoop.ParquetWriter +import org.apache.parquet.hadoop.metadata.CompressionCodecName +import java.io.File open class DefaultParsingModel( private val chatModels: ChatModels, @@ -131,6 +139,42 @@ open class DefaultParsingModel( companion object { val log = org.slf4j.LoggerFactory.getLogger(DefaultParsingModel::class.java) + + fun saveAsParquet(document: DocumentData, outputPath: String) { + val schema = Schema.Parser().parse(File("document_schema.avsc")) + val rows = mutableListOf() + fun processContent(content: ContentData, parentId: String? = null) { + val record = GenericData.Record(schema) + record.put("id", content.hashCode().toString()) + record.put("parent_id", parentId) + record.put("type", content.type) + record.put("text", content.text) + record.put("entities", content.entities?.joinToString(",")) + record.put("tags", content.tags?.joinToString(",")) + rows.add(record) + content.content?.forEach { childContent -> + processContent(childContent, content.hashCode().toString()) + } + } + document.content?.forEach { processContent(it) } + document.entities?.forEach { (entityId, entityData) -> + val record = GenericData.Record(schema) + record.put("id", entityId) + record.put("type", "entity") + record.put("text", entityData.aliases?.joinToString(", ")) + record.put("properties", entityData.properties?.entries?.joinToString(", ") { "${it.key}:${it.value}" }) + record.put("relations", entityData.relations?.entries?.joinToString(", ") { "${it.key}:${it.value}" }) + rows.add(record) + } + val conf = Configuration() + val writer: ParquetWriter = AvroParquetWriter.builder(Path(outputPath)) + .withSchema(schema) + .withConf(conf) + .withCompressionCodec(CompressionCodecName.SNAPPY) + .build() + rows.forEach { writer.write(it) } + writer.close() + } } } \ No newline at end of file diff --git a/webui/src/main/kotlin/com/simiacryptus/skyenet/webui/session/SessionTask.kt b/webui/src/main/kotlin/com/simiacryptus/skyenet/webui/session/SessionTask.kt index 130d79f0..2baeed13 100644 --- a/webui/src/main/kotlin/com/simiacryptus/skyenet/webui/session/SessionTask.kt +++ b/webui/src/main/kotlin/com/simiacryptus/skyenet/webui/session/SessionTask.kt @@ -187,7 +187,7 @@ abstract class SessionTask( fun image( @Description("The image to display") image: BufferedImage - ) = add("""""") + ) = add("""""") companion object { val log = LoggerFactory.getLogger(SessionTask::class.java)