Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
acharneski committed Sep 8, 2024
1 parent c6a53f5 commit 8c60703
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 5 deletions.
5 changes: 5 additions & 0 deletions webui/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ dependencies {
}
testRuntimeOnly("org.openapitools:openapi-generator-cli:7.3.0")

implementation("org.apache.parquet:parquet-common:1.12.3")
implementation("org.apache.parquet:parquet-avro:1.12.3")
implementation("org.apache.hadoop:hadoop-common:3.3.4")
implementation("org.apache.hadoop:hadoop-mapreduce-client-core:3.3.4")

implementation(group = "org.eclipse.jetty", name = "jetty-server", version = jetty_version)
implementation(group = "org.eclipse.jetty", name = "jetty-servlet", version = jetty_version)
implementation(group = "org.eclipse.jetty", name = "jetty-annotations", version = jetty_version)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import java.io.File
import java.nio.file.Path
import java.util.*
import javax.imageio.ImageIO
import kotlin.io.path.name
import kotlin.math.min

open class DocumentParserApp(
Expand Down Expand Up @@ -65,7 +66,6 @@ open class DocumentParserApp(
maxPages = settings.maxPages.coerceAtMost(Int.MAX_VALUE),
settings = settings,
pagesPerBatch = settings.pagesPerBatch,
root = dataStorage.getDataDir(user, session)
)
}
return socketManager
Expand All @@ -85,7 +85,6 @@ open class DocumentParserApp(
maxPages = settings.maxPages.coerceAtMost(Int.MAX_VALUE),
settings = settings,
pagesPerBatch = settings.pagesPerBatch,
root = dataStorage.getDataDir(user, session)
)
}
}
Expand All @@ -97,7 +96,6 @@ open class DocumentParserApp(
maxPages: Int,
settings: Settings,
pagesPerBatch: Int,
root: File
) {
try {
val pdfFile = fileInput.toFile()
Expand Down Expand Up @@ -244,7 +242,7 @@ open class DocumentParserApp(
)
// Save final JSON if enabled in settings
if (settings.saveFinalJson) {
val finalJsonFile = outputDir.resolve("final_document.json")
val finalJsonFile = root.resolve(fileInput.name.reversed().split(delimiters = arrayOf("."), false, 2)[1].reversed() + ".parsed.json")
finalJsonFile.writeText(JsonUtil.toJson(runningDocument))
task.add(
MarkdownUtil.renderMarkdown(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@ import com.simiacryptus.jopenai.API
import com.simiacryptus.jopenai.describe.Description
import com.simiacryptus.jopenai.models.ChatModels
import com.simiacryptus.skyenet.core.actors.ParsedActor
import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.AvroParquetWriter
import org.apache.parquet.hadoop.ParquetWriter
import org.apache.parquet.hadoop.metadata.CompressionCodecName
import java.io.File

open class DefaultParsingModel(
private val chatModels: ChatModels,
Expand Down Expand Up @@ -131,6 +139,42 @@ open class DefaultParsingModel(

companion object {
val log = org.slf4j.LoggerFactory.getLogger(DefaultParsingModel::class.java)

fun saveAsParquet(document: DocumentData, outputPath: String) {
val schema = Schema.Parser().parse(File("document_schema.avsc"))
val rows = mutableListOf<GenericData.Record>()
fun processContent(content: ContentData, parentId: String? = null) {
val record = GenericData.Record(schema)
record.put("id", content.hashCode().toString())
record.put("parent_id", parentId)
record.put("type", content.type)
record.put("text", content.text)
record.put("entities", content.entities?.joinToString(","))
record.put("tags", content.tags?.joinToString(","))
rows.add(record)
content.content?.forEach { childContent ->
processContent(childContent, content.hashCode().toString())
}
}
document.content?.forEach { processContent(it) }
document.entities?.forEach { (entityId, entityData) ->
val record = GenericData.Record(schema)
record.put("id", entityId)
record.put("type", "entity")
record.put("text", entityData.aliases?.joinToString(", "))
record.put("properties", entityData.properties?.entries?.joinToString(", ") { "${it.key}:${it.value}" })
record.put("relations", entityData.relations?.entries?.joinToString(", ") { "${it.key}:${it.value}" })
rows.add(record)
}
val conf = Configuration()
val writer: ParquetWriter<GenericData.Record> = AvroParquetWriter.builder<GenericData.Record>(Path(outputPath))
.withSchema(schema)
.withConf(conf)
.withCompressionCodec(CompressionCodecName.SNAPPY)
.build()
rows.forEach { writer.write(it) }
writer.close()
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ abstract class SessionTask(
fun image(
@Description("The image to display")
image: BufferedImage
) = add("""<img src="${saveFile("${long64()}.png", image.toPng())}" />""")
) = add("""<img src="${saveFile("images/${long64()}.png", image.toPng())}" />""")

companion object {
val log = LoggerFactory.getLogger(SessionTask::class.java)
Expand Down

0 comments on commit 8c60703

Please sign in to comment.