-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
59f93bd
commit f8467e1
Showing
11 changed files
with
399 additions
and
87 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
# Gradle Releases -> https://github.com/gradle/gradle/releases | ||
libraryGroup=com.simiacryptus.skyenet | ||
libraryVersion=1.2.17 | ||
libraryVersion=1.2.18 | ||
gradleVersion=7.6.1 | ||
kotlin.daemon.jvmargs=-Xmx4g |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
82 changes: 82 additions & 0 deletions
82
webui/src/main/kotlin/com/simiacryptus/skyenet/apps/parse/HTMLReader.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
package com.simiacryptus.skyenet.apps.parse | ||
|
||
import org.jsoup.Jsoup | ||
import org.jsoup.nodes.Document | ||
import java.awt.image.BufferedImage | ||
import java.io.File | ||
|
||
class HTMLReader(private val htmlFile: File) : DocumentParserApp.DocumentReader { | ||
private val document: Document = Jsoup.parse(htmlFile, "UTF-8") | ||
private val pages: List<String> = splitIntoPages(document.body().text()) | ||
private lateinit var settings: DocumentParserApp.Settings | ||
|
||
fun configure(settings: DocumentParserApp.Settings) { | ||
this.settings = settings | ||
} | ||
|
||
override fun getPageCount(): Int = pages.size | ||
|
||
override fun getText(startPage: Int, endPage: Int): String { | ||
val text = pages.subList(startPage, endPage.coerceAtMost(pages.size)).joinToString("\n") | ||
return if (::settings.isInitialized && settings.addLineNumbers) { | ||
text.lines().mapIndexed { index, line -> | ||
"${(index + 1).toString().padStart(6)}: $line" | ||
}.joinToString("\n") | ||
} else text | ||
} | ||
|
||
override fun renderImage(pageIndex: Int, dpi: Float): BufferedImage { | ||
throw UnsupportedOperationException("HTML files do not support image rendering") | ||
} | ||
|
||
override fun close() { | ||
// No resources to close for HTML files | ||
} | ||
|
||
private fun splitIntoPages(text: String, maxChars: Int = 16000): List<String> { | ||
if (text.length <= maxChars) return listOf(text) | ||
|
||
// Split on paragraph boundaries when possible | ||
val paragraphs = text.split(Regex("\\n\\s*\\n")) | ||
|
||
val pages = mutableListOf<String>() | ||
var currentPage = StringBuilder() | ||
|
||
for (paragraph in paragraphs) { | ||
if (currentPage.length + paragraph.length > maxChars) { | ||
if (currentPage.isNotEmpty()) { | ||
pages.add(currentPage.toString()) | ||
currentPage = StringBuilder() | ||
} | ||
// If a single paragraph is longer than maxChars, split it | ||
if (paragraph.length > maxChars) { | ||
val words = paragraph.split(" ") | ||
var currentChunk = StringBuilder() | ||
|
||
for (word in words) { | ||
if (currentChunk.length + word.length > maxChars) { | ||
pages.add(currentChunk.toString()) | ||
currentChunk = StringBuilder() | ||
} | ||
if (currentChunk.isNotEmpty()) currentChunk.append(" ") | ||
currentChunk.append(word) | ||
} | ||
if (currentChunk.isNotEmpty()) { | ||
currentPage.append(currentChunk) | ||
} | ||
} else { | ||
currentPage.append(paragraph) | ||
} | ||
} else { | ||
if (currentPage.isNotEmpty()) currentPage.append("\n\n") | ||
currentPage.append(paragraph) | ||
} | ||
} | ||
|
||
if (currentPage.isNotEmpty()) { | ||
pages.add(currentPage.toString()) | ||
} | ||
|
||
return pages | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.