Skip to content

Commit

Permalink
1.2.7 (#106)
Browse files Browse the repository at this point in the history
* 1.2.7

* update gradle

* logback

* Update settings.gradle.kts

* fix build oom

* fix build oom

* fix 4g build

* disable sass in github actions
  • Loading branch information
acharneski authored Oct 6, 2024
1 parent 9a5eac1 commit 3f301cb
Show file tree
Hide file tree
Showing 18 changed files with 455 additions and 45 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,7 @@ jobs:
distribution: 'temurin'
- name: Grant execute permissions for gradlew
run: chmod +x ./gradlew
- name: Set Gradle options
run: echo "GRADLE_OPTS='-Xmx4g'" >> $GITHUB_ENV
- name: Build and test
run: ./gradlew build
run: ./gradlew build -PskipSass
6 changes: 4 additions & 2 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,19 @@ jobs:
steps:
- uses: actions/checkout@v2
- name: Set up JDK 11
uses: actions/setup-java@v2
- uses: actions/setup-java@v2
with:
java-version: 11
distribution: 'temurin'
- name: Grant execute permissions for gradlew
run: chmod +x ./gradlew
- name: Set Gradle options
run: echo "GRADLE_OPTS='-Xmx4g'" >> $GITHUB_ENV
- name: Publish to Maven Central
env:
GPG_PRIVATE_KEY: ${{ secrets.GPG_PRIVATE_KEY }}
GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
OSSRH_USERNAME: ${{ secrets.OSSRH_USERNAME }}
OSSRH_PASSWORD: ${{ secrets.OSSRH_PASSWORD }}
GITHUB_TOKEN: ${{ secrets._GITHUB_TOKEN }}
run: ./gradlew publish -x test --no-configuration-cache --no-daemon --no-build-cache --no-parallel
run: ./gradlew publish -PskipSass -x test --no-configuration-cache --no-daemon --no-build-cache --no-parallel
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ openai.key
*.log
*.log.*
client_secret_google_oauth.json
settings.gradle.kts
2 changes: 1 addition & 1 deletion core/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ kotlin {
}

val junit_version = "5.10.1"
val logback_version = "1.4.11"
val logback_version = "1.5.8"
val jackson_version = "2.17.2"
val hsqldb_version = "2.7.2"

Expand Down
63 changes: 63 additions & 0 deletions docs/Document_Indexing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# User Guide: Document Data Extraction and Query Index Creation

This guide covers two main features: Document Data Extraction and Query Index Creation. These tools are designed to help you extract structured data from various document types and create searchable indexes for efficient querying.

## 1. Document Data Extraction

### Overview
The Document Data Extractor allows you to parse and extract structured information from PDF, TXT, MD, and HTML files. It uses AI to analyze the content and create a hierarchical JSON representation of the document's structure, entities, and metadata.

### How to Use
1. In your IDE, right-click on a supported file (PDF, TXT, MD, or HTML) in the project explorer.
2. Select the "Document Data Extractor" option from the context menu.
3. A configuration dialog will appear with the following options:
- DPI: Set the resolution for image rendering (for PDFs).
- Max Pages: Limit the number of pages to process.
- Output Format: Choose the format for saved images (PNG, JPEG, GIF, BMP).
- Pages Per Batch: Set how many pages to process in each batch.
- Show Images: Toggle whether to display rendered images in the results.
- Save Image Files: Choose to save rendered images to disk.
- Save Text Files: Choose to save extracted text to disk.
- Save Final JSON: Choose to save the final parsed JSON to disk.
4. Click "OK" to start the extraction process.
5. A new browser window will open, showing the progress and results of the extraction.

### Output
- The extracted data will be displayed in the browser, organized by pages or batches.
- If enabled, image files, text files, and the final JSON will be saved in an "output" directory next to the source file.
- The final JSON file will have a ".parsed.json" extension.

## 2. Query Index Creation

### Overview
The Query Index Creator takes the parsed JSON files from the Document Data Extractor and creates a binary index file that can be efficiently searched using embedding-based similarity search.

### How to Use
1. In your IDE, select one or more ".parsed.json" files in the project explorer.
2. Right-click and choose the "Save As Query Index" option from the context menu.
3. A file chooser dialog will appear. Select the directory where you want to save the index file.
4. Click "OK" to start the conversion process.
5. A progress bar will show the status of the index creation.

### Output
- A binary index file named "document.index.data" will be created in the selected output directory.
- This index file can be used for fast similarity searches on the extracted document data.

## Using the Query Index

Once you have created the query index, you can use it with the EmbeddingSearchTask to perform similarity searches on your document data. This allows you to quickly find relevant information across all your indexed documents.

To use the EmbeddingSearchTask:
1. Set up your search query and parameters (e.g., distance type, number of results).
2. Point the task to your "document.index.data" file.
3. Run the search to get the most relevant results based on embedding similarity.

## Tips and Best Practices

1. For large documents, consider processing them in smaller batches by adjusting the "Max Pages" and "Pages Per Batch" settings.
2. Save the final JSON files when extracting data, as these are required to create the query index.
3. Organize your parsed JSON files in a dedicated folder to make it easier to select them when creating the query index.
4. When creating the query index, choose an output location that is easily accessible for your search tasks.
5. Experiment with different DPI settings for PDFs to balance image quality and processing speed.
6. Use the "Show Images" option during extraction to visually verify the content being processed, especially for PDFs.

4 changes: 2 additions & 2 deletions gradle.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Gradle Releases -> https://github.com/gradle/gradle/releases
libraryGroup = com.simiacryptus.skyenet
libraryVersion = 1.2.6
libraryVersion = 1.2.7
gradleVersion = 7.6.1
kotlin.daemon.jvmargs=-Xmx2g
kotlin.daemon.jvmargs=-Xmx4g
Binary file modified gradle/wrapper/gradle-wrapper.jar
Binary file not shown.
2 changes: 1 addition & 1 deletion gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.8-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-8.10.2-bin.zip
networkTimeout=10000
validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME
Expand Down
2 changes: 1 addition & 1 deletion gradlew
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
# Darwin, MinGW, and NonStop.
#
# (3) This script is generated from the Groovy template
# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
# https://github.com/gradle/gradle/blob/HEAD/platforms/jvm/plugins-application/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
# within the Gradle project.
#
# You can find Gradle at https://github.com/gradle/gradle/.
Expand Down
20 changes: 10 additions & 10 deletions gradlew.bat
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@ set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
if %ERRORLEVEL% equ 0 goto execute

echo.
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
echo. 1>&2
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2
echo. 1>&2
echo Please set the JAVA_HOME variable in your environment to match the 1>&2
echo location of your Java installation. 1>&2

goto fail

Expand All @@ -57,11 +57,11 @@ set JAVA_EXE=%JAVA_HOME%/bin/java.exe

if exist "%JAVA_EXE%" goto execute

echo.
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
echo. 1>&2
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2
echo. 1>&2
echo Please set the JAVA_HOME variable in your environment to match the 1>&2
echo location of your Java installation. 1>&2

goto fail

Expand Down
4 changes: 2 additions & 2 deletions kotlin/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ dependencies {
testRuntimeOnly(group = "org.junit.jupiter", name = "junit-jupiter-engine", version = "5.10.1")

implementation(group = "org.slf4j", name = "slf4j-api", version = "2.0.16")
testImplementation(group = "ch.qos.logback", name = "logback-classic", version = "1.4.11")
testImplementation(group = "ch.qos.logback", name = "logback-core", version = "1.4.11")
testImplementation(group = "ch.qos.logback", name = "logback-classic", version = "1.5.8")
testImplementation(group = "ch.qos.logback", name = "logback-core", version = "1.5.8")
testImplementation("org.ow2.asm:asm:9.6")


Expand Down
3 changes: 3 additions & 0 deletions webui/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,9 @@ tasks {
)
}
}
tasks.withType<io.freefair.gradle.plugins.sass.SassCompile>().configureEach {
onlyIf { !project.hasProperty("skipSass") }
}


val javadocJar by tasks.registering(Jar::class) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ class TaskType<out T : PlanTaskBase>(

val TaskPlanning = TaskType("TaskPlanning", PlanningTaskData::class.java)
val Inquiry = TaskType("Inquiry", InquiryTaskData::class.java)
val Search = TaskType("Search", SearchTask.SearchTaskData::class.java)
val EmbeddingSearch = TaskType("EmbeddingSearch", EmbeddingSearchTask.EmbeddingSearchTaskData::class.java)
val FileModification = TaskType("FileModification", FileModificationTaskData::class.java)
val Documentation = TaskType("Documentation", DocumentationTaskData::class.java)
val CodeReview = TaskType("CodeReview", CodeReviewTaskData::class.java)
Expand All @@ -54,6 +56,8 @@ class TaskType<out T : PlanTaskBase>(
init {
registerConstructor(CommandAutoFix) { settings, task -> CommandAutoFixTask(settings, task) }
registerConstructor(Inquiry) { settings, task -> InquiryTask(settings, task) }
registerConstructor(Search) { settings, task -> SearchTask(settings, task) }
registerConstructor(EmbeddingSearch) { settings, task -> EmbeddingSearchTask(settings, task) }
registerConstructor(FileModification) { settings, task -> FileModificationTask(settings, task) }
registerConstructor(Documentation) { settings, task -> DocumentationTask(settings, task) }
registerConstructor(RunShellCommand) { settings, task -> RunShellCommandTask(settings, task) }
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.simiacryptus.skyenet.apps.plan.file

import com.simiacryptus.diff.FileValidationUtils
import com.simiacryptus.jopenai.describe.Description
import com.simiacryptus.skyenet.apps.plan.AbstractTask
import com.simiacryptus.skyenet.apps.plan.PlanSettings
import com.simiacryptus.skyenet.apps.plan.PlanTaskBase
Expand All @@ -18,7 +19,9 @@ abstract class AbstractFileTask<T : FileTaskBase>(
task_type: String,
task_description: String? = null,
task_dependencies: List<String>? = null,
@Description("The specific files to be used as input for the task")
val input_files: List<String>? = null,
@Description("The specific files to be generated as output for the task")
val output_files: List<String>? = null,
state: TaskState? = TaskState.Pending,
) : PlanTaskBase(
Expand Down Expand Up @@ -62,7 +65,7 @@ abstract class AbstractFileTask<T : FileTaskBase>(

companion object {
private val log = org.slf4j.LoggerFactory.getLogger(AbstractFileTask::class.java)
private const val TRIPLE_TILDE = "```"
const val TRIPLE_TILDE = "```"

}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
package com.simiacryptus.skyenet.apps.plan.file
import com.simiacryptus.diff.addApplyFileDiffLinks

import com.simiacryptus.jopenai.API
import com.simiacryptus.jopenai.describe.Description
Expand Down Expand Up @@ -37,6 +38,7 @@ class DocumentationTask(
return """
Documentation - Generate documentation
** List input files/tasks to be examined
** List output files to be modified or created with documentation
""".trimMargin()
}

Expand All @@ -48,6 +50,14 @@ class DocumentationTask(
Use a structured and consistent format that facilitates easy understanding and navigation.
Include code examples where applicable, and explain the rationale behind key design decisions and algorithm choices.
Document any known issues or areas for improvement, providing guidance for future developers on how to extend or maintain the code.
For existing files, provide documentation in the form of comments within the code.
For new files, create separate markdown files with the documentation.
Response format:
For existing files: Use ${TRIPLE_TILDE}diff code blocks with a header specifying the file path.
For new files: Use $TRIPLE_TILDE markdown blocks with a header specifying the new file path.
The diff format should use + for line additions, - for line deletions.
Include 2 lines of context before and after every change in diffs.
Separate code blocks with a single blank line.
""".trimMargin(),
model = planSettings.getTaskSettings(TaskType.Documentation).model ?: planSettings.defaultModel,
temperature = planSettings.temperature,
Expand All @@ -64,6 +74,10 @@ class DocumentationTask(
api: API,
resultFn: (String) -> Unit
) {
if (((planTask?.input_files ?: listOf()) + (planTask?.output_files ?: listOf())).isEmpty()) {
task.complete("No input or output files specified")
return
}
val semaphore = Semaphore(0)
val onComplete = {
semaphore.release()
Expand All @@ -76,22 +90,44 @@ class DocumentationTask(
JsonUtil.toJson(plan),
getPriorCode(planProcessingState),
getInputFileCode(),
"Items to document: ${itemsToDocument.joinToString(", ")}"
"Items to document: ${itemsToDocument.joinToString(", ")}",
"Output files: ${planTask?.output_files?.joinToString(", ") ?: ""}"
).filter { it.isNotBlank() }, api
)
resultFn(docResult)
if (agent.planSettings.autoFix) {
val diffLinks = agent.ui.socketManager!!.addApplyFileDiffLinks(
root = agent.root,
response = docResult,
handle = { newCodeMap ->
newCodeMap.forEach { (path, newCode) ->
task.complete("<a href='${"fileIndex/${agent.session}/$path"}'>$path</a> Updated")
}
},
ui = agent.ui,
api = api,
shouldAutoApply = { agent.planSettings.autoFix }
)
task.complete()
onComplete()
MarkdownUtil.renderMarkdown("## Generated Documentation\n$docResult\nAuto-accepted", ui = agent.ui)
MarkdownUtil.renderMarkdown(diffLinks + "\n\n## Auto-applied documentation changes", ui = agent.ui)
} else {
MarkdownUtil.renderMarkdown(
"## Generated Documentation\n$docResult",
ui = agent.ui
) + acceptButtonFooter(agent.ui) {
task.complete()
onComplete()
}
agent.ui.socketManager!!.addApplyFileDiffLinks(
root = agent.root,
response = docResult,
handle = { newCodeMap ->
newCodeMap.forEach { (path, newCode) ->
task.complete("<a href='${"fileIndex/${agent.session}/$path"}'>$path</a> Updated")
}
},
ui = agent.ui,
api = api
) + acceptButtonFooter(agent.ui) {
task.complete()
onComplete()
}, ui = agent.ui
)
}
}
Retryable(agent.ui, task = task, process = process)
Expand Down
Loading

0 comments on commit 3f301cb

Please sign in to comment.