Skip to content

Commit

Permalink
change: use new website-cleaner (#793)
Browse files Browse the repository at this point in the history
  • Loading branch information
iwilltry42 authored Dec 16, 2024
1 parent e4a854b commit 30dc574
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 27 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
**/node_modules/
otto8-tools/
.local/
acorn-tools/
44 changes: 17 additions & 27 deletions pkg/controller/handlers/knowledgefile/knowledgefile.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func shouldReIngest(file *v1.KnowledgeFile) bool {
}

func cleanInput(filename string) string {
return path.Join(".conversion", filename)
return strings.TrimSuffix(path.Join(".conversion", filename), ".md") + ".md" // migration - before, the cleaning step accepted md and outputted md, now it's html -> md
}

func outputFile(filename string) string {
Expand Down Expand Up @@ -193,39 +193,29 @@ func (h *Handler) ingest(ctx context.Context, client kclient.Client, file *v1.Kn
inputName := file.Spec.FileName

// Clean website content (remove headers, footers, etc.)
if source.Spec.Manifest.GetType() == types.KnowledgeSourceTypeWebsite && strings.HasSuffix(file.Spec.FileName, ".md") {
content, err := h.gptScript.ReadFileInWorkspace(ctx, file.Spec.FileName, gptscript.ReadFileInWorkspaceOptions{
WorkspaceID: thread.Status.WorkspaceID,
if source.Spec.Manifest.GetType() == types.KnowledgeSourceTypeWebsite && strings.HasSuffix(inputName, ".html") {
mdOutput := cleanInput(file.Spec.FileName) + ".md"
task, err := h.invoker.SystemTask(ctx, thread, system.WebsiteCleanTool, map[string]any{
"input": inputName,
"output": mdOutput,
})
if err != nil {
return err
}
if len(content) < 100_000 {
task, err := h.invoker.SystemTask(ctx, thread, system.WebsiteCleanTool, string(content))
if err != nil {
return err
}
defer task.Close()

file.Status.RunNames = append(file.Status.RunNames, task.Run.Name)
if err := client.Status().Update(ctx, file); err != nil {
return err
}
defer task.Close()

result, err := task.Result(ctx)
if err != nil {
return fmt.Errorf("failed to clean website content: %v", err)
}
file.Status.RunNames = append(file.Status.RunNames, task.Run.Name)
if err := client.Status().Update(ctx, file); err != nil {
return err
}

if result.Output != "" {
inputName = cleanInput(file.Spec.FileName)
if err := h.gptScript.WriteFileInWorkspace(ctx, inputName, []byte(result.Output), gptscript.WriteFileInWorkspaceOptions{
WorkspaceID: thread.Status.WorkspaceID,
}); err != nil {
return err
}
}
_, err = task.Result(ctx)
if err != nil {
return fmt.Errorf("failed to clean website content: %v", err)
}

inputName = mdOutput

}

Check failure on line 219 in pkg/controller/handlers/knowledgefile/knowledgefile.go

View workflow job for this annotation

GitHub Actions / lint

unnecessary trailing newline (whitespace)

loadTask, err := h.invoker.SystemTask(ctx, thread, system.KnowledgeLoadTool, map[string]any{
Expand Down

0 comments on commit 30dc574

Please sign in to comment.