diff --git a/documentloaders/ms_office.go b/documentloaders/ms_office.go new file mode 100644 index 000000000..265598c29 --- /dev/null +++ b/documentloaders/ms_office.go @@ -0,0 +1,228 @@ +package documentloaders + +import ( + "archive/zip" + "bytes" + "context" + "fmt" + "io" + "path/filepath" + "strings" + + "github.com/richardlehane/mscfb" + "github.com/tealeg/xlsx" + "github.com/tmc/langchaingo/schema" + "github.com/tmc/langchaingo/textsplitter" +) + +var _ Loader = Office{} + +// Office loads text data from an io.Reader. +type Office struct { + reader io.ReaderAt + size int64 + fileType string +} + +// NewOffice creates a new text loader with an io.Reader, filename and file size. +func NewOffice(reader io.ReaderAt, filename string, size int64) Office { + return Office{ + reader: reader, + size: size, + fileType: strings.ToLower(filepath.Ext(filename)), + } +} + +// Load reads from the io.Reader for the MS Office data and returns the raw document data. +// nolint +func (loader Office) Load(ctx context.Context) ([]schema.Document, error) { + switch loader.fileType { + case ".doc": + return loader.loadDoc() + case ".docx": + return loader.loadDocx() + case ".xls", ".xlsx": + return loader.loadExcel() + case ".ppt": + // parsing for old PPTs is same as for old DOCs + return loader.loadDoc() + case ".pptx": + return loader.loadPPTX() + default: + return nil, fmt.Errorf("unsupported file type: %s", loader.fileType) + } +} + +// LoadAndSplit reads from the io.Reader for the MS Office data and returns the raw document data +// and splits it into multiple documents using a text splitter. +func (loader Office) LoadAndSplit(ctx context.Context, splitter textsplitter.TextSplitter) ([]schema.Document, error) { + docs, err := loader.Load(ctx) + if err != nil { + return nil, err + } + + return textsplitter.SplitDocuments(splitter, docs) +} + +func (loader Office) loadDoc() ([]schema.Document, error) { + doc, err := mscfb.New(io.NewSectionReader(loader.reader, 0, loader.size)) + if err != nil { + return nil, fmt.Errorf("failed to read DOC file: %w", err) + } + + var text strings.Builder + for entry, err := doc.Next(); err == nil; entry, err = doc.Next() { + // nolint + if entry.Name == "WordDocument" { + buf := make([]byte, entry.Size) + i, err := doc.Read(buf) + if err != nil { + return nil, fmt.Errorf("error reading WordDocument stream: %w", err) + } + if i > 0 { + // Process the binary content + for j := 0; j < i; j++ { + // Extract readable ASCII text + if buf[j] >= 32 && buf[j] <= 126 { + text.WriteByte(buf[j]) + } else if buf[j] == 13 || buf[j] == 10 { + text.WriteByte('\n') + } + } + } + } + } + + documents := []schema.Document{ + { + PageContent: text.String(), + Metadata: map[string]interface{}{ + "fileType": loader.fileType, + }, + }, + } + + return documents, nil +} + +func (loader Office) loadExcel() ([]schema.Document, error) { + buf := bytes.NewBuffer(make([]byte, 0, loader.size)) + if _, err := io.Copy(buf, io.NewSectionReader(loader.reader, 0, loader.size)); err != nil { + return nil, fmt.Errorf("failed to copy Excel content: %w", err) + } + + xlFile, err := xlsx.OpenBinary(buf.Bytes()) + if err != nil { + return nil, fmt.Errorf("failed to read Excel file: %w", err) + } + + docs := make([]schema.Document, 0, len(xlFile.Sheets)) + for i, sheet := range xlFile.Sheets { + var text strings.Builder + for _, row := range sheet.Rows { + for _, cell := range row.Cells { + text.WriteString(cell.String() + "\t") + } + text.WriteString("\n") + } + + docs = append(docs, schema.Document{ + PageContent: text.String(), + Metadata: map[string]interface{}{ + "fileType": loader.fileType, + "sheetName": sheet.Name, + "sheetIndex": i, + }, + }) + } + + return docs, nil +} + +func (loader Office) loadPPTX() ([]schema.Document, error) { + buf := bytes.NewBuffer(make([]byte, 0, loader.size)) + if _, err := io.Copy(buf, io.NewSectionReader(loader.reader, 0, loader.size)); err != nil { + return nil, fmt.Errorf("failed to copy content: %w", err) + } + + zipReader, err := zip.NewReader(bytes.NewReader(buf.Bytes()), loader.size) + if err != nil { + return nil, fmt.Errorf("failed to read PPTX file as ZIP: %w", err) + } + + var text strings.Builder + for _, file := range zipReader.File { + // PPTX stores slide content in ppt/slides/slide*.xml files + if strings.HasPrefix(file.Name, "ppt/slides/slide") && strings.HasSuffix(file.Name, ".xml") { + rc, err := file.Open() + if err != nil { + return nil, fmt.Errorf("error opening slide XML: %w", err) + } + defer rc.Close() + + content, err := io.ReadAll(rc) + if err != nil { + return nil, fmt.Errorf("error reading content: %w", err) + } + + content = bytes.ReplaceAll(content, []byte("<"), []byte(" <")) + content = bytes.ReplaceAll(content, []byte(">"), []byte("> ")) + text.Write(content) + text.WriteString("\n--- Next Slide ---\n") + } + } + + documents := []schema.Document{ + { + PageContent: text.String(), + Metadata: map[string]interface{}{ + "fileType": loader.fileType, + }, + }, + } + + return documents, nil +} + +func (loader Office) loadDocx() ([]schema.Document, error) { + buf := bytes.NewBuffer(make([]byte, 0, loader.size)) + if _, err := io.Copy(buf, io.NewSectionReader(loader.reader, 0, loader.size)); err != nil { + return nil, fmt.Errorf("failed to copy content: %w", err) + } + + zipReader, err := zip.NewReader(bytes.NewReader(buf.Bytes()), loader.size) + if err != nil { + return nil, fmt.Errorf("failed to read DOCX file as ZIP: %w", err) + } + + var text strings.Builder + for _, file := range zipReader.File { + if file.Name == "word/document.xml" { + rc, err := file.Open() + if err != nil { + return nil, fmt.Errorf("error opening document.xml: %w", err) + } + defer rc.Close() + + content, err := io.ReadAll(rc) + if err != nil { + return nil, fmt.Errorf("error reading content: %w", err) + } + + content = bytes.ReplaceAll(content, []byte("<"), []byte(" <")) + content = bytes.ReplaceAll(content, []byte(">"), []byte("> ")) + text.Write(content) + } + } + + documents := []schema.Document{ + { + PageContent: text.String(), + Metadata: map[string]interface{}{ + "fileType": loader.fileType, + }, + }, + } + + return documents, nil +} diff --git a/documentloaders/ms_office_test.go b/documentloaders/ms_office_test.go new file mode 100644 index 000000000..b85873a7b --- /dev/null +++ b/documentloaders/ms_office_test.go @@ -0,0 +1,92 @@ +package documentloaders + +import ( + "context" + "os" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestMSOfficeLoader(test *testing.T) { + test.Parallel() + + docExpectedContent := "This is a .doc test file." + docxExpectedContent := "This is a .docx test file." + xlsxExpectedContent := "This is an .xlsx test file" + pptxExpectedContent := "This is a .pptx test file" + + test.Run("Load .doc", func(t *testing.T) { + t.Parallel() + + file, err := os.Open("./testdata/test.doc") + require.NoError(t, err) + defer file.Close() + + fileInfo, err := file.Stat() + require.NoError(t, err) + + loader := NewOffice(file, fileInfo.Name(), fileInfo.Size()) + docs, err := loader.Load(context.Background()) + require.NoError(t, err) + + assert.Len(t, docs, 1) + assert.True(t, strings.Contains(docs[0].PageContent, docExpectedContent)) + }) + + test.Run("Load .docx", func(t *testing.T) { + t.Parallel() + + file, err := os.Open("./testdata/test.docx") + require.NoError(t, err) + defer file.Close() + + fileInfo, err := file.Stat() + require.NoError(t, err) + + loader := NewOffice(file, fileInfo.Name(), fileInfo.Size()) + docs, err := loader.Load(context.Background()) + require.NoError(t, err) + + assert.Len(t, docs, 1) + assert.True(t, strings.Contains(docs[0].PageContent, docxExpectedContent)) + }) + + test.Run("Load .xlsx", func(t *testing.T) { + t.Parallel() + + file, err := os.Open("./testdata/test.xlsx") + require.NoError(t, err) + defer file.Close() + + fileInfo, err := file.Stat() + require.NoError(t, err) + + loader := NewOffice(file, fileInfo.Name(), fileInfo.Size()) + docs, err := loader.Load(context.Background()) + require.NoError(t, err) + + assert.Len(t, docs, 1) + assert.True(t, strings.Contains(docs[0].PageContent, xlsxExpectedContent)) + }) + + test.Run("Load .pptx", func(t *testing.T) { + t.Parallel() + + file, err := os.Open("./testdata/test.pptx") + require.NoError(t, err) + defer file.Close() + + fileInfo, err := file.Stat() + require.NoError(t, err) + + loader := NewOffice(file, fileInfo.Name(), fileInfo.Size()) + docs, err := loader.Load(context.Background()) + require.NoError(t, err) + + assert.Len(t, docs, 1) + assert.True(t, strings.Contains(docs[0].PageContent, pptxExpectedContent)) + }) +} diff --git a/documentloaders/testdata/test.doc b/documentloaders/testdata/test.doc new file mode 100644 index 000000000..7a02661d4 Binary files /dev/null and b/documentloaders/testdata/test.doc differ diff --git a/documentloaders/testdata/test.docx b/documentloaders/testdata/test.docx new file mode 100644 index 000000000..65858f722 Binary files /dev/null and b/documentloaders/testdata/test.docx differ diff --git a/documentloaders/testdata/test.pptx b/documentloaders/testdata/test.pptx new file mode 100644 index 000000000..151caafa8 Binary files /dev/null and b/documentloaders/testdata/test.pptx differ diff --git a/documentloaders/testdata/test.xlsx b/documentloaders/testdata/test.xlsx new file mode 100644 index 000000000..8f9f5afc1 Binary files /dev/null and b/documentloaders/testdata/test.xlsx differ diff --git a/go.mod b/go.mod index f3270f4c2..d34a4b3fa 100644 --- a/go.mod +++ b/go.mod @@ -134,6 +134,7 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect + github.com/richardlehane/msoleps v1.0.1 // indirect github.com/rogpeppe/go-internal v1.11.0 // indirect github.com/rs/zerolog v1.31.0 // indirect github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect @@ -158,7 +159,6 @@ require ( gitlab.com/golang-commonmark/linkify v0.0.0-20191026162114-a0c2df6c8f82 // indirect gitlab.com/golang-commonmark/mdurl v0.0.0-20191124015652-932350d1cb84 // indirect gitlab.com/golang-commonmark/puny v0.0.0-20191124015043-9f83538fa04f // indirect - go.mongodb.org/mongo-driver/v2 v2.0.0-beta1 // indirect go.opencensus.io v0.24.0 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 // indirect @@ -216,10 +216,13 @@ require ( github.com/pinecone-io/go-pinecone v0.4.1 github.com/pkoukk/tiktoken-go v0.1.6 github.com/redis/rueidis v1.0.34 + github.com/richardlehane/mscfb v1.0.4 + github.com/tealeg/xlsx v1.0.5 github.com/weaviate/weaviate v1.24.1 github.com/weaviate/weaviate-go-client/v4 v4.13.1 gitlab.com/golang-commonmark/markdown v0.0.0-20211110145824-bf3e522c626a go.mongodb.org/mongo-driver v1.14.0 + go.mongodb.org/mongo-driver/v2 v2.0.0-beta1 go.starlark.net v0.0.0-20230302034142-4b1e35fe2254 golang.org/x/exp v0.0.0-20230713183714-613f0c0eb8a1 golang.org/x/tools v0.14.0 diff --git a/go.sum b/go.sum index 1579d1c52..41eb8bccc 100644 --- a/go.sum +++ b/go.sum @@ -602,6 +602,10 @@ github.com/qdrant/go-client v1.7.0 h1:2TeeWyZAWIup7vvD7Ne6aAvo0H+F5OUb1pB9Z8Y4pF github.com/qdrant/go-client v1.7.0/go.mod h1:680gkxNAsVtre0Z8hAQmtPzJtz1xFAyCu2TUxULtnoE= github.com/redis/rueidis v1.0.34 h1:cdggTaDDoqLNeoKMoew8NQY3eTc83Kt6XyfXtoCO2Wc= github.com/redis/rueidis v1.0.34/go.mod h1:g8nPmgR4C68N3abFiOc/gUOSEKw3Tom6/teYMehg4RE= +github.com/richardlehane/mscfb v1.0.4 h1:WULscsljNPConisD5hR0+OyZjwK46Pfyr6mPu5ZawpM= +github.com/richardlehane/mscfb v1.0.4/go.mod h1:YzVpcZg9czvAuhk9T+a3avCpcFPMUWm7gK3DypaEsUk= +github.com/richardlehane/msoleps v1.0.1 h1:RfrALnSNXzmXLbGct/P2b4xkFz4e8Gmj/0Vj9M9xC1o= +github.com/richardlehane/msoleps v1.0.1/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg= github.com/rogpeppe/go-internal v1.1.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.2.2/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= @@ -668,6 +672,8 @@ github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/tealeg/xlsx v1.0.5 h1:+f8oFmvY8Gw1iUXzPk+kz+4GpbDZPK1FhPiQRd+ypgE= +github.com/tealeg/xlsx v1.0.5/go.mod h1:btRS8dz54TDnvKNosuAqxrM1QgN1udgk9O34bDCnORM= github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= github.com/testcontainers/testcontainers-go v0.31.0 h1:W0VwIhcEVhRflwL9as3dhY6jXjVCA27AkmbnZ+UTh3U=