From 279547fc72c619960fd342a22e4676d24e8c12c2 Mon Sep 17 00:00:00 2001 From: Gao Hongtao Date: Mon, 2 Dec 2024 20:49:11 +0800 Subject: [PATCH] feat: improve "InsertIfAbsent" to update the same document when a new field is introduced - Added a new test function `TestBatch_InsertAndUpdateContent` to verify the update of a document with new content. - Ensured that the document count remains the same after updating the document. - Verified the updated content field in the document. Signed-off-by: Gao Hongtao --- index/batch.go | 4 +- index/writer.go | 36 +++++++++++--- index/writer_test.go | 109 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 140 insertions(+), 9 deletions(-) diff --git a/index/batch.go b/index/batch.go index ab26188..083d38e 100644 --- a/index/batch.go +++ b/index/batch.go @@ -21,6 +21,7 @@ type Batch struct { ids []segment.Term unparsedDocuments []segment.Document unparsedIDs []segment.Term + fieldNames [][]string persistedCallback func(error) } @@ -32,9 +33,10 @@ func (b *Batch) Insert(doc segment.Document) { b.documents = append(b.documents, doc) } -func (b *Batch) InsertIfAbsent(id segment.Term, doc segment.Document) { +func (b *Batch) InsertIfAbsent(id segment.Term, fieldNames []string, doc segment.Document) { b.unparsedDocuments = append(b.unparsedDocuments, doc) b.unparsedIDs = append(b.unparsedIDs, id) + b.fieldNames = append(b.fieldNames, fieldNames) } func (b *Batch) Update(id segment.Term, doc segment.Document) { diff --git a/index/writer.go b/index/writer.go index cc82d51..f7e3aec 100644 --- a/index/writer.go +++ b/index/writer.go @@ -307,14 +307,22 @@ func (s *Writer) removeExistingDocuments(batch *Batch) error { } for i := 0; i < len(batch.unparsedIDs); i++ { - if ok, _ := dict.Contains(batch.unparsedIDs[i].Term()); ok { - batch.unparsedDocuments = append(batch.unparsedDocuments[:i], batch.unparsedDocuments[i+1:]...) - batch.unparsedIDs = append(batch.unparsedIDs[:i], batch.unparsedIDs[i+1:]...) - i-- - if len(batch.unparsedDocuments) == 0 { - return nil + if ok, _ := dict.Contains(batch.unparsedIDs[i].Term()); !ok { + continue + } + fn := batch.fieldNames[i] + if len(fn) > 0 { + if anyItemNotExist(fn, seg.segment.Fields()) { + continue } } + batch.unparsedDocuments = append(batch.unparsedDocuments[:i], batch.unparsedDocuments[i+1:]...) + batch.unparsedIDs = append(batch.unparsedIDs[:i], batch.unparsedIDs[i+1:]...) + batch.fieldNames = append(batch.fieldNames[:i], batch.fieldNames[i+1:]...) + i-- + if len(batch.unparsedDocuments) == 0 { + return nil + } } } if len(batch.unparsedDocuments) > 0 { @@ -324,6 +332,22 @@ func (s *Writer) removeExistingDocuments(batch *Batch) error { return nil } +func anyItemNotExist(newFields, existedFields []string) bool { + for _, item := range newFields { + found := false + for _, field := range existedFields { + if item == field { + found = true + break + } + } + if !found { + return true + } + } + return false +} + func (s *Writer) prepareSegment(newSegment *segmentWrapper, idTerms []segment.Term, internalOps map[string][]byte, persistedCallback func(error)) error { // new introduction diff --git a/index/writer_test.go b/index/writer_test.go index dd42cd7..2b8c2a7 100644 --- a/index/writer_test.go +++ b/index/writer_test.go @@ -1678,7 +1678,7 @@ func TestBatch_InsertIfAbsent(t *testing.T) { NewFakeField("title", "mister", false, false, true), } batch := NewBatch() - batch.InsertIfAbsent(testIdentifier(docID), doc) + batch.InsertIfAbsent(testIdentifier(docID), []string{"title"}, doc) // Apply the batch if err := idx.Batch(batch); err != nil { @@ -1709,7 +1709,7 @@ func TestBatch_InsertIfAbsent(t *testing.T) { NewFakeField("title", "mister2", true, false, true), } batchDuplicate := NewBatch() - batchDuplicate.InsertIfAbsent(testIdentifier(docID), docDuplicate) + batchDuplicate.InsertIfAbsent(testIdentifier(docID), []string{"title"}, docDuplicate) // Apply the duplicate batch if err := idx.Batch(batchDuplicate); err != nil { @@ -1767,3 +1767,108 @@ func TestBatch_InsertIfAbsent(t *testing.T) { t.Fatal(err) } } + +func TestBatch_InsertAndUpdateContent(t *testing.T) { + cfg, cleanup := CreateConfig("TestBatch_InsertAndUpdateContent") + defer func() { + err := cleanup() + if err != nil { + t.Log(err) + } + }() + + idx, err := OpenWriter(cfg) + if err != nil { + t.Fatal(err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + + // Insert a document + docID := "doc-1" + doc := &FakeDocument{ + NewFakeField("_id", docID, true, false, false), + NewFakeField("title", "mister", false, false, true), + } + batch := NewBatch() + batch.InsertIfAbsent(testIdentifier(docID), []string{"title"}, doc) + + // Apply the batch + if err := idx.Batch(batch); err != nil { + t.Fatalf("failed to apply batch: %v", err) + } + expectedCount++ + + // Verify document count after insertion + reader, err := idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err := reader.Count() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + // Update the document with new content + docUpdated := &FakeDocument{ + NewFakeField("_id", docID, true, false, false), + NewFakeField("title", "mister", false, false, true), + NewFakeField("content", "updated content", false, false, true), + } + batchUpdate := NewBatch() + batchUpdate.InsertIfAbsent(testIdentifier(docID), []string{"title", "content"}, docUpdated) + + // Apply the update batch + if err := idx.Batch(batchUpdate); err != nil { + t.Fatalf("failed to apply update batch: %v", err) + } + + // Verify document count remains the same + reader, err = idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err = reader.Count() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d after update, got %d", expectedCount, docCount) + } + + docNum1, err := findNumberByID(reader, docID) + if err != nil { + t.Fatal(err) + } + + // Verify the updated content + err = reader.VisitStoredFields(docNum1, func(field string, value []byte) bool { + if field == "content" { + if string(value) != "updated content" { + t.Errorf("expected content to be 'updated content', got '%s'", string(value)) + } + } + return true + }) + if err != nil { + t.Fatal(err) + } + + err = reader.Close() + if err != nil { + t.Fatal(err) + } +}