diff --git a/read.go b/read.go index e8f98116..167c94b4 100644 --- a/read.go +++ b/read.go @@ -21,6 +21,7 @@ import ( "hash/crc64" "log" "os" + "slices" "sort" "github.com/rs/xid" @@ -94,20 +95,15 @@ func (r *reader) Str() (string, error) { } func (r *reader) readTOC(toc *indexTOC) error { - sz, err := r.r.Size() - if err != nil { - return err - } - r.off = sz - 8 - - var tocSection simpleSection - if err := tocSection.read(r); err != nil { - return err - } - - r.seek(tocSection.off) + return r.readTOCSections(toc, []string{}) +} - sectionCount, err := r.U32() +// readTOCSections reads the table of contents of the index file. +// +// If the tags parameter is non-empty, it reads only those tagged sections for efficiency +// and does not populate the other sections. +func (r *reader) readTOCSections(toc *indexTOC, tags []string) error { + tocSection, sectionCount, err := r.readHeader() if err != nil { return err } @@ -126,11 +122,14 @@ func (r *reader) readTOC(toc *indexTOC) error { return err } + skipSection := len(tags) > 0 && !slices.Contains(tags, tag) sec := secs[tag] if sec == nil || sec.kind() != sectionKind(kind) { // If we don't recognize the section, we may be reading a newer index than the current version. Use // a "dummy section" struct to skip over it. + skipSection = true log.Printf("encountered malformed index section (%s), skipping over it", tag) + switch sectionKind(kind) { case sectionKindSimple: sec = &simpleSection{} @@ -143,8 +142,14 @@ func (r *reader) readTOC(toc *indexTOC) error { } } - if err := sec.read(r); err != nil { - return err + if skipSection { + if err := sec.skip(r); err != nil { + return err + } + } else { + if err := sec.read(r); err != nil { + return err + } } } } else { @@ -169,6 +174,27 @@ func (r *reader) readTOC(toc *indexTOC) error { return nil } +func (r *reader) readHeader() (simpleSection, uint32, error) { + sz, err := r.r.Size() + if err != nil { + return simpleSection{}, 0, err + } + r.off = sz - 8 + + var tocSection simpleSection + if err := tocSection.read(r); err != nil { + return simpleSection{}, 0, err + } + + r.seek(tocSection.off) + + sectionCount, err := r.U32() + if err != nil { + return simpleSection{}, 0, err + } + return tocSection, sectionCount, nil +} + func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) { return r.file.Read(sec.off, sec.sz) } @@ -573,10 +599,10 @@ func NewSearcher(r IndexFile) (Searcher, error) { func ReadMetadata(inf IndexFile) ([]*Repository, *IndexMetadata, error) { rd := &reader{r: inf} var toc indexTOC - if err := rd.readTOC(&toc); err != nil { + err := rd.readTOCSections(&toc, []string{"metaData", "repoMetaData"}) + if err != nil { return nil, nil, err } - return rd.parseMetadata(toc.metaData, toc.repoMetaData) } diff --git a/read_test.go b/read_test.go index 9e7acd13..c230ed9c 100644 --- a/read_test.go +++ b/read_test.go @@ -32,7 +32,6 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" - "github.com/sourcegraph/zoekt/query" ) @@ -467,3 +466,32 @@ func TestEncodeRanks(t *testing.T) { return true }, nil) } + +func BenchmarkReadMetadata(b *testing.B) { + file, err := os.Open("testdata/benchmark/zoekt_v16.00000.zoekt") + if err != nil { + b.Fatalf("Failed to open test file: %v", err) + } + defer file.Close() + + indexFile, err := NewIndexFile(file) + if err != nil { + b.Fatalf("could not open index: %v", err) + } + + b.ReportAllocs() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + repos, metadata, err := ReadMetadata(indexFile) + if err != nil { + b.Fatalf("ReadMetadata failed: %v", err) + } + if len(repos) != 1 { + b.Fatalf("expected 1 repository") + } + if metadata == nil { + b.Fatalf("expected non-nil metadata") + } + } +} diff --git a/section.go b/section.go index c686faaa..adb758e3 100644 --- a/section.go +++ b/section.go @@ -127,8 +127,12 @@ func (s *simpleSection) end(w *writer) { // section is a range of bytes in the index file. type section interface { read(*reader) error + // skip advances over the data in the section without reading it. + // NOTE: the section will not contain valid data after this call, and it should not be used. + skip(*reader) error write(*writer) - kind() sectionKind // simple or complex, used in serialization + // kind encodes whether the section is simple or compound, and is used in serialization + kind() sectionKind } type sectionKind int @@ -156,10 +160,17 @@ func (s *simpleSection) read(r *reader) error { return err } s.sz, err = r.U32() + return err +} + +func (s *simpleSection) skip(r *reader) error { + var err error + _, err = r.U32() if err != nil { return err } - return nil + _, err = r.U32() + return err } func (s *simpleSection) write(w *writer) { @@ -215,6 +226,18 @@ func (s *compoundSection) read(r *reader) error { return err } +func (s *compoundSection) skip(r *reader) error { + if err := s.data.skip(r); err != nil { + return err + } + if err := s.index.read(r); err != nil { + return err + } + + _, err := r.r.Read(s.index.off, s.index.sz) + return err +} + // relativeIndex returns the relative offsets of the items (first // element is 0), plus a final marking the end of the last item. func (s *compoundSection) relativeIndex() []uint32 {