Skip to content

Commit

Permalink
Skip other sections when reading metadata (#826)
Browse files Browse the repository at this point in the history
Looking at heap profiles, the `ReadMetadata` function creates a ton of garbage
objects. The main contributor is in other sections from the TOC, specifically
decoding `compoundSection.offsets` . However, to read metadata, we only really
need to parse the metadata sections.

This PR introduces a `skip` method that skips over a section without reading
it. This greatly reduces the allocations from `ReadMetadata`.
  • Loading branch information
jtibshirani authored Sep 17, 2024
1 parent 9aeedd4 commit be438ef
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 27 deletions.
74 changes: 50 additions & 24 deletions read.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"hash/crc64"
"log"
"os"
"slices"
"sort"

"github.com/rs/xid"
Expand Down Expand Up @@ -94,20 +95,15 @@ func (r *reader) Str() (string, error) {
}

func (r *reader) readTOC(toc *indexTOC) error {
sz, err := r.r.Size()
if err != nil {
return err
}
r.off = sz - 8

var tocSection simpleSection
if err := tocSection.read(r); err != nil {
return err
}

r.seek(tocSection.off)
return r.readTOCSections(toc, []string{})
}

sectionCount, err := r.U32()
// readTOCSections reads the table of contents of the index file.
//
// If the tags parameter is non-empty, it reads only those tagged sections for efficiency
// and does not populate the other sections.
func (r *reader) readTOCSections(toc *indexTOC, tags []string) error {
tocSection, sectionCount, err := r.readHeader()
if err != nil {
return err
}
Expand All @@ -126,11 +122,14 @@ func (r *reader) readTOC(toc *indexTOC) error {
return err
}

skipSection := len(tags) > 0 && !slices.Contains(tags, tag)
sec := secs[tag]
if sec == nil || sec.kind() != sectionKind(kind) {
// If we don't recognize the section, we may be reading a newer index than the current version. Use
// a "dummy section" struct to skip over it.
log.Printf("encountered unrecognized index section (%s), skipping over it", tag)
skipSection = true
log.Printf("encountered malformed index section (%s), skipping over it", tag)

switch sectionKind(kind) {
case sectionKindSimple:
sec = &simpleSection{}
Expand All @@ -143,8 +142,14 @@ func (r *reader) readTOC(toc *indexTOC) error {
}
}

if err := sec.read(r); err != nil {
return err
if skipSection {
if err := sec.skip(r); err != nil {
return err
}
} else {
if err := sec.read(r); err != nil {
return err
}
}
}
} else {
Expand All @@ -169,6 +174,27 @@ func (r *reader) readTOC(toc *indexTOC) error {
return nil
}

func (r *reader) readHeader() (simpleSection, uint32, error) {
sz, err := r.r.Size()
if err != nil {
return simpleSection{}, 0, err
}
r.off = sz - 8

var tocSection simpleSection
if err := tocSection.read(r); err != nil {
return simpleSection{}, 0, err
}

r.seek(tocSection.off)

sectionCount, err := r.U32()
if err != nil {
return simpleSection{}, 0, err
}
return tocSection, sectionCount, nil
}

func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) {
return r.file.Read(sec.off, sec.sz)
}
Expand Down Expand Up @@ -205,7 +231,7 @@ func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) {
return arr, nil
}

func (r *reader) readJSON(data interface{}, sec *simpleSection) error {
func (r *reader) readJSON(data interface{}, sec simpleSection) error {
blob, err := r.r.Read(sec.off, sec.sz)
if err != nil {
return err
Expand All @@ -228,7 +254,7 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
branchNames: []map[uint]string{},
}

repos, md, err := r.readMetadata(toc)
repos, md, err := r.parseMetadata(toc.metaData, toc.repoMetaData)
if md != nil && !canReadVersion(md) {
return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion)
} else if err != nil {
Expand Down Expand Up @@ -395,9 +421,9 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
return &d, nil
}

func (r *reader) readMetadata(toc *indexTOC) ([]*Repository, *IndexMetadata, error) {
func (r *reader) parseMetadata(metaData simpleSection, repoMetaData simpleSection) ([]*Repository, *IndexMetadata, error) {
var md IndexMetadata
if err := r.readJSON(&md, &toc.metaData); err != nil {
if err := r.readJSON(&md, metaData); err != nil {
return nil, nil, err
}

Expand All @@ -410,7 +436,7 @@ func (r *reader) readMetadata(toc *indexTOC) ([]*Repository, *IndexMetadata, err
}

if len(blob) == 0 {
blob, err = r.r.Read(toc.repoMetaData.off, toc.repoMetaData.sz)
blob, err = r.r.Read(repoMetaData.off, repoMetaData.sz)
if err != nil {
return nil, &md, err
}
Expand Down Expand Up @@ -573,11 +599,11 @@ func NewSearcher(r IndexFile) (Searcher, error) {
func ReadMetadata(inf IndexFile) ([]*Repository, *IndexMetadata, error) {
rd := &reader{r: inf}
var toc indexTOC
if err := rd.readTOC(&toc); err != nil {
err := rd.readTOCSections(&toc, []string{"metaData", "repoMetaData"})
if err != nil {
return nil, nil, err
}

return rd.readMetadata(&toc)
return rd.parseMetadata(toc.metaData, toc.repoMetaData)
}

// ReadMetadataPathAlive is like ReadMetadataPath except that it only returns
Expand Down
30 changes: 29 additions & 1 deletion read_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ import (

"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"

"github.com/sourcegraph/zoekt/query"
)

Expand Down Expand Up @@ -467,3 +466,32 @@ func TestEncodeRanks(t *testing.T) {
return true
}, nil)
}

func BenchmarkReadMetadata(b *testing.B) {
file, err := os.Open("testdata/benchmark/zoekt_v16.00000.zoekt")
if err != nil {
b.Fatalf("Failed to open test file: %v", err)
}
defer file.Close()

indexFile, err := NewIndexFile(file)
if err != nil {
b.Fatalf("could not open index: %v", err)
}

b.ReportAllocs()
b.ResetTimer()

for i := 0; i < b.N; i++ {
repos, metadata, err := ReadMetadata(indexFile)
if err != nil {
b.Fatalf("ReadMetadata failed: %v", err)
}
if len(repos) != 1 {
b.Fatalf("expected 1 repository")
}
if metadata == nil {
b.Fatalf("expected non-nil metadata")
}
}
}
27 changes: 25 additions & 2 deletions section.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,12 @@ func (s *simpleSection) end(w *writer) {
// section is a range of bytes in the index file.
type section interface {
read(*reader) error
// skip advances over the data in the section without reading it.
// NOTE: the section will not contain valid data after this call, and it should not be used.
skip(*reader) error
write(*writer)
kind() sectionKind // simple or complex, used in serialization
// kind encodes whether the section is simple or compound, and is used in serialization
kind() sectionKind
}

type sectionKind int
Expand Down Expand Up @@ -156,10 +160,17 @@ func (s *simpleSection) read(r *reader) error {
return err
}
s.sz, err = r.U32()
return err
}

func (s *simpleSection) skip(r *reader) error {
var err error
_, err = r.U32()
if err != nil {
return err
}
return nil
_, err = r.U32()
return err
}

func (s *simpleSection) write(w *writer) {
Expand Down Expand Up @@ -215,6 +226,18 @@ func (s *compoundSection) read(r *reader) error {
return err
}

func (s *compoundSection) skip(r *reader) error {
if err := s.data.skip(r); err != nil {
return err
}
if err := s.index.read(r); err != nil {
return err
}

_, err := r.r.Read(s.index.off, s.index.sz)
return err
}

// relativeIndex returns the relative offsets of the items (first
// element is 0), plus a final marking the end of the last item.
func (s *compoundSection) relativeIndex() []uint32 {
Expand Down

0 comments on commit be438ef

Please sign in to comment.