Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip other sections when reading metadata #826

Merged
merged 1 commit into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 50 additions & 24 deletions read.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"hash/crc64"
"log"
"os"
"slices"
"sort"

"github.com/rs/xid"
Expand Down Expand Up @@ -94,20 +95,15 @@ func (r *reader) Str() (string, error) {
}

func (r *reader) readTOC(toc *indexTOC) error {
sz, err := r.r.Size()
if err != nil {
return err
}
r.off = sz - 8

var tocSection simpleSection
if err := tocSection.read(r); err != nil {
return err
}

r.seek(tocSection.off)
return r.readTOCSections(toc, []string{})
}

sectionCount, err := r.U32()
// readTOCSections reads the table of contents of the index file.
//
// If the tags parameter is non-empty, it reads only those tagged sections for efficiency
// and does not populate the other sections.
func (r *reader) readTOCSections(toc *indexTOC, tags []string) error {
tocSection, sectionCount, err := r.readHeader()
if err != nil {
return err
}
Expand All @@ -126,11 +122,14 @@ func (r *reader) readTOC(toc *indexTOC) error {
return err
}

skipSection := len(tags) > 0 && !slices.Contains(tags, tag)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of introducing the "skip" concept, I could have taken advantage of the fact that the metadata sections are always first in the TOC. However, our index reading code is structured around flexible "section tags", and I got the feeling that section ordering wasn't an invariant we wanted to rely on.

sec := secs[tag]
if sec == nil || sec.kind() != sectionKind(kind) {
// If we don't recognize the section, we may be reading a newer index than the current version. Use
// a "dummy section" struct to skip over it.
log.Printf("encountered unrecognized index section (%s), skipping over it", tag)
skipSection = true
log.Printf("encountered malformed index section (%s), skipping over it", tag)

switch sectionKind(kind) {
case sectionKindSimple:
sec = &simpleSection{}
Expand All @@ -143,8 +142,14 @@ func (r *reader) readTOC(toc *indexTOC) error {
}
}

if err := sec.read(r); err != nil {
return err
if skipSection {
if err := sec.skip(r); err != nil {
return err
}
} else {
if err := sec.read(r); err != nil {
return err
}
}
}
} else {
Expand All @@ -169,6 +174,27 @@ func (r *reader) readTOC(toc *indexTOC) error {
return nil
}

func (r *reader) readHeader() (simpleSection, uint32, error) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I factored out the first part of readTOC (now readTOCSections). This wasn't critical for the change.

sz, err := r.r.Size()
if err != nil {
return simpleSection{}, 0, err
}
r.off = sz - 8

var tocSection simpleSection
if err := tocSection.read(r); err != nil {
return simpleSection{}, 0, err
}

r.seek(tocSection.off)

sectionCount, err := r.U32()
if err != nil {
return simpleSection{}, 0, err
}
return tocSection, sectionCount, nil
}

func (r *indexData) readSectionBlob(sec simpleSection) ([]byte, error) {
return r.file.Read(sec.off, sec.sz)
}
Expand Down Expand Up @@ -205,7 +231,7 @@ func readSectionU64(f IndexFile, sec simpleSection) ([]uint64, error) {
return arr, nil
}

func (r *reader) readJSON(data interface{}, sec *simpleSection) error {
func (r *reader) readJSON(data interface{}, sec simpleSection) error {
blob, err := r.r.Read(sec.off, sec.sz)
if err != nil {
return err
Expand All @@ -228,7 +254,7 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
branchNames: []map[uint]string{},
}

repos, md, err := r.readMetadata(toc)
repos, md, err := r.parseMetadata(toc.metaData, toc.repoMetaData)
if md != nil && !canReadVersion(md) {
return nil, fmt.Errorf("file is v%d, want v%d", md.IndexFormatVersion, IndexFormatVersion)
} else if err != nil {
Expand Down Expand Up @@ -395,9 +421,9 @@ func (r *reader) readIndexData(toc *indexTOC) (*indexData, error) {
return &d, nil
}

func (r *reader) readMetadata(toc *indexTOC) ([]*Repository, *IndexMetadata, error) {
func (r *reader) parseMetadata(metaData simpleSection, repoMetaData simpleSection) ([]*Repository, *IndexMetadata, error) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also simplified this method, as it's not a big deal to be copying simpleSection. Not critical for the change.

var md IndexMetadata
if err := r.readJSON(&md, &toc.metaData); err != nil {
if err := r.readJSON(&md, metaData); err != nil {
return nil, nil, err
}

Expand All @@ -410,7 +436,7 @@ func (r *reader) readMetadata(toc *indexTOC) ([]*Repository, *IndexMetadata, err
}

if len(blob) == 0 {
blob, err = r.r.Read(toc.repoMetaData.off, toc.repoMetaData.sz)
blob, err = r.r.Read(repoMetaData.off, repoMetaData.sz)
if err != nil {
return nil, &md, err
}
Expand Down Expand Up @@ -573,11 +599,11 @@ func NewSearcher(r IndexFile) (Searcher, error) {
func ReadMetadata(inf IndexFile) ([]*Repository, *IndexMetadata, error) {
rd := &reader{r: inf}
var toc indexTOC
if err := rd.readTOC(&toc); err != nil {
err := rd.readTOCSections(&toc, []string{"metaData", "repoMetaData"})
if err != nil {
return nil, nil, err
}

return rd.readMetadata(&toc)
return rd.parseMetadata(toc.metaData, toc.repoMetaData)
}

// ReadMetadataPathAlive is like ReadMetadataPath except that it only returns
Expand Down
30 changes: 29 additions & 1 deletion read_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ import (

"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"

"github.com/sourcegraph/zoekt/query"
)

Expand Down Expand Up @@ -467,3 +466,32 @@ func TestEncodeRanks(t *testing.T) {
return true
}, nil)
}

func BenchmarkReadMetadata(b *testing.B) {
file, err := os.Open("testdata/benchmark/zoekt_v16.00000.zoekt")
if err != nil {
b.Fatalf("Failed to open test file: %v", err)
}
defer file.Close()

indexFile, err := NewIndexFile(file)
if err != nil {
b.Fatalf("could not open index: %v", err)
}

b.ReportAllocs()
b.ResetTimer()

for i := 0; i < b.N; i++ {
repos, metadata, err := ReadMetadata(indexFile)
if err != nil {
b.Fatalf("ReadMetadata failed: %v", err)
}
if len(repos) != 1 {
b.Fatalf("expected 1 repository")
}
if metadata == nil {
b.Fatalf("expected non-nil metadata")
}
}
}
27 changes: 25 additions & 2 deletions section.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,12 @@ func (s *simpleSection) end(w *writer) {
// section is a range of bytes in the index file.
type section interface {
read(*reader) error
// skip advances over the data in the section without reading it.
// NOTE: the section will not contain valid data after this call, and it should not be used.
skip(*reader) error
write(*writer)
kind() sectionKind // simple or complex, used in serialization
// kind encodes whether the section is simple or compound, and is used in serialization
kind() sectionKind
}

type sectionKind int
Expand Down Expand Up @@ -156,10 +160,17 @@ func (s *simpleSection) read(r *reader) error {
return err
}
s.sz, err = r.U32()
return err
}

func (s *simpleSection) skip(r *reader) error {
var err error
_, err = r.U32()
if err != nil {
return err
}
return nil
_, err = r.U32()
return err
}

func (s *simpleSection) write(w *writer) {
Expand Down Expand Up @@ -215,6 +226,18 @@ func (s *compoundSection) read(r *reader) error {
return err
}

func (s *compoundSection) skip(r *reader) error {
if err := s.data.skip(r); err != nil {
return err
}
if err := s.index.read(r); err != nil {
return err
}

_, err := r.r.Read(s.index.off, s.index.sz)
return err
}

// relativeIndex returns the relative offsets of the items (first
// element is 0), plus a final marking the end of the last item.
func (s *compoundSection) relativeIndex() []uint32 {
Expand Down
Loading