From 5c51794c39d0d81f7c82b04aefcc04766dec6cbb Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Wed, 22 Nov 2023 14:15:29 -0800 Subject: [PATCH] Indexing: use one ctags process per shard (#702) Currently, we use a single ctags process for indexing an entire repository. Even though we build shards in parallel, they all share the same (single threaded) ctags process. Since ctags is one of the most expensive parts of shard building, this creates a bottleneck that can really slow down indexing. This change proposes to launch a new ctags process per shard. For `sgtest/megarepo`, this speeds up indexing by almost 2x (enabling scip-ctags and setting `-parallelism=4`): * Before: took 4 min 48 sec to index repo * After: took 2 min 30 sec to index repo --- build/builder.go | 17 +++-- build/ctags.go | 16 +++-- build/ctags_test.go | 8 ++- ctags/json.go | 46 -------------- ctags/json_test.go | 5 +- ctags/parser_factory.go | 133 ++++++++++++++++++++++++++++++++++++++++ ctags/parser_map.go | 89 --------------------------- 7 files changed, 161 insertions(+), 153 deletions(-) create mode 100644 ctags/parser_factory.go delete mode 100644 ctags/parser_map.go diff --git a/build/builder.go b/build/builder.go index 82aba3bef..1838b777a 100644 --- a/build/builder.go +++ b/build/builder.go @@ -249,8 +249,8 @@ type Builder struct { docChecker zoekt.DocChecker size int - parserMap ctags.ParserMap - building sync.WaitGroup + parserFactory ctags.ParserFactory + building sync.WaitGroup errMu sync.Mutex buildError error @@ -563,20 +563,17 @@ func NewBuilder(opts Options) (*Builder, error) { finishedShards: map[string]string{}, } - parserMap, err := ctags.NewParserMap( - ctags.ParserBinMap{ - ctags.UniversalCTags: b.opts.CTagsPath, - ctags.ScipCTags: b.opts.ScipCTagsPath, - }, + parserFactory, err := ctags.NewParserFactory( + b.opts.CTagsPath, + b.opts.ScipCTagsPath, opts.LanguageMap, b.opts.CTagsMustSucceed, ) - if err != nil { return nil, err } - b.parserMap = parserMap + b.parserFactory = parserFactory b.shardLogger = &lumberjack.Logger{ Filename: filepath.Join(opts.IndexDir, "zoekt-builder-shard-log.tsv"), @@ -1021,7 +1018,7 @@ func sortDocuments(todo []*zoekt.Document) { func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) { if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") { - err := ctagsAddSymbolsParserMap(todo, b.opts.LanguageMap, b.parserMap) + err := parseSymbols(todo, b.opts.LanguageMap, b.parserFactory) if b.opts.CTagsMustSucceed && err != nil { return nil, err } diff --git a/build/ctags.go b/build/ctags.go index 7308371f0..48a655776 100644 --- a/build/ctags.go +++ b/build/ctags.go @@ -42,12 +42,14 @@ func normalizeLanguage(filetype string) string { return normalized } -func ctagsAddSymbolsParserMap(todo []*zoekt.Document, languageMap ctags.LanguageMap, parserMap ctags.ParserMap) error { +func parseSymbols(todo []*zoekt.Document, languageMap ctags.LanguageMap, parserFactory ctags.ParserFactory) error { monitor := newMonitor() defer monitor.Stop() var tagsToSections tagsToSections + parsers := make(map[ctags.CTagsParserType]ctags.Parser) + for _, doc := range todo { if len(doc.Content) == 0 || doc.Symbols != nil { continue @@ -65,10 +67,16 @@ func ctagsAddSymbolsParserMap(todo []*zoekt.Document, languageMap ctags.Language parserKind = ctags.UniversalCTags } - parser := parserMap[parserKind] + parser := parsers[parserKind] if parser == nil { - // this happens if CTagsMustSucceed is false and we didn't find the binary - continue + // Spin up a new parser for this parser kind + parser = parserFactory.NewParser(parserKind) + if parser == nil { + // this happens if CTagsMustSucceed is false and we didn't find the binary + continue + } + parsers[parserKind] = parser + defer parser.Close() } monitor.BeginParsing(doc) diff --git a/build/ctags_test.go b/build/ctags_test.go index b052eecca..c694d433b 100644 --- a/build/ctags_test.go +++ b/build/ctags_test.go @@ -257,18 +257,22 @@ func BenchmarkTagsToSections(b *testing.B) { requireCTags(b) file, err := os.ReadFile("./testdata/large_file.cc") - parser, err := ctags.NewParser(ctags.UniversalCTags, "universal-ctags") if err != nil { b.Fatal(err) } - var tagsToSections tagsToSections + factory, err := ctags.NewParserFactory("universal-ctags", "", ctags.LanguageMap{}, true) + if err != nil { + b.Fatal(err) + } + parser := factory.NewParser(ctags.UniversalCTags) entries, err := parser.Parse("./testdata/large_file.cc", file) if err != nil { b.Fatal(err) } + var tagsToSections tagsToSections secs, _, err := tagsToSections.Convert(file, entries) if err != nil { b.Fatal(err) diff --git a/ctags/json.go b/ctags/json.go index bdff60d69..a08c92eae 100644 --- a/ctags/json.go +++ b/ctags/json.go @@ -15,12 +15,7 @@ package ctags import ( - "bytes" "fmt" - "log" - "os" - "os/exec" - "strings" "sync" "time" @@ -118,44 +113,3 @@ func (lp *lockedParser) close() { lp.send = nil lp.recv = nil } - -// NewParser creates a parser that is implemented by the given -// universal-ctags binary. The parser is safe for concurrent use. -func NewParser(parserType CTagsParserType, bin string) (Parser, error) { - if err := checkBinary(parserType, bin); err != nil { - return nil, err - } - - opts := goctags.Options{ - Bin: bin, - } - if debug { - opts.Info = log.New(os.Stderr, "CTAGS INF: ", log.LstdFlags) - opts.Debug = log.New(os.Stderr, "CTAGS DBG: ", log.LstdFlags) - } - return &lockedParser{ - opts: opts, - }, nil -} - -// checkBinary does checks on bin to ensure we can correctly use the binary -// for symbols. It is more user friendly to fail early in this case. -func checkBinary(typ CTagsParserType, bin string) error { - switch typ { - case UniversalCTags: - helpOutput, err := exec.Command(bin, "--help").CombinedOutput() - if err != nil { - return fmt.Errorf("failed to check if %s is universal-ctags: %w\n--help output:\n%s", bin, err, string(helpOutput)) - } - if !bytes.Contains(helpOutput, []byte("+interactive")) { - return fmt.Errorf("ctags binary is not universal-ctags or is not compiled with +interactive feature: bin=%s", bin) - } - - case ScipCTags: - if !strings.Contains(bin, "scip-ctags") { - return fmt.Errorf("only supports scip-ctags, not %s", bin) - } - } - - return nil -} diff --git a/ctags/json_test.go b/ctags/json_test.go index 5a6304f43..21dabac78 100644 --- a/ctags/json_test.go +++ b/ctags/json_test.go @@ -27,11 +27,12 @@ func TestJSON(t *testing.T) { t.Skip(err) } - p, err := NewParser(UniversalCTags, "universal-ctags") + factory, err := NewParserFactory("universal-ctags", "", LanguageMap{}, true) if err != nil { - t.Fatal("newProcess", err) + t.Fatal(err) } + p := factory.NewParser(UniversalCTags) defer p.Close() java := ` diff --git a/ctags/parser_factory.go b/ctags/parser_factory.go new file mode 100644 index 000000000..ac56d27b3 --- /dev/null +++ b/ctags/parser_factory.go @@ -0,0 +1,133 @@ +// Copyright 2017 Google Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ctags + +import ( + "bytes" + "fmt" + "log" + "os" + "os/exec" + "strings" + + goctags "github.com/sourcegraph/go-ctags" +) + +type CTagsParserType uint8 + +const ( + UnknownCTags CTagsParserType = iota + NoCTags + UniversalCTags + ScipCTags +) + +type LanguageMap = map[string]CTagsParserType + +func ParserToString(parser CTagsParserType) string { + switch parser { + case UnknownCTags: + return "unknown" + case NoCTags: + return "no" + case UniversalCTags: + return "universal" + case ScipCTags: + return "scip" + default: + panic("Reached impossible CTagsParserType state") + } +} + +func StringToParser(str string) CTagsParserType { + switch str { + case "no": + return NoCTags + case "universal": + return UniversalCTags + case "scip": + return ScipCTags + default: + return UniversalCTags + } +} + +type ParserFactory map[CTagsParserType]string + +func NewParserFactory( + ctagsPath string, + scipCTagsPath string, + languageMap LanguageMap, + cTagsMustSucceed bool, +) (ParserFactory, error) { + validBins := make(map[CTagsParserType]string) + requiredBins := map[CTagsParserType]string{UniversalCTags: ctagsPath} + for _, parserType := range languageMap { + if parserType == ScipCTags { + requiredBins[ScipCTags] = scipCTagsPath + break + } + } + + for parserType, bin := range requiredBins { + if bin == "" && cTagsMustSucceed { + return nil, fmt.Errorf("ctags binary not found for %s parser type", ParserToString(parserType)) + } + if err := checkBinary(parserType, bin); err != nil && cTagsMustSucceed { + return nil, fmt.Errorf("ctags.NewParserFactory: %v", err) + } + validBins[parserType] = bin + } + + return validBins, nil +} + +// checkBinary does checks on bin to ensure we can correctly use the binary +// for symbols. It is more user friendly to fail early in this case. +func checkBinary(typ CTagsParserType, bin string) error { + switch typ { + case UniversalCTags: + helpOutput, err := exec.Command(bin, "--help").CombinedOutput() + if err != nil { + return fmt.Errorf("failed to check if %s is universal-ctags: %w\n--help output:\n%s", bin, err, string(helpOutput)) + } + if !bytes.Contains(helpOutput, []byte("+interactive")) { + return fmt.Errorf("ctags binary is not universal-ctags or is not compiled with +interactive feature: bin=%s", bin) + } + + case ScipCTags: + if !strings.Contains(bin, "scip-ctags") { + return fmt.Errorf("only supports scip-ctags, not %s", bin) + } + } + + return nil +} + +// NewParser creates a parser that is implemented by the given +// ctags binary. The parser is safe for concurrent use. +func (p ParserFactory) NewParser(typ CTagsParserType) Parser { + bin := p[typ] + if bin == "" { + return nil + } + + opts := goctags.Options{Bin: bin} + if debug { + opts.Info = log.New(os.Stderr, "CTAGS INF: ", log.LstdFlags) + opts.Debug = log.New(os.Stderr, "CTAGS DBG: ", log.LstdFlags) + } + return &lockedParser{opts: opts} +} diff --git a/ctags/parser_map.go b/ctags/parser_map.go deleted file mode 100644 index c30fa9013..000000000 --- a/ctags/parser_map.go +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright 2017 Google Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package ctags - -import ( - "fmt" -) - -type CTagsParserType uint8 - -const ( - UnknownCTags CTagsParserType = iota - NoCTags - UniversalCTags - ScipCTags -) - -type LanguageMap = map[string]CTagsParserType - -func ParserToString(parser CTagsParserType) string { - switch parser { - case UnknownCTags: - return "unknown" - case NoCTags: - return "no" - case UniversalCTags: - return "universal" - case ScipCTags: - return "scip" - default: - panic("Reached impossible CTagsParserType state") - } -} - -func StringToParser(str string) CTagsParserType { - switch str { - case "no": - return NoCTags - case "universal": - return UniversalCTags - case "scip": - return ScipCTags - default: - return UniversalCTags - } -} - -type ParserMap map[CTagsParserType]Parser -type ParserBinMap map[CTagsParserType]string - -func NewParserMap(bins ParserBinMap, languageMap LanguageMap, cTagsMustSucceed bool) (ParserMap, error) { - parsers := make(ParserMap) - - requiredTypes := []CTagsParserType{UniversalCTags} - for _, parserType := range languageMap { - if parserType == ScipCTags { - requiredTypes = append(requiredTypes, ScipCTags) - break - } - } - - for _, parserType := range requiredTypes { - bin := bins[parserType] - if bin == "" && cTagsMustSucceed { - return nil, fmt.Errorf("ctags binary not found for %s parser type", ParserToString(parserType)) - } else { - parser, err := NewParser(parserType, bin) - if err != nil && cTagsMustSucceed { - return nil, fmt.Errorf("ctags.NewParserMap: %v", err) - } - - parsers[parserType] = parser - } - } - - return parsers, nil -}