Skip to content

Commit

Permalink
Indexing: use one ctags process per shard (#702)
Browse files Browse the repository at this point in the history
Currently, we use a single ctags process for indexing an entire repository.
Even though we build shards in parallel, they all share the same (single
threaded) ctags process. Since ctags is one of the most expensive parts of
shard building, this creates a bottleneck that can really slow down indexing.

This change proposes to launch a new ctags process per shard. For
`sgtest/megarepo`, this speeds up indexing by almost 2x (enabling scip-ctags
and setting `-parallelism=4`):
* Before: took 4 min 48 sec to index repo
* After: took 2 min 30 sec to index repo
  • Loading branch information
jtibshirani authored Nov 22, 2023
1 parent d982320 commit 5c51794
Show file tree
Hide file tree
Showing 7 changed files with 161 additions and 153 deletions.
17 changes: 7 additions & 10 deletions build/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,8 @@ type Builder struct {
docChecker zoekt.DocChecker
size int

parserMap ctags.ParserMap
building sync.WaitGroup
parserFactory ctags.ParserFactory
building sync.WaitGroup

errMu sync.Mutex
buildError error
Expand Down Expand Up @@ -563,20 +563,17 @@ func NewBuilder(opts Options) (*Builder, error) {
finishedShards: map[string]string{},
}

parserMap, err := ctags.NewParserMap(
ctags.ParserBinMap{
ctags.UniversalCTags: b.opts.CTagsPath,
ctags.ScipCTags: b.opts.ScipCTagsPath,
},
parserFactory, err := ctags.NewParserFactory(
b.opts.CTagsPath,
b.opts.ScipCTagsPath,
opts.LanguageMap,
b.opts.CTagsMustSucceed,
)

if err != nil {
return nil, err
}

b.parserMap = parserMap
b.parserFactory = parserFactory

b.shardLogger = &lumberjack.Logger{
Filename: filepath.Join(opts.IndexDir, "zoekt-builder-shard-log.tsv"),
Expand Down Expand Up @@ -1021,7 +1018,7 @@ func sortDocuments(todo []*zoekt.Document) {

func (b *Builder) buildShard(todo []*zoekt.Document, nextShardNum int) (*finishedShard, error) {
if !b.opts.DisableCTags && (b.opts.CTagsPath != "" || b.opts.ScipCTagsPath != "") {
err := ctagsAddSymbolsParserMap(todo, b.opts.LanguageMap, b.parserMap)
err := parseSymbols(todo, b.opts.LanguageMap, b.parserFactory)
if b.opts.CTagsMustSucceed && err != nil {
return nil, err
}
Expand Down
16 changes: 12 additions & 4 deletions build/ctags.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,14 @@ func normalizeLanguage(filetype string) string {
return normalized
}

func ctagsAddSymbolsParserMap(todo []*zoekt.Document, languageMap ctags.LanguageMap, parserMap ctags.ParserMap) error {
func parseSymbols(todo []*zoekt.Document, languageMap ctags.LanguageMap, parserFactory ctags.ParserFactory) error {
monitor := newMonitor()
defer monitor.Stop()

var tagsToSections tagsToSections

parsers := make(map[ctags.CTagsParserType]ctags.Parser)

for _, doc := range todo {
if len(doc.Content) == 0 || doc.Symbols != nil {
continue
Expand All @@ -65,10 +67,16 @@ func ctagsAddSymbolsParserMap(todo []*zoekt.Document, languageMap ctags.Language
parserKind = ctags.UniversalCTags
}

parser := parserMap[parserKind]
parser := parsers[parserKind]
if parser == nil {
// this happens if CTagsMustSucceed is false and we didn't find the binary
continue
// Spin up a new parser for this parser kind
parser = parserFactory.NewParser(parserKind)
if parser == nil {
// this happens if CTagsMustSucceed is false and we didn't find the binary
continue
}
parsers[parserKind] = parser
defer parser.Close()
}

monitor.BeginParsing(doc)
Expand Down
8 changes: 6 additions & 2 deletions build/ctags_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,18 +257,22 @@ func BenchmarkTagsToSections(b *testing.B) {
requireCTags(b)

file, err := os.ReadFile("./testdata/large_file.cc")
parser, err := ctags.NewParser(ctags.UniversalCTags, "universal-ctags")
if err != nil {
b.Fatal(err)
}

var tagsToSections tagsToSections
factory, err := ctags.NewParserFactory("universal-ctags", "", ctags.LanguageMap{}, true)
if err != nil {
b.Fatal(err)
}

parser := factory.NewParser(ctags.UniversalCTags)
entries, err := parser.Parse("./testdata/large_file.cc", file)
if err != nil {
b.Fatal(err)
}

var tagsToSections tagsToSections
secs, _, err := tagsToSections.Convert(file, entries)
if err != nil {
b.Fatal(err)
Expand Down
46 changes: 0 additions & 46 deletions ctags/json.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,7 @@
package ctags

import (
"bytes"
"fmt"
"log"
"os"
"os/exec"
"strings"
"sync"
"time"

Expand Down Expand Up @@ -118,44 +113,3 @@ func (lp *lockedParser) close() {
lp.send = nil
lp.recv = nil
}

// NewParser creates a parser that is implemented by the given
// universal-ctags binary. The parser is safe for concurrent use.
func NewParser(parserType CTagsParserType, bin string) (Parser, error) {
if err := checkBinary(parserType, bin); err != nil {
return nil, err
}

opts := goctags.Options{
Bin: bin,
}
if debug {
opts.Info = log.New(os.Stderr, "CTAGS INF: ", log.LstdFlags)
opts.Debug = log.New(os.Stderr, "CTAGS DBG: ", log.LstdFlags)
}
return &lockedParser{
opts: opts,
}, nil
}

// checkBinary does checks on bin to ensure we can correctly use the binary
// for symbols. It is more user friendly to fail early in this case.
func checkBinary(typ CTagsParserType, bin string) error {
switch typ {
case UniversalCTags:
helpOutput, err := exec.Command(bin, "--help").CombinedOutput()
if err != nil {
return fmt.Errorf("failed to check if %s is universal-ctags: %w\n--help output:\n%s", bin, err, string(helpOutput))
}
if !bytes.Contains(helpOutput, []byte("+interactive")) {
return fmt.Errorf("ctags binary is not universal-ctags or is not compiled with +interactive feature: bin=%s", bin)
}

case ScipCTags:
if !strings.Contains(bin, "scip-ctags") {
return fmt.Errorf("only supports scip-ctags, not %s", bin)
}
}

return nil
}
5 changes: 3 additions & 2 deletions ctags/json_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@ func TestJSON(t *testing.T) {
t.Skip(err)
}

p, err := NewParser(UniversalCTags, "universal-ctags")
factory, err := NewParserFactory("universal-ctags", "", LanguageMap{}, true)
if err != nil {
t.Fatal("newProcess", err)
t.Fatal(err)
}

p := factory.NewParser(UniversalCTags)
defer p.Close()

java := `
Expand Down
133 changes: 133 additions & 0 deletions ctags/parser_factory.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// Copyright 2017 Google Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ctags

import (
"bytes"
"fmt"
"log"
"os"
"os/exec"
"strings"

goctags "github.com/sourcegraph/go-ctags"
)

type CTagsParserType uint8

const (
UnknownCTags CTagsParserType = iota
NoCTags
UniversalCTags
ScipCTags
)

type LanguageMap = map[string]CTagsParserType

func ParserToString(parser CTagsParserType) string {
switch parser {
case UnknownCTags:
return "unknown"
case NoCTags:
return "no"
case UniversalCTags:
return "universal"
case ScipCTags:
return "scip"
default:
panic("Reached impossible CTagsParserType state")
}
}

func StringToParser(str string) CTagsParserType {
switch str {
case "no":
return NoCTags
case "universal":
return UniversalCTags
case "scip":
return ScipCTags
default:
return UniversalCTags
}
}

type ParserFactory map[CTagsParserType]string

func NewParserFactory(
ctagsPath string,
scipCTagsPath string,
languageMap LanguageMap,
cTagsMustSucceed bool,
) (ParserFactory, error) {
validBins := make(map[CTagsParserType]string)
requiredBins := map[CTagsParserType]string{UniversalCTags: ctagsPath}
for _, parserType := range languageMap {
if parserType == ScipCTags {
requiredBins[ScipCTags] = scipCTagsPath
break
}
}

for parserType, bin := range requiredBins {
if bin == "" && cTagsMustSucceed {
return nil, fmt.Errorf("ctags binary not found for %s parser type", ParserToString(parserType))
}
if err := checkBinary(parserType, bin); err != nil && cTagsMustSucceed {
return nil, fmt.Errorf("ctags.NewParserFactory: %v", err)
}
validBins[parserType] = bin
}

return validBins, nil
}

// checkBinary does checks on bin to ensure we can correctly use the binary
// for symbols. It is more user friendly to fail early in this case.
func checkBinary(typ CTagsParserType, bin string) error {
switch typ {
case UniversalCTags:
helpOutput, err := exec.Command(bin, "--help").CombinedOutput()
if err != nil {
return fmt.Errorf("failed to check if %s is universal-ctags: %w\n--help output:\n%s", bin, err, string(helpOutput))
}
if !bytes.Contains(helpOutput, []byte("+interactive")) {
return fmt.Errorf("ctags binary is not universal-ctags or is not compiled with +interactive feature: bin=%s", bin)
}

case ScipCTags:
if !strings.Contains(bin, "scip-ctags") {
return fmt.Errorf("only supports scip-ctags, not %s", bin)
}
}

return nil
}

// NewParser creates a parser that is implemented by the given
// ctags binary. The parser is safe for concurrent use.
func (p ParserFactory) NewParser(typ CTagsParserType) Parser {
bin := p[typ]
if bin == "" {
return nil
}

opts := goctags.Options{Bin: bin}
if debug {
opts.Info = log.New(os.Stderr, "CTAGS INF: ", log.LstdFlags)
opts.Debug = log.New(os.Stderr, "CTAGS DBG: ", log.LstdFlags)
}
return &lockedParser{opts: opts}
}
Loading

0 comments on commit 5c51794

Please sign in to comment.