From aa050d9af9d85b2b71828c0fa871b086a5a3c8b4 Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Sun, 26 Nov 2023 19:14:24 -0800 Subject: [PATCH] Search: add index shard concurrency to site config (#58514) In sourcegraph/zoekt#702, we updated indexserver to parse symbols in parallel by spawning a new ctags process per shard. By default, indexing uses all available CPUs to create shards in parallel, so now it will create many more processes than before. As a safeguard, we're exposing a site config setting to reduce the indexing concurrency. It's not intended to be set by users, but will let us experiment and make sure the defaults are solid. As part of this change, I bumped the Zoekt dependency to pull in the change to IndexOptions. --- deps.bzl | 4 ++-- go.mod | 2 +- go.sum | 4 ++-- internal/search/backend/index_options.go | 5 +++++ internal/search/backend/index_options_test.go | 21 +++++++++++++++++++ schema/schema.go | 3 +++ schema/site.schema.json | 8 ++++++- 7 files changed, 41 insertions(+), 6 deletions(-) diff --git a/deps.bzl b/deps.bzl index baef898ed72a..a458d4dcb8f4 100644 --- a/deps.bzl +++ b/deps.bzl @@ -6030,8 +6030,8 @@ def go_dependencies(): name = "com_github_sourcegraph_zoekt", build_file_proto_mode = "disable_global", importpath = "github.com/sourcegraph/zoekt", - sum = "h1:Pn9dVnAOZ7z8p+4BH32G1U9XtMCBVATy8vAq0ObHFdo=", - version = "v0.0.0-20231121165958-0959170c1623", + sum = "h1:2xAotLrNXGdj1x8I5yPh89qsesICseLEfEdKpmY3V90=", + version = "v0.0.0-20231122214222-d982320abe7b", ) go_repository( diff --git a/go.mod b/go.mod index d68028a03ca1..2ed003d069f9 100644 --- a/go.mod +++ b/go.mod @@ -569,7 +569,7 @@ require ( github.com/sourcegraph/managed-services-platform-cdktf/gen/postgresql v0.0.0-20231121191755-214be625af21 github.com/sourcegraph/mountinfo v0.0.0-20231018142932-e00da332dac5 github.com/sourcegraph/sourcegraph/monitoring v0.0.0-20230124144931-b2d81b1accb6 - github.com/sourcegraph/zoekt v0.0.0-20231121165958-0959170c1623 + github.com/sourcegraph/zoekt v0.0.0-20231122214222-d982320abe7b github.com/spf13/cobra v1.7.0 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/stretchr/objx v0.5.0 // indirect diff --git a/go.sum b/go.sum index 812e5d186e2c..a15843a3b02c 100644 --- a/go.sum +++ b/go.sum @@ -1631,8 +1631,8 @@ github.com/sourcegraph/tiktoken-go v0.0.0-20230905173153-caab340cf008 h1:Wu8W50q github.com/sourcegraph/tiktoken-go v0.0.0-20230905173153-caab340cf008/go.mod h1:9NiV+i9mJKGj1rYOT+njbv+ZwA/zJxYdewGl6qVatpg= github.com/sourcegraph/yaml v1.0.1-0.20200714132230-56936252f152 h1:z/MpntplPaW6QW95pzcAR/72Z5TWDyDnSo0EOcyij9o= github.com/sourcegraph/yaml v1.0.1-0.20200714132230-56936252f152/go.mod h1:GIjDIg/heH5DOkXY3YJ/wNhfHsQHoXGjl8G8amsYQ1I= -github.com/sourcegraph/zoekt v0.0.0-20231121165958-0959170c1623 h1:Pn9dVnAOZ7z8p+4BH32G1U9XtMCBVATy8vAq0ObHFdo= -github.com/sourcegraph/zoekt v0.0.0-20231121165958-0959170c1623/go.mod h1:WVDDy51tFgeKy8zXtujTSbqzgyJrqhrLC9sjWiEfAII= +github.com/sourcegraph/zoekt v0.0.0-20231122214222-d982320abe7b h1:2xAotLrNXGdj1x8I5yPh89qsesICseLEfEdKpmY3V90= +github.com/sourcegraph/zoekt v0.0.0-20231122214222-d982320abe7b/go.mod h1:WVDDy51tFgeKy8zXtujTSbqzgyJrqhrLC9sjWiEfAII= github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/spf13/afero v0.0.0-20170901052352-ee1bd8ee15a1/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= diff --git a/internal/search/backend/index_options.go b/internal/search/backend/index_options.go index ccb619ae690f..5cd580af1e5a 100644 --- a/internal/search/backend/index_options.go +++ b/internal/search/backend/index_options.go @@ -58,6 +58,8 @@ type ZoektIndexOptions struct { Error string `json:",omitempty"` LanguageMap map[string]ctags_config.ParserType + + ShardConcurrency int32 `json:",omitempty"` } func (o *ZoektIndexOptions) FromProto(p *proto.ZoektIndexOptions) { @@ -87,6 +89,7 @@ func (o *ZoektIndexOptions) FromProto(p *proto.ZoektIndexOptions) { languageMap[entry.Language] = uint8(entry.Ctags.Number()) } o.LanguageMap = languageMap + o.ShardConcurrency = p.GetShardConcurrency() } func (o *ZoektIndexOptions) ToProto() *proto.ZoektIndexOptions { @@ -116,6 +119,7 @@ func (o *ZoektIndexOptions) ToProto() *proto.ZoektIndexOptions { DocumentRanksVersion: o.DocumentRanksVersion, Error: o.Error, LanguageMap: languageMap, + ShardConcurrency: o.ShardConcurrency, } } @@ -212,6 +216,7 @@ func getIndexOptions( DocumentRanksVersion: opts.DocumentRanksVersion, LanguageMap: ctags_config.CreateEngineMap(*c), + ShardConcurrency: int32(c.SearchIndexShardConcurrency), } // Set of branch names. Always index HEAD diff --git a/internal/search/backend/index_options_test.go b/internal/search/backend/index_options_test.go index 339a23abd658..2ff79d4ac8ee 100644 --- a/internal/search/backend/index_options_test.go +++ b/internal/search/backend/index_options_test.go @@ -440,3 +440,24 @@ func TestGetIndexOptions_batch(t *testing.T) { t.Fatal("mismatch (-want, +got):\n", diff) } } +func TestGetIndexOptions_concurrency(t *testing.T) { + repos := []api.RepoID{1, 2, 3} + getRepoIndexOptions := func(repo api.RepoID) (*RepoIndexOptions, error) { + return &RepoIndexOptions{ + GetVersion: func(branch string) (string, error) { + return fmt.Sprintf("!%s-%d", branch, repo), nil + }, + }, nil + } + getSearchContextRevs := func(api.RepoID) ([]string, error) { return nil, nil } + + wantConcurrency := 27 + config := &schema.SiteConfiguration{SearchIndexShardConcurrency: wantConcurrency} + options := GetIndexOptions(config, getRepoIndexOptions, getSearchContextRevs, repos...) + + for _, got := range options { + if wantConcurrency != int(got.ShardConcurrency) { + t.Fatalf("wrong shard concurrency, want: %d, got: %d", wantConcurrency, got.ShardConcurrency) + } + } +} diff --git a/schema/schema.go b/schema/schema.go index bf6383331ede..5de0e1b80d37 100644 --- a/schema/schema.go +++ b/schema/schema.go @@ -2767,6 +2767,8 @@ type SiteConfiguration struct { ScimAuthToken string `json:"scim.authToken,omitempty"` // ScimIdentityProvider description: Identity provider used for SCIM support. "STANDARD" should be used unless a more specific value is available ScimIdentityProvider string `json:"scim.identityProvider,omitempty"` + // SearchIndexShardConcurrency description: The number of threads each indexserver should use to index shards. If not set, indexserver will use the number of available CPUs. This is exposed as a safeguard and should usually not require being set. + SearchIndexShardConcurrency int `json:"search.index.shardConcurrency,omitempty"` // SearchIndexSymbolsEnabled description: Whether indexed symbol search is enabled. This is contingent on the indexed search configuration, and is true by default for instances with indexed search enabled. Enabling this will cause every repository to re-index, which is a time consuming (several hours) operation. Additionally, it requires more storage and ram to accommodate the added symbols information in the search index. SearchIndexSymbolsEnabled *bool `json:"search.index.symbols.enabled,omitempty"` // SearchLargeFiles description: A list of file glob patterns where matching files will be indexed and searched regardless of their size. Files still need to be valid utf-8 to be indexed. The glob pattern syntax can be found here: https://github.com/bmatcuk/doublestar#patterns. @@ -2936,6 +2938,7 @@ func (v *SiteConfiguration) UnmarshalJSON(data []byte) error { delete(m, "repoPurgeWorker") delete(m, "scim.authToken") delete(m, "scim.identityProvider") + delete(m, "search.index.shardConcurrency") delete(m, "search.index.symbols.enabled") delete(m, "search.largeFiles") delete(m, "search.limits") diff --git a/schema/site.schema.json b/schema/site.schema.json index 09ed90f5f81a..95dc4a3f2f93 100644 --- a/schema/site.schema.json +++ b/schema/site.schema.json @@ -16,6 +16,12 @@ "group": "Search", "examples": [true] }, + "search.index.shardConcurrency": { + "description": "The number of threads each indexserver should use to index shards. If not set, indexserver will use the number of available CPUs. This is exposed as a safeguard and should usually not require being set.", + "type": "integer", + "group": "Search", + "examples": ["10"] + }, "search.largeFiles": { "description": "A list of file glob patterns where matching files will be indexed and searched regardless of their size. Files still need to be valid utf-8 to be indexed. The glob pattern syntax can be found here: https://github.com/bmatcuk/doublestar#patterns.", "type": "array", @@ -29,7 +35,7 @@ "description": "(debug) controls the amount of symbol search parallelism. Defaults to 20. It is not recommended to change this outside of debugging scenarios. This option will be removed in a future version.", "type": "integer", "group": "Debug", - "examples": [["20"]] + "examples": ["20"] }, "cloneProgress.log": { "description": "Whether clone progress should be logged to a file. If enabled, logs are written to files in the OS default path for temporary files.",