Skip to content

Commit

Permalink
feat: Minimize allocations in SCIP symbol parsing.
Browse files Browse the repository at this point in the history
See PR for benchmarks.
  • Loading branch information
varungandhi-src committed Nov 30, 2024
1 parent 79fb777 commit 910753a
Show file tree
Hide file tree
Showing 26 changed files with 3,436 additions and 2,413 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/golang.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ jobs:
- uses: ./.github/actions/asdf
with:
golang: true
- run: go test ./... -v
- run: go test ./... -v -tags asserts
2 changes: 1 addition & 1 deletion .tool-versions
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
golang 1.20.14
golang 1.22.0
nodejs 16.20.2
shellcheck 0.7.1
yarn 1.22.22
Expand Down
9 changes: 9 additions & 0 deletions bindings/go/scip/assertions.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//go:build asserts

package scip

func assert(cond bool, msg string) {
if !cond {
panic(msg)
}
}
5 changes: 5 additions & 0 deletions bindings/go/scip/assertions_noop.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
//go:build !asserts

package scip

func assert(cond bool, msg string) {}
63 changes: 63 additions & 0 deletions bindings/go/scip/internal/compat_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package internal

import (
"io"
"os"
"path/filepath"
"sync/atomic"
"testing"

"github.com/sourcegraph/beaut"
"github.com/sourcegraph/beaut/lib/knownwf"
conciter "github.com/sourcegraph/conc/iter"
"github.com/sourcegraph/scip/bindings/go/scip"
"github.com/sourcegraph/scip/bindings/go/scip/internal/shared"
"github.com/stretchr/testify/require"
"google.golang.org/protobuf/proto"
)

func TestParseCompat(t *testing.T) {
for _, path := range shared.SampleIndexes() {
t.Run(filepath.Base(path), func(t *testing.T) {
t.Parallel()
scipReader, err := os.Open(path)
require.Nil(t, err)
scipBytes, err := io.ReadAll(scipReader)
require.Nil(t, err)
scipIndex := scip.Index{}
require.NoError(t, proto.Unmarshal(scipBytes, &scipIndex))
var total atomic.Int64
conciter.ForEach(scipIndex.Documents, func(docPtr **scip.Document) {
document := *docPtr
if total.Load() > 1000*1000 {
return
}
total.Add(int64(len(document.Occurrences)))
var sym scip.Symbol
for i := 0; i < len(document.Occurrences); i++ {
occ := document.Occurrences[i]
old, err := ParsePartialSymbolV1ToBeDeleted(occ.Symbol, true)
require.NoError(t, err)
require.NotPanics(t, func() {
str := beaut.NewUTF8StringUnchecked(occ.Symbol, knownwf.UTF8DeserializedFromProtobufString)
err = scip.ParseSymbolUTF8With(str, scip.ParseSymbolOptions{
IncludeDescriptors: true,
RecordOutput: &sym,
})
}, "panic for symbol: %q", occ.Symbol)
require.NoError(t, err)
require.Equal(t, old.Scheme, sym.Scheme)
require.Equal(t, old.Package, sym.Package)
require.Equalf(t, len(old.Descriptors), len(sym.Descriptors), "symbol: %v, d1: %+v, d2: %+v", occ.Symbol,
old.Descriptors, sym.Descriptors)
for i, d := range old.Descriptors {
dnew := sym.Descriptors[i]
require.Equal(t, d.Name, dnew.Name, "symbol: %v", occ.Symbol)
require.Equal(t, d.Suffix, dnew.Suffix, "symbol: %v", occ.Symbol)
require.Equal(t, d.Disambiguator, dnew.Disambiguator, "symbol: %v", occ.Symbol)
}
}
})
})
}
}
243 changes: 243 additions & 0 deletions bindings/go/scip/internal/old_symbol_parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
package internal

import (
"fmt"
"strings"

"github.com/cockroachdb/errors"
"github.com/sourcegraph/scip/bindings/go/scip"
"github.com/sourcegraph/scip/bindings/go/scip/internal/shared"
)

func tryParseLocalSymbol(symbol string) (string, error) {
if !strings.HasPrefix(symbol, "local ") {
return "", nil
}
suffix := symbol[6:]
if len(suffix) > 0 && shared.IsSimpleIdentifier(suffix) {
return suffix, nil
}
return "", errors.Newf("expected format 'local <simple-identifier>' but got: %v", symbol)
}

// ParsePartialSymbolV1ToBeDeleted parses an SCIP string into the Symbol message
// with the option to exclude the `.Descriptor` field.
//
// Nov 30 2024: This is currently only present for benchmarking + compatibility
// reasons. We can remove this in the future once we're confident that the new
// parser handles everything correctly.
func ParsePartialSymbolV1ToBeDeleted(symbol string, includeDescriptors bool) (*scip.Symbol, error) {
// TODO: Rip out this, and call
local, err := tryParseLocalSymbol(symbol)
if err != nil {
return nil, err
}
if local != "" {
return newLocalSymbol(local), nil
}
s := newSymbolParser(symbol)
scheme, err := s.acceptSpaceEscapedIdentifier("scheme")
if err != nil {
return nil, err
}
manager, err := s.acceptSpaceEscapedIdentifier("package manager")
if err != nil {
return nil, err
}
if manager == "." {
manager = ""
}
packageName, err := s.acceptSpaceEscapedIdentifier("package name")
if err != nil {
return nil, err
}
if packageName == "." {
packageName = ""
}
packageVersion, err := s.acceptSpaceEscapedIdentifier("package version")
if err != nil {
return nil, err
}
if packageVersion == "." {
packageVersion = ""
}
var descriptors []*scip.Descriptor
if includeDescriptors {
descriptors, err = s.parseDescriptors()
}
return &scip.Symbol{
Scheme: scheme,
Package: &scip.Package{
Manager: manager,
Name: packageName,
Version: packageVersion,
},
Descriptors: descriptors,
}, err
}

func newLocalSymbol(id string) *scip.Symbol {
return &scip.Symbol{
Scheme: "local",
Descriptors: []*scip.Descriptor{
{Name: id, Suffix: scip.Descriptor_Local},
},
}
}

type symbolParser struct {
Symbol []rune
index int
SymbolString string
}

func newSymbolParser(symbol string) *symbolParser {
return &symbolParser{
SymbolString: symbol,
Symbol: []rune(symbol),
index: 0,
}
}

func (s *symbolParser) error(message string) error {
return errors.Newf("%s\n%s\n%s^", message, s.SymbolString, strings.Repeat("_", s.index))
}

func (s *symbolParser) current() rune {
if s.index < len(s.Symbol) {
return s.Symbol[s.index]
}
return '\x00'
}

func (s *symbolParser) peekNext() rune {
if s.index+1 < len(s.Symbol) {
return s.Symbol[s.index]
}
return 0
}

func (s *symbolParser) parseDescriptors() ([]*scip.Descriptor, error) {
var result []*scip.Descriptor
for s.index < len(s.Symbol) {
descriptor, err := s.parseDescriptor()
if err != nil {
return nil, err
}
result = append(result, descriptor)
}
return result, nil
}

func (s *symbolParser) parseDescriptor() (*scip.Descriptor, error) {
start := s.index
switch s.peekNext() {
case '(':
s.index++
name, err := s.acceptIdentifier("parameter name")
if err != nil {
return nil, err
}
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Parameter}, s.acceptCharacter(')', "closing parameter name")
case '[':
s.index++
name, err := s.acceptIdentifier("type parameter name")
if err != nil {
return nil, err
}
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_TypeParameter}, s.acceptCharacter(']', "closing type parameter name")
default:
name, err := s.acceptIdentifier("descriptor name")
if err != nil {
return nil, err
}
suffix := s.current()
s.index++
switch suffix {
case '(':
disambiguator := ""
if s.peekNext() != ')' {
disambiguator, err = s.acceptIdentifier("method disambiguator")
if err != nil {
return nil, err
}
}
err = s.acceptCharacter(')', "closing method")
if err != nil {
return nil, err
}
return &scip.Descriptor{Name: name, Disambiguator: disambiguator, Suffix: scip.Descriptor_Method}, s.acceptCharacter('.', "closing method")
case '/':
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Namespace}, nil
case '.':
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Term}, nil
case '#':
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Type}, nil
case ':':
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Meta}, nil
case '!':
return &scip.Descriptor{Name: name, Suffix: scip.Descriptor_Macro}, nil
default:
}
}

end := s.index
if s.index > len(s.Symbol) {
end = len(s.Symbol)
}
return nil, errors.Newf("unrecognized descriptor %q", string(s.Symbol[start:end]))
}

func (s *symbolParser) acceptIdentifier(what string) (string, error) {
if s.current() == '`' {
s.index++
return s.acceptBacktickEscapedIdentifier(what)
}
start := s.index
for s.index < len(s.Symbol) && shared.IsSimpleIdentifierCharacter(s.current()) {
s.index++
}
if start == s.index {
return "", s.error("empty identifier")
}
return string(s.Symbol[start:s.index]), nil
}

func (s *symbolParser) acceptSpaceEscapedIdentifier(what string) (string, error) {
return s.acceptEscapedIdentifier(what, ' ')
}

func (s *symbolParser) acceptBacktickEscapedIdentifier(what string) (string, error) {
return s.acceptEscapedIdentifier(what, '`')
}

func (s *symbolParser) acceptEscapedIdentifier(what string, escapeCharacter rune) (string, error) {
builder := strings.Builder{}
for s.index < len(s.Symbol) {
ch := s.current()
if ch == escapeCharacter {
s.index++
if s.index >= len(s.Symbol) {
break
}
if s.current() == escapeCharacter {
// Escaped space character.
builder.WriteRune(ch)
} else {
return builder.String(), nil
}
} else {
builder.WriteRune(ch)
}
s.index++
}
return "", s.error(fmt.Sprintf("reached end of symbol while parsing <%s>, expected a '%v' character", what, string(escapeCharacter)))
}

func (s *symbolParser) acceptCharacter(r rune, what string) error {
if s.current() == r {
s.index++
return nil
}
return s.error(fmt.Sprintf("expected '%v', obtained '%v', while parsing %v", string(r), string(s.current()), what))
}
37 changes: 37 additions & 0 deletions bindings/go/scip/internal/shared/sample_indexes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package shared

import (
"fmt"
"os"
"path"
"path/filepath"
"slices"
"strings"
)

// SampleIndexes returns a list of paths to SCIP indexes for testing/benchmarking.
func SampleIndexes() []string {
workDir, err := os.Getwd()
if err != nil {
panic(fmt.Sprintf("failed to get working directory: %v", err))
}
components := strings.Split(workDir, "/")
idx := slices.Index(components, "scip")
if idx == -1 {
panic(fmt.Sprintf("failed to locate scip directory in working directory: %q", workDir))
}
indexesDir := filepath.Join("/", filepath.Join(components[:idx+1]...), "dev", "sample_indexes")
dirEntries, err := os.ReadDir(indexesDir)
if err != nil {
wd, _ := os.Getwd()
panic(fmt.Sprintf("failed to locate %q: %v - not running benchmark proper directory? (working directory = %s)",
indexesDir, err.Error(), wd))
}
out := []string{}
for _, entry := range dirEntries {
if strings.HasSuffix(entry.Name(), ".scip") {
out = append(out, path.Join(indexesDir, entry.Name()))
}
}
return out
}
17 changes: 17 additions & 0 deletions bindings/go/scip/internal/shared/shared.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// Package shared has functions used both by the old parser and the new parser.
package shared

func IsSimpleIdentifierCharacter(c rune) bool {
return c == '_' || c == '+' || c == '-' || c == '$' || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || ('0' <= c && c <= '9')
}

// IsSimpleIdentifier matches the definition of <simple-identifier> in scip.proto.
func IsSimpleIdentifier(s string) bool {
for _, c := range s {
if IsSimpleIdentifierCharacter(c) {
continue
}
return false
}
return true
}
Loading

0 comments on commit 910753a

Please sign in to comment.