add String.index and String.count, fix grapheme boundary functions

onflow · Jul 9, 2024 · a369c56 · a369c56
1 parent 1fcafe0
commit a369c56
Show file tree

Hide file tree

Showing 5 changed files with 486 additions and 29 deletions.
diff --git a/runtime/interpreter/value.go b/runtime/interpreter/value.go
@@ -1246,8 +1246,11 @@ var EmptyString = NewUnmeteredStringValue("")
 
 func (v *StringValue) Slice(from IntValue, to IntValue, locationRange LocationRange) Value {
 	fromIndex := from.ToInt(locationRange)
-
 	toIndex := to.ToInt(locationRange)
+	return v.slice(fromIndex, toIndex, locationRange)
+}
+
+func (v *StringValue) slice(fromIndex int, toIndex int, locationRange LocationRange) *StringValue {
 
 	length := v.Length()
 
@@ -1394,6 +1397,40 @@ func (v *StringValue) GetMember(interpreter *Interpreter, locationRange Location
 			},
 		)
 
+	case sema.StringTypeIndexFunctionName:
+		return NewBoundHostFunctionValue(
+			interpreter,
+			v,
+			sema.StringTypeIndexFunctionType,
+			func(invocation Invocation) Value {
+				other, ok := invocation.Arguments[0].(*StringValue)
+				if !ok {
+					panic(errors.NewUnreachableError())
+				}
+
+				return v.IndexOf(invocation.Interpreter, other)
+			},
+		)
+
+	case sema.StringTypeCountFunctionName:
+		return NewBoundHostFunctionValue(
+			interpreter,
+			v,
+			sema.StringTypeIndexFunctionType,
+			func(invocation Invocation) Value {
+				other, ok := invocation.Arguments[0].(*StringValue)
+				if !ok {
+					panic(errors.NewUnreachableError())
+				}
+
+				return v.Count(
+					invocation.Interpreter,
+					invocation.LocationRange,
+					other,
+				)
+			},
+		)
+
 	case sema.StringTypeDecodeHexFunctionName:
 		return NewBoundHostFunctionValue(
 			interpreter,
@@ -1703,36 +1740,59 @@ func (v *StringValue) ForEach(
 	}
 }
 
-func (v *StringValue) IsBoundaryStart(start int) bool {
+func (v *StringValue) IsGraphemeBoundaryStart(startOffset int) bool {
 	v.prepareGraphemes()
-	return v.isGraphemeBoundaryStartPrepared(start)
+
+	var characterIndex int
+	return v.seekGraphemeBoundaryStartPrepared(startOffset, &characterIndex)
 }
 
-func (v *StringValue) isGraphemeBoundaryStartPrepared(start int) bool {
+func (v *StringValue) seekGraphemeBoundaryStartPrepared(startOffset int, characterIndex *int) bool {
 
-	for {
-		boundaryStart, _ := v.graphemes.Positions()
-		if start == boundaryStart {
-			return true
-		} else if boundaryStart > start {
-			return false
+	for ; v.graphemes.Next(); *characterIndex++ {
+
+		boundaryStart, boundaryEnd := v.graphemes.Positions()
+		if boundaryStart == boundaryEnd {
+			// Graphemes.Positions() should never return a zero-length grapheme,
+			// and only does so if the grapheme iterator
+			// - is at the beginning of the string and has not been initialized (i.e. Next() has not been called); or
+			// - is at the end of the string and has been exhausted (i.e. Next() has returned false)
+			panic(errors.NewUnreachableError())
 		}
 
-		if !v.graphemes.Next() {
+		if startOffset == boundaryStart {
+			return true
+		} else if boundaryStart > startOffset {
 			return false
 		}
 	}
+
+	return false
 }
 
-func (v *StringValue) IsBoundaryEnd(end int) bool {
+func (v *StringValue) IsGraphemeBoundaryEnd(end int) bool {
 	v.prepareGraphemes()
+	v.graphemes.Next()
+
 	return v.isGraphemeBoundaryEndPrepared(end)
 }
 
 func (v *StringValue) isGraphemeBoundaryEndPrepared(end int) bool {
+	// Empty strings have no grapheme clusters, and therefore no boundaries
+	if len(v.Str) == 0 {
+		return false
+	}
 
 	for {
-		_, boundaryEnd := v.graphemes.Positions()
+		boundaryStart, boundaryEnd := v.graphemes.Positions()
+		if boundaryStart == boundaryEnd {
+			// Graphemes.Positions() should never return a zero-length grapheme,
+			// and only does so if the grapheme iterator
+			// - is at the beginning of the string and has not been initialized (i.e. Next() has not been called); or
+			// - is at the end of the string and has been exhausted (i.e. Next() has returned false)
+			panic(errors.NewUnreachableError())
+		}
+
 		if end == boundaryEnd {
 			return true
 		} else if boundaryEnd > end {
@@ -1745,30 +1805,110 @@ func (v *StringValue) isGraphemeBoundaryEndPrepared(end int) bool {
 	}
 }
 
-func (v *StringValue) Contains(inter *Interpreter, other *StringValue) BoolValue {
+func (v *StringValue) IndexOf(inter *Interpreter, other *StringValue) IntValue {
+	index := v.indexOf(inter, other)
+	return NewIntValueFromInt64(inter, int64(index))
+}
+
+func (v *StringValue) indexOf(inter *Interpreter, other *StringValue) int {
+
+	if len(other.Str) == 0 {
+		return 0
+	}
 
 	// Meter computation as if the string was iterated.
 	// This is a conservative over-estimation.
 	inter.ReportComputation(common.ComputationKindLoop, uint(len(v.Str)*len(other.Str)))
 
 	v.prepareGraphemes()
 
-	for start := 0; start < len(v.Str); start++ {
+	// We are dealing with two different positions / indices / measures:
+	// - 'CharacterIndex' indicates Cadence characters (grapheme clusters)
+	// - 'ByteOffset' indicates bytes
+
+	// The resulting index, in terms of Cadence characters (grapheme clusters)
+	var characterIndex int
+
+	// Find the position of the substring in the string,
+	// by using strings.Index with an increasing start byte offset.
+	//
+	// The byte offset returned from strings.Index is the start of the substring in the string,
+	// but it may not be at a grapheme boundary, so we need to check
+	// that both the start and end byte offsets are grapheme boundaries.
+	//
+	// We do not have a way to translate a byte offset into a character index.
+	// Instead, we iterate over the grapheme clusters until we reach the byte offset,
+	// keeping track of the character index.
+	//
+	// We need to back up and restore the grapheme iterator and character index
+	// when either the start or the end byte offset are not grapheme boundaries,
+	// so the next iteration can start from the correct position.
+
+	for searchStartByteOffset := 0; searchStartByteOffset < len(v.Str); searchStartByteOffset++ {
 
-		start = strings.Index(v.Str[start:], other.Str)
-		if start < 0 {
+		relativeFoundByteOffset := strings.Index(v.Str[searchStartByteOffset:], other.Str)
+		if relativeFoundByteOffset < 0 {
 			break
 		}
 
-		if v.isGraphemeBoundaryStartPrepared(start) &&
-			v.isGraphemeBoundaryEndPrepared(start+len(other.Str)) {
+		// The resulting found byte offset is relative to the search start byte offset,
+		// so we need to add the search start byte offset to get the absolute byte offset
+		absoluteFoundByteOffset := searchStartByteOffset + relativeFoundByteOffset
+
+		// Back up the grapheme iterator and character index,
+		// so the iteration state can be restored
+		// in case the byte offset is not at a grapheme boundary
+		graphemesBackup := *v.graphemes
+		characterIndexBackup := characterIndex
 
-			return TrueValue
+		if v.seekGraphemeBoundaryStartPrepared(absoluteFoundByteOffset, &characterIndex) &&
+			v.isGraphemeBoundaryEndPrepared(absoluteFoundByteOffset+len(other.Str)) {
+
+			return characterIndex
 		}
+
+		// Restore the grapheme iterator and character index
+		v.graphemes = &graphemesBackup
+		characterIndex = characterIndexBackup
 	}
 
-	return FalseValue
+	return -1
+}
+
+func (v *StringValue) Contains(inter *Interpreter, other *StringValue) BoolValue {
+	return AsBoolValue(v.indexOf(inter, other) >= 0)
+}
+
+func (v *StringValue) Count(inter *Interpreter, locationRange LocationRange, other *StringValue) IntValue {
+	index := v.count(inter, locationRange, other)
+	return NewIntValueFromInt64(inter, int64(index))
+}
+
+func (v *StringValue) count(inter *Interpreter, locationRange LocationRange, other *StringValue) int {
+	if other.Length() == 0 {
+		return 1 + v.Length()
+	}
+
+	// Meter computation as if the string was iterated.
+	inter.ReportComputation(common.ComputationKindLoop, uint(len(v.Str)))
+
+	remaining := v
+	count := 0
+
+	for {
+		index := remaining.indexOf(inter, other)
+		if index == -1 {
+			return count
+		}
+
+		count++
 
+		remaining = remaining.slice(
+			index+other.Length(),
+			remaining.Length(),
+			locationRange,
+		)
+	}
 }
 
 type StringValueIterator struct {

diff --git a/runtime/interpreter/value_test.go b/runtime/interpreter/value_test.go
@@ -4392,7 +4392,7 @@ func TestValue_ConformsToStaticType(t *testing.T) {
 
 }
 
-func TestStringIsBoundaryStart(t *testing.T) {
+func TestStringIsGraphemeBoundaryStart(t *testing.T) {
 
 	t.Parallel()
 
@@ -4402,11 +4402,11 @@ func TestStringIsBoundaryStart(t *testing.T) {
 
 		t.Run(name, func(t *testing.T) {
 			str := NewUnmeteredStringValue(s)
-			assert.Equal(t, expected, str.IsBoundaryStart(i))
+			assert.Equal(t, expected, str.IsGraphemeBoundaryStart(i))
 		})
 	}
 
-	test("", 0, true)
+	test("", 0, false)
 	test("a", 0, true)
 	test("a", 1, false)
 	test("ab", 1, true)
@@ -4433,7 +4433,7 @@ func TestStringIsBoundaryStart(t *testing.T) {
 	test(flagESflagEE, 15, false)
 }
 
-func TestStringIsBoundaryEnd(t *testing.T) {
+func TestStringIsGraphemeBoundaryEnd(t *testing.T) {
 
 	t.Parallel()
 
@@ -4443,19 +4443,19 @@ func TestStringIsBoundaryEnd(t *testing.T) {
 
 		t.Run(name, func(t *testing.T) {
 			str := NewUnmeteredStringValue(s)
-			assert.Equal(t, expected, str.IsBoundaryEnd(i))
+			assert.Equal(t, expected, str.IsGraphemeBoundaryEnd(i))
 		})
 	}
 
-	test("", 0, true)
-	test("a", 0, true)
+	test("", 0, false)
+	test("a", 0, false)
 	test("a", 1, true)
 	test("ab", 1, true)
 
 	// 🇪🇸🇪🇪 ("ES", "EE")
 	flagESflagEE := "\U0001F1EA\U0001F1F8\U0001F1EA\U0001F1EA"
 	require.Len(t, flagESflagEE, 16)
-	test(flagESflagEE, 0, true)
+	test(flagESflagEE, 0, false)
 	test(flagESflagEE, 1, false)
 	test(flagESflagEE, 2, false)
 	test(flagESflagEE, 3, false)
@@ -4472,4 +4472,7 @@ func TestStringIsBoundaryEnd(t *testing.T) {
 	test(flagESflagEE, 13, false)
 	test(flagESflagEE, 14, false)
 	test(flagESflagEE, 15, false)
+
+	test(flagESflagEE, 16, true)
+
 }
diff --git a/runtime/sema/string_type.go b/runtime/sema/string_type.go
@@ -129,6 +129,18 @@ func init() {
 				StringTypeContainsFunctionType,
 				stringTypeContainsFunctionDocString,
 			),
+			NewUnmeteredPublicFunctionMember(
+				t,
+				StringTypeIndexFunctionName,
+				StringTypeIndexFunctionType,
+				stringTypeIndexFunctionDocString,
+			),
+			NewUnmeteredPublicFunctionMember(
+				t,
+				StringTypeCountFunctionName,
+				StringTypeCountFunctionType,
+				stringTypeCountFunctionDocString,
+			),
 		})
 	}
 }
@@ -194,6 +206,46 @@ const stringTypeContainsFunctionDocString = `
 Returns true if this string contains the given other string as a substring.
 `
 
+var StringTypeIndexFunctionType = NewSimpleFunctionType(
+	FunctionPurityView,
+	[]Parameter{
+		{
+			Label:          "of",
+			Identifier:     "other",
+			TypeAnnotation: StringTypeAnnotation,
+		},
+	},
+	IntTypeAnnotation,
+)
+
+const StringTypeIndexFunctionName = "index"
+
+const stringTypeIndexFunctionDocString = `
+Returns the index within this string of the first occurrence of the given substring.
+
+If the substring is not found, the function returns -1.
+`
+
+var StringTypeCountFunctionType = NewSimpleFunctionType(
+	FunctionPurityView,
+	[]Parameter{
+		{
+			Label:          ArgumentLabelNotRequired,
+			Identifier:     "other",
+			TypeAnnotation: StringTypeAnnotation,
+		},
+	},
+	IntTypeAnnotation,
+)
+
+const StringTypeCountFunctionName = "count"
+
+const stringTypeCountFunctionDocString = `
+Returns the number of non-overlapping instances of the given substring in this string.
+
+If the given substring is an empty string, the function returns 1 + the number of characters in this string.
+`
+
 const StringTypeReplaceAllFunctionName = "replaceAll"
 const StringTypeReplaceAllFunctionDocString = `
 Returns a new string after replacing all the occurrences of parameter ` + "`of` with the parameter `with`" + `.