Skip to content

Commit

Permalink
refined implementation and fixed perf issue
Browse files Browse the repository at this point in the history
  • Loading branch information
susanhuhu committed Jun 16, 2023
1 parent e0cacaa commit 594b02c
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 132 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ sentencepiece/sentencepiece_model.pb.go: sentencepiece/sentencepiece_model.proto
protoc --go_out=. $<


cmd/dumpspm/dumpspm: cmd/dumpspm/main.go
cd cmd/dumpspm && go build
cmd: cmd/main.go
cd cmd && go build

test:
go test -cover -coverprofile=c.out ./sentencepiece && go tool cover -html=c.out -o coverage.html
Expand Down
54 changes: 54 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package main

import (
"fmt"
"os"
"strconv"
"sync"
"time"

"github.com/susanhuhu/go-sentencepiece-encoder/sentencepiece"
)

func main() {
if len(os.Args) < 2 {
fmt.Println("Please provide an integer argument.")
return
}

// Get the argument from the command-line
arg := os.Args[1]

// Parse the argument as an integer
count, _ := strconv.Atoi(arg)

sp, err := sentencepiece.NewSentencepieceFromFile("../sentencepiece/test_data/spm1.model", false)
if err != nil {
panic(fmt.Sprintf("Unable to create sentencepiece: %v", err))
}

// Create a wait group to wait for all goroutines to finish
var wg sync.WaitGroup

// Set the number of goroutines to wait for
wg.Add(count)

data := ""
// Launch count goroutines
for i := 0; i < count; i++ {
go func(i int) {
for j := 0; j < 1; j++ {
// Call the method
start := time.Now()
sp.TokenizeToOffsets(data)
latency := time.Since(start)
fmt.Printf("%d:%d: tokenize %d data used %v \n", i, j, len(data), latency)
}
// Notify the wait group that this goroutine has finished
wg.Done()
}(i)
}

// Wait for all goroutines to finish
wg.Wait()
}
56 changes: 0 additions & 56 deletions sentencepiece/normalize.go

This file was deleted.

114 changes: 55 additions & 59 deletions sentencepiece/sentencepiece.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package sentencepiece
import (
"fmt"
"math"
"strings"
"unicode"
"unicode/utf8"
)
Expand All @@ -19,25 +18,6 @@ type slice struct {
end int
}

func findOffset(position int, q string) int {
count := 0
for i := range q {
if count == position {
return i
}
}
return -1
}

func text(s slice, q string) string {
startOffset := findOffset(s.start, q)
endOffset := findOffset(s.end, q)
if startOffset == -1 || endOffset == -1 {
return ""
}
return q[startOffset:endOffset]
}

type trieNode struct {
text string
level int
Expand Down Expand Up @@ -127,14 +107,27 @@ func (s *Sentencepiece) tokenizeToOffsets(runes []rune, adjustFirstPadding bool)
}

func (s *Sentencepiece) prepareFortokenize(text string) []rune {
text = normalize(text)
if s.lowercase {
text = strings.ToLower(text)
runes := make([]rune, 0, len(text)+1)
first, _ := utf8.DecodeRuneInString(text)
if first != sep {
runes = append(runes, sep)
}

for _, r := range text {
if isControl(r) || r == 0 {
runes = append(runes, ' ')
} else if unicode.IsSpace(r) {
runes = append(runes, sep)
} else if s.lowercase {
runes = append(runes, unicode.ToLower(r))
} else {
runes = append(runes, r)
}
}
runes := torunes(text)
replaceWhiteSpace(runes)

return runes
}

func (s *Sentencepiece) insert(word string, score float32, index int32) {
_, size := utf8.DecodeLastRuneInString(word)
charCount := len(word)
Expand All @@ -157,7 +150,7 @@ func (s *Sentencepiece) insert(word string, score float32, index int32) {
}

func (s *Sentencepiece) commonPrefixSearch(runes []rune) []trieNode {
output := make([]trieNode, 0, len(runes))
var output []trieNode
node := &s.root
for _, r := range runes {
cnode, ok := node.children[r]
Expand Down Expand Up @@ -251,39 +244,6 @@ func (s *Sentencepiece) initSlices(len int) []slice {
return slices
}

func replaceWhiteSpace(runes []rune) {
for i, r := range runes {
if unicode.IsSpace(r) {
runes[i] = sep
}
}
}

func replaceSeperator(s string) string {
replacer := func(r rune) rune {
if r == sep {
return ' '
}
return r
}
return strings.Map(replacer, s)
}

func torunes(text string) []rune {
runes := make([]rune, 0, len(text)+1)

first, _ := utf8.DecodeRuneInString(text)
if first != sep {
runes = append(runes, sep)
}

for _, r := range text {
runes = append(runes, r)
}

return runes
}

func makeTokens(offsets []TokenOffset, runes []rune) []Token {
tokens := make([]Token, len(offsets))
for i, offset := range offsets {
Expand All @@ -295,3 +255,39 @@ func makeTokens(offsets []TokenOffset, runes []rune) []Token {
func addChar(s string, r rune) string {
return fmt.Sprintf("%s%c", s, r)
}

func isControl(c rune) bool {
if c == ' ' || c == '\n' || c == '\r' || c == '\t' {
return false
}
if c <= 0x001F || (c >= 0x0080 && c <= 0x009F) ||
(c >= 0xE0020 && c <= 0xE007F) ||
(c >= 0xE000 && c <= 0xF8FF) ||
(c >= 0xF0000 && c <= 0xFFFFD) ||
(c >= 0x100000 && c <= 0x10FFFD) ||
(c >= 0xD800 && c <= 0xDB7F) ||
(c >= 0xDB80 && c <= 0xDBFF) ||
(c >= 0xDC00 && c <= 0xDFFF) ||
isControlChar(c) {
return true
}
return false
}

func isControlChar(c rune) bool {
controlChars := []rune{
0x007F, 0x00AD, 0x0600, 0x0601, 0x0602, 0x0603, 0x0604, 0x0605, 0x061C, 0x06DD, 0x070F,
0x08E2, 0x180E, 0x200B, 0x200C, 0x200D, 0x200E, 0x200F, 0x202A, 0x202B, 0x202C, 0x202D,
0x202E, 0x2060, 0x2061, 0x2062, 0x2063, 0x2064, 0x2066, 0x2067, 0x2068, 0x2069, 0x206A,
0x206B, 0x206C, 0x206D, 0x206E, 0x206F, 0xFEFF, 0xFFF9, 0xFFFA, 0xFFFB, 0x110BD,
0x110CD, 0x13430, 0x13431, 0x13432, 0x13433, 0x13434, 0x13435, 0x13436, 0x13437,
0x13438, 0x1BCA0, 0x1BCA1, 0x1BCA2, 0x1BCA3, 0x1D173, 0x1D174, 0x1D175, 0x1D176,
0x1D177, 0x1D178, 0x1D179, 0x1D17A, 0xE0001,
}
for _, ch := range controlChars {
if ch == c {
return true
}
}
return false
}
19 changes: 4 additions & 15 deletions sentencepiece/sentencepiece_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -349,23 +349,12 @@ func TestRunLengthchange(t *testing.T) {

func testRunLengthchange(t *testing.T, text string) {
originalLen := len([]rune(text))
text = normalize(text)

lenAfterNorm := len([]rune(text))
if originalLen != lenAfterNorm {
t.Errorf("text length %d changed after normalization: %d", originalLen, lenAfterNorm)
}
runes := torunes(text)
padding := len(runes) - originalLen
lenAfterPadding := len(runes)
sp := NewEmptySentencepiece(true)
runes := sp.prepareFortokenize(text)
padding := len(runes) - originalLen
if padding != 0 && padding != 1 {
t.Errorf("padding shoudl be 0 or 1")
}
replaceWhiteSpace(runes)
if len(runes) != lenAfterPadding {
t.Errorf("replacing white space shouldn't change length")
}
}
}}

func TestTokenOffset(t *testing.T) {
spm, err := NewSentencepieceFromFile("test_data/spm1.model", false)
Expand Down

0 comments on commit 594b02c

Please sign in to comment.