Skip to content

Commit

Permalink
Fix support for userdefined tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
vikesh-raj committed Jan 21, 2021
1 parent d8a7484 commit 1fff6b0
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 2 deletions.
2 changes: 1 addition & 1 deletion cmd/dumpspm/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ func dumpWords(filename string) error {
count := 0
for i, piece := range model.GetPieces() {
word := piece.GetPiece()
fmt.Println(word, piece.GetScore(), i)
fmt.Println(word, piece.GetScore(), "(", piece.GetType(), ")", i)
count++
}

Expand Down
2 changes: 1 addition & 1 deletion sentencepiece/sentencepiece_proto.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ func NewSentencepieceFromFile(filename string, lowercase bool) (Sentencepiece, e
typ := piece.GetType()
word := piece.GetPiece()
switch typ {
case ModelProto_SentencePiece_NORMAL:
case ModelProto_SentencePiece_NORMAL, ModelProto_SentencePiece_USER_DEFINED:
s.insert(word, piece.GetScore(), int32(i))
case ModelProto_SentencePiece_UNKNOWN:
s.SetUnknownIndex(int32(i))
Expand Down
9 changes: 9 additions & 0 deletions sentencepiece/sentencepiece_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,15 @@ func TestTokenizationSPM(t *testing.T) {
{ID: 103, Text: "n"},
{ID: 1333, Text: "ized"},
}},
{text: ".", tokens: []Token{{ID: 13, Text: "▁"}, {ID: 9, Text: "."}}},
{text: "this is a dot .", tokens: []Token{
{ID: 48, Text: "▁this"},
{ID: 25, Text: "▁is"},
{ID: 21, Text: "▁a"},
{ID: 14123, Text: "▁dot"},
{ID: 13, Text: "▁"},
{ID: 9, Text: "."},
}},
{text: "compose email to john saying i will be running late to office today because i am not feeling well, my head is aching and in the body add shall we meet next week and when we go to the office lets reach by around 10 am and go for a movie in the evening, may be Spiderman which seems to be a very good movie which got 5 star review from rottentomatoes and imdb", tokens: []Token{
{ID: 18217, Text: "▁compose"},
{ID: 8517, Text: "▁email"},
Expand Down

0 comments on commit 1fff6b0

Please sign in to comment.