forked from hermanschaaf/cedict
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cedict.go
293 lines (258 loc) · 7.94 KB
/
cedict.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
// Copyright 2014 Herman Schaaf. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
/*
Package cedict provides a parser / tokenizer for reading entries from the CEDict
Chinese dictionary project.
Tokenizing is done by creating a CEDict for an io.Reader r. It is the
caller's responsibility to ensure that r provides a CEDict-formatted dictionary.
import "github.com/hermanschaaf/cedict"
...
c := cedict.New(r) // r is an io.Reader to the cedict file
Given a CEDict c, the dictionary is tokenized by repeatedly calling c.NextEntry(),
which parses until it reaches the next entry, or an error if no more entries are found:
for {
err := c.NextEntry()
if err != nil {
break
}
entry := c.Entry()
fmt.Println(entry.Simplified, entry.Definitions[0])
}
To retrieve the current entry, the Entry method can be called. There is also
a lower-level API available, using the bufio.Scanner Scan method. Using this
lower-level API is the recommended way to read comments from the CEDict, should
that be necessary.
*/
package cedict
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"regexp"
"strings"
)
const (
EntryToken = iota
CommentToken
ErrorToken
)
// CEDict is the basic tokenizer struct we use to read and parse
// new dictionary instances.
type CEDict struct {
*bufio.Scanner
TokenType int
entry *Entry
}
// Entry represents a single entry in the cedict dictionary.
type Entry struct {
Simplified string
Traditional string
Pinyin string
PinyinWithTones string
PinyinNoTones string
Definitions []string
}
// consumeComment reads from the data byte slice until a new line is found,
// returning the advanced steps, accumalated bytes and nil error if successful.
// This is done in accordance to the SplitFunc type defined in bufio.
func consumeComment(data []byte, atEOF bool) (int, []byte, error) {
var accum []byte
for i, b := range data {
if b == '\n' || (atEOF && i == len(data)-1) {
return i + 1, accum, nil
} else {
accum = append(accum, b)
}
}
if atEOF {
return len(data), accum, nil
}
return 0, nil, nil
}
// consumeEntry reads from the data byte slice until a new line is found.
// It only returns the bytes found, and does not attempt to parse the actual
// entry on the line.
func consumeEntry(data []byte, atEOF bool) (int, []byte, error) {
var accum []byte
for i, b := range data {
if b == '\n' {
return i + 1, accum, nil
} else {
accum = append(accum, b)
}
}
if atEOF {
return len(data), accum, nil
}
return 0, nil, nil
}
// New takes an io.Reader and creates a new CEDict instance.
func New(r io.Reader) *CEDict {
s := bufio.NewScanner(r)
c := &CEDict{
Scanner: s,
}
// splitFunc defines how we split our tokens
splitFunc := func(data []byte, atEOF bool) (advance int, token []byte, err error) {
if data[0] == '#' {
advance, token, err = consumeComment(data, atEOF)
c.TokenType = CommentToken
} else {
advance, token, err = consumeEntry(data, atEOF)
c.TokenType = EntryToken
}
return
}
s.Split(splitFunc)
return c
}
// toneLookupTable returns a lookup table to replace a specified tone number with
// its appropriate UTF-8 character with tone marks
func toneLookupTable(tone int) (map[string]string, error) {
if tone < 0 || tone > 5 {
return nil, fmt.Errorf("Tried to create tone lookup table with tone %i", tone)
}
lookupTable := map[string][]string{
"a": []string{"a", "ā", "á", "ǎ", "à", "a"},
"e": []string{"e", "ē", "é", "ě", "è", "e"},
"i": []string{"i", "ī", "í", "ǐ", "ì", "i"},
"o": []string{"o", "ō", "ó", "ǒ", "ò", "o"},
"u": []string{"u", "ū", "ú", "ǔ", "ù", "u"},
"ü": []string{"ü", "ǖ", "ǘ", "ǚ", "ǜ", "ü"},
}
toneLookup := make(map[string]string)
for vowel, toneRunes := range lookupTable {
toneLookup[vowel] = toneRunes[tone]
}
return toneLookup, nil
}
// extractTone splits the tone number and the pinyin syllable returning a string
// and an integer, e.g., dong1 => dong, 1
func extractTone(p string) (string, int) {
tone := int(p[len(p)-1]) - 48
if tone < 0 || tone > 5 {
return p, 0
}
return p[0 : len(p)-1], tone
}
// replaceWithToneMark returns the UTF-8 representation of a pinyin syllable with
// the appropriate tone, e.g., dong1 => dōng, using the pinyin accent placement rules
func replaceWithToneMark(s string, tone int) (string, error) {
lookup, err := toneLookupTable(tone)
if err != nil {
return "", err
}
if strings.Contains(s, "a") {
return strings.Replace(s, "a", lookup["a"], -1), nil
}
if strings.Contains(s, "e") {
return strings.Replace(s, "e", lookup["e"], -1), nil
}
if strings.Contains(s, "ou") {
return strings.Replace(s, "o", lookup["o"], -1), nil
}
index := strings.LastIndexAny(s, "iüou")
if index != -1 {
var out bytes.Buffer
for ind, runeValue := range s {
if ind == index {
out.WriteString(lookup[string(runeValue)])
} else {
out.WriteString(string(runeValue))
}
}
return out.String(), nil
}
return "", fmt.Errorf("No tone match")
}
// ToPinyinTonemarks takes a CEDICT pinyin representation and returns the concatenated
// pinyin version with tone marks, e.g., yi1 lan3 zi5 => yīlǎnzi. This function
// is useful for customizing pinyin conversion for your own application. For example,
// if you wish to get the tone pinyin of each character, you may pass in each
// section of the original word separately, as in yi1 => yī, lan3 => lǎn, zi5 => zi.
func ToPinyinTonemarks(p string) string {
pv := strings.Replace(p, "u:", "ü", -1)
py := strings.Split(pv, " ")
var out bytes.Buffer
for _, pySyllable := range py {
pyNoTone, tone := extractTone(pySyllable)
pyWithTone, err := replaceWithToneMark(pyNoTone, tone)
if err != nil {
return ""
}
out.WriteString(pyWithTone)
}
return out.String()
}
// pinyinNoTones takes a CEDICT pinyin representation and returns the concatenated
// pinyin version without tone marks, e.g., yi1 lan3 zi5 => yilanzi
// This representation is useful for building a search interface to the CEDICT database
// for user pinyin input.
// Note: This substitutes the more common search term "v" for "ü"
func pinyinNoTones(p string) string {
pv := strings.Replace(p, "u:", "v", -1)
py := strings.Split(pv, " ")
var out bytes.Buffer
for _, pySyllable := range py {
pyNoTone, _ := extractTone(pySyllable)
out.WriteString(pyNoTone)
}
return out.String()
}
var reEntry = regexp.MustCompile(`(?P<trad>\S*?) (?P<simp>\S*?) \[(?P<pinyin>.+)\] \/(?P<defs>.+)\/`)
// parseEntry parses string entries from CEDict of the form:
// 一之為甚 一之为甚 [yi1 zhi1 wei2 shen4] /Once is enough (idiom)/
// It returns a pointer to a new Entry struct.
func parseEntry(s string) (*Entry, error) {
match := reEntry.FindStringSubmatch(s)
if match == nil {
return nil, fmt.Errorf("Badly formatted entry: %v", s)
}
e := Entry{}
for i, name := range reEntry.SubexpNames() {
// Ignore the whole regexp match and unnamed groups
if i == 0 || name == "" {
continue
}
switch name {
case "simp":
e.Simplified = match[i]
case "trad":
e.Traditional = match[i]
case "pinyin":
e.Pinyin = match[i]
case "defs":
e.Definitions = strings.Split(match[i], "/")
}
}
e.PinyinWithTones = ToPinyinTonemarks(e.Pinyin)
e.PinyinNoTones = pinyinNoTones(e.Pinyin)
return &e, nil
}
var NoMoreEntries error = errors.New("No more entries to read")
// Next reads until the next entry token is found. Once found,
// it parses the token and returns a pointer to a newly populated
// Entry struct.
func (c *CEDict) NextEntry() error {
for c.Scan() {
if c.TokenType == EntryToken {
e, err := parseEntry(c.Text())
if err != nil {
return err
}
c.entry = e
return nil
}
}
if err := c.Err(); err != nil {
return err
}
return NoMoreEntries
}
// Entry returns a pointer to the most recently parsed Entry struct.
func (c *CEDict) Entry() *Entry {
return c.entry
}