-
Notifications
You must be signed in to change notification settings - Fork 3
/
normalizer.go
64 lines (54 loc) · 1.58 KB
/
normalizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
package assocentity
import (
"strings"
"github.com/ndabAP/assocentity/v14/tokenize"
)
// Normalizer normalizes tokens like lower casing them to increase the overall
// token quality
type Normalizer func(tokenize.Token) tokenize.Token
// HumanReadableNormalizer normalizes tokens through lower casing them and
// replacing them with their synonyms. Note: It assumes English as input
// language
var HumanReadableNormalizer Normalizer = func(tok tokenize.Token) tokenize.Token {
t := tokenize.Token{
PoS: tok.PoS,
Text: strings.ToLower(tok.Text),
}
// This can increase the result data quality and could include more synonyms
switch tok.Text {
case "&":
t.Text = "and"
}
return t
}
// Normalize normalizes tokens with provided normalizer
func Normalize(dists map[tokenize.Token][]float64, norm Normalizer) {
for tok, d := range dists {
t := norm(tok)
// Check if text is the same as non-normalized
if t == tok {
continue
}
if _, ok := dists[t]; ok {
dists[t] = append(dists[tok], d...)
} else {
dists[t] = d
}
delete(dists, tok)
}
}
// Threshold excludes results that are below the given threshold. The threshold
// is described through the amount of distances per token relative to the total
// amount of tokens
func Threshold(dists map[tokenize.Token][]float64, threshold float64) {
// Length of dists is amount of total tokens
distsN := len(dists)
for tok, d := range dists {
dN := len(d)
// Amount of distances per token relative to the amount of all tokens
t := (float64(dN) / float64(distsN)) * 100
if t < threshold {
delete(dists, tok)
}
}
}