-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1178 from nyaruka/input_cleaning_take2
Input cleaning take 2
- Loading branch information
Showing
69 changed files
with
403 additions
and
243 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
package envs | ||
|
||
import ( | ||
"strings" | ||
|
||
"github.com/nyaruka/gocommon/stringsx" | ||
"golang.org/x/text/unicode/norm" | ||
) | ||
|
||
type Collation string | ||
|
||
const ( | ||
CollationDefault Collation = "default" | ||
CollationConfusables Collation = "confusables" | ||
CollationArabicFarsi Collation = "arabic_farsi" | ||
) | ||
|
||
type collateTransformer func(string) string | ||
|
||
// https://en.wikipedia.org/wiki/Persian_alphabet#Deviations_from_the_Arabic_script | ||
var arabicToFarsi = map[rune]rune{ | ||
'٠': '۰', // U+0660 > U+06F0 (0) | ||
'١': '۱', // U+0661 > U+06F1 (1) | ||
'٢': '۲', // U+06F2 > U+0662 (2) | ||
'٣': '۳', // U+06F3 > U+0663 (3) | ||
'٤': '۴', // U+06F4 > U+0664 (4) | ||
'٥': '۵', // U+06F5 > U+0665 (5) | ||
'٦': '۶', // U+06F6 > U+0666 (6) | ||
'٧': '۷', // U+06F7 > U+0667 (7) | ||
'٨': '۸', // U+06F8 > U+0668 (8) | ||
'٩': '۹', // U+06F9 > U+0669 (9) | ||
'ى': 'ی', // U+0649 > U+06CC (alef maksura) | ||
'ي': 'ی', // U+064A > U+06CC (yeh) | ||
'ك': 'ک', // U+0643 > U+06A9 (kāf) | ||
} | ||
|
||
var transformers = map[Collation]collateTransformer{ | ||
CollationDefault: func(s string) string { | ||
return strings.ToLower(s) | ||
}, | ||
CollationConfusables: func(s string) string { | ||
return strings.ToLower(stringsx.Skeleton(s)) | ||
}, | ||
CollationArabicFarsi: func(s string) string { | ||
return strings.ToLower(replaceRunes(norm.NFKD.String(s), arabicToFarsi)) | ||
}, | ||
} | ||
|
||
// CollateEquals returns true if the given strings are equal in the given environment's collation | ||
func CollateEquals(env Environment, s, t string) bool { | ||
return CollateTransform(env, s) == CollateTransform(env, t) | ||
} | ||
|
||
// CollateTransform transforms the given string into it's form to be used for collation. | ||
func CollateTransform(env Environment, s string) string { | ||
return transformers[env.InputCollation()](s) | ||
} | ||
|
||
func replaceRunes(s string, mapping map[rune]rune) string { | ||
var sb strings.Builder | ||
for _, r := range s { | ||
if repl, found := mapping[r]; found { | ||
sb.WriteRune(repl) | ||
} else { | ||
sb.WriteRune(r) | ||
} | ||
} | ||
return sb.String() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
package envs_test | ||
|
||
import ( | ||
"strconv" | ||
"testing" | ||
|
||
"github.com/nyaruka/goflow/envs" | ||
"github.com/stretchr/testify/assert" | ||
) | ||
|
||
func TestCollation(t *testing.T) { | ||
|
||
tcs := []struct { | ||
collation envs.Collation | ||
input string | ||
transform string | ||
equals map[string]bool | ||
}{ | ||
{envs.CollationDefault, "AbcD", "abcd", map[string]bool{ | ||
"acde": false, | ||
"aBCd": true, | ||
}}, | ||
{envs.CollationConfusables, "𝕟𝔂𝛼𝐫ᴜ𝞳𝕒", "nyaruka", map[string]bool{ | ||
"trileet": false, | ||
"Nyaruka": true, | ||
"𝒩ɣaruka": true, | ||
}}, | ||
{envs.CollationArabicFarsi, "٠١٢٣٤٥٦۷٨٩", "۰۱۲۳۴۵۶۷۸۹", map[string]bool{ | ||
"٤٥٦۷": false, | ||
"٠١٢٣٤٥٦۷٨٩": true, | ||
"۰۱۲۳۴۵۶۷۸۹": true, | ||
}}, | ||
{envs.CollationArabicFarsi, "\u0628\u0644\u06CC", "\u0628\u0644\u06CC", map[string]bool{ // ends with farsi yeh (unchanged) | ||
"\u0628\u0644": false, | ||
"\u0628\u0644\u0649": true, // ends with alef maksura | ||
"\u0628\u0644\u064A": true, // ends with arabic yeh | ||
}}, | ||
{envs.CollationArabicFarsi, "\u0628\u0644\u0649", "\u0628\u0644\u06CC", map[string]bool{ // ends with alef maksura | ||
"\u0628\u0644\u06CC": true, // ends with farsi yeh | ||
"\u0628\u0644\u064A": true, // ends with arabic yeh | ||
}}, | ||
{envs.CollationArabicFarsi, "\u0628\u0644\u064A", "\u0628\u0644\u06CC", map[string]bool{ // ends with arabic yeh | ||
"\u0628\u0644\u06CC": true, // ends with farsi yeh | ||
"\u0628\u0644\u0649": true, // ends with alef maksura | ||
}}, | ||
{envs.CollationArabicFarsi, "\u0643\u0627\u0641", "\u06A9\u0627\u0641", map[string]bool{ // starts with arabic kaf | ||
"\u0643\u0627\u0641": true, // starts with arabic kaf | ||
"\u06A9\u0627\u0641": true, // starts with farsi kaf | ||
"\uFEDB\u0627\u0641": true, // starts with explicit initial form kaf | ||
}}, | ||
{envs.CollationArabicFarsi, "YES", "yes", map[string]bool{"yes": true, "no": false}}, | ||
} | ||
|
||
for _, tc := range tcs { | ||
env := envs.NewBuilder().WithInputCollation(tc.collation).Build() | ||
|
||
assert.Equal(t, tc.transform, envs.CollateTransform(env, tc.input), "%s transform mismatch for input %s (%s)", | ||
tc.collation, strconv.QuoteToASCII(tc.input), strconv.QuoteToASCII(tc.transform)) | ||
|
||
for eqStr, eqResult := range tc.equals { | ||
assert.Equal(t, eqResult, envs.CollateEquals(env, tc.input, eqStr)) | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.