Skip to content

Commit

Permalink
Merge pull request #1178 from nyaruka/input_cleaning_take2
Browse files Browse the repository at this point in the history
Input cleaning take 2
  • Loading branch information
rowanseymour authored Aug 18, 2023
2 parents c3696a4 + dba3fc1 commit ee18400
Show file tree
Hide file tree
Showing 69 changed files with 403 additions and 243 deletions.
2 changes: 1 addition & 1 deletion assets/location.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,5 @@ import "github.com/nyaruka/goflow/envs"
// @asset location
type LocationHierarchy interface {
FindByPath(path envs.LocationPath) *envs.Location
FindByName(name string, level envs.LocationLevel, parent *envs.Location) []*envs.Location
FindByName(env envs.Environment, name string, level envs.LocationLevel, parent *envs.Location) []*envs.Location
}
106 changes: 0 additions & 106 deletions envs/cleaner.go

This file was deleted.

46 changes: 0 additions & 46 deletions envs/cleaner_test.go

This file was deleted.

69 changes: 69 additions & 0 deletions envs/collate.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package envs

import (
"strings"

"github.com/nyaruka/gocommon/stringsx"
"golang.org/x/text/unicode/norm"
)

type Collation string

const (
CollationDefault Collation = "default"
CollationConfusables Collation = "confusables"
CollationArabicFarsi Collation = "arabic_farsi"
)

type collateTransformer func(string) string

// https://en.wikipedia.org/wiki/Persian_alphabet#Deviations_from_the_Arabic_script
var arabicToFarsi = map[rune]rune{
'٠': '۰', // U+0660 > U+06F0 (0)
'١': '۱', // U+0661 > U+06F1 (1)
'٢': '۲', // U+06F2 > U+0662 (2)
'٣': '۳', // U+06F3 > U+0663 (3)
'٤': '۴', // U+06F4 > U+0664 (4)
'٥': '۵', // U+06F5 > U+0665 (5)
'٦': '۶', // U+06F6 > U+0666 (6)
'٧': '۷', // U+06F7 > U+0667 (7)
'٨': '۸', // U+06F8 > U+0668 (8)
'٩': '۹', // U+06F9 > U+0669 (9)
'ى': 'ی', // U+0649 > U+06CC (alef maksura)
'ي': 'ی', // U+064A > U+06CC (yeh)
'ك': 'ک', // U+0643 > U+06A9 (kāf)
}

var transformers = map[Collation]collateTransformer{
CollationDefault: func(s string) string {
return strings.ToLower(s)
},
CollationConfusables: func(s string) string {
return strings.ToLower(stringsx.Skeleton(s))
},
CollationArabicFarsi: func(s string) string {
return strings.ToLower(replaceRunes(norm.NFKD.String(s), arabicToFarsi))
},
}

// CollateEquals returns true if the given strings are equal in the given environment's collation
func CollateEquals(env Environment, s, t string) bool {
return CollateTransform(env, s) == CollateTransform(env, t)
}

// CollateTransform transforms the given string into it's form to be used for collation.
func CollateTransform(env Environment, s string) string {
return transformers[env.InputCollation()](s)
}

func replaceRunes(s string, mapping map[rune]rune) string {
var sb strings.Builder
for _, r := range s {
if repl, found := mapping[r]; found {
sb.WriteRune(repl)
} else {
sb.WriteRune(r)
}
}
return sb.String()
}
64 changes: 64 additions & 0 deletions envs/collate_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package envs_test

import (
"strconv"
"testing"

"github.com/nyaruka/goflow/envs"
"github.com/stretchr/testify/assert"
)

func TestCollation(t *testing.T) {

tcs := []struct {
collation envs.Collation
input string
transform string
equals map[string]bool
}{
{envs.CollationDefault, "AbcD", "abcd", map[string]bool{
"acde": false,
"aBCd": true,
}},
{envs.CollationConfusables, "𝕟𝔂𝛼𝐫ᴜ𝞳𝕒", "nyaruka", map[string]bool{
"trileet": false,
"Nyaruka": true,
"𝒩ɣaruka": true,
}},
{envs.CollationArabicFarsi, "٠١٢٣٤٥٦۷٨٩", "۰۱۲۳۴۵۶۷۸۹", map[string]bool{
"٤٥٦۷": false,
"٠١٢٣٤٥٦۷٨٩": true,
"۰۱۲۳۴۵۶۷۸۹": true,
}},
{envs.CollationArabicFarsi, "\u0628\u0644\u06CC", "\u0628\u0644\u06CC", map[string]bool{ // ends with farsi yeh (unchanged)
"\u0628\u0644": false,
"\u0628\u0644\u0649": true, // ends with alef maksura
"\u0628\u0644\u064A": true, // ends with arabic yeh
}},
{envs.CollationArabicFarsi, "\u0628\u0644\u0649", "\u0628\u0644\u06CC", map[string]bool{ // ends with alef maksura
"\u0628\u0644\u06CC": true, // ends with farsi yeh
"\u0628\u0644\u064A": true, // ends with arabic yeh
}},
{envs.CollationArabicFarsi, "\u0628\u0644\u064A", "\u0628\u0644\u06CC", map[string]bool{ // ends with arabic yeh
"\u0628\u0644\u06CC": true, // ends with farsi yeh
"\u0628\u0644\u0649": true, // ends with alef maksura
}},
{envs.CollationArabicFarsi, "\u0643\u0627\u0641", "\u06A9\u0627\u0641", map[string]bool{ // starts with arabic kaf
"\u0643\u0627\u0641": true, // starts with arabic kaf
"\u06A9\u0627\u0641": true, // starts with farsi kaf
"\uFEDB\u0627\u0641": true, // starts with explicit initial form kaf
}},
{envs.CollationArabicFarsi, "YES", "yes", map[string]bool{"yes": true, "no": false}},
}

for _, tc := range tcs {
env := envs.NewBuilder().WithInputCollation(tc.collation).Build()

assert.Equal(t, tc.transform, envs.CollateTransform(env, tc.input), "%s transform mismatch for input %s (%s)",
tc.collation, strconv.QuoteToASCII(tc.input), strconv.QuoteToASCII(tc.transform))

for eqStr, eqResult := range tc.equals {
assert.Equal(t, eqResult, envs.CollateEquals(env, tc.input, eqStr))
}
}
}
22 changes: 11 additions & 11 deletions envs/environment.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ type Environment interface {
AllowedLanguages() []Language
DefaultCountry() Country
NumberFormat() *NumberFormat
InputCollation() Collation
RedactionPolicy() RedactionPolicy
InputCleaners() []Cleaner

DefaultLanguage() Language
DefaultLocale() Locale
Expand All @@ -56,7 +56,7 @@ type environment struct {
defaultCountry Country
numberFormat *NumberFormat
redactionPolicy RedactionPolicy
inputCleaners []Cleaner
inputCollation Collation
}

func (e *environment) DateFormat() DateFormat { return e.dateFormat }
Expand All @@ -65,8 +65,8 @@ func (e *environment) Timezone() *time.Location { return e.timezone }
func (e *environment) AllowedLanguages() []Language { return e.allowedLanguages }
func (e *environment) DefaultCountry() Country { return e.defaultCountry }
func (e *environment) NumberFormat() *NumberFormat { return e.numberFormat }
func (e *environment) InputCollation() Collation { return e.inputCollation }
func (e *environment) RedactionPolicy() RedactionPolicy { return e.redactionPolicy }
func (e *environment) InputCleaners() []Cleaner { return e.inputCleaners }

// DefaultLanguage is the first allowed language
func (e *environment) DefaultLanguage() Language {
Expand Down Expand Up @@ -104,8 +104,8 @@ type envEnvelope struct {
AllowedLanguages []Language `json:"allowed_languages,omitempty" validate:"omitempty,dive,language"`
NumberFormat *NumberFormat `json:"number_format,omitempty"`
DefaultCountry Country `json:"default_country,omitempty" validate:"omitempty,country"`
InputCollation Collation `json:"input_collation"`
RedactionPolicy RedactionPolicy `json:"redaction_policy" validate:"omitempty,eq=none|eq=urns"`
InputCleaners []Cleaner `json:"input_cleaners,omitempty"`
}

// ReadEnvironment reads an environment from the given JSON
Expand All @@ -123,8 +123,8 @@ func ReadEnvironment(data json.RawMessage) (Environment, error) {
env.allowedLanguages = envelope.AllowedLanguages
env.defaultCountry = envelope.DefaultCountry
env.numberFormat = envelope.NumberFormat
env.inputCollation = envelope.InputCollation
env.redactionPolicy = envelope.RedactionPolicy
env.inputCleaners = envelope.InputCleaners

tz, err := time.LoadLocation(envelope.Timezone)
if err != nil {
Expand All @@ -143,8 +143,8 @@ func (e *environment) toEnvelope() *envEnvelope {
AllowedLanguages: e.allowedLanguages,
DefaultCountry: e.defaultCountry,
NumberFormat: e.numberFormat,
InputCollation: e.inputCollation,
RedactionPolicy: e.redactionPolicy,
InputCleaners: e.inputCleaners,
}
}

Expand Down Expand Up @@ -172,8 +172,8 @@ func NewBuilder() *EnvironmentBuilder {
allowedLanguages: nil,
defaultCountry: NilCountry,
numberFormat: DefaultNumberFormat,
inputCollation: CollationDefault,
redactionPolicy: RedactionPolicyNone,
inputCleaners: nil,
},
}
}
Expand Down Expand Up @@ -210,13 +210,13 @@ func (b *EnvironmentBuilder) WithNumberFormat(numberFormat *NumberFormat) *Envir
return b
}

func (b *EnvironmentBuilder) WithRedactionPolicy(redactionPolicy RedactionPolicy) *EnvironmentBuilder {
b.env.redactionPolicy = redactionPolicy
func (b *EnvironmentBuilder) WithInputCollation(col Collation) *EnvironmentBuilder {
b.env.inputCollation = col
return b
}

func (b *EnvironmentBuilder) WithInputCleaners(cs ...Cleaner) *EnvironmentBuilder {
b.env.inputCleaners = cs
func (b *EnvironmentBuilder) WithRedactionPolicy(redactionPolicy RedactionPolicy) *EnvironmentBuilder {
b.env.redactionPolicy = redactionPolicy
return b
}

Expand Down
4 changes: 3 additions & 1 deletion envs/environment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,13 @@ func TestEnvironmentMarshaling(t *testing.T) {
assert.Equal(t, []envs.Language{envs.Language("eng"), envs.Language("fra")}, env.AllowedLanguages())
assert.Equal(t, envs.Country("RW"), env.DefaultCountry())
assert.Equal(t, "en-RW", env.DefaultLocale().ToBCP47())
assert.Equal(t, envs.CollationDefault, env.InputCollation())
assert.Equal(t, envs.RedactionPolicyNone, env.RedactionPolicy())
assert.Nil(t, env.LocationResolver())

data, err := jsonx.Marshal(env)
require.NoError(t, err)
assert.Equal(t, string(data), `{"date_format":"DD-MM-YYYY","time_format":"tt:mm:ss","timezone":"Africa/Kigali","allowed_languages":["eng","fra"],"number_format":{"decimal_symbol":".","digit_grouping_symbol":","},"default_country":"RW","redaction_policy":"none"}`)
assert.Equal(t, string(data), `{"date_format":"DD-MM-YYYY","time_format":"tt:mm:ss","timezone":"Africa/Kigali","allowed_languages":["eng","fra"],"number_format":{"decimal_symbol":".","digit_grouping_symbol":","},"default_country":"RW","input_collation":"default","redaction_policy":"none"}`)
}

func TestEnvironmentEqual(t *testing.T) {
Expand Down
Loading

0 comments on commit ee18400

Please sign in to comment.