From 2c7b647971471351a014ebd52c60c253512ef900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Taylor?= Date: Tue, 6 Feb 2024 09:11:03 +0100 Subject: [PATCH] make sure to handle unsupported collations well (#15134) Signed-off-by: Andres Taylor --- go/mysql/collations/env.go | 5 +++ go/vt/vtgate/engine/aggregations.go | 2 +- go/vt/vtgate/executor_select_test.go | 28 +++++++------- go/vt/vtgate/planbuilder/collations_test.go | 6 +-- go/vt/vtgate/planbuilder/plan_test.go | 2 +- .../planbuilder/testdata/aggr_cases.json | 14 ++++--- .../testdata/postprocess_cases.json | 37 +++++++++++++++++-- .../planbuilder/testdata/vschemas/schema.json | 3 +- go/vt/vtgate/semantics/analyzer.go | 10 ++++- go/vt/vtgate/semantics/analyzer_test.go | 9 ++++- go/vt/vtgate/semantics/real_table.go | 1 - go/vt/vtgate/semantics/semantic_state.go | 8 +++- go/vt/vtgate/semantics/typer_test.go | 36 ++++++++++++++++++ go/vt/vtgate/vindexes/vschema.go | 21 +++++------ 14 files changed, 135 insertions(+), 47 deletions(-) diff --git a/go/mysql/collations/env.go b/go/mysql/collations/env.go index 9fe87230649..ae5419a5797 100644 --- a/go/mysql/collations/env.go +++ b/go/mysql/collations/env.go @@ -301,3 +301,8 @@ func (env *Environment) LookupByCharset(name string) *colldefaults { func (env *Environment) LookupCharsetName(coll ID) string { return env.byCharsetName[coll] } + +func (env *Environment) IsSupported(coll ID) bool { + _, supported := env.byID[coll] + return supported +} diff --git a/go/vt/vtgate/engine/aggregations.go b/go/vt/vtgate/engine/aggregations.go index 96e8cc294a9..3413234c84f 100644 --- a/go/vt/vtgate/engine/aggregations.go +++ b/go/vt/vtgate/engine/aggregations.go @@ -76,7 +76,7 @@ func (ap *AggregateParams) String() string { if ap.WAssigned() { keyCol = fmt.Sprintf("%s|%d", keyCol, ap.WCol) } - if sqltypes.IsText(ap.Type.Type()) && ap.Type.Collation() != collations.Unknown { + if sqltypes.IsText(ap.Type.Type()) && ap.CollationEnv.IsSupported(ap.Type.Collation()) { keyCol += " COLLATE " + ap.CollationEnv.LookupName(ap.Type.Collation()) } dispOrigOp := "" diff --git a/go/vt/vtgate/executor_select_test.go b/go/vt/vtgate/executor_select_test.go index 9603b5da5bc..dc4f1e0e726 100644 --- a/go/vt/vtgate/executor_select_test.go +++ b/go/vt/vtgate/executor_select_test.go @@ -26,28 +26,26 @@ import ( "testing" "time" - _flag "vitess.io/vitess/go/internal/flag" - "vitess.io/vitess/go/mysql/collations" - "vitess.io/vitess/go/streamlog" - "vitess.io/vitess/go/vt/topo/topoproto" - "vitess.io/vitess/go/vt/vtenv" - "vitess.io/vitess/go/vt/vtgate/logstats" - "github.com/google/go-cmp/cmp" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + _flag "vitess.io/vitess/go/internal/flag" + "vitess.io/vitess/go/mysql/collations" "vitess.io/vitess/go/sqltypes" + "vitess.io/vitess/go/streamlog" "vitess.io/vitess/go/test/utils" "vitess.io/vitess/go/vt/discovery" - "vitess.io/vitess/go/vt/vterrors" - _ "vitess.io/vitess/go/vt/vtgate/vindexes" - "vitess.io/vitess/go/vt/vttablet/sandboxconn" - querypb "vitess.io/vitess/go/vt/proto/query" topodatapb "vitess.io/vitess/go/vt/proto/topodata" vtgatepb "vitess.io/vitess/go/vt/proto/vtgate" vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" + "vitess.io/vitess/go/vt/topo/topoproto" + "vitess.io/vitess/go/vt/vtenv" + "vitess.io/vitess/go/vt/vterrors" + "vitess.io/vitess/go/vt/vtgate/logstats" + _ "vitess.io/vitess/go/vt/vtgate/vindexes" + "vitess.io/vitess/go/vt/vttablet/sandboxconn" ) func TestSelectNext(t *testing.T) { @@ -1982,6 +1980,7 @@ func TestSelectScatterOrderByVarChar(t *testing.T) { Fields: []*querypb.Field{ {Name: "col1", Type: sqltypes.Int32, Charset: collations.CollationBinaryID, Flags: uint32(querypb.MySqlFlag_NUM_FLAG)}, {Name: "textcol", Type: sqltypes.VarChar, Charset: uint32(collations.MySQL8().DefaultConnectionCharset())}, + {Name: "weight_string(textcol)", Type: sqltypes.VarBinary, Charset: collations.CollationBinaryID}, }, InsertID: 0, Rows: [][]sqltypes.Value{{ @@ -1990,6 +1989,7 @@ func TestSelectScatterOrderByVarChar(t *testing.T) { // This will allow us to test that cross-shard ordering // still works correctly. sqltypes.NewVarChar(fmt.Sprintf("%d", i%4)), + sqltypes.NewVarBinary(fmt.Sprintf("%d", i%4)), }}, }}) conns = append(conns, sbc) @@ -2005,7 +2005,7 @@ func TestSelectScatterOrderByVarChar(t *testing.T) { require.NoError(t, err) wantQueries := []*querypb.BoundQuery{{ - Sql: "select col1, textcol from `user` order by textcol desc", + Sql: "select col1, textcol, weight_string(textcol) from `user` order by textcol desc", BindVariables: map[string]*querypb.BindVariable{}, }} for _, conn := range conns { @@ -2114,11 +2114,13 @@ func TestStreamSelectScatterOrderByVarChar(t *testing.T) { Fields: []*querypb.Field{ {Name: "id", Type: sqltypes.Int32, Charset: collations.CollationBinaryID, Flags: uint32(querypb.MySqlFlag_NUM_FLAG)}, {Name: "textcol", Type: sqltypes.VarChar, Charset: uint32(collations.MySQL8().DefaultConnectionCharset())}, + {Name: "weight_string(textcol)", Type: sqltypes.VarBinary, Charset: collations.CollationBinaryID}, }, InsertID: 0, Rows: [][]sqltypes.Value{{ sqltypes.NewInt32(1), sqltypes.NewVarChar(fmt.Sprintf("%d", i%4)), + sqltypes.NewVarBinary(fmt.Sprintf("%d", i%4)), }}, }}) conns = append(conns, sbc) @@ -2131,7 +2133,7 @@ func TestStreamSelectScatterOrderByVarChar(t *testing.T) { require.NoError(t, err) wantQueries := []*querypb.BoundQuery{{ - Sql: "select id, textcol from `user` order by textcol desc", + Sql: "select id, textcol, weight_string(textcol) from `user` order by textcol desc", BindVariables: map[string]*querypb.BindVariable{}, }} for _, conn := range conns { diff --git a/go/vt/vtgate/planbuilder/collations_test.go b/go/vt/vtgate/planbuilder/collations_test.go index 8e90bdf5fcc..b393e186679 100644 --- a/go/vt/vtgate/planbuilder/collations_test.go +++ b/go/vt/vtgate/planbuilder/collations_test.go @@ -20,12 +20,11 @@ import ( "fmt" "testing" - "vitess.io/vitess/go/test/vschemawrapper" - "vitess.io/vitess/go/vt/vtenv" - "github.com/stretchr/testify/require" "vitess.io/vitess/go/mysql/collations" + "vitess.io/vitess/go/test/vschemawrapper" + "vitess.io/vitess/go/vt/vtenv" "vitess.io/vitess/go/vt/vtgate/engine" ) @@ -61,7 +60,6 @@ func (tc *collationTestCase) addCollationsToSchema(vschema *vschemawrapper.VSche for i, c := range tbl.Columns { if c.Name.EqualString(collation.colName) { tbl.Columns[i].CollationName = collation.collationName - break } } } diff --git a/go/vt/vtgate/planbuilder/plan_test.go b/go/vt/vtgate/planbuilder/plan_test.go index 1a09aa555ec..3c484b6a664 100644 --- a/go/vt/vtgate/planbuilder/plan_test.go +++ b/go/vt/vtgate/planbuilder/plan_test.go @@ -586,7 +586,7 @@ func loadSchema(t testing.TB, filename string, setCollation bool) *vindexes.VSch if setCollation { for _, table := range ks.Tables { for i, col := range table.Columns { - if sqltypes.IsText(col.Type) { + if sqltypes.IsText(col.Type) && col.CollationName == "" { table.Columns[i].CollationName = "latin1_swedish_ci" } } diff --git a/go/vt/vtgate/planbuilder/testdata/aggr_cases.json b/go/vt/vtgate/planbuilder/testdata/aggr_cases.json index 4a1c8fa1559..64764abc802 100644 --- a/go/vt/vtgate/planbuilder/testdata/aggr_cases.json +++ b/go/vt/vtgate/planbuilder/testdata/aggr_cases.json @@ -4840,7 +4840,8 @@ "Instructions": { "OperatorType": "Aggregate", "Variant": "Scalar", - "Aggregates": "min(0 COLLATE latin1_swedish_ci) AS min(textcol1), max(1 COLLATE latin1_swedish_ci) AS max(textcol2), sum_distinct(2 COLLATE latin1_swedish_ci) AS sum(distinct textcol1), count_distinct(3 COLLATE latin1_swedish_ci) AS count(distinct textcol1)", + "Aggregates": "min(0 COLLATE latin1_swedish_ci) AS min(textcol1), max(1|4) AS max(textcol2), sum_distinct(2 COLLATE latin1_swedish_ci) AS sum(distinct textcol1), count_distinct(3 COLLATE latin1_swedish_ci) AS count(distinct textcol1)", + "ResultColumns": 4, "Inputs": [ { "OperatorType": "Route", @@ -4849,9 +4850,9 @@ "Name": "user", "Sharded": true }, - "FieldQuery": "select min(textcol1), max(textcol2), textcol1, textcol1 from `user` where 1 != 1 group by textcol1", + "FieldQuery": "select min(textcol1), max(textcol2), textcol1, textcol1, weight_string(textcol2) from `user` where 1 != 1 group by textcol1, weight_string(textcol2)", "OrderBy": "2 ASC COLLATE latin1_swedish_ci", - "Query": "select min(textcol1), max(textcol2), textcol1, textcol1 from `user` group by textcol1 order by textcol1 asc", + "Query": "select min(textcol1), max(textcol2), textcol1, textcol1, weight_string(textcol2) from `user` group by textcol1, weight_string(textcol2) order by textcol1 asc", "Table": "`user`" } ] @@ -4870,8 +4871,9 @@ "Instructions": { "OperatorType": "Aggregate", "Variant": "Ordered", - "Aggregates": "min(1 COLLATE latin1_swedish_ci) AS min(textcol1), max(2 COLLATE latin1_swedish_ci) AS max(textcol2), sum_distinct(3 COLLATE latin1_swedish_ci) AS sum(distinct textcol1), count_distinct(4 COLLATE latin1_swedish_ci) AS count(distinct textcol1)", + "Aggregates": "min(1 COLLATE latin1_swedish_ci) AS min(textcol1), max(2|5) AS max(textcol2), sum_distinct(3 COLLATE latin1_swedish_ci) AS sum(distinct textcol1), count_distinct(4 COLLATE latin1_swedish_ci) AS count(distinct textcol1)", "GroupBy": "0", + "ResultColumns": 5, "Inputs": [ { "OperatorType": "Route", @@ -4880,9 +4882,9 @@ "Name": "user", "Sharded": true }, - "FieldQuery": "select col, min(textcol1), max(textcol2), textcol1, textcol1 from `user` where 1 != 1 group by col, textcol1", + "FieldQuery": "select col, min(textcol1), max(textcol2), textcol1, textcol1, weight_string(textcol2) from `user` where 1 != 1 group by col, textcol1, weight_string(textcol2)", "OrderBy": "0 ASC, 3 ASC COLLATE latin1_swedish_ci", - "Query": "select col, min(textcol1), max(textcol2), textcol1, textcol1 from `user` group by col, textcol1 order by col asc, textcol1 asc", + "Query": "select col, min(textcol1), max(textcol2), textcol1, textcol1, weight_string(textcol2) from `user` group by col, textcol1, weight_string(textcol2) order by col asc, textcol1 asc", "Table": "`user`" } ] diff --git a/go/vt/vtgate/planbuilder/testdata/postprocess_cases.json b/go/vt/vtgate/planbuilder/testdata/postprocess_cases.json index 0ec62ed2574..6697da92c23 100644 --- a/go/vt/vtgate/planbuilder/testdata/postprocess_cases.json +++ b/go/vt/vtgate/planbuilder/testdata/postprocess_cases.json @@ -342,9 +342,9 @@ "Name": "user", "Sharded": true }, - "FieldQuery": "select a, textcol1, b, textcol2, weight_string(a), weight_string(b) from `user` where 1 != 1", - "OrderBy": "(0|4) ASC, 1 ASC COLLATE latin1_swedish_ci, (2|5) ASC, 3 ASC COLLATE latin1_swedish_ci", - "Query": "select a, textcol1, b, textcol2, weight_string(a), weight_string(b) from `user` order by a asc, textcol1 asc, b asc, textcol2 asc", + "FieldQuery": "select a, textcol1, b, textcol2, weight_string(a), weight_string(b), weight_string(textcol2) from `user` where 1 != 1", + "OrderBy": "(0|4) ASC, 1 ASC COLLATE latin1_swedish_ci, (2|5) ASC, (3|6) ASC COLLATE ", + "Query": "select a, textcol1, b, textcol2, weight_string(a), weight_string(b), weight_string(textcol2) from `user` order by a asc, textcol1 asc, b asc, textcol2 asc", "ResultColumns": 4, "Table": "`user`" }, @@ -2174,5 +2174,36 @@ "user.user" ] } + }, + { + "comment": "DISTINCT on an unsupported collation should fall back on weightstrings", + "query": "select distinct textcol2 from user", + "plan": { + "QueryType": "SELECT", + "Original": "select distinct textcol2 from user", + "Instructions": { + "OperatorType": "Distinct", + "Collations": [ + "(0:1): " + ], + "ResultColumns": 1, + "Inputs": [ + { + "OperatorType": "Route", + "Variant": "Scatter", + "Keyspace": { + "Name": "user", + "Sharded": true + }, + "FieldQuery": "select textcol2, weight_string(textcol2) from `user` where 1 != 1", + "Query": "select distinct textcol2, weight_string(textcol2) from `user`", + "Table": "`user`" + } + ] + }, + "TablesUsed": [ + "user.user" + ] + } } ] diff --git a/go/vt/vtgate/planbuilder/testdata/vschemas/schema.json b/go/vt/vtgate/planbuilder/testdata/vschemas/schema.json index 3579205ba70..70f55c62f1f 100644 --- a/go/vt/vtgate/planbuilder/testdata/vschemas/schema.json +++ b/go/vt/vtgate/planbuilder/testdata/vschemas/schema.json @@ -219,7 +219,8 @@ }, { "name": "textcol2", - "type": "VARCHAR" + "type": "VARCHAR", + "collation_name": "big5_bin" } ] }, diff --git a/go/vt/vtgate/semantics/analyzer.go b/go/vt/vtgate/semantics/analyzer.go index 546cdf76186..3c7470026d2 100644 --- a/go/vt/vtgate/semantics/analyzer.go +++ b/go/vt/vtgate/semantics/analyzer.go @@ -78,7 +78,7 @@ func Analyze(statement sqlparser.Statement, currentDb string, si SchemaInformati } // Creation of the semantic table - return analyzer.newSemTable(statement, si.ConnCollation(), si.GetForeignKeyChecksState()) + return analyzer.newSemTable(statement, si.ConnCollation(), si.GetForeignKeyChecksState(), si.Environment().CollationEnv()) } // AnalyzeStrict analyzes the parsed query, and fails the analysis for any possible errors @@ -98,7 +98,12 @@ func AnalyzeStrict(statement sqlparser.Statement, currentDb string, si SchemaInf return st, nil } -func (a *analyzer) newSemTable(statement sqlparser.Statement, coll collations.ID, fkChecksState *bool) (*SemTable, error) { +func (a *analyzer) newSemTable( + statement sqlparser.Statement, + coll collations.ID, + fkChecksState *bool, + env *collations.Environment, +) (*SemTable, error) { var comments *sqlparser.ParsedComments commentedStmt, isCommented := statement.(sqlparser.Commented) if isCommented { @@ -133,6 +138,7 @@ func (a *analyzer) newSemTable(statement sqlparser.Statement, coll collations.ID childForeignKeysInvolved: childFks, parentForeignKeysInvolved: parentFks, childFkToUpdExprs: childFkToUpdExprs, + collEnv: env, }, nil } diff --git a/go/vt/vtgate/semantics/analyzer_test.go b/go/vt/vtgate/semantics/analyzer_test.go index 1df1ec7fd87..0257aad8a72 100644 --- a/go/vt/vtgate/semantics/analyzer_test.go +++ b/go/vt/vtgate/semantics/analyzer_test.go @@ -1630,8 +1630,13 @@ func fakeSchemaInfo() *FakeSI { Name: sqlparser.NewIdentifierCI("uid"), Type: querypb.Type_INT64, }, { - Name: sqlparser.NewIdentifierCI("name"), - Type: querypb.Type_VARCHAR, + Name: sqlparser.NewIdentifierCI("name"), + Type: querypb.Type_VARCHAR, + CollationName: "utf8_bin", + }, { + Name: sqlparser.NewIdentifierCI("textcol"), + Type: querypb.Type_VARCHAR, + CollationName: "big5_bin", }} si := &FakeSI{ diff --git a/go/vt/vtgate/semantics/real_table.go b/go/vt/vtgate/semantics/real_table.go index 72549b98e8c..d12df6acfa9 100644 --- a/go/vt/vtgate/semantics/real_table.go +++ b/go/vt/vtgate/semantics/real_table.go @@ -124,7 +124,6 @@ func vindexTableToColumnInfo(tbl *vindexes.Table, collationEnv *collations.Envir nameMap := map[string]any{} cols := make([]ColumnInfo, 0, len(tbl.Columns)) for _, col := range tbl.Columns { - cols = append(cols, ColumnInfo{ Name: col.Name.String(), Type: col.ToEvalengineType(collationEnv), diff --git a/go/vt/vtgate/semantics/semantic_state.go b/go/vt/vtgate/semantics/semantic_state.go index 9067e77dd88..7327fed9d67 100644 --- a/go/vt/vtgate/semantics/semantic_state.go +++ b/go/vt/vtgate/semantics/semantic_state.go @@ -143,6 +143,7 @@ type ( childForeignKeysInvolved map[TableSet][]vindexes.ChildFKInfo parentForeignKeysInvolved map[TableSet][]vindexes.ParentFKInfo childFkToUpdExprs map[string]sqlparser.UpdateExprs + collEnv *collations.Environment } columnName struct { @@ -640,7 +641,12 @@ func (st *SemTable) NeedsWeightString(e sqlparser.Expr) bool { if !found { return true } - return typ.Collation() == collations.Unknown && !sqltypes.IsNumber(typ.Type()) + + if !sqltypes.IsText(typ.Type()) { + return false + } + + return !st.collEnv.IsSupported(typ.Collation()) } } diff --git a/go/vt/vtgate/semantics/typer_test.go b/go/vt/vtgate/semantics/typer_test.go index c87d5672dab..7de5ecf1340 100644 --- a/go/vt/vtgate/semantics/typer_test.go +++ b/go/vt/vtgate/semantics/typer_test.go @@ -21,6 +21,7 @@ import ( "github.com/stretchr/testify/require" + "vitess.io/vitess/go/mysql/collations/colldata" querypb "vitess.io/vitess/go/vt/proto/query" "vitess.io/vitess/go/vt/sqlparser" ) @@ -54,5 +55,40 @@ func TestNormalizerAndSemanticAnalysisIntegration(t *testing.T) { require.Equal(t, test.typ, typ.Type().String()) }) } +} + +// Tests that the types correctly picks up and sets the collation on columns +func TestColumnCollations(t *testing.T) { + tests := []struct { + query, collation string + }{ + {query: "select textcol from t2"}, + {query: "select name from t2", collation: "utf8mb3_bin"}, + } + + for _, test := range tests { + t.Run(test.query, func(t *testing.T) { + parse, err := sqlparser.NewTestParser().Parse(test.query) + require.NoError(t, err) + err = sqlparser.Normalize(parse, sqlparser.NewReservedVars("bv", sqlparser.BindVars{}), map[string]*querypb.BindVariable{}) + require.NoError(t, err) + + st, err := Analyze(parse, "d", fakeSchemaInfo()) + require.NoError(t, err) + col := extract(parse.(*sqlparser.Select), 0) + typ, found := st.TypeForExpr(col) + require.True(t, found, "column was not typed") + + require.Equal(t, "VARCHAR", typ.Type().String()) + collation := colldata.Lookup(typ.Collation()) + if test.collation != "" { + collation := colldata.Lookup(typ.Collation()) + require.NotNil(t, collation) + require.Equal(t, test.collation, collation.Name()) + } else { + require.Nil(t, collation) + } + }) + } } diff --git a/go/vt/vtgate/vindexes/vschema.go b/go/vt/vtgate/vindexes/vschema.go index e1044d0136d..8559cabc447 100644 --- a/go/vt/vtgate/vindexes/vschema.go +++ b/go/vt/vtgate/vindexes/vschema.go @@ -25,19 +25,17 @@ import ( "strings" "time" + "vitess.io/vitess/go/json2" "vitess.io/vitess/go/mysql/collations" "vitess.io/vitess/go/sqlescape" - vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" - "vitess.io/vitess/go/vt/vterrors" - "vitess.io/vitess/go/vt/vtgate/evalengine" - - "vitess.io/vitess/go/json2" "vitess.io/vitess/go/sqltypes" - "vitess.io/vitess/go/vt/sqlparser" - querypb "vitess.io/vitess/go/vt/proto/query" topodatapb "vitess.io/vitess/go/vt/proto/topodata" vschemapb "vitess.io/vitess/go/vt/proto/vschema" + vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc" + "vitess.io/vitess/go/vt/sqlparser" + "vitess.io/vitess/go/vt/vterrors" + "vitess.io/vitess/go/vt/vtgate/evalengine" ) // TabletTypeSuffix maps the tablet type to its suffix string. @@ -220,12 +218,11 @@ func (col *Column) MarshalJSON() ([]byte, error) { } func (col *Column) ToEvalengineType(collationEnv *collations.Environment) evalengine.Type { - collation := collations.CollationForType(col.Type, collationEnv.DefaultConnectionCharset()) + var collation collations.ID if sqltypes.IsText(col.Type) { - coll, found := collationEnv.LookupID(col.CollationName) - if found { - collation = coll - } + collation, _ = collationEnv.LookupID(col.CollationName) + } else { + collation = collations.CollationForType(col.Type, collationEnv.DefaultConnectionCharset()) } return evalengine.NewTypeEx(col.Type, collation, col.Nullable, col.Size, col.Scale) }