Skip to content

Commit

Permalink
[Enhancement] Optimize the performance of variables like variables. (#…
Browse files Browse the repository at this point in the history
…50416)

Signed-off-by: stdpain <[email protected]>
  • Loading branch information
stdpain authored Aug 30, 2024
1 parent 690a988 commit 3c11722
Show file tree
Hide file tree
Showing 3 changed files with 201 additions and 10 deletions.
100 changes: 90 additions & 10 deletions be/src/exprs/like_predicate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,48 @@ StatusOr<ColumnPtr> LikePredicate::_predicate_const_regex(FunctionContext* conte
return result->build(value_column->is_constant());
}

enum class FastPathType {
EQUALS = 0,
START_WITH = 1,
END_WITH = 2,
SUBSTRING = 3,
REGEX = 4,
};

FastPathType extract_fast_path(const Slice& pattern) {
if (pattern.empty() || pattern.size < 2) {
return FastPathType::REGEX;
}

if (pattern.data[0] == '_' || pattern.data[pattern.size - 1] == '_') {
return FastPathType::REGEX;
}

bool is_end_with = pattern.data[0] == '%';
bool is_start_with = pattern.data[pattern.size - 1] == '%';

for (size_t i = 1; i < pattern.size - 1;) {
if (pattern.data[i] == '\\') {
i += 2;
} else {
if (pattern.data[i] == '%' || pattern.data[i] == '_') {
return FastPathType::REGEX;
}
i++;
}
}

if (is_end_with && is_start_with) {
return FastPathType::SUBSTRING;
} else if (is_end_with) {
return FastPathType::END_WITH;
} else if (is_start_with) {
return FastPathType::START_WITH;
} else {
return FastPathType::EQUALS;
}
}

StatusOr<ColumnPtr> LikePredicate::regex_match_full(FunctionContext* context, const starrocks::Columns& columns) {
const auto& value_column = VECTORIZED_FN_ARGS(0);
const auto& pattern_column = VECTORIZED_FN_ARGS(1);
Expand Down Expand Up @@ -478,18 +520,56 @@ StatusOr<ColumnPtr> LikePredicate::regex_match_full(FunctionContext* context, co
continue;
}

auto re_pattern = LikePredicate::template convert_like_pattern<false>(context, pattern_viewer.value(row));

re2::RE2 re(re_pattern, opts);

if (!re.ok()) {
context->set_error(strings::Substitute("Invalid regex: $0", re_pattern).c_str());
result.append_null();
continue;
Slice pattern = pattern_viewer.value(row);
FastPathType val = extract_fast_path(pattern);
switch (val) {
case FastPathType::EQUALS: {
std::string str_pattern = pattern.to_string();
remove_escape_character(&str_pattern);
result.append(value_viewer.value(row) == str_pattern);
break;
}
case FastPathType::START_WITH: {
std::string str_pattern = pattern.to_string();
remove_escape_character(&str_pattern);
auto pattern_slice = Slice(str_pattern);
pattern_slice.remove_suffix(1);
result.append(ConstantStartsImpl::apply<Slice, Slice, bool>(value_viewer.value(row), pattern_slice));
break;
}
case FastPathType::END_WITH: {
std::string str_pattern = pattern.to_string();
remove_escape_character(&str_pattern);
auto pattern_slice = Slice(str_pattern);
pattern_slice.remove_prefix(1);
result.append(ConstantEndsImpl::apply<Slice, Slice, bool>(value_viewer.value(row), pattern_slice));
break;
}
case FastPathType::SUBSTRING: {
std::string str_pattern = pattern.to_string();
remove_escape_character(&str_pattern);
auto pattern_slice = Slice(str_pattern);
pattern_slice.remove_prefix(1);
pattern_slice.remove_suffix(1);
auto searcher = LibcASCIICaseSensitiveStringSearcher(pattern_slice.get_data(), pattern_slice.get_size());
/// searcher returns a pointer to the found substring or to the end of `haystack`.
const Slice& value = value_viewer.value(row);
const char* res_pointer = searcher.search(value.data, value.size);
result.append(!!res_pointer);
break;
}
case FastPathType::REGEX: {
auto re_pattern = LikePredicate::template convert_like_pattern<false>(context, pattern);

auto v = RE2::FullMatch(re2::StringPiece(value_viewer.value(row).data, value_viewer.value(row).size), re);
result.append(v);
re2::RE2 re(re_pattern, opts);
if (!re.ok()) {
return Status::InvalidArgument(strings::Substitute("Invalid regex: $0", re_pattern));
}
auto v = RE2::FullMatch(re2::StringPiece(value_viewer.value(row).data, value_viewer.value(row).size), re);
result.append(v);
break;
}
}
}

return result.build(all_const);
Expand Down
64 changes: 64 additions & 0 deletions test/sql/test_join/R/test_join_with_other_predicate
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
-- name: test_join_with_other_predicate
CREATE TABLE `t0` (
`c0` int(11) NULL COMMENT "",
`c1` varchar(20) NULL COMMENT "",
`c2` varchar(200) NULL COMMENT "",
`c3` int(11) NULL COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`c0`, `c1`)
COMMENT "OLAP"
DISTRIBUTED BY HASH(`c0`, `c1`) BUCKETS 1
PROPERTIES (
"replication_num" = "1",
"in_memory" = "false",
"storage_format" = "DEFAULT",
"enable_persistent_index" = "false",
"replicated_storage" = "true",
"compression" = "LZ4"
);
-- result:
-- !result
CREATE TABLE `t1` (
`c0` int(11) NULL COMMENT "",
`c1` varchar(20) NULL COMMENT "",
`c2` varchar(200) NULL COMMENT "",
`c3` int(11) NULL COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`c0`, `c1`)
COMMENT "OLAP"
DISTRIBUTED BY HASH(`c0`, `c1`) BUCKETS 48
PROPERTIES (
"replication_num" = "1",
"in_memory" = "false",
"storage_format" = "DEFAULT",
"enable_persistent_index" = "false",
"replicated_storage" = "true",
"compression" = "LZ4"
);
-- result:
-- !result
insert into t0 SELECT generate_series, generate_series, generate_series, generate_series FROM TABLE(generate_series(1, 40960));
-- result:
-- !result
insert into t0 values (null,null,null,null);
-- result:
-- !result
insert into t1 SELECT * FROM t0;
-- result:
-- !result
select count(*) from t0 join t1 on t0.c0=t1.c0 where t0.c1 like t1.c1;
-- result:
40960
-- !result
select count(*) from t0 join t1 on t0.c0=t1.c0 where t0.c1 like concat(t1.c1, '%');
-- result:
40960
-- !result
select count(*) from t0 join t1 on t0.c0=t1.c0 where t0.c1 like concat('%', t1.c1, '%');
-- result:
40960
-- !result
select count(*) from t0 join t1 on t0.c0=t1.c0 where t0.c1 like concat('%', t1.c1);
-- result:
40960
-- !result
47 changes: 47 additions & 0 deletions test/sql/test_join/T/test_join_with_other_predicate
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
-- name: test_join_with_other_predicate

CREATE TABLE `t0` (
`c0` int(11) NULL COMMENT "",
`c1` varchar(20) NULL COMMENT "",
`c2` varchar(200) NULL COMMENT "",
`c3` int(11) NULL COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`c0`, `c1`)
COMMENT "OLAP"
DISTRIBUTED BY HASH(`c0`, `c1`) BUCKETS 1
PROPERTIES (
"replication_num" = "1",
"in_memory" = "false",
"storage_format" = "DEFAULT",
"enable_persistent_index" = "false",
"replicated_storage" = "true",
"compression" = "LZ4"
);

CREATE TABLE `t1` (
`c0` int(11) NULL COMMENT "",
`c1` varchar(20) NULL COMMENT "",
`c2` varchar(200) NULL COMMENT "",
`c3` int(11) NULL COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`c0`, `c1`)
COMMENT "OLAP"
DISTRIBUTED BY HASH(`c0`, `c1`) BUCKETS 48
PROPERTIES (
"replication_num" = "1",
"in_memory" = "false",
"storage_format" = "DEFAULT",
"enable_persistent_index" = "false",
"replicated_storage" = "true",
"compression" = "LZ4"
);

insert into t0 SELECT generate_series, generate_series, generate_series, generate_series FROM TABLE(generate_series(1, 40960));
insert into t0 values (null,null,null,null);
insert into t1 SELECT * FROM t0;


select count(*) from t0 join t1 on t0.c0=t1.c0 where t0.c1 like t1.c1;
select count(*) from t0 join t1 on t0.c0=t1.c0 where t0.c1 like concat(t1.c1, '%');
select count(*) from t0 join t1 on t0.c0=t1.c0 where t0.c1 like concat('%', t1.c1, '%');
select count(*) from t0 join t1 on t0.c0=t1.c0 where t0.c1 like concat('%', t1.c1);

0 comments on commit 3c11722

Please sign in to comment.