Skip to content

Commit

Permalink
fix pr comments/suggestions
Browse files Browse the repository at this point in the history
  • Loading branch information
kgopal492 committed Jul 28, 2023
1 parent f33a49e commit a3fd147
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 82 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from abc import ABCMeta, abstractmethod
from typing import List, Tuple
from sqlglot import Tokenizer
from sqlglot.tokens import Token

from lib.query_analysis.validation.base_query_validator import (
QueryValidationResult,
QueryValidationResultObjectType,
QueryValidationSeverity,
)


class BaseSQLGlotValidator(metaclass=ABCMeta):
@property
@abstractmethod
def message(self) -> str:
raise NotImplementedError()

@property
@abstractmethod
def severity(self) -> QueryValidationSeverity:
raise NotImplementedError()

@property
@abstractmethod
def tokenizer(self) -> Tokenizer:
raise NotImplementedError()

def _tokenize_query(self, query: str) -> List[Token]:
return self.tokenizer.tokenize(query)

def _get_query_coordinate_by_index(self, query: str, index: int) -> Tuple[int, int]:
rows = query[: index + 1].splitlines(keepends=False)
return len(rows) - 1, len(rows[-1]) - 1

def _get_query_validation_result(
self,
query: str,
start_index: int,
end_index: int,
suggestion: str = None,
validation_result_object_type=QueryValidationResultObjectType.SUGGESTION,
):
start_line, start_ch = self._get_query_coordinate_by_index(query, start_index)
end_line, end_ch = self._get_query_coordinate_by_index(query, end_index)

return QueryValidationResult(
start_line,
start_ch,
self.severity,
self.message,
validation_result_object_type,
end_line=end_line,
end_ch=end_ch,
suggestion=suggestion,
)

@abstractmethod
def get_query_validation_results(
self, query: str, raw_tokens: List[Token] = None
) -> List[QueryValidationResult]:
raise NotImplementedError()
Original file line number Diff line number Diff line change
@@ -1,83 +1,30 @@
from abc import ABCMeta, abstractmethod
from typing import List, Tuple
from sqlglot import TokenType
from typing import List
from sqlglot import TokenType, Tokenizer
from sqlglot.dialects import Trino
from sqlglot.tokens import Token

from lib.query_analysis.validation.base_query_validator import (
QueryValidationResult,
QueryValidationResultObjectType,
QueryValidationSeverity,
)
from lib.query_analysis.validation.validators.presto_explain_validator import (
PrestoExplainValidator,
)

UNION_ALL_SUGGESTION = "UNION ALL"
APPROX_DISTINCT_SUGGESTION = "APPROX_DISTINCT("

UNION_ALL_MESSAGE = "Using UNION ALL instead of UNION will execute faster"
REGEXP_LIKE_MESSAGE = (
"Combining multiple LIKEs into one REGEXP_LIKE will execute faster"
)
APPROX_DISTINCT_MESSAGE = (
"Using APPROX_DISTINCT(x) instead of COUNT(DISTINCT x) will execute faster"
from lib.query_analysis.validation.validators.base_sqlglot_validator import (
BaseSQLGlotValidator,
)


def _get_query_coordinate_by_index(query: str, index: int) -> Tuple[int, int]:
rows = query[: index + 1].splitlines(keepends=False)
return len(rows) - 1, len(rows[-1]) - 1


def _tokenize_query(query: str) -> List[Token]:
return Trino.Tokenizer().tokenize(query)


class BaseSQLGlotValidator(metaclass=ABCMeta):
class BasePrestoSQLGlotValidator(BaseSQLGlotValidator):
@property
@abstractmethod
def message(self) -> str:
raise NotImplementedError()
def tokenizer(self) -> Tokenizer:
return Trino.Tokenizer()

@property
@abstractmethod
def severity(self) -> QueryValidationSeverity:
raise NotImplementedError()

def _get_query_validation_result(
self,
query: str,
start_index: int,
end_index: int,
suggestion: str = None,
validation_result_object_type=QueryValidationResultObjectType.SUGGESTION,
):
start_line, start_ch = _get_query_coordinate_by_index(query, start_index)
end_line, end_ch = _get_query_coordinate_by_index(query, end_index)

return QueryValidationResult(
start_line,
start_ch,
self.severity,
self.message,
validation_result_object_type,
end_line=end_line,
end_ch=end_ch,
suggestion=suggestion,
)

@abstractmethod
def get_query_validation_results(
self, query: str, raw_tokens: List[Token] = None
) -> List[QueryValidationResult]:
raise NotImplementedError()


class UnionAllValidator(BaseSQLGlotValidator):
class UnionAllValidator(BasePrestoSQLGlotValidator):
@property
def message(self):
return UNION_ALL_MESSAGE
return "Using UNION ALL instead of UNION will execute faster"

@property
def severity(self) -> str:
Expand All @@ -87,7 +34,7 @@ def get_query_validation_results(
self, query: str, raw_tokens: List[Token] = None
) -> List[QueryValidationResult]:
if raw_tokens is None:
raw_tokens = _tokenize_query(query)
raw_tokens = self._tokenize_query(query)
validation_errors = []
for i, token in enumerate(raw_tokens):
if token.token_type == TokenType.UNION:
Expand All @@ -97,16 +44,18 @@ def get_query_validation_results(
):
validation_errors.append(
self._get_query_validation_result(
query, token.start, token.end, UNION_ALL_SUGGESTION
query, token.start, token.end, "UNION ALL"
)
)
return validation_errors


class ApproxDistinctValidator(BaseSQLGlotValidator):
class ApproxDistinctValidator(BasePrestoSQLGlotValidator):
@property
def message(self):
return APPROX_DISTINCT_MESSAGE
return (
"Using APPROX_DISTINCT(x) instead of COUNT(DISTINCT x) will execute faster"
)

@property
def severity(self) -> str:
Expand All @@ -116,7 +65,7 @@ def get_query_validation_results(
self, query: str, raw_tokens: List[Token] = None
) -> List[QueryValidationResult]:
if raw_tokens is None:
raw_tokens = _tokenize_query(query)
raw_tokens = self._tokenize_query(query)

validation_errors = []
for i, token in enumerate(raw_tokens):
Expand All @@ -132,16 +81,16 @@ def get_query_validation_results(
query,
token.start,
raw_tokens[i + 2].end,
APPROX_DISTINCT_SUGGESTION,
"APPROX_DISTINCT(",
)
)
return validation_errors


class RegexpLikeValidator(BaseSQLGlotValidator):
class RegexpLikeValidator(BasePrestoSQLGlotValidator):
@property
def message(self):
return REGEXP_LIKE_MESSAGE
return "Combining multiple LIKEs into one REGEXP_LIKE will execute faster"

@property
def severity(self) -> str:
Expand All @@ -157,7 +106,7 @@ def get_query_validation_results(
self, query: str, raw_tokens: List[Token] = None
) -> List[QueryValidationResult]:
if raw_tokens is None:
raw_tokens = _tokenize_query(query)
raw_tokens = self._tokenize_query(query)

validation_errors = []

Expand Down Expand Up @@ -235,8 +184,10 @@ def _get_sqlglot_validators(self) -> List[BaseSQLGlotValidator]:
def _get_validation_suggestions(self, query: str) -> List[QueryValidationResult]:
validation_suggestions = []

query_raw_tokens = _tokenize_query(query)
query_raw_tokens = None
for validator in self._get_sqlglot_validators():
if query_raw_tokens is None:
query_raw_tokens = validator._tokenize_query(query)
validation_suggestions.extend(
validator.get_query_validation_results(
query, raw_tokens=query_raw_tokens
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,6 @@
QueryValidationSeverity,
)
from lib.query_analysis.validation.validators.presto_optimizing_validator import (
UNION_ALL_SUGGESTION,
APPROX_DISTINCT_SUGGESTION,
UNION_ALL_MESSAGE,
REGEXP_LIKE_MESSAGE,
APPROX_DISTINCT_MESSAGE,
ApproxDistinctValidator,
RegexpLikeValidator,
UnionAllValidator,
Expand Down Expand Up @@ -42,7 +37,7 @@ def _get_regexp_like_validation_result(
start_line,
start_ch,
QueryValidationSeverity.WARNING,
REGEXP_LIKE_MESSAGE,
"Combining multiple LIKEs into one REGEXP_LIKE will execute faster",
QueryValidationResultObjectType.SUGGESTION,
end_line=end_line,
end_ch=end_ch,
Expand All @@ -56,11 +51,11 @@ def _get_union_all_validation_result(
start_line,
start_ch,
QueryValidationSeverity.WARNING,
UNION_ALL_MESSAGE,
"Using UNION ALL instead of UNION will execute faster",
QueryValidationResultObjectType.SUGGESTION,
end_line=end_line,
end_ch=end_ch,
suggestion=UNION_ALL_SUGGESTION,
suggestion="UNION ALL",
)

def _get_approx_distinct_validation_result(
Expand All @@ -70,11 +65,11 @@ def _get_approx_distinct_validation_result(
start_line,
start_ch,
QueryValidationSeverity.WARNING,
APPROX_DISTINCT_MESSAGE,
"Using APPROX_DISTINCT(x) instead of COUNT(DISTINCT x) will execute faster",
QueryValidationResultObjectType.SUGGESTION,
end_line=end_line,
end_ch=end_ch,
suggestion=APPROX_DISTINCT_SUGGESTION,
suggestion="APPROX_DISTINCT(",
)


Expand Down Expand Up @@ -292,3 +287,16 @@ def test_all_errors(self):
),
],
)

def test_extra_whitespace(self):
query = "SELECT \n COUNT( DISTINCT x) from a WHERE \n\t x LIKE 'foo' or x like \n'bar' and y like 'foo' \n UNION select * from b"
self._verify_query_validation_results(
self._validator._get_validation_suggestions(query),
[
self._get_union_all_validation_result(4, 5, 4, 9),
self._get_approx_distinct_validation_result(1, 2, 1, 16),
self._get_regexp_like_validation_result(
2, 3, 3, 4, "REGEXP_LIKE(x, 'foo|bar')"
),
],
)

0 comments on commit a3fd147

Please sign in to comment.