From 991c72427790f73606ba482ec14548a4a4a9a077 Mon Sep 17 00:00:00 2001 From: Daniel Thorn Date: Wed, 6 Nov 2024 15:43:44 -0800 Subject: [PATCH] address review --- socorro/external/es/crashstorage.py | 4 +- socorro/external/es/super_search_fields.py | 5 +- socorro/external/es/supersearch.py | 63 ++++---- socorro/tests/conftest.py | 2 +- .../tests/external/es/test_crashstorage.py | 2 +- .../external/es/test_super_search_fields.py | 2 +- socorro/tests/external/es/test_supersearch.py | 139 ++++++++---------- 7 files changed, 93 insertions(+), 124 deletions(-) diff --git a/socorro/external/es/crashstorage.py b/socorro/external/es/crashstorage.py index 2db4d41660..4714e35fc6 100644 --- a/socorro/external/es/crashstorage.py +++ b/socorro/external/es/crashstorage.py @@ -250,8 +250,6 @@ def build_document(src, crash_document, fields, all_keys): elif storage_type == "boolean": value = fix_boolean(value) - if value is None: - continue for dest_key in get_destination_keys(field): if dest_key in all_keys: @@ -315,7 +313,7 @@ def build_supersearch(self): def build_search(self, **kwargs): """Return new instance of elasticsearch_dsl's Search.""" with self.client() as conn: - return Search(using=conn, doc_type=self.get_doctype(), **kwargs) + return Search(using=conn, **kwargs) @staticmethod def get_source_key(field): diff --git a/socorro/external/es/super_search_fields.py b/socorro/external/es/super_search_fields.py index 965c67d678..86fc9f8e22 100644 --- a/socorro/external/es/super_search_fields.py +++ b/socorro/external/es/super_search_fields.py @@ -289,7 +289,7 @@ def keyword_field( "is_exposed": True, "is_returned": True, "query_type": "string", - "storage_mapping": {"type": "keyword", "ignore_above": 10_000}, + "storage_mapping": {"type": "keyword"}, } @@ -1743,8 +1743,7 @@ def apply_schema_properties(fields, schema): "namespace": "processed_crash", "query_type": "string", "storage_mapping": { - "fielddata": True, # FIXME(relud): this may be required in more fields? - "type": "text", + "type": "keyword", }, }, "processor_notes": { diff --git a/socorro/external/es/supersearch.py b/socorro/external/es/supersearch.py index ed144f6f03..090b865d55 100644 --- a/socorro/external/es/supersearch.py +++ b/socorro/external/es/supersearch.py @@ -142,11 +142,11 @@ def _format(aggregation): return aggregation for i, bucket in enumerate(aggregation["buckets"]): - if "key_as_string" in bucket: + if bucket["key"] in (True, False): + # Restore es 1.4 format for boolean terms as string + term = "T" if bucket["key"] else "F" + elif "key_as_string" in bucket: term = bucket["key_as_string"] - if term in ("true", "false"): - # Restore es 1.4 format for boolean terms as string - term = term[:1].upper() else: term = bucket["key"] @@ -270,7 +270,7 @@ def get(self, **kwargs): operator_range = {">": "gt", "<": "lt", ">=": "gte", "<=": "lte"} args = {} - filter_type = "term" + query_name = "term" filter_value = None if not param.operator: @@ -279,70 +279,61 @@ def get(self, **kwargs): # to match for analyzed and non-analyzed supersearch fields. if len(param.value) == 1: # Only one value, so we only do a single match - filter_type = "match" - args = Q({"match": {search_key: param.value[0]}}).to_dict()[ - "match" - ] + query_name = "match" + args = {search_key: param.value[0]} else: # Multiple values, so we do multiple matches wrapped in a bool # query where at least one of them should match - filter_type = "bool" - args = Q( - { - "bool": { - "should": [ - {"match": {search_key: param_value}} - for param_value in param.value - ], - "minimum_should_match": 1, - } - } - ).to_dict()["bool"] + query_name = "bool" + args = { + "should": [ + {"match": {search_key: param_value}} + for param_value in param.value + ], + "minimum_should_match": 1, + } elif param.operator == "=": # is exactly if field_data["has_full_version"]: search_key = f"{search_key}.full" filter_value = param.value elif param.operator in operator_range: - filter_type = "range" + query_name = "range" filter_value = {operator_range[param.operator]: param.value} elif param.operator == "__null__": - filter_type = "bool" + query_name = "bool" args["must_not"] = [Q("exists", field=search_key)] elif param.operator == "__true__": - filter_type = "term" + query_name = "term" filter_value = True elif param.operator == "@": - filter_type = "regexp" + query_name = "regexp" if field_data["has_full_version"]: search_key = f"{search_key}.full" filter_value = param.value elif param.operator in operator_wildcards: - filter_type = "wildcard" + query_name = "wildcard" # Wildcard operations are better applied to a non-analyzed # field (called "full") if there is one. if field_data["has_full_version"]: search_key = f"{search_key}.full" - q_args = {} - q_args[search_key] = ( - operator_wildcards[param.operator] % param.value - ) - query = Q("wildcard", **q_args) - args = query.to_dict()["wildcard"] + args = { + search_key: (operator_wildcards[param.operator] % param.value) + } if filter_value is not None: args[search_key] = filter_value if args: - new_filter = Q(filter_type, **args) + new_filter = Q(query_name, **args) if param.operator_not: new_filter = ~new_filter if sub_filters is None: sub_filters = new_filter - elif filter_type == "range": + elif query_name == "range": sub_filters &= new_filter else: sub_filters |= new_filter @@ -569,8 +560,8 @@ def _get_histogram_agg(self, field, intervals): # Setting min_doc_count makes ES return only non-empty buckets. # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-histogram-aggregation.html kwargs = {"min_doc_count": 1} - # NOTE(krzepka) "The date_histogram aggregation’s interval parameter is no longer valid." - # https://www.elastic.co/guide/en/elasticsearch/reference/master/migrating-8.0.html + # NOTE(relud) We want to use "calendar_interval" for date_histogram and + # "interval" for everything else. if histogram_type == "date_histogram": kwargs["calendar_interval"] = intervals[field] else: diff --git a/socorro/tests/conftest.py b/socorro/tests/conftest.py index 47493ae79c..c0a62e9da1 100644 --- a/socorro/tests/conftest.py +++ b/socorro/tests/conftest.py @@ -282,7 +282,7 @@ def get_crash_data(self, crash_id): with self.conn() as conn: search = Search(using=conn, index=index) - search = search.filter("term", **{"processed_crash.uuid": crash_id}) + search = search.filter({"term": {"processed_crash.uuid": crash_id}}) results = search.execute().to_dict() if results["hits"]["hits"]: diff --git a/socorro/tests/external/es/test_crashstorage.py b/socorro/tests/external/es/test_crashstorage.py index 508a970058..9d18a95fed 100644 --- a/socorro/tests/external/es/test_crashstorage.py +++ b/socorro/tests/external/es/test_crashstorage.py @@ -109,7 +109,7 @@ def test_index_crash_indexable_keys(self, es_helper): "another_invalid_key": "alpha", "date_processed": date_to_string(utc_now()), "uuid": "936ce666-ff3b-4c7a-9674-367fe2120408", - "dom_fission_enabled": False, + "dom_fission_enabled": "1", } crashstorage = self.build_crashstorage() diff --git a/socorro/tests/external/es/test_super_search_fields.py b/socorro/tests/external/es/test_super_search_fields.py index 7132ef65de..8e25f52779 100644 --- a/socorro/tests/external/es/test_super_search_fields.py +++ b/socorro/tests/external/es/test_super_search_fields.py @@ -101,7 +101,7 @@ def test_get_mapping(self): assert "os_name" in processed_crash assert "platform" not in processed_crash - # Those fields have a `storage_mapping`. + # Test a field that has a storage_mapping. assert processed_crash["release_channel"] == { "type": "keyword", } diff --git a/socorro/tests/external/es/test_supersearch.py b/socorro/tests/external/es/test_supersearch.py index bfddc22e17..4bd261a514 100644 --- a/socorro/tests/external/es/test_supersearch.py +++ b/socorro/tests/external/es/test_supersearch.py @@ -7,6 +7,7 @@ import http.server import threading from contextlib import contextmanager +from copy import deepcopy import pytest @@ -1227,14 +1228,11 @@ def test_get_with_date_histogram(self, es_helper): assert "histogram_date" in res["facets"] def dt_to_midnight(date): - return date.replace(hour=0, minute=0, second=0, microsecond=0) + return f"{date.replace(hour=0, minute=0, second=0, microsecond=0):%FT%H:%M:%S.000Z}" - # NOTE(relud) this used to use .isoformat() but with elasticsearch 8 dates are - # returned with millisecond precision and indicate time zone with Z instead of - # +00:00, so we have to carefully format the dates here. - today_str = f"{dt_to_midnight(now):%FT%H:%M:%S.000Z}" - yesterday_str = f"{dt_to_midnight(yesterday):%FT%H:%M:%S.000Z}" - day_before_str = f"{dt_to_midnight(the_day_before):%FT%H:%M:%S.000Z}" + today_str = dt_to_midnight(now) + yesterday_str = dt_to_midnight(yesterday) + day_before_str = dt_to_midnight(the_day_before) expected_terms = [ { @@ -1784,7 +1782,27 @@ def test_get_with_bad_regex(self, es_helper): with pytest.raises(BadArgumentError): api.get(signature=r'@"') - def test_get_with_failing_shards(self): + def test_get_with_failing_shards(self, es_helper): + # Generate a shard failure response via bad regex, like test_get_with_bad_regex + # above. Modify this response to avoid BadArgumentError and test generic shard + # failure. + crashstorage = self.build_crashstorage() + api = SuperSearchWithFields(crashstorage=crashstorage) + now = utc_now() + es_helper.index_crash( + processed_crash={ + "uuid": create_new_ooid(timestamp=now), + "signature": "test.dll", + "date_processed": now, + }, + ) + bad_regex_query = api.get(signature=r'@"', _return_query=True) + search = crashstorage.build_search(index=bad_regex_query["indices"]) + search = search.filter(bad_regex_query["query"]["query"]) + search = search.source(bad_regex_query["query"]["_source"]) + bad_regex_results = search.execute().to_dict() + + # Use a mock es server to return custom results ip, port = "127.0.0.1", 9999 with settings.override( **{ @@ -1796,33 +1814,16 @@ def test_get_with_failing_shards(self): api = SuperSearchWithFields(crashstorage=crashstorage) # Test with one failing shard. - es_results = { - "hits": { - "hits": [], - "total": {"value": 0, "relation": "eq"}, - "max_score": None, - }, - "timed_out": False, - "took": 194, - "_shards": { - "successful": 9, - "failed": 1, - "total": 10, - "failures": [ - { - "status": 500, - "index": "fake_index", - "reason": { - "type": "fake_exception", - "reason": "foo bar gone bad", - }, - "shard": 3, - } - ], - }, - } - - with mock_es_server(ip, port, es_results): + mock_results = deepcopy(bad_regex_results) + mock_results["_shards"]["failures"][0]["index"] = "fake_index" + mock_results["_shards"]["failures"][0]["reason"]["reason"] = ( + "foo bar gone bad" + ) + mock_results["_shards"]["failures"][0]["reason"]["caused_by"]["type"] = ( + "foo_bar_exception" + ) + + with mock_es_server(ip, port, mock_results): res = api.get() assert "errors" in res @@ -1831,51 +1832,31 @@ def test_get_with_failing_shards(self): assert res["errors"] == errors_exp # Test with several failures. - es_results = { - "hits": { - "hits": [], - "total": {"value": 0, "relation": "eq"}, - "max_score": None, - }, - "timed_out": False, - "took": 194, - "_shards": { - "successful": 9, - "failed": 3, - "total": 10, - "failures": [ - { - "status": 500, - "index": "fake_index", - "reason": { - "type": "fake_exception", - "reason": "foo bar gone bad", - }, - "shard": 2, - }, - { - "status": 500, - "index": "fake_index", - "reason": { - "type": "fake_exception", - "reason": "foo bar gone bad", - }, - "shard": 3, - }, - { - "status": 500, - "index": "other_index", - "reason": { - "type": "fake_exception", - "reason": "foo bar gone bad", - }, - "shard": 1, - }, - ], - }, - } + mock_results = deepcopy(bad_regex_results) + mock_results["_shards"]["failed"] += 1 + mock_results["_shards"]["successful"] -= 1 + mock_results["_shards"]["failures"][0]["index"] = "fake_index" + mock_results["_shards"]["failures"][0]["reason"]["reason"] = ( + "foo bar gone bad" + ) + mock_results["_shards"]["failures"][0]["reason"]["caused_by"]["type"] = ( + "foo_bar_exception" + ) + # add failure on different shard + mock_results["_shards"]["failures"].append( + deepcopy(mock_results["_shards"]["failures"][0]) + ) + mock_results["_shards"]["failures"][-1]["shard"] += 1 + mock_results["_shards"]["failures"][-1]["shard"] %= mock_results["_shards"][ + "total" + ] + # add failure on different index + mock_results["_shards"]["failures"].append( + deepcopy(mock_results["_shards"]["failures"][0]) + ) + mock_results["_shards"]["failures"][-1]["index"] = "other_index" - with mock_es_server(ip, port, es_results): + with mock_es_server(ip, port, mock_results): res = api.get() assert "errors" in res