From 71ead56d91e9efa6d826826d25a2ea58cb1dc642 Mon Sep 17 00:00:00 2001 From: mutantsan Date: Tue, 30 Jul 2024 14:22:23 +0300 Subject: [PATCH] rewrite dataset resource search --- ckanext/bulk/const.py | 3 + ckanext/bulk/entity_manager.py | 106 ++++++++++++++++----- ckanext/bulk/logic/action.py | 7 +- ckanext/bulk/logic/schema.py | 12 ++- ckanext/bulk/tests/test_entity_managers.py | 81 ++++++++++++++++ pyproject.toml | 3 +- setup.cfg | 2 +- 7 files changed, 183 insertions(+), 31 deletions(-) diff --git a/ckanext/bulk/const.py b/ckanext/bulk/const.py index da18174..676484b 100644 --- a/ckanext/bulk/const.py +++ b/ckanext/bulk/const.py @@ -6,3 +6,6 @@ OP_ENDS_WITH = "ends_with" OP_IS_EMPTY = "is_empty" OP_IS_NOT_EMPTY = "is_not_empty" + +GLOBAL_AND = "AND" +GLOBAL_OR = "OR" diff --git a/ckanext/bulk/entity_manager.py b/ckanext/bulk/entity_manager.py index 6e2ce3c..bef9dca 100644 --- a/ckanext/bulk/entity_manager.py +++ b/ckanext/bulk/entity_manager.py @@ -2,12 +2,14 @@ import json from abc import abstractmethod -from typing import Any, TypedDict +from typing import Any, TypedDict, cast -from attr import field +from sqlalchemy import and_, or_ +from sqlalchemy.orm.attributes import InstrumentedAttribute import ckan.plugins.toolkit as tk -from ckan import model +from ckan import model, types +from ckan.lib.dictization import model_dictize from ckan.lib.redis import connect_to_redis from ckanext.bulk import const @@ -54,7 +56,7 @@ def get_fields(cls) -> list[FieldItem]: @classmethod @abstractmethod def search_entities_by_filters( - cls, filters: list[FilterItem], global_operator: str = "AND" + cls, filters: list[FilterItem], global_operator: str = const.GLOBAL_AND ) -> list[dict[str, Any]]: pass @@ -179,7 +181,7 @@ def get_fields(cls) -> list[FieldItem]: @classmethod def search_entities_by_filters( - cls, filters: list[FilterItem], global_operator: str = "AND" + cls, filters: list[FilterItem], global_operator: str = const.GLOBAL_AND ) -> list[dict[str, Any]]: """Search entities by the provided filters. @@ -215,7 +217,7 @@ def search_entities_by_filters( q_list.append(f"{f['field']}:[* TO *]") query = f" {global_operator} ".join(q_list) - query = f"type:\"{cls.entity_type}\" AND {query}" + query = f'type:"{cls.entity_type}" AND {query}' start = 0 rows = 1000 @@ -256,18 +258,9 @@ def get_fields(cls) -> list[FieldItem]: if fields := cls.get_fields_from_redis(): return fields - resource: model.Resource | None = ( - model.Session.query(model.Resource) - .join(model.Package) - .filter(model.Package.type == "dataset") - .first() - ) - - if not resource: - return [] - fields = [ - FieldItem(value=field, text=field) for field in resource.get_columns() + FieldItem(value=field, text=field) + for field in model.Resource.get_columns(extra_columns=True) ] cls.cache_fields_to_redis(fields) @@ -276,7 +269,7 @@ def get_fields(cls) -> list[FieldItem]: @classmethod def search_entities_by_filters( - cls, filters: list[FilterItem], global_operator: str = "AND" + cls, filters: list[FilterItem], global_operator: str = const.GLOBAL_AND ) -> list[dict[str, Any]]: """Search for entities by the provided filters. @@ -293,13 +286,74 @@ def search_entities_by_filters( if operator not in supported_operators: raise ValueError(f"Operator {operator} not supported") - # TODO: throw away filters without value, because resource_search - # could throw a DatabaseError for an empty filter query - query = [f"{f['field']}:{f['value']}" for f in filters if f["value"]] + return cls.resource_search(filters, global_operator) + + @classmethod + def resource_search( + cls, filters: list[FilterItem], global_operator: str + ) -> list[dict[str, Any]]: + """This is a custom version of the core resource_search action. + + Some things we don't need were removed (hash search, term escaping), + and some things were added (exact search for core fields, working with + FitlerItem's). + + Note: Due to a Resource's extra fields being stored as a json blob, the + match is made against the json string representation. As such, false + positives may occur: + + If the search criteria is: :: + + query = "field1:term1" + + Then a json blob with the string representation of: :: + + {"field1": "foo", "field2": "term1"} + + will match the search criteria! This is a known short-coming of this + approach. + """ + context: types.Context = cast( + types.Context, + {"model": model, "session": model.Session, "user": tk.current_user}, + ) + + base_query = model.Session.query(model.Resource).join(model.Package) + queries = [] + + for filter_item in filters: + if not filter_item["value"]: + continue + + field = filter_item["field"] + value = filter_item["value"] + + # TODO: do we need escaping? The interface is available only for + # sysadmins, so it should be safe. + # value = misc.escape_sql_like_special_characters(value) + + if field in model.Resource.get_columns(): + queries.append(getattr(model.Resource, field) == value) + + # Resource extras are stored in a json blob. So searching for + # matching fields is a bit trickier. + else: + model_attr = cast(InstrumentedAttribute, model.Resource.extras) + + queries.append( + or_( + model_attr.ilike("""%%"%s": "%%%s%%",%%""" % (field, value)), + model_attr.ilike("""%%"%s": "%%%s%%"}""" % (field, value)), + ) + ) + + if not queries: + return [] + + op_func = and_ if global_operator == const.GLOBAL_AND else or_ + base_query = base_query.filter(op_func(*queries)) - return tk.get_action("resource_search")( - {"ignore_auth": True}, {"query": query, "include_private": True} - )["results"] + return model_dictize.resource_list_dictize(list(base_query), context) class GroupEntityManager(EntityManager): @@ -330,7 +384,7 @@ def get_fields(cls) -> list[FieldItem]: @classmethod def search_entities_by_filters( - cls, filters: list[FilterItem], global_operator: str = "AND" + cls, filters: list[FilterItem], global_operator: str = const.GLOBAL_AND ) -> list[dict[str, Any]]: """Search entities by the provided filters. @@ -354,7 +408,7 @@ def search_entities_by_filters( filltered_items = [] combined_filters = cls.combine_filters(filters) - check_func = all if global_operator == "AND" else any + check_func = all if global_operator == const.GLOBAL_AND else any for item in item_list: add_item = False diff --git a/ckanext/bulk/logic/action.py b/ckanext/bulk/logic/action.py index 32a32ac..f34e890 100644 --- a/ckanext/bulk/logic/action.py +++ b/ckanext/bulk/logic/action.py @@ -28,9 +28,12 @@ def bulk_update_entity(context: Context, data_dict: dict[str, Any]): data_dict["entity_id"], data_dict["update_on"] ) except tk.ValidationError as e: - error = str(e) + error = str(e.error_dict) elif data_dict["action"] == "delete": - result = entity_manager.delete_entity(data_dict["entity_id"]) + try: + result = entity_manager.delete_entity(data_dict["entity_id"]) + except tk.ValidationError as e: + error = str(e.error_dict) else: error = "Action is not supported" diff --git a/ckanext/bulk/logic/schema.py b/ckanext/bulk/logic/schema.py index 1f34848..9ce0073 100644 --- a/ckanext/bulk/logic/schema.py +++ b/ckanext/bulk/logic/schema.py @@ -4,6 +4,7 @@ from ckan import types from ckan.logic.schema import validator_args +from ckanext.bulk import const from ckanext.bulk.entity_manager import get_entity_managers @@ -26,7 +27,16 @@ def bulk_get_entities_by_filters( "operator": [not_empty, unicode_safe, one_of(operators)], # type: ignore "value": [default(""), unicode_safe], # type: ignore }, - "global_operator": [not_empty, unicode_safe, one_of(["AND", "OR"])], # type: ignore + "global_operator": [ + not_empty, + unicode_safe, + one_of( + [ + const.GLOBAL_AND, + const.GLOBAL_OR, + ] + ), # type: ignore + ], } diff --git a/ckanext/bulk/tests/test_entity_managers.py b/ckanext/bulk/tests/test_entity_managers.py index f470541..280a680 100644 --- a/ckanext/bulk/tests/test_entity_managers.py +++ b/ckanext/bulk/tests/test_entity_managers.py @@ -2,6 +2,7 @@ import pytest +import ckan.model as model import ckan.plugins.toolkit as tk from ckanext.bulk import const @@ -357,6 +358,82 @@ def test_operator_is_not_supported( [{"field": "format", "operator": const.OP_CONTAINS, "value": "test"}] ) + @pytest.mark.parametrize( + ("field_name", "value"), + [ + ("name", "new_name"), + ("format", "new_format"), + ("url", "http://example.com"), + ("description", "test"), + ], + ) + def test_search_by_field( + self, field_name, value, dataset_resource_entity_manager, resource_factory + ): + resource_factory(**{field_name: value}) + + result = dataset_resource_entity_manager.search_entities_by_filters( + [{"field": field_name, "operator": const.OP_IS, "value": value}] + ) + + assert result + + def test_search_similar_titles( + self, dataset_resource_entity_manager, resource_factory + ): + resource_factory(name="csv data") + resource_factory(name="information csv") + + result = dataset_resource_entity_manager.search_entities_by_filters( + [{"field": "name", "operator": const.OP_IS, "value": "csv data"}] + ) + + assert len(result) == 1 + + def test_search_title_exact_match( + self, dataset_resource_entity_manager, resource_factory + ): + resource_factory(name="csv data") + + result = dataset_resource_entity_manager.search_entities_by_filters( + [{"field": "name", "operator": const.OP_IS, "value": "csv data"}] + ) + + assert len(result) == 1 + + result = dataset_resource_entity_manager.search_entities_by_filters( + [{"field": "name", "operator": const.OP_IS, "value": "csv"}] + ) + + assert not result + + def test_search_by_extra_field( + self, dataset_resource_entity_manager, resource_factory + ): + resource_factory(attribution="XXX111") + + result = dataset_resource_entity_manager.search_entities_by_filters( + [{"field": "attribution", "operator": const.OP_IS, "value": "XXX111"}] + ) + + assert result + + def test_search_with_or_operator( + self, dataset_resource_entity_manager, resource_factory + ): + resource_factory(format="CSV") + resource_factory(format="XLSX") + + result = dataset_resource_entity_manager.search_entities_by_filters( + [ + {"field": "format", "operator": const.OP_IS, "value": "CSV"}, + {"field": "format", "operator": const.OP_IS, "value": "XLSX"}, + ], + const.GLOBAL_OR, + ) + + assert result + @pytest.mark.usefixtures("with_plugins", "clean_db", "clean_index") class TestDatasetEntityManagerUpdate: @@ -538,6 +615,7 @@ def test_delete_dataset(self, dataset_entity_manager, package_factory): dataset = package_factory() assert dataset_entity_manager.delete_entity(dataset["id"]) is True + assert model.Package.get(dataset["id"]).state == model.State.DELETED # type: ignore def test_delete_dataset_doesnt_exist(self, dataset_entity_manager, package_factory): package_factory() @@ -554,6 +632,7 @@ def test_delete_dataset_resource( resource = resource_factory() assert dataset_resource_entity_manager.delete_entity(resource["id"]) is True + assert model.Resource.get(resource["id"]).state == model.State.DELETED # type: ignore def test_delete_dataset_resource_doesnt_exist( self, dataset_resource_entity_manager, resource_factory @@ -570,6 +649,7 @@ def test_delete_group(self, group_entity_manager, group_factory): group = group_factory() assert group_entity_manager.delete_entity(group["id"]) is True + assert model.Group.get(group["id"]).state == model.State.DELETED # type: ignore def test_delete_group_doesnt_exist(self, group_entity_manager, group_factory): group_factory() @@ -586,6 +666,7 @@ def test_delete_organization( organization = organization_factory() assert organization_entity_manager.delete_entity(organization["id"]) is True + assert model.Group.get(organization["id"]).state == model.State.DELETED # type: ignore def test_delete_organization_doesnt_exist( self, organization_entity_manager, organization_factory diff --git a/pyproject.toml b/pyproject.toml index 3b4d7b0..28e5a32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,8 @@ ignore = [ ] "ckanext/bulk/entity_manager.py" = [ "C901", # we have kinda complex function here, but can't simplify it - "PLR0912" # too much if else statements, but it's ok here + "PLR0912", # too much if else statements, but it's ok here + "UP031", # ilike search is used here ] [tool.ruff.lint.flake8-import-conventions.aliases] diff --git a/setup.cfg b/setup.cfg index 49d5171..9e2d663 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = ckanext-bulk -version = 0.1.5 +version = 0.2.0 description = Bulk update for CKAN datasets/resources long_description = file: README.md long_description_content_type = text/markdown