rewrite dataset resource search

DataShades · Jul 30, 2024 · 71ead56 · 71ead56
1 parent f75e345
commit 71ead56
Show file tree

Hide file tree

Showing 7 changed files with 183 additions and 31 deletions.
diff --git a/ckanext/bulk/const.py b/ckanext/bulk/const.py
@@ -6,3 +6,6 @@
 OP_ENDS_WITH = "ends_with"
 OP_IS_EMPTY = "is_empty"
 OP_IS_NOT_EMPTY = "is_not_empty"
+
+GLOBAL_AND = "AND"
+GLOBAL_OR = "OR"
diff --git a/ckanext/bulk/entity_manager.py b/ckanext/bulk/entity_manager.py
@@ -2,12 +2,14 @@
 
 import json
 from abc import abstractmethod
-from typing import Any, TypedDict
+from typing import Any, TypedDict, cast
 
-from attr import field
+from sqlalchemy import and_, or_
+from sqlalchemy.orm.attributes import InstrumentedAttribute
 
 import ckan.plugins.toolkit as tk
-from ckan import model
+from ckan import model, types
+from ckan.lib.dictization import model_dictize
 from ckan.lib.redis import connect_to_redis
 
 from ckanext.bulk import const
@@ -54,7 +56,7 @@ def get_fields(cls) -> list[FieldItem]:
     @classmethod
     @abstractmethod
     def search_entities_by_filters(
-        cls, filters: list[FilterItem], global_operator: str = "AND"
+        cls, filters: list[FilterItem], global_operator: str = const.GLOBAL_AND
     ) -> list[dict[str, Any]]:
         pass
 
@@ -179,7 +181,7 @@ def get_fields(cls) -> list[FieldItem]:
 
     @classmethod
     def search_entities_by_filters(
-        cls, filters: list[FilterItem], global_operator: str = "AND"
+        cls, filters: list[FilterItem], global_operator: str = const.GLOBAL_AND
     ) -> list[dict[str, Any]]:
         """Search entities by the provided filters.
 
@@ -215,7 +217,7 @@ def search_entities_by_filters(
                 q_list.append(f"{f['field']}:[* TO *]")
 
         query = f" {global_operator} ".join(q_list)
-        query = f"type:\"{cls.entity_type}\" AND {query}"
+        query = f'type:"{cls.entity_type}" AND {query}'
 
         start = 0
         rows = 1000
@@ -256,18 +258,9 @@ def get_fields(cls) -> list[FieldItem]:
         if fields := cls.get_fields_from_redis():
             return fields
 
-        resource: model.Resource | None = (
-            model.Session.query(model.Resource)
-            .join(model.Package)
-            .filter(model.Package.type == "dataset")
-            .first()
-        )
-
-        if not resource:
-            return []
-
         fields = [
-            FieldItem(value=field, text=field) for field in resource.get_columns()
+            FieldItem(value=field, text=field)
+            for field in model.Resource.get_columns(extra_columns=True)
         ]
 
         cls.cache_fields_to_redis(fields)
@@ -276,7 +269,7 @@ def get_fields(cls) -> list[FieldItem]:
 
     @classmethod
     def search_entities_by_filters(
-        cls, filters: list[FilterItem], global_operator: str = "AND"
+        cls, filters: list[FilterItem], global_operator: str = const.GLOBAL_AND
     ) -> list[dict[str, Any]]:
         """Search for entities by the provided filters.
 
@@ -293,13 +286,74 @@ def search_entities_by_filters(
             if operator not in supported_operators:
                 raise ValueError(f"Operator {operator} not supported")
 
-        # TODO: throw away filters without value, because resource_search
-        # could throw a DatabaseError for an empty filter query
-        query = [f"{f['field']}:{f['value']}" for f in filters if f["value"]]
+        return cls.resource_search(filters, global_operator)
+
+    @classmethod
+    def resource_search(
+        cls, filters: list[FilterItem], global_operator: str
+    ) -> list[dict[str, Any]]:
+        """This is a custom version of the core resource_search action.
+
+        Some things we don't need were removed (hash search, term escaping),
+        and some things were added (exact search for core fields, working with
+        FitlerItem's).
+
+        Note: Due to a Resource's extra fields being stored as a json blob, the
+        match is made against the json string representation.  As such, false
+        positives may occur:
+
+        If the search criteria is: ::
+
+            query = "field1:term1"
+
+        Then a json blob with the string representation of: ::
+
+            {"field1": "foo", "field2": "term1"}
+
+        will match the search criteria!  This is a known short-coming of this
+        approach.
+        """
+        context: types.Context = cast(
+            types.Context,
+            {"model": model, "session": model.Session, "user": tk.current_user},
+        )
+
+        base_query = model.Session.query(model.Resource).join(model.Package)
+        queries = []
+
+        for filter_item in filters:
+            if not filter_item["value"]:
+                continue
+
+            field = filter_item["field"]
+            value = filter_item["value"]
+
+            # TODO: do we need escaping? The interface is available only for
+            # sysadmins, so it should be safe.
+            # value = misc.escape_sql_like_special_characters(value)
+
+            if field in model.Resource.get_columns():
+                queries.append(getattr(model.Resource, field) == value)
+
+            # Resource extras are stored in a json blob.  So searching for
+            # matching fields is a bit trickier.
+            else:
+                model_attr = cast(InstrumentedAttribute, model.Resource.extras)
+
+                queries.append(
+                    or_(
+                        model_attr.ilike("""%%"%s": "%%%s%%",%%""" % (field, value)),
+                        model_attr.ilike("""%%"%s": "%%%s%%"}""" % (field, value)),
+                    )
+                )
+
+        if not queries:
+            return []
+
+        op_func = and_ if global_operator == const.GLOBAL_AND else or_
+        base_query = base_query.filter(op_func(*queries))
 
-        return tk.get_action("resource_search")(
-            {"ignore_auth": True}, {"query": query, "include_private": True}
-        )["results"]
+        return model_dictize.resource_list_dictize(list(base_query), context)
 
 
 class GroupEntityManager(EntityManager):
@@ -330,7 +384,7 @@ def get_fields(cls) -> list[FieldItem]:
 
     @classmethod
     def search_entities_by_filters(
-        cls, filters: list[FilterItem], global_operator: str = "AND"
+        cls, filters: list[FilterItem], global_operator: str = const.GLOBAL_AND
     ) -> list[dict[str, Any]]:
         """Search entities by the provided filters.
 
@@ -354,7 +408,7 @@ def search_entities_by_filters(
 
         filltered_items = []
         combined_filters = cls.combine_filters(filters)
-        check_func = all if global_operator == "AND" else any
+        check_func = all if global_operator == const.GLOBAL_AND else any
 
         for item in item_list:
             add_item = False

diff --git a/ckanext/bulk/logic/action.py b/ckanext/bulk/logic/action.py
@@ -28,9 +28,12 @@ def bulk_update_entity(context: Context, data_dict: dict[str, Any]):
                 data_dict["entity_id"], data_dict["update_on"]
             )
         except tk.ValidationError as e:
-            error = str(e)
+            error = str(e.error_dict)
     elif data_dict["action"] == "delete":
-        result = entity_manager.delete_entity(data_dict["entity_id"])
+        try:
+            result = entity_manager.delete_entity(data_dict["entity_id"])
+        except tk.ValidationError as e:
+            error = str(e.error_dict)
     else:
         error = "Action is not supported"
 

diff --git a/ckanext/bulk/logic/schema.py b/ckanext/bulk/logic/schema.py
@@ -4,6 +4,7 @@
 from ckan import types
 from ckan.logic.schema import validator_args
 
+from ckanext.bulk import const
 from ckanext.bulk.entity_manager import get_entity_managers
 
 
@@ -26,7 +27,16 @@ def bulk_get_entities_by_filters(
             "operator": [not_empty, unicode_safe, one_of(operators)],  # type: ignore
             "value": [default(""), unicode_safe],  # type: ignore
         },
-        "global_operator": [not_empty, unicode_safe, one_of(["AND", "OR"])],  # type: ignore
+        "global_operator": [
+            not_empty,
+            unicode_safe,
+            one_of(
+                [
+                    const.GLOBAL_AND,
+                    const.GLOBAL_OR,
+                ]
+            ),  # type: ignore
+        ],
     }
 
 

diff --git a/ckanext/bulk/tests/test_entity_managers.py b/ckanext/bulk/tests/test_entity_managers.py
@@ -2,6 +2,7 @@
 
 import pytest
 
+import ckan.model as model
 import ckan.plugins.toolkit as tk
 
 from ckanext.bulk import const
@@ -357,6 +358,82 @@ def test_operator_is_not_supported(
                 [{"field": "format", "operator": const.OP_CONTAINS, "value": "test"}]
             )
 
+    @pytest.mark.parametrize(
+        ("field_name", "value"),
+        [
+            ("name", "new_name"),
+            ("format", "new_format"),
+            ("url", "http://example.com"),
+            ("description", "test"),
+        ],
+    )
+    def test_search_by_field(
+        self, field_name, value, dataset_resource_entity_manager, resource_factory
+    ):
+        resource_factory(**{field_name: value})
+
+        result = dataset_resource_entity_manager.search_entities_by_filters(
+            [{"field": field_name, "operator": const.OP_IS, "value": value}]
+        )
+
+        assert result
+
+    def test_search_similar_titles(
+        self, dataset_resource_entity_manager, resource_factory
+    ):
+        resource_factory(name="csv data")
+        resource_factory(name="information csv")
+
+        result = dataset_resource_entity_manager.search_entities_by_filters(
+            [{"field": "name", "operator": const.OP_IS, "value": "csv data"}]
+        )
+
+        assert len(result) == 1
+
+    def test_search_title_exact_match(
+        self, dataset_resource_entity_manager, resource_factory
+    ):
+        resource_factory(name="csv data")
+
+        result = dataset_resource_entity_manager.search_entities_by_filters(
+            [{"field": "name", "operator": const.OP_IS, "value": "csv data"}]
+        )
+
+        assert len(result) == 1
+
+        result = dataset_resource_entity_manager.search_entities_by_filters(
+            [{"field": "name", "operator": const.OP_IS, "value": "csv"}]
+        )
+
+        assert not result
+
+    def test_search_by_extra_field(
+        self, dataset_resource_entity_manager, resource_factory
+    ):
+        resource_factory(attribution="XXX111")
+
+        result = dataset_resource_entity_manager.search_entities_by_filters(
+            [{"field": "attribution", "operator": const.OP_IS, "value": "XXX111"}]
+        )
+
+        assert result
+
+    def test_search_with_or_operator(
+        self, dataset_resource_entity_manager, resource_factory
+    ):
+        resource_factory(format="CSV")
+        resource_factory(format="XLSX")
+
+        result = dataset_resource_entity_manager.search_entities_by_filters(
+            [
+                {"field": "format", "operator": const.OP_IS, "value": "CSV"},
+                {"field": "format", "operator": const.OP_IS, "value": "XLSX"},
+            ],
+            const.GLOBAL_OR,
+        )
+
+        assert result
+
 
 @pytest.mark.usefixtures("with_plugins", "clean_db", "clean_index")
 class TestDatasetEntityManagerUpdate:
@@ -538,6 +615,7 @@ def test_delete_dataset(self, dataset_entity_manager, package_factory):
         dataset = package_factory()
 
         assert dataset_entity_manager.delete_entity(dataset["id"]) is True
+        assert model.Package.get(dataset["id"]).state == model.State.DELETED  # type: ignore
 
     def test_delete_dataset_doesnt_exist(self, dataset_entity_manager, package_factory):
         package_factory()
@@ -554,6 +632,7 @@ def test_delete_dataset_resource(
         resource = resource_factory()
 
         assert dataset_resource_entity_manager.delete_entity(resource["id"]) is True
+        assert model.Resource.get(resource["id"]).state == model.State.DELETED  # type: ignore
 
     def test_delete_dataset_resource_doesnt_exist(
         self, dataset_resource_entity_manager, resource_factory
@@ -570,6 +649,7 @@ def test_delete_group(self, group_entity_manager, group_factory):
         group = group_factory()
 
         assert group_entity_manager.delete_entity(group["id"]) is True
+        assert model.Group.get(group["id"]).state == model.State.DELETED  # type: ignore
 
     def test_delete_group_doesnt_exist(self, group_entity_manager, group_factory):
         group_factory()
@@ -586,6 +666,7 @@ def test_delete_organization(
         organization = organization_factory()
 
         assert organization_entity_manager.delete_entity(organization["id"]) is True
+        assert model.Group.get(organization["id"]).state == model.State.DELETED  # type: ignore
 
     def test_delete_organization_doesnt_exist(
         self, organization_entity_manager, organization_factory

diff --git a/pyproject.toml b/pyproject.toml
@@ -51,7 +51,8 @@ ignore = [
 ]
 "ckanext/bulk/entity_manager.py" = [
     "C901", # we have kinda complex function here, but can't simplify it
-    "PLR0912" # too much if else statements, but it's ok here
+    "PLR0912", # too much if else statements, but it's ok here
+    "UP031", # ilike search is used here
 ]
 
 [tool.ruff.lint.flake8-import-conventions.aliases]

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = ckanext-bulk
-version = 0.1.5
+version = 0.2.0
 description = Bulk update for CKAN datasets/resources
 long_description = file: README.md
 long_description_content_type = text/markdown