Skip to content

Commit

Permalink
rewrite dataset resource search
Browse files Browse the repository at this point in the history
  • Loading branch information
mutantsan committed Jul 30, 2024
1 parent f75e345 commit 71ead56
Show file tree
Hide file tree
Showing 7 changed files with 183 additions and 31 deletions.
3 changes: 3 additions & 0 deletions ckanext/bulk/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@
OP_ENDS_WITH = "ends_with"
OP_IS_EMPTY = "is_empty"
OP_IS_NOT_EMPTY = "is_not_empty"

GLOBAL_AND = "AND"
GLOBAL_OR = "OR"
106 changes: 80 additions & 26 deletions ckanext/bulk/entity_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

import json
from abc import abstractmethod
from typing import Any, TypedDict
from typing import Any, TypedDict, cast

from attr import field
from sqlalchemy import and_, or_
from sqlalchemy.orm.attributes import InstrumentedAttribute

import ckan.plugins.toolkit as tk
from ckan import model
from ckan import model, types
from ckan.lib.dictization import model_dictize
from ckan.lib.redis import connect_to_redis

from ckanext.bulk import const
Expand Down Expand Up @@ -54,7 +56,7 @@ def get_fields(cls) -> list[FieldItem]:
@classmethod
@abstractmethod
def search_entities_by_filters(
cls, filters: list[FilterItem], global_operator: str = "AND"
cls, filters: list[FilterItem], global_operator: str = const.GLOBAL_AND
) -> list[dict[str, Any]]:
pass

Expand Down Expand Up @@ -179,7 +181,7 @@ def get_fields(cls) -> list[FieldItem]:

@classmethod
def search_entities_by_filters(
cls, filters: list[FilterItem], global_operator: str = "AND"
cls, filters: list[FilterItem], global_operator: str = const.GLOBAL_AND
) -> list[dict[str, Any]]:
"""Search entities by the provided filters.
Expand Down Expand Up @@ -215,7 +217,7 @@ def search_entities_by_filters(
q_list.append(f"{f['field']}:[* TO *]")

query = f" {global_operator} ".join(q_list)
query = f"type:\"{cls.entity_type}\" AND {query}"
query = f'type:"{cls.entity_type}" AND {query}'

start = 0
rows = 1000
Expand Down Expand Up @@ -256,18 +258,9 @@ def get_fields(cls) -> list[FieldItem]:
if fields := cls.get_fields_from_redis():
return fields

resource: model.Resource | None = (
model.Session.query(model.Resource)
.join(model.Package)
.filter(model.Package.type == "dataset")
.first()
)

if not resource:
return []

fields = [
FieldItem(value=field, text=field) for field in resource.get_columns()
FieldItem(value=field, text=field)
for field in model.Resource.get_columns(extra_columns=True)
]

cls.cache_fields_to_redis(fields)
Expand All @@ -276,7 +269,7 @@ def get_fields(cls) -> list[FieldItem]:

@classmethod
def search_entities_by_filters(
cls, filters: list[FilterItem], global_operator: str = "AND"
cls, filters: list[FilterItem], global_operator: str = const.GLOBAL_AND
) -> list[dict[str, Any]]:
"""Search for entities by the provided filters.
Expand All @@ -293,13 +286,74 @@ def search_entities_by_filters(
if operator not in supported_operators:
raise ValueError(f"Operator {operator} not supported")

# TODO: throw away filters without value, because resource_search
# could throw a DatabaseError for an empty filter query
query = [f"{f['field']}:{f['value']}" for f in filters if f["value"]]
return cls.resource_search(filters, global_operator)

@classmethod
def resource_search(
cls, filters: list[FilterItem], global_operator: str
) -> list[dict[str, Any]]:
"""This is a custom version of the core resource_search action.
Some things we don't need were removed (hash search, term escaping),
and some things were added (exact search for core fields, working with
FitlerItem's).
Note: Due to a Resource's extra fields being stored as a json blob, the
match is made against the json string representation. As such, false
positives may occur:
If the search criteria is: ::
query = "field1:term1"
Then a json blob with the string representation of: ::
{"field1": "foo", "field2": "term1"}
will match the search criteria! This is a known short-coming of this
approach.
"""
context: types.Context = cast(
types.Context,
{"model": model, "session": model.Session, "user": tk.current_user},
)

base_query = model.Session.query(model.Resource).join(model.Package)
queries = []

for filter_item in filters:
if not filter_item["value"]:
continue

field = filter_item["field"]
value = filter_item["value"]

# TODO: do we need escaping? The interface is available only for
# sysadmins, so it should be safe.
# value = misc.escape_sql_like_special_characters(value)

if field in model.Resource.get_columns():
queries.append(getattr(model.Resource, field) == value)

# Resource extras are stored in a json blob. So searching for
# matching fields is a bit trickier.
else:
model_attr = cast(InstrumentedAttribute, model.Resource.extras)

queries.append(
or_(
model_attr.ilike("""%%"%s": "%%%s%%",%%""" % (field, value)),
model_attr.ilike("""%%"%s": "%%%s%%"}""" % (field, value)),
)
)

if not queries:
return []

op_func = and_ if global_operator == const.GLOBAL_AND else or_
base_query = base_query.filter(op_func(*queries))

return tk.get_action("resource_search")(
{"ignore_auth": True}, {"query": query, "include_private": True}
)["results"]
return model_dictize.resource_list_dictize(list(base_query), context)


class GroupEntityManager(EntityManager):
Expand Down Expand Up @@ -330,7 +384,7 @@ def get_fields(cls) -> list[FieldItem]:

@classmethod
def search_entities_by_filters(
cls, filters: list[FilterItem], global_operator: str = "AND"
cls, filters: list[FilterItem], global_operator: str = const.GLOBAL_AND
) -> list[dict[str, Any]]:
"""Search entities by the provided filters.
Expand All @@ -354,7 +408,7 @@ def search_entities_by_filters(

filltered_items = []
combined_filters = cls.combine_filters(filters)
check_func = all if global_operator == "AND" else any
check_func = all if global_operator == const.GLOBAL_AND else any

for item in item_list:
add_item = False
Expand Down
7 changes: 5 additions & 2 deletions ckanext/bulk/logic/action.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,12 @@ def bulk_update_entity(context: Context, data_dict: dict[str, Any]):
data_dict["entity_id"], data_dict["update_on"]
)
except tk.ValidationError as e:
error = str(e)
error = str(e.error_dict)
elif data_dict["action"] == "delete":
result = entity_manager.delete_entity(data_dict["entity_id"])
try:
result = entity_manager.delete_entity(data_dict["entity_id"])
except tk.ValidationError as e:
error = str(e.error_dict)
else:
error = "Action is not supported"

Expand Down
12 changes: 11 additions & 1 deletion ckanext/bulk/logic/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from ckan import types
from ckan.logic.schema import validator_args

from ckanext.bulk import const
from ckanext.bulk.entity_manager import get_entity_managers


Expand All @@ -26,7 +27,16 @@ def bulk_get_entities_by_filters(
"operator": [not_empty, unicode_safe, one_of(operators)], # type: ignore
"value": [default(""), unicode_safe], # type: ignore
},
"global_operator": [not_empty, unicode_safe, one_of(["AND", "OR"])], # type: ignore
"global_operator": [
not_empty,
unicode_safe,
one_of(
[
const.GLOBAL_AND,
const.GLOBAL_OR,
]
), # type: ignore
],
}


Expand Down
81 changes: 81 additions & 0 deletions ckanext/bulk/tests/test_entity_managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pytest

import ckan.model as model
import ckan.plugins.toolkit as tk

from ckanext.bulk import const
Expand Down Expand Up @@ -357,6 +358,82 @@ def test_operator_is_not_supported(
[{"field": "format", "operator": const.OP_CONTAINS, "value": "test"}]
)

@pytest.mark.parametrize(
("field_name", "value"),
[
("name", "new_name"),
("format", "new_format"),
("url", "http://example.com"),
("description", "test"),
],
)
def test_search_by_field(
self, field_name, value, dataset_resource_entity_manager, resource_factory
):
resource_factory(**{field_name: value})

result = dataset_resource_entity_manager.search_entities_by_filters(
[{"field": field_name, "operator": const.OP_IS, "value": value}]
)

assert result

def test_search_similar_titles(
self, dataset_resource_entity_manager, resource_factory
):
resource_factory(name="csv data")
resource_factory(name="information csv")

result = dataset_resource_entity_manager.search_entities_by_filters(
[{"field": "name", "operator": const.OP_IS, "value": "csv data"}]
)

assert len(result) == 1

def test_search_title_exact_match(
self, dataset_resource_entity_manager, resource_factory
):
resource_factory(name="csv data")

result = dataset_resource_entity_manager.search_entities_by_filters(
[{"field": "name", "operator": const.OP_IS, "value": "csv data"}]
)

assert len(result) == 1

result = dataset_resource_entity_manager.search_entities_by_filters(
[{"field": "name", "operator": const.OP_IS, "value": "csv"}]
)

assert not result

def test_search_by_extra_field(
self, dataset_resource_entity_manager, resource_factory
):
resource_factory(attribution="XXX111")

result = dataset_resource_entity_manager.search_entities_by_filters(
[{"field": "attribution", "operator": const.OP_IS, "value": "XXX111"}]
)

assert result

def test_search_with_or_operator(
self, dataset_resource_entity_manager, resource_factory
):
resource_factory(format="CSV")
resource_factory(format="XLSX")

result = dataset_resource_entity_manager.search_entities_by_filters(
[
{"field": "format", "operator": const.OP_IS, "value": "CSV"},
{"field": "format", "operator": const.OP_IS, "value": "XLSX"},
],
const.GLOBAL_OR,
)

assert result


@pytest.mark.usefixtures("with_plugins", "clean_db", "clean_index")
class TestDatasetEntityManagerUpdate:
Expand Down Expand Up @@ -538,6 +615,7 @@ def test_delete_dataset(self, dataset_entity_manager, package_factory):
dataset = package_factory()

assert dataset_entity_manager.delete_entity(dataset["id"]) is True
assert model.Package.get(dataset["id"]).state == model.State.DELETED # type: ignore

def test_delete_dataset_doesnt_exist(self, dataset_entity_manager, package_factory):
package_factory()
Expand All @@ -554,6 +632,7 @@ def test_delete_dataset_resource(
resource = resource_factory()

assert dataset_resource_entity_manager.delete_entity(resource["id"]) is True
assert model.Resource.get(resource["id"]).state == model.State.DELETED # type: ignore

def test_delete_dataset_resource_doesnt_exist(
self, dataset_resource_entity_manager, resource_factory
Expand All @@ -570,6 +649,7 @@ def test_delete_group(self, group_entity_manager, group_factory):
group = group_factory()

assert group_entity_manager.delete_entity(group["id"]) is True
assert model.Group.get(group["id"]).state == model.State.DELETED # type: ignore

def test_delete_group_doesnt_exist(self, group_entity_manager, group_factory):
group_factory()
Expand All @@ -586,6 +666,7 @@ def test_delete_organization(
organization = organization_factory()

assert organization_entity_manager.delete_entity(organization["id"]) is True
assert model.Group.get(organization["id"]).state == model.State.DELETED # type: ignore

def test_delete_organization_doesnt_exist(
self, organization_entity_manager, organization_factory
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ ignore = [
]
"ckanext/bulk/entity_manager.py" = [
"C901", # we have kinda complex function here, but can't simplify it
"PLR0912" # too much if else statements, but it's ok here
"PLR0912", # too much if else statements, but it's ok here
"UP031", # ilike search is used here
]

[tool.ruff.lint.flake8-import-conventions.aliases]
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = ckanext-bulk
version = 0.1.5
version = 0.2.0
description = Bulk update for CKAN datasets/resources
long_description = file: README.md
long_description_content_type = text/markdown
Expand Down

0 comments on commit 71ead56

Please sign in to comment.