Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store content object collections in search index [FC-0062] #680

Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions openedx/core/djangoapps/content/search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,7 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
Fields.context_key,
Fields.org,
Fields.tags,
Fields.tags + "." + Fields.tags_collections,
Fields.tags + "." + Fields.tags_taxonomy,
Fields.tags + "." + Fields.tags_level0,
Fields.tags + "." + Fields.tags_level1,
Expand All @@ -339,6 +340,7 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
# are searchable only if at least one document in the index has a value. If we didn't list them here and,
# say, there were no tags.level3 tags in the index, the client would get an error if trying to search for
# these sub-fields: "Attribute `tags.level3` is not searchable."
Fields.tags + "." + Fields.tags_collections,
Fields.tags + "." + Fields.tags_taxonomy,
Fields.tags + "." + Fields.tags_level0,
Fields.tags + "." + Fields.tags_level1,
Expand Down
35 changes: 24 additions & 11 deletions openedx/core/djangoapps/content/search/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
from hashlib import blake2b

from django.utils.text import slugify
from django.core.exceptions import ObjectDoesNotExist
from opaque_keys.edx.keys import LearningContextKey, UsageKey
from openedx_learning.api import authoring as authoring_api

from openedx.core.djangoapps.content.search.models import SearchAccess
from openedx.core.djangoapps.content_libraries import api as lib_api
Expand Down Expand Up @@ -52,6 +54,7 @@ class Fields:
tags_level1 = "level1"
tags_level2 = "level2"
tags_level3 = "level3"
tags_collections = "collections" # subfield of tags, i.e. tags.collections
# The "content" field is a dictionary of arbitrary data, depending on the block_type.
# It comes from each XBlock's index_dictionary() method (if present) plus some processing.
# Text (html) blocks have an "html_content" key in here, capa has "capa_content" and "problem_types", and so on.
Expand Down Expand Up @@ -164,10 +167,11 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:

See the comments above on "Field.tags" for an explanation of the format.

e.g. for something tagged "Difficulty: Hard" and "Location: Vancouver" this
would return:
e.g. for something tagged "Difficulty: Hard" and "Location: Vancouver",
and in Collections 3 and 4, this would return:
{
"tags": {
"collections": [3, 4],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm still not thrilled with storing collections together with tags. If we store it in the same property, I think we couldn't use the reason field to do partial updates inside this doc property (although we can do partial updates at the doc level).

Despite being synchronous, I will not object because these operations are cheap. I'm just failing to see any advantage in this approach. 🤔

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rpenido Noted..

I'm still not thrilled with storing collections together with tags. If we store it in the same property, I think we couldn't use the reason field to do partial updates inside this doc property (although we can do partial updates at the doc level).

@bradenmacdonald What do you think? Should we be storing collections in the tags index structure, or should it be its own field?

Despite being synchronous, I will not object because these operations are cheap. I'm just failing to see any advantage in this approach. 🤔

I'm pretty sure it's asynchronous outside of the devstack -- events are handled by the "worker" processes.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would make it it's own field, unless that's a difficult change?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do, it's not difficult at all.

"taxonomy": ["Location", "Difficulty"],
"level0": ["Location > North America", "Difficulty > Hard"],
"level1": ["Location > North America > Canada"],
Expand All @@ -182,21 +186,16 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
strings in a particular format that the frontend knows how to render to
support hierarchical refinement by tag.
"""
result = {}

# Note that we could improve performance for indexing many components from the same library/course,
# if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the
# tags for each component separately.
all_tags = tagging_api.get_object_tags(str(object_id)).all()
if not all_tags:
# Clear out tags in the index when unselecting all tags for the block, otherwise
# it would remain the last value if a cleared Fields.tags field is not included
return {Fields.tags: {}}
result = {
Fields.tags_taxonomy: [],
Fields.tags_level0: [],
# ... other levels added as needed
}
for obj_tag in all_tags:
# Add the taxonomy name:
if Fields.tags_taxonomy not in result:
result[Fields.tags_taxonomy] = []
if obj_tag.taxonomy.name not in result[Fields.tags_taxonomy]:
result[Fields.tags_taxonomy].append(obj_tag.taxonomy.name)
# Taxonomy name plus each level of tags, in a list: # e.g. ["Location", "North America", "Canada", "Vancouver"]
Expand All @@ -220,6 +219,20 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
if len(parts) == level + 2:
break # We have all the levels for this tag now (e.g. parts=["Difficulty", "Hard"] -> need level0 only)

# Gather the collections associated with this object
collections = []
try:
component = lib_api.get_component_from_usage_key(object_id)
collections = authoring_api.get_entity_collections(
component.learning_package_id,
component.key,
).values_list("key", flat=True)
except ObjectDoesNotExist:
log.warning(f"No component found for {object_id}")

if collections:
result[Fields.tags_collections] = list(collections)

return {Fields.tags: result}


Expand Down
28 changes: 17 additions & 11 deletions openedx/core/djangoapps/content/search/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,14 @@

from django.db.models.signals import post_delete
from django.dispatch import receiver
from openedx_events.content_authoring.data import ContentLibraryData, ContentObjectData, LibraryBlockData, XBlockData
from openedx_events.content_authoring.data import (
ContentLibraryData,
ContentObjectChangedData,
LibraryBlockData,
XBlockData,
)
from opaque_keys import InvalidKeyError
from opaque_keys.edx.keys import UsageKey
from openedx_events.content_authoring.signals import (
CONTENT_LIBRARY_DELETED,
CONTENT_LIBRARY_UPDATED,
Expand All @@ -16,9 +23,8 @@
XBLOCK_CREATED,
XBLOCK_DELETED,
XBLOCK_UPDATED,
CONTENT_OBJECT_TAGS_CHANGED,
CONTENT_OBJECT_ASSOCIATIONS_CHANGED,
)
from openedx.core.djangoapps.content_tagging.utils import get_content_key_from_string

from openedx.core.djangoapps.content.course_overviews.models import CourseOverview
from openedx.core.djangoapps.content.search.models import SearchAccess
Expand Down Expand Up @@ -139,22 +145,22 @@ def content_library_updated_handler(**kwargs) -> None:
update_content_library_index_docs.delay(str(content_library_data.library_key))


@receiver(CONTENT_OBJECT_TAGS_CHANGED)
@receiver(CONTENT_OBJECT_ASSOCIATIONS_CHANGED)
@only_if_meilisearch_enabled
def content_object_tags_changed_handler(**kwargs) -> None:
def content_object_associations_changed_handler(**kwargs) -> None:
"""
Update the tags data in the index for the Content Object
Update the collections/tags data in the index for the Content Object
"""
content_object_tags = kwargs.get("content_object", None)
if not content_object_tags or not isinstance(content_object_tags, ContentObjectData):
content_object = kwargs.get("content_object", None)
if not content_object or not isinstance(content_object, ContentObjectChangedData):
log.error("Received null or incorrect data for event")
return

try:
# Check if valid if course or library block
get_content_key_from_string(content_object_tags.object_id)
except ValueError:
usage_key = UsageKey.from_string(str(content_object.object_id))
except InvalidKeyError:
log.error("Received invalid content object id")
return

upsert_block_tags_index_docs(content_object_tags.object_id)
upsert_block_tags_index_docs(usage_key)
46 changes: 45 additions & 1 deletion openedx/core/djangoapps/content/search/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def setUp(self):
tagging_api.add_tag_to_taxonomy(self.taxonomyB, "three")
tagging_api.add_tag_to_taxonomy(self.taxonomyB, "four")

# Create a collection:
# Create collections:
self.learning_package = authoring_api.get_learning_package_by_key(self.library.key)
with freeze_time(created_date):
self.collection = authoring_api.create_collection(
Expand Down Expand Up @@ -379,11 +379,37 @@ def test_index_library_block_tags(self, mock_meilisearch):
"""
Test indexing an Library Block with tags.
"""
collection1 = authoring_api.create_collection(
learning_package_id=self.library.learning_package.id,
key="COL1",
title="Collection 1",
created_by=None,
description="First Collection",
)

collection2 = authoring_api.create_collection(
learning_package_id=self.library.learning_package.id,
key="COL2",
title="Collection 2",
created_by=None,
description="Second Collection",
)

# Tag XBlock (these internally call `upsert_block_tags_index_docs`)
tagging_api.tag_object(str(self.problem1.usage_key), self.taxonomyA, ["one", "two"])
tagging_api.tag_object(str(self.problem1.usage_key), self.taxonomyB, ["three", "four"])

# Add Problem1 to both Collections (these internally call `upsert_block_tags_index_docs`)
# (adding in reverse order to test sorting of collection tag))
for collection in (collection2, collection1):
library_api.update_library_collection_components(
self.library.key,
collection_key=collection.key,
usage_keys=[
self.problem1.usage_key,
],
)

# Build expected docs with tags at each stage
doc_problem_with_tags1 = {
"id": self.doc_problem1["id"],
Expand All @@ -399,11 +425,29 @@ def test_index_library_block_tags(self, mock_meilisearch):
'level0': ['A > one', 'A > two', 'B > four', 'B > three']
}
}
doc_problem_with_collection2 = {
"id": self.doc_problem1["id"],
"tags": {
'collections': [collection2.key],
'taxonomy': ['A', 'B'],
'level0': ['A > one', 'A > two', 'B > four', 'B > three']
}
}
doc_problem_with_collection1 = {
"id": self.doc_problem1["id"],
"tags": {
'collections': [collection1.key, collection2.key],
'taxonomy': ['A', 'B'],
'level0': ['A > one', 'A > two', 'B > four', 'B > three']
}
}

mock_meilisearch.return_value.index.return_value.update_documents.assert_has_calls(
[
call([doc_problem_with_tags1]),
call([doc_problem_with_tags2]),
call([doc_problem_with_collection2]),
call([doc_problem_with_collection1]),
],
any_order=True,
)
Expand Down
105 changes: 102 additions & 3 deletions openedx/core/djangoapps/content/search/tests/test_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,26 @@
from openedx_learning.api import authoring as authoring_api

from openedx.core.djangoapps.content_tagging import api as tagging_api
from openedx.core.djangoapps.content_libraries import api as library_api
from openedx.core.djangolib.testing.utils import skip_unless_cms
from xmodule.modulestore.django import modulestore
from xmodule.modulestore.tests.django_utils import SharedModuleStoreTestCase
from xmodule.modulestore.tests.factories import BlockFactory, ToyCourseFactory

try:
# This import errors in the lms because content.search is not an installed app there.
from ..documents import searchable_doc_for_course_block, searchable_doc_tags, searchable_doc_for_collection
from ..documents import (
searchable_doc_for_course_block,
searchable_doc_tags,
searchable_doc_for_collection,
searchable_doc_for_library_block,
)
from ..models import SearchAccess
except RuntimeError:
searchable_doc_for_course_block = lambda x: x
searchable_doc_tags = lambda x: x
searchable_doc_for_collection = lambda x: x
searchable_doc_for_library_block = lambda x: x
SearchAccess = {}


Expand All @@ -38,21 +45,52 @@ class StudioDocumentsTest(SharedModuleStoreTestCase):
def setUpClass(cls):
super().setUpClass()
cls.store = modulestore()
cls.org = Organization.objects.create(name="edX", short_name="edX")
cls.toy_course = ToyCourseFactory.create() # See xmodule/modulestore/tests/sample_courses.py
cls.toy_course_key = cls.toy_course.id

# Get references to some blocks in the toy course
cls.html_block_key = cls.toy_course_key.make_usage_key("html", "toyjumpto")
# Create a problem in library
# Create a problem in course
cls.problem_block = BlockFactory.create(
category="problem",
parent_location=cls.toy_course_key.make_usage_key("vertical", "vertical_test"),
display_name='Test Problem',
data="<problem>What is a test?<multiplechoiceresponse></multiplechoiceresponse></problem>",
)

# Create a library and collection with a block
created_date = datetime(2023, 4, 5, 6, 7, 8, tzinfo=timezone.utc)
with freeze_time(created_date):
cls.library = library_api.create_library(
org=cls.org,
slug="2012_Fall",
title="some content_library",
description="some description",
)
cls.collection = library_api.create_library_collection(
cls.library.key,
collection_key="TOY_COLLECTION",
title="Toy Collection",
created_by=None,
description="my toy collection description"
)
cls.library_block = library_api.create_library_block(
cls.library.key,
"html",
"text2",
)

# Add the problem block to the collection
library_api.update_library_collection_components(
cls.library.key,
collection_key="TOY_COLLECTION",
usage_keys=[
cls.library_block.usage_key,
]
)

# Create a couple taxonomies and some tags
cls.org = Organization.objects.create(name="edX", short_name="edX")
cls.difficulty_tags = tagging_api.create_taxonomy(name="Difficulty", orgs=[cls.org], allow_multiple=False)
tagging_api.add_tag_to_taxonomy(cls.difficulty_tags, tag="Easy")
tagging_api.add_tag_to_taxonomy(cls.difficulty_tags, tag="Normal")
Expand All @@ -69,6 +107,7 @@ def setUpClass(cls):
tagging_api.tag_object(str(cls.problem_block.usage_key), cls.difficulty_tags, tags=["Easy"])
tagging_api.tag_object(str(cls.html_block_key), cls.subject_tags, tags=["Chinese", "Jump Links"])
tagging_api.tag_object(str(cls.html_block_key), cls.difficulty_tags, tags=["Normal"])
tagging_api.tag_object(str(cls.library_block.usage_key), cls.difficulty_tags, tags=["Normal"])

@property
def toy_course_access_id(self):
Expand All @@ -80,6 +119,16 @@ def toy_course_access_id(self):
"""
return SearchAccess.objects.get(context_key=self.toy_course_key).id

@property
def library_access_id(self):
"""
Returns the SearchAccess.id created for the library.

This SearchAccess object is created when documents are added to the search index, so this method must be called
after this step, or risk a DoesNotExist error.
"""
return SearchAccess.objects.get(context_key=self.library.key).id

def test_problem_block(self):
"""
Test how a problem block gets represented in the search index
Expand Down Expand Up @@ -205,6 +254,56 @@ def test_video_block_untagged(self):
# This video has no tags.
}

def test_html_library_block(self):
"""
Test how a library block gets represented in the search index
"""
doc = {}
doc.update(searchable_doc_for_library_block(self.library_block))
doc.update(searchable_doc_tags(self.library_block.usage_key))
assert doc == {
"id": "lbedx2012_fallhtmltext2-4bb47d67",
"type": "library_block",
"block_type": "html",
"usage_key": "lb:edX:2012_Fall:html:text2",
"block_id": "text2",
"context_key": "lib:edX:2012_Fall",
"org": "edX",
"access_id": self.library_access_id,
"display_name": "Text",
"breadcrumbs": [
{
"display_name": "some content_library",
},
],
"last_published": None,
"created": 1680674828.0,
"modified": 1680674828.0,
"content": {
"html_content": "",
},
"tags": {
"collections": ["TOY_COLLECTION"],
"taxonomy": ["Difficulty"],
"level0": ["Difficulty > Normal"],
},
}

def test_collection_with_library(self):
doc = searchable_doc_for_collection(self.collection)
assert doc == {
"id": "TOY_COLLECTION",
"type": "collection",
"org": "edX",
"display_name": "Toy Collection",
"description": "my toy collection description",
"context_key": "lib:edX:2012_Fall",
"access_id": self.library_access_id,
"breadcrumbs": [{"display_name": "some content_library"}],
"created": 1680674828.0,
"modified": 1680674828.0,
}

def test_collection_with_no_library(self):
created_date = datetime(2023, 4, 5, 6, 7, 8, tzinfo=timezone.utc)
with freeze_time(created_date):
Expand Down
Loading
Loading