Skip to content

Commit

Permalink
Handle alternate unique id fields better in load_course (#1342)
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertrand authored Jul 31, 2024
1 parent 41770f4 commit f7c8e69
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 120 deletions.
210 changes: 114 additions & 96 deletions learning_resources/etl/loaders.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""learning_resources data loaders"""

import datetime
import json
import logging
from decimal import Decimal

Expand Down Expand Up @@ -259,45 +258,41 @@ def load_run(
return learning_resource_run


def load_course( # noqa: C901
def upsert_course_or_program(
resource_data: dict,
blocklist: list[str],
duplicates: list[dict],
resource_type: str,
*,
config=CourseLoaderConfig(),
) -> LearningResource:
config: CourseLoaderConfig = None,
) -> tuple[LearningResource, bool]:
"""
Load the course into the database
Return a resource object and whether it was created or not
Args:
resource_data (dict):
a dict of course data values
a dict of course/program data values
blocklist (list of str):
list of course ids not to load
list of course/program ids not to load
duplicates (list of dict):
list of duplicate course data
list of duplicate course/program data
resource_type (str):
the type of resource to load (course or program)
config (CourseLoaderConfig):
configuration on how to load this program
configuration on how to load a course
Returns:
Course:
the created/updated course
tuple(LearningResource, bool):
the created/updated course and whether it was created or not
"""
platform_name = resource_data.pop("platform")
runs_data = resource_data.pop("runs", [])
topics_data = resource_data.pop("topics", None)
offered_bys_data = resource_data.pop("offered_by", None)
image_data = resource_data.pop("image", None)
course_data = resource_data.pop("course", None)
department_data = resource_data.pop("departments", [])
content_tags_data = resource_data.pop("content_tags", [])
resource_data.setdefault("learning_format", [LearningResourceFormat.online.name])

unique_field_name = resource_data.pop("unique_field", READABLE_ID_FIELD)
unique_field_value = resource_data.get(unique_field_name)
readable_id = resource_data.pop("readable_id")
runs = resource_data.pop("runs", [])

if readable_id in blocklist or not runs_data:
if readable_id in blocklist or not runs:
resource_data["published"] = False

deduplicated_course_id = next(
Expand All @@ -308,67 +303,105 @@ def load_course( # noqa: C901
),
None,
)

with transaction.atomic():
platform = LearningResourcePlatform.objects.filter(code=platform_name).first()
if not platform:
log.exception(
"Platform %s is null or not in database: %s",
platform_name,
json.dumps(readable_id),
)
return None

if deduplicated_course_id and readable_id != deduplicated_course_id:
duplicate_resource = LearningResource.objects.filter(
platform=platform, readable_id=readable_id
).first()
if duplicate_resource:
duplicate_resource.published = False
duplicate_resource.save()
resource_unpublished_actions(duplicate_resource)

log.info(
"Loading course: %s:%s=%s",
platform = LearningResourcePlatform.objects.filter(code=platform_name).first()
if not platform:
log.exception(
"Platform %s is null or not in database: %s",
platform_name,
readable_id,
unique_field_name,
unique_field_value,
)
resource_id = deduplicated_course_id or readable_id

if config.fetch_only:
# Do not upsert the course, it should already exist.
# Just find it and return it.
resource = LearningResource.objects.filter(
readable_id=resource_id,
platform=platform,
resource_type=LearningResourceType.course.name,
published=True,
).first()
if not resource:
log.warning("No published resource found for %s", resource_id)
return resource

if unique_field_name != READABLE_ID_FIELD:
# Some dupes may result, so we need to unpublish resources
# with matching unique values and different readable_ids
for resource in LearningResource.objects.filter(
**{unique_field_name: unique_field_value},
platform=platform,
resource_type=LearningResourceType.course.name,
).exclude(readable_id=resource_id):
resource.published = False
resource.save()
resource_unpublished_actions(resource)
(
learning_resource,
created,
) = LearningResource.objects.select_for_update().update_or_create(
return None, None

if deduplicated_course_id and readable_id != deduplicated_course_id:
duplicate_resource = LearningResource.objects.filter(
platform=platform, readable_id=readable_id
).first()
if duplicate_resource:
duplicate_resource.published = False
duplicate_resource.save()
resource_unpublished_actions(duplicate_resource)

resource_id = deduplicated_course_id or readable_id

if config and config.fetch_only:
# Do not upsert the course, it should already exist.
# Just find it and return it.
resource = LearningResource.objects.filter(
readable_id=resource_id,
platform=platform,
resource_type=resource_type,
published=True,
).first()
if not resource:
log.warning("No published resource found for %s", resource_id)
return resource, False

if unique_field_name != READABLE_ID_FIELD:
resource_data[READABLE_ID_FIELD] = resource_id
# Some dupes may result, so we should delete all but the
# most recently updated resource w/matching unique value
existing_courses = LearningResource.objects.filter(
**{unique_field_name: unique_field_value},
platform=platform,
resource_type=LearningResourceType.course.name,
defaults=resource_data,
).order_by("-updated_on")
if existing_courses.count() > 1:
for course in existing_courses[1:]:
course.delete()
else:
unique_field_value = resource_id
return LearningResource.objects.select_for_update().update_or_create(
**{unique_field_name: unique_field_value},
platform=platform,
resource_type=resource_type,
defaults=resource_data,
)


def load_course(
resource_data: dict,
blocklist: list[str],
duplicates: list[dict],
*,
config=CourseLoaderConfig(),
) -> LearningResource:
"""
Load the course into the database
Args:
resource_data (dict):
a dict of course data values
blocklist (list of str):
list of course ids not to load
duplicates (list of dict):
list of duplicate course data
config (CourseLoaderConfig):
configuration on how to load this program
Returns:
Course:
the created/updated course
"""

topics_data = resource_data.pop("topics", None)
offered_bys_data = resource_data.pop("offered_by", None)
image_data = resource_data.pop("image", None)
course_data = resource_data.pop("course", None)
department_data = resource_data.pop("departments", [])
content_tags_data = resource_data.pop("content_tags", [])
resource_data.setdefault("learning_format", [LearningResourceFormat.online.name])
runs_data = resource_data.get("runs", [])

with transaction.atomic():
learning_resource, created = upsert_course_or_program(
resource_data,
blocklist,
duplicates,
LearningResourceType.course.name,
config=config,
)
if config.fetch_only or not learning_resource:
return learning_resource

Course.objects.get_or_create(
learning_resource=learning_resource, defaults=course_data
Expand Down Expand Up @@ -469,36 +502,21 @@ def load_program(
"""
# pylint: disable=too-many-locals

readable_id = program_data.pop("readable_id")
courses_data = program_data.pop("courses")
topics_data = program_data.pop("topics", [])
runs_data = program_data.pop("runs", [])
offered_by_data = program_data.pop("offered_by", None)
departments_data = program_data.pop("departments", None)
image_data = program_data.pop("image", None)
platform_code = program_data.pop("platform")
program_data.setdefault("learning_format", [LearningResourceFormat.online.name])
runs_data = program_data.get("runs", [])

course_resources = []
with transaction.atomic():
# lock on the program record
platform = LearningResourcePlatform.objects.filter(code=platform_code).first()
if not platform:
log.exception(
"Platform %s is null or not in database: %s",
platform_code,
json.dumps(program_data),
)
return None
(
learning_resource,
created, # pylint: disable=unused-variable
) = LearningResource.objects.select_for_update().update_or_create(
readable_id=readable_id,
platform=platform,
resource_type=LearningResourceType.program.name,
defaults=program_data,
learning_resource, created = upsert_course_or_program(
program_data, [], [], LearningResourceType.program.name
)
if not learning_resource:
return None

load_topics(learning_resource, topics_data)
load_image(learning_resource, image_data)
Expand Down
56 changes: 32 additions & 24 deletions learning_resources/etl/loaders_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Tests for ETL loaders"""

import json
from datetime import timedelta
from decimal import Decimal

Expand Down Expand Up @@ -228,20 +227,6 @@ def test_load_program( # noqa: PLR0913
[],
)

if program_exists and not is_published:
mock_upsert_tasks.deindex_learning_resource.assert_called_with(
result.id, result.resource_type
)
elif is_published:
if program_exists:
mock_upsert_tasks.upsert_learning_resource.assert_called_with(result.id)
else:
mock_upsert_tasks.upsert_learning_resource_immutable_signature.assert_called_with(
result.id
)
else:
mock_upsert_tasks.upsert_learning_resource.assert_not_called()

assert Program.objects.count() == 1
assert Course.objects.count() == after_course_count

Expand Down Expand Up @@ -273,6 +258,20 @@ def test_load_program( # noqa: PLR0913
assert isinstance(relationship.child, LearningResource)
assert relationship.child.readable_id == data.learning_resource.readable_id

if program_exists and not is_published:
mock_upsert_tasks.deindex_learning_resource.assert_called_with(
result.id, result.resource_type
)
elif is_published:
if program_exists:
mock_upsert_tasks.upsert_learning_resource.assert_called_with(result.id)
else:
mock_upsert_tasks.upsert_learning_resource_immutable_signature.assert_called_with(
result.id
)
else:
mock_upsert_tasks.upsert_learning_resource.assert_not_called()


def test_load_program_bad_platform(mocker):
"""A bad platform should log an exception and not create the program"""
Expand All @@ -292,7 +291,7 @@ def test_load_program_bad_platform(mocker):
result = load_program(props, [], [], config=ProgramLoaderConfig(prune=True))
assert result is None
mock_log.assert_called_once_with(
"Platform %s is null or not in database: %s", bad_platform, json.dumps(props)
"Platform %s is null or not in database: %s", bad_platform, "abc123"
)


Expand Down Expand Up @@ -466,7 +465,7 @@ def test_load_course_bad_platform(mocker):
result = load_course(props, [], [], config=CourseLoaderConfig(prune=True))
assert result is None
mock_log.assert_called_once_with(
"Platform %s is null or not in database: %s", bad_platform, '"abc123"'
"Platform %s is null or not in database: %s", bad_platform, "abc123"
)


Expand Down Expand Up @@ -564,13 +563,19 @@ def test_load_duplicate_course(


@pytest.mark.parametrize("unique_url", [True, False])
def test_load_course_dupe_urls(unique_url):
"""If url is supposed to be unique field, unpublish old courses with same url"""
def test_load_course_unique_urls(unique_url):
"""
If url is supposed to be unique field, unpublish unpublished courses with same url
and update the published course with the new readable id
"""
unique_url = "https://mit.edu/unique.html"
readable_id = "new_unique_course_id"
platform = LearningResourcePlatformFactory.create(code=PlatformType.ocw.name)
old_courses = LearningResourceFactory.create_batch(
2, url=unique_url, platform=platform, is_course=True
old_unpublished_courses = LearningResourceFactory.create_batch(
2, url=unique_url, platform=platform, is_course=True, published=False
)
old_course = LearningResourceFactory.create(
url=unique_url, platform=platform, is_course=True
)
props = {
"readable_id": readable_id,
Expand All @@ -593,9 +598,12 @@ def test_load_course_dupe_urls(unique_url):
assert result.readable_id == readable_id
assert result.url == unique_url
assert result.published is True
for course in old_courses:
course.refresh_from_db()
assert course.published is (unique_url is False)
for unpublished_course in old_unpublished_courses:
assert (
LearningResource.objects.filter(pk=unpublished_course.id).exists() is False
)
old_course.refresh_from_db()
assert old_course == result


@pytest.mark.parametrize("course_exists", [True, False])
Expand Down

0 comments on commit f7c8e69

Please sign in to comment.