Skip to content

Commit

Permalink
Avoid course overwrites in program ETL pipelines (#1332)
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertrand authored Jul 30, 2024
1 parent ed4ce1e commit 00bf0f3
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 11 deletions.
4 changes: 2 additions & 2 deletions learning_resources/etl/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@

CourseLoaderConfig = namedtuple( # noqa: PYI024
"CourseLoaderConfig",
["prune", "offered_by", "runs"],
defaults=[True, OfferedByLoaderConfig(), LearningResourceRunLoaderConfig()],
["prune", "offered_by", "runs", "fetch_only"],
defaults=[True, OfferedByLoaderConfig(), LearningResourceRunLoaderConfig(), False],
)

ProgramLoaderConfig = namedtuple( # noqa: PYI024
Expand Down
20 changes: 19 additions & 1 deletion learning_resources/etl/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def load_run(
return learning_resource_run


def load_course(
def load_course( # noqa: C901
resource_data: dict,
blocklist: list[str],
duplicates: list[dict],
Expand Down Expand Up @@ -335,6 +335,20 @@ def load_course(
unique_field_value,
)
resource_id = deduplicated_course_id or readable_id

if config.fetch_only:
# Do not upsert the course, it should already exist.
# Just find it and return it.
resource = LearningResource.objects.filter(
readable_id=resource_id,
platform=platform,
resource_type=LearningResourceType.course.name,
published=True,
).first()
if not resource:
log.warning("No published resource found for %s", resource_id)
return resource

if unique_field_name != READABLE_ID_FIELD:
# Some dupes may result, so we need to unpublish resources
# with matching unique values and different readable_ids
Expand Down Expand Up @@ -367,6 +381,10 @@ def load_course(

if config.prune:
# mark runs no longer included here as unpublished
# This generally should not be done when loading courses
# from a program (config.prune=False).
# The course ETL should be the ultimate source of truth for
# courses and their runs.
for run in learning_resource.runs.exclude(
run_id__in=run_ids_to_update_or_create
).filter(published=True):
Expand Down
30 changes: 30 additions & 0 deletions learning_resources/etl/loaders_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,36 @@ def test_load_course_dupe_urls(unique_url):
assert course.published is (unique_url is False)


@pytest.mark.parametrize("course_exists", [True, False])
def test_load_course_fetch_only(mocker, course_exists):
"""When fetch_only is True, course should just be fetched from db"""
mock_next_runs_prices = mocker.patch(
"learning_resources.etl.loaders.load_next_start_date_and_prices"
)
mock_warn = mocker.patch("learning_resources.etl.loaders.log.warning")
platform = LearningResourcePlatformFactory.create(code=PlatformType.mitpe.name)
if course_exists:
resource = LearningResourceFactory.create(is_course=True, platform=platform)
else:
resource = LearningResourceFactory.build(is_course=True, platform=platform)

props = {
"readable_id": resource.readable_id,
"platform": platform.code,
"offered_by": {"code": OfferedBy.ocw.name},
}
result = load_course(props, [], [], config=CourseLoaderConfig(fetch_only=True))
if course_exists:
assert result == resource
mock_warn.assert_not_called()
else:
assert result is None
mock_warn.assert_called_once_with(
"No published resource found for %s", resource.readable_id
)
mock_next_runs_prices.assert_not_called()


@pytest.mark.parametrize("run_exists", [True, False])
@pytest.mark.parametrize(
"availability", [RunAvailability.archived.value, RunAvailability.current.value]
Expand Down
14 changes: 10 additions & 4 deletions learning_resources/etl/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@
micromasters_etl = compose(
load_programs(
ETLSource.micromasters.name,
config=ProgramLoaderConfig(prune=True, courses=CourseLoaderConfig()),
config=ProgramLoaderConfig(
prune=True, courses=CourseLoaderConfig(fetch_only=True)
),
),
micromasters.transform,
micromasters.extract,
Expand All @@ -53,7 +55,9 @@
mitxonline_programs_etl = compose(
load_programs(
ETLSource.mitxonline.name,
config=ProgramLoaderConfig(courses=CourseLoaderConfig(prune=True), prune=True),
config=ProgramLoaderConfig(
courses=CourseLoaderConfig(fetch_only=True), prune=True
),
),
mitxonline.transform_programs,
mitxonline.extract_programs,
Expand All @@ -74,7 +78,7 @@
prolearn_programs_etl = compose(
load_programs(
ETLSource.prolearn.name,
config=ProgramLoaderConfig(courses=CourseLoaderConfig(prune=True)),
config=ProgramLoaderConfig(courses=CourseLoaderConfig(fetch_only=True)),
),
prolearn.transform_programs,
prolearn.extract_programs,
Expand All @@ -91,7 +95,9 @@
xpro_programs_etl = compose(
load_programs(
ETLSource.xpro.name,
config=ProgramLoaderConfig(courses=CourseLoaderConfig(prune=True)),
config=ProgramLoaderConfig(
courses=CourseLoaderConfig(fetch_only=True), prune=True
),
),
xpro.transform_programs,
xpro.extract_programs,
Expand Down
14 changes: 10 additions & 4 deletions learning_resources/etl/pipelines_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ def test_mitxonline_programs_etl():
mock_load_programs.assert_called_once_with(
ETLSource.mitxonline.name,
mock_transform.return_value,
config=ProgramLoaderConfig(courses=CourseLoaderConfig(prune=True), prune=True),
config=ProgramLoaderConfig(
courses=CourseLoaderConfig(fetch_only=True), prune=True
),
)

assert result == mock_load_programs.return_value
Expand Down Expand Up @@ -146,7 +148,9 @@ def test_xpro_programs_etl():
mock_load_programs.assert_called_once_with(
ETLSource.xpro.name,
mock_transform.return_value,
config=ProgramLoaderConfig(courses=CourseLoaderConfig(prune=True)),
config=ProgramLoaderConfig(
courses=CourseLoaderConfig(fetch_only=True), prune=True
),
)

assert result == mock_load_programs.return_value
Expand Down Expand Up @@ -296,7 +300,9 @@ def test_micromasters_etl():
mock_load_programs.assert_called_once_with(
ETLSource.micromasters.name,
mock_transform.return_value,
config=ProgramLoaderConfig(prune=True, courses=CourseLoaderConfig()),
config=ProgramLoaderConfig(
prune=True, courses=CourseLoaderConfig(fetch_only=True)
),
)

assert result == mock_load_programs.return_value
Expand All @@ -317,7 +323,7 @@ def test_prolearn_programs_etl():
mock_load_programs.assert_called_once_with(
ETLSource.prolearn.name,
mock_transform.return_value,
config=ProgramLoaderConfig(courses=CourseLoaderConfig(prune=True)),
config=ProgramLoaderConfig(courses=CourseLoaderConfig(fetch_only=True)),
)

assert result == mock_load_programs.return_value
Expand Down

0 comments on commit 00bf0f3

Please sign in to comment.