Plain text news/events titles/authors; standardize html cleanup (#1248)

mitodl · Jul 11, 2024 · 6c65b1e · 6c65b1e
1 parent fec3034
commit 6c65b1e
Show file tree

Hide file tree

Showing 27 changed files with 123 additions and 147 deletions.
diff --git a/learning_resources/etl/constants.py b/learning_resources/etl/constants.py
@@ -80,33 +80,3 @@ class CourseNumberType(Enum):
         for value in LearningResourceFormat.values()
     },
 }
-
-
-ALLOWED_HTML_TAGS = {
-    "b",
-    "blockquote",
-    "br",
-    "caption",
-    "center",
-    "cite",
-    "code",
-    "div",
-    "em",
-    "hr",
-    "i",
-    "li",
-    "ol",
-    "p",
-    "pre",
-    "q",
-    "small",
-    "span",
-    "strike",
-    "strong",
-    "sub",
-    "sup",
-    "u",
-    "ul",
-}
-
-ALLOWED_HTML_ATTRIBUTES = {}
diff --git a/learning_resources/etl/mitxonline.py b/learning_resources/etl/mitxonline.py
@@ -19,12 +19,12 @@
 )
 from learning_resources.etl.constants import ETLSource
 from learning_resources.etl.utils import (
-    clean_data,
     extract_valid_department_from_id,
     generate_course_numbers_json,
     parse_certification,
     transform_topics,
 )
+from main.utils import clean_data
 
 log = logging.getLogger(__name__)
 

diff --git a/learning_resources/etl/mitxonline_test.py b/learning_resources/etl/mitxonline_test.py
@@ -31,11 +31,11 @@
 )
 from learning_resources.etl.utils import (
     UCC_TOPIC_MAPPINGS,
-    clean_data,
     extract_valid_department_from_id,
     parse_certification,
 )
 from main.test_utils import any_instance_of
+from main.utils import clean_data
 
 pytestmark = pytest.mark.django_db
 

diff --git a/learning_resources/etl/ocw.py b/learning_resources/etl/ocw.py
@@ -26,7 +26,6 @@
 )
 from learning_resources.etl.constants import ETLSource
 from learning_resources.etl.utils import (
-    clean_data,
     extract_text_metadata,
     generate_course_numbers_json,
     get_content_type,
@@ -38,6 +37,7 @@
     parse_instructors,
     safe_load_json,
 )
+from main.utils import clean_data
 
 log = logging.getLogger(__name__)
 

diff --git a/learning_resources/etl/ocw_test.py b/learning_resources/etl/ocw_test.py
@@ -16,14 +16,13 @@
     transform_contentfile,
     transform_course,
 )
-from learning_resources.etl.utils import clean_data
 from learning_resources.factories import ContentFileFactory
 from learning_resources.models import ContentFile
 from learning_resources.utils import (
     get_s3_object_and_read,
     safe_load_json,
 )
-from main.utils import now_in_utc
+from main.utils import clean_data, now_in_utc
 
 pytestmark = pytest.mark.django_db
 

diff --git a/learning_resources/etl/openedx.py b/learning_resources/etl/openedx.py
@@ -17,14 +17,14 @@
 )
 from learning_resources.etl.constants import COMMON_HEADERS
 from learning_resources.etl.utils import (
-    clean_data,
     extract_valid_department_from_id,
     generate_course_numbers_json,
     parse_certification,
     transform_levels,
     without_none,
 )
 from learning_resources.utils import get_year_and_semester
+from main.utils import clean_data
 
 MIT_OWNER_KEYS = ["MITx", "MITx_PRO"]
 

diff --git a/learning_resources/etl/podcast.py b/learning_resources/etl/podcast.py
@@ -12,9 +12,8 @@
 
 from learning_resources.constants import LearningResourceType
 from learning_resources.etl.constants import ETLSource
-from learning_resources.etl.utils import clean_data
 from learning_resources.models import PodcastEpisode
-from main.utils import frontend_absolute_url, now_in_utc
+from main.utils import clean_data, frontend_absolute_url, now_in_utc
 
 CONFIG_FILE_REPO = "mitodl/open-podcast-data"
 CONFIG_FILE_FOLDER = "podcasts"

diff --git a/learning_resources/etl/prolearn.py b/learning_resources/etl/prolearn.py
@@ -11,9 +11,9 @@
 
 from learning_resources.constants import CertificationType
 from learning_resources.etl.constants import ETLSource
-from learning_resources.etl.utils import clean_data, transform_format, transform_topics
+from learning_resources.etl.utils import transform_format, transform_topics
 from learning_resources.models import LearningResourceOfferor, LearningResourcePlatform
-from main.utils import now_in_utc
+from main.utils import clean_data, now_in_utc
 
 log = logging.getLogger(__name__)
 

diff --git a/learning_resources/etl/prolearn_test.py b/learning_resources/etl/prolearn_test.py
@@ -29,13 +29,14 @@
     transform_programs,
     update_format,
 )
-from learning_resources.etl.utils import clean_data, transform_format
+from learning_resources.etl.utils import transform_format
 from learning_resources.factories import (
     LearningResourceOfferorFactory,
     LearningResourcePlatformFactory,
 )
 from learning_resources.models import LearningResourceOfferor, LearningResourcePlatform
 from main.test_utils import assert_json_equal
+from main.utils import clean_data
 
 pytestmark = pytest.mark.django_db
 

diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py
@@ -23,7 +23,6 @@
 from django.conf import settings
 from django.utils.functional import SimpleLazyObject
 from django.utils.text import slugify
-from nh3 import nh3
 from tika import parser as tika_parser
 from xbundle import XBundle
 
@@ -40,8 +39,6 @@
     OfferedBy,
 )
 from learning_resources.etl.constants import (
-    ALLOWED_HTML_ATTRIBUTES,
-    ALLOWED_HTML_TAGS,
     RESOURCE_FORMAT_MAPPING,
     CourseNumberType,
     ETLSource,
@@ -692,12 +689,3 @@ def parse_certification(offeror, runs_data):
             if (availability and availability != AvailabilityType.archived.value)
         ]
     )
-
-
-def clean_data(data: str) -> str:
-    """Remove unwanted html tags from text"""
-    if data:
-        return nh3.clean(
-            data, tags=ALLOWED_HTML_TAGS, attributes=ALLOWED_HTML_ATTRIBUTES
-        )
-    return ""
diff --git a/learning_resources/etl/utils_test.py b/learning_resources/etl/utils_test.py
@@ -435,35 +435,6 @@ def test_parse_certification(offered_by, availability, has_cert):
     )
 
 
-@pytest.mark.parametrize(
-    ("input_text", "output_text"),
-    [
-        ("The cat sat on the mat & spat.\n", "The cat sat on the mat &amp; spat.\n"),
-        (
-            "the <b class='foo'>dog</b> chased a <a href='http://hog.mit.edu'>hog</a>",
-            "the <b>dog</b> chased a hog",
-        ),
-        (
-            "<p><style type='text/css'> <!--/*--><![CDATA[/* ><!--*/ <!--td {border: 1px solid #ccc;}br {mso-data-placement:same-cell;}--> /*--><!]]>*/ </style>What a mess</p>",
-            "<p>What a mess</p>",
-        ),
-        (
-            "<script type='javascript'>alert('xss');</script><style>\nh1 {color:red;}\np {color:blue;}\n</style><p>Some text</p>",
-            "<p>Some text</p>",
-        ),
-        (
-            "<p><img src='' onerror='alert(\"xss!\")'/>Hello, world!</p>",
-            "<p>Hello, world!</p>",
-        ),
-        (None, ""),
-        ("", ""),
-    ],
-)
-def test_clean_data(input_text, output_text):
-    """clean_data function should return expected output"""
-    assert utils.clean_data(input_text) == output_text
-
-
 @pytest.mark.parametrize(
     ("previous_archive", "identical"),
     [

diff --git a/learning_resources/etl/xpro.py b/learning_resources/etl/xpro.py
@@ -16,11 +16,11 @@
 )
 from learning_resources.etl.constants import ETLSource
 from learning_resources.etl.utils import (
-    clean_data,
     generate_course_numbers_json,
     transform_format,
     transform_topics,
 )
+from main.utils import clean_data
 
 log = logging.getLogger(__name__)
 

diff --git a/learning_resources/etl/youtube.py b/learning_resources/etl/youtube.py
@@ -22,9 +22,8 @@
 from learning_resources.etl.constants import ETLSource
 from learning_resources.etl.exceptions import ExtractException
 from learning_resources.etl.loaders import update_index
-from learning_resources.etl.utils import clean_data
 from learning_resources.models import LearningResource
-from main.utils import now_in_utc
+from main.utils import clean_data, now_in_utc
 
 CONFIG_FILE_REPO = "mitodl/open-video-data"
 CONFIG_FILE_FOLDER = "youtube"

diff --git a/learning_resources/etl/youtube_test.py b/learning_resources/etl/youtube_test.py
@@ -16,8 +16,8 @@
 from learning_resources.etl import youtube
 from learning_resources.etl.constants import ETLSource
 from learning_resources.etl.exceptions import ExtractException
-from learning_resources.etl.utils import clean_data
 from learning_resources.factories import VideoFactory
+from main.utils import clean_data
 
 
 @pytest.fixture()

diff --git a/main/constants.py b/main/constants.py
@@ -11,3 +11,30 @@
 
 ISOFORMAT = "%Y-%m-%dT%H:%M:%SZ"
 VALID_HTTP_METHODS = ["get", "post", "patch", "delete"]
+ALLOWED_HTML_TAGS = {
+    "b",
+    "blockquote",
+    "br",
+    "caption",
+    "center",
+    "cite",
+    "code",
+    "div",
+    "em",
+    "hr",
+    "i",
+    "li",
+    "ol",
+    "p",
+    "pre",
+    "q",
+    "small",
+    "span",
+    "strike",
+    "strong",
+    "sub",
+    "sup",
+    "u",
+    "ul",
+}
+ALLOWED_HTML_ATTRIBUTES = {}
diff --git a/main/utils.py b/main/utils.py
@@ -10,6 +10,9 @@
 import markdown2
 from bs4 import BeautifulSoup
 from django.conf import settings
+from nh3 import nh3
+
+from main.constants import ALLOWED_HTML_ATTRIBUTES, ALLOWED_HTML_TAGS
 
 log = logging.getLogger(__name__)
 
@@ -310,3 +313,12 @@ def frontend_absolute_url(relative_path: str) -> str:
         str: absolute url path to the frontend
     """
     return urljoin(settings.APP_BASE_URL, relative_path)
+
+
+def clean_data(data: str) -> str:
+    """Remove unwanted html tags from text"""
+    if data:
+        return nh3.clean(
+            data, tags=ALLOWED_HTML_TAGS, attributes=ALLOWED_HTML_ATTRIBUTES
+        )
+    return ""
diff --git a/main/utils_test.py b/main/utils_test.py
@@ -10,6 +10,7 @@
 from main.factories import UserFactory
 from main.utils import (
     chunks,
+    clean_data,
     extract_values,
     filter_dict_keys,
     filter_dict_with_renamed_keys,
@@ -211,3 +212,32 @@ def test_frontend_absolute_url(settings):
     assert frontend_absolute_url("/") == "http://example.com/"
     assert frontend_absolute_url("/path") == "http://example.com/path"
     assert frontend_absolute_url("path") == "http://example.com/path"
+
+
+@pytest.mark.parametrize(
+    ("input_text", "output_text"),
+    [
+        ("The cat sat on the mat & spat.\n", "The cat sat on the mat &amp; spat.\n"),
+        (
+            "the <b class='foo'>dog</b> chased a <a href='http://hog.mit.edu'>hog</a>",
+            "the <b>dog</b> chased a hog",
+        ),
+        (
+            "<p><style type='text/css'> <!--/*--><![CDATA[/* ><!--*/ <!--td {border: 1px solid #ccc;}br {mso-data-placement:same-cell;}--> /*--><!]]>*/ </style>What a mess</p>",
+            "<p>What a mess</p>",
+        ),
+        (
+            "<script type='javascript'>alert('xss');</script><style>\nh1 {color:red;}\np {color:blue;}\n</style><p>Some text</p>",
+            "<p>Some text</p>",
+        ),
+        (
+            "<p><img src='' onerror='alert(\"xss!\")'/>Hello, world!</p>",
+            "<p>Hello, world!</p>",
+        ),
+        (None, ""),
+        ("", ""),
+    ],
+)
+def test_clean_data(input_text, output_text):
+    """clean_data function should return expected output"""
+    assert clean_data(input_text) == output_text
diff --git a/news_events/etl/medium_mit_news.py b/news_events/etl/medium_mit_news.py
@@ -3,6 +3,7 @@
 import feedparser
 from bs4 import BeautifulSoup
 
+from main.utils import clean_data
 from news_events.constants import FeedType
 from news_events.etl.utils import stringify_time_struct
 
@@ -65,8 +66,8 @@ def transform_items(items_data: list[dict]) -> list[dict]:
             "guid": item.get("id"),
             "title": item.get("title", ""),
             "url": item.get("link", None),
-            "summary": item.get("summary", ""),
-            "content": content,
+            "summary": clean_data(item.get("summary", "")),
+            "content": clean_data(content),
             "image": image_data,
             "detail": {
                 "authors": [

diff --git a/news_events/etl/medium_mit_news_test.py b/news_events/etl/medium_mit_news_test.py
@@ -71,8 +71,8 @@ def test_transform_items(mocker, medium_mit_rss_data):
         "guid": "https://medium.com/p/a685e2b89c6a",
         "title": "Meet 8 MIT women faculty who teach MITx courses and lead cutting-edge research",
         "url": "https://medium.com/open-learning/meet-8-mit-women-faculty",
-        "summary": "<h4>Celebrating Women\u2019s History Month with MIT women\u2019s truncated</h4>",
-        "content": "<h4>Celebrating Women\u2019s History Month with MIT women\u2019s truncated</h4>",
+        "summary": "Celebrating Women\u2019s History Month with MIT women\u2019s truncated",
+        "content": "Celebrating Women\u2019s History Month with MIT women\u2019s truncated",
         "image": None,
         "detail": {
             "authors": ["MIT Open Learning"],

diff --git a/news_events/etl/ol_events.py b/news_events/etl/ol_events.py
@@ -6,9 +6,8 @@
 from zoneinfo import ZoneInfo
 
 from dateutil import parser
-from django.utils.html import strip_tags
 
-from main.utils import now_in_utc
+from main.utils import clean_data, now_in_utc
 from news_events.constants import FeedType
 from news_events.etl.utils import get_request_json
 
@@ -173,8 +172,8 @@ def transform_event(event_data: dict) -> dict or None:
                 event_data.get("relationships", {}).get("field_event_image", {})
             )
         ),
-        "summary": strip_tags(attributes.get("body", {}).get("value") or ""),
-        "content": strip_tags(attributes.get("body", {}).get("value") or ""),
+        "summary": clean_data(attributes.get("body", {}).get("value") or ""),
+        "content": clean_data(attributes.get("body", {}).get("value") or ""),
         "detail": {
             "location": transform_relationship(
                 extract_relationship(event_data, "field_location_tag")