Skip to content

Commit

Permalink
Plain text news/events titles/authors; standardize html cleanup (#1248)
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertrand authored Jul 11, 2024
1 parent fec3034 commit 6c65b1e
Show file tree
Hide file tree
Showing 27 changed files with 123 additions and 147 deletions.
30 changes: 0 additions & 30 deletions learning_resources/etl/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,33 +80,3 @@ class CourseNumberType(Enum):
for value in LearningResourceFormat.values()
},
}


ALLOWED_HTML_TAGS = {
"b",
"blockquote",
"br",
"caption",
"center",
"cite",
"code",
"div",
"em",
"hr",
"i",
"li",
"ol",
"p",
"pre",
"q",
"small",
"span",
"strike",
"strong",
"sub",
"sup",
"u",
"ul",
}

ALLOWED_HTML_ATTRIBUTES = {}
2 changes: 1 addition & 1 deletion learning_resources/etl/mitxonline.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@
)
from learning_resources.etl.constants import ETLSource
from learning_resources.etl.utils import (
clean_data,
extract_valid_department_from_id,
generate_course_numbers_json,
parse_certification,
transform_topics,
)
from main.utils import clean_data

log = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion learning_resources/etl/mitxonline_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@
)
from learning_resources.etl.utils import (
UCC_TOPIC_MAPPINGS,
clean_data,
extract_valid_department_from_id,
parse_certification,
)
from main.test_utils import any_instance_of
from main.utils import clean_data

pytestmark = pytest.mark.django_db

Expand Down
2 changes: 1 addition & 1 deletion learning_resources/etl/ocw.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
)
from learning_resources.etl.constants import ETLSource
from learning_resources.etl.utils import (
clean_data,
extract_text_metadata,
generate_course_numbers_json,
get_content_type,
Expand All @@ -38,6 +37,7 @@
parse_instructors,
safe_load_json,
)
from main.utils import clean_data

log = logging.getLogger(__name__)

Expand Down
3 changes: 1 addition & 2 deletions learning_resources/etl/ocw_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,13 @@
transform_contentfile,
transform_course,
)
from learning_resources.etl.utils import clean_data
from learning_resources.factories import ContentFileFactory
from learning_resources.models import ContentFile
from learning_resources.utils import (
get_s3_object_and_read,
safe_load_json,
)
from main.utils import now_in_utc
from main.utils import clean_data, now_in_utc

pytestmark = pytest.mark.django_db

Expand Down
2 changes: 1 addition & 1 deletion learning_resources/etl/openedx.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@
)
from learning_resources.etl.constants import COMMON_HEADERS
from learning_resources.etl.utils import (
clean_data,
extract_valid_department_from_id,
generate_course_numbers_json,
parse_certification,
transform_levels,
without_none,
)
from learning_resources.utils import get_year_and_semester
from main.utils import clean_data

MIT_OWNER_KEYS = ["MITx", "MITx_PRO"]

Expand Down
3 changes: 1 addition & 2 deletions learning_resources/etl/podcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@

from learning_resources.constants import LearningResourceType
from learning_resources.etl.constants import ETLSource
from learning_resources.etl.utils import clean_data
from learning_resources.models import PodcastEpisode
from main.utils import frontend_absolute_url, now_in_utc
from main.utils import clean_data, frontend_absolute_url, now_in_utc

CONFIG_FILE_REPO = "mitodl/open-podcast-data"
CONFIG_FILE_FOLDER = "podcasts"
Expand Down
4 changes: 2 additions & 2 deletions learning_resources/etl/prolearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@

from learning_resources.constants import CertificationType
from learning_resources.etl.constants import ETLSource
from learning_resources.etl.utils import clean_data, transform_format, transform_topics
from learning_resources.etl.utils import transform_format, transform_topics
from learning_resources.models import LearningResourceOfferor, LearningResourcePlatform
from main.utils import now_in_utc
from main.utils import clean_data, now_in_utc

log = logging.getLogger(__name__)

Expand Down
3 changes: 2 additions & 1 deletion learning_resources/etl/prolearn_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,14 @@
transform_programs,
update_format,
)
from learning_resources.etl.utils import clean_data, transform_format
from learning_resources.etl.utils import transform_format
from learning_resources.factories import (
LearningResourceOfferorFactory,
LearningResourcePlatformFactory,
)
from learning_resources.models import LearningResourceOfferor, LearningResourcePlatform
from main.test_utils import assert_json_equal
from main.utils import clean_data

pytestmark = pytest.mark.django_db

Expand Down
12 changes: 0 additions & 12 deletions learning_resources/etl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
from django.conf import settings
from django.utils.functional import SimpleLazyObject
from django.utils.text import slugify
from nh3 import nh3
from tika import parser as tika_parser
from xbundle import XBundle

Expand All @@ -40,8 +39,6 @@
OfferedBy,
)
from learning_resources.etl.constants import (
ALLOWED_HTML_ATTRIBUTES,
ALLOWED_HTML_TAGS,
RESOURCE_FORMAT_MAPPING,
CourseNumberType,
ETLSource,
Expand Down Expand Up @@ -692,12 +689,3 @@ def parse_certification(offeror, runs_data):
if (availability and availability != AvailabilityType.archived.value)
]
)


def clean_data(data: str) -> str:
"""Remove unwanted html tags from text"""
if data:
return nh3.clean(
data, tags=ALLOWED_HTML_TAGS, attributes=ALLOWED_HTML_ATTRIBUTES
)
return ""
29 changes: 0 additions & 29 deletions learning_resources/etl/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,35 +435,6 @@ def test_parse_certification(offered_by, availability, has_cert):
)


@pytest.mark.parametrize(
("input_text", "output_text"),
[
("The cat sat on the mat & spat.\n", "The cat sat on the mat & spat.\n"),
(
"the <b class='foo'>dog</b> chased a <a href='http://hog.mit.edu'>hog</a>",
"the <b>dog</b> chased a hog",
),
(
"<p><style type='text/css'> <!--/*--><![CDATA[/* ><!--*/ <!--td {border: 1px solid #ccc;}br {mso-data-placement:same-cell;}--> /*--><!]]>*/ </style>What a mess</p>",
"<p>What a mess</p>",
),
(
"<script type='javascript'>alert('xss');</script><style>\nh1 {color:red;}\np {color:blue;}\n</style><p>Some text</p>",
"<p>Some text</p>",
),
(
"<p><img src='' onerror='alert(\"xss!\")'/>Hello, world!</p>",
"<p>Hello, world!</p>",
),
(None, ""),
("", ""),
],
)
def test_clean_data(input_text, output_text):
"""clean_data function should return expected output"""
assert utils.clean_data(input_text) == output_text


@pytest.mark.parametrize(
("previous_archive", "identical"),
[
Expand Down
2 changes: 1 addition & 1 deletion learning_resources/etl/xpro.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
)
from learning_resources.etl.constants import ETLSource
from learning_resources.etl.utils import (
clean_data,
generate_course_numbers_json,
transform_format,
transform_topics,
)
from main.utils import clean_data

log = logging.getLogger(__name__)

Expand Down
3 changes: 1 addition & 2 deletions learning_resources/etl/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@
from learning_resources.etl.constants import ETLSource
from learning_resources.etl.exceptions import ExtractException
from learning_resources.etl.loaders import update_index
from learning_resources.etl.utils import clean_data
from learning_resources.models import LearningResource
from main.utils import now_in_utc
from main.utils import clean_data, now_in_utc

CONFIG_FILE_REPO = "mitodl/open-video-data"
CONFIG_FILE_FOLDER = "youtube"
Expand Down
2 changes: 1 addition & 1 deletion learning_resources/etl/youtube_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
from learning_resources.etl import youtube
from learning_resources.etl.constants import ETLSource
from learning_resources.etl.exceptions import ExtractException
from learning_resources.etl.utils import clean_data
from learning_resources.factories import VideoFactory
from main.utils import clean_data


@pytest.fixture()
Expand Down
27 changes: 27 additions & 0 deletions main/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,30 @@

ISOFORMAT = "%Y-%m-%dT%H:%M:%SZ"
VALID_HTTP_METHODS = ["get", "post", "patch", "delete"]
ALLOWED_HTML_TAGS = {
"b",
"blockquote",
"br",
"caption",
"center",
"cite",
"code",
"div",
"em",
"hr",
"i",
"li",
"ol",
"p",
"pre",
"q",
"small",
"span",
"strike",
"strong",
"sub",
"sup",
"u",
"ul",
}
ALLOWED_HTML_ATTRIBUTES = {}
12 changes: 12 additions & 0 deletions main/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
import markdown2
from bs4 import BeautifulSoup
from django.conf import settings
from nh3 import nh3

from main.constants import ALLOWED_HTML_ATTRIBUTES, ALLOWED_HTML_TAGS

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -310,3 +313,12 @@ def frontend_absolute_url(relative_path: str) -> str:
str: absolute url path to the frontend
"""
return urljoin(settings.APP_BASE_URL, relative_path)


def clean_data(data: str) -> str:
"""Remove unwanted html tags from text"""
if data:
return nh3.clean(
data, tags=ALLOWED_HTML_TAGS, attributes=ALLOWED_HTML_ATTRIBUTES
)
return ""
30 changes: 30 additions & 0 deletions main/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from main.factories import UserFactory
from main.utils import (
chunks,
clean_data,
extract_values,
filter_dict_keys,
filter_dict_with_renamed_keys,
Expand Down Expand Up @@ -211,3 +212,32 @@ def test_frontend_absolute_url(settings):
assert frontend_absolute_url("/") == "http://example.com/"
assert frontend_absolute_url("/path") == "http://example.com/path"
assert frontend_absolute_url("path") == "http://example.com/path"


@pytest.mark.parametrize(
("input_text", "output_text"),
[
("The cat sat on the mat & spat.\n", "The cat sat on the mat &amp; spat.\n"),
(
"the <b class='foo'>dog</b> chased a <a href='http://hog.mit.edu'>hog</a>",
"the <b>dog</b> chased a hog",
),
(
"<p><style type='text/css'> <!--/*--><![CDATA[/* ><!--*/ <!--td {border: 1px solid #ccc;}br {mso-data-placement:same-cell;}--> /*--><!]]>*/ </style>What a mess</p>",
"<p>What a mess</p>",
),
(
"<script type='javascript'>alert('xss');</script><style>\nh1 {color:red;}\np {color:blue;}\n</style><p>Some text</p>",
"<p>Some text</p>",
),
(
"<p><img src='' onerror='alert(\"xss!\")'/>Hello, world!</p>",
"<p>Hello, world!</p>",
),
(None, ""),
("", ""),
],
)
def test_clean_data(input_text, output_text):
"""clean_data function should return expected output"""
assert clean_data(input_text) == output_text
5 changes: 3 additions & 2 deletions news_events/etl/medium_mit_news.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import feedparser
from bs4 import BeautifulSoup

from main.utils import clean_data
from news_events.constants import FeedType
from news_events.etl.utils import stringify_time_struct

Expand Down Expand Up @@ -65,8 +66,8 @@ def transform_items(items_data: list[dict]) -> list[dict]:
"guid": item.get("id"),
"title": item.get("title", ""),
"url": item.get("link", None),
"summary": item.get("summary", ""),
"content": content,
"summary": clean_data(item.get("summary", "")),
"content": clean_data(content),
"image": image_data,
"detail": {
"authors": [
Expand Down
4 changes: 2 additions & 2 deletions news_events/etl/medium_mit_news_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ def test_transform_items(mocker, medium_mit_rss_data):
"guid": "https://medium.com/p/a685e2b89c6a",
"title": "Meet 8 MIT women faculty who teach MITx courses and lead cutting-edge research",
"url": "https://medium.com/open-learning/meet-8-mit-women-faculty",
"summary": "<h4>Celebrating Women\u2019s History Month with MIT women\u2019s truncated</h4>",
"content": "<h4>Celebrating Women\u2019s History Month with MIT women\u2019s truncated</h4>",
"summary": "Celebrating Women\u2019s History Month with MIT women\u2019s truncated",
"content": "Celebrating Women\u2019s History Month with MIT women\u2019s truncated",
"image": None,
"detail": {
"authors": ["MIT Open Learning"],
Expand Down
7 changes: 3 additions & 4 deletions news_events/etl/ol_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@
from zoneinfo import ZoneInfo

from dateutil import parser
from django.utils.html import strip_tags

from main.utils import now_in_utc
from main.utils import clean_data, now_in_utc
from news_events.constants import FeedType
from news_events.etl.utils import get_request_json

Expand Down Expand Up @@ -173,8 +172,8 @@ def transform_event(event_data: dict) -> dict or None:
event_data.get("relationships", {}).get("field_event_image", {})
)
),
"summary": strip_tags(attributes.get("body", {}).get("value") or ""),
"content": strip_tags(attributes.get("body", {}).get("value") or ""),
"summary": clean_data(attributes.get("body", {}).get("value") or ""),
"content": clean_data(attributes.get("body", {}).get("value") or ""),
"detail": {
"location": transform_relationship(
extract_relationship(event_data, "field_location_tag")
Expand Down
Loading

0 comments on commit 6c65b1e

Please sign in to comment.