-
Notifications
You must be signed in to change notification settings - Fork 114
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #7981 from cfpb/report-raw-html
Add `report_raw_html` management command
- Loading branch information
Showing
2 changed files
with
271 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
import collections | ||
import csv | ||
import re | ||
|
||
from django.apps import apps | ||
from django.core.management.base import BaseCommand | ||
|
||
from wagtail.models import Site, get_page_models | ||
|
||
|
||
def find_pattern(data, pattern, key, path=None): | ||
"""Return the path and match for a given regular expression in JSON | ||
For the given JSON-like data descend through the data structures looking | ||
for strings that match the `re.compile()`ed pattern. | ||
When one is found, the path to the match will be yielded along with any | ||
matching strings. | ||
If the path goes through a JSON array, it will include | ||
the index in the array. | ||
If the path goes through a JSON object, the value of the given "key" if | ||
it exists on the object will be used to identify it in the path. | ||
""" | ||
if path is None: | ||
path = [] | ||
|
||
# If it's a string, try to match it | ||
if isinstance(data, str): | ||
matches = pattern.findall(data) | ||
if len(matches) > 0: | ||
yield path, matches | ||
|
||
# If it's a mapping, iterate over its key/value pairs | ||
elif isinstance(data, collections.Mapping): | ||
local_path = path | ||
if key in data: | ||
local_path = path + [data[key]] | ||
|
||
for item in data: | ||
yield from find_pattern(data[item], pattern, key, path=local_path) | ||
|
||
# If it's a sequence, iterate over its members | ||
elif isinstance(data, collections.Sequence) and not isinstance(data, str): | ||
for index, item in enumerate(data): | ||
local_path = path + [str(index)] | ||
yield from find_pattern(item, pattern, key, path=local_path) | ||
|
||
|
||
class Command(BaseCommand): | ||
help = ( | ||
"Discover raw HTML tags within < > entities in page fields. " | ||
"Pass a list in the form of app_name.page_type.field for each page " | ||
"type and field you want to report on." | ||
) | ||
|
||
# Match "<" followed by any 0 or more characters that are not ">", | ||
# followed by ">" This should match HTML that's encoded in HTML entities | ||
# in the given fields. | ||
html_tag_entity_pattern = r"<[a-zA-Z]+(?!>).*?>" | ||
html_tag_entity_re = re.compile( | ||
html_tag_entity_pattern, re.MULTILINE | re.DOTALL | ||
) | ||
|
||
def add_arguments(self, parser): | ||
parser.add_argument( | ||
"pagetype", | ||
nargs="*", | ||
help=( | ||
"Specify the page type(s) and field to check." | ||
"This should be given in the form app_name.page_type.field " | ||
"to include a page type in the given app with the given field." | ||
"For example, v1.BrowsePage.content." | ||
), | ||
) | ||
|
||
def handle(self, *args, **options): | ||
raw_html_writer = csv.writer(self.stdout) | ||
raw_html_writer.writerow( | ||
( | ||
"Page ID", | ||
"Page Type", | ||
"Page Title", | ||
"Page URL", | ||
"Raw HTML Block Path", | ||
"Raw HTML", | ||
) | ||
) | ||
|
||
pagetypes = options["pagetype"] | ||
if len(pagetypes) == 0: | ||
pagetypes = self.get_all_page_models_and_stream_fields() | ||
|
||
for app_name_page_type_field in pagetypes: | ||
app_name, page_type, field = app_name_page_type_field.split(".") | ||
for match_data in self.get_matches(app_name, page_type, field): | ||
raw_html_writer.writerow(match_data) | ||
|
||
def get_all_page_models_and_stream_fields(self): | ||
page_models = get_page_models() | ||
|
||
pagetypes = [] | ||
for page_model in page_models: | ||
for streamfield_name in page_model.get_streamfield_names(): | ||
pagetypes.append( | ||
f"{page_model._meta.app_label}." | ||
f"{page_model._meta.object_name}." | ||
f"{streamfield_name}" | ||
) | ||
|
||
return pagetypes | ||
|
||
def get_matches(self, app_name, page_type, field): | ||
field_filter = {f"{field}__iregex": self.html_tag_entity_pattern} | ||
PageModel = apps.get_model(app_label=app_name, model_name=page_type) | ||
|
||
site = Site.objects.get(is_default_site=True) | ||
queryset = ( | ||
PageModel.objects.live() | ||
.in_site(site) | ||
.filter(**field_filter) | ||
.exact_type(PageModel) | ||
) | ||
|
||
for p in queryset: | ||
json_field = getattr(p, field) | ||
|
||
for path, matches in find_pattern( | ||
json_field.raw_data, self.html_tag_entity_re, "type" | ||
): | ||
yield [ | ||
p.id, | ||
page_type, | ||
p.title, | ||
p.get_url(), | ||
".".join(path), | ||
] + matches |
133 changes: 133 additions & 0 deletions
133
cfgov/v1/tests/management/commands/test_report_raw_html.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
import json | ||
import re | ||
from io import StringIO | ||
from unittest import TestCase | ||
|
||
from django.core.management import call_command | ||
from django.test import TestCase as DjangoTestCase | ||
|
||
from wagtail.models import Site | ||
|
||
from v1.management.commands.report_raw_html import find_pattern | ||
from v1.models import BrowsePage | ||
|
||
|
||
class FindPatternTestCase(TestCase): | ||
def setUp(self): | ||
self.pattern = re.compile(r"(dol\w+em)") | ||
self.str_data_matching = ( | ||
"Neque porro quisquam est qui dolorem ipsum dolurem" | ||
) | ||
self.str_data_not_matching = ( | ||
"quia dolor sit amet, consectetur, adipisci velit" | ||
) | ||
self.object_data_matching = { | ||
"id": "abcdefg0123456789", | ||
"type": "myblock", | ||
"value": self.str_data_matching, | ||
} | ||
self.object_data_not_matching = { | ||
"id": "abcdefg0123456789", | ||
"type": "myblock", | ||
"value": self.str_data_not_matching, | ||
} | ||
|
||
def test_string_matches_with_path(self): | ||
path_to_match = ["first"] | ||
path, matches = next( | ||
find_pattern( | ||
self.str_data_matching, | ||
self.pattern, | ||
"type", | ||
path=path_to_match, | ||
) | ||
) | ||
self.assertEqual(path, path_to_match) | ||
self.assertEqual(matches, ["dolorem", "dolurem"]) | ||
|
||
def test_string_matches_without_path(self): | ||
path, matches = next( | ||
find_pattern(self.str_data_matching, self.pattern, "type") | ||
) | ||
self.assertEqual(path, []) | ||
|
||
def test_string_does_not_match(self): | ||
with self.assertRaises(StopIteration): | ||
next( | ||
find_pattern(self.str_data_not_matching, self.pattern, "type") | ||
) | ||
|
||
def test_object_with_key_and_path(self): | ||
path_to_match = ["first"] | ||
path, matches = next( | ||
find_pattern( | ||
self.object_data_matching, | ||
self.pattern, | ||
"type", | ||
path=path_to_match, | ||
) | ||
) | ||
self.assertEqual(path, path_to_match + ["myblock"]) | ||
|
||
def test_object_without_key(self): | ||
with self.assertRaises(StopIteration): | ||
next( | ||
find_pattern( | ||
self.object_data_not_matching, self.pattern, "type" | ||
) | ||
) | ||
|
||
def test_sequence_of_objects(self): | ||
path_to_match = ["first"] | ||
path, matches = next( | ||
find_pattern( | ||
[self.object_data_matching], | ||
self.pattern, | ||
"type", | ||
path=path_to_match, | ||
) | ||
) | ||
self.assertEqual(path, path_to_match + ["0", "myblock"]) | ||
|
||
|
||
class ReportRawHTMLTestCase(DjangoTestCase): | ||
def setUp(self): | ||
html_text = "<p>Some rich text<br>here.</p>" | ||
|
||
full_width_text = json.dumps( | ||
[ | ||
{ | ||
"type": "full_width_text", | ||
"value": [ | ||
{ | ||
"type": "content", | ||
"value": html_text, | ||
}, | ||
], | ||
}, | ||
] | ||
) | ||
|
||
page = BrowsePage(title="test", slug="testpage") | ||
page.content = full_width_text | ||
|
||
site_root = Site.objects.get(is_default_site=True).root_page | ||
site_root.add_child(instance=page) | ||
|
||
page.save_revision().publish() | ||
|
||
def test_report_raw_html_with_page_type_and_field(self): | ||
output = StringIO() | ||
call_command("report_raw_html", "v1.BrowsePage.content", stdout=output) | ||
self.assertIn( | ||
"0.full_width_text.0.content", | ||
output.getvalue(), | ||
) | ||
|
||
def test_report_raw_html(self): | ||
output = StringIO() | ||
call_command("report_raw_html", stdout=output) | ||
self.assertIn( | ||
"0.full_width_text.0.content", | ||
output.getvalue(), | ||
) |