diff --git a/cfgov/v1/management/commands/report_raw_html.py b/cfgov/v1/management/commands/report_raw_html.py new file mode 100644 index 00000000000..f6d68820c4d --- /dev/null +++ b/cfgov/v1/management/commands/report_raw_html.py @@ -0,0 +1,138 @@ +import collections +import csv +import re + +from django.apps import apps +from django.core.management.base import BaseCommand + +from wagtail.models import Site, get_page_models + + +def find_pattern(data, pattern, key, path=None): + """Return the path and match for a given regular expression in JSON + + For the given JSON-like data descend through the data structures looking + for strings that match the `re.compile()`ed pattern. + + When one is found, the path to the match will be yielded along with any + matching strings. + + If the path goes through a JSON array, it will include + the index in the array. + + If the path goes through a JSON object, the value of the given "key" if + it exists on the object will be used to identify it in the path. + """ + if path is None: + path = [] + + # If it's a string, try to match it + if isinstance(data, str): + matches = pattern.findall(data) + if len(matches) > 0: + yield path, matches + + # If it's a mapping, iterate over its key/value pairs + elif isinstance(data, collections.Mapping): + local_path = path + if key in data: + local_path = path + [data[key]] + + for item in data: + yield from find_pattern(data[item], pattern, key, path=local_path) + + # If it's a sequence, iterate over its members + elif isinstance(data, collections.Sequence) and not isinstance(data, str): + for index, item in enumerate(data): + local_path = path + [str(index)] + yield from find_pattern(item, pattern, key, path=local_path) + + +class Command(BaseCommand): + help = ( + "Discover raw HTML tags within < > entities in page fields. " + "Pass a list in the form of app_name.page_type.field for each page " + "type and field you want to report on." + ) + + # Match "<" followed by any 0 or more characters that are not ">", + # followed by ">" This should match HTML that's encoded in HTML entities + # in the given fields. + html_tag_entity_pattern = r"<[a-zA-Z]+(?!>).*?>" + html_tag_entity_re = re.compile( + html_tag_entity_pattern, re.MULTILINE | re.DOTALL + ) + + def add_arguments(self, parser): + parser.add_argument( + "pagetype", + nargs="*", + help=( + "Specify the page type(s) and field to check." + "This should be given in the form app_name.page_type.field " + "to include a page type in the given app with the given field." + "For example, v1.BrowsePage.content." + ), + ) + + def handle(self, *args, **options): + raw_html_writer = csv.writer(self.stdout) + raw_html_writer.writerow( + ( + "Page ID", + "Page Type", + "Page Title", + "Page URL", + "Raw HTML Block Path", + "Raw HTML", + ) + ) + + pagetypes = options["pagetype"] + if len(pagetypes) == 0: + pagetypes = self.get_all_page_models_and_stream_fields() + + for app_name_page_type_field in pagetypes: + app_name, page_type, field = app_name_page_type_field.split(".") + for match_data in self.get_matches(app_name, page_type, field): + raw_html_writer.writerow(match_data) + + def get_all_page_models_and_stream_fields(self): + page_models = get_page_models() + + pagetypes = [] + for page_model in page_models: + for streamfield_name in page_model.get_streamfield_names(): + pagetypes.append( + f"{page_model._meta.app_label}." + f"{page_model._meta.object_name}." + f"{streamfield_name}" + ) + + return pagetypes + + def get_matches(self, app_name, page_type, field): + field_filter = {f"{field}__iregex": self.html_tag_entity_pattern} + PageModel = apps.get_model(app_label=app_name, model_name=page_type) + + site = Site.objects.get(is_default_site=True) + queryset = ( + PageModel.objects.live() + .in_site(site) + .filter(**field_filter) + .exact_type(PageModel) + ) + + for p in queryset: + json_field = getattr(p, field) + + for path, matches in find_pattern( + json_field.raw_data, self.html_tag_entity_re, "type" + ): + yield [ + p.id, + page_type, + p.title, + p.get_url(), + ".".join(path), + ] + matches diff --git a/cfgov/v1/tests/management/commands/test_report_raw_html.py b/cfgov/v1/tests/management/commands/test_report_raw_html.py new file mode 100644 index 00000000000..fc3beca6ed9 --- /dev/null +++ b/cfgov/v1/tests/management/commands/test_report_raw_html.py @@ -0,0 +1,133 @@ +import json +import re +from io import StringIO +from unittest import TestCase + +from django.core.management import call_command +from django.test import TestCase as DjangoTestCase + +from wagtail.models import Site + +from v1.management.commands.report_raw_html import find_pattern +from v1.models import BrowsePage + + +class FindPatternTestCase(TestCase): + def setUp(self): + self.pattern = re.compile(r"(dol\w+em)") + self.str_data_matching = ( + "Neque porro quisquam est qui dolorem ipsum dolurem" + ) + self.str_data_not_matching = ( + "quia dolor sit amet, consectetur, adipisci velit" + ) + self.object_data_matching = { + "id": "abcdefg0123456789", + "type": "myblock", + "value": self.str_data_matching, + } + self.object_data_not_matching = { + "id": "abcdefg0123456789", + "type": "myblock", + "value": self.str_data_not_matching, + } + + def test_string_matches_with_path(self): + path_to_match = ["first"] + path, matches = next( + find_pattern( + self.str_data_matching, + self.pattern, + "type", + path=path_to_match, + ) + ) + self.assertEqual(path, path_to_match) + self.assertEqual(matches, ["dolorem", "dolurem"]) + + def test_string_matches_without_path(self): + path, matches = next( + find_pattern(self.str_data_matching, self.pattern, "type") + ) + self.assertEqual(path, []) + + def test_string_does_not_match(self): + with self.assertRaises(StopIteration): + next( + find_pattern(self.str_data_not_matching, self.pattern, "type") + ) + + def test_object_with_key_and_path(self): + path_to_match = ["first"] + path, matches = next( + find_pattern( + self.object_data_matching, + self.pattern, + "type", + path=path_to_match, + ) + ) + self.assertEqual(path, path_to_match + ["myblock"]) + + def test_object_without_key(self): + with self.assertRaises(StopIteration): + next( + find_pattern( + self.object_data_not_matching, self.pattern, "type" + ) + ) + + def test_sequence_of_objects(self): + path_to_match = ["first"] + path, matches = next( + find_pattern( + [self.object_data_matching], + self.pattern, + "type", + path=path_to_match, + ) + ) + self.assertEqual(path, path_to_match + ["0", "myblock"]) + + +class ReportRawHTMLTestCase(DjangoTestCase): + def setUp(self): + html_text = "
Some rich text<br>here.
" + + full_width_text = json.dumps( + [ + { + "type": "full_width_text", + "value": [ + { + "type": "content", + "value": html_text, + }, + ], + }, + ] + ) + + page = BrowsePage(title="test", slug="testpage") + page.content = full_width_text + + site_root = Site.objects.get(is_default_site=True).root_page + site_root.add_child(instance=page) + + page.save_revision().publish() + + def test_report_raw_html_with_page_type_and_field(self): + output = StringIO() + call_command("report_raw_html", "v1.BrowsePage.content", stdout=output) + self.assertIn( + "0.full_width_text.0.content", + output.getvalue(), + ) + + def test_report_raw_html(self): + output = StringIO() + call_command("report_raw_html", stdout=output) + self.assertIn( + "0.full_width_text.0.content", + output.getvalue(), + )