diff --git a/features/news_api_item.feature b/features/news_api_item.feature index 893de640..32b0adbb 100644 --- a/features/news_api_item.feature +++ b/features/news_api_item.feature @@ -204,4 +204,72 @@ Feature: News API Item "headline": "headline 1", "associations": {"featuremedia": {"renditions": {"original": {}} }} } + """ + + Scenario: Item request response strips embeds + Given "items" + """ + [{"_id": "111", "body_html": "
Once upon a time there was
who could swim
", + "headline": "headline 1", + "firstpublished": "#DATE-1#", "versioncreated": "#DATE#", + "associations": {"editor_19": {"products": [{"code": "1234"}], "renditions": {"original": {}} }}}] + """ + Given "products" + """ + [{"name": "A fishy Product", + "decsription": "a product for those interested in fish", + "companies" : [ + "#companies._id#" + ], + "query": "Once upon a time", + "product_type": "news_api" + }, + {"name": "A fishy superdesk product", + "description": "a superdesk product restricting images in the atom feed", + "companies" : [ + "#companies._id#" + ], + "sd_product_id": "1234", + "product_type": "news_api" + } + ] + """ + When we get "/news/item/111?format=NINJSFormatter&no_embeds=true&no_media=1" + Then we get existing resource + """ + { + "guid": "111", + "headline": "headline 1", + "body_html": "Once upon a time there was
who could swim
" + } + """ + When we get "/news/item/111?format=NINJSFormatter2&no_embeds=true" + Then we get existing resource + """ + { + "guid": "111", + "headline": "headline 1", + "body_html": "Once upon a time there was
who could swim
", + "associations": {"editor_19": {"renditions": {"original": {}}}} + } + """ + When we get "/news/item/111?format=NINJSFormatter2&no_media=true" + Then we get existing resource + """ + { + "guid": "111", + "headline": "headline 1", + "body_html": "Once upon a time there was
who could swim
", + "associations": {} + } + """ + When we get "/news/item/111?format=NINJSFormatter3" + Then we get existing resource + """ + { + "guid": "111", + "headline": "headline 1", + "body_html": "Once upon a time there was
who could swim
", + "associations": {} + } """ \ No newline at end of file diff --git a/newsroom/monitoring/utils.py b/newsroom/monitoring/utils.py index 9d7799b9..b5f5c611 100644 --- a/newsroom/monitoring/utils.py +++ b/newsroom/monitoring/utils.py @@ -1,9 +1,7 @@ from flask import current_app as app -from lxml import html as lxml_html -import re import collections from superdesk.text_utils import get_text -from newsroom.utils import get_items_by_id +from newsroom.utils import get_items_by_id, remove_all_embeds from superdesk import etree as sd_etree @@ -69,28 +67,3 @@ def get_items_for_monitoring_report(_ids, monitoring_profile, full_text=False): items = get_items_by_id(_ids, 'items') truncate_article_body(items, monitoring_profile, full_text) return items - - -def remove_all_embeds(item): - """ - Remove the all embeds from the body of the article - :param item: - :return: - """ - root_elem = lxml_html.fromstring(item.get('body_html') or '') - regex = r" EMBED START (?:Image|Video|Audio) {id: \"editor_([0-9]+)" - html_updated = False - comments = root_elem.xpath('//comment()') - for comment in comments: - m = re.search(regex, comment.text) - # if we've found an Embed Start comment - if m and m.group(1): - parent = comment.getparent() - for elem in comment.itersiblings(): - parent.remove(elem) - if elem.text and ' EMBED END ' in elem.text: - break - parent.remove(comment) - html_updated = True - if html_updated: - item["body_html"] = sd_etree.to_string(root_elem, method="html") diff --git a/newsroom/utils.py b/newsroom/utils.py index 7a06beb5..a8e430a9 100644 --- a/newsroom/utils.py +++ b/newsroom/utils.py @@ -5,6 +5,7 @@ import pytz import re from lxml import html as lxml_html +from lxml.html import clean from superdesk.etree import to_string from superdesk.utc import utcnow @@ -460,3 +461,39 @@ def update_embeds_in_body(item, update_image=None, update_audio=None, update_vid body_updated = update_video(item, elem, m.group(1)) or body_updated if body_updated: item['body_html'] = to_string(root_elem, method="html") + + +def remove_all_embeds(item, remove_by_class=True, remove_media_embeds=True): + """ + Remove the all embeds from the body of the article, including any divs with the embed_block attribute + :param item: + :param remove_by_class: If true removes any divs that have the embed-block class, should remove such things as + embedded tweets + :param remove_media_embeds: Remove any figure tags if the passed value is true + :return: + """ + if not item.get("body_html", ""): + return + + root_elem = lxml_html.fromstring(item.get("body_html", "")) + + if remove_by_class: + # all embedded tweets etc should be in a div with the class embeded-block, these are removed + embeds = root_elem.xpath('//div[@class=\'embed-block\']') + for embed in embeds: + embed.getparent().remove(embed) + + if not remove_media_embeds: + item["body_html"] = to_string(root_elem, encoding="unicode", method='html') + return + + # clean all the embedded figures from the html, it will remove the comments as well + cleaner = clean.Cleaner(add_nofollow=False, kill_tags=["figure"]) + cleaned_xhtml = cleaner.clean_html(root_elem) + + # remove the associations relating to the embeds + kill_keys = [key for key in item.get("associations", {}) if key.startswith("editor_")] + for key in kill_keys: + item.get("associations", {}).pop(key, None) + + item["body_html"] = to_string(cleaned_xhtml, encoding="unicode", method='html') diff --git a/newsroom/wire/formatters/html.py b/newsroom/wire/formatters/html.py index 78b3b760..07e0c930 100644 --- a/newsroom/wire/formatters/html.py +++ b/newsroom/wire/formatters/html.py @@ -1,8 +1,6 @@ import flask from .base import BaseFormatter -from lxml import html as lxml_html -from lxml.html import clean -from lxml import etree +from newsroom.utils import remove_all_embeds class HTMLFormatter(BaseFormatter): @@ -14,17 +12,7 @@ class HTMLFormatter(BaseFormatter): MIMETYPE = 'text/html' def format_item(self, item, item_type='items'): - - # clean all the embedded figures from the html - blacklist = ["figure"] - root_elem = lxml_html.fromstring(item.get("body_html", "")) - cleaner = clean.Cleaner( - add_nofollow=False, - kill_tags=blacklist - ) - cleaned_xhtml = cleaner.clean_html(root_elem) - - item["body_html"] = etree.tostring(cleaned_xhtml, encoding="unicode", method='html') + remove_all_embeds(item) if item_type == 'items': return str.encode(flask.render_template('download_item.html', item=item), 'utf-8') diff --git a/newsroom/wire/formatters/newsmlg2.py b/newsroom/wire/formatters/newsmlg2.py index 5a54102a..80159b6a 100644 --- a/newsroom/wire/formatters/newsmlg2.py +++ b/newsroom/wire/formatters/newsmlg2.py @@ -4,6 +4,7 @@ from superdesk.publish.formatters.nitf_formatter import NITFFormatter from superdesk.publish.formatters.newsml_g2_formatter import NewsMLG2Formatter as SuperdeskFormatter +from newsroom.utils import remove_all_embeds from .base import BaseFormatter @@ -34,6 +35,7 @@ class NewsMLG2Formatter(BaseFormatter): nitf_formatter = NITFFormatter() def format_item(self, item, item_type='items'): + remove_all_embeds(item) item = item.copy() item.setdefault('guid', item['_id']) item.setdefault('_current_version', item['version']) diff --git a/newsroom/wire/formatters/ninjs.py b/newsroom/wire/formatters/ninjs.py index 24514e33..6d723e42 100644 --- a/newsroom/wire/formatters/ninjs.py +++ b/newsroom/wire/formatters/ninjs.py @@ -1,6 +1,8 @@ +import flask import json from .base import BaseFormatter from superdesk.utils import json_serialize_datetime_objectId +from newsroom.utils import remove_all_embeds class NINJSFormatter(BaseFormatter): @@ -20,7 +22,21 @@ def format_item(self, item, item_type='items'): return json.dumps(ninjs, default=json_serialize_datetime_objectId) + @staticmethod + def test_for_true(value): + """ + Test if the value indicates false + :param value: + :return: + """ + return value.lower() == 'true' or value == '1' + def _transform_to_ninjs(self, item): + no_embeds = flask.request.args.get('no_embeds', default=False, type=self.test_for_true) + no_media = flask.request.args.get('no_media', default=False, type=self.test_for_true) + if no_media or no_embeds: + remove_all_embeds(item, remove_media_embeds=no_media, remove_by_class=no_embeds) + ninjs = { 'guid': item.get('_id'), 'version': str(item.get('version', 1)), diff --git a/newsroom/wire/formatters/ninjs2.py b/newsroom/wire/formatters/ninjs2.py index e7601802..cdb1b0f3 100644 --- a/newsroom/wire/formatters/ninjs2.py +++ b/newsroom/wire/formatters/ninjs2.py @@ -1,6 +1,7 @@ from .ninjs import NINJSFormatter from newsroom.news_api.utils import check_featuremedia_association_permission from newsroom.wire.formatters.utils import remove_internal_renditions +from newsroom.utils import remove_all_embeds class NINJSFormatter2(NINJSFormatter): @@ -18,3 +19,14 @@ def _transform_to_ninjs(self, item): if not item.get('associations'): item.pop('associations', None) return remove_internal_renditions(super()._transform_to_ninjs(item), remove_media=True) + + +class NINJSFormatter3(NINJSFormatter2): + """ + Format with no Embeds + """ + + def _transform_to_ninjs(self, item): + remove_all_embeds(item) + ninjs = super()._transform_to_ninjs(item) + return ninjs diff --git a/newsroom/wire/formatters/nitf.py b/newsroom/wire/formatters/nitf.py index af44987e..7dd87761 100644 --- a/newsroom/wire/formatters/nitf.py +++ b/newsroom/wire/formatters/nitf.py @@ -1,6 +1,7 @@ from lxml import etree from superdesk.publish.formatters.nitf_formatter import NITFFormatter as SuperdeskNITFFormatter +from newsroom.utils import remove_all_embeds from .base import BaseFormatter @@ -14,6 +15,7 @@ class NITFFormatter(BaseFormatter): formatter = SuperdeskNITFFormatter() def format_item(self, item, item_type='items'): + remove_all_embeds(item) dest = {} nitf = self.formatter.get_nitf(item, dest, '') return etree.tostring(nitf, xml_declaration=True, pretty_print=True, encoding=self.encoding) diff --git a/newsroom/wire/formatters/text.py b/newsroom/wire/formatters/text.py index eea01599..d7a36d8f 100644 --- a/newsroom/wire/formatters/text.py +++ b/newsroom/wire/formatters/text.py @@ -1,6 +1,7 @@ import flask from .base import BaseFormatter +from newsroom.utils import remove_all_embeds class TextFormatter(BaseFormatter): @@ -9,6 +10,7 @@ class TextFormatter(BaseFormatter): MIMETYPE = 'text/plain' def format_item(self, item, item_type='items'): + remove_all_embeds(item) if item_type == 'items': return str.encode(flask.render_template('download_item.txt', item=item), 'utf-8') else: diff --git a/newsroom/wire/views.py b/newsroom/wire/views.py index a6031575..32801e3d 100644 --- a/newsroom/wire/views.py +++ b/newsroom/wire/views.py @@ -253,7 +253,8 @@ def download(_ids): update_action_list(_ids.split(','), 'downloads', force_insert=True) get_resource_service('history').create_history_record(items, 'download', user, request.args.get('type', 'wire')) - return flask.send_file(_file, mimetype=mimetype, attachment_filename=attachment_filename, as_attachment=True) + return flask.send_file(_file, mimetype=mimetype, attachment_filename=attachment_filename, as_attachment=True, + cache_timeout=0) @blueprint.route('/wire_share', methods=['POST'])