Remove embeds from selected downloads (#1181)

* Remove embeds from selected downloads * Additional options on NINJS item endpoints
superdesk · Dec 14, 2023 · ff632eb · ff632eb
1 parent 59fde73
commit ff632eb
Show file tree

Hide file tree

Showing 10 changed files with 144 additions and 43 deletions.
diff --git a/features/news_api_item.feature b/features/news_api_item.feature
@@ -204,4 +204,72 @@ Feature: News API Item
       "headline": "headline 1",
       "associations": {"featuremedia": {"renditions": {"original": {}} }}
     }
+    """
+
+  Scenario: Item request response strips embeds
+    Given "items"
+        """
+        [{"_id": "111", "body_html": "<p>Once upon a time there was </p><div class=\"embed-block\">a fish</div><p> who could swim</p><p><!-- EMBED START Image {id: \"editor_19\"} --><figure><img src=\"somthing\" alt=\"alt text\" id=\"editor_19\"<figcaption>Some caption</figcaption></figure><!-- EMBED END Image {id: \"editor_19\"} --></p>",
+         "headline": "headline 1",
+         "firstpublished": "#DATE-1#", "versioncreated": "#DATE#",
+         "associations": {"editor_19": {"products": [{"code": "1234"}], "renditions": {"original": {}} }}}]
+        """
+    Given "products"
+        """
+        [{"name": "A fishy Product",
+        "decsription": "a product for those interested in fish",
+        "companies" : [
+          "#companies._id#"
+        ],
+        "query": "Once upon a time",
+        "product_type": "news_api"
+        },
+        {"name": "A fishy superdesk product",
+        "description": "a superdesk product restricting images in the atom feed",
+        "companies" : [
+          "#companies._id#"
+        ],
+        "sd_product_id": "1234",
+        "product_type": "news_api"
+        }
+        ]
+        """
+    When we get "/news/item/111?format=NINJSFormatter&no_embeds=true&no_media=1"
+    Then we get existing resource
+    """
+    {
+      "guid": "111",
+      "headline": "headline 1",
+      "body_html": "<p>Once upon a time there was </p><p> who could swim</p><p></p>"
+    }
+    """
+    When we get "/news/item/111?format=NINJSFormatter2&no_embeds=true"
+    Then we get existing resource
+    """
+    {
+      "guid": "111",
+      "headline": "headline 1",
+      "body_html": "<p>Once upon a time there was </p><p> who could swim</p><p><!-- EMBED START Image {id: \"editor_19\"} --><figure><img src=\"somthing\" alt=\"alt text\" id=\"editor_19\">Some caption</figure><!-- EMBED END Image {id: \"editor_19\"} --></p>",
+      "associations": {"editor_19": {"renditions": {"original": {}}}}
+    }
+    """
+    When we get "/news/item/111?format=NINJSFormatter2&no_media=true"
+    Then we get existing resource
+    """
+    {
+      "guid": "111",
+      "headline": "headline 1",
+      "body_html": "<p>Once upon a time there was </p><div class=\"embed-block\">a fish</div><p> who could swim</p><p></p>",
+      "associations": {}
+    }
+    """
+    When we get "/news/item/111?format=NINJSFormatter3"
+    Then we get existing resource
+    """
+    {
+      "guid": "111",
+      "headline": "headline 1",
+      "body_html": "<p>Once upon a time there was </p><p> who could swim</p><p></p>",
+      "associations": {}
+    }
     """
diff --git a/newsroom/monitoring/utils.py b/newsroom/monitoring/utils.py
@@ -1,9 +1,7 @@
 from flask import current_app as app
-from lxml import html as lxml_html
-import re
 import collections
 from superdesk.text_utils import get_text
-from newsroom.utils import get_items_by_id
+from newsroom.utils import get_items_by_id, remove_all_embeds
 from superdesk import etree as sd_etree
 
 
@@ -69,28 +67,3 @@ def get_items_for_monitoring_report(_ids, monitoring_profile, full_text=False):
     items = get_items_by_id(_ids, 'items')
     truncate_article_body(items, monitoring_profile, full_text)
     return items
-
-
-def remove_all_embeds(item):
-    """
-    Remove the all embeds from the body of the article
-    :param item:
-    :return:
-    """
-    root_elem = lxml_html.fromstring(item.get('body_html') or '<p></p>')
-    regex = r" EMBED START (?:Image|Video|Audio) {id: \"editor_([0-9]+)"
-    html_updated = False
-    comments = root_elem.xpath('//comment()')
-    for comment in comments:
-        m = re.search(regex, comment.text)
-        # if we've found an Embed Start comment
-        if m and m.group(1):
-            parent = comment.getparent()
-            for elem in comment.itersiblings():
-                parent.remove(elem)
-                if elem.text and ' EMBED END ' in elem.text:
-                    break
-            parent.remove(comment)
-            html_updated = True
-    if html_updated:
-        item["body_html"] = sd_etree.to_string(root_elem, method="html")
diff --git a/newsroom/utils.py b/newsroom/utils.py
@@ -5,6 +5,7 @@
 import pytz
 import re
 from lxml import html as lxml_html
+from lxml.html import clean
 
 from superdesk.etree import to_string
 from superdesk.utc import utcnow
@@ -460,3 +461,39 @@ def update_embeds_in_body(item, update_image=None, update_audio=None, update_vid
                     body_updated = update_video(item, elem, m.group(1)) or body_updated
     if body_updated:
         item['body_html'] = to_string(root_elem, method="html")
+
+
+def remove_all_embeds(item, remove_by_class=True, remove_media_embeds=True):
+    """
+    Remove the all embeds from the body of the article, including any divs with the embed_block attribute
+    :param item:
+    :param remove_by_class: If true removes any divs that have the embed-block class, should remove such things as
+    embedded tweets
+    :param remove_media_embeds: Remove any figure tags if the passed value is true
+    :return:
+    """
+    if not item.get("body_html", ""):
+        return
+
+    root_elem = lxml_html.fromstring(item.get("body_html", ""))
+
+    if remove_by_class:
+        # all embedded tweets etc should be in a div with the class embeded-block, these are removed
+        embeds = root_elem.xpath('//div[@class=\'embed-block\']')
+        for embed in embeds:
+            embed.getparent().remove(embed)
+
+    if not remove_media_embeds:
+        item["body_html"] = to_string(root_elem, encoding="unicode", method='html')
+        return
+
+    # clean all the embedded figures from the html, it will remove the comments as well
+    cleaner = clean.Cleaner(add_nofollow=False, kill_tags=["figure"])
+    cleaned_xhtml = cleaner.clean_html(root_elem)
+
+    # remove the associations relating to the embeds
+    kill_keys = [key for key in item.get("associations", {}) if key.startswith("editor_")]
+    for key in kill_keys:
+        item.get("associations", {}).pop(key, None)
+
+    item["body_html"] = to_string(cleaned_xhtml, encoding="unicode", method='html')
diff --git a/newsroom/wire/formatters/html.py b/newsroom/wire/formatters/html.py
@@ -1,8 +1,6 @@
 import flask
 from .base import BaseFormatter
-from lxml import html as lxml_html
-from lxml.html import clean
-from lxml import etree
+from newsroom.utils import remove_all_embeds
 
 
 class HTMLFormatter(BaseFormatter):
@@ -14,17 +12,7 @@ class HTMLFormatter(BaseFormatter):
     MIMETYPE = 'text/html'
 
     def format_item(self, item, item_type='items'):
-
-        # clean all the embedded figures from the html
-        blacklist = ["figure"]
-        root_elem = lxml_html.fromstring(item.get("body_html", ""))
-        cleaner = clean.Cleaner(
-            add_nofollow=False,
-            kill_tags=blacklist
-        )
-        cleaned_xhtml = cleaner.clean_html(root_elem)
-
-        item["body_html"] = etree.tostring(cleaned_xhtml, encoding="unicode", method='html')
+        remove_all_embeds(item)
 
         if item_type == 'items':
             return str.encode(flask.render_template('download_item.html', item=item), 'utf-8')

diff --git a/newsroom/wire/formatters/newsmlg2.py b/newsroom/wire/formatters/newsmlg2.py
@@ -4,6 +4,7 @@
 
 from superdesk.publish.formatters.nitf_formatter import NITFFormatter
 from superdesk.publish.formatters.newsml_g2_formatter import NewsMLG2Formatter as SuperdeskFormatter
+from newsroom.utils import remove_all_embeds
 
 from .base import BaseFormatter
 
@@ -34,6 +35,7 @@ class NewsMLG2Formatter(BaseFormatter):
     nitf_formatter = NITFFormatter()
 
     def format_item(self, item, item_type='items'):
+        remove_all_embeds(item)
         item = item.copy()
         item.setdefault('guid', item['_id'])
         item.setdefault('_current_version', item['version'])

diff --git a/newsroom/wire/formatters/ninjs.py b/newsroom/wire/formatters/ninjs.py
@@ -1,6 +1,8 @@
+import flask
 import json
 from .base import BaseFormatter
 from superdesk.utils import json_serialize_datetime_objectId
+from newsroom.utils import remove_all_embeds
 
 
 class NINJSFormatter(BaseFormatter):
@@ -20,7 +22,21 @@ def format_item(self, item, item_type='items'):
 
         return json.dumps(ninjs, default=json_serialize_datetime_objectId)
 
+    @staticmethod
+    def test_for_true(value):
+        """
+        Test if the value indicates false
+        :param value:
+        :return:
+        """
+        return value.lower() == 'true' or value == '1'
+
     def _transform_to_ninjs(self, item):
+        no_embeds = flask.request.args.get('no_embeds', default=False, type=self.test_for_true)
+        no_media = flask.request.args.get('no_media', default=False, type=self.test_for_true)
+        if no_media or no_embeds:
+            remove_all_embeds(item, remove_media_embeds=no_media, remove_by_class=no_embeds)
+
         ninjs = {
             'guid': item.get('_id'),
             'version': str(item.get('version', 1)),

diff --git a/newsroom/wire/formatters/ninjs2.py b/newsroom/wire/formatters/ninjs2.py
@@ -1,6 +1,7 @@
 from .ninjs import NINJSFormatter
 from newsroom.news_api.utils import check_featuremedia_association_permission
 from newsroom.wire.formatters.utils import remove_internal_renditions
+from newsroom.utils import remove_all_embeds
 
 
 class NINJSFormatter2(NINJSFormatter):
@@ -18,3 +19,14 @@ def _transform_to_ninjs(self, item):
             if not item.get('associations'):
                 item.pop('associations', None)
         return remove_internal_renditions(super()._transform_to_ninjs(item), remove_media=True)
+
+
+class NINJSFormatter3(NINJSFormatter2):
+    """
+    Format with no Embeds
+    """
+
+    def _transform_to_ninjs(self, item):
+        remove_all_embeds(item)
+        ninjs = super()._transform_to_ninjs(item)
+        return ninjs
diff --git a/newsroom/wire/formatters/nitf.py b/newsroom/wire/formatters/nitf.py
@@ -1,6 +1,7 @@
 
 from lxml import etree
 from superdesk.publish.formatters.nitf_formatter import NITFFormatter as SuperdeskNITFFormatter
+from newsroom.utils import remove_all_embeds
 
 from .base import BaseFormatter
 
@@ -14,6 +15,7 @@ class NITFFormatter(BaseFormatter):
     formatter = SuperdeskNITFFormatter()
 
     def format_item(self, item, item_type='items'):
+        remove_all_embeds(item)
         dest = {}
         nitf = self.formatter.get_nitf(item, dest, '')
         return etree.tostring(nitf, xml_declaration=True, pretty_print=True, encoding=self.encoding)
diff --git a/newsroom/wire/formatters/text.py b/newsroom/wire/formatters/text.py
@@ -1,6 +1,7 @@
 
 import flask
 from .base import BaseFormatter
+from newsroom.utils import remove_all_embeds
 
 
 class TextFormatter(BaseFormatter):
@@ -9,6 +10,7 @@ class TextFormatter(BaseFormatter):
     MIMETYPE = 'text/plain'
 
     def format_item(self, item, item_type='items'):
+        remove_all_embeds(item)
         if item_type == 'items':
             return str.encode(flask.render_template('download_item.txt', item=item), 'utf-8')
         else:

diff --git a/newsroom/wire/views.py b/newsroom/wire/views.py
@@ -253,7 +253,8 @@ def download(_ids):
 
     update_action_list(_ids.split(','), 'downloads', force_insert=True)
     get_resource_service('history').create_history_record(items, 'download', user, request.args.get('type', 'wire'))
-    return flask.send_file(_file, mimetype=mimetype, attachment_filename=attachment_filename, as_attachment=True)
+    return flask.send_file(_file, mimetype=mimetype, attachment_filename=attachment_filename, as_attachment=True,
+                           cache_timeout=0)
 
 
 @blueprint.route('/wire_share', methods=['POST'])