Skip to content

Commit

Permalink
Remove embeds from selected downloads (#1181)
Browse files Browse the repository at this point in the history
* Remove embeds from selected downloads

* Additional options on NINJS item endpoints
  • Loading branch information
marwoodandrew authored Dec 14, 2023
1 parent 59fde73 commit ff632eb
Show file tree
Hide file tree
Showing 10 changed files with 144 additions and 43 deletions.
68 changes: 68 additions & 0 deletions features/news_api_item.feature
Original file line number Diff line number Diff line change
Expand Up @@ -204,4 +204,72 @@ Feature: News API Item
"headline": "headline 1",
"associations": {"featuremedia": {"renditions": {"original": {}} }}
}
"""

Scenario: Item request response strips embeds
Given "items"
"""
[{"_id": "111", "body_html": "<p>Once upon a time there was </p><div class=\"embed-block\">a fish</div><p> who could swim</p><p><!-- EMBED START Image {id: \"editor_19\"} --><figure><img src=\"somthing\" alt=\"alt text\" id=\"editor_19\"<figcaption>Some caption</figcaption></figure><!-- EMBED END Image {id: \"editor_19\"} --></p>",
"headline": "headline 1",
"firstpublished": "#DATE-1#", "versioncreated": "#DATE#",
"associations": {"editor_19": {"products": [{"code": "1234"}], "renditions": {"original": {}} }}}]
"""
Given "products"
"""
[{"name": "A fishy Product",
"decsription": "a product for those interested in fish",
"companies" : [
"#companies._id#"
],
"query": "Once upon a time",
"product_type": "news_api"
},
{"name": "A fishy superdesk product",
"description": "a superdesk product restricting images in the atom feed",
"companies" : [
"#companies._id#"
],
"sd_product_id": "1234",
"product_type": "news_api"
}
]
"""
When we get "/news/item/111?format=NINJSFormatter&no_embeds=true&no_media=1"
Then we get existing resource
"""
{
"guid": "111",
"headline": "headline 1",
"body_html": "<p>Once upon a time there was </p><p> who could swim</p><p></p>"
}
"""
When we get "/news/item/111?format=NINJSFormatter2&no_embeds=true"
Then we get existing resource
"""
{
"guid": "111",
"headline": "headline 1",
"body_html": "<p>Once upon a time there was </p><p> who could swim</p><p><!-- EMBED START Image {id: \"editor_19\"} --><figure><img src=\"somthing\" alt=\"alt text\" id=\"editor_19\">Some caption</figure><!-- EMBED END Image {id: \"editor_19\"} --></p>",
"associations": {"editor_19": {"renditions": {"original": {}}}}
}
"""
When we get "/news/item/111?format=NINJSFormatter2&no_media=true"
Then we get existing resource
"""
{
"guid": "111",
"headline": "headline 1",
"body_html": "<p>Once upon a time there was </p><div class=\"embed-block\">a fish</div><p> who could swim</p><p></p>",
"associations": {}
}
"""
When we get "/news/item/111?format=NINJSFormatter3"
Then we get existing resource
"""
{
"guid": "111",
"headline": "headline 1",
"body_html": "<p>Once upon a time there was </p><p> who could swim</p><p></p>",
"associations": {}
}
"""
29 changes: 1 addition & 28 deletions newsroom/monitoring/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from flask import current_app as app
from lxml import html as lxml_html
import re
import collections
from superdesk.text_utils import get_text
from newsroom.utils import get_items_by_id
from newsroom.utils import get_items_by_id, remove_all_embeds
from superdesk import etree as sd_etree


Expand Down Expand Up @@ -69,28 +67,3 @@ def get_items_for_monitoring_report(_ids, monitoring_profile, full_text=False):
items = get_items_by_id(_ids, 'items')
truncate_article_body(items, monitoring_profile, full_text)
return items


def remove_all_embeds(item):
"""
Remove the all embeds from the body of the article
:param item:
:return:
"""
root_elem = lxml_html.fromstring(item.get('body_html') or '<p></p>')
regex = r" EMBED START (?:Image|Video|Audio) {id: \"editor_([0-9]+)"
html_updated = False
comments = root_elem.xpath('//comment()')
for comment in comments:
m = re.search(regex, comment.text)
# if we've found an Embed Start comment
if m and m.group(1):
parent = comment.getparent()
for elem in comment.itersiblings():
parent.remove(elem)
if elem.text and ' EMBED END ' in elem.text:
break
parent.remove(comment)
html_updated = True
if html_updated:
item["body_html"] = sd_etree.to_string(root_elem, method="html")
37 changes: 37 additions & 0 deletions newsroom/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytz
import re
from lxml import html as lxml_html
from lxml.html import clean

from superdesk.etree import to_string
from superdesk.utc import utcnow
Expand Down Expand Up @@ -460,3 +461,39 @@ def update_embeds_in_body(item, update_image=None, update_audio=None, update_vid
body_updated = update_video(item, elem, m.group(1)) or body_updated
if body_updated:
item['body_html'] = to_string(root_elem, method="html")


def remove_all_embeds(item, remove_by_class=True, remove_media_embeds=True):
"""
Remove the all embeds from the body of the article, including any divs with the embed_block attribute
:param item:
:param remove_by_class: If true removes any divs that have the embed-block class, should remove such things as
embedded tweets
:param remove_media_embeds: Remove any figure tags if the passed value is true
:return:
"""
if not item.get("body_html", ""):
return

root_elem = lxml_html.fromstring(item.get("body_html", ""))

if remove_by_class:
# all embedded tweets etc should be in a div with the class embeded-block, these are removed
embeds = root_elem.xpath('//div[@class=\'embed-block\']')
for embed in embeds:
embed.getparent().remove(embed)

if not remove_media_embeds:
item["body_html"] = to_string(root_elem, encoding="unicode", method='html')
return

# clean all the embedded figures from the html, it will remove the comments as well
cleaner = clean.Cleaner(add_nofollow=False, kill_tags=["figure"])
cleaned_xhtml = cleaner.clean_html(root_elem)

# remove the associations relating to the embeds
kill_keys = [key for key in item.get("associations", {}) if key.startswith("editor_")]
for key in kill_keys:
item.get("associations", {}).pop(key, None)

item["body_html"] = to_string(cleaned_xhtml, encoding="unicode", method='html')
16 changes: 2 additions & 14 deletions newsroom/wire/formatters/html.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import flask
from .base import BaseFormatter
from lxml import html as lxml_html
from lxml.html import clean
from lxml import etree
from newsroom.utils import remove_all_embeds


class HTMLFormatter(BaseFormatter):
Expand All @@ -14,17 +12,7 @@ class HTMLFormatter(BaseFormatter):
MIMETYPE = 'text/html'

def format_item(self, item, item_type='items'):

# clean all the embedded figures from the html
blacklist = ["figure"]
root_elem = lxml_html.fromstring(item.get("body_html", ""))
cleaner = clean.Cleaner(
add_nofollow=False,
kill_tags=blacklist
)
cleaned_xhtml = cleaner.clean_html(root_elem)

item["body_html"] = etree.tostring(cleaned_xhtml, encoding="unicode", method='html')
remove_all_embeds(item)

if item_type == 'items':
return str.encode(flask.render_template('download_item.html', item=item), 'utf-8')
Expand Down
2 changes: 2 additions & 0 deletions newsroom/wire/formatters/newsmlg2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from superdesk.publish.formatters.nitf_formatter import NITFFormatter
from superdesk.publish.formatters.newsml_g2_formatter import NewsMLG2Formatter as SuperdeskFormatter
from newsroom.utils import remove_all_embeds

from .base import BaseFormatter

Expand Down Expand Up @@ -34,6 +35,7 @@ class NewsMLG2Formatter(BaseFormatter):
nitf_formatter = NITFFormatter()

def format_item(self, item, item_type='items'):
remove_all_embeds(item)
item = item.copy()
item.setdefault('guid', item['_id'])
item.setdefault('_current_version', item['version'])
Expand Down
16 changes: 16 additions & 0 deletions newsroom/wire/formatters/ninjs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import flask
import json
from .base import BaseFormatter
from superdesk.utils import json_serialize_datetime_objectId
from newsroom.utils import remove_all_embeds


class NINJSFormatter(BaseFormatter):
Expand All @@ -20,7 +22,21 @@ def format_item(self, item, item_type='items'):

return json.dumps(ninjs, default=json_serialize_datetime_objectId)

@staticmethod
def test_for_true(value):
"""
Test if the value indicates false
:param value:
:return:
"""
return value.lower() == 'true' or value == '1'

def _transform_to_ninjs(self, item):
no_embeds = flask.request.args.get('no_embeds', default=False, type=self.test_for_true)
no_media = flask.request.args.get('no_media', default=False, type=self.test_for_true)
if no_media or no_embeds:
remove_all_embeds(item, remove_media_embeds=no_media, remove_by_class=no_embeds)

ninjs = {
'guid': item.get('_id'),
'version': str(item.get('version', 1)),
Expand Down
12 changes: 12 additions & 0 deletions newsroom/wire/formatters/ninjs2.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .ninjs import NINJSFormatter
from newsroom.news_api.utils import check_featuremedia_association_permission
from newsroom.wire.formatters.utils import remove_internal_renditions
from newsroom.utils import remove_all_embeds


class NINJSFormatter2(NINJSFormatter):
Expand All @@ -18,3 +19,14 @@ def _transform_to_ninjs(self, item):
if not item.get('associations'):
item.pop('associations', None)
return remove_internal_renditions(super()._transform_to_ninjs(item), remove_media=True)


class NINJSFormatter3(NINJSFormatter2):
"""
Format with no Embeds
"""

def _transform_to_ninjs(self, item):
remove_all_embeds(item)
ninjs = super()._transform_to_ninjs(item)
return ninjs
2 changes: 2 additions & 0 deletions newsroom/wire/formatters/nitf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

from lxml import etree
from superdesk.publish.formatters.nitf_formatter import NITFFormatter as SuperdeskNITFFormatter
from newsroom.utils import remove_all_embeds

from .base import BaseFormatter

Expand All @@ -14,6 +15,7 @@ class NITFFormatter(BaseFormatter):
formatter = SuperdeskNITFFormatter()

def format_item(self, item, item_type='items'):
remove_all_embeds(item)
dest = {}
nitf = self.formatter.get_nitf(item, dest, '')
return etree.tostring(nitf, xml_declaration=True, pretty_print=True, encoding=self.encoding)
2 changes: 2 additions & 0 deletions newsroom/wire/formatters/text.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

import flask
from .base import BaseFormatter
from newsroom.utils import remove_all_embeds


class TextFormatter(BaseFormatter):
Expand All @@ -9,6 +10,7 @@ class TextFormatter(BaseFormatter):
MIMETYPE = 'text/plain'

def format_item(self, item, item_type='items'):
remove_all_embeds(item)
if item_type == 'items':
return str.encode(flask.render_template('download_item.txt', item=item), 'utf-8')
else:
Expand Down
3 changes: 2 additions & 1 deletion newsroom/wire/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,8 @@ def download(_ids):

update_action_list(_ids.split(','), 'downloads', force_insert=True)
get_resource_service('history').create_history_record(items, 'download', user, request.args.get('type', 'wire'))
return flask.send_file(_file, mimetype=mimetype, attachment_filename=attachment_filename, as_attachment=True)
return flask.send_file(_file, mimetype=mimetype, attachment_filename=attachment_filename, as_attachment=True,
cache_timeout=0)


@blueprint.route('/wire_share', methods=['POST'])
Expand Down

0 comments on commit ff632eb

Please sign in to comment.