Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix : Enabling urls without http in front of them [STTNHUB-252] #42

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 34 additions & 4 deletions server/stt/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,57 @@
from superdesk import get_resource_service
from superdesk.metadata.item import ITEM_TYPE, ITEM_STATE
from planning.common import WORKFLOW_STATE, POST_STATE, update_post_item
import re


def planning_xml_contains_remove_signal(xml: Element) -> bool:
"""Returns ``True`` if the ``sttinstruct:remove`` signal is included, ``False`` otherwise"""

namespaces = {"iptc": "http://iptc.org/std/nar/2006-10-01/"}
if xml.xpath("//iptc:itemMeta/iptc:signal[@qcode='sttinstruct:remove']", namespaces=namespaces):
if xml.xpath(
"//iptc:itemMeta/iptc:signal[@qcode='sttinstruct:remove']",
namespaces=namespaces,
):
return True
return False


def unpost_or_spike_event_or_planning(item: Dict[str, Any]):
item_resource = "events" if item.get(ITEM_TYPE) == "event" else "planning"
original: Dict[str, Any] = get_resource_service(item_resource).find_one(req=None, _id=item["guid"]) or {}
original: Dict[str, Any] = (
get_resource_service(item_resource).find_one(req=None, _id=item["guid"]) or {}
)

if not original.get("pubstatus") and original.get(ITEM_STATE) in [
WORKFLOW_STATE.INGESTED,
WORKFLOW_STATE.DRAFT,
WORKFLOW_STATE.POSTPONED,
WORKFLOW_STATE.CANCELLED,
]:
get_resource_service(item_resource + "_spike").patch(original[config.ID_FIELD], original)
get_resource_service(item_resource + "_spike").patch(
original[config.ID_FIELD], original
)
elif original.get("pubstatus") != POST_STATE.CANCELLED:
update_post_item({"pubstatus": POST_STATE.CANCELLED, "_etag": original["_etag"]}, original)
update_post_item(
{"pubstatus": POST_STATE.CANCELLED, "_etag": original["_etag"]}, original
)


def transform_link_from_text(item, fields):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have this code in superdesk-core (https://github.com/superdesk/superdesk-core/blob/develop/superdesk/text_utils.py#L193).

This is the code that is currently being used to transform plain text to rich text (for Editor3 fields).
Seems like the bug is there, and should be fixed there

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Okay, let me check. Thanks, Mark!

for field in fields:
if item.get(field):
url_pattern = re.compile(r"(https?://\S+|www\.\S+|\S+\.\S+)")

# Replace URLs with anchor tags and update the href attribute
def replace(match):
url = match.group(0)
if url.startswith("www."):
url_with_https = "https://" + url
return f'<a href="{url_with_https}">{url}</a>'
elif not url.startswith(("http://", "https://")):
url_with_https = "https://" + url
return f'<a href="{url_with_https}">{url}</a>'
else:
return f'<a href="{url}">{url}</a>'

item[field] = url_pattern.sub(replace, item[field])
126 changes: 88 additions & 38 deletions server/stt/stt_events_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
from superdesk.errors import SuperdeskApiError
from planning.feed_parsers.events_ml import EventsMLParser

from .common import planning_xml_contains_remove_signal, unpost_or_spike_event_or_planning
from .common import (
planning_xml_contains_remove_signal,
unpost_or_spike_event_or_planning,
transform_link_from_text,
)

logger = logging.getLogger(__name__)
TIMEZONE = "Europe/Helsinki"
Expand All @@ -27,7 +31,19 @@ def search_existing_contacts(contact: Dict[str, Any]) -> Optional[Dict[str, Any]
contacts_service = get_resource_service("contacts")
if len(contact.get("contact_email") or []):
cursor = contacts_service.search(
{"query": {"bool": {"must": [{"term": {"contact_email.keyword": contact["contact_email"][0]}}]}}}
{
"query": {
"bool": {
"must": [
{
"term": {
"contact_email.keyword": contact["contact_email"][0]
}
}
]
}
}
}
)
if cursor.count():
return list(cursor)[0]
Expand All @@ -36,31 +52,33 @@ def search_existing_contacts(contact: Dict[str, Any]) -> Optional[Dict[str, Any]
first_name = contact["first_name"].lower()
last_name = contact["last_name"].lower()

cursor = contacts_service.search({
"query": {
"bool": {
"must": [
{
"match": {
"first_name": {
"query": first_name.lower(),
"operator": "AND",
cursor = contacts_service.search(
{
"query": {
"bool": {
"must": [
{
"match": {
"first_name": {
"query": first_name.lower(),
"operator": "AND",
},
},
},
},
{
"match": {
"last_name": {
"query": last_name.lower(),
"operator": "AND",
{
"match": {
"last_name": {
"query": last_name.lower(),
"operator": "AND",
},
},
},
},
],
],
},
},
},
"sort": ["_score"]
})
"sort": ["_score"],
}
)
if cursor.count():
return list(cursor)[0]

Expand All @@ -86,6 +104,20 @@ def parse(self, tree: Element, provider=None):
# If the item contains the ``sttinstruct:remove`` signal, no need to ingest this one
continue
self.set_extra_fields(item, tree)

fields = (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This functionality is already in EventsML parser from superdesk-planning. And we have to check if the field is using Editor3, otherwise adding HTML tags in plain text would not be ideal.

As we support this as a core feature, we should update the EventsML and PlanningML parsers to support all possible fields. See https://github.com/superdesk/superdesk-planning/blob/develop/server/planning/feed_parsers/events_ml.py#L252 for example.

We first check if the field is Editor3 in the content profile, and then attempt to convert to html. So it might be good to implement a utility in superdesk-planning to convert multiple fields on ingest to editor3, checking it's content profile.

"definition_long",
"definition_short",
"ednote",
"internal_note",
"name",
"slugline",
"registration_details",
"invitation_details",
"accreditation_info",
)
transform_link_from_text(item, fields)

items_to_ingest.append(item)

return items_to_ingest
Expand Down Expand Up @@ -139,17 +171,25 @@ def set_extra_fields(self, item, xml):
if related is not None and related.get("rel", "") == "sttnat:sttEventType":
qcode_parts = related.get("qcode", "").split(":")
qcode = qcode_parts[1] if len(qcode_parts) == 2 else qcode_parts
qcode = f"type{qcode}" # add prefix to avoid conflict with sttdepartment
name = self.getVocabulary("event_type", qcode, related.find(self.qname("name")).text)
item.setdefault("subject", []).append({
"qcode": qcode,
"name": name,
"scheme": "event_type",
})
qcode = (
f"type{qcode}" # add prefix to avoid conflict with sttdepartment
)
name = self.getVocabulary(
"event_type", qcode, related.find(self.qname("name")).text
)
item.setdefault("subject", []).append(
{
"qcode": qcode,
"name": name,
"scheme": "event_type",
}
)
except AttributeError:
pass

self.set_location_details(item, event_details.find(self.qname("location")), location_notes)
self.set_location_details(
item, event_details.find(self.qname("location")), location_notes
)
self.set_contact_details(item, event_details)

def set_location_details(self, item, location_xml, notes):
Expand Down Expand Up @@ -195,13 +235,19 @@ def set_location_details(self, item, location_xml, notes):
elif values[0] == "sttcountry":
location["address"]["extra"]["sttcountry"] = values[1]
try:
location["address"]["country"] = broader.find(self.qname("name")).text
location["address"]["extra"]["iso3166"] = broader.find(self.qname("sameAs")).get("qcode")
location["address"]["country"] = broader.find(
self.qname("name")
).text
location["address"]["extra"]["iso3166"] = broader.find(
self.qname("sameAs")
).get("qcode")
except AttributeError:
continue

try:
address = location_xml.find(self.qname("POIDetails")).find(self.qname("address"))
address = location_xml.find(self.qname("POIDetails")).find(
self.qname("address")
)
except AttributeError:
address = None

Expand All @@ -212,7 +258,9 @@ def set_location_details(self, item, location_xml, notes):
pass

try:
location["address"]["postal_code"] = address.find(self.qname("postalCode")).text
location["address"]["postal_code"] = address.find(
self.qname("postalCode")
).text
except AttributeError:
pass

Expand Down Expand Up @@ -242,10 +290,12 @@ def set_contact_details(self, item: Dict[str, Any], event_details: Element):
if job_title is not None and job_title.text:
contact["job_title"] = job_title.text
if phone is not None and phone.text:
contact["contact_phone"] = [{
"number": phone.text,
"public": True,
}]
contact["contact_phone"] = [
{
"number": phone.text,
"public": True,
}
]
if email is not None and email.text:
contact["contact_email"] = [email.text.lower()]
if web is not None and web.text:
Expand Down
5 changes: 5 additions & 0 deletions server/stt/stt_planning_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .common import (
planning_xml_contains_remove_signal,
unpost_or_spike_event_or_planning,
transform_link_from_text,
)

TIMEZONE = "Europe/Helsinki"
Expand Down Expand Up @@ -41,6 +42,10 @@ def parse(self, tree: Element, provider=None):
) if planning_item else self.set_placeholder_coverage(item, tree)

self.set_extra_fields(item, tree)

fields = ("description_text", "headline", "slugline", "ednote", "abstract")
transform_link_from_text(item, fields)

items_to_ingest.append(item)

return items_to_ingest
Expand Down
75 changes: 75 additions & 0 deletions server/tests/fixtures/planning_ml_link.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
<?xml version="1.0" encoding="UTF-8"?>
<planningItem xmlns="http://iptc.org/std/nar/2006-10-01/" xmlns:stt="http://www.stt-lehtikuva.fi/NewsML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://iptc.org/std/nar/2006-10-01/ http://www.iptc.org/std/NewsML-G2/2.12/specification/NewsML-G2_2.12-spec-All-Power.xsd http://www.stt-lehtikuva.fi/NewsML http://www.stt-lehtikuva.fi/newsml/schema/STT-Lehtikuva_NewsML_G2.xsd" guid="urn:newsml:stt.fi:20280911:631023" version="1" standard="NewsML-G2" standardversion="2.12" conformance="power" xml:lang="fi">
<catalogRef href="http://www.iptc.org/std/catalog/catalog.IPTC-G2-Standards_18.xml"/>
<catalogRef href="http://www.stt-lehtikuva.fi/newsml/doc/stt-NewsCodesCatalog_1.xml"/>
<itemMeta>
<itemClass qcode="plinat:newscoverage"/>
<provider literal="STT"/>
<versionCreated>2023-10-02T12:23:48+02:00</versionCreated>
<pubStatus qcode="stat:usable"/>
<edNote role="sttdescription:additionalinfo">Testi: Lisätietoja aiheesta erityisesti media-asiakkaille

www.stt.fi STT

stt.fi

https://stt.fi/

STT</edNote>
</itemMeta>
<contentMeta>
<urgency>1</urgency>
<contentCreated>2023-10-02T12:23:48+02:00</contentCreated>
<contentModified>2023-10-02T12:23:48+02:00</contentModified>
<headline>Testi/Luotsi</headline>
<description role="drol:summary">www.stt.fi

stt.fi

https://stt.fi/</description>
<subject qcode="stt-topics:631023">
<note role="sttpresent:1">Toimittaja paikalla</note>
<related rel="sttrel:assigneddate" value="2028-09-11" valuedatatype="Date"/>
</subject>
<subject type="cpnat:department" qcode="sttdepartment:3">
<name>Kotimaa</name>
</subject>
</contentMeta>
<assert qcode="stt-topics:631023">
<newsCoverageStatus qcode="ncostat:int"/>
</assert>
<newsCoverageSet>
<newsCoverage id="ID_EVENT_286323" modified="2023-10-02T12:23:57+02:00">
<planning>
<g2contentType>application/vnd.iptc.g2.newsitem+xml</g2contentType>
<itemClass qcode="ninat:text"/>
<headline>Testi/Luotsi</headline>
<description>www.stt.fi

stt.fi

https://stt.fi/</description>

<subject type="cpnat:event" qcode="urn:newsml:stt.fi:20280911:286323">
<name>Testi/Luotsi</name>
</subject>
</planning>
</newsCoverage>
<newsCoverage id="ID_WORKREQUEST_187845">
<planning>
<g2contentType>application/vnd.iptc.g2.newsitem+xml</g2contentType>
<itemClass qcode="ninat:picture"/>
<scheduled>2023-10-04T00:00:00+02:00</scheduled>
<headline>Testi/Luotsi</headline>
<subject type="ninat:text" qcode="urn:newsml:stt.fi:20280911000000:187845">
<definition role="sttdescription:imagetarget">.</definition>
<stt:workstartdate>2028-09-11T00:00:00+02:00</stt:workstartdate>
</subject>
<genre qcode="sttimage:20">
<name>Kuvaaja paikalla</name>
<definition role="sttdescription:imagetype">-</definition>
</genre>
</planning>
</newsCoverage>
</newsCoverageSet>
</planningItem>
12 changes: 12 additions & 0 deletions server/tests/stt_planning_ml_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,15 @@ def test_update_planning(self):
"placeholder_urn:newsml:stt.fi:20230529:620121",
dest["coverages"][0]["coverage_id"],
)

def test_text_link(self):
self.fixture = "planning_ml_link.xml"
self.parse_source_content()
self.assertEqual(
self.item["ednote"],
"""Testi: Lisätietoja aiheesta erityisesti media-asiakkaille\n\n<a href="https://www.stt.fi">www.stt.fi</a> STT\n\n<a href="https://stt.fi">stt.fi</a>\n\n<a href="https://stt.fi/">https://stt.fi/</a>\n\nSTT""", # noqa
)
self.assertEqual(
self.item["description_text"],
'<a href="https://www.stt.fi">www.stt.fi</a> \n\n<a href="https://stt.fi">stt.fi</a> \n\n<a href="https://stt.fi/">https://stt.fi/</a>', # noqa
)