Skip to content

Commit

Permalink
LinkDetailsExtractor adjustments (mastodon#31357)
Browse files Browse the repository at this point in the history
  • Loading branch information
c960657 authored Nov 22, 2024
1 parent 68c7782 commit 0518613
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 9 deletions.
18 changes: 12 additions & 6 deletions app/lib/link_details_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def height
end

def title
html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first)&.strip
html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || head.at_xpath('title')&.content)&.strip
end

def description
Expand Down Expand Up @@ -205,11 +205,11 @@ def embed_url
end

def language
valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.xpath('//html').pick('lang'))
valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.root.attr('lang'))
end

def icon
valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('shortcut icon'))
valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('icon'))
end

private
Expand Down Expand Up @@ -237,18 +237,20 @@ def valid_url_or_nil(str, same_origin_only: false)
end

def link_tag(name)
document.xpath("//link[nokogiri:link_rel_include(@rel, '#{name}')]", NokogiriHandler).pick('href')
head.at_xpath("//link[nokogiri:link_rel_include(@rel, '#{name}')]", NokogiriHandler)&.attr('href')
end

def opengraph_tag(name)
document.xpath("//meta[@property=\"#{name}\" or @name=\"#{name}\"]").pick('content')
head.at_xpath("//meta[nokogiri:casecmp(@property, '#{name}') or nokogiri:casecmp(@name, '#{name}')]", NokogiriHandler)&.attr('content')
end

def meta_tag(name)
document.xpath("//meta[@name=\"#{name}\"]").pick('content')
head.at_xpath("//meta[nokogiri:casecmp(@name, '#{name}')]", NokogiriHandler)&.attr('content')
end

def structured_data
return @structured_data if defined?(@structured_data)

# Some publications have more than one JSON-LD definition on the page,
# and some of those definitions aren't valid JSON either, so we have
# to loop through here until we find something that is the right type
Expand All @@ -273,6 +275,10 @@ def document
@document ||= detect_encoding_and_parse_document
end

def head
@head ||= document.at_xpath('/html/head')
end

def detect_encoding_and_parse_document
html = nil
encoding = nil
Expand Down
4 changes: 4 additions & 0 deletions app/lib/nokogiri_handler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,9 @@ class << self
def link_rel_include(token_list, token)
token_list.to_s.downcase.split(WHITE_SPACE).include?(token.downcase)
end

def casecmp(str1, str2)
str1.to_s.casecmp?(str2.to_s)
end
end
end
8 changes: 5 additions & 3 deletions spec/lib/link_details_extractor_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@
<html lang="en">
<head>
<title>Man bites dog</title>
<meta name="description" content="A dog&#39;s tale">
<meta name="descripTION" content="A dog&#39;s tale">
<link rel="pretty IcoN" href="/favicon.ico">
</head>
</html>
HTML
Expand All @@ -59,7 +60,8 @@
.to have_attributes(
title: eq('Man bites dog'),
description: eq("A dog's tale"),
language: eq('en')
language: eq('en'),
icon: eq('https://example.com/favicon.ico')
)
end
end
Expand Down Expand Up @@ -256,7 +258,7 @@
<head>
<meta property="og:url" content="https://example.com/dog.html">
<meta property="og:title" content="Man bites dog">
<meta property="og:description" content="A dog's tale">
<meta property="OG:description" content="A dog's tale">
<meta property="article:published_time" content="2022-01-31T19:53:00+00:00">
<meta property="og:author" content="Charlie Brown">
<meta property="og:locale" content="en">
Expand Down

0 comments on commit 0518613

Please sign in to comment.