PageInfo: Switch to JSoup and improve Bluesky title and text

nla · Nov 27, 2024 · c4f6361 · c4f6361
1 parent 40f1552
commit c4f6361
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 237 deletions.
diff --git a/ui/pom.xml b/ui/pom.xml
@@ -280,7 +280,7 @@
         <dependency>
             <groupId>org.jsoup</groupId>
             <artifactId>jsoup</artifactId>
-            <version>1.15.3</version>
+            <version>1.18.2</version>
         </dependency>
         <dependency>
             <groupId>com.googlecode.flyway</groupId>

diff --git a/ui/src/pandas/render/HtmlCharset.java b/ui/src/pandas/render/HtmlCharset.java
diff --git a/ui/src/pandas/render/PageInfo.java b/ui/src/pandas/render/PageInfo.java
@@ -1,15 +1,12 @@
 package pandas.render;
 
-import org.attoparser.AbstractMarkupHandler;
-import org.attoparser.ParseException;
-import org.attoparser.util.TextUtil;
-import org.jsoup.parser.Parser;
-
-import java.util.Locale;
-import java.util.Set;
-import java.util.regex.Pattern;
+import org.jsoup.nodes.Document;
+import org.jsoup.parser.StreamParser;
 
 public class PageInfo {
+    private static final int MAX_TITLE_LEN = 1000;
+    private static final int MAX_TEXT_LEN = 3000;
+
     private final int status;
     private final String reason;
     private final String contentType;
@@ -28,6 +25,33 @@ public PageInfo(int status, String reason, String contentType, String charset, S
         this.text = text;
     }
 
+    public PageInfo(int status, String reason, String contentType, Document document) {
+        this.status = status;
+        this.reason = reason;
+        this.contentType = contentType;
+        charset = document.charset().name();
+        String title = null;
+        if (document.location().startsWith("https://bsky.app/profile/")) {
+            var ogTitle = document.selectFirst("meta[property=og:title]");
+            if (ogTitle != null) {
+                title = ogTitle.attr("content") + " on Bluesky";
+            }
+        }
+        if (title == null) title = cleanTitle(document.title());
+        this.title = title;
+        location = null;
+        var description = document.selectFirst("meta[property=description]");
+        text = (description != null ? description.html() : "") + document.text();
+    }
+
+    private static String cleanTitle(String title) {
+        title = title.replaceAll("\\s\\s+", " ").trim();
+        if (title.length() > MAX_TITLE_LEN) {
+            title = title.substring(0, MAX_TITLE_LEN) + "...";
+        }
+        return title;
+    }
+
     public int weight() {
         int weight = 20;
         if (reason != null) weight += reason.length();
@@ -47,84 +71,6 @@ public String getText() {
         return text;
     }
 
-    static class TitleHandler extends AbstractMarkupHandler {
-        private static final Pattern WHITESPACE = Pattern.compile("\\s+");
-        private static final Set BLOCK_TAGS = Set.of(
-                "html", "head", "body", "frameset", "style", "meta", "link", "title", "frame",
-                "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6",
-                "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins",
-                "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th",
-                "td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main",
-                "svg", "math", "center", "dir", "applet", "marquee", "listing");
-
-        final char[] TITLE = "title".toCharArray();
-        final int maxTitleLen = 1000;
-        final int maxTextLen = 3000;
-        boolean withinTitle = false;
-        String title;
-        final StringBuilder textBuffer = new StringBuilder();
-
-        @Override
-        public void handleCloseElementStart(char[] buffer, int nameOffset, int nameLen, int line, int col) throws ParseException {
-            withinTitle = false;
-            addSpaceIfBlockTag(buffer, nameOffset, nameLen);
-        }
-
-        @Override
-        public void handleOpenElementEnd(char[] buffer, int nameOffset, int nameLen, int line, int col) throws ParseException {
-            withinTitle = TextUtil.equals(false, buffer, nameOffset, nameLen, TITLE, 0, TITLE.length);
-            addSpaceIfBlockTag(buffer, nameOffset, nameLen);
-        }
-
-        private void addSpaceIfBlockTag(char[] buffer, int nameOffset, int nameLen) {
-            String tag = new String(buffer, nameOffset, nameLen).toLowerCase(Locale.ROOT);
-            if (BLOCK_TAGS.contains(tag)) {
-                if (!endsWithWhitespace(textBuffer)) {
-                    textBuffer.append(" ");
-                }
-            }
-        }
-
-        @Override
-        public void handleText(char[] buffer, int offset, int len, int line, int col) throws ParseException {
-            if (withinTitle && title == null) {
-                title = decodeText(buffer, offset, Math.min(len, maxTitleLen));
-            }
-            int remaining = maxTextLen - textBuffer.length();
-            if (remaining > 0) {
-                String text = decodeText(buffer, offset, Math.min(len, remaining));
-                if (startsWithWhitespace(text) && (textBuffer.isEmpty() || endsWithWhitespace(textBuffer))) {
-                    textBuffer.append(text, 1, text.length());
-                } else {
-                    textBuffer.append(text);
-                }
-            }
-
-        }
-
-        private static boolean startsWithWhitespace(CharSequence s) {
-            return !s.isEmpty() && Character.isWhitespace(s.charAt(0));
-        }
-
-        private static boolean endsWithWhitespace(CharSequence s) {
-            return !s.isEmpty() && Character.isWhitespace(s.charAt(s.length() - 1));
-        }
-
-        private String decodeText(char[] buffer, int offset, int len) {
-            String text = Parser.unescapeEntities(new String(buffer, offset, len), false);
-            text = WHITESPACE.matcher(text).replaceAll(" ");
-            return text;
-        }
-
-        public String getCleanTitle() {
-            String title = this.title.replaceAll("\\s\\s+", " ").trim();
-            if (title.length() > maxTitleLen) {
-                title = title.substring(0, maxTitleLen) + "...";
-            }
-            return title;
-        }
-    }
-
     public int getStatus() {
         return status;
     }

diff --git a/ui/src/pandas/render/PageInfoController.java b/ui/src/pandas/render/PageInfoController.java
@@ -6,12 +6,9 @@
 import com.google.common.cache.CacheBuilder;
 import com.google.common.cache.CacheLoader;
 import com.google.common.cache.LoadingCache;
-import org.attoparser.MarkupParser;
-import org.attoparser.ParseException;
-import org.attoparser.config.ParseConfiguration;
-import org.attoparser.discard.DiscardMarkupHandler;
-import org.attoparser.select.BlockSelectorMarkupHandler;
+import org.apache.commons.io.input.BoundedInputStream;
 import org.jetbrains.annotations.NotNull;
+import org.jsoup.Jsoup;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.beans.factory.annotation.Autowired;
@@ -26,18 +23,14 @@
 import pandas.collection.Subject;
 import pandas.collection.SubjectRepository;
 
-import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.net.URI;
 import java.net.UnknownHostException;
 import java.net.http.HttpClient;
 import java.net.http.HttpRequest;
 import java.net.http.HttpResponse;
 import java.nio.charset.Charset;
-import java.nio.charset.StandardCharsets;
-import java.nio.charset.UnsupportedCharsetException;
 import java.util.*;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
@@ -155,34 +148,12 @@ private PageInfo fetchPageInfo(String url) throws IOException, InterruptedExcept
             String text = null;
             HttpStatus status = HttpStatus.resolve(response.statusCode());
             String reason = status == null ? null : status.getReasonPhrase();
-            if (mediaType.equalsTypeAndSubtype(MediaType.TEXT_HTML) && body != null) {
-                PageInfo.TitleHandler handler = new PageInfo.TitleHandler();
-
-                InputStream stream = body;
-                // if there was no charset in the Content-Type header, probe for meta tags near the top of the file
-                if (charset == null) {
-                    BufferedInputStream bis = new BufferedInputStream(stream);
-                    String charsetName = HtmlCharset.detect(bis);
-                    if (charsetName != null) {
-                        try {
-                            charset = Charset.forName(charsetName);
-                        } catch (UnsupportedCharsetException e) {
-                            log.warn("Unsupported charset {}, defaulting to iso-8859-1", charsetName);
-                            charset = StandardCharsets.ISO_8859_1;
-                        }
-                    }
-                    stream = bis;
-                }
 
-                try {
-                    new MarkupParser(ParseConfiguration.htmlConfiguration()).parse(new InputStreamReader(stream, charset),
-                            new BlockSelectorMarkupHandler(new DiscardMarkupHandler(), handler,
-                                    new String[]{"script", "noscript", "style"}));
-                } catch (ParseException e) {
-                    log.warn("Exception parsing " + url, e);
-                }
-                title = handler.getCleanTitle();
-                text = handler.textBuffer.toString();
+            if (mediaType.equalsTypeAndSubtype(MediaType.TEXT_HTML) && body != null) {
+                String charsetName = mediaType.getParameter("charset");
+                var boundedStream = new BoundedInputStream(body, 10 * 1024 * 1024);
+                var document = Jsoup.parse(boundedStream, charsetName, url);
+                return new PageInfo(response.statusCode(), reason, contentType, document);
             }
             String location = null;
             if (response.previousResponse().isPresent()) {

diff --git a/ui/test/pandas/render/HtmlCharsetTest.java b/ui/test/pandas/render/HtmlCharsetTest.java
diff --git a/ui/test/pandas/render/PageInfoTest.java b/ui/test/pandas/render/PageInfoTest.java
@@ -3,6 +3,7 @@
 import org.attoparser.MarkupParser;
 import org.attoparser.ParseException;
 import org.attoparser.config.ParseConfiguration;
+import org.jsoup.Jsoup;
 import org.junit.Test;
 
 import static org.junit.Assert.assertEquals;
@@ -11,9 +12,10 @@ public class PageInfoTest {
 
     @Test
     public void testTitleHandler() throws ParseException {
-        PageInfo.TitleHandler handler = new PageInfo.TitleHandler();
-        new MarkupParser(ParseConfiguration.htmlConfiguration()).parse("<html><head><title>   \t\ttest\n123\t\t\n  456</title></head><body><h1>h1", handler);
-        assertEquals("test 123 456", handler.getCleanTitle());
+        String html = "<html><head><title>   \t\ttest\n123\t\t\n  456</title></head><body><h1>h1";
+
+        var pageInfo = new PageInfo(200, "OK", "text/html", Jsoup.parse(html));
+        assertEquals("test 123 456", pageInfo.getTitle());
     }