diff --git a/ui/pom.xml b/ui/pom.xml
index 148e04e..4bb7b48 100644
--- a/ui/pom.xml
+++ b/ui/pom.xml
@@ -280,7 +280,7 @@
org.jsoup
jsoup
- 1.15.3
+ 1.18.2
com.googlecode.flyway
diff --git a/ui/src/pandas/render/HtmlCharset.java b/ui/src/pandas/render/HtmlCharset.java
deleted file mode 100644
index bdefcdd..0000000
--- a/ui/src/pandas/render/HtmlCharset.java
+++ /dev/null
@@ -1,86 +0,0 @@
-package pandas.render;
-
-import org.apache.commons.io.input.BoundedInputStream;
-import org.attoparser.AbstractMarkupHandler;
-import org.attoparser.MarkupParser;
-import org.attoparser.ParseException;
-import org.attoparser.config.ParseConfiguration;
-import org.attoparser.util.TextUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.springframework.http.InvalidMediaTypeException;
-import org.springframework.http.MediaType;
-
-import java.io.BufferedInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.nio.charset.StandardCharsets;
-
-public class HtmlCharset {
- private static final Logger log = LoggerFactory.getLogger(HtmlCharset.class);
-
- /**
- * Probes the start of a stream for HTML meta charset tag. Uses mark() and reset() so as not to consume the actual
- * stream content. Returns null if no meta charset tag was found or if we encountered an unrecoverable HTML parse
- * error.
- */
- public static String detect(BufferedInputStream stream) throws IOException {
- int limit = 4096;
- stream.mark(limit);
- HtmlCharset.Handler handler = new HtmlCharset.Handler();
- BoundedInputStream bounded = new BoundedInputStream(stream, limit);
- bounded.setPropagateClose(false);
- try {
- new MarkupParser(ParseConfiguration.htmlConfiguration()).parse(new InputStreamReader(bounded, StandardCharsets.ISO_8859_1), handler);
- } catch (ParseException e) {
- log.warn("charset detection error", e);
- // ignore
- }
- stream.reset();
- return handler.charset;
- }
-
- static class Handler extends AbstractMarkupHandler {
- private static final char[] META = "meta".toCharArray();
- private static final char[] CHARSET = "charset".toCharArray();
- private static final char[] CONTENT = "content".toCharArray();
- private static final char[] HTTP_EQUIV = "http-equiv".toCharArray();
- private static final char[] CONTENT_TYPE = "content-type".toCharArray();
-
- boolean withinMeta = false;
- boolean httpEquivContentType = false;
- String charset;
- String content;
-
- @Override
- public void handleStandaloneElementStart(char[] buffer, int nameOffset, int nameLen, boolean minimized, int line, int col) throws ParseException {
- withinMeta = TextUtil.equals(false, buffer, nameOffset, nameLen, META, 0, META.length);
- content = null;
- httpEquivContentType = false;
- }
-
- @Override
- public void handleStandaloneElementEnd(char[] buffer, int nameOffset, int nameLen, boolean minimized, int line, int col) throws ParseException {
- if (withinMeta && httpEquivContentType) {
- try {
- charset = MediaType.parseMediaType(content).getParameter("charset");
- } catch (InvalidMediaTypeException e) {
- charset = null;
- }
- }
- }
-
- @Override
- public void handleAttribute(char[] buffer, int nameOffset, int nameLen, int nameLine, int nameCol, int operatorOffset, int operatorLen, int operatorLine, int operatorCol, int valueContentOffset, int valueContentLen, int valueOuterOffset, int valueOuterLen, int valueLine, int valueCol) throws ParseException {
- if (withinMeta && TextUtil.equals(false, buffer, nameOffset, nameLen, CHARSET, 0, CHARSET.length)) {
- charset = new String(buffer, valueContentOffset, valueContentLen);
- } else if (withinMeta && TextUtil.equals(false, buffer, nameOffset, nameLen, CONTENT, 0, CONTENT.length)){
- content = new String(buffer, valueContentOffset, valueContentLen);
- } else if (withinMeta && TextUtil.equals(false, buffer, nameOffset, nameLen, HTTP_EQUIV, 0, HTTP_EQUIV.length)
- && TextUtil.equals(false, buffer, valueContentOffset, valueContentLen, CONTENT_TYPE, 0, CONTENT_TYPE.length)) {
- httpEquivContentType = true;
- }
- }
- }
-
-}
diff --git a/ui/src/pandas/render/PageInfo.java b/ui/src/pandas/render/PageInfo.java
index 158f7f8..7a8b3da 100644
--- a/ui/src/pandas/render/PageInfo.java
+++ b/ui/src/pandas/render/PageInfo.java
@@ -1,15 +1,12 @@
package pandas.render;
-import org.attoparser.AbstractMarkupHandler;
-import org.attoparser.ParseException;
-import org.attoparser.util.TextUtil;
-import org.jsoup.parser.Parser;
-
-import java.util.Locale;
-import java.util.Set;
-import java.util.regex.Pattern;
+import org.jsoup.nodes.Document;
+import org.jsoup.parser.StreamParser;
public class PageInfo {
+ private static final int MAX_TITLE_LEN = 1000;
+ private static final int MAX_TEXT_LEN = 3000;
+
private final int status;
private final String reason;
private final String contentType;
@@ -28,6 +25,33 @@ public PageInfo(int status, String reason, String contentType, String charset, S
this.text = text;
}
+ public PageInfo(int status, String reason, String contentType, Document document) {
+ this.status = status;
+ this.reason = reason;
+ this.contentType = contentType;
+ charset = document.charset().name();
+ String title = null;
+ if (document.location().startsWith("https://bsky.app/profile/")) {
+ var ogTitle = document.selectFirst("meta[property=og:title]");
+ if (ogTitle != null) {
+ title = ogTitle.attr("content") + " on Bluesky";
+ }
+ }
+ if (title == null) title = cleanTitle(document.title());
+ this.title = title;
+ location = null;
+ var description = document.selectFirst("meta[property=description]");
+ text = (description != null ? description.html() : "") + document.text();
+ }
+
+ private static String cleanTitle(String title) {
+ title = title.replaceAll("\\s\\s+", " ").trim();
+ if (title.length() > MAX_TITLE_LEN) {
+ title = title.substring(0, MAX_TITLE_LEN) + "...";
+ }
+ return title;
+ }
+
public int weight() {
int weight = 20;
if (reason != null) weight += reason.length();
@@ -47,84 +71,6 @@ public String getText() {
return text;
}
- static class TitleHandler extends AbstractMarkupHandler {
- private static final Pattern WHITESPACE = Pattern.compile("\\s+");
- private static final Set BLOCK_TAGS = Set.of(
- "html", "head", "body", "frameset", "style", "meta", "link", "title", "frame",
- "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6",
- "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins",
- "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th",
- "td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main",
- "svg", "math", "center", "dir", "applet", "marquee", "listing");
-
- final char[] TITLE = "title".toCharArray();
- final int maxTitleLen = 1000;
- final int maxTextLen = 3000;
- boolean withinTitle = false;
- String title;
- final StringBuilder textBuffer = new StringBuilder();
-
- @Override
- public void handleCloseElementStart(char[] buffer, int nameOffset, int nameLen, int line, int col) throws ParseException {
- withinTitle = false;
- addSpaceIfBlockTag(buffer, nameOffset, nameLen);
- }
-
- @Override
- public void handleOpenElementEnd(char[] buffer, int nameOffset, int nameLen, int line, int col) throws ParseException {
- withinTitle = TextUtil.equals(false, buffer, nameOffset, nameLen, TITLE, 0, TITLE.length);
- addSpaceIfBlockTag(buffer, nameOffset, nameLen);
- }
-
- private void addSpaceIfBlockTag(char[] buffer, int nameOffset, int nameLen) {
- String tag = new String(buffer, nameOffset, nameLen).toLowerCase(Locale.ROOT);
- if (BLOCK_TAGS.contains(tag)) {
- if (!endsWithWhitespace(textBuffer)) {
- textBuffer.append(" ");
- }
- }
- }
-
- @Override
- public void handleText(char[] buffer, int offset, int len, int line, int col) throws ParseException {
- if (withinTitle && title == null) {
- title = decodeText(buffer, offset, Math.min(len, maxTitleLen));
- }
- int remaining = maxTextLen - textBuffer.length();
- if (remaining > 0) {
- String text = decodeText(buffer, offset, Math.min(len, remaining));
- if (startsWithWhitespace(text) && (textBuffer.isEmpty() || endsWithWhitespace(textBuffer))) {
- textBuffer.append(text, 1, text.length());
- } else {
- textBuffer.append(text);
- }
- }
-
- }
-
- private static boolean startsWithWhitespace(CharSequence s) {
- return !s.isEmpty() && Character.isWhitespace(s.charAt(0));
- }
-
- private static boolean endsWithWhitespace(CharSequence s) {
- return !s.isEmpty() && Character.isWhitespace(s.charAt(s.length() - 1));
- }
-
- private String decodeText(char[] buffer, int offset, int len) {
- String text = Parser.unescapeEntities(new String(buffer, offset, len), false);
- text = WHITESPACE.matcher(text).replaceAll(" ");
- return text;
- }
-
- public String getCleanTitle() {
- String title = this.title.replaceAll("\\s\\s+", " ").trim();
- if (title.length() > maxTitleLen) {
- title = title.substring(0, maxTitleLen) + "...";
- }
- return title;
- }
- }
-
public int getStatus() {
return status;
}
diff --git a/ui/src/pandas/render/PageInfoController.java b/ui/src/pandas/render/PageInfoController.java
index 9882089..e6a7d8d 100644
--- a/ui/src/pandas/render/PageInfoController.java
+++ b/ui/src/pandas/render/PageInfoController.java
@@ -6,12 +6,9 @@
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
-import org.attoparser.MarkupParser;
-import org.attoparser.ParseException;
-import org.attoparser.config.ParseConfiguration;
-import org.attoparser.discard.DiscardMarkupHandler;
-import org.attoparser.select.BlockSelectorMarkupHandler;
+import org.apache.commons.io.input.BoundedInputStream;
import org.jetbrains.annotations.NotNull;
+import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
@@ -26,18 +23,14 @@
import pandas.collection.Subject;
import pandas.collection.SubjectRepository;
-import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.io.InputStreamReader;
import java.net.URI;
import java.net.UnknownHostException;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.charset.Charset;
-import java.nio.charset.StandardCharsets;
-import java.nio.charset.UnsupportedCharsetException;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
@@ -155,34 +148,12 @@ private PageInfo fetchPageInfo(String url) throws IOException, InterruptedExcept
String text = null;
HttpStatus status = HttpStatus.resolve(response.statusCode());
String reason = status == null ? null : status.getReasonPhrase();
- if (mediaType.equalsTypeAndSubtype(MediaType.TEXT_HTML) && body != null) {
- PageInfo.TitleHandler handler = new PageInfo.TitleHandler();
-
- InputStream stream = body;
- // if there was no charset in the Content-Type header, probe for meta tags near the top of the file
- if (charset == null) {
- BufferedInputStream bis = new BufferedInputStream(stream);
- String charsetName = HtmlCharset.detect(bis);
- if (charsetName != null) {
- try {
- charset = Charset.forName(charsetName);
- } catch (UnsupportedCharsetException e) {
- log.warn("Unsupported charset {}, defaulting to iso-8859-1", charsetName);
- charset = StandardCharsets.ISO_8859_1;
- }
- }
- stream = bis;
- }
- try {
- new MarkupParser(ParseConfiguration.htmlConfiguration()).parse(new InputStreamReader(stream, charset),
- new BlockSelectorMarkupHandler(new DiscardMarkupHandler(), handler,
- new String[]{"script", "noscript", "style"}));
- } catch (ParseException e) {
- log.warn("Exception parsing " + url, e);
- }
- title = handler.getCleanTitle();
- text = handler.textBuffer.toString();
+ if (mediaType.equalsTypeAndSubtype(MediaType.TEXT_HTML) && body != null) {
+ String charsetName = mediaType.getParameter("charset");
+ var boundedStream = new BoundedInputStream(body, 10 * 1024 * 1024);
+ var document = Jsoup.parse(boundedStream, charsetName, url);
+ return new PageInfo(response.statusCode(), reason, contentType, document);
}
String location = null;
if (response.previousResponse().isPresent()) {
diff --git a/ui/test/pandas/render/HtmlCharsetTest.java b/ui/test/pandas/render/HtmlCharsetTest.java
deleted file mode 100644
index 3353a24..0000000
--- a/ui/test/pandas/render/HtmlCharsetTest.java
+++ /dev/null
@@ -1,25 +0,0 @@
-package pandas.render;
-
-import org.attoparser.ParseException;
-import org.junit.Test;
-
-import java.io.BufferedInputStream;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.nio.charset.StandardCharsets;
-
-import static org.junit.Assert.assertEquals;
-
-public class HtmlCharsetTest {
- private static String detect(String html) throws ParseException, IOException {
- return HtmlCharset.detect(new BufferedInputStream(new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8))));
- }
-
- @Test
- public void testDetect() throws ParseException, IOException {
- assertEquals("utf-8", detect(""));
- assertEquals("utf-8", detect(""));
- assertEquals(null, detect(""));
- assertEquals(null, detect("\1\0garbage<<<<<<<<<<<<<<<<"));
- }
-}
\ No newline at end of file
diff --git a/ui/test/pandas/render/PageInfoTest.java b/ui/test/pandas/render/PageInfoTest.java
index d1b52c5..16921b3 100644
--- a/ui/test/pandas/render/PageInfoTest.java
+++ b/ui/test/pandas/render/PageInfoTest.java
@@ -3,6 +3,7 @@
import org.attoparser.MarkupParser;
import org.attoparser.ParseException;
import org.attoparser.config.ParseConfiguration;
+import org.jsoup.Jsoup;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
@@ -11,9 +12,10 @@ public class PageInfoTest {
@Test
public void testTitleHandler() throws ParseException {
- PageInfo.TitleHandler handler = new PageInfo.TitleHandler();
- new MarkupParser(ParseConfiguration.htmlConfiguration()).parse(" \t\ttest\n123\t\t\n 456h1", handler);
- assertEquals("test 123 456", handler.getCleanTitle());
+ String html = " \t\ttest\n123\t\t\n 456h1";
+
+ var pageInfo = new PageInfo(200, "OK", "text/html", Jsoup.parse(html));
+ assertEquals("test 123 456", pageInfo.getTitle());
}