diff --git a/ui/pom.xml b/ui/pom.xml index 148e04e..4bb7b48 100644 --- a/ui/pom.xml +++ b/ui/pom.xml @@ -280,7 +280,7 @@ org.jsoup jsoup - 1.15.3 + 1.18.2 com.googlecode.flyway diff --git a/ui/src/pandas/render/HtmlCharset.java b/ui/src/pandas/render/HtmlCharset.java deleted file mode 100644 index bdefcdd..0000000 --- a/ui/src/pandas/render/HtmlCharset.java +++ /dev/null @@ -1,86 +0,0 @@ -package pandas.render; - -import org.apache.commons.io.input.BoundedInputStream; -import org.attoparser.AbstractMarkupHandler; -import org.attoparser.MarkupParser; -import org.attoparser.ParseException; -import org.attoparser.config.ParseConfiguration; -import org.attoparser.util.TextUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.http.InvalidMediaTypeException; -import org.springframework.http.MediaType; - -import java.io.BufferedInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; - -public class HtmlCharset { - private static final Logger log = LoggerFactory.getLogger(HtmlCharset.class); - - /** - * Probes the start of a stream for HTML meta charset tag. Uses mark() and reset() so as not to consume the actual - * stream content. Returns null if no meta charset tag was found or if we encountered an unrecoverable HTML parse - * error. - */ - public static String detect(BufferedInputStream stream) throws IOException { - int limit = 4096; - stream.mark(limit); - HtmlCharset.Handler handler = new HtmlCharset.Handler(); - BoundedInputStream bounded = new BoundedInputStream(stream, limit); - bounded.setPropagateClose(false); - try { - new MarkupParser(ParseConfiguration.htmlConfiguration()).parse(new InputStreamReader(bounded, StandardCharsets.ISO_8859_1), handler); - } catch (ParseException e) { - log.warn("charset detection error", e); - // ignore - } - stream.reset(); - return handler.charset; - } - - static class Handler extends AbstractMarkupHandler { - private static final char[] META = "meta".toCharArray(); - private static final char[] CHARSET = "charset".toCharArray(); - private static final char[] CONTENT = "content".toCharArray(); - private static final char[] HTTP_EQUIV = "http-equiv".toCharArray(); - private static final char[] CONTENT_TYPE = "content-type".toCharArray(); - - boolean withinMeta = false; - boolean httpEquivContentType = false; - String charset; - String content; - - @Override - public void handleStandaloneElementStart(char[] buffer, int nameOffset, int nameLen, boolean minimized, int line, int col) throws ParseException { - withinMeta = TextUtil.equals(false, buffer, nameOffset, nameLen, META, 0, META.length); - content = null; - httpEquivContentType = false; - } - - @Override - public void handleStandaloneElementEnd(char[] buffer, int nameOffset, int nameLen, boolean minimized, int line, int col) throws ParseException { - if (withinMeta && httpEquivContentType) { - try { - charset = MediaType.parseMediaType(content).getParameter("charset"); - } catch (InvalidMediaTypeException e) { - charset = null; - } - } - } - - @Override - public void handleAttribute(char[] buffer, int nameOffset, int nameLen, int nameLine, int nameCol, int operatorOffset, int operatorLen, int operatorLine, int operatorCol, int valueContentOffset, int valueContentLen, int valueOuterOffset, int valueOuterLen, int valueLine, int valueCol) throws ParseException { - if (withinMeta && TextUtil.equals(false, buffer, nameOffset, nameLen, CHARSET, 0, CHARSET.length)) { - charset = new String(buffer, valueContentOffset, valueContentLen); - } else if (withinMeta && TextUtil.equals(false, buffer, nameOffset, nameLen, CONTENT, 0, CONTENT.length)){ - content = new String(buffer, valueContentOffset, valueContentLen); - } else if (withinMeta && TextUtil.equals(false, buffer, nameOffset, nameLen, HTTP_EQUIV, 0, HTTP_EQUIV.length) - && TextUtil.equals(false, buffer, valueContentOffset, valueContentLen, CONTENT_TYPE, 0, CONTENT_TYPE.length)) { - httpEquivContentType = true; - } - } - } - -} diff --git a/ui/src/pandas/render/PageInfo.java b/ui/src/pandas/render/PageInfo.java index 158f7f8..7a8b3da 100644 --- a/ui/src/pandas/render/PageInfo.java +++ b/ui/src/pandas/render/PageInfo.java @@ -1,15 +1,12 @@ package pandas.render; -import org.attoparser.AbstractMarkupHandler; -import org.attoparser.ParseException; -import org.attoparser.util.TextUtil; -import org.jsoup.parser.Parser; - -import java.util.Locale; -import java.util.Set; -import java.util.regex.Pattern; +import org.jsoup.nodes.Document; +import org.jsoup.parser.StreamParser; public class PageInfo { + private static final int MAX_TITLE_LEN = 1000; + private static final int MAX_TEXT_LEN = 3000; + private final int status; private final String reason; private final String contentType; @@ -28,6 +25,33 @@ public PageInfo(int status, String reason, String contentType, String charset, S this.text = text; } + public PageInfo(int status, String reason, String contentType, Document document) { + this.status = status; + this.reason = reason; + this.contentType = contentType; + charset = document.charset().name(); + String title = null; + if (document.location().startsWith("https://bsky.app/profile/")) { + var ogTitle = document.selectFirst("meta[property=og:title]"); + if (ogTitle != null) { + title = ogTitle.attr("content") + " on Bluesky"; + } + } + if (title == null) title = cleanTitle(document.title()); + this.title = title; + location = null; + var description = document.selectFirst("meta[property=description]"); + text = (description != null ? description.html() : "") + document.text(); + } + + private static String cleanTitle(String title) { + title = title.replaceAll("\\s\\s+", " ").trim(); + if (title.length() > MAX_TITLE_LEN) { + title = title.substring(0, MAX_TITLE_LEN) + "..."; + } + return title; + } + public int weight() { int weight = 20; if (reason != null) weight += reason.length(); @@ -47,84 +71,6 @@ public String getText() { return text; } - static class TitleHandler extends AbstractMarkupHandler { - private static final Pattern WHITESPACE = Pattern.compile("\\s+"); - private static final Set BLOCK_TAGS = Set.of( - "html", "head", "body", "frameset", "style", "meta", "link", "title", "frame", - "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", - "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins", - "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th", - "td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main", - "svg", "math", "center", "dir", "applet", "marquee", "listing"); - - final char[] TITLE = "title".toCharArray(); - final int maxTitleLen = 1000; - final int maxTextLen = 3000; - boolean withinTitle = false; - String title; - final StringBuilder textBuffer = new StringBuilder(); - - @Override - public void handleCloseElementStart(char[] buffer, int nameOffset, int nameLen, int line, int col) throws ParseException { - withinTitle = false; - addSpaceIfBlockTag(buffer, nameOffset, nameLen); - } - - @Override - public void handleOpenElementEnd(char[] buffer, int nameOffset, int nameLen, int line, int col) throws ParseException { - withinTitle = TextUtil.equals(false, buffer, nameOffset, nameLen, TITLE, 0, TITLE.length); - addSpaceIfBlockTag(buffer, nameOffset, nameLen); - } - - private void addSpaceIfBlockTag(char[] buffer, int nameOffset, int nameLen) { - String tag = new String(buffer, nameOffset, nameLen).toLowerCase(Locale.ROOT); - if (BLOCK_TAGS.contains(tag)) { - if (!endsWithWhitespace(textBuffer)) { - textBuffer.append(" "); - } - } - } - - @Override - public void handleText(char[] buffer, int offset, int len, int line, int col) throws ParseException { - if (withinTitle && title == null) { - title = decodeText(buffer, offset, Math.min(len, maxTitleLen)); - } - int remaining = maxTextLen - textBuffer.length(); - if (remaining > 0) { - String text = decodeText(buffer, offset, Math.min(len, remaining)); - if (startsWithWhitespace(text) && (textBuffer.isEmpty() || endsWithWhitespace(textBuffer))) { - textBuffer.append(text, 1, text.length()); - } else { - textBuffer.append(text); - } - } - - } - - private static boolean startsWithWhitespace(CharSequence s) { - return !s.isEmpty() && Character.isWhitespace(s.charAt(0)); - } - - private static boolean endsWithWhitespace(CharSequence s) { - return !s.isEmpty() && Character.isWhitespace(s.charAt(s.length() - 1)); - } - - private String decodeText(char[] buffer, int offset, int len) { - String text = Parser.unescapeEntities(new String(buffer, offset, len), false); - text = WHITESPACE.matcher(text).replaceAll(" "); - return text; - } - - public String getCleanTitle() { - String title = this.title.replaceAll("\\s\\s+", " ").trim(); - if (title.length() > maxTitleLen) { - title = title.substring(0, maxTitleLen) + "..."; - } - return title; - } - } - public int getStatus() { return status; } diff --git a/ui/src/pandas/render/PageInfoController.java b/ui/src/pandas/render/PageInfoController.java index 9882089..e6a7d8d 100644 --- a/ui/src/pandas/render/PageInfoController.java +++ b/ui/src/pandas/render/PageInfoController.java @@ -6,12 +6,9 @@ import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; -import org.attoparser.MarkupParser; -import org.attoparser.ParseException; -import org.attoparser.config.ParseConfiguration; -import org.attoparser.discard.DiscardMarkupHandler; -import org.attoparser.select.BlockSelectorMarkupHandler; +import org.apache.commons.io.input.BoundedInputStream; import org.jetbrains.annotations.NotNull; +import org.jsoup.Jsoup; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; @@ -26,18 +23,14 @@ import pandas.collection.Subject; import pandas.collection.SubjectRepository; -import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; import java.net.URI; import java.net.UnknownHostException; import java.net.http.HttpClient; import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.nio.charset.UnsupportedCharsetException; import java.util.*; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; @@ -155,34 +148,12 @@ private PageInfo fetchPageInfo(String url) throws IOException, InterruptedExcept String text = null; HttpStatus status = HttpStatus.resolve(response.statusCode()); String reason = status == null ? null : status.getReasonPhrase(); - if (mediaType.equalsTypeAndSubtype(MediaType.TEXT_HTML) && body != null) { - PageInfo.TitleHandler handler = new PageInfo.TitleHandler(); - - InputStream stream = body; - // if there was no charset in the Content-Type header, probe for meta tags near the top of the file - if (charset == null) { - BufferedInputStream bis = new BufferedInputStream(stream); - String charsetName = HtmlCharset.detect(bis); - if (charsetName != null) { - try { - charset = Charset.forName(charsetName); - } catch (UnsupportedCharsetException e) { - log.warn("Unsupported charset {}, defaulting to iso-8859-1", charsetName); - charset = StandardCharsets.ISO_8859_1; - } - } - stream = bis; - } - try { - new MarkupParser(ParseConfiguration.htmlConfiguration()).parse(new InputStreamReader(stream, charset), - new BlockSelectorMarkupHandler(new DiscardMarkupHandler(), handler, - new String[]{"script", "noscript", "style"})); - } catch (ParseException e) { - log.warn("Exception parsing " + url, e); - } - title = handler.getCleanTitle(); - text = handler.textBuffer.toString(); + if (mediaType.equalsTypeAndSubtype(MediaType.TEXT_HTML) && body != null) { + String charsetName = mediaType.getParameter("charset"); + var boundedStream = new BoundedInputStream(body, 10 * 1024 * 1024); + var document = Jsoup.parse(boundedStream, charsetName, url); + return new PageInfo(response.statusCode(), reason, contentType, document); } String location = null; if (response.previousResponse().isPresent()) { diff --git a/ui/test/pandas/render/HtmlCharsetTest.java b/ui/test/pandas/render/HtmlCharsetTest.java deleted file mode 100644 index 3353a24..0000000 --- a/ui/test/pandas/render/HtmlCharsetTest.java +++ /dev/null @@ -1,25 +0,0 @@ -package pandas.render; - -import org.attoparser.ParseException; -import org.junit.Test; - -import java.io.BufferedInputStream; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.nio.charset.StandardCharsets; - -import static org.junit.Assert.assertEquals; - -public class HtmlCharsetTest { - private static String detect(String html) throws ParseException, IOException { - return HtmlCharset.detect(new BufferedInputStream(new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8)))); - } - - @Test - public void testDetect() throws ParseException, IOException { - assertEquals("utf-8", detect("")); - assertEquals("utf-8", detect("")); - assertEquals(null, detect("")); - assertEquals(null, detect("\1\0garbage<<<<<<<<<<<<<<<<")); - } -} \ No newline at end of file diff --git a/ui/test/pandas/render/PageInfoTest.java b/ui/test/pandas/render/PageInfoTest.java index d1b52c5..16921b3 100644 --- a/ui/test/pandas/render/PageInfoTest.java +++ b/ui/test/pandas/render/PageInfoTest.java @@ -3,6 +3,7 @@ import org.attoparser.MarkupParser; import org.attoparser.ParseException; import org.attoparser.config.ParseConfiguration; +import org.jsoup.Jsoup; import org.junit.Test; import static org.junit.Assert.assertEquals; @@ -11,9 +12,10 @@ public class PageInfoTest { @Test public void testTitleHandler() throws ParseException { - PageInfo.TitleHandler handler = new PageInfo.TitleHandler(); - new MarkupParser(ParseConfiguration.htmlConfiguration()).parse(" \t\ttest\n123\t\t\n 456

h1", handler); - assertEquals("test 123 456", handler.getCleanTitle()); + String html = " \t\ttest\n123\t\t\n 456

h1"; + + var pageInfo = new PageInfo(200, "OK", "text/html", Jsoup.parse(html)); + assertEquals("test 123 456", pageInfo.getTitle()); }