Skip to content

Commit

Permalink
PageInfo: Switch to JSoup and improve Bluesky title and text
Browse files Browse the repository at this point in the history
  • Loading branch information
ato committed Nov 27, 2024
1 parent 40f1552 commit c4f6361
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 237 deletions.
2 changes: 1 addition & 1 deletion ui/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.15.3</version>
<version>1.18.2</version>
</dependency>
<dependency>
<groupId>com.googlecode.flyway</groupId>
Expand Down
86 changes: 0 additions & 86 deletions ui/src/pandas/render/HtmlCharset.java

This file was deleted.

118 changes: 32 additions & 86 deletions ui/src/pandas/render/PageInfo.java
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
package pandas.render;

import org.attoparser.AbstractMarkupHandler;
import org.attoparser.ParseException;
import org.attoparser.util.TextUtil;
import org.jsoup.parser.Parser;

import java.util.Locale;
import java.util.Set;
import java.util.regex.Pattern;
import org.jsoup.nodes.Document;
import org.jsoup.parser.StreamParser;

public class PageInfo {
private static final int MAX_TITLE_LEN = 1000;
private static final int MAX_TEXT_LEN = 3000;

private final int status;
private final String reason;
private final String contentType;
Expand All @@ -28,6 +25,33 @@ public PageInfo(int status, String reason, String contentType, String charset, S
this.text = text;
}

public PageInfo(int status, String reason, String contentType, Document document) {
this.status = status;
this.reason = reason;
this.contentType = contentType;
charset = document.charset().name();
String title = null;
if (document.location().startsWith("https://bsky.app/profile/")) {
var ogTitle = document.selectFirst("meta[property=og:title]");
if (ogTitle != null) {
title = ogTitle.attr("content") + " on Bluesky";
}
}
if (title == null) title = cleanTitle(document.title());
this.title = title;
location = null;
var description = document.selectFirst("meta[property=description]");
text = (description != null ? description.html() : "") + document.text();
}

private static String cleanTitle(String title) {
title = title.replaceAll("\\s\\s+", " ").trim();
if (title.length() > MAX_TITLE_LEN) {
title = title.substring(0, MAX_TITLE_LEN) + "...";
}
return title;
}

public int weight() {
int weight = 20;
if (reason != null) weight += reason.length();
Expand All @@ -47,84 +71,6 @@ public String getText() {
return text;
}

static class TitleHandler extends AbstractMarkupHandler {
private static final Pattern WHITESPACE = Pattern.compile("\\s+");
private static final Set BLOCK_TAGS = Set.of(
"html", "head", "body", "frameset", "style", "meta", "link", "title", "frame",
"noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6",
"ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins",
"del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th",
"td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main",
"svg", "math", "center", "dir", "applet", "marquee", "listing");

final char[] TITLE = "title".toCharArray();
final int maxTitleLen = 1000;
final int maxTextLen = 3000;
boolean withinTitle = false;
String title;
final StringBuilder textBuffer = new StringBuilder();

@Override
public void handleCloseElementStart(char[] buffer, int nameOffset, int nameLen, int line, int col) throws ParseException {
withinTitle = false;
addSpaceIfBlockTag(buffer, nameOffset, nameLen);
}

@Override
public void handleOpenElementEnd(char[] buffer, int nameOffset, int nameLen, int line, int col) throws ParseException {
withinTitle = TextUtil.equals(false, buffer, nameOffset, nameLen, TITLE, 0, TITLE.length);
addSpaceIfBlockTag(buffer, nameOffset, nameLen);
}

private void addSpaceIfBlockTag(char[] buffer, int nameOffset, int nameLen) {
String tag = new String(buffer, nameOffset, nameLen).toLowerCase(Locale.ROOT);
if (BLOCK_TAGS.contains(tag)) {
if (!endsWithWhitespace(textBuffer)) {
textBuffer.append(" ");
}
}
}

@Override
public void handleText(char[] buffer, int offset, int len, int line, int col) throws ParseException {
if (withinTitle && title == null) {
title = decodeText(buffer, offset, Math.min(len, maxTitleLen));
}
int remaining = maxTextLen - textBuffer.length();
if (remaining > 0) {
String text = decodeText(buffer, offset, Math.min(len, remaining));
if (startsWithWhitespace(text) && (textBuffer.isEmpty() || endsWithWhitespace(textBuffer))) {
textBuffer.append(text, 1, text.length());
} else {
textBuffer.append(text);
}
}

}

private static boolean startsWithWhitespace(CharSequence s) {
return !s.isEmpty() && Character.isWhitespace(s.charAt(0));
}

private static boolean endsWithWhitespace(CharSequence s) {
return !s.isEmpty() && Character.isWhitespace(s.charAt(s.length() - 1));
}

private String decodeText(char[] buffer, int offset, int len) {
String text = Parser.unescapeEntities(new String(buffer, offset, len), false);
text = WHITESPACE.matcher(text).replaceAll(" ");
return text;
}

public String getCleanTitle() {
String title = this.title.replaceAll("\\s\\s+", " ").trim();
if (title.length() > maxTitleLen) {
title = title.substring(0, maxTitleLen) + "...";
}
return title;
}
}

public int getStatus() {
return status;
}
Expand Down
43 changes: 7 additions & 36 deletions ui/src/pandas/render/PageInfoController.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,9 @@
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import org.attoparser.MarkupParser;
import org.attoparser.ParseException;
import org.attoparser.config.ParseConfiguration;
import org.attoparser.discard.DiscardMarkupHandler;
import org.attoparser.select.BlockSelectorMarkupHandler;
import org.apache.commons.io.input.BoundedInputStream;
import org.jetbrains.annotations.NotNull;
import org.jsoup.Jsoup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
Expand All @@ -26,18 +23,14 @@
import pandas.collection.Subject;
import pandas.collection.SubjectRepository;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.UnknownHostException;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -155,34 +148,12 @@ private PageInfo fetchPageInfo(String url) throws IOException, InterruptedExcept
String text = null;
HttpStatus status = HttpStatus.resolve(response.statusCode());
String reason = status == null ? null : status.getReasonPhrase();
if (mediaType.equalsTypeAndSubtype(MediaType.TEXT_HTML) && body != null) {
PageInfo.TitleHandler handler = new PageInfo.TitleHandler();

InputStream stream = body;
// if there was no charset in the Content-Type header, probe for meta tags near the top of the file
if (charset == null) {
BufferedInputStream bis = new BufferedInputStream(stream);
String charsetName = HtmlCharset.detect(bis);
if (charsetName != null) {
try {
charset = Charset.forName(charsetName);
} catch (UnsupportedCharsetException e) {
log.warn("Unsupported charset {}, defaulting to iso-8859-1", charsetName);
charset = StandardCharsets.ISO_8859_1;
}
}
stream = bis;
}

try {
new MarkupParser(ParseConfiguration.htmlConfiguration()).parse(new InputStreamReader(stream, charset),
new BlockSelectorMarkupHandler(new DiscardMarkupHandler(), handler,
new String[]{"script", "noscript", "style"}));
} catch (ParseException e) {
log.warn("Exception parsing " + url, e);
}
title = handler.getCleanTitle();
text = handler.textBuffer.toString();
if (mediaType.equalsTypeAndSubtype(MediaType.TEXT_HTML) && body != null) {
String charsetName = mediaType.getParameter("charset");
var boundedStream = new BoundedInputStream(body, 10 * 1024 * 1024);
var document = Jsoup.parse(boundedStream, charsetName, url);
return new PageInfo(response.statusCode(), reason, contentType, document);
}
String location = null;
if (response.previousResponse().isPresent()) {
Expand Down
25 changes: 0 additions & 25 deletions ui/test/pandas/render/HtmlCharsetTest.java

This file was deleted.

8 changes: 5 additions & 3 deletions ui/test/pandas/render/PageInfoTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import org.attoparser.MarkupParser;
import org.attoparser.ParseException;
import org.attoparser.config.ParseConfiguration;
import org.jsoup.Jsoup;
import org.junit.Test;

import static org.junit.Assert.assertEquals;
Expand All @@ -11,9 +12,10 @@ public class PageInfoTest {

@Test
public void testTitleHandler() throws ParseException {
PageInfo.TitleHandler handler = new PageInfo.TitleHandler();
new MarkupParser(ParseConfiguration.htmlConfiguration()).parse("<html><head><title> \t\ttest\n123\t\t\n 456</title></head><body><h1>h1", handler);
assertEquals("test 123 456", handler.getCleanTitle());
String html = "<html><head><title> \t\ttest\n123\t\t\n 456</title></head><body><h1>h1";

var pageInfo = new PageInfo(200, "OK", "text/html", Jsoup.parse(html));
assertEquals("test 123 456", pageInfo.getTitle());
}


Expand Down

0 comments on commit c4f6361

Please sign in to comment.