Skip to content

Commit

Permalink
PageSearch: Group results by URL when doing a prefix search
Browse files Browse the repository at this point in the history
  • Loading branch information
ato committed Nov 29, 2023
1 parent d069f5b commit a30cb77
Show file tree
Hide file tree
Showing 6 changed files with 114 additions and 18 deletions.
18 changes: 18 additions & 0 deletions ui/resources/static/assets/Global.css
Original file line number Diff line number Diff line change
Expand Up @@ -747,6 +747,24 @@ button.title-flag:active:after, button.title-flag.active:after {
border-bottom: 1px solid black;
}

/**
* CDX Results Table
*/
.cdx-results {
white-space: nowrap;
}

.cdx-results thead td {
font-weight: bold;
}
.cdx-results td {
margin-right: 16px;
}

.cdx-results .url {
color: darkgreen;
}

/**
* HTTP status codes
*/
Expand Down
16 changes: 0 additions & 16 deletions ui/resources/templates/PageSearch.html
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,6 @@
<title>Pages - PANDAS</title>
<link rel="stylesheet" href="../static/assets/Global.css" th:href="@{/assets/Global.css}">
<script src="../static/assets/Global.js" th:src="@{/assets/Global.js}" defer></script>
<style>
.cdx-results {
white-space: nowrap;
}

.cdx-results thead td {
font-weight: bold;
}
.cdx-results td {
margin-right: 16px;
}

.cdx-results .url {
color: darkgreen;
}
</style>
</head>
<body>
<nav th:replace="~{_layout.html :: sidebar (activeItem='Pages')}"></nav>
Expand Down
48 changes: 48 additions & 0 deletions ui/resources/templates/PageSearchGrouped.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<!doctype html>
<html lang=en xmlns:th="http://www.thymeleaf.org">
<head>
<meta charset="UTF-8">
<title>Pages - PANDAS</title>
<link rel="stylesheet" href="../static/assets/Global.css" th:href="@{/assets/Global.css}">
<script src="../static/assets/Global.js" th:src="@{/assets/Global.js}" defer></script>
</head>
<body>
<nav th:replace="~{_layout.html :: sidebar (activeItem='Pages')}"></nav>
<main class="search">
<form id="sf" th:action="@{/pages}">
</form>

<header>
<nav></nav>
<input type=search name="url" th:value="${url}" placeholder="URL" data-keybind="f" form=sf style="width: 100%">
</header>

<table class="cdx-results">
<thead>
<tr>
<td>From</td>
<td>To</td>
<td>Status</td>
<td>Type</td>
<td><abbr title="Unique snapshots (by content digest)">Unq</abbr></td>
<td><abbr title="Total snapshots">Tot</abbr></td>
<td>URL</td>
</tr>
</thead>
<tbody>
<tr th:each="group : ${groups}" th:with="capture = ${group.last()}">
<td><a th:href="@{/replay(url=${group.first.url},date=${group.first.date})}"
th:text="${dateTimeFormat.format(group.first.date)}">First</a></td>
<td><a th:href="@{/replay(url=${group.last.url},date=${group.last.date})}"
th:text="${dateTimeFormat.format(group.last.date)}">Last</a></td>
<td><span th:text="${capture.status}" th:class="${@format.statusClass(capture.status)}"
th:title="${capture.statusPhrase}"></span>
<a th:if="${capture.redirectUrl}" th:href="@{/pages(url=${capture.redirectUrl})}">redirect</a></td>
<td th:text="${capture.contentType}">text/html</td>
<td th:text="${group.uniqueCount()}"></td>
<td th:text="${group.count()}"></td>
<td><a th:href="@{/pages(url=${capture.url})}" th:text="${capture.url}"></a></td>
</tr>
</tbody>
</table>
</main>
4 changes: 4 additions & 0 deletions ui/src/pandas/collection/CaptureGroup.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
package pandas.collection;

public record CaptureGroup(Capture first, Capture last, int count, int uniqueCount) {
}
36 changes: 35 additions & 1 deletion ui/src/pandas/collection/CaptureIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
public class CaptureIndex {
private String baseUrl = "http://winch.nla.gov.au:9901/trove";

public List<Capture> query(String q) {
public List<Capture> query(String q, boolean excludeErrors) {
List<String> urls = new ArrayList<>();
Set<String> digests = new HashSet<>();
for (String word : q.split(" ")) {
Expand All @@ -31,6 +31,7 @@ public List<Capture> query(String q) {
List<Capture> captures = new ArrayList<>();
for (String url : urls) {
String qUrl = baseUrl + "?url=" + URLEncoder.encode(url.toString(), UTF_8);
if (excludeErrors) qUrl += "&filter=!status:[45]..";
try (var reader = new BufferedReader(new InputStreamReader(new URL(qUrl).openStream(), UTF_8))) {
while (true) {
String line = reader.readLine();
Expand All @@ -45,8 +46,41 @@ public List<Capture> query(String q) {
throw new UncheckedIOException(e);
}
}
return captures;
}

public List<Capture> queryDateDesc(String q) {
var captures = query(q, false);
captures.sort(Comparator.comparing(Capture::getUrl)
.thenComparing(Comparator.comparing(Capture::getDate).reversed()));
return captures;
}

public List<CaptureGroup> queryGrouped(String q) {
List<CaptureGroup> groups = new ArrayList<>();
Capture firstInGroup = null;
Capture prev = null;
int groupCount = 0;
int uniqueCount = 0;
for (Capture capture : query(q, true)) {
if (firstInGroup == null || !capture.getUrl().equals(firstInGroup.getUrl())) {
if (firstInGroup != null) {
groups.add(new CaptureGroup(firstInGroup, prev, groupCount, uniqueCount));
}
firstInGroup = capture;
prev = null;
groupCount = 0;
uniqueCount = 0;
}
groupCount++;
if (prev == null || !capture.getDigest().equals(prev.getDigest())) {
uniqueCount++;
}
prev = capture;
}
if (firstInGroup != null) {
groups.add(new CaptureGroup(firstInGroup, prev, groupCount, uniqueCount));
}
return groups;
}
}
10 changes: 9 additions & 1 deletion ui/src/pandas/collection/PagesController.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ public PagesController(Config config, CaptureIndex captureIndex) {

@GetMapping("/pages")
public String search(@RequestParam(value = "url", required = false) String url, Model model) {
List<Capture> captures = url == null || url.isBlank() ? List.of() : captureIndex.query(url);
if (url != null && url.endsWith("*")) return searchGrouped(url, model);
List<Capture> captures = url == null || url.isBlank() ? List.of() : captureIndex.queryDateDesc(url);
String queryString = captures.stream()
.map(Capture::getFile)
.distinct()
Expand Down Expand Up @@ -70,6 +71,13 @@ public String search(@RequestParam(value = "url", required = false) String url,
return "PageSearch";
}

private String searchGrouped(String url, Model model) {
model.addAttribute("groups", captureIndex.queryGrouped(url));
model.addAttribute("url", url);
model.addAttribute("dateTimeFormat", DateTimeFormatter.ofPattern("dd/MM/yyyy").withZone(ZoneId.systemDefault()));
return "PageSearchGrouped";
}

record CrawlsByFilename(long warcId, String filename, long crawlId, String crawlName, Long pandasInstanceId) {
}

Expand Down

0 comments on commit a30cb77

Please sign in to comment.