diff --git a/ui/resources/static/assets/Global.css b/ui/resources/static/assets/Global.css
index 29ef4f5a..298a0218 100644
--- a/ui/resources/static/assets/Global.css
+++ b/ui/resources/static/assets/Global.css
@@ -747,6 +747,24 @@ button.title-flag:active:after, button.title-flag.active:after {
border-bottom: 1px solid black;
}
+/**
+ * CDX Results Table
+ */
+.cdx-results {
+ white-space: nowrap;
+}
+
+.cdx-results thead td {
+ font-weight: bold;
+}
+.cdx-results td {
+ margin-right: 16px;
+}
+
+.cdx-results .url {
+ color: darkgreen;
+}
+
/**
* HTTP status codes
*/
diff --git a/ui/resources/templates/PageSearch.html b/ui/resources/templates/PageSearch.html
index f20abedc..14b79b01 100644
--- a/ui/resources/templates/PageSearch.html
+++ b/ui/resources/templates/PageSearch.html
@@ -5,22 +5,6 @@
Pages - PANDAS
-
diff --git a/ui/resources/templates/PageSearchGrouped.html b/ui/resources/templates/PageSearchGrouped.html
new file mode 100644
index 00000000..c708def6
--- /dev/null
+++ b/ui/resources/templates/PageSearchGrouped.html
@@ -0,0 +1,48 @@
+
+
+
+
+ Pages - PANDAS
+
+
+
+
+
+
+
+
+
+
+
+
+
+ From |
+ To |
+ Status |
+ Type |
+ Unq |
+ Tot |
+ URL |
+
+
+
+
+ First |
+ Last |
+
+ redirect |
+ text/html |
+ |
+ |
+ |
+
+
+
+
\ No newline at end of file
diff --git a/ui/src/pandas/collection/CaptureGroup.java b/ui/src/pandas/collection/CaptureGroup.java
new file mode 100644
index 00000000..29b51f59
--- /dev/null
+++ b/ui/src/pandas/collection/CaptureGroup.java
@@ -0,0 +1,4 @@
+package pandas.collection;
+
+public record CaptureGroup(Capture first, Capture last, int count, int uniqueCount) {
+}
diff --git a/ui/src/pandas/collection/CaptureIndex.java b/ui/src/pandas/collection/CaptureIndex.java
index c36fb6d1..e12714fb 100644
--- a/ui/src/pandas/collection/CaptureIndex.java
+++ b/ui/src/pandas/collection/CaptureIndex.java
@@ -17,7 +17,7 @@
public class CaptureIndex {
private String baseUrl = "http://winch.nla.gov.au:9901/trove";
- public List query(String q) {
+ public List query(String q, boolean excludeErrors) {
List urls = new ArrayList<>();
Set digests = new HashSet<>();
for (String word : q.split(" ")) {
@@ -31,6 +31,7 @@ public List query(String q) {
List captures = new ArrayList<>();
for (String url : urls) {
String qUrl = baseUrl + "?url=" + URLEncoder.encode(url.toString(), UTF_8);
+ if (excludeErrors) qUrl += "&filter=!status:[45]..";
try (var reader = new BufferedReader(new InputStreamReader(new URL(qUrl).openStream(), UTF_8))) {
while (true) {
String line = reader.readLine();
@@ -45,8 +46,41 @@ public List query(String q) {
throw new UncheckedIOException(e);
}
}
+ return captures;
+ }
+
+ public List queryDateDesc(String q) {
+ var captures = query(q, false);
captures.sort(Comparator.comparing(Capture::getUrl)
.thenComparing(Comparator.comparing(Capture::getDate).reversed()));
return captures;
}
+
+ public List queryGrouped(String q) {
+ List groups = new ArrayList<>();
+ Capture firstInGroup = null;
+ Capture prev = null;
+ int groupCount = 0;
+ int uniqueCount = 0;
+ for (Capture capture : query(q, true)) {
+ if (firstInGroup == null || !capture.getUrl().equals(firstInGroup.getUrl())) {
+ if (firstInGroup != null) {
+ groups.add(new CaptureGroup(firstInGroup, prev, groupCount, uniqueCount));
+ }
+ firstInGroup = capture;
+ prev = null;
+ groupCount = 0;
+ uniqueCount = 0;
+ }
+ groupCount++;
+ if (prev == null || !capture.getDigest().equals(prev.getDigest())) {
+ uniqueCount++;
+ }
+ prev = capture;
+ }
+ if (firstInGroup != null) {
+ groups.add(new CaptureGroup(firstInGroup, prev, groupCount, uniqueCount));
+ }
+ return groups;
+ }
}
diff --git a/ui/src/pandas/collection/PagesController.java b/ui/src/pandas/collection/PagesController.java
index b2122be0..452e0d9f 100644
--- a/ui/src/pandas/collection/PagesController.java
+++ b/ui/src/pandas/collection/PagesController.java
@@ -42,7 +42,8 @@ public PagesController(Config config, CaptureIndex captureIndex) {
@GetMapping("/pages")
public String search(@RequestParam(value = "url", required = false) String url, Model model) {
- List captures = url == null || url.isBlank() ? List.of() : captureIndex.query(url);
+ if (url != null && url.endsWith("*")) return searchGrouped(url, model);
+ List captures = url == null || url.isBlank() ? List.of() : captureIndex.queryDateDesc(url);
String queryString = captures.stream()
.map(Capture::getFile)
.distinct()
@@ -70,6 +71,13 @@ public String search(@RequestParam(value = "url", required = false) String url,
return "PageSearch";
}
+ private String searchGrouped(String url, Model model) {
+ model.addAttribute("groups", captureIndex.queryGrouped(url));
+ model.addAttribute("url", url);
+ model.addAttribute("dateTimeFormat", DateTimeFormatter.ofPattern("dd/MM/yyyy").withZone(ZoneId.systemDefault()));
+ return "PageSearchGrouped";
+ }
+
record CrawlsByFilename(long warcId, String filename, long crawlId, String crawlName, Long pandasInstanceId) {
}