From a30cb779017b2b45287471db8924ffad2bf359cc Mon Sep 17 00:00:00 2001
From: Alex Osborne <aosborne@nla.gov.au>
Date: Wed, 29 Nov 2023 11:34:08 +0900
Subject: [PATCH] PageSearch: Group results by URL when doing a prefix search

---
 ui/resources/static/assets/Global.css         | 18 +++++++
 ui/resources/templates/PageSearch.html        | 16 -------
 ui/resources/templates/PageSearchGrouped.html | 48 +++++++++++++++++++
 ui/src/pandas/collection/CaptureGroup.java    |  4 ++
 ui/src/pandas/collection/CaptureIndex.java    | 36 +++++++++++++-
 ui/src/pandas/collection/PagesController.java | 10 +++-
 6 files changed, 114 insertions(+), 18 deletions(-)
 create mode 100644 ui/resources/templates/PageSearchGrouped.html
 create mode 100644 ui/src/pandas/collection/CaptureGroup.java
diff --git a/ui/resources/static/assets/Global.css b/ui/resources/static/assets/Global.css
index 29ef4f5a..298a0218 100644
--- a/ui/resources/static/assets/Global.css
+++ b/ui/resources/static/assets/Global.css
@@ -747,6 +747,24 @@ button.title-flag:active:after, button.title-flag.active:after {
     border-bottom: 1px solid black;
 }
 
+/**
+ * CDX Results Table
+ */
+.cdx-results {
+    white-space: nowrap;
+}
+
+.cdx-results thead td {
+    font-weight: bold;
+}
+.cdx-results td {
+    margin-right: 16px;
+}
+
+.cdx-results .url {
+    color: darkgreen;
+}
+
 /**
  * HTTP status codes
  */
diff --git a/ui/resources/templates/PageSearch.html b/ui/resources/templates/PageSearch.html
index f20abedc..14b79b01 100644
--- a/ui/resources/templates/PageSearch.html
+++ b/ui/resources/templates/PageSearch.html
@@ -5,22 +5,6 @@
     <title>Pages - PANDAS</title>
     <link rel="stylesheet" href="../static/assets/Global.css" th:href="@{/assets/Global.css}">
     <script src="../static/assets/Global.js" th:src="@{/assets/Global.js}" defer></script>
-    <style>
-        .cdx-results {
-            white-space: nowrap;
-        }
-
-        .cdx-results thead td {
-            font-weight: bold;
-        }
-        .cdx-results td {
-            margin-right: 16px;
-        }
-
-        .cdx-results .url {
-            color: darkgreen;
-        }
-    </style>
 </head>
 <body>
 <nav th:replace="~{_layout.html :: sidebar (activeItem='Pages')}"></nav>
diff --git a/ui/resources/templates/PageSearchGrouped.html b/ui/resources/templates/PageSearchGrouped.html
new file mode 100644
index 00000000..c708def6
--- /dev/null
+++ b/ui/resources/templates/PageSearchGrouped.html
@@ -0,0 +1,48 @@
+<!doctype html>
+<html lang=en xmlns:th="http://www.thymeleaf.org">
+<head>
+    <meta charset="UTF-8">
+    <title>Pages - PANDAS</title>
+    <link rel="stylesheet" href="../static/assets/Global.css" th:href="@{/assets/Global.css}">
+    <script src="../static/assets/Global.js" th:src="@{/assets/Global.js}" defer></script>
+</head>
+<body>
+<nav th:replace="~{_layout.html :: sidebar (activeItem='Pages')}"></nav>
+<main class="search">
+    <form id="sf" th:action="@{/pages}">
+    </form>
+
+    <header>
+        <nav></nav>
+        <input type=search name="url" th:value="${url}" placeholder="URL" data-keybind="f" form=sf style="width: 100%">
+    </header>
+
+    <table class="cdx-results">
+        <thead>
+        <tr>
+            <td>From</td>
+            <td>To</td>
+            <td>Status</td>
+            <td>Type</td>
+            <td><abbr title="Unique snapshots (by content digest)">Unq</abbr></td>
+            <td><abbr title="Total snapshots">Tot</abbr></td>
+            <td>URL</td>
+        </tr>
+        </thead>
+        <tbody>
+        <tr th:each="group : ${groups}" th:with="capture = ${group.last()}">
+            <td><a th:href="@{/replay(url=${group.first.url},date=${group.first.date})}"
+                   th:text="${dateTimeFormat.format(group.first.date)}">First</a></td>
+            <td><a th:href="@{/replay(url=${group.last.url},date=${group.last.date})}"
+                   th:text="${dateTimeFormat.format(group.last.date)}">Last</a></td>
+            <td><span th:text="${capture.status}" th:class="${@format.statusClass(capture.status)}"
+                      th:title="${capture.statusPhrase}"></span>
+                <a th:if="${capture.redirectUrl}" th:href="@{/pages(url=${capture.redirectUrl})}">redirect</a></td>
+            <td th:text="${capture.contentType}">text/html</td>
+            <td th:text="${group.uniqueCount()}"></td>
+            <td th:text="${group.count()}"></td>
+            <td><a th:href="@{/pages(url=${capture.url})}" th:text="${capture.url}"></a></td>
+        </tr>
+        </tbody>
+    </table>
+</main>
\ No newline at end of file
diff --git a/ui/src/pandas/collection/CaptureGroup.java b/ui/src/pandas/collection/CaptureGroup.java
new file mode 100644
index 00000000..29b51f59
--- /dev/null
+++ b/ui/src/pandas/collection/CaptureGroup.java
@@ -0,0 +1,4 @@
+package pandas.collection;
+
+public record CaptureGroup(Capture first, Capture last, int count, int uniqueCount) {
+}
diff --git a/ui/src/pandas/collection/CaptureIndex.java b/ui/src/pandas/collection/CaptureIndex.java
index c36fb6d1..e12714fb 100644
--- a/ui/src/pandas/collection/CaptureIndex.java
+++ b/ui/src/pandas/collection/CaptureIndex.java
@@ -17,7 +17,7 @@
 public class CaptureIndex {
     private String baseUrl = "http://winch.nla.gov.au:9901/trove";
 
-    public List<Capture> query(String q) {
+    public List<Capture> query(String q, boolean excludeErrors) {
         List<String> urls = new ArrayList<>();
         Set<String> digests = new HashSet<>();
         for (String word : q.split(" ")) {
@@ -31,6 +31,7 @@ public List<Capture> query(String q) {
         List<Capture> captures = new ArrayList<>();
         for (String url : urls) {
             String qUrl = baseUrl + "?url=" + URLEncoder.encode(url.toString(), UTF_8);
+            if (excludeErrors) qUrl += "&filter=!status:[45]..";
             try (var reader = new BufferedReader(new InputStreamReader(new URL(qUrl).openStream(), UTF_8))) {
                 while (true) {
                     String line = reader.readLine();
@@ -45,8 +46,41 @@ public List<Capture> query(String q) {
                 throw new UncheckedIOException(e);
             }
         }
+        return captures;
+    }
+
+    public List<Capture> queryDateDesc(String q) {
+        var captures = query(q, false);
         captures.sort(Comparator.comparing(Capture::getUrl)
                 .thenComparing(Comparator.comparing(Capture::getDate).reversed()));
         return captures;
     }
+
+    public List<CaptureGroup> queryGrouped(String q) {
+        List<CaptureGroup> groups = new ArrayList<>();
+        Capture firstInGroup = null;
+        Capture prev = null;
+        int groupCount = 0;
+        int uniqueCount = 0;
+        for (Capture capture : query(q, true)) {
+            if (firstInGroup == null || !capture.getUrl().equals(firstInGroup.getUrl())) {
+                if (firstInGroup != null) {
+                    groups.add(new CaptureGroup(firstInGroup, prev, groupCount, uniqueCount));
+                }
+                firstInGroup = capture;
+                prev = null;
+                groupCount = 0;
+                uniqueCount = 0;
+            }
+            groupCount++;
+            if (prev == null || !capture.getDigest().equals(prev.getDigest())) {
+                uniqueCount++;
+            }
+            prev = capture;
+        }
+        if (firstInGroup != null) {
+            groups.add(new CaptureGroup(firstInGroup, prev, groupCount, uniqueCount));
+        }
+        return groups;
+    }
 }
diff --git a/ui/src/pandas/collection/PagesController.java b/ui/src/pandas/collection/PagesController.java
index b2122be0..452e0d9f 100644
--- a/ui/src/pandas/collection/PagesController.java
+++ b/ui/src/pandas/collection/PagesController.java
@@ -42,7 +42,8 @@ public PagesController(Config config, CaptureIndex captureIndex) {
 
     @GetMapping("/pages")
     public String search(@RequestParam(value = "url", required = false) String url, Model model) {
-        List<Capture> captures = url == null || url.isBlank() ? List.of() : captureIndex.query(url);
+        if (url != null && url.endsWith("*")) return searchGrouped(url, model);
+        List<Capture> captures = url == null || url.isBlank() ? List.of() : captureIndex.queryDateDesc(url);
         String queryString = captures.stream()
                 .map(Capture::getFile)
                 .distinct()
@@ -70,6 +71,13 @@ public String search(@RequestParam(value = "url", required = false) String url,
         return "PageSearch";
     }
 
+    private String searchGrouped(String url, Model model) {
+        model.addAttribute("groups", captureIndex.queryGrouped(url));
+        model.addAttribute("url", url);
+        model.addAttribute("dateTimeFormat", DateTimeFormatter.ofPattern("dd/MM/yyyy").withZone(ZoneId.systemDefault()));
+        return "PageSearchGrouped";
+    }
+
     record CrawlsByFilename(long warcId, String filename, long crawlId, String crawlName, Long pandasInstanceId) {
     }