From a30cb779017b2b45287471db8924ffad2bf359cc Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Wed, 29 Nov 2023 11:34:08 +0900 Subject: [PATCH] PageSearch: Group results by URL when doing a prefix search --- ui/resources/static/assets/Global.css | 18 +++++++ ui/resources/templates/PageSearch.html | 16 ------- ui/resources/templates/PageSearchGrouped.html | 48 +++++++++++++++++++ ui/src/pandas/collection/CaptureGroup.java | 4 ++ ui/src/pandas/collection/CaptureIndex.java | 36 +++++++++++++- ui/src/pandas/collection/PagesController.java | 10 +++- 6 files changed, 114 insertions(+), 18 deletions(-) create mode 100644 ui/resources/templates/PageSearchGrouped.html create mode 100644 ui/src/pandas/collection/CaptureGroup.java diff --git a/ui/resources/static/assets/Global.css b/ui/resources/static/assets/Global.css index 29ef4f5a..298a0218 100644 --- a/ui/resources/static/assets/Global.css +++ b/ui/resources/static/assets/Global.css @@ -747,6 +747,24 @@ button.title-flag:active:after, button.title-flag.active:after { border-bottom: 1px solid black; } +/** + * CDX Results Table + */ +.cdx-results { + white-space: nowrap; +} + +.cdx-results thead td { + font-weight: bold; +} +.cdx-results td { + margin-right: 16px; +} + +.cdx-results .url { + color: darkgreen; +} + /** * HTTP status codes */ diff --git a/ui/resources/templates/PageSearch.html b/ui/resources/templates/PageSearch.html index f20abedc..14b79b01 100644 --- a/ui/resources/templates/PageSearch.html +++ b/ui/resources/templates/PageSearch.html @@ -5,22 +5,6 @@ Pages - PANDAS - diff --git a/ui/resources/templates/PageSearchGrouped.html b/ui/resources/templates/PageSearchGrouped.html new file mode 100644 index 00000000..c708def6 --- /dev/null +++ b/ui/resources/templates/PageSearchGrouped.html @@ -0,0 +1,48 @@ + + + + + Pages - PANDAS + + + + + +
+
+
+ +
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + +
FromToStatusTypeUnqTotURL
FirstLast + redirecttext/html
+
\ No newline at end of file diff --git a/ui/src/pandas/collection/CaptureGroup.java b/ui/src/pandas/collection/CaptureGroup.java new file mode 100644 index 00000000..29b51f59 --- /dev/null +++ b/ui/src/pandas/collection/CaptureGroup.java @@ -0,0 +1,4 @@ +package pandas.collection; + +public record CaptureGroup(Capture first, Capture last, int count, int uniqueCount) { +} diff --git a/ui/src/pandas/collection/CaptureIndex.java b/ui/src/pandas/collection/CaptureIndex.java index c36fb6d1..e12714fb 100644 --- a/ui/src/pandas/collection/CaptureIndex.java +++ b/ui/src/pandas/collection/CaptureIndex.java @@ -17,7 +17,7 @@ public class CaptureIndex { private String baseUrl = "http://winch.nla.gov.au:9901/trove"; - public List query(String q) { + public List query(String q, boolean excludeErrors) { List urls = new ArrayList<>(); Set digests = new HashSet<>(); for (String word : q.split(" ")) { @@ -31,6 +31,7 @@ public List query(String q) { List captures = new ArrayList<>(); for (String url : urls) { String qUrl = baseUrl + "?url=" + URLEncoder.encode(url.toString(), UTF_8); + if (excludeErrors) qUrl += "&filter=!status:[45].."; try (var reader = new BufferedReader(new InputStreamReader(new URL(qUrl).openStream(), UTF_8))) { while (true) { String line = reader.readLine(); @@ -45,8 +46,41 @@ public List query(String q) { throw new UncheckedIOException(e); } } + return captures; + } + + public List queryDateDesc(String q) { + var captures = query(q, false); captures.sort(Comparator.comparing(Capture::getUrl) .thenComparing(Comparator.comparing(Capture::getDate).reversed())); return captures; } + + public List queryGrouped(String q) { + List groups = new ArrayList<>(); + Capture firstInGroup = null; + Capture prev = null; + int groupCount = 0; + int uniqueCount = 0; + for (Capture capture : query(q, true)) { + if (firstInGroup == null || !capture.getUrl().equals(firstInGroup.getUrl())) { + if (firstInGroup != null) { + groups.add(new CaptureGroup(firstInGroup, prev, groupCount, uniqueCount)); + } + firstInGroup = capture; + prev = null; + groupCount = 0; + uniqueCount = 0; + } + groupCount++; + if (prev == null || !capture.getDigest().equals(prev.getDigest())) { + uniqueCount++; + } + prev = capture; + } + if (firstInGroup != null) { + groups.add(new CaptureGroup(firstInGroup, prev, groupCount, uniqueCount)); + } + return groups; + } } diff --git a/ui/src/pandas/collection/PagesController.java b/ui/src/pandas/collection/PagesController.java index b2122be0..452e0d9f 100644 --- a/ui/src/pandas/collection/PagesController.java +++ b/ui/src/pandas/collection/PagesController.java @@ -42,7 +42,8 @@ public PagesController(Config config, CaptureIndex captureIndex) { @GetMapping("/pages") public String search(@RequestParam(value = "url", required = false) String url, Model model) { - List captures = url == null || url.isBlank() ? List.of() : captureIndex.query(url); + if (url != null && url.endsWith("*")) return searchGrouped(url, model); + List captures = url == null || url.isBlank() ? List.of() : captureIndex.queryDateDesc(url); String queryString = captures.stream() .map(Capture::getFile) .distinct() @@ -70,6 +71,13 @@ public String search(@RequestParam(value = "url", required = false) String url, return "PageSearch"; } + private String searchGrouped(String url, Model model) { + model.addAttribute("groups", captureIndex.queryGrouped(url)); + model.addAttribute("url", url); + model.addAttribute("dateTimeFormat", DateTimeFormatter.ofPattern("dd/MM/yyyy").withZone(ZoneId.systemDefault())); + return "PageSearchGrouped"; + } + record CrawlsByFilename(long warcId, String filename, long crawlId, String crawlName, Long pandasInstanceId) { }