From 7f0d99e759455863875cb9bc0b0421f8162473ac Mon Sep 17 00:00:00 2001 From: Andrew Jackson Date: Wed, 4 Mar 2020 23:24:43 +0000 Subject: [PATCH] Bring up to date with Heritrix 3.4.0-20200304 --- integration-test/robot/Dockerfile | 2 +- .../robot/tests/crawl-test-site.robot | 2 +- jobs/frequent/crawler-beans.cxml | 2 +- jobs/frequent/sheets.xml | 3 -- pom.xml | 4 +-- .../crawler/h3/frontier/RedisFrontier.java | 28 +++++++++++++++++++ .../h3/frontier/RedisWorkQueueFrontier.java | 28 +++++++++++++++++++ .../postprocessor/KafkaKeyedCrawlLogFeed.java | 4 +-- 8 files changed, 63 insertions(+), 10 deletions(-) diff --git a/integration-test/robot/Dockerfile b/integration-test/robot/Dockerfile index 23ea9d8..e7d96e0 100644 --- a/integration-test/robot/Dockerfile +++ b/integration-test/robot/Dockerfile @@ -1,4 +1,4 @@ -FROM ukwa/ukwa-manage +FROM ukwa/crawl-streams COPY requirements.txt /tmp/requirements.txt RUN pip install -r /tmp/requirements.txt diff --git a/integration-test/robot/tests/crawl-test-site.robot b/integration-test/robot/tests/crawl-test-site.robot index 4855b68..6c4aef5 100644 --- a/integration-test/robot/tests/crawl-test-site.robot +++ b/integration-test/robot/tests/crawl-test-site.robot @@ -5,7 +5,7 @@ Library Process *** Test Cases *** Launch first test crawl Sleep 30s Waiting for 20s to give Kafka time to start up... - ${result}= Run Process submit -k kafka:9092 -S -R fc.tocrawl http://crawl-test-site.webarchive.org.uk shell=yes + ${result}= Run Process submit -k kafka:9092 -S -R fc.tocrawl -p 2 http://crawl-test-site.webarchive.org.uk shell=yes Should Not Contain ${result.stderr} Traceback Log ${result.stdout} Log ${result.stderr} diff --git a/jobs/frequent/crawler-beans.cxml b/jobs/frequent/crawler-beans.cxml index aec4743..b80747e 100644 --- a/jobs/frequent/crawler-beans.cxml +++ b/jobs/frequent/crawler-beans.cxml @@ -817,7 +817,7 @@ - + diff --git a/jobs/frequent/sheets.xml b/jobs/frequent/sheets.xml index fde9d60..2d784f2 100644 --- a/jobs/frequent/sheets.xml +++ b/jobs/frequent/sheets.xml @@ -456,7 +456,6 @@ - @@ -464,7 +463,6 @@ - @@ -472,7 +470,6 @@ - diff --git a/pom.xml b/pom.xml index d5e58d6..bd2d098 100755 --- a/pom.xml +++ b/pom.xml @@ -7,8 +7,8 @@ 1.8 UTF-8 - - 3.4.0-SNAPSHOT + 3.4.0-20200304 + diff --git a/src/main/java/uk/bl/wap/crawler/h3/frontier/RedisFrontier.java b/src/main/java/uk/bl/wap/crawler/h3/frontier/RedisFrontier.java index 900db3d..5a95312 100644 --- a/src/main/java/uk/bl/wap/crawler/h3/frontier/RedisFrontier.java +++ b/src/main/java/uk/bl/wap/crawler/h3/frontier/RedisFrontier.java @@ -13,6 +13,8 @@ import java.io.IOException; import java.io.PrintWriter; import java.util.Map; +import java.util.Set; +import java.util.concurrent.BlockingQueue; import java.util.logging.Level; import java.util.logging.Logger; @@ -21,8 +23,10 @@ import org.archive.crawler.datamodel.UriUniqFilter; import org.archive.crawler.event.CrawlURIDispositionEvent; import org.archive.crawler.frontier.AbstractFrontier; +import org.archive.crawler.frontier.WorkQueue; import org.archive.modules.CrawlURI; import org.archive.spring.KeyedProperties; +import org.archive.util.ObjectIdentityCache; import org.springframework.beans.BeansException; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.ApplicationContext; @@ -501,4 +505,28 @@ private void setQueueDelay(CrawlURI curi, long fetchTime) { } } + @Override + public long exportPendingUris(PrintWriter writer) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public ObjectIdentityCache getAllQueues() { + // TODO Auto-generated method stub + return null; + } + + @Override + public BlockingQueue getReadyClassQueues() { + // TODO Auto-generated method stub + return null; + } + + @Override + public Set getInProcessQueues() { + // TODO Auto-generated method stub + return null; + } + } diff --git a/src/main/java/uk/bl/wap/crawler/h3/frontier/RedisWorkQueueFrontier.java b/src/main/java/uk/bl/wap/crawler/h3/frontier/RedisWorkQueueFrontier.java index 363f433..63dc157 100644 --- a/src/main/java/uk/bl/wap/crawler/h3/frontier/RedisWorkQueueFrontier.java +++ b/src/main/java/uk/bl/wap/crawler/h3/frontier/RedisWorkQueueFrontier.java @@ -3,8 +3,11 @@ */ package uk.bl.wap.crawler.h3.frontier; +import java.io.PrintWriter; import java.util.Queue; +import java.util.Set; import java.util.SortedMap; +import java.util.concurrent.BlockingQueue; import javax.management.openmbean.CompositeData; @@ -12,6 +15,7 @@ import org.archive.crawler.frontier.WorkQueue; import org.archive.crawler.frontier.WorkQueueFrontier; import org.archive.modules.CrawlURI; +import org.archive.util.ObjectIdentityCache; import com.sleepycat.je.DatabaseException; @@ -108,4 +112,28 @@ protected boolean workQueueDataOnDisk() { return true; } + @Override + public long exportPendingUris(PrintWriter writer) { + // TODO Auto-generated method stub + return 0; + } + + @Override + public ObjectIdentityCache getAllQueues() { + // TODO Auto-generated method stub + return null; + } + + @Override + public BlockingQueue getReadyClassQueues() { + // TODO Auto-generated method stub + return null; + } + + @Override + public Set getInProcessQueues() { + // TODO Auto-generated method stub + return null; + } + } diff --git a/src/main/java/uk/bl/wap/crawler/postprocessor/KafkaKeyedCrawlLogFeed.java b/src/main/java/uk/bl/wap/crawler/postprocessor/KafkaKeyedCrawlLogFeed.java index c836fac..1543bae 100644 --- a/src/main/java/uk/bl/wap/crawler/postprocessor/KafkaKeyedCrawlLogFeed.java +++ b/src/main/java/uk/bl/wap/crawler/postprocessor/KafkaKeyedCrawlLogFeed.java @@ -46,7 +46,6 @@ import org.archive.modules.Processor; import org.archive.modules.net.ServerCache; import org.archive.modules.postprocessor.CrawlLogJsonBuilder; -import org.archive.modules.postprocessor.KafkaCrawlLogFeed; import org.json.JSONObject; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.Lifecycle; @@ -62,7 +61,8 @@ * Sends messages with a key (the CrawlURI classKey by default). * * - * @see KafkaCrawlLogFeed (which this implementation is based upon) + * @see org.archive.modules.postprocessor.KafkaCrawlLogFeed (which this + * implementation is based upon) * @see UriProcessingFormatter * @author nlevitt, anjackson */