Skip to content

Commit

Permalink
Bring up to date with Heritrix 3.4.0-20200304
Browse files Browse the repository at this point in the history
  • Loading branch information
anjackson committed Mar 4, 2020
1 parent 035fc00 commit 7f0d99e
Show file tree
Hide file tree
Showing 8 changed files with 63 additions and 10 deletions.
2 changes: 1 addition & 1 deletion integration-test/robot/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM ukwa/ukwa-manage
FROM ukwa/crawl-streams

COPY requirements.txt /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
Expand Down
2 changes: 1 addition & 1 deletion integration-test/robot/tests/crawl-test-site.robot
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Library Process
*** Test Cases ***
Launch first test crawl
Sleep 30s Waiting for 20s to give Kafka time to start up...
${result}= Run Process submit -k kafka:9092 -S -R fc.tocrawl http://crawl-test-site.webarchive.org.uk shell=yes
${result}= Run Process submit -k kafka:9092 -S -R fc.tocrawl -p 2 http://crawl-test-site.webarchive.org.uk shell=yes
Should Not Contain ${result.stderr} Traceback
Log ${result.stdout}
Log ${result.stderr}
Expand Down
2 changes: 1 addition & 1 deletion jobs/frequent/crawler-beans.cxml
Original file line number Diff line number Diff line change
Expand Up @@ -817,7 +817,7 @@
<property name="forceQueueAssignment" value="" />
<property name="deferToPrevious" value="true" />
<property name="parallelQueues" value="1" />
<property name="parallelQueuesRandomAssignment" value="true"/>
<property name="parallelQueuesRandomAssignment" value="true" />
</bean>

<!-- URI PRECEDENCE POLICY -->
Expand Down
3 changes: 0 additions & 3 deletions jobs/frequent/sheets.xml
Original file line number Diff line number Diff line change
Expand Up @@ -456,23 +456,20 @@
<property name="map">
<map>
<entry key="queueAssignmentPolicy.parallelQueues" value="2" />
<entry key="queueAssignmentPolicy.parallelQueuesRandomAssignment" value="true" />
</map>
</property>
</bean>
<bean id="parallel-queues-4" class="org.archive.spring.Sheet">
<property name="map">
<map>
<entry key="queueAssignmentPolicy.parallelQueues" value="4" />
<entry key="queueAssignmentPolicy.parallelQueuesRandomAssignment" value="true" />
</map>
</property>
</bean>
<bean id="parallel-queues-8" class="org.archive.spring.Sheet">
<property name="map">
<map>
<entry key="queueAssignmentPolicy.parallelQueues" value="8" />
<entry key="queueAssignmentPolicy.parallelQueuesRandomAssignment" value="true" />
</map>
</property>
</bean>
Expand Down
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
<properties>
<jdk.version>1.8</jdk.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<!-- heritrix.version>3.4.0-20190418</heritrix.version -->
<heritrix.version>3.4.0-SNAPSHOT</heritrix.version>
<heritrix.version>3.4.0-20200304</heritrix.version>
<!-- >heritrix.version>3.4.0-SNAPSHOT</heritrix.version -->
</properties>
<build>
<plugins>
Expand Down
28 changes: 28 additions & 0 deletions src/main/java/uk/bl/wap/crawler/h3/frontier/RedisFrontier.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.logging.Level;
import java.util.logging.Logger;

Expand All @@ -21,8 +23,10 @@
import org.archive.crawler.datamodel.UriUniqFilter;
import org.archive.crawler.event.CrawlURIDispositionEvent;
import org.archive.crawler.frontier.AbstractFrontier;
import org.archive.crawler.frontier.WorkQueue;
import org.archive.modules.CrawlURI;
import org.archive.spring.KeyedProperties;
import org.archive.util.ObjectIdentityCache;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
Expand Down Expand Up @@ -501,4 +505,28 @@ private void setQueueDelay(CrawlURI curi, long fetchTime) {
}
}

@Override
public long exportPendingUris(PrintWriter writer) {
// TODO Auto-generated method stub
return 0;
}

@Override
public ObjectIdentityCache<WorkQueue> getAllQueues() {
// TODO Auto-generated method stub
return null;
}

@Override
public BlockingQueue<String> getReadyClassQueues() {
// TODO Auto-generated method stub
return null;
}

@Override
public Set<WorkQueue> getInProcessQueues() {
// TODO Auto-generated method stub
return null;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,19 @@
*/
package uk.bl.wap.crawler.h3.frontier;

import java.io.PrintWriter;
import java.util.Queue;
import java.util.Set;
import java.util.SortedMap;
import java.util.concurrent.BlockingQueue;

import javax.management.openmbean.CompositeData;

//import org.apache.commons.collections4.trie.PatriciaTrie;
import org.archive.crawler.frontier.WorkQueue;
import org.archive.crawler.frontier.WorkQueueFrontier;
import org.archive.modules.CrawlURI;
import org.archive.util.ObjectIdentityCache;

import com.sleepycat.je.DatabaseException;

Expand Down Expand Up @@ -108,4 +112,28 @@ protected boolean workQueueDataOnDisk() {
return true;
}

@Override
public long exportPendingUris(PrintWriter writer) {
// TODO Auto-generated method stub
return 0;
}

@Override
public ObjectIdentityCache<WorkQueue> getAllQueues() {
// TODO Auto-generated method stub
return null;
}

@Override
public BlockingQueue<String> getReadyClassQueues() {
// TODO Auto-generated method stub
return null;
}

@Override
public Set<WorkQueue> getInProcessQueues() {
// TODO Auto-generated method stub
return null;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
import org.archive.modules.Processor;
import org.archive.modules.net.ServerCache;
import org.archive.modules.postprocessor.CrawlLogJsonBuilder;
import org.archive.modules.postprocessor.KafkaCrawlLogFeed;
import org.json.JSONObject;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;
Expand All @@ -62,7 +61,8 @@
* Sends messages with a key (the CrawlURI classKey by default).
*
*
* @see KafkaCrawlLogFeed (which this implementation is based upon)
* @see org.archive.modules.postprocessor.KafkaCrawlLogFeed (which this
* implementation is based upon)
* @see UriProcessingFormatter
* @author nlevitt, anjackson
*/
Expand Down

0 comments on commit 7f0d99e

Please sign in to comment.