Skip to content

Commit

Permalink
browsertrix: Ensure a default time and size limit are applied
Browse files Browse the repository at this point in the history
  • Loading branch information
ato committed Nov 21, 2024
1 parent d7cbd63 commit 40f1552
Showing 1 changed file with 11 additions and 4 deletions.
15 changes: 11 additions & 4 deletions gatherer/src/pandas/gatherer/heritrix/BrowsertrixGatherer.java
Original file line number Diff line number Diff line change
Expand Up @@ -92,20 +92,27 @@ public void gather(Instance instance) throws Exception {
command.add("prefix");
}

command.add("--timeLimit");
command.add(String.valueOf(titleGather.getCrawlTimeLimitSeconds()));
long timeLimit = titleGather.getCrawlTimeLimitSeconds();
long sizeLimit = config.getDefaultCrawlLimitBytes();

Profile profile = titleGather.getActiveProfile();
if (profile != null) {
if (profile.getCrawlLimitSeconds() != null) {
timeLimit = profile.getCrawlLimitSeconds();
}

if (profile.getCrawlLimitBytes() != null) {
command.add("--sizeLimit");
command.add(String.valueOf(profile.getCrawlLimitBytes()));
sizeLimit = profile.getCrawlLimitBytes();
}
}

command.add("--timeLimit");
command.add(String.valueOf(timeLimit));

command.add("--sizeLimit");
command.add(String.valueOf(sizeLimit));


if (!Strings.isNullOrBlank(config.getUserAgentSuffix())) {
command.add("--userAgentSuffix");
command.add(config.getUserAgentSuffix());
Expand Down

0 comments on commit 40f1552

Please sign in to comment.