Skip to content

Commit

Permalink
browsertrix: Remove page limit
Browse files Browse the repository at this point in the history
The time and size limit should be enough and this is restricting large
crawls too much.
  • Loading branch information
ato committed Nov 21, 2024
1 parent b1b3e8b commit d7cbd63
Show file tree
Hide file tree
Showing 2 changed files with 0 additions and 10 deletions.
9 changes: 0 additions & 9 deletions gatherer/src/pandas/gatherer/heritrix/BrowsertrixConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
@ConfigurationProperties(prefix = "browsertrix")
public class BrowsertrixConfig {
private String podmanOptions;
private int pageLimit = 1000;
private String userAgentSuffix = "nla.gov.au_bot (National Library of Australia Legal Deposit Request; +http://www.nla.gov.au/legal-deposit/request)";
private int workers = 4;

Expand All @@ -24,14 +23,6 @@ public void setPodmanOptions(String podmanOptions) {
this.podmanOptions = podmanOptions;
}

public int getPageLimit() {
return pageLimit;
}

public void setPageLimit(int pageLimit) {
this.pageLimit = pageLimit;
}

public String getUserAgentSuffix() {
return userAgentSuffix;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ public void gather(Instance instance) throws Exception {
"--generatecdx",
"--logging", "none",
"--saveState", "always",
"--limit", String.valueOf(config.getPageLimit()),
"--depth", String.valueOf(depth)));

if (scope != null && scope.isIncludeSubdomains()) {
Expand Down

0 comments on commit d7cbd63

Please sign in to comment.