Skip to content

Commit

Permalink
Minimum delay between based on download finished time
Browse files Browse the repository at this point in the history
- Modified PolitenessScheduler to compute the delay between
same-domain requests based on the time when the download
finished
- Refactoring of FethcedResultHandler to simply notify the
LinkStorage that the download finished and to delegate data
processing to other handlers for the appropriate link type
- Fixed unit and integration tests to consider the new scheduler
workflow that requires notification when the download was
finished

Fixes #179.
  • Loading branch information
aecio committed Apr 22, 2019
1 parent 4e1f46d commit 5a0f441
Show file tree
Hide file tree
Showing 19 changed files with 539 additions and 338 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@
- Upgrade `commons-validator` library to version 1.6
- Upgrade `okhttp3` library to version 3.14.0
- Fix issue #177: Links from recent TLDs are considered invalid
- Minimum delay between request now considers the time when the dowload is
actually finihsed, not the time when the URL was initially scheduled to be
downloaded (which disregards other processing times between scheduling and actual download)
- Refactoring of FethcedResultHandler to simply notify the LinkStorage that
the download finished and to delegate data processing to other handlers
for the appropriate link type

## Version 0.11.0

Expand Down
24 changes: 8 additions & 16 deletions src/main/java/focusedCrawler/crawler/async/AsyncCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,13 @@

import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.util.concurrent.AbstractExecutionThreadService;

import focusedCrawler.config.Configuration;
import focusedCrawler.crawler.async.HttpDownloader.Callback;
import focusedCrawler.crawler.cookies.Cookie;
import focusedCrawler.crawler.cookies.CookieUtils;
import focusedCrawler.link.LinkStorage;
Expand All @@ -26,9 +24,9 @@ public class AsyncCrawler extends AbstractExecutionThreadService {
private final TargetStorage targetStorage;
private final LinkStorage linkStorage;
private final HttpDownloader downloader;
private final Map<LinkRelevance.Type, HttpDownloader.Callback> handlers = new HashMap<>();
private MetricsManager metricsManager;
private Configuration config;
private final FetchedResultHandler fetchedResultHandler;
private final MetricsManager metricsManager;
private final Configuration config;

public AsyncCrawler(String crawlerId, TargetStorage targetStorage, LinkStorage linkStorage,
Configuration config, String dataPath, MetricsManager metricsManager) {
Expand All @@ -41,10 +39,9 @@ public AsyncCrawler(String crawlerId, TargetStorage targetStorage, LinkStorage l
HttpDownloaderConfig downloaderConfig = config.getCrawlerConfig().getDownloaderConfig();
this.downloader = new HttpDownloader(downloaderConfig, dataPath, metricsManager);

this.handlers.put(LinkRelevance.Type.FORWARD, new FetchedResultHandler(crawlerId, targetStorage));
this.handlers.put(LinkRelevance.Type.SITEMAP, new SitemapXmlHandler(linkStorage));
this.handlers.put(LinkRelevance.Type.ROBOTS, new RobotsTxtHandler(linkStorage,
downloaderConfig.getUserAgentName()));
String userAgentName = downloaderConfig.getUserAgentName();
this.fetchedResultHandler = new FetchedResultHandler(crawlerId, targetStorage, linkStorage,
userAgentName);

Runtime.getRuntime().addShutdownHook(new Thread() {
public void run() {
Expand All @@ -58,14 +55,9 @@ public void run() {
protected void run() {
while (isRunning()) {
try {
LinkRelevance link = (LinkRelevance) linkStorage.select(null);
LinkRelevance link = linkStorage.select();
if (link != null) {
Callback handler = handlers.get(link.getType());
if (handler == null) {
logger.error("No registered handler for link type: " + link.getType());
continue;
}
downloader.dipatchDownload(link, handler);
downloader.dipatchDownload(link, fetchedResultHandler);
}
} catch (DataNotFoundException e) {
// There are no more links available in the frontier right now
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import java.io.IOException;

import com.fasterxml.jackson.annotation.JsonUnwrapped;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;

Expand All @@ -16,7 +15,7 @@ public AsyncCrawlerConfig() {
// Required for de-serialization
}

public AsyncCrawlerConfig(JsonNode config, ObjectMapper objectMapper) throws JsonProcessingException, IOException {
public AsyncCrawlerConfig(JsonNode config, ObjectMapper objectMapper) throws IOException {
objectMapper.readerForUpdating(this).readValue(config);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,64 +1,65 @@
package focusedCrawler.crawler.async;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import focusedCrawler.crawler.async.HttpDownloader.Callback;
import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
import focusedCrawler.link.LinkStorage;
import focusedCrawler.link.frontier.LinkRelevance;
import focusedCrawler.target.TargetStorage;
import focusedCrawler.target.model.Page;
import focusedCrawler.target.model.ParsedData;
import focusedCrawler.util.parser.PaginaURL;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class FetchedResultHandler implements HttpDownloader.Callback {

private static final Logger logger = LoggerFactory.getLogger(FetchedResultHandler.class);

private String crawlerId;
private TargetStorage targetStorage;

public FetchedResultHandler(String crawlerId, TargetStorage targetStorage) {
this.crawlerId = crawlerId;
this.targetStorage = targetStorage;
private final SitemapXmlHandler sitemapXmlHandler;
private final ForwardLinkHandler forwardLinkHandler;
private final RobotsTxtHandler robotsTxtHandler;
private LinkStorage linkStorage;

public FetchedResultHandler(String crawlerId, TargetStorage targetStorage,
LinkStorage linkStorage, String userAgentName) {
this.linkStorage = linkStorage;
this.forwardLinkHandler = new ForwardLinkHandler(crawlerId, targetStorage);
this.sitemapXmlHandler = new SitemapXmlHandler(linkStorage);
this.robotsTxtHandler = new RobotsTxtHandler(linkStorage, userAgentName);
}

@Override
public void completed(LinkRelevance link, FetchedResult response) {

int statusCode = response.getStatusCode();
if(statusCode >= 200 && statusCode < 300) {
processData(link, response);
}
//else {
// TODO: Update metadata about page visits in link storage
//}
linkStorage.notifyDownloadFinished(link);
Callback handler = getDownloadHandler(link);
handler.completed(link, response);
}

@Override
public void failed(LinkRelevance link, Exception e) {
if(e instanceof AbortedFetchException) {
linkStorage.notifyDownloadFinished(link);
if (e instanceof AbortedFetchException) {
AbortedFetchException afe = (AbortedFetchException) e;
logger.info("Download aborted: \n>URL: {}\n>Reason: {}", link.getURL().toString(), afe.getAbortReason());
logger.info("Download aborted: \n>URL: {}\n>Reason: {}", link.getURL().toString(),
afe.getAbortReason());
} else {
logger.info("Failed to download URL: {}\n>Reason: {}", link.getURL().toString(), e.getMessage());
logger.info("Failed to download URL: {}\n>Reason: {}", link.getURL().toString(),
e.getMessage());
}
Callback handler = getDownloadHandler(link);
handler.failed(link, e);
}

private void processData(LinkRelevance link, FetchedResult response) {
try {
Page page = new Page(response);
page.setLinkRelevance(link);
page.setCrawlerId(crawlerId);
if (page.isHtml()) {
PaginaURL pageParser = new PaginaURL(page);
page.setParsedData(new ParsedData(pageParser));
}
targetStorage.insert(page);

} catch (Exception e) {
logger.error("Problem while processing data.", e);

private Callback getDownloadHandler(LinkRelevance link) {
switch (link.getType()) {
case FORWARD:
return forwardLinkHandler;
case ROBOTS:
return robotsTxtHandler;
case SITEMAP:
return sitemapXmlHandler;
default:
// There should be a handler for each link type, so this shouldn't happen
throw new IllegalStateException("No handler for link type: " + link.getType());
}
}

}
}
57 changes: 57 additions & 0 deletions src/main/java/focusedCrawler/crawler/async/ForwardLinkHandler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package focusedCrawler.crawler.async;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
import focusedCrawler.link.frontier.LinkRelevance;
import focusedCrawler.target.TargetStorage;
import focusedCrawler.target.model.Page;
import focusedCrawler.target.model.ParsedData;
import focusedCrawler.util.parser.PaginaURL;

public class ForwardLinkHandler implements HttpDownloader.Callback {

private static final Logger logger = LoggerFactory.getLogger(ForwardLinkHandler.class);

private String crawlerId;
private TargetStorage targetStorage;

public ForwardLinkHandler(String crawlerId, TargetStorage targetStorage) {
this.crawlerId = crawlerId;
this.targetStorage = targetStorage;
}

@Override
public void completed(LinkRelevance link, FetchedResult response) {

int statusCode = response.getStatusCode();
if (statusCode >= 200 && statusCode < 300) {
processPage(link, response);
}
//else {
// TODO: Update metadata about page visits in link storage
//}
}

@Override
public void failed(LinkRelevance link, Exception e) {
}

private void processPage(LinkRelevance link, FetchedResult response) {
try {
Page page = new Page(response);
page.setLinkRelevance(link);
page.setCrawlerId(crawlerId);
if (page.isHtml()) {
PaginaURL pageParser = new PaginaURL(page);
page.setParsedData(new ParsedData(pageParser));
}
targetStorage.insert(page);

} catch (Exception e) {
logger.error("Problem while processing data.", e);
}
}

}
46 changes: 18 additions & 28 deletions src/main/java/focusedCrawler/crawler/async/RobotsTxtHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

import crawlercommons.robots.SimpleRobotRules;
import crawlercommons.robots.SimpleRobotRulesParser;
import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchException;
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult;
import focusedCrawler.link.LinkStorage;
import focusedCrawler.link.frontier.LinkRelevance;
Expand All @@ -26,70 +25,61 @@ public RobotsData(LinkRelevance link, SimpleRobotRules robotRules) {
this.robotRules = robotRules;
}
}

private static final Logger logger = LoggerFactory.getLogger(RobotsTxtHandler.class);

private SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
private LinkStorage linkStorage;
private String userAgentName;

public RobotsTxtHandler(LinkStorage linkStorage, String userAgentName) {
this.linkStorage = linkStorage;
this.userAgentName = userAgentName;
}

@Override
public void completed(LinkRelevance link, FetchedResult response) {
int statusCode = response.getStatusCode();
if(statusCode >= 200 && statusCode < 300) {
logger.info("Successfully downloaded URL=["+response.getBaseUrl()+"] HTTP-Response-Code="+statusCode);
if (statusCode >= 200 && statusCode < 300) {
// HTTP 2xx means the request was successful
processRobot(link, response, false);
} else {
logger.info("Server returned bad code for URL=["+response.getBaseUrl()+"] HTTP-Response-Code="+statusCode);
processRobot(link, response, true);
}
}

@Override
public void failed(LinkRelevance link, Exception e) {
if(e instanceof AbortedFetchException) {
AbortedFetchException afe = (AbortedFetchException) e;
logger.info("Download aborted: \n>URL: {}\n>Reason: {}",
link.getURL().toString(), afe.getAbortReason());
} else {
logger.info("Failed to download URL: "+link.getURL().toString(), e.getMessage());
}
processRobot(link, null, true);
}

private void processRobot(LinkRelevance link, FetchedResult response, boolean fetchFailed) {

SimpleRobotRules robotRules;
if(fetchFailed || response == null) {
robotRules = (SimpleRobotRules) parser.failedFetch(HttpStatus.SC_GONE);
}
else {
if (fetchFailed || response == null) {
robotRules = parser.failedFetch(HttpStatus.SC_GONE);
} else {
String contentType = response.getContentType();
boolean isPlainText = (contentType != null) && (contentType.startsWith("text/plain"));
if ((response.getNumRedirects() > 0) && !isPlainText) {
robotRules = (SimpleRobotRules) parser.failedFetch(HttpStatus.SC_GONE);
robotRules = parser.failedFetch(HttpStatus.SC_GONE);
} else {
robotRules = (SimpleRobotRules) parser.parseContent(
robotRules = parser.parseContent(
response.getFetchedUrl(),
response.getContent(),
response.getContentType(),
userAgentName
userAgentName
);
}
}

try {
RobotsData robotsData = new RobotsData(link, robotRules);
linkStorage.insert(robotsData);
} catch (Exception e) {
logger.error("Failed to insert robots.txt data into link storage.", e);
}

}

}
Loading

0 comments on commit 5a0f441

Please sign in to comment.