-
Notifications
You must be signed in to change notification settings - Fork 134
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Minimum delay between based on download finished time
- Modified PolitenessScheduler to compute the delay between same-domain requests based on the time when the download finished - Refactoring of FethcedResultHandler to simply notify the LinkStorage that the download finished and to delegate data processing to other handlers for the appropriate link type - Fixed unit and integration tests to consider the new scheduler workflow that requires notification when the download was finished Fixes #179.
- Loading branch information
Showing
19 changed files
with
539 additions
and
338 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
83 changes: 42 additions & 41 deletions
83
src/main/java/focusedCrawler/crawler/async/FetchedResultHandler.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,64 +1,65 @@ | ||
package focusedCrawler.crawler.async; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import focusedCrawler.crawler.async.HttpDownloader.Callback; | ||
import focusedCrawler.crawler.crawlercommons.fetcher.AbortedFetchException; | ||
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult; | ||
import focusedCrawler.link.LinkStorage; | ||
import focusedCrawler.link.frontier.LinkRelevance; | ||
import focusedCrawler.target.TargetStorage; | ||
import focusedCrawler.target.model.Page; | ||
import focusedCrawler.target.model.ParsedData; | ||
import focusedCrawler.util.parser.PaginaURL; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
public class FetchedResultHandler implements HttpDownloader.Callback { | ||
|
||
private static final Logger logger = LoggerFactory.getLogger(FetchedResultHandler.class); | ||
|
||
private String crawlerId; | ||
private TargetStorage targetStorage; | ||
|
||
public FetchedResultHandler(String crawlerId, TargetStorage targetStorage) { | ||
this.crawlerId = crawlerId; | ||
this.targetStorage = targetStorage; | ||
private final SitemapXmlHandler sitemapXmlHandler; | ||
private final ForwardLinkHandler forwardLinkHandler; | ||
private final RobotsTxtHandler robotsTxtHandler; | ||
private LinkStorage linkStorage; | ||
|
||
public FetchedResultHandler(String crawlerId, TargetStorage targetStorage, | ||
LinkStorage linkStorage, String userAgentName) { | ||
this.linkStorage = linkStorage; | ||
this.forwardLinkHandler = new ForwardLinkHandler(crawlerId, targetStorage); | ||
this.sitemapXmlHandler = new SitemapXmlHandler(linkStorage); | ||
this.robotsTxtHandler = new RobotsTxtHandler(linkStorage, userAgentName); | ||
} | ||
|
||
@Override | ||
public void completed(LinkRelevance link, FetchedResult response) { | ||
|
||
int statusCode = response.getStatusCode(); | ||
if(statusCode >= 200 && statusCode < 300) { | ||
processData(link, response); | ||
} | ||
//else { | ||
// TODO: Update metadata about page visits in link storage | ||
//} | ||
linkStorage.notifyDownloadFinished(link); | ||
Callback handler = getDownloadHandler(link); | ||
handler.completed(link, response); | ||
} | ||
|
||
@Override | ||
public void failed(LinkRelevance link, Exception e) { | ||
if(e instanceof AbortedFetchException) { | ||
linkStorage.notifyDownloadFinished(link); | ||
if (e instanceof AbortedFetchException) { | ||
AbortedFetchException afe = (AbortedFetchException) e; | ||
logger.info("Download aborted: \n>URL: {}\n>Reason: {}", link.getURL().toString(), afe.getAbortReason()); | ||
logger.info("Download aborted: \n>URL: {}\n>Reason: {}", link.getURL().toString(), | ||
afe.getAbortReason()); | ||
} else { | ||
logger.info("Failed to download URL: {}\n>Reason: {}", link.getURL().toString(), e.getMessage()); | ||
logger.info("Failed to download URL: {}\n>Reason: {}", link.getURL().toString(), | ||
e.getMessage()); | ||
} | ||
Callback handler = getDownloadHandler(link); | ||
handler.failed(link, e); | ||
} | ||
|
||
private void processData(LinkRelevance link, FetchedResult response) { | ||
try { | ||
Page page = new Page(response); | ||
page.setLinkRelevance(link); | ||
page.setCrawlerId(crawlerId); | ||
if (page.isHtml()) { | ||
PaginaURL pageParser = new PaginaURL(page); | ||
page.setParsedData(new ParsedData(pageParser)); | ||
} | ||
targetStorage.insert(page); | ||
|
||
} catch (Exception e) { | ||
logger.error("Problem while processing data.", e); | ||
|
||
private Callback getDownloadHandler(LinkRelevance link) { | ||
switch (link.getType()) { | ||
case FORWARD: | ||
return forwardLinkHandler; | ||
case ROBOTS: | ||
return robotsTxtHandler; | ||
case SITEMAP: | ||
return sitemapXmlHandler; | ||
default: | ||
// There should be a handler for each link type, so this shouldn't happen | ||
throw new IllegalStateException("No handler for link type: " + link.getType()); | ||
} | ||
} | ||
|
||
} | ||
} |
57 changes: 57 additions & 0 deletions
57
src/main/java/focusedCrawler/crawler/async/ForwardLinkHandler.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
package focusedCrawler.crawler.async; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import focusedCrawler.crawler.crawlercommons.fetcher.FetchedResult; | ||
import focusedCrawler.link.frontier.LinkRelevance; | ||
import focusedCrawler.target.TargetStorage; | ||
import focusedCrawler.target.model.Page; | ||
import focusedCrawler.target.model.ParsedData; | ||
import focusedCrawler.util.parser.PaginaURL; | ||
|
||
public class ForwardLinkHandler implements HttpDownloader.Callback { | ||
|
||
private static final Logger logger = LoggerFactory.getLogger(ForwardLinkHandler.class); | ||
|
||
private String crawlerId; | ||
private TargetStorage targetStorage; | ||
|
||
public ForwardLinkHandler(String crawlerId, TargetStorage targetStorage) { | ||
this.crawlerId = crawlerId; | ||
this.targetStorage = targetStorage; | ||
} | ||
|
||
@Override | ||
public void completed(LinkRelevance link, FetchedResult response) { | ||
|
||
int statusCode = response.getStatusCode(); | ||
if (statusCode >= 200 && statusCode < 300) { | ||
processPage(link, response); | ||
} | ||
//else { | ||
// TODO: Update metadata about page visits in link storage | ||
//} | ||
} | ||
|
||
@Override | ||
public void failed(LinkRelevance link, Exception e) { | ||
} | ||
|
||
private void processPage(LinkRelevance link, FetchedResult response) { | ||
try { | ||
Page page = new Page(response); | ||
page.setLinkRelevance(link); | ||
page.setCrawlerId(crawlerId); | ||
if (page.isHtml()) { | ||
PaginaURL pageParser = new PaginaURL(page); | ||
page.setParsedData(new ParsedData(pageParser)); | ||
} | ||
targetStorage.insert(page); | ||
|
||
} catch (Exception e) { | ||
logger.error("Problem while processing data.", e); | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.