Skip to content

Commit

Permalink
add abortion for too long docs
Browse files Browse the repository at this point in the history
  • Loading branch information
Long Pham committed May 24, 2018
1 parent 8e4390f commit 27f85ca
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 21 deletions.
40 changes: 21 additions & 19 deletions .idea/workspace.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,10 @@ public static void main(String[] args) {
WebDriver driver = new ChromeDriver(options);

String baseUrl = "http://www.forwarddatalab.org/kevinchang";
baseUrl = "https://charm.cs.illinois.edu/";
baseUrl = "http://codingspectator.cs.illinois.edu/updates/helios/";
// baseUrl = "https://charm.cs.illinois.edu/";
// baseUrl = "http://codingspectator.cs.illinois.edu/updates/helios/";
// baseUrl = "http://nlp.cs.illinois.edu/HockenmaierGroup/8k-pictures.html";
baseUrl = "http://hockenmaier.cs.illinois.edu/DenotationGraph/graph/index10.html";
// WebDriverWait wait = new WebDriverWait(driver, 10);
// wait.until(new ExpectedCondition<Boolean>() {
// @Override
Expand All @@ -72,6 +74,8 @@ public static void main(String[] args) {
// });
// launch browser and direct it to the Base URL
driver.get(baseUrl);
System.out.println(driver.getPageSource().length());
System.exit(0);

// get the title and print it
String pageTitle = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
public class HTMLDocumentIngestionManager {

public static final String WEB_DOCUMENT_CANNOT_BE_OPENED_ERROR = "oOo This web document cannot be opened by browser oOo";
public static final int MAX_LENGTH_OF_PAGE_SOURCE = 400000;
public static ArrayList<String> LIST_OF_TAGS_CREATING_NEW_LINES = new ArrayList<>();

static {
Expand Down Expand Up @@ -460,6 +461,11 @@ private static ESAnnotatedHTMLDocument getHTMLDocumentForAnnotation(String url,
} else {
CUR_URL = currentUrl;
}
if (driver.getPageSource().length() > MAX_LENGTH_OF_PAGE_SOURCE) {
System.out.println("This document is too long!");
System.err.println("This document is too long " + url);
return null;
}
String pageTitle = driver.getTitle();
List<CoreLabel> allTokens = new ArrayList<>();
RemoteWebElement e;
Expand Down
1 change: 1 addition & 0 deletions test_urls.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
1 https://relate.cs.illinois.edu/video/cs598ak-f13/html/player.html?descriptor=metadata/lec07.json
2 http://codingspectator.cs.illinois.edu/composite-refactorings-updates/
3 https://relate.cs.illinois.edu/course/zuics101fa16/f/lectures/lec05.ipynb
5 http://hockenmaier.cs.illinois.edu/DenotationGraph/graph/index10.html
4 http://www.forwarddatalab.org/kevinchang

0 comments on commit 27f85ca

Please sign in to comment.