From 27f85ca425173a2ee6011a255b5522cdeec4c003 Mon Sep 17 00:00:00 2001 From: Long Pham Date: Thu, 24 May 2018 14:06:58 -0500 Subject: [PATCH] add abortion for too long docs --- .idea/workspace.xml | 40 ++++++++++--------- .../entitysearch/experiment/TestSelenium.java | 8 +++- .../HTMLDocumentIngestionManager.java | 6 +++ test_urls.csv | 1 + 4 files changed, 34 insertions(+), 21 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index ad17103..c2cf28f 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,7 +2,9 @@ + + @@ -29,11 +31,11 @@ - + - - + + @@ -41,11 +43,11 @@ - + - - + + @@ -156,7 +158,7 @@ System. System myLabel - + After annotation serial im total @@ -222,9 +224,9 @@ @@ -452,12 +454,12 @@ - + - @@ -922,13 +924,6 @@ - - - - - - - @@ -941,14 +936,21 @@ - - + + + + + + + + + diff --git a/src/main/java/org/forward/entitysearch/experiment/TestSelenium.java b/src/main/java/org/forward/entitysearch/experiment/TestSelenium.java index 44762f8..5f3fe0e 100644 --- a/src/main/java/org/forward/entitysearch/experiment/TestSelenium.java +++ b/src/main/java/org/forward/entitysearch/experiment/TestSelenium.java @@ -61,8 +61,10 @@ public static void main(String[] args) { WebDriver driver = new ChromeDriver(options); String baseUrl = "http://www.forwarddatalab.org/kevinchang"; - baseUrl = "https://charm.cs.illinois.edu/"; - baseUrl = "http://codingspectator.cs.illinois.edu/updates/helios/"; +// baseUrl = "https://charm.cs.illinois.edu/"; +// baseUrl = "http://codingspectator.cs.illinois.edu/updates/helios/"; +// baseUrl = "http://nlp.cs.illinois.edu/HockenmaierGroup/8k-pictures.html"; + baseUrl = "http://hockenmaier.cs.illinois.edu/DenotationGraph/graph/index10.html"; // WebDriverWait wait = new WebDriverWait(driver, 10); // wait.until(new ExpectedCondition() { // @Override @@ -72,6 +74,8 @@ public static void main(String[] args) { // }); // launch browser and direct it to the Base URL driver.get(baseUrl); + System.out.println(driver.getPageSource().length()); + System.exit(0); // get the title and print it String pageTitle = null; diff --git a/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java b/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java index 06f7f48..57305b2 100644 --- a/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java +++ b/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java @@ -23,6 +23,7 @@ public class HTMLDocumentIngestionManager { public static final String WEB_DOCUMENT_CANNOT_BE_OPENED_ERROR = "oOo This web document cannot be opened by browser oOo"; + public static final int MAX_LENGTH_OF_PAGE_SOURCE = 400000; public static ArrayList LIST_OF_TAGS_CREATING_NEW_LINES = new ArrayList<>(); static { @@ -460,6 +461,11 @@ private static ESAnnotatedHTMLDocument getHTMLDocumentForAnnotation(String url, } else { CUR_URL = currentUrl; } + if (driver.getPageSource().length() > MAX_LENGTH_OF_PAGE_SOURCE) { + System.out.println("This document is too long!"); + System.err.println("This document is too long " + url); + return null; + } String pageTitle = driver.getTitle(); List allTokens = new ArrayList<>(); RemoteWebElement e; diff --git a/test_urls.csv b/test_urls.csv index 6a0edff..36b5f4d 100644 --- a/test_urls.csv +++ b/test_urls.csv @@ -2,4 +2,5 @@ 1 https://relate.cs.illinois.edu/video/cs598ak-f13/html/player.html?descriptor=metadata/lec07.json 2 http://codingspectator.cs.illinois.edu/composite-refactorings-updates/ 3 https://relate.cs.illinois.edu/course/zuics101fa16/f/lectures/lec05.ipynb +5 http://hockenmaier.cs.illinois.edu/DenotationGraph/graph/index10.html 4 http://www.forwarddatalab.org/kevinchang \ No newline at end of file