From f62451498dbbbd79e61b687b7882c57975a59817 Mon Sep 17 00:00:00 2001 From: Long Pham Date: Thu, 24 May 2018 00:12:17 -0500 Subject: [PATCH] minor change --- .idea/workspace.xml | 101 +++++++++--------- .../experiment/TestReadObject.java | 2 +- .../entitysearch/experiment/TestSelenium.java | 11 +- .../HTMLDocumentIngestionManager.java | 27 +++-- test_urls copy.csv | 3 + test_urls.csv | 4 +- 6 files changed, 85 insertions(+), 63 deletions(-) create mode 100644 test_urls copy.csv diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 80e769b..538a749 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -3,6 +3,7 @@ + @@ -34,8 +35,8 @@ - - + + @@ -43,6 +44,15 @@ + + + + + + + + + @@ -93,11 +103,11 @@ - + - - + + @@ -114,15 +124,6 @@ - - - - - - - - - @@ -214,7 +215,6 @@ @@ -362,7 +363,7 @@ - + - - @@ -454,12 +455,12 @@ - + - @@ -589,13 +590,6 @@ - - - - - - - @@ -822,13 +816,6 @@ - - - - - - - @@ -912,6 +899,20 @@ + + + + + + + + + + + + + + @@ -924,30 +925,30 @@ - + - - + + + + + - - + + - + - - - - - + + diff --git a/src/main/java/org/forward/entitysearch/experiment/TestReadObject.java b/src/main/java/org/forward/entitysearch/experiment/TestReadObject.java index ad3982d..42a3f16 100644 --- a/src/main/java/org/forward/entitysearch/experiment/TestReadObject.java +++ b/src/main/java/org/forward/entitysearch/experiment/TestReadObject.java @@ -14,7 +14,7 @@ public static void main (String[] args) { FileInputStream fin = null; ObjectInputStream ois = null; try { - fin = new FileInputStream("serialized/2.ser"); + fin = new FileInputStream("serialized/0.ser"); ois = new ObjectInputStream(fin); doc = (ESAnnotatedHTMLDocument) ois.readObject(); } catch (Exception e) { diff --git a/src/main/java/org/forward/entitysearch/experiment/TestSelenium.java b/src/main/java/org/forward/entitysearch/experiment/TestSelenium.java index f7bcb7b..57ba261 100644 --- a/src/main/java/org/forward/entitysearch/experiment/TestSelenium.java +++ b/src/main/java/org/forward/entitysearch/experiment/TestSelenium.java @@ -7,6 +7,8 @@ import org.openqa.selenium.chrome.ChromeDriver; import org.openqa.selenium.chrome.ChromeOptions; import org.openqa.selenium.remote.RemoteWebElement; +import org.openqa.selenium.support.ui.ExpectedCondition; +import org.openqa.selenium.support.ui.WebDriverWait; import java.util.ArrayList; import java.util.List; @@ -59,7 +61,14 @@ public static void main(String[] args) { WebDriver driver = new ChromeDriver(options); String baseUrl = "http://www.forwarddatalab.org/kevinchang"; - + baseUrl = "https://charm.cs.illinois.edu/"; + WebDriverWait wait = new WebDriverWait(driver, 10); + wait.until(new ExpectedCondition() { + @Override + public Boolean apply(WebDriver webDriver) { + return driver.findElement(By.xpath("/html/body")).getText().length() != 0; + } + }); // launch browser and direct it to the Base URL driver.get(baseUrl); diff --git a/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java b/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java index 13b0fd9..22ec64d 100644 --- a/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java +++ b/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java @@ -27,6 +27,7 @@ import java.util.stream.Stream; import org.apache.commons.cli.*; +import org.openqa.selenium.support.ui.WebDriverWait; public class HTMLDocumentIngestionManager { @@ -296,6 +297,11 @@ public static void main(String[] args) { System.out.println("Finish creating document for annotation " + (time-start)/1000 + " seconds"); start = time; + if (document == null) { + System.err.println("This URL cannot be rendered by Selenium " + document.getURL()); + continue; + } + if (document.get(CoreAnnotations.TokensAnnotation.class).size() <= 1) { System.err.println("This URL is probably not a web page " + document.getURL()); continue; @@ -400,18 +406,23 @@ private static ESAnnotatedHTMLDocument getHTMLDocumentForAnnotation(String url, driver.get(url); String pageTitle = driver.getTitle(); List allTokens = new ArrayList<>(); - RemoteWebElement e = (RemoteWebElement) driver.findElement(By.xpath("/html/body")); - travelDOMTreeWithSelenium(e,null,allTokens, driver); - ESAnnotatedHTMLDocument document = new ESAnnotatedHTMLDocument(allTokens); + try{ + RemoteWebElement e = (RemoteWebElement) driver.findElement(By.xpath("/html/body")); + travelDOMTreeWithSelenium(e,null,allTokens, driver); + ESAnnotatedHTMLDocument document = new ESAnnotatedHTMLDocument(allTokens); // List> allTokens = new ArrayList<>(); // travelDOMTreeWithSelenium2((RemoteWebElement)driver.findElement(By.xpath("/html/body")),null,allTokens, driver); // ESAnnotatedHTMLDocument document = new ESAnnotatedHTMLDocument(); // document.loadFromTokens(allTokens); - document.setURL(url); - document.setTitle(pageTitle); - document.setHeight(e.getSize().height); - document.setWidth(e.getSize().width); - return document; + document.setURL(url); + document.setTitle(pageTitle); + document.setHeight(e.getSize().height); + document.setWidth(e.getSize().width); + return document; + } catch (Exception ex) { + ex.printStackTrace(); + } + return null; } private static void printAnnotatedDocument(ESAnnotatedHTMLDocument document) { diff --git a/test_urls copy.csv b/test_urls copy.csv new file mode 100644 index 0000000..5516c0a --- /dev/null +++ b/test_urls copy.csv @@ -0,0 +1,3 @@ +0 https://cs.illinois.edu/about-us/awards/faculty-awards/chairs-and-professorships/founder-professor-engineering +1 http://www.forwarddatalab.org/kevinchang +2 https://relate.cs.illinois.edu/course/zuics101fa16/f/lectures/lec05.ipynb diff --git a/test_urls.csv b/test_urls.csv index d45c627..c923aaa 100644 --- a/test_urls.csv +++ b/test_urls.csv @@ -1,3 +1 @@ -0 http://www.forwarddatalab.org/kevinchang -1 http://www.forwarddatalab.org/research -2 https://relate.cs.illinois.edu/course/zuics101fa16/f/lectures/lec05.ipynb +0 http://charm.cs.illinois.edu/research/episim \ No newline at end of file