diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 3c9101d..80e769b 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,14 +2,9 @@ - - - - - - - + + @@ -36,11 +31,11 @@ - + - - + + @@ -78,7 +73,7 @@ - + @@ -98,11 +93,11 @@ - + - - + + @@ -229,9 +224,9 @@ @@ -367,7 +362,7 @@ - + - @@ -882,18 +877,6 @@ - - - - - - - - - - - - @@ -915,41 +898,53 @@ - + - - + + - + - - + + + + + + + + + + + - + - - + + - + - - + + + + + - + - - + + diff --git a/src/main/java/org/forward/entitysearch/experiment/TestReadObject.java b/src/main/java/org/forward/entitysearch/experiment/TestReadObject.java index a520fbe..ad3982d 100644 --- a/src/main/java/org/forward/entitysearch/experiment/TestReadObject.java +++ b/src/main/java/org/forward/entitysearch/experiment/TestReadObject.java @@ -14,13 +14,14 @@ public static void main (String[] args) { FileInputStream fin = null; ObjectInputStream ois = null; try { - fin = new FileInputStream("serialized/kevin.ser"); + fin = new FileInputStream("serialized/2.ser"); ois = new ObjectInputStream(fin); doc = (ESAnnotatedHTMLDocument) ois.readObject(); } catch (Exception e) { e.printStackTrace(); } if (doc != null) { + System.out.println(doc.getTitle()); System.out.println(doc.get(CoreAnnotations.TokensAnnotation.class).size()); for (CoreLabel token : doc.get(CoreAnnotations.TokensAnnotation.class)) { System.out.println(token.word() + " " + token.ner() + " " + diff --git a/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java b/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java index bbfdc88..13b0fd9 100644 --- a/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java +++ b/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java @@ -266,12 +266,6 @@ public static void main(String[] args) { long time = System.currentTimeMillis(); long start = time; - WebDriver driver = createChromeDriver(); -// System.out.println(getAllTextWithLayout(driver,baseUrl)); - time = System.currentTimeMillis(); - System.out.println("Finish loading web driver " + (time-start)/1000 + " seconds"); - start = time; - AnnotatorFactory.getInstance().getAnnotationPipeline(); time = System.currentTimeMillis(); System.out.println("Finish loading the default annotation pipeline " + (time-start)/1000 + " seconds"); @@ -291,11 +285,22 @@ public static void main(String[] args) { System.out.println(i + "\t" + baseUrl); + WebDriver driver = createChromeDriver(); +// System.out.println(getAllTextWithLayout(driver,baseUrl)); + time = System.currentTimeMillis(); + System.out.println("Finish loading web driver " + (time-start)/1000 + " seconds"); + start = time; + ESAnnotatedHTMLDocument document = getHTMLDocumentForAnnotation(baseUrl, driver); time = System.currentTimeMillis(); System.out.println("Finish creating document for annotation " + (time-start)/1000 + " seconds"); start = time; + if (document.get(CoreAnnotations.TokensAnnotation.class).size() <= 1) { + System.err.println("This URL is probably not a web page " + document.getURL()); + continue; + } + AnnotatorFactory.getInstance().getAnnotationPipeline().annotate(document); time = System.currentTimeMillis(); System.out.println("Finish annotation " + (time-start)/1000 + " seconds"); @@ -324,9 +329,10 @@ public static void main(String[] args) { printAnnotatedDocument(document); PipelineHelper.printAnnotatedDocument(document, fields); } + + driver.close(); } - driver.close(); // time = System.currentTimeMillis(); // start = time; diff --git a/test_urls.csv b/test_urls.csv index a96ef23..d45c627 100644 --- a/test_urls.csv +++ b/test_urls.csv @@ -1,2 +1,3 @@ 0 http://www.forwarddatalab.org/kevinchang 1 http://www.forwarddatalab.org/research +2 https://relate.cs.illinois.edu/course/zuics101fa16/f/lectures/lec05.ipynb