diff --git a/.gitignore b/.gitignore
index 5a7e6ca..4175871 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
*.class
*.jar
+*.ser
lib/chromedriver
diff --git a/.idea/libraries/Maven__commons_cli_commons_cli_1_3_1.xml b/.idea/libraries/Maven__commons_cli_commons_cli_1_3_1.xml
new file mode 100644
index 0000000..a1510b9
--- /dev/null
+++ b/.idea/libraries/Maven__commons_cli_commons_cli_1_3_1.xml
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
index 079e6fb..3c9101d 100644
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -2,6 +2,13 @@
+
+
+
+
+
+
+
@@ -32,8 +39,8 @@
-
-
+
+
@@ -41,11 +48,20 @@
+
+
+
+
+
+
+
+
+
-
-
+
+
@@ -53,7 +69,7 @@
-
+
@@ -74,10 +90,22 @@
-
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -100,28 +128,6 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
@@ -133,7 +139,6 @@
- regexne
RegexNER
ner
Stanford
@@ -163,6 +168,7 @@
serial
im
total
+ Data
CustomizedCoreAnnotations.NamedEntityTagAnnotation
@@ -175,7 +181,6 @@
-
+
-
-
+
+
-
+
-
-
+
+
-
+
@@ -339,7 +404,7 @@
-
+
@@ -347,6 +412,7 @@
+
@@ -359,18 +425,18 @@
-
+
+
-
@@ -393,12 +459,12 @@
-
+
-
+
@@ -528,21 +594,6 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
@@ -824,13 +875,6 @@
-
-
-
-
-
-
-
@@ -838,13 +882,6 @@
-
-
-
-
-
-
-
@@ -864,17 +901,55 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
-
-
+
+
diff --git a/EntityAnnotation.iml b/EntityAnnotation.iml
index bd21cd2..eabd5ec 100644
--- a/EntityAnnotation.iml
+++ b/EntityAnnotation.iml
@@ -57,5 +57,6 @@
+
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 4cf1908..2d40515 100644
--- a/pom.xml
+++ b/pom.xml
@@ -67,6 +67,12 @@
3.12.0
compile
+
+ commons-cli
+ commons-cli
+ 1.3.1
+ compile
+
diff --git a/serialized/kevin.ser b/serialized/kevin.ser
index 5e8587a..efc1bfd 100644
Binary files a/serialized/kevin.ser and b/serialized/kevin.ser differ
diff --git a/src/main/java/org/forward/entitysearch/experiment/TestReadObject.java b/src/main/java/org/forward/entitysearch/experiment/TestReadObject.java
new file mode 100644
index 0000000..a520fbe
--- /dev/null
+++ b/src/main/java/org/forward/entitysearch/experiment/TestReadObject.java
@@ -0,0 +1,33 @@
+package org.forward.entitysearch.experiment;
+
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.ling.CustomizableCoreAnnotations;
+import org.forward.entitysearch.ingestion.ESAnnotatedHTMLDocument;
+
+import java.io.FileInputStream;
+import java.io.ObjectInputStream;
+
+public class TestReadObject {
+ public static void main (String[] args) {
+ ESAnnotatedHTMLDocument doc = null;
+ FileInputStream fin = null;
+ ObjectInputStream ois = null;
+ try {
+ fin = new FileInputStream("serialized/kevin.ser");
+ ois = new ObjectInputStream(fin);
+ doc = (ESAnnotatedHTMLDocument) ois.readObject();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ if (doc != null) {
+ System.out.println(doc.get(CoreAnnotations.TokensAnnotation.class).size());
+ for (CoreLabel token : doc.get(CoreAnnotations.TokensAnnotation.class)) {
+ System.out.println(token.word() + " " + token.ner() + " " +
+ token.get(CustomizableCoreAnnotations.LayoutHeightAnnotation.class) + " " +
+ token.get(CustomizableCoreAnnotations.LayoutWidthAnnotation.class));
+ }
+ System.out.println(doc.getHeight() + " " + doc.getWidth());
+ }
+ }
+}
diff --git a/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java b/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java
index 8dfa786..bbfdc88 100644
--- a/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java
+++ b/src/main/java/org/forward/entitysearch/ingestion/HTMLDocumentIngestionManager.java
@@ -5,9 +5,10 @@
import edu.stanford.nlp.ling.CustomizableCoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.PipelineHelper;
+import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.TypesafeMap;
import org.forward.entitysearch.AnnotationProperties;
-import org.forward.entitysearch.experiment.AnnotatorFactory;
+import org.forward.entitysearch.ner.annotation.AnnotatorFactory;
import org.openqa.selenium.By;
import org.openqa.selenium.Dimension;
import org.openqa.selenium.Rectangle;
@@ -16,11 +17,16 @@
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.RemoteWebElement;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.ObjectOutputStream;
+import java.io.*;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import org.apache.commons.cli.*;
public class HTMLDocumentIngestionManager {
@@ -210,43 +216,118 @@ private static List tokenizeText(String txt) {
public static void main(String[] args) {
+ Options options = new Options();
+
+ Option input = new Option("i", "input", true, "input url");
+ input.setRequired(true);
+ options.addOption(input);
+
+ Option output = new Option("o", "output", true, "output folder for serialized files");
+ output.setRequired(true);
+ options.addOption(output);
+
+ Option verbose = new Option("v", "verbose", true, "print additional message for debugging");
+ verbose.setRequired(false);
+ options.addOption(verbose);
+
+ CommandLineParser parser = new DefaultParser();
+ HelpFormatter formatter = new HelpFormatter();
+ CommandLine cmd;
+
+ try {
+ cmd = parser.parse(options, args);
+ } catch (ParseException e) {
+ System.out.println(e.getMessage());
+ formatter.printHelp("utility-name", options);
+
+ System.exit(1);
+ return;
+ }
+
+ boolean VERBOSE = false;
+ if (cmd.hasOption("verbose"))
+ VERBOSE = Boolean.parseBoolean(cmd.getOptionValue("verbose"));
+ String inputFile = cmd.getOptionValue("input");
+ String outputFolder = cmd.getOptionValue("output");
+
+ List> urls = new ArrayList<>();
+
+ try (BufferedReader br = new BufferedReader(new FileReader(inputFile))) {
+ String line;
+ while ((line = br.readLine()) != null) {
+ String[] tmp = line.split("\t");
+ urls.add(new Pair<>(tmp[0],tmp[1]));
+ }
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
long time = System.currentTimeMillis();
long start = time;
- String baseUrl = "http://www.forwarddatalab.org/kevinchang";
+
WebDriver driver = createChromeDriver();
// System.out.println(getAllTextWithLayout(driver,baseUrl));
- System.out.println("After creating driver " + (System.currentTimeMillis()-time));
time = System.currentTimeMillis();
+ System.out.println("Finish loading web driver " + (time-start)/1000 + " seconds");
+ start = time;
+
AnnotatorFactory.getInstance().getAnnotationPipeline();
- System.out.println("After loading the default annotation pipeline " + (System.currentTimeMillis()-time));
- time = System.currentTimeMillis();
- ESAnnotatedHTMLDocument document = getHTMLDocumentForAnnotation(baseUrl, driver);
- System.out.println("After creating document for annotation " + (System.currentTimeMillis()-time));
time = System.currentTimeMillis();
+ System.out.println("Finish loading the default annotation pipeline " + (time-start)/1000 + " seconds");
+ start = time;
+
List>> fields = PipelineHelper.addPopularRegexRuleAnnotators(AnnotatorFactory.getInstance().getAnnotationPipeline());
- System.out.println("After loading extra components for annotation pipeline " + (System.currentTimeMillis()-time));
time = System.currentTimeMillis();
- AnnotatorFactory.getInstance().getAnnotationPipeline().annotate(document);
- System.out.println("After annotation " + (System.currentTimeMillis()-time));
- System.out.println("Total time: " + (System.currentTimeMillis() - start));
- System.out.println(document.getTitle());
- System.out.println(document.getURL());
- System.out.println(document.getHeight() + " " + document.getWidth());
- try {
- FileOutputStream fileOut =
- new FileOutputStream("serialized/kevin.ser");
- ObjectOutputStream out = new ObjectOutputStream(fileOut);
- out.writeObject(document);
- out.close();
- fileOut.close();
- System.out.printf("Serialized data is saved in serialized/kevin.ser");
- } catch (IOException i) {
- i.printStackTrace();
+ System.out.println("Finish loading extra annotators " + (time-start)/1000 + " seconds");
+ start = time;
+
+ System.out.println("Ready to download and annotate HTML documents");
+ System.out.println("----------------------------------------------");
+
+ for (int i = 0; i < urls.size(); i++) {
+ String filename = urls.get(i).first;
+ String baseUrl = urls.get(i).second;
+
+ System.out.println(i + "\t" + baseUrl);
+
+ ESAnnotatedHTMLDocument document = getHTMLDocumentForAnnotation(baseUrl, driver);
+ time = System.currentTimeMillis();
+ System.out.println("Finish creating document for annotation " + (time-start)/1000 + " seconds");
+ start = time;
+
+ AnnotatorFactory.getInstance().getAnnotationPipeline().annotate(document);
+ time = System.currentTimeMillis();
+ System.out.println("Finish annotation " + (time-start)/1000 + " seconds");
+ start = time;
+
+ try {
+ String path = outputFolder + filename + ".ser";
+ FileOutputStream fileOut =
+ new FileOutputStream(path);
+ ObjectOutputStream out = new ObjectOutputStream(fileOut);
+ out.writeObject(document);
+ out.close();
+ fileOut.close();
+ if (VERBOSE)
+ System.out.println("Serialized data is saved to " + path);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ time = System.currentTimeMillis();
+ if (VERBOSE) {
+ System.out.println("Finish serialization " + (time-start)/1000 + " seconds");
+ System.out.println("Done with " + document.getTitle() + " with size " + document.getHeight() + " " + document.getWidth());
+ }
+
+ if (VERBOSE) {
+ printAnnotatedDocument(document);
+ PipelineHelper.printAnnotatedDocument(document, fields);
+ }
}
- time = System.currentTimeMillis();
- printAnnotatedDocument(document);
-// PipelineHelper.printAnnotatedDocument(document, fields);
- System.out.println("After printing results " + (System.currentTimeMillis()-time));
+
+ driver.close();
+
// time = System.currentTimeMillis();
// start = time;
// document = getHTMLDocumentForAnnotation("https://cs.illinois.edu/directory/profile/kcchang", driver);
@@ -260,7 +341,6 @@ public static void main(String[] args) {
// System.out.println("After printing results " + (System.currentTimeMillis()-time));
// printAnnotatedDocument(document);
- driver.close();
// List el = driver.findElements(By.cssSelector("*"));
// It is not working because it will miss text nodes
@@ -313,7 +393,6 @@ public static void main(String[] args) {
private static ESAnnotatedHTMLDocument getHTMLDocumentForAnnotation(String url, WebDriver driver) {
driver.get(url);
String pageTitle = driver.getTitle();
- System.out.println(url + " " + pageTitle);
List allTokens = new ArrayList<>();
RemoteWebElement e = (RemoteWebElement) driver.findElement(By.xpath("/html/body"));
travelDOMTreeWithSelenium(e,null,allTokens, driver);
diff --git a/src/main/java/org/forward/entitysearch/experiment/AnnotatorFactory.java b/src/main/java/org/forward/entitysearch/ner/annotation/AnnotatorFactory.java
similarity index 97%
rename from src/main/java/org/forward/entitysearch/experiment/AnnotatorFactory.java
rename to src/main/java/org/forward/entitysearch/ner/annotation/AnnotatorFactory.java
index c8663c7..e95517b 100644
--- a/src/main/java/org/forward/entitysearch/experiment/AnnotatorFactory.java
+++ b/src/main/java/org/forward/entitysearch/ner/annotation/AnnotatorFactory.java
@@ -1,4 +1,4 @@
-package org.forward.entitysearch.experiment;
+package org.forward.entitysearch.ner.annotation;
import edu.stanford.nlp.ling.CustomizableCoreAnnotations;
import edu.stanford.nlp.pipeline.*;
diff --git a/test_urls.csv b/test_urls.csv
new file mode 100644
index 0000000..a96ef23
--- /dev/null
+++ b/test_urls.csv
@@ -0,0 +1,2 @@
+0 http://www.forwarddatalab.org/kevinchang
+1 http://www.forwarddatalab.org/research