Skip to content

Commit

Permalink
prepare for bulk processing
Browse files Browse the repository at this point in the history
  • Loading branch information
Long Pham committed May 24, 2018
1 parent fffe91c commit 770a7e6
Show file tree
Hide file tree
Showing 10 changed files with 321 additions and 111 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*.class
*.jar
*.ser
lib/chromedriver
13 changes: 13 additions & 0 deletions .idea/libraries/Maven__commons_cli_commons_cli_1_3_1.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

229 changes: 152 additions & 77 deletions .idea/workspace.xml

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions EntityAnnotation.iml
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,6 @@
<orderEntry type="library" name="Maven: org.codehaus.mojo:animal-sniffer-annotations:1.14" level="project" />
<orderEntry type="library" name="Maven: com.squareup.okhttp3:okhttp:3.9.1" level="project" />
<orderEntry type="library" name="Maven: com.squareup.okio:okio:1.13.0" level="project" />
<orderEntry type="library" name="Maven: commons-cli:commons-cli:1.3.1" level="project" />
</component>
</module>
6 changes: 6 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@
<version>3.12.0</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.3.1</version>
<scope>compile</scope>
</dependency>
</dependencies>


Expand Down
Binary file modified serialized/kevin.ser
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package org.forward.entitysearch.experiment;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CustomizableCoreAnnotations;
import org.forward.entitysearch.ingestion.ESAnnotatedHTMLDocument;

import java.io.FileInputStream;
import java.io.ObjectInputStream;

public class TestReadObject {
public static void main (String[] args) {
ESAnnotatedHTMLDocument doc = null;
FileInputStream fin = null;
ObjectInputStream ois = null;
try {
fin = new FileInputStream("serialized/kevin.ser");
ois = new ObjectInputStream(fin);
doc = (ESAnnotatedHTMLDocument) ois.readObject();
} catch (Exception e) {
e.printStackTrace();
}
if (doc != null) {
System.out.println(doc.get(CoreAnnotations.TokensAnnotation.class).size());
for (CoreLabel token : doc.get(CoreAnnotations.TokensAnnotation.class)) {
System.out.println(token.word() + " " + token.ner() + " " +
token.get(CustomizableCoreAnnotations.LayoutHeightAnnotation.class) + " " +
token.get(CustomizableCoreAnnotations.LayoutWidthAnnotation.class));
}
System.out.println(doc.getHeight() + " " + doc.getWidth());
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
import edu.stanford.nlp.ling.CustomizableCoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.PipelineHelper;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.TypesafeMap;
import org.forward.entitysearch.AnnotationProperties;
import org.forward.entitysearch.experiment.AnnotatorFactory;
import org.forward.entitysearch.ner.annotation.AnnotatorFactory;
import org.openqa.selenium.By;
import org.openqa.selenium.Dimension;
import org.openqa.selenium.Rectangle;
Expand All @@ -16,11 +17,16 @@
import org.openqa.selenium.chrome.ChromeOptions;
import org.openqa.selenium.remote.RemoteWebElement;

import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import org.apache.commons.cli.*;

public class HTMLDocumentIngestionManager {

Expand Down Expand Up @@ -210,43 +216,118 @@ private static List<CoreLabel> tokenizeText(String txt) {

public static void main(String[] args) {

Options options = new Options();

Option input = new Option("i", "input", true, "input url");
input.setRequired(true);
options.addOption(input);

Option output = new Option("o", "output", true, "output folder for serialized files");
output.setRequired(true);
options.addOption(output);

Option verbose = new Option("v", "verbose", true, "print additional message for debugging");
verbose.setRequired(false);
options.addOption(verbose);

CommandLineParser parser = new DefaultParser();
HelpFormatter formatter = new HelpFormatter();
CommandLine cmd;

try {
cmd = parser.parse(options, args);
} catch (ParseException e) {
System.out.println(e.getMessage());
formatter.printHelp("utility-name", options);

System.exit(1);
return;
}

boolean VERBOSE = false;
if (cmd.hasOption("verbose"))
VERBOSE = Boolean.parseBoolean(cmd.getOptionValue("verbose"));
String inputFile = cmd.getOptionValue("input");
String outputFolder = cmd.getOptionValue("output");

List<Pair<String,String>> urls = new ArrayList<>();

try (BufferedReader br = new BufferedReader(new FileReader(inputFile))) {
String line;
while ((line = br.readLine()) != null) {
String[] tmp = line.split("\t");
urls.add(new Pair<>(tmp[0],tmp[1]));
}

} catch (IOException e) {
e.printStackTrace();
}

long time = System.currentTimeMillis();
long start = time;
String baseUrl = "http://www.forwarddatalab.org/kevinchang";

WebDriver driver = createChromeDriver();
// System.out.println(getAllTextWithLayout(driver,baseUrl));
System.out.println("After creating driver " + (System.currentTimeMillis()-time));
time = System.currentTimeMillis();
System.out.println("Finish loading web driver " + (time-start)/1000 + " seconds");
start = time;

AnnotatorFactory.getInstance().getAnnotationPipeline();
System.out.println("After loading the default annotation pipeline " + (System.currentTimeMillis()-time));
time = System.currentTimeMillis();
ESAnnotatedHTMLDocument document = getHTMLDocumentForAnnotation(baseUrl, driver);
System.out.println("After creating document for annotation " + (System.currentTimeMillis()-time));
time = System.currentTimeMillis();
System.out.println("Finish loading the default annotation pipeline " + (time-start)/1000 + " seconds");
start = time;

List<Class<? extends TypesafeMap.Key<String>>> fields = PipelineHelper.addPopularRegexRuleAnnotators(AnnotatorFactory.getInstance().getAnnotationPipeline());
System.out.println("After loading extra components for annotation pipeline " + (System.currentTimeMillis()-time));
time = System.currentTimeMillis();
AnnotatorFactory.getInstance().getAnnotationPipeline().annotate(document);
System.out.println("After annotation " + (System.currentTimeMillis()-time));
System.out.println("Total time: " + (System.currentTimeMillis() - start));
System.out.println(document.getTitle());
System.out.println(document.getURL());
System.out.println(document.getHeight() + " " + document.getWidth());
try {
FileOutputStream fileOut =
new FileOutputStream("serialized/kevin.ser");
ObjectOutputStream out = new ObjectOutputStream(fileOut);
out.writeObject(document);
out.close();
fileOut.close();
System.out.printf("Serialized data is saved in serialized/kevin.ser");
} catch (IOException i) {
i.printStackTrace();
System.out.println("Finish loading extra annotators " + (time-start)/1000 + " seconds");
start = time;

System.out.println("Ready to download and annotate HTML documents");
System.out.println("----------------------------------------------");

for (int i = 0; i < urls.size(); i++) {
String filename = urls.get(i).first;
String baseUrl = urls.get(i).second;

System.out.println(i + "\t" + baseUrl);

ESAnnotatedHTMLDocument document = getHTMLDocumentForAnnotation(baseUrl, driver);
time = System.currentTimeMillis();
System.out.println("Finish creating document for annotation " + (time-start)/1000 + " seconds");
start = time;

AnnotatorFactory.getInstance().getAnnotationPipeline().annotate(document);
time = System.currentTimeMillis();
System.out.println("Finish annotation " + (time-start)/1000 + " seconds");
start = time;

try {
String path = outputFolder + filename + ".ser";
FileOutputStream fileOut =
new FileOutputStream(path);
ObjectOutputStream out = new ObjectOutputStream(fileOut);
out.writeObject(document);
out.close();
fileOut.close();
if (VERBOSE)
System.out.println("Serialized data is saved to " + path);
} catch (IOException e) {
e.printStackTrace();
}
time = System.currentTimeMillis();
if (VERBOSE) {
System.out.println("Finish serialization " + (time-start)/1000 + " seconds");
System.out.println("Done with " + document.getTitle() + " with size " + document.getHeight() + " " + document.getWidth());
}

if (VERBOSE) {
printAnnotatedDocument(document);
PipelineHelper.printAnnotatedDocument(document, fields);
}
}
time = System.currentTimeMillis();
printAnnotatedDocument(document);
// PipelineHelper.printAnnotatedDocument(document, fields);
System.out.println("After printing results " + (System.currentTimeMillis()-time));

driver.close();

// time = System.currentTimeMillis();
// start = time;
// document = getHTMLDocumentForAnnotation("https://cs.illinois.edu/directory/profile/kcchang", driver);
Expand All @@ -260,7 +341,6 @@ public static void main(String[] args) {
// System.out.println("After printing results " + (System.currentTimeMillis()-time));

// printAnnotatedDocument(document);
driver.close();

// List<WebElement> el = driver.findElements(By.cssSelector("*"));
// It is not working because it will miss text nodes
Expand Down Expand Up @@ -313,7 +393,6 @@ public static void main(String[] args) {
private static ESAnnotatedHTMLDocument getHTMLDocumentForAnnotation(String url, WebDriver driver) {
driver.get(url);
String pageTitle = driver.getTitle();
System.out.println(url + " " + pageTitle);
List<CoreLabel> allTokens = new ArrayList<>();
RemoteWebElement e = (RemoteWebElement) driver.findElement(By.xpath("/html/body"));
travelDOMTreeWithSelenium(e,null,allTokens, driver);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.forward.entitysearch.experiment;
package org.forward.entitysearch.ner.annotation;

import edu.stanford.nlp.ling.CustomizableCoreAnnotations;
import edu.stanford.nlp.pipeline.*;
Expand Down
2 changes: 2 additions & 0 deletions test_urls.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
0 http://www.forwarddatalab.org/kevinchang
1 http://www.forwarddatalab.org/research

0 comments on commit 770a7e6

Please sign in to comment.