Skip to content

Commit

Permalink
Fix http bad requests
Browse files Browse the repository at this point in the history
  • Loading branch information
Long Pham committed May 26, 2018
1 parent 27f85ca commit 4fd9cfd
Show file tree
Hide file tree
Showing 8 changed files with 136 additions and 102 deletions.
112 changes: 51 additions & 61 deletions .idea/workspace.xml

Large diffs are not rendered by default.

Empty file added run.sh
Empty file.
Binary file removed serialized/kevin.ser
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ public static void main (String[] args) {
FileInputStream fin = null;
ObjectInputStream ois = null;
try {
fin = new FileInputStream("serialized/4.ser");
fin = new FileInputStream("serialized/00000.ser");
ois = new ObjectInputStream(fin);
doc = (ESAnnotatedHTMLDocument) ois.readObject();
} catch (Exception e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import org.openqa.selenium.support.ui.ExpectedCondition;
import org.openqa.selenium.support.ui.WebDriverWait;

import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

Expand Down Expand Up @@ -61,10 +63,15 @@ public static void main(String[] args) {
WebDriver driver = new ChromeDriver(options);

String baseUrl = "http://www.forwarddatalab.org/kevinchang";
baseUrl = "https://ece.illinois.edu/directory/profile/carney%3D";
baseUrl = "http://ece.illinois.edu";
baseUrl = "http://www.ece.illinois.edu/academics/courses/profile/ECE487&secM";

// baseUrl = "https://charm.cs.illinois.edu/";
// baseUrl = "http://codingspectator.cs.illinois.edu/updates/helios/";
// baseUrl = "http://nlp.cs.illinois.edu/HockenmaierGroup/8k-pictures.html";
baseUrl = "http://hockenmaier.cs.illinois.edu/DenotationGraph/graph/index10.html";
// baseUrl = "http://hockenmaier.cs.illinois.edu/DenotationGraph/graph/index10.html";
// baseUrl = "http://hockenmaier.cs.illinois.edu/DenotationGraph/graph/index100.html";
// WebDriverWait wait = new WebDriverWait(driver, 10);
// wait.until(new ExpectedCondition<Boolean>() {
// @Override
Expand All @@ -74,7 +81,8 @@ public static void main(String[] args) {
// });
// launch browser and direct it to the Base URL
driver.get(baseUrl);
System.out.println(driver.getPageSource().length());
System.out.println(driver.getCurrentUrl());
System.out.println(driver.getPageSource());
System.exit(0);

// get the title and print it
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@
import org.openqa.selenium.remote.RemoteWebElement;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;

import org.apache.commons.cli.*;
Expand Down Expand Up @@ -297,6 +300,8 @@ public static void main(String[] args) {
System.out.println("Ready to download and annotate HTML documents");
System.out.println("----------------------------------------------");

HashSet<String> seenUrls = new HashSet<>();

for (int i = 0; i < urls.size(); i++) {
String filename = urls.get(i).first;
String baseUrl = urls.get(i).second;
Expand All @@ -312,44 +317,74 @@ public static void main(String[] args) {

System.out.println(filename + "\t" + baseUrl);

Integer responseCode = getHttpResponseCode(baseUrl);
if (responseCode == null || responseCode >= 400) {
System.out.println("Bad Request!");
System.err.println("Bad Request: " + baseUrl);
continue;
}

driver.get(baseUrl);
String currentUrl; // final url after redirects
try {
currentUrl = driver.getCurrentUrl();
} catch (Exception ex) {
System.out.println("This web document cannot be opened by browser!");
System.err.println("This web document cannot be opened by browser: " + baseUrl);
driver.close();
driver = createChromeDriver();
continue;
}
if (seenUrls.contains(currentUrl)) {
// for deduplication
// and also for avoiding the case when the new URL is a file which does not navigate the driver to a new page
System.out.println("This URL has been rendered " + currentUrl);
System.err.println("This URL has been rendered " + currentUrl);
if (!baseUrl.equalsIgnoreCase(currentUrl)) {
System.out.println("It has been redirected from the base URL: " + baseUrl);
System.err.println("It has been redirected from the base URL: " + baseUrl);
}
continue;
}
if (driver.getPageSource().length() > MAX_LENGTH_OF_PAGE_SOURCE) {
System.out.println("This document is too long!");
System.err.println("This document is too long " + baseUrl);
continue;
}

ESAnnotatedHTMLDocument document = null;
try {
document = getHTMLDocumentForAnnotation(baseUrl, driver);
document = getHTMLDocumentForAnnotation(driver);
} catch(Exception ex) {
System.out.println("There is some exception when parsing the document ");
System.err.println("There is some exception when parsing the document in this URL: " + baseUrl);
System.err.println(ex.getClass());
continue;
}
time = System.currentTimeMillis();
System.out.println("Finish creating document for annotation " + (time-start)/1000 + " seconds");
start = time;

if (document == null) {
System.out.println("This URL cannot be rendered by Selenium!");
System.err.println("This URL cannot be rendered by Selenium " + baseUrl);
continue;
}

if (document.getTitle().equals(WEB_DOCUMENT_CANNOT_BE_OPENED_ERROR)) {
System.out.println("This web document cannot be opened by browser!");
System.err.println("This web document cannot be opened by browser: " + baseUrl);
driver.close();
driver = createChromeDriver();
continue;
}

if (document.get(CoreAnnotations.TokensAnnotation.class).size() <= 1) {
System.out.println("This URL is probably not a web page!");
System.err.println("This URL is probably not a web page " + document.getURL());
System.err.println("This URL is probably not a web page " + baseUrl);
continue;
}

time = System.currentTimeMillis();
System.out.println("Finish creating document for annotation " + (time-start)/1000 + " seconds");
start = time;

seenUrls.add(currentUrl);

try {
AnnotatorFactory.getInstance().getAnnotationPipeline().annotate(document);
} catch(StaleElementReferenceException ex) {
} catch(Exception ex) {
System.out.println("There is an exception when annotating this document");
System.err.println("There is an exception when the document in this URL: " + baseUrl);
System.err.println("There is an exception when annotating the document in this URL: " + baseUrl);
continue;
}

Expand Down Expand Up @@ -446,27 +481,21 @@ public static void main(String[] args) {

}

private static ESAnnotatedHTMLDocument getHTMLDocumentForAnnotation(String url, WebDriver driver) throws StaleElementReferenceException{
driver.get(url);
String currentUrl;
public static Integer getHttpResponseCode(String baseUrl) {
try {
currentUrl = driver.getCurrentUrl();
} catch (Exception ex) {
ESAnnotatedHTMLDocument doc = new ESAnnotatedHTMLDocument();
doc.setTitle(WEB_DOCUMENT_CANNOT_BE_OPENED_ERROR);
return doc;
}
if (currentUrl.equalsIgnoreCase(CUR_URL)) {
return null; // avoid the case when the new URL is a file which does not navigate the driver to a new page
} else {
CUR_URL = currentUrl;
}
if (driver.getPageSource().length() > MAX_LENGTH_OF_PAGE_SOURCE) {
System.out.println("This document is too long!");
System.err.println("This document is too long " + url);
return null;
URL url = new URL(baseUrl);
HttpURLConnection connection = (HttpURLConnection)url.openConnection();
connection.setRequestMethod("GET");
connection.connect();
return connection.getResponseCode();
} catch (Exception e) {
System.err.println("This url cannot be opened: " + baseUrl);
}
String pageTitle = driver.getTitle();
return null;
}

private static ESAnnotatedHTMLDocument getHTMLDocumentForAnnotation(WebDriver driver) throws StaleElementReferenceException{

List<CoreLabel> allTokens = new ArrayList<>();
RemoteWebElement e;
try{
Expand All @@ -481,8 +510,8 @@ private static ESAnnotatedHTMLDocument getHTMLDocumentForAnnotation(String url,
// travelDOMTreeWithSelenium2((RemoteWebElement)driver.findElement(By.xpath("/html/body")),null,allTokens, driver);
// ESAnnotatedHTMLDocument document = new ESAnnotatedHTMLDocument();
// document.loadFromTokens(allTokens);
document.setURL(url);
document.setTitle(pageTitle);
document.setURL(driver.getCurrentUrl());
document.setTitle(driver.getTitle());
document.setHeight(e.getSize().height);
document.setWidth(e.getSize().width);
return document;
Expand Down
6 changes: 5 additions & 1 deletion test_urls copy.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
0 https://cs.illinois.edu/about-us/awards/faculty-awards/chairs-and-professorships/founder-professor-engineering
1 http://www.forwarddatalab.org/kevinchang
2 https://relate.cs.illinois.edu/course/zuics101fa16/f/lectures/lec05.ipynb
1 http://data-people.cs.illinois.edu/topic.html
2 http://charm.cs.illinois.edu/research/episim
https://relate.cs.illinois.edu/video/cs598ak-f13/html/player.html?descriptor=metadata/lec07.json // dangling nodes
http://codingspectator.cs.illinois.edu/composite-refactorings-updates/ // documents could not be loaded by browsers
http://keshmesh.cs.illinois.edu/updates/ //couldn't be loaded by browsers
5 changes: 4 additions & 1 deletion test_urls.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,8 @@
1 https://relate.cs.illinois.edu/video/cs598ak-f13/html/player.html?descriptor=metadata/lec07.json
2 http://codingspectator.cs.illinois.edu/composite-refactorings-updates/
3 https://relate.cs.illinois.edu/course/zuics101fa16/f/lectures/lec05.ipynb
9 https://ece.illinois.edu/directory/faculty.asp
5 http://hockenmaier.cs.illinois.edu/DenotationGraph/graph/index10.html
4 http://www.forwarddatalab.org/kevinchang
6 https://ece.illinois.edu/directory/profile/farzadk%3D
4 http://www.forwarddatalab.org/kevinchang
10 http://www.ece.illinois.edu/academics/courses/profile/ECE487&sec%3D

0 comments on commit 4fd9cfd

Please sign in to comment.