From e3c348b972ca2325b75056818d6469e587d07616 Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Fri, 8 Apr 2016 17:00:49 -0400 Subject: [PATCH] Handle non-UTF-8 input files. Fixes #29. --- dkpro-c4corpus-boilerplate/pom.xml | 4 ++++ .../standalone/HTMLBoilerplateRemoval.java | 12 +++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/dkpro-c4corpus-boilerplate/pom.xml b/dkpro-c4corpus-boilerplate/pom.xml index 5109cfd..0b20387 100644 --- a/dkpro-c4corpus-boilerplate/pom.xml +++ b/dkpro-c4corpus-boilerplate/pom.xml @@ -47,6 +47,10 @@ commons-io + + org.dkpro.c4corpus + dkpro-c4corpus-language + diff --git a/dkpro-c4corpus-boilerplate/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/boilerplate/standalone/HTMLBoilerplateRemoval.java b/dkpro-c4corpus-boilerplate/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/boilerplate/standalone/HTMLBoilerplateRemoval.java index 13d33ad..b9bc05e 100644 --- a/dkpro-c4corpus-boilerplate/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/boilerplate/standalone/HTMLBoilerplateRemoval.java +++ b/dkpro-c4corpus-boilerplate/src/main/java/de/tudarmstadt/ukp/dkpro/c4corpus/boilerplate/standalone/HTMLBoilerplateRemoval.java @@ -19,11 +19,15 @@ import de.tudarmstadt.ukp.dkpro.c4corpus.boilerplate.BoilerPlateRemoval; import de.tudarmstadt.ukp.dkpro.c4corpus.boilerplate.impl.JusTextBoilerplateRemoval; +import de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.CharsetDetector; +import de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.impl.ICUCharsetDetectorWrapper; + import org.apache.commons.io.FileUtils; import java.io.File; import java.io.IOException; import java.io.PrintWriter; +import java.nio.charset.Charset; /** * This class takes one HTML file as input, removes boilerplate for each entry, @@ -36,10 +40,14 @@ public class HTMLBoilerplateRemoval { private static final BoilerPlateRemoval boilerPlateRemoval = new JusTextBoilerplateRemoval(); + private final static CharsetDetector CHARSET_DETECTOR = new ICUCharsetDetectorWrapper(); public static void main(String[] args) throws IOException { + if (args.length < 3) { + System.out.println("Not enough arguments - Usage: infile outfile true/false (output HTML tags)"); + } File input = new File(args[0]); File output = new File(args[1]); boolean keepMinimalHtml = Boolean.valueOf(args[2]); @@ -51,7 +59,9 @@ public static void processHtmlFile(File input, File outFile, boolean keepMinimal throws IOException { // read the html file - String html = FileUtils.readFileToString(input, "utf-8"); + byte[] bytes = FileUtils.readFileToByteArray(input); + Charset charset = CHARSET_DETECTOR.detectCharset(bytes); + String html = new String(bytes, charset); // boilerplate removal String cleanText;