Skip to content

Commit

Permalink
Handle non-UTF-8 input files. Fixes dkpro#29.
Browse files Browse the repository at this point in the history
  • Loading branch information
tfmorris committed Jun 12, 2020
1 parent ed67700 commit e3c348b
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 1 deletion.
4 changes: 4 additions & 0 deletions dkpro-c4corpus-boilerplate/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
<artifactId>commons-io</artifactId>
</dependency>

<dependency>
<groupId>org.dkpro.c4corpus</groupId>
<artifactId>dkpro-c4corpus-language</artifactId>
</dependency>
</dependencies>

<!-- for a standalone application -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,15 @@

import de.tudarmstadt.ukp.dkpro.c4corpus.boilerplate.BoilerPlateRemoval;
import de.tudarmstadt.ukp.dkpro.c4corpus.boilerplate.impl.JusTextBoilerplateRemoval;
import de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.CharsetDetector;
import de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.impl.ICUCharsetDetectorWrapper;

import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.charset.Charset;

/**
* This class takes one HTML file as input, removes boilerplate for each entry,
Expand All @@ -36,10 +40,14 @@
public class HTMLBoilerplateRemoval
{
private static final BoilerPlateRemoval boilerPlateRemoval = new JusTextBoilerplateRemoval();
private final static CharsetDetector CHARSET_DETECTOR = new ICUCharsetDetectorWrapper();

public static void main(String[] args)
throws IOException
{
if (args.length < 3) {
System.out.println("Not enough arguments - Usage: infile outfile true/false (output HTML tags)");
}
File input = new File(args[0]);
File output = new File(args[1]);
boolean keepMinimalHtml = Boolean.valueOf(args[2]);
Expand All @@ -51,7 +59,9 @@ public static void processHtmlFile(File input, File outFile, boolean keepMinimal
throws IOException
{
// read the html file
String html = FileUtils.readFileToString(input, "utf-8");
byte[] bytes = FileUtils.readFileToByteArray(input);
Charset charset = CHARSET_DETECTOR.detectCharset(bytes);
String html = new String(bytes, charset);

// boilerplate removal
String cleanText;
Expand Down

0 comments on commit e3c348b

Please sign in to comment.