diff --git a/pom.xml b/pom.xml index c1ff2a5d2..d0f26a1a0 100644 --- a/pom.xml +++ b/pom.xml @@ -91,13 +91,13 @@ maven-gpg-plugin 1.5 - + diff --git a/src/main/java/edu/umd/cloud9/collection/wikipedia/CountWikipediaPages.java b/src/main/java/edu/umd/cloud9/collection/wikipedia/CountWikipediaPages.java index 8ed6310e8..9dcb2f038 100644 --- a/src/main/java/edu/umd/cloud9/collection/wikipedia/CountWikipediaPages.java +++ b/src/main/java/edu/umd/cloud9/collection/wikipedia/CountWikipediaPages.java @@ -47,6 +47,7 @@ * * @author Jimmy Lin * @author Peter Exner + * @author Gaurav Ragtah (gaurav.ragtah@lithium.com) */ public class CountWikipediaPages extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(CountWikipediaPages.class); @@ -89,8 +90,8 @@ public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path") .hasArg().withDescription("XML dump file").create(INPUT_OPTION)); - options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg() - .withDescription("two-letter language code").create(LANGUAGE_OPTION)); + options.addOption(OptionBuilder.withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr").hasArg() + .withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); @@ -111,7 +112,7 @@ public int run(String[] args) throws Exception { String language = "en"; // Assume 'en' by default. if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); - if (language.length() != 2) { + if(!(language.length() == 2 || language.length() == 6)){ System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } diff --git a/src/main/java/edu/umd/cloud9/collection/wikipedia/DumpWikipediaToPlainText.java b/src/main/java/edu/umd/cloud9/collection/wikipedia/DumpWikipediaToPlainText.java index 4a1805336..a5d2e8df7 100644 --- a/src/main/java/edu/umd/cloud9/collection/wikipedia/DumpWikipediaToPlainText.java +++ b/src/main/java/edu/umd/cloud9/collection/wikipedia/DumpWikipediaToPlainText.java @@ -45,6 +45,7 @@ * * @author Jimmy Lin * @author Peter Exner + * @author Gaurav Ragtah (gaurav.ragtah@lithium.com) */ public class DumpWikipediaToPlainText extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(DumpWikipediaToPlainText.class); @@ -54,8 +55,9 @@ private static enum PageTypes { }; private static class MyMapper extends Mapper { - private static final Text articleName = new Text(); - private static final Text articleContent = new Text(); + private static final Text articleId = new Text(); + private static final Text articleTitleAndContent = new Text(); + @Override public void map(LongWritable key, WikipediaPage p, Context context) @@ -75,10 +77,14 @@ public void map(LongWritable key, WikipediaPage p, Context context) context.getCounter(PageTypes.STUB).increment(1); } - articleName.set(p.getTitle().replaceAll("[\\r\\n]+", " ")); - articleContent.set(p.getContent().replaceAll("[\\r\\n]+", " ")); + articleId.set(p.getDocid()); + articleTitleAndContent.set( + p.getTitle().replaceAll("[\\r\\n]+", " ") + + "\t" + + p.getContent().replaceAll("[\\r\\n]+", " ") + ); - context.write(articleName, articleContent); + context.write(articleId, articleTitleAndContent); } else { context.getCounter(PageTypes.OTHER).increment(1); } @@ -97,8 +103,8 @@ public int run(String[] args) throws Exception { .withDescription("XML dump file").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("output path").create(OUTPUT_OPTION)); - options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg() - .withDescription("two-letter language code").create(LANGUAGE_OPTION)); + options.addOption(OptionBuilder.withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr").hasArg() + .withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); @@ -119,7 +125,7 @@ public int run(String[] args) throws Exception { String language = "en"; // Assume "en" by default. if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); - if (language.length() != 2) { + if(!(language.length() == 2 || language.length() == 6)){ System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } diff --git a/src/main/java/edu/umd/cloud9/collection/wikipedia/ExtractWikipediaDisambiguations.java b/src/main/java/edu/umd/cloud9/collection/wikipedia/ExtractWikipediaDisambiguations.java new file mode 100644 index 000000000..a41c0640f --- /dev/null +++ b/src/main/java/edu/umd/cloud9/collection/wikipedia/ExtractWikipediaDisambiguations.java @@ -0,0 +1,233 @@ +package edu.umd.cloud9.collection.wikipedia; + +import java.io.IOException; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.log4j.Logger; +import java.util.ArrayList; +import java.util.regex.Pattern; +import java.util.regex.Matcher; + +/** + * Tool for taking a Wikipedia XML dump file and writing out article titles and ambiguous related titles + * in a flat text file (article title and related titles, separated by tabs; related titles are '\002' separated). + * + * @author Gaurav Ragtah (gaurav.ragtah@lithium.com) + */ +public class ExtractWikipediaDisambiguations extends Configured implements Tool { + private static final Logger LOG = Logger.getLogger(ExtractWikipediaDisambiguations.class); + + private static enum PageTypes { + TOTAL, REDIRECT, DISAMBIGUATION, EMPTY, ARTICLE, STUB, OTHER + } + + private static class WikiDisambiguationMapper extends Mapper { + + private static final Pattern LANG_LINKS = Pattern.compile("\\[\\[[a-z\\-]+:[^\\]]+\\]\\]"); + private static final Pattern REF = Pattern.compile(".*?"); + private static final Pattern HTML_COMMENT = Pattern.compile("", Pattern.DOTALL); + // Sometimes, URLs bump up against comments e.g., + // So remove comments first, since the URL pattern might capture comment terminators. + private static final Pattern URL = Pattern.compile("http://[^ <]+"); + private static final Pattern DOUBLE_CURLY = Pattern.compile("\\{\\{.*?\\}\\}"); + private static final Pattern HTML_TAG = Pattern.compile("<[^!][^>]*>"); + private static final Pattern NEWLINE = Pattern.compile("[\\r\\n]+"); + private static final String SINGLE_SPACE = " "; + + private static final Pattern WIKI_TITLE = Pattern.compile("\\[\\[(.*?)\\]\\]"); + private static final Pattern WIKI_TITLE_DUPLICATED = Pattern.compile("\\|.+"); + + private static final Pattern[] patternsToCleanUp = {LANG_LINKS, REF, HTML_COMMENT, URL, DOUBLE_CURLY, HTML_TAG, NEWLINE}; + + @Override + public void map(LongWritable key, WikipediaPage p, Context context) throws IOException, InterruptedException { + context.getCounter(PageTypes.TOTAL).increment(1); + + if (p.isEmpty()) { + context.getCounter(PageTypes.EMPTY).increment(1); + } else if (p.isRedirect()) { + context.getCounter(PageTypes.REDIRECT).increment(1); + } else if (p.isDisambiguation()) { + + context.getCounter(PageTypes.DISAMBIGUATION).increment(1); + + ArrayList wikiTitleList = new ArrayList(); + Text title = new Text(); + Text similarTitles = new Text(); + + String wikiText = p.getWikiMarkup(); + + // Find 'See also' section and truncate it - related but not ambiguous terms + int seeAlsoSectionStart = wikiText.indexOf("See also"); + if (seeAlsoSectionStart >= 0) + wikiText = wikiText.substring(0,seeAlsoSectionStart); + + if (wikiText == null) { + context.getCounter(WikiDisambiguationMapper.class.getSimpleName(), "NULL_WIKITEXT").increment(1); + return; + } + + // The way the some entities are encoded, we have to unescape twice. + wikiText = StringEscapeUtils.unescapeHtml(StringEscapeUtils.unescapeHtml(wikiText)); + + for (Pattern pattern: patternsToCleanUp) { + wikiText = pattern.matcher(wikiText).replaceAll(SINGLE_SPACE); + } + + Pattern disambPattern = p.getDisambPattern(); + String disambRegex = disambPattern.toString(); + + // Format disambiguation regex to match against in-text article titles. + // Eg. Convert \{disambig\w*\} to \(disambig\w*\) + disambRegex = "\\(" + disambRegex.replaceAll("(\\\\\\{)?(\\\\\\})?", "") + "\\)"; + Pattern disambMatcher = Pattern.compile(disambRegex, Pattern.CASE_INSENSITIVE); + + // Extract the ambiguous entity from the disambiguation article title. + // Eg. Apple_Store_(disambiguation) to Apple Store + String ambiguousTitle = disambMatcher.matcher(p.getTitle()).replaceAll(SINGLE_SPACE).replaceAll("_", SINGLE_SPACE).toLowerCase().trim(); + + Matcher wikiTitleMatcher = WIKI_TITLE.matcher(wikiText); + while (wikiTitleMatcher.find()) { + String wikiTitle = wikiTitleMatcher.group(1); + + // Only pick in-text titles that are not disambiguations and contain the current page's ambiguous title. + // Eg. From the Apple disambiguation page, pick [[Apple Inc.]] and [[Big Apple]] but not [[Big Apple (disambiguation)]] or [[Apel (disambiguation)]] + if (!disambMatcher.matcher(wikiTitle).find() && wikiTitle.toLowerCase().contains(ambiguousTitle)) { + // To handle duplication like [[Alien (Britney Spears song)|"Alien" (Britney Spears song)]] in the same in-text title + wikiTitle = WIKI_TITLE_DUPLICATED.matcher(wikiTitle).replaceAll(SINGLE_SPACE).trim(); + wikiTitleList.add(wikiTitle); + } + } + + String[] wikiTitles = wikiTitleList.toArray(new String[wikiTitleList.size()]); + for (int i = 0; i < wikiTitles.length; i++) { + title.clear(); + similarTitles.clear(); + title.set(wikiTitles[i]); + String similarTitlesStr = ""; + + for (int j = 0; j < wikiTitles.length; j++) { + if (i == j || wikiTitles[j].isEmpty()) continue; + similarTitlesStr += wikiTitles[j]; + if (j < wikiTitles.length - 1) similarTitlesStr += '\002'; + } + + if (!similarTitlesStr.isEmpty()) { + similarTitles.set(similarTitlesStr); + context.write(title, similarTitles); + } + } + + } else if (p.isArticle()) { + context.getCounter(PageTypes.ARTICLE).increment(1); + if (p.isStub()) { + context.getCounter(PageTypes.STUB).increment(1); + } + } else { + context.getCounter(PageTypes.OTHER).increment(1); + } + } + } + + private static final String INPUT_OPTION = "input"; + private static final String OUTPUT_OPTION = "output"; + private static final String LANGUAGE_OPTION = "wiki_language"; + + @SuppressWarnings("static-access") + @Override + public int run(String[] args) throws Exception { + Options options = new Options(); + options.addOption(OptionBuilder.withArgName("path").hasArg() + .withDescription("XML dump file").create(INPUT_OPTION)); + options.addOption(OptionBuilder.withArgName("path").hasArg() + .withDescription("output path").create(OUTPUT_OPTION)); + options.addOption(OptionBuilder.withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr").hasArg() + .withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION)); + + CommandLine cmdline; + CommandLineParser parser = new GnuParser(); + try { + cmdline = parser.parse(options, args); + } catch (ParseException exp) { + LOG.error("Error parsing command line: " + exp.getMessage()); + return -1; + } + + if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp(this.getClass().getName(), options); + ToolRunner.printGenericCommandUsage(System.out); + return -1; + } + + String language = "en"; // Assume "en" by default. + if (cmdline.hasOption(LANGUAGE_OPTION)) { + language = cmdline.getOptionValue(LANGUAGE_OPTION); + if (!(language.length() == 2 || language.length() == 6)) { + LOG.error("Error: \"" + language + "\" unknown language!"); + return -1; + } + } + + String inputPath = cmdline.getOptionValue(INPUT_OPTION); + String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); + + LOG.info("Tool name: " + this.getClass().getName()); + LOG.info(" - XML dump file: " + inputPath); + LOG.info(" - output path: " + outputPath); + LOG.info(" - language: " + language); + + Job job = Job.getInstance(getConf()); + job.setJarByClass(ExtractWikipediaDisambiguations.class); + job.setJobName(String.format("ExtractWikipediaDisambiguations[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, + inputPath, OUTPUT_OPTION, outputPath, LANGUAGE_OPTION, language)); + + job.setNumReduceTasks(0); + + FileInputFormat.setInputPaths(job, new Path(inputPath)); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + if (language != null) { + job.getConfiguration().set("wiki.language", language); + } + + job.setInputFormatClass(WikipediaPageInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + + job.setMapperClass(WikiDisambiguationMapper.class); + job.setOutputKeyClass(Text.class); + job.setOutputValueClass(Text.class); + + // Delete the output directory if it exists already. + FileSystem.get(getConf()).delete(new Path(outputPath), true); + + job.waitForCompletion(true); + + return 0; + } + + public ExtractWikipediaDisambiguations() { + } + + public static void main(String[] args) throws Exception { + ToolRunner.run(new ExtractWikipediaDisambiguations(), args); + } +} diff --git a/src/main/java/edu/umd/cloud9/collection/wikipedia/RepackWikipedia.java b/src/main/java/edu/umd/cloud9/collection/wikipedia/RepackWikipedia.java index 3b036f360..ff7d4edcb 100644 --- a/src/main/java/edu/umd/cloud9/collection/wikipedia/RepackWikipedia.java +++ b/src/main/java/edu/umd/cloud9/collection/wikipedia/RepackWikipedia.java @@ -46,6 +46,7 @@ * * @author Jimmy Lin * @author Peter Exner + * @author Gaurav Ragtah (gaurav.ragtah@lithium.com) */ public class RepackWikipedia extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(RepackWikipedia.class); @@ -110,8 +111,8 @@ public int run(String[] args) throws Exception { .withDescription("mapping file").create(MAPPING_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("block|record|none").hasArg() .withDescription("compression type").create(COMPRESSION_TYPE_OPTION)); - options.addOption(OptionBuilder.withArgName("en|sv|de").hasArg() - .withDescription("two-letter language code").create(LANGUAGE_OPTION)); + options.addOption(OptionBuilder.withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr").hasArg() + .withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); @@ -143,7 +144,8 @@ public int run(String[] args) throws Exception { String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); - if(language.length()!=2){ + if(!(language.length()==2 || language.length()==6)){ + // Added length check for 6 to include languages like zh_yue System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } diff --git a/src/main/java/edu/umd/cloud9/collection/wikipedia/WikipediaDocnoMappingBuilder.java b/src/main/java/edu/umd/cloud9/collection/wikipedia/WikipediaDocnoMappingBuilder.java index 96d1037c0..22fd11e8f 100644 --- a/src/main/java/edu/umd/cloud9/collection/wikipedia/WikipediaDocnoMappingBuilder.java +++ b/src/main/java/edu/umd/cloud9/collection/wikipedia/WikipediaDocnoMappingBuilder.java @@ -50,6 +50,7 @@ * * @author Jimmy Lin * @author Peter Exner + * @author Gaurav Ragtah (gaurav.ragtah@lithium.com) */ public class WikipediaDocnoMappingBuilder extends Configured implements Tool, DocnoMapping.Builder { private static final Logger LOG = Logger.getLogger(WikipediaDocnoMappingBuilder.class); @@ -147,8 +148,8 @@ public int run(String[] args) throws Exception { .hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path") .hasArg().withDescription("output file").create(OUTPUT_FILE_OPTION)); - options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg() - .withDescription("two-letter language code").create(LANGUAGE_OPTION)); + options.addOption(OptionBuilder.withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr").hasArg() + .withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION)); options.addOption(KEEP_ALL_OPTION, false, "keep all pages"); CommandLine cmdline; @@ -170,7 +171,8 @@ public int run(String[] args) throws Exception { String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); - if(language.length()!=2){ + if(!(language.length()==2 || language.length()==6)){ + // Added length check for 6 to include languages like zh_yue System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } diff --git a/src/main/java/edu/umd/cloud9/collection/wikipedia/WikipediaForwardIndexBuilder.java b/src/main/java/edu/umd/cloud9/collection/wikipedia/WikipediaForwardIndexBuilder.java index d2e06799e..b94990b34 100644 --- a/src/main/java/edu/umd/cloud9/collection/wikipedia/WikipediaForwardIndexBuilder.java +++ b/src/main/java/edu/umd/cloud9/collection/wikipedia/WikipediaForwardIndexBuilder.java @@ -56,6 +56,7 @@ * * @author Jimmy Lin * @author Peter Exner + * @author Gaurav Ragtah (gaurav.ragtah@lithium.com) */ public class WikipediaForwardIndexBuilder extends Configured implements Tool { private static final Logger LOG = Logger.getLogger(WikipediaForwardIndexBuilder.class); @@ -119,8 +120,8 @@ public int run(String[] args) throws Exception { .create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("index file") .create(INDEX_FILE_OPTION)); - options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg() - .withDescription("two-letter language code").create(LANGUAGE_OPTION)); + options.addOption(OptionBuilder.withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr").hasArg() + .withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); @@ -152,7 +153,7 @@ public int run(String[] args) throws Exception { String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); - if (language.length() != 2) { + if(!(language.length() == 2 || language.length() == 6)){ System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } diff --git a/src/main/java/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java b/src/main/java/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java index ae09c2817..543b22319 100644 --- a/src/main/java/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java +++ b/src/main/java/edu/umd/cloud9/collection/wikipedia/WikipediaPage.java @@ -40,6 +40,7 @@ * * @author Jimmy Lin * @author Peter Exner + * @author Gaurav Ragtah (gaurav.ragtah@lithium.com) */ public abstract class WikipediaPage extends Indexable { /** @@ -102,6 +103,7 @@ public abstract class WikipediaPage extends Indexable { protected boolean isStub; protected boolean isArticle; protected String language; + protected Pattern disambPattern; private WikiModel wikiModel; private PlainTextConverter textConverter; @@ -151,6 +153,10 @@ public String getLanguage() { return this.language; } + public Pattern getDisambPattern() { + return disambPattern; + } + // Explictly remove ..., because there are screwy things like this: // [http://www.interieur.org/] // where "http://www.interieur.org/