From 4db967c654be589c16b0425b9a9d14aebc4972fe Mon Sep 17 00:00:00 2001 From: lintool Date: Tue, 22 Jan 2013 09:18:54 -0500 Subject: [PATCH] Created word count tutorial. --- .gitignore | 4 +- build.xml | 78 ++--- docs/word-count.html | 290 ++++++++++++++++++ index.html | 2 +- .../cloud9/example/simple/DemoWordCount.java | 63 +++- 5 files changed, 369 insertions(+), 68 deletions(-) create mode 100644 docs/word-count.html diff --git a/.gitignore b/.gitignore index 1270e33a9..2f116d2ab 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,7 @@ /dist/ /lib/ ivy/ivy.jar -etc/junit.* -etc/run.* -etc/run-integration.* +etc/*.sh data/bible+shakes.nopunc data/bible+shakes.nopunc.json.packed data/bible+shakes.nopunc.tuple1.packed diff --git a/build.xml b/build.xml index 0989b45c6..f985893c5 100644 --- a/build.xml +++ b/build.xml @@ -20,7 +20,10 @@ - + + + + @@ -212,56 +215,35 @@ + + - - - - - - + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + - + + + + + + + + + + + + + + + + diff --git a/docs/word-count.html b/docs/word-count.html new file mode 100644 index 000000000..b31678adb --- /dev/null +++ b/docs/word-count.html @@ -0,0 +1,290 @@ + + + + + Cloud9: A Hadoop toolkit for working with big data + + + + + + + + + + + + + + + + + + + + + +
+
+

Cloud9

+

A Hadoop toolkit for working with big data

+
+
+ +
+ + + +

Cloud9 is designed to work with Hadoop YARN and has been +tested against Cloudera CDH 4.1.2 (on both Mac and Linux). It should +work with other Hadoop distributions or on other platforms with only +minor modifications; however, switching to a non-YARN version of +Hadoop will requiring recompiling the jars. In this tutorial, we'll +take you through running word count on a toy collection. First, we'll +use Hadoop local mode (also called standalone mode). Running in local +model, as the name suggests, does not require setting up a cluster, +but of course, you won't get the benefits of distributed processing +either. Next, we'll run word count on the single node virtual Hadoop +cluster provided by Cloudera.

+ + +
WarningNote that local mode doesn't work +properly under Windows, even with cygwin, so Windows users following +this guide should start with "Running Cloud9 on a Single Node Virtual +Cluster"
+ +

Download and Install Hadoop

+ +

Download Cloudera CDH +4.1.2 here. The +easiest way is to download the tarball and unpack on your local +machine. Make sure +PATH_TO_HADOOP/bin is on your path. Verify that Hadoop is running +with the pi example. In a shell, run the following command:

+ +
+hadoop jar PATH_TO_HADOOP/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.0.0-cdh4.1.2.jar pi \
+  -D mapreduce.framework.name=local -D mapreduce.jobtracker.address=local -D fs.default.name=file:/// \
+  -D mapreduce.cluster.local.dir=/tmp/mapred/local \
+  -D mapreduce.cluster.temp.dir=/tmp/mapred/temp\
+  -D mapreduce.jobtracker.staging.root.dir=/tmp/mapred/staging \
+  -D mapreduce.jobtracker.system.dir=/tmp/mapred/system \
+  100 100000
+
+ +

Note that the multitude of -D options overrides the +Hadoop config and forces local mode. It isn't necessary if you just +downloaded the tarball straight from the site above. This is just in +case you have Hadoop set up already.

+ +

After the above Hadoop local job finishes, you should see the +computed value of pi... something that's reasonably closer to 3.14.

+ +

Clone the Cloud9 Repo

+ +

Open up a shell and clone the Cloud9 github repo:

+ +
+git clone git://github.com/lintool/Cloud9.git
+
+ +

Go into the Cloud9/ directory and build with ant by +typing ant. The build should complete without error.

+ +

Let's now run the word count demo. Ant should have automatically +created a script for you located at etc/hadoop-local.sh +for running Hadoop jobs in local mode. It conveniently sets up the +environment, so you shouldn't have to worry about classpaths, libjars, +etc. Building the index involves two separated commands:

+ +
+etc/hadoop-local.sh edu.umd.cloud9.example.simple.DemoWordCount \
+  -input data/bible+shakes.nopunc.gz -output wc
+
+ +

In local model, there is no HDFS, so you can use standard shell +commands to see the output. For example:

+ +
+$ head wc/part-r-00000
+&c	70
+&c'	1
+''all	1
+''among	1
+''and	1
+''but	1
+''how	1
+''lo	2
+''look	1
+''my	1
+
+ +

And that's it!

+ +

Running Cloud9 on a Single Node Virtual Cluster

+ +

The next step is to run Cloud9 on an actual Hadoop cluster. How to +set up a Hadoop cluster is beyond the scope of this tutorial, but the +next best thing is to use Cloudera's virtual machine images, which +come with pre-configured single-node cluster. The images can be +downloaded here. +

+ +

The latest available version is CDH 4.1.1: use the VirtualBox image, +since VirtualBox is freely available across all major +platforms. Download the image and unpack the tarball. VirtualBox +itself can be +download here.

+ +

Install VirtualBox and open up the application. To install the +Cloudera Hadoop image, click "New" on the tool bar. For "Name and +operating system", put in the following information:

+ +
    +
  • Name: Cloudera CDH 4.1.1
  • +
  • Type: Linux
  • +
  • Version: Linux 2.6 (64 bit)
  • +
+ +

Next, for "Memory size", put in as much as you can spare, with a +minimum of 3GB. Next, "Hard drive", select "Use an existing virtual +hard drive file" and select the VM image you downloaded from above. To +finish, click "Create". Back in the main window, the VM should have +been added. Select it and click "Start" in the toolbar. That'll boot +up the image.

+ + + +
InfoOn Mac, if you get the error "Failed to +load VMMR0.r0 (VERR_SUPLIB_WORLD_WRITABLE)" when booting up, it's +complaining because the directory /Application is world +writable. Apparently, that's bad practice, so change that: chmod +775 should do the trick. +
+ +

The VM is missing a few packages that we need, so open up a shell +and install from the command line:

+ +
+sudo yum install git 
+sudo yum install ant 
+sudo yum install gcc
+
+ +

Open up a shell and clone the Cloud9 github repo:

+ +
+git clone git://github.com/lintool/Cloud9.git
+
+ +

As with before, go into the Cloud9/ directory and build +with ant by typing ant.

+ +

After that's done, we need to put the sample data onto HDFS:

+ +
+hadoop fs -put data/bible+shakes.nopunc.gz
+
+ +

You can verify that the file is there:

+ +
+hadoop fs -ls
+
+ +

Next, run the word count demo using +the etc/hadoop-cluster.sh script, as follows:

+ +
+etc/hadoop-cluster.sh edu.umd.cloud9.example.simple.DemoWordCount \
+  -input data/bible+shakes.nopunc.gz -output wc -numReducers 5
+
+ +

The script is a wrapper around hadoop that sets up the +environment, handles libjars, etc. If you're curious, cat +it and you'll see. Note that the paths here are referencing HDFS +paths, not local paths.

+ +

After the job completes, you should be able to see the output on +HDFS:

+ +
+hadoop fs -ls wc
+
+ +

Now copy the data from HDFS onto the local disk:

+ +
+hadoop fs -get wc/part-r-00000 .
+
+ +

From here, you should be able to examine the contents of the file +using normal shell commands.

+ +

And that's it!

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/index.html b/index.html index bcbd7c44f..51e51ff72 100644 --- a/index.html +++ b/index.html @@ -58,7 +58,7 @@

Cloud9

GitHub project
  • - Version 1.4.2 + Version 1.4.4
  • diff --git a/src/dist/edu/umd/cloud9/example/simple/DemoWordCount.java b/src/dist/edu/umd/cloud9/example/simple/DemoWordCount.java index d12fa3f5a..33f349fb0 100644 --- a/src/dist/edu/umd/cloud9/example/simple/DemoWordCount.java +++ b/src/dist/edu/umd/cloud9/example/simple/DemoWordCount.java @@ -20,6 +20,13 @@ import java.util.Iterator; import java.util.StringTokenizer; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.GnuParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; @@ -36,6 +43,8 @@ import org.apache.hadoop.util.ToolRunner; import org.apache.log4j.Logger; +import cern.colt.Arrays; + /** * Simple word count demo. * @@ -52,8 +61,8 @@ private static class MyMapper extends Mapper values, Context context) throws IOException, - InterruptedException { + public void reduce(Text key, Iterable values, Context context) + throws IOException, InterruptedException { // Sum up values. Iterator iter = values.iterator(); int sum = 0; @@ -86,27 +95,49 @@ public void reduce(Text key, Iterable values, Context context) thro /** * Creates an instance of this tool. */ - public DemoWordCount() { - } + public DemoWordCount() {} - private static int printUsage() { - System.out.println("usage: [input-path] [output-path] [num-reducers]"); - ToolRunner.printGenericCommandUsage(System.out); - return -1; - } + private static final String INPUT = "input"; + private static final String OUTPUT = "output"; + private static final String NUM_REDUCERS = "numReducers"; /** * Runs this tool. */ + @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { - if (args.length != 3) { - printUsage(); + Options options = new Options(); + + options.addOption(OptionBuilder.withArgName("path").hasArg() + .withDescription("input path").create(INPUT)); + options.addOption(OptionBuilder.withArgName("path").hasArg() + .withDescription("output path").create(OUTPUT)); + options.addOption(OptionBuilder.withArgName("num").hasArg() + .withDescription("number of reducers").create(NUM_REDUCERS)); + + CommandLine cmdline; + CommandLineParser parser = new GnuParser(); + + try { + cmdline = parser.parse(options, args); + } catch (ParseException exp) { + System.err.println("Error parsing command line: " + exp.getMessage()); + return -1; + } + + if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { + System.out.println("args: " + Arrays.toString(args)); + HelpFormatter formatter = new HelpFormatter(); + formatter.setWidth(120); + formatter.printHelp(this.getClass().getName(), options); + ToolRunner.printGenericCommandUsage(System.out); return -1; } - String inputPath = args[0]; - String outputPath = args[1]; - int reduceTasks = Integer.parseInt(args[2]); + String inputPath = cmdline.getOptionValue(INPUT); + String outputPath = cmdline.getOptionValue(OUTPUT); + int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? + Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + DemoWordCount.class.getSimpleName()); LOG.info(" - input path: " + inputPath);