Skip to content
This repository has been archived by the owner on May 6, 2018. It is now read-only.

Commit

Permalink
Created word count tutorial.
Browse files Browse the repository at this point in the history
  • Loading branch information
jimmy0017 committed Jan 22, 2013
1 parent c9c76e5 commit 4db967c
Show file tree
Hide file tree
Showing 5 changed files with 369 additions and 68 deletions.
4 changes: 1 addition & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
/dist/
/lib/
ivy/ivy.jar
etc/junit.*
etc/run.*
etc/run-integration.*
etc/*.sh
data/bible+shakes.nopunc
data/bible+shakes.nopunc.json.packed
data/bible+shakes.nopunc.tuple1.packed
Expand Down
78 changes: 30 additions & 48 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,10 @@
<path id="run.path.id">
<path refid="lib.path.id" />
<fileset dir="${dist.dir}" />
<!-- path location="${build.dir}" /-->
</path>
<path id="libjars.path.id">
<fileset dir="${dist.dir}/" includes="cloud9*.jar"/>
<fileset dir="${lib.dir}/" includes="guava*.jar"/>
</path>

<property name="maven.ant.task.version" value="2.1.2"/>
Expand Down Expand Up @@ -212,56 +215,35 @@

<target name="scripts" depends="jar">
<pathconvert property="run.path.id" refid="run.path.id" targetos="${platform}" />
<pathconvert property="lib.path.id" refid="lib.path.id" targetos="${platform}" />
<pathconvert property="libjars.path.id" refid="libjars.path.id" targetos="${platform}" pathsep="," />

<condition property="suffix" value="sh">
<equals arg1="${platform}" arg2="unix" />
</condition>
<condition property="suffix" value="bat">
<equals arg1="${platform}" arg2="windows" />
</condition>
<property name="suffix" value="sh" />
<property name="param_prefix" value="$" />
<property name="cp_sep" value=":" />
<property name="java_command" value="java -Xmx4g -classpath &quot;${run.path.id}&quot;" />
<property name="script_prefix" value="#!/bin/sh${line.separator}export HADOOP_CLASSPATH=&quot;${run.path.id}:$HADOOP_CLASSPATH&quot;${line.separator}" />

<condition property="param_prefix" value="$">
<equals arg1="${platform}" arg2="unix" />
</condition>
<condition property="param_prefix" value="%">
<equals arg1="${platform}" arg2="windows" />
</condition>

<condition property="java" value="java">
<equals arg1="${platform}" arg2="unix" />
</condition>
<condition property="java" value="java">
<equals arg1="${platform}" arg2="windows" />
</condition>

<condition property="cp_sep" value=":">
<equals arg1="${platform}" arg2="unix" />
</condition>
<condition property="cp_sep" value=";">
<equals arg1="${platform}" arg2="windows" />
</condition>

<property name="java_command" value="${java} -Xmx2048m -classpath &quot;${run.path.id}&quot;" />

<condition property="script_prefix" value="#!/bin/sh${line.separator}export HADOOP_CLASSPATH=&quot;${run.path.id}:$HADOOP_CLASSPATH&quot;${line.separator}if test -s ~/.bashrc${line.separator}then${line.separator}source ~/.bashrc${line.separator}fi${line.separator}">
<equals arg1="${platform}" arg2="unix" />
</condition>
<condition property="script_prefix" value="">
<equals arg1="${platform}" arg2="windows" />
</condition>

<echo file="./etc/junit.${suffix}" message="${script_prefix}" />
<echo file="./etc/junit.${suffix}" message="${java_command} org.junit.runner.JUnitCore " append="true" />
<echo file="./etc/junit.${suffix}" message="${param_prefix}1" append="true" />

<echo file="./etc/run.${suffix}" message="${script_prefix}" />
<echo file="./etc/run.${suffix}" message="${java_command} " append="true" />
<echo file="./etc/run.${suffix}" message="${param_prefix}1 ${param_prefix}2 ${param_prefix}3 ${param_prefix}4 ${param_prefix}5 ${param_prefix}6 ${param_prefix}7 ${param_prefix}8 ${param_prefix}9" append="true" />

<echo file="./etc/run-integration.${suffix}" message="${script_prefix}" />
<echo file="./etc/run-integration.${suffix}" message="${line.separator}ant clean${line.separator}ant${line.separator}ant -lib etc/mail.jar integration${line.separator}" append="true" />
<echo file="./etc/junit.${suffix}" message="${script_prefix}" />
<echo file="./etc/junit.${suffix}" message="${java_command} org.junit.runner.JUnitCore " append="true" />
<echo file="./etc/junit.${suffix}" message="${param_prefix}1" append="true" />

</target>
<echo file="./etc/run.${suffix}" message="${script_prefix}" />
<echo file="./etc/run.${suffix}" message="${java_command} " append="true" />
<echo file="./etc/run.${suffix}" message="${param_prefix}1 ${param_prefix}2 ${param_prefix}3 ${param_prefix}4 ${param_prefix}5 ${param_prefix}6 ${param_prefix}7 ${param_prefix}8 ${param_prefix}9" append="true" />

<echo file="./etc/run-integration.${suffix}" message="${script_prefix}" />
<echo file="./etc/run-integration.${suffix}" message="${line.separator}ant clean${line.separator}ant${line.separator}ant -lib etc/mail.jar integration${line.separator}" append="true" />

<echo file="./etc/hadoop-local.${suffix}" message="${script_prefix}" />
<echo file="./etc/hadoop-local.${suffix}" message="hadoop jar ${basedir}/${dist.dir}/${artifactId}-${version}.jar ${param_prefix}1 -D mapreduce.framework.name=local -D mapreduce.jobtracker.address=local -D fs.default.name=file:/// -D mapreduce.cluster.local.dir=/tmp/mapred/local -D mapreduce.cluster.temp.dir=/tmp/mapred/temp -D mapreduce.jobtracker.staging.root.dir=/tmp/mapred/staging -D mapreduce.jobtracker.system.dir=/tmp/mapred/system ${param_prefix}2 ${param_prefix}3 ${param_prefix}4 ${param_prefix}5 ${param_prefix}6 ${param_prefix}7 ${param_prefix}8 ${param_prefix}9 ${param_prefix}{10} ${param_prefix}{11} ${param_prefix}{12} ${param_prefix}{13} ${param_prefix}{14} ${param_prefix}{15} ${param_prefix}{16} ${param_prefix}{17} ${param_prefix}{18} ${param_prefix}{19} ${param_prefix}{20}${line.separator}" append="true" />

<echo file="./etc/hadoop-cluster.${suffix}" message="${script_prefix}" />
<echo file="./etc/hadoop-cluster.${suffix}" message="hadoop jar ${basedir}/${dist.dir}/${artifactId}-${version}.jar ${param_prefix}1 -libjars ${libjars.path.id} ${param_prefix}2 ${param_prefix}3 ${param_prefix}4 ${param_prefix}5 ${param_prefix}6 ${param_prefix}7 ${param_prefix}8 ${param_prefix}9 ${param_prefix}{10} ${param_prefix}{11} ${param_prefix}{12} ${param_prefix}{13} ${param_prefix}{14} ${param_prefix}{15} ${param_prefix}{16} ${param_prefix}{17} ${param_prefix}{18} ${param_prefix}{19} ${param_prefix}{20}${line.separator}" append="true" />

<chmod dir="./etc/" perm="ugo+rx" includes="**/*.sh"/>

</target>

<target name="gen-pom" depends="install-ivy">
<ivy:makepom ivyfile="ivy/ivy.xml" pomfile="cloud9.pom" />
Expand Down
290 changes: 290 additions & 0 deletions docs/word-count.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Cloud9: A Hadoop toolkit for working with big data</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta name="description" content="">
<meta name="author" content="">

<!-- Le styles -->
<link href="assets/css/bootstrap.css" rel="stylesheet">
<link href="assets/css/bootstrap-responsive.css" rel="stylesheet">
<link href="assets/css/docs.css" rel="stylesheet">
<link href="assets/js/google-code-prettify/prettify.css" rel="stylesheet">

<!-- Le HTML5 shim, for IE6-8 support of HTML5 elements -->
<!--[if lt IE 9]>
<script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
<![endif]-->

</head>

<body data-spy="scroll" data-target=".bs-docs-sidebar">

<!-- Navbar
================================================== -->
<div class="navbar navbar-inverse navbar-fixed-top">
<div class="navbar-inner">
<div class="container">
<button type="button" class="btn btn-navbar" data-toggle="collapse" data-target=".nav-collapse">
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<div class="nav-collapse collapse">
<ul class="nav">
<li class="">
<a href="../index.html">Home</a>
</li>
<li class="active">
<a href="./contents.html">Table of Contents</a>
</li>
<li class="">
<a href="./api/index.html">API</a>
</li>
</ul>
</div>
</div>
</div>
</div>

<!-- Subhead
================================================== -->
<header class="jumbotron subhead" id="overview">
<div class="container">
<h1>Cloud<sup>9</sup></h1>
<p class="lead">A Hadoop toolkit for working with big data</p>
</div>
</header>

<div class="container">

<div class="page-header">
<h2>Word Count Tutorial</h2>
</div>

<p>Cloud<sup>9</sup> is designed to work with Hadoop YARN and has been
tested against Cloudera CDH 4.1.2 (on both Mac and Linux). It should
work with other Hadoop distributions or on other platforms with only
minor modifications; however, switching to a non-YARN version of
Hadoop will requiring recompiling the jars. In this tutorial, we'll
take you through running word count on a toy collection. First, we'll
use Hadoop local mode (also called standalone mode). Running in local
model, as the name suggests, does not require setting up a cluster,
but of course, you won't get the benefits of distributed processing
either. Next, we'll run word count on the single node virtual Hadoop
cluster provided by Cloudera.</p>

<table><tr><td valign="top"><span class="label label-warning">Warning</span></td>
<td style="padding-left: 10px">Note that local mode doesn't work
properly under Windows, even with cygwin, so Windows users following
this guide should start with "Running Cloud<sup>9</sup> on a Single Node Virtual
Cluster"</td></tr></table>

<h3>Download and Install Hadoop</h3>

<p>Download Cloudera CDH
4.1.2 <a href="https://ccp.cloudera.com/display/SUPPORT/CDH+Downloads">here</a>. The
easiest way is to download the tarball and unpack on your local
machine. Make sure
<code>PATH_TO_HADOOP/bin</code> is on your path. Verify that Hadoop is running
with the pi example. In a shell, run the following command:</p>

<pre class="code">
hadoop jar PATH_TO_HADOOP/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.0.0-cdh4.1.2.jar pi \
-D mapreduce.framework.name=local -D mapreduce.jobtracker.address=local -D fs.default.name=file:/// \
-D mapreduce.cluster.local.dir=/tmp/mapred/local \
-D mapreduce.cluster.temp.dir=/tmp/mapred/temp\
-D mapreduce.jobtracker.staging.root.dir=/tmp/mapred/staging \
-D mapreduce.jobtracker.system.dir=/tmp/mapred/system \
100 100000
</pre>

<p>Note that the multitude of <code>-D</code> options overrides the
Hadoop config and forces local mode. It isn't necessary if you just
downloaded the tarball straight from the site above. This is just in
case you have Hadoop set up already.</p>

<p>After the above Hadoop local job finishes, you should see the
computed value of pi... something that's reasonably closer to 3.14.</p>

<h3>Clone the Cloud<sup>9</sup> Repo</h3>

<p>Open up a shell and clone the Cloud<sup>9</sup> github repo:</p>

<pre class="code">
git clone git://github.com/lintool/Cloud9.git
</pre>

<p>Go into the <code>Cloud9/</code> directory and build with ant by
typing <code>ant</code>. The build should complete without error.</p>

<p>Let's now run the word count demo. Ant should have automatically
created a script for you located at <code>etc/hadoop-local.sh</code>
for running Hadoop jobs in local mode. It conveniently sets up the
environment, so you shouldn't have to worry about classpaths, libjars,
etc. Building the index involves two separated commands:</p>

<pre class="code">
etc/hadoop-local.sh edu.umd.cloud9.example.simple.DemoWordCount \
-input data/bible+shakes.nopunc.gz -output wc
</pre>

<p>In local model, there is no HDFS, so you can use standard shell
commands to see the output. For example:</p>

<pre class="code">
$ head wc/part-r-00000
&c 70
&c' 1
''all 1
''among 1
''and 1
''but 1
''how 1
''lo 2
''look 1
''my 1
</pre>

<p>And that's it!</p>

<h3>Running Cloud<sup>9</sup> on a Single Node Virtual Cluster</h3>

<p>The next step is to run Cloud<sup>9</sup> on an actual Hadoop cluster. How to
set up a Hadoop cluster is beyond the scope of this tutorial, but the
next best thing is to use Cloudera's virtual machine images, which
come with pre-configured single-node cluster. The images can be
downloaded <a href="https://ccp.cloudera.com/display/SUPPORT/Cloudera%27s+Hadoop+Demo+VM+for+CDH4">here</a>.
</p>

<p>The latest available version is CDH 4.1.1: use the VirtualBox image,
since VirtualBox is freely available across all major
platforms. Download the image and unpack the tarball. VirtualBox
itself can be
download <a href="https://www.virtualbox.org/wiki/Downloads">here</a>.</p>

<p>Install VirtualBox and open up the application. To install the
Cloudera Hadoop image, click "New" on the tool bar. For "Name and
operating system", put in the following information:</p>

<ul>
<li>Name: Cloudera CDH 4.1.1</li>
<li>Type: Linux</li>
<li>Version: Linux 2.6 (64 bit)</li>
</ul>

<p>Next, for "Memory size", put in as much as you can spare, with a
minimum of 3GB. Next, "Hard drive", select "Use an existing virtual
hard drive file" and select the VM image you downloaded from above. To
finish, click "Create". Back in the main window, the VM should have
been added. Select it and click "Start" in the toolbar. That'll boot
up the image.</p>

<table style="margin-top: 15px; margin-bottom: 15px;">
<tr><td valign="top"><span class="label label-info">Info</span></td>
<td style="padding-left: 10px">On Mac, if you get the error "<code>Failed to
load VMMR0.r0 (VERR_SUPLIB_WORLD_WRITABLE)</code>" when booting up, it's
complaining because the directory <code>/Application</code> is world
writable. Apparently, that's bad practice, so change that: <code>chmod
775</code> should do the trick.
</td></tr></table>

<p>The VM is missing a few packages that we need, so open up a shell
and install from the command line:</p>

<pre class="code">
sudo yum install git
sudo yum install ant
sudo yum install gcc
</pre>

<p>Open up a shell and clone the Cloud<sup>9</sup> github repo:</p>

<pre class="code">
git clone git://github.com/lintool/Cloud9.git
</pre>

<p>As with before, go into the <code>Cloud9/</code> directory and build
with ant by typing <code>ant</code>.</p>

<p>After that's done, we need to put the sample data onto HDFS:</p>

<pre class="code">
hadoop fs -put data/bible+shakes.nopunc.gz
</pre>

<p>You can verify that the file is there:</p>

<pre class="code">
hadoop fs -ls
</pre>

<p>Next, run the word count demo using
the <code>etc/hadoop-cluster.sh</code> script, as follows:</p>

<pre class="code">
etc/hadoop-cluster.sh edu.umd.cloud9.example.simple.DemoWordCount \
-input data/bible+shakes.nopunc.gz -output wc -numReducers 5
</pre>

<p>The script is a wrapper around <code>hadoop</code> that sets up the
environment, handles libjars, etc. If you're curious, <code>cat</code>
it and you'll see. Note that the paths here are referencing HDFS
paths, not local paths.</p>

<p>After the job completes, you should be able to see the output on
HDFS:</p>

<pre class="code">
hadoop fs -ls wc
</pre>

<p>Now copy the data from HDFS onto the local disk:</P>

<pre class="code">
hadoop fs -get wc/part-r-00000 .
</pre>

<p>From here, you should be able to examine the contents of the file
using normal shell commands.</p>

<p>And that's it!</p>

</div>



<!-- Footer
================================================== -->
<footer class="footer">
<div class="container">
<p class="pull-right"><a href="#">Back to top</a></p>
<p>Designed using <a href="http://twitter.github.com/bootstrap/">Bootstrap</a>.</p>
<p>Code licensed under <a href="http://www.apache.org/licenses/LICENSE-2.0" target="_blank">Apache License v2.0</a>, documentation under <a href="http://creativecommons.org/licenses/by/3.0/">CC BY 3.0</a>.</p>
</div>
</footer>

<!-- Le javascript
================================================== -->
<!-- Placed at the end of the document so the pages load faster -->
<script src="assets/js/jquery.js"></script>
<script src="assets/js/google-code-prettify/prettify.js"></script>
<script src="assets/js/bootstrap-transition.js"></script>
<script src="assets/js/bootstrap-alert.js"></script>
<script src="assets/js/bootstrap-modal.js"></script>
<script src="assets/js/bootstrap-dropdown.js"></script>
<script src="assets/js/bootstrap-scrollspy.js"></script>
<script src="assets/js/bootstrap-tab.js"></script>
<script src="assets/js/bootstrap-tooltip.js"></script>
<script src="assets/js/bootstrap-popover.js"></script>
<script src="assets/js/bootstrap-button.js"></script>
<script src="assets/js/bootstrap-collapse.js"></script>
<script src="assets/js/bootstrap-carousel.js"></script>
<script src="assets/js/bootstrap-typeahead.js"></script>
<script src="assets/js/bootstrap-affix.js"></script>

</body>
</html>

Loading

0 comments on commit 4db967c

Please sign in to comment.