From 7f8541fffaf525c0baf952c15a9901211bfd4ebc Mon Sep 17 00:00:00 2001
From: "KIRSTEN W. HILDRUM" <hildrum@us.ibm.com>
Date: Sun, 27 Sep 2015 00:11:14 -0400
Subject: [PATCH] HadoopReader (was InputFormatReader)

---
 .gitignore                                    |   9 +
 com.ibm.streamsx.hdfs/build.xml               |  24 +-
 .../com/ibm/streamsx/hdfs/HadoopReader.java   | 320 ++++++++++++++++++
 com.ibm.streamsx.hdfs/info.xml                |   4 +-
 com.ibm.streamsx.hdfs/pom.xml                 |   4 +-
 demos/WordCount/.project                      |  34 ++
 demos/WordCount/ReadAndTokenize.spl           |  46 +++
 demos/WordCount/WordCount.spl                 |  66 ++++
 demos/WordCount/info.xml                      |  21 ++
 .../ReadSequenceParallel/Main.spl             |  41 +++
 .../HadoopReader/ReadSequenceSingle/Main.spl  |  33 ++
 tests/HadoopReader/ReadTextParallel/Main.spl  |  41 +++
 tests/HadoopReader/ReadTextSingle/Main.spl    |  51 +++
 .../hdfs.test.common/CheckCount.spl           |  27 ++
 14 files changed, 714 insertions(+), 7 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 com.ibm.streamsx.hdfs/impl/java/src/com/ibm/streamsx/hdfs/HadoopReader.java
 create mode 100644 demos/WordCount/.project
 create mode 100644 demos/WordCount/ReadAndTokenize.spl
 create mode 100644 demos/WordCount/WordCount.spl
 create mode 100644 demos/WordCount/info.xml
 create mode 100644 tests/HadoopReader/ReadSequenceParallel/Main.spl
 create mode 100644 tests/HadoopReader/ReadSequenceSingle/Main.spl
 create mode 100644 tests/HadoopReader/ReadTextParallel/Main.spl
 create mode 100644 tests/HadoopReader/ReadTextSingle/Main.spl
 create mode 100644 tests/hdfs.test.common/hdfs.test.common/CheckCount.spl
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..28a6c73
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+com.ibm.streamsx.hdfs/opt/downloaded
+com.ibm.streamsx.hdfs/opt/hadoopreader
+output
+toolkit.xml
+data
+com.ibm.streamsx.hdfs/com.ibm.streamsx.hdfs/HadoopReader
+src-gen
+com.ibm.streamsx.hdfs/impl/lib
+.toolkitList
diff --git a/com.ibm.streamsx.hdfs/build.xml b/com.ibm.streamsx.hdfs/build.xml
index 513b60c..d95b1d8 100644
--- a/com.ibm.streamsx.hdfs/build.xml
+++ b/com.ibm.streamsx.hdfs/build.xml
@@ -17,7 +17,8 @@
 	<property name="lib.dir" value="lib" />
 	<property name="test.run.dir" value="tests" />
 	<property name="test.build.dir" value="${test.run.dir}/bin" />
-	<property name="ext.downloads.dir" value="opt/downloaded" />
+    <property name="ext.downloads.dir" value="opt/downloaded" />
+    <property name="ext.hadoopreader.dir" value="opt/hadoopreader" />
 	<property name="gensrc.dir" value="${basedir}/impl/java/src-gen" />
 	<property name="jarfile" value="BigData.jar" />
 	<property name="spl-mt" value="${streams.install}/bin/spl-make-toolkit" />
@@ -50,7 +51,8 @@
 
 	<path id="cp.ext.libs">
 		<fileset dir="${ext.downloads.dir}"/>
-	</path>
+        <fileset dir="${ext.hadoopreader.dir}"/>
+    </path>
 
 
 	<path id="cp.compile">
@@ -80,7 +82,23 @@
 		<exec executable="${maven.bin}"  failonerror="true">
 			<arg value="dependency:copy-dependencies"/>
 			<arg value="-DoutputDirectory=${ext.downloads.dir}"/>			
-	    </exec>
+        </exec>
+        <copy todir="${ext.hadoopreader.dir}">
+            <fileset dir="${ext.downloads.dir}">
+                <include name="avro*.jar"/>
+                <include name="commons-cli*.jar"/>
+                <include name="commons-configuration*.jar"/>
+                <include name="commons-logging*.jar"/>
+                <include name="hadoop-mapreduce-client-core*.jar"/>
+                <include name="hadoop-common*.jar"/>
+                <include name="hadoop-auth*.jar"/>
+                <include name="hadoop-hdfs*.jar"/>
+                <include name="commons-collections*.jar"/>
+                <include name="commons-compress*.jar"/>
+                <include name="htrace-core*.jar"/>
+                <include name="servlet-*.jar"/>
+            </fileset>
+       </copy>
 	</target>
 	
 
diff --git a/com.ibm.streamsx.hdfs/impl/java/src/com/ibm/streamsx/hdfs/HadoopReader.java b/com.ibm.streamsx.hdfs/impl/java/src/com/ibm/streamsx/hdfs/HadoopReader.java
new file mode 100644
index 0000000..e34452d
--- /dev/null
+++ b/com.ibm.streamsx.hdfs/impl/java/src/com/ibm/streamsx/hdfs/HadoopReader.java
@@ -0,0 +1,320 @@
+/* Copyright (C) 2015, International Business Machines Corporation */
+/* All Rights Reserved */
+
+package com.ibm.streamsx.hdfs;
+
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.TaskType;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
+import org.apache.log4j.Logger;
+
+import com.ibm.streams.operator.AbstractOperator;
+import com.ibm.streams.operator.Attribute;
+import com.ibm.streams.operator.OperatorContext;
+import com.ibm.streams.operator.OperatorContext.ContextCheck;
+import com.ibm.streams.operator.OutputTuple;
+import com.ibm.streams.operator.StreamSchema;
+import com.ibm.streams.operator.StreamingData.Punctuation;
+import com.ibm.streams.operator.StreamingOutput;
+import com.ibm.streams.operator.Type.MetaType;
+import com.ibm.streams.operator.compile.OperatorContextChecker;
+import com.ibm.streams.operator.model.Libraries;
+import com.ibm.streams.operator.model.OutputPortSet;
+import com.ibm.streams.operator.model.OutputPortSet.WindowPunctuationOutputMode;
+import com.ibm.streams.operator.model.OutputPorts;
+import com.ibm.streams.operator.model.Parameter;
+import com.ibm.streams.operator.model.PrimitiveOperator;
+import java.io.File;
+
+/**
+ * Operator to read input formats.
+ */
+@Libraries({"opt/hadoopreader/*"})
+@PrimitiveOperator(name="HadoopReader", namespace="com.ibm.streamsx.hdfs",
+description="HadoopReader can read text files, compressed text files, and sequence files.  If in a parallel region, each channel will handle a separate part of the input, unless unsplittable compression is used.  It is based on Hadoop's InputFormat, and so each output tuple is a key-value pair.")
+@OutputPorts({@OutputPortSet(description="The operator will populate the key and value attribute on this port with the key-value pairs given by the record reader for the underlying file format. Either the key or the value can be skipped.", cardinality=1, optional=false, windowPunctuationOutputMode=WindowPunctuationOutputMode.Generating), @OutputPortSet(description="Optional output ports", optional=true, windowPunctuationOutputMode=WindowPunctuationOutputMode.Generating)})
+public class HadoopReader extends AbstractOperator {
+	
+	public static final String consistentCutIntroducer="\\n\\n**Behavior in a consistent region**\\n\\n";
+	
+	public static final String description = "The operator uses the InputFormat interface to read data from HDFS.  Use the fileType parameter to set the input format type.  Two are currently supported--text and sequence.  The default is text. "+
+	"The text format supports both compressed and uncompressed text."+
+	" The operator has one output port  It has two optional attributes, key and value.  These correspond to the key and value returned by the underlying RecordReader."+
+	"**Configurating the operator**"+
+	"To configure the operator, set the configResource parameter to your core-site.xml file."+
+	consistentCutIntroducer+
+	"The operator has no consistent region support.";
+	private static final String INPUT_PARAM_NAME = "file";
+	private static final String FILETYPE_PARAM_NAME="fileType";
+	Logger logger = Logger.getLogger(this.getClass());
+	
+	public static enum FileType {
+		text,
+		sequence
+	}
+	
+	private FileType fileType = FileType.text;
+	
+	@Parameter(name=FILETYPE_PARAM_NAME,description="Type of file.  Use text for uncompressed and uncompressed text files and sequence for sequence files",optional=true)
+	public void setFileType(FileType type) {
+		fileType = type;
+	}
+	
+	
+	
+	
+	private static final String CONFIG_RESOURCES_PARAM_NAME = "configResources";
+
+
+	List<InputSplit> splits;
+	/**
+	 * Thread for calling <code>produceTuples()</code> to produce tuples 
+	 */
+    private Thread processThread;
+    
+    
+
+    private String inputFiles[];
+    private String configResources[];
+    FileInputFormat<LongWritable,Text> inputFormat;
+    int keyIndex = -1;
+    int valueIndex = -1;
+    
+    int channel = -1;
+    int maxChannels = -1;
+    Configuration conf;
+    
+    
+    @Parameter(name=INPUT_PARAM_NAME,description="Paths to be used as input")
+    public void setFile(String[] files ) {
+    	inputFiles = files;
+    }
+    
+    @Parameter(name=CONFIG_RESOURCES_PARAM_NAME,optional=true,description="Resources to be added to the configuration")
+    public void setResources(String[] paths) {
+    	configResources = paths;
+    }
+    
+    @ContextCheck(compile=true)
+    public static void compileChecks(OperatorContextChecker checker) {	
+    	StreamSchema outSchema= checker.getOperatorContext().getStreamingOutputs().get(0).getStreamSchema();
+    	
+    	Attribute keyAttr = outSchema.getAttribute("key");
+    	Attribute valueAttr = outSchema.getAttribute("value");
+    	
+    	
+    	if (keyAttr == null && valueAttr == null) {
+    		checker.setInvalidContext("Either key or value must be on output stream ", new Object[0]);
+    	}
+    	
+    	if (checker.getOperatorContext().getNumberOfStreamingOutputs() != 1) {
+    		checker.setInvalidContext("Number of streaming outputs must be 1",new Object[0]);
+    	}
+    }
+  
+    
+    @ContextCheck(compile=false)
+    public static void runtimeCheck(OperatorContextChecker checker) { 
+    	StreamSchema outSchema= checker.getOperatorContext().getStreamingOutputs().get(0).getStreamSchema();
+    	
+    	Attribute keyAttr = outSchema.getAttribute("key");
+    	Attribute valueAttr = outSchema.getAttribute("value");
+
+    	
+    	if (keyAttr != null  && keyAttr.getType().getMetaType() != MetaType.INT64) {
+    		checker.setInvalidContext("Type of key on output stream must be int64", new Object[0]);
+    	}
+    	
+    	if (valueAttr != null && valueAttr.getType().getMetaType() != MetaType.RSTRING 
+    			&& valueAttr.getType().getMetaType() != MetaType.USTRING) {
+    		checker.setInvalidContext("Type of value on output stream must be RString or ustring", new Object[0]);
+    	}
+    	
+    }
+    
+    /**
+     * Initialize this operator. Called once before any tuples are processed.
+     * @param context OperatorContext for this operator.
+     * @throws Exception Operator failure, will cause the enclosing PE to terminate.
+     */
+    @Override
+    public synchronized void initialize(OperatorContext context)
+            throws Exception {
+    	// Must call super.initialize(context) to correctly setup an operator.
+        super.initialize(context);
+        Logger.getLogger(this.getClass()).trace("Operator " + context.getName() + " initializing in PE: " + context.getPE().getPEId() + " in Job: " + context.getPE().getJobId() );
+        
+
+        StreamingOutput<OutputTuple> out = context.getStreamingOutputs().get(0);
+        StreamSchema outSchema = out.getStreamSchema();
+        
+        channel = context.getChannel();
+        maxChannels = context.getMaxChannels();
+        
+        // This will make the for loop in process tuples work right--we take splits 
+        // channel + k* maxChannels.
+        if (channel < 0) {
+        	channel = 0;
+        	maxChannels = 1;
+        }
+        
+        if (outSchema.getAttribute("key") != null) {
+        	keyIndex = outSchema.getAttributeIndex("key");
+        }
+        if (outSchema.getAttribute("value") != null) {
+        	valueIndex = outSchema.getAttributeIndex("value");
+        }
+        
+        // Establish input splits.
+		try {
+			
+		//Class<FileInputFormat<K, V>> inputFormatClass= (Class<FileInputFormat<K, V>>) Class.forName(inputFormatClassname);	
+		//FileInputFormat<K,V> inputFormat = inputFormatClass.newInstance();
+	
+		if (fileType == FileType.text) {
+			inputFormat = new TextInputFormat();
+		}
+		else {
+			inputFormat = new SequenceFileInputFormat();
+		}
+
+		conf = new Configuration();
+		if (configResources != null ) {
+		for (String s : configResources) {
+            File toAdd = new File(s);
+            String pathToFile;
+            if (toAdd.isAbsolute()) {
+                pathToFile = toAdd.getAbsolutePath();
+            }
+            else {
+                pathToFile = context.getPE().getApplicationDirectory()+File.separator+s;
+                toAdd = new File(pathToFile);
+		    }
+            if (!toAdd.exists()) {
+                throw new Exception("Specified configuration file "+s+" not found at "+pathToFile);
+            }
+           logger.info("Adding "+pathToFile+" as config resource");
+            conf.addResource(new Path(pathToFile));
+		}
+		String defaultFS = conf.get("fs.defaultFS");
+		if (!defaultFS.startsWith("hdfs")) {
+			logger.warn("Default file system not HDFS; may be configuration problem");
+		}
+		logger.debug("Default file system is "+defaultFS);
+        }
+		Job job = Job.getInstance(conf);
+		
+		for (String p : inputFiles) {
+			FileInputFormat.addInputPath(job, new Path(p));
+		}
+		
+		splits = inputFormat.getSplits(job);
+        logger.info("There are "+splits.size()+" splits");
+		}
+		catch (IOException e ) {
+			throw e;
+		}
+        
+        
+        processThread = getOperatorContext().getThreadFactory().newThread(
+                new Runnable() {
+
+                    @Override
+                    public void run() {
+                        try {
+                            produceTuples();
+                        } catch (Exception e) {
+                            Logger.getLogger(this.getClass()).error("Operator error", e);
+                        }                    
+                    }
+                    
+                });
+        
+        /*
+         * Set the thread not to be a daemon to ensure that the SPL runtime
+         * will wait for the thread to complete before determining the
+         * operator is complete.
+         */
+        processThread.setDaemon(false);
+    }
+
+    /**
+     * Notification that initialization is complete and all input and output ports 
+     * are connected and ready to receive and submit tuples.
+     * @throws Exception Operator failure, will cause the enclosing PE to terminate.
+     */
+    @Override
+    public synchronized void allPortsReady() throws Exception {
+        OperatorContext context = getOperatorContext();
+        Logger.getLogger(this.getClass()).trace("Operator " + context.getName() + " all ports are ready in PE: " + context.getPE().getPEId() + " in Job: " + context.getPE().getJobId() );
+    	// Start a thread for producing tuples because operator 
+    	// implementations must not block and must return control to the caller.
+        processThread.start();
+    }
+    
+    /**
+     * Submit new tuples to the output stream
+     * @throws Exception if an error occurs while submitting a tuple
+     */
+    private void produceTuples() throws Exception  {
+        final StreamingOutput<OutputTuple> out = getOutput(0);
+
+        for (int i = channel; i < splits.size(); i = i + maxChannels) {
+        	if (logger.isInfoEnabled()) {
+        		logger.info("Handling split "+i);
+        	}
+        	TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID("channel "+channel+" of "+maxChannels,0,TaskType.MAP,i,0));
+
+			RecordReader<LongWritable,Text> reader = inputFormat.createRecordReader(splits.get(i),context);
+			reader.initialize(splits.get(i), context);
+			
+			while (reader.nextKeyValue()) {
+				OutputTuple toSend = out.newTuple();
+				// TODO set filename, if it makes sense.
+				if (keyIndex >= 0) {
+					toSend.setLong(keyIndex, reader.getCurrentKey().get());
+				}
+				if (valueIndex >= 0) {
+					toSend.setString(valueIndex, reader.getCurrentValue().toString());
+				}
+				out.submit(toSend);
+			}
+			out.punctuate(Punctuation.WINDOW_MARKER);
+        }
+        out.punctuate(Punctuation.FINAL_MARKER);
+    }
+
+    /**
+     * Shutdown this operator, which will interrupt the thread
+     * executing the <code>produceTuples()</code> method.
+     * @throws Exception Operator failure, will cause the enclosing PE to terminate.
+     */
+    public synchronized void shutdown() throws Exception {
+        if (processThread != null) {
+            processThread.interrupt();
+            processThread = null;
+        }
+        OperatorContext context = getOperatorContext();
+        Logger.getLogger(this.getClass()).trace("Operator " + context.getName() + " shutting down in PE: " + context.getPE().getPEId() + " in Job: " + context.getPE().getJobId() );
+        
+        // TODO: If needed, close connections or release resources related to any external system or data store.
+
+        // Must call super.shutdown()
+        super.shutdown();
+    }
+}
diff --git a/com.ibm.streamsx.hdfs/info.xml b/com.ibm.streamsx.hdfs/info.xml
index 54d56f4..2f73386 100644
--- a/com.ibm.streamsx.hdfs/info.xml
+++ b/com.ibm.streamsx.hdfs/info.xml
@@ -156,8 +156,8 @@ Alternatively, you can fully qualify the operators that are provided by toolkit
 9. Run the application. You can submit the application as a job by using the **streamtool submitjob** command or by using Streams Studio. 
 
 </description>
-    <version>2.0.0</version>
+    <version>2.1.0</version>
     <requiredProductVersion>4.0.0.0</requiredProductVersion>
   </identity>
   <dependencies/>
-</toolkitInfoModel>
\ No newline at end of file
+</toolkitInfoModel>
diff --git a/com.ibm.streamsx.hdfs/pom.xml b/com.ibm.streamsx.hdfs/pom.xml
index 4fdcc6e..5200153 100644
--- a/com.ibm.streamsx.hdfs/pom.xml
+++ b/com.ibm.streamsx.hdfs/pom.xml
@@ -21,8 +21,8 @@
 	<dependencies>
 		<dependency>
 			<groupId>org.apache.hadoop</groupId>
-			<artifactId>hadoop-common</artifactId>
-			<version>2.1.2-SNAPSHOT</version>			
+			<artifactId>hadoop-client</artifactId>
+			<version>2.7.1</version>			
 			<scope>compile</scope>
 		</dependency>
 	</dependencies>
diff --git a/demos/WordCount/.project b/demos/WordCount/.project
new file mode 100644
index 0000000..03df5a9
--- /dev/null
+++ b/demos/WordCount/.project
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>WordCount</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.ui.externaltools.ExternalToolBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+				<dictionary>
+					<key>LaunchConfigHandle</key>
+					<value>&lt;project&gt;/.externalToolBuilders/org.eclipse.jdt.core.javabuilder.launch</value>
+				</dictionary>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>com.ibm.streams.studio.splproject.builder.SPLProjectBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.xtext.ui.shared.xtextBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>com.ibm.streams.studio.splproject.SPLProjectNature</nature>
+		<nature>org.eclipse.xtext.ui.shared.xtextNature</nature>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>
diff --git a/demos/WordCount/ReadAndTokenize.spl b/demos/WordCount/ReadAndTokenize.spl
new file mode 100644
index 0000000..f344ed1
--- /dev/null
+++ b/demos/WordCount/ReadAndTokenize.spl
@@ -0,0 +1,46 @@
+use com.ibm.streamsx.hdfs::HadoopReader ;
+type EmailTuple = rstring ID, rstring From, rstring Date, rstring Subject,
+	rstring ToList, rstring CcList, rstring BccList, rstring Body ;
+
+composite ReadAndTokenize(output WordStream )
+{
+	param
+		expression<rstring> $inputFile ;
+		expression<rstring> $configFile : "etc/core-site.xml" ;
+	graph
+		stream<rstring value> TuplesAsString = HadoopReader()
+		{
+			param
+				file : $inputFile ;
+				configResources : $configFile ;
+		}
+
+		stream<rstring body> Body = Functor(TuplesAsString)
+		{
+			output
+				Body :
+				//        body = "This is a test";
+				body =((EmailTuple) value).Body ;
+		}
+
+		stream<rstring word> WordStream = Custom(Body)
+		{
+			logic
+				onTuple Body :
+				{
+					list<rstring> words = tokenize(body, ",\r\n \t", false) ;
+					for(rstring w in words)
+					{
+						submit({ word = w }, WordStream) ;
+					}
+
+				}
+
+				onPunct Body :
+				{
+					submit(currentPunct(), WordStream) ;
+				}
+
+		}
+
+}
diff --git a/demos/WordCount/WordCount.spl b/demos/WordCount/WordCount.spl
new file mode 100644
index 0000000..7c2d208
--- /dev/null
+++ b/demos/WordCount/WordCount.spl
@@ -0,0 +1,66 @@
+
+
+composite Reduce(input InStream ; output WordCount)
+{
+	graph
+		stream<InStream> Puncted = Custom(InStream)
+		{
+			logic
+				onTuple InStream : submit(InStream, Puncted) ;
+				onPunct InStream :
+				{
+					if(currentPunct() == Sys.FinalMarker)
+					{
+						submit(Sys.WindowMarker, Puncted) ;
+						submit(Sys.FinalMarker, Puncted) ;
+					}
+
+				}
+
+		}
+
+		stream<rstring word, int32 count> WordCount = Aggregate(Puncted as I)
+		{
+			window
+				I : tumbling, punct(), partitioned ;
+			param
+				partitionBy : word ;
+			output
+				WordCount : count = Count() ;
+		}
+
+		() as sink = FileSink(WordCount)
+		{
+			param
+				file : "words" +(rstring) getChannel() + ".txt" ;
+		}
+
+}
+
+composite WordCount
+{
+	param
+		expression<int32> $numMappers :(int32) getSubmissionTimeValue("numMappers",
+			"1") ;
+		expression<int32> $numReducers :(int32) getSubmissionTimeValue("numReducers",
+			"1") ;
+	graph
+		@parallel(width = $numMappers)
+		stream<rstring word> WordStream = ReadAndTokenize()
+		{
+			param
+				inputFile : getSubmissionTimeValue("inputFile", "allEnron.txt") ;
+			config
+				placement : partitionColocation("map") ;
+		}
+
+		@parallel(width = $numReducers, partitionBy = [ { port = WordStream,
+			attributes = [ word ] } ])
+		stream<rstring word, int32 count> WordCount = Reduce(WordStream)
+		{
+			config
+				placement : partitionColocation("reduce") ;
+		}
+
+}
+
diff --git a/demos/WordCount/info.xml b/demos/WordCount/info.xml
new file mode 100644
index 0000000..8c30255
--- /dev/null
+++ b/demos/WordCount/info.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+// *******************************************************************************
+// * Copyright (C)2014, International Business Machines Corporation and *
+// * others. All Rights Reserved. *
+// *******************************************************************************
+-->
+<info:toolkitInfoModel xmlns:common="http://www.ibm.com/xmlns/prod/streams/spl/common" xmlns:info="http://www.ibm.com/xmlns/prod/streams/spl/toolkitInfo">
+  <info:identity>
+    <info:name>WordCount</info:name>
+    <info:description>Show how to implement WordCount and WordCountWithCombine</info:description>
+    <info:version>1.0.0</info:version>
+    <info:requiredProductVersion>4.0</info:requiredProductVersion>
+  </info:identity>
+  <info:dependencies>
+    <info:toolkit>
+      <common:name>com.ibm.streamsx.hdfs</common:name>
+      <common:version>2.1</common:version>
+    </info:toolkit>
+  </info:dependencies>
+</info:toolkitInfoModel>
diff --git a/tests/HadoopReader/ReadSequenceParallel/Main.spl b/tests/HadoopReader/ReadSequenceParallel/Main.spl
new file mode 100644
index 0000000..c3d853d
--- /dev/null
+++ b/tests/HadoopReader/ReadSequenceParallel/Main.spl
@@ -0,0 +1,41 @@
+/* Copyright (C) 2015, International Business Machines Corporation */
+/* All Rights Reserved */
+use com.ibm.streamsx.hdfs::HadoopReader;
+use hdfs.test.common::CheckCount;
+
+/**
+ * Use to check input file reading for both compressed and uncompressed files.    
+ * Also checks that key-value output, key only, and value only output works.
+ */
+
+composite ReadAndCount {
+
+param expression<rstring> $inputFile: getSubmissionTimeValue("inputFile");
+expression<rstring> $configFile: getSubmissionTimeValue("configFile");
+expression<list<int32>> $recordCountArray: (list<int32>)getSubmissionTimeValue("recordCountArray");
+
+graph
+
+stream<int64 key, rstring value> Both = HadoopReader() {
+param 
+file: $inputFile;
+configResources: $configFile;
+fileType:sequence;
+}
+
+() as bothCheck = CheckCount(Both) {
+param
+tupleCount: $recordCountArray[getChannel()];
+}
+}
+
+composite Main {
+
+graph
+@parallel(width=2)
+() as channel = ReadAndCount() {
+
+}
+
+}
+
diff --git a/tests/HadoopReader/ReadSequenceSingle/Main.spl b/tests/HadoopReader/ReadSequenceSingle/Main.spl
new file mode 100644
index 0000000..47d3dc0
--- /dev/null
+++ b/tests/HadoopReader/ReadSequenceSingle/Main.spl
@@ -0,0 +1,33 @@
+/* Copyright (C) 2015, International Business Machines Corporation */
+/* All Rights Reserved */
+
+use com.ibm.streamsx.hdfs::HadoopReader;
+use hdfs.test.common::CheckCount;
+
+/**
+ * Use to check input file reading for both compressed and uncompressed files.    
+ * Also checks that key-value output, key only, and value only output works.
+ */
+
+composite Main {
+
+param expression<rstring> $inputFile: getSubmissionTimeValue("inputFile");
+expression<rstring> $configFile: getSubmissionTimeValue("configFile");
+expression<int32> $recordCount: (int32)getSubmissionTimeValue("recordCount");
+
+graph
+
+stream<int64 key, rstring value> Both = HadoopReader() {
+param 
+file: $inputFile;
+configResources: $configFile;
+fileType:sequence;
+}
+
+() as bothCheck = CheckCount(Both) {
+param
+tupleCount: $recordCount;
+}
+
+
+}
diff --git a/tests/HadoopReader/ReadTextParallel/Main.spl b/tests/HadoopReader/ReadTextParallel/Main.spl
new file mode 100644
index 0000000..30f5a2b
--- /dev/null
+++ b/tests/HadoopReader/ReadTextParallel/Main.spl
@@ -0,0 +1,41 @@
+/* Copyright (C) 2015, International Business Machines Corporation */
+/* All Rights Reserved */
+
+use com.ibm.streamsx.hdfs::HadoopReader;
+use hdfs.test.common::CheckCount;
+
+/**
+ * Use to check input file reading for both compressed and uncompressed files.    
+ * Also checks that key-value output, key only, and value only output works.
+ */
+
+composite ReadAndCount {
+
+param expression<rstring> $inputFile: getSubmissionTimeValue("inputFile");
+expression<rstring> $configFile: getSubmissionTimeValue("configFile");
+expression<list<int32>> $recordCountArray: (list<int32>)getSubmissionTimeValue("recordCountArray");
+
+graph
+
+stream<int64 key, rstring value> Both = HadoopReader() {
+param 
+file: $inputFile;
+configResources: $configFile;
+}
+
+() as bothCheck = CheckCount(Both) {
+param
+tupleCount: $recordCountArray[getChannel()];
+}
+}
+
+composite Main {
+
+graph
+@parallel(width=2)
+() as channel = ReadAndCount() {
+
+}
+
+}
+
diff --git a/tests/HadoopReader/ReadTextSingle/Main.spl b/tests/HadoopReader/ReadTextSingle/Main.spl
new file mode 100644
index 0000000..4de746b
--- /dev/null
+++ b/tests/HadoopReader/ReadTextSingle/Main.spl
@@ -0,0 +1,51 @@
+/* Copyright (C) 2015, International Business Machines Corporation */
+/* All Rights Reserved */
+
+use com.ibm.streamsx.hdfs::HadoopReader;
+use hdfs.test.common::CheckCount;
+
+/**
+ * Use to check input file reading for both compressed and uncompressed files.    
+ * Also checks that key-value output, key only, and value only output works.
+ */
+
+composite Main {
+
+param expression<rstring> $inputFile: getSubmissionTimeValue("inputFile");
+expression<rstring> $configFile: getSubmissionTimeValue("configFile");
+expression<int32> $recordCount: (int32)getSubmissionTimeValue("recordCount");
+
+graph
+
+stream<int64 key, rstring value> Both = HadoopReader() {
+param 
+file: $inputFile;
+configResources: $configFile;
+}
+
+() as bothCheck = CheckCount(Both) {
+param
+tupleCount: $recordCount;
+}
+
+stream<rstring value> ValueOnly = HadoopReader() {
+param file: $inputFile;
+configResources: $configFile;
+}
+
+() as valueCheck = CheckCount(ValueOnly) {
+param
+tupleCount: $recordCount;
+}
+
+
+stream<int64 key> KeyOnly = HadoopReader() {
+param file: $inputFile;
+configResources: $configFile;
+}
+
+() as keyCheck = CheckCount(KeyOnly) {
+param tupleCount: $recordCount;
+}
+
+}
diff --git a/tests/hdfs.test.common/hdfs.test.common/CheckCount.spl b/tests/hdfs.test.common/hdfs.test.common/CheckCount.spl
new file mode 100644
index 0000000..0eabddd
--- /dev/null
+++ b/tests/hdfs.test.common/hdfs.test.common/CheckCount.spl
@@ -0,0 +1,27 @@
+/* Copyright (C) 2015, International Business Machines Corporation */
+/* All Rights Reserved */
+
+namespace hdfs.test.common;
+
+public composite CheckCount(input InStream) {
+
+param 
+expression<int32> $tupleCount;
+
+graph
+() as checker = Custom(InStream) {
+
+logic state: {
+    mutable int32 numTuples = 0;
+    int32 targetNumber = $tupleCount;
+}
+onTuple InStream: {
+    numTuples++;
+}
+onPunct InStream: 
+    if (currentPunct() == Sys.FinalMarker) {
+    assert(numTuples==targetNumber, "Received "+(rstring)numTuples+" but expected "+(rstring)targetNumber);
+    }
+}
+
+}