diff --git a/pom.xml b/pom.xml
index 95af5cd..839365b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -39,6 +39,16 @@
8.2.0
+
+
+
+ commons-codec
+ commons-codec
+ 1.9
+
+
+
+
@@ -135,6 +145,11 @@
1.8.3
+
+ commons-codec
+ commons-codec
+
+
diff --git a/src/main/java/com/scaleunlimited/cascading/scheme/core/Metadata.java b/src/main/java/com/scaleunlimited/cascading/scheme/core/Metadata.java
new file mode 100644
index 0000000..c7dbf64
--- /dev/null
+++ b/src/main/java/com/scaleunlimited/cascading/scheme/core/Metadata.java
@@ -0,0 +1,40 @@
+package com.scaleunlimited.cascading.scheme.core;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+
+import org.apache.commons.codec.digest.DigestUtils;
+
+public class Metadata {
+
+ public static final String MD5_FILE_NAME = ".md5";
+
+ public static File writeMetadata(File partDir) throws IOException {
+ File md5File = new File(partDir, Metadata.MD5_FILE_NAME);
+ OutputStream fos = new FileOutputStream(md5File);
+ OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
+ try {
+ File indexDir = new File(partDir, "index");
+ File[] indexFiles = indexDir.listFiles();
+ for (File indexFile : indexFiles) {
+ InputStream is = new FileInputStream(indexFile);
+ String md5 = null;
+ try {
+ md5 = DigestUtils.md5Hex(is);
+ } finally {
+ is.close();
+ }
+ osw.write(indexFile.getName() + "\t" + md5 + "\n");
+ }
+ } finally {
+ osw.close();
+ }
+ return md5File;
+ }
+
+}
diff --git a/src/main/java/com/scaleunlimited/cascading/scheme/hadoop/SolrOutputFormat.java b/src/main/java/com/scaleunlimited/cascading/scheme/hadoop/SolrOutputFormat.java
index 3fda3ff..5bc24c2 100644
--- a/src/main/java/com/scaleunlimited/cascading/scheme/hadoop/SolrOutputFormat.java
+++ b/src/main/java/com/scaleunlimited/cascading/scheme/hadoop/SolrOutputFormat.java
@@ -16,27 +16,30 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import cascading.flow.hadoop.util.HadoopUtil;
-import cascading.tuple.Fields;
-import cascading.tuple.Tuple;
-
import com.scaleunlimited.cascading.scheme.core.KeepAliveHook;
+import com.scaleunlimited.cascading.scheme.core.Metadata;
import com.scaleunlimited.cascading.scheme.core.SolrSchemeUtil;
import com.scaleunlimited.cascading.scheme.core.SolrWriter;
+import cascading.flow.hadoop.util.HadoopUtil;
+import cascading.tuple.Fields;
+import cascading.tuple.Tuple;
+
public class SolrOutputFormat extends FileOutputFormat {
private static final Logger LOGGER = LoggerFactory.getLogger(SolrOutputFormat.class);
public static final String SOLR_CONF_PATH_KEY = "com.scaleunlimited.cascading.solr.confPath";
public static final String SINK_FIELDS_KEY = "com.scaleunlimited.cascading.solr.sinkFields";
public static final String MAX_SEGMENTS_KEY = "com.scaleunlimited.cascading.solr.maxSegments";
-
+ public static final String INCLUDE_METADATA_KEY = "com.scaleunlimited.cascading.solr.includeMetadata";
+
public static final int DEFAULT_MAX_SEGMENTS = 10;
private static class SolrRecordWriter implements RecordWriter {
private Path _outputPath;
private FileSystem _outputFS;
+ private boolean _isIncludeMetadata;
private transient KeepAliveHook _keepAliveHook;
private transient File _localIndexDir;
@@ -59,7 +62,9 @@ public SolrRecordWriter(JobConf conf, String name, Progressable progress) throws
// Get the set of fields we're indexing.
Fields sinkFields = HadoopUtil.deserializeBase64(conf.get(SINK_FIELDS_KEY), conf, Fields.class);
+ // Load optional configuration parameters.
int maxSegments = conf.getInt(MAX_SEGMENTS_KEY, DEFAULT_MAX_SEGMENTS);
+ _isIncludeMetadata = conf.getBoolean(INCLUDE_METADATA_KEY, false);
// Set up local Solr home.
File localSolrHome = SolrSchemeUtil.makeTempSolrHome(localSolrConf, null);
@@ -102,6 +107,13 @@ private void copyToHDFS() throws IOException {
Thread reporterThread = startProgressThread();
try {
+ if (_isIncludeMetadata) {
+ File localMetadataFile = Metadata.writeMetadata(_localIndexDir);
+ Path metadataPath = new Path(_outputPath.getParent(), Metadata.MD5_FILE_NAME);
+ LOGGER.info(String.format("Copying index metadata from %s to %s", _localIndexDir, metadataPath));
+ _outputFS.copyFromLocalFile(true, new Path(localMetadataFile.getAbsolutePath()), metadataPath);
+ }
+
long indexSize = FileUtils.sizeOfDirectory(indexDir);
LOGGER.info(String.format("Copying %d bytes of index from %s to %s", indexSize, _localIndexDir, _outputPath));
_outputFS.copyFromLocalFile(true, new Path(indexDir.getAbsolutePath()), _outputPath);
diff --git a/src/main/java/com/scaleunlimited/cascading/scheme/hadoop/SolrScheme.java b/src/main/java/com/scaleunlimited/cascading/scheme/hadoop/SolrScheme.java
index f8758a5..bee1c2a 100644
--- a/src/main/java/com/scaleunlimited/cascading/scheme/hadoop/SolrScheme.java
+++ b/src/main/java/com/scaleunlimited/cascading/scheme/hadoop/SolrScheme.java
@@ -12,6 +12,8 @@
import org.apache.hadoop.mapred.RecordReader;
import org.xml.sax.SAXException;
+import com.scaleunlimited.cascading.scheme.core.SolrSchemeUtil;
+
import cascading.flow.FlowProcess;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.scheme.Scheme;
@@ -24,23 +26,31 @@
import cascading.tuple.Tuple;
import cascading.util.Util;
-import com.scaleunlimited.cascading.scheme.core.SolrSchemeUtil;
-
@SuppressWarnings("serial")
public class SolrScheme extends Scheme, OutputCollector, Object[], Void> {
-
+
private File _solrConfDir;
private int _maxSegments;
-
+ private boolean _isIncludeMetadata;
+
public SolrScheme(Fields schemeFields, String solrConfDir) throws IOException, ParserConfigurationException, SAXException {
this(schemeFields, solrConfDir, SolrOutputFormat.DEFAULT_MAX_SEGMENTS);
}
public SolrScheme(Fields schemeFields, String solrConfDir, int maxSegments) throws IOException, ParserConfigurationException, SAXException {
+ this(schemeFields, solrConfDir, SolrOutputFormat.DEFAULT_MAX_SEGMENTS, false);
+ }
+
+ public SolrScheme(Fields schemeFields, String solrConfDir, boolean isIncludeMetadata) throws IOException, ParserConfigurationException, SAXException {
+ this(schemeFields, solrConfDir, SolrOutputFormat.DEFAULT_MAX_SEGMENTS, isIncludeMetadata);
+ }
+
+ public SolrScheme(Fields schemeFields, String solrConfDir, int maxSegments, boolean isIncludeMetadata) throws IOException, ParserConfigurationException, SAXException {
super(schemeFields, schemeFields);
_solrConfDir = new File(solrConfDir);
_maxSegments = maxSegments;
+ _isIncludeMetadata = isIncludeMetadata;
SolrSchemeUtil.validate(_solrConfDir, schemeFields);
}
@@ -87,6 +97,7 @@ public void sinkConfInit(FlowProcess flowProcess, Tap conf, SourceCall