Skip to content

Commit

Permalink
Fully working file crusher!
Browse files Browse the repository at this point in the history
  • Loading branch information
edwardcapriolo committed Nov 30, 2011
1 parent f07ac28 commit 6320d46
Show file tree
Hide file tree
Showing 29 changed files with 8,558 additions and 0 deletions.
2 changes: 2 additions & 0 deletions NOTICE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Hadoop Filecrush
Copyright 2010, 2011 m6d Media6degrees
3 changes: 3 additions & 0 deletions README
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Hadoop filecrusher.

Turn many small files into fewer larger ones. Also change from text to sequence and other compression options in one pass.
188 changes: 188 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.m6d</groupId>
<artifactId>filecrush</artifactId>
<name>M6D App - Filecrush</name>
<version>2.2.2-SNAPSHOT</version>
<description>filecrush utility</description>
<packaging>jar</packaging>
<properties>
<hadoop.version>0.20.2</hadoop.version>
<commons-cli.version>1.2</commons-cli.version>
<commons-logging.version>1.0.4</commons-logging.version>
<commons-lang.version>2.3</commons-lang.version>
<commons-httpclient.version>3.0.1</commons-httpclient.version>
<log4j.version>1.2.13</log4j.version>
<slf4j.version>1.6.1</slf4j.version>
<plexus-utils.version>1.1</plexus-utils.version>
<junit.version>4.8.2</junit.version>
<mockito.version>1.8.5</mockito.version>
<hamcrest.version>1.2</hamcrest.version>
<easymock.version>3.0</easymock.version>
<jetty.version>6.1.14</jetty.version>
</properties>

<build>

<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-eclipse-plugin</artifactId>
<version>2.5.1</version>
<configuration>
<projectNameTemplate>[artifactId]</projectNameTemplate>
<wtpmanifest>true</wtpmanifest>
<wtpapplicationxml>true</wtpapplicationxml>
<wtpversion>1.5</wtpversion>
<additionalBuildcommands>
<buildcommand>org.eclipse.jdt.core.javabuilder</buildcommand>
<buildcommand>org.maven.ide.eclipse.maven2Builder</buildcommand>
</additionalBuildcommands>
<additionalProjectnatures>
<projectnature>org.eclipse.jdt.core.javanature</projectnature>
<projectnature>org.maven.ide.eclipse.maven2Nature</projectnature>
</additionalProjectnatures>
</configuration>
</plugin>
</plugins>
</pluginManagement>

<plugins>

<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.6</source>
<target>1.6</target>
</configuration>
</plugin>

<plugin>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<archive>
</archive>
</configuration>
<executions>
<execution>
<id>jar</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>


<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>${hadoop.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>${commons-logging.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>${log4j.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
<version>${commons-httpclient.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>${commons-lang.version}</version>
</dependency>
<dependency>
<groupId>org.codehaus.plexus</groupId>
<artifactId>plexus-utils</artifactId>
<version>${plexus-utils.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>${commons-cli.version}</version>
<scope>provided</scope>
</dependency>
<!-- test dependencies -->
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
<version>${mockito.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-core</artifactId>
<version>${hamcrest.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest-library</artifactId>
<version>${hamcrest.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.easymock</groupId>
<artifactId>easymock</artifactId>
<version>${easymock.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-test</artifactId>
<version>${hadoop.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
<scope>test</scope>
</dependency>
<!-- Needed to run Hadoop cluster test cases -->
<dependency>
<groupId>org.mortbay.jetty</groupId>
<artifactId>jetty</artifactId>
<version>${jetty.version}</version>
<scope>test</scope>
</dependency>
<!-- Needed to run Hadoop cluster test cases -->
<dependency>
<groupId>org.mortbay.jetty</groupId>
<artifactId>jetty-util</artifactId>
<version>${jetty.version}</version>
<scope>test</scope>
</dependency>

</dependencies>
</project>
167 changes: 167 additions & 0 deletions src/main/java/com/m6d/filecrush/clean/Clean.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
/*
Copyright 2011 m6d.com
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.m6d.filecrush.clean;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

@SuppressWarnings("deprecation")
public class Clean extends Configured implements Tool{

public static final String TARGET_DIR="clean.target.dir";
public static final String CUTTOFF_MILLIS="clean.cutoff.millis";
public static final String TARGET_EXPR="clean.target.expr";
public static final String WARN_MODE="clean.warn.mode";

protected FileSystem fs;
protected Configuration conf;
protected long cutoff;

public Clean(){
super();
}

public static void main(String[] args) throws Exception {
Clean clean = new Clean();
int exitCode = ToolRunner.run(new Configuration(),clean, args);
System.exit(exitCode);
}

@Override
public int run(String[] args) throws Exception {
conf = getConf();

try {
fs=FileSystem.get(getConf());
} catch (IOException e) {
throw new RuntimeException("Could not open filesystem");
}
int pre = preFlightCheck();
if (pre!=0){
return pre;
}

if (conf.get(CUTTOFF_MILLIS)!=null){
long now=System.currentTimeMillis();
long targetAge= Long.parseLong(conf.get(CUTTOFF_MILLIS));
cutoff=now-targetAge;
}

return cleanup (new Path(conf.get(TARGET_DIR)));

}

public void warnOrDelete(Path p) throws IOException{
if (conf.getBoolean(WARN_MODE, false)){
System.out.println("DELETE "+p);
} else {
if ( p.equals( new Path(conf.get(TARGET_DIR)) )){

} else {
fs.delete(p);
}
}
}


public int cleanup(Path p){
try {
if (fs.isFile(p)){
if (conf.get(TARGET_EXPR)!=null){
if (p.getName().matches(conf.get(TARGET_EXPR))){
warnOrDelete(p);
}
}
if (conf.get(CUTTOFF_MILLIS)!=null){
if (fs.getFileStatus(p).getModificationTime() < cutoff ){
warnOrDelete(p);
}
}
}

if (fs.isDirectory(p)){
for (FileStatus stat: fs.listStatus(p)){
cleanup( stat.getPath() );
}
if (fs.listStatus(p).length == 0){
if (conf.get(TARGET_EXPR)!=null){
if (p.getName().matches(conf.get(TARGET_EXPR))){
warnOrDelete(p);
}
}
if (conf.get(CUTTOFF_MILLIS)!=null){
if (fs.getFileStatus(p).getModificationTime() < cutoff ){
warnOrDelete(p);
}
}
}
}
} catch (IOException e) {
System.out.println("exception "+e);
return 7;
}
return 0;
}

public int preFlightCheck(){
Configuration conf = getConf();
if (conf.get(TARGET_DIR) == null){
System.err.println("You must specify a target.dir");
return 1;
}
if (conf.get(TARGET_DIR).equals("/")){
System.err.println("Will not clean / !!!!!!");
return 2;
}
if ( fs.getHomeDirectory().equals( new Path(conf.get(TARGET_DIR)) ) ){
System.err.println("Will not clean home directory");
return 3;
}
if (conf.get(CUTTOFF_MILLIS)==null && conf.get(TARGET_EXPR)==null){
System.err.println("You must specify "+CUTTOFF_MILLIS+" or "+TARGET_EXPR);
return 4;
}
if (!(conf.get(CUTTOFF_MILLIS)==null) && !(conf.get(TARGET_EXPR)==null)){
System.err.println("You can not specify "+CUTTOFF_MILLIS+" and "+TARGET_EXPR);
return 9;
}
if (conf.get(CUTTOFF_MILLIS)!=null) {
try {
Long.parseLong(conf.get(CUTTOFF_MILLIS));
} catch (NumberFormatException ex){
System.err.println(CUTTOFF_MILLIS+" was specified as "+conf.get(CUTTOFF_MILLIS)+" this is not a long integer");
return 15;
}
}
try {
if (! fs.exists( new Path(conf.get(TARGET_DIR)))) {
System.err.println(conf.get(TARGET_DIR)+" does not exist");
}
} catch (IOException e) {
System.err.println("IOEXCEPTION"+ e);
return 6;
}
return 0;
}

}
Loading

0 comments on commit 6320d46

Please sign in to comment.