-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* cache hash digests * add `bitwiseDifference` call
- Loading branch information
1 parent
36a7874
commit 61e1f04
Showing
5 changed files
with
140 additions
and
68 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,17 +2,20 @@ | |
|
||
import java.util.*; | ||
|
||
import org.apache.commons.codec.DecoderException; | ||
import javax.xml.bind.DatatypeConverter; | ||
|
||
import org.apache.commons.codec.binary.Hex; | ||
import org.apache.commons.lang3.ArrayUtils; | ||
import org.apache.commons.lang3.builder.EqualsBuilder; | ||
import org.apache.commons.lang3.builder.HashCodeBuilder; | ||
|
||
/** | ||
* Computes the Nilsimsa hash for the given string. | ||
* @author Albert Weichselbraun <[email protected]> | ||
* <[email protected]> | ||
* | ||
* This class is a translation of the Python implementation by Michael Itz | ||
* to the Java language <http://code.google.com/p/py-nilsimsa>. | ||
* This class is based on the Python implementation by Michael Itz | ||
* <http://code.google.com/p/py-nilsimsa>. | ||
* | ||
* Original C nilsimsa-0.2.4 implementation by cmeclax: | ||
* <http://ixazon.dynip.com/~cmeclax/nilsimsa.html> | ||
|
@@ -34,12 +37,13 @@ | |
*/ | ||
public class Nilsimsa { | ||
|
||
private int count = 0; // num characters seen | ||
private int[] acc = new int[256]; // accumulators for computing the digest | ||
private int[] lastch = new int[4]; // the last four seen characters | ||
private int count = 0; // num characters seen | ||
private int[] acc = new int[256]; // accumulators for computing the digest | ||
private int[] lastch = new int[4]; // the last four seen characters | ||
private byte[] digest = null; // the Nilsimsa digest | ||
|
||
// pre-defined transformation arrays | ||
private static final byte[] TRAN = Nilsimsa._getByteArray( | ||
private static final byte[] TRAN = DatatypeConverter.parseHexBinary( | ||
"02D69E6FF91D04ABD022161FD873A1AC" + | ||
"3B7062961E6E8F399D05144AA6BEAE0E" + | ||
"CFB99C9AC76813E12DA4EB518D646B50" + | ||
|
@@ -61,24 +65,6 @@ public Nilsimsa() { | |
reset(); | ||
} | ||
|
||
/** | ||
* Computes the Nilsimsa digest for the given byte array. | ||
* @param data | ||
* @return | ||
*/ | ||
public static Nilsimsa getHash(byte[] data) { | ||
return new Nilsimsa().update(data); | ||
} | ||
|
||
/** | ||
* Computes the Nilsimsa digest for the given String. | ||
* @param s | ||
* @return | ||
*/ | ||
public static Nilsimsa getHash(String s) { | ||
return getHash(s.getBytes()); | ||
} | ||
|
||
/** | ||
* Updates the Nilsimsa digest using the given String | ||
* @param s: the String data to consider in the update | ||
|
@@ -110,6 +96,7 @@ public Nilsimsa update(byte[] data) { | |
} | ||
lastch[0] = ch; | ||
} | ||
digest = null; | ||
return this; | ||
} | ||
|
||
|
@@ -124,23 +111,10 @@ public Nilsimsa reset() { | |
count = 0; | ||
Arrays.fill(acc, (byte) 0); | ||
Arrays.fill(lastch, -1); | ||
this.digest = null; | ||
return this; | ||
} | ||
|
||
/* | ||
* Converts the given hexString to a byte array. | ||
* @param hexString: the hexString to convert | ||
* @return the corresponding byte array | ||
*/ | ||
private static byte[] _getByteArray( String hexString ) { | ||
try { | ||
return Hex.decodeHex( hexString.toCharArray()); | ||
} catch (DecoderException e) { | ||
e.printStackTrace(); | ||
return null; | ||
} | ||
} | ||
|
||
/** | ||
* Accumulator for a transition n between the chars a, b, c | ||
*/ | ||
|
@@ -153,9 +127,12 @@ private int _tran3(int a, int b, int c, int n) { | |
* @return the digest for the current Nilsimsa object. | ||
*/ | ||
public byte[] digest() { | ||
if (digest != null) { | ||
return digest; | ||
} | ||
int total = 0; | ||
int threshold; | ||
byte[] digest = new byte[32]; | ||
digest = new byte[32]; | ||
Arrays.fill(digest, (byte)0); | ||
|
||
if (count == 3) { | ||
|
@@ -174,46 +151,85 @@ public byte[] digest() { | |
} | ||
ArrayUtils.reverse( digest ); | ||
return digest; | ||
} | ||
|
||
/** | ||
* Compute the Nilsimsa digest for the given String. | ||
* @param data: an array of bytes to hash | ||
* @return the Nilsimsa digest. | ||
*/ | ||
public byte[] digest(byte[] data) { | ||
reset(); | ||
update(data); | ||
return digest(); | ||
} | ||
|
||
/** | ||
* Computes the Nilsimsa digest for the given byte array. | ||
* @param data | ||
* @return | ||
*/ | ||
public static Nilsimsa getHash(byte[] data) { | ||
return new Nilsimsa().update(data); | ||
} | ||
|
||
/** | ||
* @return a String representation of the current state of | ||
* the Nilsimsa object. | ||
* Computes the Nilsimsa digest for the given String. | ||
* @param s | ||
* @return | ||
*/ | ||
public String hexdigest() { | ||
return Hex.encodeHexString( digest() ); | ||
public static Nilsimsa getHash(String s) { | ||
return getHash(s.getBytes()); | ||
} | ||
|
||
|
||
/** | ||
* Compute the Nilsimsa digest for the given String. | ||
* @param s: the String to hash | ||
* @return the Nilsimsa digest. | ||
*/ | ||
public byte[] digest(String s) { | ||
reset(); | ||
update(s); | ||
return digest(); | ||
return digest(s.getBytes()); | ||
} | ||
|
||
|
||
/** | ||
* @return a String representation of the current state of | ||
* the Nilsimsa object. | ||
*/ | ||
public String hexdigest() { | ||
return Hex.encodeHexString(digest()); | ||
} | ||
|
||
/** | ||
* Compute the Nilsimsa hexDigest for the given String. | ||
* @param s: the String to hash | ||
* @param data: an array of bytes to hash | ||
* @return the Nilsimsa hexdigest. | ||
*/ | ||
public String hexdigest(byte[] data) { | ||
digest(data); | ||
return hexdigest(); | ||
} | ||
|
||
|
||
/** | ||
* Compute the Nilsimsa hexDigest for the given String. | ||
* @param s: the String to hash | ||
* @return the Nilsimsa hexdigest. | ||
*/ | ||
public String hexdigest(String s) { | ||
return Hex.encodeHexString( digest(s) ); | ||
digest(s); | ||
return hexdigest(); | ||
} | ||
|
||
/** | ||
* Compares a Nilsimsa object to the current one and | ||
* return the number of bits that differ. | ||
* @param cmp: the comparison object | ||
* @return the number of bits in the strings which differ. | ||
* @param cmp: | ||
* the comparison object | ||
* @return | ||
* the number of bits in which the Nilsimsa digests differ. | ||
*/ | ||
public int compare(Nilsimsa cmp) { | ||
public int bitwiseDifference(Nilsimsa cmp) { | ||
int distance = 0; | ||
int h1, h2; | ||
|
||
|
@@ -227,5 +243,32 @@ public int compare(Nilsimsa cmp) { | |
} | ||
return distance; | ||
} | ||
|
||
/** | ||
* Returns a value between -128 and + 128 that indicates the difference between | ||
* the nilsimsa digest of the current object and cmp. | ||
* @param cmp: | ||
* comparison object | ||
* @return | ||
* a value between -128 (no matching bits) and 128 (all bits match; both hashes are equal) | ||
*/ | ||
public int compare(Nilsimsa cmp) { | ||
return 128 - bitwiseDifference(cmp); | ||
} | ||
|
||
@Override | ||
public boolean equals(Object o) { | ||
if (o == null) { return false; } | ||
if (o == this) { return true; } | ||
if (o.getClass() != getClass()) { return false; }; | ||
|
||
return new EqualsBuilder() | ||
.append(digest(), ((Nilsimsa)o).digest()).isEquals(); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return new HashCodeBuilder().append(digest()).toHashCode(); | ||
} | ||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,21 +3,24 @@ | |
import static org.junit.Assert.*; | ||
|
||
import java.io.*; | ||
import java.net.URISyntaxException; | ||
import java.net.URL; | ||
import java.nio.charset.Charset; | ||
import java.nio.file.Files; | ||
import java.nio.file.Paths; | ||
import java.util.*; | ||
|
||
import org.apache.commons.codec.Charsets; | ||
import org.apache.commons.io.FileUtils; | ||
import org.junit.Test; | ||
|
||
/** | ||
* Test the Nilsimsa algorithm | ||
* @author Albert Weichselbraun | ||
* @author Albert Weichselbraun <[email protected]> | ||
* | ||
*/ | ||
public class NilsimsaTest { | ||
|
||
private final static String CONTENT_ENCODING = "UTF8"; | ||
private final static Charset CONTENT_ENCODING = Charsets.UTF_8; | ||
private final static String[] TEST_DATA = { | ||
"73302df80673894c115249b1f880abb1ec2b09f1c9726e642b690291e636fe6f c", | ||
"67b02df81323816c51019d71da92612dede05cf1cd20fb042b218310e61368ef hmac", | ||
|
@@ -71,14 +74,43 @@ public void differenceTest() { | |
Nilsimsa referenceHash = Nilsimsa.getHash(referenceString); | ||
// System.out.print("{"); | ||
for (int j=0; j<TEST_DATA.length; j++) { | ||
int distance = referenceHash.compare(Nilsimsa.getHash(TEST_DATA[j])); | ||
int distance = referenceHash.bitwiseDifference(Nilsimsa.getHash(TEST_DATA[j])); | ||
// System.out.print(distance + ", "); | ||
assertEquals(REFERENCE_DISTANCE[i][j], distance); | ||
} | ||
// System.out.println("}, "); | ||
} | ||
} | ||
|
||
@Test | ||
public void equalsAndHashCodeTest() { | ||
Nilsimsa h1, h2; | ||
|
||
// test equals and hash code | ||
for (int i=0; i<TEST_DATA.length; i++) { | ||
h1 = Nilsimsa.getHash(TEST_DATA[i]); | ||
for (int j=0; j<TEST_DATA.length; j++) { | ||
h2 = Nilsimsa.getHash(TEST_DATA[j]); | ||
if (j == i) { | ||
assertEquals(h1, h2); | ||
assertEquals(h1.hashCode(), h2.hashCode()); | ||
} else { | ||
assertNotEquals(h1, h2); | ||
assertNotEquals(h1.hashCode(), h2.hashCode()); | ||
} | ||
} | ||
} | ||
} | ||
|
||
@Test | ||
public void equalsSpecialCasesTest() { | ||
Nilsimsa h = Nilsimsa.getHash("test"); | ||
assertNotEquals(h, null); | ||
assertNotEquals(h, null); | ||
assertNotEquals(h, this); | ||
assertEquals(h, h); | ||
} | ||
|
||
/** | ||
* compile test mapping | ||
* @return a mapping of file content and the corresponding reference | ||
|
@@ -92,10 +124,9 @@ private static Map<String, String> _readTestDocuments() { | |
testSet = testData.split(" "); | ||
try { | ||
URL resource = NilsimsaTest.class.getClassLoader().getResource("wiki-"+ testSet[1] + ".txt"); | ||
documentContent = FileUtils.readFileToString( | ||
new File( resource.getFile()), CONTENT_ENCODING); | ||
documentContent = new String(Files.readAllBytes(Paths.get(resource.toURI())), CONTENT_ENCODING); | ||
result.put( testSet[0], documentContent); | ||
} catch (IOException e) { | ||
} catch (IOException | URISyntaxException e) { | ||
e.printStackTrace(); | ||
fail("Cannot read corpus."); | ||
} | ||
|