diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000..e7aa96ea
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+* text=auto
+*.java text eol=lf
diff --git a/README.md b/README.md
index 66f02051..2ae2cd3f 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,6 @@ Recommended Package imports:
```java
import org.iq80.leveldb.*;
-import static org.iq80.leveldb.impl.Iq80DBFactory.*;
import java.io.*;
```
diff --git a/leveldb-api/src/main/java/org/iq80/leveldb/DB.java b/leveldb-api/src/main/java/org/iq80/leveldb/DB.java
index cd2e5bd5..f77e07a4 100644
--- a/leveldb-api/src/main/java/org/iq80/leveldb/DB.java
+++ b/leveldb-api/src/main/java/org/iq80/leveldb/DB.java
@@ -88,10 +88,17 @@ void suspendCompactions()
void resumeCompactions();
/**
- * Force a compaction of the specified key range.
+ * Compact the underlying storage for the key range [begin, end].
+ * In particular, deleted and overwritten versions are discarded,
+ * and the data is rearranged to reduce the cost of operations
+ * needed to access the data. This operation should typically only
+ * be invoked by users who understand the underlying implementation.
+ *
+ * Call to {@code db.compactRange(null, null);} will compact the
+ * entire database.
*
* @param begin if null then compaction start from the first key
- * @param end if null then compaction ends at the last key
+ * @param end if null then compaction ends at the last key
*/
void compactRange(byte[] begin, byte[] end)
throws DBException;
diff --git a/leveldb-api/src/main/java/org/iq80/leveldb/DBIterator.java b/leveldb-api/src/main/java/org/iq80/leveldb/DBIterator.java
index 6a1ab6fa..4a2d0371 100644
--- a/leveldb-api/src/main/java/org/iq80/leveldb/DBIterator.java
+++ b/leveldb-api/src/main/java/org/iq80/leveldb/DBIterator.java
@@ -62,4 +62,7 @@ public interface DBIterator
* Repositions the iterator so it is at the end of of the Database.
*/
void seekToLast();
+
+ @Override
+ void close();
}
diff --git a/leveldb-api/src/main/java/org/iq80/leveldb/Options.java b/leveldb-api/src/main/java/org/iq80/leveldb/Options.java
index 630753cc..90be4412 100644
--- a/leveldb-api/src/main/java/org/iq80/leveldb/Options.java
+++ b/leveldb-api/src/main/java/org/iq80/leveldb/Options.java
@@ -33,6 +33,30 @@ public class Options
private DBComparator comparator;
private Logger logger;
private long cacheSize;
+ private boolean allowMmapReads = true;
+ private boolean allowMmapWrites = true;
+ private XFilterPolicy filterPolicy;
+
+ public static Options fromOptions(Options options)
+ {
+ final Options options1 = new Options();
+ options1.createIfMissing = options.createIfMissing;
+ options1.errorIfExists = options.errorIfExists;
+ options1.writeBufferSize = options.writeBufferSize;
+ options1.maxOpenFiles = options.maxOpenFiles;
+ options1.blockRestartInterval = options.blockRestartInterval;
+ options1.blockSize = options.blockSize;
+ options1.compressionType = options.compressionType;
+ options1.verifyChecksums = options.verifyChecksums;
+ options1.paranoidChecks = options.paranoidChecks;
+ options1.comparator = options.comparator;
+ options1.logger = options.logger;
+ options1.cacheSize = options.cacheSize;
+ options1.allowMmapReads = options.allowMmapReads;
+ options1.allowMmapWrites = options.allowMmapWrites;
+ options1.filterPolicy = options.filterPolicy;
+ return options1;
+ }
static void checkArgNotNull(Object value, String name)
{
@@ -173,4 +197,43 @@ public Options paranoidChecks(boolean paranoidChecks)
this.paranoidChecks = paranoidChecks;
return this;
}
+
+ public Options allowMmapReads(boolean allowMmapReads)
+ {
+ this.allowMmapReads = allowMmapReads;
+ return this;
+ }
+
+ public boolean allowMmapReads()
+ {
+ return allowMmapReads;
+ }
+
+ public Options allowMmapWrites(boolean allowMmapWrites)
+ {
+ this.allowMmapWrites = allowMmapWrites;
+ return this;
+ }
+
+ public boolean allowMmapWrites()
+ {
+ return allowMmapWrites;
+ }
+
+ /**
+ * Set table filter policy
+ *
+ * @param filterPolicy new filter policy
+ * @return self
+ */
+ public Options filterPolicy(XFilterPolicy filterPolicy)
+ {
+ this.filterPolicy = filterPolicy;
+ return this;
+ }
+
+ public XFilterPolicy filterPolicy()
+ {
+ return filterPolicy;
+ }
}
diff --git a/leveldb-api/src/main/java/org/iq80/leveldb/WriteOptions.java b/leveldb-api/src/main/java/org/iq80/leveldb/WriteOptions.java
index 819c334e..d3eeddd8 100644
--- a/leveldb-api/src/main/java/org/iq80/leveldb/WriteOptions.java
+++ b/leveldb-api/src/main/java/org/iq80/leveldb/WriteOptions.java
@@ -22,6 +22,25 @@ public class WriteOptions
private boolean sync;
private boolean snapshot;
+ /**
+ * If true, the write will be flushed from the operating system
+ * buffer cache (by calling WritableFile::Sync()) before the write
+ * is considered complete. If this flag is true, writes will be
+ * slower.
+ *
+ * If this flag is false, and the machine crashes, some recent
+ * writes may be lost. Note that if it is just the process that
+ * crashes (i.e., the machine does not reboot), no writes will be
+ * lost even if sync==false.
+ *
+ * In other words, a DB write with sync==false has similar
+ * crash semantics as the "write()" system call. A DB write
+ * with sync==true has similar crash semantics to a "write()"
+ * system call followed by "fsync()".
+ *
+ * In java Implementation if process crash
+ * Default: false
+ **/
public boolean sync()
{
return sync;
diff --git a/leveldb-api/src/main/java/org/iq80/leveldb/XFilterPolicy.java b/leveldb-api/src/main/java/org/iq80/leveldb/XFilterPolicy.java
new file mode 100644
index 00000000..67de8ecb
--- /dev/null
+++ b/leveldb-api/src/main/java/org/iq80/leveldb/XFilterPolicy.java
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.iq80.leveldb;
+
+/**
+ * A database can be configured with a custom FilterPolicy object.
+ * This object is responsible for creating a small filter from a set
+ * of keys. These filters are stored in leveldb and are consulted
+ * automatically by leveldb to decide whether or not to read some
+ * information from disk. In many cases, a filter can cut down the
+ * number of disk seeks form a handful to a single disk seek per
+ * DB::Get() call.
+ *
+ * Most people will want to use the builtin bloom filter support (see
+ * NewBloomFilterPolicy() below).
+ *
+ * @author Honore Vasconcelos
+ */
+public interface XFilterPolicy
+{
+}
diff --git a/leveldb-benchmark/pom.xml b/leveldb-benchmark/pom.xml
index cb05f04a..a2ffbe3a 100644
--- a/leveldb-benchmark/pom.xml
+++ b/leveldb-benchmark/pom.xml
@@ -30,6 +30,7 @@
${project.parent.basedir}
+ false
diff --git a/leveldb-benchmark/src/main/java/org/iq80/leveldb/benchmark/DbBenchmark.java b/leveldb-benchmark/src/main/java/org/iq80/leveldb/benchmark/DbBenchmark.java
index 7959c9c1..d415fad9 100644
--- a/leveldb-benchmark/src/main/java/org/iq80/leveldb/benchmark/DbBenchmark.java
+++ b/leveldb-benchmark/src/main/java/org/iq80/leveldb/benchmark/DbBenchmark.java
@@ -26,9 +26,10 @@
import org.iq80.leveldb.DBFactory;
import org.iq80.leveldb.DBIterator;
import org.iq80.leveldb.Options;
+import org.iq80.leveldb.ReadOptions;
import org.iq80.leveldb.WriteBatch;
import org.iq80.leveldb.WriteOptions;
-import org.iq80.leveldb.impl.DbImpl;
+import org.iq80.leveldb.table.BloomFilterPolicy;
import org.iq80.leveldb.util.Closeables;
import org.iq80.leveldb.util.FileUtils;
import org.iq80.leveldb.util.PureJavaCrc32C;
@@ -39,21 +40,20 @@
import java.io.File;
import java.io.IOException;
+import java.lang.reflect.Method;
import java.nio.ByteBuffer;
+import java.util.Arrays;
import java.util.Date;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.Condition;
+import java.util.concurrent.locks.ReentrantLock;
import static com.google.common.base.Preconditions.checkArgument;
import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.iq80.leveldb.benchmark.DbBenchmark.DBState.EXISTING;
-import static org.iq80.leveldb.benchmark.DbBenchmark.DBState.FRESH;
-import static org.iq80.leveldb.benchmark.DbBenchmark.Order.RANDOM;
-import static org.iq80.leveldb.benchmark.DbBenchmark.Order.SEQUENTIAL;
-import static org.iq80.leveldb.impl.DbConstants.NUM_LEVELS;
public class DbBenchmark
{
@@ -61,38 +61,18 @@ public class DbBenchmark
private final Integer writeBufferSize;
private final File databaseDir;
private final double compressionRatio;
- private long startTime;
-
- enum Order
- {
- SEQUENTIAL,
- RANDOM
- }
-
- enum DBState
- {
- FRESH,
- EXISTING
- }
+ private final Map flags;
// Cache cache_;
private final List benchmarks;
+ private final int blockCacheSize;
+ private final int bloomFilterBits;
private DB db;
- private final int num;
+ private int num;
private int reads;
- private final int valueSize;
- private int heapCounter;
- private double lastOpFinish;
- private long bytes;
- private String message;
- private String postMessage;
- // private Histogram hist_;
- private final RandomGenerator generator;
- private final Random random;
-
- // State kept for progress messages
- private int done;
- private int nextReport; // When to report next
+ private int valueSize;
+ private WriteOptions writeOptions;
+ private int entriesPerBatch;
private final DBFactory factory;
@@ -101,16 +81,18 @@ public DbBenchmark(Map flags)
{
ClassLoader cl = DbBenchmark.class.getClassLoader();
factory = (DBFactory) cl.loadClass(System.getProperty("leveldb.factory", "org.iq80.leveldb.impl.Iq80DBFactory")).newInstance();
+ this.flags = flags;
benchmarks = (List) flags.get(Flag.benchmarks);
- num = (Integer) flags.get(Flag.num);
- reads = (Integer) (flags.get(Flag.reads) == null ? flags.get(Flag.num) : flags.get(Flag.reads));
- valueSize = (Integer) flags.get(Flag.value_size);
+
writeBufferSize = (Integer) flags.get(Flag.write_buffer_size);
compressionRatio = (Double) flags.get(Flag.compression_ratio);
useExisting = (Boolean) flags.get(Flag.use_existing_db);
- heapCounter = 0;
- bytes = 0;
- random = new Random(301);
+ blockCacheSize = (Integer) flags.get(Flag.cache_size);
+ bloomFilterBits = (Integer) flags.get(Flag.bloom_bits);
+ num = (Integer) flags.get(Flag.num);
+ reads = (Integer) (flags.get(Flag.reads) == null ? flags.get(Flag.num) : flags.get(Flag.reads));
+ valueSize = (Integer) flags.get(Flag.value_size);
+ entriesPerBatch = 1;
databaseDir = new File((String) flags.get(Flag.db));
@@ -124,8 +106,6 @@ public DbBenchmark(Map flags)
if (!useExisting) {
destroyDb();
}
-
- generator = new RandomGenerator(compressionRatio);
}
private void run()
@@ -135,92 +115,216 @@ private void run()
open();
for (String benchmark : benchmarks) {
- start();
+ // Reset parameters that may be overridden below
+ num = (Integer) flags.get(Flag.num);
+ reads = (Integer) (flags.get(Flag.reads) == null ? flags.get(Flag.num) : flags.get(Flag.reads));
+ valueSize = (Integer) flags.get(Flag.value_size);
+ entriesPerBatch = 1;
+ writeOptions = new WriteOptions();
- boolean known = true;
+ boolean freshBb = false;
+ int numThreads = (Integer) flags.get(Flag.threads);
+
+ String method = null;
if (benchmark.equals("fillseq")) {
- write(new WriteOptions(), SEQUENTIAL, FRESH, num, valueSize, 1);
+ freshBb = true;
+ method = "writeSeq";
}
else if (benchmark.equals("fillbatch")) {
- write(new WriteOptions(), SEQUENTIAL, FRESH, num, valueSize, 1000);
+ freshBb = true;
+ entriesPerBatch = 1000;
+ method = "writeSeq";
}
else if (benchmark.equals("fillrandom")) {
- write(new WriteOptions(), RANDOM, FRESH, num, valueSize, 1);
+ freshBb = true;
+ method = "writeRandom";
}
else if (benchmark.equals("overwrite")) {
- write(new WriteOptions(), RANDOM, EXISTING, num, valueSize, 1);
+ freshBb = false;
+ method = "writeRandom";
}
else if (benchmark.equals("fillsync")) {
- write(new WriteOptions().sync(true), RANDOM, FRESH, num / 1000, valueSize, 1);
+ freshBb = true;
+ num /= 1000;
+ writeOptions.sync(true);
+ method = "writeRandom";
}
else if (benchmark.equals("fill100K")) {
- write(new WriteOptions(), RANDOM, FRESH, num / 1000, 100 * 1000, 1);
+ freshBb = true;
+ num /= 1000;
+ valueSize = 100 * 1000;
+ method = "writeRandom";
}
else if (benchmark.equals("readseq")) {
- readSequential();
+ method = "readSequential";
}
else if (benchmark.equals("readreverse")) {
- readReverse();
+ method = "readReverse";
}
else if (benchmark.equals("readrandom")) {
- readRandom();
+ method = "readRandom";
+ }
+ else if (benchmark.equals("seekrandom")) {
+ method = "seekRandom";
}
else if (benchmark.equals("readhot")) {
- readHot();
+ method = "readHot";
}
else if (benchmark.equals("readrandomsmall")) {
- int n = reads;
reads /= 1000;
- readRandom();
- reads = n;
+ method = "readRandom";
+ }
+ else if (benchmark.equals("readwhilewriting")) {
+ numThreads++; // Add extra thread for writing
+ method = "readWhileWriting";
}
else if (benchmark.equals("compact")) {
- compact();
+ method = "compact";
}
else if (benchmark.equals("crc32c")) {
- crc32c(4096, "(4k per op)");
+ method = "crc32c";
}
else if (benchmark.equals("acquireload")) {
- acquireLoad();
+ method = "acquireLoad";
}
else if (benchmark.equals("snappycomp")) {
if (Snappy.available()) {
- snappyCompress();
+ method = "snappyCompress";
}
}
else if (benchmark.equals("snappyuncomp")) {
if (Snappy.available()) {
- snappyUncompressDirectBuffer();
+ method = "snappyUncompressDirectBuffer";
}
}
else if (benchmark.equals("unsnap-array")) {
if (Snappy.available()) {
- snappyUncompressArray();
+ method = "snappyUncompressArray";
}
}
else if (benchmark.equals("unsnap-direct")) {
if (Snappy.available()) {
- snappyUncompressDirectBuffer();
+ method = "snappyUncompressDirectBuffer";
}
}
else if (benchmark.equals("heapprofile")) {
heapProfile();
}
else if (benchmark.equals("stats")) {
- printStats();
+ printStats("leveldb.stats");
}
else {
- known = false;
System.err.println("Unknown benchmark: " + benchmark);
}
- if (known) {
- stop(benchmark);
+ if (freshBb) {
+ if (useExisting) {
+ System.out.println("skipping (--use_existing_db is true)");
+ return;
+ }
+ db.close();
+ db = null;
+ destroyDb();
+ open();
+ }
+ if (method != null) {
+ try {
+ runBenchmark(numThreads, benchmark, method);
+ }
+ catch (Exception e) {
+ System.out.println("Failed to rung " + method);
+ e.printStackTrace();
+ return;
+ }
}
+
}
db.close();
}
+ private void runBenchmark(int n, String name, String method) throws Exception
+ {
+ SharedState shared = new SharedState();
+
+ ThreadArg[] arg = new ThreadArg[n];
+ for (int i = 0; i < arg.length; ++i) {
+ arg[i] = new ThreadArg();
+ arg[i].bm = this;
+ arg[i].method = method;
+ arg[i].shared = shared;
+ arg[i].thread = new ThreadState(i);
+ arg[i].thread.shared = shared;
+ startThread(arg[i]);
+ }
+
+ shared.mu.lock();
+ while (shared.numInitialized < n) {
+ shared.cv.await();
+ }
+
+ shared.start = true;
+ shared.cv.signalAll();
+ while (shared.numDone < n) {
+ shared.cv.await();
+ }
+ shared.mu.unlock();
+
+ for (int i = 1; i < n; i++) {
+ arg[0].thread.stats.merge(arg[i].thread.stats);
+ }
+ arg[0].thread.stats.report(name);
+ }
+
+ public void startThread(final ThreadArg arg)
+ {
+ new Thread(new Runnable()
+ {
+ @Override
+ public void run()
+ {
+ SharedState shared = arg.shared;
+ ThreadState thread = arg.thread;
+ shared.mu.lock();
+ try {
+ shared.numInitialized++;
+ if (shared.numInitialized >= shared.total) {
+ shared.cv.signalAll();
+ }
+ while (!shared.start) {
+ shared.cv.awaitUninterruptibly();
+ }
+ }
+ finally {
+ shared.mu.unlock();
+ }
+ try {
+ Method method = arg.bm.getClass().getDeclaredMethod(arg.method, ThreadState.class);
+ method.setAccessible(true);
+ thread.stats.start();
+ method.invoke(arg.bm, thread);
+ }
+ catch (Exception e) {
+ thread.stats.addMessage("ERROR " + e);
+ e.printStackTrace();
+ }
+ finally {
+ thread.stats.stop();
+ }
+
+ shared.mu.lock();
+ try {
+ shared.numDone++;
+ if (shared.numDone >= shared.total) {
+ shared.cv.signalAll();
+ }
+ }
+ finally {
+ shared.mu.unlock();
+ }
+ }
+ }).start();
+ }
+
private void printHeader()
throws IOException
{
@@ -240,10 +344,12 @@ private void printHeader()
System.out.printf("------------------------------------------------\n");
}
+ @SuppressWarnings({"InnerAssignment"})
static void printWarnings()
{
- boolean assertsEnabled = true;
- assert assertsEnabled; // Intentional side effect!!!
+ boolean assertsEnabled = false;
+ // CHECKSTYLE IGNORE check FOR NEXT 1 LINES
+ assert assertsEnabled = true; // Intentional side effect!!!
if (assertsEnabled) {
System.out.printf("WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
}
@@ -302,95 +408,41 @@ private void open()
{
Options options = new Options();
options.createIfMissing(!useExisting);
- // todo block cache
+ if (blockCacheSize >= 0) {
+ options.cacheSize(blockCacheSize);
+ }
+ if (bloomFilterBits >= 0) {
+ options.filterPolicy(new BloomFilterPolicy(bloomFilterBits));
+ }
+ options.cacheSize(blockCacheSize);
if (writeBufferSize != null) {
options.writeBufferSize(writeBufferSize);
}
db = factory.open(databaseDir, options);
}
- private void start()
- {
- startTime = System.nanoTime();
- bytes = 0;
- message = null;
- lastOpFinish = startTime;
- // hist.clear();
- done = 0;
- nextReport = 100;
- }
-
- private void stop(String benchmark)
- {
- long endTime = System.nanoTime();
- double elapsedSeconds = 1.0d * (endTime - startTime) / TimeUnit.SECONDS.toNanos(1);
-
- // Pretend at least one op was done in case we are running a benchmark
- // that does nto call FinishedSingleOp().
- if (done < 1) {
- done = 1;
- }
-
- if (bytes > 0) {
- String rate = String.format("%6.1f MB/s", (bytes / 1048576.0) / elapsedSeconds);
- if (message != null) {
- message = rate + " " + message;
- }
- else {
- message = rate;
- }
- }
- else if (message == null) {
- message = "";
- }
-
- System.out.printf("%-12s : %11.5f micros/op;%s%s\n",
- benchmark,
- elapsedSeconds * 1.0e6 / done,
- (message == null ? "" : " "),
- message);
-// if (FLAGS_histogram) {
-// System.out.printf("Microseconds per op:\n%s\n", hist_.ToString().c_str());
-// }
-
- if (postMessage != null) {
- System.out.printf("\n%s\n", postMessage);
- postMessage = null;
- }
-
- }
-
- private void write(WriteOptions writeOptions, Order order, DBState state, int numEntries, int valueSize, int entriesPerBatch)
+ private void write(ThreadState thread, boolean seq)
throws IOException
{
- if (state == FRESH) {
- if (useExisting) {
- message = "skipping (--use_existing_db is true)";
- return;
- }
- db.close();
- db = null;
- destroyDb();
- open();
- start(); // Do not count time taken to destroy/open
+ if (!flags.get(Flag.num).equals(num)) {
+ thread.stats.addMessage(String.format("(%d ops)", num));
}
- if (numEntries != num) {
- message = String.format("(%d ops)", numEntries);
- }
-
- for (int i = 0; i < numEntries; i += entriesPerBatch) {
+ RandomGenerator gen = newGenerator();
+ long bytes = 0;
+ for (int i = 0; i < num; i += entriesPerBatch) {
WriteBatch batch = db.createWriteBatch();
for (int j = 0; j < entriesPerBatch; j++) {
- int k = (order == SEQUENTIAL) ? i + j : random.nextInt(num);
+ int k = seq ? i + j : thread.rand.nextInt(num);
byte[] key = formatNumber(k);
- batch.put(key, generator.generate(valueSize));
+ batch.put(key, gen.generate(valueSize));
bytes += valueSize + key.length;
- finishedSingleOp();
+ thread.stats.finishedSingleOp();
}
db.write(batch, writeOptions);
batch.close();
}
+ thread.stats.addBytes(bytes);
}
public static byte[] formatNumber(long n)
@@ -410,100 +462,123 @@ public static byte[] formatNumber(long n)
return slice;
}
- private void finishedSingleOp()
- {
-// if (histogram) {
-// todo
-// }
- done++;
- if (done >= nextReport) {
- if (nextReport < 1000) {
- nextReport += 100;
- }
- else if (nextReport < 5000) {
- nextReport += 500;
- }
- else if (nextReport < 10000) {
- nextReport += 1000;
- }
- else if (nextReport < 50000) {
- nextReport += 5000;
- }
- else if (nextReport < 100000) {
- nextReport += 10000;
- }
- else if (nextReport < 500000) {
- nextReport += 50000;
- }
- else {
- nextReport += 100000;
- }
- System.out.printf("... finished %d ops%30s\r", done, "");
-
- }
- }
-
- private void readSequential()
+ private void readSequential(ThreadState thread)
{
+ long bytes = 0;
for (int loops = 0; loops < 5; loops++) {
DBIterator iterator = db.iterator();
for (int i = 0; i < reads && iterator.hasNext(); i++) {
Map.Entry entry = iterator.next();
bytes += entry.getKey().length + entry.getValue().length;
- finishedSingleOp();
+ thread.stats.finishedSingleOp();
}
Closeables.closeQuietly(iterator);
}
+ thread.stats.addBytes(bytes);
}
- private void readReverse()
+ private void readReverse(ThreadState thread)
{
//To change body of created methods use File | Settings | File Templates.
}
- private void readRandom()
+ private void readRandom(ThreadState thread)
{
+ int found = 0;
for (int i = 0; i < reads; i++) {
- byte[] key = formatNumber(random.nextInt(num));
+ byte[] key = formatNumber(thread.rand.nextInt(num));
byte[] value = db.get(key);
- if (value == null) {
- throw new NullPointerException(String.format("db.get(%s) is null", new String(key, UTF_8)));
+ if (value != null) {
+ found++;
}
- bytes += key.length + value.length;
- finishedSingleOp();
+ thread.stats.finishedSingleOp();
+ }
+ thread.stats.addMessage(String.format("(%d of %d found)", found, num));
+ }
+
+ private void readMissing(ThreadState thread)
+ {
+
+ for (int i = 0; i < reads; i++) {
+ byte[] key = formatNumber(thread.rand.nextInt(num));
+ db.get(key);
+ thread.stats.finishedSingleOp();
}
}
- private void readHot()
+ private void readHot(ThreadState thread)
{
+ long bytes = 0;
int range = (num + 99) / 100;
for (int i = 0; i < reads; i++) {
- byte[] key = formatNumber(random.nextInt(range));
+ byte[] key = formatNumber(thread.rand.nextInt(range));
byte[] value = db.get(key);
bytes += key.length + value.length;
- finishedSingleOp();
+ thread.stats.finishedSingleOp();
}
+ thread.stats.addBytes(bytes);
}
- private void compact()
- throws IOException
+ private void seekRandom(ThreadState thread) throws IOException
{
- if (db instanceof DbImpl) {
- ((DbImpl) db).compactMemTable();
- for (int level = 0; level < NUM_LEVELS - 1; level++) {
- ((DbImpl) db).compactRange(level, Slices.copiedBuffer("", UTF_8), Slices.copiedBuffer("~", UTF_8));
+ ReadOptions options = new ReadOptions();
+ int found = 0;
+ for (int i = 0; i < reads; i++) {
+ DBIterator iter = db.iterator(options);
+ byte[] key = formatNumber(thread.rand.nextInt(num));
+ iter.seek(key);
+ if (iter.hasNext() == Arrays.equals(iter.next().getKey(), key)) {
+ found++;
}
+ iter.close();
+ thread.stats.finishedSingleOp();
}
+ thread.stats.addMessage(String.format("(%d of %d found)", found, num));
}
- private void crc32c(int blockSize, String message)
+ private void readWhileWriting(ThreadState thread)
{
- // Checksum about 500MB of data total
- byte[] data = new byte[blockSize];
- for (int i = 0; i < data.length; i++) {
- data[i] = 'x';
+ if (thread.tid > 0) {
+ readRandom(thread);
+ }
+ else {
+ // Special thread that keeps writing until other threads are done.
+ RandomGenerator gen = newGenerator();
+ while (true) {
+ thread.shared.mu.lock();
+ try {
+ if (thread.shared.numDone + 1 >= thread.shared.numInitialized) {
+ // Other threads have finished
+ break;
+ }
+ }
+ finally {
+ thread.shared.mu.unlock();
+ }
+ byte[] key = formatNumber(thread.rand.nextInt((Integer) flags.get(Flag.num)));
+ db.put(key, gen.generate(valueSize), writeOptions);
+ }
+
+ // Do not count any of the preceding work/delay in stats.
+ thread.stats.start();
}
+ }
+
+ private void compact(ThreadState thread)
+ throws IOException
+ {
+ db.compactRange(null, null);
+ }
+
+ private void crc32c(final ThreadState thread)
+ {
+ // Checksum about 500MB of data total
+ int blockSize = 4096;
+ String label = "(4K per op)";
+ // Checksum about 500MB of data total
+ byte[] data = new byte[blockSize];
+ Arrays.fill(data, (byte) 'x');
long bytes = 0;
int crc = 0;
@@ -511,26 +586,27 @@ private void crc32c(int blockSize, String message)
PureJavaCrc32C checksum = new PureJavaCrc32C();
checksum.update(data, 0, blockSize);
crc = checksum.getMaskedValue();
- finishedSingleOp();
+ thread.stats.finishedSingleOp();
bytes += blockSize;
}
+ // Print so result is not dead
System.out.printf("... crc=0x%x\r", crc);
- this.bytes = bytes;
- // Print so result is not dead
- this.message = message;
+ thread.stats.addBytes(bytes);
+ thread.stats.addMessage(label);
}
- private void acquireLoad()
+ private void acquireLoad(ThreadState thread)
{
//To change body of created methods use File | Settings | File Templates.
}
- private void snappyCompress()
+ private void snappyCompress(ThreadState thread)
{
- byte[] raw = generator.generate(new Options().blockSize());
+ byte[] raw = newGenerator().generate(new Options().blockSize());
byte[] compressedOutput = new byte[Snappy.maxCompressedLength(raw.length)];
+ long bytes = 0;
long produced = 0;
// attempt to compress the block
@@ -541,20 +617,27 @@ private void snappyCompress()
produced += compressedSize;
}
catch (IOException ignored) {
+ thread.stats.addMessage("(snappy failure)");
throw Throwables.propagate(ignored);
}
- finishedSingleOp();
+ thread.stats.finishedSingleOp();
}
+ thread.stats.addMessage(String.format("(output: %.1f%%)", (produced * 100.0) / bytes));
+ thread.stats.addBytes(bytes);
+ }
- message = String.format("(output: %.1f%%)", (produced * 100.0) / bytes);
+ private RandomGenerator newGenerator()
+ {
+ return new RandomGenerator(compressionRatio);
}
- private void snappyUncompressArray()
+ private void snappyUncompressArray(ThreadState thread)
{
int inputSize = new Options().blockSize();
byte[] compressedOutput = new byte[Snappy.maxCompressedLength(inputSize)];
- byte[] raw = generator.generate(inputSize);
+ byte[] raw = newGenerator().generate(inputSize);
+ long bytes = 0;
int compressedLength;
try {
compressedLength = Snappy.compress(raw, 0, raw.length, compressedOutput, 0);
@@ -569,18 +652,20 @@ private void snappyUncompressArray()
bytes += inputSize;
}
catch (IOException ignored) {
+ thread.stats.addMessage("(snappy failure)");
throw Throwables.propagate(ignored);
}
- finishedSingleOp();
+ thread.stats.finishedSingleOp();
}
+ thread.stats.addBytes(bytes);
}
- private void snappyUncompressDirectBuffer()
+ private void snappyUncompressDirectBuffer(ThreadState thread)
{
int inputSize = new Options().blockSize();
byte[] compressedOutput = new byte[Snappy.maxCompressedLength(inputSize)];
- byte[] raw = generator.generate(inputSize);
+ byte[] raw = newGenerator().generate(inputSize);
int compressedLength;
try {
compressedLength = Snappy.compress(raw, 0, raw.length, compressedOutput, 0);
@@ -593,6 +678,7 @@ private void snappyUncompressDirectBuffer()
ByteBuffer compressedBuffer = ByteBuffer.allocateDirect(compressedLength);
compressedBuffer.put(compressedOutput, 0, compressedLength);
+ long bytes = 0;
// attempt to uncompress the block
while (bytes < 5L * 1024 * 1048576) { // Compress 1G
try {
@@ -603,13 +689,25 @@ private void snappyUncompressDirectBuffer()
bytes += inputSize;
}
catch (IOException ignored) {
+ thread.stats.addMessage("(snappy failure)");
throw Throwables.propagate(ignored);
}
- finishedSingleOp();
+ thread.stats.finishedSingleOp();
+ thread.stats.addBytes(bytes);
}
}
+ private void writeSeq(ThreadState thread) throws IOException
+ {
+ write(thread, true);
+ }
+
+ private void writeRandom(ThreadState thread) throws IOException
+ {
+ write(thread, false);
+ }
+
private void heapProfile()
{
//To change body of created methods use File | Settings | File Templates.
@@ -622,8 +720,12 @@ private void destroyDb()
FileUtils.deleteRecursively(databaseDir);
}
- private void printStats()
+ private void printStats(String name)
{
+ final String property = db.getProperty(name);
+ if (property != null) {
+ System.out.print(property);
+ }
//To change body of created methods use File | Settings | File Templates.
}
@@ -677,13 +779,10 @@ private enum Flag
// stats -- Print DB stats
// heapprofile -- Dump a heap profile (if supported by this port)
benchmarks(ImmutableList.of(
- "fillseq",
- "fillseq",
"fillseq",
"fillsync",
"fillrandom",
"overwrite",
- "fillseq",
"readrandom",
"readrandom", // Extra run to allow previous compactions to quiesce
"readseq",
@@ -696,121 +795,130 @@ private enum Flag
// "crc32c",
"snappycomp",
"unsnap-array",
- "unsnap-direct"
+ "unsnap-direct",
+ "stats"
// "acquireload"
- ))
- {
- @Override
- public Object parseValue(String value)
- {
- return ImmutableList.copyOf(Splitter.on(",").trimResults().omitEmptyStrings().split(value));
- }
- },
+ )) {
+ @Override
+ public Object parseValue(String value)
+ {
+ return ImmutableList.copyOf(Splitter.on(",").trimResults().omitEmptyStrings().split(value));
+ }
+ },
// Arrange to generate values that shrink to this fraction of
// their original size after compression
- compression_ratio(0.5d)
- {
- @Override
- public Object parseValue(String value)
- {
- return Double.parseDouble(value);
- }
- },
+ compression_ratio(0.5d) {
+ @Override
+ public Object parseValue(String value)
+ {
+ return Double.parseDouble(value);
+ }
+ },
// Print histogram of operation timings
- histogram(false)
- {
- @Override
- public Object parseValue(String value)
- {
- return Boolean.parseBoolean(value);
- }
- },
+ histogram(false) {
+ @Override
+ public Object parseValue(String value)
+ {
+ return Boolean.parseBoolean(value);
+ }
+ },
// If true, do not destroy the existing database. If you set this
// flag and also specify a benchmark that wants a fresh database, that
// benchmark will fail.
- use_existing_db(false)
- {
- @Override
- public Object parseValue(String value)
- {
- return Boolean.parseBoolean(value);
- }
- },
+ use_existing_db(false) {
+ @Override
+ public Object parseValue(String value)
+ {
+ return Boolean.parseBoolean(value);
+ }
+ },
// Number of key/values to place in database
- num(1000000)
- {
- @Override
- public Object parseValue(String value)
- {
- return Integer.parseInt(value);
- }
- },
+ num(1000000) {
+ @Override
+ public Object parseValue(String value)
+ {
+ return Integer.parseInt(value);
+ }
+ },
// Number of read operations to do. If negative, do FLAGS_num reads.
- reads(null)
- {
- @Override
- public Object parseValue(String value)
- {
- return Integer.parseInt(value);
- }
- },
+ reads(null) {
+ @Override
+ public Object parseValue(String value)
+ {
+ return Integer.parseInt(value);
+ }
+ },
+
+ // Number of concurrent threads to run.
+ threads(1) {
+ @Override
+ public Object parseValue(String value)
+ {
+ return Integer.parseInt(value);
+ }
+ },
// Size of each value
- value_size(100)
- {
- @Override
- public Object parseValue(String value)
- {
- return Integer.parseInt(value);
- }
- },
+ value_size(100) {
+ @Override
+ public Object parseValue(String value)
+ {
+ return Integer.parseInt(value);
+ }
+ },
// Number of bytes to buffer in memtable before compacting
// (initialized to default value by "main")
- write_buffer_size(null)
- {
- @Override
- public Object parseValue(String value)
- {
- return Integer.parseInt(value);
- }
- },
+ write_buffer_size(null) {
+ @Override
+ public Object parseValue(String value)
+ {
+ return Integer.parseInt(value);
+ }
+ },
// Number of bytes to use as a cache of uncompressed data.
// Negative means use default settings.
- cache_size(-1)
- {
- @Override
- public Object parseValue(String value)
- {
- return Integer.parseInt(value);
- }
- },
+ cache_size(-1) {
+ @Override
+ public Object parseValue(String value)
+ {
+ return Integer.parseInt(value);
+ }
+ },
+
+ // Bloom filter bits per key.
+ // Negative means use default settings.
+ bloom_bits(-1) {
+ @Override
+ public Object parseValue(String value)
+ {
+ return Integer.parseInt(value);
+ }
+ },
// Maximum number of files to keep open at the same time (use default if == 0)
- open_files(0)
- {
- @Override
- public Object parseValue(String value)
- {
- return Integer.parseInt(value);
- }
- },
+ open_files(0) {
+ @Override
+ public Object parseValue(String value)
+ {
+ return Integer.parseInt(value);
+ }
+ },
// Use the db with the following name.
- db("/tmp/dbbench")
- {
- @Override
- public Object parseValue(String value)
- {
- return value;
- }
- };
+ db("/tmp/dbbench") {
+ @Override
+ public Object parseValue(String value)
+ {
+ return value;
+ }
+ };
private final Object defaultValue;
@@ -885,4 +993,180 @@ private static Slice generateRandomSlice(Random random, int length)
}
return rawData;
}
+
+ private static class SharedState
+ {
+ ReentrantLock mu;
+ Condition cv;
+ int total;
+
+ // Each thread goes through the following states:
+ // (1) initializing
+ // (2) waiting for others to be initialized
+ // (3) running
+ // (4) done
+ int numInitialized;
+ int numDone;
+ boolean start;
+
+ public SharedState()
+ {
+ this.mu = new ReentrantLock();
+ this.cv = mu.newCondition();
+ }
+ }
+
+ private class ThreadState
+ {
+ int tid; // 0..n-1 when running in n threads
+ Random rand; // Has different seeds for different threads
+ DbBenchmark.Stats stats = new Stats();
+ SharedState shared;
+
+ public ThreadState(int index)
+ {
+ this.tid = index;
+ this.rand = new Random(1000 + index);
+ }
+ }
+
+ private class ThreadArg
+ {
+ DbBenchmark bm;
+ SharedState shared;
+ ThreadState thread;
+ String method;
+ }
+
+ private class Stats
+ {
+ long start;
+ long finish;
+ double seconds;
+ int done;
+ int nextReport;
+ long bytes;
+ double lastOpFinish;
+ Histogram hist = new Histogram();
+ StringBuilder message = new StringBuilder();
+
+ public Stats()
+ {
+ start();
+ }
+
+ void start()
+ {
+ nextReport = 100;
+ lastOpFinish = start;
+ hist.clear();
+ done = 0;
+ bytes = 0;
+ seconds = 0;
+ start = System.nanoTime();
+ finish = start;
+ message.setLength(0);
+ }
+
+ void merge(Stats other)
+ {
+ hist.merge(other.hist);
+ done += other.done;
+ bytes += other.bytes;
+ seconds += other.seconds;
+ if (other.start < start) {
+ start = other.start;
+ }
+ if (other.finish > finish) {
+ finish = other.finish;
+ }
+
+ // Just keep the messages from one thread
+ if (message.length() == 0) {
+ message = other.message;
+ }
+ }
+
+ void stop()
+ {
+ finish = System.nanoTime();
+ seconds = 1.0d * (finish - start) / TimeUnit.SECONDS.toNanos(1);
+ }
+
+ void addMessage(String msg)
+ {
+ if (message.length() != 0) {
+ message.append(" ");
+ }
+ message.append(msg);
+ }
+
+ void finishedSingleOp()
+ {
+ if (flags.containsKey(Flag.histogram)) {
+ double now = System.nanoTime();
+ double micros = (now - lastOpFinish) / 1000.0d;
+ hist.add(micros);
+ if (micros > 20000) {
+ System.out.printf("long op: %.1f micros%30s\r", micros, "");
+ }
+ lastOpFinish = now;
+ }
+
+ done++;
+ if (done >= nextReport) {
+ if (nextReport < 1000) {
+ nextReport += 100;
+ }
+ else if (nextReport < 5000) {
+ nextReport += 500;
+ }
+ else if (nextReport < 10000) {
+ nextReport += 1000;
+ }
+ else if (nextReport < 50000) {
+ nextReport += 5000;
+ }
+ else if (nextReport < 100000) {
+ nextReport += 10000;
+ }
+ else if (nextReport < 500000) {
+ nextReport += 50000;
+ }
+ else {
+ nextReport += 100000;
+ }
+ System.out.printf("... finished %d ops%30s\r", done, "");
+ }
+ }
+
+ void addBytes(long n)
+ {
+ bytes += n;
+ }
+
+ void report(String name)
+ {
+
+ // Pretend at least one op was done in case we are running a benchmark
+ // that does nto call FinishedSingleOp().
+ if (done < 1) {
+ done = 1;
+ }
+
+ if (bytes > 0) {
+ String rate = String.format("%6.1f MB/s", (bytes / 1048576.0) / seconds);
+ message.insert(0, " ").insert(0, rate);
+ }
+
+ System.out.printf("%-12s : %11.5f micros/op;%s%s\n",
+ name,
+ seconds * 1.0e6 / done,
+ (message == null ? "" : " "),
+ message);
+ if (flags.get(Flag.histogram).equals(true)) {
+ System.out.printf("Microseconds per op:\n%s\n", hist.toString());
+ }
+ }
+ }
}
diff --git a/leveldb-benchmark/src/main/java/org/iq80/leveldb/benchmark/Histogram.java b/leveldb-benchmark/src/main/java/org/iq80/leveldb/benchmark/Histogram.java
new file mode 100644
index 00000000..d443524f
--- /dev/null
+++ b/leveldb-benchmark/src/main/java/org/iq80/leveldb/benchmark/Histogram.java
@@ -0,0 +1,178 @@
+/*
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.iq80.leveldb.benchmark;
+
+import com.google.common.base.Strings;
+
+public class Histogram
+{
+ static final double[] K_BUCKET_LIMIT = {
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45,
+ 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450,
+ 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000,
+ 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000,
+ 16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000,
+ 70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000,
+ 250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000,
+ 900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000,
+ 3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000,
+ 9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000,
+ 25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000,
+ 70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000,
+ 180000000, 200000000, 250000000, 300000000, 350000000, 400000000,
+ 450000000, 500000000, 600000000, 700000000, 800000000, 900000000,
+ 1000000000, 1200000000, 1400000000, 1600000000, 1800000000, 2000000000,
+ 2500000000.0, 3000000000.0, 3500000000.0, 4000000000.0, 4500000000.0,
+ 5000000000.0, 6000000000.0, 7000000000.0, 8000000000.0, 9000000000.0,
+ 1e200,
+ };
+ private final int kNumBuckets = 154;
+ private double min;
+ private double max;
+ private double num;
+ private double sum;
+ private double sumSquares;
+
+ private double[] doubles = new double[kNumBuckets];
+
+ public void clear()
+ {
+ min = K_BUCKET_LIMIT[kNumBuckets - 1];
+ max = 0;
+ num = 0;
+ sum = 0;
+ sumSquares = 0;
+ for (int i = 0; i < kNumBuckets; i++) {
+ doubles[i] = 0;
+ }
+ }
+
+ public void add(double value)
+ {
+ // Linear search is fast enough for our usage in db_bench
+ int b = 0;
+ while (b < kNumBuckets - 1 && K_BUCKET_LIMIT[b] <= value) {
+ b++;
+ }
+ doubles[b] += 1.0;
+ if (min > value) {
+ min = value;
+ }
+ if (max < value) {
+ max = value;
+ }
+ num++;
+ sum += value;
+ sumSquares += (value * value);
+ }
+
+ public void merge(Histogram other)
+ {
+ if (other.min < min) {
+ min = other.min;
+ }
+ if (other.max > max) {
+ max = other.max;
+ }
+ num += other.num;
+ sum += other.sum;
+ sumSquares += other.sumSquares;
+ for (int b = 0; b < kNumBuckets; b++) {
+ doubles[b] += other.doubles[b];
+ }
+ }
+
+ public double median()
+ {
+ return percentile(50.0);
+ }
+
+ public double percentile(double p)
+ {
+ double threshold = num * (p / 100.0);
+ double sum = 0;
+ for (int b = 0; b < kNumBuckets; b++) {
+ sum += doubles[b];
+ if (sum >= threshold) {
+ // Scale linearly within this bucket
+ double leftPoint = (b == 0) ? 0 : K_BUCKET_LIMIT[b - 1];
+ double rightPoint = K_BUCKET_LIMIT[b];
+ double leftSum = sum - doubles[b];
+ double rightSum = sum;
+ double pos = (threshold - leftSum) / (rightSum - leftSum);
+ double r = leftPoint + (rightPoint - leftPoint) * pos;
+ if (r < min) {
+ r = min;
+ }
+ if (r > max) {
+ r = max;
+ }
+ return r;
+ }
+ }
+ return max;
+ }
+
+ public double average()
+ {
+ if (num == 0.0) {
+ return 0;
+ }
+ return sum / num;
+ }
+
+ public double standardDeviation()
+ {
+ if (num == 0.0) {
+ return 0;
+ }
+ double variance = (sumSquares * num - sum * sum) / (num * num);
+ return Math.sqrt(variance);
+ }
+
+ public String toString()
+ {
+ StringBuilder r = new StringBuilder();
+ r.append(String.format("Count: %.0f Average: %.4f StdDev: %.2f\n",
+ num, average(), standardDeviation()));
+ r.append(String.format("Min: %.4f Median: %.4f Max: %.4f\n",
+ (num == 0.0 ? 0.0 : min), median(), max));
+ r.append("------------------------------------------------------\n");
+ r.append("left right count % cum % \n");
+ double mult = 100.0 / num;
+ double sum = 0;
+ for (int b = 0; b < kNumBuckets; b++) {
+ if (doubles[b] <= 0.0) {
+ continue;
+ }
+ sum += doubles[b];
+ r.append(String.format("[ %7.0f, %7.0f ) %7.0f %7.3f%% %7.3f%% ",
+ ((b == 0) ? 0.0 : K_BUCKET_LIMIT[b - 1]), // left
+ K_BUCKET_LIMIT[b], // right
+ doubles[b], // count
+ mult * doubles[b], // percentage
+ mult * sum)); // cumulative percentage
+
+ // Add hash marks based on percentage; 20 marks for 100%.
+ int marks = (int) (20 * (doubles[b] / num) + 0.5);
+ r.append(Strings.repeat("#", marks));
+ r.append("\n");
+ }
+ return r.toString();
+ }
+}
diff --git a/leveldb/pom.xml b/leveldb/pom.xml
index e580ec56..7ff9a267 100644
--- a/leveldb/pom.xml
+++ b/leveldb/pom.xml
@@ -111,8 +111,15 @@
org.apache.maven.pluginsmaven-surefire-plugin
-
-
+
+
+ listener
+ org.testng.reporters.VerboseReporter
+
+
+ 1
+ true
+ -Xmx2048m
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/Compaction.java b/leveldb/src/main/java/org/iq80/leveldb/impl/Compaction.java
index f0699b7a..eeb6089a 100644
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/Compaction.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/Compaction.java
@@ -27,9 +27,9 @@
import static org.iq80.leveldb.impl.VersionSet.MAX_GRAND_PARENT_OVERLAP_BYTES;
// A Compaction encapsulates information about a compaction.
-public class Compaction
+public class Compaction implements AutoCloseable
{
- private final Version inputVersion;
+ private Version inputVersion;
private final int level;
// Each compaction reads inputs from "level" and "level+1"
@@ -70,6 +70,7 @@ public Compaction(Version inputVersion, int level, List levelInput
this.grandparents = grandparents;
this.maxOutputFileSize = VersionSet.maxFileSizeForLevel(level);
this.inputs = new List[] {levelInputs, levelUpInputs};
+ inputVersion.retain();
}
public int getLevel()
@@ -196,8 +197,17 @@ public boolean shouldStopBefore(InternalKey internalKey)
}
}
- public List[] getInputs()
+ @Override
+ public void close()
{
- return inputs;
+ if (inputVersion != null) {
+ inputVersion.release();
+ inputVersion = null;
+ }
+ }
+
+ public List input(int which)
+ {
+ return inputs[which];
}
}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/DbImpl.java b/leveldb/src/main/java/org/iq80/leveldb/impl/DbImpl.java
old mode 100755
new mode 100644
index d0cafac4..2d7612a1
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/DbImpl.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/DbImpl.java
@@ -17,7 +17,7 @@
*/
package org.iq80.leveldb.impl;
-import com.google.common.base.Throwables;
+import com.google.common.annotations.VisibleForTesting;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.iq80.leveldb.CompressionType;
import org.iq80.leveldb.DB;
@@ -35,28 +35,31 @@
import org.iq80.leveldb.impl.WriteBatchImpl.Handler;
import org.iq80.leveldb.table.BytewiseComparator;
import org.iq80.leveldb.table.CustomUserComparator;
+import org.iq80.leveldb.table.FilterPolicy;
import org.iq80.leveldb.table.TableBuilder;
import org.iq80.leveldb.table.UserComparator;
import org.iq80.leveldb.util.DbIterator;
import org.iq80.leveldb.util.MergingIterator;
+import org.iq80.leveldb.util.SequentialFile;
+import org.iq80.leveldb.util.SequentialFileImpl;
import org.iq80.leveldb.util.Slice;
import org.iq80.leveldb.util.SliceInput;
import org.iq80.leveldb.util.SliceOutput;
import org.iq80.leveldb.util.Slices;
import org.iq80.leveldb.util.Snappy;
+import org.iq80.leveldb.util.UnbufferedWritableFile;
+import org.iq80.leveldb.util.WritableFile;
import java.io.File;
-import java.io.FileInputStream;
import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
import java.io.IOException;
import java.lang.Thread.UncaughtExceptionHandler;
-import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.Deque;
+import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
-import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
@@ -65,13 +68,14 @@
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static java.util.Objects.requireNonNull;
import static org.iq80.leveldb.impl.DbConstants.L0_SLOWDOWN_WRITES_TRIGGER;
import static org.iq80.leveldb.impl.DbConstants.L0_STOP_WRITES_TRIGGER;
-import static org.iq80.leveldb.impl.DbConstants.NUM_LEVELS;
import static org.iq80.leveldb.impl.SequenceNumber.MAX_SEQUENCE_NUMBER;
import static org.iq80.leveldb.impl.ValueType.DELETION;
import static org.iq80.leveldb.impl.ValueType.VALUE;
@@ -96,11 +100,15 @@ public class DbImpl
private final Condition backgroundCondition = mutex.newCondition();
private final List pendingOutputs = new ArrayList<>(); // todo
+ private final Deque writers = new LinkedList<>();
+ private final SnapshotList snapshots = new SnapshotList(mutex);
+ private final WriteBatchImpl tmpBatch = new WriteBatchImpl();
+ private final Env env;
private LogWriter log;
private MemTable memTable;
- private MemTable immutableMemTable;
+ private volatile MemTable immutableMemTable;
private final InternalKeyComparator internalKeyComparator;
@@ -110,9 +118,12 @@ public class DbImpl
private ManualCompaction manualCompaction;
- public DbImpl(Options options, File databaseDir)
+ private CompactionStats[] stats = new CompactionStats[DbConstants.NUM_LEVELS];
+
+ public DbImpl(Options options, File databaseDir, Env env)
throws IOException
{
+ this.env = env;
requireNonNull(options, "options is null");
requireNonNull(databaseDir, "databaseDir is null");
this.options = options;
@@ -124,6 +135,11 @@ public DbImpl(Options options, File databaseDir)
this.databaseDir = databaseDir;
+ if (this.options.filterPolicy() != null) {
+ checkArgument(this.options.filterPolicy() instanceof FilterPolicy, "Filter policy must implement Java interface FilterPolicy");
+ this.options.filterPolicy(InternalFilterPolicy.convert(this.options.filterPolicy()));
+ }
+
//use custom comparator if set
DBComparator comparator = options.comparator();
UserComparator userComparator;
@@ -154,7 +170,7 @@ public void uncaughtException(Thread t, Throwable e)
// Reserve ten files or so for other uses and give the rest to TableCache.
int tableCacheSize = options.maxOpenFiles() - 10;
- tableCache = new TableCache(databaseDir, tableCacheSize, new InternalUserComparator(internalKeyComparator), options.verifyChecksums());
+ tableCache = new TableCache(databaseDir, tableCacheSize, new InternalUserComparator(internalKeyComparator), options);
// create the version set
@@ -163,6 +179,10 @@ public void uncaughtException(Thread t, Throwable e)
checkArgument(databaseDir.exists(), "Database directory '%s' does not exist and could not be created", databaseDir);
checkArgument(databaseDir.isDirectory(), "Database directory '%s' is not a directory", databaseDir);
+ for (int i = 0; i < DbConstants.NUM_LEVELS; i++) {
+ stats[i] = new CompactionStats();
+ }
+
mutex.lock();
try {
// lock the database dir
@@ -177,7 +197,7 @@ public void uncaughtException(Thread t, Throwable e)
checkArgument(!options.errorIfExists(), "Database '%s' exists and the error if exists option is enabled", databaseDir);
}
- versions = new VersionSet(databaseDir, tableCache, internalKeyComparator);
+ versions = new VersionSet(databaseDir, tableCache, internalKeyComparator, options.allowMmapWrites());
// load (and recover) current version
versions.recover();
@@ -216,11 +236,11 @@ public void uncaughtException(Thread t, Throwable e)
// open transaction log
long logFileNumber = versions.getNextFileNumber();
- this.log = Logs.createLogWriter(new File(databaseDir, Filename.logFileName(logFileNumber)), logFileNumber);
+ this.log = Logs.createLogWriter(new File(databaseDir, Filename.logFileName(logFileNumber)), logFileNumber, options.allowMmapWrites());
edit.setLogNumber(log.getFileNumber());
// apply recovered edits
- versions.logAndApply(edit);
+ versions.logAndApply(edit, mutex);
// cleanup unused files
deleteObsoleteFiles();
@@ -275,6 +295,47 @@ public void close()
public String getProperty(String name)
{
checkBackgroundException();
+ if (!name.startsWith("leveldb.")) {
+ return null;
+ }
+ String key = name.substring("leveldb.".length());
+ mutex.lock();
+ try {
+ Matcher matcher;
+ matcher = Pattern.compile("num-files-at-level(\\d+)")
+ .matcher(key);
+ if (matcher.matches()) {
+ final int level = Integer.valueOf(matcher.group(1));
+ return String.valueOf(versions.numberOfFilesInLevel(level));
+ }
+ matcher = Pattern.compile("stats")
+ .matcher(key);
+ if (matcher.matches()) {
+ final StringBuilder stringBuilder = new StringBuilder();
+ stringBuilder.append(" Compactions\n");
+ stringBuilder.append("Level Files Size(MB) Time(sec) Read(MB) Write(MB)\n");
+ stringBuilder.append("--------------------------------------------------\n");
+ for (int level = 0; level < DbConstants.NUM_LEVELS; level++) {
+ int files = versions.numberOfFilesInLevel(level);
+ if (stats[level].micros > 0 || files > 0) {
+ stringBuilder.append(String.format(
+ "%3d %8d %8.0f %9.0f %8.0f %9.0f\n",
+ level,
+ files,
+ versions.numberOfBytesInLevel(level) / 1048576.0,
+ stats[level].micros / 1e6,
+ stats[level].bytesRead / 1048576.0,
+ stats[level].bytesWritten / 1048576.0));
+ }
+ }
+ return stringBuilder.toString();
+ }
+ //TODO implement sstables
+ //TODO implement approximate-memory-usage
+ }
+ finally {
+ mutex.unlock();
+ }
return null;
}
@@ -338,13 +399,13 @@ public void flushMemTable()
mutex.lock();
try {
// force compaction
- makeRoomForWrite(true);
+ writeInternal(null, new WriteOptions());
// todo bg_error code
while (immutableMemTable != null) {
backgroundCondition.awaitUninterruptibly();
}
-
+ checkBackgroundException();
}
finally {
mutex.unlock();
@@ -354,7 +415,7 @@ public void flushMemTable()
public void compactRange(int level, Slice start, Slice end)
{
checkArgument(level >= 0, "level is negative");
- checkArgument(level + 1 < NUM_LEVELS, "level is greater than or equal to %s", NUM_LEVELS);
+ checkArgument(level + 1 < DbConstants.NUM_LEVELS, "level is greater than or equal to %s", DbConstants.NUM_LEVELS);
requireNonNull(start, "start is null");
requireNonNull(end, "end is null");
@@ -363,7 +424,9 @@ public void compactRange(int level, Slice start, Slice end)
while (this.manualCompaction != null) {
backgroundCondition.awaitUninterruptibly();
}
- ManualCompaction manualCompaction = new ManualCompaction(level, start, end);
+ ManualCompaction manualCompaction = new ManualCompaction(level,
+ new InternalKey(start, SequenceNumber.MAX_SEQUENCE_NUMBER, VALUE),
+ new InternalKey(end, 0, DELETION));
this.manualCompaction = manualCompaction;
maybeScheduleCompaction();
@@ -388,29 +451,16 @@ private void maybeScheduleCompaction()
else if (shuttingDown.get()) {
// DB is being shutdown; no more background compactions
}
+ else if (backgroundException != null) {
+ // Already got an error; no more changes
+ }
else if (immutableMemTable == null &&
manualCompaction == null &&
!versions.needsCompaction()) {
// No work to be done
}
else {
- backgroundCompaction = compactionExecutor.submit(new Callable()
- {
- @Override
- public Void call()
- throws Exception
- {
- try {
- backgroundCall();
- }
- catch (DatabaseShutdownException ignored) {
- }
- catch (Throwable e) {
- backgroundException = e;
- }
- return null;
- }
- });
+ backgroundCompaction = compactionExecutor.submit(this::backgroundCall);
}
}
@@ -423,36 +473,37 @@ public void checkBackgroundException()
}
private void backgroundCall()
- throws IOException
{
mutex.lock();
try {
- if (backgroundCompaction == null) {
- return;
- }
+ checkState(backgroundCompaction != null, "Compaction was not correctly scheduled");
try {
- if (!shuttingDown.get()) {
+ if (!shuttingDown.get() && backgroundException == null) {
backgroundCompaction();
}
}
finally {
backgroundCompaction = null;
}
+ // Previous compaction may have produced too many files in a level,
+ // so reschedule another compaction if needed.
+ maybeScheduleCompaction();
+ }
+ catch (DatabaseShutdownException ignored) {
+ }
+ catch (Throwable throwable) {
+ backgroundException = throwable;
+ if (throwable instanceof Error) {
+ throw (Error) throwable;
+ }
}
finally {
try {
- // Previous compaction may have produced too many files in a level,
- // so reschedule another compaction if needed.
- maybeScheduleCompaction();
+ backgroundCondition.signalAll();
}
finally {
- try {
- backgroundCondition.signalAll();
- }
- finally {
- mutex.unlock();
- }
+ mutex.unlock();
}
}
}
@@ -462,13 +513,20 @@ private void backgroundCompaction()
{
checkState(mutex.isHeldByCurrentThread());
- compactMemTableInternal();
+ if (immutableMemTable != null) {
+ compactMemTable();
+ }
Compaction compaction;
- if (manualCompaction != null) {
- compaction = versions.compactRange(manualCompaction.level,
- new InternalKey(manualCompaction.begin, MAX_SEQUENCE_NUMBER, VALUE),
- new InternalKey(manualCompaction.end, 0, DELETION));
+ InternalKey manualEnd = null;
+ boolean isManual = manualCompaction != null;
+ if (isManual) {
+ ManualCompaction m = this.manualCompaction;
+ compaction = versions.compactRange(m.level, m.begin, m.end);
+ m.done = compaction == null;
+ if (compaction != null) {
+ manualEnd = compaction.input(0, compaction.getLevelInputs().size() - 1).getLargest();
+ }
}
else {
compaction = versions.pickCompaction();
@@ -477,23 +535,35 @@ private void backgroundCompaction()
if (compaction == null) {
// no compaction
}
- else if (manualCompaction == null && compaction.isTrivialMove()) {
+ else if (!isManual && compaction.isTrivialMove()) {
// Move file to next level
checkState(compaction.getLevelInputs().size() == 1);
FileMetaData fileMetaData = compaction.getLevelInputs().get(0);
compaction.getEdit().deleteFile(compaction.getLevel(), fileMetaData.getNumber());
compaction.getEdit().addFile(compaction.getLevel() + 1, fileMetaData);
- versions.logAndApply(compaction.getEdit());
+ versions.logAndApply(compaction.getEdit(), mutex);
// log
}
else {
CompactionState compactionState = new CompactionState(compaction);
doCompactionWork(compactionState);
+ compaction.close(); //release resources
cleanupCompaction(compactionState);
+ deleteObsoleteFiles();
+ }
+ if (compaction != null) {
+ compaction.close();
}
// manual compaction complete
- if (manualCompaction != null) {
+ if (isManual) {
+ ManualCompaction m = manualCompaction;
+ if (backgroundException != null) {
+ m.done = true;
+ }
+ if (!m.done) {
+ m.begin = manualEnd;
+ }
manualCompaction = null;
}
}
@@ -519,10 +589,9 @@ private long recoverLogFile(long fileNumber, VersionEdit edit)
{
checkState(mutex.isHeldByCurrentThread());
File file = new File(databaseDir, Filename.logFileName(fileNumber));
- try (FileInputStream fis = new FileInputStream(file);
- FileChannel channel = fis.getChannel()) {
+ try (SequentialFile in = SequentialFileImpl.open(file);) {
LogMonitor logMonitor = LogMonitors.logMonitor();
- LogReader logReader = new LogReader(channel, logMonitor, true, 0);
+ LogReader logReader = new LogReader(in, logMonitor, true, 0);
// Log(options_.info_log, "Recovering log #%llu", (unsigned long long) log_number);
@@ -583,44 +652,41 @@ public byte[] get(byte[] key, ReadOptions options)
{
checkBackgroundException();
LookupKey lookupKey;
+ LookupResult lookupResult;
mutex.lock();
try {
- SnapshotImpl snapshot = getSnapshot(options);
- lookupKey = new LookupKey(Slices.wrappedBuffer(key), snapshot.getLastSequence());
+ long lastSequence = options.snapshot() != null ?
+ snapshots.getSequenceFrom(options.snapshot()) : versions.getLastSequence();
+ lookupKey = new LookupKey(Slices.wrappedBuffer(key), lastSequence);
// First look in the memtable, then in the immutable memtable (if any).
- LookupResult lookupResult = memTable.get(lookupKey);
- if (lookupResult != null) {
- Slice value = lookupResult.getValue();
- if (value == null) {
- return null;
- }
- return value.getBytes();
- }
- if (immutableMemTable != null) {
- lookupResult = immutableMemTable.get(lookupKey);
- if (lookupResult != null) {
- Slice value = lookupResult.getValue();
- if (value == null) {
- return null;
- }
- return value.getBytes();
- }
- }
- }
- finally {
+ final MemTable memTable = this.memTable;
+ final MemTable immutableMemTable = this.immutableMemTable;
+ final Version current = versions.getCurrent();
+ current.retain();
+ ReadStats readStats = null;
mutex.unlock();
- }
+ try {
+ lookupResult = memTable.get(lookupKey);
+ if (lookupResult == null && immutableMemTable != null) {
+ lookupResult = immutableMemTable.get(lookupKey);
+ }
- // Not in memTables; try live files in level order
- LookupResult lookupResult = versions.get(lookupKey);
+ if (lookupResult == null) {
+ // Not in memTables; try live files in level order
+ readStats = new ReadStats();
+ lookupResult = current.get(lookupKey, readStats);
+ }
- // schedule compaction if necessary
- mutex.lock();
- try {
- if (versions.needsCompaction()) {
+ // schedule compaction if necessary
+ }
+ finally {
+ mutex.lock();
+ }
+ if (readStats != null && current.updateStats(readStats)) {
maybeScheduleCompaction();
}
+ current.release();
}
finally {
mutex.unlock();
@@ -646,21 +712,27 @@ public void put(byte[] key, byte[] value)
public Snapshot put(byte[] key, byte[] value, WriteOptions options)
throws DBException
{
- return writeInternal(new WriteBatchImpl().put(key, value), options);
+ try (WriteBatchImpl writeBatch = new WriteBatchImpl()) {
+ return writeInternal(writeBatch.put(key, value), options);
+ }
}
@Override
public void delete(byte[] key)
throws DBException
{
- writeInternal(new WriteBatchImpl().delete(key), new WriteOptions());
+ try (WriteBatchImpl writeBatch = new WriteBatchImpl()) {
+ writeInternal(writeBatch.delete(key), new WriteOptions());
+ }
}
@Override
public Snapshot delete(byte[] key, WriteOptions options)
throws DBException
{
- return writeInternal(new WriteBatchImpl().delete(key), options);
+ try (WriteBatchImpl writeBatch = new WriteBatchImpl()) {
+ return writeInternal(writeBatch.delete(key), options);
+ }
}
@Override
@@ -677,41 +749,84 @@ public Snapshot write(WriteBatch updates, WriteOptions options)
return writeInternal((WriteBatchImpl) updates, options);
}
- public Snapshot writeInternal(WriteBatchImpl updates, WriteOptions options)
+ public Snapshot writeInternal(WriteBatchImpl myBatch, WriteOptions options)
throws DBException
{
checkBackgroundException();
+ final WriteBatchInternal w = new WriteBatchInternal(myBatch, options.sync(), mutex.newCondition());
mutex.lock();
try {
+ writers.offerLast(w);
+ while (!w.done && writers.peekFirst() != w) {
+ w.await();
+ }
+ if (w.done) {
+ return null;
+ }
long sequenceEnd;
- if (updates.size() != 0) {
- makeRoomForWrite(false);
+ WriteBatchImpl updates = null;
+ ValueHolder lastWriter = new ValueHolder<>(w);
+ // May temporarily unlock and wait.
+ makeRoomForWrite(myBatch == null);
+ if (myBatch != null) {
+ updates = buildBatchGroup(lastWriter);
// Get sequence numbers for this change set
long sequenceBegin = versions.getLastSequence() + 1;
sequenceEnd = sequenceBegin + updates.size() - 1;
+ // Add to log and apply to memtable. We can release the lock
+ // during this phase since "w" is currently responsible for logging
+ // and protects against concurrent loggers and concurrent writes
+ // into mem_.
+ // log and memtable are modified by makeRoomForWrite
+ {
+ mutex.unlock();
+ try {
+ // Log write
+ Slice record = writeWriteBatch(updates, sequenceBegin);
+ try {
+ log.addRecord(record, options.sync());
+ }
+ catch (IOException e) {
+ throw new DBException(e);
+ }
+
+ // Update memtable
+ //this.memTable is modified by makeRoomForWrite
+ updates.forEach(new InsertIntoHandler(this.memTable, sequenceBegin));
+ }
+ finally {
+ mutex.lock();
+ }
+ }
+ if (updates == tmpBatch) {
+ tmpBatch.clear();
+ }
// Reserve this sequence in the version set
versions.setLastSequence(sequenceEnd);
+ }
- // Log write
- Slice record = writeWriteBatch(updates, sequenceBegin);
- try {
- log.addRecord(record, options.sync());
+ final WriteBatchInternal lastWriteV = lastWriter.getValue();
+ while (true) {
+ WriteBatchInternal ready = writers.peekFirst();
+ writers.pollFirst();
+ if (ready != w) {
+ ready.done = true;
+ ready.signal();
}
- catch (IOException e) {
- throw Throwables.propagate(e);
+ if (ready == lastWriteV) {
+ break;
}
-
- // Update memtable
- updates.forEach(new InsertIntoHandler(memTable, sequenceBegin));
}
- else {
- sequenceEnd = versions.getLastSequence();
+
+ // Notify new head of write queue
+ if (!writers.isEmpty()) {
+ writers.peekFirst().signal();
}
if (options.snapshot()) {
- return new SnapshotImpl(versions.getCurrent(), sequenceEnd);
+ return snapshots.newSnapshot(versions.getLastSequence());
}
else {
return null;
@@ -722,6 +837,60 @@ public Snapshot writeInternal(WriteBatchImpl updates, WriteOptions options)
}
}
+ /**
+ * REQUIRES: Writer list must be non-empty
+ * REQUIRES: First writer must have a non-NULL batch
+ */
+ private WriteBatchImpl buildBatchGroup(ValueHolder lastWriter)
+ {
+ checkArgument(!writers.isEmpty(), "A least one writer is required");
+ final WriteBatchInternal first = writers.peekFirst();
+ WriteBatchImpl result = first.batch;
+ checkArgument(result != null, "Batch must be non null");
+
+ int sizeInit;
+ sizeInit = first.batch.getApproximateSize();
+ /*
+ * Allow the group to grow up to a maximum size, but if the
+ * original write is small, limit the growth so we do not slow
+ * down the small write too much.
+ */
+ int maxSize = 1 << 20;
+ if (sizeInit <= (128 << 10)) {
+ maxSize = sizeInit + (128 << 10);
+ }
+
+ int size = 0;
+ lastWriter.setValue(first);
+ for (WriteBatchInternal w : writers) {
+ if (w.sync && !lastWriter.getValue().sync) {
+ // Do not include a sync write into a batch handled by a non-sync write.
+ break;
+ }
+
+ if (w.batch != null) {
+ size += w.batch.getApproximateSize();
+ if (size > maxSize) {
+ // Do not make batch too big
+ break;
+ }
+
+ // Append to result
+ if (result == first.batch) {
+ // Switch to temporary batch instead of disturbing caller's batch
+ result = tmpBatch;
+ checkState(result.size() == 0, "Temp batch should be clean");
+ result.append(first.batch);
+ }
+ else if (first.batch != w.batch) {
+ result.append(w.batch);
+ }
+ }
+ lastWriter.setValue(w);
+ }
+ return result;
+ }
+
@Override
public WriteBatch createWriteBatch()
{
@@ -744,7 +913,7 @@ public SeekingIteratorAdapter iterator(ReadOptions options)
DbIterator rawIterator = internalIterator();
// filter any entries not visible in our snapshot
- SnapshotImpl snapshot = getSnapshot(options);
+ long snapshot = getSnapshot(options);
SnapshotSeekingIterator snapshotIterator = new SnapshotSeekingIterator(rawIterator, snapshot, internalKeyComparator.getUserComparator());
return new SeekingIteratorAdapter(snapshotIterator);
}
@@ -753,18 +922,6 @@ public SeekingIteratorAdapter iterator(ReadOptions options)
}
}
- SeekingIterable internalIterable()
- {
- return new SeekingIterable()
- {
- @Override
- public DbIterator iterator()
- {
- return internalIterator();
- }
- };
- }
-
DbIterator internalIterator()
{
mutex.lock();
@@ -775,7 +932,16 @@ DbIterator internalIterator()
iterator = immutableMemTable.iterator();
}
Version current = versions.getCurrent();
- return new DbIterator(memTable.iterator(), iterator, current.getLevel0Files(), current.getLevelIterators(), internalKeyComparator);
+ current.retain();
+ return new DbIterator(memTable.iterator(), iterator, current.getLevelIterators(), internalKeyComparator, () -> {
+ mutex.lock();
+ try {
+ current.release();
+ }
+ finally {
+ mutex.unlock();
+ }
+ });
}
finally {
mutex.unlock();
@@ -788,22 +954,21 @@ public Snapshot getSnapshot()
checkBackgroundException();
mutex.lock();
try {
- return new SnapshotImpl(versions.getCurrent(), versions.getLastSequence());
+ return snapshots.newSnapshot(versions.getLastSequence());
}
finally {
mutex.unlock();
}
}
- private SnapshotImpl getSnapshot(ReadOptions options)
+ private long getSnapshot(ReadOptions options)
{
- SnapshotImpl snapshot;
+ long snapshot;
if (options.snapshot() != null) {
- snapshot = (SnapshotImpl) options.snapshot();
+ snapshot = snapshots.getSequenceFrom(options.snapshot());
}
else {
- snapshot = new SnapshotImpl(versions.getCurrent(), versions.getLastSequence());
- snapshot.close(); // To avoid holding the snapshot active..
+ snapshot = versions.getLastSequence();
}
return snapshot;
}
@@ -811,17 +976,15 @@ private SnapshotImpl getSnapshot(ReadOptions options)
private void makeRoomForWrite(boolean force)
{
checkState(mutex.isHeldByCurrentThread());
+ checkState(!writers.isEmpty());
boolean allowDelay = !force;
while (true) {
- // todo background processing system need work
-// if (!bg_error_.ok()) {
-// // Yield previous error
-// s = bg_error_;
-// break;
-// } else
- if (allowDelay && versions.numberOfFilesInLevel(0) > L0_SLOWDOWN_WRITES_TRIGGER) {
+ if (backgroundException != null) {
+ throw new DBException("Background exception occurred", backgroundException);
+ }
+ else if (allowDelay && versions.numberOfFilesInLevel(0) > L0_SLOWDOWN_WRITES_TRIGGER) {
// We are getting close to hitting a hard limit on the number of
// L0 files. Rather than delaying a single write by several
// seconds when we hit the hard limit, start delaying each
@@ -866,13 +1029,13 @@ else if (versions.numberOfFilesInLevel(0) >= L0_STOP_WRITES_TRIGGER) {
log.close();
}
catch (IOException e) {
- throw new RuntimeException("Unable to close log file " + log.getFile(), e);
+ throw new RuntimeException("Unable to close log file " + log, e);
}
// open a new log
long logNumber = versions.getNextFileNumber();
try {
- this.log = Logs.createLogWriter(new File(databaseDir, Filename.logFileName(logNumber)), logNumber);
+ this.log = Logs.createLogWriter(new File(databaseDir, Filename.logFileName(logNumber)), logNumber, options.allowMmapWrites());
}
catch (IOException e) {
throw new RuntimeException("Unable to open new log file " +
@@ -891,31 +1054,19 @@ else if (versions.numberOfFilesInLevel(0) >= L0_STOP_WRITES_TRIGGER) {
}
}
- public void compactMemTable()
- throws IOException
- {
- mutex.lock();
- try {
- compactMemTableInternal();
- }
- finally {
- mutex.unlock();
- }
- }
-
- private void compactMemTableInternal()
+ private void compactMemTable()
throws IOException
{
checkState(mutex.isHeldByCurrentThread());
- if (immutableMemTable == null) {
- return;
- }
+ checkState(immutableMemTable != null);
try {
// Save the contents of the memtable as a new Table
VersionEdit edit = new VersionEdit();
Version base = versions.getCurrent();
+ base.retain();
writeLevel0Table(immutableMemTable, edit, base);
+ base.release();
if (shuttingDown.get()) {
throw new DatabaseShutdownException("Database shutdown during memtable compaction");
@@ -924,10 +1075,9 @@ private void compactMemTableInternal()
// Replace immutable memtable with the generated Table
edit.setPreviousLogNumber(0);
edit.setLogNumber(log.getFileNumber()); // Earlier logs no longer needed
- versions.logAndApply(edit);
+ versions.logAndApply(edit, mutex);
immutableMemTable = null;
-
deleteObsoleteFiles();
}
finally {
@@ -938,6 +1088,7 @@ private void compactMemTableInternal()
private void writeLevel0Table(MemTable mem, VersionEdit edit, Version base)
throws IOException
{
+ final long startMicros = env.nowMicros();
checkState(mutex.isHeldByCurrentThread());
// skip empty mem table
@@ -969,6 +1120,7 @@ private void writeLevel0Table(MemTable mem, VersionEdit edit, Version base)
}
edit.addFile(level, meta);
}
+ this.stats[level].Add(env.nowMicros() - startMicros, 0, meta.getFileSize());
}
private FileMetaData buildTable(SeekingIterable data, long fileNumber)
@@ -978,9 +1130,8 @@ private FileMetaData buildTable(SeekingIterable data, long f
try {
InternalKey smallest = null;
InternalKey largest = null;
- FileChannel channel = new FileOutputStream(file).getChannel();
- try {
- TableBuilder tableBuilder = new TableBuilder(options, channel, new InternalUserComparator(internalKeyComparator));
+ try (WritableFile writableFile = UnbufferedWritableFile.open(file)) {
+ TableBuilder tableBuilder = new TableBuilder(options, writableFile, new InternalUserComparator(internalKeyComparator));
for (Entry entry : data) {
// update keys
@@ -994,14 +1145,7 @@ private FileMetaData buildTable(SeekingIterable data, long f
}
tableBuilder.finish();
- }
- finally {
- try {
- channel.force(true);
- }
- finally {
- channel.close();
- }
+ writableFile.force();
}
if (smallest == null) {
@@ -1012,8 +1156,6 @@ private FileMetaData buildTable(SeekingIterable data, long f
// verify table can be opened
tableCache.newIterator(fileMetaData);
- pendingOutputs.remove(fileNumber);
-
return fileMetaData;
}
@@ -1026,13 +1168,14 @@ private FileMetaData buildTable(SeekingIterable data, long f
private void doCompactionWork(CompactionState compactionState)
throws IOException
{
+ final long startMicros = env.nowMicros();
+ long immMicros = 0; // Micros spent doing imm_ compactions
checkState(mutex.isHeldByCurrentThread());
checkArgument(versions.numberOfBytesInLevel(compactionState.getCompaction().getLevel()) > 0);
checkArgument(compactionState.builder == null);
checkArgument(compactionState.outfile == null);
- // todo track snapshots
- compactionState.smallestSnapshot = versions.getLastSequence();
+ compactionState.smallestSnapshot = snapshots.isEmpty() ? versions.getLastSequence() : snapshots.getOldest();
// Release mutex while we're actually doing the compaction work
mutex.unlock();
@@ -1045,14 +1188,17 @@ private void doCompactionWork(CompactionState compactionState)
long lastSequenceForKey = MAX_SEQUENCE_NUMBER;
while (iterator.hasNext() && !shuttingDown.get()) {
// always give priority to compacting the current mem table
- mutex.lock();
- try {
- compactMemTableInternal();
- }
- finally {
- mutex.unlock();
+ if (immutableMemTable != null) {
+ long immStart = env.nowMicros();
+ mutex.lock();
+ try {
+ compactMemTable();
+ }
+ finally {
+ mutex.unlock();
+ }
+ immMicros += (env.nowMicros() - immStart);
}
-
InternalKey key = iterator.peek().getKey();
if (compactionState.compaction.shouldStopBefore(key) && compactionState.builder != null) {
finishCompactionOutputFile(compactionState);
@@ -1123,11 +1269,20 @@ else if (key.getValueType() == DELETION &&
}
}
finally {
+ long micros = env.nowMicros() - startMicros - immMicros;
+ long bytesRead = 0;
+ for (int which = 0; which < 2; which++) {
+ for (int i = 0; i < compactionState.compaction.input(which).size(); i++) {
+ bytesRead += compactionState.compaction.input(which, i).getFileSize();
+ }
+ }
+ long bytesWritten = 0;
+ for (int i = 0; i < compactionState.outputs.size(); i++) {
+ bytesWritten += compactionState.outputs.get(i).getFileSize();
+ }
mutex.lock();
+ this.stats[compactionState.compaction.getLevel() + 1].Add(micros, bytesRead, bytesWritten);
}
-
- // todo port CompactionStats code
-
installCompactionResults(compactionState);
}
@@ -1137,22 +1292,22 @@ private void openCompactionOutputFile(CompactionState compactionState)
requireNonNull(compactionState, "compactionState is null");
checkArgument(compactionState.builder == null, "compactionState builder is not null");
+ long fileNumber;
mutex.lock();
try {
- long fileNumber = versions.getNextFileNumber();
+ fileNumber = versions.getNextFileNumber();
pendingOutputs.add(fileNumber);
compactionState.currentFileNumber = fileNumber;
compactionState.currentFileSize = 0;
compactionState.currentSmallest = null;
compactionState.currentLargest = null;
-
- File file = new File(databaseDir, Filename.tableFileName(fileNumber));
- compactionState.outfile = new FileOutputStream(file).getChannel();
- compactionState.builder = new TableBuilder(options, compactionState.outfile, new InternalUserComparator(internalKeyComparator));
}
finally {
mutex.unlock();
}
+ File file = new File(databaseDir, Filename.tableFileName(fileNumber));
+ compactionState.outfile = UnbufferedWritableFile.open(file);
+ compactionState.builder = new TableBuilder(options, compactionState.outfile, new InternalUserComparator(internalKeyComparator));
}
private void finishCompactionOutputFile(CompactionState compactionState)
@@ -1180,7 +1335,7 @@ private void finishCompactionOutputFile(CompactionState compactionState)
compactionState.builder = null;
- compactionState.outfile.force(true);
+ compactionState.outfile.force();
compactionState.outfile.close();
compactionState.outfile = null;
@@ -1203,25 +1358,21 @@ private void installCompactionResults(CompactionState compact)
pendingOutputs.remove(output.getNumber());
}
- try {
- versions.logAndApply(compact.compaction.getEdit());
- deleteObsoleteFiles();
- }
- catch (IOException e) {
- // Compaction failed for some reason. Simply discard the work and try again later.
-
- // Discard any files we may have created during this failed compaction
- for (FileMetaData output : compact.outputs) {
- File file = new File(databaseDir, Filename.tableFileName(output.getNumber()));
- file.delete();
- }
- compact.outputs.clear();
- }
+ versions.logAndApply(compact.compaction.getEdit(), mutex);
}
+ @VisibleForTesting
int numberOfFilesInLevel(int level)
{
- return versions.getCurrent().numberOfFilesInLevel(level);
+ mutex.lock();
+ Version v;
+ try {
+ v = versions.getCurrent();
+ }
+ finally {
+ mutex.unlock();
+ }
+ return v.numberOfFilesInLevel(level);
}
@Override
@@ -1238,19 +1389,39 @@ public long[] getApproximateSizes(Range... ranges)
public long getApproximateSizes(Range range)
{
- Version v = versions.getCurrent();
+ mutex.lock();
+ Version v;
+ try {
+ v = versions.getCurrent();
+ v.retain();
+ }
+ finally {
+ mutex.unlock();
+ }
InternalKey startKey = new InternalKey(Slices.wrappedBuffer(range.start()), MAX_SEQUENCE_NUMBER, VALUE);
InternalKey limitKey = new InternalKey(Slices.wrappedBuffer(range.limit()), MAX_SEQUENCE_NUMBER, VALUE);
long startOffset = v.getApproximateOffsetOf(startKey);
long limitOffset = v.getApproximateOffsetOf(limitKey);
-
+ mutex.lock();
+ try {
+ v.release();
+ }
+ finally {
+ mutex.unlock();
+ }
return (limitOffset >= startOffset ? limitOffset - startOffset : 0);
}
public long getMaxNextLevelOverlappingBytes()
{
- return versions.getMaxNextLevelOverlappingBytes();
+ mutex.lock();
+ try {
+ return versions.getMaxNextLevelOverlappingBytes();
+ }
+ finally {
+ mutex.unlock();
+ }
}
private static class CompactionState
@@ -1262,7 +1433,7 @@ private static class CompactionState
private long smallestSnapshot;
// State kept for output being generated
- private FileChannel outfile;
+ private WritableFile outfile;
private TableBuilder builder;
// Current file being generated
@@ -1287,10 +1458,11 @@ public Compaction getCompaction()
private static class ManualCompaction
{
private final int level;
- private final Slice begin;
- private final Slice end;
+ private InternalKey begin;
+ private final InternalKey end;
+ private boolean done;
- private ManualCompaction(int level, Slice begin, Slice end)
+ private ManualCompaction(int level, InternalKey begin, InternalKey end)
{
this.level = level;
this.begin = begin;
@@ -1298,6 +1470,29 @@ private ManualCompaction(int level, Slice begin, Slice end)
}
}
+ // Per level compaction stats. stats[level] stores the stats for
+ // compactions that produced data for the specified "level".
+ private static class CompactionStats
+ {
+ long micros;
+ long bytesRead;
+ long bytesWritten;
+
+ CompactionStats()
+ {
+ this.micros = 0;
+ this.bytesRead = 0;
+ this.bytesWritten = 0;
+ }
+
+ public void Add(long micros, long bytesRead, long bytesWritten)
+ {
+ this.micros += micros;
+ this.bytesRead += bytesRead;
+ this.bytesWritten += bytesWritten;
+ }
+ }
+
private WriteBatchImpl readWriteBatch(SliceInput record, int updateSize)
throws IOException
{
@@ -1353,31 +1548,6 @@ public void delete(Slice key)
return record.slice(0, sliceOutput.size());
}
- private static class InsertIntoHandler
- implements Handler
- {
- private long sequence;
- private final MemTable memTable;
-
- public InsertIntoHandler(MemTable memTable, long sequenceBegin)
- {
- this.memTable = memTable;
- this.sequence = sequenceBegin;
- }
-
- @Override
- public void put(Slice key, Slice value)
- {
- memTable.add(sequence++, VALUE, key, value);
- }
-
- @Override
- public void delete(Slice key)
- {
- memTable.add(sequence++, DELETION, key, Slices.EMPTY_SLICE);
- }
- }
-
public static class DatabaseShutdownException
extends DBException
{
@@ -1445,6 +1615,102 @@ public void resumeCompactions()
public void compactRange(byte[] begin, byte[] end)
throws DBException
{
- throw new UnsupportedOperationException("Not yet implemented");
+ final Slice smallestUserKey = begin == null ? null : new Slice(begin, 0, begin.length);
+ final Slice largestUserKey = end == null ? null : new Slice(end, 0, end.length);
+ int maxLevelWithFiles = 1;
+ mutex.lock();
+ try {
+ Version base = versions.getCurrent();
+ for (int level = 1; level < DbConstants.NUM_LEVELS; level++) {
+ if (base.overlapInLevel(level, smallestUserKey, largestUserKey)) {
+ maxLevelWithFiles = level;
+ }
+ }
+ }
+ finally {
+ mutex.unlock();
+ }
+ testCompactMemTable(); // TODO: Skip if memtable does not overlap
+ for (int level = 0; level < maxLevelWithFiles; level++) {
+ testCompactRange(level, smallestUserKey, largestUserKey);
+ }
+ }
+
+ @VisibleForTesting
+ void testCompactRange(int level, Slice begin, Slice end) throws DBException
+ {
+ checkArgument(level >= 0);
+ checkArgument(level + 1 < DbConstants.NUM_LEVELS);
+
+ final InternalKey beginStorage = begin == null ? null : new InternalKey(begin, SequenceNumber.MAX_SEQUENCE_NUMBER, VALUE);
+ final InternalKey endStorage = end == null ? null : new InternalKey(end, 0, DELETION);
+ ManualCompaction manual = new ManualCompaction(level, beginStorage, endStorage);
+ mutex.lock();
+ try {
+ while (!manual.done && !shuttingDown.get() && backgroundException == null) {
+ if (manualCompaction == null) { // Idle
+ manualCompaction = manual;
+ maybeScheduleCompaction();
+ }
+ else { // Running either my compaction or another compaction.
+ backgroundCondition.awaitUninterruptibly();
+ }
+ }
+ if (manualCompaction == manual) {
+ // Cancel my manual compaction since we aborted early for some reason.
+ manualCompaction = null;
+ }
+ }
+ finally {
+ mutex.unlock();
+ }
+ }
+
+ @VisibleForTesting
+ public void testCompactMemTable() throws DBException
+ {
+ // NULL batch means just wait for earlier writes to be done
+ writeInternal(null, new WriteOptions());
+ // Wait until the compaction completes
+ mutex.lock();
+
+ try {
+ while (immutableMemTable != null && backgroundException == null) {
+ backgroundCondition.awaitUninterruptibly();
+ }
+ if (immutableMemTable != null) {
+ if (backgroundException != null) {
+ throw new DBException(backgroundException);
+ }
+ }
+ }
+ finally {
+ mutex.unlock();
+ }
+ }
+
+ private class WriteBatchInternal
+ {
+ private final WriteBatchImpl batch;
+ private final boolean sync;
+ private final Condition backgroundCondition;
+ private boolean done = false;
+
+ public WriteBatchInternal(WriteBatchImpl batch, boolean sync, Condition backgroundCondition)
+ {
+ this.batch = batch;
+ this.sync = sync;
+ this.backgroundCondition = backgroundCondition;
+ }
+
+ public void await()
+ {
+ backgroundCondition.awaitUninterruptibly();
+ }
+
+ public void signal()
+ {
+ backgroundCondition.signal();
+ }
}
}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/Env.java b/leveldb/src/main/java/org/iq80/leveldb/impl/Env.java
new file mode 100644
index 00000000..36989713
--- /dev/null
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/Env.java
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.iq80.leveldb.impl;
+
+public interface Env
+{
+ long nowMicros();
+}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/EnvImpl.java b/leveldb/src/main/java/org/iq80/leveldb/impl/EnvImpl.java
new file mode 100644
index 00000000..10385917
--- /dev/null
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/EnvImpl.java
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.iq80.leveldb.impl;
+
+import java.util.concurrent.TimeUnit;
+
+public class EnvImpl implements Env
+{
+ @Override
+ public long nowMicros()
+ {
+ return TimeUnit.NANOSECONDS.toMicros(System.nanoTime());
+ }
+}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/FileChannelLogWriter.java b/leveldb/src/main/java/org/iq80/leveldb/impl/FileChannelLogWriter.java
deleted file mode 100644
index 7547cedf..00000000
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/FileChannelLogWriter.java
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright (C) 2011 the original author or authors.
- * See the notice.md file distributed with this work for additional
- * information regarding copyright ownership.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.iq80.leveldb.impl;
-
-import org.iq80.leveldb.util.Closeables;
-import org.iq80.leveldb.util.Slice;
-import org.iq80.leveldb.util.SliceInput;
-import org.iq80.leveldb.util.SliceOutput;
-import org.iq80.leveldb.util.Slices;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.channels.FileChannel;
-import java.util.concurrent.atomic.AtomicBoolean;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkState;
-import static java.util.Objects.requireNonNull;
-import static org.iq80.leveldb.impl.LogConstants.BLOCK_SIZE;
-import static org.iq80.leveldb.impl.LogConstants.HEADER_SIZE;
-
-public class FileChannelLogWriter
- implements LogWriter
-{
- private final File file;
- private final long fileNumber;
- private final FileChannel fileChannel;
- private final AtomicBoolean closed = new AtomicBoolean();
-
- /**
- * Current offset in the current block
- */
- private int blockOffset;
-
- public FileChannelLogWriter(File file, long fileNumber)
- throws FileNotFoundException
- {
- requireNonNull(file, "file is null");
- checkArgument(fileNumber >= 0, "fileNumber is negative");
-
- this.file = file;
- this.fileNumber = fileNumber;
- this.fileChannel = new FileOutputStream(file).getChannel();
- }
-
- @Override
- public boolean isClosed()
- {
- return closed.get();
- }
-
- @Override
- public synchronized void close()
- {
- closed.set(true);
-
- // try to forces the log to disk
- try {
- fileChannel.force(true);
- }
- catch (IOException ignored) {
- }
-
- // close the channel
- Closeables.closeQuietly(fileChannel);
- }
-
- @Override
- public synchronized void delete()
- {
- closed.set(true);
-
- // close the channel
- Closeables.closeQuietly(fileChannel);
-
- // try to delete the file
- file.delete();
- }
-
- @Override
- public File getFile()
- {
- return file;
- }
-
- @Override
- public long getFileNumber()
- {
- return fileNumber;
- }
-
- // Writes a stream of chunks such that no chunk is split across a block boundary
- @Override
- public synchronized void addRecord(Slice record, boolean force)
- throws IOException
- {
- checkState(!closed.get(), "Log has been closed");
-
- SliceInput sliceInput = record.input();
-
- // used to track first, middle and last blocks
- boolean begin = true;
-
- // Fragment the record int chunks as necessary and write it. Note that if record
- // is empty, we still want to iterate once to write a single
- // zero-length chunk.
- do {
- int bytesRemainingInBlock = BLOCK_SIZE - blockOffset;
- checkState(bytesRemainingInBlock >= 0);
-
- // Switch to a new block if necessary
- if (bytesRemainingInBlock < HEADER_SIZE) {
- if (bytesRemainingInBlock > 0) {
- // Fill the rest of the block with zeros
- // todo lame... need a better way to write zeros
- fileChannel.write(ByteBuffer.allocate(bytesRemainingInBlock));
- }
- blockOffset = 0;
- bytesRemainingInBlock = BLOCK_SIZE - blockOffset;
- }
-
- // Invariant: we never leave less than HEADER_SIZE bytes available in a block
- int bytesAvailableInBlock = bytesRemainingInBlock - HEADER_SIZE;
- checkState(bytesAvailableInBlock >= 0);
-
- // if there are more bytes in the record then there are available in the block,
- // fragment the record; otherwise write to the end of the record
- boolean end;
- int fragmentLength;
- if (sliceInput.available() > bytesAvailableInBlock) {
- end = false;
- fragmentLength = bytesAvailableInBlock;
- }
- else {
- end = true;
- fragmentLength = sliceInput.available();
- }
-
- // determine block type
- LogChunkType type;
- if (begin && end) {
- type = LogChunkType.FULL;
- }
- else if (begin) {
- type = LogChunkType.FIRST;
- }
- else if (end) {
- type = LogChunkType.LAST;
- }
- else {
- type = LogChunkType.MIDDLE;
- }
-
- // write the chunk
- writeChunk(type, sliceInput.readSlice(fragmentLength));
-
- // we are no longer on the first chunk
- begin = false;
- } while (sliceInput.isReadable());
-
- if (force) {
- fileChannel.force(false);
- }
- }
-
- private void writeChunk(LogChunkType type, Slice slice)
- throws IOException
- {
- checkArgument(slice.length() <= 0xffff, "length %s is larger than two bytes", slice.length());
- checkArgument(blockOffset + HEADER_SIZE <= BLOCK_SIZE);
-
- // create header
- Slice header = newLogRecordHeader(type, slice, slice.length());
-
- // write the header and the payload
- header.getBytes(0, fileChannel, header.length());
- slice.getBytes(0, fileChannel, slice.length());
-
- blockOffset += HEADER_SIZE + slice.length();
- }
-
- private Slice newLogRecordHeader(LogChunkType type, Slice slice, int length)
- {
- int crc = Logs.getChunkChecksum(type.getPersistentId(), slice.getRawArray(), slice.getRawOffset(), length);
-
- // Format the header
- SliceOutput header = Slices.allocate(HEADER_SIZE).output();
- header.writeInt(crc);
- header.writeByte((byte) (length & 0xff));
- header.writeByte((byte) (length >>> 8));
- header.writeByte((byte) (type.getPersistentId()));
-
- return header.slice();
- }
-}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/Filename.java b/leveldb/src/main/java/org/iq80/leveldb/impl/Filename.java
index b12ec99a..e8f7d819 100755
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/Filename.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/Filename.java
@@ -57,6 +57,13 @@ public static String logFileName(long number)
* Return the name of the sstable with the specified number.
*/
public static String tableFileName(long number)
+ {
+ return makeFileName(number, "ldb");
+ }
+ /**
+ * Return the deprecated name of the sstable with the specified number.
+ */
+ public static String sstTableFileName(long number)
{
return makeFileName(number, "sst");
}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/InsertIntoHandler.java b/leveldb/src/main/java/org/iq80/leveldb/impl/InsertIntoHandler.java
new file mode 100644
index 00000000..cf777f3e
--- /dev/null
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/InsertIntoHandler.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.iq80.leveldb.impl;
+
+import org.iq80.leveldb.util.Slice;
+import org.iq80.leveldb.util.Slices;
+
+import static org.iq80.leveldb.impl.ValueType.DELETION;
+import static org.iq80.leveldb.impl.ValueType.VALUE;
+
+final class InsertIntoHandler
+ implements WriteBatchImpl.Handler
+{
+ private long sequence;
+ private final MemTable memTable;
+
+ public InsertIntoHandler(MemTable memTable, long sequenceBegin)
+ {
+ this.memTable = memTable;
+ this.sequence = sequenceBegin;
+ }
+
+ @Override
+ public void put(Slice key, Slice value)
+ {
+ memTable.add(sequence++, VALUE, key, value);
+ }
+
+ @Override
+ public void delete(Slice key)
+ {
+ memTable.add(sequence++, DELETION, key, Slices.EMPTY_SLICE);
+ }
+}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/InternalFilterPolicy.java b/leveldb/src/main/java/org/iq80/leveldb/impl/InternalFilterPolicy.java
new file mode 100644
index 00000000..5b92b3c3
--- /dev/null
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/InternalFilterPolicy.java
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.iq80.leveldb.impl;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Lists;
+import org.iq80.leveldb.XFilterPolicy;
+import org.iq80.leveldb.util.Slice;
+
+import java.util.List;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+/**
+ * Filter policy wrapper that converts from internal keys to user keys
+ *
+ *
+ *
+ * @author Honore Vasconcelos
+ */
+final class InternalFilterPolicy implements org.iq80.leveldb.table.FilterPolicy
+{
+ private static final Function EXTRACT_USER_KEY = InternalFilterPolicy::extractUserKey;
+ private org.iq80.leveldb.table.FilterPolicy userPolicy;
+
+ private InternalFilterPolicy(org.iq80.leveldb.table.FilterPolicy userPolicy)
+ {
+ this.userPolicy = userPolicy;
+ }
+
+ static InternalFilterPolicy convert(XFilterPolicy policy)
+ {
+ checkArgument(policy == null || policy instanceof org.iq80.leveldb.table.FilterPolicy, "Filter policy must implement Java interface FilterPolicy");
+ if (policy instanceof InternalFilterPolicy) {
+ return (InternalFilterPolicy) policy;
+ }
+ return policy == null ? null : new InternalFilterPolicy((org.iq80.leveldb.table.FilterPolicy) policy);
+ }
+
+ @Override
+ public String name()
+ {
+ return userPolicy.name();
+ }
+
+ @Override
+ public byte[] createFilter(final List keys)
+ {
+ //instead of copying all the keys to a shorter form, make it lazy
+ return userPolicy.createFilter(Lists.transform(keys, EXTRACT_USER_KEY));
+ }
+
+ @Override
+ public boolean keyMayMatch(Slice key, Slice filter)
+ {
+ return userPolicy.keyMayMatch(extractUserKey(key), filter);
+ }
+
+ private static Slice extractUserKey(Slice key)
+ {
+ checkArgument(key.length() >= 8);
+ return key.slice(0, key.length() - 8);
+ }
+}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/Iq80DBFactory.java b/leveldb/src/main/java/org/iq80/leveldb/impl/Iq80DBFactory.java
index 4a8244ac..400eddc0 100644
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/Iq80DBFactory.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/Iq80DBFactory.java
@@ -80,7 +80,7 @@ public class Iq80DBFactory
public DB open(File path, Options options)
throws IOException
{
- return new DbImpl(options, path);
+ return new DbImpl(options, path, new EnvImpl());
}
@Override
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/KeyMatchingLookup.java b/leveldb/src/main/java/org/iq80/leveldb/impl/KeyMatchingLookup.java
new file mode 100644
index 00000000..8de14380
--- /dev/null
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/KeyMatchingLookup.java
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.iq80.leveldb.impl;
+
+import org.iq80.leveldb.table.KeyValueFunction;
+import org.iq80.leveldb.util.Slice;
+
+import static com.google.common.base.Preconditions.checkState;
+import static org.iq80.leveldb.impl.ValueType.VALUE;
+
+/**
+ * @author Honore Vasconcelos
+ */
+public class KeyMatchingLookup implements KeyValueFunction
+{
+ private LookupKey key;
+
+ KeyMatchingLookup(LookupKey key)
+ {
+ this.key = key;
+ }
+
+ @Override
+ public LookupResult apply(Slice internalKey1, Slice value)
+ {
+ // parse the key in the block
+ checkState(internalKey1 != null, "Corrupt key for %s", key);
+
+ final InternalKey internalKey = new InternalKey(internalKey1);
+
+ // if this is a value key (not a delete) and the keys match, return the value
+ if (key.getUserKey().equals(internalKey.getUserKey())) {
+ if (internalKey.getValueType() == ValueType.DELETION) {
+ return LookupResult.deleted(key);
+ }
+ else if (internalKey.getValueType() == VALUE) {
+ return LookupResult.ok(key, value);
+ }
+ }
+ return null;
+ }
+}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/Level.java b/leveldb/src/main/java/org/iq80/leveldb/impl/Level.java
index 3b2b1147..d11c318f 100644
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/Level.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/Level.java
@@ -17,21 +17,19 @@
*/
package org.iq80.leveldb.impl;
-import com.google.common.collect.Lists;
+import com.google.common.annotations.VisibleForTesting;
import org.iq80.leveldb.table.UserComparator;
-import org.iq80.leveldb.util.InternalTableIterator;
+import org.iq80.leveldb.util.InternalIterator;
+import org.iq80.leveldb.util.Level0Iterator;
import org.iq80.leveldb.util.LevelIterator;
import org.iq80.leveldb.util.Slice;
import java.util.ArrayList;
-import java.util.Collections;
+import java.util.Collection;
import java.util.Comparator;
import java.util.List;
-import java.util.Map.Entry;
import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkState;
-import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Objects.requireNonNull;
import static org.iq80.leveldb.impl.SequenceNumber.MAX_SEQUENCE_NUMBER;
import static org.iq80.leveldb.impl.ValueType.VALUE;
@@ -40,12 +38,13 @@
public class Level
implements SeekingIterable
{
+ private static final Comparator NEWEST_FIRST = (fileMetaData, fileMetaData1) -> (int) (fileMetaData1.getNumber() - fileMetaData.getNumber());
private final int levelNumber;
private final TableCache tableCache;
private final InternalKeyComparator internalKeyComparator;
private final List files;
- public Level(int levelNumber, List files, TableCache tableCache, InternalKeyComparator internalKeyComparator)
+ public Level(int levelNumber, Collection files, TableCache tableCache, InternalKeyComparator internalKeyComparator)
{
checkArgument(levelNumber >= 0, "levelNumber is negative");
requireNonNull(files, "files is null");
@@ -55,7 +54,6 @@ public Level(int levelNumber, List files, TableCache tableCache, I
this.files = new ArrayList<>(files);
this.tableCache = tableCache;
this.internalKeyComparator = internalKeyComparator;
- checkArgument(levelNumber >= 0, "levelNumber is negative");
this.levelNumber = levelNumber;
}
@@ -70,9 +68,14 @@ public List getFiles()
}
@Override
- public LevelIterator iterator()
+ public InternalIterator iterator()
{
- return createLevelConcatIterator(tableCache, files, internalKeyComparator);
+ if (levelNumber == 0) {
+ return new Level0Iterator(tableCache, files, internalKeyComparator);
+ }
+ else {
+ return createLevelConcatIterator(tableCache, files, internalKeyComparator);
+ }
}
public static LevelIterator createLevelConcatIterator(TableCache tableCache, List files, InternalKeyComparator internalKeyComparator)
@@ -94,10 +97,14 @@ public LookupResult get(LookupKey key, ReadStats readStats)
fileMetaDataList.add(fileMetaData);
}
}
+ if (fileMetaDataList.isEmpty()) {
+ return null;
+ }
+ fileMetaDataList.sort(NEWEST_FIRST);
}
else {
// Binary search to find earliest index whose largest key >= ikey.
- int index = ceilingEntryIndex(Lists.transform(files, FileMetaData::getLargest), key.getInternalKey(), internalKeyComparator);
+ int index = findFile(key.getInternalKey());
// did we find any files that could contain the key?
if (index >= files.size()) {
@@ -127,61 +134,65 @@ public LookupResult get(LookupKey key, ReadStats readStats)
lastFileRead = fileMetaData;
lastFileReadLevel = levelNumber;
- // open the iterator
- InternalTableIterator iterator = tableCache.newIterator(fileMetaData);
-
- // seek to the key
- iterator.seek(key.getInternalKey());
-
- if (iterator.hasNext()) {
- // parse the key in the block
- Entry entry = iterator.next();
- InternalKey internalKey = entry.getKey();
- checkState(internalKey != null, "Corrupt key for %s", key.getUserKey().toString(UTF_8));
-
- // if this is a value key (not a delete) and the keys match, return the value
- if (key.getUserKey().equals(internalKey.getUserKey())) {
- if (internalKey.getValueType() == ValueType.DELETION) {
- return LookupResult.deleted(key);
- }
- else if (internalKey.getValueType() == VALUE) {
- return LookupResult.ok(key, entry.getValue());
- }
- }
+ final LookupResult lookupResult = tableCache.get(key.getInternalKey().encode(), fileMetaData, new KeyMatchingLookup(key));
+ if (lookupResult != null) {
+ return lookupResult;
}
}
return null;
}
- private static int ceilingEntryIndex(List list, T key, Comparator comparator)
+ public boolean someFileOverlapsRange(boolean disjointSortedFiles, Slice smallestUserKey, Slice largestUserKey)
{
- int insertionPoint = Collections.binarySearch(list, key, comparator);
- if (insertionPoint < 0) {
- insertionPoint = -(insertionPoint + 1);
+ UserComparator userComparator = internalKeyComparator.getUserComparator();
+ if (!disjointSortedFiles) {
+ // Need to check against all files
+ for (FileMetaData file : files) {
+ if (afterFile(userComparator, smallestUserKey, file) ||
+ beforeFile(userComparator, largestUserKey, file)) {
+ // No overlap
+ }
+ else {
+ return true; // Overlap
+ }
+ }
+ return false;
+ }
+ int index = 0;
+ if (smallestUserKey != null) {
+ InternalKey smallestInternalKey = new InternalKey(smallestUserKey, MAX_SEQUENCE_NUMBER, VALUE);
+ index = findFile(smallestInternalKey);
+ }
+
+ if (index >= files.size()) {
+ // beginning of range is after all files, so no overlap.
+ return false;
}
- return insertionPoint;
+
+ return !beforeFile(userComparator, largestUserKey, files.get(index));
}
- public boolean someFileOverlapsRange(Slice smallestUserKey, Slice largestUserKey)
+ private boolean beforeFile(UserComparator userComparator, Slice userKey, FileMetaData file)
{
- InternalKey smallestInternalKey = new InternalKey(smallestUserKey, MAX_SEQUENCE_NUMBER, VALUE);
- int index = findFile(smallestInternalKey);
-
- UserComparator userComparator = internalKeyComparator.getUserComparator();
- return ((index < files.size()) &&
- userComparator.compare(largestUserKey, files.get(index).getSmallest().getUserKey()) >= 0);
+ // null userKey occurs after all keys and is therefore never before *f
+ return (userKey != null &&
+ userComparator.compare(userKey, file.getSmallest().getUserKey()) < 0);
}
- private int findFile(InternalKey targetKey)
+ private boolean afterFile(UserComparator userComparator, Slice userKey, FileMetaData file)
{
- if (files.isEmpty()) {
- return files.size();
- }
+ // NULL user_key occurs before all keys and is therefore never after *f
+ return (userKey != null &&
+ userComparator.compare(userKey, file.getLargest().getUserKey()) > 0);
+ }
+ @VisibleForTesting
+ int findFile(InternalKey targetKey)
+ {
// todo replace with Collections.binarySearch
int left = 0;
- int right = files.size() - 1;
+ int right = files.size();
// binary search restart positions to find the restart position immediately before the targetKey
while (left < right) {
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/Level0.java b/leveldb/src/main/java/org/iq80/leveldb/impl/Level0.java
deleted file mode 100644
index c1b7c9e7..00000000
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/Level0.java
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (C) 2011 the original author or authors.
- * See the notice.md file distributed with this work for additional
- * information regarding copyright ownership.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.iq80.leveldb.impl;
-
-import org.iq80.leveldb.table.UserComparator;
-import org.iq80.leveldb.util.InternalTableIterator;
-import org.iq80.leveldb.util.Level0Iterator;
-import org.iq80.leveldb.util.Slice;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-import java.util.Map.Entry;
-
-import static com.google.common.base.Preconditions.checkState;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static java.util.Objects.requireNonNull;
-import static org.iq80.leveldb.impl.SequenceNumber.MAX_SEQUENCE_NUMBER;
-import static org.iq80.leveldb.impl.ValueType.VALUE;
-
-// todo this class should be immutable
-public class Level0
- implements SeekingIterable
-{
- private final TableCache tableCache;
- private final InternalKeyComparator internalKeyComparator;
- private final List files;
-
- public static final Comparator NEWEST_FIRST = new Comparator()
- {
- @Override
- public int compare(FileMetaData fileMetaData, FileMetaData fileMetaData1)
- {
- return (int) (fileMetaData1.getNumber() - fileMetaData.getNumber());
- }
- };
-
- public Level0(List files, TableCache tableCache, InternalKeyComparator internalKeyComparator)
- {
- requireNonNull(files, "files is null");
- requireNonNull(tableCache, "tableCache is null");
- requireNonNull(internalKeyComparator, "internalKeyComparator is null");
-
- this.files = new ArrayList<>(files);
- this.tableCache = tableCache;
- this.internalKeyComparator = internalKeyComparator;
- }
-
- public int getLevelNumber()
- {
- return 0;
- }
-
- public List getFiles()
- {
- return files;
- }
-
- @Override
- public Level0Iterator iterator()
- {
- return new Level0Iterator(tableCache, files, internalKeyComparator);
- }
-
- public LookupResult get(LookupKey key, ReadStats readStats)
- {
- if (files.isEmpty()) {
- return null;
- }
-
- List fileMetaDataList = new ArrayList<>(files.size());
- for (FileMetaData fileMetaData : files) {
- if (internalKeyComparator.getUserComparator().compare(key.getUserKey(), fileMetaData.getSmallest().getUserKey()) >= 0 &&
- internalKeyComparator.getUserComparator().compare(key.getUserKey(), fileMetaData.getLargest().getUserKey()) <= 0) {
- fileMetaDataList.add(fileMetaData);
- }
- }
-
- Collections.sort(fileMetaDataList, NEWEST_FIRST);
-
- readStats.clear();
- for (FileMetaData fileMetaData : fileMetaDataList) {
- // open the iterator
- InternalTableIterator iterator = tableCache.newIterator(fileMetaData);
-
- // seek to the key
- iterator.seek(key.getInternalKey());
-
- if (iterator.hasNext()) {
- // parse the key in the block
- Entry entry = iterator.next();
- InternalKey internalKey = entry.getKey();
- checkState(internalKey != null, "Corrupt key for %s", key.getUserKey().toString(UTF_8));
-
- // if this is a value key (not a delete) and the keys match, return the value
- if (key.getUserKey().equals(internalKey.getUserKey())) {
- if (internalKey.getValueType() == ValueType.DELETION) {
- return LookupResult.deleted(key);
- }
- else if (internalKey.getValueType() == VALUE) {
- return LookupResult.ok(key, entry.getValue());
- }
- }
- }
-
- if (readStats.getSeekFile() == null) {
- // We have had more than one seek for this read. Charge the first file.
- readStats.setSeekFile(fileMetaData);
- readStats.setSeekFileLevel(0);
- }
- }
-
- return null;
- }
-
- public boolean someFileOverlapsRange(Slice smallestUserKey, Slice largestUserKey)
- {
- InternalKey smallestInternalKey = new InternalKey(smallestUserKey, MAX_SEQUENCE_NUMBER, VALUE);
- int index = findFile(smallestInternalKey);
-
- UserComparator userComparator = internalKeyComparator.getUserComparator();
- return ((index < files.size()) &&
- userComparator.compare(largestUserKey, files.get(index).getSmallest().getUserKey()) >= 0);
- }
-
- private int findFile(InternalKey targetKey)
- {
- if (files.isEmpty()) {
- return files.size();
- }
-
- // todo replace with Collections.binarySearch
- int left = 0;
- int right = files.size() - 1;
-
- // binary search restart positions to find the restart position immediately before the targetKey
- while (left < right) {
- int mid = (left + right) / 2;
-
- if (internalKeyComparator.compare(files.get(mid).getLargest(), targetKey) < 0) {
- // Key at "mid.largest" is < "target". Therefore all
- // files at or before "mid" are uninteresting.
- left = mid + 1;
- }
- else {
- // Key at "mid.largest" is >= "target". Therefore all files
- // after "mid" are uninteresting.
- right = mid;
- }
- }
- return right;
- }
-
- public void addFile(FileMetaData fileMetaData)
- {
- // todo remove mutation
- files.add(fileMetaData);
- }
-
- @Override
- public String toString()
- {
- final StringBuilder sb = new StringBuilder();
- sb.append("Level0");
- sb.append("{files=").append(files);
- sb.append('}');
- return sb.toString();
- }
-}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/LogReader.java b/leveldb/src/main/java/org/iq80/leveldb/impl/LogReader.java
index 85c125cb..736931bf 100644
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/LogReader.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/LogReader.java
@@ -17,6 +17,7 @@
*/
package org.iq80.leveldb.impl;
+import org.iq80.leveldb.util.SequentialFile;
import org.iq80.leveldb.util.DynamicSliceOutput;
import org.iq80.leveldb.util.Slice;
import org.iq80.leveldb.util.SliceInput;
@@ -24,7 +25,6 @@
import org.iq80.leveldb.util.Slices;
import java.io.IOException;
-import java.nio.channels.FileChannel;
import static org.iq80.leveldb.impl.LogChunkType.BAD_CHUNK;
import static org.iq80.leveldb.impl.LogChunkType.EOF;
@@ -37,7 +37,7 @@
public class LogReader
{
- private final FileChannel fileChannel;
+ private final SequentialFile sequentialFile;
private final LogMonitor monitor;
@@ -83,9 +83,9 @@ public class LogReader
*/
private Slice currentChunk = Slices.EMPTY_SLICE;
- public LogReader(FileChannel fileChannel, LogMonitor monitor, boolean verifyChecksums, long initialOffset)
+ public LogReader(SequentialFile sequentialFile, LogMonitor monitor, boolean verifyChecksums, long initialOffset)
{
- this.fileChannel = fileChannel;
+ this.sequentialFile = sequentialFile;
this.monitor = monitor;
this.verifyChecksums = verifyChecksums;
this.initialOffset = initialOffset;
@@ -118,7 +118,7 @@ private boolean skipToInitialBlock()
// Skip to start of first block that can contain the initial record
if (blockStartLocation > 0) {
try {
- fileChannel.position(blockStartLocation);
+ sequentialFile.skip(blockStartLocation);
}
catch (IOException e) {
reportDrop(blockStartLocation, e);
@@ -310,7 +310,7 @@ public boolean readNextBlock()
// read the next full block
while (blockScratch.writableBytes() > 0) {
try {
- int bytesRead = blockScratch.writeBytes(fileChannel, blockScratch.writableBytes());
+ int bytesRead = sequentialFile.read(blockScratch.writableBytes(), blockScratch);
if (bytesRead < 0) {
// no more bytes to read
eof = true;
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/LogWriter.java b/leveldb/src/main/java/org/iq80/leveldb/impl/LogWriter.java
index d61febcb..10ea6ea7 100644
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/LogWriter.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/LogWriter.java
@@ -18,25 +18,172 @@
package org.iq80.leveldb.impl;
import org.iq80.leveldb.util.Slice;
+import org.iq80.leveldb.util.SliceInput;
+import org.iq80.leveldb.util.SliceOutput;
+import org.iq80.leveldb.util.Slices;
+import org.iq80.leveldb.util.WritableFile;
-import java.io.File;
+import java.io.Closeable;
import java.io.IOException;
+import java.util.concurrent.atomic.AtomicBoolean;
-public interface LogWriter
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkState;
+import static java.util.Objects.requireNonNull;
+import static org.iq80.leveldb.impl.LogConstants.BLOCK_SIZE;
+import static org.iq80.leveldb.impl.LogConstants.HEADER_SIZE;
+import static org.iq80.leveldb.impl.Logs.getChunkChecksum;
+
+public class LogWriter
+ implements Closeable
{
- boolean isClosed();
+ private static final byte[] SA = new byte[HEADER_SIZE];
+ private final WritableFile writableFile;
+ private final long fileNumber;
+ private final AtomicBoolean closed = new AtomicBoolean();
+
+ /**
+ * Current offset in the current block
+ */
+ private int blockOffset;
+
+ private LogWriter(long fileNumber, WritableFile file)
+ {
+ requireNonNull(file, "file is null");
+ checkArgument(fileNumber >= 0, "fileNumber is negative");
+ this.fileNumber = fileNumber;
+ this.writableFile = file;
+ }
- void close()
- throws IOException;
+ public static LogWriter createWriter(long fileNumber, WritableFile writableFile)
+ {
+ return new LogWriter(fileNumber, writableFile);
+ }
- void delete()
- throws IOException;
+ @Override
+ public void close()
+ throws IOException
+ {
+ closed.set(true);
+ writableFile.close();
- File getFile();
+ }
- long getFileNumber();
+ public long getFileNumber()
+ {
+ return fileNumber;
+ }
// Writes a stream of chunks such that no chunk is split across a block boundary
- void addRecord(Slice record, boolean force)
- throws IOException;
+ public void addRecord(Slice record, boolean force)
+ throws IOException
+ {
+ checkState(!closed.get(), "Log has been closed");
+
+ SliceInput sliceInput = record.input();
+
+ // used to track first, middle and last blocks
+ boolean begin = true;
+
+ // Fragment the record int chunks as necessary and write it. Note that if record
+ // is empty, we still want to iterate once to write a single
+ // zero-length chunk.
+ do {
+ int bytesRemainingInBlock = BLOCK_SIZE - blockOffset;
+ checkState(bytesRemainingInBlock >= 0);
+
+ // Switch to a new block if necessary
+ if (bytesRemainingInBlock < HEADER_SIZE) {
+ if (bytesRemainingInBlock > 0) {
+ // Fill the rest of the block with zeros
+ // todo lame... need a better way to write zeros
+ writableFile.append(new Slice(SA, 0, bytesRemainingInBlock));
+ }
+ blockOffset = 0;
+ bytesRemainingInBlock = BLOCK_SIZE - blockOffset;
+ }
+
+ // Invariant: we never leave less than HEADER_SIZE bytes available in a block
+ int bytesAvailableInBlock = bytesRemainingInBlock - HEADER_SIZE;
+ checkState(bytesAvailableInBlock >= 0);
+
+ // if there are more bytes in the record then there are available in the block,
+ // fragment the record; otherwise write to the end of the record
+ boolean end;
+ int fragmentLength;
+ if (sliceInput.available() > bytesAvailableInBlock) {
+ end = false;
+ fragmentLength = bytesAvailableInBlock;
+ }
+ else {
+ end = true;
+ fragmentLength = sliceInput.available();
+ }
+
+ // determine block type
+ LogChunkType type;
+ if (begin && end) {
+ type = LogChunkType.FULL;
+ }
+ else if (begin) {
+ type = LogChunkType.FIRST;
+ }
+ else if (end) {
+ type = LogChunkType.LAST;
+ }
+ else {
+ type = LogChunkType.MIDDLE;
+ }
+
+ // write the chunk
+ writeChunk(type, sliceInput.readBytes(fragmentLength));
+
+ // we are no longer on the first chunk
+ begin = false;
+ } while (sliceInput.isReadable());
+
+ if (force) {
+ writableFile.force();
+ }
+ }
+
+ private void writeChunk(LogChunkType type, Slice slice)
+ throws IOException
+ {
+ checkArgument(slice.length() <= 0xffff, "length %s is larger than two bytes", slice.length());
+ checkArgument(blockOffset + HEADER_SIZE <= BLOCK_SIZE);
+
+ // create header
+ Slice header = newLogRecordHeader(type, slice, slice.length());
+
+ // write the header and the payload
+ writableFile.append(header);
+ writableFile.append(slice);
+
+ blockOffset += HEADER_SIZE + slice.length();
+ }
+
+ private static Slice newLogRecordHeader(LogChunkType type, Slice slice, int length)
+ {
+ int crc = getChunkChecksum(type.getPersistentId(), slice.getRawArray(), slice.getRawOffset(), length);
+
+ // Format the header
+ Slice header = Slices.allocate(HEADER_SIZE);
+ SliceOutput sliceOutput = header.output();
+ sliceOutput.writeInt(crc);
+ sliceOutput.writeByte((byte) (length & 0xff));
+ sliceOutput.writeByte((byte) (length >>> 8));
+ sliceOutput.writeByte((byte) (type.getPersistentId()));
+
+ return header;
+ }
+
+ @Override
+ public String toString()
+ {
+ return "LogWriter{" +
+ "writableFile=" + writableFile +
+ ", fileNumber=" + fileNumber +
+ '}';
+ }
}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/Logs.java b/leveldb/src/main/java/org/iq80/leveldb/impl/Logs.java
index 5bf617b8..77996a26 100644
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/Logs.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/Logs.java
@@ -17,27 +17,25 @@
*/
package org.iq80.leveldb.impl;
+import org.iq80.leveldb.util.MMWritableFile;
import org.iq80.leveldb.util.PureJavaCrc32C;
import org.iq80.leveldb.util.Slice;
+import org.iq80.leveldb.util.UnbufferedWritableFile;
import java.io.File;
import java.io.IOException;
public final class Logs
{
+ private static final int PAGE_SIZE = 1024 * 1024;
+
private Logs()
{
}
- public static LogWriter createLogWriter(File file, long fileNumber)
- throws IOException
+ public static LogWriter createLogWriter(File file, long fileNumber, boolean allowMmapWrites) throws IOException
{
- if (Iq80DBFactory.USE_MMAP) {
- return new MMapLogWriter(file, fileNumber);
- }
- else {
- return new FileChannelLogWriter(file, fileNumber);
- }
+ return LogWriter.createWriter(fileNumber, allowMmapWrites ? MMWritableFile.open(file, PAGE_SIZE) : UnbufferedWritableFile.open(file));
}
public static int getChunkChecksum(int chunkTypeId, Slice slice)
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/MMapLogWriter.java b/leveldb/src/main/java/org/iq80/leveldb/impl/MMapLogWriter.java
deleted file mode 100755
index b42e3ce3..00000000
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/MMapLogWriter.java
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Copyright (C) 2011 the original author or authors.
- * See the notice.md file distributed with this work for additional
- * information regarding copyright ownership.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.iq80.leveldb.impl;
-
-import org.iq80.leveldb.util.ByteBufferSupport;
-import org.iq80.leveldb.util.Closeables;
-import org.iq80.leveldb.util.Slice;
-import org.iq80.leveldb.util.SliceInput;
-import org.iq80.leveldb.util.SliceOutput;
-import org.iq80.leveldb.util.Slices;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.RandomAccessFile;
-import java.nio.MappedByteBuffer;
-import java.nio.channels.FileChannel;
-import java.nio.channels.FileChannel.MapMode;
-import java.util.concurrent.atomic.AtomicBoolean;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static com.google.common.base.Preconditions.checkState;
-import static java.util.Objects.requireNonNull;
-import static org.iq80.leveldb.impl.LogConstants.BLOCK_SIZE;
-import static org.iq80.leveldb.impl.LogConstants.HEADER_SIZE;
-import static org.iq80.leveldb.impl.Logs.getChunkChecksum;
-
-public class MMapLogWriter
- implements LogWriter
-{
- private static final int PAGE_SIZE = 1024 * 1024;
-
- private final File file;
- private final long fileNumber;
- private final FileChannel fileChannel;
- private final AtomicBoolean closed = new AtomicBoolean();
- private MappedByteBuffer mappedByteBuffer;
- private long fileOffset;
- /**
- * Current offset in the current block
- */
- private int blockOffset;
-
- public MMapLogWriter(File file, long fileNumber)
- throws IOException
- {
- requireNonNull(file, "file is null");
- checkArgument(fileNumber >= 0, "fileNumber is negative");
- this.file = file;
- this.fileNumber = fileNumber;
- this.fileChannel = new RandomAccessFile(file, "rw").getChannel();
- mappedByteBuffer = fileChannel.map(MapMode.READ_WRITE, 0, PAGE_SIZE);
- }
-
- @Override
- public boolean isClosed()
- {
- return closed.get();
- }
-
- @Override
- public synchronized void close()
- throws IOException
- {
- closed.set(true);
-
- destroyMappedByteBuffer();
-
- if (fileChannel.isOpen()) {
- fileChannel.truncate(fileOffset);
- }
-
- // close the channel
- Closeables.closeQuietly(fileChannel);
- }
-
- @Override
- public synchronized void delete()
- throws IOException
- {
- close();
-
- // try to delete the file
- file.delete();
- }
-
- private void destroyMappedByteBuffer()
- {
- if (mappedByteBuffer != null) {
- fileOffset += mappedByteBuffer.position();
- unmap();
- }
- mappedByteBuffer = null;
- }
-
- @Override
- public File getFile()
- {
- return file;
- }
-
- @Override
- public long getFileNumber()
- {
- return fileNumber;
- }
-
- // Writes a stream of chunks such that no chunk is split across a block boundary
- @Override
- public synchronized void addRecord(Slice record, boolean force)
- throws IOException
- {
- checkState(!closed.get(), "Log has been closed");
-
- SliceInput sliceInput = record.input();
-
- // used to track first, middle and last blocks
- boolean begin = true;
-
- // Fragment the record int chunks as necessary and write it. Note that if record
- // is empty, we still want to iterate once to write a single
- // zero-length chunk.
- do {
- int bytesRemainingInBlock = BLOCK_SIZE - blockOffset;
- checkState(bytesRemainingInBlock >= 0);
-
- // Switch to a new block if necessary
- if (bytesRemainingInBlock < HEADER_SIZE) {
- if (bytesRemainingInBlock > 0) {
- // Fill the rest of the block with zeros
- // todo lame... need a better way to write zeros
- ensureCapacity(bytesRemainingInBlock);
- mappedByteBuffer.put(new byte[bytesRemainingInBlock]);
- }
- blockOffset = 0;
- bytesRemainingInBlock = BLOCK_SIZE - blockOffset;
- }
-
- // Invariant: we never leave less than HEADER_SIZE bytes available in a block
- int bytesAvailableInBlock = bytesRemainingInBlock - HEADER_SIZE;
- checkState(bytesAvailableInBlock >= 0);
-
- // if there are more bytes in the record then there are available in the block,
- // fragment the record; otherwise write to the end of the record
- boolean end;
- int fragmentLength;
- if (sliceInput.available() > bytesAvailableInBlock) {
- end = false;
- fragmentLength = bytesAvailableInBlock;
- }
- else {
- end = true;
- fragmentLength = sliceInput.available();
- }
-
- // determine block type
- LogChunkType type;
- if (begin && end) {
- type = LogChunkType.FULL;
- }
- else if (begin) {
- type = LogChunkType.FIRST;
- }
- else if (end) {
- type = LogChunkType.LAST;
- }
- else {
- type = LogChunkType.MIDDLE;
- }
-
- // write the chunk
- writeChunk(type, sliceInput.readBytes(fragmentLength));
-
- // we are no longer on the first chunk
- begin = false;
- } while (sliceInput.isReadable());
-
- if (force) {
- mappedByteBuffer.force();
- }
- }
-
- private void writeChunk(LogChunkType type, Slice slice)
- throws IOException
- {
- checkArgument(slice.length() <= 0xffff, "length %s is larger than two bytes", slice.length());
- checkArgument(blockOffset + HEADER_SIZE <= BLOCK_SIZE);
-
- // create header
- Slice header = newLogRecordHeader(type, slice);
-
- // write the header and the payload
- ensureCapacity(header.length() + slice.length());
- header.getBytes(0, mappedByteBuffer);
- slice.getBytes(0, mappedByteBuffer);
-
- blockOffset += HEADER_SIZE + slice.length();
- }
-
- private void ensureCapacity(int bytes)
- throws IOException
- {
- if (mappedByteBuffer.remaining() < bytes) {
- // remap
- fileOffset += mappedByteBuffer.position();
- unmap();
-
- mappedByteBuffer = fileChannel.map(MapMode.READ_WRITE, fileOffset, PAGE_SIZE);
- }
- }
-
- private void unmap()
- {
- ByteBufferSupport.unmap(mappedByteBuffer);
- }
-
- private static Slice newLogRecordHeader(LogChunkType type, Slice slice)
- {
- int crc = getChunkChecksum(type.getPersistentId(), slice.getRawArray(), slice.getRawOffset(), slice.length());
-
- // Format the header
- Slice header = Slices.allocate(HEADER_SIZE);
- SliceOutput sliceOutput = header.output();
- sliceOutput.writeInt(crc);
- sliceOutput.writeByte((byte) (slice.length() & 0xff));
- sliceOutput.writeByte((byte) (slice.length() >>> 8));
- sliceOutput.writeByte((byte) (type.getPersistentId()));
-
- return header;
- }
-}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/SeekingIterator.java b/leveldb/src/main/java/org/iq80/leveldb/impl/SeekingIterator.java
index 2d247bf2..9d60ad28 100644
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/SeekingIterator.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/SeekingIterator.java
@@ -30,7 +30,7 @@ public interface SeekingIterator
void seekToFirst();
/**
- * Repositions the iterator so the key of the next BlockElement returned greater than or equal to the specified targetKey.
+ * Repositions the iterator so the key of the next element returned greater than or equal to the specified targetKey.
*/
void seek(K targetKey);
}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/SnapshotImpl.java b/leveldb/src/main/java/org/iq80/leveldb/impl/SnapshotImpl.java
deleted file mode 100644
index 8e6fb368..00000000
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/SnapshotImpl.java
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (C) 2011 the original author or authors.
- * See the notice.md file distributed with this work for additional
- * information regarding copyright ownership.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.iq80.leveldb.impl;
-
-import org.iq80.leveldb.Snapshot;
-
-import java.util.concurrent.atomic.AtomicBoolean;
-
-public class SnapshotImpl
- implements Snapshot
-{
- private final AtomicBoolean closed = new AtomicBoolean();
- private final Version version;
- private final long lastSequence;
-
- SnapshotImpl(Version version, long lastSequence)
- {
- this.version = version;
- this.lastSequence = lastSequence;
- this.version.retain();
- }
-
- @Override
- public void close()
- {
- // This is an end user API.. he might screw up and close multiple times.
- // but we don't want the version reference count going bad.
- if (closed.compareAndSet(false, true)) {
- this.version.release();
- }
- }
-
- public long getLastSequence()
- {
- return lastSequence;
- }
-
- public Version getVersion()
- {
- return version;
- }
-
- @Override
- public String toString()
- {
- return Long.toString(lastSequence);
- }
-
- @Override
- public boolean equals(Object o)
- {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- SnapshotImpl snapshot = (SnapshotImpl) o;
-
- if (lastSequence != snapshot.lastSequence) {
- return false;
- }
- if (!version.equals(snapshot.version)) {
- return false;
- }
-
- return true;
- }
-
- @Override
- public int hashCode()
- {
- int result = version.hashCode();
- result = 31 * result + (int) (lastSequence ^ (lastSequence >>> 32));
- return result;
- }
-}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/SnapshotList.java b/leveldb/src/main/java/org/iq80/leveldb/impl/SnapshotList.java
new file mode 100644
index 00000000..8fd89ca9
--- /dev/null
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/SnapshotList.java
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.iq80.leveldb.impl;
+
+import org.iq80.leveldb.Snapshot;
+
+import java.util.concurrent.locks.ReentrantLock;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkState;
+
+/**
+ * Snapshots are kept in a doubly-linked list in the DB.
+ * Each Snapshot corresponds to a particular sequence number.
+ */
+final class SnapshotList
+{
+ private final ReentrantLock mutex;
+ private final SnapshotNode list;
+
+ /**
+ * Snapshot list where all operation are protected by {@ode mutex}.
+ * All {@code mutex} acquisition mut be done externally to ensure sequence order.
+ *
+ * @param mutex protect concurrent read/write to this list
+ */
+ public SnapshotList(ReentrantLock mutex)
+ {
+ this.mutex = mutex;
+ this.list = new SnapshotNode(0);
+ this.list.next = this.list;
+ this.list.prev = this.list;
+ }
+
+ /**
+ * Track a new snapshot for {@code sequence}.
+ *
+ * @param sequence most actual version sequence available
+ * @return new a new tracked snapshot for {@code sequence}
+ * @throws IllegalStateException if mutex is not held by current thread
+ */
+ public Snapshot newSnapshot(long sequence)
+ {
+ checkState(mutex.isHeldByCurrentThread());
+ SnapshotNode s = new SnapshotNode(sequence);
+ s.next = this.list;
+ s.prev = list.prev;
+ s.prev.next = s;
+ s.next.prev = s;
+ return s;
+ }
+
+ /**
+ * Return {@code true} if list is empty
+ *
+ * @return Return {@code true} if list is empty
+ * @throws IllegalStateException if mutex is not held by current thread
+ */
+ public boolean isEmpty()
+ {
+ checkState(mutex.isHeldByCurrentThread());
+ return list.next == list;
+ }
+
+ /**
+ * Return oldest sequence number of this list
+ *
+ * @return oldest sequence number
+ * @throws IllegalStateException if mutex is not held by current thread or list is empty
+ */
+ public long getOldest()
+ {
+ checkState(mutex.isHeldByCurrentThread());
+ checkState(!isEmpty());
+ return list.next.number;
+ }
+
+ /**
+ * Return sequence corresponding to given snapshot.
+ *
+ * @param snapshot snapshot to read from
+ * @return Return sequence corresponding to given snapshot.
+ * @throws IllegalArgumentException if snapshot concrete type does not come from current list
+ * @throws IllegalStateException if mutex is not held by current thread
+ */
+ public long getSequenceFrom(Snapshot snapshot)
+ {
+ checkArgument(snapshot instanceof SnapshotNode);
+ checkState(mutex.isHeldByCurrentThread());
+ return ((SnapshotNode) snapshot).number;
+ }
+
+ private final class SnapshotNode implements Snapshot
+ {
+ private final long number;
+ private SnapshotNode next;
+ private SnapshotNode prev;
+
+ private SnapshotNode(long number)
+ {
+ this.number = number;
+ }
+
+ @Override
+ public void close()
+ {
+ mutex.lock();
+ try {
+ this.prev.next = this.next;
+ this.next.prev = this.prev;
+ }
+ finally {
+ mutex.unlock();
+ }
+ }
+ }
+}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/SnapshotSeekingIterator.java b/leveldb/src/main/java/org/iq80/leveldb/impl/SnapshotSeekingIterator.java
index ba4649d7..65b51e10 100644
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/SnapshotSeekingIterator.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/SnapshotSeekingIterator.java
@@ -26,83 +26,106 @@
import java.util.Map.Entry;
public final class SnapshotSeekingIterator
- extends AbstractSeekingIterator
+ extends AbstractSeekingIterator implements AutoCloseable
{
private final DbIterator iterator;
- private final SnapshotImpl snapshot;
+ private final long sequence;
private final Comparator userComparator;
- public SnapshotSeekingIterator(DbIterator iterator, SnapshotImpl snapshot, Comparator userComparator)
+ public SnapshotSeekingIterator(DbIterator iterator, long sequence, Comparator userComparator)
{
this.iterator = iterator;
- this.snapshot = snapshot;
+ this.sequence = sequence;
this.userComparator = userComparator;
- this.snapshot.getVersion().retain();
}
+ @Override
public void close()
{
- this.snapshot.getVersion().release();
+ next = null;
+ iterator.close();
}
@Override
protected void seekToFirstInternal()
{
+ next = null;
iterator.seekToFirst();
- findNextUserEntry(null);
+ findNextUserEntry();
}
@Override
protected void seekInternal(Slice targetKey)
{
- iterator.seek(new InternalKey(targetKey, snapshot.getLastSequence(), ValueType.VALUE));
- findNextUserEntry(null);
+ next = null;
+ iterator.seek(new InternalKey(targetKey, sequence, ValueType.VALUE));
+ findNextUserEntry();
}
@Override
protected Entry getNextElement()
{
- if (!iterator.hasNext()) {
+ if (this.next == null && !iterator.hasNext()) {
return null;
}
-
- Entry next = iterator.next();
-
// find the next user entry after the key we are about to return
- findNextUserEntry(next.getKey().getUserKey());
-
- return Maps.immutableEntry(next.getKey().getUserKey(), next.getValue());
+ findNextUserEntry();
+ if (next != null) {
+ Entry next = this.next;
+ this.next = null;
+ return Maps.immutableEntry(next.getKey().getUserKey(), next.getValue());
+ }
+ return null;
}
- private void findNextUserEntry(Slice deletedKey)
+ Entry next;
+
+ private void findNextUserEntry()
{
+ if (next != null) {
+ return;
+ }
// if there are no more entries, we are done
if (!iterator.hasNext()) {
return;
}
-
- do {
- // Peek the next entry and parse the key
- InternalKey internalKey = iterator.peek().getKey();
-
+ //todo optimize algorithm. we should not do early load when called from #seekX(y)
+ while (iterator.hasNext()) {
+ Entry next = iterator.next();
+ InternalKey key = next.getKey();
// skip entries created after our snapshot
- if (internalKey.getSequenceNumber() > snapshot.getLastSequence()) {
- iterator.next();
+ if (key.getSequenceNumber() > sequence) {
continue;
}
-
- // if the next entry is a deletion, skip all subsequent entries for that key
- if (internalKey.getValueType() == ValueType.DELETION) {
- deletedKey = internalKey.getUserKey();
+ if (key.getValueType() == ValueType.DELETION) {
+ while (iterator.hasNext()) {
+ Entry peek = iterator.peek();
+ if (peek.getKey().getValueType() == ValueType.DELETION) {
+ break; //handled by next loop
+ }
+ else if (peek.getKey().getValueType() == ValueType.VALUE && userComparator.compare(key.getUserKey(), peek.getKey().getUserKey()) == 0) {
+ iterator.next(); // Entry hidden
+ }
+ else {
+ break; //different key
+ }
+ }
}
- else if (internalKey.getValueType() == ValueType.VALUE) {
- // is this value masked by a prior deletion record?
- if (deletedKey == null || userComparator.compare(internalKey.getUserKey(), deletedKey) > 0) {
- return;
+ else if (key.getValueType() == ValueType.VALUE) {
+ while (iterator.hasNext()) {
+ Entry peek = iterator.peek();
+ if (peek.getKey().getValueType() == ValueType.VALUE && userComparator.compare(key.getUserKey(), peek.getKey().getUserKey()) == 0) {
+ iterator.next(); // Entry hidden
+ }
+ else {
+ this.next = next;
+ return;
+ }
}
+ this.next = next;
+ return;
}
- iterator.next();
- } while (iterator.hasNext());
+ }
}
@Override
@@ -110,7 +133,7 @@ public String toString()
{
final StringBuilder sb = new StringBuilder();
sb.append("SnapshotSeekingIterator");
- sb.append("{snapshot=").append(snapshot);
+ sb.append("{sequence=").append(sequence);
sb.append(", iterator=").append(iterator);
sb.append('}');
return sb.toString();
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/TableCache.java b/leveldb/src/main/java/org/iq80/leveldb/impl/TableCache.java
index 34b5055e..f25ff407 100755
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/TableCache.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/TableCache.java
@@ -22,18 +22,24 @@
import com.google.common.cache.LoadingCache;
import com.google.common.cache.RemovalListener;
import com.google.common.cache.RemovalNotification;
-import org.iq80.leveldb.table.FileChannelTable;
-import org.iq80.leveldb.table.MMapTable;
+import org.iq80.leveldb.Options;
+import org.iq80.leveldb.table.BlockHandle;
+import org.iq80.leveldb.table.BlockHandleSliceWeigher;
+import org.iq80.leveldb.table.FilterPolicy;
+import org.iq80.leveldb.table.KeyValueFunction;
import org.iq80.leveldb.table.Table;
import org.iq80.leveldb.table.UserComparator;
+import org.iq80.leveldb.util.Closeables;
+import org.iq80.leveldb.util.UnbufferedRandomInputFile;
import org.iq80.leveldb.util.Finalizer;
import org.iq80.leveldb.util.InternalTableIterator;
+import org.iq80.leveldb.util.LRUCache;
+import org.iq80.leveldb.util.MMRandomInputFile;
+import org.iq80.leveldb.util.RandomInputFile;
import org.iq80.leveldb.util.Slice;
import java.io.File;
-import java.io.FileInputStream;
import java.io.IOException;
-import java.nio.channels.FileChannel;
import java.util.concurrent.ExecutionException;
import static java.util.Objects.requireNonNull;
@@ -42,11 +48,15 @@ public class TableCache
{
private final LoadingCache cache;
private final Finalizer
finalizer = new Finalizer<>(1);
+ private final LRUCache blockCache;
- public TableCache(final File databaseDir, int tableCacheSize, final UserComparator userComparator, final boolean verifyChecksums)
+ public TableCache(final File databaseDir,
+ int tableCacheSize,
+ final UserComparator userComparator,
+ final Options options)
{
requireNonNull(databaseDir, "databaseName is null");
-
+ blockCache = new LRUCache<>(options.cacheSize() > 0 ? (int) options.cacheSize() : 8 << 20, new BlockHandleSliceWeigher()); //TODO add possibility to disable cache?
cache = CacheBuilder.newBuilder()
.maximumSize(tableCacheSize)
.removalListener(new RemovalListener()
@@ -54,8 +64,11 @@ public TableCache(final File databaseDir, int tableCacheSize, final UserComparat
@Override
public void onRemoval(RemovalNotification notification)
{
- Table table = notification.getValue().getTable();
- finalizer.addCleanup(table, table.closer());
+ final TableAndFile value = notification.getValue();
+ if (value != null) {
+ final Table table = value.getTable();
+ finalizer.addCleanup(table, table.closer());
+ }
}
})
.build(new CacheLoader()
@@ -64,7 +77,7 @@ public void onRemoval(RemovalNotification notification)
public TableAndFile load(Long fileNumber)
throws IOException
{
- return new TableAndFile(databaseDir, fileNumber, userComparator, verifyChecksums);
+ return new TableAndFile(databaseDir, fileNumber, userComparator, options, blockCache);
}
});
}
@@ -79,6 +92,13 @@ public InternalTableIterator newIterator(long number)
return new InternalTableIterator(getTable(number).iterator());
}
+ public T get(Slice key, FileMetaData fileMetaData, KeyValueFunction resultBuilder)
+ {
+ final Table table = getTable(fileMetaData.getNumber());
+ return table.internalGet(key, resultBuilder);
+
+ }
+
public long getApproximateOffsetOf(FileMetaData file, Slice key)
{
return getTable(file.getNumber()).getApproximateOffsetOf(key);
@@ -115,20 +135,41 @@ private static final class TableAndFile
{
private final Table table;
- private TableAndFile(File databaseDir, long fileNumber, UserComparator userComparator, boolean verifyChecksums)
+ private TableAndFile(File databaseDir, long fileNumber, UserComparator userComparator, Options options, LRUCache blockCache)
throws IOException
{
- String tableFileName = Filename.tableFileName(fileNumber);
- File tableFile = new File(databaseDir, tableFileName);
- try (FileInputStream fis = new FileInputStream(tableFile);
- FileChannel fileChannel = fis.getChannel()) {
- if (Iq80DBFactory.USE_MMAP) {
- table = new MMapTable(tableFile.getAbsolutePath(), fileChannel, userComparator, verifyChecksums);
+ final File tableFile = tableFileName(databaseDir, fileNumber);
+ RandomInputFile source = null;
+ try {
+ if (options.allowMmapReads()) {
+ source = MMRandomInputFile.open(tableFile);
}
else {
- table = new FileChannelTable(tableFile.getAbsolutePath(), fileChannel, userComparator, verifyChecksums);
+ source = UnbufferedRandomInputFile.open(tableFile);
+ }
+ final FilterPolicy filterPolicy = (FilterPolicy) options.filterPolicy();
+ table = new Table(source, userComparator,
+ options.verifyChecksums(), blockCache, filterPolicy);
+ }
+ catch (IOException e) {
+ Closeables.closeQuietly(source);
+ throw e;
+ }
+ }
+
+ private File tableFileName(File databaseDir, long fileNumber)
+ {
+ final String tableFileName = Filename.tableFileName(fileNumber);
+ File tableFile = new File(databaseDir, tableFileName);
+ if (!tableFile.canRead()) {
+ // attempt to open older .sst extension
+ final String sstFileName = Filename.sstTableFileName(fileNumber);
+ final File sstPath = new File(databaseDir, sstFileName);
+ if (sstPath.canRead()) {
+ tableFile = sstPath;
}
}
+ return tableFile;
}
public Table getTable()
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/ValueHolder.java b/leveldb/src/main/java/org/iq80/leveldb/impl/ValueHolder.java
new file mode 100644
index 00000000..38299f93
--- /dev/null
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/ValueHolder.java
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.iq80.leveldb.impl;
+
+import java.util.Objects;
+
+/**
+ * Value holder for reference modification like in C++
+ */
+final class ValueHolder
+{
+ private V value;
+
+ public ValueHolder(V value)
+ {
+ this.value = value;
+ }
+
+ /**
+ * Setter for property 'value'.
+ *
+ * @param value Value to set for property 'value'.
+ */
+ public void setValue(V value)
+ {
+ this.value = value;
+ }
+
+ /**
+ * Getter for property 'value'.
+ *
+ * @return Value for property 'value'.
+ */
+ public V getValue()
+ {
+ return value;
+ }
+
+ @Override
+ public boolean equals(Object o)
+ {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ ValueHolder> that = (ValueHolder>) o;
+ return Objects.equals(value, that.value);
+ }
+
+ @Override
+ public int hashCode()
+ {
+ return Objects.hash(value);
+ }
+
+ @Override
+ public String toString()
+ {
+ return "ValueHolder{value=" + value + '}';
+ }
+}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/Version.java b/leveldb/src/main/java/org/iq80/leveldb/impl/Version.java
index 818542c5..10c167ac 100644
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/Version.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/Version.java
@@ -22,8 +22,6 @@
import com.google.common.collect.ImmutableMultimap;
import com.google.common.collect.Multimap;
import org.iq80.leveldb.util.InternalIterator;
-import org.iq80.leveldb.util.InternalTableIterator;
-import org.iq80.leveldb.util.LevelIterator;
import org.iq80.leveldb.util.MergingIterator;
import org.iq80.leveldb.util.Slice;
@@ -35,7 +33,6 @@
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkPositionIndex;
import static com.google.common.collect.Ordering.natural;
-import static java.util.Objects.requireNonNull;
import static org.iq80.leveldb.impl.DbConstants.MAX_MEM_COMPACT_LEVEL;
import static org.iq80.leveldb.impl.DbConstants.NUM_LEVELS;
import static org.iq80.leveldb.impl.SequenceNumber.MAX_SEQUENCE_NUMBER;
@@ -47,7 +44,6 @@ public class Version
{
private final AtomicInteger retained = new AtomicInteger(1);
private final VersionSet versionSet;
- private final Level0 level0;
private final List levels;
// move these mutable fields somewhere else
@@ -60,11 +56,8 @@ public Version(VersionSet versionSet)
{
this.versionSet = versionSet;
checkArgument(NUM_LEVELS > 1, "levels must be at least 2");
-
- this.level0 = new Level0(new ArrayList(), getTableCache(), getInternalKeyComparator());
-
Builder builder = ImmutableList.builder();
- for (int i = 1; i < NUM_LEVELS; i++) {
+ for (int i = 0; i < NUM_LEVELS; i++) {
List files = new ArrayList<>();
builder.add(new Level(i, files, getTableCache(), getInternalKeyComparator()));
}
@@ -72,17 +65,10 @@ public Version(VersionSet versionSet)
}
- public void assertNoOverlappingFiles()
- {
- for (int level = 1; level < NUM_LEVELS; level++) {
- assertNoOverlappingFiles(level);
- }
- }
-
public void assertNoOverlappingFiles(int level)
{
if (level > 0) {
- Collection files = getFiles().asMap().get(level);
+ Collection files = getFiles(level);
if (files != null) {
long previousFileNumber = 0;
InternalKey previousEnd = null;
@@ -111,22 +97,22 @@ public final InternalKeyComparator getInternalKeyComparator()
return versionSet.getInternalKeyComparator();
}
- public synchronized int getCompactionLevel()
+ public int getCompactionLevel()
{
return compactionLevel;
}
- public synchronized void setCompactionLevel(int compactionLevel)
+ public void setCompactionLevel(int compactionLevel)
{
this.compactionLevel = compactionLevel;
}
- public synchronized double getCompactionScore()
+ public double getCompactionScore()
{
return compactionScore;
}
- public synchronized void setCompactionScore(double compactionScore)
+ public void setCompactionScore(double compactionScore)
{
this.compactionScore = compactionScore;
}
@@ -134,24 +120,13 @@ public synchronized void setCompactionScore(double compactionScore)
@Override
public MergingIterator iterator()
{
- Builder builder = ImmutableList.builder();
- builder.add(level0.iterator());
- builder.addAll(getLevelIterators());
- return new MergingIterator(builder.build(), getInternalKeyComparator());
- }
-
- List getLevel0Files()
- {
- Builder builder = ImmutableList.builder();
- for (FileMetaData file : level0.getFiles()) {
- builder.add(getTableCache().newIterator(file));
- }
- return builder.build();
+ ImmutableList it = ImmutableList.copyOf(getLevelIterators());
+ return new MergingIterator(it, getInternalKeyComparator());
}
- List getLevelIterators()
+ List getLevelIterators()
{
- Builder builder = ImmutableList.builder();
+ Builder builder = ImmutableList.builder();
for (Level level : levels) {
if (!level.getFiles().isEmpty()) {
builder.add(level.iterator());
@@ -160,22 +135,18 @@ List getLevelIterators()
return builder.build();
}
- public LookupResult get(LookupKey key)
+ public LookupResult get(LookupKey key, ReadStats readStats)
{
// We can search level-by-level since entries never hop across
// levels. Therefore we are guaranteed that if we find data
// in an smaller level, later levels are irrelevant.
- ReadStats readStats = new ReadStats();
- LookupResult lookupResult = level0.get(key, readStats);
- if (lookupResult == null) {
- for (Level level : levels) {
- lookupResult = level.get(key, readStats);
- if (lookupResult != null) {
- break;
- }
+ LookupResult lookupResult = null;
+ for (Level level : levels) {
+ lookupResult = level.get(key, readStats);
+ if (lookupResult != null) {
+ break;
}
}
- updateStats(readStats.getSeekFileLevel(), readStats.getSeekFile());
return lookupResult;
}
@@ -186,7 +157,7 @@ int pickLevelForMemTableOutput(Slice smallestUserKey, Slice largestUserKey)
// Push to next level if there is no overlap in next level,
// and the #bytes overlapping in the level after that are limited.
InternalKey start = new InternalKey(smallestUserKey, MAX_SEQUENCE_NUMBER, ValueType.VALUE);
- InternalKey limit = new InternalKey(largestUserKey, 0, ValueType.VALUE);
+ InternalKey limit = new InternalKey(largestUserKey, 0, ValueType.DELETION);
while (level < MAX_MEM_COMPACT_LEVEL) {
if (overlapInLevel(level + 1, smallestUserKey, largestUserKey)) {
break;
@@ -204,37 +175,23 @@ int pickLevelForMemTableOutput(Slice smallestUserKey, Slice largestUserKey)
public boolean overlapInLevel(int level, Slice smallestUserKey, Slice largestUserKey)
{
checkPositionIndex(level, levels.size(), "Invalid level");
- requireNonNull(smallestUserKey, "smallestUserKey is null");
- requireNonNull(largestUserKey, "largestUserKey is null");
-
- if (level == 0) {
- return level0.someFileOverlapsRange(smallestUserKey, largestUserKey);
- }
- return levels.get(level - 1).someFileOverlapsRange(smallestUserKey, largestUserKey);
+ return levels.get(level).someFileOverlapsRange(level > 0, smallestUserKey, largestUserKey);
}
public int numberOfLevels()
{
- return levels.size() + 1;
+ return levels.size();
}
public int numberOfFilesInLevel(int level)
{
- if (level == 0) {
- return level0.getFiles().size();
- }
- else {
- return levels.get(level - 1).getFiles().size();
- }
+ return getFiles(level).size();
}
public Multimap getFiles()
{
ImmutableMultimap.Builder builder = ImmutableMultimap.builder();
builder = builder.orderKeysBy(natural());
-
- builder.putAll(0, level0.getFiles());
-
for (Level level : levels) {
builder.putAll(level.getLevelNumber(), level.getFiles());
}
@@ -243,26 +200,19 @@ public Multimap getFiles()
public List getFiles(int level)
{
- if (level == 0) {
- return level0.getFiles();
- }
- else {
- return levels.get(level - 1).getFiles();
- }
+ return levels.get(level).getFiles();
}
public void addFile(int level, FileMetaData fileMetaData)
{
- if (level == 0) {
- level0.addFile(fileMetaData);
- }
- else {
- levels.get(level - 1).addFile(fileMetaData);
- }
+ levels.get(level).addFile(fileMetaData);
}
- private boolean updateStats(int seekFileLevel, FileMetaData seekFile)
+ public boolean updateStats(ReadStats readStats)
{
+ final int seekFileLevel = readStats.getSeekFileLevel();
+ final FileMetaData seekFile = readStats.getSeekFile();
+
if (seekFile == null) {
return false;
}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/VersionSet.java b/leveldb/src/main/java/org/iq80/leveldb/impl/VersionSet.java
old mode 100755
new mode 100644
index f554d82d..90ab3e77
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/VersionSet.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/VersionSet.java
@@ -28,12 +28,12 @@
import org.iq80.leveldb.util.InternalIterator;
import org.iq80.leveldb.util.Level0Iterator;
import org.iq80.leveldb.util.MergingIterator;
+import org.iq80.leveldb.util.SequentialFile;
+import org.iq80.leveldb.util.SequentialFileImpl;
import org.iq80.leveldb.util.Slice;
import java.io.File;
-import java.io.FileInputStream;
import java.io.IOException;
-import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
@@ -47,6 +47,7 @@
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.locks.ReentrantLock;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
@@ -77,16 +78,18 @@ public class VersionSet
private final File databaseDir;
private final TableCache tableCache;
private final InternalKeyComparator internalKeyComparator;
+ private final boolean allowMmapWrites;
private LogWriter descriptorLog;
private final Map compactPointers = new TreeMap<>();
- public VersionSet(File databaseDir, TableCache tableCache, InternalKeyComparator internalKeyComparator)
+ public VersionSet(File databaseDir, TableCache tableCache, InternalKeyComparator internalKeyComparator, boolean allowMmapWrites)
throws IOException
{
this.databaseDir = databaseDir;
this.tableCache = tableCache;
this.internalKeyComparator = internalKeyComparator;
+ this.allowMmapWrites = allowMmapWrites;
appendVersion(new Version(this));
initializeIfNeeded();
@@ -104,7 +107,7 @@ private void initializeIfNeeded()
edit.setNextFileNumber(nextFileNumber.get());
edit.setLastSequenceNumber(lastSequence);
- LogWriter log = Logs.createLogWriter(new File(databaseDir, Filename.descriptorFileName(manifestFileNumber)), manifestFileNumber);
+ LogWriter log = Logs.createLogWriter(new File(databaseDir, Filename.descriptorFileName(manifestFileNumber)), manifestFileNumber, allowMmapWrites);
try {
writeSnapshot(log);
log.addRecord(edit.encode(), false);
@@ -141,7 +144,7 @@ private void appendVersion(Version version)
requireNonNull(version, "version is null");
checkArgument(version != current, "version is the current version");
Version previous = current;
- current = version;
+ current = version; //version already retained, create with retained = 1
activeVersions.put(version, new Object());
if (previous != null) {
previous.release();
@@ -204,25 +207,20 @@ public MergingIterator makeInputIterator(Compaction c)
// TODO(opt): use concatenating iterator for level-0 if there is no overlap
List list = new ArrayList<>();
for (int which = 0; which < 2; which++) {
- if (!c.getInputs()[which].isEmpty()) {
+ List files = c.input(which);
+ if (!files.isEmpty()) {
if (c.getLevel() + which == 0) {
- List files = c.getInputs()[which];
list.add(new Level0Iterator(tableCache, files, internalKeyComparator));
}
else {
// Create concatenating iterator for the files from this level
- list.add(Level.createLevelConcatIterator(tableCache, c.getInputs()[which], internalKeyComparator));
+ list.add(Level.createLevelConcatIterator(tableCache, files, internalKeyComparator));
}
}
}
return new MergingIterator(list, internalKeyComparator);
}
- public LookupResult get(LookupKey key)
- {
- return current.get(key);
- }
-
public boolean overlapInLevel(int level, Slice smallestUserKey, Slice largestUserKey)
{
return current.overlapInLevel(level, smallestUserKey, largestUserKey);
@@ -249,7 +247,7 @@ public void setLastSequence(long newLastSequence)
this.lastSequence = newLastSequence;
}
- public void logAndApply(VersionEdit edit)
+ public void logAndApply(VersionEdit edit, ReentrantLock mutex)
throws IOException
{
if (edit.getLogNumber() != null) {
@@ -268,31 +266,38 @@ public void logAndApply(VersionEdit edit)
edit.setLastSequenceNumber(lastSequence);
Version version = new Version(this);
- Builder builder = new Builder(this, current);
- builder.apply(edit);
- builder.saveTo(version);
+ try (Builder builder = new Builder(this, current)) {
+ builder.apply(edit);
+ builder.saveTo(version);
+ }
finalizeVersion(version);
boolean createdNewManifest = false;
+ final long mFileNumber = manifestFileNumber;
try {
// Initialize new descriptor log file if necessary by creating
// a temporary file that contains a snapshot of the current version.
if (descriptorLog == null) {
edit.setNextFileNumber(nextFileNumber.get());
- descriptorLog = Logs.createLogWriter(new File(databaseDir, Filename.descriptorFileName(manifestFileNumber)), manifestFileNumber);
+ descriptorLog = Logs.createLogWriter(new File(databaseDir, Filename.descriptorFileName(mFileNumber)), mFileNumber, allowMmapWrites);
writeSnapshot(descriptorLog);
createdNewManifest = true;
}
-
- // Write new record to MANIFEST log
- Slice record = edit.encode();
- descriptorLog.addRecord(record, true);
-
- // If we just created a new descriptor file, install it by writing a
- // new CURRENT file that points to it.
- if (createdNewManifest) {
- Filename.setCurrentFile(databaseDir, descriptorLog.getFileNumber());
+ mutex.unlock();
+ try {
+ // Write new record to MANIFEST log
+ Slice record = edit.encode();
+ descriptorLog.addRecord(record, true);
+
+ // If we just created a new descriptor file, install it by writing a
+ // new CURRENT file that points to it.
+ if (createdNewManifest) {
+ Filename.setCurrentFile(databaseDir, mFileNumber);
+ }
+ }
+ finally {
+ mutex.lock();
}
}
catch (IOException e) {
@@ -300,7 +305,7 @@ public void logAndApply(VersionEdit edit)
if (createdNewManifest) {
descriptorLog.close();
// todo add delete method to LogWriter
- new File(databaseDir, Filename.logFileName(descriptorLog.getFileNumber())).delete();
+ new File(databaseDir, Filename.logFileName(mFileNumber)).delete();
descriptorLog = null;
}
throw e;
@@ -343,8 +348,7 @@ public void recover()
currentName = currentName.substring(0, currentName.length() - 1);
// open file channel
- try (FileInputStream fis = new FileInputStream(new File(databaseDir, currentName));
- FileChannel fileChannel = fis.getChannel()) {
+ try (SequentialFile in = SequentialFileImpl.open(new File(databaseDir, currentName))) {
// read log edit log
Long nextFileNumber = null;
Long lastSequence = null;
@@ -352,7 +356,7 @@ public void recover()
Long prevLogNumber = null;
Builder builder = new Builder(this, current);
- LogReader reader = new LogReader(fileChannel, throwExceptionMonitor(), true, 0);
+ LogReader reader = new LogReader(in, throwExceptionMonitor(), true, 0);
for (Slice record = reader.readRecord(); record != null; record = reader.readRecord()) {
// read version edit
VersionEdit edit = new VersionEdit(record);
@@ -394,6 +398,7 @@ public void recover()
Version newVersion = new Version(this);
builder.saveTo(newVersion);
+ builder.close();
// Install recovered version
finalizeVersion(newVersion);
@@ -622,20 +627,40 @@ private Compaction setupOtherInputs(int level, List levelInputs)
List getOverlappingInputs(int level, InternalKey begin, InternalKey end)
{
- ImmutableList.Builder files = ImmutableList.builder();
- Slice userBegin = begin.getUserKey();
- Slice userEnd = end.getUserKey();
+ List inputs = new ArrayList<>();
+ Slice userBegin = begin == null ? null : begin.getUserKey();
+ Slice userEnd = end == null ? null : end.getUserKey();
UserComparator userComparator = internalKeyComparator.getUserComparator();
- for (FileMetaData fileMetaData : current.getFiles(level)) {
- if (userComparator.compare(fileMetaData.getLargest().getUserKey(), userBegin) < 0 ||
- userComparator.compare(fileMetaData.getSmallest().getUserKey(), userEnd) > 0) {
- // Either completely before or after range; skip it
+ List filesInLevel = current.getFiles(level);
+ for (int i = 0; i < filesInLevel.size(); i++) {
+ FileMetaData fileMetaData = filesInLevel.get(i);
+ Slice fileStart = fileMetaData.getSmallest().getUserKey();
+ Slice fileLimit = fileMetaData.getLargest().getUserKey();
+ if (begin != null && userComparator.compare(fileLimit, userBegin) < 0) {
+ // "files1" is completely before specified range; skip it
+ }
+ else if (end != null && userComparator.compare(fileStart, userEnd) > 0) {
+ // "files1" is completely after specified range; skip it
}
else {
- files.add(fileMetaData);
+ inputs.add(fileMetaData);
+ if (level == 0) {
+ // Level-0 files may overlap each other. So check if the newly
+ // added file has expanded the range. If so, restart search.
+ if (begin != null && userComparator.compare(fileStart, userBegin) < 0) {
+ userBegin = fileStart;
+ inputs.clear();
+ i = -1;
+ }
+ else if (end != null && userComparator.compare(fileLimit, userEnd) > 0) {
+ userEnd = fileLimit;
+ inputs.clear();
+ i = -1;
+ }
+ }
}
}
- return files.build();
+ return inputs;
}
private Entry getRange(List... inputLists)
@@ -682,7 +707,7 @@ public long getMaxNextLevelOverlappingBytes()
* of edits to a particular state without creating intermediate
* Versions that contain full copies of the intermediate state.
*/
- private static class Builder
+ private static class Builder implements AutoCloseable
{
private final VersionSet versionSet;
private final Version baseVersion;
@@ -692,6 +717,7 @@ private Builder(VersionSet versionSet, Version baseVersion)
{
this.versionSet = versionSet;
this.baseVersion = baseVersion;
+ baseVersion.retain();
levels = new ArrayList<>(baseVersion.numberOfLevels());
for (int i = 0; i < baseVersion.numberOfLevels(); i++) {
@@ -759,7 +785,7 @@ public void saveTo(Version version)
// Merge the set of added files with the set of pre-existing files.
// Drop any deleted files. Store the result in *v.
- Collection baseFiles = baseVersion.getFiles().asMap().get(level);
+ Collection baseFiles = baseVersion.getFiles(level);
if (baseFiles == null) {
baseFiles = ImmutableList.of();
}
@@ -780,7 +806,7 @@ public void saveTo(Version version)
//#ifndef NDEBUG todo
// Make sure there is no overlap in levels > 0
- version.assertNoOverlappingFiles();
+ version.assertNoOverlappingFiles(level);
//#endif
}
}
@@ -809,6 +835,12 @@ private void maybeAddFile(Version version, int level, FileMetaData fileMetaData)
}
}
+ @Override
+ public void close()
+ {
+ baseVersion.release();
+ }
+
private static class FileMetaDataBySmallestKey
implements Comparator
{
diff --git a/leveldb/src/main/java/org/iq80/leveldb/impl/WriteBatchImpl.java b/leveldb/src/main/java/org/iq80/leveldb/impl/WriteBatchImpl.java
index 003a3cad..640aa222 100644
--- a/leveldb/src/main/java/org/iq80/leveldb/impl/WriteBatchImpl.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/impl/WriteBatchImpl.java
@@ -31,7 +31,11 @@
public class WriteBatchImpl
implements WriteBatch
{
+ // WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
+ private static final int HEADER_SIZE = 12;
+
private final List> batch = new ArrayList<>();
+ //TODO fix this count that is wrong!!!
private int approximateSize;
public int getApproximateSize()
@@ -50,7 +54,7 @@ public WriteBatchImpl put(byte[] key, byte[] value)
requireNonNull(key, "key is null");
requireNonNull(value, "value is null");
batch.add(Maps.immutableEntry(Slices.wrappedBuffer(key), Slices.wrappedBuffer(value)));
- approximateSize += 12 + key.length + value.length;
+ approximateSize += HEADER_SIZE + key.length + value.length;
return this;
}
@@ -59,7 +63,7 @@ public WriteBatchImpl put(Slice key, Slice value)
requireNonNull(key, "key is null");
requireNonNull(value, "value is null");
batch.add(Maps.immutableEntry(key, value));
- approximateSize += 12 + key.length() + value.length();
+ approximateSize += HEADER_SIZE + key.length() + value.length();
return this;
}
@@ -99,6 +103,18 @@ public void forEach(Handler handler)
}
}
+ public void append(WriteBatchImpl batch)
+ {
+ this.batch.addAll(batch.batch);
+ this.approximateSize += batch.approximateSize;
+ }
+
+ public void clear()
+ {
+ approximateSize = 0;
+ batch.clear();
+ }
+
public interface Handler
{
void put(Slice key, Slice value);
diff --git a/leveldb/src/test/java/org/iq80/leveldb/table/FileChannelTableTest.java b/leveldb/src/main/java/org/iq80/leveldb/table/BlockHandleSliceWeigher.java
similarity index 67%
rename from leveldb/src/test/java/org/iq80/leveldb/table/FileChannelTableTest.java
rename to leveldb/src/main/java/org/iq80/leveldb/table/BlockHandleSliceWeigher.java
index 99c6021c..92043c39 100644
--- a/leveldb/src/test/java/org/iq80/leveldb/table/FileChannelTableTest.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/table/BlockHandleSliceWeigher.java
@@ -15,21 +15,22 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
package org.iq80.leveldb.table;
-import org.iq80.leveldb.util.Slice;
+import com.google.common.cache.Weigher;
-import java.io.IOException;
-import java.nio.channels.FileChannel;
-import java.util.Comparator;
+import org.iq80.leveldb.util.Slice;
-public class FileChannelTableTest
- extends TableTest
+/**
+ * @author Honore Vasconcelos
+ */
+public class BlockHandleSliceWeigher implements Weigher
{
@Override
- protected Table createTable(String name, FileChannel fileChannel, Comparator comparator, boolean verifyChecksums)
- throws IOException
+ public int weigh(BlockHandle key, Slice value)
{
- return new FileChannelTable(name, fileChannel, comparator, verifyChecksums);
+ //approximate weigher
+ return 64 + value.getRawArray().length;
}
}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/table/BlockIterator.java b/leveldb/src/main/java/org/iq80/leveldb/table/BlockIterator.java
index f91dd36f..e9b2d136 100644
--- a/leveldb/src/main/java/org/iq80/leveldb/table/BlockIterator.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/table/BlockIterator.java
@@ -112,6 +112,16 @@ public void seekToFirst()
}
}
+ public void seekToLast()
+ {
+ if (restartCount > 0) {
+ seekToRestartPosition(restartCount - 1);
+ while (peek() != null) {
+ next();
+ }
+ }
+ }
+
/**
* Repositions the iterator so the key of the next BlockElement returned greater than or equal to the specified targetKey.
*/
@@ -145,7 +155,7 @@ public void seek(Slice targetKey)
// linear search (within restart block) for first key greater than or equal to targetKey
for (seekToRestartPosition(left); nextEntry != null; next()) {
- if (comparator.compare(peek().getKey(), targetKey) >= 0) {
+ if (comparator.compare(nextEntry.getKey(), targetKey) >= 0) {
break;
}
}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/table/BloomFilterPolicy.java b/leveldb/src/main/java/org/iq80/leveldb/table/BloomFilterPolicy.java
new file mode 100644
index 00000000..da71c87d
--- /dev/null
+++ b/leveldb/src/main/java/org/iq80/leveldb/table/BloomFilterPolicy.java
@@ -0,0 +1,135 @@
+/*
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.iq80.leveldb.table;
+
+import org.iq80.leveldb.XFilterPolicy;
+import org.iq80.leveldb.util.Hash;
+import org.iq80.leveldb.util.Slice;
+
+import java.util.List;
+
+/**
+ * BloomFilter policy
+ *
+ * @author Honore Vasconcelos
+ * @link https://github.com/google/leveldb/commit/85584d497e7b354853b72f450683d59fcf6b9c5c
+ */
+public final class BloomFilterPolicy implements org.iq80.leveldb.table.FilterPolicy, XFilterPolicy
+{
+ private final int bitsPerKey;
+ private final int k;
+
+ public BloomFilterPolicy(final int bitsPerKey)
+ {
+ this.bitsPerKey = bitsPerKey;
+ int k = (int) (bitsPerKey * 0.69);
+ if (k < 1) {
+ k = 1;
+ }
+ else if (k > 30) {
+ k = 30;
+ }
+ this.k = k;
+ }
+
+ @Override
+ public String name()
+ {
+ return "leveldb.BuiltinBloomFilter2";
+ }
+
+ @Override
+ public byte[] createFilter(List keys)
+ {
+ // Compute bloom filter size (in both bits and bytes)
+ int bits = keys.size() * bitsPerKey;
+
+ // For small n, we can see a very high false positive rate. Fix it
+ // by enforcing a minimum bloom filter length.
+ if (bits < 64) {
+ bits = 64;
+ }
+
+ int bytes = (bits + 7) / 8;
+ bits = bytes * 8;
+
+ final byte[] array = new byte[bytes + 1];
+ array[array.length - 1] = (byte) k; // Remember # of probes in filter
+
+ for (Slice key : keys) {
+ // Use double-hashing to generate a sequence of hash values.
+ // See analysis in [Kirsch,Mitzenmacher 2006].
+ int h = bloomHash(key);
+ int delta = (h >>> 17) | (h << 15); // Rotate right 17 bits
+ for (int j = 0; j < k; j++) {
+ int bitpos = (int) ((toLong(h)) % bits);
+ final int i = bitpos / 8;
+ array[i] |= (1 << (bitpos % 8));
+ h += delta;
+ }
+ }
+ return array;
+ }
+
+ private int bloomHash(Slice data)
+ {
+ return Hash.hash(data.getRawArray(), data.getRawOffset(), data.length(), 0xbc9f1d34); //avoid data copy
+ }
+
+ @Override
+ public boolean keyMayMatch(Slice key, Slice bloomFilter1)
+ {
+ int len = bloomFilter1.length();
+ byte[] data = bloomFilter1.getRawArray();
+ int offset = bloomFilter1.getRawOffset();
+ if (len < 2) {
+ return false;
+ }
+
+ int bits = (len - 1) * 8;
+
+ // Use the encoded k so that we can read filters generated by
+ // bloom filters created using different parameters.
+ int k = data[offset + len - 1];
+ if (k > 30) {
+ // Reserved for potentially new encodings for short bloom filters.
+ // Consider it a match.
+ return true;
+ }
+
+ int h = bloomHash(key);
+ int delta = (h >>> 17) | (h << 15); // Rotate right 17 bits
+ for (int j = 0; j < k; j++) {
+ int bitpos = (int) (toLong(h) % bits);
+ if ((data[offset + (bitpos / 8)] & (1 << (bitpos % 8))) == 0) {
+ return false;
+ }
+ h += delta;
+ }
+ return true;
+ }
+
+ /**
+ * Convert an unsigned int into a long
+ */
+ private long toLong(int h)
+ {
+ return h & 0xffffffffL;
+ }
+}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/table/FileChannelTable.java b/leveldb/src/main/java/org/iq80/leveldb/table/FileChannelTable.java
deleted file mode 100644
index a5723387..00000000
--- a/leveldb/src/main/java/org/iq80/leveldb/table/FileChannelTable.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (C) 2011 the original author or authors.
- * See the notice.md file distributed with this work for additional
- * information regarding copyright ownership.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.iq80.leveldb.table;
-
-import org.iq80.leveldb.util.Slice;
-import org.iq80.leveldb.util.Slices;
-import org.iq80.leveldb.util.Snappy;
-
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.channels.FileChannel;
-import java.util.Comparator;
-
-import static org.iq80.leveldb.CompressionType.SNAPPY;
-
-public class FileChannelTable
- extends Table
-{
- public FileChannelTable(String name, FileChannel fileChannel, Comparator comparator, boolean verifyChecksums)
- throws IOException
- {
- super(name, fileChannel, comparator, verifyChecksums);
- }
-
- @Override
- protected Footer init()
- throws IOException
- {
- long size = fileChannel.size();
- ByteBuffer footerData = read(size - Footer.ENCODED_LENGTH, Footer.ENCODED_LENGTH);
- return Footer.readFooter(Slices.copiedBuffer(footerData));
- }
-
- @SuppressWarnings({"AssignmentToStaticFieldFromInstanceMethod", "NonPrivateFieldAccessedInSynchronizedContext"})
- @Override
- protected Block readBlock(BlockHandle blockHandle)
- throws IOException
- {
- // read block trailer
- ByteBuffer trailerData = read(blockHandle.getOffset() + blockHandle.getDataSize(), BlockTrailer.ENCODED_LENGTH);
- BlockTrailer blockTrailer = BlockTrailer.readBlockTrailer(Slices.copiedBuffer(trailerData));
-
-// todo re-enable crc check when ported to support direct buffers
-// // only verify check sums if explicitly asked by the user
-// if (verifyChecksums) {
-// // checksum data and the compression type in the trailer
-// PureJavaCrc32C checksum = new PureJavaCrc32C();
-// checksum.update(data.getRawArray(), data.getRawOffset(), blockHandle.getDataSize() + 1);
-// int actualCrc32c = checksum.getMaskedValue();
-//
-// checkState(blockTrailer.getCrc32c() == actualCrc32c, "Block corrupted: checksum mismatch");
-// }
-
- // decompress data
-
- ByteBuffer uncompressedBuffer = read(blockHandle.getOffset(), blockHandle.getDataSize());
- Slice uncompressedData;
- if (blockTrailer.getCompressionType() == SNAPPY) {
- synchronized (FileChannelTable.class) {
- int uncompressedLength = uncompressedLength(uncompressedBuffer);
- if (uncompressedScratch.capacity() < uncompressedLength) {
- uncompressedScratch = ByteBuffer.allocateDirect(uncompressedLength);
- }
- uncompressedScratch.clear();
-
- Snappy.uncompress(uncompressedBuffer, uncompressedScratch);
- uncompressedData = Slices.copiedBuffer(uncompressedScratch);
- }
- }
- else {
- uncompressedData = Slices.copiedBuffer(uncompressedBuffer);
- }
-
- return new Block(uncompressedData, comparator);
- }
-
- private ByteBuffer read(long offset, int length)
- throws IOException
- {
- ByteBuffer uncompressedBuffer = ByteBuffer.allocate(length);
- fileChannel.read(uncompressedBuffer, offset);
- if (uncompressedBuffer.hasRemaining()) {
- throw new IOException("Could not read all the data");
- }
- uncompressedBuffer.clear();
- return uncompressedBuffer;
- }
-}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/table/FilterBlockBuilder.java b/leveldb/src/main/java/org/iq80/leveldb/table/FilterBlockBuilder.java
new file mode 100644
index 00000000..00d90bc7
--- /dev/null
+++ b/leveldb/src/main/java/org/iq80/leveldb/table/FilterBlockBuilder.java
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.iq80.leveldb.table;
+
+import org.iq80.leveldb.util.DynamicSliceOutput;
+import org.iq80.leveldb.util.IntVector;
+import org.iq80.leveldb.util.Slice;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+/**
+ * The filter block stores a sequence of filters, where filter i contains
+ * the output of FilterPolicy::CreateFilter() on all keys that are stored
+ * in a block whose file offset falls within the range
+ *
+ * [ i*base ... (i+1)*base-1 ]
+ *
+ * Currently, "base" is 2KB. So for example, if blocks X and Y start in
+ * the range [ 0KB .. 2KB-1 ], all of the keys in X and Y will be
+ * converted to a filter by calling FilterPolicy::CreateFilter(), and the
+ * resulting filter will be stored as the first filter in the filter
+ * block.
+ *
+ * [offset of beginning of offset array] : 4 bytes
+ * lg(base) : 1 byte
+ *
+ *
+ *
+ * @author Honore Vasconcelos
+ */
+public class FilterBlockBuilder
+{
+ // Generate new filter every 2KB of data
+ private static final byte FILTER_BASE_LG = 11;
+ private static final int FILTER_BASE = 1 << FILTER_BASE_LG;
+
+ private final List keys = new ArrayList<>();
+ private final DynamicSliceOutput result = new DynamicSliceOutput(32);
+ private final IntVector filterOffsets = new IntVector(32);
+ private final FilterPolicy policy;
+
+ public FilterBlockBuilder(FilterPolicy policy)
+ {
+ this.policy = policy;
+ }
+
+ public void addKey(Slice key)
+ {
+ keys.add(key);
+ }
+
+ public void startBlock(long blockOffset)
+ {
+ long filterIndex = blockOffset / FILTER_BASE;
+ checkArgument(filterIndex >= filterOffsets.size());
+ while (filterIndex > filterOffsets.size()) {
+ generateFilter();
+ }
+ }
+
+ private void generateFilter()
+ {
+ final int numberOfKeys = keys.size();
+ if (numberOfKeys == 0) {
+ //Fast path if there are no keys for this filter
+ filterOffsets.add(result.size());
+ return;
+ }
+ filterOffsets.add(result.size());
+ final byte[] filter = policy.createFilter(keys);
+ result.writeBytes(filter);
+ keys.clear();
+ }
+
+ public Slice finish()
+ {
+ if (!keys.isEmpty()) {
+ generateFilter();
+ }
+ final int arrayOffset = result.size();
+ filterOffsets.write(result);
+ result.writeInt(arrayOffset); //4 bytes
+ result.write(FILTER_BASE_LG); //1 byte
+ final Slice slice = result.slice();
+ return slice;
+ }
+
+ public String name()
+ {
+ return policy.name();
+ }
+}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/table/FilterBlockReader.java b/leveldb/src/main/java/org/iq80/leveldb/table/FilterBlockReader.java
new file mode 100644
index 00000000..0bcbabdb
--- /dev/null
+++ b/leveldb/src/main/java/org/iq80/leveldb/table/FilterBlockReader.java
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.iq80.leveldb.table;
+
+import org.iq80.leveldb.util.Slice;
+
+/**
+ * @author Honore Vasconcelos
+ */
+final class FilterBlockReader
+{
+ private final byte baseLg;
+ private final int num;
+ private final Slice contents;
+ private final int offset;
+ private final FilterPolicy filterPolicy;
+
+ FilterBlockReader(FilterPolicy filterPolicy, Slice contents)
+ {
+ this.filterPolicy = filterPolicy;
+ final int n = contents.length();
+ final int lgAndOffset = 5;
+ if (n < lgAndOffset) { //byte + int
+ this.baseLg = 0;
+ this.contents = null;
+ this.num = 0;
+ this.offset = 0;
+ return;
+ }
+ baseLg = contents.getByte(n - 1);
+ offset = contents.getInt(n - lgAndOffset);
+ if (offset > n - lgAndOffset) {
+ this.num = 0;
+ this.contents = null;
+ return;
+ }
+ num = (n - lgAndOffset - offset) / 4;
+ this.contents = contents;
+ }
+
+ public boolean keyMayMatch(long offset1, Slice key)
+ {
+ final int index = (int) (offset1 >> baseLg);
+ if (index < num) {
+ final int start = contents.getInt(this.offset + index * 4);
+ final int limit = contents.getInt(this.offset + index * 4 + 4);
+ if (start <= limit && limit <= offset) {
+ Slice filter = contents.slice(start, limit - start);
+ return filterPolicy.keyMayMatch(key, filter);
+ }
+ else if (start == limit) {
+ // Empty filters do not match any keys
+ return false;
+ }
+ }
+ return true; // Errors are treated as potential matches
+ }
+}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/table/FilterPolicy.java b/leveldb/src/main/java/org/iq80/leveldb/table/FilterPolicy.java
new file mode 100644
index 00000000..dd457f8e
--- /dev/null
+++ b/leveldb/src/main/java/org/iq80/leveldb/table/FilterPolicy.java
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.iq80.leveldb.table;
+
+import org.iq80.leveldb.util.Slice;
+
+import java.util.List;
+
+/**
+ * A database can be configured with a custom FilterPolicy object.
+ * This object is responsible for creating a small filter from a set
+ * of keys. These filters are stored in leveldb and are consulted
+ * automatically by leveldb to decide whether or not to read some
+ * information from disk. In many cases, a filter can cut down the
+ * number of disk seeks form a handful to a single disk seek per
+ * DB::Get() call.
+ *
+ * Most people will want to use the builtin bloom filter support (see
+ * NewBloomFilterPolicy() below).
+ *
+ * @author Honore Vasconcelos
+ */
+public interface FilterPolicy extends org.iq80.leveldb.XFilterPolicy
+{
+ String name();
+
+ /**
+ * Append a filter that summarizes keys[0,n-1] to *dst.
+ *
+ * @param keys keys[0,n-1] contains a list of keys (potentially with duplicates)
+ * that are ordered according to the user supplied comparator.
+ */
+ byte[] createFilter(List keys);
+
+ /**
+ * "filter" contains the data appended by a preceding call to
+ * CreateFilter() on this class. This method must return true if
+ * the key was in the list of keys passed to CreateFilter().
+ * This method may return true or false if the key was not on the
+ * list, but it should aim to return false with a high probability.
+ */
+ boolean keyMayMatch(Slice key, Slice filter);
+}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/table/KeyValueFunction.java b/leveldb/src/main/java/org/iq80/leveldb/table/KeyValueFunction.java
new file mode 100644
index 00000000..ae9aa919
--- /dev/null
+++ b/leveldb/src/main/java/org/iq80/leveldb/table/KeyValueFunction.java
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2011 the original author or authors.
+ * See the notice.md file distributed with this work for additional
+ * information regarding copyright ownership.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.iq80.leveldb.table;
+
+import org.iq80.leveldb.util.Slice;
+
+/**
+ * @author Honore Vasconcelos
+ */
+public interface KeyValueFunction
+{
+ /**
+ * Function to apply on first entry after seeking in a table.
+ *
+ * @param internalKey internal key
+ * @param value associated value
+ * @return transformed key/value
+ */
+ T apply(Slice internalKey, Slice value);
+}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/table/MMapTable.java b/leveldb/src/main/java/org/iq80/leveldb/table/MMapTable.java
deleted file mode 100755
index 75b08a3b..00000000
--- a/leveldb/src/main/java/org/iq80/leveldb/table/MMapTable.java
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (C) 2011 the original author or authors.
- * See the notice.md file distributed with this work for additional
- * information regarding copyright ownership.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.iq80.leveldb.table;
-
-import org.iq80.leveldb.util.ByteBufferSupport;
-import org.iq80.leveldb.util.Closeables;
-import org.iq80.leveldb.util.Slice;
-import org.iq80.leveldb.util.Slices;
-import org.iq80.leveldb.util.Snappy;
-
-import java.io.Closeable;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
-import java.nio.MappedByteBuffer;
-import java.nio.channels.FileChannel;
-import java.nio.channels.FileChannel.MapMode;
-import java.util.Comparator;
-import java.util.concurrent.Callable;
-
-import static com.google.common.base.Preconditions.checkArgument;
-import static org.iq80.leveldb.CompressionType.SNAPPY;
-
-public class MMapTable
- extends Table
-{
- private MappedByteBuffer data;
-
- public MMapTable(String name, FileChannel fileChannel, Comparator comparator, boolean verifyChecksums)
- throws IOException
- {
- super(name, fileChannel, comparator, verifyChecksums);
- checkArgument(fileChannel.size() <= Integer.MAX_VALUE, "File must be smaller than %s bytes", Integer.MAX_VALUE);
- }
-
- @Override
- protected Footer init()
- throws IOException
- {
- long size = fileChannel.size();
- data = fileChannel.map(MapMode.READ_ONLY, 0, size);
- Slice footerSlice = Slices.copiedBuffer(data, (int) size - Footer.ENCODED_LENGTH, Footer.ENCODED_LENGTH);
- return Footer.readFooter(footerSlice);
- }
-
- @Override
- public Callable> closer()
- {
- return new Closer(name, fileChannel, data);
- }
-
- private static class Closer
- implements Callable
- {
- private final String name;
- private final Closeable closeable;
- private final MappedByteBuffer data;
-
- public Closer(String name, Closeable closeable, MappedByteBuffer data)
- {
- this.name = name;
- this.closeable = closeable;
- this.data = data;
- }
-
- public Void call()
- {
- ByteBufferSupport.unmap(data);
- Closeables.closeQuietly(closeable);
- return null;
- }
- }
-
- @SuppressWarnings({"NonPrivateFieldAccessedInSynchronizedContext", "AssignmentToStaticFieldFromInstanceMethod"})
- @Override
- protected Block readBlock(BlockHandle blockHandle)
- throws IOException
- {
- // read block trailer
- BlockTrailer blockTrailer = BlockTrailer.readBlockTrailer(Slices.copiedBuffer(this.data,
- (int) blockHandle.getOffset() + blockHandle.getDataSize(),
- BlockTrailer.ENCODED_LENGTH));
-
-// todo re-enable crc check when ported to support direct buffers
-// // only verify check sums if explicitly asked by the user
-// if (verifyChecksums) {
-// // checksum data and the compression type in the trailer
-// PureJavaCrc32C checksum = new PureJavaCrc32C();
-// checksum.update(data.getRawArray(), data.getRawOffset(), blockHandle.getDataSize() + 1);
-// int actualCrc32c = checksum.getMaskedValue();
-//
-// checkState(blockTrailer.getCrc32c() == actualCrc32c, "Block corrupted: checksum mismatch");
-// }
-
- // decompress data
- Slice uncompressedData;
- ByteBuffer uncompressedBuffer = read(this.data, (int) blockHandle.getOffset(), blockHandle.getDataSize());
- if (blockTrailer.getCompressionType() == SNAPPY) {
- synchronized (MMapTable.class) {
- int uncompressedLength = uncompressedLength(uncompressedBuffer);
- if (uncompressedScratch.capacity() < uncompressedLength) {
- uncompressedScratch = ByteBuffer.allocateDirect(uncompressedLength);
- }
- uncompressedScratch.clear();
-
- Snappy.uncompress(uncompressedBuffer, uncompressedScratch);
- uncompressedData = Slices.copiedBuffer(uncompressedScratch);
- }
- }
- else {
- uncompressedData = Slices.copiedBuffer(uncompressedBuffer);
- }
-
- return new Block(uncompressedData, comparator);
- }
-
- public static ByteBuffer read(MappedByteBuffer data, int offset, int length)
- throws IOException
- {
- int newPosition = data.position() + offset;
- ByteBuffer block = (ByteBuffer) data.duplicate().order(ByteOrder.LITTLE_ENDIAN).clear().limit(newPosition + length).position(newPosition);
- return block;
- }
-}
diff --git a/leveldb/src/main/java/org/iq80/leveldb/table/Table.java b/leveldb/src/main/java/org/iq80/leveldb/table/Table.java
index 37ee951c..bff8307c 100755
--- a/leveldb/src/main/java/org/iq80/leveldb/table/Table.java
+++ b/leveldb/src/main/java/org/iq80/leveldb/table/Table.java
@@ -18,53 +18,102 @@
package org.iq80.leveldb.table;
import com.google.common.base.Throwables;
+import com.google.common.cache.CacheLoader;
import org.iq80.leveldb.impl.SeekingIterable;
-import org.iq80.leveldb.util.Closeables;
+import org.iq80.leveldb.util.LRUCache;
+import org.iq80.leveldb.util.RandomInputFile;
import org.iq80.leveldb.util.Slice;
+import org.iq80.leveldb.util.Slices;
+import org.iq80.leveldb.util.Snappy;
import org.iq80.leveldb.util.TableIterator;
import org.iq80.leveldb.util.VariableLengthQuantity;
-import java.io.Closeable;
import java.io.IOException;
import java.nio.ByteBuffer;
-import java.nio.channels.FileChannel;
+import java.nio.charset.Charset;
import java.util.Comparator;
import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
import static com.google.common.base.Preconditions.checkArgument;
import static java.util.Objects.requireNonNull;
+import static org.iq80.leveldb.CompressionType.SNAPPY;
-public abstract class Table
+public final class Table
implements SeekingIterable
{
- protected final String name;
- protected final FileChannel fileChannel;
- protected final Comparator comparator;
- protected final boolean verifyChecksums;
- protected final Block indexBlock;
- protected final BlockHandle metaindexBlockHandle;
-
- public Table(String name, FileChannel fileChannel, Comparator comparator, boolean verifyChecksums)
+ private static final Charset CHARSET = Charset.forName("UTF-8");
+ private final Comparator comparator;
+ private final boolean verifyChecksums;
+ private final Block indexBlock;
+ private final BlockHandle metaindexBlockHandle;
+ private final RandomInputFile source;
+ private final LRUCache.LRUSubCache blockCache;
+ private final FilterBlockReader filter;
+
+ public Table(RandomInputFile source, Comparator comparator, boolean verifyChecksum, LRUCache blockCache, final FilterPolicy filterPolicy)
throws IOException
{
- requireNonNull(name, "name is null");
- requireNonNull(fileChannel, "fileChannel is null");
- long size = fileChannel.size();
+ this.source = source;
+ this.blockCache = cacheForTable(blockCache);
+ requireNonNull(source, "source is null");
+ long size = source.size();
checkArgument(size >= Footer.ENCODED_LENGTH, "File is corrupt: size must be at least %s bytes", Footer.ENCODED_LENGTH);
requireNonNull(comparator, "comparator is null");
- this.name = name;
- this.fileChannel = fileChannel;
- this.verifyChecksums = verifyChecksums;
+ this.verifyChecksums = verifyChecksum;
this.comparator = comparator;
+ final ByteBuffer footerData = source.read(size - Footer.ENCODED_LENGTH, Footer.ENCODED_LENGTH);
- Footer footer = init();
- indexBlock = readBlock(footer.getIndexBlockHandle());
+ Footer footer = Footer.readFooter(Slices.avoidCopiedBuffer(footerData));
+ indexBlock = new Block(readRawBlock(footer.getIndexBlockHandle()), comparator); //no need for cache
metaindexBlockHandle = footer.getMetaindexBlockHandle();
+ this.filter = readMeta(filterPolicy);
+
}
- protected abstract Footer init()
- throws IOException;
+ private FilterBlockReader readMeta(FilterPolicy filterPolicy) throws IOException
+ {
+ if (filterPolicy == null) {
+ return null; // Do not need any metadata
+ }
+
+ final Block meta = new Block(readRawBlock(metaindexBlockHandle), new BytewiseComparator());
+ final BlockIterator iterator = meta.iterator();
+ final Slice targetKey = new Slice(("filter." + filterPolicy.name()).getBytes(CHARSET));
+ iterator.seek(targetKey);
+ if (iterator.hasNext() && iterator.peek().getKey().equals(targetKey)) {
+ return readFilter(filterPolicy, iterator.next().getValue());
+ }
+ else {
+ return null;
+ }
+ }
+
+ protected FilterBlockReader readFilter(FilterPolicy filterPolicy, Slice filterHandle) throws IOException
+ {
+ final Slice filterBlock = readRawBlock(BlockHandle.readBlockHandle(filterHandle.input()));
+ return new FilterBlockReader(filterPolicy, filterBlock);
+ }
+
+ /**
+ * Get reference to a new sub cache to current table.
+ *
+ * @param blockCache global cache
+ * @return cache scoped to current table
+ */
+ private LRUCache.LRUSubCache cacheForTable(LRUCache blockCache)
+ {
+ final LRUCache cache = requireNonNull(blockCache, "Block cache should not be null");
+ return cache.subCache(new CacheLoader()
+ {
+ @Override
+ public Slice load(BlockHandle key) throws Exception
+ {
+ return readRawBlock(key);
+ }
+ });
+ }
@Override
public TableIterator iterator()
@@ -72,6 +121,11 @@ public TableIterator iterator()
return new TableIterator(this, indexBlock.iterator());
}
+ public FilterBlockReader getFilter()
+ {
+ return filter;
+ }
+
public Block openBlock(Slice blockEntry)
{
BlockHandle blockHandle = BlockHandle.readBlockHandle(blockEntry.input());
@@ -85,16 +139,78 @@ public Block openBlock(Slice blockEntry)
return dataBlock;
}
- protected static ByteBuffer uncompressedScratch = ByteBuffer.allocateDirect(4 * 1024 * 1024);
-
- protected abstract Block readBlock(BlockHandle blockHandle)
- throws IOException;
+ private Block readBlock(BlockHandle blockHandle)
+ throws IOException
+ {
+ try {
+ final Slice rawBlock = blockCache.load(blockHandle);
+ return new Block(rawBlock, comparator);
+ }
+ catch (ExecutionException e) {
+ Throwables.propagateIfPossible(e.getCause(), IOException.class);
+ throw new IOException(e.getCause());
+ }
+ }
- protected int uncompressedLength(ByteBuffer data)
+ protected Slice readRawBlock(BlockHandle blockHandle)
throws IOException
{
- int length = VariableLengthQuantity.readVariableLengthInt(data.duplicate());
- return length;
+ // read block trailer
+ final ByteBuffer trailerData = source.read(blockHandle.getOffset() + blockHandle.getDataSize(), BlockTrailer.ENCODED_LENGTH);
+ final BlockTrailer blockTrailer = BlockTrailer.readBlockTrailer(Slices.avoidCopiedBuffer(trailerData));
+
+// todo re-enable crc check when ported to support direct buffers
+// // only verify check sums if explicitly asked by the user
+// if (verifyChecksums) {
+// // checksum data and the compression type in the trailer
+// PureJavaCrc32C checksum = new PureJavaCrc32C();
+// checksum.update(data.getRawArray(), data.getRawOffset(), blockHandle.getDataSize() + 1);
+// int actualCrc32c = checksum.getMaskedValue();
+//
+// checkState(blockTrailer.getCrc32c() == actualCrc32c, "Block corrupted: checksum mismatch");
+// }
+
+ // decompress data
+ Slice uncompressedData;
+ ByteBuffer uncompressedBuffer = source.read(blockHandle.getOffset(), blockHandle.getDataSize());
+ if (blockTrailer.getCompressionType() == SNAPPY) {
+ int uncompressedLength = uncompressedLength(uncompressedBuffer);
+ final ByteBuffer uncompressedScratch = ByteBuffer.allocateDirect(uncompressedLength);
+ Snappy.uncompress(uncompressedBuffer, uncompressedScratch);
+ uncompressedData = Slices.copiedBuffer(uncompressedScratch);
+ }
+ else {
+ uncompressedData = Slices.avoidCopiedBuffer(uncompressedBuffer);
+ }
+
+ return uncompressedData;
+ }
+
+ public T internalGet(Slice key, KeyValueFunction keyValueFunction)
+ {
+ final BlockIterator iterator = indexBlock.iterator();
+ iterator.seek(key);
+ if (iterator.hasNext()) {
+ final BlockEntry peek = iterator.peek();
+ final Slice handleValue = peek.getValue();
+ if (filter != null && !filter.keyMayMatch(BlockHandle.readBlockHandle(handleValue.input()).getOffset(), key)) {
+ return null;
+ }
+ else {
+ final BlockIterator iterator1 = openBlock(handleValue).iterator();
+ iterator1.seek(key);
+ if (iterator1.hasNext()) {
+ final BlockEntry next = iterator1.next();
+ return keyValueFunction.apply(next.getKey(), next.getValue());
+ }
+ }
+ }
+ return null;
+ }
+
+ private int uncompressedLength(ByteBuffer data)
+ {
+ return VariableLengthQuantity.readVariableLengthInt(data.duplicate());
}
/**
@@ -123,34 +239,31 @@ public long getApproximateOffsetOf(Slice key)
@Override
public String toString()
{
- StringBuilder sb = new StringBuilder();
- sb.append("Table");
- sb.append("{name='").append(name).append('\'');
- sb.append(", comparator=").append(comparator);
- sb.append(", verifyChecksums=").append(verifyChecksums);
- sb.append('}');
- return sb.toString();
+ return "Table" +
+ "{source='" + source + '\'' +
+ ", comparator=" + comparator +
+ ", verifyChecksums=" + verifyChecksums +
+ '}';
}
public Callable> closer()
{
- return new Closer(fileChannel);
+ return new CloseableToCallable(source);
}
- private static class Closer
- implements Callable
+ private static class CloseableToCallable implements Callable