mesos · RongGu · Aug 2, 2013 · Aug 3, 2013 · rxin · Aug 2, 2013
diff --git a/core/src/main/scala/spark/broadcast/BitTorrentBroadcast.scala b/core/src/main/scala/spark/broadcast/BitTorrentBroadcast.scala
@@ -21,7 +21,8 @@ private[spark] class BitTorrentBroadcast[T](@transient var value_ : T, isLocal:
   def blockId: String = "broadcast_" + id
 
   MultiTracker.synchronized {
-    SparkEnv.get.blockManager.putSingle(blockId, value_, StorageLevel.MEMORY_AND_DISK, false)
+    //Let BlockManagerMaster know that we have the broadcast block for its latter notification us to remove.
+    SparkEnv.get.blockManager.putSingle(blockId, value_, StorageLevel.MEMORY_AND_DISK, true)
   }
 
   @transient var arrayOfBlocks: Array[BroadcastBlock] = null
@@ -58,6 +59,23 @@ private[spark] class BitTorrentBroadcast[T](@transient var value_ : T, isLocal:
   if (!isLocal) {
     sendBroadcast()
   }
+
+  override def rm(toClearSource: Boolean = false) {
+    logInfo("Remove broadcast variable " + blockId)
+    SparkEnv.get.blockManager.master.removeBlock(blockId)
+    SparkEnv.get.blockManager.removeBlock(blockId, false)
+    if(toClearSource)
+      clearBlockSource()
+  }
+
+  def clearBlockSource(){
+    arrayOfBlocks = null
+    hasBlocksBitVector = null
+    numCopiesSent = null
+    listOfSources = null
+    serveMR = null
+    guideMR = null
+  }
 
   def sendBroadcast() {
     logInfo("Local host address: " + hostAddress)
@@ -116,7 +134,7 @@ private[spark] class BitTorrentBroadcast[T](@transient var value_ : T, isLocal:
   private def readObject(in: ObjectInputStream) {
     in.defaultReadObject()
     MultiTracker.synchronized {
-      SparkEnv.get.blockManager.getSingle(blockId) match {
+      SparkEnv.get.blockManager.getSingleLocal(blockId) match {
         case Some(x) =>
           value_ = x.asInstanceOf[T]
 
@@ -139,8 +157,9 @@ private[spark] class BitTorrentBroadcast[T](@transient var value_ : T, isLocal:
           val receptionSucceeded = receiveBroadcast(id)
           if (receptionSucceeded) {
             value_ = MultiTracker.unBlockifyObject[T](arrayOfBlocks, totalBytes, totalBlocks)
+            //Let BlockManagerMaster know that we have the broadcast block for its latter notification us to remove.
             SparkEnv.get.blockManager.putSingle(
-              blockId, value_, StorageLevel.MEMORY_AND_DISK, false)
+              blockId, value_, StorageLevel.MEMORY_AND_DISK, true)
           }  else {
             logError("Reading broadcast variable " + id + " failed")
           }

diff --git a/core/src/main/scala/spark/broadcast/Broadcast.scala b/core/src/main/scala/spark/broadcast/Broadcast.scala
@@ -12,6 +12,10 @@ abstract class Broadcast[T](private[spark] val id: Long) extends Serializable {
   // readObject having to be 'private' in sub-classes.
 
   override def toString = "spark.Broadcast(" + id + ")"
+
+  // Remove a Broadcast blcok from the SparkContext and Executors that have it.
+  // Set isClearSource true to also remove the Broadcast value from its source.
+  def rm(toClearSource: Boolean)
 }
 
 private[spark] 

diff --git a/core/src/main/scala/spark/broadcast/HttpBroadcast.scala b/core/src/main/scala/spark/broadcast/HttpBroadcast.scala
@@ -21,24 +21,38 @@ extends Broadcast[T](id) with Logging with Serializable {
   def blockId: String = "broadcast_" + id
 
   HttpBroadcast.synchronized {
-    SparkEnv.get.blockManager.putSingle(blockId, value_, StorageLevel.MEMORY_AND_DISK, false)
+    //Let BlockManagerMaster know that we have the broadcast block for its latter notification us to remove.
+    SparkEnv.get.blockManager.putSingle(blockId, value_, StorageLevel.MEMORY_AND_DISK, true)
   }
 
   if (!isLocal) { 
     HttpBroadcast.write(id, value_)
   }
+
+  override def rm(toClearSource: Boolean = false) {
+    logInfo("Remove broadcast variable " + blockId)
+    SparkEnv.get.blockManager.master.removeBlock(blockId)
+    SparkEnv.get.blockManager.removeBlock(blockId, false)
+    if(toClearSource){
+      val path: String = HttpBroadcast.broadcastDir + "/" + "broadcast-" + id
+      HttpBroadcast.files.internalMap.remove(path)
+      new File(path).delete()
+      logInfo("Deleted source broadcast file '" + path + "'")
+    }
+  }
 
   // Called by JVM when deserializing an object
   private def readObject(in: ObjectInputStream) {
     in.defaultReadObject()
     HttpBroadcast.synchronized {
-      SparkEnv.get.blockManager.getSingle(blockId) match {
+      SparkEnv.get.blockManager.getSingleLocal(blockId) match {
         case Some(x) => value_ = x.asInstanceOf[T]
         case None => {
           logInfo("Started reading broadcast variable " + id)
           val start = System.nanoTime
           value_ = HttpBroadcast.read[T](id)
-          SparkEnv.get.blockManager.putSingle(blockId, value_, StorageLevel.MEMORY_AND_DISK, false)
+          //Let BlockManagerMaster know that we have the broadcast block for its latter notification us to remove. 
+          SparkEnv.get.blockManager.putSingle(blockId, value_, StorageLevel.MEMORY_AND_DISK, true) 
           val time = (System.nanoTime - start) / 1e9
           logInfo("Reading broadcast variable " + id + " took " + time + " s")
         }

diff --git a/core/src/main/scala/spark/broadcast/TreeBroadcast.scala b/core/src/main/scala/spark/broadcast/TreeBroadcast.scala
@@ -18,7 +18,8 @@ extends Broadcast[T](id) with Logging with Serializable {
   def blockId = "broadcast_" + id
 
   MultiTracker.synchronized {
-    SparkEnv.get.blockManager.putSingle(blockId, value_, StorageLevel.MEMORY_AND_DISK, false)
+    //Let BlockManagerMaster know that we have the broadcast block for its latter notification us to remove.
+    SparkEnv.get.blockManager.putSingle(blockId, value_, StorageLevel.MEMORY_AND_DISK, true)
   }
 
   @transient var arrayOfBlocks: Array[BroadcastBlock] = null
@@ -46,6 +47,21 @@ extends Broadcast[T](id) with Logging with Serializable {
   if (!isLocal) {
     sendBroadcast()
   }
+
+  override def rm(toClearSource: Boolean = false) {
+    logInfo("Remove broadcast variable " + blockId)
+    SparkEnv.get.blockManager.master.removeBlock(blockId)
+    SparkEnv.get.blockManager.removeBlock(blockId, false)
+    if(toClearSource)
+      clearBlockSource()
+  }
+
+  def clearBlockSource(){
+    arrayOfBlocks = null
+    listOfSources = null
+    serveMR = null
+    guideMR = null
+  }
 
   def sendBroadcast() {
     logInfo("Local host address: " + hostAddress)
@@ -92,7 +108,7 @@ extends Broadcast[T](id) with Logging with Serializable {
   private def readObject(in: ObjectInputStream) {
     in.defaultReadObject()
     MultiTracker.synchronized {
-      SparkEnv.get.blockManager.getSingle(blockId) match {
+      SparkEnv.get.blockManager.getSingleLocal(blockId) match {
         case Some(x) =>
           value_ = x.asInstanceOf[T]
 

diff --git a/core/src/main/scala/spark/storage/BlockManager.scala b/core/src/main/scala/spark/storage/BlockManager.scala
@@ -772,6 +772,13 @@ private[spark] class BlockManager(
   def getSingle(blockId: String): Option[Any] = {
     get(blockId).map(_.next())
   }
+
+  /**
+   * Read a block consisting of a single object only from local BlockManager.
+   */
+  def getSingleLocal(blockId: String): Option[Any] = {
+    getLocal(blockId).map(_.next())
+  }
 
   /**
    * Write a block consisting of a single object.

diff --git a/examples/src/main/scala/spark/examples/BroadcastTest.scala b/examples/src/main/scala/spark/examples/BroadcastTest.scala
@@ -26,6 +26,7 @@ object BroadcastTest {
       sc.parallelize(1 to 10, slices).foreach {
         i => println(barr1.value.size)
       }
+      barr1.rm(true)
     }
 
     System.exit(0)