-
Notifications
You must be signed in to change notification settings - Fork 385
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow user to remove broadcast variables when they are no longer used #771
base: master
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,24 +21,38 @@ extends Broadcast[T](id) with Logging with Serializable { | |
def blockId: String = "broadcast_" + id | ||
|
||
HttpBroadcast.synchronized { | ||
SparkEnv.get.blockManager.putSingle(blockId, value_, StorageLevel.MEMORY_AND_DISK, false) | ||
//Let BlockManagerMaster know that we have the broadcast block for its latter notification us to remove. | ||
SparkEnv.get.blockManager.putSingle(blockId, value_, StorageLevel.MEMORY_AND_DISK, true) | ||
} | ||
|
||
if (!isLocal) { | ||
HttpBroadcast.write(id, value_) | ||
} | ||
|
||
override def rm(toClearSource: Boolean = false) { | ||
logInfo("Remove broadcast variable " + blockId) | ||
SparkEnv.get.blockManager.master.removeBlock(blockId) | ||
SparkEnv.get.blockManager.removeBlock(blockId, false) | ||
if(toClearSource){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a space after if |
||
val path: String = HttpBroadcast.broadcastDir + "/" + "broadcast-" + id | ||
HttpBroadcast.files.internalMap.remove(path) | ||
new File(path).delete() | ||
logInfo("Deleted source broadcast file '" + path + "'") | ||
} | ||
} | ||
|
||
// Called by JVM when deserializing an object | ||
private def readObject(in: ObjectInputStream) { | ||
in.defaultReadObject() | ||
HttpBroadcast.synchronized { | ||
SparkEnv.get.blockManager.getSingle(blockId) match { | ||
SparkEnv.get.blockManager.getSingleLocal(blockId) match { | ||
case Some(x) => value_ = x.asInstanceOf[T] | ||
case None => { | ||
logInfo("Started reading broadcast variable " + id) | ||
val start = System.nanoTime | ||
value_ = HttpBroadcast.read[T](id) | ||
SparkEnv.get.blockManager.putSingle(blockId, value_, StorageLevel.MEMORY_AND_DISK, false) | ||
//Let BlockManagerMaster know that we have the broadcast block for its latter notification us to remove. | ||
SparkEnv.get.blockManager.putSingle(blockId, value_, StorageLevel.MEMORY_AND_DISK, true) | ||
val time = (System.nanoTime - start) / 1e9 | ||
logInfo("Reading broadcast variable " + id + " took " + time + " s") | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,7 +18,8 @@ extends Broadcast[T](id) with Logging with Serializable { | |
def blockId = "broadcast_" + id | ||
|
||
MultiTracker.synchronized { | ||
SparkEnv.get.blockManager.putSingle(blockId, value_, StorageLevel.MEMORY_AND_DISK, false) | ||
//Let BlockManagerMaster know that we have the broadcast block for its latter notification us to remove. | ||
SparkEnv.get.blockManager.putSingle(blockId, value_, StorageLevel.MEMORY_AND_DISK, true) | ||
} | ||
|
||
@transient var arrayOfBlocks: Array[BroadcastBlock] = null | ||
|
@@ -46,6 +47,21 @@ extends Broadcast[T](id) with Logging with Serializable { | |
if (!isLocal) { | ||
sendBroadcast() | ||
} | ||
|
||
override def rm(toClearSource: Boolean = false) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we rename this function to remove, and toClearSource to releaseSource? |
||
logInfo("Remove broadcast variable " + blockId) | ||
SparkEnv.get.blockManager.master.removeBlock(blockId) | ||
SparkEnv.get.blockManager.removeBlock(blockId, false) | ||
if(toClearSource) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use
|
||
clearBlockSource() | ||
} | ||
|
||
def clearBlockSource(){ | ||
arrayOfBlocks = null | ||
listOfSources = null | ||
serveMR = null | ||
guideMR = null | ||
} | ||
|
||
def sendBroadcast() { | ||
logInfo("Local host address: " + hostAddress) | ||
|
@@ -92,7 +108,7 @@ extends Broadcast[T](id) with Logging with Serializable { | |
private def readObject(in: ObjectInputStream) { | ||
in.defaultReadObject() | ||
MultiTracker.synchronized { | ||
SparkEnv.get.blockManager.getSingle(blockId) match { | ||
SparkEnv.get.blockManager.getSingleLocal(blockId) match { | ||
case Some(x) => | ||
value_ = x.asInstanceOf[T] | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There might be a performance problem here.
Spark actually uses broadcast variable to broadcast the JobConf in HadoopRDD to avoid having that in the task closure (10KB). If every worker has to send a message back to the master, it might slow things down since every HadoopRDD we create will need to do that ...
Ideally, we should track the memory usage of broadcast variables, but I am not sure what the best way to do this is.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for reviewing my pull request, Reynold. I have followed your advice as blow.
1.Change the name of methods.
Answer: I have changed their names in the last commit as you suggested.
2.Performance issues for HadoopRDD.
Answer: This is a good concerning. In fact, the message size of a blockinfo transferred is no large. To make it flexible, I added a boolean parameter 'tellMaster (default is true)' for broadcast variables. When set false, the broadcast variable will not be reportted to master and not be removed at slave machines in a SparkContext, this is suitable for small-size broadcast variables. Users can choose the broadcast type at their wish.
3.Track the memory usages of broadcast.
Answer: This a good idea. It can lead making a automatic memory cleaner for broadcast variables. Nevertheless, the purpose of this patch is providing a removing broadcast API to users. These two things do not conflict in essence. For memory cleanup tasks, the lesson I learned is that, whatever program-monitoring mechanisms seems not better than clear the memory explicitly by users if possible. GC can not always be in time and it has overhead costs. Moreover, in this case, it is hard to determine whether a broadcast needed be used by users any more. On the other side, it is a issue to leave large unused broadcast variables in memory, and users have no means to handle that. Therefore, we provide a explicit removing broadcast method to users. It solves my problem in practise.
Thank you for commenting again and I will continue thinking about the automatic cleaner based on tracking memory usage you put forwarded here , it's interesting.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for reviewing my pull request, Reynold @rxin . I have followed your advice as blow.
1.Change the name of methods.
Answer: I have changed their names in the last commit as you suggested.
2.Performance issues for HadoopRDD.
Answer: This is a good concerning. In fact, the message size of a blockinfo transferred is no large. To make it flexible, I added a boolean parameter 'tellMaster (default is true)' for broadcast variables. When set false, the broadcast variable will not be reportted to master and not be removed at slave machines in a SparkContext, this is suitable for small-size broadcast variables. Users can choose the broadcast type at their wish.
3.Track the memory usages of broadcast.
Answer: This a good idea. It can lead making a automatic memory cleaner for broadcast variables. Nevertheless, the purpose of this patch is providing a removing broadcast API to users. These two things do not conflict in essence. For memory cleanup tasks, the lesson I learned is that, whatever program-monitoring mechanisms seems not better than clear the memory explicitly by users if possible. GC can not always be in time and it has overhead costs. Moreover, in this case, it is hard to determine whether a broadcast needed be used by users any more. On the other side, it is a issue to leave large unused broadcast variables in memory, and users have no means to handle that. Therefore, we provide a explicit removing broadcast method to users. It solves my problem in practise.
Thank you for commenting again and I will continue thinking about the automatic cleaner based on tracking memory usage you put forwarded here , it's interesting.