From 5b052eccc91451941d970a9df7198a84ebd02570 Mon Sep 17 00:00:00 2001 From: jjudd Date: Tue, 29 Oct 2024 14:54:38 -0600 Subject: [PATCH] Fix AnnexScalaInstance jar cache misses due to ordering and path comparison problems The jar cache in AnnexScalaInstance is a ConcurrentHashMap that uses a string representation of the compiler classpath (a list of jars) as its key. Problem is that list of jars which is used to create the key is not sorted, so you would sometimes get cache misses for the same classpath because the list had a different ordering. At a large enough scale, this would cause the JVM to run out of CodeHeap and crash. Presumably this was because we'd create up to n! AnnexScalaInstances for every AnnexScalaInstance and then classload for every single one of those instances. This change prevents those cache misses and thus prevents the worker from crashing. This should also fix cache misses with path mapping by comparing the short path, so we don't include the bazel arch/config specific parts of the path. --- .../workers/common/AnnexScalaInstance.scala | 29 +++++++++++++------ .../rules_scala/workers/common/FileUtil.scala | 24 ++++++++++++++- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/src/main/scala/higherkindness/rules_scala/workers/common/AnnexScalaInstance.scala b/src/main/scala/higherkindness/rules_scala/workers/common/AnnexScalaInstance.scala index 4d3af8e4..660ff306 100644 --- a/src/main/scala/higherkindness/rules_scala/workers/common/AnnexScalaInstance.scala +++ b/src/main/scala/higherkindness/rules_scala/workers/common/AnnexScalaInstance.scala @@ -7,12 +7,12 @@ import java.net.URLClassLoader import java.nio.file.{Files, Path, Paths} import java.util.Properties import java.util.concurrent.ConcurrentHashMap -import scala.collection.immutable.Map +import scala.collection.immutable.TreeMap object AnnexScalaInstance { // See the comment on getAnnexScalaInstance as to why this is necessary - private val instanceCache: ConcurrentHashMap[String, AnnexScalaInstance] = - new ConcurrentHashMap[String, AnnexScalaInstance]() + private val instanceCache: ConcurrentHashMap[Set[Path], AnnexScalaInstance] = + new ConcurrentHashMap[Set[Path], AnnexScalaInstance]() /** * We only need to care about minimizing the number of AnnexScalaInstances we create if things are being run as a @@ -48,7 +48,7 @@ object AnnexScalaInstance { * from a prior compilation in combination with a new ScalaInstance for the current compilation. Or there's some issue * with Scala's classpath caching. * - * If all three caches are enabled (this cache + Zinc claspath + Scala compiler), then the non-determinism seems go + * If all three caches are enabled (this cache + Zinc claspath + Scala compiler), then the non-determinism seems to go * away. * * If this cache is not used, but the Zinc classpath + Scala compiler caches are used, then non-determinsm shows up in @@ -67,11 +67,22 @@ object AnnexScalaInstance { * any different from leaving Zinc's cache enabled. */ private def getAnnexScalaInstance(allJars: Array[File], workDir: Path): AnnexScalaInstance = { - // We need to remove the sandbox prefix from the paths in order to compare them. + // We want to compare short paths to avoid the Bazel sandbox prefix and arch/config specific parts of the path + // We use a set because we don't care about ordering for comparison purposes val mapBuilder = Map.newBuilder[Path, Path] + val keyBuilder = Set.newBuilder[Path] + val absoluteWorkDir = workDir.toAbsolutePath().normalize() allJars.foreach { jar => - val comparableJarPath = jar.toPath().toAbsolutePath().normalize() - mapBuilder.addOne(jar.toPath -> workDir.toAbsolutePath().normalize().relativize(comparableJarPath)) + val absoluteJarPath = jar.toPath().toAbsolutePath().normalize() + + // Remove the arch/config specific parts of the path. + val comparablePath = FileUtil.bazelShortPath( + // Remove the sandbox prefix from the path + absoluteWorkDir.relativize(absoluteJarPath), + replaceExternal = false, + ) + mapBuilder.addOne(jar.toPath -> comparablePath) + keyBuilder.addOne(comparablePath) } val workRequestJarToWorkerJar = mapBuilder.result() @@ -84,10 +95,10 @@ object AnnexScalaInstance { // but that would require hashing the all the classpath jars on every compilation request. I // imagine that would cause a performance hit. // - // I also imagine it is extremeley rare to be mutating the contents of compiler classpath jars + // I also imagine it is extremely rare to be mutating the contents of compiler classpath jars // while keeping the names the same, e.g., generating new scala library jar for scala 2.13.14. // As a result I'm leaving this string based for now. - val key = workRequestJarToWorkerJar.values.mkString(":") + val key = keyBuilder.result() Option(instanceCache.get(key)).getOrElse { // Copy all the jars to the worker's directory because in a sandboxed world the diff --git a/src/main/scala/higherkindness/rules_scala/workers/common/FileUtil.scala b/src/main/scala/higherkindness/rules_scala/workers/common/FileUtil.scala index 2e6f762a..feaefabe 100644 --- a/src/main/scala/higherkindness/rules_scala/workers/common/FileUtil.scala +++ b/src/main/scala/higherkindness/rules_scala/workers/common/FileUtil.scala @@ -4,7 +4,7 @@ package workers.common import scala.annotation.tailrec import java.io.{File, IOException} import java.nio.channels.FileChannel -import java.nio.file.{FileAlreadyExistsException, FileVisitResult, Files, Path, SimpleFileVisitor, StandardCopyOption, StandardOpenOption} +import java.nio.file.{FileAlreadyExistsException, FileVisitResult, Files, Path, Paths, SimpleFileVisitor, StandardCopyOption, StandardOpenOption} import java.nio.file.attribute.BasicFileAttributes import java.util.zip.{ZipEntry, ZipInputStream, ZipOutputStream} @@ -59,6 +59,28 @@ object FileUtil { path.getFileName().toString().stripPrefix("header_").stripPrefix("processed_") } + /** + * Given a Bazel path to the bazel output directory, return a path excluding the bazel configuration specific parts of + * the path. This is analogous to Bazel's File.short_path function. Fair warning: this function is super good enough, + * but is likely not perfect. + */ + def bazelShortPath(path: Path, replaceExternal: Boolean = true): Path = { + val nameCount = path.getNameCount() + + val shortPath = if (path.startsWith("bazel-out") && nameCount >= 4) { + path.subpath(3, nameCount) + } else { + path + } + + // Handle difference between Bazel's external directory being referred to as .. in the short_path + if (replaceExternal && shortPath.subpath(0, 1) == Paths.get("external")) { + Paths.get("..").resolve(shortPath.subpath(1, shortPath.getNameCount() - 1)) + } else { + shortPath + } + } + def copy(source: Path, target: Path) = Files.walkFileTree(source, new CopyFileVisitor(source, target)) def delete(path: Path) = Files.walkFileTree(path, new DeleteFileVisitor)