From 27429a1e828f23ab3c97f64f318c11d984d24f73 Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Thu, 31 Oct 2024 08:08:48 -0700 Subject: [PATCH] Disable most caching for packfile objects (#854) By default go-git maintains an LRU cache of git objects of size 96MB. When an object's contents are loaded, it's stored as a MemoryObject in this cache. This cache is not super useful in the indexing access pattern, which accesses each file only once. And in many profiles, we see a substantial number of allocations from these memory objects. This PR disables caching for most git objects by setting LargeObjectThreshold: 1. go-git still proactively caches packfile objects under 16KB (see smallObjectThreshold here). Follow up to #852. This change is also gated by the ZOEKT_ENABLE_GOGIT_OPTIMIZATION feature flag. --- gitindex/index.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gitindex/index.go b/gitindex/index.go index c4de1388..4a1af361 100644 --- a/gitindex/index.go +++ b/gitindex/index.go @@ -565,8 +565,7 @@ func indexGitRepo(opts Options, config gitIndexConfig) (bool, error) { // openRepo opens a git repository in a way that's optimized for indexing. // -// It copies the relevant logic from git.PlainOpen, and enables the filesystem KeepDescriptors option. This -// caches the packfile handles, preventing the packfile from being opened then closed on every object access. +// It copies the relevant logic from git.PlainOpen, and tweaks certain filesystem options. func openRepo(repoDir string) (*git.Repository, io.Closer, error) { fs := osfs.New(repoDir) @@ -586,7 +585,12 @@ func openRepo(repoDir string) (*git.Repository, io.Closer, error) { } s := filesystem.NewStorageWithOptions(fs, cache.NewObjectLRUDefault(), filesystem.Options{ + // Cache the packfile handles, preventing the packfile from being opened then closed on every object access KeepDescriptors: true, + // Disable caching for most objects, by setting the threshold to 1 byte. This avoids allocating a bunch of + // in-memory objects that are unlikely to be reused, since we only read each file once. Note: go-git still + // proactively caches objects under 16KB (see smallObjectThreshold in packfile logic). + LargeObjectThreshold: 1, }) // Because we're keeping descriptors open, we need to close the storage object when we're done.