Skip to content

Commit

Permalink
Cache optimization (#1090)
Browse files Browse the repository at this point in the history
* wip.

* variant with a delayed index storing

* index is global now

* adds random deletion

also play with bin_io to get better results

* updated GC.clean: now less then 2 seconds!

* removed unused fileds from entry type

* refactoring

* fixes a couple of bugs

1. call upgrade as soon as possible
2. check index file existance before mtime

* renaming, everything seems to be ok now

* preserves config file

* refactoring

* wip

* added lock to GC

* refactored a little

* removed primus_system_config.ml

* gc with lock

* reverted optimization pass

* fixes a couple of bugs

* a couple of stylistic changes

* use integer for overhead cmdkine paramter

* refactored a little

* renaming

* removes fileutils from dependencies

* locked cache upgrade as well

* refactoring

* updated testsuite

* rewrote remove

* adds tests

* debug in progress

* few stylistic updates

* refactoring, remove bap_config module

* wip

* uses bap_main for cmdline interface

* updates tests

* better init

* removed old code

* just a final touch

* fixes tests and a sip of refactoring

* fixes commands printing

* fixes typos

* stores overhead as an integer

* updated mktmp directory

* new implementation for mkdtemp

* added uuidm to cache deps

* moved rng initialization to the toplevel
  • Loading branch information
gitoleg authored May 22, 2020
1 parent b020215 commit 140991b
Show file tree
Hide file tree
Showing 12 changed files with 709 additions and 316 deletions.
4 changes: 2 additions & 2 deletions oasis/cache
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Library cache_plugin
Build$: flag(everything) || flag(cache)
Path: plugins/cache
FindlibName: bap-plugin-cache
BuildDepends: bap, core_kernel, regular, ppx_jane, mmap
InternalModules: Cache_main
BuildDepends: bap, bap-main, core_kernel, regular, ppx_jane, mmap, fileutils, uuidm
InternalModules: Bap_cache, Bap_cache_gc, Bap_cache_main, Bap_cache_types, Bap_cache_utils
XMETADescription: provide caching services
XMETAExtraLines: tags="cache"
6 changes: 6 additions & 0 deletions plugins/cache/.merlin
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
PKG cmdliner
PKG mmap
PKG fileutils

S .
B ../../_build/plugins/cache

REC
196 changes: 196 additions & 0 deletions plugins/cache/bap_cache.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
open Core_kernel
open Regular.Std
open Bap.Std
open Bap_cache_types

include Self ()

module Filename = Caml.Filename
module Random = Caml.Random
module Utils = Bap_cache_utils

let (//) = Filename.concat

let rng = Random.State.make_self_init ()

module Cfg = struct

module T = struct
type t = config [@@deriving bin_io, compare, sexp]
end

let default = {
capacity = 4 * 1024; (* 4 Gb *)
overhead = 25;
gc_enabled = true;
}

let version = 3
let config_file = sprintf "config.%d" version
let cache_data = "data"

let config_file path = path // config_file
let data path = path // cache_data

let gc_threshold c =
c.capacity + (c.capacity * c.overhead / 100)

let default_root = ref None

let set_root dir = default_root := Some dir

let write path cfg =
try Utils.binable_to_file (module T) path cfg
with e ->
warning "Failed to store config file: %s" (Exn.to_string e)

let read path =
try Utils.binable_from_file (module T) path
with e ->
warning "Failed to read config file: %s" (Exn.to_string e);
default

end

include Cfg

let getenv opt = try Some (Sys.getenv opt) with Caml.Not_found -> None

let root () =
let root = match !default_root with
| Some dir -> dir // ".cache" // "bap"
| None -> match getenv "XDG_CACHE_HOME" with
| Some cache -> cache
| None -> match getenv "HOME" with
| None -> Filename.get_temp_dir_name () // "bap" // "cache"
| Some home -> home // ".cache" // "bap" in
root

let ensure_dir_exists path =
try
Unix.mkdir path 0o700
with
| Unix.(Unix_error (EEXIST,_,_)) -> ()
| exn -> raise exn

let rec mkdir path =
let par = Filename.dirname path in
if not (Sys.file_exists par) then mkdir par;
if not (Sys.file_exists path) then
ensure_dir_exists path

let dir_exists dir = Sys.file_exists dir && Sys.is_directory dir

let mkdtemp ?(mode=0o0700) ?tmp_dir ?(prefix="") ?(suffix="") () =
let genname () = Uuidm.v4_gen rng () |> Uuidm.to_string in
let rec create name =
let tmp = match tmp_dir with
| None -> Filename.get_temp_dir_name ()
| Some tmp -> tmp in
let path =
String.concat ~sep:Filename.dir_sep [tmp; prefix; name; suffix] in
match Unix.mkdir path mode with
| () -> path
| exception Unix.Unix_error(Unix.EEXIST,_,_) ->
genname () |> create in
genname () |> create

let with_temp_dir path ~f =
let tmp_dir = mkdtemp ~tmp_dir:path () in
protect ~f:(fun () ->
mkdir tmp_dir;
f tmp_dir)
~finally:(fun () ->
if dir_exists tmp_dir
then FileUtil.rm ~recurse:true [tmp_dir])

let mkdir_from_tmp ~target ~f path =
with_temp_dir path
~f:(fun tmp_dir ->
f tmp_dir;
try Unix.rename tmp_dir target
with
(* these errors occur if the destination exists and is not empty *)
| Unix.(Unix_error (EEXIST,_,_))
| Unix.(Unix_error (ENOTEMPTY,_,_)) -> ()
| exn -> raise exn)

let init_cache_dir path =
mkdir (Cfg.data path);
Cfg.write (Cfg.config_file path) Cfg.default

let init_cache_dir () =
let root = root () in
let data = data root in
if not (dir_exists data)
then
let parent = Filename.dirname root in
mkdir parent;
mkdir_from_tmp ~target:root ~f:init_cache_dir parent

let config_file () = config_file @@ root ()
let data () = data @@ root ()
let write_config cfg = Cfg.write (config_file ()) cfg
let read_config () = Cfg.read @@ config_file ()

module Upgrade = struct

let index_versions = [2;1]
let index_file v = sprintf "index.%d" v
let index_files = List.map index_versions ~f:index_file

let find_index () =
let files = List.map index_files ~f:(fun x -> root () // x) in
List.find files ~f:Sys.file_exists

let get_version path =
let file = Filename.basename path in
match String.chop_prefix file "index." with
| None -> Ok 1
| Some v ->
try Ok (int_of_string v)
with _ ->
Error (Error.of_string (sprintf "unknown version %s" v))

let upgrade_from_index_v2 index dst =
let open Compatibility.V2 in
let rename from to_ =
Sys.rename from to_;
Unix.chmod to_ 0o444 in
try
let idx = Utils.binable_from_file (module Compatibility.V2) index in
Map.iteri idx.entries ~f:(fun ~key ~data:{path} ->
rename path (dst // Data.Cache.Digest.to_string key))
with e ->
warning "can't read entries from index version 2: %s"
(Exn.to_string e)

let upgrade_from_index_v2 file =
mkdir_from_tmp ~target:(data ())
~f:(upgrade_from_index_v2 file) (root ())

let run () = match find_index () with
| None -> ()
| Some file ->
begin
match get_version file with
| Ok 2 -> upgrade_from_index_v2 file;
| _ -> warning "unknown index version"
end;
Sys.remove file
end

let size () =
let path = data () in
let size =
Sys.readdir path |>
Array.fold ~init:0L ~f:(fun s f ->
try
let file = path // f in
Int64.(s + Unix.LargeFile.( (stat file).st_size ))
with _ -> s) in
Int64.(to_int_exn (size / 1024L / 1024L))

let init () =
init_cache_dir ();
Upgrade.run ()
19 changes: 19 additions & 0 deletions plugins/cache/bap_cache.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
(** All operations in this module are atomic. *)

open Bap_cache_types

val init : unit -> unit

val size : unit -> int

val set_root : string -> unit

val root : unit -> string

val data : unit -> string

val gc_threshold : config -> int

val read_config : unit -> config

val write_config : config -> unit
155 changes: 155 additions & 0 deletions plugins/cache/bap_cache_gc.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
(**
The main goal is to delete files randomly and prioritizing larger files,
but still giving the probability for all files to be deleted.
Notation:
1. s(i) - the size of i-th file, where i = 0..m-1 with m being the
total number of files;
2. Sum(x(i)) = x(0) + ... x(m-1) - is the sigma operator;
3. T = Sum(s(i)) - the total size of the cache;
4. p(i) = s(i)/T - the discrete probability distrubution of the file
sizes in cache, likelihood that a randomly chosen file from the
cache will have size s(i).
5. F(i) = p(i) + p(i-1) + ... + p(0)
cumulative discrete distribution function (CDF).
F(i) we can generate a random number u in range 0..1,
using a uniform random number generator, and then find such k that
F(k-1) < u <= F(k).
6. |s| = Sum(p(i) * s(i)) = (1/T) * Sum(s(i)^2) - the expected value
of the size of a cache entry
7. |n| = t/|s| - the expected number of deletions that we need to
make to delete t bytes, e.g. if we want to delete half:
|n| = T^2 / (2*Sum(s(i)^2)
Example:
sizes = {4, 6, 3, 1, 6}
the total size of the cache is Sum(sizes(i)) = 20
the PDF is p(i) = {4/20; 6/20; 3/20; 1/20; 6/20}
and CDF is F(i) = {4/20; 10/20; 13/20; 14/20; 20/20}
We don't want to use floating points, there will be too many big and
small numbers and overflows and we finally want to get an
index. We will use rational numbers, since formulas 4. and 5. have the
same denominator (namely T) we can use only numenators.
On the high-level, we need to generate a random value between 0 and
T, and find such k that F(k-1) < S <= F(k), the k-th file will be
our candidate for removal. We can repeat sampling until we get |n|
files (of course deleting the same file twice won't free twice of
its size, so we had to keep in mind which files we already selected
and repeat until we get |n| distinct files)
Of course, we don't want to have a linear search for intervals, but
we can see, that F(i) partitions the set of sizes (0...T) into m-1
subsets, so we can represent F as a finite mapping, e.g., with our
example,
[0,3] -> 0
[4,9] -> 1
[10,12] -> 2
[13,13] -> 3
[14,19] -> 4
Since intervals are not intersecting, we don't need to use
Interval_map here, we just need to use the common Map from core
with the closest_key (`Less_or_equal_to`` function. So once we
generated a random size u we call for the closest_key for u and
pick the associated value as the index of the file that we will
delete. E.g., let's choose randomly a value from the range of
0...19, if it in range from 0..3 we will pick the first file, or if
it is in range from 4,9, e.g., 5, then closest_key will return 4,1,
so we will remove the second file. So we managed to get away from
ugly floats and got the desired distribution with no rounding
errors.
Now, after we have selected |n| distinct files we can shuffle them and
delete without worrying that some other process already deleted one
of those files. All the processes are using the same sequence of
pseudorandom files, so they will select approximately equal files
for deletion.
And finally, we don't want to make our recursive selection depend
from |n|, so instead of selecting |n| files for removal we will
select as many files as we need to remove requested size.
*)

open Core_kernel
open Bap.Std

include Self ()

module Cache = Bap_cache
module CDF = Int.Map

type entry = {
path : string;
size : int; (* Kb *)
}

let (//) = Filename.concat

let min_entry_size = 4 (* Kb *)

let entry path name =
try
let path = path // name in
let size = Unix.( (stat path).st_size ) / 1024 in
Some {path; size;}
with _ -> None

let read_cache path =
Sys.readdir path |> Array.filter_map ~f:(entry path)

let total_size =
Array.fold ~init:0 ~f:(fun s e -> s + e.size)

let cdf entries =
fst @@
Array.foldi entries ~init:(Map.empty (module Int),0)
~f:(fun i (m,prev) e ->
let f_i = prev + max min_entry_size e.size in
CDF.add_exn m prev i, f_i)

let select entries total_size size_to_free =
let cdf = cdf entries in
let rec loop indexes freed =
if freed < size_to_free then
let u = Random.int total_size in
let (_,i) =
Option.value_exn (CDF.closest_key cdf `Less_than u) in
if Set.mem indexes i
then loop indexes freed
else loop (Set.add indexes i) (freed + entries.(i).size)
else indexes in
loop (Set.empty (module Int)) 0 |> Set.to_array

let remove e =
try Sys.remove e.path
with exn ->
warning "unable to remove entry: %s" (Exn.to_string exn)

let shuffle fs =
Array.permute ~random_state:(Random.State.make_self_init ()) fs

let to_Kb s = s * 1024

let lower_bound c =
let open Bap_cache_types in
to_Kb @@
max 0
(c.capacity - (c.capacity * c.overhead / 100))

let shrink ?(by_threshold=false) cfg =
let entries = read_cache @@ Cache.data () in
let total = total_size entries in
let lower_bound = lower_bound cfg in
let max_size =
if by_threshold then to_Kb @@ Cache.gc_threshold cfg
else lower_bound in
if total > max_size then
let selected = select entries total (total - lower_bound) in
shuffle selected;
Array.iter selected ~f:(fun i -> remove entries.(i))

let clean () =
Array.iter (read_cache @@ Cache.data ()) ~f:remove
5 changes: 5 additions & 0 deletions plugins/cache/bap_cache_gc.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
open Bap_cache_types

val shrink : ?by_threshold:bool -> config -> unit

val clean : unit -> unit
Loading

0 comments on commit 140991b

Please sign in to comment.