-
Notifications
You must be signed in to change notification settings - Fork 274
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* wip. * variant with a delayed index storing * index is global now * adds random deletion also play with bin_io to get better results * updated GC.clean: now less then 2 seconds! * removed unused fileds from entry type * refactoring * fixes a couple of bugs 1. call upgrade as soon as possible 2. check index file existance before mtime * renaming, everything seems to be ok now * preserves config file * refactoring * wip * added lock to GC * refactored a little * removed primus_system_config.ml * gc with lock * reverted optimization pass * fixes a couple of bugs * a couple of stylistic changes * use integer for overhead cmdkine paramter * refactored a little * renaming * removes fileutils from dependencies * locked cache upgrade as well * refactoring * updated testsuite * rewrote remove * adds tests * debug in progress * few stylistic updates * refactoring, remove bap_config module * wip * uses bap_main for cmdline interface * updates tests * better init * removed old code * just a final touch * fixes tests and a sip of refactoring * fixes commands printing * fixes typos * stores overhead as an integer * updated mktmp directory * new implementation for mkdtemp * added uuidm to cache deps * moved rng initialization to the toplevel
- Loading branch information
Showing
12 changed files
with
709 additions
and
316 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,8 @@ | ||
PKG cmdliner | ||
PKG mmap | ||
PKG fileutils | ||
|
||
S . | ||
B ../../_build/plugins/cache | ||
|
||
REC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
open Core_kernel | ||
open Regular.Std | ||
open Bap.Std | ||
open Bap_cache_types | ||
|
||
include Self () | ||
|
||
module Filename = Caml.Filename | ||
module Random = Caml.Random | ||
module Utils = Bap_cache_utils | ||
|
||
let (//) = Filename.concat | ||
|
||
let rng = Random.State.make_self_init () | ||
|
||
module Cfg = struct | ||
|
||
module T = struct | ||
type t = config [@@deriving bin_io, compare, sexp] | ||
end | ||
|
||
let default = { | ||
capacity = 4 * 1024; (* 4 Gb *) | ||
overhead = 25; | ||
gc_enabled = true; | ||
} | ||
|
||
let version = 3 | ||
let config_file = sprintf "config.%d" version | ||
let cache_data = "data" | ||
|
||
let config_file path = path // config_file | ||
let data path = path // cache_data | ||
|
||
let gc_threshold c = | ||
c.capacity + (c.capacity * c.overhead / 100) | ||
|
||
let default_root = ref None | ||
|
||
let set_root dir = default_root := Some dir | ||
|
||
let write path cfg = | ||
try Utils.binable_to_file (module T) path cfg | ||
with e -> | ||
warning "Failed to store config file: %s" (Exn.to_string e) | ||
|
||
let read path = | ||
try Utils.binable_from_file (module T) path | ||
with e -> | ||
warning "Failed to read config file: %s" (Exn.to_string e); | ||
default | ||
|
||
end | ||
|
||
include Cfg | ||
|
||
let getenv opt = try Some (Sys.getenv opt) with Caml.Not_found -> None | ||
|
||
let root () = | ||
let root = match !default_root with | ||
| Some dir -> dir // ".cache" // "bap" | ||
| None -> match getenv "XDG_CACHE_HOME" with | ||
| Some cache -> cache | ||
| None -> match getenv "HOME" with | ||
| None -> Filename.get_temp_dir_name () // "bap" // "cache" | ||
| Some home -> home // ".cache" // "bap" in | ||
root | ||
|
||
let ensure_dir_exists path = | ||
try | ||
Unix.mkdir path 0o700 | ||
with | ||
| Unix.(Unix_error (EEXIST,_,_)) -> () | ||
| exn -> raise exn | ||
|
||
let rec mkdir path = | ||
let par = Filename.dirname path in | ||
if not (Sys.file_exists par) then mkdir par; | ||
if not (Sys.file_exists path) then | ||
ensure_dir_exists path | ||
|
||
let dir_exists dir = Sys.file_exists dir && Sys.is_directory dir | ||
|
||
let mkdtemp ?(mode=0o0700) ?tmp_dir ?(prefix="") ?(suffix="") () = | ||
let genname () = Uuidm.v4_gen rng () |> Uuidm.to_string in | ||
let rec create name = | ||
let tmp = match tmp_dir with | ||
| None -> Filename.get_temp_dir_name () | ||
| Some tmp -> tmp in | ||
let path = | ||
String.concat ~sep:Filename.dir_sep [tmp; prefix; name; suffix] in | ||
match Unix.mkdir path mode with | ||
| () -> path | ||
| exception Unix.Unix_error(Unix.EEXIST,_,_) -> | ||
genname () |> create in | ||
genname () |> create | ||
|
||
let with_temp_dir path ~f = | ||
let tmp_dir = mkdtemp ~tmp_dir:path () in | ||
protect ~f:(fun () -> | ||
mkdir tmp_dir; | ||
f tmp_dir) | ||
~finally:(fun () -> | ||
if dir_exists tmp_dir | ||
then FileUtil.rm ~recurse:true [tmp_dir]) | ||
|
||
let mkdir_from_tmp ~target ~f path = | ||
with_temp_dir path | ||
~f:(fun tmp_dir -> | ||
f tmp_dir; | ||
try Unix.rename tmp_dir target | ||
with | ||
(* these errors occur if the destination exists and is not empty *) | ||
| Unix.(Unix_error (EEXIST,_,_)) | ||
| Unix.(Unix_error (ENOTEMPTY,_,_)) -> () | ||
| exn -> raise exn) | ||
|
||
let init_cache_dir path = | ||
mkdir (Cfg.data path); | ||
Cfg.write (Cfg.config_file path) Cfg.default | ||
|
||
let init_cache_dir () = | ||
let root = root () in | ||
let data = data root in | ||
if not (dir_exists data) | ||
then | ||
let parent = Filename.dirname root in | ||
mkdir parent; | ||
mkdir_from_tmp ~target:root ~f:init_cache_dir parent | ||
|
||
let config_file () = config_file @@ root () | ||
let data () = data @@ root () | ||
let write_config cfg = Cfg.write (config_file ()) cfg | ||
let read_config () = Cfg.read @@ config_file () | ||
|
||
module Upgrade = struct | ||
|
||
let index_versions = [2;1] | ||
let index_file v = sprintf "index.%d" v | ||
let index_files = List.map index_versions ~f:index_file | ||
|
||
let find_index () = | ||
let files = List.map index_files ~f:(fun x -> root () // x) in | ||
List.find files ~f:Sys.file_exists | ||
|
||
let get_version path = | ||
let file = Filename.basename path in | ||
match String.chop_prefix file "index." with | ||
| None -> Ok 1 | ||
| Some v -> | ||
try Ok (int_of_string v) | ||
with _ -> | ||
Error (Error.of_string (sprintf "unknown version %s" v)) | ||
|
||
let upgrade_from_index_v2 index dst = | ||
let open Compatibility.V2 in | ||
let rename from to_ = | ||
Sys.rename from to_; | ||
Unix.chmod to_ 0o444 in | ||
try | ||
let idx = Utils.binable_from_file (module Compatibility.V2) index in | ||
Map.iteri idx.entries ~f:(fun ~key ~data:{path} -> | ||
rename path (dst // Data.Cache.Digest.to_string key)) | ||
with e -> | ||
warning "can't read entries from index version 2: %s" | ||
(Exn.to_string e) | ||
|
||
let upgrade_from_index_v2 file = | ||
mkdir_from_tmp ~target:(data ()) | ||
~f:(upgrade_from_index_v2 file) (root ()) | ||
|
||
let run () = match find_index () with | ||
| None -> () | ||
| Some file -> | ||
begin | ||
match get_version file with | ||
| Ok 2 -> upgrade_from_index_v2 file; | ||
| _ -> warning "unknown index version" | ||
end; | ||
Sys.remove file | ||
end | ||
|
||
let size () = | ||
let path = data () in | ||
let size = | ||
Sys.readdir path |> | ||
Array.fold ~init:0L ~f:(fun s f -> | ||
try | ||
let file = path // f in | ||
Int64.(s + Unix.LargeFile.( (stat file).st_size )) | ||
with _ -> s) in | ||
Int64.(to_int_exn (size / 1024L / 1024L)) | ||
|
||
let init () = | ||
init_cache_dir (); | ||
Upgrade.run () |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
(** All operations in this module are atomic. *) | ||
|
||
open Bap_cache_types | ||
|
||
val init : unit -> unit | ||
|
||
val size : unit -> int | ||
|
||
val set_root : string -> unit | ||
|
||
val root : unit -> string | ||
|
||
val data : unit -> string | ||
|
||
val gc_threshold : config -> int | ||
|
||
val read_config : unit -> config | ||
|
||
val write_config : config -> unit |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
(** | ||
The main goal is to delete files randomly and prioritizing larger files, | ||
but still giving the probability for all files to be deleted. | ||
Notation: | ||
1. s(i) - the size of i-th file, where i = 0..m-1 with m being the | ||
total number of files; | ||
2. Sum(x(i)) = x(0) + ... x(m-1) - is the sigma operator; | ||
3. T = Sum(s(i)) - the total size of the cache; | ||
4. p(i) = s(i)/T - the discrete probability distrubution of the file | ||
sizes in cache, likelihood that a randomly chosen file from the | ||
cache will have size s(i). | ||
5. F(i) = p(i) + p(i-1) + ... + p(0) | ||
cumulative discrete distribution function (CDF). | ||
F(i) we can generate a random number u in range 0..1, | ||
using a uniform random number generator, and then find such k that | ||
F(k-1) < u <= F(k). | ||
6. |s| = Sum(p(i) * s(i)) = (1/T) * Sum(s(i)^2) - the expected value | ||
of the size of a cache entry | ||
7. |n| = t/|s| - the expected number of deletions that we need to | ||
make to delete t bytes, e.g. if we want to delete half: | ||
|n| = T^2 / (2*Sum(s(i)^2) | ||
Example: | ||
sizes = {4, 6, 3, 1, 6} | ||
the total size of the cache is Sum(sizes(i)) = 20 | ||
the PDF is p(i) = {4/20; 6/20; 3/20; 1/20; 6/20} | ||
and CDF is F(i) = {4/20; 10/20; 13/20; 14/20; 20/20} | ||
We don't want to use floating points, there will be too many big and | ||
small numbers and overflows and we finally want to get an | ||
index. We will use rational numbers, since formulas 4. and 5. have the | ||
same denominator (namely T) we can use only numenators. | ||
On the high-level, we need to generate a random value between 0 and | ||
T, and find such k that F(k-1) < S <= F(k), the k-th file will be | ||
our candidate for removal. We can repeat sampling until we get |n| | ||
files (of course deleting the same file twice won't free twice of | ||
its size, so we had to keep in mind which files we already selected | ||
and repeat until we get |n| distinct files) | ||
Of course, we don't want to have a linear search for intervals, but | ||
we can see, that F(i) partitions the set of sizes (0...T) into m-1 | ||
subsets, so we can represent F as a finite mapping, e.g., with our | ||
example, | ||
[0,3] -> 0 | ||
[4,9] -> 1 | ||
[10,12] -> 2 | ||
[13,13] -> 3 | ||
[14,19] -> 4 | ||
Since intervals are not intersecting, we don't need to use | ||
Interval_map here, we just need to use the common Map from core | ||
with the closest_key (`Less_or_equal_to`` function. So once we | ||
generated a random size u we call for the closest_key for u and | ||
pick the associated value as the index of the file that we will | ||
delete. E.g., let's choose randomly a value from the range of | ||
0...19, if it in range from 0..3 we will pick the first file, or if | ||
it is in range from 4,9, e.g., 5, then closest_key will return 4,1, | ||
so we will remove the second file. So we managed to get away from | ||
ugly floats and got the desired distribution with no rounding | ||
errors. | ||
Now, after we have selected |n| distinct files we can shuffle them and | ||
delete without worrying that some other process already deleted one | ||
of those files. All the processes are using the same sequence of | ||
pseudorandom files, so they will select approximately equal files | ||
for deletion. | ||
And finally, we don't want to make our recursive selection depend | ||
from |n|, so instead of selecting |n| files for removal we will | ||
select as many files as we need to remove requested size. | ||
*) | ||
|
||
open Core_kernel | ||
open Bap.Std | ||
|
||
include Self () | ||
|
||
module Cache = Bap_cache | ||
module CDF = Int.Map | ||
|
||
type entry = { | ||
path : string; | ||
size : int; (* Kb *) | ||
} | ||
|
||
let (//) = Filename.concat | ||
|
||
let min_entry_size = 4 (* Kb *) | ||
|
||
let entry path name = | ||
try | ||
let path = path // name in | ||
let size = Unix.( (stat path).st_size ) / 1024 in | ||
Some {path; size;} | ||
with _ -> None | ||
|
||
let read_cache path = | ||
Sys.readdir path |> Array.filter_map ~f:(entry path) | ||
|
||
let total_size = | ||
Array.fold ~init:0 ~f:(fun s e -> s + e.size) | ||
|
||
let cdf entries = | ||
fst @@ | ||
Array.foldi entries ~init:(Map.empty (module Int),0) | ||
~f:(fun i (m,prev) e -> | ||
let f_i = prev + max min_entry_size e.size in | ||
CDF.add_exn m prev i, f_i) | ||
|
||
let select entries total_size size_to_free = | ||
let cdf = cdf entries in | ||
let rec loop indexes freed = | ||
if freed < size_to_free then | ||
let u = Random.int total_size in | ||
let (_,i) = | ||
Option.value_exn (CDF.closest_key cdf `Less_than u) in | ||
if Set.mem indexes i | ||
then loop indexes freed | ||
else loop (Set.add indexes i) (freed + entries.(i).size) | ||
else indexes in | ||
loop (Set.empty (module Int)) 0 |> Set.to_array | ||
|
||
let remove e = | ||
try Sys.remove e.path | ||
with exn -> | ||
warning "unable to remove entry: %s" (Exn.to_string exn) | ||
|
||
let shuffle fs = | ||
Array.permute ~random_state:(Random.State.make_self_init ()) fs | ||
|
||
let to_Kb s = s * 1024 | ||
|
||
let lower_bound c = | ||
let open Bap_cache_types in | ||
to_Kb @@ | ||
max 0 | ||
(c.capacity - (c.capacity * c.overhead / 100)) | ||
|
||
let shrink ?(by_threshold=false) cfg = | ||
let entries = read_cache @@ Cache.data () in | ||
let total = total_size entries in | ||
let lower_bound = lower_bound cfg in | ||
let max_size = | ||
if by_threshold then to_Kb @@ Cache.gc_threshold cfg | ||
else lower_bound in | ||
if total > max_size then | ||
let selected = select entries total (total - lower_bound) in | ||
shuffle selected; | ||
Array.iter selected ~f:(fun i -> remove entries.(i)) | ||
|
||
let clean () = | ||
Array.iter (read_cache @@ Cache.data ()) ~f:remove |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
open Bap_cache_types | ||
|
||
val shrink : ?by_threshold:bool -> config -> unit | ||
|
||
val clean : unit -> unit |
Oops, something went wrong.