From c0fc6a9b307f12bcdac47b4ecaafd4b713b9c98e Mon Sep 17 00:00:00 2001 From: liugaoji <757394026@qq.com> Date: Sat, 20 Apr 2024 10:58:14 +0800 Subject: [PATCH] add prophet-zenfs --- .gitignore | 1 + README.md | 201 +------------------- fs/fs_zenfs.cc | 454 +++++++++++++++++++++++++++++++++++++++++--- fs/fs_zenfs.h | 19 +- fs/io_zenfs.cc | 62 +++++- fs/io_zenfs.h | 14 ++ fs/metrics.h | 3 + fs/snapshot.h | 20 +- fs/zbd_zenfs.cc | 427 ++++++++++++++++++++++++++++++++++++++--- fs/zbd_zenfs.h | 119 +++++++++++- fs/zbdlib_zenfs.cc | 8 +- fs/zbdlib_zenfs.h | 5 +- fs/zonefs_zenfs.cc | 5 +- generate-version.sh | 2 +- util/zenfs.cc | 6 +- 15 files changed, 1057 insertions(+), 289 deletions(-) diff --git a/.gitignore b/.gitignore index 7b52ec0..1c20bdc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ util/zenfs +util/zenfs.dbg fs/*.o fs/*.cc.d tests/results diff --git a/README.md b/README.md index a4ce832..eb32dab 100644 --- a/README.md +++ b/README.md @@ -1,200 +1,3 @@ -# ZenFS: RocksDB Storage Backend for ZNS SSDs and SMR HDDs +# Prophet -ZenFS is a file system plugin that utilizes [RocksDB](https://github.com/facebook/rocksdb)'s FileSystem interface to -place files into zones on a raw zoned block device. By separating files into -zones and utilizing write life time hints to co-locate data of similar life -times the system write amplification is greatly reduced compared to -conventional block devices. ZenFS ensures that there is no background -garbage collection in the file system or on the disk, improving performance -in terms of throughput, tail latencies and disk endurance. - -## Community -For help or questions about zenfs usage (e.g. "how do I do X?") see below, on join us on [Matrix](https://app.element.io/#/room/#zonedstorage-general:matrix.org), or on [Slack](https://join.slack.com/t/zonedstorage/shared_invite/zt-uyfut5xe-nKajp9YRnEWqiD4X6RkTFw). - -To report a bug, file a documentation issue, or submit a feature request, please open a GitHub issue. - -For release announcements and other discussions, please subscribe to this repository or join us on Matrix or Slack. - -## Dependencies - -ZenFS depends on[ libzbd ](https://github.com/westerndigitalcorporation/libzbd) -and Linux kernel 5.4 or later to perform zone management operations. To use -ZenFS on SSDs with Zoned Namespaces, Linux kernel 5.9 or later is required. -ZenFS works with RocksDB version v6.19.3 or later. - -# Getting started - -## Build - -Download, build and install libzbd. See the libzbd [ README ](https://github.com/westerndigitalcorporation/libzbd/blob/master/README.md) -for instructions. - -Download rocksdb and the zenfs projects: -``` -$ git clone https://github.com/facebook/rocksdb.git -$ cd rocksdb -$ git clone https://github.com/westerndigitalcorporation/zenfs plugin/zenfs -``` - -Build and install rocksdb with zenfs enabled: -``` -$ DEBUG_LEVEL=0 ROCKSDB_PLUGINS=zenfs make -j48 db_bench install -``` - -Build the zenfs utility: -``` -$ pushd -$ cd plugin/zenfs/util -$ make -$ popd -``` - -## Configure the IO Scheduler for the zoned block device - -The IO scheduler must be set to deadline to avoid writes from being reordered. -This must be done every time the zoned name space is enumerated (e.g at boot). - -``` -echo deadline > /sys/class/block//queue/scheduler -``` - -## Creating a ZenFS file system - -Before ZenFS can be used in RocksDB, the file system metadata and superblock must be set up. -This is done with the zenfs utility, using the mkfs command. A ZenFS filesystem can be created -on either a raw zoned block device or on a zonefs filesystem on a zoned block device. For a raw -zoned block device, the device is specified using `--zbd=`: - -``` -./plugin/zenfs/util/zenfs mkfs --zbd= --aux_path= -``` - -If using zonefs, the zonefs file system mountpoint is specified instead using `--zonefs=`: - -``` -./plugin/zenfs/util/zenfs mkfs --zonefs= --aux_path= -``` - -In general, all operations of the zenfs utility can target either a raw block device or a zonefs mountpoint. - -When using zonefs, the zonefs volumes should be mounted with the option "explicit-open": - -``` -sudo mount -o explicit-open -``` - -## ZenFS on-disk file formats - -ZenFS Version 1.0.0 and earlier uses version 1 of the on-disk format. -ZenFS Version 2.0.0 introduces breaking on-disk-format changes (inline extents, support for zones larged than 4GB). - -To migrate between different versions of the on-disk file format, use the zenfs backup/restore commands. - -``` -# Backup the disk contents to the host file system using the version of zenfs that was used to store the current database -./plugin/zenfs/util/zenfs backup --path= --zbd= - -# Switch to the new version of ZenFS you want to use (e.g 1.0.2 -> 2.0.0), rebuild and create a new file system -# Remove the current aux folder if needed. -./plugin/zenfs/util/zenfs mkfs --force --zbd= --aux_path= - -# Restore the database files to the new version of the file system -./plugin/zenfs/util/zenfs restore --path= --zbd= - -``` - -Likewise, it is possible to migrate between a raw zoned block device and a zonefs filesystem by using backup on one -and restore on the other. One thing to be aware of is that for a given block device, zonefs will expose one zone less -to zenfs as the zonefs formatting will consume one zone for the zonefs superblock. - -## Testing with db_bench - -To instruct db_bench to use zenfs on a specific zoned block device, the --fs_uri parameter is used. -The device name may be used by specifying `--fs_uri=zenfs://dev:` for a raw -block device, `--fs_uri=zenfs://zonefs:` for a zonefs mountpoint or by specifying -a unique identifier for the created file system by specifying `--fs_uri=zenfs://uuid:`. UUIDs -can be listed using `./plugin/zenfs/util/zenfs ls-uuid` - -``` -./db_bench --fs_uri=zenfs://dev: --benchmarks=fillrandom --use_direct_io_for_flush_and_compaction - -``` - -## Performance testing - -If you want to use db_bench for testing zenfs performance, there is a a convenience script -that runs the 'long' and 'quick' performance test sets with a good set of parameters -for the drive. - -`cd tests; ./zenfs_base_performance.sh [ ]` - - -## Crashtesting -To run the crashtesting scripts, Python3 is required. -Crashtesting is done through the modified db_crashtest.py -(original [db_crashtest.py](https://github.com/facebook/rocksdb/blob/main/tools/db_crashtest.py)). -It kills the DB at a random point in time (blackbox) or at predefined places -in the RocksDB code (whitebox) and checks for recovery. -For further reading visit the RocksDB [wiki](https://github.com/facebook/rocksdb/wiki/Stress-test). -However the goal for ZenFS crashtesting is to cover a specified set of -parameters rather than randomized continuous testing. - -The convenience script can be used to run all crashtest sets defined in `tests/crashtest`. -``` -cd tests; ./zenfs_base_crashtest.sh -``` - -## Prometheus Metrics Exporter - -To export performance metrics to Prometheus, do the following: - -Set environment variable ZENFS_EXPORT_PROMETHEUS=y when building to enable -prometheus export of metrics. Exporter will listen on 127.0.0.1:8080. - -**Requires prometheus-cpp-pull == 1.1.0** - -# ZenFS Internals - -## Architecture overview -![zenfs stack](https://user-images.githubusercontent.com/447288/84152469-fa3d6300-aa64-11ea-87c4-8a6653bb9d22.png) - -ZenFS implements the FileSystem API, and stores all data files on to a raw -zoned block device. Log and lock files are stored on the default file system -under a configurable directory. Zone management is done through libzbd and -ZenFS io is done through normal pread/pwrite calls. - -## File system implementation - -Files are mapped into into a set of extents: - -* Extents are block-aligned, continuous regions on the block device -* Extents do not span across zones -* A zone may contain more than one extent -* Extents from different files may share zones - -### Reclaim - -ZenFS is exceptionally lazy at current state of implementation and does -not do any garbage collection whatsoever. As files gets deleted, the used -capacity zone counters drops and when it reaches zero, a zone can be reset -and reused. - -### Metadata - -Metadata is stored in a rolling log in the first zones of the block device. - -Each valid meta data zone contains: - -* A superblock with the current sequence number and global file system metadata -* At least one snapshot of all files in the file system -* Incremental file system updates (new files, new extents, deletes, renames etc) - -# Contribution Guide - -ZenFS uses clang-format with Google code style. You may run the following commands -before submitting a PR. - -```bash -clang-format-11 -n -Werror --style=file fs/* util/zenfs.cc # Check for style issues -clang-format-11 -i --style=file fs/* util/zenfs.cc # Auto-fix the style issues -``` +Prophet-ZenFS should build with Prophet-RocksDB. The step is shown in [Prophet-RocksDB](https://github.com/asu-idi/prophet-rocksdb/blob/main/README.md) diff --git a/fs/fs_zenfs.cc b/fs/fs_zenfs.cc index fc2cd52..eb1d113 100644 --- a/fs/fs_zenfs.cc +++ b/fs/fs_zenfs.cc @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -25,14 +26,32 @@ #include "metrics_prometheus.h" #endif #include "rocksdb/utilities/object_registry.h" +// #include "rocksdb/io_status.h" +// #include "monitoring/iostats_context_imp.h" +// #include "monitoring/thread_status_util.h" +#include "db/db_impl/db_impl.h" #include "snapshot.h" #include "util/coding.h" #include "util/crc32c.h" +// #include "zbdlib_zenfs.h" #define DEFAULT_ZENV_LOG_PATH "/tmp/" namespace ROCKSDB_NAMESPACE { +extern uint64_t GetIOSTATS(); + +extern bool DoPreCompaction(std::vector file_list, int ENABLE_LIMIT_LEVEL, int MAX_LIFETIME); +extern int get_bg_compaction_scheduled_(); +extern void set_write_amplification(double wp); +extern void set_write_amplification_no_set(double wp); +extern void set_reset_num(int reset_num); +extern void set_allocated_num(int allocated_zone_num); +extern int get_clock(); + +uint64_t write_size_calc; +uint64_t write_size_calc_no_reset; + Status Superblock::DecodeFrom(Slice* input) { if (input->size() != ENCODED_SIZE) { return Status::Corruption("ZenFS Superblock", @@ -48,6 +67,7 @@ Status Superblock::DecodeFrom(Slice* input) { GetFixed32(input, &block_size_); GetFixed32(input, &zone_size_); GetFixed32(input, &nr_zones_); + nr_zones_ = std::min(static_cast(ZoneNumber), nr_zones_); GetFixed32(input, &finish_treshold_); memcpy(&aux_fs_path_, input->data(), sizeof(aux_fs_path_)); input->remove_prefix(sizeof(aux_fs_path_)); @@ -125,10 +145,12 @@ Status Superblock::CompatibleWith(ZonedBlockDevice* zbd) { "Error: block size missmatch"); if (zone_size_ != (zbd->GetZoneSize() / block_size_)) return Status::Corruption("ZenFS Superblock", "Error: zone size missmatch"); - if (nr_zones_ > zbd->GetNrZones()) + if (nr_zones_ > zbd->GetNrZones()) { + printf("DEBUG nr_zones=%d zbd->GetNrZones()=%d\n", nr_zones_, + zbd->GetNrZones()); return Status::Corruption("ZenFS Superblock", "Error: nr of zones missmatch"); - + } return Status::OK(); } @@ -269,18 +291,49 @@ ZenFS::~ZenFS() { delete zbd_; } -void ZenFS::GCWorker() { +const int SLEEP_TIME = 1000 * 1000; +int reset_zone_num = 0; +int allocated_zone_num = 0; +int pre_compaction_num; +int precompaction_file_num; +uint64_t total_file_num = 0; +uint64_t total_size = 0; +uint64_t total_extents = 0; +uint64_t GC_num = 0; +int case0 = 0, case1 = 0, case2 = 0, case3 = 0, case4 = 0, case5 = 0; +bool check_gced(std::vector &file_list, std::map &has_migrated) { + for(auto &x: file_list) { + if((!has_migrated.empty()) && has_migrated.find(x) != has_migrated.end()) { + return 1; + } + } + return 0; +} +uint64_t pre_fail_id = -1; +void ZenFS::MyGCWorker() { + uint32_t gc_times = 0; + uint32_t running = 0; + std::map has_migrated; while (run_gc_worker_) { - usleep(1000 * 1000 * 10); - + set_write_amplification(1.0 * write_size_calc / GetIOSTATS()); + set_write_amplification_no_set(1.0 * write_size_calc_no_reset / GetIOSTATS()); + set_reset_num(reset_zone_num); + set_allocated_num(allocated_zone_num); + usleep(SLEEP_TIME); uint64_t non_free = zbd_->GetUsedSpace() + zbd_->GetReclaimableSpace(); uint64_t free = zbd_->GetFreeSpace(); + printf("Used=%ld Rec=%ld Free=%ld FreePercent=%ld\n", zbd_->GetUsedSpace(), zbd_->GetReclaimableSpace(), zbd_->GetFreeSpace(), (100 * free) / (free + non_free)); uint64_t free_percent = (100 * free) / (free + non_free); ZenFSSnapshot snapshot; ZenFSSnapshotOptions options; - - if (free_percent > GC_START_LEVEL) continue; - + if(free_percent > GC_STOP_LEVEL) { + if(running) running = 0; + continue; + } + if (free_percent < GC_START_LEVEL && (!running)) { + running = 1; + } + if(!running) continue; options.zone_ = 1; options.zone_file_ = 1; options.log_garbage_ = 1; @@ -289,16 +342,130 @@ void ZenFS::GCWorker() { uint64_t threshold = (100 - GC_SLOPE * (GC_START_LEVEL - free_percent)); std::set migrate_zones_start; + + sort(snapshot.zones_.begin(), snapshot.zones_.end(), + [](ZoneSnapshot& a, ZoneSnapshot& b) { + if (a.capacity == 0 && b.capacity == 0) { + return (100 - 100 * a.used_capacity / a.max_capacity) > + (100 - 100 * b.used_capacity / b.max_capacity); + } else { + return a.capacity < b.capacity; + } + }); + + // uint64_t greedy_zone_id = 0; + std::vector greedy_zone_id; + std::vector used_cap; + std::vector hint_list; + std::vector type_list; + uint64_t migrate_file_num = 0; + uint64_t migrate_size = 0; + std::vector > lifetime_list_v; + std::vector > prediction_lifetime_list_v; + std::vector > hint_num_v; + + printf( + "GC Work Start %d threshold=%ld free_percent=%ld free=%ld non_free=%ld " + "GC_START_LEVEL=%ld\n", + gc_times, threshold, free_percent, free, non_free, GC_START_LEVEL); + uint64_t tot = 0; + + for (const auto& zone : snapshot.zones_) { + std::vector& file_list = zone_file_list[zone.start]; + // std::vector>& file_list_all = zone_file_list_all[zone.start]; + if (zone.capacity == 0 && zone.lifetime_ != 0) { + printf( + "zone: zone_start=%ld zone_id=%ld HINT=%d L=%ld R=%ld capacity=%ld " + "used_capacity=%ld max_capacity=%ld file_list_size=%ld\n", + zone.start, zone.id, zone.lifetime_, zone.min_lifetime, zone.max_lifetime, + zone.capacity / MB, zone.used_capacity / MB, zone.max_capacity / MB, + file_list.size()); + + } + } + + bool PreCompaction = 0; for (const auto& zone : snapshot.zones_) { - if (zone.capacity == 0) { - uint64_t garbage_percent_approx = - 100 - 100 * zone.used_capacity / zone.max_capacity; - if (garbage_percent_approx > threshold && - garbage_percent_approx < 100) { - migrate_zones_start.emplace(zone.start); + std::vector file_list = zone_file_list[zone.start]; + std::vector>& file_list_all = zone_file_list_all[zone.start]; + + if (zone.capacity == 0 && zone.lifetime_ != 0) { + + migrate_zones_start.emplace(zone.start); + + bool control_flag; + // printf("zone.max_capacity=%ld", zone.max_capacity); + if(zone.used_capacity == 0) { + printf("Case0 used capactiy = 0 %d\n", ++case0); + control_flag = 0; + + } else if(ENABLE_CASE1 && get_bg_compaction_scheduled_() == 0) { + printf("Case1 no compaction %d\n", ++case1); + control_flag = 1; + } else if(ENABLE_CASE2 && check_gced(file_list, has_migrated)) { + printf("Case 2 gced before %d\n", ++case2); + control_flag = 1; + } else if(ENABLE_LIMIT_LEVEL) { + printf("Case 5 limit level %d\n", ++case5); + control_flag = 1; + } else if(1.0 * zone.used_capacity / zone.max_capacity <= GC_THRESHOLD){ + printf("Case 3 GC %d %lf\n", ++case3, 1.0 * zone.used_capacity / zone.max_capacity); + control_flag = 0; + } else { + printf("Case 4 Compensation %d %lf\n", ++case4, 1.0 * zone.used_capacity / zone.max_capacity); + control_flag = 1; } + printf("PreCompaction FileList zone_id=%ld pre_zone_id=%ld: ", zone.id, pre_fail_id); + for(auto &x: file_list) printf("%ld ", x); + puts(""); + if(ENABLE_PRECOMPACTION && zone.id != pre_fail_id && zone.used_capacity != 0 && file_list.size() != 0 && control_flag == 1) { + if( DoPreCompaction(file_list, ENABLE_LIMIT_LEVEL, MAX_LIFETIME)) { + pre_compaction_num++; + printf("DoPreCompaction %d zone_id=%ld file_list.size()=%ld\n", pre_compaction_num, zone.id, file_list.size()); + for (auto& x : file_list_all) { + ZoneFile& file = *x; + printf("file_id=%ld lifetime=%ld\n", file.GetID(), file.new_lifetime); + } + puts(""); + migrate_zones_start.emplace(zone.start); + PreCompaction = 1; + Status s = zbd_->ResetTartetUnusedIOZones(zone.id); + if(!s.ok()) { + printf("ERROR: ResetZoneIn PreCompaction"); + } + } else { + printf("DoPreCompaction is False\n"); + migrate_zones_start.emplace(zone.start); + } + pre_fail_id = zone.id; + printf("Update PreCompaction zone id=%ld\n", pre_fail_id); + } + zone_file_list[zone.start].clear(); + zone_file_list_all[zone.start].clear(); + used_cap.emplace_back(zone.used_capacity / MB); + greedy_zone_id.emplace_back(zone.id); + lifetime_list_v.emplace_back(zone.lifetime_list); + hint_list.emplace_back(zone.lifetime_); + type_list.emplace_back(zone.lifetime_type); + hint_num_v.emplace_back(zone.hint_num); + prediction_lifetime_list_v.emplace_back(zone.prediction_lifetime_list); + tot++; + if(!PreCompaction) { + + for(auto &x: file_list) { + if((!has_migrated.empty()) && has_migrated.find(x) != has_migrated.end()) { + printf("file=%ld has been migrated in time=%d\n", x, has_migrated[x]); + } else { + has_migrated[x] = gc_times; + } + } + migrate_size += zone.used_capacity; + migrate_file_num += file_list.size(); + } + if (tot == K) break; } } + if(PreCompaction) continue; std::vector migrate_exts; for (auto& ext : snapshot.extents_) { @@ -307,14 +474,100 @@ void ZenFS::GCWorker() { migrate_exts.push_back(&ext); } } + if (migrate_exts.size() > 0) + GC_num++; + if (migrate_zones_start.size() > 0) { + printf("GC Begin %d GC=%ld Compensation=%d precompaction_file_num=%d clock=%d ", ++gc_times, GC_num, pre_compaction_num, precompaction_file_num, get_clock()); + printf( + "total_size=%ld free=%ld drive_io=%ld rocks_io=%ld total_extents=%ld total_file_num=%ld zone_size=%ld" + "reset_zone_num=%d migrate_exts=%ld migrate_file_num=%ld migrate_size=%ld case0=%d case1=%d case2=%d case3=%d case4=%d case5=%d\n", + total_size / MB, zbd_->GetFreeSpace(), write_size_calc_no_reset, GetIOSTATS(), total_extents, total_file_num, migrate_zones_start.size(), + reset_zone_num, migrate_exts.size(), migrate_file_num, migrate_size / MB, case0, case1, case2, case3, case4, case5); + + + for(uint64_t i = 0; i < greedy_zone_id.size(); i++) { + auto &lifetime_list = lifetime_list_v[i]; + auto &prediction_lifetime_list = prediction_lifetime_list_v[i]; + auto &hint_num = hint_num_v[i]; + std::sort(lifetime_list.begin(), lifetime_list.end()); + std::sort(prediction_lifetime_list.begin(), prediction_lifetime_list.end()); + + if (!lifetime_list.empty()) { + printf( + "zone_id=%ld diff=%ld HINT=%d type=%d Real_list size=%ld used=%ld min_time=%ld " + "max_time=%ld [", + greedy_zone_id[i], lifetime_list[lifetime_list.size() - 1] - lifetime_list[0], hint_list[i], type_list[i], + lifetime_list.size(), used_cap[i], lifetime_list[0], + lifetime_list[lifetime_list.size() - 1]); + for (auto& x : lifetime_list) { + printf("%ld ", x); + } + printf("]\n"); + } else { + printf("ERROR: lifetime_list is empty\n"); + } + + if(MYMODE == true) { + if (!prediction_lifetime_list.empty()) { + printf( + "Zone_id=%ld diff=%ld HINT=%d type=%d Pred_list size=%ld used=%ld min_lifetime=%ld " + "max_lifetime=%ld[", + greedy_zone_id[i], prediction_lifetime_list[prediction_lifetime_list.size() - 1] - + prediction_lifetime_list[0], hint_list[i], type_list[i], prediction_lifetime_list.size(), used_cap[i], + prediction_lifetime_list[0], + prediction_lifetime_list[prediction_lifetime_list.size() - 1]); + for (auto& x : prediction_lifetime_list) { + printf("%ld ", x); + } + printf("]\n"); + } else { + printf("ERROR: lifetime_list is empty\n"); + } + } else { + printf( + "Zone_id=%ld diff=%ld HINT=%d Pred_list size=%ld used=%ld\n", + greedy_zone_id[i], prediction_lifetime_list[prediction_lifetime_list.size() - 1] - + prediction_lifetime_list[0], hint_list[i], prediction_lifetime_list.size(), used_cap[i]); + for (auto& x : hint_num) { + printf("key=%d value=%d\n", x.first, x.second); + } + printf("["); + for (auto& x : prediction_lifetime_list) { + printf("%ld ", x); + } + printf("]\n"); + } + + + } + + + + } + + if (migrate_exts.size() == 0 && greedy_zone_id.size() != 0) { + for(auto &x: greedy_zone_id) { + IOStatus s; + s = zbd_->ResetTartetUnusedIOZones(x); + if (!s.ok()) { + Error(logger_, "Garbage collection failed"); + } + } + } if (migrate_exts.size() > 0) { IOStatus s; Info(logger_, "Garbage collecting %d extents \n", (int)migrate_exts.size()); - s = MigrateExtents(migrate_exts); + s = GreedyMigrateExtents(migrate_exts, greedy_zone_id); + if (!s.ok()) { Error(logger_, "Garbage collection failed"); + printf("GC failed"); + } else { + total_file_num += migrate_file_num; + total_size += migrate_size; + total_extents += migrate_exts.size(); } } } @@ -340,7 +593,6 @@ std::string ZenFS::FormatPathLexically(fs::path filepath) { void ZenFS::LogFiles() { std::map>::iterator it; - uint64_t total_size = 0; Info(logger_, " Files:\n"); for (it = files_.begin(); it != files_.end(); it++) { @@ -577,7 +829,10 @@ IOStatus ZenFS::DeleteFileNoLock(std::string fname, const IOOptions& options, } else { if (zoneFile->GetNrLinks() > 0) return s; /* Mark up the file as deleted so it won't be migrated by GC */ + zoneFile->SetDeleted(); + printf("delete_file_no_lock set_id=%ld is_deleted=%d\n", + zoneFile->GetID(), zoneFile->IsDeleted()); zoneFile.reset(); } } else { @@ -634,6 +889,7 @@ IOStatus ZenFS::NewWritableFile(const std::string& filename, const FileOptions& file_opts, std::unique_ptr* result, IODebugContext* /*dbg*/) { + printf("NewWritableFile Called %s\n", filename.c_str()); std::string fname = FormatPathLexically(filename); Debug(logger_, "New writable file: %s direct: %d\n", fname.c_str(), file_opts.use_direct_writes); @@ -641,6 +897,79 @@ IOStatus ZenFS::NewWritableFile(const std::string& filename, return OpenWritableFile(fname, file_opts, result, nullptr, false); } +//0 prediction lifetime +//1 real lifetime +IOStatus ZenFS::SetFileLifetime(std::string fname, uint64_t lifetime, + int clock, bool flag, int level, std::vector overlap_list) { + global_clock = clock; + const uint64_t MAX = 1e9; + if (lifetime > MAX) { + lifetime = 0; //实际上这个可以走WRITE_LIFETIME_HINT + } + std::string f = FormatPathLexically(fname); + if (files_.find(f) == files_.end()) { + printf("SetFileLifetime Fail file_name=%s fname=%s lifetime=%ld flag=%d\n", + f.c_str(), fname.c_str(), lifetime, flag); + return IOStatus::IOError("Can't find file:" + fname); + } else { + + std::shared_ptr tmp = files_[f]; + uint64_t lifetime_list_size = 0; + if (!flag) { + tmp->new_lifetime = lifetime; + tmp->new_type = (level <= SHORT_THE) ? 0 : 1; + tmp->level = level; + if (tmp->GetActiveZone() != NULL) { + printf("ERROR: ZoneFile has actived file_id=%ld zone_id=%ld\n", + tmp->GetID(), tmp->GetActiveZone()->id); + } + for(auto &x: overlap_list) { + std::string name = FormatPathLexically(x); + if(files_.find(name) == files_.end()) { + printf("ERROR: can't find overlap file\n"); + continue; + } + std::shared_ptr overlap_f_ptr = files_[name]; + for (auto* zone : zbd_->get_io_zones()) { + if (zone->id == overlap_f_ptr->zone_id) { + tmp->overlap_zone_list.emplace_back(zone->id); + + } + } + } + + + } else { + + if (tmp->zone_id != 0) { + for (auto* zone : zbd_->get_io_zones()) { + // printf("zone_information zone_id=%ld zone_capacity=%ld + // zone_max_capacity=%ld zone_used_capacity=%ld\n", zone->id, + // zone->capacity_, zone->max_capacity_, zone->used_capacity_.load()); + if (zone->id == tmp->zone_id) { + zone->lifetime_list.emplace_back(lifetime); + lifetime_list_size = zone->lifetime_list.size(); + break; + } + } + if (tmp->GetActiveZone() == NULL) { + printf("Flag == 1 But GetActiveZone() is NULL\n"); + } + } else { + printf("ERROR: Flag = 1 GetActiveZone is NULL\n"); + } + + } + printf( + "SetFileLifetime Success name=%s get_io_zones_size=%ld " + "lifetime_list_size=%ld set_zone_id=%ld set_file_id=%ld lifetime=%ld " + "flag=%d level=%d over_list.size=%ld new_type=%d\n", + f.c_str(), zbd_->get_io_zones().size(), lifetime_list_size, + tmp->zone_id, files_[f]->GetID(), lifetime, flag, level, overlap_list.size(), tmp->new_type); + return IOStatus::OK(); + } +} + IOStatus ZenFS::ReuseWritableFile(const std::string& filename, const std::string& old_filename, const FileOptions& file_opts, @@ -848,11 +1177,12 @@ IOStatus ZenFS::OpenWritableFile(const std::string& filename, std::make_shared(zbd_, next_file_id_++, &metadata_writer_); zoneFile->SetFileModificationTime(time(0)); zoneFile->AddLinkName(fname); - + zoneFile->debug_fname = fname; /* RocksDB does not set the right io type(!)*/ if (ends_with(fname, ".log")) { zoneFile->SetIOType(IOType::kWAL); zoneFile->SetSparse(!file_opts.use_direct_writes); + zoneFile->new_type = (SHORT_THE == -1) ? 1 : 0; } else { zoneFile->SetIOType(IOType::kUnknown); } @@ -880,6 +1210,12 @@ IOStatus ZenFS::DeleteFile(const std::string& fname, const IOOptions& options, IOStatus s; Debug(logger_, "DeleteFile: %s \n", fname.c_str()); + std::string f = FormatPathLexically(fname); + printf("delete file called %s %s\n", fname.c_str(), f.c_str()); + + if (files_.find(f) != files_.end() && files_[f].get() != nullptr) { + printf("file_delete_id=%ld\n", files_[f]->GetID()); + } files_mtx_.lock(); s = DeleteFileNoLock(fname, options, dbg); @@ -1479,11 +1815,11 @@ Status ZenFS::Mount(bool readonly) { if (!status.ok()) return status; Info(logger_, " Done"); - if (superblock_->IsGCEnabled()) { - Info(logger_, "Starting garbage collection worker"); - run_gc_worker_ = true; - gc_worker_.reset(new std::thread(&ZenFS::GCWorker, this)); - } + // if (superblock_->IsGCEnabled()) { + Info(logger_, "Starting garbage collection worker"); + run_gc_worker_ = true; + gc_worker_.reset(new std::thread(&ZenFS::MyGCWorker, this)); + //} } LogFiles(); @@ -1566,7 +1902,7 @@ std::map ZenFS::GetWriteLifeTimeHints() { return hint_map; } -#if !defined(NDEBUG) || defined(WITH_TERARKDB) +//#if !defined(NDEBUG) || defined(WITH_TERARKDB) static std::string GetLogFilename(std::string bdev) { std::ostringstream ss; time_t t = time(0); @@ -1578,7 +1914,7 @@ static std::string GetLogFilename(std::string bdev) { return ss.str(); } -#endif +//#endif Status NewZenFS(FileSystem** fs, const std::string& bdevname, std::shared_ptr metrics) { @@ -1596,17 +1932,17 @@ Status NewZenFS(FileSystem** fs, const ZbdBackendType backend_type, // // TODO(guokuankuan@bytedance.com) We need to figure out how to reuse // RocksDB's logger in the future. -#if !defined(NDEBUG) || defined(WITH_TERARKDB) + //#if !defined(NDEBUG) || defined(WITH_TERARKDB) s = Env::Default()->NewLogger(GetLogFilename(backend_name), &logger); if (!s.ok()) { fprintf(stderr, "ZenFS: Could not create logger"); } else { logger->SetInfoLogLevel(DEBUG_LEVEL); -#ifdef WITH_TERARKDB + //#ifdef WITH_TERARKDB logger->SetInfoLogLevel(INFO_LEVEL); -#endif + //#endif } -#endif + //#endif ZonedBlockDevice* zbd = new ZonedBlockDevice(backend_name, backend_type, logger, metrics); @@ -1714,16 +2050,42 @@ void ZenFS::GetZenFSSnapshot(ZenFSSnapshot& snapshot, if (options.zone_) { zbd_->GetZoneSnapshot(snapshot.zones_); } + + zone_file_list.clear(); + zone_file_list_all.clear(); if (options.zone_file_) { std::lock_guard file_lock(files_mtx_); + for (const auto& file_it : files_) { ZoneFile& file = *(file_it.second); + // if(file.GetActiveZone() != nullptr) { + + // printf("file.GetActiveZone() is not nullptr address=%ld file_id=%ld + // zone_id=%ld zone_start=%ld\n", + // &file, file.GetID(), file.GetActiveZone()->id, + // file.zone_begin); + // } else { + // if(file.GetZbd() == nullptr) { + // printf("file.GetZbd() is also null"); + // } + // printf("file.GetActiveZone() is nullptr address=%ld file_id=%ld + // is_deleted=%d is_openwr=%d new_lifetime=%ld file_size=%ld\n", &file, + // file.GetID(), file.IsDeleted(), file.IsOpenForWR(), + // file.new_lifetime, file.zone_begin); + // } + /* Skip files open for writing, as extents are being updated */ if (!file.TryAcquireWRLock()) continue; - // file -> extents mapping + zone_file_list[file.zone_begin].emplace_back(file.GetID() - 7); + zone_file_list_all[file.zone_begin].emplace_back(file_it.second); + // printf("file_information migrate_file_id=%ld is_deleted=%d is_openwr=%d + // zone_begin=%ld zone_id=%ld\n", file.GetID(), file.IsDeleted(), + // file.IsOpenForWR(), file.zone_begin, file.zone_id); + // file -> extents mapping snapshot.zone_files_.emplace_back(file); + // extent -> file mapping for (auto* ext : file.GetExtents()) { snapshot.extents_.emplace_back(*ext, file.GetFilename()); @@ -1731,6 +2093,8 @@ void ZenFS::GetZenFSSnapshot(ZenFSSnapshot& snapshot, file.ReleaseWRLock(); } + printf("All files number=%ld zone_files_=%ld has_actived_zone_size=%ld\n", + files_.size(), snapshot.zone_files_.size(), zone_file_list.size()); } if (options.trigger_report_) { @@ -1758,7 +2122,30 @@ IOStatus ZenFS::MigrateExtents( for (const auto& it : file_extents) { s = MigrateFileExtents(it.first, it.second); if (!s.ok()) break; - s = zbd_->ResetUnusedIOZones(); + s = zbd_->MyResetUnusedIOZones(); + if (!s.ok()) break; + } + return s; +} +IOStatus ZenFS::GreedyMigrateExtents( + const std::vector& extents, std::vector zone_id) { + IOStatus s; + // Group extents by their filename + std::map> file_extents; + for (auto* ext : extents) { + std::string fname = ext->filename; + // We only migrate SST file extents + if (ends_with(fname, ".sst")) { + file_extents[fname].emplace_back(ext); + } + } + + for (const auto& it : file_extents) { + s = MigrateFileExtents(it.first, it.second); + if (!s.ok()) break; + } + for(auto &x: zone_id) { + s = zbd_ -> ResetTartetUnusedIOZones(x); if (!s.ok()) break; } return s; @@ -1806,9 +2193,14 @@ IOStatus ZenFS::MigrateFileExtents( Zone* target_zone = nullptr; + + + + + // Allocate a new migration zone. s = zbd_->TakeMigrateZone(&target_zone, zfile->GetWriteLifeTimeHint(), - ext->length_); + ext->length_, zfile->new_lifetime, zfile->new_type); if (!s.ok()) { continue; } diff --git a/fs/fs_zenfs.h b/fs/fs_zenfs.h index 41abc9d..c63ac6b 100644 --- a/fs/fs_zenfs.h +++ b/fs/fs_zenfs.h @@ -28,6 +28,9 @@ namespace fs = std::filesystem; namespace ROCKSDB_NAMESPACE { +extern int global_clock; + + #if !defined(ROCKSDB_LITE) && defined(OS_LINUX) class ZoneSnapshot; @@ -62,6 +65,7 @@ class Superblock { */ Superblock(ZonedBlockDevice* zbd, std::string aux_fs_path = "", uint32_t finish_threshold = 0, bool enable_gc = false) { + std::string uuid = Env::Default()->GenerateUniqueId(); int uuid_len = std::min(uuid.length(), @@ -76,8 +80,9 @@ class Superblock { block_size_ = zbd->GetBlockSize(); zone_size_ = zbd->GetZoneSize() / block_size_; - nr_zones_ = zbd->GetNrZones(); + nr_zones_ = zbd->GetNrZones(); + printf("DEBUG zbd->GetNrZones() called %d\n", nr_zones_); strncpy(aux_fs_path_, aux_fs_path.c_str(), sizeof(aux_fs_path_) - 1); std::string zenfs_version = ZENFS_VERSION; @@ -138,6 +143,8 @@ class ZenFS : public FileSystemWrapper { std::mutex files_mtx_; std::shared_ptr logger_; std::atomic next_file_id_; + std::map >zone_file_list; //zone_begin -> file_id + std::map>> zone_file_list_all; //zone_begin -> file Zone* cur_meta_zone_ = nullptr; std::unique_ptr meta_log_; @@ -147,7 +154,7 @@ class ZenFS : public FileSystemWrapper { std::shared_ptr GetLogger() { return logger_; } std::unique_ptr gc_worker_ = nullptr; - bool run_gc_worker_ = false; + bool run_gc_worker_ = true; struct ZenFSMetadataWriter : public MetadataWriter { ZenFS* zenFS; @@ -303,6 +310,8 @@ class ZenFS : public FileSystemWrapper { const FileOptions& file_opts, std::unique_ptr* result, IODebugContext* dbg) override; + virtual IOStatus SetFileLifetime(std::string fname, + uint64_t lifetime, int clock, bool flag, int level, std::vector overlap_list); virtual IOStatus ReuseWritableFile(const std::string& fname, const std::string& old_fname, const FileOptions& file_opts, @@ -452,16 +461,16 @@ class ZenFS : public FileSystemWrapper { const ZenFSSnapshotOptions& options); IOStatus MigrateExtents(const std::vector& extents); + IOStatus GreedyMigrateExtents(const std::vector& extents, std::vectorzone_id); + IOStatus MigrateFileExtents( const std::string& fname, const std::vector& migrate_exts); private: - const uint64_t GC_START_LEVEL = - 20; /* Enable GC when < 20% free space available */ const uint64_t GC_SLOPE = 3; /* GC agressiveness */ - void GCWorker(); + void MyGCWorker(); }; #endif // !defined(ROCKSDB_LITE) && defined(OS_LINUX) diff --git a/fs/io_zenfs.cc b/fs/io_zenfs.cc index d861662..96f49f7 100644 --- a/fs/io_zenfs.cc +++ b/fs/io_zenfs.cc @@ -18,7 +18,10 @@ #include #include #include - +#include +#include +#include + #include #include #include @@ -29,6 +32,10 @@ namespace ROCKSDB_NAMESPACE { +extern uint64_t write_size_calc; +extern uint64_t write_size_calc_no_reset; + + ZoneExtent::ZoneExtent(uint64_t start, uint64_t length, Zone* zone) : start_(start), length_(length), zone_(zone) {} @@ -261,11 +268,24 @@ void ZoneFile::ClearExtents() { } extents_.clear(); } - +void print_stacktrace() +{ + int size = 16; + void * array[16]; + int stack_num = backtrace(array, size); + char ** stacktrace = backtrace_symbols(array, stack_num); + for (int i = 0; i < stack_num; ++i) + { + printf("%s\n", stacktrace[i]); + } + free(stacktrace); +} IOStatus ZoneFile::CloseActiveZone() { IOStatus s = IOStatus::OK(); if (active_zone_) { bool full = active_zone_->IsFull(); + // print_stacktrace(); + printf("close_active_zone_id=%ld capacity=%ld\n", active_zone_->id, active_zone_->capacity_); s = active_zone_->Close(); ReleaseActiveZone(); if (!s.ok()) { @@ -305,6 +325,8 @@ IOStatus ZoneFile::CloseWR() { s = PersistMetadata(); if (!s.ok()) return s; ReleaseWRLock(); + if(active_zone_ != nullptr) + printf("CloseWR zone_id=%ld predict_list_size=%ld extents_size=%ld\n", active_zone_->id, active_zone_->prediction_lifetime_list.size(), extents_.size()); return CloseActiveZone(); } @@ -434,15 +456,29 @@ void ZoneFile::PushExtent() { extent_filepos_ = file_size_; } + +uint64_t max_global_clock = 0; IOStatus ZoneFile::AllocateNewZone() { Zone* zone; - IOStatus s = zbd_->AllocateIOZone(lifetime_, io_type_, &zone); + IOStatus s; + if(new_lifetime == 0) { + new_lifetime = max_global_clock; + } else { + max_global_clock = std::max(max_global_clock, new_lifetime); + } + printf("Begin Allocate file_id=%ld HINT=%d new_lifetime=%ld new_type=%d\n", file_id_, lifetime_, new_lifetime, new_type); + s = zbd_->AllocateIOZone(lifetime_, io_type_, &zone, new_lifetime, new_type, overlap_zone_list, level); //my_allocate_alogortihm if (!s.ok()) return s; if (!zone) { return IOStatus::NoSpace("Zone allocation failure\n"); } SetActiveZone(zone); + zone_begin = zone->start_; + zone_id = zone->id; + printf("Allocate Result file_name=%s file_hint=%d io_zone_number=%ld allocate_file_id=%ld zone_id=%ld zone_hint=%d lifetime=%ld min_lifetime=%ld max_lifetime=%ld zone_left=%ld\n", + debug_fname.c_str(), lifetime_, GetZbd()->GetIOZones().size(), GetID(), GetActiveZone()->id, zone->lifetime_, new_lifetime, zone->min_lifetime, zone->max_lifetime, zone->GetCapacityLeft()); + extent_start_ = active_zone_->wp_; extent_filepos_ = file_size_; @@ -490,6 +526,7 @@ IOStatus ZoneFile::BufferedAppend(char* buffer, uint32_t data_size) { left -= extent_length; if (active_zone_->capacity_ == 0) { + printf("BufferAppend::active_zone_full zone_id=%ld predict_list_size=%ld extents_size=%ld\n", active_zone_->id, active_zone_->prediction_lifetime_list.size(), extents_.size()); s = CloseActiveZone(); if (!s.ok()) { return s; @@ -548,6 +585,7 @@ IOStatus ZoneFile::SparseAppend(char* sparse_buffer, uint32_t data_size) { left -= extent_length; if (active_zone_->capacity_ == 0) { + printf("SparseAppend::active_zone_full zone_id=%ld predict_list_size=%ld extents_size=%ld\n", active_zone_->id, active_zone_->prediction_lifetime_list.size(), extents_.size()); s = CloseActiveZone(); if (!s.ok()) { return s; @@ -576,9 +614,11 @@ IOStatus ZoneFile::Append(void* data, int data_size) { } while (left) { - if (active_zone_->capacity_ == 0) { + printf("Before::Append active_zone_id=%ld zone_capactiy=%ld left=%d file_id=%ld file_size=%ld\n", active_zone_->id, active_zone_->capacity_, left, file_id_, file_size_); + if (active_zone_->capacity_ == 0) { //这个地方很重要,既然capacity = 0,肯定就要allocate new zone + PushExtent(); - + printf("Append::active_zone_full_zone_id=%ld predict_list_size=%ld extents_size=%ld\n", active_zone_->id, active_zone_->prediction_lifetime_list.size(), extents_.size()); s = CloseActiveZone(); if (!s.ok()) { return s; @@ -587,7 +627,6 @@ IOStatus ZoneFile::Append(void* data, int data_size) { s = AllocateNewZone(); if (!s.ok()) return s; } - wr_size = left; if (wr_size > active_zone_->capacity_) wr_size = active_zone_->capacity_; @@ -596,6 +635,7 @@ IOStatus ZoneFile::Append(void* data, int data_size) { file_size_ += wr_size; left -= wr_size; + printf("After::Append active_zone_id=%ld zone_capacity=%ld left=%d file_id=%ld file_size=%ld\n", active_zone_->id, active_zone_->capacity_, left, file_id_, file_size_); offset += wr_size; } @@ -699,8 +739,8 @@ IOStatus ZoneFile::Recover() { } void ZoneFile::ReplaceExtentList(std::vector new_list) { - assert(!IsOpenForWR() && new_list.size() > 0); - assert(new_list.size() == extents_.size()); + + assert(IsOpenForWR() && new_list.size() > 0); WriteLock lck(this); extents_ = new_list; @@ -739,7 +779,7 @@ IOStatus ZoneFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint lifetime) { void ZoneFile::ReleaseActiveZone() { assert(active_zone_ != nullptr); - bool ok = active_zone_->Release(); + bool ok = active_zone_->Release(); //release只修改了一个bool变量的值 assert(ok); (void)ok; active_zone_ = nullptr; @@ -749,8 +789,11 @@ void ZoneFile::SetActiveZone(Zone* zone) { assert(active_zone_ == nullptr); assert(zone->IsBusy()); active_zone_ = zone; + zone->files_id.emplace_back(GetID()); } +//ZonedWritableFile是返回给Rocksdb的对象 +//其和ZoneFile一一对应 ZonedWritableFile::ZonedWritableFile(ZonedBlockDevice* zbd, bool _buffered, std::shared_ptr zoneFile) { assert(zoneFile->IsOpenForWR()); @@ -1037,6 +1080,7 @@ IOStatus ZonedRandomAccessFile::Read(uint64_t offset, size_t n, return zoneFile_->PositionedRead(offset, n, result, scratch, direct_); } +//逻辑就是先Read出来,然后target_zone.append IOStatus ZoneFile::MigrateData(uint64_t offset, uint32_t length, Zone* target_zone) { uint32_t step = 128 << 10; diff --git a/fs/io_zenfs.h b/fs/io_zenfs.h index 00050fa..b197628 100644 --- a/fs/io_zenfs.h +++ b/fs/io_zenfs.h @@ -49,6 +49,15 @@ class MetadataWriter { }; class ZoneFile { + public: + uint64_t new_lifetime = 0; + int new_type; + int level; + + uint64_t zone_begin; + uint64_t zone_id; + std::string debug_fname; + std::vector overlap_zone_list; private: const uint64_t NO_EXTENT = 0xffffffffffffffff; @@ -94,11 +103,16 @@ class ZoneFile { IOStatus CloseWR(); bool IsOpenForWR(); + + Zone *GetActiveZone() { + return active_zone_; + } IOStatus PersistMetadata(); IOStatus Append(void* buffer, int data_size); IOStatus BufferedAppend(char* data, uint32_t size); IOStatus SparseAppend(char* data, uint32_t size); + IOStatus SetWriteLifeTimeHint(Env::WriteLifeTimeHint lifetime); void SetIOType(IOType io_type); std::string GetFilename(); diff --git a/fs/metrics.h b/fs/metrics.h index c9b6b03..560682c 100644 --- a/fs/metrics.h +++ b/fs/metrics.h @@ -165,6 +165,9 @@ struct ZenFSMetricsLatencyGuard { virtual ~ZenFSMetricsLatencyGuard() { uint64_t end_time_micro_ = GetTime(); + if(end_time_micro_ < begin_time_micro_) { + printf("end_time_micro_=%ld < begin_time_micro_=%ld\n", end_time_micro_, begin_time_micro_); + } assert(end_time_micro_ >= begin_time_micro_); metrics_->ReportLatency(label_, Report(end_time_micro_ - begin_time_micro_)); diff --git a/fs/snapshot.h b/fs/snapshot.h index 91dcaac..e1907f6 100644 --- a/fs/snapshot.h +++ b/fs/snapshot.h @@ -50,14 +50,30 @@ class ZoneSnapshot { uint64_t capacity; uint64_t used_capacity; uint64_t max_capacity; - + uint64_t id; + uint64_t min_lifetime; + uint64_t max_lifetime; + std::vector lifetime_list; + std::vector prediction_lifetime_list; + int lifetime_; + std::map hint_num; + int lifetime_type; public: ZoneSnapshot(const Zone& zone) : start(zone.start_), wp(zone.wp_), capacity(zone.capacity_), used_capacity(zone.used_capacity_), - max_capacity(zone.max_capacity_) {} + max_capacity(zone.max_capacity_), + id(zone.id), + min_lifetime(zone.min_lifetime), + max_lifetime(zone.max_lifetime), + lifetime_list(zone.lifetime_list), + prediction_lifetime_list(zone.prediction_lifetime_list), + lifetime_(zone.lifetime_), + hint_num(zone.hint_num), + lifetime_type(zone.lifetime_type) + {} }; class ZoneExtentSnapshot { diff --git a/fs/zbd_zenfs.cc b/fs/zbd_zenfs.cc index df29341..880c619 100644 --- a/fs/zbd_zenfs.cc +++ b/fs/zbd_zenfs.cc @@ -27,6 +27,7 @@ #include #include +#include "fs_zenfs.h" #include "rocksdb/env.h" #include "rocksdb/io_status.h" #include "snapshot.h" @@ -48,6 +49,44 @@ namespace ROCKSDB_NAMESPACE { +static int cnt[111]; +static int cnt_zone_hint[111]; +/* +flag2: SHORT_THE +flag: + +0: in [L, R] +1: x < [L, R] +2: [L, R] < x +3: type = 0 -> type 0 or type = 0 -> type 1 +4: open new zone when get active zone == true +5: open new zone and finish a zone + +*/ +void add_allocation(int flag2, int flag, uint64_t lifetime, int new_type, Zone *zone) { + cnt[flag]++; + printf("allocation_type:flag2=%d flag=%d lifetime=%ld new_type=%d ", flag2, flag, lifetime, new_type); + if(zone != nullptr) + printf("zone_id=%ld zone_l=%ld zone_r=%ld zone_type=%d ", zone->id, zone->min_lifetime, zone->max_lifetime, zone->lifetime_type); + for(int i = 0; i < 7; i++) printf("type%d=%d ", i, cnt[i]); + printf("\n"); +} +void add_allocation_off(int flag, Env::WriteLifeTimeHint lifetime, Zone *zone) { + cnt[flag]++; + printf("allocation_type=%d lifetime=%d ", flag, lifetime); + if(zone != nullptr) + printf("zone_id=%ld zone_l=%ld zone_r=%ld ", zone->id, zone->min_lifetime, zone->max_lifetime); + for(int i = 0; i < 4; i++) printf("type%d=%d ", i, cnt[i]); + printf("\n"); + if(MYMODE == false && flag == 3) { + cnt_zone_hint[zone->lifetime_]++; + for(int i = 0; i < 6; i++) { + printf("zone_hint%d=%d\n", i, cnt_zone_hint[i]); + } + } +} + + Zone::Zone(ZonedBlockDevice *zbd, ZonedBlockDeviceBackend *zbd_be, std::unique_ptr &zones, unsigned int idx) : zbd_(zbd), @@ -59,6 +98,7 @@ Zone::Zone(ZonedBlockDevice *zbd, ZonedBlockDeviceBackend *zbd_be, lifetime_ = Env::WLTH_NOT_SET; used_capacity_ = 0; capacity_ = 0; + id = idx; if (zbd_be->ZoneIsWritable(zones, idx)) capacity_ = max_capacity_ - (wp_ - start_); } @@ -80,13 +120,17 @@ void Zone::EncodeJson(std::ostream &json_stream) { json_stream << "}"; } -IOStatus Zone::Reset() { +IOStatus Zone::Reset() { + + printf("Zone Reset zone_id=%ld capacity=%ld\n", id, capacity_); bool offline; uint64_t max_capacity; assert(!IsUsed()); assert(IsBusy()); - + if(CALC_RESET) { + write_size_calc += capacity_; + } IOStatus ios = zbd_be_->Reset(start_, &offline, &max_capacity); if (ios != IOStatus::OK()) return ios; @@ -95,6 +139,7 @@ IOStatus Zone::Reset() { else max_capacity_ = capacity_ = max_capacity; + files_id.clear(); wp_ = start_; lifetime_ = Env::WLTH_NOT_SET; @@ -106,7 +151,9 @@ IOStatus Zone::Finish() { IOStatus ios = zbd_be_->Finish(start_); if (ios != IOStatus::OK()) return ios; - + if(CALC_RESET) { + write_size_calc += capacity_; + } capacity_ = 0; wp_ = start_ + zbd_->GetZoneSize(); @@ -124,6 +171,7 @@ IOStatus Zone::Close() { return IOStatus::OK(); } +// 一个问题是Zone的append只传入了data,那么如何区分这个data属于哪个ZoneFile呢? IOStatus Zone::Append(char *data, uint32_t size) { ZenFSMetricsLatencyGuard guard(zbd_->GetMetrics(), ZENFS_ZONE_WRITE_LATENCY, Env::Default()); @@ -272,9 +320,11 @@ IOStatus ZonedBlockDevice::Open(bool readonly, bool exclusive) { } uint64_t ZonedBlockDevice::GetFreeSpace() { + // printf("io_zones_number=%ld\n", io_zones.size()); uint64_t free = 0; for (const auto z : io_zones) { free += z->capacity_; + // printf("z->capacity=%ld\n", z->capacity_); } return free; } @@ -290,7 +340,10 @@ uint64_t ZonedBlockDevice::GetUsedSpace() { uint64_t ZonedBlockDevice::GetReclaimableSpace() { uint64_t reclaimable = 0; for (const auto z : io_zones) { - if (z->IsFull()) reclaimable += (z->max_capacity_ - z->used_capacity_); + if (z->IsFull()) { + reclaimable += (z->max_capacity_ - z->used_capacity_); + // printf("GetReclaimableSpace id=%ld max_cap=%ld used=%ld reclaimable=%ld\n", z->id, z->max_capacity_, z->used_capacity_.load(), reclaimable); + } } return reclaimable; } @@ -334,6 +387,7 @@ void ZonedBlockDevice::LogZoneUsage() { } } +// GC过程 void ZonedBlockDevice::LogGarbageInfo() { // Log zone garbage stats vector. // @@ -345,7 +399,7 @@ void ZonedBlockDevice::LogGarbageInfo() { // We don't need to lock io_zones since we only read data and we don't need // the result to be precise. int zone_gc_stat[12] = {0}; - for (auto z : io_zones) { + for (auto z : io_zones) { // 枚举所有的IO zone if (!z->Acquire()) { continue; } @@ -357,8 +411,15 @@ void ZonedBlockDevice::LogGarbageInfo() { } double garbage_rate = - double(z->wp_ - z->start_ - z->used_capacity_) / z->max_capacity_; - assert(garbage_rate > 0); + double(z->wp_ - z->start_ - z->used_capacity_) / + z->max_capacity_; // 这个rate越高,表示未使用的空间越多 + if (garbage_rate < 0) { + printf( + "ERROR:garbage_rate<=0 zone_id=%ld wp=%ld start=%ld " + "used_capacity=%ld max_capacity=%ld\n", + z->id, z->wp_, z->start_, z->used_capacity_.load(), z->max_capacity_); + } + assert(garbage_rate >= 0); int idx = int((garbage_rate + 0.1) * 10); zone_gc_stat[idx]++; @@ -401,7 +462,7 @@ unsigned int GetLifeTimeDiff(Env::WriteLifeTimeHint zone_lifetime, } if (zone_lifetime > file_lifetime) return zone_lifetime - file_lifetime; - if (zone_lifetime == file_lifetime) return LIFETIME_DIFF_COULD_BE_WORSE; + if (zone_lifetime == file_lifetime) return MODIFY_OFF ? 0 : LIFETIME_DIFF_COULD_BE_WORSE; return LIFETIME_DIFF_NOT_GOOD; } @@ -433,12 +494,45 @@ IOStatus ZonedBlockDevice::AllocateMetaZone(Zone **out_meta_zone) { return IOStatus::NoSpace("Out of metadata zones"); } +// 注意这个api IOStatus ZonedBlockDevice::ResetUnusedIOZones() { + if (DISABLE_RESET == true) return IOStatus::OK(); + for (const auto z : io_zones) { + if (z->Acquire()) { + if (!z->IsEmpty() && !z->IsUsed()) { // used = 0 + printf("Reset zone_id=%ld capacity=%ld used_capacity=%ld HINT=%d level=%d type=%d L=%ld R=%ld\n", + z->id, z->capacity_, z->used_capacity_.load(), z->lifetime_, z->level, z->lifetime_type, z->min_lifetime, z->max_lifetime); + bool full = z->IsFull(); + IOStatus reset_status = z->Reset(); + reset_zone_num++; + z->prediction_lifetime_list.clear(); + z->lifetime_list.clear(); + z->hint_num.clear(); + IOStatus release_status = z->CheckRelease(); + if (!reset_status.ok()) return reset_status; + if (!release_status.ok()) return release_status; + if (!full) PutActiveIOZoneToken(); + } else { + IOStatus release_status = z->CheckRelease(); + if (!release_status.ok()) return release_status; + } + } + } + return IOStatus::OK(); +} + +IOStatus ZonedBlockDevice::MyResetUnusedIOZones() { for (const auto z : io_zones) { if (z->Acquire()) { - if (!z->IsEmpty() && !z->IsUsed()) { + if (!z->IsEmpty() && !z->IsUsed()) { // zone is empty + printf("Reset zone_id=%ld capacity=%ld used_capacity=%ld HINT=%d level=%d type=%d L=%ld R=%ld\n", + z->id, z->capacity_, z->used_capacity_.load(), z->lifetime_, z->level, z->lifetime_type, z->min_lifetime, z->max_lifetime); bool full = z->IsFull(); IOStatus reset_status = z->Reset(); + reset_zone_num++; + z->prediction_lifetime_list.clear(); + z->lifetime_list.clear(); + z->hint_num.clear(); IOStatus release_status = z->CheckRelease(); if (!reset_status.ok()) return reset_status; if (!release_status.ok()) return release_status; @@ -452,6 +546,35 @@ IOStatus ZonedBlockDevice::ResetUnusedIOZones() { return IOStatus::OK(); } +IOStatus ZonedBlockDevice::ResetTartetUnusedIOZones(uint64_t id) { + for (const auto z : io_zones) { + if (z->Acquire()) { + if (!z->IsEmpty() && !z->IsUsed() && z->id == id) { + printf("Reset zone_id=%ld padding=%ld start=%ld max_capacity=%ld wp=%ld is_empty()=%d capacity=%ld used_capacity=%ld HINT=%d L=%ld R=%ld\n", + z->id, z->start_ + z->max_capacity_ - z-> wp_, z->start_, z->max_capacity_, z-> wp_, z->IsEmpty(), z->capacity_, z->used_capacity_.load(), z->lifetime_, z->min_lifetime, z->max_lifetime); + bool full = z->IsFull(); + IOStatus reset_status = z->Reset(); + z->prediction_lifetime_list.clear(); + z->lifetime_list.clear(); + z->hint_num.clear(); + reset_zone_num++; + IOStatus release_status = z->CheckRelease(); + if (!reset_status.ok()) return reset_status; + if (!release_status.ok()) return release_status; + if (!full) PutActiveIOZoneToken(); + } else { + IOStatus release_status = z->CheckRelease(); + if(z->id == id) { + printf("ResetTargetIOZoneFail id=%ld IsEmpty()=%d IsUsed=%ld\n", id, z->IsEmpty(), z->used_capacity_.load()); + } + if (!release_status.ok()) return release_status; + } + } + } + + return IOStatus::OK(); +} + void ZonedBlockDevice::WaitForOpenIOZoneToken(bool prioritized) { long allocator_open_limit; @@ -568,7 +691,7 @@ IOStatus ZonedBlockDevice::FinishCheapestIOZone() { Info(logger_, "All non-busy zones are empty or full, skip."); return IOStatus::OK(); } - + printf("finish_victim zone_id=%ld zone_capacity=%ld used=%ld\n", finish_victim->id, finish_victim->capacity_, finish_victim->used_capacity_.load()); s = finish_victim->Finish(); IOStatus release_status = finish_victim->CheckRelease(); @@ -590,21 +713,27 @@ IOStatus ZonedBlockDevice::GetBestOpenZoneMatch( Zone *allocated_zone = nullptr; IOStatus s; - for (const auto z : io_zones) { - if (z->Acquire()) { + for (const auto z : io_zones) { // 枚举所有的IO zone + if (z->Acquire()) { // 获得锁 if ((z->used_capacity_ > 0) && !z->IsFull() && - z->capacity_ >= min_capacity) { + z->capacity_ >= min_capacity) { // 如果说zone仍然有剩余空间 unsigned int diff = GetLifeTimeDiff(z->lifetime_, file_lifetime); - if (diff <= best_diff) { - if (allocated_zone != nullptr) { - s = allocated_zone->CheckRelease(); + printf( + "GetBestOpenZoneMatch Off zone_id=%ld " + "min_lifetime=%ld max_lifetime=%ld zone_hint=%d zone_type=%d global_clock=%d\n", + z->id, z->min_lifetime, z->max_lifetime, z->lifetime_, z->lifetime_type, + global_clock); + if (diff <= best_diff) { // 如果要比best_diff小 + if (allocated_zone != + nullptr) { // 如果之前已经有过allocate_zone allocated_zone + s = allocated_zone->CheckRelease(); // check s是否能释放 if (!s.ok()) { IOStatus s_ = z->CheckRelease(); if (!s_.ok()) return s_; return s; } } - allocated_zone = z; + allocated_zone = z; // 更新allocated_zone best_diff = diff; } else { s = z->CheckRelease(); @@ -623,6 +752,117 @@ IOStatus ZonedBlockDevice::GetBestOpenZoneMatch( return IOStatus::OK(); } +int global_clock = 0; +// Allocate只需要传入一个file_lifetime即可 +IOStatus ZonedBlockDevice::GetBestOpenZoneMatch( + uint64_t new_lifetime_, int new_type, Env::WriteLifeTimeHint file_lifetime, + unsigned int *best_diff_out, Zone **zone_out, int flag, int flag2, std::vector overlap_zone_list, + uint32_t min_capacity) { + unsigned int best_diff = LIFETIME_DIFF_NOT_GOOD; + Zone *allocated_zone = nullptr; + IOStatus s; + uint64_t mx = INF; // dis最小的zone + if(new_lifetime_ == 0) { + new_type = ((SHORT_THE == -1) ? 1 : 0); + } + for (const auto z : io_zones) { + if(z->wp_ != z->start_) { + printf("zone test zone_id=%ld zone_cap=%ld valid=%ld type=%d min_lifetime=%ld max_lifetime=%ld is_busy=%d\n", z->id, z->capacity_ / MB, z->used_capacity_.load() / MB, z->lifetime_type, z->min_lifetime, z->max_lifetime, z->IsBusy()); + } + + if (z->Acquire()) { + if ((z->used_capacity_ > 0) && !z->IsFull() && + z->capacity_ >= min_capacity) { + //new_type: file type + //lifetime_type: zone type; + if( (new_type == 0 && z->lifetime_type == 0 && flag2 == 0) + || + ( + ((new_type == 0 && z->lifetime_type == 1 && flag2 == 1) || (new_type == 1 && z->lifetime_type == 1 && flag2 == 1)) + && + ( + (flag == 0 && (new_lifetime_ >= z->min_lifetime) && (new_lifetime_ <= z->max_lifetime)) || + (flag == 1 && (new_lifetime_ < z->min_lifetime) && (z->min_lifetime - new_lifetime_ < mx)) || + (flag == 2 && (new_lifetime_ > z->max_lifetime) && (new_lifetime_ - z->max_lifetime < mx) && (new_lifetime_ - z->max_lifetime <= MAX_DIFFTIME)) + ) + ) + ) + { + printf( + "GetBestOpenZoneMatch Normal zone_id=%ld cap=%ld new_lifetime_=%ld new_type=%d " + "min_lifetime=%ld max_lifetime=%ld zone_type=%d global_clock=%d flag=%d flag2=%d overlap_list.size()=%ld\n", + z->id, z->capacity_ / MB, new_lifetime_, new_type, z->min_lifetime, z->max_lifetime, z->lifetime_type, + global_clock, flag, flag2, overlap_zone_list.size()); + if(flag == 1 && (new_lifetime_ < z->min_lifetime)) + mx = z->min_lifetime - new_lifetime_; + if(flag == 2 && (new_lifetime_ > z->max_lifetime)) + mx = new_lifetime_ - z->max_lifetime; + if (allocated_zone != nullptr) { // flag == 1 need to find the maximal max_lifetime + s = allocated_zone->CheckRelease(); + if (!s.ok()) { + printf("InRangeButCheckRelease Fail\n"); + IOStatus s_ = z->CheckRelease(); + if (!s_.ok()) return s_; + return s; + } + } + allocated_zone = z; + best_diff = 0; // 把best_diff赋值为一个较小的值 + // if(flag == 0) break; //flag == 0 need to find the first valid zone + } else { + s = z->CheckRelease(); + if (!s.ok()) return s; + } + + } else { + s = z->CheckRelease(); + if (!s.ok()) return s; + } + } + } + *best_diff_out = best_diff; + *zone_out = allocated_zone; + // if(overlap_zone_list.size() != 0 && ENABLE_CAZA) { + // for (const auto z : io_zones) { + // if(find(overlap_zone_list.begin(), overlap_zone_list.end(), z->id) == overlap_zone_list.end()) continue; + // if (z->Acquire()) { + // if ((z->used_capacity_ > 0) && !z->IsFull() && + // z->capacity_ >= min_capacity) { + // printf( + // "GetBestOpenZoneMatch Overlap zone_id=%ld new_lifetime_=%ld file_hint=%d new_type=%d " + // "min_lifetime=%ld max_lifetime=%ld zone_type=%d global_clock=%d flag=%d flag2=%d overlap_list.size()=%ld\n", + // z->id, new_lifetime_, file_lifetime, new_type, z->min_lifetime, z->max_lifetime, z->lifetime_type, + // global_clock, flag, flag2, overlap_zone_list.size()); + // if (allocated_zone != nullptr) { // flag == 1 need to find the maximal max_lifetime + // s = allocated_zone->CheckRelease(); + // if (!s.ok()) { + // printf("InRangeButCheckRelease Fail\n"); + // IOStatus s_ = z->CheckRelease(); + // if (!s_.ok()) return s_; + // return s; + // } + // } + // allocated_zone = z; + // best_diff = 0; // 把best_diff赋值为一个较小的值 + // } else { + // s = z->CheckRelease(); + // if (!s.ok()) return s; + // } + // } + // } + // } + // else { + + //} + + return IOStatus::OK(); +} + + +extern int allocated_zone_num; + + +// 当Allocate IO zone失效的时候调用此函数来申请一个新的zone IOStatus ZonedBlockDevice::AllocateEmptyZone(Zone **zone_out) { IOStatus s; Zone *allocated_zone = nullptr; @@ -638,6 +878,12 @@ IOStatus ZonedBlockDevice::AllocateEmptyZone(Zone **zone_out) { } } *zone_out = allocated_zone; + if(allocated_zone != nullptr) { + allocated_zone_num++; + } + new_log_writer(*zone_out); + printf("io_zones number = %ld and zone_out = %d\n", io_zones.size(), + zone_out == nullptr ? 0 : 1); return IOStatus::OK(); } @@ -680,26 +926,81 @@ IOStatus ZonedBlockDevice::ReleaseMigrateZone(Zone *zone) { IOStatus ZonedBlockDevice::TakeMigrateZone(Zone **out_zone, Env::WriteLifeTimeHint file_lifetime, - uint32_t min_capacity) { + uint32_t min_capacity, uint64_t new_lifetime, int new_type) { std::unique_lock lock(migrate_zone_mtx_); migrate_resource_.wait(lock, [this] { return !migrating_; }); migrating_ = true; unsigned int best_diff = LIFETIME_DIFF_NOT_GOOD; - auto s = - GetBestOpenZoneMatch(file_lifetime, &best_diff, out_zone, min_capacity); + Zone *allocated_zone = nullptr; + IOStatus s; + if (MYMODE == true) { + //level segragation + s = GetBestOpenZoneMatch(new_lifetime, new_type, file_lifetime, &best_diff, + &allocated_zone, 0, 0, std::vector{}, min_capacity); + if(allocated_zone == nullptr) //L <= x <= R + s = GetBestOpenZoneMatch(new_lifetime, new_type, file_lifetime, &best_diff, + &allocated_zone, 0, 1, std::vector{}, min_capacity); + if (allocated_zone == nullptr) //x < L + s = GetBestOpenZoneMatch(new_lifetime, new_type, file_lifetime, &best_diff, + &allocated_zone, 1, 1, std::vector{}, min_capacity); + if (allocated_zone == nullptr) //R < x + s = GetBestOpenZoneMatch(new_lifetime, new_type, file_lifetime, &best_diff, + &allocated_zone, 2, 1, std::vector{}, min_capacity); + *out_zone = allocated_zone; + + } else if(MYMODE == false) { + s = GetBestOpenZoneMatch(file_lifetime, &best_diff, out_zone, min_capacity); + } if (s.ok() && (*out_zone) != nullptr) { + printf("GC Migrate Begin new_lifetime=%ld new_type=%d zone_id=%ld zone_type=%d min_lifetime=%ld max_lifetime=%ld\n", new_lifetime, new_type, (*out_zone)->id, (*out_zone)->lifetime_type, (*out_zone)->min_lifetime, (*out_zone)->max_lifetime); Info(logger_, "TakeMigrateZone: %lu", (*out_zone)->start_); } else { migrating_ = false; - } + printf("ERROR GC fail new_lifetime=%ld new_type=%d min_capactiy=%d\n", new_lifetime, new_type, min_capacity); + } return s; } +void ZonedBlockDevice::OpenNewZone(Zone **tmp_zone, Env::WriteLifeTimeHint file_lifetime, uint64_t new_lifetime, int new_type, int level) { + const long long MAX = 1e9; + assert(allocated_zone->IsBusy()); + Zone *allocated_zone = *tmp_zone; + allocated_zone->lifetime_ = file_lifetime; + if (new_lifetime > MAX) new_lifetime = 0; + //allocated_zone->min_lifetime = std::max(static_cast(0), new_lifetime - T); + if(new_lifetime == 0) new_type = ((SHORT_THE == -1) ? 1 : 0); + allocated_zone->lifetime_type = new_type; + allocated_zone->level = level; + //if(new_lifetime < T) + + if(ENABLE_T_SLICE) { + allocated_zone->min_lifetime = new_lifetime / T * T; + allocated_zone->max_lifetime = (new_lifetime / T + 1) * T - 1; + } else { + if(ENABLE_T_RANGE) { + allocated_zone->min_lifetime = (new_lifetime < T ? 0: new_lifetime - T); + } else { + allocated_zone->min_lifetime = new_lifetime; + } + int base = T; + for(int i = 1; i <= level - 3; i++) base = base * MULTI; + allocated_zone->max_lifetime = new_lifetime + base; + } + + + printf("OpenNewZone zone_id=%ld l=%ld r=%ld HINT=%d new_type=%d \n", allocated_zone->id, allocated_zone->min_lifetime, allocated_zone->max_lifetime, file_lifetime, new_type); + + allocated_zone->hint_num[file_lifetime]++; +} IOStatus ZonedBlockDevice::AllocateIOZone(Env::WriteLifeTimeHint file_lifetime, - IOType io_type, Zone **out_zone) { + IOType io_type, Zone **out_zone, + uint64_t new_lifetime, int new_type, std::vector overlap_zone_list, int level) { + + printf("AllocateIOZone::Before t_id=%d deletion_timeitZ=%ld new_type=%d active=%ld max_open_zone=%d\n", gettid(),new_lifetime, new_type, active_io_zones_.load(), max_nr_active_io_zones_); + Zone *allocated_zone = nullptr; unsigned int best_diff = LIFETIME_DIFF_NOT_GOOD; int new_zone = 0; @@ -734,15 +1035,77 @@ IOStatus ZonedBlockDevice::AllocateIOZone(Env::WriteLifeTimeHint file_lifetime, WaitForOpenIOZoneToken(io_type == IOType::kWAL); /* Try to fill an already open zone(with the best life time diff) */ - s = GetBestOpenZoneMatch(file_lifetime, &best_diff, &allocated_zone); + if (MYMODE == true) { + //level segragation + if(new_type == 0) { + s = GetBestOpenZoneMatch(new_lifetime, new_type, file_lifetime, &best_diff, + &allocated_zone, 0, 0, std::vector{}); + } + + if(allocated_zone == nullptr) { + //L <= x <= R + s = GetBestOpenZoneMatch(new_lifetime, new_type, file_lifetime, &best_diff, + &allocated_zone, 0, 1, std::vector{}); + + if (allocated_zone == nullptr) { // try again, find the + + if((max_nr_active_io_zones_ - active_io_zones_.load() >= 3) && GetActiveIOZoneTokenIfAvailable()) { + printf("GetBestOpenZone when open active=%ld max_open_zone=%d\n", active_io_zones_.load(), max_nr_active_io_zones_); + s = AllocateEmptyZone(&allocated_zone); + new_zone = true; + if (!s.ok()) { + PutActiveIOZoneToken(); + PutOpenIOZoneToken(); + return s; + } + if (allocated_zone != nullptr) { + OpenNewZone(&allocated_zone, file_lifetime, new_lifetime, new_type, level); + add_allocation(1, 4, new_lifetime, new_type,allocated_zone); + } else { + PutActiveIOZoneToken(); + } + } else { + s = GetBestOpenZoneMatch(new_lifetime, new_type, file_lifetime, &best_diff, + &allocated_zone, 1, 1, std::vector{}); + + if (allocated_zone == nullptr) { // try again, find the + s = GetBestOpenZoneMatch(new_lifetime, new_type, file_lifetime, &best_diff, + &allocated_zone, 2, 1, std::vector{}); + if(allocated_zone != nullptr) { + add_allocation(1, 2, new_lifetime, new_type, allocated_zone); + } + } else { + add_allocation(1, 1, new_lifetime, new_type, allocated_zone); + } + } + } else { + add_allocation(1, 0, new_lifetime, new_type, allocated_zone); + } + } else if(allocated_zone != nullptr) { + add_allocation(0, 3, new_lifetime, new_type,allocated_zone); + } + + + + if(allocated_zone != nullptr) { + best_diff = 0; + } + + } else if(MYMODE == false) { + s = GetBestOpenZoneMatch(file_lifetime, &best_diff, &allocated_zone, 0); + if(allocated_zone != nullptr) { + add_allocation_off(0, file_lifetime, allocated_zone); + allocated_zone->hint_num[file_lifetime]++; + } + } + if (!s.ok()) { PutOpenIOZoneToken(); return s; } - // Holding allocated_zone if != nullptr - if (best_diff >= LIFETIME_DIFF_COULD_BE_WORSE) { + bool got_token = GetActiveIOZoneTokenIfAvailable(); /* If we did not get a token, try to use the best match, even if the life @@ -768,7 +1131,9 @@ IOStatus ZonedBlockDevice::AllocateIOZone(Env::WriteLifeTimeHint file_lifetime, /* If we haven't found an open zone to fill, open a new zone */ if (allocated_zone == nullptr) { /* We have to make sure we can open an empty zone */ + printf("allocated_zone == nulptr But we don't need it best_diff=%d active=%ld max_open_zone=%d\n", best_diff, active_io_zones_.load(), max_nr_active_io_zones_); while (!got_token && !GetActiveIOZoneTokenIfAvailable()) { + s = FinishCheapestIOZone(); if (!s.ok()) { PutOpenIOZoneToken(); @@ -777,6 +1142,7 @@ IOStatus ZonedBlockDevice::AllocateIOZone(Env::WriteLifeTimeHint file_lifetime, } s = AllocateEmptyZone(&allocated_zone); + new_zone = true; if (!s.ok()) { PutActiveIOZoneToken(); PutOpenIOZoneToken(); @@ -784,9 +1150,9 @@ IOStatus ZonedBlockDevice::AllocateIOZone(Env::WriteLifeTimeHint file_lifetime, } if (allocated_zone != nullptr) { - assert(allocated_zone->IsBusy()); - allocated_zone->lifetime_ = file_lifetime; - new_zone = true; + OpenNewZone(&allocated_zone, file_lifetime, new_lifetime, new_type, level); + add_allocation(1, 5, new_lifetime, new_type,allocated_zone); + add_allocation_off(5, file_lifetime, allocated_zone); } else { PutActiveIOZoneToken(); } @@ -808,6 +1174,8 @@ IOStatus ZonedBlockDevice::AllocateIOZone(Env::WriteLifeTimeHint file_lifetime, } *out_zone = allocated_zone; + if (allocated_zone != nullptr) + allocated_zone->prediction_lifetime_list.emplace_back(new_lifetime); metrics_->ReportGeneral(ZENFS_OPEN_ZONES_COUNT, open_io_zones_); metrics_->ReportGeneral(ZENFS_ACTIVE_ZONES_COUNT, active_io_zones_); @@ -863,6 +1231,9 @@ void ZonedBlockDevice::SetZoneDeferredStatus(IOStatus status) { void ZonedBlockDevice::GetZoneSnapshot(std::vector &snapshot) { for (auto *zone : io_zones) { snapshot.emplace_back(*zone); + // printf("zone_information zone_id=%ld zone_capacity=%ld + // zone_max_capacity=%ld zone_used_capacity=%ld\n", zone->id, + // zone->capacity_, zone->max_capacity_, zone->used_capacity_.load()); } } diff --git a/fs/zbd_zenfs.h b/fs/zbd_zenfs.h index cdc3c7d..e39901a 100644 --- a/fs/zbd_zenfs.h +++ b/fs/zbd_zenfs.h @@ -9,6 +9,7 @@ #include #if !defined(ROCKSDB_LITE) && defined(OS_LINUX) +#include #include #include #include @@ -30,6 +31,47 @@ #include "rocksdb/io_status.h" namespace ROCKSDB_NAMESPACE { +const int INF = 1e9; + +const bool MYMODE = true; //true: Prophet false: LIZA +const int ZoneNumber = 100; + +//full compensation: 1 0 0 +//full gc: 0 1 0 +//gc with compensation: 1 1 3 +const int ENABLE_PRECOMPACTION = 1; +const double GC_THRESHOLD = 1; +const int ENABLE_LIMIT_LEVEL = 3; + +const int T = 100; + +//const uint64_t GC_START_LEVEL = 60; //micro test +//const uint64_t GC_STOP_LEVEL = 75; + +const uint64_t GC_START_LEVEL = 20; //full test +const uint64_t GC_STOP_LEVEL = 45; + +const int SHORT_THE = 2; //SHORT_THRESHOLD of level segragation +const int ENABLE_T_SLICE = 1; //ENABLE rounding +const int ENABLE_SHORT_WITH_TYPE0 = 50; //case2B threshold + +const int MAX_LIFETIME = 1e9; //deprecate +const int MAX_DIFFTIME = INF; //deprecate +const int MULTI = 1;//deprecate +const int ENABLE_CAZA = 0;//deprecate +const int MODIFY_OFF = 0; //deprecate +const int ENABLE_CASE1 = 0; //deprecate +const int ENABLE_CASE2 = 0; //deprecate +const bool DISABLE_RESET = false; //deprecate +const int ENABLE_T_RANGE = 0; //1 means [-T, T] deprecate + + +const int CALC_RESET = 1; //default +const int K = 1; //gc top k default +const int MB = 1024 * 1024; +extern int reset_zone_num; + + class ZonedBlockDevice; class ZonedBlockDeviceBackend; @@ -57,13 +99,21 @@ class Zone { public: explicit Zone(ZonedBlockDevice *zbd, ZonedBlockDeviceBackend *zbd_be, std::unique_ptr &zones, unsigned int idx); - + uint64_t id; uint64_t start_; uint64_t capacity_; /* remaining capacity */ uint64_t max_capacity_; uint64_t wp_; Env::WriteLifeTimeHint lifetime_; + uint64_t min_lifetime; + uint64_t max_lifetime; + int lifetime_type; //0 top 1 upper + int level; std::atomic used_capacity_; + std::vector files_id; + std::vector lifetime_list; + std::vector prediction_lifetime_list; + std::map hint_num; IOStatus Reset(); IOStatus Finish(); @@ -92,6 +142,36 @@ class Zone { inline IOStatus CheckRelease(); }; +class LogWriter { +public: + LogWriter(Zone *zone) { + zone_list.push(zone); + }; + LogWriter() = default; + // std::queue *get_zone_list() { + // return &zone_list; + // }; + // bool try_append(uint64_t lifetime) { + // if(lifetime >= min_lifetime && lifetime <= max_lifetime) { + + // return true; + // } + // return false; + // }; + Zone *get_current_zone() { + if(active_zone->capacity_ == 0) { + zone_list.pop(); + if(zone_list.empty()) return nullptr; + active_zone = zone_list.front(); + } + return active_zone; + }; +private: + Zone * active_zone; + std::queue zone_list; + +}; + class ZonedBlockDeviceBackend { public: uint32_t block_sz_ = 0; @@ -139,9 +219,10 @@ enum class ZbdBackendType { }; class ZonedBlockDevice { - private: + private: + std::vector io_zones; std::unique_ptr zbd_be_; - std::vector io_zones; + std::vector log_writer_list; std::vector meta_zones; time_t start_time_; std::shared_ptr logger_; @@ -154,6 +235,7 @@ class ZonedBlockDevice { /* Protects zone_resuorces_ condition variable, used for notifying changes in open_io_zones_ */ std::mutex zone_resources_mtx_; + std::condition_variable zone_resources_; std::mutex zone_deferred_status_mutex_; IOStatus zone_deferred_status_; @@ -176,13 +258,30 @@ class ZonedBlockDevice { std::shared_ptr metrics = std::make_shared()); virtual ~ZonedBlockDevice(); + void new_log_writer(Zone *zone) { + log_writer_list.emplace_back(new LogWriter(zone)); + } + std::vector &get_io_zones() { + return io_zones; + } + bool remove_log_writer(LogWriter * log_writer) { + uint32_t pos = -1; + for(uint32_t i = 0; i < log_writer_list.size(); i++) { + if(log_writer_list[i] == log_writer) { + pos = i; + } + } + if(pos == static_cast(-1)) return false; + log_writer_list.erase(log_writer_list.begin() + pos); + delete log_writer; + return true; + } IOStatus Open(bool readonly, bool exclusive); Zone *GetIOZone(uint64_t offset); - IOStatus AllocateIOZone(Env::WriteLifeTimeHint file_lifetime, IOType io_type, - Zone **out_zone); + IOStatus AllocateIOZone(Env::WriteLifeTimeHint file_lifetime, IOType io_type, Zone **out_zone, uint64_t new_lifetime, int new_type, std::vector overlap_zone_list, int level); IOStatus AllocateMetaZone(Zone **out_meta_zone); uint64_t GetFreeSpace(); @@ -193,6 +292,8 @@ class ZonedBlockDevice { uint32_t GetBlockSize(); IOStatus ResetUnusedIOZones(); + IOStatus MyResetUnusedIOZones(); + IOStatus ResetTartetUnusedIOZones(uint64_t id); void LogZoneStats(); void LogZoneUsage(); void LogGarbageInfo(); @@ -200,6 +301,7 @@ class ZonedBlockDevice { uint64_t GetZoneSize(); uint32_t GetNrZones(); std::vector GetMetaZones() { return meta_zones; } + std::vector GetIOZones() { return io_zones; } void SetFinishTreshold(uint32_t threshold) { finish_threshold_ = threshold; } @@ -219,7 +321,7 @@ class ZonedBlockDevice { IOStatus ReleaseMigrateZone(Zone *zone); IOStatus TakeMigrateZone(Zone **out_zone, Env::WriteLifeTimeHint lifetime, - uint32_t min_capacity); + uint32_t min_capacity, uint64_t new_lifetime, int new_type); void AddBytesWritten(uint64_t written) { bytes_written_ += written; }; void AddGCBytesWritten(uint64_t written) { gc_bytes_written_ += written; }; @@ -227,16 +329,19 @@ class ZonedBlockDevice { return bytes_written_.load() - gc_bytes_written_.load(); }; uint64_t GetTotalBytesWritten() { return bytes_written_.load(); }; - private: IOStatus GetZoneDeferredStatus(); bool GetActiveIOZoneTokenIfAvailable(); void WaitForOpenIOZoneToken(bool prioritized); IOStatus ApplyFinishThreshold(); IOStatus FinishCheapestIOZone(); + void OpenNewZone(Zone **out_zone, Env::WriteLifeTimeHint file_lifetime, uint64_t new_lifetime, int new_type, int level); IOStatus GetBestOpenZoneMatch(Env::WriteLifeTimeHint file_lifetime, unsigned int *best_diff_out, Zone **zone_out, uint32_t min_capacity = 0); + IOStatus GetBestOpenZoneMatch(uint64_t new_lifetime_, int new_type, Env::WriteLifeTimeHint file_lifetime, + unsigned int *best_diff_out, Zone **zone_out, int flag, int flag2, std::vector overlap_list, + uint32_t min_capacity = 0); IOStatus AllocateEmptyZone(Zone **zone_out); }; diff --git a/fs/zbdlib_zenfs.cc b/fs/zbdlib_zenfs.cc index 589c8fc..5fb572f 100644 --- a/fs/zbdlib_zenfs.cc +++ b/fs/zbdlib_zenfs.cc @@ -103,9 +103,10 @@ IOStatus ZbdlibBackend::Open(bool readonly, bool exclusive, block_sz_ = info.pblock_size; zone_sz_ = info.zone_size; - nr_zones_ = info.nr_zones; + nr_zones_ = std::min(static_cast(ZoneNumber), info.nr_zones); *max_active_zones = info.max_nr_active_zones; *max_open_zones = info.max_nr_open_zones; + printf("device information block_sz=%d zone_sz=%ld nr_zones=%d mac_active_zones=%d max_open_zones=%d\n", block_sz_, zone_sz_, nr_zones_, *max_active_zones, *max_open_zones); return IOStatus::OK(); } @@ -116,6 +117,7 @@ std::unique_ptr ZbdlibBackend::ListZones() { ret = zbd_list_zones(read_f_, 0, zone_sz_ * nr_zones_, ZBD_RO_ALL, (struct zbd_zone **)&zones, &nr_zones); + nr_zones = std::min(static_cast (ZoneNumber), nr_zones); if (ret) { return nullptr; } @@ -125,6 +127,7 @@ std::unique_ptr ZbdlibBackend::ListZones() { return zl; } +//Reset操作非常重要 IOStatus ZbdlibBackend::Reset(uint64_t start, bool *offline, uint64_t *max_capacity) { unsigned int report = 1; @@ -172,6 +175,9 @@ int ZbdlibBackend::Read(char *buf, int size, uint64_t pos, bool direct) { } int ZbdlibBackend::Write(char *data, uint32_t size, uint64_t pos) { + write_size_calc += size; + write_size_calc_no_reset += size; + return pwrite(write_f_, data, size, pos); } diff --git a/fs/zbdlib_zenfs.h b/fs/zbdlib_zenfs.h index a72d514..a9d6b19 100644 --- a/fs/zbdlib_zenfs.h +++ b/fs/zbdlib_zenfs.h @@ -17,6 +17,9 @@ namespace ROCKSDB_NAMESPACE { +extern uint64_t write_size_calc; +extern uint64_t write_size_calc_no_reset; + class ZbdlibBackend : public ZonedBlockDeviceBackend { private: std::string filename_; @@ -25,13 +28,13 @@ class ZbdlibBackend : public ZonedBlockDeviceBackend { int write_f_; public: + explicit ZbdlibBackend(std::string bdevname); ~ZbdlibBackend() { zbd_close(read_f_); zbd_close(read_direct_f_); zbd_close(write_f_); } - IOStatus Open(bool readonly, bool exclusive, unsigned int *max_active_zones, unsigned int *max_open_zones); std::unique_ptr ListZones(); diff --git a/fs/zonefs_zenfs.cc b/fs/zonefs_zenfs.cc index c93ab45..463179c 100644 --- a/fs/zonefs_zenfs.cc +++ b/fs/zonefs_zenfs.cc @@ -168,8 +168,9 @@ IOStatus ZoneFsBackend::Open(bool readonly, } // The size of the "seq" directory shows the number of sequential zones - nr_zones_ = zonefs_stat.st_size; - + nr_zones_ = zonefs_stat.st_size; + nr_zones_ = std::min(static_cast(ZoneNumber), nr_zones_); + printf("ZoneFsBackend::Open called nr_zones_=%d\n", nr_zones_); seqdirname += "/0"; int zone_zero_fd = open(seqdirname.c_str(), O_RDONLY); if (zone_zero_fd < 0) { diff --git a/generate-version.sh b/generate-version.sh index 81796c0..cf73871 100755 --- a/generate-version.sh +++ b/generate-version.sh @@ -5,7 +5,7 @@ REPO_ROOT=$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P ) cd $REPO_ROOT # 'git describe --abbrev=7 --dirty' will output a version in that looks like "v0.1.0-12-g3456789-dirty". -VERSION=$(git describe --abbrev=7 --dirty) +VERSION=$(git describe --abbrev=7 --dirty --always) updateVersionFile () { if [ "${#VERSION}" -gt 63 ]; then diff --git a/util/zenfs.cc b/util/zenfs.cc index 7079b6e..a856a91 100644 --- a/util/zenfs.cc +++ b/util/zenfs.cc @@ -777,7 +777,7 @@ int zenfs_tool_fsinfo() { } // namespace ROCKSDB_NAMESPACE int main(int argc, char **argv) { - gflags::SetUsageMessage( + google::SetUsageMessage( std::string("\nUSAGE:\n") + argv[0] + +" [OPTIONS]...\nCommands: mkfs, list, ls-uuid, " + +"df, backup, restore, dump, fs-info, link, delete, rename, rmdir"); @@ -789,9 +789,9 @@ int main(int argc, char **argv) { return 1; } - gflags::SetVersionString(ZENFS_VERSION); + google::SetVersionString(ZENFS_VERSION); std::string subcmd(argv[1]); - gflags::ParseCommandLineFlags(&argc, &argv, true); + google::ParseCommandLineFlags(&argc, &argv, true); if (FLAGS_zonefs.empty() && FLAGS_zbd.empty() && subcmd != "ls-uuid") { fprintf(