Skip to content

Commit

Permalink
Fix --block-io
Browse files Browse the repository at this point in the history
Due to a bug, when the multi-reader switches from one reader with later
events to one that has earlier events, a block I/O event was discarded
from the reader with the later events. This predictably led to event
loss.

Further, the insert event was switched from block_rq_insert to
block_bio_queue. The reason for this is that block_rq_insert never
matched the amount of data reported by block_rq_issue and
block_rq_complete. I could not find another block_rq_* tracepoint that
accounts for the block I/O request that do not enter through
block_rq_insert. However, block_bio_queue matches the amount of data
that block_rq_issue and block_rq_complete report.

This results in somewhat of a mismatch, as block_bio_* is one level
higher up than block_rq_*. This is most notable in that one
block_bio_queue "struct bio" is split into multiple (usually less than
10) "struct rq" on the block_rq_* level.

I remedy this situation by only writing the first block_rq_insert/issue
event that I encounter (which is the one with the matching sector to the
block_bio_queue event) and discarding the others.

We might want to discuss if using the timestamp of the last
block_rq_* event is the more correct variant.
  • Loading branch information
cvonelm committed Mar 14, 2024
1 parent ccbae16 commit 5c9696f
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 35 deletions.
120 changes: 86 additions & 34 deletions include/lo2s/perf/bio/writer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,24 @@ struct __attribute((__packed__)) RecordBlock
char rwbs[8]; // the type of the operation. "R" for read, "W" for write, etc.
};

struct __attribute((__packed__)) RecordBioQueue
{
TracepointSampleType header;
uint16_t common_type; // common fields included in all tracepoints, ignored here
uint8_t common_flag;
uint8_t common_preempt_count;
int32_t common_pid;

uint32_t dev; // the accessed device (as dev_t)
char padding[4]; // padding because of struct alignment
uint64_t sector; // the accessed sector on the device

uint32_t nr_sector; // the number of sector_cache_ written
// 512) for complete: the error code of the operation

char rwbs[8]; // the type of the operation. "R" for read, "W" for write, etc.
};

class Writer
{
public:
Expand All @@ -64,50 +82,71 @@ class Writer

void write(IoReaderIdentity& identity, TracepointSampleType* header)
{
struct RecordBlock* event = (RecordBlock*)header;

otf2::common::io_operation_mode_type mode = otf2::common::io_operation_mode_type::flush;
// TODO: Handle the few io operations that arent either reads or write
if (event->rwbs[0] == 'R')
{
mode = otf2::common::io_operation_mode_type::read;
}
else if (event->rwbs[0] == 'W')
{
mode = otf2::common::io_operation_mode_type::write;
}
else
{
return;
}

// Something seems to be broken about the dev_t dev field returned from the block_rq_*
// tracepoints What works is extracting the major:minor numbers from the dev field with
// bitshifts as documented in block/block_rq_*/format So first convert the dev field into
// major:minor numbers and then use the makedev macro to get an unbroken dev_t.
BlockDevice dev =
BlockDevice::block_device_for(makedev(event->dev >> 20, event->dev & ((1U << 20) - 1)));

otf2::writer::local& writer = trace_.bio_writer(dev);
otf2::definition::io_handle& handle = trace_.block_io_handle(dev);

auto size = event->nr_sector * SECTOR_SIZE;

if (identity.tracepoint == bio_insert_)
if (identity.tracepoint == bio_queue_)
{
struct RecordBioQueue* event = (RecordBioQueue*)header;

otf2::common::io_operation_mode_type mode = otf2::common::io_operation_mode_type::flush;
// TODO: Handle the few io operations that arent either reads or write
if (event->rwbs[0] == 'R')
{
mode = otf2::common::io_operation_mode_type::read;
}
else if (event->rwbs[0] == 'W')
{
mode = otf2::common::io_operation_mode_type::write;
}
else
{
return;
}

BlockDevice dev = block_device_for<RecordBioQueue>(event);

otf2::writer::local& writer = trace_.bio_writer(dev);
otf2::definition::io_handle& handle = trace_.block_io_handle(dev);
auto size = event->nr_sector * SECTOR_SIZE;

sector_cache_[dev][event->sector] = size;
writer << otf2::event::io_operation_begin(
time_converter_(event->header.time), handle, mode,
otf2::common::io_operation_flag_type::non_blocking, size, event->sector);
}
else if (identity.tracepoint == bio_issue_)
{
struct RecordBlock* event = (RecordBlock*)header;

BlockDevice dev = block_device_for<RecordBlock>(event);

if (sector_cache_.count(dev) == 0 || sector_cache_.at(dev).count(event->sector) == 0)
{
return;
}

otf2::writer::local& writer = trace_.bio_writer(dev);
otf2::definition::io_handle& handle = trace_.block_io_handle(dev);

writer << otf2::event::io_operation_issued(time_converter_(event->header.time), handle,
event->sector);
}
else if (identity.tracepoint == bio_complete_)
{
struct RecordBlock* event = (RecordBlock*)header;

BlockDevice dev = block_device_for<RecordBlock>(event);

if (sector_cache_.count(dev) == 0 && sector_cache_[dev].count(event->sector) == 0)
{
return;
}

otf2::writer::local& writer = trace_.bio_writer(dev);
otf2::definition::io_handle& handle = trace_.block_io_handle(dev);

writer << otf2::event::io_operation_complete(time_converter_(event->header.time),
handle, size, event->sector);
handle, sector_cache_[dev][event->sector],
event->sector);
sector_cache_[dev][event->sector] = 0;
}
else
{
Expand All @@ -119,18 +158,31 @@ class Writer
std::vector<tracepoint::EventFormat> get_tracepoints()
{

bio_insert_ = tracepoint::EventFormat("block:block_rq_insert");
bio_queue_ = tracepoint::EventFormat("block:block_bio_queue");
bio_issue_ = tracepoint::EventFormat("block:block_rq_issue");
bio_complete_ = tracepoint::EventFormat("block:block_rq_complete");

return { bio_insert_, bio_issue_, bio_complete_ };
return { bio_queue_, bio_issue_, bio_complete_ };
}

private:
template <class T>
BlockDevice block_device_for(T* event)
{
// Something seems to be broken about the dev_t dev field returned from the block_rq_*
// tracepoints What works is extracting the major:minor numbers from the dev field with
// bitshifts as documented in block/block_rq_*/format So first convert the dev field
// into major:minor numbers and then use the makedev macro to get an unbroken dev_t.
return BlockDevice::block_device_for(
makedev(event->dev >> 20, event->dev & ((1U << 20) - 1)));
}

private:
std::map<BlockDevice, std::map<uint64_t, uint64_t>> sector_cache_;
trace::Trace& trace_;
time::Converter& time_converter_;

tracepoint::EventFormat bio_insert_;
tracepoint::EventFormat bio_queue_;
tracepoint::EventFormat bio_issue_;
tracepoint::EventFormat bio_complete_;
// The unit "sector" is always 512 bit large, regardless of the actual sector size of the device
Expand Down
1 change: 0 additions & 1 deletion include/lo2s/perf/multi_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ class MultiReader
{
state.time = event->time;
earliest_available.push(state);
readers_.at(state.identity).pop();
break;
}

Expand Down

0 comments on commit 5c9696f

Please sign in to comment.