From 9dffa0a3470934d236ec30af418730b4d873d32c Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Fri, 1 Mar 2024 18:39:00 -0800 Subject: [PATCH] Add function to create proto from disassembled instructions (#50) This patch adds a new function that refactors out some functionality from the existing proto generation functions to create a proto directly from disassembled instructions. This allows user to do their own disassembly (like for easy access to the llvm MCInsts) and efficiently create a proto afterwards. --- gematria/datasets/bhive_importer.cc | 27 +++++++++++++-------- gematria/datasets/bhive_importer.h | 25 ++++++++++++------- gematria/datasets/bhive_importer_test.cc | 31 ++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 19 deletions(-) diff --git a/gematria/datasets/bhive_importer.cc b/gematria/datasets/bhive_importer.cc index 1bdb3424..70cd5bb7 100644 --- a/gematria/datasets/bhive_importer.cc +++ b/gematria/datasets/bhive_importer.cc @@ -63,6 +63,22 @@ BHiveImporter::BHiveImporter(const Canonicalizer* canonicalizer) *target_machine_.getMCAsmInfo(), *target_machine_.getMCInstrInfo(), *target_machine_.getMCRegisterInfo())) {} +BasicBlockProto BHiveImporter::BasicBlockProtoFromInstructions( + llvm::ArrayRef disassembled_instructions, + uint64_t base_address /*= 0*/) { + BasicBlockProto basic_block_proto; + for (const DisassembledInstruction& instruction : disassembled_instructions) { + MachineInstructionProto& machine_instruction = + *basic_block_proto.add_machine_instructions(); + machine_instruction.set_address(instruction.address); + machine_instruction.set_assembly(instruction.assembly); + machine_instruction.set_machine_code(instruction.machine_code); + *basic_block_proto.add_canonicalized_instructions() = ProtoFromInstruction( + canonicalizer_.InstructionFromMCInst(instruction.mc_inst)); + } + return basic_block_proto; +} + absl::StatusOr BHiveImporter::BasicBlockProtoFromMachineCode( llvm::ArrayRef machine_code, uint64_t base_address /*= 0*/) { BasicBlockProto basic_block_proto; @@ -76,16 +92,7 @@ absl::StatusOr BHiveImporter::BasicBlockProtoFromMachineCode( return LlvmErrorToStatus(std::move(error)); } - for (DisassembledInstruction& instruction : *instructions) { - MachineInstructionProto& machine_instruction = - *basic_block_proto.add_machine_instructions(); - machine_instruction.set_address(instruction.address); - machine_instruction.set_assembly(instruction.assembly); - machine_instruction.set_machine_code(instruction.machine_code); - *basic_block_proto.add_canonicalized_instructions() = ProtoFromInstruction( - canonicalizer_.InstructionFromMCInst(instruction.mc_inst)); - } - return basic_block_proto; + return BasicBlockProtoFromInstructions(*instructions); } absl::StatusOr diff --git a/gematria/datasets/bhive_importer.h b/gematria/datasets/bhive_importer.h index 271b7dc7..776a902f 100644 --- a/gematria/datasets/bhive_importer.h +++ b/gematria/datasets/bhive_importer.h @@ -24,6 +24,7 @@ #include "absl/status/statusor.h" #include "gematria/llvm/canonicalizer.h" +#include "gematria/llvm/disassembler.h" #include "gematria/proto/basic_block.pb.h" #include "gematria/proto/throughput.pb.h" #include "llvm/ADT/ArrayRef.h" @@ -42,15 +43,21 @@ class BHiveImporter { // Does not take ownership of the canonicalizer. explicit BHiveImporter(const Canonicalizer* canonicalizer); - // Creates a basic block from the given block of machine code. `machine_code` - // must contain machine code of the instructions to include in the basic - // block. Expects that the `machine_code.begin()` is the first byte of the - // first instruction, and `machine_code.rbegin()` is the last byte of the last - // instruction. Uses `base_address` as the address of the first instruction; - // the addresses of following instructions are derived from `base_address` and - // the sizes of the instructions that preceded it. - // Returns an error when parts of `machine_code` do not disassemble using the - // provided canonicalizer. + // Creates a basic block from the given instructions. Uses `base_address` as + // the address of the first instruction; the addresses of following + // instructions are derived from `base_address` and the sizes of the + // instructions that preceded it. + BasicBlockProto BasicBlockProtoFromInstructions( + llvm::ArrayRef disassembled_instructions, + uint64_t base_address = 0); + + // A version of BasicBlockProtoFromInstructions. Creates a basic block from + // the given block of machine code. `machine_code` must contain machine code + // of the instructions to include in the basic block. Expects that the + // `machine_code.begin()` is the first byte of the first instruction, and + // `machine_code.rbegin()` is the last byte of the last instruction. Returns + // an error when parts of `machine_code` do not disassemble using the provided + // canonicalizer. absl::StatusOr BasicBlockProtoFromMachineCode( llvm::ArrayRef machine_code, uint64_t base_address = 0); diff --git a/gematria/datasets/bhive_importer_test.cc b/gematria/datasets/bhive_importer_test.cc index 2617b8b0..8e2c6744 100644 --- a/gematria/datasets/bhive_importer_test.cc +++ b/gematria/datasets/bhive_importer_test.cc @@ -19,6 +19,7 @@ #include "gematria/llvm/canonicalizer.h" #include "gematria/llvm/llvm_architecture_support.h" #include "gematria/testing/matchers.h" +#include "gematria/utils/string.h" #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -57,6 +58,36 @@ TEST_F(BHiveImporterTest, EmptyBlock) { })pb"))); } +TEST_F(BHiveImporterTest, SingleInstructionHex) { + const auto machine_code_bytes_or_status = ParseHexString("4929d2"); + ASSERT_TRUE(machine_code_bytes_or_status.has_value()); + + std::unique_ptr inst_printer = + x86_llvm_->CreateMCInstPrinter(0); + + llvm::Expected> instructions_or_error = + DisassembleAllInstructions( + x86_llvm_->mc_disassembler(), x86_llvm_->mc_instr_info(), + x86_llvm_->mc_register_info(), x86_llvm_->mc_subtarget_info(), + *inst_printer, 0, *machine_code_bytes_or_status); + ASSERT_TRUE(static_cast(instructions_or_error)); + EXPECT_THAT(x86_bhive_importer_->BasicBlockProtoFromInstructions( + *instructions_or_error, 0), + EqualsProto( + R"pb(machine_instructions { + assembly: "\tsubq\t%rdx, %r10" + machine_code: "I)\322" + } + canonicalized_instructions { + mnemonic: "SUB" + llvm_mnemonic: "SUB64rr" + output_operands { register_name: "R10" } + input_operands { register_name: "R10" } + input_operands { register_name: "RDX" } + implicit_output_operands { register_name: "EFLAGS" } + })pb")); +} + TEST_F(BHiveImporterTest, OneInstruction) { EXPECT_THAT(x86_bhive_importer_->ParseBHiveCsvLine( kSourceName, "4929d2,100.000000", 0, 1, 0.5),