From cd8dc0a81cee71ca1946ac31c05e15679fc95a80 Mon Sep 17 00:00:00 2001 From: offtkp Date: Mon, 16 Sep 2024 20:05:54 +0300 Subject: [PATCH] Patch `EXTRQ` --- src/core/cpu_patches.cpp | 132 +++++++++++++++++++++++++++++++- src/core/signals.cpp | 160 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 287 insertions(+), 5 deletions(-) diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index 3f4e4c0df7..a9b97652e8 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include "common/alignment.h" #include "common/assert.h" #include "common/decoder.h" @@ -27,6 +28,16 @@ using namespace Xbyak::util; +#define MAYBE_AVX(OPCODE, ...) \ + [&] { \ + Cpu cpu; \ + if (cpu.has(Cpu::tAVX)) { \ + c.v##OPCODE(__VA_ARGS__); \ + } else { \ + c.OPCODE(__VA_ARGS__); \ + } \ + }() + namespace Core { static Xbyak::Reg ZydisToXbyakRegister(const ZydisRegister reg) { @@ -587,6 +598,114 @@ static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGe #endif // __APPLE__ +static bool FilterNoSSE4a(const ZydisDecodedOperand*) { + Cpu cpu; + return !cpu.has(Cpu::tSSE4a); +} + +static void GenerateEXTRQ(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { + bool immediateForm = operands[1].type == ZYDIS_OPERAND_TYPE_IMMEDIATE && + operands[2].type == ZYDIS_OPERAND_TYPE_IMMEDIATE; + + ASSERT_MSG(operands[0].type == ZYDIS_OPERAND_TYPE_REGISTER, "operand 0 must be a register"); + + const auto dst = ZydisToXbyakRegisterOperand(operands[0]); + + ASSERT_MSG(dst.isXMM(), "operand 0 must be an XMM register"); + + Xbyak::Xmm xmm_dst = *reinterpret_cast(&dst); + + if (immediateForm) { + u8 length = operands[1].imm.value.u & 0x3F; + u8 index = operands[2].imm.value.u & 0x3F; + if (length == 0) { + length = 64; + } + + LOG_DEBUG(Core, "Patching immediate form EXTRQ, length: {}, index: {}", length, index); + + const Xbyak::Reg64 scratch1 = rax; + const Xbyak::Reg64 scratch2 = rcx; + + // Set rsp to before red zone and save scratch registers + c.lea(rsp, ptr[rsp - 128]); + c.pushfq(); + c.push(scratch1); + c.push(scratch2); + + u64 mask = (1ULL << length) - 1; + + // Get lower qword from xmm register + MAYBE_AVX(movq, scratch1, xmm_dst); + + if (index != 0) { + c.shr(scratch1, index); + } + + // We need to move mask to a register because we can't use all the possible + // immediate values with `and reg, imm32` + c.mov(scratch2, mask); + c.and_(scratch1, scratch2); + + // Writeback to xmm register, extrq instruction says top 64-bits are undefined so we don't + // care to preserve them + MAYBE_AVX(movq, xmm_dst, scratch1); + + c.pop(scratch2); + c.pop(scratch1); + c.popfq(); + c.lea(rsp, ptr[rsp + 128]); + } else { + ASSERT_MSG(operands[0].type == ZYDIS_OPERAND_TYPE_REGISTER && + operands[1].type == ZYDIS_OPERAND_TYPE_REGISTER && + operands[0].reg.value >= ZYDIS_REGISTER_XMM0 && + operands[0].reg.value <= ZYDIS_REGISTER_XMM15 && + operands[1].reg.value >= ZYDIS_REGISTER_XMM0 && + operands[1].reg.value <= ZYDIS_REGISTER_XMM15, + "Unexpected operand types for EXTRQ instruction"); + + const auto src = ZydisToXbyakRegisterOperand(operands[1]); + + ASSERT_MSG(src.isXMM(), "operand 1 must be an XMM register"); + + Xbyak::Xmm xmm_src = *reinterpret_cast(&src); + + const Xbyak::Reg64 scratch1 = rax; + const Xbyak::Reg64 scratch2 = rcx; + const Xbyak::Reg64 mask = rdx; + + c.lea(rsp, ptr[rsp - 128]); + c.pushfq(); + c.push(scratch1); + c.push(scratch2); + c.push(mask); + + // Construct the mask out of the length that resides in bottom 6 bits of source xmm + MAYBE_AVX(movq, scratch1, xmm_src); + c.mov(scratch2, scratch1); + c.and_(scratch2, 0x3F); + c.mov(mask, 1); + c.shl(mask, cl); + c.dec(mask); + + // Get the shift amount and store it in scratch2 + c.shr(scratch1, 8); + c.and_(scratch1, 0x3F); + c.mov(scratch2, scratch1); // cl now contains the shift amount + + MAYBE_AVX(movq, scratch1, xmm_dst); + c.shr(scratch1, cl); + c.and_(scratch1, mask); + MAYBE_AVX(movq, xmm_dst, scratch1); + + c.pop(mask); + c.pop(scratch2); + c.pop(scratch1); + c.popfq(); + c.lea(rsp, ptr[rsp + 128]); + } +} + using PatchFilter = bool (*)(const ZydisDecodedOperand*); using InstructionGenerator = void (*)(const ZydisDecodedOperand*, Xbyak::CodeGenerator&); struct PatchInfo { @@ -608,6 +727,8 @@ static const std::unordered_map Patches = { {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, false}}, #endif + {ZYDIS_MNEMONIC_EXTRQ, {FilterNoSSE4a, GenerateEXTRQ, true}}, + #ifdef __APPLE__ // Patches for instruction sets not supported by Rosetta 2. // BMI1 @@ -671,14 +792,23 @@ static std::pair TryPatch(u8* code, PatchModule* module) { if (Patches.contains(instruction.mnemonic)) { const auto& patch_info = Patches.at(instruction.mnemonic); + bool needs_trampoline = patch_info.trampoline; if (patch_info.filter(operands)) { auto& patch_gen = module->patch_gen; + if (needs_trampoline && instruction.length < 5) { + // Trampoline is needed but instruction is too short to patch. + // Should be handled at illegal instruction handler. + // This if is for Linux which does some AOT patching, + // should be removed if that gets removed. + return std::make_pair(false, instruction.length); + } + // Reset state and move to current code position. patch_gen.reset(); patch_gen.setSize(code - patch_gen.getCode()); - if (patch_info.trampoline) { + if (needs_trampoline) { auto& trampoline_gen = module->trampoline_gen; const auto trampoline_ptr = trampoline_gen.getCurr(); diff --git a/src/core/signals.cpp b/src/core/signals.cpp index df3b28b975..5bf49a5d49 100644 --- a/src/core/signals.cpp +++ b/src/core/signals.cpp @@ -12,9 +12,150 @@ #include #ifdef ARCH_X86_64 #include +#include #endif #endif +namespace { + +#if defined(ARCH_X86_64) + +#ifdef _WIN32 +#define INCREMENT_RIP(ctx, length) ((CONTEXT*)ctx)->Rip += length +#else +#define INCREMENT_RIP(ctx, length) ((ucontext_t*)ctx)->uc_mcontext.gregs[REG_RIP] += length +#endif + +void* getXmmPointer(void* ctx, u32 index) { +#if defined(_WIN32) +#define CASE(index) \ + case index: \ + return (void*)(&(((CONTEXT*)ctx)->Xmm##index.Low)) +#elif defined(__APPLE__) +#define CASE(index) \ + case index: \ + return (void*)(&((ucontext_t*)ctx)->uc_mcontext.__fs.fpu_xmm##index); +#else +#define CASE(index) return (void*)(&((ucontext_t*)ctx)->uc_mcontext.fpregs->_xmm[index].element[0]) +#endif + switch (index) { + CASE(0); + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + default: { + UNREACHABLE_MSG("Invalid XMM register index: {}", index); + return nullptr; + } + } +#undef CASE +} + +// We need to check, before patching, if there's enough space for a relative jump to the trampoline. +// If there isn't, the instruction must be handled specially in the illegal instruction handler +// itself. +bool shouldNotBePatched(void* code_address) { + ZydisDecodedInstruction instruction; + ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT]; + const auto status = + Common::Decoder::Instance()->decodeInstruction(instruction, operands, code_address); + if (!ZYAN_SUCCESS(status)) { + LOG_ERROR(Core, "Failed to decode instruction at: {}", fmt::ptr(code_address)); + } + + if (instruction.length < 5) { + // not enough bytes for a relative jump for the trampoline + return true; + } + + return false; +} + +bool handleIllegalInstruction(void* ctx, void* code_address) { + ZydisDecodedInstruction instruction; + ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT]; + const auto status = + Common::Decoder::Instance()->decodeInstruction(instruction, operands, code_address); + + switch (instruction.mnemonic) { + case ZYDIS_MNEMONIC_EXTRQ: { + bool immediateForm = operands[1].type == ZYDIS_OPERAND_TYPE_IMMEDIATE && + operands[2].type == ZYDIS_OPERAND_TYPE_IMMEDIATE; + if (immediateForm) { + LOG_ERROR(Core, "EXTRQ immediate form should have been patched at code address: {}", + fmt::ptr(code_address)); + return false; + } else { + ASSERT_MSG(operands[0].type == ZYDIS_OPERAND_TYPE_REGISTER && + operands[1].type == ZYDIS_OPERAND_TYPE_REGISTER && + operands[0].reg.value >= ZYDIS_REGISTER_XMM0 && + operands[0].reg.value <= ZYDIS_REGISTER_XMM15 && + operands[1].reg.value >= ZYDIS_REGISTER_XMM0 && + operands[1].reg.value <= ZYDIS_REGISTER_XMM15, + "Unexpected operand types for EXTRQ instruction"); + + const auto dstIndex = operands[0].reg.value - ZYDIS_REGISTER_XMM0; + const auto srcIndex = operands[1].reg.value - ZYDIS_REGISTER_XMM0; + + const auto dst = getXmmPointer(ctx, dstIndex); + const auto src = getXmmPointer(ctx, srcIndex); + + u64 lowQWordSrc; + memcpy(&lowQWordSrc, src, sizeof(lowQWordSrc)); + + u64 lowQWordDst; + memcpy(&lowQWordDst, dst, sizeof(lowQWordDst)); + + u64 mask = lowQWordSrc & 0x3F; + mask = (1ULL << mask) - 1; + + u64 shift = (lowQWordSrc >> 8) & 0x3F; + + lowQWordDst >>= shift; + lowQWordDst &= mask; + + memcpy(dst, &lowQWordDst, sizeof(lowQWordDst)); + + INCREMENT_RIP(ctx, instruction.length); + + return true; + } + break; + } + default: { + LOG_ERROR(Core, "Unhandled illegal instruction at code address {}: {}", + fmt::ptr(code_address), ZydisMnemonicGetString(instruction.mnemonic)); + return false; + } + } +} +#elif defined(ARCH_ARM64) +// These functions shouldn't be needed for ARM as it will use a JIT so there's no need to patch +// instructions. Returning false lets it go through with whatever handler is set up. +bool shouldNotBePatched(void* code_address) { + return false; +} + +bool handleIllegalInstruction(void* code_address) { + return false; +} +#else +#error "Unsupported architecture" +#endif +} // namespace + namespace Core { #if defined(_WIN32) @@ -32,7 +173,11 @@ static LONG WINAPI SignalHandler(EXCEPTION_POINTERS* pExp) noexcept { pExp->ExceptionRecord->ExceptionInformation[0] == 1); break; case EXCEPTION_ILLEGAL_INSTRUCTION: - handled = signals->DispatchIllegalInstruction(code_address); + if (shouldNotBePatched(code_address)) { + handled = handleIllegalInstruction((void*)pExp->ContextRecord, code_address); + } else { + handled = signals->DispatchIllegalInstruction(code_address); + } break; default: break; @@ -99,9 +244,16 @@ static void SignalHandler(int sig, siginfo_t* info, void* raw_context) { } break; case SIGILL: - if (!signals->DispatchIllegalInstruction(code_address)) { - UNREACHABLE_MSG("Unhandled illegal instruction at code address {}: {}", - fmt::ptr(code_address), DisassembleInstruction(code_address)); + if (shouldNotBePatched(code_address)) { + if (!handleIllegalInstruction(raw_context, code_address)) { + UNREACHABLE_MSG("Unhandled illegal instruction at code address {}: {}", + fmt::ptr(code_address), DisassembleInstruction(code_address)); + } + } else { + if (!signals->DispatchIllegalInstruction(code_address)) { + UNREACHABLE_MSG("Unhandled illegal instruction at code address {}: {}", + fmt::ptr(code_address), DisassembleInstruction(code_address)); + } } break; default: