From 409c73f5ab0d16f2a208bbfa97dec9018afd351d Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Fri, 15 Sep 2023 22:24:13 +0200 Subject: [PATCH 1/5] Latte: Rework command processor Widened conditions for accelerated draws Added code for logging buffer contents --- .../HW/Latte/Core/LatteCommandProcessor.cpp | 1057 +++++++++++------ src/Cafe/HW/Latte/Core/LatteOverlay.cpp | 6 +- src/Cafe/HW/Latte/Core/LatteOverlay.h | 2 +- .../HW/Latte/Core/LattePerformanceMonitor.cpp | 8 +- .../HW/Latte/Core/LattePerformanceMonitor.h | 1 + src/Cafe/HW/Latte/ISA/LatteReg.h | 2 +- 6 files changed, 684 insertions(+), 392 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp b/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp index 37ce8ff97..60e5935cd 100644 --- a/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp +++ b/src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp @@ -16,9 +16,17 @@ #include "Cafe/CafeSystem.h" +#include + +void LatteCP_DebugPrintCmdBuffer(uint32be* bufferPtr, uint32 size); + #define CP_TIMER_RECHECK 1024 -//#define FAST_DRAW_LOGGING +//#define LATTE_CP_LOGGING + +typedef uint32be* LatteCMDPtr; +#define LatteReadCMD() ((uint32)*(cmd++)) +#define LatteSkipCMD(_nWords) cmd += (_nWords) uint8* gxRingBufferReadPtr; // currently active read pointer (gx2 ring buffer or display list) uint8* gx2CPParserDisplayListPtr; @@ -31,6 +39,14 @@ void LatteThread_Exit(); class DrawPassContext { + struct CmdQueuePos + { + CmdQueuePos(LatteCMDPtr current, LatteCMDPtr start, LatteCMDPtr end) : current(current), start(start), end(end) {}; + + LatteCMDPtr current; + LatteCMDPtr start; + LatteCMDPtr end; + }; public: bool isWithinDrawPass() const { @@ -54,6 +70,13 @@ class DrawPassContext if (numInstances == 0) return; + /* + if (GetAsyncKeyState('B')) + { + cemuLog_force("[executeDraw] {} Count {} BaseVertex {} BaseInstance {}", m_isFirstDraw?"Init":"Fast", count, baseVertex, baseInstance); + } + */ + if (!isAutoIndex) { cemu_assert_debug(physIndices != MPTR_NULL); @@ -66,6 +89,9 @@ class DrawPassContext { g_renderer->draw_execute(baseVertex, baseInstance, numInstances, count, MPTR_NULL, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE::AUTO, m_isFirstDraw); } + performanceMonitor.cycle[performanceMonitor.cycleIndex].drawCallCounter++; + if (!m_isFirstDraw) + performanceMonitor.cycle[performanceMonitor.cycleIndex].fastDrawCallCounter++; m_isFirstDraw = false; m_vertexBufferChanged = false; m_uniformBufferChanged = false; @@ -87,14 +113,33 @@ class DrawPassContext m_uniformBufferChanged = true; } + // command buffer processing position + void PushCurrentCommandQueuePos(LatteCMDPtr current, LatteCMDPtr start, LatteCMDPtr end) + { + m_queuePosStack.emplace_back(current, start, end); + } + + bool PopCurrentCommandQueuePos(LatteCMDPtr& current, LatteCMDPtr& start, LatteCMDPtr& end) + { + if (m_queuePosStack.empty()) + return false; + const auto& it = m_queuePosStack.back(); + current = it.current; + start = it.start; + end = it.end; + m_queuePosStack.pop_back(); + return true; + } + private: bool m_drawPassActive{ false }; bool m_isFirstDraw{false}; bool m_vertexBufferChanged{ false }; bool m_uniformBufferChanged{ false }; + boost::container::small_vector m_queuePosStack; }; -void LatteCP_processCommandBuffer(uint8* cmdBuffer, sint32 cmdSize, DrawPassContext& drawPassCtx); +void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx); /* * Read a U32 from the command buffer @@ -193,10 +238,6 @@ void LatteCP_skipWords(uint32 wordsToSkip) } } -typedef uint32be* LatteCMDPtr; -#define LatteReadCMD() ((uint32)*(cmd++)) -#define LatteSkipCMD(_nWords) cmd += (_nWords) - LatteCMDPtr LatteCP_itSurfaceSync(LatteCMDPtr cmd) { uint32 invalidationFlags = LatteReadCMD(); @@ -215,22 +256,31 @@ LatteCMDPtr LatteCP_itSurfaceSync(LatteCMDPtr cmd) return cmd; } -template -void LatteCP_itIndirectBufferDepr(uint32 nWords) +// called from TCL command queue. Executes a memory command buffer +void LatteCP_itIndirectBufferDepr(LatteCMDPtr cmd, uint32 nWords) { cemu_assert_debug(nWords == 3); - - uint32 physicalAddress = readU32(); - uint32 physicalAddressHigh = readU32(); // unused - uint32 sizeInDWords = readU32(); + uint32 physicalAddress = LatteReadCMD(); + uint32 physicalAddressHigh = LatteReadCMD(); // unused + uint32 sizeInDWords = LatteReadCMD(); uint32 displayListSize = sizeInDWords * 4; DrawPassContext drawPassCtx; - LatteCP_processCommandBuffer(memory_getPointerFromPhysicalOffset(physicalAddress), displayListSize, drawPassCtx); + +#ifdef LATTE_CP_LOGGING + if (GetAsyncKeyState('A')) + LatteCP_DebugPrintCmdBuffer(MEMPTR(physicalAddress), displayListSize); +#endif + + uint32be* buf = MEMPTR(physicalAddress).GetPtr(); + drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords); + + LatteCP_processCommandBuffer(drawPassCtx); if (drawPassCtx.isWithinDrawPass()) drawPassCtx.endDrawPass(); } -LatteCMDPtr LatteCP_itIndirectBuffer(LatteCMDPtr cmd, uint32 nWords, DrawPassContext& drawPassCtx) +// pushes the command buffer to the stack +void LatteCP_itIndirectBuffer(LatteCMDPtr cmd, uint32 nWords, DrawPassContext& drawPassCtx) { cemu_assert_debug(nWords == 3); uint32 physicalAddress = LatteReadCMD(); @@ -239,8 +289,8 @@ LatteCMDPtr LatteCP_itIndirectBuffer(LatteCMDPtr cmd, uint32 nWords, DrawPassCon uint32 displayListSize = sizeInDWords * 4; cemu_assert_debug(displayListSize >= 4); - LatteCP_processCommandBuffer(memory_getPointerFromPhysicalOffset(physicalAddress), displayListSize, drawPassCtx); - return cmd; + uint32be* buf = MEMPTR(physicalAddress).GetPtr(); + drawPassCtx.PushCurrentCommandQueuePos(buf, buf, buf + sizeInDWords); } LatteCMDPtr LatteCP_itStreamoutBufferUpdate(LatteCMDPtr cmd, uint32 nWords) @@ -615,8 +665,6 @@ LatteCMDPtr LatteCP_itDrawIndex2(LatteCMDPtr cmd, uint32 nWords, DrawPassContext uint32 count = LatteReadCMD(); uint32 ukn3 = LatteReadCMD(); - performanceMonitor.cycle[performanceMonitor.cycleIndex].drawCallCounter++; - LatteGPUState.currentDrawCallTick = GetTickCount(); drawPassCtx.executeDraw(count, false, physIndices); return cmd; @@ -628,8 +676,6 @@ LatteCMDPtr LatteCP_itDrawIndexAuto(LatteCMDPtr cmd, uint32 nWords, DrawPassCont uint32 count = LatteReadCMD(); uint32 ukn = LatteReadCMD(); - performanceMonitor.cycle[performanceMonitor.cycleIndex].drawCallCounter++; - if (LatteGPUState.drawContext.numInstances == 0) return cmd; LatteGPUState.currentDrawCallTick = GetTickCount(); @@ -692,7 +738,6 @@ LatteCMDPtr LatteCP_itDrawImmediate(LatteCMDPtr cmd, uint32 nWords, DrawPassCont // verify packet size if (nWords != (2 + numIndexU32s)) debugBreakpoint(); - performanceMonitor.cycle[performanceMonitor.cycleIndex].drawCallCounter++; uint32 baseVertex = LatteGPUState.contextRegister[mmSQ_VTX_BASE_VTX_LOC]; uint32 baseInstance = LatteGPUState.contextRegister[mmSQ_VTX_START_INST_LOC]; @@ -930,398 +975,682 @@ void LatteCP_dumpCommandBufferError(LatteCMDPtr cmdStart, LatteCMDPtr cmdEnd, La } // any drawcalls issued without changing textures, framebuffers, shader or other complex states can be done quickly without having to reinitialize the entire pipeline state -// we implement this optimization by having an optimized version of LatteCP_processCommandBuffer, called right after drawcalls, which only implements commands that dont interfere with fast drawing. Other commands will cause this function to return to the complex parser -LatteCMDPtr LatteCP_processCommandBuffer_continuousDrawPass(LatteCMDPtr cmd, LatteCMDPtr cmdStart, LatteCMDPtr cmdEnd, DrawPassContext& drawPassCtx) +// we implement this optimization by having a specialized version of LatteCP_processCommandBuffer, called right after drawcalls, which only implements commands that dont interfere with fast drawing. Other commands will cause this function to return to the complex and generic parser +void LatteCP_processCommandBuffer_continuousDrawPass(DrawPassContext& drawPassCtx) { cemu_assert_debug(drawPassCtx.isWithinDrawPass()); // quit early if there are parameters set which are generally incompatible with fast drawing if (LatteGPUState.contextRegister[mmVGT_STRMOUT_EN] != 0) { drawPassCtx.endDrawPass(); - return cmd; + return; } // check for other special states? - while (cmd < cmdEnd) + while (true) { - LatteCMDPtr cmdBeforeCommand = cmd; - uint32 itHeader = LatteReadCMD(); - uint32 itHeaderType = (itHeader >> 30) & 3; - if (itHeaderType == 3) + LatteCMDPtr cmd, cmdStart, cmdEnd; + if (!drawPassCtx.PopCurrentCommandQueuePos(cmd, cmdStart, cmdEnd)) { - uint32 itCode = (itHeader >> 8) & 0xFF; - uint32 nWords = ((itHeader >> 16) & 0x3FFF) + 1; - switch (itCode) - { - case IT_SET_RESOURCE: // attribute buffers, uniform buffers or texture units - { - cmd = LatteCP_itSetRegistersGeneric(cmd, nWords, [&drawPassCtx](uint32 registerStart, uint32 registerEnd) + drawPassCtx.endDrawPass(); + return; + } + + while (cmd < cmdEnd) + { + LatteCMDPtr cmdBeforeCommand = cmd; + uint32 itHeader = LatteReadCMD(); + uint32 itHeaderType = (itHeader >> 30) & 3; + if (itHeaderType == 3) + { + uint32 itCode = (itHeader >> 8) & 0xFF; + uint32 nWords = ((itHeader >> 16) & 0x3FFF) + 1; + LatteCMDPtr cmdData = cmd; + cmd += nWords; + switch (itCode) { - if (registerStart >= Latte::REGADDR::SQ_TEX_RESOURCE_WORD_FIRST && registerStart <= Latte::REGADDR::SQ_TEX_RESOURCE_WORD_LAST) - drawPassCtx.endDrawPass(); // texture updates end the current draw sequence - else if (registerStart >= mmSQ_VTX_ATTRIBUTE_BLOCK_START && registerEnd <= mmSQ_VTX_ATTRIBUTE_BLOCK_END) - drawPassCtx.notifyModifiedVertexBuffer(); - else - drawPassCtx.notifyModifiedUniformBuffer(); - }); - if (!drawPassCtx.isWithinDrawPass()) - return cmd; - break; + case IT_SET_RESOURCE: // attribute buffers, uniform buffers or texture units + { + LatteCP_itSetRegistersGeneric(cmdData, nWords, [&drawPassCtx](uint32 registerStart, uint32 registerEnd) + { + if ((registerStart >= Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_PS && registerStart < (Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_PS + Latte::GPU_LIMITS::NUM_TEXTURES_PER_STAGE * 7)) || + (registerStart >= Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_VS && registerStart < (Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_VS + Latte::GPU_LIMITS::NUM_TEXTURES_PER_STAGE * 7)) || + (registerStart >= Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_GS && registerStart < (Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_GS + Latte::GPU_LIMITS::NUM_TEXTURES_PER_STAGE * 7))) + drawPassCtx.endDrawPass(); // texture updates end the current draw sequence + else if (registerStart >= mmSQ_VTX_ATTRIBUTE_BLOCK_START && registerEnd <= mmSQ_VTX_ATTRIBUTE_BLOCK_END) + drawPassCtx.notifyModifiedVertexBuffer(); + else + drawPassCtx.notifyModifiedUniformBuffer(); + }); + if (!drawPassCtx.isWithinDrawPass()) + { + drawPassCtx.PushCurrentCommandQueuePos(cmd, cmdStart, cmdEnd); + return; + } + break; + } + case IT_SET_ALU_CONST: // uniform register + { + LatteCP_itSetRegistersGeneric(cmdData, nWords); + break; + } + case IT_SET_CTL_CONST: + { + LatteCP_itSetRegistersGeneric(cmdData, nWords); + break; + } + case IT_SET_CONFIG_REG: + { + LatteCP_itSetRegistersGeneric(cmdData, nWords); + break; + } + case IT_INDEX_TYPE: + { + LatteCP_itIndexType(cmdData, nWords); + break; + } + case IT_NUM_INSTANCES: + { + LatteCP_itNumInstances(cmdData, nWords); + break; + } + case IT_DRAW_INDEX_2: + { + LatteCP_itDrawIndex2(cmdData, nWords, drawPassCtx); + break; + } + case IT_SET_CONTEXT_REG: + { + drawPassCtx.endDrawPass(); + drawPassCtx.PushCurrentCommandQueuePos(cmdBeforeCommand, cmdStart, cmdEnd); + return; + } + case IT_INDIRECT_BUFFER_PRIV: + { + drawPassCtx.PushCurrentCommandQueuePos(cmd, cmdStart, cmdEnd); + LatteCP_itIndirectBuffer(cmdData, nWords, drawPassCtx); + if (!drawPassCtx.PopCurrentCommandQueuePos(cmd, cmdStart, cmdEnd)) // switch to sub buffer + cemu_assert_debug(false); + + //if (!drawPassCtx.isWithinDrawPass()) + // return cmdData; + break; + } + default: + // unsupported command for fast draw + drawPassCtx.endDrawPass(); + drawPassCtx.PushCurrentCommandQueuePos(cmdBeforeCommand, cmdStart, cmdEnd); + return; + } } - case IT_SET_ALU_CONST: // uniform register + else if (itHeaderType == 2) { - cmd = LatteCP_itSetRegistersGeneric(cmd, nWords); - break; + // filler packet } - case IT_SET_CTL_CONST: + else { - cmd = LatteCP_itSetRegistersGeneric(cmd, nWords); - break; + // unsupported command for fast draw + drawPassCtx.endDrawPass(); + drawPassCtx.PushCurrentCommandQueuePos(cmdBeforeCommand, cmdStart, cmdEnd); + return; } - case IT_SET_CONFIG_REG: - { - cmd = LatteCP_itSetRegistersGeneric(cmd, nWords); + } + } + if (drawPassCtx.isWithinDrawPass()) + drawPassCtx.endDrawPass(); +} + +void LatteCP_processCommandBuffer(DrawPassContext& drawPassCtx) +{ + while (true) + { + LatteCMDPtr cmd, cmdStart, cmdEnd; + if (!drawPassCtx.PopCurrentCommandQueuePos(cmd, cmdStart, cmdEnd)) + break; + while (cmd < cmdEnd) + { + uint32 itHeader = LatteReadCMD(); + uint32 itHeaderType = (itHeader >> 30) & 3; + if (itHeaderType == 3) + { + uint32 itCode = (itHeader >> 8) & 0xFF; + uint32 nWords = ((itHeader >> 16) & 0x3FFF) + 1; + LatteCMDPtr cmdData = cmd; + cmd += nWords; + switch (itCode) + { + case IT_SET_CONTEXT_REG: + { + LatteCP_itSetRegistersGeneric(cmdData, nWords); + } break; - } - case IT_INDEX_TYPE: - { - cmd = LatteCP_itIndexType(cmd, nWords); + case IT_SET_RESOURCE: + { + LatteCP_itSetRegistersGeneric(cmdData, nWords); + } break; - } - case IT_NUM_INSTANCES: - { - cmd = LatteCP_itNumInstances(cmd, nWords); + case IT_SET_ALU_CONST: + { + LatteCP_itSetRegistersGeneric(cmdData, nWords); + } break; - } - case IT_DRAW_INDEX_2: - { -#ifdef FAST_DRAW_LOGGING - if(GetAsyncKeyState('A')) - forceLogRemoveMe_printf("Minimal draw"); -#endif - cmd = LatteCP_itDrawIndex2(cmd, nWords, drawPassCtx); + case IT_SET_CTL_CONST: + { + LatteCP_itSetRegistersGeneric(cmdData, nWords); + } + break; + case IT_SET_SAMPLER: + { + LatteCP_itSetRegistersGeneric(cmdData, nWords); + } + break; + case IT_SET_CONFIG_REG: + { + LatteCP_itSetRegistersGeneric(cmdData, nWords); + } + break; + case IT_SET_LOOP_CONST: + { + // todo + } + break; + case IT_SURFACE_SYNC: + { + LatteCP_itSurfaceSync(cmdData); + } + break; + case IT_INDIRECT_BUFFER_PRIV: + { + drawPassCtx.PushCurrentCommandQueuePos(cmd, cmdStart, cmdEnd); + LatteCP_itIndirectBuffer(cmdData, nWords, drawPassCtx); + if (!drawPassCtx.PopCurrentCommandQueuePos(cmd, cmdStart, cmdEnd)) // switch to sub buffer + cemu_assert_debug(false); + } + break; + case IT_STRMOUT_BUFFER_UPDATE: + { + LatteCP_itStreamoutBufferUpdate(cmdData, nWords); + } + break; + case IT_INDEX_TYPE: + { + LatteCP_itIndexType(cmdData, nWords); + } + break; + case IT_NUM_INSTANCES: + { + LatteCP_itNumInstances(cmdData, nWords); + } + break; + case IT_DRAW_INDEX_2: + { + drawPassCtx.beginDrawPass(); + LatteCP_itDrawIndex2(cmdData, nWords, drawPassCtx); + // enter fast draw mode + drawPassCtx.PushCurrentCommandQueuePos(cmd, cmdStart, cmdEnd); + LatteCP_processCommandBuffer_continuousDrawPass(drawPassCtx); + cemu_assert_debug(!drawPassCtx.isWithinDrawPass()); + if (!drawPassCtx.PopCurrentCommandQueuePos(cmd, cmdStart, cmdEnd)) + return; + } break; + case IT_DRAW_INDEX_AUTO: + { + drawPassCtx.beginDrawPass(); + LatteCP_itDrawIndexAuto(cmdData, nWords, drawPassCtx); + // enter fast draw mode + drawPassCtx.PushCurrentCommandQueuePos(cmd, cmdStart, cmdEnd); + LatteCP_processCommandBuffer_continuousDrawPass(drawPassCtx); + cemu_assert_debug(!drawPassCtx.isWithinDrawPass()); + if (!drawPassCtx.PopCurrentCommandQueuePos(cmd, cmdStart, cmdEnd)) + return; + } + break; + case IT_DRAW_INDEX_IMMD: + { + DrawPassContext drawPassCtx; + drawPassCtx.beginDrawPass(); + LatteCP_itDrawImmediate(cmdData, nWords, drawPassCtx); + drawPassCtx.endDrawPass(); + break; + } + case IT_WAIT_REG_MEM: + { + LatteCP_itWaitRegMem(cmdData, nWords); + LatteTiming_HandleTimedVsync(); + LatteAsyncCommands_checkAndExecute(); + break; + } + case IT_MEM_WRITE: + { + LatteCP_itMemWrite(cmdData, nWords); + break; + } + case IT_CONTEXT_CONTROL: + { + LatteCP_itContextControl(cmdData, nWords); + break; + } + case IT_MEM_SEMAPHORE: + { + LatteCP_itMemSemaphore(cmdData, nWords); + break; + } + case IT_LOAD_CONFIG_REG: + { + LatteCP_itLoadReg(cmdData, nWords, LATTE_REG_BASE_CONFIG); + break; + } + case IT_LOAD_CONTEXT_REG: + { + LatteCP_itLoadReg(cmdData, nWords, LATTE_REG_BASE_CONTEXT); + break; + } + case IT_LOAD_ALU_CONST: + { + LatteCP_itLoadReg(cmdData, nWords, LATTE_REG_BASE_ALU_CONST); + break; + } + case IT_LOAD_LOOP_CONST: + { + LatteCP_itLoadReg(cmdData, nWords, LATTE_REG_BASE_LOOP_CONST); + break; + } + case IT_LOAD_RESOURCE: + { + LatteCP_itLoadReg(cmdData, nWords, LATTE_REG_BASE_RESOURCE); + break; + } + case IT_LOAD_SAMPLER: + { + LatteCP_itLoadReg(cmdData, nWords, LATTE_REG_BASE_SAMPLER); + break; + } + case IT_SET_PREDICATION: + { + LatteCP_itSetPredication(cmdData, nWords); + break; + } + case IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER: + { + LatteCP_itHLECopyColorBufferToScanBuffer(cmdData, nWords); + break; + } + case IT_HLE_TRIGGER_SCANBUFFER_SWAP: + { + LatteCP_itHLESwapScanBuffer(cmdData, nWords); + break; + } + case IT_HLE_WAIT_FOR_FLIP: + { + LatteCP_itHLEWaitForFlip(cmdData, nWords); + break; + } + case IT_HLE_REQUEST_SWAP_BUFFERS: + { + LatteCP_itHLERequestSwapBuffers(cmdData, nWords); + break; + } + case IT_HLE_CLEAR_COLOR_DEPTH_STENCIL: + { + LatteCP_itHLEClearColorDepthStencil(cmdData, nWords); + break; + } + case IT_HLE_COPY_SURFACE_NEW: + { + LatteCP_itHLECopySurfaceNew(cmdData, nWords); + break; + } + case IT_HLE_SAMPLE_TIMER: + { + LatteCP_itHLESampleTimer(cmdData, nWords); + break; + } + case IT_HLE_SPECIAL_STATE: + { + LatteCP_itHLESpecialState(cmdData, nWords); + break; + } + case IT_HLE_BEGIN_OCCLUSION_QUERY: + { + LatteCP_itHLEBeginOcclusionQuery(cmdData, nWords); + break; + } + case IT_HLE_END_OCCLUSION_QUERY: + { + LatteCP_itHLEEndOcclusionQuery(cmdData, nWords); + break; + } + case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP: + { + LatteCP_itHLESetRetirementTimestamp(cmdData, nWords); + break; + } + case IT_HLE_BOTTOM_OF_PIPE_CB: + { + LatteCP_itHLEBottomOfPipeCB(cmdData, nWords); + break; + } + case IT_HLE_SYNC_ASYNC_OPERATIONS: + { + LatteTextureReadback_UpdateFinishedTransfers(true); + LatteQuery_UpdateFinishedQueriesForceFinishAll(); + break; + } + default: + debug_printf("Unhandled IT %02x\n", itCode); + cemu_assert_debug(false); + break; + } } - case IT_SET_CONTEXT_REG: + else if (itHeaderType == 2) { -#ifdef FAST_DRAW_LOGGING - if (GetAsyncKeyState('A')) - forceLogRemoveMe_printf("[FAST-DRAW] Quit due to command IT_SET_CONTEXT_REG Reg: %04x", (uint32)cmd[0] + 0xA000); -#endif - drawPassCtx.endDrawPass(); - return cmdBeforeCommand; + // filler packet + // has no body } - case IT_INDIRECT_BUFFER_PRIV: + else if (itHeaderType == 0) { - cmd = LatteCP_itIndirectBuffer(cmd, nWords, drawPassCtx); - if (!drawPassCtx.isWithinDrawPass()) - return cmd; - break; + uint32 registerBase = (itHeader & 0xFFFF); + uint32 registerCount = ((itHeader >> 16) & 0x3FFF) + 1; + if (registerBase == 0x304A) + { + GX2::__GX2NotifyEvent(GX2::GX2CallbackEventType::TIMESTAMP_TOP); + LatteSkipCMD(registerCount); + } + else if (registerBase == 0x304B) + { + LatteSkipCMD(registerCount); + } + else + { + LatteCP_dumpCommandBufferError(cmdStart, cmdEnd, cmd); + cemu_assert_debug(false); + } } - default: -#ifdef FAST_DRAW_LOGGING - if (GetAsyncKeyState('A')) - forceLogRemoveMe_printf("[FAST-DRAW] Quit due to command itCode 0x%02x", itCode); -#endif - drawPassCtx.endDrawPass(); - return cmdBeforeCommand; + else + { + debug_printf("invalid itHeaderType %08x\n", itHeaderType); + LatteCP_dumpCommandBufferError(cmdStart, cmdEnd, cmd); + cemu_assert_debug(false); } } - else if (itHeaderType == 2) - { - // filler packet - } - else - { -#ifdef FAST_DRAW_LOGGING - if (GetAsyncKeyState('A')) - forceLogRemoveMe_printf("[FAST-DRAW] Quit due to unsupported headerType 0x%02x", itHeaderType); -#endif - drawPassCtx.endDrawPass(); - return cmdBeforeCommand; - } + cemu_assert_debug(cmd == cmdEnd); } - cemu_assert_debug(drawPassCtx.isWithinDrawPass()); - return cmd; } -void LatteCP_processCommandBuffer(uint8* cmdBuffer, sint32 cmdSize, DrawPassContext& drawPassCtx) +void LatteCP_ProcessRingbuffer() { - LatteCMDPtr cmd = (LatteCMDPtr)cmdBuffer; - LatteCMDPtr cmdStart = (LatteCMDPtr)cmdBuffer; - LatteCMDPtr cmdEnd = (LatteCMDPtr)(cmdBuffer + cmdSize); - - if (drawPassCtx.isWithinDrawPass()) - { - cmd = LatteCP_processCommandBuffer_continuousDrawPass(cmd, cmdStart, cmdEnd, drawPassCtx); - cemu_assert_debug(cmd <= cmdEnd); - if (cmd == cmdEnd) - return; - cemu_assert_debug(!drawPassCtx.isWithinDrawPass()); - } - - while (cmd < cmdEnd) + sint32 timerRecheck = 0; // estimates how much CP processing time has elapsed based on the executed commands, if the value exceeds CP_TIMER_RECHECK then _handleTimers() is called + while (true) { - uint32 itHeader = LatteReadCMD(); + uint32 itHeader = LatteCP_readU32Deprc(); uint32 itHeaderType = (itHeader >> 30) & 3; if (itHeaderType == 3) { uint32 itCode = (itHeader >> 8) & 0xFF; uint32 nWords = ((itHeader >> 16) & 0x3FFF) + 1; -#ifdef CEMU_DEBUG_ASSERT - LatteCMDPtr expectedPostCmd = cmd + nWords; -#endif + LatteCP_waitForNWords(nWords); + LatteCMDPtr cmd = (LatteCMDPtr)gxRingBufferReadPtr; + uint8* cmdEnd = gxRingBufferReadPtr + nWords * 4; + gxRingBufferReadPtr = cmdEnd; switch (itCode) { + case IT_SURFACE_SYNC: + { + LatteCP_itSurfaceSync(cmd); + timerRecheck += CP_TIMER_RECHECK / 512; + } + break; case IT_SET_CONTEXT_REG: { - cmd = LatteCP_itSetRegistersGeneric(cmd, nWords); + LatteCP_itSetRegistersGeneric(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 512; } break; case IT_SET_RESOURCE: { - cmd = LatteCP_itSetRegistersGeneric(cmd, nWords); + LatteCP_itSetRegistersGeneric(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 512; } break; case IT_SET_ALU_CONST: { - cmd = LatteCP_itSetRegistersGeneric(cmd, nWords); + LatteCP_itSetRegistersGeneric(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 512; + break; } - break; case IT_SET_CTL_CONST: { - cmd = LatteCP_itSetRegistersGeneric(cmd, nWords); + LatteCP_itSetRegistersGeneric(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 512; + break; } - break; case IT_SET_SAMPLER: { - cmd = LatteCP_itSetRegistersGeneric(cmd, nWords); + LatteCP_itSetRegistersGeneric(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 512; + break; } - break; case IT_SET_CONFIG_REG: { - cmd = LatteCP_itSetRegistersGeneric(cmd, nWords); - } - break; - case IT_SET_LOOP_CONST: - { - LatteSkipCMD(nWords); - // todo - } - break; - case IT_SURFACE_SYNC: - { - cmd = LatteCP_itSurfaceSync(cmd); + LatteCP_itSetRegistersGeneric(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 512; + break; } - break; case IT_INDIRECT_BUFFER_PRIV: { - cmd = LatteCP_itIndirectBuffer(cmd, nWords, drawPassCtx); - if (drawPassCtx.isWithinDrawPass()) - { - cmd = LatteCP_processCommandBuffer_continuousDrawPass(cmd, cmdStart, cmdEnd, drawPassCtx); - cemu_assert_debug(cmd <= cmdEnd); - if (cmd == cmdEnd) - return; - cemu_assert_debug(!drawPassCtx.isWithinDrawPass()); - } -#ifdef CEMU_DEBUG_ASSERT - expectedPostCmd = cmd; -#endif + LatteCP_itIndirectBufferDepr(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 512; + break; } - break; case IT_STRMOUT_BUFFER_UPDATE: { - cmd = LatteCP_itStreamoutBufferUpdate(cmd, nWords); + LatteCP_itStreamoutBufferUpdate(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 512; + break; } - break; case IT_INDEX_TYPE: { - cmd = LatteCP_itIndexType(cmd, nWords); + LatteCP_itIndexType(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 1024; + break; } - break; case IT_NUM_INSTANCES: { - cmd = LatteCP_itNumInstances(cmd, nWords); + LatteCP_itNumInstances(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 1024; + break; } - break; case IT_DRAW_INDEX_2: { + DrawPassContext drawPassCtx; drawPassCtx.beginDrawPass(); -#ifdef FAST_DRAW_LOGGING - if (GetAsyncKeyState('A')) - forceLogRemoveMe_printf("[FAST-DRAW] Starting"); -#endif - cmd = LatteCP_itDrawIndex2(cmd, nWords, drawPassCtx); - cmd = LatteCP_processCommandBuffer_continuousDrawPass(cmd, cmdStart, cmdEnd, drawPassCtx); - cemu_assert_debug(cmd == cmdEnd || drawPassCtx.isWithinDrawPass() == false); // draw sequence should have ended if we didn't reach the end of the command buffer -#ifdef CEMU_DEBUG_ASSERT - expectedPostCmd = cmd; -#endif + LatteCP_itDrawIndex2(cmd, nWords, drawPassCtx); + drawPassCtx.endDrawPass(); + timerRecheck += CP_TIMER_RECHECK / 64; + break; } - break; case IT_DRAW_INDEX_AUTO: { + DrawPassContext drawPassCtx; drawPassCtx.beginDrawPass(); - cmd = LatteCP_itDrawIndexAuto(cmd, nWords, drawPassCtx); - cmd = LatteCP_processCommandBuffer_continuousDrawPass(cmd, cmdStart, cmdEnd, drawPassCtx); - cemu_assert_debug(cmd == cmdEnd || drawPassCtx.isWithinDrawPass() == false); // draw sequence should have ended if we didn't reach the end of the command buffer -#ifdef CEMU_DEBUG_ASSERT - expectedPostCmd = cmd; -#endif -#ifdef FAST_DRAW_LOGGING - if (GetAsyncKeyState('A')) - forceLogRemoveMe_printf("[FAST-DRAW] Auto-draw"); -#endif + LatteCP_itDrawIndexAuto(cmd, nWords, drawPassCtx); + drawPassCtx.endDrawPass(); + timerRecheck += CP_TIMER_RECHECK / 512; + break; } - break; case IT_DRAW_INDEX_IMMD: { DrawPassContext drawPassCtx; drawPassCtx.beginDrawPass(); - cmd = LatteCP_itDrawImmediate(cmd, nWords, drawPassCtx); + LatteCP_itDrawImmediate(cmd, nWords, drawPassCtx); drawPassCtx.endDrawPass(); + timerRecheck += CP_TIMER_RECHECK / 64; break; } case IT_WAIT_REG_MEM: { - cmd = LatteCP_itWaitRegMem(cmd, nWords); - LatteTiming_HandleTimedVsync(); - LatteAsyncCommands_checkAndExecute(); + LatteCP_itWaitRegMem(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 16; + break; } - break; case IT_MEM_WRITE: { - cmd = LatteCP_itMemWrite(cmd, nWords); + LatteCP_itMemWrite(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 128; + break; } - break; case IT_CONTEXT_CONTROL: { - cmd = LatteCP_itContextControl(cmd, nWords); + LatteCP_itContextControl(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 128; + break; } - break; case IT_MEM_SEMAPHORE: { - cmd = LatteCP_itMemSemaphore(cmd, nWords); + LatteCP_itMemSemaphore(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 128; + break; } - break; case IT_LOAD_CONFIG_REG: { - cmd = LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_CONFIG); + LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_CONFIG); + timerRecheck += CP_TIMER_RECHECK / 64; + break; } - break; case IT_LOAD_CONTEXT_REG: { - cmd = LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_CONTEXT); + LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_CONTEXT); + timerRecheck += CP_TIMER_RECHECK / 64; + break; } - break; case IT_LOAD_ALU_CONST: { - cmd = LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_ALU_CONST); + LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_ALU_CONST); + timerRecheck += CP_TIMER_RECHECK / 64; + break; } - break; case IT_LOAD_LOOP_CONST: { - cmd = LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_LOOP_CONST); + LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_LOOP_CONST); + timerRecheck += CP_TIMER_RECHECK / 64; + break; } - break; case IT_LOAD_RESOURCE: { - cmd = LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_RESOURCE); + LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_RESOURCE); + timerRecheck += CP_TIMER_RECHECK / 64; + break; } - break; case IT_LOAD_SAMPLER: { - cmd = LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_SAMPLER); + LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_SAMPLER); + timerRecheck += CP_TIMER_RECHECK / 64; + break; + } + case IT_SET_LOOP_CONST: + { + // todo + break; } - break; case IT_SET_PREDICATION: { - cmd = LatteCP_itSetPredication(cmd, nWords); + LatteCP_itSetPredication(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 512; + break; } - break; case IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER: { - cmd = LatteCP_itHLECopyColorBufferToScanBuffer(cmd, nWords); + LatteCP_itHLECopyColorBufferToScanBuffer(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 64; + break; } - break; case IT_HLE_TRIGGER_SCANBUFFER_SWAP: { - cmd = LatteCP_itHLESwapScanBuffer(cmd, nWords); + LatteCP_itHLESwapScanBuffer(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 64; + break; } - break; case IT_HLE_WAIT_FOR_FLIP: { - cmd = LatteCP_itHLEWaitForFlip(cmd, nWords); + LatteCP_itHLEWaitForFlip(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 1; + break; } - break; case IT_HLE_REQUEST_SWAP_BUFFERS: { - cmd = LatteCP_itHLERequestSwapBuffers(cmd, nWords); + LatteCP_itHLERequestSwapBuffers(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 32; + break; } - break; case IT_HLE_CLEAR_COLOR_DEPTH_STENCIL: { - cmd = LatteCP_itHLEClearColorDepthStencil(cmd, nWords); + LatteCP_itHLEClearColorDepthStencil(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 128; + break; } - break; case IT_HLE_COPY_SURFACE_NEW: { - cmd = LatteCP_itHLECopySurfaceNew(cmd, nWords); + LatteCP_itHLECopySurfaceNew(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 128; + break; + } + case IT_HLE_FIFO_WRAP_AROUND: + { + LatteCP_itHLEFifoWrapAround(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 512; + break; } - break; case IT_HLE_SAMPLE_TIMER: { - cmd = LatteCP_itHLESampleTimer(cmd, nWords); + LatteCP_itHLESampleTimer(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 512; + break; } - break; case IT_HLE_SPECIAL_STATE: { - cmd = LatteCP_itHLESpecialState(cmd, nWords); + LatteCP_itHLESpecialState(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 512; + break; } - break; case IT_HLE_BEGIN_OCCLUSION_QUERY: { - cmd = LatteCP_itHLEBeginOcclusionQuery(cmd, nWords); + LatteCP_itHLEBeginOcclusionQuery(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 512; + break; } - break; case IT_HLE_END_OCCLUSION_QUERY: { - cmd = LatteCP_itHLEEndOcclusionQuery(cmd, nWords); + LatteCP_itHLEEndOcclusionQuery(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 512; + break; } - break; case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP: { - cmd = LatteCP_itHLESetRetirementTimestamp(cmd, nWords); + LatteCP_itHLESetRetirementTimestamp(cmd, nWords); + timerRecheck += CP_TIMER_RECHECK / 512; + break; } - break; case IT_HLE_BOTTOM_OF_PIPE_CB: { - cmd = LatteCP_itHLEBottomOfPipeCB(cmd, nWords); + LatteCP_itHLEBottomOfPipeCB(cmd, nWords); + break; } - break; case IT_HLE_SYNC_ASYNC_OPERATIONS: { - LatteSkipCMD(nWords); + //LatteCP_skipWords(nWords); LatteTextureReadback_UpdateFinishedTransfers(true); LatteQuery_UpdateFinishedQueriesForceFinishAll(); + break; } - break; default: - debug_printf("Unhandled IT %02x\n", itCode); cemu_assert_debug(false); - LatteSkipCMD(nWords); } -#ifdef CEMU_DEBUG_ASSERT - if(cmd != expectedPostCmd) - debug_printf("cmd %016p expectedPostCmd %016p\n", cmd, expectedPostCmd); - cemu_assert_debug(cmd == expectedPostCmd); -#endif } else if (itHeaderType == 2) { - // filler packet - // has no body + // filler packet, skip this + cemu_assert_debug(itHeader == 0x80000000); } else if (itHeaderType == 0) { @@ -1330,341 +1659,299 @@ void LatteCP_processCommandBuffer(uint8* cmdBuffer, sint32 cmdSize, DrawPassCont if (registerBase == 0x304A) { GX2::__GX2NotifyEvent(GX2::GX2CallbackEventType::TIMESTAMP_TOP); - LatteSkipCMD(registerCount); + LatteCP_skipWords(registerCount); } else if (registerBase == 0x304B) { - LatteSkipCMD(registerCount); + LatteCP_skipWords(registerCount); } else { - LatteCP_dumpCommandBufferError(cmdStart, cmdEnd, cmd); cemu_assert_debug(false); } } else { debug_printf("invalid itHeaderType %08x\n", itHeaderType); - LatteCP_dumpCommandBufferError(cmdStart, cmdEnd, cmd); cemu_assert_debug(false); } + if (timerRecheck >= CP_TIMER_RECHECK) + { + LatteTiming_HandleTimedVsync(); + LatteAsyncCommands_checkAndExecute(); + timerRecheck = 0; + } } - cemu_assert_debug(cmd == cmdEnd); } -void LatteCP_ProcessRingbuffer() +#ifdef LATTE_CP_LOGGING +void LatteCP_DebugPrintCmdBuffer(uint32be* bufferPtr, uint32 size) { - sint32 timerRecheck = 0; // estimates how much CP processing time passed based on the executed commands, if the value exceeds CP_TIMER_RECHECK then _handleTimers() is called - while (true) + uint32be* bufferPtrInitial = bufferPtr; + uint32be* bufferPtrEnd = bufferPtr + (size/4); + while (bufferPtr < bufferPtrEnd) { - uint32 itHeader = LatteCP_readU32Deprc(); + std::string strPrefix = fmt::format("[PM4 Buf {:08x} Offs {:04x}]", MEMPTR(bufferPtr).GetMPTR(), (bufferPtr - bufferPtrInitial) * 4); + uint32 itHeader = *bufferPtr; + bufferPtr++; uint32 itHeaderType = (itHeader >> 30) & 3; if (itHeaderType == 3) { uint32 itCode = (itHeader >> 8) & 0xFF; uint32 nWords = ((itHeader >> 16) & 0x3FFF) + 1; - LatteCP_waitForNWords(nWords); - LatteCMDPtr cmd = (LatteCMDPtr)gxRingBufferReadPtr; - uint8* expectedGxRingBufferReadPtr = gxRingBufferReadPtr + nWords*4; + uint32be* cmdData = bufferPtr; + bufferPtr += nWords; switch (itCode) { case IT_SURFACE_SYNC: { - gxRingBufferReadPtr = (uint8*)LatteCP_itSurfaceSync(cmd); - timerRecheck += CP_TIMER_RECHECK / 512; + cemuLog_log(LogType::Force, "{} IT_SURFACE_SYNC", strPrefix); + break; } - break; case IT_SET_CONTEXT_REG: { - gxRingBufferReadPtr = (uint8*)LatteCP_itSetRegistersGeneric(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 512; + std::string regVals; + for (uint32 i = 0; i < std::min(nWords - 1, 8); i++) + regVals.append(fmt::format("{:08x} ", cmdData[1 + i].value())); + cemuLog_log(LogType::Force, "{} IT_SET_CONTEXT_REG Reg {:04x} RegValues {}", strPrefix, cmdData[0].value(), regVals); } - break; case IT_SET_RESOURCE: { - gxRingBufferReadPtr = (uint8*)LatteCP_itSetRegistersGeneric(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 512; + std::string regVals; + for (uint32 i = 0; i < std::min(nWords - 1, 8); i++) + regVals.append(fmt::format("{:08x} ", cmdData[1+i].value())); + cemuLog_log(LogType::Force, "{} IT_SET_RESOURCE Reg {:04x} RegValues {}", strPrefix, cmdData[0].value(), regVals); + break; } - break; case IT_SET_ALU_CONST: { - gxRingBufferReadPtr = (uint8*)LatteCP_itSetRegistersGeneric(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 512; + cemuLog_log(LogType::Force, "{} IT_SET_ALU_CONST", strPrefix); break; } case IT_SET_CTL_CONST: { - gxRingBufferReadPtr = (uint8*)LatteCP_itSetRegistersGeneric(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 512; + cemuLog_log(LogType::Force, "{} IT_SET_CTL_CONST", strPrefix); break; } case IT_SET_SAMPLER: { - gxRingBufferReadPtr = (uint8*)LatteCP_itSetRegistersGeneric(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 512; + cemuLog_log(LogType::Force, "{} IT_SET_SAMPLER", strPrefix); break; } case IT_SET_CONFIG_REG: { - gxRingBufferReadPtr = (uint8*)LatteCP_itSetRegistersGeneric(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 512; + cemuLog_log(LogType::Force, "{} IT_SET_CONFIG_REG", strPrefix); break; } case IT_INDIRECT_BUFFER_PRIV: { -#ifdef FAST_DRAW_LOGGING - if (GetAsyncKeyState('A')) - forceLogRemoveMe_printf("[FAST-DRAW] BEGIN CMD BUFFER"); -#endif - LatteCP_itIndirectBufferDepr(nWords); - timerRecheck += CP_TIMER_RECHECK / 512; -#ifdef FAST_DRAW_LOGGING - if (GetAsyncKeyState('A')) - forceLogRemoveMe_printf("[FAST-DRAW] END CMD BUFFER"); -#endif + if (nWords != 3) + { + cemuLog_log(LogType::Force, "{} IT_INDIRECT_BUFFER_PRIV (malformed!)", strPrefix); + } + else + { + uint32 physicalAddress = cmdData[0]; + uint32 physicalAddressHigh = cmdData[1]; + uint32 sizeInDWords = cmdData[2]; + cemuLog_log(LogType::Force, "{} IT_INDIRECT_BUFFER_PRIV Addr {:08x} Size {:08x}", strPrefix, physicalAddress, sizeInDWords*4); + LatteCP_DebugPrintCmdBuffer(MEMPTR(physicalAddress), sizeInDWords * 4); + } break; } case IT_STRMOUT_BUFFER_UPDATE: { - gxRingBufferReadPtr = (uint8*)LatteCP_itStreamoutBufferUpdate(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 512; + cemuLog_log(LogType::Force, "{} IT_STRMOUT_BUFFER_UPDATE", strPrefix); break; } case IT_INDEX_TYPE: { - gxRingBufferReadPtr = (uint8*)LatteCP_itIndexType(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 1024; + cemuLog_log(LogType::Force, "{} IT_INDEX_TYPE", strPrefix); break; } case IT_NUM_INSTANCES: { - gxRingBufferReadPtr = (uint8*)LatteCP_itNumInstances(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 1024; + cemuLog_log(LogType::Force, "{} IT_NUM_INSTANCES", strPrefix); break; } case IT_DRAW_INDEX_2: { - DrawPassContext drawPassCtx; - drawPassCtx.beginDrawPass(); - gxRingBufferReadPtr = (uint8*)LatteCP_itDrawIndex2(cmd, nWords, drawPassCtx); - drawPassCtx.endDrawPass(); - timerRecheck += CP_TIMER_RECHECK / 64; + if (nWords != 5) + { + cemuLog_log(LogType::Force, "{} IT_DRAW_INDEX_2 (malformed!)", strPrefix); + } + else + { + uint32 ukn1 = cmdData[0]; + MPTR physIndices = cmdData[1]; + uint32 ukn2 = cmdData[2]; + uint32 count = cmdData[3]; + uint32 ukn3 = cmdData[4]; + cemuLog_log(LogType::Force, "{} IT_DRAW_INDEX_2 | Count {}", strPrefix, count); + } break; } case IT_DRAW_INDEX_AUTO: { - DrawPassContext drawPassCtx; - drawPassCtx.beginDrawPass(); - gxRingBufferReadPtr = (uint8*)LatteCP_itDrawIndexAuto(cmd, nWords, drawPassCtx); - drawPassCtx.endDrawPass(); - timerRecheck += CP_TIMER_RECHECK / 512; + cemuLog_log(LogType::Force, "{} IT_DRAW_INDEX_AUTO", strPrefix); break; } case IT_DRAW_INDEX_IMMD: { - DrawPassContext drawPassCtx; - drawPassCtx.beginDrawPass(); - gxRingBufferReadPtr = (uint8*)LatteCP_itDrawImmediate(cmd, nWords, drawPassCtx); - drawPassCtx.endDrawPass(); - timerRecheck += CP_TIMER_RECHECK / 64; + cemuLog_log(LogType::Force, "{} IT_DRAW_INDEX_IMMD", strPrefix); break; } case IT_WAIT_REG_MEM: { - gxRingBufferReadPtr = (uint8*)LatteCP_itWaitRegMem(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 16; + cemuLog_log(LogType::Force, "{} IT_WAIT_REG_MEM", strPrefix); break; } case IT_MEM_WRITE: { - gxRingBufferReadPtr = (uint8*)LatteCP_itMemWrite(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 128; + cemuLog_log(LogType::Force, "{} IT_MEM_WRITE", strPrefix); break; } case IT_CONTEXT_CONTROL: { - gxRingBufferReadPtr = (uint8*)LatteCP_itContextControl(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 128; + cemuLog_log(LogType::Force, "{} IT_CONTEXT_CONTROL", strPrefix); break; } case IT_MEM_SEMAPHORE: { - gxRingBufferReadPtr = (uint8*)LatteCP_itMemSemaphore(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 128; + cemuLog_log(LogType::Force, "{} IT_MEM_SEMAPHORE", strPrefix); break; } case IT_LOAD_CONFIG_REG: { - gxRingBufferReadPtr = (uint8*)LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_CONFIG); - timerRecheck += CP_TIMER_RECHECK / 64; + cemuLog_log(LogType::Force, "{} IT_LOAD_CONFIG_REG", strPrefix); break; } case IT_LOAD_CONTEXT_REG: { - gxRingBufferReadPtr = (uint8*)LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_CONTEXT); - timerRecheck += CP_TIMER_RECHECK / 64; + cemuLog_log(LogType::Force, "{} IT_LOAD_CONTEXT_REG", strPrefix); break; } case IT_LOAD_ALU_CONST: { - gxRingBufferReadPtr = (uint8*)LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_ALU_CONST); - timerRecheck += CP_TIMER_RECHECK / 64; + cemuLog_log(LogType::Force, "{} IT_LOAD_ALU_CONST", strPrefix); break; } case IT_LOAD_LOOP_CONST: { - gxRingBufferReadPtr = (uint8*)LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_LOOP_CONST); - timerRecheck += CP_TIMER_RECHECK / 64; + cemuLog_log(LogType::Force, "{} IT_LOAD_LOOP_CONST", strPrefix); break; } case IT_LOAD_RESOURCE: { - gxRingBufferReadPtr = (uint8*)LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_RESOURCE); - timerRecheck += CP_TIMER_RECHECK / 64; + cemuLog_log(LogType::Force, "{} IT_LOAD_RESOURCE", strPrefix); break; } case IT_LOAD_SAMPLER: { - gxRingBufferReadPtr = (uint8*)LatteCP_itLoadReg(cmd, nWords, LATTE_REG_BASE_SAMPLER); - timerRecheck += CP_TIMER_RECHECK / 64; + cemuLog_log(LogType::Force, "{} IT_LOAD_SAMPLER", strPrefix); break; } case IT_SET_LOOP_CONST: { - LatteSkipCMD(nWords); - gxRingBufferReadPtr = (uint8*)cmd; - // todo + cemuLog_log(LogType::Force, "{} IT_SET_LOOP_CONST", strPrefix); break; } case IT_SET_PREDICATION: { - gxRingBufferReadPtr = (uint8*)LatteCP_itSetPredication(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 512; + cemuLog_log(LogType::Force, "{} IT_SET_PREDICATION", strPrefix); break; } case IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER: { - gxRingBufferReadPtr = (uint8*)LatteCP_itHLECopyColorBufferToScanBuffer(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 64; + cemuLog_log(LogType::Force, "{} IT_HLE_COPY_COLORBUFFER_TO_SCANBUFFER", strPrefix); break; } case IT_HLE_TRIGGER_SCANBUFFER_SWAP: { - gxRingBufferReadPtr = (uint8*)LatteCP_itHLESwapScanBuffer(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 64; + cemuLog_log(LogType::Force, "{} IT_HLE_TRIGGER_SCANBUFFER_SWAP", strPrefix); break; } case IT_HLE_WAIT_FOR_FLIP: { - gxRingBufferReadPtr = (uint8*)LatteCP_itHLEWaitForFlip(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 1; + cemuLog_log(LogType::Force, "{} IT_HLE_WAIT_FOR_FLIP", strPrefix); break; } case IT_HLE_REQUEST_SWAP_BUFFERS: { - gxRingBufferReadPtr = (uint8*)LatteCP_itHLERequestSwapBuffers(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 32; + cemuLog_log(LogType::Force, "{} IT_HLE_REQUEST_SWAP_BUFFERS", strPrefix); break; } case IT_HLE_CLEAR_COLOR_DEPTH_STENCIL: { - gxRingBufferReadPtr = (uint8*)LatteCP_itHLEClearColorDepthStencil(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 128; + cemuLog_log(LogType::Force, "{} IT_HLE_CLEAR_COLOR_DEPTH_STENCIL", strPrefix); break; } case IT_HLE_COPY_SURFACE_NEW: { - gxRingBufferReadPtr = (uint8*)LatteCP_itHLECopySurfaceNew(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 128; + cemuLog_log(LogType::Force, "{} IT_HLE_COPY_SURFACE_NEW", strPrefix); break; } case IT_HLE_FIFO_WRAP_AROUND: { - gxRingBufferReadPtr = (uint8*)LatteCP_itHLEFifoWrapAround(cmd, nWords); - expectedGxRingBufferReadPtr = gxRingBufferReadPtr; - timerRecheck += CP_TIMER_RECHECK / 512; + cemuLog_log(LogType::Force, "{} IT_HLE_FIFO_WRAP_AROUND", strPrefix); break; } case IT_HLE_SAMPLE_TIMER: { - gxRingBufferReadPtr = (uint8*)LatteCP_itHLESampleTimer(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 512; + cemuLog_log(LogType::Force, "{} IT_HLE_SAMPLE_TIMER", strPrefix); break; } case IT_HLE_SPECIAL_STATE: { - gxRingBufferReadPtr = (uint8*)LatteCP_itHLESpecialState(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 512; + cemuLog_log(LogType::Force, "{} IT_HLE_SPECIAL_STATE", strPrefix); break; } case IT_HLE_BEGIN_OCCLUSION_QUERY: { - gxRingBufferReadPtr = (uint8*)LatteCP_itHLEBeginOcclusionQuery(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 512; + cemuLog_log(LogType::Force, "{} IT_HLE_BEGIN_OCCLUSION_QUERY", strPrefix); break; } case IT_HLE_END_OCCLUSION_QUERY: { - gxRingBufferReadPtr = (uint8*)LatteCP_itHLEEndOcclusionQuery(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 512; + cemuLog_log(LogType::Force, "{} IT_HLE_END_OCCLUSION_QUERY", strPrefix); break; } case IT_HLE_SET_CB_RETIREMENT_TIMESTAMP: { - gxRingBufferReadPtr = (uint8*)LatteCP_itHLESetRetirementTimestamp(cmd, nWords); - timerRecheck += CP_TIMER_RECHECK / 512; + cemuLog_log(LogType::Force, "{} IT_HLE_SET_CB_RETIREMENT_TIMESTAMP", strPrefix); break; } case IT_HLE_BOTTOM_OF_PIPE_CB: { - gxRingBufferReadPtr = (uint8*)LatteCP_itHLEBottomOfPipeCB(cmd, nWords); + cemuLog_log(LogType::Force, "{} IT_HLE_BOTTOM_OF_PIPE_CB", strPrefix); break; } case IT_HLE_SYNC_ASYNC_OPERATIONS: { - LatteCP_skipWords(nWords); - LatteTextureReadback_UpdateFinishedTransfers(true); - LatteQuery_UpdateFinishedQueriesForceFinishAll(); + cemuLog_log(LogType::Force, "{} IT_HLE_SYNC_ASYNC_OPERATIONS", strPrefix); break; } default: - cemu_assert_debug(false); + cemuLog_log(LogType::Force, "{} Unsupported operation code", strPrefix); + return; } - cemu_assert_debug(expectedGxRingBufferReadPtr == gxRingBufferReadPtr); } else if (itHeaderType == 2) { - // filler packet, skip this - cemu_assert_debug(itHeader == 0x80000000); + // filler packet } else if (itHeaderType == 0) { uint32 registerBase = (itHeader & 0xFFFF); uint32 registerCount = ((itHeader >> 16) & 0x3FFF) + 1; - if (registerBase == 0x304A) - { - GX2::__GX2NotifyEvent(GX2::GX2CallbackEventType::TIMESTAMP_TOP); - LatteCP_skipWords(registerCount); - } - else if (registerBase == 0x304B) - { - LatteCP_skipWords(registerCount); - } - else - { - cemu_assert_debug(false); - } + LatteCP_skipWords(registerCount); + cemuLog_log(LogType::Force, "[LatteCP] itType=0 registerBase={:04x}", registerBase); } else { - debug_printf("invalid itHeaderType %08x\n", itHeaderType); - cemu_assert_debug(false); - } - if (timerRecheck >= CP_TIMER_RECHECK) - { - LatteTiming_HandleTimedVsync(); - LatteAsyncCommands_checkAndExecute(); - timerRecheck = 0; + cemuLog_log(LogType::Force, "Invalid itHeaderType %08x\n", itHeaderType); + return; } } } +#endif \ No newline at end of file diff --git a/src/Cafe/HW/Latte/Core/LatteOverlay.cpp b/src/Cafe/HW/Latte/Core/LatteOverlay.cpp index ff5238d54..238f85e80 100644 --- a/src/Cafe/HW/Latte/Core/LatteOverlay.cpp +++ b/src/Cafe/HW/Latte/Core/LatteOverlay.cpp @@ -26,6 +26,7 @@ struct OverlayStats double fps{}; uint32 draw_calls_per_frame{}; + uint32 fast_draw_calls_per_frame{}; float cpu_usage{}; // cemu cpu usage in % std::vector cpu_per_core; // global cpu usage in % per core uint32 ram_usage{}; // ram usage in MB @@ -86,7 +87,7 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio ImGui::Text("FPS: %.2lf", g_state.fps); if (config.overlay.drawcalls) - ImGui::Text("Draws/f: %d", g_state.draw_calls_per_frame); + ImGui::Text("Draws/f: %d (fast: %d)", g_state.draw_calls_per_frame, g_state.fast_draw_calls_per_frame); if (config.overlay.cpu_usage) ImGui::Text("CPU: %.2lf%%", g_state.cpu_usage); @@ -588,13 +589,14 @@ static void UpdateStats_CpuPerCore() } } -void LatteOverlay_updateStats(double fps, sint32 drawcalls) +void LatteOverlay_updateStats(double fps, sint32 drawcalls, sint32 fastDrawcalls) { if (GetConfig().overlay.position == ScreenPosition::kDisabled) return; g_state.fps = fps; g_state.draw_calls_per_frame = drawcalls; + g_state.fast_draw_calls_per_frame = fastDrawcalls; UpdateStats_CemuCpu(); UpdateStats_CpuPerCore(); diff --git a/src/Cafe/HW/Latte/Core/LatteOverlay.h b/src/Cafe/HW/Latte/Core/LatteOverlay.h index e497abb06..824c68b27 100644 --- a/src/Cafe/HW/Latte/Core/LatteOverlay.h +++ b/src/Cafe/HW/Latte/Core/LatteOverlay.h @@ -2,6 +2,6 @@ void LatteOverlay_init(); void LatteOverlay_render(bool pad_view); -void LatteOverlay_updateStats(double fps, sint32 drawcalls); +void LatteOverlay_updateStats(double fps, sint32 drawcalls, sint32 fastDrawcalls); void LatteOverlay_pushNotification(const std::string& text, sint32 duration); \ No newline at end of file diff --git a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp index 6bbc7ea4d..f27674464 100644 --- a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp +++ b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp @@ -38,6 +38,7 @@ void LattePerformanceMonitor_frameEnd() uint64 indexDataCached = 0; uint32 frameCounter = 0; uint32 drawCallCounter = 0; + uint32 fastDrawCallCounter = 0; uint32 shaderBindCounter = 0; uint32 recompilerLeaveCount = 0; uint32 threadLeaveCount = 0; @@ -53,6 +54,7 @@ void LattePerformanceMonitor_frameEnd() indexDataCached += performanceMonitor.cycle[i].indexDataCached; frameCounter += performanceMonitor.cycle[i].frameCounter; drawCallCounter += performanceMonitor.cycle[i].drawCallCounter; + fastDrawCallCounter += performanceMonitor.cycle[i].fastDrawCallCounter; shaderBindCounter += performanceMonitor.cycle[i].shaderBindCount; recompilerLeaveCount += performanceMonitor.cycle[i].recompilerLeaveCount; threadLeaveCount += performanceMonitor.cycle[i].threadLeaveCount; @@ -75,7 +77,6 @@ void LattePerformanceMonitor_frameEnd() indexDataUploadPerFrame /= 1024ULL; double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS; - uint32 drawCallsPerFrame = drawCallCounter / elapsedFrames; uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames; passedCycles = passedCycles * 1000ULL / totalElapsedTime; uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime); @@ -85,6 +86,7 @@ void LattePerformanceMonitor_frameEnd() // next counter cycle sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES; performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0; + performanceMonitor.cycle[nextCycleIndex].fastDrawCallCounter = 0; performanceMonitor.cycle[nextCycleIndex].frameCounter = 0; performanceMonitor.cycle[nextCycleIndex].shaderBindCount = 0; performanceMonitor.cycle[nextCycleIndex].lastCycleCount = PPCInterpreter_getMainCoreCycleCounter(); @@ -104,12 +106,12 @@ void LattePerformanceMonitor_frameEnd() if (isFirstUpdate) { - LatteOverlay_updateStats(0.0, 0); + LatteOverlay_updateStats(0.0, 0, 0); gui_updateWindowTitles(false, false, 0.0); } else { - LatteOverlay_updateStats(fps, drawCallCounter / elapsedFrames); + LatteOverlay_updateStats(fps, drawCallCounter / elapsedFrames, fastDrawCallCounter / elapsedFrames); gui_updateWindowTitles(false, false, fps); } } diff --git a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h index 77554e80b..713e094e0 100644 --- a/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h +++ b/src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h @@ -84,6 +84,7 @@ typedef struct uint32 lastUpdate; uint32 frameCounter; uint32 drawCallCounter; + uint32 fastDrawCallCounter; uint32 shaderBindCount; uint64 vertexDataUploaded; // amount of vertex data uploaded to GPU (bytes) uint64 vertexDataCached; // amount of vertex data reused from GPU cache (bytes) diff --git a/src/Cafe/HW/Latte/ISA/LatteReg.h b/src/Cafe/HW/Latte/ISA/LatteReg.h index 7f0cf7c9a..d571dc6e6 100644 --- a/src/Cafe/HW/Latte/ISA/LatteReg.h +++ b/src/Cafe/HW/Latte/ISA/LatteReg.h @@ -484,7 +484,7 @@ namespace Latte SQ_TEX_RESOURCE_WORD0_N_GS = 0xE930, SQ_TEX_RESOURCE_WORD_FIRST = SQ_TEX_RESOURCE_WORD0_N_PS, SQ_TEX_RESOURCE_WORD_LAST = (SQ_TEX_RESOURCE_WORD0_N_GS + GPU_LIMITS::NUM_TEXTURES_PER_STAGE * 7 - 1), - // there are 54 samplers with 3 registers each. 18 per stage. For stage indices see SAMPLER_BASE_INDEX_* + // there are 54 samplers with 3 registers each. 18 (actually only 16?) per stage. For stage indices see SAMPLER_BASE_INDEX_* SQ_TEX_SAMPLER_WORD0_0 = 0xF000, SQ_TEX_SAMPLER_WORD1_0 = 0xF001, SQ_TEX_SAMPLER_WORD2_0 = 0xF002, From 9d5dc4415f10cfa3576ddfaf7debff8c932e71cf Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Thu, 16 Mar 2023 16:26:36 +0100 Subject: [PATCH 2/5] Latte: Lower cmd buffer submission threshold --- src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp index 7987b20e1..c2e0a4f87 100644 --- a/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp +++ b/src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp @@ -2002,7 +2002,7 @@ void VulkanRenderer::SubmitCommandBuffer(VkSemaphore signalSemaphore, VkSemaphor occlusionQuery_notifyBeginCommandBuffer(); m_recordedDrawcalls = 0; - m_submitThreshold = 500; // this used to be 750 before 1.25.5, but more frequent submission is actually better for latency + m_submitThreshold = 300; m_submitOnIdle = false; } From f524133c8d492b26a1708e63b0c58c289ba3974a Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Thu, 16 Mar 2023 16:27:02 +0100 Subject: [PATCH 3/5] Latte: More detailed debug logging for texture readback --- .../HW/Latte/Core/LatteTextureReadback.cpp | 37 ++++++++++++++----- .../HW/Latte/Core/LatteTextureReadbackInfo.h | 1 + 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/src/Cafe/HW/Latte/Core/LatteTextureReadback.cpp b/src/Cafe/HW/Latte/Core/LatteTextureReadback.cpp index 0483e8eee..a6e865d8b 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureReadback.cpp +++ b/src/Cafe/HW/Latte/Core/LatteTextureReadback.cpp @@ -8,10 +8,11 @@ #include "Cafe/HW/Latte/Core/LatteTexture.h" #include "Cafe/HW/Latte/Renderer/OpenGL/LatteTextureViewGL.h" -// #define LOG_READBACK_TIME +//#define LOG_READBACK_TIME struct LatteTextureReadbackQueueEntry { + HRTick initiateTime; uint32 lastUpdateDrawcallIndex; LatteTextureView* textureView; }; @@ -22,12 +23,12 @@ std::queue sTextureActiveReadbackQueue; // readbacks void LatteTextureReadback_StartTransfer(LatteTextureView* textureView) { cemuLog_log(LogType::TextureReadback, "[TextureReadback-Start] PhysAddr {:08x} Res {}x{} Fmt {} Slice {} Mip {}", textureView->baseTexture->physAddress, textureView->baseTexture->width, textureView->baseTexture->height, textureView->baseTexture->format, textureView->firstSlice, textureView->firstMip); + HRTick currentTick = HighResolutionTimer().now().getTick(); // create info entry and store in ordered linked list LatteTextureReadbackInfo* readbackInfo = g_renderer->texture_createReadback(textureView); sTextureActiveReadbackQueue.push(readbackInfo); readbackInfo->StartTransfer(); - //debug_printf("[Tex-Readback] %08x %dx%d TM %d FMT %04x\n", textureView->baseTexture->physAddress, textureView->baseTexture->width, textureView->baseTexture->height, textureView->baseTexture->tileMode, textureView->baseTexture->format); - readbackInfo->transferStartTime = HighResolutionTimer().now().getTick(); + readbackInfo->transferStartTime = currentTick; } /* @@ -41,9 +42,15 @@ bool LatteTextureReadback_Update(bool forceStart) for (size_t i = 0; i < sTextureScheduledReadbacks.size(); i++) { LatteTextureReadbackQueueEntry& entry = sTextureScheduledReadbacks[i]; - uint32 numPassedDrawcalls = LatteGPUState.drawCallCounter - entry.lastUpdateDrawcallIndex; - if (forceStart || numPassedDrawcalls >= 5) + uint32 numElapsedDrawcalls = LatteGPUState.drawCallCounter - entry.lastUpdateDrawcallIndex; + if (forceStart || numElapsedDrawcalls >= 5) { +#ifdef LOG_READBACK_TIME + double elapsedSecondsSinceInitiate = HighResolutionTimer::getTimeDiff(entry.initiateTime, HighResolutionTimer().now().getTick()); + char initiateElapsedTimeStr[32]; + sprintf(initiateElapsedTimeStr, "%.4lfms", elapsedSecondsSinceInitiate); + cemuLog_log(LogType::TextureReadback, "[TextureReadback-Update] Starting transfer for {:08x} after {} elapsed drawcalls. Time since initiate: {} Force-start: {}", entry.textureView->baseTexture->physAddress, numElapsedDrawcalls, initiateElapsedTimeStr, forceStart?"yes":"no"); +#endif LatteTextureReadback_StartTransfer(entry.textureView); // remove element vectorRemoveByIndex(sTextureScheduledReadbacks, i); @@ -91,6 +98,7 @@ void LatteTextureReadback_Initate(LatteTextureView* textureView) } // queue LatteTextureReadbackQueueEntry queueEntry; + queueEntry.initiateTime = HighResolutionTimer().now().getTick(); queueEntry.textureView = textureView; queueEntry.lastUpdateDrawcallIndex = LatteGPUState.drawCallCounter; sTextureScheduledReadbacks.emplace_back(queueEntry); @@ -112,6 +120,14 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish) if (!readbackInfo->IsFinished()) { readbackInfo->waitStartTime = HighResolutionTimer().now().getTick(); +#ifdef LOG_READBACK_TIME + if (cemuLog_isLoggingEnabled(LogType::TextureReadback)) + { + double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, HighResolutionTimer().now().getTick()); + forceLog_printf("[Texture-Readback] Force-finish: %08x Res %4d/%4d TM %d FMT %04x Transfer time so far: %.4lfms", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0); + } +#endif + readbackInfo->forceFinish = true; readbackInfo->ForceFinish(); // rerun logic since ->ForceFinish() can recurively call this function and thus modify the queue continue; @@ -125,10 +141,13 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish) } // performance testing #ifdef LOG_READBACK_TIME - HRTick currentTick = HighResolutionTimer().now().getTick(); - double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, currentTick); - double elapsedSecondsWaiting = HighResolutionTimer::getTimeDiff(readbackInfo->waitStartTime, currentTick); - cemuLog_log(LogType::Force, "[Texture-Readback] {:08x} Res {:4}/{:4} TM {} FMT {:04x} ReadbackLatency: {:6.3}ms WaitTime: {:6.3}ms ForcedWait {}", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, forceFinish?"yes":"no"); + if (cemuLog_isLoggingEnabled(LogType::TextureReadback)) + { + HRTick currentTick = HighResolutionTimer().now().getTick(); + double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, currentTick); + double elapsedSecondsWaiting = HighResolutionTimer::getTimeDiff(readbackInfo->waitStartTime, currentTick); + forceLog_printf("[Texture-Readback] %08x Res %4d/%4d TM %d FMT %04x ReadbackLatency: %6.3lfms WaitTime: %6.3lfms ForcedWait %s", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, readbackInfo->forceFinish ? "yes" : "no"); + } #endif uint8* pixelData = readbackInfo->GetData(); LatteTextureLoader_writeReadbackTextureToMemory(&readbackInfo->hostTextureCopy, 0, 0, pixelData); diff --git a/src/Cafe/HW/Latte/Core/LatteTextureReadbackInfo.h b/src/Cafe/HW/Latte/Core/LatteTextureReadbackInfo.h index 4f3a30692..535e9442f 100644 --- a/src/Cafe/HW/Latte/Core/LatteTextureReadbackInfo.h +++ b/src/Cafe/HW/Latte/Core/LatteTextureReadbackInfo.h @@ -21,6 +21,7 @@ class LatteTextureReadbackInfo HRTick transferStartTime; HRTick waitStartTime; + bool forceFinish{ false }; // set to true if not finished in time for dependent operation // texture info LatteTextureDefinition hostTextureCopy{}; From 7ed37f0b9baefa2c9bd681fba2756a0ed1190a26 Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Wed, 22 Mar 2023 13:38:42 +0100 Subject: [PATCH 4/5] Latte: Try to use real colorbuffer texture size if possible --- src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp index 3a52f6414..bcf09944b 100644 --- a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp +++ b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp @@ -295,6 +295,15 @@ LatteTextureView* LatteMRT::GetColorAttachmentTexture(uint32 index, bool createN uint32 colorBufferHeight = pitchHeight / colorBufferPitch; uint32 colorBufferWidth = colorBufferPitch; + // colorbuffer width/height has to be padded to 8/32 but the actual resolution might be smaller + // use the scissor box as a clue to figure out the original resolution if possible + uint32 scissorBoxWidth = LatteGPUState.contextNew.PA_SC_GENERIC_SCISSOR_BR.get_BR_X(); + uint32 scissorBoxHeight = LatteGPUState.contextNew.PA_SC_GENERIC_SCISSOR_BR.get_BR_Y(); + if (((scissorBoxWidth + 7) & ~7) == colorBufferWidth) + colorBufferWidth = scissorBoxWidth; + if (((colorBufferHeight + 31) & ~31) == colorBufferHeight) + colorBufferHeight = scissorBoxHeight; + bool colorBufferWasFound = false; sint32 viewFirstMip = 0; // todo From 0454ffa2ada6fa0c9ea525bb2f64bf973c3acd39 Mon Sep 17 00:00:00 2001 From: Exzap <13877693+Exzap@users.noreply.github.com> Date: Tue, 19 Sep 2023 20:25:07 +0200 Subject: [PATCH 5/5] Disable color buffer resolution heuristic for now --- src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp | 21 +++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp index bcf09944b..060159490 100644 --- a/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp +++ b/src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp @@ -295,14 +295,33 @@ LatteTextureView* LatteMRT::GetColorAttachmentTexture(uint32 index, bool createN uint32 colorBufferHeight = pitchHeight / colorBufferPitch; uint32 colorBufferWidth = colorBufferPitch; - // colorbuffer width/height has to be padded to 8/32 but the actual resolution might be smaller + // colorbuffer width/height has to be padded to 8/32 alignment but the actual resolution might be smaller // use the scissor box as a clue to figure out the original resolution if possible +#if 0 uint32 scissorBoxWidth = LatteGPUState.contextNew.PA_SC_GENERIC_SCISSOR_BR.get_BR_X(); uint32 scissorBoxHeight = LatteGPUState.contextNew.PA_SC_GENERIC_SCISSOR_BR.get_BR_Y(); if (((scissorBoxWidth + 7) & ~7) == colorBufferWidth) colorBufferWidth = scissorBoxWidth; if (((colorBufferHeight + 31) & ~31) == colorBufferHeight) colorBufferHeight = scissorBoxHeight; +#endif + + // log resolution changes if the above heuristic takes effect + // this is useful to find resolutions which need to be updated in gfx pack texture rules +#if 0 + uint32 colorBufferHeight2 = pitchHeight / colorBufferPitch; + static std::unordered_set s_foundColorBufferResMappings; + if (colorBufferPitch != colorBufferWidth || colorBufferHeight != colorBufferHeight2) + { + // only log unique, source and dest resolution. Encode into a key with 16 bits per component + uint64 resHash = (uint64)colorBufferWidth | ((uint64)colorBufferHeight << 16) | ((uint64)colorBufferPitch << 32) | ((uint64)colorBufferHeight2 << 48); + if( !s_foundColorBufferResMappings.contains(resHash) ) + { + s_foundColorBufferResMappings.insert(resHash); + cemuLog_log(LogType::Force, "[COLORBUFFER-DBG] Using res {}x{} instead of {}x{}", colorBufferWidth, colorBufferHeight, colorBufferPitch, colorBufferHeight2); + } + } +#endif bool colorBufferWasFound = false; sint32 viewFirstMip = 0; // todo