Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Latte optimizations and tweaks #706

Merged
merged 5 commits into from
Sep 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,057 changes: 672 additions & 385 deletions src/Cafe/HW/Latte/Core/LatteCommandProcessor.cpp

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions src/Cafe/HW/Latte/Core/LatteOverlay.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ struct OverlayStats

double fps{};
uint32 draw_calls_per_frame{};
uint32 fast_draw_calls_per_frame{};
float cpu_usage{}; // cemu cpu usage in %
std::vector<float> cpu_per_core; // global cpu usage in % per core
uint32 ram_usage{}; // ram usage in MB
Expand Down Expand Up @@ -86,7 +87,7 @@ void LatteOverlay_renderOverlay(ImVec2& position, ImVec2& pivot, sint32 directio
ImGui::Text("FPS: %.2lf", g_state.fps);

if (config.overlay.drawcalls)
ImGui::Text("Draws/f: %d", g_state.draw_calls_per_frame);
ImGui::Text("Draws/f: %d (fast: %d)", g_state.draw_calls_per_frame, g_state.fast_draw_calls_per_frame);

if (config.overlay.cpu_usage)
ImGui::Text("CPU: %.2lf%%", g_state.cpu_usage);
Expand Down Expand Up @@ -588,13 +589,14 @@ static void UpdateStats_CpuPerCore()
}
}

void LatteOverlay_updateStats(double fps, sint32 drawcalls)
void LatteOverlay_updateStats(double fps, sint32 drawcalls, sint32 fastDrawcalls)
{
if (GetConfig().overlay.position == ScreenPosition::kDisabled)
return;

g_state.fps = fps;
g_state.draw_calls_per_frame = drawcalls;
g_state.fast_draw_calls_per_frame = fastDrawcalls;
UpdateStats_CemuCpu();
UpdateStats_CpuPerCore();

Expand Down
2 changes: 1 addition & 1 deletion src/Cafe/HW/Latte/Core/LatteOverlay.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@

void LatteOverlay_init();
void LatteOverlay_render(bool pad_view);
void LatteOverlay_updateStats(double fps, sint32 drawcalls);
void LatteOverlay_updateStats(double fps, sint32 drawcalls, sint32 fastDrawcalls);

void LatteOverlay_pushNotification(const std::string& text, sint32 duration);
8 changes: 5 additions & 3 deletions src/Cafe/HW/Latte/Core/LattePerformanceMonitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ void LattePerformanceMonitor_frameEnd()
uint64 indexDataCached = 0;
uint32 frameCounter = 0;
uint32 drawCallCounter = 0;
uint32 fastDrawCallCounter = 0;
uint32 shaderBindCounter = 0;
uint32 recompilerLeaveCount = 0;
uint32 threadLeaveCount = 0;
Expand All @@ -53,6 +54,7 @@ void LattePerformanceMonitor_frameEnd()
indexDataCached += performanceMonitor.cycle[i].indexDataCached;
frameCounter += performanceMonitor.cycle[i].frameCounter;
drawCallCounter += performanceMonitor.cycle[i].drawCallCounter;
fastDrawCallCounter += performanceMonitor.cycle[i].fastDrawCallCounter;
shaderBindCounter += performanceMonitor.cycle[i].shaderBindCount;
recompilerLeaveCount += performanceMonitor.cycle[i].recompilerLeaveCount;
threadLeaveCount += performanceMonitor.cycle[i].threadLeaveCount;
Expand All @@ -75,7 +77,6 @@ void LattePerformanceMonitor_frameEnd()
indexDataUploadPerFrame /= 1024ULL;

double fps = (double)elapsedFrames2S * 1000.0 / (double)totalElapsedTimeFPS;
uint32 drawCallsPerFrame = drawCallCounter / elapsedFrames;
uint32 shaderBindsPerFrame = shaderBindCounter / elapsedFrames;
passedCycles = passedCycles * 1000ULL / totalElapsedTime;
uint32 rlps = (uint32)((uint64)recompilerLeaveCount * 1000ULL / (uint64)totalElapsedTime);
Expand All @@ -85,6 +86,7 @@ void LattePerformanceMonitor_frameEnd()
// next counter cycle
sint32 nextCycleIndex = (performanceMonitor.cycleIndex + 1) % PERFORMANCE_MONITOR_TRACK_CYCLES;
performanceMonitor.cycle[nextCycleIndex].drawCallCounter = 0;
performanceMonitor.cycle[nextCycleIndex].fastDrawCallCounter = 0;
performanceMonitor.cycle[nextCycleIndex].frameCounter = 0;
performanceMonitor.cycle[nextCycleIndex].shaderBindCount = 0;
performanceMonitor.cycle[nextCycleIndex].lastCycleCount = PPCInterpreter_getMainCoreCycleCounter();
Expand All @@ -104,12 +106,12 @@ void LattePerformanceMonitor_frameEnd()

if (isFirstUpdate)
{
LatteOverlay_updateStats(0.0, 0);
LatteOverlay_updateStats(0.0, 0, 0);
gui_updateWindowTitles(false, false, 0.0);
}
else
{
LatteOverlay_updateStats(fps, drawCallCounter / elapsedFrames);
LatteOverlay_updateStats(fps, drawCallCounter / elapsedFrames, fastDrawCallCounter / elapsedFrames);
gui_updateWindowTitles(false, false, fps);
}
}
Expand Down
1 change: 1 addition & 0 deletions src/Cafe/HW/Latte/Core/LattePerformanceMonitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ typedef struct
uint32 lastUpdate;
uint32 frameCounter;
uint32 drawCallCounter;
uint32 fastDrawCallCounter;
uint32 shaderBindCount;
uint64 vertexDataUploaded; // amount of vertex data uploaded to GPU (bytes)
uint64 vertexDataCached; // amount of vertex data reused from GPU cache (bytes)
Expand Down
28 changes: 28 additions & 0 deletions src/Cafe/HW/Latte/Core/LatteRenderTarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,34 @@ LatteTextureView* LatteMRT::GetColorAttachmentTexture(uint32 index, bool createN
uint32 colorBufferHeight = pitchHeight / colorBufferPitch;
uint32 colorBufferWidth = colorBufferPitch;

// colorbuffer width/height has to be padded to 8/32 alignment but the actual resolution might be smaller
// use the scissor box as a clue to figure out the original resolution if possible
#if 0
uint32 scissorBoxWidth = LatteGPUState.contextNew.PA_SC_GENERIC_SCISSOR_BR.get_BR_X();
uint32 scissorBoxHeight = LatteGPUState.contextNew.PA_SC_GENERIC_SCISSOR_BR.get_BR_Y();
if (((scissorBoxWidth + 7) & ~7) == colorBufferWidth)
colorBufferWidth = scissorBoxWidth;
Comment on lines +303 to +304
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this is a typo and this was meant instead:

Suggested change
if (((scissorBoxWidth + 7) & ~7) == colorBufferWidth)
colorBufferWidth = scissorBoxWidth;
if (((colorBufferWidth + 7) & ~7) == colorBufferWidth)
colorBufferWidth = scissorBoxWidth;

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a typo

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry about the noise then! 😅 Performance is improved and only the graphics pack was to blame for the issue!

if (((colorBufferHeight + 31) & ~31) == colorBufferHeight)
colorBufferHeight = scissorBoxHeight;
#endif

// log resolution changes if the above heuristic takes effect
// this is useful to find resolutions which need to be updated in gfx pack texture rules
#if 0
uint32 colorBufferHeight2 = pitchHeight / colorBufferPitch;
static std::unordered_set<uint64> s_foundColorBufferResMappings;
if (colorBufferPitch != colorBufferWidth || colorBufferHeight != colorBufferHeight2)
{
// only log unique, source and dest resolution. Encode into a key with 16 bits per component
uint64 resHash = (uint64)colorBufferWidth | ((uint64)colorBufferHeight << 16) | ((uint64)colorBufferPitch << 32) | ((uint64)colorBufferHeight2 << 48);
if( !s_foundColorBufferResMappings.contains(resHash) )
{
s_foundColorBufferResMappings.insert(resHash);
cemuLog_log(LogType::Force, "[COLORBUFFER-DBG] Using res {}x{} instead of {}x{}", colorBufferWidth, colorBufferHeight, colorBufferPitch, colorBufferHeight2);
}
}
#endif

bool colorBufferWasFound = false;
sint32 viewFirstMip = 0; // todo

Expand Down
37 changes: 28 additions & 9 deletions src/Cafe/HW/Latte/Core/LatteTextureReadback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
#include "Cafe/HW/Latte/Core/LatteTexture.h"
#include "Cafe/HW/Latte/Renderer/OpenGL/LatteTextureViewGL.h"

// #define LOG_READBACK_TIME
//#define LOG_READBACK_TIME

struct LatteTextureReadbackQueueEntry
{
HRTick initiateTime;
uint32 lastUpdateDrawcallIndex;
LatteTextureView* textureView;
};
Expand All @@ -22,12 +23,12 @@ std::queue<LatteTextureReadbackInfo*> sTextureActiveReadbackQueue; // readbacks
void LatteTextureReadback_StartTransfer(LatteTextureView* textureView)
{
cemuLog_log(LogType::TextureReadback, "[TextureReadback-Start] PhysAddr {:08x} Res {}x{} Fmt {} Slice {} Mip {}", textureView->baseTexture->physAddress, textureView->baseTexture->width, textureView->baseTexture->height, textureView->baseTexture->format, textureView->firstSlice, textureView->firstMip);
HRTick currentTick = HighResolutionTimer().now().getTick();
// create info entry and store in ordered linked list
LatteTextureReadbackInfo* readbackInfo = g_renderer->texture_createReadback(textureView);
sTextureActiveReadbackQueue.push(readbackInfo);
readbackInfo->StartTransfer();
//debug_printf("[Tex-Readback] %08x %dx%d TM %d FMT %04x\n", textureView->baseTexture->physAddress, textureView->baseTexture->width, textureView->baseTexture->height, textureView->baseTexture->tileMode, textureView->baseTexture->format);
readbackInfo->transferStartTime = HighResolutionTimer().now().getTick();
readbackInfo->transferStartTime = currentTick;
}

/*
Expand All @@ -41,9 +42,15 @@ bool LatteTextureReadback_Update(bool forceStart)
for (size_t i = 0; i < sTextureScheduledReadbacks.size(); i++)
{
LatteTextureReadbackQueueEntry& entry = sTextureScheduledReadbacks[i];
uint32 numPassedDrawcalls = LatteGPUState.drawCallCounter - entry.lastUpdateDrawcallIndex;
if (forceStart || numPassedDrawcalls >= 5)
uint32 numElapsedDrawcalls = LatteGPUState.drawCallCounter - entry.lastUpdateDrawcallIndex;
if (forceStart || numElapsedDrawcalls >= 5)
{
#ifdef LOG_READBACK_TIME
double elapsedSecondsSinceInitiate = HighResolutionTimer::getTimeDiff(entry.initiateTime, HighResolutionTimer().now().getTick());
char initiateElapsedTimeStr[32];
sprintf(initiateElapsedTimeStr, "%.4lfms", elapsedSecondsSinceInitiate);
cemuLog_log(LogType::TextureReadback, "[TextureReadback-Update] Starting transfer for {:08x} after {} elapsed drawcalls. Time since initiate: {} Force-start: {}", entry.textureView->baseTexture->physAddress, numElapsedDrawcalls, initiateElapsedTimeStr, forceStart?"yes":"no");
#endif
LatteTextureReadback_StartTransfer(entry.textureView);
// remove element
vectorRemoveByIndex(sTextureScheduledReadbacks, i);
Expand Down Expand Up @@ -91,6 +98,7 @@ void LatteTextureReadback_Initate(LatteTextureView* textureView)
}
// queue
LatteTextureReadbackQueueEntry queueEntry;
queueEntry.initiateTime = HighResolutionTimer().now().getTick();
queueEntry.textureView = textureView;
queueEntry.lastUpdateDrawcallIndex = LatteGPUState.drawCallCounter;
sTextureScheduledReadbacks.emplace_back(queueEntry);
Expand All @@ -112,6 +120,14 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish)
if (!readbackInfo->IsFinished())
{
readbackInfo->waitStartTime = HighResolutionTimer().now().getTick();
#ifdef LOG_READBACK_TIME
if (cemuLog_isLoggingEnabled(LogType::TextureReadback))
{
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, HighResolutionTimer().now().getTick());
forceLog_printf("[Texture-Readback] Force-finish: %08x Res %4d/%4d TM %d FMT %04x Transfer time so far: %.4lfms", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0);
}
#endif
readbackInfo->forceFinish = true;
readbackInfo->ForceFinish();
// rerun logic since ->ForceFinish() can recurively call this function and thus modify the queue
continue;
Expand All @@ -125,10 +141,13 @@ void LatteTextureReadback_UpdateFinishedTransfers(bool forceFinish)
}
// performance testing
#ifdef LOG_READBACK_TIME
HRTick currentTick = HighResolutionTimer().now().getTick();
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, currentTick);
double elapsedSecondsWaiting = HighResolutionTimer::getTimeDiff(readbackInfo->waitStartTime, currentTick);
cemuLog_log(LogType::Force, "[Texture-Readback] {:08x} Res {:4}/{:4} TM {} FMT {:04x} ReadbackLatency: {:6.3}ms WaitTime: {:6.3}ms ForcedWait {}", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, forceFinish?"yes":"no");
if (cemuLog_isLoggingEnabled(LogType::TextureReadback))
{
HRTick currentTick = HighResolutionTimer().now().getTick();
double elapsedSecondsTransfer = HighResolutionTimer::getTimeDiff(readbackInfo->transferStartTime, currentTick);
double elapsedSecondsWaiting = HighResolutionTimer::getTimeDiff(readbackInfo->waitStartTime, currentTick);
forceLog_printf("[Texture-Readback] %08x Res %4d/%4d TM %d FMT %04x ReadbackLatency: %6.3lfms WaitTime: %6.3lfms ForcedWait %s", readbackInfo->hostTextureCopy.physAddress, readbackInfo->hostTextureCopy.width, readbackInfo->hostTextureCopy.height, readbackInfo->hostTextureCopy.tileMode, (uint32)readbackInfo->hostTextureCopy.format, elapsedSecondsTransfer * 1000.0, elapsedSecondsWaiting * 1000.0, readbackInfo->forceFinish ? "yes" : "no");
}
#endif
uint8* pixelData = readbackInfo->GetData();
LatteTextureLoader_writeReadbackTextureToMemory(&readbackInfo->hostTextureCopy, 0, 0, pixelData);
Expand Down
1 change: 1 addition & 0 deletions src/Cafe/HW/Latte/Core/LatteTextureReadbackInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class LatteTextureReadbackInfo

HRTick transferStartTime;
HRTick waitStartTime;
bool forceFinish{ false }; // set to true if not finished in time for dependent operation
// texture info
LatteTextureDefinition hostTextureCopy{};

Expand Down
2 changes: 1 addition & 1 deletion src/Cafe/HW/Latte/ISA/LatteReg.h
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ namespace Latte
SQ_TEX_RESOURCE_WORD0_N_GS = 0xE930,
SQ_TEX_RESOURCE_WORD_FIRST = SQ_TEX_RESOURCE_WORD0_N_PS,
SQ_TEX_RESOURCE_WORD_LAST = (SQ_TEX_RESOURCE_WORD0_N_GS + GPU_LIMITS::NUM_TEXTURES_PER_STAGE * 7 - 1),
// there are 54 samplers with 3 registers each. 18 per stage. For stage indices see SAMPLER_BASE_INDEX_*
// there are 54 samplers with 3 registers each. 18 (actually only 16?) per stage. For stage indices see SAMPLER_BASE_INDEX_*
SQ_TEX_SAMPLER_WORD0_0 = 0xF000,
SQ_TEX_SAMPLER_WORD1_0 = 0xF001,
SQ_TEX_SAMPLER_WORD2_0 = 0xF002,
Expand Down
2 changes: 1 addition & 1 deletion src/Cafe/HW/Latte/Renderer/Vulkan/VulkanRenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2002,7 +2002,7 @@ void VulkanRenderer::SubmitCommandBuffer(VkSemaphore signalSemaphore, VkSemaphor
occlusionQuery_notifyBeginCommandBuffer();

m_recordedDrawcalls = 0;
m_submitThreshold = 500; // this used to be 750 before 1.25.5, but more frequent submission is actually better for latency
m_submitThreshold = 300;
m_submitOnIdle = false;
}

Expand Down