From 90d16ae1757223d028d9c9e300adf955029c2ab2 Mon Sep 17 00:00:00 2001 From: doe300 Date: Wed, 2 Feb 2022 17:23:22 +0100 Subject: [PATCH] Improve memory access - Detect more stack-allocated memory addresses, see https://github.com/doe300/VC4CL/issues/106 - Reduce usage of unaligned VPM access --- src/intermediate/Operations.cpp | 9 +++++++++ src/normalization/MemoryMapChecks.cpp | 20 ++++++++++++++------ src/periphery/VPM.cpp | 27 +++++++++++++++++++++++---- 3 files changed, 46 insertions(+), 10 deletions(-) diff --git a/src/intermediate/Operations.cpp b/src/intermediate/Operations.cpp index 5f8dc9b8..573976c3 100644 --- a/src/intermediate/Operations.cpp +++ b/src/intermediate/Operations.cpp @@ -238,6 +238,11 @@ qpu_asm::DecoratedInstruction Operation::convertToAsm(const FastMapgetOutput() && addOp->getOutput() == mulOp->getOutput() && + addInstr.getAddOutput() != mulInstr.getMulOutput()) + throw CompilationError(CompilationStep::CODE_GENERATION, + "Shared output for combined instructions is wrongly mapped to different output registers", to_string()); if(addInstr.getAddOutput().file == mulInstr.getMulOutput().file && addInstr.getAddOutput().isGeneralPurpose() && mulInstr.getMulOutput().isGeneralPurpose()) // This also does not work for output to the same register (on a physical register-file), since we cannot diff --git a/src/normalization/MemoryMapChecks.cpp b/src/normalization/MemoryMapChecks.cpp index da942d79..fca041b3 100644 --- a/src/normalization/MemoryMapChecks.cpp +++ b/src/normalization/MemoryMapChecks.cpp @@ -256,6 +256,15 @@ static Optional> checkAllWritersArePhiNo if(excludedInstructions.find(writer) != excludedInstructions.end()) // skip the (possible write) we are currently processing return; + if(auto memoryAccess = dynamic_cast(writer)) + { + if(memoryAccess->op != MemoryOperation::READ) + // the contents are written (or read), not the address itself + return; + if(!memoryAccess->getDestination().hasLocal(local)) + // the contents of the address is read, not the address written + return; + } if(writer->hasDecoration(InstructionDecorations::PHI_NODE)) { if(writer->assertArgument(0).checkLocal()) @@ -402,7 +411,6 @@ static void addConditionalLocalMapping(const Local* loc, const ConditionalMemory static const Local* determineSingleMemoryAreaMapping(MemoryAccessMap& mapping, TypedInstructionWalker it, const Local* local, - const intermediate::MemoryInstruction* memInstr, FastMap, std::pair>>& conditionalWrittenMemoryAccesses, FastSet& conditionalAddressWrites) @@ -449,7 +457,7 @@ static const Local* determineSingleMemoryAreaMapping(MemoryAccessMap& mapping, if(isMemoryOnlyRead(local)) { // global buffer - if(getConstantElementValue(memInstr->getSource()) && memInstr->getNumEntries().hasLiteral(1_lit)) + if(getConstantElementValue(it->getSource()) && it->getNumEntries().hasLiteral(1_lit)) { CPPLOG_LAZY(logging::Level::DEBUG, log << "Constant element of constant buffer '" << local->to_string() @@ -493,7 +501,7 @@ static const Local* determineSingleMemoryAreaMapping(MemoryAccessMap& mapping, mapping[local].fallback = MemoryAccessType::RAM_READ_WRITE_VPM; } } - else if(auto conditonalWrites = checkAllWritersArePhiNodesOrSelections(local, {it.get(), memInstr})) + else if(auto conditonalWrites = checkAllWritersArePhiNodesOrSelections(local, {it.get()})) { // If the local is a PHI result from different memory addresses, we can still map it, if the // source addresses have the same access type. Since the "original" access types of the sources @@ -613,7 +621,7 @@ static const Local* determineSingleMemoryAreaMapping(MemoryAccessMap& mapping, log << "Failed to determine underlying memory area for '" << local->to_string() << "', trying again for its single source: " << sourceLocal->to_string() << logging::endl); auto memoryLocal = determineSingleMemoryAreaMapping( - mapping, it, sourceLocal, memInstr, conditionalWrittenMemoryAccesses, conditionalAddressWrites); + mapping, it, sourceLocal, conditionalWrittenMemoryAccesses, conditionalAddressWrites); // if we returned here, then we managed to (conditionally) map the local to a memory area. So edit the // current local to reference the inner most local if(local->type.getPointerType() && !local->get() && memoryLocal && @@ -629,7 +637,7 @@ static const Local* determineSingleMemoryAreaMapping(MemoryAccessMap& mapping, innerMappingIt->second.accessInstructions.emplace(it, local); const_cast(local)->set(ReferenceData(*memoryLocal, hasOffset ? ANY_ELEMENT : 0)); auto memoryParameter = memoryLocal->as(); - if(memoryParameter && memInstr->op != MemoryOperation::READ && + if(memoryParameter && it->op != MemoryOperation::READ && !has_flag(memoryParameter->decorations, ParameterDecorations::READ_ONLY)) { // Mark the original memory area as being written to. @@ -1007,7 +1015,7 @@ MemoryAccessInfo normalization::determineMemoryAccess(Method& method) } } for(const auto local : memInstr->getMemoryAreas()) - determineSingleMemoryAreaMapping(mapping, typeSafe(it, *memInstr), local, memInstr, + determineSingleMemoryAreaMapping(mapping, typeSafe(it, *memInstr), local, conditionalWrittenMemoryAccesses, conditionalAddressWrites); if(it.has()) allWalkers.emplace(it); diff --git a/src/periphery/VPM.cpp b/src/periphery/VPM.cpp index 93fda79a..ffbce55b 100644 --- a/src/periphery/VPM.cpp +++ b/src/periphery/VPM.cpp @@ -6,6 +6,7 @@ #include "VPM.h" +#include "../Expression.h" #include "../Profiler.h" #include "../intermediate/Helper.h" #include "../intermediate/VectorHelper.h" @@ -587,8 +588,28 @@ static NODISCARD InstructionWalker calculateElementOffsetInVPM(Method& method, I */ static bool isUnalignedMemoryVPMAccess(const Value& offset, DataType elementType) { - return elementType.getVectorWidth() > 1 && - (!offset.getLiteralValue() || (offset.getLiteralValue()->unsignedInt() % elementType.getInMemoryWidth()) != 0); + if(elementType.getVectorWidth() == 1) + return false; + if(auto litOffset = offset.getLiteralValue()) + return litOffset->unsignedInt() % elementType.getInMemoryWidth() != 0; + if(auto writer = offset.getSingleWriter()) + { + if(auto expression = Expression::createRecursiveExpression(*writer, 3)) + { + if(expression->code == OP_SHL && expression->arg1.getLiteralValue() && + (expression->arg1.getLiteralValue()->unsignedInt() % 32) >= + static_cast(std::log2(elementType.getInMemoryWidth()))) + return false; + if(expression->code == Expression::FAKEOP_UMUL && + ((expression->arg0.getLiteralValue() && + expression->arg0.getLiteralValue()->unsignedInt() % elementType.getInMemoryWidth() == 0) || + (expression->arg1.getLiteralValue() && + expression->arg1.getLiteralValue()->unsignedInt() % elementType.getInMemoryWidth() == 0))) + return false; + } + } + // unless we can prove otherwise, assume unaligned + return true; } InstructionWalker VPM::insertCopyRAM(Method& method, InstructionWalker it, const Value& destAddress, @@ -1358,7 +1379,6 @@ NODISCARD static InstructionWalker lowerReadVPM( (load(Literal(genericSetup.value)), InstructionDecorations::VPM_READ_CONFIGURATION); else if(isUnalignedMemoryVPMAccess(internalOffset, dataType)) { - // TODO make sure this block is only used where really really required! // TODO if inAreaOffset guaranteed to lie within one row, skip loading of second?! // TODO 64-bit version /* @@ -1451,7 +1471,6 @@ NODISCARD static InstructionWalker lowerWriteVPM( (load(Literal(genericSetup.value)), InstructionDecorations::VPM_WRITE_CONFIGURATION); else if(isUnalignedMemoryVPMAccess(internalOffset, dataType)) { - // TODO make sure this block is only used where really really required! // TODO if inAreaOffset guaranteed to lie within one row, skip loading of second?! // TODO 64-bit version /*