From f986061d2c39c7b5feba5625b69e8f31c0b1150a Mon Sep 17 00:00:00 2001 From: doe300 Date: Mon, 29 Jun 2020 16:56:55 +0200 Subject: [PATCH] Implements aggresive register fixup step This fixup-step will try to group all scalar (including pointer) locals into group vectors to reduce the register pressure. Also removes the obsolete register-resolver round maximum. Effects: * fixes register association error for #143 * drastically increases number of instructions where applied --- include/config.h | 5 - src/CMakeLists.txt | 4 + src/asm/CodeGenerator.cpp | 6 +- src/asm/RegisterFixes.cpp | 181 ++++++++++++++++++++++++++- src/asm/RegisterFixes.h | 29 +++++ src/main.cpp | 2 - src/normalization/LongOperations.cpp | 2 +- src/tools/options.cpp | 2 - test/test_cases.h | 3 + 9 files changed, 218 insertions(+), 16 deletions(-) diff --git a/include/config.h b/include/config.h index dab406b9..14472a39 100644 --- a/include/config.h +++ b/include/config.h @@ -138,11 +138,6 @@ namespace vc4c */ unsigned replaceNopThreshold = 16; - /* - * Maximum number of rounds the register-checker tries to resolve conflicts - */ - unsigned registerResolverMaxRounds = 6; - /* * Depth of loops whose constants are moved to out side of it * diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f485f00e..e16fcabd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -55,6 +55,10 @@ if(OPT_FOUND) target_compile_definitions(${VC4C_LIBRARY_NAME} PRIVATE OPT_PATH="${OPT_FOUND}") target_compile_definitions(${VC4C_PROGRAM_NAME} PRIVATE OPT_PATH="${OPT_FOUND}") endif() +if(LLVM_DIS_FOUND) + target_compile_definitions(${VC4C_LIBRARY_NAME} PRIVATE LLVM_DIS_PATH="${LLVM_DIS_FOUND}") + target_compile_definitions(${VC4C_PROGRAM_NAME} PRIVATE LLVM_DIS_PATH="${LLVM_DIS_FOUND}") +endif() # Add all sources include(sources.list) diff --git a/src/asm/CodeGenerator.cpp b/src/asm/CodeGenerator.cpp index 62f67196..8499e17a 100644 --- a/src/asm/CodeGenerator.cpp +++ b/src/asm/CodeGenerator.cpp @@ -94,10 +94,9 @@ const FastAccessList& CodeGenerator::generateInstructions( // check and fix possible errors with register-association std::unique_ptr coloredGraph; - std::size_t round = 0; auto stepIt = FIXUP_STEPS.begin(); FixupResult lastResult = FixupResult::FIXES_APPLIED_RECREATE_GRAPH; - while(round < config.additionalOptions.registerResolverMaxRounds && stepIt != FIXUP_STEPS.end()) + while(stepIt != FIXUP_STEPS.end()) { if(!coloredGraph || lastResult == FixupResult::FIXES_APPLIED_RECREATE_GRAPH) { @@ -119,11 +118,10 @@ const FastAccessList& CodeGenerator::generateInstructions( if(lastResult == FixupResult::ALL_FIXED) // all errors were fixed break; - ++round; ++stepIt; } - if(round >= config.additionalOptions.registerResolverMaxRounds || stepIt == FIXUP_STEPS.end()) + if(stepIt == FIXUP_STEPS.end()) { logging::warn() << "Register conflict resolver has exceeded its maximum rounds, there might still be errors!" << logging::endl; diff --git a/src/asm/RegisterFixes.cpp b/src/asm/RegisterFixes.cpp index f3f7bd21..abaf9205 100644 --- a/src/asm/RegisterFixes.cpp +++ b/src/asm/RegisterFixes.cpp @@ -32,7 +32,23 @@ const std::vector> qpu_asm::FIXUP_STEP return coloredGraph.fixErrors() ? FixupResult::ALL_FIXED : FixupResult::FIXES_APPLIED_KEEP_GRAPH; }}, // Try to group pointer parameters into vectors to save registers used - {"Group parameters", groupParameters}}; + {"Group parameters", groupParameters}, + // Try to group any scalar/pointer local into vectors to save registers used + {"Group scalar locals (conversative)", + [](Method& method, const Configuration& config, GraphColoring& coloredGraph) -> FixupResult { + // In the first run, skip e.g. locals which are used inside of (nested) loops + return groupScalarLocals(method, config, coloredGraph, true); + }}, + {"Small rewrites", + [](Method& method, const Configuration& config, GraphColoring& coloredGraph) -> FixupResult { + // same as above + return coloredGraph.fixErrors() ? FixupResult::ALL_FIXED : FixupResult::FIXES_APPLIED_KEEP_GRAPH; + }}, + {"Group scalar locals (aggressive)", + [](Method& method, const Configuration& config, GraphColoring& coloredGraph) -> FixupResult { + return groupScalarLocals(method, config, coloredGraph, false); + }}, +}; FixupResult qpu_asm::groupParameters(Method& method, const Configuration& config, const GraphColoring& coloredGraph) { @@ -133,7 +149,7 @@ FixupResult qpu_asm::groupParameters(Method& method, const Configuration& config auto tmp = method.addNewLocal((*paramIt)->type, (*paramIt)->name); readerIt = intermediate::insertVectorExtraction( - readerIt, method, tmpGroup, Value(Literal(groupIndex), TYPE_INT8), tmp); + readerIt, method, tmpGroup, Value(SmallImmediate(groupIndex), TYPE_INT8), tmp); readerIt->replaceLocal(*paramIt, tmp); } @@ -144,3 +160,164 @@ FixupResult qpu_asm::groupParameters(Method& method, const Configuration& config return somethingChanged ? FixupResult::FIXES_APPLIED_RECREATE_GRAPH : FixupResult::NOTHING_FIXED; } + +static std::pair reserveGroupSpace( + Method& method, FastMap>& groups, const Local* loc) +{ + for(auto& group : groups) + { + for(uint8_t i = 0; i < group.second.size(); ++i) + { + if(group.second[i] == nullptr) + { + group.second[i] = loc; + return std::make_pair(group.first, i); + } + } + } + // insert new group + auto group = method.addNewLocal(TYPE_INT32.toVectorType(NATIVE_VECTOR_SIZE), "%local_group").local(); + groups[group].fill(nullptr); + groups[group][0] = loc; + return std::make_pair(group, 0); +} + +FixupResult qpu_asm::groupScalarLocals( + Method& method, const Configuration& config, const GraphColoring& coloredGraph, bool runConservative) +{ + FastMap candidateLocals; + FastMap> localReaders; + for(auto& block : method) + { + FastMap localsWrittenInBlock; + // the value is a list (as in not-unique) on purpose to keep the count correct for i.e. combined instructions + FastMap> localsReadInBlock; + for(auto it = block.walk(); !it.isEndOfBlock(); it.nextInBlock()) + { + if(!it.get()) + continue; + it->forUsedLocals( + [&](const Local* loc, LocalUse::Type use, const intermediate::IntermediateInstruction& instr) { + if(loc->type.isLabelType() || (!loc->type.isScalarType() && !loc->type.getPointerType())) + // XXX out of simplicity, for now only handle scalar values + return; + if(has_flag(use, LocalUse::Type::WRITER)) + { + if(!instr.hasConditionalExecution() && loc->getSingleWriter() == &instr && + !instr.hasDecoration(intermediate::InstructionDecorations::PHI_NODE)) + // track only unconditionally written locals with a single writer + // XXX otherwise we would need to write-back the updated result into the group vector + // also don't track phi-writes, since they are always used immediately after the branch + // destination label and therefore don't really have a large usage range + localsWrittenInBlock.emplace(loc, it); + } + if(has_flag(use, LocalUse::Type::READER)) + { + localsReadInBlock[loc].emplace_back(it); + localReaders[loc].emplace(it); + } + }); + } + + for(auto& pair : localsWrittenInBlock) + { + auto readIt = localsReadInBlock.find(pair.first); + if(readIt != localsReadInBlock.end()) + { + if(readIt->second.size() == pair.first->getUsers(LocalUse::Type::READER).size()) + // exclude all locals which have all readers inside the block they are written in - locals which are + // only live inside a single block + // TODO to simplify, could skip all locals which have any readers inside the block they are + // written?! this would reduce the number of locals grouped and therefore the number of registers + // freed, but also the number of instructions inserted + continue; + + if(!readIt->second.empty()) + { + // if the local is read inside this and other blocks, do some optimization + // 1. remove all reads inside this block from the tracked reads, so we do not spill/rematerialize + // inside a single block + auto& readers = localReaders[pair.first]; + for(auto& inst : readIt->second) + readers.erase(inst); + // 2. move the position to insert the spilling code after the last read + candidateLocals.emplace(pair.first, readIt->second.back()); + } + } + candidateLocals.emplace(pair); + } + } + + if(runConservative) + { + analysis::LocalUsageRangeAnalysis localUsageRangeAnalysis(&coloredGraph.getLivenessAnalysis()); + localUsageRangeAnalysis(method); + + for(const auto& entry : localUsageRangeAnalysis.getOverallUsages()) + { + auto candIt = candidateLocals.find(entry.local); + if(candIt == candidateLocals.end()) + continue; + if(entry.numLoops > 1) + // skip locals used in nested loops + candidateLocals.erase(candIt); + else if(entry.numAccesses > 8) + // skip locals with too many accesses + candidateLocals.erase(candIt); + } + } + + if(candidateLocals.size() < 4) + // for 0 and 1 locals this does not help anything at all anyway... + return FixupResult::NOTHING_FIXED; + + // map of "original" local -> group local (+ index in group) where it is located in + FastMap> currentlyGroupedLocals; + // map of group local -> contained "original" locals and their positions + FastMap> groups; + + for(auto& entry : candidateLocals) + { + // allocate an element in our spill register + auto pos = reserveGroupSpace(method, groups, entry.first); + currentlyGroupedLocals.emplace(entry.first, pos); + CPPLOG_LAZY(logging::Level::DEBUG, + log << "Grouping local '" << entry.first->to_string() << "' into " << pos.first->to_string() << " at index " + << static_cast(pos.second) << logging::endl); + + // after the local write (or the last read within the writing block), insert the spill code + // we spill a copy to not force the local to an accumulator (since we do full vector rotation) + { + auto spillIt = entry.second.nextInBlock(); + auto tmpValue = assign(spillIt, entry.first->type) = entry.first->createReference(); + spillIt = intermediate::insertVectorInsertion(entry.second.nextInBlock(), method, + pos.first->createReference(), Value(SmallImmediate(pos.second), TYPE_INT8), tmpValue); + if(!spillIt.isEndOfBlock() && spillIt.get() && spillIt->readsLocal(pos.first)) + // if we happen to insert a write just before a read of the same container, insert a NOP to prevent the + // container to be forced to an accumulator + nop(spillIt, intermediate::DelayType::WAIT_REGISTER); + } + + // before all reads (not located in the writing block), insert a dematerialization and use a new temporary + // local in the read + auto& readers = localReaders[entry.first]; + for(auto reader : readers) + { + auto checkIt = reader.copy().previousInBlock(); + if(!checkIt.isEndOfBlock() && checkIt.get() && checkIt->writesLocal(pos.first)) + // if we happen to insert a read just after a write of the same container, insert a NOP to prevent the + // container to be forced to an accumulator + nop(reader, intermediate::DelayType::WAIT_REGISTER); + // need to extract from temporary container to not force the group container to an accumulator + auto tmpValue = method.addNewLocal(entry.first->type, entry.first->name); + auto tmpContainer = assign(reader, pos.first->type) = pos.first->createReference(); + reader = intermediate::insertVectorExtraction( + reader, method, tmpContainer, Value(SmallImmediate(pos.second), TYPE_INT8), tmpValue); + if(reader->hasUnpackMode() || reader->readsLiteral() || reader.get()) + // insert a NOP before the actually reading instruction to allow for e.g. unpack-modes + nop(reader, intermediate::DelayType::WAIT_REGISTER); + reader->replaceLocal(entry.first, tmpValue); + } + } + return FixupResult::FIXES_APPLIED_RECREATE_GRAPH; +} diff --git a/src/asm/RegisterFixes.h b/src/asm/RegisterFixes.h index aa89f590..014324f7 100644 --- a/src/asm/RegisterFixes.h +++ b/src/asm/RegisterFixes.h @@ -74,6 +74,35 @@ namespace vc4c * livenesses changed. */ FixupResult groupParameters(Method& method, const Configuration& config, const GraphColoring& coloredGraph); + + /** + * Reduces register pressure by grouping scalar and pointer locals into the elements of vector locals. + * + * NOTE: This modification increases the number of instructions drastically! + * + * Example: + * %in = reg uniform + * %out = reg uniform + * [...] + * %in_addr = %in + * [...] + * %out_addr = %out + * + * is converted to: + * - = xor reg elem_num, 0 (setf) + * %param_group = reg uniform (ifz) + * - = xor reg elem_num, 1 (setf) + * %param_group reg uniform (ifz) + * [...] + * %in_addr = %param_group >> 0 + * [...] + * %out_addr = %param_group >> 1 + * + * Returns whether at least one group of scalar locals was created and therefore instructions and local + * livenesses changed. + */ + FixupResult groupScalarLocals( + Method& method, const Configuration& config, const GraphColoring& coloredGraph, bool runConservative); } // namespace qpu_asm } // namespace vc4c diff --git a/src/main.cpp b/src/main.cpp index e914407f..a38470e4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -60,8 +60,6 @@ static void printHelp() << "\tThe maximum live-range of a local still considered to be mapped to an accumulator" << std::endl; std::cout << "\t--freplace-nop-threshold=" << defaultConfig.additionalOptions.replaceNopThreshold << "\tThe number of instructions to search for a replacement for NOPs" << std::endl; - std::cout << "\t--fregister-resolver-rounds=" << defaultConfig.additionalOptions.registerResolverMaxRounds - << "\tThe maximum number of rows for the register allocator" << std::endl; std::cout << "\t--fmove-constants-depth=" << defaultConfig.additionalOptions.moveConstantsDepth << "\tThe maximum depth of nested loops to move constants out of" << std::endl; std::cout << "\t--foptimization-iterations=" << defaultConfig.additionalOptions.maxOptimizationIterations diff --git a/src/normalization/LongOperations.cpp b/src/normalization/LongOperations.cpp index 0b09a930..4e854925 100644 --- a/src/normalization/LongOperations.cpp +++ b/src/normalization/LongOperations.cpp @@ -384,7 +384,7 @@ void normalization::lowerLongOperation( else if(!move->hasOtherSideEffects(intermediate::SideEffectType::FLAGS) && (move->getSource().type.getScalarBitCount() <= 32 || move->getSource().getConstantValue())) { - assign(it, out->lower->createReference()) = move->getSource(); + assign(it, out->lower->createReference()) = (move->getSource(), move->getCondition()); move->setOutput(out->upper->createReference()); move->setSource(Value(0_lit, move->getSource().type)); } diff --git a/src/tools/options.cpp b/src/tools/options.cpp index 15ee89de..353e1e1d 100644 --- a/src/tools/options.cpp +++ b/src/tools/options.cpp @@ -179,8 +179,6 @@ bool tools::parseConfigurationParameter(Configuration& config, const std::string config.additionalOptions.accumulatorThreshold = static_cast(intValue); else if(paramName == "replace-nop-threshold") config.additionalOptions.replaceNopThreshold = static_cast(intValue); - else if(paramName == "register-resolver-rounds") - config.additionalOptions.registerResolverMaxRounds = static_cast(intValue); else if(paramName == "move-constants-depth") config.additionalOptions.moveConstantsDepth = intValue; else if(paramName == "optimization-iterations") diff --git a/test/test_cases.h b/test/test_cases.h index eaf46be1..d54f9634 100644 --- a/test/test_cases.h +++ b/test/test_cases.h @@ -267,10 +267,13 @@ namespace vc4c {toParameter(std::vector{0xFFFFFFFFu}), toParameter(std::vector{0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 11, 12, 13}) }, toConfig(8, 1, 1, 2, 1, 1), maxExecutionCycles), addVector({}, 0, std::vector{7}) ), +#ifndef SPIRV_FRONTEND +// 64-bit constants (that have a high word of neither zero nor one) are not yet implemented for SPIR-V front-end std::make_pair(EmulationData(VC4C_ROOT_PATH "testing/boost-compute/test_functional_popcount.cl", "copy", {toParameter(std::vector(4)), toParameter(std::vector{1, 0, 17, 0, 1, 2, 15, 1}), toScalarParameter(4) }, toConfig(8, 1, 1), maxExecutionCycles), addVector({}, 0, std::vector{1, 2, 2, 5}) ) +#endif }; //TODO NVIDIA/matrixMul, NVIDIA/transpose, OpenCLIPP/Arithmetic, OpenCLIPP/Logic, OpenCLIPP/Thresholding, test_signedness