Skip to content

Commit

Permalink
Implements aggresive register fixup step
Browse files Browse the repository at this point in the history
This fixup-step will try to group all scalar (including pointer) locals
into group vectors to reduce the register pressure.

Also removes the obsolete register-resolver round maximum.

Effects:
* fixes register association error for #143
* drastically increases number of instructions where applied
  • Loading branch information
doe300 committed Jun 29, 2020
1 parent 317c450 commit f986061
Show file tree
Hide file tree
Showing 9 changed files with 218 additions and 16 deletions.
5 changes: 0 additions & 5 deletions include/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,6 @@ namespace vc4c
*/
unsigned replaceNopThreshold = 16;

/*
* Maximum number of rounds the register-checker tries to resolve conflicts
*/
unsigned registerResolverMaxRounds = 6;

/*
* Depth of loops whose constants are moved to out side of it
*
Expand Down
4 changes: 4 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ if(OPT_FOUND)
target_compile_definitions(${VC4C_LIBRARY_NAME} PRIVATE OPT_PATH="${OPT_FOUND}")
target_compile_definitions(${VC4C_PROGRAM_NAME} PRIVATE OPT_PATH="${OPT_FOUND}")
endif()
if(LLVM_DIS_FOUND)
target_compile_definitions(${VC4C_LIBRARY_NAME} PRIVATE LLVM_DIS_PATH="${LLVM_DIS_FOUND}")
target_compile_definitions(${VC4C_PROGRAM_NAME} PRIVATE LLVM_DIS_PATH="${LLVM_DIS_FOUND}")
endif()

# Add all sources
include(sources.list)
Expand Down
6 changes: 2 additions & 4 deletions src/asm/CodeGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,9 @@ const FastAccessList<DecoratedInstruction>& CodeGenerator::generateInstructions(

// check and fix possible errors with register-association
std::unique_ptr<GraphColoring> coloredGraph;
std::size_t round = 0;
auto stepIt = FIXUP_STEPS.begin();
FixupResult lastResult = FixupResult::FIXES_APPLIED_RECREATE_GRAPH;
while(round < config.additionalOptions.registerResolverMaxRounds && stepIt != FIXUP_STEPS.end())
while(stepIt != FIXUP_STEPS.end())
{
if(!coloredGraph || lastResult == FixupResult::FIXES_APPLIED_RECREATE_GRAPH)
{
Expand All @@ -119,11 +118,10 @@ const FastAccessList<DecoratedInstruction>& CodeGenerator::generateInstructions(
if(lastResult == FixupResult::ALL_FIXED)
// all errors were fixed
break;
++round;
++stepIt;
}

if(round >= config.additionalOptions.registerResolverMaxRounds || stepIt == FIXUP_STEPS.end())
if(stepIt == FIXUP_STEPS.end())
{
logging::warn() << "Register conflict resolver has exceeded its maximum rounds, there might still be errors!"
<< logging::endl;
Expand Down
181 changes: 179 additions & 2 deletions src/asm/RegisterFixes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,23 @@ const std::vector<std::pair<std::string, RegisterFixupStep>> qpu_asm::FIXUP_STEP
return coloredGraph.fixErrors() ? FixupResult::ALL_FIXED : FixupResult::FIXES_APPLIED_KEEP_GRAPH;
}},
// Try to group pointer parameters into vectors to save registers used
{"Group parameters", groupParameters}};
{"Group parameters", groupParameters},
// Try to group any scalar/pointer local into vectors to save registers used
{"Group scalar locals (conversative)",
[](Method& method, const Configuration& config, GraphColoring& coloredGraph) -> FixupResult {
// In the first run, skip e.g. locals which are used inside of (nested) loops
return groupScalarLocals(method, config, coloredGraph, true);
}},
{"Small rewrites",
[](Method& method, const Configuration& config, GraphColoring& coloredGraph) -> FixupResult {
// same as above
return coloredGraph.fixErrors() ? FixupResult::ALL_FIXED : FixupResult::FIXES_APPLIED_KEEP_GRAPH;
}},
{"Group scalar locals (aggressive)",
[](Method& method, const Configuration& config, GraphColoring& coloredGraph) -> FixupResult {
return groupScalarLocals(method, config, coloredGraph, false);
}},
};

FixupResult qpu_asm::groupParameters(Method& method, const Configuration& config, const GraphColoring& coloredGraph)
{
Expand Down Expand Up @@ -133,7 +149,7 @@ FixupResult qpu_asm::groupParameters(Method& method, const Configuration& config

auto tmp = method.addNewLocal((*paramIt)->type, (*paramIt)->name);
readerIt = intermediate::insertVectorExtraction(
readerIt, method, tmpGroup, Value(Literal(groupIndex), TYPE_INT8), tmp);
readerIt, method, tmpGroup, Value(SmallImmediate(groupIndex), TYPE_INT8), tmp);
readerIt->replaceLocal(*paramIt, tmp);
}

Expand All @@ -144,3 +160,164 @@ FixupResult qpu_asm::groupParameters(Method& method, const Configuration& config

return somethingChanged ? FixupResult::FIXES_APPLIED_RECREATE_GRAPH : FixupResult::NOTHING_FIXED;
}

static std::pair<const Local*, uint8_t> reserveGroupSpace(
Method& method, FastMap<const Local*, std::array<const Local*, NATIVE_VECTOR_SIZE>>& groups, const Local* loc)
{
for(auto& group : groups)
{
for(uint8_t i = 0; i < group.second.size(); ++i)
{
if(group.second[i] == nullptr)
{
group.second[i] = loc;
return std::make_pair(group.first, i);
}
}
}
// insert new group
auto group = method.addNewLocal(TYPE_INT32.toVectorType(NATIVE_VECTOR_SIZE), "%local_group").local();
groups[group].fill(nullptr);
groups[group][0] = loc;
return std::make_pair(group, 0);
}

FixupResult qpu_asm::groupScalarLocals(
Method& method, const Configuration& config, const GraphColoring& coloredGraph, bool runConservative)
{
FastMap<const Local*, InstructionWalker> candidateLocals;
FastMap<const Local*, FastSet<InstructionWalker>> localReaders;
for(auto& block : method)
{
FastMap<const Local*, InstructionWalker> localsWrittenInBlock;
// the value is a list (as in not-unique) on purpose to keep the count correct for i.e. combined instructions
FastMap<const Local*, FastAccessList<InstructionWalker>> localsReadInBlock;
for(auto it = block.walk(); !it.isEndOfBlock(); it.nextInBlock())
{
if(!it.get())
continue;
it->forUsedLocals(
[&](const Local* loc, LocalUse::Type use, const intermediate::IntermediateInstruction& instr) {
if(loc->type.isLabelType() || (!loc->type.isScalarType() && !loc->type.getPointerType()))
// XXX out of simplicity, for now only handle scalar values
return;
if(has_flag(use, LocalUse::Type::WRITER))
{
if(!instr.hasConditionalExecution() && loc->getSingleWriter() == &instr &&
!instr.hasDecoration(intermediate::InstructionDecorations::PHI_NODE))
// track only unconditionally written locals with a single writer
// XXX otherwise we would need to write-back the updated result into the group vector
// also don't track phi-writes, since they are always used immediately after the branch
// destination label and therefore don't really have a large usage range
localsWrittenInBlock.emplace(loc, it);
}
if(has_flag(use, LocalUse::Type::READER))
{
localsReadInBlock[loc].emplace_back(it);
localReaders[loc].emplace(it);
}
});
}

for(auto& pair : localsWrittenInBlock)
{
auto readIt = localsReadInBlock.find(pair.first);
if(readIt != localsReadInBlock.end())
{
if(readIt->second.size() == pair.first->getUsers(LocalUse::Type::READER).size())
// exclude all locals which have all readers inside the block they are written in - locals which are
// only live inside a single block
// TODO to simplify, could skip all locals which have any readers inside the block they are
// written?! this would reduce the number of locals grouped and therefore the number of registers
// freed, but also the number of instructions inserted
continue;

if(!readIt->second.empty())
{
// if the local is read inside this and other blocks, do some optimization
// 1. remove all reads inside this block from the tracked reads, so we do not spill/rematerialize
// inside a single block
auto& readers = localReaders[pair.first];
for(auto& inst : readIt->second)
readers.erase(inst);
// 2. move the position to insert the spilling code after the last read
candidateLocals.emplace(pair.first, readIt->second.back());
}
}
candidateLocals.emplace(pair);
}
}

if(runConservative)
{
analysis::LocalUsageRangeAnalysis localUsageRangeAnalysis(&coloredGraph.getLivenessAnalysis());
localUsageRangeAnalysis(method);

for(const auto& entry : localUsageRangeAnalysis.getOverallUsages())
{
auto candIt = candidateLocals.find(entry.local);
if(candIt == candidateLocals.end())
continue;
if(entry.numLoops > 1)
// skip locals used in nested loops
candidateLocals.erase(candIt);
else if(entry.numAccesses > 8)
// skip locals with too many accesses
candidateLocals.erase(candIt);
}
}

if(candidateLocals.size() < 4)
// for 0 and 1 locals this does not help anything at all anyway...
return FixupResult::NOTHING_FIXED;

// map of "original" local -> group local (+ index in group) where it is located in
FastMap<const Local*, std::pair<const Local*, uint8_t>> currentlyGroupedLocals;
// map of group local -> contained "original" locals and their positions
FastMap<const Local*, std::array<const Local*, NATIVE_VECTOR_SIZE>> groups;

for(auto& entry : candidateLocals)
{
// allocate an element in our spill register
auto pos = reserveGroupSpace(method, groups, entry.first);
currentlyGroupedLocals.emplace(entry.first, pos);
CPPLOG_LAZY(logging::Level::DEBUG,
log << "Grouping local '" << entry.first->to_string() << "' into " << pos.first->to_string() << " at index "
<< static_cast<unsigned>(pos.second) << logging::endl);

// after the local write (or the last read within the writing block), insert the spill code
// we spill a copy to not force the local to an accumulator (since we do full vector rotation)
{
auto spillIt = entry.second.nextInBlock();
auto tmpValue = assign(spillIt, entry.first->type) = entry.first->createReference();
spillIt = intermediate::insertVectorInsertion(entry.second.nextInBlock(), method,
pos.first->createReference(), Value(SmallImmediate(pos.second), TYPE_INT8), tmpValue);
if(!spillIt.isEndOfBlock() && spillIt.get() && spillIt->readsLocal(pos.first))
// if we happen to insert a write just before a read of the same container, insert a NOP to prevent the
// container to be forced to an accumulator
nop(spillIt, intermediate::DelayType::WAIT_REGISTER);
}

// before all reads (not located in the writing block), insert a dematerialization and use a new temporary
// local in the read
auto& readers = localReaders[entry.first];
for(auto reader : readers)
{
auto checkIt = reader.copy().previousInBlock();
if(!checkIt.isEndOfBlock() && checkIt.get() && checkIt->writesLocal(pos.first))
// if we happen to insert a read just after a write of the same container, insert a NOP to prevent the
// container to be forced to an accumulator
nop(reader, intermediate::DelayType::WAIT_REGISTER);
// need to extract from temporary container to not force the group container to an accumulator
auto tmpValue = method.addNewLocal(entry.first->type, entry.first->name);
auto tmpContainer = assign(reader, pos.first->type) = pos.first->createReference();
reader = intermediate::insertVectorExtraction(
reader, method, tmpContainer, Value(SmallImmediate(pos.second), TYPE_INT8), tmpValue);
if(reader->hasUnpackMode() || reader->readsLiteral() || reader.get<intermediate::VectorRotation>())
// insert a NOP before the actually reading instruction to allow for e.g. unpack-modes
nop(reader, intermediate::DelayType::WAIT_REGISTER);
reader->replaceLocal(entry.first, tmpValue);
}
}
return FixupResult::FIXES_APPLIED_RECREATE_GRAPH;
}
29 changes: 29 additions & 0 deletions src/asm/RegisterFixes.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,35 @@ namespace vc4c
* livenesses changed.
*/
FixupResult groupParameters(Method& method, const Configuration& config, const GraphColoring& coloredGraph);

/**
* Reduces register pressure by grouping scalar and pointer locals into the elements of vector locals.
*
* NOTE: This modification increases the number of instructions drastically!
*
* Example:
* %in = reg uniform
* %out = reg uniform
* [...]
* %in_addr = %in
* [...]
* %out_addr = %out
*
* is converted to:
* - = xor reg elem_num, 0 (setf)
* %param_group = reg uniform (ifz)
* - = xor reg elem_num, 1 (setf)
* %param_group reg uniform (ifz)
* [...]
* %in_addr = %param_group >> 0
* [...]
* %out_addr = %param_group >> 1
*
* Returns whether at least one group of scalar locals was created and therefore instructions and local
* livenesses changed.
*/
FixupResult groupScalarLocals(
Method& method, const Configuration& config, const GraphColoring& coloredGraph, bool runConservative);
} // namespace qpu_asm

} // namespace vc4c
Expand Down
2 changes: 0 additions & 2 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,6 @@ static void printHelp()
<< "\tThe maximum live-range of a local still considered to be mapped to an accumulator" << std::endl;
std::cout << "\t--freplace-nop-threshold=" << defaultConfig.additionalOptions.replaceNopThreshold
<< "\tThe number of instructions to search for a replacement for NOPs" << std::endl;
std::cout << "\t--fregister-resolver-rounds=" << defaultConfig.additionalOptions.registerResolverMaxRounds
<< "\tThe maximum number of rows for the register allocator" << std::endl;
std::cout << "\t--fmove-constants-depth=" << defaultConfig.additionalOptions.moveConstantsDepth
<< "\tThe maximum depth of nested loops to move constants out of" << std::endl;
std::cout << "\t--foptimization-iterations=" << defaultConfig.additionalOptions.maxOptimizationIterations
Expand Down
2 changes: 1 addition & 1 deletion src/normalization/LongOperations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ void normalization::lowerLongOperation(
else if(!move->hasOtherSideEffects(intermediate::SideEffectType::FLAGS) &&
(move->getSource().type.getScalarBitCount() <= 32 || move->getSource().getConstantValue()))
{
assign(it, out->lower->createReference()) = move->getSource();
assign(it, out->lower->createReference()) = (move->getSource(), move->getCondition());
move->setOutput(out->upper->createReference());
move->setSource(Value(0_lit, move->getSource().type));
}
Expand Down
2 changes: 0 additions & 2 deletions src/tools/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,6 @@ bool tools::parseConfigurationParameter(Configuration& config, const std::string
config.additionalOptions.accumulatorThreshold = static_cast<unsigned>(intValue);
else if(paramName == "replace-nop-threshold")
config.additionalOptions.replaceNopThreshold = static_cast<unsigned>(intValue);
else if(paramName == "register-resolver-rounds")
config.additionalOptions.registerResolverMaxRounds = static_cast<unsigned>(intValue);
else if(paramName == "move-constants-depth")
config.additionalOptions.moveConstantsDepth = intValue;
else if(paramName == "optimization-iterations")
Expand Down
3 changes: 3 additions & 0 deletions test/test_cases.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,10 +267,13 @@ namespace vc4c
{toParameter(std::vector<unsigned>{0xFFFFFFFFu}), toParameter(std::vector<int>{0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 11, 11, 12, 13})
}, toConfig(8, 1, 1, 2, 1, 1), maxExecutionCycles), addVector({}, 0, std::vector<unsigned>{7})
),
#ifndef SPIRV_FRONTEND
// 64-bit constants (that have a high word of neither zero nor one) are not yet implemented for SPIR-V front-end
std::make_pair(EmulationData(VC4C_ROOT_PATH "testing/boost-compute/test_functional_popcount.cl", "copy",
{toParameter(std::vector<unsigned>(4)), toParameter(std::vector<unsigned>{1, 0, 17, 0, 1, 2, 15, 1}), toScalarParameter(4)
}, toConfig(8, 1, 1), maxExecutionCycles), addVector({}, 0, std::vector<unsigned>{1, 2, 2, 5})
)
#endif
};

//TODO NVIDIA/matrixMul, NVIDIA/transpose, OpenCLIPP/Arithmetic, OpenCLIPP/Logic, OpenCLIPP/Thresholding, test_signedness
Expand Down

0 comments on commit f986061

Please sign in to comment.