From a3a266207c7b0d1938801def3b4934cf8f1c92d1 Mon Sep 17 00:00:00 2001 From: jeff Date: Tue, 12 Apr 2022 01:55:24 +0000 Subject: [PATCH 1/3] initialize issueTypes_ --- lib/Scheduler/machine_model.cpp | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/lib/Scheduler/machine_model.cpp b/lib/Scheduler/machine_model.cpp index 6fd89ae3..52d59230 100644 --- a/lib/Scheduler/machine_model.cpp +++ b/lib/Scheduler/machine_model.cpp @@ -17,6 +17,31 @@ using std::vector; MachineModel::MachineModel(const std::string &modelFile) { SpecsBuffer buf; buf.Load(modelFile.c_str()); + + char buffer[MAX_NAMESIZE]; + + buf.ReadSpec("MODEL_NAME:", buffer); + mdlName_ = buffer; + + issueRate_ = buf.ReadIntSpec("ISSUE_RATE:"); + + int numIssueTypes = buf.ReadIntSpec("ISSUE_TYPE_COUNT:"); + + issueTypes_.resize(numIssueTypes > 0 ? numIssueTypes : 1); + if (numIssueTypes > 0) { + for (size_t j = 0; j < issueTypes_.size(); j++) { + int pieceCnt; + char *strngs[INBUF_MAX_PIECES_PERLINE]; + int lngths[INBUF_MAX_PIECES_PERLINE]; + buf.GetNxtVldLine(pieceCnt, strngs, lngths); + + if (pieceCnt != 2) + llvm::report_fatal_error("Invalid issue type spec", false); + + issueTypes_[j].name = strngs[0]; + issueTypes_[j].slotsCount = atoi(strngs[1]); + } + } } MachineModel::MachineModel(SpecsBuffer &buf) { From f6b6105e90f9ff39ed0873ebdbef3dbd1b73ab9e Mon Sep 17 00:00:00 2001 From: jeff Date: Thu, 12 May 2022 09:34:39 -0700 Subject: [PATCH 2/3] Refactor of MachineModel construction --- include/opt-sched/Scheduler/machine_model.h | 2 ++ lib/Scheduler/machine_model.cpp | 40 +++++---------------- 2 files changed, 11 insertions(+), 31 deletions(-) diff --git a/include/opt-sched/Scheduler/machine_model.h b/include/opt-sched/Scheduler/machine_model.h index 95905749..b4c9753f 100644 --- a/include/opt-sched/Scheduler/machine_model.h +++ b/include/opt-sched/Scheduler/machine_model.h @@ -189,6 +189,8 @@ class MachineModel { std::vector registerTypes_; // A vector of issue types with their names and slot counts. std::vector issueTypes_; + + void parseBuffer(SpecsBuffer &buf); }; } // namespace opt_sched diff --git a/lib/Scheduler/machine_model.cpp b/lib/Scheduler/machine_model.cpp index 52d59230..e8029df7 100644 --- a/lib/Scheduler/machine_model.cpp +++ b/lib/Scheduler/machine_model.cpp @@ -14,37 +14,7 @@ using namespace llvm::opt_sched; using std::string; using std::vector; -MachineModel::MachineModel(const std::string &modelFile) { - SpecsBuffer buf; - buf.Load(modelFile.c_str()); - - char buffer[MAX_NAMESIZE]; - - buf.ReadSpec("MODEL_NAME:", buffer); - mdlName_ = buffer; - - issueRate_ = buf.ReadIntSpec("ISSUE_RATE:"); - - int numIssueTypes = buf.ReadIntSpec("ISSUE_TYPE_COUNT:"); - - issueTypes_.resize(numIssueTypes > 0 ? numIssueTypes : 1); - if (numIssueTypes > 0) { - for (size_t j = 0; j < issueTypes_.size(); j++) { - int pieceCnt; - char *strngs[INBUF_MAX_PIECES_PERLINE]; - int lngths[INBUF_MAX_PIECES_PERLINE]; - buf.GetNxtVldLine(pieceCnt, strngs, lngths); - - if (pieceCnt != 2) - llvm::report_fatal_error("Invalid issue type spec", false); - - issueTypes_[j].name = strngs[0]; - issueTypes_[j].slotsCount = atoi(strngs[1]); - } - } -} - -MachineModel::MachineModel(SpecsBuffer &buf) { +void MachineModel::parseBuffer(SpecsBuffer &buf) { char buffer[MAX_NAMESIZE]; buf.ReadSpec("MODEL_NAME:", buffer); @@ -116,6 +86,14 @@ MachineModel::MachineModel(SpecsBuffer &buf) { } } +MachineModel::MachineModel(const std::string &modelFile) { + SpecsBuffer buf; + buf.Load(modelFile.c_str()); + parseBuffer(buf); +} + +MachineModel::MachineModel(SpecsBuffer &buf) { parseBuffer(buf); } + InstType MachineModel::GetInstTypeByName(const string &typeName, const string &prevName) const { string composite = prevName.size() ? typeName + "_after_" + prevName : ""; From 4aefcc94fe282f4c6de85b90fb2b89636bb9643f Mon Sep 17 00:00:00 2001 From: jeff Date: Fri, 20 May 2022 16:42:36 -0700 Subject: [PATCH 3/3] Integration with new ROCm --- CMakeLists.txt | 21 +- cmake/superbuild/CMakeLists.txt | 13 +- example/optsched-cfg/hotfuncs.ini | 126 +++++++++ example/optsched-cfg/occupancy_limits.ini | 248 ++++++++++++++++++ example/optsched-cfg/sched.ini | 51 ++-- include/opt-sched/Scheduler/OptSchedTarget.h | 16 +- include/opt-sched/Scheduler/bb_spill.h | 5 +- include/opt-sched/Scheduler/data_dep.h | 8 +- include/opt-sched/Scheduler/defines.h | 12 + include/opt-sched/Scheduler/enumerator.h | 19 +- include/opt-sched/Scheduler/hist_table.h | 2 + include/opt-sched/Scheduler/list_sched.h | 4 + include/opt-sched/Scheduler/lnkd_lst.h | 4 +- include/opt-sched/Scheduler/machine_model.h | 7 +- .../opt-sched/Scheduler/sched_basic_data.h | 11 +- include/opt-sched/Scheduler/sched_region.h | 6 +- include/opt-sched/Scheduler/stats.h | 8 +- lib/CMakeLists.txt | 28 +- lib/Scheduler/aco.cpp | 4 +- lib/Scheduler/bb_spill.cpp | 48 ++-- lib/Scheduler/config.cpp | 4 +- lib/Scheduler/data_dep.cpp | 36 +-- lib/Scheduler/enumerator.cpp | 95 ++++++- lib/Scheduler/graph_trans_ilp.cpp | 2 +- lib/Scheduler/hist_table.cpp | 12 +- lib/Scheduler/list_sched.cpp | 38 ++- lib/Scheduler/machine_model.cpp | 24 +- lib/Scheduler/reg_alloc.cpp | 61 ++++- lib/Scheduler/register.cpp | 126 +++++---- lib/Scheduler/sched_basic_data.cpp | 28 +- lib/Scheduler/sched_region.cpp | 25 +- lib/Scheduler/stats.cpp | 1 + lib/Wrapper/AMDGPU/GCNOptSched.cpp | 14 +- lib/Wrapper/AMDGPU/GCNOptSched.h | 2 +- lib/Wrapper/AMDGPU/GCNOptSchedReg.h | 30 +++ lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp | 146 +++++++++-- lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp | 93 ++++++- lib/Wrapper/OptSchedDDGWrapperBasic.cpp | 64 ++++- lib/Wrapper/OptSchedDDGWrapperBasic.h | 1 + lib/Wrapper/OptSchedGenericTarget.cpp | 21 +- lib/Wrapper/OptSchedMachineWrapper.cpp | 9 +- lib/Wrapper/OptSchedMachineWrapper.h | 1 - lib/Wrapper/OptSchedReg.h | 30 +++ lib/Wrapper/OptimizingScheduler.cpp | 177 +++++++++---- lib/Wrapper/OptimizingScheduler.h | 23 +- test/CMakeLists.txt | 2 +- unittests/CMakeLists.txt | 2 +- 47 files changed, 1374 insertions(+), 334 deletions(-) create mode 100644 example/optsched-cfg/occupancy_limits.ini create mode 100644 lib/Wrapper/AMDGPU/GCNOptSchedReg.h create mode 100644 lib/Wrapper/OptSchedReg.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b0eab82..5c1ece74 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,8 +2,9 @@ cmake_minimum_required(VERSION 3.4.3) project(OptSched) -option(OPTSCHED_INCLUDE_TESTS "Generate build targets for the OptSched unit tests." ON) -option(OPTSCHED_ENABLE_AMDGPU "Build the AMDGPU code. Requires that the AMDGPU target is supported." OFF) +option(OPTSCHED_INCLUDE_TESTS "Generate build targets for the OptSched unit tests." OFF) +option(OPTSCHED_ENABLE_AMDGPU "Build the AMDGPU code. Requires that the AMDGPU target is supported." ON) + set(OPTSCHED_LIT_ARGS "-sv" CACHE STRING "Arguments to pass to lit") set(OPTSCHED_EXTRA_LINK_LIBRARIES "" CACHE STRING "Extra link_libraries to pass to OptSched, ;-separated") set(OPTSCHED_EXTRA_INCLUDE_DIRS "" CACHE STRING "Extra include_directories to pass to OptSched, ;-separated") @@ -11,18 +12,18 @@ set(OPTSCHED_EXTRA_INCLUDE_DIRS "" CACHE STRING "Extra include_directories to pa # '-DOPTSCHED_EXTRA_DEFINITIONS=-DIS_DEBUG_DEFS_AND_USES;-DIS_DEBUG_DEF_USE_COUNT' set(OPTSCHED_EXTRA_DEFINITIONS "" CACHE STRING "Extra add_definitions to pass to OptSched, ;-separated") -if(TARGET LLVMCodeGen) +if(TARGET LLVMAMDGPUCodeGen OR TARGET LLVMCodeGen OR TARGET LLVMX86CodeGen) set(llvm_subproject TRUE) else() set(llvm_subproject FALSE) endif() +# Not supported if(NOT llvm_subproject) set(llvm_version 6.0) if(OPTSCHED_ENABLE_AMDGPU) set(llvm_version 9.0) endif() - set(OPTSCHED_LLVM_VERSION ${llvm_version} CACHE STRING "The LLVM version to build OptSched with (independent build only)") find_package(LLVM ${OPTSCHED_LLVM_VERSION} REQUIRED CONFIG) @@ -36,8 +37,8 @@ endif() if(OPTSCHED_ENABLE_AMDGPU) if(NOT "AMDGPU" IN_LIST LLVM_ALL_TARGETS) message(FATAL_ERROR "Trying to build the AMDGPU code, but AMDGPU is not supported by this build of LLVM") - elseif(LLVM_VERSION VERSION_LESS 7.0) - message(FATAL_ERROR "OptSched requries LLVM version >= 7.0 to build the AMDGPU scheduler.") + elseif(LLVM_VERSION VERSION_LESS 13.0) + message(FATAL_ERROR "OptSched requries LLVM version >= 13.0 to build the AMDGPU scheduler.") endif() endif() @@ -62,10 +63,6 @@ include_directories( add_definitions(${OPTSCHED_EXTRA_DEFINITIONS}) link_directories(${OPTSCHED_EXTRA_LINK_LIBRARIES}) -if(LLVM_VERSION VERSION_LESS 7.0) - add_definitions(-DLLVM_DEBUG=DEBUG) -endif() - if(NOT llvm_subproject) include(GetLocalLLVM) @@ -98,9 +95,5 @@ if(OPTSCHED_INCLUDE_TESTS) COMMAND ${LLVM_TOOLS_BINARY_DIR}/clang ${CMAKE_CURRENT_SOURCE_DIR}/example/helloworld.cpp -O3 - -fplugin=$ - -mllvm -misched=optsched - -mllvm -enable-misched - -mllvm -optsched-cfg=${CMAKE_CURRENT_SOURCE_DIR}/example/optsched-cfg ) endif() diff --git a/cmake/superbuild/CMakeLists.txt b/cmake/superbuild/CMakeLists.txt index 99ad7af3..c96a48a5 100644 --- a/cmake/superbuild/CMakeLists.txt +++ b/cmake/superbuild/CMakeLists.txt @@ -13,6 +13,15 @@ # - LLVM_PARALLEL_LINK_JOBS. # - *_EXTRA_CMAKE_ARGS: Passes these CMake arguments on to the corresponding sub-build. # - The flang builds can be configured to use a custom CMAKE_GENERATOR, separate from the superbuild's generator. + + +############################################ +# +# As of 5/26/2022, the superbuild script is +# no longer gauranteed to work. +# +############################################ + cmake_minimum_required(VERSION 3.7) project(OptSched-SuperBuild) @@ -99,8 +108,4 @@ add_test(NAME OptSched-CompileHelloWorld COMMAND ${OPTSCHEDSUPER_LLVM_INSTALL_PREFIX}/bin/clang ${ROOT_DIR}/example/helloworld.cpp -O3 - -fplugin=${OPTSCHEDSUPER_LLVM_INSTALL_PREFIX}/lib/OptSched.so - -mllvm -misched=optsched - -mllvm -enable-misched - -mllvm -optsched-cfg=${ROOT_DIR}/example/optsched-cfg ) diff --git a/example/optsched-cfg/hotfuncs.ini b/example/optsched-cfg/hotfuncs.ini index f1252adc..f15bfcae 100644 --- a/example/optsched-cfg/hotfuncs.ini +++ b/example/optsched-cfg/hotfuncs.ini @@ -211,3 +211,129 @@ module_big_step_utilities_em_calc_cq_ YES mgau_eval YES vector_gautbl_eval_logs3 YES subvq_mgau_shortlist YES + +# ====================================== +# SPEC CPU2017 (fp rate only) +# ====================================== + +#503.bwaves_r Total 97.51% (95.88% selected) +mat_times_vec_ YES #68.22% +bi_cgstab_block_ YES #12.60% +shell_ YES #10.81% +jacobian_ YES # 4.25% +#flux_ YES # 1.63% + +#507.cactuBSSN_r Total 90.09% +_ZL16ML_BSSN_RHS_BodyPK4_cGHiiPKdS3_S3_PKiS5_iPKPd YES #35.83% +_ZL19ML_BSSN_Advect_BodyPK4_cGHiiPKdS3_S3_PKiS5_iPKPd YES #30.82% +_ZL24ML_BSSN_constraints_BodyPK4_cGHiiPKdS3_S3_PKiS5_iPKPd YES # 8.90% +_ZL41ML_BSSN_convertToADMBaseDtLapseShift_BodyPK4_cGHiiPKdS3_S3_PKiS5_iPKPd YES # 8.45% +MoL_LinearCombination YES # 3.27% +_ZL29ML_BSSN_convertToADMBase_BodyPK4_cGHiiPKdS3_S3_PKiS5_iPKPd YES # 2.82% + +#508.namd_r Total 99.34% +_Z22pairlist_from_pairlistddddPK8CompAtomPKtiPtdPd YES #18.81% +_ZN20ComputeNonbondedUtil26calc_pair_energy_fullelectEP9nonbonded YES #13.12% +_ZN20ComputeNonbondedUtil19calc_pair_fullelectEP9nonbonded YES # 9.52% +_ZN20ComputeNonbondedUtil16calc_pair_energyEP9nonbonded YES # 9.35% +_ZN20ComputeNonbondedUtil32calc_pair_energy_merge_fullelectEP9nonbonded YES # 9.11% +_ZN20ComputeNonbondedUtil25calc_pair_merge_fullelectEP9nonbonded YES # 7.00% +_ZN20ComputeNonbondedUtil9calc_pairEP9nonbonded YES # 6.98% +_ZN20ComputeNonbondedUtil26calc_self_energy_fullelectEP9nonbonded YES # 5.78% +_ZN20ComputeNonbondedUtil32calc_self_energy_merge_fullelectEP9nonbonded YES # 4.80% +_ZN20ComputeNonbondedUtil16calc_self_energyEP9nonbonded YES # 4.73% +_ZN20ComputeNonbondedUtil19calc_self_fullelectEP9nonbonded YES # 4.11% +_ZN20ComputeNonbondedUtil9calc_selfEP9nonbonded YES # 3.02% +_ZN20ComputeNonbondedUtil25calc_self_merge_fullelectEP9nonbonded YES # 3.01% + +#510.parest_r Total 85.12% (83.38% selected) +_ZNK6dealii9SparseILUIdE5vmultIdEEvRNS_6VectorIT_EERKS5_ YES #29.73% +_ZNK6dealii12SparseMatrixIdE5vmultINS_6VectorIdEES4_EEvRT_RKT0_ YES #25.33% +_ZNK6dealii6VectorIdEmlIdEEdRKNS0_IT_EE YES #13.83% +_ZNK6dealii12SparseMatrixIdE17precondition_SSORIdEEvRNS_6VectorIT_EERKS5_dRKSt6vectorIjSaIjEE YES # 5.94% +_ZN6dealii11SolverGMRESINS_6VectorIdEEE5solveINS_12SparseMatrixIdEENS_9SparseILUIdEEEEvRKT_RS2_RKS2_RKT0_ YES # 3.79% +_ZN6dealii8FESystemILi3ELi3EE10initializeEv YES # 2.66% +_ZN12METomography5Slave5SlaveILi3EE12GlobalMatrix15assemble_matrixERKN6dealii18TriaActiveIteratorINS4_15DoFCellAccessorINS4_10DoFHandlerILi3ELi3EEEEEEERNS0_8internal13AssemblerDataILi3EEE YES # 2.10% +#_ZNK6dealii15SparsityPatternclEjj YES # 1.74% + +#511.povray_r Total 82.24% (78.66% selected) +_ZN3povL23All_Plane_IntersectionsEPNS_13Object_StructEPNS_10Ray_StructEPNS_13istack_structE YES #16.55% +_ZN3povL31All_CSG_Intersect_IntersectionsEPNS_13Object_StructEPNS_10Ray_StructEPNS_13istack_structE YES #10.95% +_ZN3povL24All_Sphere_IntersectionsEPNS_13Object_StructEPNS_10Ray_StructEPNS_13istack_structE YES #10.72% +_ZN3pov17Check_And_EnqueueEPNS_21Priority_Queue_StructEPNS_16BBox_Tree_StructEPNS_19Bounding_Box_StructEPNS_14Rayinfo_StructE YES # 8.58% +_ZN3povL12Inside_PlaneEPdPNS_13Object_StructE YES # 4.83% +_ZN3pov12Ray_In_BoundEPNS_10Ray_StructEPNS_13Object_StructE YES # 4.55% +_ZN3pov19Intersect_BBox_TreeEPNS_16BBox_Tree_StructEPNS_10Ray_StructEPNS_10istk_entryEPPNS_13Object_StructEb YES # 4.09% +_ZN3pov6DNoiseEPdS0_ YES # 4.07% +_ZN3povL25All_Quadric_IntersectionsEPNS_13Object_StructEPNS_10Ray_StructEPNS_13istack_structE YES # 3.78% +_ZN3povL14Inside_QuadricEPdPNS_13Object_StructE YES # 2.93% +_ZN3pov13Inside_ObjectEPdPNS_13Object_StructE YES # 2.90% +_ZN3pov20Intersect_Light_TreeEPNS_10Ray_StructEPNS_24Project_Tree_Node_StructEiiPNS_10istk_entryEPPNS_13Object_StructEPNS_19Light_Source_StructE YES # 2.46% +_ZN3pov5NoiseEPdPNS_14Pattern_StructE YES # 2.25% +#_ZN3pov18MInvTransDirectionEPdS0_PNS_16Transform_StructE YES # 1.83% +#_ZN3pov12IntersectionEPNS_10istk_entryEPNS_13Object_StructEPNS_10Ray_StructE YES # 1.75% + +#519.lbm_r +LBM_performStreamCollideTRT YES #99.04% + +#526.blender_r Total 91.73% (84.1% selected) +_ZL9intersectILi1024EEiP8VBVHTreeP5Isect YES #61.79% +RE_rayobject_intersect YES #14.34% +add_radiance YES # 3.95% +ray_ao YES # 2.50% +#zbuffer_sss YES # 1.72% +#traverse_octree YES # 1.72% +#zbuffer_solid YES # 1.57% +#zbuf_part_project YES # 1.52% +#ray_shadow YES # 1.42% +#RE_rayobject_raycast YES # 1.20% + +#527.cam4_r Total 25.90% (excluding libraries) 47.35% (including libraries) +#__fsd_pow_fma3 YES # 8.05% #From libpgmath.so(runtime shared library) +aer_rad_props_aer_rad_props_sw_ YES # 5.66% +#__fsd_exp_fma3 YES # 5.38% #From libpgmath.so(runtime shared library) +radsw_radcswmx_ YES # 5.14% +radae_radabs_ YES # 3.42% +zm_conv_ientropy_ YES # 3.28% +#__fd_log_1_avx512 YES # 3.25% #From libpgmath.so(runtime shared library) +radsw_raddedmx_ YES # 3.16% +tracer_data_vert_interp_ YES # 3.10% +#__memset_avx2_unaligned_erms YES # 2.64% #From standard library +radae_trcab_ YES # 2.14% +#__memcmp_avx2_movbe YES # 2.13% #From standard library + +#538.imagick_r Total 96.98% +MorphologyApply YES #45.04% +MeanShiftImage YES #21.48% +SetPixelCacheNexusPixels YES #16.88% +GetVirtualPixelsFromNexus YES # 9.65% +GetOneCacheViewVirtualPixel YES # 3.93% + +#544.nab_r Total 89.04% (excluding libraries) 98.63 (including libraries) +mme34 YES #66.21% +nbond YES # 8.45% +searchkdtree YES # 7.24% +heapsort_pairs YES # 7.14% +#__ieee754_log_fma YES # 4.21% +#__ieee754_exp_fma YES # 3.05% +#exp@@GLIBC_2.29 YES # 2.33% + +#549.fotonik3d_r Total 99.18% +upml_mod_upml_updatee_simple_ YES #26.47% +upml_mod_upml_updateh_ YES #24.04% +material_mod_mat_updatee_ YES #21.26% +update_mod_updateh_ YES #17.24% +power_mod_power_dft_ YES #10.17% + +#554.roms_r.txt Total 80.63% (excluding library) 84.62%(including library) +step2d_mod_step2d_tile_ YES #27.67% +pre_step3d_mod_pre_step3d_tile_ YES #10.74% +lmd_skpp_mod_lmd_skpp_tile_ YES # 7.19% +step3d_t_mod_step3d_t_tile_ YES # 6.54% +rhs3d_mod_rhs3d_tile_ YES # 6.14% +t3dmix_mod_t3dmix2_tile_ YES # 6.05% +step3d_uv_mod_step3d_uv_tile_ YES # 5.93% +#__fsd_exp_fma3 YES # 3.99% #From libpgmath.so(runtime shared library) +rho_eos_mod_rho_eos_tile_ YES # 3.73% +prsgrd_mod_prsgrd_tile_ YES # 3.62% +uv3dmix_mod_uv3dmix2_tile_ YES # 3.02% \ No newline at end of file diff --git a/example/optsched-cfg/occupancy_limits.ini b/example/optsched-cfg/occupancy_limits.ini new file mode 100644 index 00000000..ba5dc580 --- /dev/null +++ b/example/optsched-cfg/occupancy_limits.ini @@ -0,0 +1,248 @@ +_ZN7rocprim6detail23histogram_global_kernelILj256ELj8ELj1ELj1EPtjNS0_18sample_to_bin_evenIivEEEEvT3_jjNS0_11fixed_arrayIPT4_XT2_EEENS6_IT5_XT2_EEENS6_IjXT2_EEE 6 +_ZN7rocprim6detail23histogram_global_kernelILj256ELj8ELj1ELj1EPijNS0_18sample_to_bin_evenIivEEEEvT3_jjNS0_11fixed_arrayIPT4_XT2_EEENS6_IT5_XT2_EEENS6_IjXT2_EEE 5 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE1EE11custom_typeIddELj256ELj4ELj100EEvPKT0_PS6_ 6 +_ZN7rocprim6detail23histogram_global_kernelILj256ELj2ELj4ELj3EPtjNS0_18sample_to_bin_evenIivEEEEvT3_jjNS0_11fixed_arrayIPT4_XT2_EEENS6_IT5_XT2_EEENS6_IjXT2_EEE 4 +_ZN7rocprim6detail23histogram_global_kernelILj256ELj8ELj1ELj1EP6__halfjNS0_18sample_to_bin_evenIivEEEEvT3_jjNS0_11fixed_arrayIPT4_XT2_EEENS7_IT5_XT2_EEENS7_IjXT2_EEE 9 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE1EEfLj256ELj11ELj100EEvPKT0_PS4_ 7 +_ZN7rocprim6detail23histogram_global_kernelILj256ELj8ELj1ELj1EPhjNS0_18sample_to_bin_evenIivEEEEvT3_jjNS0_11fixed_arrayIPT4_XT2_EEENS6_IT5_XT2_EEENS6_IjXT2_EEE 4 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE1ELb1ENS0_21default_select_configILj0ENS_5tupleIJjjEEEEENS_12zip_iteratorINS4_IJPjS8_EEEEEPNS_10empty_typeESA_S8_ZNS_34run_length_encode_non_trivial_runsINS_14default_configEPaS8_S8_S8_EE10hipError_tPvRmT0_jT1_T2_T3_P12ihipStream_tbEUlRKS5_E_SB_NS0_19lookback_scan_stateIjLb0ELb1EEEEEvSL_SM_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 6 +_ZN7rocprim6detail23histogram_global_kernelILj256ELj8ELj1ELj1EPajNS0_18sample_to_bin_evenIivEEEEvT3_jjNS0_11fixed_arrayIPT4_XT2_EEENS6_IT5_XT2_EEENS6_IjXT2_EEE 5 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE1ELb1ENS0_21default_select_configILj0ENS_5tupleIJjjEEEEENS_12zip_iteratorINS4_IJPjS8_EEEEEPNS_10empty_typeESA_S8_ZNS_34run_length_encode_non_trivial_runsINS_14default_configEPhS8_S8_S8_EE10hipError_tPvRmT0_jT1_T2_T3_P12ihipStream_tbEUlRKS5_E_SB_NS0_19lookback_scan_stateIjLb0ELb1EEEEEvSL_SM_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 9 +_Z6kernelI9histogramILN7rocprim25block_histogram_algorithmE1EEyLj256ELj8ELj256ELj100EEvPKT0_PS4_ 5 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEdLj256ELj4ELj100EEvPKT0_PS4_ 7 +_ZN7rocprim6detail34scan_and_scatter_carry_outs_kernelILj256ELj2EdPdNS_4plusIdEEEEvPKNS0_9carry_outIT1_EEPKS6_T2_T3_j 5 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEdLj256ELj1ELj100EEvPKT0_PS4_ 10 +_Z6kernelI9histogramILN7rocprim25block_histogram_algorithmE1EEyLj256ELj16ELj256ELj100EEvPKT0_PS4_ 7 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEfLj256ELj1ELj100EEvPKT0_PS4_ 9 +_ZN7rocprim6detail23sort_and_scatter_kernelILj256ELj10ELj4ELb0EPaS2_PNS_10empty_typeES4_EEvT3_T4_T5_T6_jPKjSA_jjjj 9 +_Z6kernelI9histogramILN7rocprim25block_histogram_algorithmE1EEyLj256ELj4ELj256ELj100EEvPKT0_PS4_ 6 +_ZN7rocprim6detail23sort_and_scatter_kernelILj256ELj10ELj4ELb0EPhS2_PNS_10empty_typeES4_EEvT3_T4_T5_T6_jPKjSA_jjjj 7 +_Z6kernelI9histogramILN7rocprim25block_histogram_algorithmE1EEyLj320ELj16ELj320ELj100EEvPKT0_PS4_ 7 +_Z17sort_pairs_kernelIxLj512ELj8ELj10EEvPKT_PS0_ 5 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE2ELb1ENS0_21default_select_configILj0EfEEPfPNS_10empty_typeES5_PjS6_NS0_18inequality_wrapperINS_8equal_toIfEEEENS0_19lookback_scan_stateIjLb0ELb1EEEEEvT2_T3_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 5 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE2ELb1ENS0_21default_select_configILj0EiEEPiPNS_10empty_typeES5_PjS6_NS0_18inequality_wrapperINS_8equal_toIiEEEENS0_19lookback_scan_stateIjLb0ELb1EEEEEvT2_T3_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 4 +_Z6kernelI9histogramILN7rocprim25block_histogram_algorithmE1EEyLj256ELj3ELj256ELj100EEvPKT0_PS4_ 6 +_Z6kernelI9histogramILN7rocprim25block_histogram_algorithmE1EEyLj320ELj4ELj320ELj100EEvPKT0_PS4_ 10 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EE15HIP_vector_typeIdLj2EELj256ELj1ELj100EEvPKT0_PS6_ 6 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEfLj256ELj4ELj100EEvPKT0_PS4_ 4 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EE11custom_typeIddELj256ELj1ELj100EEvPKT0_PS6_ 6 +_Z6kernelI9histogramILN7rocprim25block_histogram_algorithmE1EEyLj320ELj8ELj320ELj100EEvPKT0_PS4_ 9 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE1ELb1ENS0_21default_select_configILj0EiEEPiPNS_10empty_typeES5_PjZ22run_selectop_benchmarkIiEvRN9benchmark5StateEmP12ihipStream_tfEUlRKiE_S6_NS0_19lookback_scan_stateIjLb0ELb1EEEEEvT2_T3_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 6 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE2EE11custom_typeIffELj256ELj1ELj100EEvPKT0_PS6_ 5 +_ZN7rocprim6detail21segmented_sort_kernelINS0_35default_segmented_radix_sort_configILj0Ei11custom_typeIffEEELb0EPiS6_PS4_S7_S6_EEvT1_PNSt15iterator_traitsIS8_E10value_typeET2_T3_PNS9_ISE_E10value_typeET4_bT5_SJ_jjjj 9 +_Z6kernelI9histogramILN7rocprim25block_histogram_algorithmE1EEiLj320ELj16ELj320ELj100EEvPKT0_PS4_ 9 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEdLj256ELj8ELj100EEvPKT0_PS4_ 10 +_Z17sort_pairs_kernelIxLj256ELj8ELj10EEvPKT_PS0_ 5 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEdLj256ELj3ELj100EEvPKT0_PS4_ 7 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEfLj256ELj16ELj100EEvPKT0_PS4_ 7 +_Z17sort_pairs_kernelIiLj512ELj8ELj10EEvPKT_PS0_ 6 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EE6__halfLj256ELj1ELj100EEvPKT0_PS5_ 9 +_ZN7rocprim6detail23segmented_reduce_kernelINS0_21default_reduce_configILj0E11custom_typeIffEEEPS4_S6_PiS4_NS_4plusIS4_EEEEvT0_T1_T2_SC_T4_T3_ 10 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEdLj256ELj2ELj100EEvPKT0_PS4_ 6 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE1ELb1ENS0_21default_select_configILj0EfEEPfPNS_10empty_typeES5_PjZ22run_selectop_benchmarkIfEvRN9benchmark5StateEmP12ihipStream_tfEUlRKfE_S6_NS0_19lookback_scan_stateIjLb0ELb1EEEEEvT2_T3_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 5 +_ZN7rocprim6detail23sort_and_scatter_kernelILj256ELj10ELj5ELb0EPsS2_PNS_10empty_typeES4_EEvT3_T4_T5_T6_jPKjSA_jjjj 4 +_Z16operation_kernelIiLj128ELj32EL23memory_operation_method0E9operationIL16kernel_operation1EiLj32ELj128EELi0EEvPT_S5_T3_ 4 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE2EE6__halfLj256ELj3ELj100EEvPKT0_PS5_ 5 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EE15HIP_vector_typeIdLj2EELj256ELj4ELj100EEvPKT0_PS6_ 10 +_Z6kernelI10flag_tailsiLj256ELj4ELb0ELj100EEvPKT0_PS1_ 6 +_Z26warp_inclusive_scan_kernelI11custom_typeIidELj15ELj100EEvPKT_PS2_ 7 +_Z16operation_kernelIiLj256ELj16EL23memory_operation_method0E9operationIL16kernel_operation1EiLj16ELj256EELi0EEvPT_S5_T3_ 4 +_Z16operation_kernelImLj256ELj4EL23memory_operation_method2E9operationIL16kernel_operation1EmLj4ELj256EELi0EEvPT_S5_T3_ 5 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE1EE15HIP_vector_typeIdLj2EELj256ELj4ELj100EEvPKT0_PS6_ 6 +_Z16operation_kernelImLj512ELj2EL23memory_operation_method2E9operationIL16kernel_operation1EmLj2ELj512EELi0EEvPT_S5_T3_ 10 +_Z6kernelI20flag_heads_and_tailsxLj256ELj2ELb1ELj100EEvPKT0_PS1_ 6 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EE11custom_typeIffELj256ELj4ELj100EEvPKT0_PS6_ 5 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEdLj64ELj4ELj100EEvPKT0_PS4_ 6 +_Z16operation_kernelImLj1024ELj8EL23memory_operation_method1E9operationIL16kernel_operation0EmLj8ELj1024EELi0EEvPT_S5_T3_ 4 +_Z6kernelI10flag_headsiLj256ELj2ELb1ELj100EEvPKT0_PS1_ 6 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEdLj64ELj2ELj100EEvPKT0_PS4_ 10 +_Z6kernelI20flag_heads_and_tailsiLj256ELj4ELb0ELj100EEvPKT0_PS1_ 6 +_Z17sort_pairs_kernelIiLj320ELj8ELj10EEvPKT_PS0_ 9 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE2EEdLj64ELj4ELj100EEvPKT0_PS4_ 10 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEdLj64ELj1ELj100EEvPKT0_PS4_ 8 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE2EEdLj64ELj2ELj100EEvPKT0_PS4_ 4 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE0ELb1ENS0_21default_select_configILj0EhEEPhS5_S5_PjNS_10empty_typeES7_NS0_19lookback_scan_stateIjLb0ELb1EEEEEvT2_T3_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 6 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE2EEdLj64ELj1ELj100EEvPKT0_PS4_ 5 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE0ELb1ENS0_21default_select_configILj0EaEEPaS5_S5_PjNS_10empty_typeES7_NS0_19lookback_scan_stateIjLb0ELb1EEEEEvT2_T3_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 6 +_Z16operation_kernelIfLj256ELj2EL23memory_operation_method0E9operationIL16kernel_operation1EfLj2ELj256EELi0EEvPT_S5_T3_ 9 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE0EEfLj256ELj4ELj100EEvPKT0_PS4_ 4 +_Z17sort_pairs_kernelIiLj320ELj10EEvPKT_PS0_ 8 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EE11custom_typeIffELj256ELj1ELj100EEvPKT0_PS6_ 7 +_Z16operation_kernelIfLj512ELj2EL23memory_operation_method0E9operationIL16kernel_operation1EfLj2ELj512EELi0EEvPT_S5_T3_ 6 +_ZN7rocprim6detail34scan_and_scatter_carry_outs_kernelILj256ELj2E11custom_typeIffEPS3_NS_4plusIS3_EEEEvPKNS0_9carry_outIT1_EEPKS8_T2_T3_j 7 +_Z16operation_kernelIdLj128ELj4EL23memory_operation_method0E9operationIL16kernel_operation1EdLj4ELj128EELi0EEvPT_S5_T3_ 9 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE0ELb0ENS0_21default_select_configILj0EaEEPaS5_S5_PjNS_10empty_typeES7_NS0_19lookback_scan_stateIjLb0ELb1EEEEEvT2_T3_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 5 +_Z17sort_pairs_kernelI6__halfLj256ELj8ELj10EEvPKT_PS1_ 10 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE0ELb0ENS0_21default_select_configILj0EhEEPhS5_S5_PjNS_10empty_typeES7_NS0_19lookback_scan_stateIjLb0ELb1EEEEEvT2_T3_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 7 +_Z6kernelI10flag_headsiLj256ELj4ELb1ELj100EEvPKT0_PS1_ 6 +_Z16operation_kernelIdLj128ELj2EL23memory_operation_method0E9operationIL16kernel_operation1EdLj2ELj128EELi0EEvPT_S5_T3_ 9 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE2EE11custom_typeIddELj256ELj4ELj100EEvPKT0_PS6_ 10 +_ZN7rocprim6detail23sort_and_scatter_kernelILj256ELj10ELj5ELb0EP6__halfS3_S3_S3_EEvT3_T4_T5_T6_jPKjS9_jjjj 4 +_Z6kernelI10flag_tailsiLj256ELj4ELb1ELj100EEvPKT0_PS1_ 4 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE0ELb1ENS0_21default_select_configILj0EdEEPdPhS5_PjNS_10empty_typeES8_NS0_19lookback_scan_stateIjLb0ELb1EEEEEvT2_T3_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 4 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE2EE15HIP_vector_typeIdLj2EELj256ELj1ELj100EEvPKT0_PS6_ 6 +_Z17sort_pairs_kernelIhLj320ELj8ELj10EEvPKT_PS0_ 9 +_ZN7rocprim6detail12merge_kernelILj256ELj10EPjP6__halfS4_S4_S4_S4_S4_9half_lessEEvT1_T2_T3_T4_T5_T6_T7_mmT8_ 10 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EE15HIP_vector_typeIfLj2EELj256ELj4ELj100EEvPKT0_PS6_ 10 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE2EE6__halfLj256ELj11ELj100EEvPKT0_PS5_ 5 +_Z16operation_kernelIiLj256ELj2EL23memory_operation_method0E9operationIL16kernel_operation1EiLj2ELj256EELi0EEvPT_S5_T3_ 4 +_Z6kernelI14inclusive_scanILN7rocprim20block_scan_algorithmE1EE15HIP_vector_typeIdLj2EELj256ELj1ELj100EEvPKT0_PS6_ 5 +_ZN7rocprim6detail23histogram_shared_kernelILj256ELj2ELj4ELj3EPhjNS0_18sample_to_bin_evenIivEEEEvT3_jjjjNS0_11fixed_arrayIPT4_XT2_EEENS6_IT5_XT2_EEENS6_IjXT2_EEE 6 +_Z17sort_pairs_kernelIaLj320ELj8ELj10EEvPKT_PS0_ 9 +_ZN7rocprim6detail23sort_and_scatter_kernelILj256ELj15ELj6ELb0EPiS2_PfS3_EEvT3_T4_T5_T6_jPKjS9_jjjj 7 +_Z16operation_kernelIfLj256ELj16EL23memory_operation_method0E9operationIL16kernel_operation1EfLj16ELj256EELi0EEvPT_S5_T3_ 4 +_Z18warp_reduce_kernelILb0EiLj37ELj100EEvPKT0_PS0_ 7 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE1EE15HIP_vector_typeIdLj2EELj256ELj1ELj100EEvPKT0_PS6_ 6 +_Z17sort_pairs_kernelIxLj512ELj10EEvPKT_PS0_ 10 +_Z6kernelI9histogramILN7rocprim25block_histogram_algorithmE1EEiLj320ELj8ELj320ELj100EEvPKT0_PS4_ 8 +_Z6kernelI10flag_tailsiLj256ELj2ELb1ELj100EEvPKT0_PS1_ 6 +_Z16operation_kernelIdLj512ELj2EL23memory_operation_method0E9operationIL16kernel_operation2EdLj2ELj512EELi0EEvPT_S5_T3_ 5 +_Z6kernelI10flag_headsiLj256ELj3ELb1ELj100EEvPKT0_PS1_ 6 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE1ELb0ENS0_21default_select_configILj0EfEEPfPNS_10empty_typeES5_PjZ16run_if_benchmarkIfEvRN9benchmark5StateEmP12ihipStream_tfEUlRKfE_S6_NS0_19lookback_scan_stateIjLb0ELb1EEEEEvT2_T3_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 5 +_Z16operation_kernelIdLj128ELj16EL23memory_operation_method0E9operationIL16kernel_operation1EdLj16ELj128EELi0EEvPT_S5_T3_ 4 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EE6__halfLj256ELj16ELj100EEvPKT0_PS5_ 5 +_Z16operation_kernelIiLj256ELj1EL23memory_operation_method0E9operationIL16kernel_operation1EiLj1ELj256EELi0EEvPT_S5_T3_ 4 +_Z16operation_kernelIfLj512ELj8EL23memory_operation_method0E9operationIL16kernel_operation1EfLj8ELj512EELi0EEvPT_S5_T3_ 9 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEfLj256ELj11ELj100EEvPKT0_PS4_ 6 +_Z16operation_kernelIiLj256ELj8EL23memory_operation_method0E9operationIL16kernel_operation1EiLj8ELj256EELi0EEvPT_S5_T3_ 7 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE1ELb1ENS0_21default_select_configILj0ENS_5tupleIJjjEEEEENS_12zip_iteratorINS4_IJPjS8_EEEEEPNS_10empty_typeESA_S8_ZNS_34run_length_encode_non_trivial_runsINS_14default_configEPxS8_S8_S8_EE10hipError_tPvRmT0_jT1_T2_T3_P12ihipStream_tbEUlRKS5_E_SB_NS0_19lookback_scan_stateIjLb0ELb1EEEEEvSL_SM_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 6 +_Z16operation_kernelIdLj128ELj8EL23memory_operation_method0E9operationIL16kernel_operation1EdLj8ELj128EELi0EEvPT_S5_T3_ 10 +_Z16operation_kernelIfLj256ELj8EL23memory_operation_method0E9operationIL16kernel_operation1EfLj8ELj256EELi0EEvPT_S5_T3_ 4 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE0EEaLj256ELj11ELj100EEvPKT0_PS4_ 8 +_Z16operation_kernelIiLj1024ELj2EL23memory_operation_method0E9operationIL16kernel_operation4EiLj2ELj1024EELi0EEvPT_S5_T3_ 4 +_Z16operation_kernelIiLj128ELj8EL23memory_operation_method2E9operationIL16kernel_operation1EiLj8ELj128EELi0EEvPT_S5_T3_ 6 +_Z6kernelI20flag_heads_and_tailsiLj256ELj3ELb1ELj100EEvPKT0_PS1_ 6 +_Z6kernelI10flag_tailsiLj256ELj3ELb1ELj100EEvPKT0_PS1_ 9 +_Z16operation_kernelImLj1024ELj2EL23memory_operation_method2E9operationIL16kernel_operation1EmLj2ELj1024EELi0EEvPT_S5_T3_ 6 +_Z6kernelI10flag_tails6__halfLj256ELj1ELb0ELj100EEvPKT0_PS2_ 6 +_ZN7rocprim6detail23histogram_shared_kernelILj256ELj2ELj4ELj3EPtjNS0_18sample_to_bin_evenIivEEEEvT3_jjjjNS0_11fixed_arrayIPT4_XT2_EEENS6_IT5_XT2_EEENS6_IjXT2_EEE 7 +_Z16operation_kernelIiLj1024ELj4EL23memory_operation_method0E9operationIL16kernel_operation1EiLj4ELj1024EELi0EEvPT_S5_T3_ 6 +_Z17sort_pairs_kernelIiLj64ELj3ELj10EEvPKT_PS0_ 4 +_Z17sort_pairs_kernelIxLj320ELj3ELj10EEvPKT_PS0_ 4 +_Z6kernelI10flag_heads6__halfLj256ELj1ELb0ELj100EEvPKT0_PS2_ 6 +_Z16operation_kernelIfLj1024ELj1EL23memory_operation_method0E9operationIL16kernel_operation1EfLj1ELj1024EELi0EEvPT_S5_T3_ 6 +_ZN7rocprim6detail18block_merge_kernelILj1024EP11custom_typeIiiES4_PS2_IcdES6_NS_4lessIS3_EEEEvT0_T1_T2_T3_mjT4_ 6 +_Z16operation_kernelImLj1024ELj1EL23memory_operation_method2E9operationIL16kernel_operation1EmLj1ELj1024EELi0EEvPT_S5_T3_ 5 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE2EE11custom_typeIddELj256ELj1ELj100EEvPKT0_PS6_ 8 +_ZN7rocprim6detail34scan_and_scatter_carry_outs_kernelILj256ELj2EhPhNS_4plusIhEEEEvPKNS0_9carry_outIT1_EEPKS6_T2_T3_j 9 +_Z6kernelI20flag_heads_and_tailsiLj256ELj4ELb1ELj100EEvPKT0_PS1_ 6 +_Z6kernelI10flag_tails6__halfLj256ELj4ELb1ELj100EEvPKT0_PS2_ 4 +_ZN7rocprim6detail34scan_and_scatter_carry_outs_kernelILj256ELj2EaPaNS_4plusIaEEEEvPKNS0_9carry_outIT1_EEPKS6_T2_T3_j 4 +_Z16operation_kernelIiLj512ELj1EL23memory_operation_method0E9operationIL16kernel_operation1EiLj1ELj512EELi0EEvPT_S5_T3_ 4 +_Z16operation_kernelIdLj256ELj2EL23memory_operation_method0E9operationIL16kernel_operation1EdLj2ELj256EELi0EEvPT_S5_T3_ 9 +_ZN7rocprim6detail23sort_and_scatter_kernelILj256ELj10ELj5ELb0EP6__halfS3_PNS_10empty_typeES5_EEvT3_T4_T5_T6_jPKjSB_jjjj 7 +_Z16operation_kernelIfLj256ELj2EL23memory_operation_method2E9operationIL16kernel_operation1EfLj2ELj256EELi0EEvPT_S5_T3_ 5 +_Z16operation_kernelIiLj1024ELj1EL23memory_operation_method0E9operationIL16kernel_operation1EiLj1ELj1024EELi0EEvPT_S5_T3_ 4 +_Z16operation_kernelIdLj256ELj4EL23memory_operation_method0E9operationIL16kernel_operation1EdLj4ELj256EELi0EEvPT_S5_T3_ 7 +_Z16operation_kernelIdLj512ELj2EL23memory_operation_method0E9operationIL16kernel_operation1EdLj2ELj512EELi0EEvPT_S5_T3_ 5 +_Z16operation_kernelIiLj512ELj8EL23memory_operation_method0E9operationIL16kernel_operation1EiLj8ELj512EELi0EEvPT_S5_T3_ 7 +_Z16operation_kernelIiLj1024ELj2EL23memory_operation_method0E9operationIL16kernel_operation1EiLj2ELj1024EELi0EEvPT_S5_T3_ 5 +_Z16operation_kernelIdLj512ELj4EL23memory_operation_method0E9operationIL16kernel_operation1EdLj4ELj512EELi0EEvPT_S5_T3_ 4 +_Z26warp_inclusive_scan_kernelI11custom_typeIddELj15ELj100EEvPKT_PS2_ 6 +_Z16operation_kernelIiLj256ELj4EL23memory_operation_method0E9operationIL16kernel_operation1EiLj4ELj256EELi0EEvPT_S5_T3_ 6 +_Z16operation_kernelIdLj256ELj8EL23memory_operation_method0E9operationIL16kernel_operation1EdLj8ELj256EELi0EEvPT_S5_T3_ 9 +_Z16operation_kernelIdLj1024ELj2EL23memory_operation_method0E9operationIL16kernel_operation2EdLj2ELj1024EELi0EEvPT_S5_T3_ 6 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE0EEfLj256ELj11ELj100EEvPKT0_PS4_ 6 +_Z16operation_kernelIiLj512ELj4EL23memory_operation_method0E9operationIL16kernel_operation1EiLj4ELj512EELi0EEvPT_S5_T3_ 4 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE2EE15HIP_vector_typeIdLj2EELj256ELj8ELj100EEvPKT0_PS6_ 6 +_ZN7rocprim6detail23histogram_shared_kernelILj256ELj8ELj1ELj1EPtjNS0_18sample_to_bin_evenIivEEEEvT3_jjjjNS0_11fixed_arrayIPT4_XT2_EEENS6_IT5_XT2_EEENS6_IjXT2_EEE 9 +_Z16operation_kernelIfLj512ELj4EL23memory_operation_method0E9operationIL16kernel_operation1EfLj4ELj512EELi0EEvPT_S5_T3_ 6 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE1EEiLj256ELj11ELj100EEvPKT0_PS4_ 6 +_Z16operation_kernelIiLj512ELj8EL23memory_operation_method2E9operationIL16kernel_operation1EiLj8ELj512EELi0EEvPT_S5_T3_ 9 +_Z6kernelI10flag_heads6__halfLj256ELj3ELb1ELj100EEvPKT0_PS2_ 6 +_Z6kernelI10flag_tails6__halfLj256ELj3ELb1ELj100EEvPKT0_PS2_ 6 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE2EE11custom_typeIffELj256ELj8ELj100EEvPKT0_PS6_ 6 +_Z16operation_kernelIdLj256ELj1EL23memory_operation_method0E9operationIL16kernel_operation1EdLj1ELj256EELi0EEvPT_S5_T3_ 5 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EE15HIP_vector_typeIfLj2EELj256ELj8ELj100EEvPKT0_PS6_ 10 +_ZN7rocprim6detail18block_merge_kernelILj1024EPxS2_P11custom_typeIddES5_NS_4lessIxEEEEvT0_T1_T2_T3_mjT4_ 5 +_Z16operation_kernelIiLj256ELj4EL23memory_operation_method2E9operationIL16kernel_operation1EiLj4ELj256EELi0EEvPT_S5_T3_ 9 +_Z16operation_kernelIiLj512ELj2EL23memory_operation_method2E9operationIL16kernel_operation1EiLj2ELj512EELi0EEvPT_S5_T3_ 6 +_Z16operation_kernelIfLj256ELj4EL23memory_operation_method0E9operationIL16kernel_operation1EfLj4ELj256EELi0EEvPT_S5_T3_ 5 +_Z16operation_kernelIiLj1024ELj2EL23memory_operation_method2E9operationIL16kernel_operation1EiLj2ELj1024EELi0EEvPT_S5_T3_ 7 +_Z16operation_kernelIiLj256ELj8EL23memory_operation_method2E9operationIL16kernel_operation1EiLj8ELj256EELi0EEvPT_S5_T3_ 4 +_Z16operation_kernelImLj256ELj4EL23memory_operation_method0E9operationIL16kernel_operation2EmLj4ELj256EELi0EEvPT_S5_T3_ 9 +_Z16operation_kernelIiLj1024ELj4EL23memory_operation_method2E9operationIL16kernel_operation1EiLj4ELj1024EELi0EEvPT_S5_T3_ 7 +_ZN7rocprim6detail18block_merge_kernelILj1024EP11custom_typeIiiES4_PS2_IddES6_NS_4lessIS3_EEEEvT0_T1_T2_T3_mjT4_ 8 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE0EE6__halfLj64ELj11ELj100EEvPKT0_PS5_ 5 +_Z16operation_kernelIiLj256ELj2EL23memory_operation_method2E9operationIL16kernel_operation1EiLj2ELj256EELi0EEvPT_S5_T3_ 4 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE0ELb0ENS0_21default_select_configILj0EfEEPfPhS5_PjNS_10empty_typeES8_NS0_19lookback_scan_stateIjLb0ELb1EEEEEvT2_T3_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 4 +_ZN7rocprim6detail18block_merge_kernelILj1024EP11custom_typeIiiES4_PS2_IxdES6_NS_4lessIS3_EEEEvT0_T1_T2_T3_mjT4_ 6 +_Z16operation_kernelIiLj512ELj4EL23memory_operation_method2E9operationIL16kernel_operation1EiLj4ELj512EELi0EEvPT_S5_T3_ 10 +_Z17sort_pairs_kernelI6__halfLj320ELj8ELj10EEvPKT_PS1_ 10 +_Z6kernelI10flag_tails6__halfLj256ELj2ELb1ELj100EEvPKT0_PS2_ 10 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE1EEiLj64ELj8ELj100EEvPKT0_PS4_ 7 +_Z6kernelI14inclusive_scanILN7rocprim20block_scan_algorithmE0EEiLj256ELj2ELj100EEvPKT0_PS4_ 6 +_Z16operation_kernelIfLj128ELj4EL23memory_operation_method2E9operationIL16kernel_operation1EfLj4ELj128EELi0EEvPT_S5_T3_ 8 +_ZN7rocprim6detail19block_reduce_kernelILb1ENS0_21default_reduce_configILj0EaEEaPaS4_aNS_4plusIaEEEEvT2_mT3_T4_T5_ 4 +_Z6kernelI14inclusive_scanILN7rocprim20block_scan_algorithmE0EEfLj64ELj16ELj100EEvPKT0_PS4_ 9 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE0ELb0ENS0_21default_select_configILj0EiEEPiPhS5_PjNS_10empty_typeES8_NS0_19lookback_scan_stateIjLb0ELb1EEEEEvT2_T3_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 6 +_Z6kernelI20flag_heads_and_tailsxLj256ELj8ELb1ELj100EEvPKT0_PS1_ 6 +_ZN7rocprim6detail23sort_and_scatter_kernelILj256ELj15ELj6ELb0EPxS2_PdS3_EEvT3_T4_T5_T6_jPKjS9_jjjj 6 +_ZN7rocprim6detail19block_reduce_kernelILb1ENS0_21default_reduce_configILj0EhEEhPhS4_hNS_4plusIhEEEEvT2_mT3_T4_T5_ 4 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE0ELb0ENS0_21default_select_configILj0EdEEPdPhS5_PjNS_10empty_typeES8_NS0_19lookback_scan_stateIjLb0ELb1EEEEEvT2_T3_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 4 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE2EE11custom_typeIffELj256ELj4ELj100EEvPKT0_PS6_ 6 +_ZN7rocprim6detail18block_merge_kernelILj1024EP11custom_typeIddES4_S4_S4_NS_4lessIS3_EEEEvT0_T1_T2_T3_mjT4_ 6 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE2EEiLj64ELj16ELj100EEvPKT0_PS4_ 6 +_Z16operation_kernelImLj512ELj1EL23memory_operation_method0E9operationIL16kernel_operation2EmLj1ELj512EELi0EEvPT_S5_T3_ 6 +_Z16operation_kernelImLj1024ELj2EL23memory_operation_method0E9operationIL16kernel_operation1EmLj2ELj1024EELi0EEvPT_S5_T3_ 4 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEiLj64ELj2ELj100EEvPKT0_PS4_ 8 +_ZN7rocprim6detail23sort_and_scatter_kernelILj256ELj15ELj6ELb0EPxS2_PfS3_EEvT3_T4_T5_T6_jPKjS9_jjjj 4 +_Z16operation_kernelImLj256ELj16EL23memory_operation_method0E9operationIL16kernel_operation1EmLj16ELj256EELi0EEvPT_S5_T3_ 4 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE1EEfLj64ELj2ELj100EEvPKT0_PS4_ 9 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE1EEfLj64ELj8ELj100EEvPKT0_PS4_ 10 +_Z16operation_kernelIfLj1024ELj4EL23memory_operation_method0E9operationIL16kernel_operation2EfLj4ELj1024EELi0EEvPT_S5_T3_ 6 +_Z26warp_exclusive_scan_kernelI11custom_typeIddELj15ELj100EEvPKT_PS2_S2_ 7 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE1EEfLj64ELj1ELj100EEvPKT0_PS4_ 9 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE0EEiLj64ELj11ELj100EEvPKT0_PS4_ 8 +_Z16operation_kernelIfLj1024ELj2EL23memory_operation_method0E9operationIL16kernel_operation1EfLj2ELj1024EELi0EEvPT_S5_T3_ 4 +_ZN7rocprim6detail23histogram_shared_kernelILj256ELj8ELj1ELj1EPfjNS0_19sample_to_bin_rangeIfEEEEvT3_jjjjNS0_11fixed_arrayIPT4_XT2_EEENS6_IT5_XT2_EEENS6_IjXT2_EEE 6 +_ZN7rocprim6detail21segmented_sort_kernelINS0_35default_segmented_radix_sort_configILj0ExdEELb0EPxS4_PdS5_PiEEvT1_PNSt15iterator_traitsIS7_E10value_typeET2_T3_PNS8_ISD_E10value_typeET4_bT5_SI_jjjj 6 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE1EEiLj64ELj11ELj100EEvPKT0_PS4_ 9 +_Z6kernelI14inclusive_scanILN7rocprim20block_scan_algorithmE0EEfLj256ELj3ELj100EEvPKT0_PS4_ 6 +_Z16operation_kernelIiLj1024ELj4EL23memory_operation_method0E9operationIL16kernel_operation2EiLj4ELj1024EELi0EEvPT_S5_T3_ 6 +_Z6kernelI20flag_heads_and_tailsxLj256ELj8ELb0ELj100EEvPKT0_PS1_ 6 +_Z6kernelI14inclusive_scanILN7rocprim20block_scan_algorithmE1EE15HIP_vector_typeIfLj2EELj256ELj8ELj100EEvPKT0_PS6_ 7 +_ZN7rocprim6detail23histogram_shared_kernelILj256ELj8ELj1ELj1EPijNS0_18sample_to_bin_evenIivEEEEvT3_jjjjNS0_11fixed_arrayIPT4_XT2_EEENS6_IT5_XT2_EEENS6_IjXT2_EEE 9 +_ZN7rocprim6detail23sort_and_scatter_kernelILj256ELj15ELj6ELb0EPxS2_P11custom_typeIffES5_EEvT3_T4_T5_T6_jPKjSB_jjjj 6 +_Z16operation_kernelImLj1024ELj2EL23memory_operation_method0E9operationIL16kernel_operation2EmLj2ELj1024EELi0EEvPT_S5_T3_ 6 +_Z6kernelI10flag_headsxLj256ELj8ELb0ELj100EEvPKT0_PS1_ 6 +_Z16operation_kernelIdLj256ELj16EL23memory_operation_method0E9operationIL16kernel_operation1EdLj16ELj256EELi0EEvPT_S5_T3_ 4 +_Z16operation_kernelImLj128ELj4EL23memory_operation_method0E9operationIL16kernel_operation2EmLj4ELj128EELi0EEvPT_S5_T3_ 6 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE1EEfLj64ELj3ELj100EEvPKT0_PS4_ 9 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE0EEiLj64ELj11ELj100EEvPKT0_PS4_ 9 +_Z6kernelI14inclusive_scanILN7rocprim20block_scan_algorithmE1EEiLj64ELj11ELj100EEvPKT0_PS4_ 9 +_Z6kernelI14inclusive_scanILN7rocprim20block_scan_algorithmE1EEfLj64ELj4ELj100EEvPKT0_PS4_ 6 +_Z16operation_kernelImLj128ELj1EL23memory_operation_method0E9operationIL16kernel_operation2EmLj1ELj128EELi0EEvPT_S5_T3_ 6 +_Z16operation_kernelIfLj128ELj2EL23memory_operation_method2E9operationIL16kernel_operation1EfLj2ELj128EELi0EEvPT_S5_T3_ 7 +_Z16operation_kernelImLj512ELj2EL23memory_operation_method0E9operationIL16kernel_operation2EmLj2ELj512EELi0EEvPT_S5_T3_ 6 +_Z16operation_kernelIiLj128ELj2EL23memory_operation_method2E9operationIL16kernel_operation1EiLj2ELj128EELi0EEvPT_S5_T3_ 4 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEfLj64ELj3ELj100EEvPKT0_PS4_ 10 +_Z6kernelI14inclusive_scanILN7rocprim20block_scan_algorithmE1EEfLj64ELj3ELj100EEvPKT0_PS4_ 9 +_Z16operation_kernelImLj256ELj2EL23memory_operation_method0E9operationIL16kernel_operation2EmLj2ELj256EELi0EEvPT_S5_T3_ 6 +_Z6kernelI10flag_headsxLj256ELj8ELb1ELj100EEvPKT0_PS1_ 4 +_Z6kernelI14inclusive_scanILN7rocprim20block_scan_algorithmE1EEfLj64ELj1ELj100EEvPKT0_PS4_ 9 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EE6__halfLj64ELj11ELj100EEvPKT0_PS5_ 5 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE0EEfLj64ELj3ELj100EEvPKT0_PS4_ 5 +_ZN7rocprim6detail16partition_kernelILNS0_13select_methodE0ELb0ENS0_21default_select_configILj0E6__halfEEPS4_PaS6_PjNS_10empty_typeES9_NS0_19lookback_scan_stateIjLb0ELb1EEEEEvT2_T3_T4_T5_mT6_T7_T8_jNS0_16ordered_block_idIjEE 7 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE2EE6__halfLj64ELj11ELj100EEvPKT0_PS5_ 5 +_ZN7rocprim6detail23sort_and_scatter_kernelILj256ELj15ELj6ELb0EPxS2_P15HIP_vector_typeIfLj2EES5_EEvT3_T4_T5_T6_jPKjSB_jjjj 4 +_Z16operation_kernelImLj128ELj2EL23memory_operation_method0E9operationIL16kernel_operation2EmLj2ELj128EELi0EEvPT_S5_T3_ 6 +_Z28segmented_warp_reduce_kernelIfhLj37ELj100EEvPKT_PT0_PS0_ 5 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE0EE6__halfLj256ELj11ELj100EEvPKT0_PS5_ 7 +_Z6kernelI14inclusive_scanILN7rocprim20block_scan_algorithmE1EEiLj256ELj11ELj100EEvPKT0_PS4_ 6 +_Z6kernelI14inclusive_scanILN7rocprim20block_scan_algorithmE0EEfLj64ELj11ELj100EEvPKT0_PS4_ 10 +_Z26warp_inclusive_scan_kernelI11custom_typeIddELj31ELj100EEvPKT_PS2_ 6 +_Z16operation_kernelIdLj512ELj4EL23memory_operation_method0E9operationIL16kernel_operation2EdLj4ELj512EELi0EEvPT_S5_T3_ 6 +_Z26warp_exclusive_scan_kernelI11custom_typeIddELj31ELj100EEvPKT_PS2_S2_ 5 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE2EEiLj64ELj11ELj100EEvPKT0_PS4_ 7 +_ZN7rocprim6detail23histogram_shared_kernelILj256ELj8ELj1ELj1EP6__halfjNS0_18sample_to_bin_evenIivEEEEvT3_jjjjNS0_11fixed_arrayIPT4_XT2_EEENS7_IT5_XT2_EEENS7_IjXT2_EEE 5 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEiLj64ELj11ELj100EEvPKT0_PS4_ 5 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE1EE6__halfLj256ELj11ELj100EEvPKT0_PS5_ 7 +_Z6kernelI14inclusive_scanILN7rocprim20block_scan_algorithmE1EEfLj256ELj11ELj100EEvPKT0_PS4_ 6 +_Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEdLj256ELj11ELj100EEvPKT0_PS4_ 10 +_Z6kernelI14exclusive_scanILN7rocprim20block_scan_algorithmE0EEiLj256ELj11ELj100EEvPKT0_PS4_ 9 +_Z6kernelI14inclusive_scanILN7rocprim20block_scan_algorithmE0EEiLj256ELj11ELj100EEvPKT0_PS4_ 9 +_Z6kernelI14inclusive_scanILN7rocprim20block_scan_algorithmE0EEfLj256ELj11ELj100EEvPKT0_PS4_ 6 +_ZN7rocprim6detail23segmented_reduce_kernelINS0_21default_reduce_configILj0EdEEPdS4_PidNS_4plusIdEEEEvT0_T1_T2_SA_T4_T3_ 7 +_ZN7rocprim6detail23segmented_reduce_kernelINS0_21default_reduce_configILj0EiEEPiS4_S4_iNS_4plusIiEEEEvT0_T1_T2_S9_T4_T3_ 5 +_ZN7rocprim6detail23segmented_reduce_kernelINS0_21default_reduce_configILj0EfEEPfS4_PifNS_4plusIfEEEEvT0_T1_T2_SA_T4_T3_ 5 diff --git a/example/optsched-cfg/sched.ini b/example/optsched-cfg/sched.ini index ed7e87cb..c3f439e2 100644 --- a/example/optsched-cfg/sched.ini +++ b/example/optsched-cfg/sched.ini @@ -12,7 +12,8 @@ PRINT_SPILL_COUNTS YES # First pass minimizes RP and second pass tries to balances RP and ILP. # YES # NO -USE_TWO_PASS NO +USE_TWO_PASS YES + # Order of passes to run delimited by a comma with no space. Only enabled with # USE_TWO_PASS YES. Note that if both the regular ILP Pass and the ILP Reduced @@ -40,6 +41,7 @@ LATENCY_MINIMUM 1 # only the sequential list scheduler will be enabled and enumerator disabled. COMPILE_TIME_DATA_PASS NO + # These 3 flags control which schedulers will be used. # Each one can be individually toggled. The heuristic # list scheduler or ACO must be run before the @@ -78,14 +80,14 @@ LENGTH_TIMEOUT 5 # A time limit for the whole region in milliseconds. Defaults to no limit. # Only used when two pass scheduling is enabled. # A time limit for the whole region. -FIRST_PASS_REGION_TIMEOUT 5 +FIRST_PASS_REGION_TIMEOUT 1000 # A time limit for each schedule length. FIRST_PASS_LENGTH_TIMEOUT 5 # A time limit for the second pass in milliseconds. # Only used when two pass scheduling is enabled. # A time limit for the whole region. -SECOND_PASS_REGION_TIMEOUT 5 +SECOND_PASS_REGION_TIMEOUT 1000 # A time limit for each schedule length. SECOND_PASS_LENGTH_TIMEOUT 5 @@ -93,7 +95,7 @@ SECOND_PASS_LENGTH_TIMEOUT 5 # INSTR : multiply the time limits in the above fields by the number of # instructions in the block # BLOCK : use the time limits in the above fields as is -TIMEOUT_PER INSTR +TIMEOUT_PER BLOCK # The maximum number of instructions to use the scheduler for. # Beyond this size, the heuristic scheduler is used. @@ -107,16 +109,16 @@ MAX_REGION_LENGTH 2147483647 # NID: node ID # LLVM: LLVM’s default list scheduler order # Example: LUC_CP_NID -HEURISTIC LUC_CP_NID +HEURISTIC NID # The heuristic used for the enumerator. If the two pass scheduling # approach is enabled, then this value will be used for the first pass. # Same valid values as HEURISTIC. -ENUM_HEURISTIC LUC_CP_NID +ENUM_HEURISTIC NID # The heuuristic used for the enumerator in the second pass in the two-pass scheduling approach. # Same valid values as HEURISTIC. -SECOND_PASS_ENUM_HEURISTIC LUC_CP_NID +SECOND_PASS_ENUM_HEURISTIC NID # The spill cost function to be used. Valid values are: # PERP: peak excess reg pressure @@ -126,13 +128,15 @@ SECOND_PASS_ENUM_HEURISTIC LUC_CP_NID # SLIL: sum of live interval lengths for each block # SPILLS: number of spills after running a register allocator (doesn't work with enumerator) # TARGET: use target specific register pressure tracking -SPILL_COST_FUNCTION PERP +SPILL_COST_FUNCTION TARGET + # The spill cost function to be used in the second pass. # All values for SPILL_COST_FUNCTION are valid here. # The value SAME causes the second pass SCF to be the same as the first pass SCF SECOND_PASS_SCF SAME + # The weight of the spill cost in the objective function. This factor # defines the importance of spill cost relative to schedule length. A good # value for this factor should be found experimentally, but is is expected @@ -140,18 +144,17 @@ SECOND_PASS_SCF SAME # making spill cost minimization the primary objective) and smaller on # architectures with in-order execution like SPARC (thus making scheduling # the primary objective). -SPILL_COST_WEIGHT 10000 +SPILL_COST_WEIGHT 1000000 # Precision of latency info: # PRECISE: use precise latencies from the machine_model.cfg file # LLVM: use latencies from LLVM # UNIT: use unit latencies -LATENCY_PRECISION LLVM +LATENCY_PRECISION PRECISE # The scheduler used to find an initial feasible schedule. # LIST: List scheduler # SEQ: Sequential list scheduler -# STALLING_LIST: Schedules stalls until instruction with top heuristic score becomes ready HEUR_SCHED_TYPE LIST # What circumstances the ACO dual cost algorithm should be applied @@ -169,6 +172,7 @@ ACO_DUAL_COST_FN_ENABLE OFF ACO_DUAL_COST_FN SLIL ACO2P_DUAL_COST_FN NONE + #use 3-tournament ACO_TOURNAMENT NO @@ -177,7 +181,6 @@ ACO_USE_FIXED_BIAS YES #Fixed number of evaporation ACO_FIXED_BIAS 20 -ACO2P_FIXED_BIAS 20 # 0 to 1, ratio that will use bias ACO_BIAS_RATIO 0.9 @@ -202,12 +205,10 @@ ACO_TRACE NO #ACO_DBG_REGIONS kernel_c2_sdk_0:1|other_region|even_more_regions| ACO_DBG_REGIONS NONE -ACO_DBG_REGIONS_OUT_PATH /home/user/path_to_graph_output_directory/ +ACO_DBG_REGIONS_OUT_PATH ~/path_to_graph_output_directory/ -# Previously the heuristic was raised to the power of the heuristic importance, -# but this has proved to not be useful, and added an expensive pow operation. -# Now a heuristic importance of 0 disables the heuristic. Any other value leaves the -# heuristic enabled. +# The importance of the heuristic in ACO. ACO uses (1/heuristic)^importance, so +# importance of 0 means don't use the heuristic. ACO_HEURISTIC_IMPORTANCE 1 ACO2P_HEURISTIC_IMPORTANCE 1 @@ -241,7 +242,7 @@ BLOCKS_TO_KEEP ALL # REGIONS_TO_SCHEDULE, which is a comma-separated list of # scheduling regions. SCHEDULE_SPECIFIC_REGIONS NO -REGIONS_TO_SCHEDULE fft1D_512:114 +REGIONS_TO_SCHEDULE _Z6kernelI6reduceILN7rocprim22block_reduce_algorithmE1EEdLj256ELj11ELj100EEvPKT0_PS4_:0 # Whether to use suffix concatenation. Disabled automatically if # history domination is disabled. @@ -326,9 +327,23 @@ TREAT_ORDER_DEPS_AS_DATA_DEPS NO # The number of bits in the hash table used in history-based domination. HIST_TABLE_HASH_BITS 16 +TIMEOUT_PER_MEMBLOCK_RATIO 100 + # Whether to dump the DDG for all the regions we schedule. # This is a debugging option. DUMP_DDGS NO # Where to dump the DDGs # DDG_DUMP_PATH ~/ddgs + +# Whether or not we want to limit our occupancy to a target below the theoretical max +SHOULD_LIMIT_OCCUPANCY YES + +# What should we use to limit the occupancy? +# HEURISTIC: AMD's memory boundedness heuristic +# FILE: occupancy_limits.ini file +# NONE: do not use limits -- defaults to unlimited +OCCUPANCY_LIMIT_SOURCE FILE + +# The value of occupancy (1-10) at which we no longer try to optimize RP +OCCUPANCY_LIMIT 4 diff --git a/include/opt-sched/Scheduler/OptSchedTarget.h b/include/opt-sched/Scheduler/OptSchedTarget.h index c7e83c7d..28cc4b29 100644 --- a/include/opt-sched/Scheduler/OptSchedTarget.h +++ b/include/opt-sched/Scheduler/OptSchedTarget.h @@ -9,6 +9,7 @@ #define LLVM_OPT_SCHED_TARGET_H #include "opt-sched/Scheduler/OptSchedDDGWrapperBase.h" +#include "opt-sched/Scheduler/config.h" #include "opt-sched/Scheduler/data_dep.h" #include "opt-sched/Scheduler/defines.h" #include "opt-sched/Scheduler/machine_model.h" @@ -36,7 +37,8 @@ class OptSchedTarget { OptSchedMachineModel *MM, LATENCY_PRECISION LatencyPrecision, const std::string &RegionID) = 0; - virtual void initRegion(ScheduleDAGInstrs *DAG, MachineModel *MM) = 0; + virtual void initRegion(ScheduleDAGInstrs *DAG, MachineModel *MM, + Config &OccFile) = 0; virtual void finalizeRegion(const InstSchedule *Schedule) = 0; // FIXME: This is a shortcut to doing the proper thing and creating a RP class // that targets can override. It's hard to justify spending the extra time @@ -51,6 +53,10 @@ class OptSchedTarget { // Targets that wish to discard the finalized schedule for any reason can // override this. virtual bool shouldKeepSchedule() { return true; } + + virtual void SetOccupancyLimit(int){/*nothing*/}; + virtual void SetShouldLimitOcc(bool){/*nothing*/}; + virtual void SetOccLimitSource(OCC_LIMIT_TYPE){/*nothing*/}; }; template class OptSchedRegistryNode { @@ -76,11 +82,15 @@ template class OptSchedRegistry { FactoryT getFactoryWithName(llvm::StringRef Name) { FactoryT Factory = nullptr; - for (auto I = List; I; I = I->Next) - if (I->Name == Name) { + std::string Match = std::string(Name.data()); + + for (auto I = List; I; I = I->Next) { + std::string Temp = std::string(I->Name.data()); + if (Match.compare(Temp) == 0) { Factory = I->Factory; break; } + } return Factory; } diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h index e6d81bf4..ab69de13 100644 --- a/include/opt-sched/Scheduler/bb_spill.h +++ b/include/opt-sched/Scheduler/bb_spill.h @@ -89,7 +89,7 @@ class BBWithSpill : public SchedRegion { InstCount CmputCost_(InstSchedule *sched, COST_COMP_MODE compMode, InstCount &execCost, bool trackCnflcts); void CmputSchedUprBound_(); - Enumerator *AllocEnumrtr_(Milliseconds timeout); + Enumerator *AllocEnumrtr_(Milliseconds timeout, int timeoutPerMemblock); FUNC_RESULT Enumerate_(Milliseconds startTime, Milliseconds rgnDeadline, Milliseconds lngthDeadline); void SetupForSchdulng_(); @@ -118,7 +118,8 @@ class BBWithSpill : public SchedRegion { SchedPriorities hurstcPrirts, SchedPriorities enumPrirts, bool vrfySched, Pruning PruningStrategy, bool SchedForRPOnly, bool enblStallEnum, int SCW, SPILL_COST_FUNCTION spillCostFunc, - SchedulerType HeurSchedType, GT_POSITION GraphTransPosition); + SchedulerType HeurSchedType, GT_POSITION GraphTransPositionbool, + bool isTimeoutPerInst, int TimeoutPerMemblock); ~BBWithSpill(); InstCount CmputExecCostLwrBound(); diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h index da2e6eab..498907f0 100644 --- a/include/opt-sched/Scheduler/data_dep.h +++ b/include/opt-sched/Scheduler/data_dep.h @@ -14,6 +14,8 @@ Last Update: Mar. 2011 #include "opt-sched/Scheduler/defines.h" #include "opt-sched/Scheduler/sched_basic_data.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Support/raw_ostream.h" #include namespace llvm { @@ -291,6 +293,9 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase, RegisterFile *getRegFiles() { return RegFiles.get(); } + void setMF_(MachineFunction *MF) { MF_ = MF; } + void printMF() { MF_->print(errs()); } + protected: // TODO(max): Get rid of this. // Number of basic blocks @@ -335,6 +340,7 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase, SmallVector, 0> graphTrans_; MachineModel *machMdl_; + MachineFunction *MF_ = nullptr; bool backTrackEnbl_; @@ -388,7 +394,7 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase, InstType instType, const char *const opCode, int nodeID, InstCount fileSchedOrder, InstCount fileSchedCycle, InstCount fileLB, - InstCount fileUB, int blkNum); + InstCount fileUB, int blkNum, const SUnit *SU); FUNC_RESULT FinishNode_(InstCount nodeNum, InstCount edgeCnt = -1); void CreateEdge_(InstCount frmInstNum, InstCount toInstNum, int ltncy, DependenceType depType, bool IsArtificial = false); diff --git a/include/opt-sched/Scheduler/defines.h b/include/opt-sched/Scheduler/defines.h index ba3edcf8..34e31f72 100644 --- a/include/opt-sched/Scheduler/defines.h +++ b/include/opt-sched/Scheduler/defines.h @@ -43,6 +43,18 @@ enum FUNC_RESULT { RES_TIMEOUT = 3 }; +// Which mechanism we are using to limit occupancy +// Limiting occupancy has shown to improve exec perf +// for some kernels +enum OCC_LIMIT_TYPE { + // NONE + OLT_NONE, + // AMD's Heuristic + OLT_HEUR, + // Hardcoded File + OLT_FILE, +}; + } // namespace opt_sched } // namespace llvm diff --git a/include/opt-sched/Scheduler/enumerator.h b/include/opt-sched/Scheduler/enumerator.h index 67d6e7e4..aa8a7e80 100644 --- a/include/opt-sched/Scheduler/enumerator.h +++ b/include/opt-sched/Scheduler/enumerator.h @@ -442,6 +442,8 @@ class Enumerator : public ConstrainedScheduler { // Should we ignore ilp and only schedule for register pressure. bool SchedForRPOnly_; + bool BypassLatencyChecking_; + // (Chris): Store the most recent matching hist node when checking for // history domination HistEnumTreeNode *mostRecentMatchingHistNode_ = nullptr; @@ -547,7 +549,7 @@ class Enumerator : public ConstrainedScheduler { InstCount schedUprBound, int16_t sigHashSize, SchedPriorities prirts, Pruning PruningStrategy, bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout, - InstCount preFxdInstCnt = 0, + int TimeoutPerMemblock, InstCount preFxdInstCnt = 0, SchedInstruction *preFxdInsts[] = NULL); virtual ~Enumerator(); virtual void Reset(); @@ -561,6 +563,8 @@ class Enumerator : public ConstrainedScheduler { inline bool IsRlxdPrnng(); virtual bool IsCostEnum() = 0; + void printRdyLst(); + // (Chris) inline bool IsSchedForRPOnly() const { return SchedForRPOnly_; } @@ -568,6 +572,8 @@ class Enumerator : public ConstrainedScheduler { FUNC_RESULT FindSchedule(InstSchedule *sched, SchedRegion *rgn) { return RES_ERROR; } + + inline bool bypassLatencyChecking() { return BypassLatencyChecking_; } }; /*****************************************************************************/ @@ -592,7 +598,8 @@ class LengthEnumerator : public Enumerator { InstCount schedUprBound, int16_t sigHashSize, SchedPriorities prirts, Pruning PruningStrategy, bool SchedForRPOnly, bool enblStallEnum, - Milliseconds timeout, InstCount preFxdInstCnt = 0, + Milliseconds timeout, int TimeoutPerMemblock, + InstCount preFxdInstCnt = 0, SchedInstruction *preFxdInsts[] = NULL); virtual ~LengthEnumerator(); void Reset(); @@ -628,7 +635,6 @@ class LengthCostEnumerator : public Enumerator { bool WasObjctvMetFrstPss_(); bool WasObjctvMetScndPss_(); bool BackTrack_(); - InstCount GetBestCost_(); InstCount getBestSpillCost_(); InstCount getBestSchedLength_(); void CreateRootNode_(); @@ -648,12 +654,14 @@ class LengthCostEnumerator : public Enumerator { InstCount schedUprBound, int16_t sigHashSize, SchedPriorities prirts, Pruning PruningStrategy, bool SchedForRPOnly, bool enblStallEnum, - Milliseconds timeout, SPILL_COST_FUNCTION spillCostFunc, + Milliseconds timeout, int TimeoutPerMemblock, + SPILL_COST_FUNCTION spillCostFunc, InstCount preFxdInstCnt = 0, SchedInstruction *preFxdInsts[] = NULL); virtual ~LengthCostEnumerator(); void Reset(); + InstCount GetBestCost_(); // Given a schedule with some instructions possibly fixed, find a // feasible schedule of the given target length if possible FUNC_RESULT FindFeasibleSchedule(InstSchedule *sched, InstCount trgtLngth, @@ -948,10 +956,9 @@ bool EnumTreeNode::IsLngthFsbl() { return isLngthFsbl_; } /*****************************************************************************/ inline bool Enumerator::WasSolnFound_() { - bool isCmplt = IsSchedComplete_(); assert(crntSched_->GetCrntLngth() <= trgtSchedLngth_); - bool isTrgt = crntSched_->GetCrntLngth() == trgtSchedLngth_; + bool isTrgt = crntSched_->GetCrntLngth() <= trgtSchedLngth_; if (isCmplt && isTrgt) { fsblSchedCnt_++; diff --git a/include/opt-sched/Scheduler/hist_table.h b/include/opt-sched/Scheduler/hist_table.h index ad8aaddb..2620af37 100644 --- a/include/opt-sched/Scheduler/hist_table.h +++ b/include/opt-sched/Scheduler/hist_table.h @@ -53,6 +53,8 @@ class HistEnumTreeNode { SetSuffix(const std::shared_ptr> &suffix); std::vector GetPrefix() const; + inline int getInstNum() { return inst_->GetNum(); } + protected: HistEnumTreeNode *prevNode_; diff --git a/include/opt-sched/Scheduler/list_sched.h b/include/opt-sched/Scheduler/list_sched.h index 03ff4dcc..7607b0f7 100644 --- a/include/opt-sched/Scheduler/list_sched.h +++ b/include/opt-sched/Scheduler/list_sched.h @@ -32,6 +32,10 @@ class ListScheduler : public ConstrainedScheduler { // ready list. void UpdtRdyLst_(InstCount cycleNum, int slotNum); + // Check whether the next node ID instruction is ready -- used to collect + // scheduling stats for LLVM generating schedules + bool CheckForInst(int numToPick) const; + // Pick next instruction to be scheduled. Returns NULL if no instructions are // ready. virtual SchedInstruction *PickInst() const; diff --git a/include/opt-sched/Scheduler/lnkd_lst.h b/include/opt-sched/Scheduler/lnkd_lst.h index 9331fb7b..8e641152 100644 --- a/include/opt-sched/Scheduler/lnkd_lst.h +++ b/include/opt-sched/Scheduler/lnkd_lst.h @@ -178,9 +178,9 @@ template std::unique_ptr> makeDynamicOrArenaAllocator(int MaxSize) { if (MaxSize == INVALID_VALUE) - return llvm::make_unique>(); + return std::make_unique>(); else - return llvm::make_unique>(MaxSize); + return std::make_unique>(MaxSize); } template class LinkedList; diff --git a/include/opt-sched/Scheduler/machine_model.h b/include/opt-sched/Scheduler/machine_model.h index b4c9753f..fdf59947 100644 --- a/include/opt-sched/Scheduler/machine_model.h +++ b/include/opt-sched/Scheduler/machine_model.h @@ -10,8 +10,13 @@ Last Update: Mar. 2011 #ifndef OPTSCHED_BASIC_MACHINE_MODEL_H #define OPTSCHED_BASIC_MACHINE_MODEL_H +#include "llvm/ADT/StringRef.h" +// For class ostream. +#include #include "opt-sched/Scheduler/defines.h" +// For class string. #include +// For class vector. #include namespace llvm { @@ -122,7 +127,7 @@ class MachineModel { // Returns the instruction type given the name of the instruction as well // as the name of the previous instruction (used for context-dependent // instructions). - InstType GetInstTypeByName(const std::string &typeName, + InstType GetInstTypeByName(llvm::StringRef typeName, const std::string &prevName = "") const; // Return the default instruction type InstType getDefaultInstType() const; diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h index 075acd43..52b469aa 100644 --- a/include/opt-sched/Scheduler/sched_basic_data.h +++ b/include/opt-sched/Scheduler/sched_basic_data.h @@ -13,6 +13,9 @@ Last Update: Sept. 2013 #include "opt-sched/Scheduler/hash_table.h" #include "opt-sched/Scheduler/machine_model.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/Support/raw_ostream.h" #include namespace llvm { @@ -141,7 +144,8 @@ class SchedInstruction : public GraphNode { SchedInstruction(InstCount num, const string &name, InstType instType, const string &opCode, InstCount maxInstCnt, int nodeID, InstCount fileSchedCycle, InstCount fileSchedOrder, - InstCount fileLB, InstCount fileUB, MachineModel *model); + InstCount fileLB, InstCount fileUB, MachineModel *model, + const SUnit *SU); // Deallocates the memory used by the instruction and destroys the object. ~SchedInstruction(); @@ -431,7 +435,12 @@ class SchedInstruction : public GraphNode { friend class SchedRange; + void setMF(MachineFunction *MF) { MF_ = MF; } + void printMF() { MF_->print(errs()); } + protected: + MachineFunction *MF_; + const SUnit *SU_; // The "name" of this instruction. Usually a string indicating its type. string name_; // The mnemonic of this instruction, e.g. "add" or "jmp". diff --git a/include/opt-sched/Scheduler/sched_region.h b/include/opt-sched/Scheduler/sched_region.h index 6398499c..b4e88642 100644 --- a/include/opt-sched/Scheduler/sched_region.h +++ b/include/opt-sched/Scheduler/sched_region.h @@ -17,6 +17,7 @@ Last Update: Jan. 2020 #include "opt-sched/Scheduler/data_dep.h" // For Enumerator, LengthCostEnumerator, EnumTreeNode and Pruning. #include "opt-sched/Scheduler/enumerator.h" +#include "llvm/CodeGen/MachineFunction.h" namespace llvm { namespace opt_sched { @@ -278,6 +279,8 @@ class SchedRegion { // TODO(max): Document. InstCount crntSlotNum_; + bool instTimeout_; + int TimeoutPerMemblock_; bool needsTransitiveClosure(Milliseconds rgnTimeout) const; // protected accessors: @@ -336,7 +339,8 @@ class SchedRegion { // TODO(max): Document. virtual void CmputSchedUprBound_() = 0; // TODO(max): Document. - virtual Enumerator *AllocEnumrtr_(Milliseconds timeout) = 0; + virtual Enumerator *AllocEnumrtr_(Milliseconds timeout, + int TimeoutPerMemblock) = 0; // Wrapper for the enumerator virtual FUNC_RESULT Enumerate_(Milliseconds startTime, Milliseconds rgnTimeout, diff --git a/include/opt-sched/Scheduler/stats.h b/include/opt-sched/Scheduler/stats.h index 9fc5448a..9b68f99c 100644 --- a/include/opt-sched/Scheduler/stats.h +++ b/include/opt-sched/Scheduler/stats.h @@ -87,13 +87,14 @@ template class NumericStat : public Stat { return *this; } + void Print(std::ostream &out) const { + out << name_ << ": " << value_ << "\n"; + } + protected: // The value tracked by this record. T value_; // Prints the stat to a stream. - void Print(std::ostream &out) const { - out << name_ << ": " << value_ << "\n"; - } }; typedef NumericStat IntStat; @@ -351,6 +352,7 @@ extern IntStat relaxedSchedulingInfeasibilityHits; extern IntStat slotCountInfeasibilityHits; extern IntStat forwardLBInfeasibilityHits; extern IntStat backwardLBInfeasibilityHits; +extern IntStat costInfeasibilityHits; extern IntStat invalidSchedules; diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 3e2045b1..3ad4358c 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -1,4 +1,5 @@ -set(OPTSCHED_SRCS Scheduler/aco.cpp +set(OPTSCHED_SRCS + Scheduler/aco.cpp Scheduler/bb_spill.cpp Scheduler/buffers.cpp Scheduler/config.cpp @@ -28,7 +29,7 @@ set(OPTSCHED_SRCS Scheduler/aco.cpp Wrapper/OptSchedGenericTarget.cpp ) -set(OPTSCHED_TARGET_DEPS LLVMCodeGen) +set(OPTSCHED_TARGET_DEPS "") if(OPTSCHED_ENABLE_AMDGPU) list(APPEND OPTSCHED_SRCS @@ -36,27 +37,14 @@ if(OPTSCHED_ENABLE_AMDGPU) Wrapper/AMDGPU/OptSchedGCNTarget.cpp Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp ) - if(TARGET AMDGPUCommonTableGen) + if(TARGET LLVMAMDGPUCodeGen) list(APPEND OPTSCHED_TARGET_DEPS AMDGPUCommonTableGen) endif() - if(llvm_subproject) - include_directories( - ${LLVM_MAIN_SRC_DIR}/lib/Target/AMDGPU - ${LLVM_BINARY_DIR}/lib/Target/AMDGPU - ) - endif() endif() -function(add_optsched_library name) - if(LLVM_VERSION VERSION_LESS 8.0) - add_llvm_loadable_module(${name} OBJECT ${ARGN}) - else() - add_llvm_library(${name} MODULE OBJECT ${ARGN}) - endif() -endfunction() - -add_optsched_library(OptSched +add_llvm_target(OptSched + STATIC ${OPTSCHED_SRCS} - LINK_LIBS ${OPTSCHED_LINK_LIBS} ) -add_dependencies(OptSched ${OPTSCHED_TARGET_DEPS}) +add_dependencies(LLVMOptSched ${OPTSCHED_TARGET_DEPS}) +#add_definitions(${OPTSCHED_EXTRA_DEFINITIONS}) diff --git a/lib/Scheduler/aco.cpp b/lib/Scheduler/aco.cpp index 9d2b8d71..31c486d0 100644 --- a/lib/Scheduler/aco.cpp +++ b/lib/Scheduler/aco.cpp @@ -283,7 +283,7 @@ std::unique_ptr ACOScheduler::FindOneSchedule(InstCount TargetRPCost) { SchedInstruction *lastInst = NULL; std::unique_ptr schedule = - llvm::make_unique(machMdl_, dataDepGraph_, true); + std::make_unique(machMdl_, dataDepGraph_, true); InstCount maxPriority = rdyLst_->MaxPriority(); if (maxPriority == 0) maxPriority = 1; // divide by 0 is bad @@ -671,7 +671,7 @@ void PrintSchedule(InstSchedule *schedule) { void ACOScheduler::setInitialSched(InstSchedule *Sched) { if (Sched) { InitialSchedule = - llvm::make_unique(machMdl_, dataDepGraph_, VrfySched_); + std::make_unique(machMdl_, dataDepGraph_, VrfySched_); InitialSchedule->Copy(Sched); } } diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp index f2454aff..93ec5459 100644 --- a/lib/Scheduler/bb_spill.cpp +++ b/lib/Scheduler/bb_spill.cpp @@ -40,7 +40,8 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, bool enblStallEnum, int SCW, SPILL_COST_FUNCTION spillCostFunc, SchedulerType HeurSchedType, - GT_POSITION GraphTransPosition) + GT_POSITION GraphTransPosition, bool isTimeoutPerInst, + int TimeoutPerMemblock) : SchedRegion(OST_->MM, dataDepGraph, rgnNum, sigHashSize, lbAlg, hurstcPrirts, enumPrirts, vrfySched, PruningStrategy, HeurSchedType, spillCostFunc, GraphTransPosition), @@ -74,6 +75,10 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph, schduldEntryInstCnt_ = 0; schduldExitInstCnt_ = 0; schduldInstCnt_ = 0; + + instTimeout_ = isTimeoutPerInst; + TimeoutPerMemblock_ = TimeoutPerMemblock; + Logger::Event("FinishedConstBBInterfacer"); } /****************************************************************************/ @@ -170,7 +175,7 @@ static InstCount ComputeSLILStaticLowerBound(int64_t regTypeCnt_, // already computed it before. const auto RegFiles = llvm::makeMutableArrayRef(regFiles_, regTypeCnt_); for (RegisterFile &File : RegFiles) { - for (Register &Reg : File) { + for (llvm::opt_sched::Register &Reg : File) { Reg.resetLiveInterval(); } } @@ -179,7 +184,7 @@ static InstCount ComputeSLILStaticLowerBound(int64_t regTypeCnt_, // and uses for each register. int naiveLowerBound = 0; for (RegisterFile &File : RegFiles) { - for (Register &Reg : File) { + for (llvm::opt_sched::Register &Reg : File) { const auto added_to_interval = [&](const SchedInstruction *instruction) { return Reg.AddToInterval(instruction); }; @@ -204,7 +209,7 @@ static InstCount ComputeSLILStaticLowerBound(int64_t regTypeCnt_, // between the recursive successor list of this instruction and the // recursive predecessors of the dependent instruction. auto recSuccBV = inst->GetRcrsvNghbrBitVector(DIR_FRWRD); - for (Register *def : inst->GetDefs()) { + for (llvm::opt_sched::Register *def : inst->GetDefs()) { for (const auto &dependentInst : def->GetUseList()) { auto recPredBV = const_cast(dependentInst) ->GetRcrsvNghbrBitVector(DIR_BKWRD); @@ -230,14 +235,15 @@ static InstCount ComputeSLILStaticLowerBound(int64_t regTypeCnt_, // based on the instructions that use more than one register (defined by // different instructions). int commonUseLowerBound = closureLowerBound; - std::vector> usedInsts; + std::vector> + usedInsts; for (int i = 0; i < dataDepGraph_->GetInstCnt(); ++i) { const auto &inst = dataDepGraph_->GetInstByIndx(i); // Get a list of instructions that define the registers, in array form. usedInsts.clear(); llvm::transform(inst->GetUses(), std::back_inserter(usedInsts), - [&](Register *reg) { + [&](llvm::opt_sched::Register *reg) { assert(reg->GetDefList().size() == 1 && "Number of defs for register is not 1!"); return std::make_pair(*(reg->GetDefList().begin()), reg); @@ -488,10 +494,11 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, physRegNum = use->GetPhysicalNumber(); if (use->IsLive() == false) - llvm::report_fatal_error("Reg " + std::to_string(regNum) + " of type " + - std::to_string(regType) + - " is used without being defined", - false); + llvm::report_fatal_error( + llvm::StringRef("Reg " + std::to_string(regNum) + " of type " + + std::to_string(regType) + + " is used without being defined"), + false); #ifdef IS_DEBUG_REG_PRESSURE Logger::Info("Inst %d uses reg %d of type %d and %d uses", inst->GetNum(), @@ -569,6 +576,7 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst, for (int16_t i = 0; i < regTypeCnt_; i++) { liveRegs = liveRegs_[i].GetWghtedCnt(); + // Set current RP for register type "i" regPressures_[i] = liveRegs; // Update peak RP for register type "i" @@ -780,7 +788,8 @@ void BBWithSpill::FinishOptml_() { } /*****************************************************************************/ -Enumerator *BBWithSpill::AllocEnumrtr_(Milliseconds timeout) { +Enumerator *BBWithSpill::AllocEnumrtr_(Milliseconds timeout, + int timeoutPerMemblock) { bool enblStallEnum = enblStallEnum_; /* if (!dataDepGraph_->IncludesUnpipelined()) { enblStallEnum = false; @@ -789,7 +798,7 @@ Enumerator *BBWithSpill::AllocEnumrtr_(Milliseconds timeout) { enumrtr_ = new LengthCostEnumerator( dataDepGraph_, machMdl_, schedUprBound_, GetSigHashSize(), GetEnumPriorities(), GetPruningStrategy(), SchedForRPOnly_, enblStallEnum, - timeout, GetSpillCostFunc(), 0, NULL); + timeout, timeoutPerMemblock, GetSpillCostFunc(), 0, NULL); return enumrtr_; } @@ -809,20 +818,21 @@ FUNC_RESULT BBWithSpill::Enumerate_(Milliseconds startTime, (rgnTimeout == INVALID_VALUE) ? INVALID_VALUE : startTime + rgnTimeout; lngthDeadline = (rgnTimeout == INVALID_VALUE) ? INVALID_VALUE : startTime + lngthTimeout; - assert(lngthDeadline <= rgnDeadline); + // assert(lngthDeadline <= rgnDeadline); + + Milliseconds deadline = instTimeout_ ? lngthDeadline : rgnDeadline; for (trgtLngth = schedLwrBound_; trgtLngth <= schedUprBound_; trgtLngth++) { InitForSchdulng(); Logger::Event("Enumerating", "target_length", trgtLngth); rslt = enumrtr_->FindFeasibleSchedule(enumCrntSched_, trgtLngth, this, - costLwrBound, lngthDeadline); + costLwrBound, deadline); if (rslt == RES_TIMEOUT) timeout = true; HandlEnumrtrRslt_(rslt, trgtLngth); - if (GetBestCost() == 0 || rslt == RES_ERROR || - (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT) || + if (GetBestCost() == 0 || rslt == RES_ERROR || (rslt == RES_TIMEOUT) || (rslt == RES_SUCCESS && IsSecondPass())) { // If doing two pass optsched and on the second pass then terminate if a @@ -852,6 +862,10 @@ FUNC_RESULT BBWithSpill::Enumerate_(Milliseconds startTime, lngthDeadline = rgnDeadline; } + stats::positiveDominationHits.Print(cout); + stats::nodeSuperiorityInfeasibilityHits.Print(cout); + stats::costInfeasibilityHits.Print(cout); + #ifdef IS_DEBUG_ITERS stats::iterations.Record(iterCnt); stats::enumerations.Record(enumrtr_->GetSearchCnt()); @@ -1075,11 +1089,13 @@ bool BBWithSpill::ChkCostFsbltyWghtd(InstCount trgtLngth, EnumTreeNode *node, node->SetCostLwrBound(crntCost); node->SetPeakSpillCost(peakSpillCost_); node->SetSpillCostSum(totSpillCost_); + enumCrntSched_->SetSpillCost(crntSpillCost_); node->setSpillCost(TmpSpillCost); node->setSpillCostLwrBound(TmpSpillCost); return true; } + stats::costInfeasibilityHits++; return false; } diff --git a/lib/Scheduler/config.cpp b/lib/Scheduler/config.cpp index db13714b..b96fff11 100644 --- a/lib/Scheduler/config.cpp +++ b/lib/Scheduler/config.cpp @@ -2,6 +2,7 @@ #include "opt-sched/Scheduler/logger.h" #include "llvm/Support/ErrorHandling.h" #include +#include #include using namespace llvm::opt_sched; @@ -63,7 +64,8 @@ void Config::Load(std::istream &file) { string Config::GetString(const string &name) const { std::map::const_iterator it = settings.find(name); if (it == settings.end()) { - llvm::report_fatal_error("No value found for setting " + name, false); + llvm::report_fatal_error( + llvm::StringRef("No value found for setting " + name), false); return ""; } else { return it->second; diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp index 1a31ae63..fed07c45 100644 --- a/lib/Scheduler/data_dep.cpp +++ b/lib/Scheduler/data_dep.cpp @@ -11,6 +11,7 @@ #include "opt-sched/Scheduler/relaxed_sched.h" #include "opt-sched/Scheduler/stats.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -197,7 +198,7 @@ DataDepGraph::DataDepGraph(MachineModel *machMdl, LATENCY_PRECISION ltncyPrcsn) entryInstCnt_ = 0; exitInstCnt_ = 0; - RegFiles = llvm::make_unique(machMdl_->GetRegTypeCnt()); + RegFiles = std::make_unique(machMdl_->GetRegTypeCnt()); } DataDepGraph::~DataDepGraph() { @@ -605,7 +606,8 @@ FUNC_RESULT DataDepGraph::ParseF2Nodes_(SpecsBuffer *buf, } CreateNode_(nodeNum, instName, instType, opCode, nodeID, fileSchedOrder, - fileSchedCycle, fileInstLwrBound, fileInstUprBound, blkNum); + fileSchedCycle, fileInstLwrBound, fileInstUprBound, blkNum, + nullptr); instCntPerType_[instType]++; stats::instructionTypeCounts.Increment( @@ -823,15 +825,18 @@ FUNC_RESULT DataDepGraph::SkipGraph(SpecsBuffer *buf, bool &endOfFileReached) { return RES_SUCCESS; } -SchedInstruction *DataDepGraph::CreateNode_( - InstCount instNum, const char *const instName, InstType instType, - const char *const opCode, int nodeID, InstCount fileSchedOrder, - InstCount fileSchedCycle, InstCount fileLB, InstCount fileUB, int blkNum) { +SchedInstruction * +DataDepGraph::CreateNode_(InstCount instNum, const char *const instName, + InstType instType, const char *const opCode, + int nodeID, InstCount fileSchedOrder, + InstCount fileSchedCycle, InstCount fileLB, + InstCount fileUB, int blkNum, const SUnit *SU) { SchedInstruction *newInstPtr; - newInstPtr = new SchedInstruction(instNum, instName, instType, opCode, - 2 * instCnt_, nodeID, fileSchedOrder, - fileSchedCycle, fileLB, fileUB, machMdl_); + newInstPtr = new SchedInstruction( + instNum, instName, instType, opCode, 2 * instCnt_, nodeID, fileSchedOrder, + fileSchedCycle, fileLB, fileUB, machMdl_, SU); + if (instNum < 0 || instNum >= instCnt_) llvm::report_fatal_error("Invalid instruction number", false); // Logger::Info("Instruction order = %d, instCnt_ = %d", fileSchedOrder, @@ -839,6 +844,7 @@ SchedInstruction *DataDepGraph::CreateNode_( if (fileSchedOrder > maxFileSchedOrder_) maxFileSchedOrder_ = fileSchedOrder; + newInstPtr->setMF(MF_); insts_[instNum] = newInstPtr; return newInstPtr; @@ -1523,15 +1529,15 @@ void DataDepSubGraph::CreateRootAndLeafInsts_() { assert(rootInst_ == NULL && leafInst_ == NULL); - rootInst_ = - new SchedInstruction(INVALID_VALUE, "root", instType, " ", maxInstCnt_, 0, - INVALID_VALUE, INVALID_VALUE, 0, 0, machMdl_); + rootInst_ = new SchedInstruction(INVALID_VALUE, "root", instType, " ", + maxInstCnt_, 0, INVALID_VALUE, INVALID_VALUE, + 0, 0, machMdl_, nullptr); rootInst_->SetIssueType(issuType); - leafInst_ = - new SchedInstruction(INVALID_VALUE, "leaf", instType, " ", maxInstCnt_, 0, - INVALID_VALUE, INVALID_VALUE, 0, 0, machMdl_); + leafInst_ = new SchedInstruction(INVALID_VALUE, "leaf", instType, " ", + maxInstCnt_, 0, INVALID_VALUE, INVALID_VALUE, + 0, 0, machMdl_, nullptr); leafInst_->SetIssueType(issuType); diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp index c2747af5..9b2aa785 100644 --- a/lib/Scheduler/enumerator.cpp +++ b/lib/Scheduler/enumerator.cpp @@ -436,9 +436,11 @@ Enumerator::Enumerator(DataDepGraph *dataDepGraph, MachineModel *machMdl, InstCount schedUprBound, int16_t sigHashSize, SchedPriorities prirts, Pruning PruningStrategy, bool SchedForRPOnly, bool enblStallEnum, - Milliseconds timeout, InstCount preFxdInstCnt, - SchedInstruction *preFxdInsts[]) + Milliseconds timeout, int TimeoutPerMemblock, + InstCount preFxdInstCnt, SchedInstruction *preFxdInsts[]) : ConstrainedScheduler(dataDepGraph, machMdl, schedUprBound) { + + Logger::Info("timeout is %d", timeout); memAllocBlkSize_ = (int)timeout / TIMEOUT_TO_MEMBLOCK_RATIO; assert(preFxdInstCnt >= 0); @@ -931,6 +933,7 @@ FUNC_RESULT Enumerator::FindFeasibleSchedule_(InstSchedule *sched, crntNode_, dataDepGraph_); isCrntNodeFsbl = BackTrack_(); } + } else { // All branches from the current node have been explored, and no more // branches that lead to feasible nodes have been found. @@ -960,8 +963,8 @@ FUNC_RESULT Enumerator::FindFeasibleSchedule_(InstSchedule *sched, return fsblSchedCnt_ > 0 ? RES_SUCCESS : RES_FAIL; } /****************************************************************************/ - bool Enumerator::FindNxtFsblBrnch_(EnumTreeNode *&newNode) { + InstCount i; bool isEmptyNode; SchedInstruction *inst; @@ -1002,7 +1005,9 @@ bool Enumerator::FindNxtFsblBrnch_(EnumTreeNode *&newNode) { if (getIsFirstPass()) { return false; } - +#ifdef IS_DEBUG_SEARCH_ORDER + Logger::Log((Logger::LOG_LEVEL)4, false, "Out of instructions, stalling"); +#endif // then we only have the option of scheduling a stall assert(isEmptyNode == false || brnchCnt == 1); inst = NULL; @@ -1018,7 +1023,14 @@ bool Enumerator::FindNxtFsblBrnch_(EnumTreeNode *&newNode) { continue; } } else { + inst = rdyLst_->GetNextPriorityInst(); + +#ifdef IS_DEBUG_SEARCH_ORDER + Logger::Log((Logger::LOG_LEVEL)4, false, "Probing inst %d", + inst->GetNum()); +#endif + assert(inst != NULL); bool isLegal = ChkInstLglty_(inst); isLngthFsbl = isLegal; @@ -1080,6 +1092,9 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, if (inst != NULL) if (inst->GetPreFxdCycle() != INVALID_VALUE) if (inst->GetPreFxdCycle() != crntCycleNum_) { +#ifdef IS_DEBUG_SEARCH_ORDER + Logger::Log((Logger::LOG_LEVEL)4, false, "probe: prefix fail"); +#endif return false; } @@ -1087,6 +1102,9 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, if (inst->GetCrntLwrBound(DIR_FRWRD) > crntCycleNum_) { #ifdef IS_DEBUG_INFSBLTY_TESTS stats::forwardLBInfeasibilityHits++; +#endif +#ifdef IS_DEBUG_SEARCH_ORDER + Logger::Log((Logger::LOG_LEVEL)4, false, "probe: LB fail"); #endif return false; } @@ -1094,6 +1112,9 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, if (inst->GetCrntDeadline() < crntCycleNum_) { #ifdef IS_DEBUG_INFSBLTY_TESTS stats::backwardLBInfeasibilityHits++; +#endif +#ifdef IS_DEBUG_SEARCH_ORDER + Logger::Log((Logger::LOG_LEVEL)4, false, "probe: deadline fail"); #endif return false; } @@ -1111,10 +1132,14 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, if (prune_.nodeSup) { if (inst != NULL) if (crntNode_->WasSprirNodeExmnd(inst)) { -#ifdef IS_DEBUG_INFSBLTY_TESTS stats::nodeSuperiorityInfeasibilityHits++; +#ifdef IS_DEBUG_INFSBLTY_TESTS + // stats::nodeSuperiorityInfeasibilityHits++; #endif isNodeDmntd = true; +#ifdef IS_DEBUG_SEARCH_ORDER + Logger::Log((Logger::LOG_LEVEL)4, false, "probe: history fail"); +#endif return false; } } @@ -1131,6 +1156,9 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, if (!fsbl) { #ifdef IS_DEBUG_INFSBLTY_TESTS stats::slotCountInfeasibilityHits++; +#endif +#ifdef IS_DEBUG_SEARCH_ORDER + Logger::Log((Logger::LOG_LEVEL)4, false, "probe: issue slot fail"); #endif return false; } @@ -1142,6 +1170,9 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, if (fsbl == false) { #ifdef IS_DEBUG_INFSBLTY_TESTS stats::rangeTighteningInfeasibilityHits++; +#endif +#ifdef IS_DEBUG_SEARCH_ORDER + Logger::Log((Logger::LOG_LEVEL)4, false, "probe: tightn LB fail"); #endif return false; } @@ -1160,13 +1191,16 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, if (WasDmnntSubProbExmnd_(inst, newNode)) { #ifdef IS_DEBUG_INFSBLTY_TESTS stats::historyDominationInfeasibilityHits++; +#endif +#ifdef IS_DEBUG_SEARCH_ORDER + Logger::Log((Logger::LOG_LEVEL)4, false, "probe: histDom fail"); #endif return false; } } // Try to find a relaxed schedule for the unscheduled instructions - if (prune_.rlxd) { + if (prune_.rlxd && rgn_->IsSecondPass()) { fsbl = RlxdSchdul_(newNode); state_.rlxSchduld = true; @@ -1175,7 +1209,9 @@ bool Enumerator::ProbeBranch_(SchedInstruction *inst, EnumTreeNode *&newNode, stats::relaxedSchedulingInfeasibilityHits++; #endif isRlxInfsbl = true; - +#ifdef IS_DEBUG_SEARCH_ORDER + Logger::Log((Logger::LOG_LEVEL)4, false, "probe: relaxed fail"); +#endif return false; } } @@ -1258,6 +1294,12 @@ void Enumerator::StepFrwrd_(EnumTreeNode *&newNode) { SchedInstruction *instToSchdul = newNode->GetInst(); InstCount instNumToSchdul; +#ifdef IS_DEBUG_SEARCH_ORDER + if (instToSchdul) + Logger::Log((Logger::LOG_LEVEL)4, false, "Stepping forward to inst %d", + instToSchdul->GetNum()); +#endif + CreateNewRdyLst_(); // Let the new node inherit its parent's ready list before we update it newNode->SetRdyLst(rdyLst_); @@ -1480,6 +1522,13 @@ bool Enumerator::BackTrack_() { SchedInstruction *inst = crntNode_->GetInst(); EnumTreeNode *trgtNode = crntNode_->GetParent(); +#ifdef IS_DEBUG_SEARCH_ORDER + if (crntNode_->GetInst()) + Logger::Log((Logger::LOG_LEVEL)4, false, + "Back tracking fron inst %d to inst %d", inst->GetNum(), + trgtNode->GetInstNum()); +#endif + rdyLst_->RemoveLatestSubList(); if (IsHistDom()) { @@ -1563,7 +1612,6 @@ bool Enumerator::WasDmnntSubProbExmnd_(SchedInstruction *, #ifdef IS_DEBUG_SPD stats::signatureMatches++; #endif - if (exNode->DoesMatch(newNode, this)) { if (!mostRecentMatchWasSet) { mostRecentMatchingHistNode_ = @@ -1895,10 +1943,11 @@ LengthEnumerator::LengthEnumerator( DataDepGraph *dataDepGraph, MachineModel *machMdl, InstCount schedUprBound, int16_t sigHashSize, SchedPriorities prirts, Pruning PruningStrategy, bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout, - InstCount preFxdInstCnt, SchedInstruction *preFxdInsts[]) + int TimeoutPerMemblock, InstCount preFxdInstCnt, + SchedInstruction *preFxdInsts[]) : Enumerator(dataDepGraph, machMdl, schedUprBound, sigHashSize, prirts, PruningStrategy, SchedForRPOnly, enblStallEnum, timeout, - preFxdInstCnt, preFxdInsts) { + TimeoutPerMemblock, preFxdInstCnt, preFxdInsts) { SetupAllocators_(); tmpHstryNode_ = new HistEnumTreeNode; } @@ -1982,11 +2031,11 @@ LengthCostEnumerator::LengthCostEnumerator( DataDepGraph *dataDepGraph, MachineModel *machMdl, InstCount schedUprBound, int16_t sigHashSize, SchedPriorities prirts, Pruning PruningStrategy, bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout, - SPILL_COST_FUNCTION spillCostFunc, InstCount preFxdInstCnt, - SchedInstruction *preFxdInsts[]) + int TimeoutPerMemblock, SPILL_COST_FUNCTION spillCostFunc, + InstCount preFxdInstCnt, SchedInstruction *preFxdInsts[]) : Enumerator(dataDepGraph, machMdl, schedUprBound, sigHashSize, prirts, PruningStrategy, SchedForRPOnly, enblStallEnum, timeout, - preFxdInstCnt, preFxdInsts) { + TimeoutPerMemblock, preFxdInstCnt, preFxdInsts) { SetupAllocators_(); costChkCnt_ = 0; @@ -2059,6 +2108,7 @@ FUNC_RESULT LengthCostEnumerator::FindFeasibleSchedule(InstSchedule *sched, Milliseconds deadline) { rgn_ = rgn; costLwrBound_ = costLwrBound; + BypassLatencyChecking_ = rgn_->IsSecondPass() ? false : true; SpillCostLwrBound_ = rgn_->getSpillCostLwrBound(); this->setIsSecondPass(rgn_->IsSecondPass()); @@ -2178,6 +2228,10 @@ bool LengthCostEnumerator::ProbeBranch_(SchedInstruction *inst, isFsbl = ChkCostFsblty_(inst, newNode, SuffixRPSpillCost); if (isFsbl == false) { +#ifdef IS_DEBUG_SEARCH_ORDER + Logger::Log((Logger::LOG_LEVEL)4, false, "probe: cost fail"); +#endif + // Suffix propogation is currently not enabled for weighted sum if (rgn_->isTwoPassEnabled()) { assert(SuffixRPSpillCost != -1); @@ -2206,6 +2260,9 @@ bool LengthCostEnumerator::ProbeBranch_(SchedInstruction *inst, stats::historyDominationInfeasibilityHits++; #endif rgn_->UnschdulInst(inst, crntCycleNum_, crntSlotNum_, parent); +#ifdef IS_DEBUG_SEARCH_ORDER + Logger::Log((Logger::LOG_LEVEL)4, false, "probe: LCE history fail"); +#endif return false; } @@ -2243,6 +2300,7 @@ bool LengthCostEnumerator::ChkCostFsblty_(SchedInstruction *inst, /*****************************************************************************/ bool LengthCostEnumerator::BackTrack_() { + SchedInstruction *inst = crntNode_->GetInst(); rgn_->UnschdulInst(inst, crntCycleNum_, crntSlotNum_, crntNode_->GetParent()); @@ -2338,6 +2396,15 @@ void LengthCostEnumerator::FreeHistNode_(HistEnumTreeNode *histNode) { } /*****************************************************************************/ +void Enumerator::printRdyLst() { + rdyLst_->ResetIterator(); + int sizeOfList = rdyLst_->GetInstCnt(); + Logger::Info("ReadyList Contains: "); + for (int i = 0; i < sizeOfList; i++) { + Logger::Info("%d", rdyLst_->GetNextPriorityInst()->GetNum()); + } + rdyLst_->ResetIterator(); +} void EnumTreeNode::setSuffixRPCostLowerBound(InstCount RPCost) { // Suffix cost should never be negative nor less than the estimated LB if (RPCost < 0) @@ -2349,4 +2416,4 @@ void EnumTreeNode::setSuffixRPCostLowerBound(InstCount RPCost) { InstCount EnumTreeNode::getSuffixRPCostLowerBound() { return MinSuffixRPLowerBound; -} \ No newline at end of file +} diff --git a/lib/Scheduler/graph_trans_ilp.cpp b/lib/Scheduler/graph_trans_ilp.cpp index cb8ed2d6..0fd6b2c0 100644 --- a/lib/Scheduler/graph_trans_ilp.cpp +++ b/lib/Scheduler/graph_trans_ilp.cpp @@ -158,7 +158,7 @@ StaticNodeSupILPTrans::DataAlloc::DataAlloc(DataDepGraph &DDG) SuperiorNodesList( createSuperiorNodesList(wrapAs2D(SuperiorArray, DDG.GetNodeCnt()))), AddedEdges(), Stats(), - Data_(llvm::make_unique(Data{ + Data_(std::make_unique(Data{ DDG, wrapAs2D(this->DistanceTable, DDG.GetNodeCnt()), wrapAs2D(this->SuperiorArray, DDG.GetNodeCnt()), diff --git a/lib/Scheduler/hist_table.cpp b/lib/Scheduler/hist_table.cpp index 8584c9ee..1e0e2d8b 100644 --- a/lib/Scheduler/hist_table.cpp +++ b/lib/Scheduler/hist_table.cpp @@ -270,12 +270,13 @@ bool HistEnumTreeNode::DoesDominate_(EnumTreeNode *node, lastInsts[indx] = NULL; } } - } - // If this node is an absolute dominant that dominates any matching node. - if (isAbslutDmnnt) - stats::absoluteDominationHits++; - // PrntPartialSched(enumrtr); + // If this node is an absolute dominant that dominates any matching node. + if (isAbslutDmnnt) + stats::absoluteDominationHits++; + + // PrntPartialSched(enumrtr); + } return true; } @@ -606,6 +607,7 @@ bool CostHistEnumTreeNode::chkCostDmntnForSinglePass(EnumTreeNode *Node, spillCostSum_ % instCnt >= Node->GetSpillCostSum() % instCnt; } } + return ShouldPrune; } diff --git a/lib/Scheduler/list_sched.cpp b/lib/Scheduler/list_sched.cpp index 8c9d854d..d53f2bb2 100644 --- a/lib/Scheduler/list_sched.cpp +++ b/lib/Scheduler/list_sched.cpp @@ -28,6 +28,20 @@ SchedInstruction *ListScheduler::PickInst() const { return inst; } +bool ListScheduler::CheckForInst(int numToPick) const { + SchedInstruction *inst = NULL; + int rdyLstSize = rdyLst_->GetInstCnt(); + for (int i = 0; i < rdyLstSize; i++) { + inst = rdyLst_->GetNextPriorityInst(); + if (inst->GetNum() == numToPick) { + rdyLst_->ResetIterator(); + return true; + } + } + rdyLst_->ResetIterator(); + return false; +} + FUNC_RESULT ListScheduler::FindSchedule(InstSchedule *sched, SchedRegion *rgn) { InstCount rdyLstSize, maxRdyLstSize = 0, avgRdyLstSize = 0, iterCnt = 0; bool isEmptyCycle = true; @@ -37,6 +51,8 @@ FUNC_RESULT ListScheduler::FindSchedule(InstSchedule *sched, SchedRegion *rgn) { Initialize_(); + int numToPick = -1; + int entry, exit; while (!IsSchedComplete_()) { UpdtRdyLst_(crntCycleNum_, crntSlotNum_); rdyLst_->ResetIterator(); @@ -47,7 +63,27 @@ FUNC_RESULT ListScheduler::FindSchedule(InstSchedule *sched, SchedRegion *rgn) { maxRdyLstSize = rdyLstSize; avgRdyLstSize += rdyLstSize; - SchedInstruction *inst = PickInst(); + SchedInstruction *inst; + + // TODO -- extract this into config variable + bool forcedSchedule = false; + + // Force get the schedule in order of best heuristic value (not just best + // available/ready) + + if (forcedSchedule) { + if (numToPick == -1 || CheckForInst(numToPick)) { + inst = PickInst(); + assert(inst); + if (numToPick == -1) + entry = inst->GetNum(); + numToPick += 1; + if (numToPick == entry) + numToPick += 1; + } + } + + inst = PickInst(); InstCount instNum; // If the ready list is empty. diff --git a/lib/Scheduler/machine_model.cpp b/lib/Scheduler/machine_model.cpp index e8029df7..935d407e 100644 --- a/lib/Scheduler/machine_model.cpp +++ b/lib/Scheduler/machine_model.cpp @@ -73,9 +73,10 @@ void MachineModel::parseBuffer(SpecsBuffer &buf) { IssueType issuType = GetIssueTypeByName(buffer); if (issuType == INVALID_ISSUE_TYPE) { - llvm::report_fatal_error(std::string("Invalid issue type ") + buffer + - " for inst. type " + it->name, - false); + llvm::report_fatal_error( + llvm::StringRef(std::string("Invalid issue type ") + buffer + + " for inst. type " + it->name), + false); } it->issuType = issuType; @@ -94,13 +95,16 @@ MachineModel::MachineModel(const std::string &modelFile) { MachineModel::MachineModel(SpecsBuffer &buf) { parseBuffer(buf); } -InstType MachineModel::GetInstTypeByName(const string &typeName, +InstType MachineModel::GetInstTypeByName(llvm::StringRef typeName, const string &prevName) const { - string composite = prevName.size() ? typeName + "_after_" + prevName : ""; + string composite = prevName.size() + ? std::string(typeName.data()) + "_after_" + prevName + : ""; for (size_t i = 0; i < instTypes_.size(); i++) { if (instTypes_[i].isCntxtDep && instTypes_[i].name == composite) { return (InstType)i; - } else if (!instTypes_[i].isCntxtDep && instTypes_[i].name == typeName) { + } else if (!instTypes_[i].isCntxtDep && + instTypes_[i].name == typeName.data()) { return (InstType)i; } } @@ -109,9 +113,15 @@ InstType MachineModel::GetInstTypeByName(const string &typeName, } int16_t MachineModel::GetRegTypeByName(const char *const regTypeName) const { + std::string mapVal; + if (regTypeName == "SReg_32") + mapVal = "SGPR32"; + if (regTypeName == "VGPR_32") + mapVal = "VGPR32"; int16_t Type = INVALID_VALUE; for (size_t i = 0; i < registerTypes_.size(); i++) { - if (regTypeName == registerTypes_[i].name) { + if (regTypeName == registerTypes_[i].name || + mapVal.data() == registerTypes_[i].name) { Type = (int16_t)i; break; } diff --git a/lib/Scheduler/reg_alloc.cpp b/lib/Scheduler/reg_alloc.cpp index 95ab6d64..2626d863 100644 --- a/lib/Scheduler/reg_alloc.cpp +++ b/lib/Scheduler/reg_alloc.cpp @@ -33,10 +33,17 @@ void LocalRegAlloc::AllocRegs() { i != INVALID_VALUE; i = instSchedule_->GetNxtInst(cycle, slot)) { int instNum = i; SchedInstruction *inst = dataDepGraph_->GetInstByIndx(instNum); +#ifdef RA_BUG + Logger::Info("\nParsing inst"); + inst->printMIR(); +#endif + // Skip artificial entry and exit nodes. if (!strcmp(inst->GetOpCode(), "__optsched_entry") || - !strcmp(inst->GetOpCode(), "__optsched_exit")) + !strcmp(inst->GetOpCode(), "__optsched_exit")) { + Logger::Info("skipping artificial inst"); continue; + } #ifdef IS_DEBUG_REG_ALLOC Logger::Info("REG_ALLOC: Processing instruction %d.", instNum); @@ -46,6 +53,9 @@ void LocalRegAlloc::AllocRegs() { for (Register *use : inst->GetUses()) { int16_t regType = use->GetType(); int virtRegNum = use->GetNum(); +#ifdef RA_BUG + Logger::Info("found use %d", virtRegNum); +#endif RegMap &map = regMaps_[regType][virtRegNum]; #ifdef IS_DEBUG_REG_ALLOC Logger::Info("REG_ALLOC: Processing use for register %d:%d.", regType, @@ -74,6 +84,10 @@ void LocalRegAlloc::AllocRegs() { if (map.nextUses.empty() && map.assignedReg != -1) { int physRegNum = map.assignedReg; +#ifdef RA_BUG + Logger::Info("no more uses for %d, freeing pr %d", virtRegNum, + physRegNum); +#endif assert(physRegs[physRegNum] == virtRegNum); map.assignedReg = -1; map.isDirty = false; @@ -86,6 +100,9 @@ void LocalRegAlloc::AllocRegs() { for (Register *def : inst->GetDefs()) { int16_t regType = def->GetType(); int virtRegNum = def->GetNum(); +#ifdef RA_BUG + Logger::Info("found def %d", virtRegNum); +#endif #ifdef IS_DEBUG_REG_ALLOC Logger::Info("REG_ALLOC: Processing def for register %d:%d.", regType, virtRegNum); @@ -99,6 +116,9 @@ void LocalRegAlloc::AllocRegs() { } void LocalRegAlloc::AllocateReg_(int16_t regType, int virtRegNum) { +#ifdef RA_BUG + Logger::Info("allocating reg for %d", virtRegNum); +#endif std::map ®Maps = regMaps_[regType]; std::stack &free = freeRegs_[regType]; std::vector &physRegs = physRegs_[regType]; @@ -108,6 +128,10 @@ void LocalRegAlloc::AllocateReg_(int16_t regType, int virtRegNum) { physRegNum = free.top(); regMaps[virtRegNum].assignedReg = free.top(); physRegs[physRegNum] = virtRegNum; +#ifdef RA_BUG + Logger::Info("found free phys reg, assigning vr %d to pr %d", virtRegNum, + physRegNum); +#endif free.pop(); } else { // If there are no free registers find one to use. @@ -126,6 +150,12 @@ void LocalRegAlloc::AllocateReg_(int16_t regType, int virtRegNum) { } physRegNum = regMaps[spillCand].assignedReg; +#ifdef RA_BUG + Logger::Info("found spill cand, vr %d, pr %d", spillCand, physRegNum); + if (physRegNum == -1) + Logger::Info("about to fire assert, spillCand %d, regType %d", spillCand, + regType); +#endif assert(physRegNum != -1); regMaps[spillCand].assignedReg = -1; regMaps[virtRegNum].assignedReg = physRegNum; @@ -144,9 +174,26 @@ int LocalRegAlloc::FindSpillCand_(std::map ®Maps, int virtRegWithMaxUse = -1; for (size_t i = 0; i < physRegs.size(); i++) { int virtReg = physRegs[i]; +#ifdef RA_BUG + if (virtReg == -1) { + Logger::Info("virtReg == - 1"); + dataDepGraph_->printMF(); + } +#endif assert(virtReg != -1); RegMap ®Map = regMaps[virtReg]; +#ifdef RA_BUG + if (regMap.assignedReg != i) { + Logger::Info("regMap.assignedReg != i"); + Logger::Info("virtReg %d, i %d, regMap.assignedReg %d", virtReg, i, + regMap.assignedReg); + dataDepGraph_->printMF(); + } +#endif + + assert(regMap.assignedReg == i); + // If this register is clean, it can be spilled immediately . if (!regMap.isDirty) { #ifdef IS_DEBUG_REG_ALLOC @@ -234,9 +281,15 @@ void LocalRegAlloc::ScanUses_() { void LocalRegAlloc::AddLiveIn_(SchedInstruction *artificialEntry) { // Process live-in regs. +#ifdef RA_BUG + Logger::Info("Parsing live ins"); +#endif for (Register *def : artificialEntry->GetDefs()) { int16_t regType = def->GetType(); int virtRegNum = def->GetNum(); +#ifdef RA_BUG + Logger::Info("Found live in def vreg %d", virtRegNum); +#endif #ifdef IS_DEBUG_REG_ALLOC Logger::Info("REG_ALLOC: Processing live-in register %d:%d.", regType, virtRegNum); @@ -248,8 +301,12 @@ void LocalRegAlloc::AddLiveIn_(SchedInstruction *artificialEntry) { if (!free.empty()) { physRegNum = free.top(); - regMaps[virtRegNum].assignedReg = free.top(); + regMaps[virtRegNum].assignedReg = physRegNum; physRegs[physRegNum] = virtRegNum; +#ifdef RA_BUG + Logger::Info("found free phs reg, virtReg %d assigned to physReg %d", + virtRegNum, physRegNum); +#endif free.pop(); } else { #ifdef IS_DEBUG_REG_ALLOC diff --git a/lib/Scheduler/register.cpp b/lib/Scheduler/register.cpp index ddab53de..a9fb8946 100644 --- a/lib/Scheduler/register.cpp +++ b/lib/Scheduler/register.cpp @@ -3,78 +3,95 @@ using namespace llvm::opt_sched; -int16_t Register::GetType() const { return type_; } +int16_t llvm::opt_sched::Register::GetType() const { return type_; } -int Register::GetNum() const { return num_; } +int llvm::opt_sched::Register::GetNum() const { return num_; } -int Register::GetWght() const { return wght_; } +int llvm::opt_sched::Register::GetWght() const { return wght_; } -void Register::SetType(int16_t type) { type_ = type; } +void llvm::opt_sched::Register::SetType(int16_t type) { type_ = type; } -void Register::SetNum(int num) { num_ = num; } +void llvm::opt_sched::Register::SetNum(int num) { num_ = num; } -void Register::SetWght(int wght) { wght_ = wght; } +void llvm::opt_sched::Register::SetWght(int wght) { wght_ = wght; } -bool Register::IsPhysical() const { return physicalNumber_ != INVALID_VALUE; } +bool llvm::opt_sched::Register::IsPhysical() const { + return physicalNumber_ != INVALID_VALUE; +} -int Register::GetPhysicalNumber() const { return physicalNumber_; } +int llvm::opt_sched::Register::GetPhysicalNumber() const { + return physicalNumber_; +} -void Register::SetPhysicalNumber(int physicalNumber) { +void llvm::opt_sched::Register::SetPhysicalNumber(int physicalNumber) { physicalNumber_ = physicalNumber; } -bool Register::IsLive() const { +bool llvm::opt_sched::Register::IsLive() const { assert(crntUseCnt_ <= useCnt_); return crntUseCnt_ < useCnt_; } -bool Register::IsLiveIn() const { return liveIn_; } +bool llvm::opt_sched::Register::IsLiveIn() const { return liveIn_; } -bool Register::IsLiveOut() const { return liveOut_; } +bool llvm::opt_sched::Register::IsLiveOut() const { return liveOut_; } -void Register::SetIsLiveIn(bool liveIn) { liveIn_ = liveIn; } +void llvm::opt_sched::Register::SetIsLiveIn(bool liveIn) { liveIn_ = liveIn; } -void Register::SetIsLiveOut(bool liveOut) { liveOut_ = liveOut; } +void llvm::opt_sched::Register::SetIsLiveOut(bool liveOut) { + liveOut_ = liveOut; +} -void Register::ResetCrntUseCnt() { crntUseCnt_ = 0; } +void llvm::opt_sched::Register::ResetCrntUseCnt() { crntUseCnt_ = 0; } -void Register::AddUse(const SchedInstruction *inst) { +void llvm::opt_sched::Register::AddUse(const SchedInstruction *inst) { uses_.insert(inst); useCnt_++; } -void Register::AddDef(const SchedInstruction *inst) { +void llvm::opt_sched::Register::AddDef(const SchedInstruction *inst) { defs_.insert(inst); defCnt_++; } -int Register::GetUseCnt() const { return useCnt_; } +int llvm::opt_sched::Register::GetUseCnt() const { return useCnt_; } -const Register::InstSetType &Register::GetUseList() const { return uses_; } +const llvm::opt_sched::Register::InstSetType & +llvm::opt_sched::Register::GetUseList() const { + return uses_; +} -size_t Register::GetSizeOfUseList() const { return uses_.size(); } +size_t llvm::opt_sched::Register::GetSizeOfUseList() const { + return uses_.size(); +} -int Register::GetDefCnt() const { return defCnt_; } +int llvm::opt_sched::Register::GetDefCnt() const { return defCnt_; } -const Register::InstSetType &Register::GetDefList() const { return defs_; } +const llvm::opt_sched::Register::InstSetType & +llvm::opt_sched::Register::GetDefList() const { + return defs_; +} -size_t Register::GetSizeOfDefList() const { return defs_.size(); } +size_t llvm::opt_sched::Register::GetSizeOfDefList() const { + return defs_.size(); +} -int Register::GetCrntUseCnt() const { return crntUseCnt_; } +int llvm::opt_sched::Register::GetCrntUseCnt() const { return crntUseCnt_; } -void Register::AddCrntUse() { crntUseCnt_++; } +void llvm::opt_sched::Register::AddCrntUse() { crntUseCnt_++; } -void Register::DelCrntUse() { crntUseCnt_--; } +void llvm::opt_sched::Register::DelCrntUse() { crntUseCnt_--; } -void Register::ResetCrntLngth() { crntLngth_ = 0; } +void llvm::opt_sched::Register::ResetCrntLngth() { crntLngth_ = 0; } -int Register::GetCrntLngth() const { return crntLngth_; } +int llvm::opt_sched::Register::GetCrntLngth() const { return crntLngth_; } -void Register::IncrmntCrntLngth() { crntLngth_++; } +void llvm::opt_sched::Register::IncrmntCrntLngth() { crntLngth_++; } -void Register::DcrmntCrntLngth() { crntLngth_--; } +void llvm::opt_sched::Register::DcrmntCrntLngth() { crntLngth_--; } -Register &Register::operator=(const Register &rhs) { +llvm::opt_sched::Register &llvm::opt_sched::Register:: +operator=(const llvm::opt_sched::Register &rhs) { if (this != &rhs) { num_ = rhs.num_; type_ = rhs.type_; @@ -83,54 +100,65 @@ Register &Register::operator=(const Register &rhs) { return *this; } -void Register::SetupConflicts(int regCnt) { conflicts_.Construct(regCnt); } +void llvm::opt_sched::Register::SetupConflicts(int regCnt) { + conflicts_.Construct(regCnt); +} -void Register::ResetConflicts() { +void llvm::opt_sched::Register::ResetConflicts() { conflicts_.Reset(); isSpillCnddt_ = false; } -void Register::AddConflict(int regNum, bool isSpillCnddt) { +void llvm::opt_sched::Register::AddConflict(int regNum, bool isSpillCnddt) { assert(regNum != num_); assert(regNum >= 0); conflicts_.SetBit(regNum, true); isSpillCnddt_ = isSpillCnddt_ || isSpillCnddt; } -int Register::GetConflictCnt() const { return conflicts_.GetOneCnt(); } +int llvm::opt_sched::Register::GetConflictCnt() const { + return conflicts_.GetOneCnt(); +} -bool Register::IsSpillCandidate() const { return isSpillCnddt_; } +bool llvm::opt_sched::Register::IsSpillCandidate() const { + return isSpillCnddt_; +} -bool Register::AddToInterval(const SchedInstruction *inst) { +bool llvm::opt_sched::Register::AddToInterval(const SchedInstruction *inst) { return liveIntervalSet_.insert(inst).second; } -bool Register::IsInInterval(const SchedInstruction *inst) const { +bool llvm::opt_sched::Register::IsInInterval( + const SchedInstruction *inst) const { return liveIntervalSet_.count(inst) != 0; } -const Register::InstSetType &Register::GetLiveInterval() const { +const llvm::opt_sched::Register::InstSetType & +llvm::opt_sched::Register::GetLiveInterval() const { return liveIntervalSet_; } -bool Register::AddToPossibleInterval(const SchedInstruction *inst) { +bool llvm::opt_sched::Register::AddToPossibleInterval( + const SchedInstruction *inst) { return possibleLiveIntervalSet_.insert(inst).second; } -bool Register::IsInPossibleInterval(const SchedInstruction *inst) const { +bool llvm::opt_sched::Register::IsInPossibleInterval( + const SchedInstruction *inst) const { return possibleLiveIntervalSet_.count(inst) != 0; } -const Register::InstSetType &Register::GetPossibleLiveInterval() const { +const llvm::opt_sched::Register::InstSetType & +llvm::opt_sched::Register::GetPossibleLiveInterval() const { return possibleLiveIntervalSet_; } -void Register::resetLiveInterval() { +void llvm::opt_sched::Register::resetLiveInterval() { liveIntervalSet_.clear(); possibleLiveIntervalSet_.clear(); } -Register::Register(int16_t type, int num, int physicalNumber) { +llvm::opt_sched::Register::Register(int16_t type, int num, int physicalNumber) { type_ = type; num_ = num; wght_ = 1; @@ -168,9 +196,9 @@ void RegisterFile::ResetCrntLngths() { } } -Register *RegisterFile::getNext() { +llvm::opt_sched::Register *RegisterFile::getNext() { size_t RegNum = Regs.size(); - auto Reg = llvm::make_unique(); + auto Reg = std::make_unique(); Reg->SetType(regType_); Reg->SetNum(RegNum); Regs.push_back(std::move(Reg)); @@ -183,14 +211,14 @@ void RegisterFile::SetRegCnt(int regCnt) { Regs.resize(regCnt); for (int i = 0; i < getCount(); i++) { - auto Reg = llvm::make_unique(); + auto Reg = std::make_unique(); Reg->SetType(regType_); Reg->SetNum(i); Regs[i] = std::move(Reg); } } -Register *RegisterFile::GetReg(int num) const { +llvm::opt_sched::Register *RegisterFile::GetReg(int num) const { if (num >= 0 && num < getCount()) { return Regs[num].get(); } else { @@ -198,7 +226,7 @@ Register *RegisterFile::GetReg(int num) const { } } -Register *RegisterFile::FindLiveReg(int physNum) const { +llvm::opt_sched::Register *RegisterFile::FindLiveReg(int physNum) const { for (int i = 0; i < getCount(); i++) { if (Regs[i]->GetPhysicalNumber() == physNum && Regs[i]->IsLive() == true) return Regs[i].get(); diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp index 5ea6aff7..09bba871 100644 --- a/lib/Scheduler/sched_basic_data.cpp +++ b/lib/Scheduler/sched_basic_data.cpp @@ -2,6 +2,7 @@ #include "opt-sched/Scheduler/register.h" #include "opt-sched/Scheduler/stats.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/Support/ErrorHandling.h" #include @@ -34,8 +35,10 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name, InstCount maxInstCnt, int nodeID, InstCount fileSchedOrder, InstCount fileSchedCycle, InstCount fileLB, - InstCount fileUB, MachineModel *model) + InstCount fileUB, MachineModel *model, + const SUnit *SU) : GraphNode(num, maxInstCnt) { + SU_ = SU; // Static data that is computed only once. name_ = name; opCode_ = opCode; @@ -281,27 +284,22 @@ bool SchedInstruction::ApplyPreFxng(LinkedList *tightndLst, void SchedInstruction::AddDef(Register *reg) { if (defCnt_ >= MAX_DEFS_PER_INSTR) { - llvm::report_fatal_error("An instruction can't have more than " + - std::to_string(MAX_DEFS_PER_INSTR) + " defs", - false); - } - // Logger::Info("Inst %d defines reg %d of type %d and physNum %d and useCnt - // %d", - // num_, reg->GetNum(), reg->GetType(), reg->GetPhysicalNumber(), - // reg->GetUseCnt()); + llvm::report_fatal_error( + llvm::StringRef("An instruction can't have more than " + + std::to_string(MAX_DEFS_PER_INSTR) + " defs"), + false); + } assert(reg != NULL); defs_[defCnt_++] = reg; } void SchedInstruction::AddUse(Register *reg) { if (useCnt_ >= MAX_USES_PER_INSTR) { - llvm::report_fatal_error("An instruction can't have more than " + - std::to_string(MAX_USES_PER_INSTR) + " uses", - false); + llvm::report_fatal_error( + llvm::StringRef("An instruction can't have more than " + + std::to_string(MAX_USES_PER_INSTR) + " uses"), + false); } - // Logger::Info("Inst %d uses reg %d of type %d and physNum %d and useCnt %d", - // num_, reg->GetNum(), reg->GetType(), reg->GetPhysicalNumber(), - // reg->GetUseCnt()); assert(reg != NULL); uses_[useCnt_++] = reg; } diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp index 4e4595b2..0b67676e 100644 --- a/lib/Scheduler/sched_region.cpp +++ b/lib/Scheduler/sched_region.cpp @@ -43,7 +43,8 @@ static std::string ComputeDDGDumpPath() { // Force the user to set DDG_DUMP_PATH if (Path.empty()) llvm::report_fatal_error( - "DDG_DUMP_PATH must be set if trying to DUMP_DDGS.", false); + llvm::StringRef("DDG_DUMP_PATH must be set if trying to DUMP_DDGS."), + false); // Do some niceness to the input path to produce the actual path. llvm::SmallString<32> FixedPath; @@ -51,14 +52,16 @@ static std::string ComputeDDGDumpPath() { fs::real_path(Path, FixedPath, /* expand_tilde = */ true); if (ec) llvm::report_fatal_error( - "Unable to expand DDG_DUMP_PATH. " + ec.message(), false); + llvm::StringRef("Unable to expand DDG_DUMP_PATH. " + ec.message()), + false); Path.assign(FixedPath.begin(), FixedPath.end()); // The path must be a directory, and it must exist. if (!fs::is_directory(Path)) llvm::report_fatal_error( - "DDG_DUMP_PATH is set to a non-existent directory or non-directory " + - Path, + llvm::StringRef("DDG_DUMP_PATH is set to a non-existent directory or " + "non-directory " + + Path), false); // Force the path to be considered a directory. @@ -149,7 +152,7 @@ static bool isBbEnabled(Config &schedIni, Milliseconds rgnTimeout) { static void dumpDDG(DataDepGraph *DDG, llvm::StringRef DDGDumpPath, llvm::StringRef Suffix = "") { - std::string Path = DDGDumpPath; + std::string Path = DDGDumpPath.data(); Path += DDG->GetDagID(); if (!Suffix.empty()) { @@ -233,7 +236,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( if (!HeuristicSchedulerEnabled && !AcoBeforeEnum) { // Abort if ACO and heuristic algorithms are disabled. llvm::report_fatal_error( - "Heuristic list scheduler or ACO must be enabled before enumerator.", + llvm::StringRef("Heuristic list scheduler or ACO must be enabled " + "before enumerator."), false); return RES_ERROR; } @@ -295,7 +299,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( rslt = lstSchdulr->FindSchedule(lstSched, this); if (rslt != RES_SUCCESS) { - llvm::report_fatal_error("List scheduling failed", false); + llvm::report_fatal_error(llvm::StringRef("List scheduling failed"), + false); delete lstSchdulr; delete lstSched; return rslt; @@ -444,7 +449,7 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule( rslt = runACO(AcoSchedule, lstSched, false); if (rslt != RES_SUCCESS) { - llvm::report_fatal_error("ACO scheduling failed", false); + llvm::report_fatal_error(llvm::StringRef("ACO scheduling failed"), false); if (lstSchdulr) delete lstSchdulr; if (lstSched) @@ -803,7 +808,9 @@ FUNC_RESULT SchedRegion::Optimize_(Milliseconds startTime, enumBestSched_ = AllocNewSched_(); InstCount initCost = bestCost_; - enumrtr = AllocEnumrtr_(lngthTimeout); + + Milliseconds timeout = instTimeout_ ? lngthTimeout : rgnTimeout; + enumrtr = AllocEnumrtr_(timeout, TimeoutPerMemblock_); rslt = Enumerate_(startTime, rgnTimeout, lngthTimeout); Milliseconds solutionTime = Utilities::GetProcessorTime() - startTime; diff --git a/lib/Scheduler/stats.cpp b/lib/Scheduler/stats.cpp index 71dba612..7260d1c4 100644 --- a/lib/Scheduler/stats.cpp +++ b/lib/Scheduler/stats.cpp @@ -217,6 +217,7 @@ IntStat IntStat slotCountInfeasibilityHits("Slot count infeasibility hits"); IntStat forwardLBInfeasibilityHits("Forward LB infeasibility hits"); IntStat backwardLBInfeasibilityHits("Backward LB infeasibility hits"); +IntStat costInfeasibilityHits("Cost infeasibility hits"); IntStat invalidSchedules("Invalid schedules"); diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.cpp b/lib/Wrapper/AMDGPU/GCNOptSched.cpp index 7c3d2993..4fae3193 100644 --- a/lib/Wrapper/AMDGPU/GCNOptSched.cpp +++ b/lib/Wrapper/AMDGPU/GCNOptSched.cpp @@ -5,6 +5,7 @@ //===----------------------------------------------------------------------===// #include "GCNOptSched.h" +#include "AMDGPUExportClustering.h" #include "AMDGPUMacroFusion.h" #include "GCNSchedStrategy.h" #include "SIMachineFunctionInfo.h" @@ -20,19 +21,6 @@ static cl::opt cl::desc("Limit occpancy target using perf hints."), cl::init(false), cl::Hidden); -static ScheduleDAGInstrs *createOptSchedGCN(MachineSchedContext *C) { - ScheduleDAGMILive *DAG = new ScheduleDAGOptSchedGCN( - C, llvm::make_unique(C)); - DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); - return DAG; -} - -// Register the machine scheduler. -static MachineSchedRegistry - OptSchedMIRegistry("gcn-optsched", "Use the GCN OptSched scheduler.", - createOptSchedGCN); - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) static void getRealRegionPressure(MachineBasicBlock::const_iterator Begin, MachineBasicBlock::const_iterator End, diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.h b/lib/Wrapper/AMDGPU/GCNOptSched.h index ff473bd3..8a7db9e9 100644 --- a/lib/Wrapper/AMDGPU/GCNOptSched.h +++ b/lib/Wrapper/AMDGPU/GCNOptSched.h @@ -7,8 +7,8 @@ #ifndef LLVM_GCN_OPT_SCHED_H #define LLVM_GCN_OPT_SCHED_H -#include "../OptimizingScheduler.h" #include "GCNRegPressure.h" +#include "Wrapper/OptimizingScheduler.h" namespace llvm { namespace opt_sched { diff --git a/lib/Wrapper/AMDGPU/GCNOptSchedReg.h b/lib/Wrapper/AMDGPU/GCNOptSchedReg.h new file mode 100644 index 00000000..901c53c3 --- /dev/null +++ b/lib/Wrapper/AMDGPU/GCNOptSchedReg.h @@ -0,0 +1,30 @@ +#ifndef OPT_SCHED_REG +#define OPT_SCHED_REG + +#include "Wrapper/AMDGPU/GCNOptSched.h" +#include "Wrapper/AMDGPU/OptSchedGCNTarget.cpp" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace llvm { +namespace opt_sched { + +// Create OptSched ScheduleDAG. +static ScheduleDAGInstrs *createOptSchedGCN(MachineSchedContext *C) { + ScheduleDAGMILive *DAG = new ScheduleDAGOptSchedGCN( + C, std::make_unique(C)); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + return DAG; +} + +static MachineSchedRegistry + OptSchedGCNMIRegistry("gcn-optsched", "Use the GCN OptSched scheduler.", + createOptSchedGCN); + +} // namespace opt_sched +} // namespace llvm + +#endif diff --git a/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp b/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp index 6f18a644..d056ab8e 100644 --- a/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp +++ b/lib/Wrapper/AMDGPU/OptSchedDDGWrapperGCN.cpp @@ -9,8 +9,11 @@ #include "SIRegisterInfo.h" #include "opt-sched/Scheduler/register.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include #define DEBUG_TYPE "optsched-ddg-wrapper" @@ -29,15 +32,15 @@ namespace { std::unique_ptr createSubRegSet(unsigned Reg, const MachineRegisterInfo &MRI, int16_t Type) { - return llvm::make_unique( - MRI.getMaxLaneMaskForVReg(Reg).getNumLanes(), Type); + unsigned numSubRegs = + SIRegisterInfo::getNumCoveredRegs(MRI.getMaxLaneMaskForVReg(Reg)); + return std::make_unique(numSubRegs, Type); } // Copied from Target/AMDGPU/GCNRegPressure.cpp LaneBitmask getDefRegMask(const MachineOperand &MO, const MachineRegisterInfo &MRI) { - assert(MO.isDef() && MO.isReg() && - TargetRegisterInfo::isVirtualRegister(MO.getReg())); + assert(MO.isDef() && MO.isReg() && MO.getReg().isVirtual()); // We don't rely on read-undef flag because in case of tentative schedule // tracking it isn't set correctly yet. This works correctly however since @@ -52,8 +55,7 @@ LaneBitmask getDefRegMask(const MachineOperand &MO, LaneBitmask getUsedRegMask(const MachineOperand &MO, const MachineRegisterInfo &MRI, const LiveIntervals &LIS) { - assert(MO.isUse() && MO.isReg() && - TargetRegisterInfo::isVirtualRegister(MO.getReg())); + assert(MO.isUse() && MO.isReg() && MO.getReg().isVirtual()); if (auto SubReg = MO.getSubReg()) return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg); @@ -74,8 +76,26 @@ SmallVector collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI) { SmallVector Res; - for (const auto &MO : MI.operands()) { - if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + for (ConstMIBundleOperands MIO(MI); MIO.isValid(); ++MIO) { + const MachineOperand MO = *MIO; +#ifdef DEBUG_REG + Logger::Info("processing Op"); + MO.print(errs()); + errs() << "\n"; + + if (!MO.isReg()) { + Logger::Info("Is Not Reg"); + continue; + } + if (!MO.getReg().isVirtual()) + Logger::Info("Is Not VirtReg"); + if (!MO.isUse()) + Logger::Info("Is Not Use"); + if (!MO.readsReg()) + Logger::Info("Is Not Reads Reg"); +#endif + + if (!MO.isReg() || !MO.getReg().isVirtual()) continue; if (!MO.isUse() || !MO.readsReg()) continue; @@ -83,6 +103,14 @@ collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS, const auto UsedMask = getUsedRegMask(MO, MRI, LIS); auto Reg = MO.getReg(); + +#ifdef DEBUG_REG + Logger::Info("found use"); + Logger::Info("has Reg %u", Reg.id()); + auto maskPrint = PrintLaneMask(UsedMask); + errs() << maskPrint; + errs() << "\n"; +#endif auto I = std::find_if(Res.begin(), Res.end(), [Reg](const RegisterMaskPair &RM) { return RM.RegUnit == Reg; @@ -97,16 +125,45 @@ collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS, SmallVector collectVirtualRegDefs(const MachineInstr &MI, const LiveIntervals &LIS, - const MachineRegisterInfo &MRI) { + const MachineRegisterInfo &MRI, + const ScheduleDAGOptSched *DAG) { SmallVector Res; - for (const auto &MO : MI.defs()) { - if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()) || - MO.isDead()) + + for (ConstMIBundleOperands MIO(MI); MIO.isValid(); ++MIO) { + const MachineOperand MO = *MIO; +#ifdef DEBUG_REG + Logger::Info("Processing Op"); + MO.print(errs()); + errs() << "\n"; + + if (!MO.isReg()) { + Logger::Info("Is Not Reg"); + continue; + } + if (!MO.getReg().isVirtual()) + Logger::Info("Is Not VirtReg"); + if (!MO.isDef()) + Logger::Info("Is Not Def"); + if (MO.isDead()) + Logger::Info("Is Dead"); +#endif + + if (!MO.isReg() || !MO.getReg().isVirtual() || MO.isDead() || !MO.isDef()) { continue; + } const auto DefMask = getDefRegMask(MO, MRI); auto Reg = MO.getReg(); + +#ifdef DEBUG_REG + Logger::Info("found def"); + Logger::Info("has Reg %u", Reg.id()); + auto maskPrint = PrintLaneMask(DefMask); + errs() << maskPrint; + errs() << "\n"; +#endif + auto I = std::find_if(Res.begin(), Res.end(), [Reg](const RegisterMaskPair &RM) { return RM.RegUnit == Reg; @@ -127,12 +184,14 @@ collectLiveSubRegsAtInstr(const MachineInstr *MI, const LiveIntervals *LIS, SmallVector Res; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - auto Reg = TargetRegisterInfo::index2VirtReg(I); + auto Reg = llvm::Register::index2VirtReg(I); if (!LIS->hasInterval(Reg)) continue; + auto LiveMask = getLiveLaneMask(Reg, SI, *LIS, MRI); - if (LiveMask.any()) + if (LiveMask.any()) { Res.emplace_back(Reg, LiveMask); + } } return Res; } @@ -140,9 +199,11 @@ collectLiveSubRegsAtInstr(const MachineInstr *MI, const LiveIntervals *LIS, } // end anonymous namespace unsigned OptSchedDDGWrapperGCN::getRegKind(unsigned Reg) const { - assert(TargetRegisterInfo::isVirtualRegister(Reg)); + assert(llvm::Register::isVirtualRegister(Reg)); const auto RC = MRI.getRegClass(Reg); auto STI = static_cast(MRI.getTargetRegisterInfo()); + if (STI->isAGPRClass(RC)) + Logger::Info("FOUND AGPR!"); return STI->isSGPRClass(RC) ? SGPR32 : VGPR32; } @@ -157,11 +218,16 @@ void OptSchedDDGWrapperGCN::convertRegFiles() { for (const auto &SU : SUnits) { const MachineInstr *MI = SU.getInstr(); +#ifdef DEBUG_REG + Logger::Info("Parsing Inst"); + MI->print(errs()); +#endif + for (const auto &MaskPair : collectVirtualRegUses(*MI, *LIS, MRI)) addSubRegUses(GetInstByIndx(SU.NodeNum), MaskPair.RegUnit, MaskPair.LaneMask); - for (const auto &MaskPair : collectVirtualRegDefs(*MI, *LIS, MRI)) + for (const auto &MaskPair : collectVirtualRegDefs(*MI, *LIS, MRI, DAG)) addSubRegDefs(GetInstByIndx(SU.NodeNum), MaskPair.RegUnit, MaskPair.LaneMask); } @@ -188,16 +254,29 @@ void OptSchedDDGWrapperGCN::convertRegFiles() { void OptSchedDDGWrapperGCN::addSubRegDefs(SchedInstruction *Instr, unsigned Reg, const LaneBitmask &LiveMask, bool LiveIn) { - if (RegionRegs[Reg] == nullptr) + if (RegionRegs[Reg] == nullptr) { RegionRegs[Reg] = createSubRegSet(Reg, MRI, getRegKind(Reg)); + } SubRegSet &SubRegs = *RegionRegs[Reg].get(); RegisterFile &RF = RegFiles[SubRegs.Type]; unsigned Lane = 0; +#ifdef DEBUG_REG + Logger::Info("Processing LLVM Reg %u", Reg); + auto Temp = Reg; +#endif for (auto &ResNo : SubRegs) { - if ((LiveMask.getLane(Lane) & LiveMask).any()) { + if ((LiveMask.getLane(Lane) & LiveMask).any() || + (LiveMask.getLane(Lane + 1) & LiveMask).any()) { + Register *Reg = RF.getNext(); ResNo = Reg->GetNum(); +#ifdef DEBUG_REG + Logger::Info("maps to OptSched Reg %d", Reg->GetNum()); + Logger::Info( + "Adding def for subreg of reg %u (optsched vreg %d, type = %d)", Temp, + ResNo, Reg->GetType()); +#endif Instr->AddDef(Reg); // Weight should always be one since we are only tracking VGPR32 and // SGPR32 @@ -205,23 +284,46 @@ void OptSchedDDGWrapperGCN::addSubRegDefs(SchedInstruction *Instr, unsigned Reg, Reg->AddDef(Instr); Reg->SetIsLiveIn(LiveIn); } - Lane++; + if ((LiveMask.getLane(Lane) & LiveMask).any() != + (LiveMask.getLane(Lane + 1) & LiveMask).any()) { + Logger::Info("found lane mismatch"); + } + Lane += 2; } } void OptSchedDDGWrapperGCN::addSubRegUses(SchedInstruction *Instr, unsigned Reg, const LaneBitmask &LiveMask, bool LiveOut) { - SubRegSet &SubRegs = *RegionRegs[Reg].get(); + auto temp = RegionRegs[Reg].get(); + if (temp == nullptr) + DAG->MF.print(errs()); + SubRegSet &SubRegs = *temp; RegisterFile &RF = RegFiles[SubRegs.Type]; unsigned Lane = 0; +#ifdef DEBUG_REG + Logger::Info("Processing LLVM Reg %u", Reg); + auto Temp = Reg; +#endif for (auto &ResNo : SubRegs) { - if ((LiveMask.getLane(Lane) & LiveMask).any()) { + if ((LiveMask.getLane(Lane) & LiveMask).any() || + (LiveMask.getLane(Lane + 1) & LiveMask).any()) { Register *Reg = RF.GetReg(ResNo); +#ifdef DEBUG_REG + Logger::Info("maps to OptSched Reg %d", Reg->GetNum()); + Logger::Info( + "Adding use for subreg of reg %u (optsched vreg %d, type = %d)", Temp, + ResNo, Reg->GetType()); +#endif Instr->AddUse(Reg); Reg->AddUse(Instr); Reg->SetIsLiveOut(LiveOut); } - Lane++; + if ((LiveMask.getLane(Lane) & LiveMask).any() != + (LiveMask.getLane(Lane + 1) & LiveMask).any()) { + Logger::Info("found lane mismatch"); + } + + Lane += 2; } } diff --git a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp index aa6be72d..e7b82972 100644 --- a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp +++ b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp @@ -7,6 +7,7 @@ #include "SIMachineFunctionInfo.h" #include "Wrapper/OptSchedMachineWrapper.h" #include "opt-sched/Scheduler/OptSchedTarget.h" +#include "opt-sched/Scheduler/config.h" #include "opt-sched/Scheduler/data_dep.h" #include "opt-sched/Scheduler/defines.h" #include "opt-sched/Scheduler/machine_model.h" @@ -23,6 +24,7 @@ using namespace llvm::opt_sched; // This is necessary because we cannot perfectly predict the number of registers // of each type that will be allocated. static const unsigned GPRErrorMargin = 0; +static const unsigned OCCUnlimited = 10; #ifndef NDEBUG static unsigned getOccupancyWeight(unsigned Occupancy) { @@ -66,18 +68,19 @@ class OptSchedGCNTarget : public OptSchedTarget { public: std::unique_ptr createMachineModel(const char *ConfigPath) override { - return llvm::make_unique(ConfigPath); + return std::make_unique(ConfigPath); } std::unique_ptr createDDGWrapper(llvm::MachineSchedContext *Context, ScheduleDAGOptSched *DAG, OptSchedMachineModel *MM, LATENCY_PRECISION LatencyPrecision, const std::string &RegionID) override { - return llvm::make_unique(Context, DAG, MM, - LatencyPrecision, RegionID); + return std::make_unique(Context, DAG, MM, + LatencyPrecision, RegionID); } - void initRegion(llvm::ScheduleDAGInstrs *DAG, MachineModel *MM_) override; + void initRegion(llvm::ScheduleDAGInstrs *DAG, MachineModel *MM_, + Config &OccFile) override; void finalizeRegion(const InstSchedule *Schedule) override; @@ -90,6 +93,18 @@ class OptSchedGCNTarget : public OptSchedTarget { // Revert scheduing if we decrease occupancy. bool shouldKeepSchedule() override; + void SetOccupancyLimit(int OccupancyLimitParam) override { + OccupancyLimit = OccupancyLimitParam; + } + void SetShouldLimitOcc(bool ShouldLimitOccParam) override { + ShouldLimitOcc = ShouldLimitOccParam; + } + void SetOccLimitSource(OCC_LIMIT_TYPE LimitTypeParam) override { + LimitType = LimitTypeParam; + } + + int getOccupancyLimit(Config &OccFile) const; + private: const llvm::MachineFunction *MF; SIMachineFunctionInfo *MFI; @@ -100,20 +115,26 @@ class OptSchedGCNTarget : public OptSchedTarget { unsigned RegionEndingOccupancy; unsigned TargetOccupancy; + // Limiting occupancy has shown to greatly increase the performance of some + // kernels + int OccupancyLimit; + bool ShouldLimitOcc; + OCC_LIMIT_TYPE LimitType; + // Max occupancy with local memory size; unsigned MaxOccLDS; // In RP only (max occupancy) scheduling mode we should try to find // a min-RP schedule without considering perf hints which suggest limiting // occupancy. Returns true if we should consider perf hints. - bool shouldLimitWaves() const; + bool shouldLimitWaves(llvm::SIMachineFunctionInfo *MFI) const; // Find occupancy with spill cost. unsigned getOccupancyWithCost(const InstCount Cost) const; }; std::unique_ptr createOptSchedGCNTarget() { - return llvm::make_unique(); + return std::make_unique(); } } // end anonymous namespace @@ -144,7 +165,7 @@ void OptSchedGCNTarget::dumpOccupancyInfo(const InstSchedule *Schedule) const { #endif void OptSchedGCNTarget::initRegion(llvm::ScheduleDAGInstrs *DAG_, - MachineModel *MM_) { + MachineModel *MM_, Config &OccFile) { DAG = static_cast(DAG_); MF = &DAG->MF; MFI = @@ -156,25 +177,68 @@ void OptSchedGCNTarget::initRegion(llvm::ScheduleDAGInstrs *DAG_, GCNDownwardRPTracker RPTracker(*DAG->getLIS()); RPTracker.advance(DAG->begin(), DAG->end(), nullptr); const GCNRegPressure &P = RPTracker.moveMaxPressure(); - RegionStartingOccupancy = - getAdjustedOccupancy(ST, P.getVGPRNum(), P.getSGPRNum(), MaxOccLDS); + RegionStartingOccupancy = getAdjustedOccupancy( + ST, P.getVGPRNum(ST->hasGFX90AInsts()), P.getSGPRNum(), MaxOccLDS); + TargetOccupancy = - shouldLimitWaves() ? MFI->getMinAllowedOccupancy() : MFI->getOccupancy(); + shouldLimitWaves(MFI) ? getOccupancyLimit(OccFile) : MFI->getOccupancy(); + + // Do not attempt to hit a higher occupancy if we are limited by another + // region + if (TargetOccupancy > MFI->getOccupancy()) + TargetOccupancy = MFI->getOccupancy(); + + Logger::Event("TargetOccupancy", "RegionStarting", RegionStartingOccupancy, + "Target", TargetOccupancy); - Logger::Event("TargetOccupancy", "region", RegionStartingOccupancy, "target", - TargetOccupancy); LLVM_DEBUG(dbgs() << "Region starting occupancy is " << RegionStartingOccupancy << "\n" << "Target occupancy is " << TargetOccupancy << "\n"); } -bool OptSchedGCNTarget::shouldLimitWaves() const { +bool OptSchedGCNTarget::shouldLimitWaves( + llvm::SIMachineFunctionInfo *MFI) const { // FIXME: Consider machine model here as well. // FIXME: Return false because perf hints are not currently strong enough to // use as a hard cap. Consider 'OccupancyWeight' heuristic here instead. + // TODO(Jeff): Limiting occupancy has shown to have a huge impact on + // performance. Good heuristics will likely be largely beneficial + + if (ShouldLimitOcc) { + switch (LimitType) { + case OLT_NONE: + return false; + case OLT_HEUR: + return MFI->isMemoryBound() || MFI->needsWaveLimiter(); + case OLT_FILE: + return true; + } + } + return false; } +int OptSchedGCNTarget::getOccupancyLimit(Config &OccFile) const { + switch (LimitType) { + case OLT_NONE: + return OCCUnlimited; + case OLT_HEUR: + return MFI->isMemoryBound() || MFI->needsWaveLimiter() ? 4 : OCCUnlimited; + case OLT_FILE: + std::string functionName = MF->getFunction().getName().data(); + int limit = OccFile.GetInt(functionName, -1); + int AMDHeur = + MFI->isMemoryBound() || MFI->needsWaveLimiter() ? 4 : OCCUnlimited; + if (limit != -1) { + Logger::Event("OccupancyLimits", "File", limit, "AMDHeur", AMDHeur); + } + if (limit == -1) { + limit = OCCUnlimited; + } + return limit; + } +} + unsigned OptSchedGCNTarget::getOccupancyWithCost(const InstCount Cost) const { return TargetOccupancy - Cost; } @@ -224,5 +288,8 @@ namespace opt_sched { OptSchedTargetRegistry OptSchedGCNTargetRegistry("amdgcn", createOptSchedGCNTarget); +OptSchedTargetRegistry OptSchedGCNHSATargetRegistry("amdgcn-amd-amdhsa", + createOptSchedGCNTarget); + } // namespace opt_sched } // namespace llvm diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp index d90ec76e..9e9f0fb3 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp @@ -20,6 +20,7 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include #include @@ -35,7 +36,7 @@ using namespace llvm; using namespace llvm::opt_sched; #ifndef NDEBUG -static Printable printOptSchedReg(const Register *Reg, +static Printable printOptSchedReg(const llvm::opt_sched::Register *Reg, const std::string &RegTypeName, int16_t RegTypeNum); #endif @@ -325,7 +326,7 @@ OptSchedDDGWrapperBasic::getRegisterType(unsigned RegUnit) const { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -static Printable printOptSchedReg(const Register *Reg, +static Printable printOptSchedReg(const llvm::opt_sched::Register *Reg, const std::string &RegTypeName, int16_t RegTypeNum) { return Printable([Reg, &RegTypeName, RegTypeNum](raw_ostream &OS) { @@ -383,7 +384,8 @@ inline void OptSchedDDGWrapperBasic::setupRoot() { RootNum, // fileSchedCycle 0, // fileInstLwrBound 0, // fileInstUprBound - 0); // blkNum + 0, // blkNum + nullptr); // Add edges between root nodes in graph and optsched artificial root. for (size_t i = 0; i < DAG->SUnits.size(); i++) @@ -401,7 +403,8 @@ inline void OptSchedDDGWrapperBasic::setupLeaf() { LeafNum, // fileSchedCycle 0, // fileInstLwrBound 0, // fileInstUprBound - 0); // blkNum + 0, // blkNum + nullptr); // Add edges between leaf nodes in graph and optsched artificial leaf. for (size_t i = 0; i < DAG->SUnits.size(); i++) @@ -420,35 +423,63 @@ void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU, bool IgnoreArtificialEdges) { const MachineInstr *instr = SU.getInstr(); SUnit::const_succ_iterator I, E; +#ifdef PRINT_EDGE + if (!IgnoreRealEdges) { + Logger::Info("\n\n"); + Logger::Info("Scanning dependencies for inst (%d total, inst has %d preds)", + SU.Succs.size(), SU.Preds.size()); + SU.getInstr()->print(errs()); + } +#endif + for (I = SU.Succs.begin(), E = SU.Succs.end(); I != E; ++I) { if (I->getSUnit()->isBoundaryNode()) continue; bool IsArtificial = I->isArtificial() || I->isCluster(); - if (IgnoreArtificialEdges && IsArtificial) + if (IgnoreArtificialEdges && IsArtificial) { continue; - else if (IgnoreRealEdges && !IsArtificial) + } else if (IgnoreRealEdges && !IsArtificial) { continue; + } DependenceType DepType; +#ifdef PRINT_EDGE + Logger::Info("Found dependency between"); + SU.getInstr()->print(errs()); + Logger::Info("And"); + I->getSUnit()->getInstr()->print(errs()); +#endif switch (I->getKind()) { case SDep::Data: DepType = DEP_DATA; +#ifdef PRINT_EDGE + Logger::Info("Data dep on %u", I->getReg()); +#endif break; case SDep::Anti: DepType = DEP_ANTI; +#ifdef PRINT_EDGE + Logger::Info("Anti dep on %u", I->getReg()); +#endif break; case SDep::Output: DepType = DEP_OUTPUT; +#ifdef PRINT_EDGE + Logger::Info("Output dep on %u", I->getReg()); +#endif break; case SDep::Order: DepType = TreatOrderDepsAsDataDeps ? DEP_DATA : DEP_OTHER; +#ifdef PRINT_EDGE + Logger::Info("Order dep"); +#endif break; } int16_t Latency; if (ltncyPrcsn_ == LTP_PRECISE) { // get latency from the machine model - const auto &InstName = DAG->TII->getName(instr->getOpcode()); + const auto &InstName = DAG->TII->getName(instr->getOpcode()).data(); const auto &InstType = MM->GetInstTypeByName(InstName); Latency = MM->GetLatency(InstType, DepType); } else if (ltncyPrcsn_ == LTP_ROUGH) { // rough latency = llvm latency @@ -457,9 +488,11 @@ void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU, // by the specified divisor if (DAG->reducedLatencyPassStarted() && Latency > DAG->getLatencyTarget()) { - const string &InstFromName = DAG->TII->getName(instr->getOpcode()); + const string &InstFromName = + DAG->TII->getName(instr->getOpcode()).data(); const MachineInstr *ToInstr = I->getSUnit()->getInstr(); - const string &InstToName = DAG->TII->getName(ToInstr->getOpcode()); + const string &InstToName = + DAG->TII->getName(ToInstr->getOpcode()).data(); int16_t OldLatency = Latency; Latency /= DAG->getLatencyDivisor(); if (Latency < DAG->getLatencyMinimun()) @@ -472,6 +505,10 @@ void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU, } else Latency = 1; // unit latency = ignore ilp +#ifdef PRINT_EDGE + Logger::Info("Has latency %d", Latency); +#endif + CreateEdge_(SU.NodeNum, I->getSUnit()->NodeNum, Latency, DepType, IsArtificial); } @@ -480,11 +517,13 @@ void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU, void OptSchedDDGWrapperBasic::convertSUnit(const SUnit &SU) { InstType InstType; std::string InstName; - if (SU.isBoundaryNode() || !SU.isInstr()) + + if (SU.isBoundaryNode() || !SU.isInstr()) { return; + } const MachineInstr *MI = SU.getInstr(); - InstName = DAG->TII->getName(MI->getOpcode()); + InstName = DAG->TII->getName(MI->getOpcode()).data(); // Search in the machine model for an instType with this OpCode name InstType = MM->GetInstTypeByName(InstName.c_str()); @@ -505,7 +544,8 @@ void OptSchedDDGWrapperBasic::convertSUnit(const SUnit &SU) { SU.NodeNum, // fileSchedCycle 0, // fileInstLwrBound 0, // fileInstUprBound - 0); // blkNum + 0, + &SU); // blkNum } void OptSchedDDGWrapperBasic::discoverBoundaryLiveness(const MachineInstr *MI) { diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.h b/lib/Wrapper/OptSchedDDGWrapperBasic.h index cb105bdc..5f829ffa 100644 --- a/lib/Wrapper/OptSchedDDGWrapperBasic.h +++ b/lib/Wrapper/OptSchedDDGWrapperBasic.h @@ -11,6 +11,7 @@ #include "OptimizingScheduler.h" #include "opt-sched/Scheduler/data_dep.h" #include "opt-sched/Scheduler/graph_trans.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/TargetRegisterInfo.h" diff --git a/lib/Wrapper/OptSchedGenericTarget.cpp b/lib/Wrapper/OptSchedGenericTarget.cpp index 3299d1b1..ba16d21f 100644 --- a/lib/Wrapper/OptSchedGenericTarget.cpp +++ b/lib/Wrapper/OptSchedGenericTarget.cpp @@ -6,6 +6,7 @@ #include "OptSchedDDGWrapperBasic.h" #include "OptSchedMachineWrapper.h" #include "opt-sched/Scheduler/OptSchedTarget.h" +#include "opt-sched/Scheduler/config.h" #include "opt-sched/Scheduler/defines.h" #include "opt-sched/Scheduler/machine_model.h" #include "llvm/ADT/STLExtras.h" @@ -24,29 +25,41 @@ class OptSchedGenericTarget : public OptSchedTarget { public: std::unique_ptr createMachineModel(const char *ConfigPath) override { - return llvm::make_unique(ConfigPath); + return std::make_unique(ConfigPath); } std::unique_ptr createDDGWrapper(llvm::MachineSchedContext *Context, ScheduleDAGOptSched *DAG, OptSchedMachineModel *MM, LATENCY_PRECISION LatencyPrecision, const std::string &RegionID) override { - return llvm::make_unique( + return std::make_unique( Context, DAG, MM, LatencyPrecision, RegionID); } - void initRegion(llvm::ScheduleDAGInstrs *DAG, MachineModel *MM_) override { + void initRegion(llvm::ScheduleDAGInstrs *DAG, MachineModel *MM_, + Config &OccFile) override { MM = MM_; } void finalizeRegion(const InstSchedule *Schedule) override {} // For generic target find total PRP. InstCount getCost(const llvm::SmallVectorImpl &PRP) const override; + + void SetOccupancyLimit(int OccupancyLimitParam) override { /*nothing*/ + ; + } + void SetShouldLimitOcc(bool ShouldLimitOccParam) override { /*nothing*/ + ; + } + void SetOccLimitSource(OCC_LIMIT_TYPE LimitTypeParam) override { /*nothing*/ + ; + } }; } // end anonymous namespace InstCount OptSchedGenericTarget::getCost( const llvm::SmallVectorImpl &PRP) const { + Logger::Info("in generic get cost"); InstCount TotalPRP = 0; for (int16_t T = 0; T < MM->GetRegTypeCnt(); ++T) TotalPRP += PRP[T]; @@ -57,7 +70,7 @@ namespace llvm { namespace opt_sched { std::unique_ptr createOptSchedGenericTarget() { - return llvm::make_unique(); + return std::make_unique(); } OptSchedTargetRegistry diff --git a/lib/Wrapper/OptSchedMachineWrapper.cpp b/lib/Wrapper/OptSchedMachineWrapper.cpp index c22b62af..8c34d70b 100644 --- a/lib/Wrapper/OptSchedMachineWrapper.cpp +++ b/lib/Wrapper/OptSchedMachineWrapper.cpp @@ -7,6 +7,7 @@ Description: A wrapper that convert an LLVM target to an OptSched MachineModel. #include "opt-sched/Scheduler/logger.h" #include "opt-sched/Scheduler/machine_model.h" #include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -45,13 +46,13 @@ void dumpInstType(InstTypeInfo &instType, MachineModel *mm) { std::unique_ptr createCortexA7MMGenerator(const llvm::ScheduleDAGInstrs *dag, MachineModel *mm) { - return make_unique(dag, mm); + return std::make_unique(dag, mm); } std::unique_ptr createCortexA53MMGenerator(const llvm::ScheduleDAGInstrs *dag, MachineModel *mm) { - return make_unique(dag, mm); + return std::make_unique(dag, mm); } } // end anonymous namespace @@ -183,7 +184,7 @@ IssueType CortexA7MMGenerator::generateIssueType(const InstrStage *E) const { InstType CortexA7MMGenerator::generateInstrType(const MachineInstr *instr) { // Search in the machine model for an instType with this OpCode - const std::string instrName = DAG->TII->getName(instr->getOpcode()); + const std::string instrName = DAG->TII->getName(instr->getOpcode()).data(); const InstType InstType = MM->GetInstTypeByName(instrName); // If the machine model does not have instType with this OpCode name, @@ -242,7 +243,7 @@ void CortexA53MMGenerator::generateProcessorData(std::string *mdlName_, InstType CortexA53MMGenerator::generateInstrType(const llvm::MachineInstr *instr) { // Search in the machine model for an instType with this OpCode - const std::string InstrName = DAG->TII->getName(instr->getOpcode()); + const std::string InstrName = DAG->TII->getName(instr->getOpcode()).data(); const InstType InstrType = MM->GetInstTypeByName(InstrName); // If the machine model does not have instType with this OpCode name, diff --git a/lib/Wrapper/OptSchedMachineWrapper.h b/lib/Wrapper/OptSchedMachineWrapper.h index 927bf163..48cf14b1 100644 --- a/lib/Wrapper/OptSchedMachineWrapper.h +++ b/lib/Wrapper/OptSchedMachineWrapper.h @@ -9,7 +9,6 @@ contained in those ini files. #define OPTSCHED_MACHINE_MODEL_WRAPPER_H #include "opt-sched/Scheduler/machine_model.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/RegisterClassInfo.h" diff --git a/lib/Wrapper/OptSchedReg.h b/lib/Wrapper/OptSchedReg.h new file mode 100644 index 00000000..378004d4 --- /dev/null +++ b/lib/Wrapper/OptSchedReg.h @@ -0,0 +1,30 @@ +#ifndef OPT_SCHED_REG +#define OPT_SCHED_REG + +#include "OptimizingScheduler.h" +#include "llvm/CodeGen/MachineScheduler.h" + +using namespace llvm; + +namespace llvm { +namespace opt_sched { + +// Create OptSched ScheduleDAG. +static ScheduleDAGInstrs *createOptSched(MachineSchedContext *C) { + ScheduleDAGMILive *DAG = + new ScheduleDAGOptSched(C, std::make_unique(C)); + DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI)); + // README: if you need the x86 mutations uncomment the next line. + // addMutation(createX86MacroFusionDAGMutation()); + // You also need to add the next line somewhere above this function + //#include "../../../../../llvm/lib/Target/X86/X86MacroFusion.h" + return DAG; +} + +// Register the machine scheduler. +static MachineSchedRegistry OptSchedMIRegistry("optsched", + "Use the OptSched scheduler.", + createOptSched); + +} // namespace opt_sched +} // namespace llvm \ No newline at end of file diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp index c6eea6b3..fbf38a95 100644 --- a/lib/Wrapper/OptimizingScheduler.cpp +++ b/lib/Wrapper/OptimizingScheduler.cpp @@ -20,6 +20,7 @@ #include "opt-sched/Scheduler/utilities.h" #include "llvm/ADT/StringMap.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/RegisterClassInfo.h" @@ -33,6 +34,7 @@ #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetMachine.h" #include #include #include @@ -66,22 +68,8 @@ static constexpr const char *DEFAULT_CFGHF_FNAME = "/hotfuncs.ini"; // Default path to the machine model specification file for opt-sched. static constexpr const char *DEFAULT_CFGMM_FNAME = "/machine_model.cfg"; -// Create OptSched ScheduleDAG. -static ScheduleDAGInstrs *createOptSched(MachineSchedContext *C) { - ScheduleDAGMILive *DAG = - new ScheduleDAGOptSched(C, llvm::make_unique(C)); - DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI)); - // README: if you need the x86 mutations uncomment the next line. - // addMutation(createX86MacroFusionDAGMutation()); - // You also need to add the next line somewhere above this function - //#include "../../../../../llvm/lib/Target/X86/X86MacroFusion.h" - return DAG; -} - -// Register the machine scheduler. -static MachineSchedRegistry OptSchedMIRegistry("optsched", - "Use the OptSched scheduler.", - createOptSched); +// Default path to the machine model specification file for opt-sched. +static constexpr const char *DEFAULT_CFGOCL_FNAME = "/occupancy_limits.ini"; // Command line options for opt-sched. static cl::opt OptSchedCfg( @@ -103,6 +91,10 @@ static cl::opt OptSchedCfgMM( "optsched-cfg-machine-model", cl::Hidden, cl::desc("Path to the machine model specification file for opt-sched.")); +static cl::opt OptSchedCfgOCL( + "optsched-cfg-occupancy-limits", cl::Hidden, + cl::desc("Path to the occupancy limits specification file for opt-sched.")); + static void getRealCfgPathCL(SmallString<128> &Path) { SmallString<128> Tmp = Path; auto EC = sys::fs::real_path(Tmp, Path, true); @@ -113,11 +105,12 @@ static void getRealCfgPathCL(SmallString<128> &Path) { static void reportCfgDirPathError(std::error_code EC, llvm::StringRef OptSchedCfg) { if (OptSchedCfg == DEFAULT_CFG_DIR) - llvm::report_fatal_error(EC.message() + - ": Error searching for the OptSched config " - "directory in the default location: " + - DEFAULT_CFG_DIR, - false); + llvm::report_fatal_error( + llvm::StringRef(EC.message() + + ": Error searching for the OptSched config " + "directory in the default location: " + + DEFAULT_CFG_DIR), + false); else llvm::report_fatal_error(EC.message() + ": " + OptSchedCfg, false); } @@ -174,12 +167,14 @@ static SchedulerType parseListSchedType() { return SCHED_STALLING_LIST; llvm::report_fatal_error( - "Unrecognized option for HEUR_SCHED_TYPE: " + SchedTypeString, false); + llvm::StringRef("Unrecognized option for HEUR_SCHED_TYPE: " + + SchedTypeString), + false); } static std::unique_ptr createStaticNodeSupTrans(DataDepGraph *DataDepGraph, bool IsMultiPass = false) { - return llvm::make_unique(DataDepGraph, IsMultiPass); + return std::make_unique(DataDepGraph, IsMultiPass); } void ScheduleDAGOptSched::addGraphTransformations( @@ -197,13 +192,13 @@ void ScheduleDAGOptSched::addGraphTransformations( if (ILPStaticNodeSup) { GraphTransformations->push_back( - llvm::make_unique(BDDG)); + std::make_unique(BDDG)); } if (OccupancyPreservingILPStaticNodeSup || (OccupancyPreservingILPStaticNodeSup2ndPass && SecondPass)) { GraphTransformations->push_back( - llvm::make_unique(BDDG)); + std::make_unique(BDDG)); } } @@ -223,6 +218,8 @@ ScheduleDAGOptSched::ScheduleDAGOptSched( // load hot functions ini file HotFunctions.Load(PathCfgHF.c_str()); + OccupancyLimits.Load(PathCfgOCL.c_str()); + // Load config files for the OptScheduler loadOptSchedConfig(); @@ -235,6 +232,14 @@ ScheduleDAGOptSched::ScheduleDAGOptSched( OptSchedTargetRegistry::Registry.getFactoryWithName("generic"); OST = TargetFactory(); + + if ((strncmp("amdgcn", ArchName.data(), 6) == 0) || + (strncmp("amdgcn-amd-amdhsa", ArchName.data(), 17) == 0)) { + OST->SetOccupancyLimit(OccupancyLimit); + OST->SetShouldLimitOcc(ShouldLimitOccupancy); + OST->SetOccLimitSource(OccupancyLimitSource); + } + MM = OST->createMachineModel(PathCfgMM.c_str()); MM->convertMachineModel(static_cast(*this), RegClassInfo); @@ -303,16 +308,28 @@ void ScheduleDAGOptSched::schedule() { return; } - if (!OptSchedEnabled || !scheduleSpecificRegion(RegionName, schedIni) || - NumRegionInstrs > MaxRegionInstrs) { +#ifdef PRINT_MIR + bool print = false; +#endif + + if (!OptSchedEnabled || !scheduleSpecificRegion(RegionName, schedIni)) { LLVM_DEBUG(dbgs() << "Skipping region " << RegionName << "\n"); ScheduleDAGMILive::schedule(); return; } +#ifdef PRINT_MIR + else { + print = true; + Logger::Info("MIR Before Scheduling"); + C->MF->print(errs()); + } +#endif + // This log output is parsed by scripts. Don't change its format unless you // are prepared to change the relevant scripts as well. Logger::Info("********** Opt Scheduling **********"); + Logger::Event("BeginScheduling"); LLVM_DEBUG(dbgs() << "********** Scheduling Region " << RegionName << " **********\n"); LLVM_DEBUG(const auto *MBB = RegionBegin->getParent(); @@ -414,11 +431,12 @@ void ScheduleDAGOptSched::schedule() { SetupLLVMDag(); } - OST->initRegion(this, MM.get()); + OST->initRegion(this, MM.get(), OccupancyLimits); // Convert graph auto DDG = OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName); + // DDG->setMF(C->MF); // In the second pass, ignore artificial edges before running the sequential // heuristic list scheduler. if (SecondPass && EnableMutations) @@ -432,11 +450,12 @@ void ScheduleDAGOptSched::schedule() { addGraphTransformations(BDDG); // create region - auto region = llvm::make_unique( + auto region = std::make_unique( OST.get(), static_cast(DDG.get()), 0, HistTableHashBits, LowerBoundAlgorithm, HeuristicPriorities, EnumPriorities, VerifySchedule, PruningStrategy, SchedForRPOnly, EnumStalls, SCW, SCF, HeurSchedType, - SecondPass ? GraphTransPosition2ndPass : GraphTransPosition); + SecondPass ? GraphTransPosition2ndPass : GraphTransPosition, + IsTimeoutPerInst, TimeoutPerMemblock); bool IsEasy = false; InstCount NormBestCost = 0; @@ -497,9 +516,13 @@ void ScheduleDAGOptSched::schedule() { LLVM_DEBUG(Logger::Info("OptSched succeeded.")); OST->finalizeRegion(Sched); - if (!OST->shouldKeepSchedule()) + if (!OST->shouldKeepSchedule()) { + for (size_t i = 0; i < SUnits.size(); i++) { + SUnit SU = SUnits[i]; + ResetFlags(SU); + } return; - + } // Count simulated spills. if (isSimRegAllocEnabled()) { SimulatedSpills += region->GetSimSpills(); @@ -525,6 +548,11 @@ void ScheduleDAGOptSched::schedule() { } } placeDebugValues(); +#ifdef PRINT_MIR + Logger::Info("MIR After Scheduling"); + if (print) + MF.print(errs()); +#endif #ifdef IS_DEBUG_PEAK_PRESSURE Logger::Info("Register pressure after"); @@ -532,6 +560,14 @@ void ScheduleDAGOptSched::schedule() { #endif } +void ScheduleDAGOptSched::ResetFlags(SUnit &SU) { + RegisterOperands RegOpers; + RegOpers.collect(*SU.getInstr(), *TRI, MRI, true, false); + // Adjust liveness and add missing dead+read-undef flags. + auto SlotIdx = LIS->getInstructionIndex(*SU.getInstr()).getRegSlot(); + RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, SU.getInstr()); +} + void ScheduleDAGOptSched::ScheduleNode(SUnit *SU, unsigned CurCycle) { #ifdef IS_DEBUG_CONVERT_LLVM Logger::Info("*** Scheduling [%lu]: ", CurCycle); @@ -539,9 +575,10 @@ void ScheduleDAGOptSched::ScheduleNode(SUnit *SU, unsigned CurCycle) { if (SU) { MachineInstr *instr = SU->getInstr(); // Reset read - undef flags and update them later. - for (auto &Op : instr->operands()) - if (Op.isReg() && Op.isDef()) - Op.setIsUndef(false); + for (MIBundleOperands MIO(*instr); MIO.isValid(); ++MIO) { + if (MIO->isReg() && MIO->isDef()) + MIO->setIsUndef(false); + } if (&*CurrentTop == instr) CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom); @@ -662,6 +699,16 @@ void ScheduleDAGOptSched::loadOptSchedConfig() { randomSeed = time(NULL); RandomGen::SetSeed(randomSeed); HeurSchedType = parseListSchedType(); + + TimeoutPerMemblock = schedIni.GetInt("TIMEOUT_PER_MEMBLOCK_RATIO"); + + OccupancyLimit = schedIni.GetInt("OCCUPANCY_LIMIT"); + ShouldLimitOccupancy = schedIni.GetBool("SHOULD_LIMIT_OCCUPANCY"); + + OccupancyLimitSource = OCC_LIMIT_TYPE::OLT_NONE; + if (ShouldLimitOccupancy) + OccupancyLimitSource = + parseOccLimit(schedIni.GetString("OCCUPANCY_LIMIT_SOURCE")); } bool ScheduleDAGOptSched::isOptSchedEnabled() const { @@ -672,16 +719,17 @@ bool ScheduleDAGOptSched::isOptSchedEnabled() const { return true; } else if (optSchedOption == "HOT_ONLY") { // get the name of the function this scheduler was created for - std::string functionName = C->MF->getFunction().getName(); + std::string functionName = C->MF->getFunction().getName().data(); // check the list of hot functions for the name of the current function return HotFunctions.GetBool(functionName, false); } else if (optSchedOption == "NO") { return false; } - llvm::report_fatal_error("Unrecognized option for USE_OPT_SCHED setting: " + - optSchedOption, - false); + llvm::report_fatal_error( + llvm::StringRef("Unrecognized option for USE_OPT_SCHED setting: " + + optSchedOption), + false); } bool ScheduleDAGOptSched::isTwoPassEnabled() const { @@ -694,7 +742,9 @@ bool ScheduleDAGOptSched::isTwoPassEnabled() const { return false; llvm::report_fatal_error( - "Unrecognized option for USE_TWO_PASS setting: " + twoPassOption, false); + llvm::StringRef("Unrecognized option for USE_TWO_PASS setting: " + + twoPassOption), + false); } LATENCY_PRECISION ScheduleDAGOptSched::fetchLatencyPrecision() const { @@ -709,7 +759,9 @@ LATENCY_PRECISION ScheduleDAGOptSched::fetchLatencyPrecision() const { } llvm::report_fatal_error( - "Unrecognized option for LATENCY_PRECISION setting: " + lpName, false); + llvm::StringRef("Unrecognized option for LATENCY_PRECISION setting: " + + lpName), + false); } LB_ALG ScheduleDAGOptSched::parseLowerBoundAlgorithm() const { @@ -720,8 +772,9 @@ LB_ALG ScheduleDAGOptSched::parseLowerBoundAlgorithm() const { return LBA_LC; } - llvm::report_fatal_error("Unrecognized option for LB_ALG setting: " + LBalg, - false); + llvm::report_fatal_error( + llvm::StringRef("Unrecognized option for LB_ALG setting: " + LBalg), + false); } // Helper function to find the next substring which is a heuristic name in Str @@ -740,7 +793,8 @@ static LISTSCHED_HEURISTIC GetNextHeuristicName(const std::string &Str, return LSH.HID; } - llvm::report_fatal_error("Unrecognized heuristic used: " + Str, false); + llvm::report_fatal_error( + llvm::StringRef("Unrecognized heuristic used: " + Str), false); } GT_POSITION @@ -800,6 +854,25 @@ SPILL_COST_FUNCTION ScheduleDAGOptSched::parseSpillCostFunc() const { return ParseSCFName(name); } +OCC_LIMIT_TYPE +ScheduleDAGOptSched::parseOccLimit(const std::string Str) { + OCC_LIMIT_TYPE result = OCC_LIMIT_TYPE::OLT_NONE; + + if (Str == "NONE") { + return OCC_LIMIT_TYPE::OLT_NONE; + } else if (Str == "HEURISTIC") { + return OCC_LIMIT_TYPE::OLT_HEUR; + } else if (Str == "FILE") { + return OCC_LIMIT_TYPE::OLT_FILE; + } + + llvm::report_fatal_error( + llvm::StringRef("Unrecognized option for LATENCY_PRECISION setting: " + + Str), + false); + return result; +} + bool ScheduleDAGOptSched::shouldPrintSpills() const { std::string printSpills = SchedulerOptions::getInstance().GetString("PRINT_SPILL_COUNTS"); @@ -808,12 +881,13 @@ bool ScheduleDAGOptSched::shouldPrintSpills() const { } else if (printSpills == "NO") { return false; } else if (printSpills == "HOT_ONLY") { - std::string functionName = C->MF->getFunction().getName(); + std::string functionName = C->MF->getFunction().getName().data(); return HotFunctions.GetBool(functionName, false); } llvm::report_fatal_error( - "Unrecognized option for PRINT_SPILL_COUNTS setting: " + printSpills, + llvm::StringRef("Unrecognized option for PRINT_SPILL_COUNTS setting: " + + printSpills), false); } @@ -1035,6 +1109,13 @@ void ScheduleDAGOptSched::getRealCfgPaths() { getRealCfgPathCL(PathCfgMM); } + if (OptSchedCfgOCL.empty()) + (PathCfg + DEFAULT_CFGOCL_FNAME).toVector(PathCfgOCL); + else { + PathCfgOCL = OptSchedCfgOCL; + getRealCfgPathCL(PathCfgOCL); + } + // Convert full paths to native fromat. sys::path::native(PathCfgS); sys::path::native(PathCfgHF); @@ -1050,9 +1131,9 @@ printMaskPairs(const SmallVectorImpl &RegPairs, for (const auto &P : RegPairs) { const TargetRegisterClass *RegClass; - if (TRI->isPhysicalRegister(P.RegUnit)) + if (P.RegUnit.isPhysicalRegister(P.RegUnit)) RegClass = TRI->getMinimalPhysRegClass(P.RegUnit); - else if (TRI->isVirtualRegister(P.RegUnit)) + else if (P.RegUnit.isVirtualRegister(P.RegUnit)) RegClass = MRI.getRegClass(P.RegUnit); else RegClass = nullptr; diff --git a/lib/Wrapper/OptimizingScheduler.h b/lib/Wrapper/OptimizingScheduler.h index 0bacea58..88467cfd 100644 --- a/lib/Wrapper/OptimizingScheduler.h +++ b/lib/Wrapper/OptimizingScheduler.h @@ -14,7 +14,6 @@ #include "opt-sched/Scheduler/data_dep.h" #include "opt-sched/Scheduler/graph_trans.h" #include "opt-sched/Scheduler/sched_region.h" -#include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/MachineScheduler.h" @@ -61,6 +60,9 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { // Path to the machine model specification file for opt-sched. SmallString<128> PathCfgMM; + // Path to the occupancy limits file for specified kernels using opt-sched + SmallString<128> PathCfgOCL; + // Bool value indicating that the scheduler is in the second // pass. Used for the two pass scheduling approach. bool SecondPass; @@ -80,6 +82,10 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { // OptScheduler Config HotFunctions; + // A list of kernels / functions and the occupancy limit the maximizes + // performance + Config OccupancyLimits; + // Struct for setting the pruning strategy Pruning PruningStrategy; @@ -145,6 +151,13 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { int SecondPassRegionTimeout; int SecondPassLengthTimeout; + int TimeoutPerMemblock; + + int OccupancyLimit; + + bool ShouldLimitOccupancy; + OCC_LIMIT_TYPE OccupancyLimitSource; + // How to interpret the timeout value? Timeout per instruction or // timout per block bool IsTimeoutPerInst; @@ -243,6 +256,8 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { // Get the GT_POSITION static GT_POSITION parseGraphTransPosition(llvm::StringRef Str); + OCC_LIMIT_TYPE parseOccLimit(const std::string Str); + // Return true if the OptScheduler should be enabled for the function this // ScheduleDAG was created for bool isOptSchedEnabled() const; @@ -259,6 +274,9 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { // Return true if we should print spill count for the current function bool shouldPrintSpills() const; + // Reset the flags (e.g undef) before reverting scheduling + void ResetFlags(SUnit &SU); + // Add node to llvm schedule void ScheduleNode(SUnit *SU, unsigned CurCycle); @@ -290,6 +308,9 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive { // Schedule the current region using the OptScheduler void schedule() override; + // Calculate OptSched scheduling stats based on ordering of SUnits + void getOptSchedStats(); + // Setup and select schedulers for the two pass scheduling approach. virtual void initSchedulers(); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a4e376df..aa17f9f7 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -16,7 +16,7 @@ configure_lit_site_cfg( ) list(APPEND OPTSCHED_TEST_DEPS - OptSched + LLVMOptSched ) set(OPTSCHED_TEST_PARAMS diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt index 38005ba2..3eeb9df0 100644 --- a/unittests/CMakeLists.txt +++ b/unittests/CMakeLists.txt @@ -9,7 +9,7 @@ function(add_optsched_unittest test_dirname) endfunction() # All unit test targets depend on OptSched -add_library(UnitTest.OptSched STATIC $) +add_llvm_library(UnitTest.OptSched STATIC $) link_libraries(UnitTest.OptSched) add_subdirectory(Basic)