diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index 6831938fd4c6d..df9256e878f30 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -1950,9 +1950,18 @@ void SYCLToolChain::AddImpliedTargetArgs(const llvm::Triple &Triple, if (Args.hasFlag(options::OPT_ftarget_export_symbols, options::OPT_fno_target_export_symbols, false)) BeArgs.push_back("-library-compilation"); - } else if (IsJIT) + // -foffload-fp32-prec-[sqrt/div] + if (Args.hasArg(options::OPT_foffload_fp32_prec_div) || + Args.hasArg(options::OPT_foffload_fp32_prec_sqrt)) + BeArgs.push_back("-ze-fp32-correctly-rounded-divide-sqrt"); + } else if (IsJIT) { // -ftarget-compile-fast JIT Args.AddLastArg(BeArgs, options::OPT_ftarget_compile_fast); + // -foffload-fp32-prec-div JIT + Args.AddLastArg(BeArgs, options::OPT_foffload_fp32_prec_div); + // -foffload-fp32-prec-sqrt JIT + Args.AddLastArg(BeArgs, options::OPT_OPT_foffload_fp32_prec_sqrt); + } if (IsGen) { for (auto [DeviceName, BackendArgStr] : PerDeviceArgs) { CmdArgs.push_back("-device_options"); diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp.h b/llvm/include/llvm/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp.h new file mode 100644 index 0000000000000..26cd063c5b452 --- /dev/null +++ b/llvm/include/llvm/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp.h @@ -0,0 +1,34 @@ +//===-- SYCLSqrtFDivMaxErrorCleanUp.h - SYCLSqrtFDivMaxErrorCleanUp Pass --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Remove llvm.fpbuiltin.[sqrt/fdiv] intrinsics to ensure compatibility with the +// old drivers (that don't support SPV_INTEL_fp_max_error extension). +// The intrinsic functions are removed in case if they are used with standard +// for OpenCL max-error (e.g [3.0/2.5] ULP) and there are no: +// - other llvm.fpbuiltin.* intrinsic functions; +// - fdiv instructions +// - @sqrt builtins (both C and C++-styles)/llvm intrinsic in the module. +//===----------------------------------------------------------------------===// +#ifndef LLVM_SYCL_SQRT_FDIV_MAX_ERROR_CLEAN_UP_H +#define LLVM_SYCL_SQRT_FDIV_MAX_ERROR_CLEAN_UP_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +// FIXME: remove this pass, it's not really needed. +class SYCLSqrtFDivMaxErrorCleanUpPass + : public PassInfoMixin { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &); + + static bool isRequired() { return true; } +}; + +} // namespace llvm + +#endif // LLVM_SYCL_SQRT_FDIV_MAX_ERROR_CLEAN_UP_H diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 2222c76a1751a..a827697fecaf8 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -151,6 +151,7 @@ #include "llvm/SYCLLowerIR/SYCLJointMatrixTransform.h" #include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h" #include "llvm/SYCLLowerIR/SYCLPropagateJointMatrixUsage.h" +#include "llvm/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp.h" #include "llvm/SYCLLowerIR/SYCLVirtualFunctionsAnalysis.h" #include "llvm/SYCLLowerIR/SpecConstants.h" #include "llvm/Support/CommandLine.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 953a6f620d14a..b408b4de9ad80 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -166,6 +166,7 @@ MODULE_PASS("esimd-remove-host-code", ESIMDRemoveHostCodePass()); MODULE_PASS("esimd-remove-optnone-noinline", ESIMDRemoveOptnoneNoinlinePass()); MODULE_PASS("sycl-conditional-call-on-device", SYCLConditionalCallOnDevicePass()) MODULE_PASS("sycl-joint-matrix-transform", SYCLJointMatrixTransformPass()) +MODULE_PASS("sycl-sqrt-fdiv-max-error-clean-up", SYCLSqrtFDivMaxErrorCleanUpPass()) MODULE_PASS("sycl-propagate-aspects-usage", SYCLPropagateAspectsUsagePass()) MODULE_PASS("sycl-propagate-joint-matrix-usage", SYCLPropagateJointMatrixUsagePass()) MODULE_PASS("sycl-add-opt-level-attribute", SYCLAddOptLevelAttributePass()) diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt index 0ce2a91f91a29..baff8e0cff74b 100644 --- a/llvm/lib/SYCLLowerIR/CMakeLists.txt +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -67,6 +67,7 @@ add_llvm_component_library(LLVMSYCLLowerIR SYCLJointMatrixTransform.cpp SYCLPropagateAspectsUsage.cpp SYCLPropagateJointMatrixUsage.cpp + SYCLSqrtFDivMaxErrorCleanUp.cpp SYCLVirtualFunctionsAnalysis.cpp SYCLUtils.cpp SanitizeDeviceGlobal.cpp diff --git a/llvm/lib/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp.cpp b/llvm/lib/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp.cpp new file mode 100644 index 0000000000000..f2f7876046404 --- /dev/null +++ b/llvm/lib/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp.cpp @@ -0,0 +1,160 @@ +//===- SYCLSqrtFDivMaxErrorCleanUp.cpp - SYCLSqrtFDivMaxErrorCleanUp Pass -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Remove llvm.fpbuiltin.[sqrt/fdiv] intrinsics to ensure compatibility with the +// old drivers (that don't support SPV_INTEL_fp_max_error extension). +// The intrinsic functions are removed in case if they are used with standard +// for OpenCL max-error (e.g [3.0/2.5] ULP) and there are no: +// - other llvm.fpbuiltin.* intrinsic functions; +// - fdiv instructions +// - @sqrt builtins (both C and C++-styles)/llvm intrinsic in the module. +//===----------------------------------------------------------------------===// + +#include "llvm/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp.h" + +#include "llvm/ADT/SmallSet.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IRBuilder.h" + +using namespace llvm; + +namespace { +static constexpr char SQRT_ERROR[] = "3.0"; +static constexpr char FDIV_ERROR[] = "2.5"; +} // namespace + +PreservedAnalyses +SYCLSqrtFDivMaxErrorCleanUpPass::run(Module &M, + ModuleAnalysisManager &MAM) { + SmallVector WorkListSqrt; + SmallVector WorkListFDiv; + + // Add all llvm.fpbuiltin.sqrt with 3.0 error and llvm.fpbuiltin.fdiv with + // 2.5 error to the work list to remove them later. If attributes with other + // values or other llvm.fpbuiltin.* intrinsic functions found - abort the + // pass. + for (auto &F : M) { + if (!F.isDeclaration()) + continue; + const auto ID = F.getIntrinsicID(); + if (ID != llvm::Intrinsic::fpbuiltin_sqrt && + ID != llvm::Intrinsic::fpbuiltin_fdiv) + continue; + + for (auto *Use : F.users()) { + auto *II = cast(Use); + if (II && II->getCalledFunction()->getName(). + starts_with("llvm.fpbuiltin")) { + // llvm.fpbuiltin.* intrinsics should always have fpbuiltin-max-error + // attribute, but it's not a concern of the pass, so just do an early + // exit here if the attribute is not attached. + if (!II->getAttributes().hasFnAttr("fpbuiltin-max-error")) + return PreservedAnalyses::none(); + StringRef MaxError = II->getAttributes().getFnAttr( + "fpbuiltin-max-error").getValueAsString(); + + if (ID == llvm::Intrinsic::fpbuiltin_sqrt) { + if (MaxError != SQRT_ERROR) + return PreservedAnalyses::none(); + WorkListSqrt.push_back(II); + } + else if (ID == llvm::Intrinsic::fpbuiltin_fdiv) { + if (MaxError != FDIV_ERROR) + return PreservedAnalyses::none(); + WorkListFDiv.push_back(II); + } else { + // Another llvm.fpbuiltin.* intrinsic was found - the module is + // already not backward compatible. + return PreservedAnalyses::none(); + } + } + } + } + + // No intrinsics at all - do an early exist. + if (WorkListSqrt.empty() && WorkListFDiv.empty()) + return PreservedAnalyses::none(); + + // If @sqrt, @_Z4sqrt*, @llvm.sqrt. or fdiv present in the module - do + // nothing. + for (auto &F : M) { + if (F.isDeclaration()) + continue; + for (auto &BB : F) { + for (auto &II : BB) { + if (auto *CI = dyn_cast(&II)) { + auto *SqrtF = CI->getCalledFunction(); + if (SqrtF->getName() == "sqrt" || + SqrtF->getName().starts_with("_Z4sqrt") || + SqrtF->getIntrinsicID() == llvm::Intrinsic::sqrt) + return PreservedAnalyses::none(); + } + if (auto *FPI = dyn_cast(&II)) { + auto Opcode = FPI->getOpcode(); + if (Opcode == Instruction::FDiv) + return PreservedAnalyses::none(); + } + } + } + } + + // Replace @llvm.fpbuiltin.sqrt call with @llvm.sqrt. llvm-spirv will handle + // it later. + SmallSet DeclToRemove; + for (auto *Sqrt : WorkListSqrt) { + DeclToRemove.insert(Sqrt->getCalledFunction()); + IRBuilder Builder(Sqrt); + Builder.SetInsertPoint(Sqrt); + Type *Ty = Sqrt->getType(); + AttributeList Attrs = Sqrt->getAttributes(); + Function *NewSqrtF = + Intrinsic::getDeclaration(&M, llvm::Intrinsic::sqrt, Ty); + auto *NewSqrt = Builder.CreateCall(NewSqrtF, { Sqrt->getOperand(0) }, + Sqrt->getName()); + + // Copy FP flags, metadata and attributes. Replace old call with a new call. + Attrs = Attrs.removeFnAttribute(Sqrt->getContext(), "fpbuiltin-max-error"); + NewSqrt->setAttributes(Attrs); + NewSqrt->copyMetadata(*Sqrt); + FPMathOperator *FPOp = cast(Sqrt); + FastMathFlags FMF = FPOp->getFastMathFlags(); + NewSqrt->setFastMathFlags(FMF); + Sqrt->replaceAllUsesWith(NewSqrt); + Sqrt->dropAllReferences(); + Sqrt->eraseFromParent(); + } + + // Replace @llvm.fpbuiltin.fdiv call with fdiv. + for (auto *FDiv : WorkListFDiv) { + DeclToRemove.insert(FDiv->getCalledFunction()); + IRBuilder Builder(FDiv); + Builder.SetInsertPoint(FDiv); + Instruction *NewFDiv = + cast(Builder.CreateFDiv( + FDiv->getOperand(0), FDiv->getOperand(1), FDiv->getName())); + + // Copy FP flags and metadata. Replace old call with a new instruction. + cast(NewFDiv)->copyMetadata(*FDiv); + FPMathOperator *FPOp = cast(FDiv); + FastMathFlags FMF = FPOp->getFastMathFlags(); + NewFDiv->setFastMathFlags(FMF); + FDiv->replaceAllUsesWith(NewFDiv); + FDiv->dropAllReferences(); + FDiv->eraseFromParent(); + } + + // Clear old declarations. + for (auto *Decl : DeclToRemove) { + assert(Decl->isDeclaration() && + "attempting to remove a function definition"); + Decl->dropAllReferences(); + Decl->eraseFromParent(); + } + + return PreservedAnalyses::all(); +} diff --git a/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-max-error-basic.ll b/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-max-error-basic.ll new file mode 100644 index 0000000000000..1476f9b71dab3 --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-max-error-basic.ll @@ -0,0 +1,68 @@ +; Test checks if @llvm.fpbuiltin.fdiv and @llvm.fpbuiltin.sqrt are removed from +; the module. + +; RUN: opt -passes=sycl-sqrt-fdiv-max-error-clean-up < %s -S | FileCheck %s + +; CHECK-NOT: llvm.fpbuiltin.fdiv.f32 +; CHECK-NOT: llvm.fpbuiltin.sqrt.f32 +; CHECK-NOT: fpbuiltin-max-error + +; CHECK: test_fp_max_error_decoration(float [[F1:[%0-9a-z.]+]], float [[F2:[%0-9a-z.]+]]) +; CHECK: [[V1:[%0-9a-z.]+]] = fdiv float [[F1]], [[F2]] +; CHECK: call float @llvm.sqrt.f32(float [[V1]]) + +; CHECK: test_fp_max_error_decoration_fast(float [[F1:[%0-9a-z.]+]], float [[F2:[%0-9a-z.]+]]) +; CHECK: [[V1:[%0-9a-z.]+]] = fdiv fast float [[F1]], [[F2]] +; CHECK: call fast float @llvm.sqrt.f32(float [[V1]]) + +; CHECK: test_fp_max_error_decoration_debug(float [[F1:[%0-9a-z.]+]], float [[F2:[%0-9a-z.]+]]) +; CHECK: [[V1:[%0-9a-z.]+]] = fdiv float [[F1]], [[F2]], !dbg ![[#Loc1:]] +; CHECK: call float @llvm.sqrt.f32(float [[V1]]), !dbg ![[#Loc2:]] + +; CHECK: [[#Loc1]] = !DILocation(line: 1, column: 1, scope: ![[#]]) +; CHECK: [[#Loc2]] = !DILocation(line: 2, column: 1, scope: ![[#]]) + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +define void @test_fp_max_error_decoration(float %f1, float %f2) { +entry: + %v1 = call float @llvm.fpbuiltin.fdiv.f32(float %f1, float %f2) #0 + %v2 = call float @llvm.fpbuiltin.sqrt.f32(float %v1) #1 + ret void +} + +define void @test_fp_max_error_decoration_fast(float %f1, float %f2) { +entry: + %v1 = call fast float @llvm.fpbuiltin.fdiv.f32(float %f1, float %f2) #0 + %v2 = call fast float @llvm.fpbuiltin.sqrt.f32(float %v1) #1 + ret void +} + +define void @test_fp_max_error_decoration_debug(float %f1, float %f2) { +entry: + %v1 = call float @llvm.fpbuiltin.fdiv.f32(float %f1, float %f2) #0, !dbg !7 + %v2 = call float @llvm.fpbuiltin.sqrt.f32(float %v1) #1, !dbg !8 + ret void +} + +declare float @llvm.fpbuiltin.fdiv.f32(float, float) + +declare float @llvm.fpbuiltin.sqrt.f32(float) + +attributes #0 = { "fpbuiltin-max-error"="2.5" } +attributes #1 = { "fpbuiltin-max-error"="3.0" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!9} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/tmp", checksumkind: CSK_MD5, checksum: "2a034da6937f5b9cf6dd2d89127f57fd") +!2 = distinct !DISubprogram(name: "test_fp_max_error_decoration_debug", scope: !1, file: !1, line: 1, type: !3, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0) +!3 = !DISubroutineType(types: !4) +!4 = !{!5, !6, !6} +!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!6 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) +!7 = !DILocation(line: 1, column: 1, scope: !2) +!8 = !DILocation(line: 2, column: 1, scope: !2) +!9 = !{i32 2, !"Debug Info Version", i32 3} diff --git a/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-error.ll b/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-error.ll new file mode 100644 index 0000000000000..260cb0dcc8520 --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-error.ll @@ -0,0 +1,25 @@ +; Test checks if @llvm.fpbuiltin.fdiv and @llvm.fpbuiltin.sqrt remain if +; non-standart for OpenCL max-error is used. + +; RUN: opt -passes=sycl-sqrt-fdiv-max-error-clean-up < %s -S | FileCheck %s + +; CHECK: llvm.fpbuiltin.fdiv.f32 +; CHECK: llvm.fpbuiltin.sqrt.f32 +; CHECK: fpbuiltin-max-error + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +define void @test_fp_max_error_decoration(float %f1, float %f2) { +entry: + %v1 = call float @llvm.fpbuiltin.fdiv.f32(float %f1, float %f2) #0 + %v2 = call float @llvm.fpbuiltin.sqrt.f32(float %v1) #1 + ret void +} + +declare float @llvm.fpbuiltin.fdiv.f32(float, float) + +declare float @llvm.fpbuiltin.sqrt.f32(float) + +attributes #0 = { "fpbuiltin-max-error"="2.0" } +attributes #1 = { "fpbuiltin-max-error"="3.0" } diff --git a/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-fdiv.ll b/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-fdiv.ll new file mode 100644 index 0000000000000..c9e7dcfb7172e --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-fdiv.ll @@ -0,0 +1,26 @@ +; Test checks if @llvm.fpbuiltin.fdiv and @llvm.fpbuiltin.sqrt remain if +; fdiv instruction was in the module. + +; RUN: opt -passes=sycl-sqrt-fdiv-max-error-clean-up < %s -S | FileCheck %s + +; CHECK: llvm.fpbuiltin.fdiv.f32 +; CHECK: llvm.fpbuiltin.sqrt.f32 +; CHECK: fpbuiltin-max-error + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +define void @test_fp_max_error_decoration(float %f1, float %f2) { +entry: + %v1 = call float @llvm.fpbuiltin.fdiv.f32(float %f1, float %f2) #0 + %v2 = call float @llvm.fpbuiltin.sqrt.f32(float %v1) #1 + %v3 = fdiv float %v2, %f2 + ret void +} + +declare float @llvm.fpbuiltin.fdiv.f32(float, float) + +declare float @llvm.fpbuiltin.sqrt.f32(float) + +attributes #0 = { "fpbuiltin-max-error"="2.0" } +attributes #1 = { "fpbuiltin-max-error"="3.0" } diff --git a/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-other-intrinsic.ll b/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-other-intrinsic.ll new file mode 100644 index 0000000000000..3c944041c483b --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-other-intrinsic.ll @@ -0,0 +1,28 @@ +; Test checks if @llvm.fpbuiltin.fdiv and @llvm.fpbuiltin.sqrt remain if +; other fpbuiltin intrinsic is used in the module. + +; RUN: opt -passes=sycl-sqrt-fdiv-max-error-clean-up < %s -S | FileCheck %s + +; CHECK: llvm.fpbuiltin.fdiv.f32 +; CHECK: llvm.fpbuiltin.sqrt.f32 +; CHECK: fpbuiltin-max-error + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +define void @test_fp_max_error_decoration(float %f1, float %f2) { +entry: + %v1 = call float @llvm.fpbuiltin.fdiv.f32(float %f1, float %f2) #0 + %v2 = call float @llvm.fpbuiltin.sqrt.f32(float %v1) #1 + %v3 = call float @llvm.fpbuiltin.exp.f32(float %v2) + ret void +} + +declare float @llvm.fpbuiltin.fdiv.f32(float, float) + +declare float @llvm.fpbuiltin.sqrt.f32(float) + +declare float @llvm.fpbuiltin.exp.f32(float) + +attributes #0 = { "fpbuiltin-max-error"="2.0" } +attributes #1 = { "fpbuiltin-max-error"="3.0" } diff --git a/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-sqrt-c-builtin.ll b/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-sqrt-c-builtin.ll new file mode 100644 index 0000000000000..fcbd3a2e20af9 --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-sqrt-c-builtin.ll @@ -0,0 +1,28 @@ +; Test checks if @llvm.fpbuiltin.fdiv and @llvm.fpbuiltin.sqrt remain if +; C-style @sqrt builtin was used in the module. + +; RUN: opt -passes=sycl-sqrt-fdiv-max-error-clean-up < %s -S | FileCheck %s + +; CHECK: llvm.fpbuiltin.fdiv.f32 +; CHECK: llvm.fpbuiltin.sqrt.f32 +; CHECK: fpbuiltin-max-error + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +define void @test_fp_max_error_decoration(float %f1, float %f2) { +entry: + %v1 = call float @llvm.fpbuiltin.fdiv.f32(float %f1, float %f2) #0 + %v2 = call float @llvm.fpbuiltin.sqrt.f32(float %v1) #1 + %v3 = call float @sqrt(float %v2) + ret void +} + +declare float @llvm.fpbuiltin.fdiv.f32(float, float) + +declare float @llvm.fpbuiltin.sqrt.f32(float) + +declare float @sqrt(float) + +attributes #0 = { "fpbuiltin-max-error"="2.0" } +attributes #1 = { "fpbuiltin-max-error"="3.0" } diff --git a/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-sqrt-cpp-builtin.ll b/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-sqrt-cpp-builtin.ll new file mode 100644 index 0000000000000..5c7be2a86917b --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-sqrt-cpp-builtin.ll @@ -0,0 +1,28 @@ +; Test checks if @llvm.fpbuiltin.fdiv and @llvm.fpbuiltin.sqrt remain if +; C++-style @sqrt builtin was used in the module. + +; RUN: opt -passes=sycl-sqrt-fdiv-max-error-clean-up < %s -S | FileCheck %s + +; CHECK: llvm.fpbuiltin.fdiv.f32 +; CHECK: llvm.fpbuiltin.sqrt.f32 +; CHECK: fpbuiltin-max-error + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +define void @test_fp_max_error_decoration(float %f1, float %f2) { +entry: + %v1 = call float @llvm.fpbuiltin.fdiv.f32(float %f1, float %f2) #0 + %v2 = call float @llvm.fpbuiltin.sqrt.f32(float %v1) #1 + %v3 = call float @_Z4sqrtf(float %v2) + ret void +} + +declare float @llvm.fpbuiltin.fdiv.f32(float, float) + +declare float @llvm.fpbuiltin.sqrt.f32(float) + +declare float @_Z4sqrtf(float) + +attributes #0 = { "fpbuiltin-max-error"="2.0" } +attributes #1 = { "fpbuiltin-max-error"="3.0" } diff --git a/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-sqrt-intrinsic.ll b/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-sqrt-intrinsic.ll new file mode 100644 index 0000000000000..2365e4b82dbc0 --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp/sycl-sqrt-fdiv-negative-sqrt-intrinsic.ll @@ -0,0 +1,26 @@ +; Test checks if @llvm.fpbuiltin.fdiv and @llvm.fpbuiltin.sqrt remain if +; @llvm.sqrt was used in the module. + +; RUN: opt -passes=sycl-sqrt-fdiv-max-error-clean-up < %s -S | FileCheck %s + +; CHECK: llvm.fpbuiltin.fdiv.f32 +; CHECK: llvm.fpbuiltin.sqrt.f32 +; CHECK: fpbuiltin-max-error + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +define void @test_fp_max_error_decoration(float %f1, float %f2) { +entry: + %v1 = call float @llvm.fpbuiltin.fdiv.f32(float %f1, float %f2) #0 + %v2 = call float @llvm.fpbuiltin.sqrt.f32(float %v1) #1 + %v3 = call float @llvm.sqrt.f32(float %v2) + ret void +} + +declare float @llvm.fpbuiltin.fdiv.f32(float, float) + +declare float @llvm.fpbuiltin.sqrt.f32(float) + +attributes #0 = { "fpbuiltin-max-error"="2.0" } +attributes #1 = { "fpbuiltin-max-error"="3.0" } diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 3800c5875e44f..90eed512a2b67 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -39,6 +39,7 @@ #include "llvm/SYCLLowerIR/LowerInvokeSimd.h" #include "llvm/SYCLLowerIR/ModuleSplitter.h" #include "llvm/SYCLLowerIR/SYCLJointMatrixTransform.h" +#include "llvm/SYCLLowerIR/SYCLSqrtFDivMaxErrorCleanUp.h" #include "llvm/SYCLLowerIR/SYCLUtils.h" #include "llvm/SYCLLowerIR/SanitizeDeviceGlobal.h" #include "llvm/SYCLLowerIR/SpecConstants.h" @@ -800,6 +801,11 @@ processInputModule(std::unique_ptr M) { // LLVM IR specification. Modified |= runModulePass(*M); + // Remove llvm.fpbuiltin.[sqrt/fdiv] intrinsic functions if they all have + // max-error attribute with values 3.0 and 2.5 appropriately and no other + // sqrt or fdiv present in the module. + Modified |= runModulePass(*M); + // Do invoke_simd processing before splitting because this: // - saves processing time (the pass is run once, even though on larger IR) // - doing it before SYCL/ESIMD splitting is required for correctness diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index d6602725663ff..35f6e480512aa 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -446,6 +446,22 @@ static void appendCompileOptionsFromImage(std::string &CompileOpts, CompileOpts = NewCompileOpts; OptPos = CompileOpts.find(TargetRegisterAllocMode); } + static const char *FP32PrecDiv = "-foffload-fp32-prec-div"; + if (auto Pos = CompileOpts.find(FP32PrecDiv); + Pos != std::string::npos) { + const char *BackendOption = nullptr; + PlatformImpl->getBackendOption(FP32PrecDiv, &BackendOption); + auto OptLen = strlen(FP32PrecDiv); + CompileOpts.replace(Pos, OptLen, BackendOption); + } + static const char *FP32PrecSqrt = "-foffload-fp32-prec-sqrt"; + if (auto Pos = CompileOpts.find(FP32PrecSqrt); + Pos != std::string::npos) { + const char *BackendOption = nullptr; + PlatformImpl->getBackendOption(FP32PrecSqrt, &BackendOption); + auto OptLen = strlen(FP32PrecSqrt); + CompileOpts.replace(Pos, OptLen, BackendOption); + } } } diff --git a/sycl/test-e2e/KernelAndProgram/fp32-precise-fdiv.cpp b/sycl/test-e2e/KernelAndProgram/fp32-precise-fdiv.cpp new file mode 100644 index 0000000000000..323ad82de2a56 --- /dev/null +++ b/sycl/test-e2e/KernelAndProgram/fp32-precise-fdiv.cpp @@ -0,0 +1,42 @@ +// RUN: %{build} -Wno-error=unused-command-line-argument -foffload-fp32-prec-div -o %t_with.out +// RUN: %{build} -o %t_without.out + +// RUN: env SYCL_UR_TRACE=2 %{run} %t_with.out 2>&1 | FileCheck %if hip || cuda %{ --check-prefix=CHECK-WITHOUT %} %else %{ --check-prefix=CHECK-WITH %} %s +// RUN: env SYCL_UR_TRACE=2 %{run} %t_without.out 2>&1 | FileCheck --implicit-check-not=fp32-correctly-rounded-divide-div + +// CHECK-INTEL-WITH: <--- urProgramBuild +// CHECK-INTEL-WITH-SAME: fp32-correctly-rounded-divide-div + +// CHECK-WITHOUT-NOT: <--- urProgramBuild{{.*}}fp32-correctly-rounded-divide-div{{.*}} -> UR_RESULT_SUCCESS +// CHECK-WITHOUT: <--- urProgramBuild{{.*}} -> UR_RESULT_SUCCESS + +#include + +int main() { + sycl::buffer Buffer(4); + + sycl::queue Queue; + + sycl::range<1> NumOfWorkItems{Buffer.size()}; + + Queue.submit([&](sycl::handler &cgh) { + sycl::accessor Accessor{Buffer, cgh, sycl::write_only}; + cgh.parallel_for(NumOfWorkItems, [=](sycl::id<1> WIid) { + Accessor[WIid] = WIid.get(0); + }); + }); + + sycl::host_accessor HostAccessor{Buffer, sycl::read_only}; + + bool MismatchFound = false; + for (size_t I = 0; I < Buffer.size(); ++I) { + if (HostAccessor[I] != I) { + std::cout << "The result is incorrect for element: " << I + << " , expected: " << I << " , got: " << HostAccessor[I] + << std::endl; + MismatchFound = true; + } + } + + return MismatchFound; +} diff --git a/sycl/test-e2e/KernelAndProgram/fp32-precise-sqrt.cpp b/sycl/test-e2e/KernelAndProgram/fp32-precise-sqrt.cpp new file mode 100644 index 0000000000000..9cfea1df29313 --- /dev/null +++ b/sycl/test-e2e/KernelAndProgram/fp32-precise-sqrt.cpp @@ -0,0 +1,42 @@ +// RUN: %{build} -Wno-error=unused-command-line-argument -foffload-fp32-prec-sqrt -o %t_with.out +// RUN: %{build} -o %t_without.out + +// RUN: env SYCL_UR_TRACE=2 %{run} %t_with.out 2>&1 | FileCheck %if hip || cuda %{ --check-prefix=CHECK-WITHOUT %} %else %{ --check-prefix=CHECK-WITH %} %s +// RUN: env SYCL_UR_TRACE=2 %{run} %t_without.out 2>&1 | FileCheck --implicit-check-not=fp32-correctly-rounded-divide-sqrt + +// CHECK-INTEL-WITH: <--- urProgramBuild +// CHECK-INTEL-WITH-SAME: fp32-correctly-rounded-divide-sqrt + +// CHECK-WITHOUT-NOT: <--- urProgramBuild{{.*}}fp32-correctly-rounded-divide-sqrt{{.*}} -> UR_RESULT_SUCCESS +// CHECK-WITHOUT: <--- urProgramBuild{{.*}} -> UR_RESULT_SUCCESS + +#include + +int main() { + sycl::buffer Buffer(4); + + sycl::queue Queue; + + sycl::range<1> NumOfWorkItems{Buffer.size()}; + + Queue.submit([&](sycl::handler &cgh) { + sycl::accessor Accessor{Buffer, cgh, sycl::write_only}; + cgh.parallel_for(NumOfWorkItems, [=](sycl::id<1> WIid) { + Accessor[WIid] = WIid.get(0); + }); + }); + + sycl::host_accessor HostAccessor{Buffer, sycl::read_only}; + + bool MismatchFound = false; + for (size_t I = 0; I < Buffer.size(); ++I) { + if (HostAccessor[I] != I) { + std::cout << "The result is incorrect for element: " << I + << " , expected: " << I << " , got: " << HostAccessor[I] + << std::endl; + MismatchFound = true; + } + } + + return MismatchFound; +}