[llvm] 884acbb - AMDGPU: Enhancement on FDIV lowering in AMDGPUCodeGenPrepare
Changpeng Fang via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 7 11:47:07 PST 2020
Author: Changpeng Fang
Date: 2020-02-07T11:46:23-08:00
New Revision: 884acbb9e167d5668e43581630239d688edec8ad
URL: https://github.com/llvm/llvm-project/commit/884acbb9e167d5668e43581630239d688edec8ad
DIFF: https://github.com/llvm/llvm-project/commit/884acbb9e167d5668e43581630239d688edec8ad.diff
LOG: AMDGPU: Enhancement on FDIV lowering in AMDGPUCodeGenPrepare
Summary:
The accuracy limit to use rcp is adjusted to 1.0 ulp from 2.5 ulp.
Also, afn instead of arcp is used to allow inaccurate rcp to be used.
Reviewers:
arsenm
Differential Revision: https://reviews.llvm.org/D73588
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 76f8d5e8c320..e3cc2a4abdee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -606,24 +606,23 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
return true;
}
-// Perform RCP optimizations:
+// Optimize fdiv with rcp:
//
-// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
-// denormals flushed.
+// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
+// allowed with unsafe-fp-math or afn.
//
-// a/b -> a*rcp(b) when fast unsafe rcp is legal.
-static Value *performRCPOpt(Value *Num, Value *Den, bool FastUnsafeRcpLegal,
- IRBuilder<> Builder, MDNode *FPMath, Module *Mod,
- bool HasDenormals, bool NeedHighAccuracy) {
+// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
+static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
+ bool RcpIsAccurate, IRBuilder<> Builder,
+ Module *Mod) {
- Type *Ty = Den->getType();
- if (!FastUnsafeRcpLegal && Ty->isFloatTy() &&
- (HasDenormals || NeedHighAccuracy))
+ if (!AllowInaccurateRcp && !RcpIsAccurate)
return nullptr;
+ Type *Ty = Den->getType();
Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, Ty);
if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
- if (FastUnsafeRcpLegal || Ty->isFloatTy() || Ty->isHalfTy()) {
+ if (AllowInaccurateRcp || RcpIsAccurate) {
if (CLHS->isExactlyValue(1.0)) {
// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
// the CI documentation has a worst case error of 1 ulp.
@@ -648,49 +647,63 @@ static Value *performRCPOpt(Value *Num, Value *Den, bool FastUnsafeRcpLegal,
}
}
- if (FastUnsafeRcpLegal) {
+ if (AllowInaccurateRcp) {
// Turn into multiply by the reciprocal.
// x / y -> x * (1.0 / y)
Value *Recip = Builder.CreateCall(Decl, { Den });
- return Builder.CreateFMul(Num, Recip, "", FPMath);
+ return Builder.CreateFMul(Num, Recip);
}
return nullptr;
}
-static bool shouldKeepFDivF32(Value *Num, bool FastUnsafeRcpLegal,
- bool HasDenormals) {
- const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
- if (!CNum)
- return HasDenormals;
+// optimize with fdiv.fast:
+//
+// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
+//
+// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
+//
+// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
+static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
+ bool HasDenormals, IRBuilder<> Builder,
+ Module *Mod) {
+ // fdiv.fast can achieve 2.5 ULP accuracy.
+ if (ReqdAccuracy < 2.5f)
+ return nullptr;
- if (FastUnsafeRcpLegal)
- return true;
+ // Only have fdiv.fast for f32.
+ Type *Ty = Den->getType();
+ if (!Ty->isFloatTy())
+ return nullptr;
- bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
+ bool NumIsOne = false;
+ if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
+ if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
+ NumIsOne = true;
+ }
- // Reciprocal f32 is handled separately without denormals.
- return HasDenormals ^ IsOne;
-}
+ // fdiv does not support denormals. But 1.0/x is always fine to use it.
+ if (HasDenormals && !NumIsOne)
+ return nullptr;
+ Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
+ return Builder.CreateCall(Decl, { Num, Den });
+}
-// Optimizations is performed based on fpmath, fast math flags as wells as
-// denormals to lower fdiv using either rcp or fdiv.fast.
+// Optimizations is performed based on fpmath, fast math flags as well as
+// denormals to optimize fdiv with either rcp or fdiv.fast.
+//
+// With rcp:
+// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
+// allowed with unsafe-fp-math or afn.
//
-// FastUnsafeRcpLegal: We determine whether it is legal to use rcp based on
-// unsafe-fp-math, fast math flags, denormals and fpmath
-// accuracy request.
+// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
//
-// RCP Optimizations:
-// 1/x -> rcp(x) when fast unsafe rcp is legal or fpmath >= 2.5ULP with
-// denormals flushed.
-// a/b -> a*rcp(b) when fast unsafe rcp is legal.
+// With fdiv.fast:
+// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
//
-// Use fdiv.fast:
-// a/b -> fdiv.fast(a, b) when RCP optimization is not performed and
-// fpmath >= 2.5ULP with denormals flushed.
+// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
//
-// 1/x -> fdiv.fast(1,x) when RCP optimization is not performed and
-// fpmath >= 2.5ULP with denormals.
+// NOTE: rcp is the preference in cases that both are legal.
bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
Type *Ty = FDiv.getType()->getScalarType();
@@ -700,19 +713,17 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
return false;
const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
- MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
- const bool NeedHighAccuracy = !FPMath || FPOp->getFPAccuracy() < 2.5f;
+ const float ReqdAccuracy = FPOp->getFPAccuracy();
+ // Inaccurate rcp is allowed with unsafe-fp-math or afn.
FastMathFlags FMF = FPOp->getFastMathFlags();
- // Determine whether it is ok to use rcp based on unsafe-fp-math,
- // fast math flags, denormals and accuracy request.
- const bool FastUnsafeRcpLegal = HasUnsafeFPMath || FMF.isFast() ||
- (FMF.allowReciprocal() && ((!HasFP32Denormals && !NeedHighAccuracy)
- || FMF.approxFunc()));
+ const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();
- // Use fdiv.fast for only f32, fpmath >= 2.5ULP and rcp is not used.
- const bool UseFDivFast = Ty->isFloatTy() && !NeedHighAccuracy &&
- !FastUnsafeRcpLegal;
+ // rcp_f16 is accurate for !fpmath >= 1.0ulp.
+ // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
+ // rcp_f64 is never accurate.
+ const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
+ (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
Builder.setFastMathFlags(FMF);
@@ -730,31 +741,24 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
Value *NumEltI = Builder.CreateExtractElement(Num, I);
Value *DenEltI = Builder.CreateExtractElement(Den, I);
- Value *NewElt = nullptr;
- if (UseFDivFast && !shouldKeepFDivF32(NumEltI, FastUnsafeRcpLegal,
- HasFP32Denormals)) {
- Function *Decl =
- Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
- NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI }, "", FPMath);
- }
- if (!NewElt) // Try rcp.
- NewElt = performRCPOpt(NumEltI, DenEltI, FastUnsafeRcpLegal, Builder,
- FPMath, Mod, HasFP32Denormals, NeedHighAccuracy);
- if (!NewElt)
- NewElt = Builder.CreateFDiv(NumEltI, DenEltI, "", FPMath);
+ // Try rcp first.
+ Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp,
+ RcpIsAccurate, Builder, Mod);
+ if (!NewElt) // Try fdiv.fast.
+ NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy,
+ HasFP32Denormals, Builder, Mod);
+ if (!NewElt) // Keep the original.
+ NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
}
- } else { // Scalar.
- if (UseFDivFast && !shouldKeepFDivF32(Num, FastUnsafeRcpLegal,
- HasFP32Denormals)) {
- Function *Decl =
- Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
- NewFDiv = Builder.CreateCall(Decl, { Num, Den }, "", FPMath);
- }
- if (!NewFDiv) { // Try rcp.
- NewFDiv = performRCPOpt(Num, Den, FastUnsafeRcpLegal, Builder, FPMath,
- Mod, HasFP32Denormals, NeedHighAccuracy);
+ } else { // Scalar FDiv.
+ // Try rcp first.
+ NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate,
+ Builder, Mod);
+ if (!NewFDiv) { // Try fdiv.fast.
+ NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
+ Builder, Mod);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7cbdd4982cdb..475a2b8f30da 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7418,19 +7418,12 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();
- bool FastUnsafeRcpLegal = DAG.getTarget().Options.UnsafeFPMath ||
- (Flags.hasAllowReciprocal() &&
- ((VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) ||
- VT == MVT::f16 ||
- Flags.hasApproximateFuncs()));
-
- // Do rcp optimization only when fast unsafe rcp is legal here.
- // NOTE: We already performed RCP optimization to insert intrinsics in
- // AMDGPUCodeGenPrepare. Ideally there should have no opportunity here to
- // rcp optimization.
- // However, there are cases like FREM, which is expended into a sequence
- // of instructions including FDIV, which may expose new opportunities.
- if (!FastUnsafeRcpLegal)
+ bool AllowInaccurateRcp = DAG.getTarget().Options.UnsafeFPMath ||
+ Flags.hasApproximateFuncs();
+
+ // Without !fpmath accuracy information, we can't do more because we don't
+ // know exactly whether rcp is accurate enough to meet !fpmath requirement.
+ if (!AllowInaccurateRcp)
return SDValue();
if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index 6ead8b9ad13e..3096372394bb 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -12,14 +12,14 @@ define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a,
; CHECK-LABEL: @fdiv_fpmath(
; CHECK: %no.md = fdiv float %a, %b{{$}}
-; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
-; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
-; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
+; CHECK: %md.half.ulp = fdiv float %a, %b
+; CHECK: %md.1ulp = fdiv float %a, %b
+; CHECK: %md.25ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b)
+; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b)
; CHECK: %[[FAST_RCP:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b)
-; CHECK: %fast.md.25ulp = fmul fast float %a, %[[FAST_RCP]], !fpmath !0
-; CHECK: %[[ARCP_RCP:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %b)
-; CHECK: arcp.md.25ulp = fmul arcp float %a, %[[ARCP_RCP]], !fpmath !0
+; CHECK: %fast.md.25ulp = fmul fast float %a, %[[FAST_RCP]]
+; CHECK: %[[AFN_RCP:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %b)
+; CHECK: afn.md.25ulp = fmul afn float %a, %[[AFN_RCP]]
define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
%no.md = fdiv float %a, %b
store volatile float %no.md, float addrspace(1)* %out
@@ -39,8 +39,8 @@ define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float
%fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
store volatile float %fast.md.25ulp, float addrspace(1)* %out
- %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
- store volatile float %arcp.md.25ulp, float addrspace(1)* %out
+ %afn.md.25ulp = fdiv afn float %a, %b, !fpmath !0
+ store volatile float %afn.md.25ulp, float addrspace(1)* %out
ret void
}
@@ -48,9 +48,9 @@ define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float
; CHECK-LABEL: @rcp_fdiv_fpmath(
; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}}
; CHECK: %md.25ulp = call float @llvm.amdgcn.rcp.f32(float %x)
-; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1
-; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x
-; CHECK: %arcp.25ulp = call arcp float @llvm.amdgcn.rcp.f32(float %x)
+; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x
+; CHECK: %afn.no.md = call afn float @llvm.amdgcn.rcp.f32(float %x)
+; CHECK: %afn.25ulp = call afn float @llvm.amdgcn.rcp.f32(float %x)
; CHECK: %fast.no.md = call fast float @llvm.amdgcn.rcp.f32(float %x)
; CHECK: %fast.25ulp = call fast float @llvm.amdgcn.rcp.f32(float %x)
define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
@@ -63,11 +63,11 @@ define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #
%md.half.ulp = fdiv float 1.0, %x, !fpmath !1
store volatile float %md.half.ulp, float addrspace(1)* %out
- %arcp.no.md = fdiv arcp float 1.0, %x
- store volatile float %arcp.no.md, float addrspace(1)* %out
+ %afn.no.md = fdiv afn float 1.0, %x
+ store volatile float %afn.no.md, float addrspace(1)* %out
- %arcp.25ulp = fdiv arcp float 1.0, %x, !fpmath !0
- store volatile float %arcp.25ulp, float addrspace(1)* %out
+ %afn.25ulp = fdiv afn float 1.0, %x, !fpmath !0
+ store volatile float %afn.25ulp, float addrspace(1)* %out
%fast.no.md = fdiv fast float 1.0, %x
store volatile float %fast.no.md, float addrspace(1)* %out
@@ -78,28 +78,6 @@ define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #
ret void
}
-; CHECK-LABEL: @rcp_fdiv_arcp_denormal(
-; CHECK: %arcp.low.accuracy = call arcp float @llvm.amdgcn.fdiv.fast(float 1.000000e+00, float %x), !fpmath !0
-; CHECK: %arcp.high.accuracy = fdiv arcp float 1.000000e+00, %x, !fpmath !2
-; CHECK: %arcp.low.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x)
-; CHECK: %arcp.high.afn = call arcp afn float @llvm.amdgcn.rcp.f32(float %x)
-define amdgpu_kernel void @rcp_fdiv_arcp_denormal(float addrspace(1)* %out, float %x) #2 {
-
- %arcp.low.accuracy = fdiv arcp float 1.0, %x, !fpmath !0
- store volatile float %arcp.low.accuracy, float addrspace(1)* %out
-
- %arcp.high.accuracy = fdiv arcp float 1.0, %x, !fpmath !2
- store volatile float %arcp.high.accuracy, float addrspace(1)* %out
-
- %arcp.low.afn = fdiv arcp afn float 1.0, %x, !fpmath !0
- store volatile float %arcp.low.afn, float addrspace(1)* %out
-
- %arcp.high.afn = fdiv arcp afn float 1.0, %x, !fpmath !2
- store volatile float %arcp.high.afn, float addrspace(1)* %out
-
- ret void
-}
-
; CHECK-LABEL: @fdiv_fpmath_vector(
; CHECK: %[[NO_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
; CHECK: %[[NO_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
@@ -113,31 +91,31 @@ define amdgpu_kernel void @rcp_fdiv_arcp_denormal(float addrspace(1)* %out, floa
; CHECK: %[[HALF_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
; CHECK: %[[HALF_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
-; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float %[[HALF_A0]], %[[HALF_B0]], !fpmath !1
+; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float %[[HALF_A0]], %[[HALF_B0]]
; CHECK: %[[HALF_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[HALF_FDIV0]], i64 0
; CHECK: %[[HALF_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
; CHECK: %[[HALF_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
-; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float %[[HALF_A1]], %[[HALF_B1]], !fpmath !1
+; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float %[[HALF_A1]], %[[HALF_B1]]
; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1
; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
; CHECK: %[[ONE_A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
; CHECK: %[[ONE_B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
-; CHECK: %[[ONE_FDIV0:[0-9]+]] = fdiv float %[[ONE_A0]], %[[ONE_B0]], !fpmath !2
+; CHECK: %[[ONE_FDIV0:[0-9]+]] = fdiv float %[[ONE_A0]], %[[ONE_B0]]
; CHECK: %[[ONE_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ONE_FDIV0]], i64 0
; CHECK: %[[ONE_A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
; CHECK: %[[ONE_B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
-; CHECK: %[[ONE_FDIV1:[0-9]+]] = fdiv float %[[ONE_A1]], %[[ONE_B1]], !fpmath !2
+; CHECK: %[[ONE_FDIV1:[0-9]+]] = fdiv float %[[ONE_A1]], %[[ONE_B1]]
; CHECK: %md.1ulp = insertelement <2 x float> %[[ONE_INS0]], float %[[ONE_FDIV1]], i64 1
; CHECK: store volatile <2 x float> %md.1ulp, <2 x float> addrspace(1)* %out
; CHECK: %[[A0:[0-9]+]] = extractelement <2 x float> %a, i64 0
; CHECK: %[[B0:[0-9]+]] = extractelement <2 x float> %b, i64 0
-; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]]), !fpmath !0
+; CHECK: %[[FDIV0:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A0]], float %[[B0]])
; CHECK: %[[INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FDIV0]], i64 0
; CHECK: %[[A1:[0-9]+]] = extractelement <2 x float> %a, i64 1
; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
-; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
+; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]])
; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
%no.md = fdiv <2 x float> %a, %b
@@ -165,20 +143,20 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2
; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
; CHECK: %[[HALF0:[0-9]+]] = extractelement <2 x float> %x, i64 0
-; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF0]], !fpmath !1
+; CHECK: %[[HALF_FDIV0:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF0]]
; CHECK: %[[HALF_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[HALF_FDIV0]], i64 0
; CHECK: %[[HALF1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF1]], !fpmath !1
+; CHECK: %[[HALF_FDIV1:[0-9]+]] = fdiv float 1.000000e+00, %[[HALF1]]
; CHECK: %md.half.ulp = insertelement <2 x float> %[[HALF_INS0]], float %[[HALF_FDIV1]], i64 1
; CHECK: store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
-; CHECK: %[[ARCP_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0
-; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]]
-; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0
-; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO1]]
-; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1
-; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+; CHECK: %[[AFN_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0
+; CHECK: %[[AFN_NO_FDIV0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO0]])
+; CHECK: %[[AFN_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[AFN_NO_FDIV0]], i64 0
+; CHECK: %[[AFN_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[AFN_NO_FDIV1:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO1]])
+; CHECK: %afn.no.md = insertelement <2 x float> %[[AFN_NO_INS0]], float %[[AFN_NO_FDIV1]], i64 1
+; CHECK: store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out
; CHECK: %[[FAST_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0
; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]])
@@ -188,13 +166,13 @@ define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2
; CHECK: %fast.no.md = insertelement <2 x float> %[[FAST_NO_INS0]], float %[[FAST_NO_RCP1]], i64 1
; CHECK: store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
-; CHECK: %[[ARCP_250:[0-9]+]] = extractelement <2 x float> %x, i64 0
-; CHECK: %[[ARCP_25_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_250]])
-; CHECK: %[[ARCP_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_25_RCP0]], i64 0
-; CHECK: %[[ARCP_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: %[[ARCP_25_RCP1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_251]])
-; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_25_INS0]], float %[[ARCP_25_RCP1]], i64 1
-; CHECK: store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+; CHECK: %[[AFN_250:[0-9]+]] = extractelement <2 x float> %x, i64 0
+; CHECK: %[[AFN_25_RCP0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_250]])
+; CHECK: %[[AFN_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[AFN_25_RCP0]], i64 0
+; CHECK: %[[AFN_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[AFN_25_RCP1:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_251]])
+; CHECK: %afn.25ulp = insertelement <2 x float> %[[AFN_25_INS0]], float %[[AFN_25_RCP1]], i64 1
+; CHECK: store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out
; CHECK: %[[FAST_250:[0-9]+]] = extractelement <2 x float> %x, i64 0
; CHECK: %[[FAST_25_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_250]])
@@ -210,14 +188,14 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
%md.half.ulp = fdiv <2 x float> <float 1.0, float 1.0>, %x, !fpmath !1
store volatile <2 x float> %md.half.ulp, <2 x float> addrspace(1)* %out
- %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
- store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+ %afn.no.md = fdiv afn <2 x float> <float 1.0, float 1.0>, %x
+ store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out
%fast.no.md = fdiv fast <2 x float> <float 1.0, float 1.0>, %x
store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
- %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
- store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+ %afn.25ulp = fdiv afn <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
+ store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out
%fast.25ulp = fdiv fast <2 x float> <float 1.0, float 1.0>, %x, !fpmath !0
store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
@@ -234,13 +212,14 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
; CHECK: %no.md = insertelement <2 x float> %[[NO_INS0]], float %[[NO_FDIV1]], i64 1
; CHECK: store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
-; CHECK: %[[ARCP_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0
-; CHECK: %[[ARCP_NO_FDIV0:[0-9]+]] = fdiv arcp float 1.000000e+00, %[[ARCP_NO0]]
-; CHECK: %[[ARCP_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_NO_FDIV0]], i64 0
-; CHECK: %[[ARCP_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: %[[ARCP_NO_FDIV1:[0-9]+]] = fdiv arcp float 2.000000e+00, %[[ARCP_NO1]]
-; CHECK: %arcp.no.md = insertelement <2 x float> %[[ARCP_NO_INS0]], float %[[ARCP_NO_FDIV1]], i64 1
-; CHECK: store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+; CHECK: %[[AFN_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0
+; CHECK: %[[AFN_NO_FDIV0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO0]])
+; CHECK: %[[AFN_NO_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[AFN_NO_FDIV0]], i64 0
+; CHECK: %[[AFN_NO1:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[AFN_NO_FDIV1:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_NO1]])
+; CHECK: %[[AFN_NO_MUL1:[0-9]+]] = fmul afn float 2.000000e+00, %[[AFN_NO_FDIV1]]
+; CHECK: %afn.no.md = insertelement <2 x float> %[[AFN_NO_INS0]], float %[[AFN_NO_MUL1]], i64 1
+; CHECK: store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out
; CHECK: %[[FAST_NO0:[0-9]+]] = extractelement <2 x float> %x, i64 0
; CHECK: %[[FAST_NO_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_NO0]])
@@ -251,14 +230,14 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out
; CHECK: %fast.no.md = insertelement <2 x float> %[[FAST_NO_INS0]], float %[[FAST_NO_MUL1]], i64 1
; CHECK: store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
-; CHECK: %[[ARCP_250:[0-9]+]] = extractelement <2 x float> %x, i64 0
-; CHECK: %[[ARCP_25_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_250]])
-; CHECK: %[[ARCP_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_25_RCP0]], i64 0
-; CHECK: %[[ARCP_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
-; CHECK: %[[ARCP_25_RCP1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_251]])
-; CHECK: %[[ARCP_25_MUL1:[0-9]+]] = fmul arcp float 2.000000e+00, %[[ARCP_25_RCP1]]
-; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_25_INS0]], float %[[ARCP_25_MUL1]], i64 1
-; CHECK: store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+; CHECK: %[[AFN_250:[0-9]+]] = extractelement <2 x float> %x, i64 0
+; CHECK: %[[AFN_25_RCP0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_250]])
+; CHECK: %[[AFN_25_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[AFN_25_RCP0]], i64 0
+; CHECK: %[[AFN_251:[0-9]+]] = extractelement <2 x float> %x, i64 1
+; CHECK: %[[AFN_25_RCP1:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_251]])
+; CHECK: %[[AFN_25_MUL1:[0-9]+]] = fmul afn float 2.000000e+00, %[[AFN_25_RCP1]]
+; CHECK: %afn.25ulp = insertelement <2 x float> %[[AFN_25_INS0]], float %[[AFN_25_MUL1]], i64 1
+; CHECK: store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out
; CHECK: %[[FAST_250:[0-9]+]] = extractelement <2 x float> %x, i64 0
; CHECK: %[[FAST_25_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_250]])
@@ -272,14 +251,14 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace
%no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
- %arcp.no.md = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x
- store volatile <2 x float> %arcp.no.md, <2 x float> addrspace(1)* %out
+ %afn.no.md = fdiv afn <2 x float> <float 1.0, float 2.0>, %x
+ store volatile <2 x float> %afn.no.md, <2 x float> addrspace(1)* %out
%fast.no.md = fdiv fast <2 x float> <float 1.0, float 2.0>, %x
store volatile <2 x float> %fast.no.md, <2 x float> addrspace(1)* %out
- %arcp.25ulp = fdiv arcp <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
- store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+ %afn.25ulp = fdiv afn <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
+ store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out
%fast.25ulp = fdiv fast <2 x float> <float 1.0, float 2.0>, %x, !fpmath !0
store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
@@ -288,34 +267,34 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace
}
; CHECK-LABEL: @rcp_fdiv_fpmath_vector_partial_constant(
-; CHECK: %[[ARCP_A0:[0-9]+]] = extractelement <2 x float> %x.insert, i64 0
-; CHECK: %[[ARCP_B0:[0-9]+]] = extractelement <2 x float> %y, i64 0
-; CHECK: %[[ARCP_RCP0:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_B0]])
-; CHECK: %[[ARCP_MUL0:[0-9]+]] = fmul arcp float %[[ARCP_A0]], %[[ARCP_RCP0]], !fpmath !0
-; CHECK: %[[ARCP_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[ARCP_MUL0]], i64 0
-; CHECK: %[[ARCP_A1:[0-9]+]] = extractelement <2 x float> %x.insert, i64 1
-; CHECK: %[[ARCP_B1:[0-9]+]] = extractelement <2 x float> %y, i64 1
-; CHECK: %[[ARCP_RCP1:[0-9]+]] = call arcp float @llvm.amdgcn.rcp.f32(float %[[ARCP_B1]])
-; CHECK: %[[ARCP_MUL1:[0-9]+]] = fmul arcp float %[[ARCP_A1]], %[[ARCP_RCP1]], !fpmath !0
-; CHECK: %arcp.25ulp = insertelement <2 x float> %[[ARCP_INS0]], float %[[ARCP_MUL1]], i64 1
-; CHECK: store volatile <2 x float> %arcp.25ulp
+; CHECK: %[[AFN_A0:[0-9]+]] = extractelement <2 x float> %x.insert, i64 0
+; CHECK: %[[AFN_B0:[0-9]+]] = extractelement <2 x float> %y, i64 0
+; CHECK: %[[AFN_RCP0:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_B0]])
+; CHECK: %[[AFN_MUL0:[0-9]+]] = fmul afn float %[[AFN_A0]], %[[AFN_RCP0]]
+; CHECK: %[[AFN_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[AFN_MUL0]], i64 0
+; CHECK: %[[AFN_A1:[0-9]+]] = extractelement <2 x float> %x.insert, i64 1
+; CHECK: %[[AFN_B1:[0-9]+]] = extractelement <2 x float> %y, i64 1
+; CHECK: %[[AFN_RCP1:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %[[AFN_B1]])
+; CHECK: %[[AFN_MUL1:[0-9]+]] = fmul afn float %[[AFN_A1]], %[[AFN_RCP1]]
+; CHECK: %afn.25ulp = insertelement <2 x float> %[[AFN_INS0]], float %[[AFN_MUL1]], i64 1
+; CHECK: store volatile <2 x float> %afn.25ulp
; CHECK: %[[FAST_A0:[0-9]+]] = extractelement <2 x float> %x.insert, i64 0
; CHECK: %[[FAST_B0:[0-9]+]] = extractelement <2 x float> %y, i64 0
; CHECK: %[[FAST_RCP0:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_B0]])
-; CHECK: %[[FAST_MUL0:[0-9]+]] = fmul fast float %[[FAST_A0]], %[[FAST_RCP0]], !fpmath !0
+; CHECK: %[[FAST_MUL0:[0-9]+]] = fmul fast float %[[FAST_A0]], %[[FAST_RCP0]]
; CHECK: %[[FAST_INS0:[0-9]+]] = insertelement <2 x float> undef, float %[[FAST_MUL0]], i64 0
; CHECK: %[[FAST_A1:[0-9]+]] = extractelement <2 x float> %x.insert, i64 1
; CHECK: %[[FAST_B1:[0-9]+]] = extractelement <2 x float> %y, i64 1
; CHECK: %[[FAST_RCP1:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %[[FAST_B1]])
-; CHECK: %[[FAST_MUL1:[0-9]+]] = fmul fast float %[[FAST_A1]], %[[FAST_RCP1]], !fpmath !0
+; CHECK: %[[FAST_MUL1:[0-9]+]] = fmul fast float %[[FAST_A1]], %[[FAST_RCP1]]
; CHECK: %fast.25ulp = insertelement <2 x float> %[[FAST_INS0]], float %[[FAST_MUL1]], i64 1
; CHECK: store volatile <2 x float> %fast.25ulp
define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
%x.insert = insertelement <2 x float> %x, float 1.0, i32 0
- %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
- store volatile <2 x float> %arcp.25ulp, <2 x float> addrspace(1)* %out
+ %afn.25ulp = fdiv afn <2 x float> %x.insert, %y, !fpmath !0
+ store volatile <2 x float> %afn.25ulp, <2 x float> addrspace(1)* %out
%fast.25ulp = fdiv fast <2 x float> %x.insert, %y, !fpmath !0
store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
@@ -325,13 +304,14 @@ define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> a
; CHECK-LABEL: @fdiv_fpmath_f32_denormals(
; CHECK: %no.md = fdiv float %a, %b{{$}}
-; CHECK: %md.half.ulp = fdiv float %a, %b, !fpmath !1
-; CHECK: %md.1ulp = fdiv float %a, %b, !fpmath !2
-; CHECK: %md.25ulp = fdiv float %a, %b, !fpmath !0
-; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
+; CHECK: %md.half.ulp = fdiv float %a, %b
+; CHECK: %md.1ulp = fdiv float %a, %b
+; CHECK: %md.25ulp = fdiv float %a, %b
+; CHECK: %md.3ulp = fdiv float %a, %b
; CHECK: %[[RCP_FAST:[0-9]+]] = call fast float @llvm.amdgcn.rcp.f32(float %b)
-; CHECK: %fast.md.25ulp = fmul fast float %a, %[[RCP_FAST]], !fpmath !0
-; CHECK: %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
+; CHECK: %fast.md.25ulp = fmul fast float %a, %[[RCP_FAST]]
+; CHECK: %[[RCP_AFN:[0-9]+]] = call afn float @llvm.amdgcn.rcp.f32(float %b)
+; CHECK: %afn.md.25ulp = fmul afn float %a, %[[RCP_AFN]]
define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
%no.md = fdiv float %a, %b
store volatile float %no.md, float addrspace(1)* %out
@@ -351,8 +331,8 @@ define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, f
%fast.md.25ulp = fdiv fast float %a, %b, !fpmath !0
store volatile float %fast.md.25ulp, float addrspace(1)* %out
- %arcp.md.25ulp = fdiv arcp float %a, %b, !fpmath !0
- store volatile float %arcp.md.25ulp, float addrspace(1)* %out
+ %afn.md.25ulp = fdiv afn float %a, %b, !fpmath !0
+ store volatile float %afn.md.25ulp, float addrspace(1)* %out
ret void
}
@@ -361,11 +341,6 @@ attributes #0 = { nounwind optnone noinline }
attributes #1 = { nounwind }
attributes #2 = { nounwind "target-features"="+fp32-denormals" }
-; CHECK: !0 = !{float 2.500000e+00}
-; CHECK: !1 = !{float 5.000000e-01}
-; CHECK: !2 = !{float 1.000000e+00}
-; CHECK: !3 = !{float 3.000000e+00}
-
!0 = !{float 2.500000e+00}
!1 = !{float 5.000000e-01}
!2 = !{float 1.000000e+00}
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 3199c49af4e5..3f61aca713e1 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -63,7 +63,7 @@ entry:
%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
%b.val = load volatile half, half addrspace(1)* %gep.b
- %r.val = fdiv half 1.0, %b.val
+ %r.val = fdiv half 1.0, %b.val, !fpmath !0
store half %r.val, half addrspace(1)* %gep.r
ret void
}
@@ -82,25 +82,46 @@ entry:
%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
%b.val = load volatile half, half addrspace(1)* %gep.b
%b.abs = call half @llvm.fabs.f16(half %b.val)
- %r.val = fdiv half 1.0, %b.abs
+ %r.val = fdiv half 1.0, %b.abs, !fpmath !0
store half %r.val, half addrspace(1)* %gep.r
ret void
}
-; GCN-LABEL: {{^}}v_rcp_f16_arcp:
+; We could not do 1/b -> rcp_f16(b) under !fpmath < 1ulp.
+
+; GCN-LABEL: {{^}}reciprocal_f16_rounded:
+; GFX8_9_10: {{flat|global}}_load_ushort [[VAL16:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}
+; GFX8_9_10: v_cvt_f32_f16_e32 [[CVT_TO32:v[0-9]+]], [[VAL16]]
+; GFX8_9_10: v_rcp_f32_e32 [[RCP32:v[0-9]+]], [[CVT_TO32]]
+; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK16:v[0-9]+]], [[RCP32]]
+; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK16]], [[VAL16]], 1.0
+; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @reciprocal_f16_rounded(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
+ %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
+ %b.val = load volatile half, half addrspace(1)* %gep.b
+ %r.val = fdiv half 1.0, %b.val
+ store half %r.val, half addrspace(1)* %gep.r
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_rcp_f16_afn:
; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
; GFX8_9_10-NOT: [[VAL]]
; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
; GFX8_9_10-NOT: [[RESULT]]
; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_afn(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
%b.val = load volatile half, half addrspace(1)* %gep.b
- %r.val = fdiv arcp half 1.0, %b.val
+ %r.val = fdiv afn half 1.0, %b.val, !fpmath !0
store half %r.val, half addrspace(1)* %gep.r
ret void
}
@@ -118,7 +139,7 @@ entry:
%gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext
%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
%b.val = load volatile half, half addrspace(1)* %gep.b
- %r.val = fdiv half -1.0, %b.val
+ %r.val = fdiv half -1.0, %b.val, !fpmath !0
store half %r.val, half addrspace(1)* %gep.r
ret void
}
@@ -137,7 +158,7 @@ entry:
%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
%b.val = load volatile half, half addrspace(1)* %gep.b
%b.sqrt = call half @llvm.sqrt.f16(half %b.val)
- %r.val = fdiv half 1.0, %b.sqrt
+ %r.val = fdiv half 1.0, %b.sqrt, !fpmath !0
store half %r.val, half addrspace(1)* %gep.r
ret void
}
@@ -157,12 +178,12 @@ entry:
%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
%b.val = load volatile half, half addrspace(1)* %gep.b
%b.sqrt = call half @llvm.sqrt.f16(half %b.val)
- %r.val = fdiv half -1.0, %b.sqrt
+ %r.val = fdiv half -1.0, %b.sqrt, !fpmath !0
store half %r.val, half addrspace(1)* %gep.r
ret void
}
-; GCN-LABEL: {{^}}v_fdiv_f16_arcp:
+; GCN-LABEL: {{^}}v_fdiv_f16_afn:
; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
@@ -170,7 +191,7 @@ entry:
; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_fdiv_f16_afn(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -179,7 +200,7 @@ entry:
%gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext
%a.val = load volatile half, half addrspace(1)* %gep.a
%b.val = load volatile half, half addrspace(1)* %gep.b
- %r.val = fdiv arcp half %a.val, %b.val
+ %r.val = fdiv afn half %a.val, %b.val
store half %r.val, half addrspace(1)* %gep.r
ret void
}
@@ -206,38 +227,38 @@ entry:
ret void
}
-; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f16:
+; FUNC-LABEL: {{^}}div_afn_2_x_pat_f16:
; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
; GFX8_9_10: buffer_store_short [[MUL]]
-define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_afn_2_x_pat_f16(half addrspace(1)* %out) #0 {
%x = load half, half addrspace(1)* undef
- %rcp = fdiv arcp half %x, 2.0
+ %rcp = fdiv afn half %x, 2.0
store half %rcp, half addrspace(1)* %out, align 4
ret void
}
-; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:
-; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}}
+; FUNC-LABEL: {{^}}div_afn_k_x_pat_f16:
+; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dcccccd, v{{[0-9]+}}
; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
; GFX8_9_10: buffer_store_short [[MUL]]
-define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_afn_k_x_pat_f16(half addrspace(1)* %out) #0 {
%x = load half, half addrspace(1)* undef
- %rcp = fdiv arcp half %x, 10.0
+ %rcp = fdiv afn half %x, 10.0
store half %rcp, half addrspace(1)* %out, align 4
ret void
}
-; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:
-; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}}
+; FUNC-LABEL: {{^}}div_afn_neg_k_x_pat_f16:
+; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdcccccd, v{{[0-9]+}}
; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
; GFX8_9_10: buffer_store_short [[MUL]]
-define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
%x = load half, half addrspace(1)* undef
- %rcp = fdiv arcp half %x, -10.0
+ %rcp = fdiv afn half %x, -10.0
store half %rcp, half addrspace(1)* %out, align 4
ret void
}
@@ -249,3 +270,5 @@ declare half @llvm.fabs.f16(half) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind "unsafe-fp-math"="true" }
+
+!0 = !{float 2.500000e+00}
More information about the llvm-commits
mailing list