[llvm] af0207f - AMDGPU: Check global FP atomics match default FP mode
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 23 06:07:57 PDT 2020
Author: Matt Arsenault
Date: 2020-09-23T09:07:50-04:00
New Revision: af0207f2bae8578c5283877a786e502ce6e33b14
URL: https://github.com/llvm/llvm-project/commit/af0207f2bae8578c5283877a786e502ce6e33b14
DIFF: https://github.com/llvm/llvm-project/commit/af0207f2bae8578c5283877a786e502ce6e33b14.diff
LOG: AMDGPU: Check global FP atomics match default FP mode
We would always select global FP atomics from atomicrmw fadd, although
they have a hardcoded FP mode.
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 73c65647352f..710014f141b1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11748,6 +11748,16 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
SNaN, Depth);
}
+// Global FP atomic instructions have a hardcoded FP mode and do not support
+// FP32 denormals, and only support v2f16 denormals.
+static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
+ const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
+ auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
+ if (&Flt == &APFloat::IEEEsingle())
+ return DenormMode == DenormalMode::getPreserveSign();
+ return DenormMode == DenormalMode::getIEEE();
+}
+
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
switch (RMW->getOperation()) {
@@ -11766,10 +11776,15 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) {
+ if (!fpModeMatchesGlobalFPAtomicMode(RMW))
+ return AtomicExpansionKind::CmpXChg;
+
return RMW->use_empty() ? AtomicExpansionKind::None :
AtomicExpansionKind::CmpXChg;
}
+ // DS FP atomics do repect the denormal mode, but the rounding mode is fixed
+ // to round-to-nearest-even.
return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
}
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index af54135d1ceb..a54116a79fbe 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -7,39 +7,55 @@
; CAS: global_atomic_cmpswap
; CAS: s_andn2_b64 exec, exec,
; CAS-NEXT: s_cbranch_execnz [[LOOP]]
-define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) {
+define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) #0 {
+ %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
+ store float %result, float addrspace(1)* undef
+ ret void
+}
+
+; GCN-LABEL: {{^}}global_atomic_fadd_ret_f32_ieee:
+; CAS: [[LOOP:BB[0-9]+_[0-9]+]]
+; CAS: v_add_f32_e32
+; CAS: global_atomic_cmpswap
+; CAS: s_andn2_b64 exec, exec,
+; CAS-NEXT: s_cbranch_execnz [[LOOP]]
+define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %ptr) {
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
store float %result, float addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}global_atomic_fadd_noret_f32:
-; GFX900: [[LOOP:BB[0-9]+_[0-9]+]]
-; GFX900: v_add_f32_e32
-; GFX900: global_atomic_cmpswap
-; GFX900: s_andn2_b64 exec, exec,
-; GFX900-NEXT: s_cbranch_execnz [[LOOP]]
+; GCN: [[LOOP:BB[0-9]+_[0-9]+]]
+; GCN: v_add_f32_e32
+; GCN: global_atomic_cmpswap
+; GCN: s_andn2_b64 exec, exec,
+; GCN-NEXT: s_cbranch_execnz [[LOOP]]
+define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr) #0 {
+ %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
+ ret void
+}
-; GFX908-NOT: v_add_f32
-; GFX908: global_atomic_add_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off
-; GFX908-NOT: s_cbranch_execnz
-define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr) {
+; GCN-LABEL: {{^}}global_atomic_fadd_noret_f32_ieee:
+; GCN: global_atomic_cmpswap
+define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)* %ptr) {
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
ret void
}
; Make sure this artificially selects with an incorrect subtarget, but the feature set.
; GCN-LABEL: {{^}}global_atomic_fadd_ret_f32_wrong_subtarget:
-define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 {
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
store float %result, float addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}global_atomic_fadd_noret_f32_wrong_subtarget:
-define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(float addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 {
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
ret void
}
-attributes #0 = { "target-cpu"="gfx803" "target-features"="+atomic-fadd-insts" }
+attributes #0 = { "denormal-fp-math-fp32"="preserve-sign,preserve-sign"}
+attributes #1 = { "denormal-fp-math-fp32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-insts" }
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 42ff7aa32b57..78992ac3048e 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -115,8 +115,8 @@ define float @test_atomicrmw_fadd_f32_global(float addrspace(1)* %ptr, float %va
ret float %res
}
-define void @test_atomicrmw_fadd_f32_global_no_use(float addrspace(1)* %ptr, float %value) {
-; CI-LABEL: @test_atomicrmw_fadd_f32_global_no_use(
+define void @test_atomicrmw_fadd_f32_global_no_use_ieee(float addrspace(1)* %ptr, float %value) {
+; CI-LABEL: @test_atomicrmw_fadd_f32_global_no_use_ieee(
; CI-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4
; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
; CI: atomicrmw.start:
@@ -133,7 +133,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use(float addrspace(1)* %ptr, flo
; CI: atomicrmw.end:
; CI-NEXT: ret void
;
-; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_no_use(
+; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_no_use_ieee(
; GFX9-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4
; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
; GFX9: atomicrmw.start:
@@ -150,7 +150,63 @@ define void @test_atomicrmw_fadd_f32_global_no_use(float addrspace(1)* %ptr, flo
; GFX9: atomicrmw.end:
; GFX9-NEXT: ret void
;
-; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_no_use(
+; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_no_use_ieee(
+; GFX908-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)*
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret void
+;
+ %res = atomicrmw fadd float addrspace(1)* %ptr, float %value seq_cst
+ ret void
+}
+
+define void @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(float addrspace(1)* %ptr, float %value) #0 {
+; CI-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(
+; CI-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)*
+; CI-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; CI-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; CI-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; CI-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret void
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(
+; GFX9-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)*
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; GFX9-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX9-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret void
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(
; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst
; GFX908-NEXT: ret void
;
@@ -407,3 +463,4 @@ define double @test_atomicrmw_fadd_f64_local(double addrspace(3)* %ptr, double %
ret double %res
}
+attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
More information about the llvm-commits
mailing list