[llvm] [AMDGPU] Fix scan of atomicFSub in AtomicOptimizer. (PR #66082)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 12 06:08:20 PDT 2023
llvmbot wrote:
@llvm/pr-subscribers-backend-amdgpu
<details>
<summary>Changes</summary>
[D156301](https://reviews.llvm.org/D156301) introduced atomic optimizations for FAdd/FSub. For FSub, reduction/scan needs to be performed using add operation (`not sub`) and memory location will be updated by reduced value using atomic sub later by only one lane.
--
Patch is 69.85 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/66082.diff
5 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp (+9-4)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+165-165)
<pre>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index b7323bdac40c3bf..f27e57e78c7b4f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -754,8 +754,15 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// If we have a divergent value in each lane, we need to combine the value
// using DPP.
if (ValDivergent) {
- const AtomicRMWInst::BinOp ScanOp =
- Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
+ // For atomic sub, perform scan with add operation and allow one lane to substract the reduced value later.
+ AtomicRMWInst::BinOp ScanOp;
+ if(Op == AtomicRMWInst::Sub ) {
+ ScanOp = AtomicRMWInst::Add;
+ } else if (Op == AtomicRMWInst::FSub) {
+ ScanOp = AtomicRMWInst::FAdd;
+ } else {
+ ScanOp = Op;
+ }
if (ScanImpl == ScanOptions::DPP) {
// First we need to set all inactive invocations to the identity value, so
// that they can correctly contribute to the final result.
@@ -766,8 +773,6 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
NewV = B.CreateBitCast(NewV, Ty);
V = B.CreateBitCast(V, Ty);
Identity = B.CreateBitCast(Identity, Ty);
- const AtomicRMWInst::BinOp ScanOp =
- Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
if (!NeedResult && ST->hasPermLaneX16()) {
// On GFX10 the permlanex16 instruction helps us build a reduction
// without too many readlanes and writelanes, which are generally bad
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
index 2d9cfd146c6bb73..c0ca6bc99f798c5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
@@ -451,7 +451,7 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast float [[OLDVALUEPHI]] to i32
; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.writelane(i32 [[TMP25]], i32 [[TMP21]], i32 [[TMP26]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP28]] = bitcast i32 [[TMP27]] to float
-; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fsub.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP29]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP24]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP30:%.*]] = shl i64 1, [[TMP20]]
; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = xor i64 [[TMP30]], -1
; IR-ITERATIVE-NEXT: [[TMP32]] = and i64 [[ACTIVEBITS]], [[TMP31]]
@@ -476,17 +476,17 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP24]], i32 312, i32 15, i32 15, i1 false) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP24]] to i32
; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP26]], i32 63) #[[ATTR8]]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
index bfc6a3ba6067ac1..c3e25bdbcd525d2 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
@@ -193,7 +193,7 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #
; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32
; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]])
; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float
-; IR-ITERATIVE-NEXT: [[TMP16]] = fsub float [[ACCUMULATOR]], [[TMP15]]
+; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]]
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1
; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]]
@@ -217,17 +217,17 @@ define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #
; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float
; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float
; IR-DPP-NEXT: [[TMP11:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP9]], i32 273, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP12:%.*]] = fsub float [[TMP9]], [[TMP11]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = fadd float [[TMP9]], [[TMP11]]
; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP12]], i32 274, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP14:%.*]] = fsub float [[TMP12]], [[TMP13]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = fadd float [[TMP12]], [[TMP13]]
; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP14]], i32 276, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP16:%.*]] = fsub float [[TMP14]], [[TMP15]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = fadd float [[TMP14]], [[TMP15]]
; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP16]], i32 280, i32 15, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP18:%.*]] = fsub float [[TMP16]], [[TMP17]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = fadd float [[TMP16]], [[TMP17]]
; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP18]], i32 322, i32 10, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP20:%.*]] = fsub float [[TMP18]], [[TMP19]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = fadd float [[TMP18]], [[TMP19]]
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP20]], i32 323, i32 12, i32 15, i1 false)
-; IR-DPP-NEXT: [[TMP22:%.*]] = fsub float [[TMP20]], [[TMP21]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = fadd float [[TMP20]], [[TMP21]]
; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast float [[TMP22]] to i32
; IR-DPP-NEXT: [[TMP24:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP23]], i32 63)
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast i32 [[TMP24]] to float
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
index e7c6f3da458c60a..ea2192ccb8b0d2b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
@@ -354,7 +354,7 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast float [[VAL:%.*]] to i32
; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP16]], i32 [[TMP15]]) #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float
-; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fsub.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19]] = call float @llvm.experimental.constrained.fadd.f32(float [[ACCUMULATOR]], float [[TMP18]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = shl i64 1, [[TMP14]]
; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = xor i64 [[TMP20]], -1
; IR-ITERATIVE-NEXT: [[TMP22]] = and i64 [[ACTIVEBITS]], [[TMP21]]
@@ -379,17 +379,17 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast i32 [[TMP10]] to float
; IR-DPP-NEXT: [[TMP12:%.*]] = bitcast i32 [[TMP9]] to float
; IR-DPP-NEXT: [[TMP13:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP11]], i32 273, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP14:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP11]], float [[TMP13]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP14]], i32 274, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP16:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP14]], float [[TMP15]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP17:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP16]], i32 276, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP18:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP16]], float [[TMP17]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP18]], i32 280, i32 15, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP18]], float [[TMP19]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP21:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP20]], i32 322, i32 10, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP22:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP20]], float [[TMP21]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP23:%.*]] = call float @llvm.amdgcn.update.dpp.f32(float 0.000000e+00, float [[TMP22]], i32 323, i32 12, i32 15, i1 false) #[[ATTR8]]
-; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fsub.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[TMP22]], float [[TMP23]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
; IR-DPP-NEXT: [[TMP25:%.*]] = bitcast float [[TMP24]] to i32
; IR-DPP-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP25]], i32 63) #[[ATTR8]]
; IR-DPP-NEXT: [[TMP27:%.*]] = bitcast i32 [[TMP26]] to float
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 55a3b877181fe9b..7dcca1160d655a2 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -477,7 +477,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -547,7 +547,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -614,7 +614,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
-; GFX1032-NEXT: v_subrev_f32_e32 v2, s2, v2
+; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -676,7 +676,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT: v_subrev_f32_e32 v2, s4, v2
+; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -738,7 +738,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_subrev_f32_e32 v2, s2, v2
+; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -805,21 +805,21 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX9-DPP-NEXT: s_not_b64 exec, exec
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_nop 0
-; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-DPP-NEXT: s_nop 1
-; GFX9-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX9-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
-; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_nop 1
; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-DPP-NEXT: v_sub_f32_e32 v3, v3, v4
+; GFX9-DPP-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
@@ -878,19 +878,19 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-DPP-NEXT: s_not_b64 exec, exec
; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX1064-DPP-NEXT: v_subrev_f32_dpp v3, v3, v3 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX1064-DPP-NEXT: v_add_f32_dpp v3, v3, v3 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1...
<truncated>
</pre>
</details>
https://github.com/llvm/llvm-project/pull/66082
More information about the llvm-commits
mailing list