[llvm] [AMDGPU] Cleanup bitcast spam in atomic optimizer (PR #96933)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 27 10:18:08 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Vikram Hegde (vikramRH)
<details>
<summary>Changes</summary>
---
Patch is 339.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96933.diff
14 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp (+27-80)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll (+11-12)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll (+19-20)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+56-56)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+80-80)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll (+15-18)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll (+14-15)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll (+492-676)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll (+66-80)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll (+216-258)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+90-80)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+60-54)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+60-54)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+90-80)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index d7ef6f3c5dc43..cdd1953dca4ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -386,7 +386,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
Value *V,
Value *const Identity) const {
Type *AtomicTy = V->getType();
- Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -402,34 +401,28 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Reduce within each pair of rows (i.e. 32 lanes).
assert(ST->hasPermLaneX16());
- V = B.CreateBitCast(V, IntNTy);
Value *Permlanex16Call = B.CreateIntrinsic(
V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
- V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
- B.CreateBitCast(Permlanex16Call, AtomicTy));
+ V = buildNonAtomicBinOp(B, Op, V, Permlanex16Call);
if (ST->isWave32()) {
return V;
}
if (ST->hasPermLane64()) {
// Reduce across the upper and lower 32 lanes.
- V = B.CreateBitCast(V, IntNTy);
Value *Permlane64Call =
B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
- return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
- B.CreateBitCast(Permlane64Call, AtomicTy));
+ return buildNonAtomicBinOp(B, Op, V, Permlane64Call);
}
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
// combine them with a scalar operation.
Function *ReadLane =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
- V = B.CreateBitCast(V, IntNTy);
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
- return buildNonAtomicBinOp(B, Op, B.CreateBitCast(Lane0, AtomicTy),
- B.CreateBitCast(Lane32, AtomicTy));
+ return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
}
// Use the builder to create an inclusive scan of V across the wavefront, with
@@ -438,8 +431,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
AtomicRMWInst::BinOp Op, Value *V,
Value *Identity) const {
Type *AtomicTy = V->getType();
- Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
-
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -470,20 +461,17 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
// 48..63).
assert(ST->hasPermLaneX16());
- V = B.CreateBitCast(V, IntNTy);
Value *PermX = B.CreateIntrinsic(
V->getType(), Intrinsic::amdgcn_permlanex16,
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
- Value *UpdateDPPCall =
- B.CreateCall(UpdateDPP, {Identity, B.CreateBitCast(PermX, AtomicTy),
- B.getInt32(DPP::QUAD_PERM_ID), B.getInt32(0xa),
- B.getInt32(0xf), B.getFalse()});
- V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall);
+ Value *UpdateDPPCall = B.CreateCall(
+ UpdateDPP, {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
+ B.getInt32(0xa), B.getInt32(0xf), B.getFalse()});
+ V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
if (!ST->isWave32()) {
// Combine lane 31 into lanes 32..63.
- V = B.CreateBitCast(V, IntNTy);
Value *const Lane31 = B.CreateIntrinsic(
V->getType(), Intrinsic::amdgcn_readlane, {V, B.getInt32(31)});
@@ -491,8 +479,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
B.getInt32(0xc), B.getInt32(0xf), B.getFalse()});
- V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
- UpdateDPPCall);
+ V = buildNonAtomicBinOp(B, Op, V, UpdateDPPCall);
}
}
return V;
@@ -503,8 +490,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
Value *Identity) const {
Type *AtomicTy = V->getType();
- Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
-
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -514,10 +499,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
B.getInt32(0xf), B.getFalse()});
} else {
- Function *ReadLane = Intrinsic::getDeclaration(
- M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
- Function *WriteLane = Intrinsic::getDeclaration(
- M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
+ Function *ReadLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy);
+ Function *WriteLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, AtomicTy);
// On GFX10 all DPP operations are confined to a single row. To get cross-
// row operations we have to use permlane or readlane.
@@ -527,24 +512,19 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
// Copy the old lane 15 to the new lane 16.
- V = B.CreateCall(
- WriteLane,
- {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), B.getInt32(15)}),
- B.getInt32(16), B.CreateBitCast(V, IntNTy)});
- V = B.CreateBitCast(V, AtomicTy);
+ V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
+ B.getInt32(16), V});
+
if (!ST->isWave32()) {
// Copy the old lane 31 to the new lane 32.
- V = B.CreateBitCast(V, IntNTy);
- V = B.CreateCall(WriteLane,
- {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy),
- B.getInt32(31)}),
- B.getInt32(32), V});
+ V = B.CreateCall(
+ WriteLane,
+ {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
// Copy the old lane 47 to the new lane 48.
V = B.CreateCall(
WriteLane,
{B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
- V = B.CreateBitCast(V, AtomicTy);
}
}
@@ -584,24 +564,18 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
auto *FF1 =
B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});
- Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());
- auto *LaneIdxInt = B.CreateTrunc(FF1, IntNTy);
+ auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty());
// Get the value required for atomic operation
- V = B.CreateBitCast(V, IntNTy);
Value *LaneValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_readlane,
{V, LaneIdxInt});
- LaneValue = B.CreateBitCast(LaneValue, Ty);
// Perform writelane if intermediate scan results are required later in the
// kernel computations
Value *OldValue = nullptr;
if (NeedResult) {
- OldValue =
- B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_writelane,
- {B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt,
- B.CreateBitCast(OldValuePhi, IntNTy)});
- OldValue = B.CreateBitCast(OldValue, Ty);
+ OldValue = B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_writelane,
+ {Accumulator, LaneIdxInt, OldValuePhi});
OldValuePhi->addIncoming(OldValue, ComputeLoop);
}
@@ -700,10 +674,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
Type *const Ty = I.getType();
Type *Int32Ty = B.getInt32Ty();
- Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());
bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
- auto *const VecTy = FixedVectorType::get(Int32Ty, 2);
// This is the value in the atomic operation we need to combine in order to
// reduce the number of atomic operations.
@@ -758,13 +730,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
if (ScanImpl == ScanOptions::DPP) {
// First we need to set all inactive invocations to the identity value, so
// that they can correctly contribute to the final result.
- V = B.CreateBitCast(V, IntNTy);
- Identity = B.CreateBitCast(Identity, IntNTy);
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, IntNTy,
- {V, Identity});
- NewV = B.CreateBitCast(NewV, Ty);
- V = B.CreateBitCast(V, Ty);
- Identity = B.CreateBitCast(Identity, Ty);
+ NewV =
+ B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
if (!NeedResult && ST->hasPermLaneX16()) {
// On GFX10 the permlanex16 instruction helps us build a reduction
// without too many readlanes and writelanes, which are generally bad
@@ -779,10 +746,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// which we will provide to the atomic operation.
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
assert(TyBitWidth == 32);
- NewV = B.CreateBitCast(NewV, IntNTy);
- NewV = B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readlane,
+ NewV = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readlane,
{NewV, LastLaneIdx});
- NewV = B.CreateBitCast(NewV, Ty);
}
// Finally mark the readlanes in the WWM section.
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
@@ -922,26 +887,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// but have to handle 64-bit broadcasts with two calls to this intrinsic.
Value *BroadcastI = nullptr;
- if (TyBitWidth == 64) {
- Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
- Value *const ExtractLo = B.CreateTrunc(CastedPhi, Int32Ty);
- Value *const ExtractHi =
- B.CreateTrunc(B.CreateLShr(CastedPhi, 32), Int32Ty);
- CallInst *const ReadFirstLaneLo = B.CreateIntrinsic(
- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo);
- CallInst *const ReadFirstLaneHi = B.CreateIntrinsic(
- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi);
- Value *const PartialInsert = B.CreateInsertElement(
- PoisonValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
- Value *const Insert =
- B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
- BroadcastI = B.CreateBitCast(Insert, Ty);
- } else if (TyBitWidth == 32) {
- Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
- BroadcastI =
- B.CreateIntrinsic(IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi);
- BroadcastI = B.CreateBitCast(BroadcastI, Ty);
-
+ if (TyBitWidth == 32 || TyBitWidth == 64) {
+ BroadcastI = B.CreateIntrinsic(Ty, Intrinsic::amdgcn_readfirstlane, PHI);
} else {
llvm_unreachable("Unhandled atomic bit width");
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
index 21832dc320e42..3f0b86c271538 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
@@ -169,30 +169,29 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY8]], [[COPY9]], implicit $exec
; GFX90A_GFX940-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]]
; GFX90A_GFX940-NEXT: [[V_MBCNT_HI_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_HI_U32_B32_e64 [[COPY10]], [[V_MBCNT_LO_U32_B32_e64_]], implicit $exec
- ; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
+ ; GFX90A_GFX940-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
; GFX90A_GFX940-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY11]], implicit-def dead $scc, implicit $exec
- ; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
- ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY12]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY13]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY14]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp3:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY15]], [[V_ADD_F32_e64_2]], 280, 15, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_2]], 0, [[V_MOV_B32_dpp3]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp4:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY16]], [[V_ADD_F32_e64_3]], 322, 10, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, [[V_MOV_B32_dpp4]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]]
+ ; GFX90A_GFX940-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
; GFX90A_GFX940-NEXT: [[V_MOV_B32_dpp5:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY17]], [[V_ADD_F32_e64_4]], 323, 12, 15, 0, implicit $exec
; GFX90A_GFX940-NEXT: [[V_ADD_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_4]], 0, [[V_MOV_B32_dpp5]], 0, 0, implicit $mode, implicit $exec
- ; GFX90A_GFX940-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 63
- ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_4]]
+ ; GFX90A_GFX940-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 63
+ ; GFX90A_GFX940-NEXT: [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 [[V_ADD_F32_e64_5]], [[S_MOV_B32_3]]
; GFX90A_GFX940-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[V_READLANE_B32_]]
; GFX90A_GFX940-NEXT: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec
; GFX90A_GFX940-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
@@ -200,7 +199,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_BRANCH %bb.3
; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.35):
+ ; GFX90A_GFX940-NEXT: bb.3 (%ir-block.31):
; GFX90A_GFX940-NEXT: successors: %bb.4(0x80000000)
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -211,7 +210,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
; GFX90A_GFX940-NEXT: {{ $}}
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF1]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: {{ $}}
- ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.37):
+ ; GFX90A_GFX940-NEXT: bb.5 (%ir-block.33):
; GFX90A_GFX940-NEXT: SI_END_CF [[SI_IF]], implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A_GFX940-NEXT: S_ENDPGM 0
%ret = atomicrmw fadd ptr addrspace(1) %ptr, float %data syncscope("wavefront") monotonic
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
index e48d281f37c9a..676eae1bad85d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
@@ -171,35 +171,34 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY3]]
; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GFX11-NEXT: [[V_MBCNT_LO_U32_B32_e64_:%[0-9]+]]:vgpr_32 = V_MBCNT_LO_U32_B32_e64 [[COPY4]], [[COPY5]], implicit $exec
- ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
+ ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_SET_INACTIVE_B32_:%[0-9]+]]:vgpr_32 = V_SET_INACTIVE_B32 [[COPY2]], [[COPY6]], implicit-def dead $scc, implicit $exec
- ; GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
- ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
+ ; GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY7]], [[V_SET_INACTIVE_B32_]], 273, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_SET_INACTIVE_B32_]], 0, [[V_MOV_B32_dpp]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
+ ; GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY8]], [[V_ADD_F32_e64_]], 274, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_MOV_B32_dpp1]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]]
+ ; GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
; GFX11-NEXT: [[V_MOV_B32_dpp2:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY9]], [[V_ADD_F32_e64_1]], 276, 15, 15, 0, implicit $exec
; GFX11-NEXT: [[V_ADD_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_1]], 0, [[V_MOV_B32_dpp2]], 0, 0, implicit $mode, implicit $exec
- ; GFX11-NEX...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/96933
More information about the llvm-commits
mailing list