[llvm] AMDGPU: Simplify DS atomicrmw fadd handling (PR #89468)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Sat Apr 20 01:25:27 PDT 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/89468
>From 2653277e46e65990322b4d958b9b15b730752132 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 19 Apr 2024 16:34:51 +0200
Subject: [PATCH] AMDGPU: Simplify DS atomicrmw fadd handling
DS atomic fadd F32 does respect the denormal mode, so we do not
need to consider the expected FP mode or unsafe-fp-atomics attribute.
They don't respect the rounding mode, but we don't care outside of strictfp.
This also reveals the fp-mode-is-flush check has been missing in the cases
that should be considering it alongside amdgpu-unsafe-fp-atomics.
This also stops considering the case where flushing is enabled for f64,
as flushing isn't mandated and we barely handle this case.
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 2 +-
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 2 +-
llvm/lib/Target/AMDGPU/DSInstructions.td | 8 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 47 ++-
.../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 38 +-
.../AMDGPU/atomics-hw-remarks-gfx90a.ll | 1 -
.../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 52 +--
.../AMDGPU/expand-atomic-rmw-fadd.ll | 344 +++++++++++++++---
9 files changed, 342 insertions(+), 155 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 9b09550159993c..5c2c6d4b13c669 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1896,7 +1896,7 @@ def HasVINTERPEncoding : Predicate<"Subtarget->hasVINTERPEncoding()">,
def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
AssemblerPredicate<(all_of FeatureGFX9Insts)>;
-def HasLDSFPAtomicAdd : Predicate<"Subtarget->hasLDSFPAtomicAdd()">,
+def HasLDSFPAtomicAddF32 : Predicate<"Subtarget->hasLDSFPAtomicAddF32()">,
AssemblerPredicate<(all_of FeatureGFX8Insts)>;
def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e55d1de01b4fd1..780dfaae11ef3e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1624,7 +1624,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
}
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
- if (ST.hasLDSFPAtomicAdd()) {
+ if (ST.hasLDSFPAtomicAddF32()) {
Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
if (ST.hasLdsAtomicAddF64())
Atomic.legalFor({{S64, LocalPtr}});
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 0773ef7f323418..d63f04ab6d4c91 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -443,7 +443,7 @@ defm DS_AND_B32 : DS_1A1D_NORET_mc<"ds_and_b32">;
defm DS_OR_B32 : DS_1A1D_NORET_mc<"ds_or_b32">;
defm DS_XOR_B32 : DS_1A1D_NORET_mc<"ds_xor_b32">;
-let SubtargetPredicate = HasLDSFPAtomicAdd in {
+let SubtargetPredicate = HasLDSFPAtomicAddF32 in {
defm DS_ADD_F32 : DS_1A1D_NORET_mc<"ds_add_f32">;
}
@@ -523,7 +523,7 @@ defm DS_MAX_F64 : DS_1A1D_NORET_mc<"ds_max_f64", VReg_64>;
defm DS_ADD_RTN_U32 : DS_1A1D_RET_mc<"ds_add_rtn_u32", VGPR_32>;
-let SubtargetPredicate = HasLDSFPAtomicAdd in {
+let SubtargetPredicate = HasLDSFPAtomicAddF32 in {
defm DS_ADD_RTN_F32 : DS_1A1D_RET_mc<"ds_add_rtn_f32", VGPR_32>;
}
defm DS_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_rtn_u32", VGPR_32>;
@@ -697,7 +697,7 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
} // let SubtargetPredicate = isGFX8Plus
-let SubtargetPredicate = HasLDSFPAtomicAdd, OtherPredicates = [HasDsSrc2Insts] in {
+let SubtargetPredicate = HasLDSFPAtomicAddF32, OtherPredicates = [HasDsSrc2Insts] in {
def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
}
@@ -1088,7 +1088,7 @@ let SubtargetPredicate = isGFX11Plus in {
defm : DSAtomicCmpXChg_mc<DS_CMPSTORE_RTN_B32, DS_CMPSTORE_B32, i32, "atomic_cmp_swap">;
}
-let SubtargetPredicate = HasLDSFPAtomicAdd in {
+let SubtargetPredicate = HasLDSFPAtomicAddF32 in {
defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_F32, DS_ADD_F32, f32, "atomic_load_fadd">;
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 8a4a46ce50d1d7..2ca5ae306b11be 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -960,7 +960,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasScalarAtomics;
}
- bool hasLDSFPAtomicAdd() const { return GFX8Insts; }
+ bool hasLDSFPAtomicAddF32() const { return GFX8Insts; }
+ bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; }
/// \returns true if the subtarget has the v_permlanex16_b32 instruction.
bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c4592d43f4f8a8..12b7f05c446309 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15977,6 +15977,8 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
SNaN, Depth);
}
+#if 0
+// FIXME: This should be checked before unsafe fp atomics are enabled
// Global FP atomic instructions have a hardcoded FP mode and do not support
// FP32 denormals, and only support v2f16 denormals.
static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
@@ -15986,6 +15988,7 @@ static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
return DenormMode == DenormalMode::getPreserveSign();
return DenormMode == DenormalMode::getIEEE();
}
+#endif
// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
// floating point atomic instructions. May generate more efficient code,
@@ -16046,8 +16049,31 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::FAdd: {
Type *Ty = RMW->getType();
- if (Ty->isHalfTy())
+ // TODO: Handle REGION_ADDRESS
+ if (AS == AMDGPUAS::LOCAL_ADDRESS) {
+ // DS F32 FP atomics do respect the denormal mode, but the rounding mode
+ // is fixed to round-to-nearest-even.
+ //
+ // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
+ // round-to-nearest-even.
+ //
+ // We ignore the rounding mode problem, even in strictfp. The C++ standard
+ // suggests it is OK if the floating-point mode may not match the calling
+ // thread.
+ if (Ty->isFloatTy()) {
+ return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
+ : AtomicExpansionKind::CmpXChg;
+ }
+
+ if (Ty->isDoubleTy()) {
+ // Ignores denormal mode, but we don't consider flushing mandatory.
+ return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
+ : AtomicExpansionKind::CmpXChg;
+ }
+
+ // TODO: Handle v2f16/v2bf16 cases for gfx940
return AtomicExpansionKind::CmpXChg;
+ }
if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
return AtomicExpansionKind::CmpXChg;
@@ -16091,7 +16117,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// space. If it is in global address space, we emit the global atomic
// fadd; if it is in shared address space, we emit the LDS atomic fadd.
if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() &&
- Subtarget->hasLDSFPAtomicAdd()) {
+ Subtarget->hasLDSFPAtomicAddF32()) {
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
return AtomicExpansionKind::Expand;
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
@@ -16101,23 +16127,6 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AtomicExpansionKind::CmpXChg;
}
- // DS FP atomics do respect the denormal mode, but the rounding mode is
- // fixed to round-to-nearest-even.
- // The only exception is DS_ADD_F64 which never flushes regardless of mode.
- if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) {
- if (!Ty->isDoubleTy())
- return AtomicExpansionKind::None;
-
- if (fpModeMatchesGlobalFPAtomicMode(RMW))
- return AtomicExpansionKind::None;
-
- return RMW->getFunction()
- ->getFnAttribute("amdgpu-unsafe-fp-atomics")
- .getValueAsString() == "true"
- ? ReportUnsafeHWInst(AtomicExpansionKind::None)
- : AtomicExpansionKind::CmpXChg;
- }
-
return AtomicExpansionKind::CmpXChg;
}
case AtomicRMWInst::FMin:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 1a76f8cf87ffbe..4e94a646f6da5e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -2074,28 +2074,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB67_3
+; GFX90A-NEXT: s_cbranch_execz .LBB67_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v4, s0
-; GFX90A-NEXT: ds_read_b64 v[2:3], v4
-; GFX90A-NEXT: s_mov_b64 s[0:1], 0
-; GFX90A-NEXT: .LBB67_2: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
-; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s0
+; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX90A-NEXT: s_cbranch_execnz .LBB67_2
-; GFX90A-NEXT: .LBB67_3:
+; GFX90A-NEXT: .LBB67_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
@@ -2106,28 +2095,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB67_3
+; GFX940-NEXT: s_cbranch_execz .LBB67_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v4, s0
-; GFX940-NEXT: ds_read_b64 v[2:3], v4
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB67_2: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
-; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
+; GFX940-NEXT: v_mov_b32_e32 v2, s0
+; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB67_2
-; GFX940-NEXT: .LBB67_3:
+; GFX940-NEXT: .LBB67_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll
index 001a4e999aee98..b3fb67a7d7e0c9 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll
@@ -1,7 +1,6 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=si-lower \
; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-HW
-; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope system due to an unsafe request.
; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope agent due to an unsafe request.
; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope workgroup due to an unsafe request.
; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope wavefront due to an unsafe request.
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index a948fab8f1c1b9..121fab51024fdd 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -2213,29 +2213,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB72_3
+; GFX90A-NEXT: s_cbranch_execz .LBB72_2
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s4
-; GFX90A-NEXT: ds_read_b64 v[2:3], v0
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
-; GFX90A-NEXT: s_mov_b64 s[0:1], 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, s4
-; GFX90A-NEXT: .LBB72_2: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
-; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s0
+; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX90A-NEXT: s_cbranch_execnz .LBB72_2
-; GFX90A-NEXT: .LBB72_3:
+; GFX90A-NEXT: .LBB72_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
@@ -2245,29 +2233,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB72_3
+; GFX940-NEXT: s_cbranch_execz .LBB72_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-NEXT: ds_read_b64 v[2:3], v0
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_mov_b32_e32 v4, s4
-; GFX940-NEXT: .LBB72_2: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
-; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
+; GFX940-NEXT: v_mov_b32_e32 v2, s0
+; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB72_2
-; GFX940-NEXT: .LBB72_3:
+; GFX940-NEXT: .LBB72_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 337e51f9a912c2..17318b2c62ca8c 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -3670,84 +3670,308 @@ define double @test_atomicrmw_fadd_f64_dyndenorm_global_system_ret__amdgpu_ignor
}
define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret(ptr addrspace(3) %ptr, double %value) #5 {
-; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret(
-; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
-; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret void
+; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret(
+; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret void
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret(
+; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret void
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret(
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret void
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret(
+; GFX90A-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8
+; GFX90A-NEXT: ret void
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret(
+; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8
+; GFX940-NEXT: ret void
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret(
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret void
;
%unused = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic
ret void
}
define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret(ptr addrspace(3) %ptr, double %value) #5 {
-; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret(
-; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
-; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret double [[TMP5]]
+; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret(
+; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret double [[TMP5]]
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret(
+; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret double [[TMP5]]
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret(
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP5]]
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret(
+; GFX90A-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8
+; GFX90A-NEXT: ret double [[RET]]
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret(
+; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8
+; GFX940-NEXT: ret double [[RET]]
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret(
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP5]]
;
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic
ret double %ret
}
define void @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr, double %value) #5 {
-; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode(
-; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
-; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret void
+; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode(
+; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret void
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode(
+; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret void
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode(
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret void
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode(
+; GFX90A-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: ret void
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode(
+; GFX940-NEXT: [[UNUSED:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret void
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_noret__amdgpu_ignore_denormal_mode(
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret void
;
%unused = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0
ret void
}
define double @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode(ptr addrspace(3) %ptr, double %value) #5 {
-; ALL-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode(
-; ALL-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
-; ALL-NEXT: br label [[ATOMICRMW_START:%.*]]
-; ALL: atomicrmw.start:
-; ALL-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; ALL-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
-; ALL-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
-; ALL-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
-; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
-; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
-; ALL-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; ALL: atomicrmw.end:
-; ALL-NEXT: ret double [[TMP5]]
+; CI-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode(
+; CI-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; CI-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CI: atomicrmw.start:
+; CI-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; CI-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; CI-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; CI-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; CI-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CI: atomicrmw.end:
+; CI-NEXT: ret double [[TMP5]]
+;
+; GFX9-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode(
+; GFX9-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX9: atomicrmw.start:
+; GFX9-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX9-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; GFX9-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX9-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX9-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX9: atomicrmw.end:
+; GFX9-NEXT: ret double [[TMP5]]
+;
+; GFX908-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode(
+; GFX908-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX908: atomicrmw.start:
+; GFX908-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX908-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; GFX908-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX908: atomicrmw.end:
+; GFX908-NEXT: ret double [[TMP5]]
+;
+; GFX90A-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode(
+; GFX90A-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT: ret double [[RET]]
+;
+; GFX940-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode(
+; GFX940-NEXT: [[RET:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], double [[VALUE:%.*]] monotonic, align 8, !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT: ret double [[RET]]
+;
+; GFX11-LABEL: @test_atomicrmw_fadd_f64_dyndenorm_local_system_ret__amdgpu_ignore_denormal_mode(
+; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(3) [[PTR:%.*]], align 8
+; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GFX11: atomicrmw.start:
+; GFX11-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; GFX11-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
+; GFX11-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX11-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] monotonic monotonic, align 8
+; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX11-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GFX11: atomicrmw.end:
+; GFX11-NEXT: ret double [[TMP5]]
;
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double %value monotonic, !amdgpu.ignore.denormal.mode !0
ret double %ret
More information about the llvm-commits
mailing list