[llvm-branch-commits] [llvm] AMDGPU: Handle new atomicrmw metadata for fadd case (PR #96760)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jun 26 05:41:37 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
This is the most complex atomicrmw support case. Note we don't have
accurate remarks for all of the cases, which I'm planning on fixing
in a later change with more precise wording.
Continue respecting amdgpu-unsafe-fp-atomics until it's eventual removal.
Also seems to fix a few cases not interpreting amdgpu-unsafe-fp-atomics
appropriately aaggressively.
---
Patch is 1.02 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96760.diff
37 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+69-81)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll (+5-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+61-178)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll (+420-101)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll (+262-17)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f64.ll (+110-39)
- (modified) llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll (+51-69)
- (modified) llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll (+11-9)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+236-87)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll (+5-3)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll (+5-3)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+736-958)
- (modified) llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll (+13-50)
- (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+59-156)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f64.ll (+73-28)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd-wrong-subtarget.ll (+5-3)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+746-232)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+78-182)
- (modified) llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll (+53-51)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-agent.ll (+52-676)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f32-system.ll (+182-1186)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-agent.ll (+4-52)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-f64-system.ll (+20-175)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i128.ll (+30-30)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll (+10-22)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll (+45-45)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll (+1471-3143)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-simplify-cfg-CAS-block.ll (+3-3)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll (+33-223)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll (+52-4)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll (+59-201)
- (modified) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll (+13-1)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 11ebfe7511f7b..f9b5aea144440 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16037,26 +16037,15 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
SNaN, Depth);
}
-#if 0
-// FIXME: This should be checked before unsafe fp atomics are enabled
-// Global FP atomic instructions have a hardcoded FP mode and do not support
-// FP32 denormals, and only support v2f16 denormals.
-static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
- const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
- auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
- if (&Flt == &APFloat::IEEEsingle())
- return DenormMode == DenormalMode::getPreserveSign();
- return DenormMode == DenormalMode::getIEEE();
-}
-#endif
+// On older subtargets, global FP atomic instructions have a hardcoded FP mode
+// and do not support FP32 denormals, and only support v2f16/f64 denormals.
+static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW) {
+ if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
+ return true;
-// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
-// floating point atomic instructions. May generate more efficient code,
-// but may not respect rounding and denormal modes, and may give incorrect
-// results for certain memory destinations.
-bool unsafeFPAtomicsDisabled(Function *F) {
- return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
- "true";
+ const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
+ auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
+ return DenormMode == DenormalMode::getPreserveSign();
}
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW) {
@@ -16185,75 +16174,74 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AtomicExpansionKind::CmpXChg;
}
- if (!AMDGPU::isFlatGlobalAddrSpace(AS) &&
- AS != AMDGPUAS::BUFFER_FAT_POINTER)
- return AtomicExpansionKind::CmpXChg;
-
- if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
- return AtomicExpansionKind::None;
-
- if (AS == AMDGPUAS::FLAT_ADDRESS) {
- // gfx940, gfx12
- // FIXME: Needs to account for no fine-grained memory
- if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
- return AtomicExpansionKind::None;
- } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
- // gfx90a, gfx940, gfx12
- // FIXME: Needs to account for no fine-grained memory
- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
- return AtomicExpansionKind::None;
-
- // gfx940, gfx12
- // FIXME: Needs to account for no fine-grained memory
- if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
- return AtomicExpansionKind::None;
- } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
- // gfx90a, gfx940, gfx12
- // FIXME: Needs to account for no fine-grained memory
- if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
- return AtomicExpansionKind::None;
-
- // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
- // buffer. gfx12 does have the buffer version.
- if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
- return AtomicExpansionKind::None;
- }
-
- if (unsafeFPAtomicsDisabled(RMW->getFunction()))
- return AtomicExpansionKind::CmpXChg;
-
- // Always expand system scope fp atomics.
- if (HasSystemScope)
+ // LDS atomics respect the denormal mode from the mode register.
+ //
+ // Traditionally f32 global/buffer memory atomics would unconditionally
+ // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
+ // flush.
+ //
+ // On targets with flat atomic fadd, denormals would flush depending on
+ // whether the target address resides in LDS or global memory. We consider
+ // this flat-maybe-flush as will-flush.
+ if (Ty->isFloatTy() &&
+ !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
+ !atomicIgnoresDenormalModeOrFPModeIsFTZ(RMW))
return AtomicExpansionKind::CmpXChg;
- // global and flat atomic fadd f64: gfx90a, gfx940.
- if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
-
- if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
- // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
- // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
- }
+ if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
+ if (AS == AMDGPUAS::FLAT_ADDRESS) {
+ // gfx940, gfx12
+ if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
+ // gfx90a, gfx940, gfx12
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
+ return AtomicExpansionKind::None;
+
+ // gfx940, gfx12
+ if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
+ // gfx90a, gfx940, gfx12
+ if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
+ return AtomicExpansionKind::None;
+
+ // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
+ // buffer. gfx12 does have the buffer version.
+ if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty))
+ return AtomicExpansionKind::None;
+ }
- // flat atomic fadd f32: gfx940, gfx11+.
- if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
- if (Subtarget->hasFlatAtomicFaddF32Inst())
+ // global and flat atomic fadd f64: gfx90a, gfx940.
+ if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
- // If it is in flat address space, and the type is float, we will try to
- // expand it, if the target supports global and lds atomic fadd. The
- // reason we need that is, in the expansion, we emit the check of address
- // space. If it is in global address space, we emit the global atomic
- // fadd; if it is in shared address space, we emit the LDS atomic fadd.
- if (Subtarget->hasLDSFPAtomicAddF32()) {
+ if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
+ // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
- return AtomicExpansionKind::Expand;
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
- return AtomicExpansionKind::Expand;
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ }
+
+ // flat atomic fadd f32: gfx940, gfx11+.
+ if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
+ if (Subtarget->hasFlatAtomicFaddF32Inst())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+
+ // If it is in flat address space, and the type is float, we will try to
+ // expand it, if the target supports global and lds atomic fadd. The
+ // reason we need that is, in the expansion, we emit the check of
+ // address space. If it is in global address space, we emit the global
+ // atomic fadd; if it is in shared address space, we emit the LDS atomic
+ // fadd.
+ if (Subtarget->hasLDSFPAtomicAddF32()) {
+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
+ return AtomicExpansionKind::Expand;
+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
+ return AtomicExpansionKind::Expand;
+ }
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
index aa9ebb9226cdd..7ce1c17b7ccfd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll
@@ -79,7 +79,7 @@ define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(ptr %ptr, float %da
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
; GFX11-NEXT: S_ENDPGM 0
- %ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic
+ %ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -107,10 +107,12 @@ define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(ptr %ptr, float %data
; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr)
; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- %ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic
+ %ret = atomicrmw fadd ptr %ptr, float %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
ret float %ret
}
declare float @llvm.amdgcn.flat.atomic.fadd.f32.p1.f32(ptr, float)
attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
index 68d8e3d747b86..869f073f3b1b9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f64.ll
@@ -55,7 +55,7 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d
; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
; GFX90A_GFX940-NEXT: S_ENDPGM 0
- %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic
+ %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -78,10 +78,12 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da
; GFX90A_GFX940-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
- %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic
+ %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !amdgpu.no.fine.grained.memory !0
ret double %ret
}
declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double)
attributes #0 = {"amdgpu-unsafe-fp-atomics"="true" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index 632dbd45279fb..710d48be037e0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -34,7 +34,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_endpgm
- %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
+ %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -50,7 +50,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_endpgm
- %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
+ %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -75,7 +75,7 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
- %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
+ %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst, !amdgpu.no.remote.memory !0
ret float %ret
}
@@ -235,3 +235,5 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
}
attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 66b22bedaf072..429b045e31c12 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1095,32 +1095,20 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB39_3
+; GFX90A-NEXT: s_cbranch_execz .LBB39_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB39_2
-; GFX90A-NEXT: .LBB39_3:
+; GFX90A-NEXT: .LBB39_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat:
@@ -1146,7 +1134,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX940-NEXT: .LBB39_2:
; GFX940-NEXT: s_endpgm
main_body:
- %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1196,7 +1184,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX940-NEXT: .LBB40_2:
; GFX940-NEXT: s_endpgm
main_body:
- %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1209,32 +1197,20 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB41_3
+; GFX90A-NEXT: s_cbranch_execz .LBB41_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX90A-NEXT: v_mov_b32_e32 v6, 0
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB41_2
-; GFX90A-NEXT: .LBB41_3:
+; GFX90A-NEXT: .LBB41_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system:
@@ -1260,7 +1236,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX940-NEXT: .LBB41_2:
; GFX940-NEXT: s_endpgm
main_body:
- %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1310,7 +1286,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX940-NEXT: .LBB42_2:
; GFX940-NEXT: s_endpgm
main_body:
- %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
+ %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
@@ -1337,26 +1313,13 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
+; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v2
-; GFX90A-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat:
@@ -136...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/96760
More information about the llvm-branch-commits
mailing list