[llvm] [AMDGPU] Support f64 atomics on gfx1250 (PR #151172)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 29 08:30:56 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Changpeng Fang (changpeng)
<details>
<summary>Changes</summary>
- BUF/FLAT/GLOBAL_ADD/MIN/MAX_F64
- DS_ADD_F64
---
Patch is 170.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151172.diff
14 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+1-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+1-1)
- (modified) llvm/lib/Target/AMDGPU/BUFInstructions.td (+11-1)
- (modified) llvm/lib/Target/AMDGPU/DSInstructions.td (+3)
- (modified) llvm/lib/Target/AMDGPU/FLATInstructions.td (+8)
- (modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+3-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+790)
- (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+799)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_ds.s (+78)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf.s (+300)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s (+211)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_ds.txt (+33)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vbuffer_mubuf.txt (+90)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt (+69)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index a17fb934ff66d..25e1eabb2c293 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2429,7 +2429,7 @@ def HasAtomicFMinFMaxF64FlatInsts :
def HasLdsAtomicAddF64 :
Predicate<"Subtarget->hasLdsAtomicAddF64()">,
- AssemblerPredicate<(any_of FeatureGFX90AInsts)>;
+ AssemblerPredicate<(any_of FeatureGFX90AInsts, FeatureGFX1250Insts)>;
def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
AssemblerPredicate<(all_of FeatureFlatGlobalInsts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index fedfa3f9dd900..f16351fac9e2e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1682,7 +1682,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasFlatAtomicFaddF32Inst())
Atomic.legalFor({{S32, FlatPtr}});
- if (ST.hasGFX90AInsts()) {
+ if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
// These are legal with some caveats, and should have undergone expansion in
// the IR in most situations
// TODO: Move atomic expansion into legalizer
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index f99e71637f70f..1956a15c57d67 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -2489,7 +2489,7 @@ multiclass VBUFFER_MTBUF_Real_gfx12<bits<4> op, string real_name> {
}
//===----------------------------------------------------------------------===//
-// MUBUF - GFX11, GFX12.
+// MUBUF - GFX11, GFX12, GFX1250.
//===----------------------------------------------------------------------===//
// gfx11 instruction that accept both old and new assembler name.
@@ -2600,6 +2600,12 @@ multiclass MUBUF_Real_Atomic_gfx11_gfx12<bits<8> op,
def : Mnem_gfx12<gfx11_name, gfx12_name>;
}
+multiclass MUBUF_Real_Atomic_gfx12_Renamed<bits<8> op, string real_name> :
+ MUBUF_Real_Atomic_gfx12_impl<op, 0, real_name>,
+ MUBUF_Real_Atomic_gfx12_impl<op, 1, real_name> {
+ def : Mnem_gfx12<get_BUF_ps<NAME>.Mnemonic, real_name>;
+}
+
defm BUFFER_GL0_INV : MUBUF_Real_gfx11<0x02B>;
defm BUFFER_GL1_INV : MUBUF_Real_gfx11<0x02C>;
@@ -2678,6 +2684,10 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12<0x04B, "buffer
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_gfx12<0x059>;
defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Real_Atomic_gfx12<0x05a>;
+defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_gfx12<0x055>;
+defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05b, "buffer_atomic_min_num_f64">;
+defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Real_Atomic_gfx12_Renamed<0x05c, "buffer_atomic_max_num_f64">;
+
//===----------------------------------------------------------------------===//
// MUBUF - GFX10.
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 319cc9d1da181..3ff675d6e5e97 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1397,6 +1397,9 @@ defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0,
defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>;
defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>;
+defm DS_ADD_F64 : DS_Real_gfx12<0x054>;
+defm DS_ADD_RTN_F64 : DS_Real_gfx12<0x074>;
+
let AssemblerPredicate = HasLdsBarrierArriveAtomic in {
defm DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_Real_gfx12<0x056>;
defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_Real_gfx12<0x075>;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 8ede9caead8bc..d5d1074622135 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -3488,6 +3488,14 @@ defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "globa
defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>;
defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>;
+defm FLAT_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>;
+defm FLAT_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "flat_atomic_min_num_f64">;
+defm FLAT_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "flat_atomic_max_num_f64">;
+
+defm GLOBAL_ATOMIC_ADD_F64 : VFLAT_Real_Atomics_gfx1250<0x055>;
+defm GLOBAL_ATOMIC_MIN_F64 : VFLAT_Real_Atomics_gfx1250<0x05b, "global_atomic_min_num_f64">;
+defm GLOBAL_ATOMIC_MAX_F64 : VFLAT_Real_Atomics_gfx1250<0x05c, "global_atomic_max_num_f64">;
+
def True16D16Table : GenericTable {
let FilterClass = "True16D16Table";
let CppTypeName = "True16D16Info";
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 10ded0e1d1c3a..5357a375ae5a9 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -715,7 +715,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasVINTERPEncoding() const { return GFX11Insts && !hasGFX1250Insts(); }
// DS_ADD_F64/DS_ADD_RTN_F64
- bool hasLdsAtomicAddF64() const { return hasGFX90AInsts(); }
+ bool hasLdsAtomicAddF64() const {
+ return hasGFX90AInsts() || hasGFX1250Insts();
+ }
bool hasMultiDwordFlatScratchAddressing() const {
return getGeneration() >= GFX9;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 0e132f130c844..2785b78da99e2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s -check-prefix=GFX90A
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefix=GFX942
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s -check-prefix=GFX1250
declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
@@ -37,6 +38,17 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
ret void
@@ -56,6 +68,13 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
store double %ret, ptr poison
@@ -92,6 +111,24 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, 4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -120,6 +157,17 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8)
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
ret void
@@ -139,6 +187,13 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
store double %ret, ptr poison
@@ -175,6 +230,24 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, 4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -203,6 +276,17 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
ret void
@@ -222,6 +306,13 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
store double %ret, ptr poison
@@ -258,6 +349,23 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -286,6 +394,17 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
ret void
@@ -305,6 +424,13 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inr
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
store double %ret, ptr poison
@@ -341,6 +467,23 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -369,6 +512,17 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub
; GFX942-NEXT: v_mov_b32_e32 v2, s8
; GFX942-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
ret void
@@ -388,6 +542,13 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: flat_store_b64 v[0:1], v[0:1] scope:SCOPE_SE
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
store double %ret, ptr poison
@@ -424,6 +585,24 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: global_store_dwordx2 v0, v[2:3], s[8:9]
; GFX942-NEXT: s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX1250: ; %bb.0: ; %main_body
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x34
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_mov_b32 s6, 4
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
+; GFX1250-NEXT: v_mov_b32_e32 v2, s10
+; GFX1250-NEXT: buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], s6 offen th:TH_ATOMIC_NT_RETURN
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x44
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
store double %ret, ptr addrspace(1) %out, align 8
@@ -452,6 +631,17 @@ define amdgpu_kernel void @raw_p...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/151172
More information about the llvm-commits
mailing list