[llvm] 9dbc968 - [AMDGPU] Fix atomic float max/min intrinsics
Joe Nash via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 18 11:29:15 PDT 2021
Author: Joe Nash
Date: 2021-08-18T14:12:42-04:00
New Revision: 9dbc968ed9fa7b81ba6be27f7f26b79bc813a7bb
URL: https://github.com/llvm/llvm-project/commit/9dbc968ed9fa7b81ba6be27f7f26b79bc813a7bb
DIFF: https://github.com/llvm/llvm-project/commit/9dbc968ed9fa7b81ba6be27f7f26b79bc813a7bb.diff
LOG: [AMDGPU] Fix atomic float max/min intrinsics
Hooked up raw.buffer.atomic.fmin/max.f64
This instruction should be available on GFX6, GFX7, and GFX10.
It was implemented for GFX90a with a different name.
Added intrinsic def for image_atomic_fmin/fmax; the instruction
defs were already there.
Reviewed By: arsenm
Differential Revision: https://reviews.llvm.org/D108208
Change-Id: I473f98d28b2afbeeb2c27822d9686b5e86634e2f
Added:
llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll
Modified:
llvm/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/lib/Target/AMDGPU/BUFInstructions.td
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 46a7aeb39c9a..c8e55fe0f579 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -684,7 +684,14 @@ class AMDGPUDimAtomicProfile<string opmod,
let IsAtomic = true;
}
-class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> : AMDGPUDimProfile<"GET_RESINFO", dim> {
+class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim,
+ list<AMDGPUArg> dataargs>
+ : AMDGPUDimAtomicProfile<opmod, dim, dataargs> {
+ let RetTypes = [llvm_anyfloat_ty];
+}
+
+class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim>
+ : AMDGPUDimProfile<"GET_RESINFO", dim> {
let RetTypes = [llvm_anyfloat_ty];
let DataArgs = [];
let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">];
@@ -860,17 +867,24 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
// atomic intrinsics
//////////////////////////////////////////////////////////////////////////
defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
- multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs> {
- foreach dim = AMDGPUDims.All in {
- def !strconcat(NAME, "_", dim.Name)
- : AMDGPUImageDimIntrinsic<
- AMDGPUDimAtomicProfile<opmod, dim, dataargs>,
- [], [SDNPMemOperand]>;
- }
+ multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs,
+ int isFloat = 0> {
+ foreach dim = AMDGPUDims.All in {
+ def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic<
+ !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>,
+ AMDGPUDimAtomicProfile<opmod, dim, dataargs>),
+ [], [SDNPMemOperand]>;
+ }
}
- multiclass AMDGPUImageDimAtomic<string opmod> {
- defm "" : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">]>;
+ multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> {
+ defm ""
+ : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">],
+ isFloat>;
+ }
+
+ multiclass AMDGPUImageDimFloatAtomic<string opmod> {
+ defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>;
}
defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">;
@@ -878,8 +892,10 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">;
defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">;
defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">;
+ defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">;
defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">;
defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">;
+ defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">;
defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">;
defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">;
defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">;
@@ -1015,8 +1031,10 @@ def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
@@ -1036,10 +1054,6 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
// gfx908 intrinsic
def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
-// gfx90a intrinsics
-def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
-def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
-
class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic <
!if(NoRtn, [], [data_ty]),
[!if(NoRtn, data_ty, LLVMMatchType<0>), // vdata(VGPR)
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 5f43aa8388ee..9c8a3464fcc0 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1438,6 +1438,13 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i64, "BUFFER_ATOMIC_INC_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i64, "BUFFER_ATOMIC_DEC_X2">;
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
+ defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f32, "BUFFER_ATOMIC_FMIN">;
+ defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f32, "BUFFER_ATOMIC_FMAX">;
+ defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f64, "BUFFER_ATOMIC_FMIN_X2">;
+ defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f64, "BUFFER_ATOMIC_FMAX_X2">;
+}
+
class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag <
(ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5, node:$src6, node:$src7),
(vt (Op $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7)),
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll
new file mode 100644
index 000000000000..e1aa9b009748
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll
@@ -0,0 +1,1245 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefix=GFX7
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GFX10
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1030
+
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=G_SI
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX7
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030
+
+declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg)
+declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg)
+
+declare float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float, <4 x i32>, i32, i32, i32 immarg)
+declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32, i32 immarg)
+
+declare float @llvm.amdgcn.image.atomic.fmin.1d.f32.f32(float, i32, <8 x i32>, i32, i32)
+declare float @llvm.amdgcn.image.atomic.fmax.1d.f32.f32(float, i32, <8 x i32>, i32, i32)
+
+
+define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_min_noret_f32:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen
+; SI-NEXT: s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_noret_f32:
+; GFX7: ; %bb.0: ; %main_body
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_noret_f32:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen
+; GFX10-NEXT: s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_noret_f32:
+; GFX1030: ; %bb.0: ; %main_body
+; GFX1030-NEXT: s_clause 0x1
+; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030-NEXT: v_mov_b32_e32 v1, s5
+; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen
+; GFX1030-NEXT: s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_noret_f32:
+; G_SI: ; %bb.0: ; %main_body
+; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; G_SI-NEXT: s_waitcnt lgkmcnt(0)
+; G_SI-NEXT: v_mov_b32_e32 v0, s0
+; G_SI-NEXT: v_mov_b32_e32 v1, s1
+; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc
+; G_SI-NEXT: s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f32:
+; G_GFX7: ; %bb.0: ; %main_body
+; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT: v_mov_b32_e32 v0, s0
+; G_GFX7-NEXT: v_mov_b32_e32 v1, s1
+; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc
+; G_GFX7-NEXT: s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f32:
+; G_GFX10: ; %bb.0: ; %main_body
+; G_GFX10-NEXT: s_clause 0x1
+; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc
+; G_GFX10-NEXT: s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f32:
+; G_GFX1030: ; %bb.0: ; %main_body
+; G_GFX1030-NEXT: s_clause 0x1
+; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_GFX1030-NEXT: s_endpgm
+main_body:
+ %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_min_noret_f64:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; SI-NEXT: s_load_dword s0, s[0:1], 0xf
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_mov_b32_e32 v2, s0
+; SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
+; SI-NEXT: s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_noret_f64:
+; GFX7: ; %bb.0: ; %main_body
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_noret_f64:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_mov_b32_e32 v2, s8
+; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX10-NEXT: s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
+; GFX1030: ; %bb.0: ; %main_body
+; GFX1030-NEXT: s_clause 0x2
+; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
+; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030-NEXT: v_mov_b32_e32 v1, s5
+; GFX1030-NEXT: v_mov_b32_e32 v2, s6
+; GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX1030-NEXT: s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_noret_f64:
+; G_SI: ; %bb.0: ; %main_body
+; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf
+; G_SI-NEXT: s_waitcnt lgkmcnt(0)
+; G_SI-NEXT: v_mov_b32_e32 v0, s2
+; G_SI-NEXT: v_mov_b32_e32 v1, s3
+; G_SI-NEXT: v_mov_b32_e32 v2, s0
+; G_SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc
+; G_SI-NEXT: s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f64:
+; G_GFX7: ; %bb.0: ; %main_body
+; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_GFX7-NEXT: s_load_dword s0, s[0:1], 0xf
+; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX7-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX7-NEXT: v_mov_b32_e32 v2, s0
+; G_GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc
+; G_GFX7-NEXT: s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f64:
+; G_GFX10: ; %bb.0: ; %main_body
+; G_GFX10-NEXT: s_clause 0x2
+; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c
+; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT: v_mov_b32_e32 v2, s8
+; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc
+; G_GFX10-NEXT: s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
+; G_GFX1030: ; %bb.0: ; %main_body
+; G_GFX1030-NEXT: s_clause 0x2
+; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
+; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
+; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX1030-NEXT: s_endpgm
+main_body:
+ %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_min_rtn_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_min_rtn_f32:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_rtn_f32:
+; GFX7: ; %bb.0: ; %main_body
+; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dword v[0:1], v0, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32:
+; GFX1030: ; %bb.0: ; %main_body
+; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: global_store_dword v[0:1], v0, off
+; GFX1030-NEXT: s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32:
+; G_SI: ; %bb.0: ; %main_body
+; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_SI-NEXT: s_mov_b32 s2, -1
+; G_SI-NEXT: s_mov_b32 s3, 0xf000
+; G_SI-NEXT: s_waitcnt vmcnt(0)
+; G_SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; G_SI-NEXT: s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f32:
+; G_GFX7: ; %bb.0: ; %main_body
+; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_GFX7-NEXT: s_mov_b32 s2, -1
+; G_GFX7-NEXT: s_mov_b32 s3, 0xf000
+; G_GFX7-NEXT: s_waitcnt vmcnt(0)
+; G_GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; G_GFX7-NEXT: s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f32:
+; G_GFX10: ; %bb.0: ; %main_body
+; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_GFX10-NEXT: s_waitcnt vmcnt(0)
+; G_GFX10-NEXT: global_store_dword v[0:1], v0, off
+; G_GFX10-NEXT: s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32:
+; G_GFX1030: ; %bb.0: ; %main_body
+; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_GFX1030-NEXT: s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT: global_store_dword v[0:1], v0, off
+; G_GFX1030-NEXT: s_endpgm
+main_body:
+ %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+ store float %ret, float addrspace(1)* undef
+ ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_min_rtn_f64:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: ds_write_b64 v0, v[0:1]
+; SI-NEXT: s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_rtn_f64:
+; GFX7: ; %bb.0: ; %main_body
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: ds_write_b64 v0, v[0:1]
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_rtn_f64:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ds_write_b64 v0, v[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64:
+; GFX1030: ; %bb.0: ; %main_body
+; GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: ds_write_b64 v0, v[0:1]
+; GFX1030-NEXT: s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_rtn_f64:
+; G_SI: ; %bb.0: ; %main_body
+; G_SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_SI-NEXT: s_mov_b32 m0, -1
+; G_SI-NEXT: s_waitcnt vmcnt(0)
+; G_SI-NEXT: ds_write_b64 v0, v[0:1]
+; G_SI-NEXT: s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f64:
+; G_GFX7: ; %bb.0: ; %main_body
+; G_GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX7-NEXT: s_mov_b32 m0, -1
+; G_GFX7-NEXT: s_waitcnt vmcnt(0)
+; G_GFX7-NEXT: ds_write_b64 v0, v[0:1]
+; G_GFX7-NEXT: s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f64:
+; G_GFX10: ; %bb.0: ; %main_body
+; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX10-NEXT: s_waitcnt vmcnt(0)
+; G_GFX10-NEXT: ds_write_b64 v0, v[0:1]
+; G_GFX10-NEXT: s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64:
+; G_GFX1030: ; %bb.0: ; %main_body
+; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX1030-NEXT: s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT: ds_write_b64 v0, v[0:1]
+; G_GFX1030-NEXT: s_endpgm
+main_body:
+ %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+ store double %ret, double addrspace(3)* undef
+ ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, float addrspace(3)* %out) {
+; SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; SI-NEXT: s_load_dword s0, s[0:1], 0xf
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: ds_write_b32 v1, v0
+; SI-NEXT: s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; GFX7: ; %bb.0: ; %main_body
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; GFX7-NEXT: v_mov_b32_e32 v1, s0
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: ds_write_b32 v1, v0
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_nop 0
+; GFX10-NEXT: s_load_dword s0, s[0:1], 0x3c
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ds_write_b32 v1, v0
+; GFX10-NEXT: s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; GFX1030: ; %bb.0: ; %main_body
+; GFX1030-NEXT: s_clause 0x2
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dword s0, s[0:1], 0x3c
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; GFX1030-NEXT: v_mov_b32_e32 v1, s0
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: ds_write_b32 v1, v0
+; GFX1030-NEXT: s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; G_SI: ; %bb.0: ; %main_body
+; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf
+; G_SI-NEXT: s_mov_b32 m0, -1
+; G_SI-NEXT: s_waitcnt lgkmcnt(0)
+; G_SI-NEXT: v_mov_b32_e32 v0, s2
+; G_SI-NEXT: v_mov_b32_e32 v1, s3
+; G_SI-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; G_SI-NEXT: v_mov_b32_e32 v1, s0
+; G_SI-NEXT: s_waitcnt vmcnt(0)
+; G_SI-NEXT: ds_write_b32 v1, v0
+; G_SI-NEXT: s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; G_GFX7: ; %bb.0: ; %main_body
+; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0xd
+; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xf
+; G_GFX7-NEXT: s_mov_b32 m0, -1
+; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT: v_mov_b32_e32 v0, s12
+; G_GFX7-NEXT: v_mov_b32_e32 v1, s13
+; G_GFX7-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; G_GFX7-NEXT: v_mov_b32_e32 v1, s2
+; G_GFX7-NEXT: s_waitcnt vmcnt(0)
+; G_GFX7-NEXT: ds_write_b32 v1, v0
+; G_GFX7-NEXT: s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; G_GFX10: ; %bb.0: ; %main_body
+; G_GFX10-NEXT: s_clause 0x2
+; G_GFX10-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34
+; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT: s_load_dword s2, s[0:1], 0x3c
+; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s12
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s13
+; G_GFX10-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s2
+; G_GFX10-NEXT: s_waitcnt vmcnt(0)
+; G_GFX10-NEXT: ds_write_b32 v1, v0
+; G_GFX10-NEXT: s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; G_GFX1030: ; %bb.0: ; %main_body
+; G_GFX1030-NEXT: s_clause 0x2
+; G_GFX1030-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dword s2, s[0:1], 0x3c
+; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s12
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s13
+; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s2
+; G_GFX1030-NEXT: s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT: ds_write_b32 v1, v0
+; G_GFX1030-NEXT: s_endpgm
+; GFX1010-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+main_body:
+ %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+ store float %ret, float addrspace(3)* %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(3)* %out) {
+; SI-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: ds_write_b64 v3, v[0:1]
+; SI-NEXT: s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX7: ; %bb.0: ; %main_body
+; GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: ds_write_b64 v3, v[0:1]
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ds_write_b64 v3, v[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX1030: ; %bb.0: ; %main_body
+; GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: ds_write_b64 v3, v[0:1]
+; GFX1030-NEXT: s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; G_SI: ; %bb.0: ; %main_body
+; G_SI-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_SI-NEXT: s_mov_b32 m0, -1
+; G_SI-NEXT: s_waitcnt vmcnt(0)
+; G_SI-NEXT: ds_write_b64 v3, v[0:1]
+; G_SI-NEXT: s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; G_GFX7: ; %bb.0: ; %main_body
+; G_GFX7-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX7-NEXT: s_mov_b32 m0, -1
+; G_GFX7-NEXT: s_waitcnt vmcnt(0)
+; G_GFX7-NEXT: ds_write_b64 v3, v[0:1]
+; G_GFX7-NEXT: s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; G_GFX10: ; %bb.0: ; %main_body
+; G_GFX10-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX10-NEXT: s_waitcnt vmcnt(0)
+; G_GFX10-NEXT: ds_write_b64 v3, v[0:1]
+; G_GFX10-NEXT: s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; G_GFX1030: ; %bb.0: ; %main_body
+; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX1030-NEXT: s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT: ds_write_b64 v3, v[0:1]
+; G_GFX1030-NEXT: s_endpgm
+main_body:
+ %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+ store double %ret, double addrspace(3)* %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_max_noret_f32:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen
+; SI-NEXT: s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_noret_f32:
+; GFX7: ; %bb.0: ; %main_body
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s0
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_noret_f32:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen
+; GFX10-NEXT: s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_noret_f32:
+; GFX1030: ; %bb.0: ; %main_body
+; GFX1030-NEXT: s_clause 0x1
+; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030-NEXT: v_mov_b32_e32 v1, s5
+; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen
+; GFX1030-NEXT: s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_noret_f32:
+; G_SI: ; %bb.0: ; %main_body
+; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; G_SI-NEXT: s_waitcnt lgkmcnt(0)
+; G_SI-NEXT: v_mov_b32_e32 v0, s0
+; G_SI-NEXT: v_mov_b32_e32 v1, s1
+; G_SI-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc
+; G_SI-NEXT: s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f32:
+; G_GFX7: ; %bb.0: ; %main_body
+; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
+; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT: v_mov_b32_e32 v0, s0
+; G_GFX7-NEXT: v_mov_b32_e32 v1, s1
+; G_GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc
+; G_GFX7-NEXT: s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f32:
+; G_GFX10: ; %bb.0: ; %main_body
+; G_GFX10-NEXT: s_clause 0x1
+; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc
+; G_GFX10-NEXT: s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f32:
+; G_GFX1030: ; %bb.0: ; %main_body
+; G_GFX1030-NEXT: s_clause 0x1
+; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_GFX1030-NEXT: s_endpgm
+main_body:
+ %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_max_noret_f64:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; SI-NEXT: s_load_dword s0, s[0:1], 0xf
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_mov_b32_e32 v2, s0
+; SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
+; SI-NEXT: s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_noret_f64:
+; GFX7: ; %bb.0: ; %main_body
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GFX7-NEXT: s_load_dword s0, s[0:1], 0xf
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_noret_f64:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_mov_b32_e32 v2, s8
+; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX10-NEXT: s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
+; GFX1030: ; %bb.0: ; %main_body
+; GFX1030-NEXT: s_clause 0x2
+; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
+; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030-NEXT: v_mov_b32_e32 v1, s5
+; GFX1030-NEXT: v_mov_b32_e32 v2, s6
+; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX1030-NEXT: s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_noret_f64:
+; G_SI: ; %bb.0: ; %main_body
+; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf
+; G_SI-NEXT: s_waitcnt lgkmcnt(0)
+; G_SI-NEXT: v_mov_b32_e32 v0, s2
+; G_SI-NEXT: v_mov_b32_e32 v1, s3
+; G_SI-NEXT: v_mov_b32_e32 v2, s0
+; G_SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc
+; G_SI-NEXT: s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f64:
+; G_GFX7: ; %bb.0: ; %main_body
+; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_GFX7-NEXT: s_load_dword s0, s[0:1], 0xf
+; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX7-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX7-NEXT: v_mov_b32_e32 v2, s0
+; G_GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc
+; G_GFX7-NEXT: s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f64:
+; G_GFX10: ; %bb.0: ; %main_body
+; G_GFX10-NEXT: s_clause 0x2
+; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT: s_load_dword s8, s[0:1], 0x3c
+; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT: v_mov_b32_e32 v2, s8
+; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc
+; G_GFX10-NEXT: s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
+; G_GFX1030: ; %bb.0: ; %main_body
+; G_GFX1030-NEXT: s_clause 0x2
+; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
+; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
+; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX1030-NEXT: s_endpgm
+main_body:
+ %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_max_rtn_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_max_rtn_f32:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32:
+; GFX7: ; %bb.0: ; %main_body
+; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dword v[0:1], v0, off
+; GFX10-NEXT: s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32:
+; GFX1030: ; %bb.0: ; %main_body
+; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: global_store_dword v[0:1], v0, off
+; GFX1030-NEXT: s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32:
+; G_SI: ; %bb.0: ; %main_body
+; G_SI-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_SI-NEXT: s_mov_b32 s2, -1
+; G_SI-NEXT: s_mov_b32 s3, 0xf000
+; G_SI-NEXT: s_waitcnt vmcnt(0)
+; G_SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; G_SI-NEXT: s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32:
+; G_GFX7: ; %bb.0: ; %main_body
+; G_GFX7-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_GFX7-NEXT: s_mov_b32 s2, -1
+; G_GFX7-NEXT: s_mov_b32 s3, 0xf000
+; G_GFX7-NEXT: s_waitcnt vmcnt(0)
+; G_GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; G_GFX7-NEXT: s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32:
+; G_GFX10: ; %bb.0: ; %main_body
+; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_GFX10-NEXT: s_waitcnt vmcnt(0)
+; G_GFX10-NEXT: global_store_dword v[0:1], v0, off
+; G_GFX10-NEXT: s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32:
+; G_GFX1030: ; %bb.0: ; %main_body
+; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_GFX1030-NEXT: s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT: global_store_dword v[0:1], v0, off
+; G_GFX1030-NEXT: s_endpgm
+main_body:
+ %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+ store float %ret, float addrspace(1)* undef
+ ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_max_rtn_f64:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: ds_write_b64 v0, v[0:1]
+; SI-NEXT: s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64:
+; GFX7: ; %bb.0: ; %main_body
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: ds_write_b64 v0, v[0:1]
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ds_write_b64 v0, v[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64:
+; GFX1030: ; %bb.0: ; %main_body
+; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: ds_write_b64 v0, v[0:1]
+; GFX1030-NEXT: s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64:
+; G_SI: ; %bb.0: ; %main_body
+; G_SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_SI-NEXT: s_mov_b32 m0, -1
+; G_SI-NEXT: s_waitcnt vmcnt(0)
+; G_SI-NEXT: ds_write_b64 v0, v[0:1]
+; G_SI-NEXT: s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64:
+; G_GFX7: ; %bb.0: ; %main_body
+; G_GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX7-NEXT: s_mov_b32 m0, -1
+; G_GFX7-NEXT: s_waitcnt vmcnt(0)
+; G_GFX7-NEXT: ds_write_b64 v0, v[0:1]
+; G_GFX7-NEXT: s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64:
+; G_GFX10: ; %bb.0: ; %main_body
+; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX10-NEXT: s_waitcnt vmcnt(0)
+; G_GFX10-NEXT: ds_write_b64 v0, v[0:1]
+; G_GFX10-NEXT: s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64:
+; G_GFX1030: ; %bb.0: ; %main_body
+; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX1030-NEXT: s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT: ds_write_b64 v0, v[0:1]
+; G_GFX1030-NEXT: s_endpgm
+main_body:
+ %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+ store double %ret, double addrspace(3)* undef
+ ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, float addrspace(1)* %out) {
+; SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s8
+; SI-NEXT: v_mov_b32_e32 v1, s9
+; SI-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; GFX7: ; %bb.0: ; %main_body
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf
+; GFX7-NEXT: s_mov_b32 s3, 0xf000
+; GFX7-NEXT: s_mov_b32 s2, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: v_mov_b32_e32 v1, s9
+; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_nop 0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; GFX1030: ; %bb.0: ; %main_body
+; GFX1030-NEXT: s_clause 0x2
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; GFX1030-NEXT: v_mov_b32_e32 v1, 0
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1030-NEXT: s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; G_SI: ; %bb.0: ; %main_body
+; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf
+; G_SI-NEXT: s_waitcnt lgkmcnt(0)
+; G_SI-NEXT: v_mov_b32_e32 v0, s2
+; G_SI-NEXT: v_mov_b32_e32 v1, s3
+; G_SI-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_SI-NEXT: s_mov_b32 s2, -1
+; G_SI-NEXT: s_mov_b32 s3, 0xf000
+; G_SI-NEXT: s_waitcnt vmcnt(0)
+; G_SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; G_SI-NEXT: s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; G_GFX7: ; %bb.0: ; %main_body
+; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf
+; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX7-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX7-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_GFX7-NEXT: s_mov_b32 s2, -1
+; G_GFX7-NEXT: s_mov_b32 s3, 0xf000
+; G_GFX7-NEXT: s_waitcnt vmcnt(0)
+; G_GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; G_GFX7-NEXT: s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; G_GFX10: ; %bb.0: ; %main_body
+; G_GFX10-NEXT: s_clause 0x2
+; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT: s_nop 0
+; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_GFX10-NEXT: v_mov_b32_e32 v1, 0
+; G_GFX10-NEXT: s_waitcnt vmcnt(0)
+; G_GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; G_GFX10-NEXT: s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; G_GFX1030: ; %bb.0: ; %main_body
+; G_GFX1030-NEXT: s_clause 0x2
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, 0
+; G_GFX1030-NEXT: s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT: global_store_dword v1, v0, s[0:1]
+; G_GFX1030-NEXT: s_endpgm
+main_body:
+ %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+ store float %ret, float addrspace(1)* %out, align 8
+ ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(3)* %out) {
+; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; SI-NEXT: s_load_dword s8, s[0:1], 0xf
+; SI-NEXT: s_load_dword s0, s[0:1], 0x10
+; SI-NEXT: s_mov_b32 m0, -1
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: v_mov_b32_e32 v0, s2
+; SI-NEXT: v_mov_b32_e32 v1, s3
+; SI-NEXT: v_mov_b32_e32 v2, s8
+; SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; SI-NEXT: v_mov_b32_e32 v2, s0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: ds_write_b64 v2, v[0:1]
+; SI-NEXT: s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX7: ; %bb.0: ; %main_body
+; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf
+; GFX7-NEXT: s_mov_b32 m0, -1
+; GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v0, s2
+; GFX7-NEXT: v_mov_b32_e32 v1, s3
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
+; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; GFX7-NEXT: v_mov_b32_e32 v2, s1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: ds_write_b64 v2, v[0:1]
+; GFX7-NEXT: s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: s_clause 0x2
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_mov_b32_e32 v2, s8
+; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; GFX10-NEXT: v_mov_b32_e32 v2, s9
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ds_write_b64 v2, v[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX1030: ; %bb.0: ; %main_body
+; GFX1030-NEXT: s_clause 0x2
+; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x3c
+; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030-NEXT: v_mov_b32_e32 v1, s5
+; GFX1030-NEXT: v_mov_b32_e32 v2, s6
+; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX1030-NEXT: v_mov_b32_e32 v2, s7
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: ds_write_b64 v2, v[0:1]
+; GFX1030-NEXT: s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; G_SI: ; %bb.0: ; %main_body
+; G_SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_SI-NEXT: s_load_dword s8, s[0:1], 0x10
+; G_SI-NEXT: s_load_dword s0, s[0:1], 0xf
+; G_SI-NEXT: s_mov_b32 m0, -1
+; G_SI-NEXT: s_waitcnt lgkmcnt(0)
+; G_SI-NEXT: v_mov_b32_e32 v0, s2
+; G_SI-NEXT: v_mov_b32_e32 v1, s3
+; G_SI-NEXT: v_mov_b32_e32 v2, s0
+; G_SI-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; G_SI-NEXT: v_mov_b32_e32 v2, s8
+; G_SI-NEXT: s_waitcnt vmcnt(0)
+; G_SI-NEXT: ds_write_b64 v2, v[0:1]
+; G_SI-NEXT: s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; G_GFX7: ; %bb.0: ; %main_body
+; G_GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xf
+; G_GFX7-NEXT: s_mov_b32 m0, -1
+; G_GFX7-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX7-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX7-NEXT: v_mov_b32_e32 v2, s0
+; G_GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; G_GFX7-NEXT: v_mov_b32_e32 v2, s1
+; G_GFX7-NEXT: s_waitcnt vmcnt(0)
+; G_GFX7-NEXT: ds_write_b64 v2, v[0:1]
+; G_GFX7-NEXT: s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; G_GFX10: ; %bb.0: ; %main_body
+; G_GFX10-NEXT: s_clause 0x2
+; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c
+; G_GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT: v_mov_b32_e32 v2, s8
+; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; G_GFX10-NEXT: v_mov_b32_e32 v2, s9
+; G_GFX10-NEXT: s_waitcnt vmcnt(0)
+; G_GFX10-NEXT: ds_write_b64 v2, v[0:1]
+; G_GFX10-NEXT: s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; G_GFX1030: ; %bb.0: ; %main_body
+; G_GFX1030-NEXT: s_clause 0x2
+; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x3c
+; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
+; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s7
+; G_GFX1030-NEXT: s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT: ds_write_b64 v2, v[0:1]
+; G_GFX1030-NEXT: s_endpgm
+main_body:
+ %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+ store double %ret, double addrspace(3)* %out, align 8
+ ret void
+}
+
+define amdgpu_ps float @atomic_fmin_1d(<8 x i32> inreg %rsrc, float %data, i32 %s) {
+; SI-LABEL: atomic_fmin_1d:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: atomic_fmin_1d:
+; GFX7: ; %bb.0: ; %main_body
+; GFX7-NEXT: image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: atomic_fmin_1d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1030-LABEL: atomic_fmin_1d:
+; GFX1030: ; %bb.0: ; %main_body
+; GFX1030-NEXT: image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: ; return to shader part epilog
+;
+; G_SI-LABEL: atomic_fmin_1d:
+; G_SI: ; %bb.0: ; %main_body
+; G_SI-NEXT: image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
+; G_SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; G_SI-NEXT: ; return to shader part epilog
+;
+; G_GFX7-LABEL: atomic_fmin_1d:
+; G_GFX7: ; %bb.0: ; %main_body
+; G_GFX7-NEXT: image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
+; G_GFX7-NEXT: s_waitcnt vmcnt(0)
+; G_GFX7-NEXT: ; return to shader part epilog
+;
+; G_GFX10-LABEL: atomic_fmin_1d:
+; G_GFX10: ; %bb.0: ; %main_body
+; G_GFX10-NEXT: image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; G_GFX10-NEXT: s_waitcnt vmcnt(0)
+; G_GFX10-NEXT: ; return to shader part epilog
+;
+; G_GFX1030-LABEL: atomic_fmin_1d:
+; G_GFX1030: ; %bb.0: ; %main_body
+; G_GFX1030-NEXT: image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; G_GFX1030-NEXT: s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT: ; return to shader part epilog
+main_body:
+ %v = call float @llvm.amdgcn.image.atomic.fmin.1d.f32.f32(float %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret float %v
+}
+
+define amdgpu_ps float @atomic_fmax_1d(<8 x i32> inreg %rsrc, float %data, i32 %s) {
+; SI-LABEL: atomic_fmax_1d:
+; SI: ; %bb.0: ; %main_body
+; SI-NEXT: image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT: ; return to shader part epilog
+;
+; GFX7-LABEL: atomic_fmax_1d:
+; GFX7: ; %bb.0: ; %main_body
+; GFX7-NEXT: image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: atomic_fmax_1d:
+; GFX10: ; %bb.0: ; %main_body
+; GFX10-NEXT: image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1030-LABEL: atomic_fmax_1d:
+; GFX1030: ; %bb.0: ; %main_body
+; GFX1030-NEXT: image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX1030-NEXT: s_waitcnt vmcnt(0)
+; GFX1030-NEXT: ; return to shader part epilog
+;
+; G_SI-LABEL: atomic_fmax_1d:
+; G_SI: ; %bb.0: ; %main_body
+; G_SI-NEXT: image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
+; G_SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; G_SI-NEXT: ; return to shader part epilog
+;
+; G_GFX7-LABEL: atomic_fmax_1d:
+; G_GFX7: ; %bb.0: ; %main_body
+; G_GFX7-NEXT: image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
+; G_GFX7-NEXT: s_waitcnt vmcnt(0)
+; G_GFX7-NEXT: ; return to shader part epilog
+;
+; G_GFX10-LABEL: atomic_fmax_1d:
+; G_GFX10: ; %bb.0: ; %main_body
+; G_GFX10-NEXT: image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; G_GFX10-NEXT: s_waitcnt vmcnt(0)
+; G_GFX10-NEXT: ; return to shader part epilog
+;
+; G_GFX1030-LABEL: atomic_fmax_1d:
+; G_GFX1030: ; %bb.0: ; %main_body
+; G_GFX1030-NEXT: image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; G_GFX1030-NEXT: s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT: ; return to shader part epilog
+main_body:
+ %v = call float @llvm.amdgcn.image.atomic.fmax.1d.f32.f32(float %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret float %v
+}
+
More information about the llvm-commits
mailing list