[llvm] [RFC][AMDGPU] Remove old llvm.amdgcn.buffer.* and tbuffer intrinsics (PR #93801)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Thu May 30 03:55:21 PDT 2024
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/93801
>From 9761328f64685c0906351410f6f5655bbc33de39 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 30 May 2024 11:30:36 +0100
Subject: [PATCH 1/2] [RFC][AMDGPU] Remove old llvm.amdgcn.buffer.* and tbuffer
intrinsics
They have been superseded by llvm.amdgcn.raw.buffer.* and
llvm.amdgcn.struct.buffer.*.
---
llvm/docs/TableGen/BackGuide.rst | 2 +-
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 110 +--
.../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 9 -
.../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 5 -
.../Target/AMDGPU/AMDGPUSearchableTables.td | 12 -
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 246 +------
llvm/lib/Target/AMDGPU/SIISelLowering.h | 6 +-
.../AMDGPU/llvm.amdgcn.buffer.atomic.ll | 100 ---
.../AMDGPU/buffer-intrinsics-mmo-offsets.ll | 276 +++-----
llvm/test/CodeGen/AMDGPU/buffer-schedule.ll | 44 --
.../AMDGPU/fail-select-buffer-atomic-fadd.ll | 19 -
.../CodeGen/AMDGPU/force-store-sc0-sc1.ll | 141 ----
.../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 188 ++---
.../CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll | 82 ---
.../AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll | 93 ---
.../CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll | 111 ---
.../AMDGPU/llvm.amdgcn.buffer.atomic.ll | 209 ------
.../AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll | 22 -
.../llvm.amdgcn.buffer.load.format.d16.ll | 55 --
.../AMDGPU/llvm.amdgcn.buffer.load.format.ll | 133 ----
.../CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll | 476 -------------
.../llvm.amdgcn.buffer.store.dwordx3.ll | 20 -
.../llvm.amdgcn.buffer.store.format.d16.ll | 63 --
.../AMDGPU/llvm.amdgcn.buffer.store.format.ll | 104 ---
.../AMDGPU/llvm.amdgcn.buffer.store.ll | 268 -------
.../AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll | 16 -
.../AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll | 14 -
.../AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll | 19 -
.../llvm.amdgcn.struct.tbuffer.store.ll | 4 +-
.../AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll | 55 --
.../llvm.amdgcn.tbuffer.load.dwordx3.ll | 13 -
.../AMDGPU/llvm.amdgcn.tbuffer.load.ll | 109 ---
.../AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll | 76 --
.../llvm.amdgcn.tbuffer.store.dwordx3.ll | 10 -
.../AMDGPU/llvm.amdgcn.tbuffer.store.ll | 110 ---
...mdgcn-demanded-vector-elts-inseltpoison.ll | 661 +-----------------
.../AMDGPU/amdgcn-demanded-vector-elts.ll | 661 +-----------------
.../amdgcn-simplify-image-buffer-stores.ll | 29 -
llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll | 23 -
39 files changed, 155 insertions(+), 4439 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
diff --git a/llvm/docs/TableGen/BackGuide.rst b/llvm/docs/TableGen/BackGuide.rst
index e1413c1c73a79..60677a6dcd627 100644
--- a/llvm/docs/TableGen/BackGuide.rst
+++ b/llvm/docs/TableGen/BackGuide.rst
@@ -761,7 +761,7 @@ over time. The output looks like this.
-------------------- Global Variables (5) --------------------
- AMDGPUBufferIntrinsics = [int_amdgcn_buffer_load_format, ...
+ AMDGPUBufferIntrinsics = [int_amdgcn_s_buffer_load, ...
AMDGPUImageDimAtomicIntrinsics = [int_amdgcn_image_atomic_swap_1d, ...
...
-------------------- Classes (758) --------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index d4a8954a4cdac..be4a1b8b92767 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1072,18 +1072,6 @@ def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = {
-class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
- [data_ty],
- [llvm_v4i32_ty, // rsrc(SGPR)
- llvm_i32_ty, // vindex(VGPR)
- llvm_i32_ty, // offset(SGPR/VGPR/imm)
- llvm_i1_ty, // glc(imm)
- llvm_i1_ty], // slc(imm)
- [IntrReadMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
- AMDGPURsrcIntrinsic<0>;
-def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>;
-def int_amdgcn_buffer_load : AMDGPUBufferLoad;
-
// Generate a buffer_load instruction that may be optimized to s_buffer_load if
// the offset argument is uniform.
def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic <
@@ -1100,25 +1088,12 @@ def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic <
[IntrNoMem, ImmArg<ArgIndex<2>>]>,
AMDGPURsrcIntrinsic<0>;
-class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic <
- [],
- [data_ty, // vdata(VGPR)
- llvm_v4i32_ty, // rsrc(SGPR)
- llvm_i32_ty, // vindex(VGPR)
- llvm_i32_ty, // offset(SGPR/VGPR/imm)
- llvm_i1_ty, // glc(imm)
- llvm_i1_ty], // slc(imm)
- [IntrWriteMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
- AMDGPURsrcIntrinsic<1>;
-def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>;
-def int_amdgcn_buffer_store : AMDGPUBufferStore;
-
-// New buffer intrinsics with separate raw and struct variants. The raw
+// Buffer intrinsics with separate raw and struct variants. The raw
// variant never has an index. The struct variant always has an index, even if
// it is const 0. A struct intrinsic with constant 0 index is different to the
// corresponding raw intrinsic on gfx9+ because the behavior of bound checking
// and swizzling changes depending on whether idxen is set in the instruction.
-// These new instrinsics also keep the offset and soffset arguments separate as
+// These instrinsics also keep the offset and soffset arguments separate as
// they behave differently in bounds checking and swizzling.
// The versions of these intrinsics that take <4 x i32> arguments are deprecated
@@ -1478,41 +1453,7 @@ def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloa
def int_amdgcn_struct_ptr_buffer_atomic_fmin : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
def int_amdgcn_struct_ptr_buffer_atomic_fmax : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
-// Obsolescent tbuffer intrinsics.
-def int_amdgcn_tbuffer_load : DefaultAttrsIntrinsic <
- [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
- [llvm_v4i32_ty, // rsrc(SGPR)
- llvm_i32_ty, // vindex(VGPR)
- llvm_i32_ty, // voffset(VGPR)
- llvm_i32_ty, // soffset(SGPR)
- llvm_i32_ty, // offset(imm)
- llvm_i32_ty, // dfmt(imm)
- llvm_i32_ty, // nfmt(imm)
- llvm_i1_ty, // glc(imm)
- llvm_i1_ty], // slc(imm)
- [IntrReadMem,
- ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>,
- ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>], "", [SDNPMemOperand]>,
- AMDGPURsrcIntrinsic<0>;
-
-def int_amdgcn_tbuffer_store : DefaultAttrsIntrinsic <
- [],
- [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32
- llvm_v4i32_ty, // rsrc(SGPR)
- llvm_i32_ty, // vindex(VGPR)
- llvm_i32_ty, // voffset(VGPR)
- llvm_i32_ty, // soffset(SGPR)
- llvm_i32_ty, // offset(imm)
- llvm_i32_ty, // dfmt(imm)
- llvm_i32_ty, // nfmt(imm)
- llvm_i1_ty, // glc(imm)
- llvm_i1_ty], // slc(imm)
- [IntrWriteMem, ImmArg<ArgIndex<5>>,
- ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>,
- ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<9>>], "", [SDNPMemOperand]>,
- AMDGPURsrcIntrinsic<1>;
-
-// New tbuffer intrinsics, with:
+// tbuffer intrinsics, with:
// - raw and struct variants
// - joint format field
// - joint cachepolicy field
@@ -1659,51 +1600,6 @@ def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic <
ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>,
AMDGPURsrcIntrinsic<1>;
-class AMDGPUBufferAtomic : Intrinsic <
- [llvm_anyint_ty],
- [LLVMMatchType<0>, // vdata(VGPR)
- llvm_v4i32_ty, // rsrc(SGPR)
- llvm_i32_ty, // vindex(VGPR)
- llvm_i32_ty, // offset(SGPR/VGPR/imm)
- llvm_i1_ty], // slc(imm)
- [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
- AMDGPURsrcIntrinsic<1, 0>;
-def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic;
-def int_amdgcn_buffer_atomic_cmpswap : Intrinsic<
- [llvm_i32_ty],
- [llvm_i32_ty, // src(VGPR)
- llvm_i32_ty, // cmp(VGPR)
- llvm_v4i32_ty, // rsrc(SGPR)
- llvm_i32_ty, // vindex(VGPR)
- llvm_i32_ty, // offset(SGPR/VGPR/imm)
- llvm_i1_ty], // slc(imm)
- [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
- AMDGPURsrcIntrinsic<2, 0>;
-
-def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic;
-
-class AMDGPUBufferAtomicFP : Intrinsic <
- [llvm_anyfloat_ty],
- [LLVMMatchType<0>, // vdata(VGPR)
- llvm_v4i32_ty, // rsrc(SGPR)
- llvm_i32_ty, // vindex(VGPR)
- llvm_i32_ty, // offset(SGPR/VGPR/imm)
- llvm_i1_ty], // slc(imm)
- [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
- AMDGPURsrcIntrinsic<1, 0>;
-
-// Legacy form of the intrinsic. raw and struct forms should be preferred.
-def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP;
-
class AMDGPURawBufferLoadLDS : Intrinsic <
[],
[llvm_v4i32_ty, // rsrc(SGPR)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 1d645002b1fe6..38cc5a9bef969 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -249,63 +249,54 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
switch (I.getIntrinsicID()) {
default:
return;
- case Intrinsic::amdgcn_buffer_atomic_add:
case Intrinsic::amdgcn_struct_buffer_atomic_add:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
case Intrinsic::amdgcn_raw_buffer_atomic_add:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
Op = AtomicRMWInst::Add;
break;
- case Intrinsic::amdgcn_buffer_atomic_sub:
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
Op = AtomicRMWInst::Sub;
break;
- case Intrinsic::amdgcn_buffer_atomic_and:
case Intrinsic::amdgcn_struct_buffer_atomic_and:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
case Intrinsic::amdgcn_raw_buffer_atomic_and:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
Op = AtomicRMWInst::And;
break;
- case Intrinsic::amdgcn_buffer_atomic_or:
case Intrinsic::amdgcn_struct_buffer_atomic_or:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
case Intrinsic::amdgcn_raw_buffer_atomic_or:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
Op = AtomicRMWInst::Or;
break;
- case Intrinsic::amdgcn_buffer_atomic_xor:
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
Op = AtomicRMWInst::Xor;
break;
- case Intrinsic::amdgcn_buffer_atomic_smin:
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
Op = AtomicRMWInst::Min;
break;
- case Intrinsic::amdgcn_buffer_atomic_umin:
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
Op = AtomicRMWInst::UMin;
break;
- case Intrinsic::amdgcn_buffer_atomic_smax:
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
Op = AtomicRMWInst::Max;
break;
- case Intrinsic::amdgcn_buffer_atomic_umax:
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 160a17584ca3a..93bca4402ed23 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1158,12 +1158,10 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
break;
}
- case Intrinsic::amdgcn_buffer_store_format:
case Intrinsic::amdgcn_raw_buffer_store_format:
case Intrinsic::amdgcn_struct_buffer_store_format:
case Intrinsic::amdgcn_raw_tbuffer_store:
case Intrinsic::amdgcn_struct_tbuffer_store:
- case Intrinsic::amdgcn_tbuffer_store:
case Intrinsic::amdgcn_image_store_1d:
case Intrinsic::amdgcn_image_store_1darray:
case Intrinsic::amdgcn_image_store_2d:
@@ -1376,8 +1374,6 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
std::function<void(Instruction *, unsigned, APInt, APInt &)>
SimplifyAndSetOp) const {
switch (II.getIntrinsicID()) {
- case Intrinsic::amdgcn_buffer_load:
- case Intrinsic::amdgcn_buffer_load_format:
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format:
@@ -1391,7 +1387,6 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
case Intrinsic::amdgcn_struct_tbuffer_load:
case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
- case Intrinsic::amdgcn_tbuffer_load:
return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
default: {
if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 410dc83d45c57..e84d39a2895c8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -256,17 +256,6 @@ def : SourceOfDivergence<int_amdgcn_ds_fadd>;
def : SourceOfDivergence<int_amdgcn_ds_fmin>;
def : SourceOfDivergence<int_amdgcn_ds_fmax>;
def : SourceOfDivergence<int_amdgcn_ds_fadd_v2bf16>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_swap>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_add>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_sub>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_smin>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_umin>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_smax>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_umax>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_and>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_or>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_xor>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_sub>;
@@ -339,7 +328,6 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32>;
-def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
def : SourceOfDivergence<int_amdgcn_live_mask>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f9948e92862f7..d9b7dc2eb59d2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1280,19 +1280,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
- case Intrinsic::amdgcn_buffer_atomic_fadd: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
- Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
- Info.align.reset();
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-
- const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
- if (!Vol || !Vol->isZero())
- Info.flags |= MachineMemOperand::MOVolatile;
-
- return true;
- }
case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -8732,43 +8719,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
}
- case Intrinsic::amdgcn_buffer_load:
- case Intrinsic::amdgcn_buffer_load_format: {
- unsigned Glc = Op.getConstantOperandVal(5);
- unsigned Slc = Op.getConstantOperandVal(6);
- unsigned IdxEn = getIdxEn(Op.getOperand(3));
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // rsrc
- Op.getOperand(3), // vindex
- SDValue(), // voffset -- will be set by setBufferOffsets
- SDValue(), // soffset -- will be set by setBufferOffsets
- SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
-
- unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
- AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
-
- EVT VT = Op.getValueType();
- EVT IntVT = VT.changeTypeToInteger();
- auto *M = cast<MemSDNode>(Op);
- EVT LoadVT = Op.getValueType();
-
- if (LoadVT.getScalarType() == MVT::f16)
- return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
- M, DAG, Ops);
-
- // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
- if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops,
- M->getMemOperand());
-
- return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
- M->getMemOperand(), DAG);
- }
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_ptr_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format:
@@ -8818,35 +8768,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
}
- case Intrinsic::amdgcn_tbuffer_load: {
- MemSDNode *M = cast<MemSDNode>(Op);
- EVT LoadVT = Op.getValueType();
-
- auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
- unsigned Dfmt = Op.getConstantOperandVal(7);
- unsigned Nfmt = Op.getConstantOperandVal(8);
- unsigned Glc = Op.getConstantOperandVal(9);
- unsigned Slc = Op.getConstantOperandVal(10);
- unsigned IdxEn = getIdxEn(Op.getOperand(3));
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // rsrc
- Op.getOperand(3), // vindex
- Op.getOperand(4), // voffset
- SOffset, // soffset
- Op.getOperand(6), // offset
- DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
- };
-
- if (LoadVT.getScalarType() == MVT::f16)
- return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
- M, DAG, Ops);
- return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
- Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
- DAG);
- }
case Intrinsic::amdgcn_raw_tbuffer_load:
case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
@@ -8901,82 +8822,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
DAG);
}
- case Intrinsic::amdgcn_buffer_atomic_swap:
- case Intrinsic::amdgcn_buffer_atomic_add:
- case Intrinsic::amdgcn_buffer_atomic_sub:
- case Intrinsic::amdgcn_buffer_atomic_csub:
- case Intrinsic::amdgcn_buffer_atomic_smin:
- case Intrinsic::amdgcn_buffer_atomic_umin:
- case Intrinsic::amdgcn_buffer_atomic_smax:
- case Intrinsic::amdgcn_buffer_atomic_umax:
- case Intrinsic::amdgcn_buffer_atomic_and:
- case Intrinsic::amdgcn_buffer_atomic_or:
- case Intrinsic::amdgcn_buffer_atomic_xor:
- case Intrinsic::amdgcn_buffer_atomic_fadd: {
- unsigned Slc = Op.getConstantOperandVal(6);
- unsigned IdxEn = getIdxEn(Op.getOperand(4));
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // vdata
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- SDValue(), // voffset -- will be set by setBufferOffsets
- SDValue(), // soffset -- will be set by setBufferOffsets
- SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
-
- EVT VT = Op.getValueType();
-
- auto *M = cast<MemSDNode>(Op);
- unsigned Opcode = 0;
-
- switch (IntrID) {
- case Intrinsic::amdgcn_buffer_atomic_swap:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
- break;
- case Intrinsic::amdgcn_buffer_atomic_add:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
- break;
- case Intrinsic::amdgcn_buffer_atomic_sub:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
- break;
- case Intrinsic::amdgcn_buffer_atomic_csub:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB;
- break;
- case Intrinsic::amdgcn_buffer_atomic_smin:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
- break;
- case Intrinsic::amdgcn_buffer_atomic_umin:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
- break;
- case Intrinsic::amdgcn_buffer_atomic_smax:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
- break;
- case Intrinsic::amdgcn_buffer_atomic_umax:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
- break;
- case Intrinsic::amdgcn_buffer_atomic_and:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
- break;
- case Intrinsic::amdgcn_buffer_atomic_or:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
- break;
- case Intrinsic::amdgcn_buffer_atomic_xor:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
- break;
- case Intrinsic::amdgcn_buffer_atomic_fadd:
- Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD;
- break;
- default:
- llvm_unreachable("unhandled atomic opcode");
- }
-
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
- M->getMemOperand());
- }
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
@@ -9085,29 +8930,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return lowerStructBufferAtomicIntrin(Op, DAG,
AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
- case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
- unsigned Slc = Op.getConstantOperandVal(7);
- unsigned IdxEn = getIdxEn(Op.getOperand(5));
- SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // src
- Op.getOperand(3), // cmp
- Op.getOperand(4), // rsrc
- Op.getOperand(5), // vindex
- SDValue(), // voffset -- will be set by setBufferOffsets
- SDValue(), // soffset -- will be set by setBufferOffsets
- SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
-
- EVT VT = Op.getValueType();
- auto *M = cast<MemSDNode>(Op);
-
- return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
- Op->getVTList(), Ops, VT, M->getMemOperand());
- }
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
@@ -9550,34 +9372,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return SDValue();
};
- case Intrinsic::amdgcn_tbuffer_store: {
- SDValue VData = Op.getOperand(2);
- bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
- if (IsD16)
- VData = handleD16VData(VData, DAG);
- unsigned Dfmt = Op.getConstantOperandVal(8);
- unsigned Nfmt = Op.getConstantOperandVal(9);
- unsigned Glc = Op.getConstantOperandVal(10);
- unsigned Slc = Op.getConstantOperandVal(11);
- unsigned IdxEn = getIdxEn(Op.getOperand(4));
- SDValue Ops[] = {
- Chain,
- VData, // vdata
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- Op.getOperand(5), // voffset
- Op.getOperand(6), // soffset
- Op.getOperand(7), // offset
- DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
- AMDGPUISD::TBUFFER_STORE_FORMAT;
- MemSDNode *M = cast<MemSDNode>(Op);
- return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
- M->getMemoryVT(), M->getMemOperand());
- }
case Intrinsic::amdgcn_struct_tbuffer_store:
case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
@@ -9635,42 +9429,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
M->getMemoryVT(), M->getMemOperand());
}
- case Intrinsic::amdgcn_buffer_store:
- case Intrinsic::amdgcn_buffer_store_format: {
- SDValue VData = Op.getOperand(2);
- bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
- if (IsD16)
- VData = handleD16VData(VData, DAG);
- unsigned Glc = Op.getConstantOperandVal(6);
- unsigned Slc = Op.getConstantOperandVal(7);
- unsigned IdxEn = getIdxEn(Op.getOperand(4));
- SDValue Ops[] = {
- Chain,
- VData,
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- SDValue(), // voffset -- will be set by setBufferOffsets
- SDValue(), // soffset -- will be set by setBufferOffsets
- SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
- };
- setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
-
- unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
- AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
- Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
- MemSDNode *M = cast<MemSDNode>(Op);
-
- // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
- EVT VDataType = VData.getValueType().getScalarType();
- if (VDataType == MVT::i8 || VDataType == MVT::i16)
- return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
-
- return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
- M->getMemoryVT(), M->getMemOperand());
- }
-
case Intrinsic::amdgcn_raw_buffer_store:
case Intrinsic::amdgcn_raw_ptr_buffer_store:
case Intrinsic::amdgcn_raw_buffer_store_format:
@@ -10076,8 +9834,8 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
return {N0, SDValue(C1, 0)};
}
-// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
-// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
+// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
+// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
SelectionDAG &DAG, SDValue *Offsets,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 292b17da93583..61bffe9aa80e5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -255,9 +255,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool shouldExpandVectorDynExt(SDNode *N) const;
private:
- // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
- // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
- // pointed to by Offsets.
+ // Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
+ // the three offsets (voffset, soffset and instoffset) into the SDValue[3]
+ // array pointed to by Offsets.
void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
SDValue *Offsets, Align Alignment = Align(4)) const;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
index 3db2183845ff2..6d0ca701e47a9 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/llvm.amdgcn.buffer.atomic.ll
@@ -1,93 +1,5 @@
; RUN: opt -mtriple amdgcn-mesa-mesa3d -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(
-define float @buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
- %orig = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.add.i32(
-define float @buffer_atomic_add(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
- %orig = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(
-define float @buffer_atomic_sub(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
- %orig = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(
-define float @buffer_atomic_smin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
- %orig = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(
-define float @buffer_atomic_umin(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
- %orig = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(
-define float @buffer_atomic_smax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
- %orig = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(
-define float @buffer_atomic_umax(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
- %orig = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.and.i32(
-define float @buffer_atomic_and(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
- %orig = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.or.i32(
-define float @buffer_atomic_or(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
- %orig = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(
-define float @buffer_atomic_xor(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-main_body:
- %orig = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(
-define float @buffer_atomic_cmpswap(<4 x i32> inreg %rsrc, i32 inreg %data, i32 inreg %cmp) #0 {
-main_body:
- %orig = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
;CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(
define float @raw_buffer_atomic_swap(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
main_body:
@@ -440,18 +352,6 @@ main_body:
ret float %r
}
-declare i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0
-
declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
index f4cd19a2ffa80..87461ec77446d 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
@@ -21,75 +21,37 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub1
; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, killed [[COPY5]], %subreg.sub2, killed [[COPY4]], %subreg.sub3
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE3]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
- ; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 64
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 128
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_2]], 64, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_3]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[COPY7]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE2]], [[COPY7]], 128, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 72
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 144
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_4]], 72, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[COPY9]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE2]], [[COPY9]], 144, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 80
@@ -107,6 +69,7 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[COPY]]
; GCN-NEXT: BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[REG_SEQUENCE2]], [[COPY11]], 160, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
+ ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 88
; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 88, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
@@ -123,94 +86,95 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]]
; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 96
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 192
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY14]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[COPY14]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET7]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN2]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_10]], 96, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_11]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET8]], [[REG_SEQUENCE2]], [[COPY15]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET7]], [[REG_SEQUENCE2]], [[COPY15]], 192, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 104
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 208
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN2]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET5]], [[REG_SEQUENCE2]], [[S_MOV_B32_12]], 104, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET6]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_13]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY17:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET8]], [[REG_SEQUENCE2]], [[COPY17]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET7]], [[REG_SEQUENCE2]], [[COPY17]], 208, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 112
; GCN-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 224
; GCN-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1
; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[COPY22:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY21]], [[S_LOAD_DWORDX4_IMM]], [[COPY22]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY21]], [[S_LOAD_DWORDX4_IMM]], [[COPY22]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY23]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY23]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY24]], [[REG_SEQUENCE2]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY24]], [[REG_SEQUENCE2]], [[S_MOV_B32_14]], 112, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY25]], [[REG_SEQUENCE2]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY25]], [[REG_SEQUENCE2]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_15]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[COPY27:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY26]], [[REG_SEQUENCE2]], [[COPY27]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN12:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN13:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY26]], [[REG_SEQUENCE2]], [[COPY27]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 224, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 120
; GCN-NEXT: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 240
; GCN-NEXT: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[COPY32:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY31]], [[S_LOAD_DWORDX4_IMM]], [[COPY32]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY31]], [[S_LOAD_DWORDX4_IMM]], [[COPY32]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY33]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY33]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY34]], [[REG_SEQUENCE2]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY34]], [[REG_SEQUENCE2]], [[S_MOV_B32_16]], 120, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY35]], [[REG_SEQUENCE2]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY35]], [[REG_SEQUENCE2]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_17]], 0, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[COPY37:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY36]], [[REG_SEQUENCE2]], [[COPY37]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN12:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN13:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY36]], [[REG_SEQUENCE2]], [[COPY37]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 240, 0, 0, implicit $exec :: (dereferenceable load (s128) from %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY38]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store (s32), align 1, addrspace 8)
@@ -268,113 +232,63 @@ define amdgpu_cs void @mmo_offsets0(ptr addrspace(6) inreg noalias align(16) der
; GCN-NEXT: BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE3]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store (s32) on %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY58:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY58]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[COPY58]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY59:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY59]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY59]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 288
; GCN-NEXT: [[COPY60:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY60]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY60]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY61:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[COPY62:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY61]], [[S_LOAD_DWORDX4_IMM]], [[COPY62]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY61]], [[S_LOAD_DWORDX4_IMM]], [[COPY62]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY63:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN8]], [[COPY63]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[COPY63]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY64:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN9]], [[COPY64]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY64]], [[REG_SEQUENCE2]], [[S_MOV_B32_5]], 144, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY65:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN10]], [[COPY65]], [[REG_SEQUENCE2]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN8]], [[COPY65]], [[REG_SEQUENCE2]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN1]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_21]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY66:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[COPY67:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN11]], [[COPY66]], [[REG_SEQUENCE2]], [[COPY67]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN12]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN13]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN9]], [[COPY66]], [[REG_SEQUENCE2]], [[COPY67]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN10]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN11]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 288, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */
; GCN-NEXT: [[COPY68:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY68]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[COPY68]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 152
; GCN-NEXT: [[COPY69:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY69]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY69]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[S_MOV_B32_23:%[0-9]+]]:sreg_32 = S_MOV_B32 304
; GCN-NEXT: [[COPY70:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY70]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY70]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE4]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY71:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[COPY72:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY71]], [[S_LOAD_DWORDX4_IMM]], [[COPY72]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY71]], [[S_LOAD_DWORDX4_IMM]], [[COPY72]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 8)
; GCN-NEXT: [[COPY73:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8]], [[COPY73]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[COPY73]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY74:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9]], [[COPY74]], [[REG_SEQUENCE2]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY74]], [[REG_SEQUENCE2]], [[S_MOV_B32_22]], 152, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY75:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10]], [[COPY75]], [[REG_SEQUENCE2]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN8]], [[COPY75]], [[REG_SEQUENCE2]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN1]], [[REG_SEQUENCE4]], [[REG_SEQUENCE2]], [[S_MOV_B32_23]], 0, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: [[COPY76:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
; GCN-NEXT: [[COPY77:%[0-9]+]]:sreg_32 = COPY [[COPY]]
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11]], [[COPY76]], [[REG_SEQUENCE2]], [[COPY77]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN12]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
- ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN13]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN9]], [[COPY76]], [[REG_SEQUENCE2]], [[COPY77]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN10]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
+ ; GCN-NEXT: BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN11]], [[COPY]], [[REG_SEQUENCE2]], [[S_MOV_B32_]], 304, 0, 0, implicit $exec :: (dereferenceable store (s128) into %ir.tmp1, align 1, addrspace 8)
; GCN-NEXT: S_ENDPGM 0
bb.0:
%tmp0 = load <4 x i32>, ptr addrspace(6) %arg0, align 16, !invariant.load !0
%tmp1 = load ptr addrspace(8), ptr addrspace(6) %arg0, align 16, !invariant.load !0
- %buffer0 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 16, i1 false, i1 false) #0
- %buffer1 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #0
- %buffer2 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 1, i32 16, i1 false, i1 false) #0
- %buffer3 = call nsz <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 16, i1 false, i1 false) #0
-
- ; Insert inline asm to keep the different instruction types from being mixed. This makes the output easier to read.
- call void asm sideeffect "", "" ()
-
- call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer0, <4 x i32> %tmp0, i32 0, i32 32, i1 false, i1 false) #1
- call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #1
- call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer2, <4 x i32> %tmp0, i32 1, i32 32, i1 false, i1 false) #1
- call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %buffer3, <4 x i32> %tmp0, i32 %arg1, i32 32, i1 false, i1 false) #1
-
- call void asm sideeffect "", "" ()
-
- %buffer_format0 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 48, i1 false, i1 false) #0
- %buffer_format1 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #0
- %buffer_format2 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 1, i32 48, i1 false, i1 false) #0
- %buffer_format3 = call nsz <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp0, i32 %arg1, i32 48, i1 false, i1 false) #0
-
- call void asm sideeffect "", "" ()
-
- call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format0, <4 x i32> %tmp0, i32 0, i32 64, i1 false, i1 false) #1
- call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false, i1 false) #1
- call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format2, <4 x i32> %tmp0, i32 1, i32 64, i1 false, i1 false) #1
- call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %buffer_format3, <4 x i32> %tmp0, i32 %arg1, i32 64, i1 false, i1 false) #1
-
- call void asm sideeffect "", "" ()
-
- %atomic_add0 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 80, i1 false) #2
- %atomic_add1 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2
- %atomic_add2 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 1, i32 80, i1 false) #2
- %atomic_add3 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 80, i1 false) #2
-
- call void asm sideeffect "", "" ()
-
- %atomic_cmpswap0 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 96, i1 false) #2
- %atomic_cmpswap1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2
- %atomic_cmpswap2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 1, i32 96, i1 false) #2
- %atomic_cmpswap3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %arg1, i32 %arg1, <4 x i32> %tmp0, i32 %arg1, i32 96, i1 false) #2
-
- call void asm sideeffect "", "" ()
-
- %fadd1 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 112, i1 false) #2
- %fadd2 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 0, i32 %arg1, i1 false) #2
- %fadd3 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 1, i32 112, i1 false) #2
- %fadd4 = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %tmp0, i32 %arg1, i32 112, i1 false) #2
-
- call void asm sideeffect "", "" ()
- ; rsrc, offset, soffset, cachepolicy
%raw_buffer0 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 128, i32 0, i32 0) #0
%raw_buffer1 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 64, i32 64, i32 0) #0
%raw_buffer2 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 128, i32 0) #0
@@ -383,7 +297,6 @@ bb.0:
call void asm sideeffect "", "" ()
- ; rsrc, offset, soffset, cachepolicy
%raw_ptr_buffer0 = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %tmp1, i32 128, i32 0, i32 0) #3
%raw_ptr_buffer1 = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %tmp1, i32 64, i32 64, i32 0) #3
%raw_ptr_buffer2 = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %tmp1, i32 0, i32 128, i32 0) #3
@@ -472,7 +385,6 @@ bb.0:
call void asm sideeffect "", "" ()
- ; rsrc, vindex, offset, soffset, cachepolicy
%struct_buffer0 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 224, i32 0, i32 0) #0
%struct_buffer1 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 112, i32 112, i32 0) #0
%struct_buffer2 = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %tmp0, i32 0, i32 0, i32 224, i32 0) #0
@@ -592,13 +504,6 @@ bb.0:
ret void
}
-declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
-declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #2
-declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #2
-declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) #2
declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0
declare <4 x float> @llvm.amdgcn.raw.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32) #0
declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #2
@@ -629,7 +534,6 @@ declare void @llvm.amdgcn.struct.ptr.buffer.store.format.v4f32(<4 x float>, ptr
attributes #0 = { nounwind readonly }
-attributes #1 = { nounwind writeonly }
attributes #2 = { nounwind }
attributes #3 = { nounwind memory(argmem: read) }
attributes #4 = { nounwind memory(argmem: write) }
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll b/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll
index bdc73e5b99720..727863936096a 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-schedule.ll
@@ -4,46 +4,6 @@
; The buffer_loads and buffer_stores all access the same location. Check they do
; not get reordered by the scheduler.
-; GCN-LABEL: {{^}}_amdgpu_cs_main:
-; GCN: buffer_load_dword
-; GCN: buffer_store_dword
-; GCN: buffer_load_dword
-; GCN: buffer_store_dword
-; GCN: buffer_load_dword
-; GCN: buffer_store_dword
-; GCN: buffer_load_dword
-; GCN: buffer_store_dword
-
-; Function Attrs: nounwind
-define amdgpu_cs void @_amdgpu_cs_main(<3 x i32> inreg %arg3, <3 x i32> %arg5) {
-.entry:
- %tmp9 = add <3 x i32> %arg3, %arg5
- %tmp10 = extractelement <3 x i32> %tmp9, i32 0
- %tmp11 = shl i32 %tmp10, 2
- %tmp12 = inttoptr i64 undef to ptr addrspace(4)
- %tmp13 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
- %tmp14 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp13, i32 0, i32 %tmp11, i1 false, i1 false) #0
- %tmp17 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
- call void @llvm.amdgcn.buffer.store.f32(float %tmp14, <4 x i32> %tmp17, i32 0, i32 %tmp11, i1 false, i1 false) #0
- %tmp20 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
- %tmp21 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp20, i32 0, i32 %tmp11, i1 false, i1 false) #0
- %tmp22 = fadd reassoc nnan arcp contract float %tmp21, 1.000000e+00
- call void @llvm.amdgcn.buffer.store.f32(float %tmp22, <4 x i32> %tmp20, i32 0, i32 %tmp11, i1 false, i1 false) #0
- %tmp25 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
- %tmp26 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp25, i32 0, i32 %tmp11, i1 false, i1 false) #0
- %tmp27 = fadd reassoc nnan arcp contract float %tmp26, 1.000000e+00
- call void @llvm.amdgcn.buffer.store.f32(float %tmp27, <4 x i32> %tmp25, i32 0, i32 %tmp11, i1 false, i1 false) #0
- %tmp30 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
- %tmp31 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp30, i32 0, i32 %tmp11, i1 false, i1 false) #0
- %tmp32 = fadd reassoc nnan arcp contract float %tmp31, 1.000000e+00
- call void @llvm.amdgcn.buffer.store.f32(float %tmp32, <4 x i32> %tmp30, i32 0, i32 %tmp11, i1 false, i1 false) #0
- %tmp35 = load <4 x i32>, ptr addrspace(4) %tmp12, align 16
- %tmp36 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %tmp35, i32 0, i32 %tmp11, i1 false, i1 false) #0
- %tmp37 = fadd reassoc nnan arcp contract float %tmp36, 1.000000e+00
- call void @llvm.amdgcn.buffer.store.f32(float %tmp37, <4 x i32> %tmp35, i32 0, i32 %tmp11, i1 false, i1 false) #0
- ret void
-}
-
; GCN-LABEL: {{^}}test1:
; GCN: buffer_store_dword
; GCN: buffer_load_dword
@@ -84,10 +44,6 @@ define amdgpu_cs void @test1_ptrs_reorderable(ptr addrspace(8) inreg %buf, i32 %
}
-declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2
-
-declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #3
-
declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32) #2
declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32) #3
diff --git a/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll b/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll
deleted file mode 100644
index 8ff78aaccf5a3..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/fail-select-buffer-atomic-fadd.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=tahiti -o /dev/null %s 2>&1 | FileCheck -check-prefix=FAIL %s
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=hawaii -o /dev/null %s 2>&1 | FileCheck -check-prefix=FAIL %s
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=fiji -o /dev/null %s 2>&1 | FileCheck -check-prefix=FAIL %s
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx900 -o /dev/null %s 2>&1 | FileCheck -check-prefix=FAIL %s
-; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1010 -o /dev/null %s 2>&1 | FileCheck -check-prefix=FAIL %s
-
-; Make sure selection of these intrinsics fails on targets that do not
-; have the instruction available.
-; FIXME: Should also really make sure the v2f16 version fails.
-
-; FAIL: LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD
-define amdgpu_cs void @atomic_fadd(<4 x i32> inreg %arg0) {
- %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %arg0, i32 0, i32 112, i1 false)
- ret void
-}
-
-declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1 immarg) #0
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll b/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll
deleted file mode 100644
index b35de03203004..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/force-store-sc0-sc1.ll
+++ /dev/null
@@ -1,141 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,FORCESC0SC1 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx941 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,FORCESC0SC1 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs -mattr=-forcestoresc1 < %s | FileCheck --check-prefixes=GCN,NOSC0SC1 %s
-
-define amdgpu_kernel void @store_global(ptr addrspace(1) %ptr) {
-; FORCESC0SC1-LABEL: store_global:
-; FORCESC0SC1: ; %bb.0: ; %entry
-; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FORCESC0SC1-NEXT: v_mov_b32_e32 v0, 0
-; FORCESC0SC1-NEXT: v_mov_b32_e32 v1, 1.0
-; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0)
-; FORCESC0SC1-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
-; FORCESC0SC1-NEXT: s_endpgm
-;
-; NOSC0SC1-LABEL: store_global:
-; NOSC0SC1: ; %bb.0: ; %entry
-; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; NOSC0SC1-NEXT: v_mov_b32_e32 v0, 0
-; NOSC0SC1-NEXT: v_mov_b32_e32 v1, 1.0
-; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0)
-; NOSC0SC1-NEXT: global_store_dword v0, v1, s[0:1]
-; NOSC0SC1-NEXT: s_endpgm
-entry:
- store float 1.000000e+00, ptr addrspace(1) %ptr, align 4
- ret void
-}
-
-define amdgpu_kernel void @store_flat(ptr addrspace(0) %ptr) {
-; FORCESC0SC1-LABEL: store_flat:
-; FORCESC0SC1: ; %bb.0: ; %entry
-; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FORCESC0SC1-NEXT: v_mov_b32_e32 v2, 1.0
-; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0)
-; FORCESC0SC1-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; FORCESC0SC1-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
-; FORCESC0SC1-NEXT: s_endpgm
-;
-; NOSC0SC1-LABEL: store_flat:
-; NOSC0SC1: ; %bb.0: ; %entry
-; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; NOSC0SC1-NEXT: v_mov_b32_e32 v2, 1.0
-; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0)
-; NOSC0SC1-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; NOSC0SC1-NEXT: flat_store_dword v[0:1], v2
-; NOSC0SC1-NEXT: s_endpgm
-entry:
- store float 1.000000e+00, ptr addrspace(0) %ptr, align 4
- ret void
-}
-
-define amdgpu_kernel void @store_lds(ptr addrspace(3) %ptr) {
-; GCN-LABEL: store_lds:
-; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s0, s[0:1], 0x24
-; GCN-NEXT: v_mov_b32_e32 v0, 1.0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v1, s0
-; GCN-NEXT: ds_write_b32 v1, v0
-; GCN-NEXT: s_endpgm
-entry:
- store float 1.000000e+00, ptr addrspace(3) %ptr, align 4
- ret void
-}
-
-define amdgpu_kernel void @store_scratch(ptr addrspace(5) %ptr) {
-; FORCESC0SC1-LABEL: store_scratch:
-; FORCESC0SC1: ; %bb.0: ; %entry
-; FORCESC0SC1-NEXT: s_load_dword s0, s[0:1], 0x24
-; FORCESC0SC1-NEXT: v_mov_b32_e32 v0, 1.0
-; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0)
-; FORCESC0SC1-NEXT: scratch_store_dword off, v0, s0 sc0 sc1
-; FORCESC0SC1-NEXT: s_endpgm
-;
-; NOSC0SC1-LABEL: store_scratch:
-; NOSC0SC1: ; %bb.0: ; %entry
-; NOSC0SC1-NEXT: s_load_dword s0, s[0:1], 0x24
-; NOSC0SC1-NEXT: v_mov_b32_e32 v0, 1.0
-; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0)
-; NOSC0SC1-NEXT: scratch_store_dword off, v0, s0
-; NOSC0SC1-NEXT: s_endpgm
-entry:
- store float 1.000000e+00, ptr addrspace(5) %ptr, align 4
- ret void
-}
-
-define amdgpu_ps void @store_buffer(<4 x i32> inreg %rsrc, float %data, i32 %index) {
-; FORCESC0SC1-LABEL: store_buffer:
-; FORCESC0SC1: ; %bb.0: ; %main_body
-; FORCESC0SC1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen sc0 sc1
-; FORCESC0SC1-NEXT: s_endpgm
-;
-; NOSC0SC1-LABEL: store_buffer:
-; NOSC0SC1: ; %bb.0: ; %main_body
-; NOSC0SC1-NEXT: buffer_store_dword v0, v1, s[0:3], 0 idxen
-; NOSC0SC1-NEXT: s_endpgm
-main_body:
- call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
- ret void
-}
-
-define amdgpu_kernel void @store_global_atomic(ptr addrspace(1) %ptr) {
-; FORCESC0SC1-LABEL: store_global_atomic:
-; FORCESC0SC1: ; %bb.0: ; %entry
-; FORCESC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FORCESC0SC1-NEXT: v_mov_b32_e32 v0, 0
-; FORCESC0SC1-NEXT: v_mov_b32_e32 v1, 1.0
-; FORCESC0SC1-NEXT: buffer_wbl2 sc1
-; FORCESC0SC1-NEXT: s_waitcnt lgkmcnt(0)
-; FORCESC0SC1-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
-; FORCESC0SC1-NEXT: s_endpgm
-;
-; NOSC0SC1-LABEL: store_global_atomic:
-; NOSC0SC1: ; %bb.0: ; %entry
-; NOSC0SC1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; NOSC0SC1-NEXT: v_mov_b32_e32 v0, 0
-; NOSC0SC1-NEXT: v_mov_b32_e32 v1, 1.0
-; NOSC0SC1-NEXT: buffer_wbl2 sc1
-; NOSC0SC1-NEXT: s_waitcnt lgkmcnt(0)
-; NOSC0SC1-NEXT: global_store_dword v0, v1, s[0:1] sc1
-; NOSC0SC1-NEXT: s_endpgm
-entry:
- store atomic float 1.000000e+00, ptr addrspace(1) %ptr syncscope("agent-one-as") seq_cst, align 4
- ret void
-}
-
-define amdgpu_kernel void @store_global_atomic_system(ptr addrspace(1) %ptr) {
-; GCN-LABEL: store_global_atomic_system:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: v_mov_b32_e32 v1, 1.0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
-; GCN-NEXT: s_endpgm
- store atomic float 1.000000e+00, ptr addrspace(1) %ptr monotonic, align 4
- ret void
-}
-
-
-declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 121fab51024fd..0835b43e96a67 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -2,7 +2,6 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940
-declare double @llvm.amdgcn.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i1)
declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32 immarg)
@@ -24,89 +23,6 @@ declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
-define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
-; GFX90A-LABEL: buffer_atomic_add_noret_f64:
-; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c
-; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen
-; GFX90A-NEXT: s_endpgm
-;
-; GFX940-LABEL: buffer_atomic_add_noret_f64:
-; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c
-; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v2, s8
-; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen
-; GFX940-NEXT: s_endpgm
-main_body:
- %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- ret void
-}
-
-define amdgpu_ps void @buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
-; GFX90A-LABEL: buffer_atomic_add_rtn_f64:
-; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
-; GFX90A-NEXT: s_endpgm
-;
-; GFX940-LABEL: buffer_atomic_add_rtn_f64:
-; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
-; GFX940-NEXT: s_endpgm
-main_body:
- %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- store double %ret, ptr undef
- ret void
-}
-
-define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
-; GFX90A-LABEL: buffer_atomic_add_rtn_f64_off4_slc:
-; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c
-; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: v_mov_b32_e32 v2, s10
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc
-; GFX90A-NEXT: v_mov_b32_e32 v2, 0
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
-; GFX90A-NEXT: s_endpgm
-;
-; GFX940-LABEL: buffer_atomic_add_rtn_f64_off4_slc:
-; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c
-; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v2, s10
-; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt
-; GFX940-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1
-; GFX940-NEXT: s_endpgm
-main_body:
- %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
- store double %ret, ptr addrspace(1) %out, align 8
- ret void
-}
-
define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64:
; GFX90A: ; %bb.0: ; %main_body
@@ -1186,7 +1102,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB42_3
+; GFX90A-NEXT: s_cbranch_execz .LBB39_3
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
@@ -1198,7 +1114,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: .LBB42_2: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
@@ -1210,8 +1126,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB42_2
-; GFX90A-NEXT: .LBB42_3:
+; GFX90A-NEXT: s_cbranch_execnz .LBB39_2
+; GFX90A-NEXT: .LBB39_3:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat:
@@ -1221,7 +1137,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB42_2
+; GFX940-NEXT: s_cbranch_execz .LBB39_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
@@ -1233,7 +1149,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: .LBB42_2:
+; GFX940-NEXT: .LBB39_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst
@@ -1248,7 +1164,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB43_2
+; GFX90A-NEXT: s_cbranch_execz .LBB40_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
@@ -1259,7 +1175,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: .LBB43_2:
+; GFX90A-NEXT: .LBB40_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent:
@@ -1269,7 +1185,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB43_2
+; GFX940-NEXT: s_cbranch_execz .LBB40_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
@@ -1281,7 +1197,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: .LBB43_2:
+; GFX940-NEXT: .LBB40_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1296,7 +1212,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB44_3
+; GFX90A-NEXT: s_cbranch_execz .LBB41_3
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
@@ -1308,7 +1224,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: .LBB44_2: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
@@ -1320,8 +1236,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB44_2
-; GFX90A-NEXT: .LBB44_3:
+; GFX90A-NEXT: s_cbranch_execnz .LBB41_2
+; GFX90A-NEXT: .LBB41_3:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system:
@@ -1331,7 +1247,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB44_2
+; GFX940-NEXT: s_cbranch_execz .LBB41_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
@@ -1343,7 +1259,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: .LBB44_2:
+; GFX940-NEXT: .LBB41_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst
@@ -1358,7 +1274,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB45_2
+; GFX90A-NEXT: s_cbranch_execz .LBB42_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
@@ -1369,7 +1285,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
-; GFX90A-NEXT: .LBB45_2:
+; GFX90A-NEXT: .LBB42_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush:
@@ -1379,7 +1295,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB45_2
+; GFX940-NEXT: s_cbranch_execz .LBB42_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
@@ -1391,7 +1307,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: .LBB45_2:
+; GFX940-NEXT: .LBB42_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1423,7 +1339,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
@@ -1436,7 +1352,7 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
@@ -1488,7 +1404,7 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
@@ -1501,7 +1417,7 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB49_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB46_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
@@ -1568,7 +1484,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB52_3
+; GFX90A-NEXT: s_cbranch_execz .LBB49_3
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
@@ -1580,7 +1496,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: .LBB52_2: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
@@ -1590,8 +1506,8 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB52_2
-; GFX90A-NEXT: .LBB52_3:
+; GFX90A-NEXT: s_cbranch_execnz .LBB49_2
+; GFX90A-NEXT: .LBB49_3:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
@@ -1601,7 +1517,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB52_2
+; GFX940-NEXT: s_cbranch_execz .LBB49_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
@@ -1613,7 +1529,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: .LBB52_2:
+; GFX940-NEXT: .LBB49_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1629,7 +1545,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
@@ -1642,7 +1558,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX90A-NEXT: s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_endpgm
;
@@ -1700,7 +1616,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
@@ -1714,7 +1630,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_endpgm
;
@@ -1740,7 +1656,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
@@ -1753,7 +1669,7 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB53_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
@@ -1805,7 +1721,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
@@ -1819,7 +1735,7 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB58_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v0, v2
@@ -1896,7 +1812,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
@@ -1907,7 +1823,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX90A-NEXT: s_cbranch_execnz .LBB61_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB58_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_endpgm
;
@@ -2123,7 +2039,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB70_2
+; GFX90A-NEXT: s_cbranch_execz .LBB67_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
@@ -2133,7 +2049,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: .LBB70_2:
+; GFX90A-NEXT: .LBB67_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat:
@@ -2143,7 +2059,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB70_2
+; GFX940-NEXT: s_cbranch_execz .LBB67_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
@@ -2153,7 +2069,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: .LBB70_2:
+; GFX940-NEXT: .LBB67_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -2168,7 +2084,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB71_2
+; GFX90A-NEXT: s_cbranch_execz .LBB68_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
@@ -2178,7 +2094,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: .LBB71_2:
+; GFX90A-NEXT: .LBB68_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush:
@@ -2188,7 +2104,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB71_2
+; GFX940-NEXT: s_cbranch_execz .LBB68_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
@@ -2198,7 +2114,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: .LBB71_2:
+; GFX940-NEXT: .LBB68_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -2213,7 +2129,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB72_2
+; GFX90A-NEXT: s_cbranch_execz .LBB69_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
@@ -2223,7 +2139,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: .LBB72_2:
+; GFX90A-NEXT: .LBB69_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
@@ -2233,7 +2149,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB72_2
+; GFX940-NEXT: s_cbranch_execz .LBB69_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
@@ -2243,7 +2159,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: .LBB72_2:
+; GFX940-NEXT: .LBB69_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
deleted file mode 100644
index 2b0584d39a3be..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,PREGFX12
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,PREGFX12
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX12PLUS
-
-declare i32 @llvm.amdgcn.buffer.atomic.csub(i32, <4 x i32>, i32, i32, i1)
-declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32)
-
-; GCN-LABEL: {{^}}buffer_atomic_csub_rtn:
-; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen glc
-; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN
-define amdgpu_ps void @buffer_atomic_csub_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
-main_body:
- %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}buffer_atomic_csub_no_rtn:
-; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen
-; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen
-define amdgpu_ps void @buffer_atomic_csub_no_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 {
-main_body:
- %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}buffer_atomic_csub_off4_slc_rtn:
-; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 glc slc
-; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT_RETURN
-define amdgpu_ps void @buffer_atomic_csub_off4_slc_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
-main_body:
- %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
- ret void
-}
-
-; GCN-LABEL: {{^}}buffer_atomic_csub_off4_slc_no_rtn:
-; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 slc
-; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT
-define amdgpu_ps void @buffer_atomic_csub_off4_slc_no_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 {
-main_body:
- %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
- ret void
-}
-
-; GCN-LABEL: {{^}}global_atomic_csub_rtn:
-; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc
-; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN
-define amdgpu_kernel void @global_atomic_csub_rtn(ptr addrspace(1) %ptr, i32 %data) {
-main_body:
- %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
- ret void
-}
-
-; GCN-LABEL: {{^}}global_atomic_csub_no_rtn:
-; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
-; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1]
-define amdgpu_kernel void @global_atomic_csub_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
-main_body:
- %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
- ret void
-}
-
-; GCN-LABEL: {{^}}global_atomic_csub_off4_rtn:
-; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc
-; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4 th:TH_ATOMIC_RETURN
-define amdgpu_kernel void @global_atomic_csub_off4_rtn(ptr addrspace(1) %ptr, i32 %data) {
-main_body:
- %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
- %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %p, i32 %data)
- ret void
-}
-
-; GCN-LABEL: {{^}}global_atomic_csub_off4_no_rtn:
-; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
-; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1] offset:4
-define amdgpu_kernel void @global_atomic_csub_off4_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
-main_body:
- %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
- %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %p, i32 %data)
- ret void
-}
-
-attributes #0 = { "target-features"="+atomic-csub-no-rtn-insts" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll
deleted file mode 100644
index 1e1bc2bbbd26a..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.gfx90a.ll
+++ /dev/null
@@ -1,93 +0,0 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A
-; RUN: not --crash llc < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs 2>&1 | FileCheck %s -check-prefix=GFX908
-
-declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
-declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1)
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
-
-; GFX908: LLVM ERROR: Cannot select: {{.+}}: f32,ch = BUFFER_ATOMIC_FADD
-
-; GFX90A-LABEL: {{^}}buffer_atomic_add_f32:
-; GFX90A: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen glc
-define amdgpu_ps float @buffer_atomic_add_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
-main_body:
- %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- ret float %ret
-}
-
-; GFX90A-LABEL: {{^}}buffer_atomic_add_f32_off4_slc:
-; GFX90A: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen offset:4 glc slc
-define amdgpu_ps float @buffer_atomic_add_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
-main_body:
- %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
- ret float %ret
-}
-
-; GFX90A-LABEL: {{^}}buffer_atomic_pk_add_v2f16:
-; GFX90A: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen glc
-define amdgpu_ps <2 x half> @buffer_atomic_pk_add_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) {
-main_body:
- %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- ret <2 x half> %ret
-}
-
-; GFX90A-LABEL: {{^}}buffer_atomic_pk_add_v2f16_off4_slc:
-; GFX90A: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen offset:4 glc slc
-define amdgpu_ps <2 x half> @buffer_atomic_pk_add_v2f16_off4_slc(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) {
-main_body:
- %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
- ret <2 x half> %ret
-}
-
-; GFX90A-LABEL: {{^}}global_atomic_add_f32:
-; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc
-define amdgpu_ps float @global_atomic_add_f32(ptr addrspace(1) %ptr, float %data) {
-main_body:
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret float %ret
-}
-
-; GFX90A-LABEL: {{^}}global_atomic_add_f32_off4:
-; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:4 glc
-define amdgpu_ps float @global_atomic_add_f32_off4(ptr addrspace(1) %ptr, float %data) {
-main_body:
- %p = getelementptr float, ptr addrspace(1) %ptr, i64 1
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data)
- ret float %ret
-}
-
-; GFX90A-LABEL: {{^}}global_atomic_add_f32_offneg4:
-; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off offset:-4 glc
-define amdgpu_ps float @global_atomic_add_f32_offneg4(ptr addrspace(1) %ptr, float %data) {
-main_body:
- %p = getelementptr float, ptr addrspace(1) %ptr, i64 -1
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data)
- ret float %ret
-}
-
-; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16:
-; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
-define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
-main_body:
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
- ret <2 x half> %ret
-}
-
-; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_off4:
-; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:4 glc
-define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_off4(ptr addrspace(1) %ptr, <2 x half> %data) {
-main_body:
- %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 1
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data)
- ret <2 x half> %ret
-}
-
-; GFX90A-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4:
-; GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-4 glc
-define amdgpu_ps <2 x half> @global_atomic_pk_add_v2f16_offneg4(ptr addrspace(1) %ptr, <2 x half> %data) {
-main_body:
- %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -1
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data)
- ret <2 x half> %ret
-}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll
deleted file mode 100644
index bd07dd137ac49..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll
+++ /dev/null
@@ -1,111 +0,0 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN
-
-declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
-declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1)
-declare float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1), float)
-declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1), <2 x half>)
-declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr, float)
-
-; GCN-LABEL: {{^}}buffer_atomic_add_f32:
-; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_atomic_add_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
-main_body:
- %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}buffer_atomic_add_f32_off4_slc:
-; GCN: buffer_atomic_add_f32 v0, v1, s[0:3], 0 idxen offset:4 slc
-define amdgpu_ps void @buffer_atomic_add_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
-main_body:
- %ret = call float @llvm.amdgcn.buffer.atomic.fadd.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
- ret void
-}
-
-; GCN-LABEL: {{^}}buffer_atomic_pk_add_v2f16:
-; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_atomic_pk_add_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) {
-main_body:
- %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}buffer_atomic_pk_add_v2f16_off4_slc:
-; GCN: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 idxen offset:4 slc
-define amdgpu_ps void @buffer_atomic_pk_add_v2f16_off4_slc(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %vindex) {
-main_body:
- %ret = call <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
- ret void
-}
-
-; GCN-LABEL: {{^}}global_atomic_add_f32:
-; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
-define amdgpu_kernel void @global_atomic_add_f32(ptr addrspace(1) %ptr, float %data) {
-main_body:
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret void
-}
-
-; GCN-LABEL: {{^}}global_atomic_add_f32_off4:
-; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
-define amdgpu_kernel void @global_atomic_add_f32_off4(ptr addrspace(1) %ptr, float %data) {
-main_body:
- %p = getelementptr float, ptr addrspace(1) %ptr, i64 1
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data)
- ret void
-}
-
-; GCN-LABEL: {{^}}global_atomic_add_f32_offneg4:
-; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4
-define amdgpu_kernel void @global_atomic_add_f32_offneg4(ptr addrspace(1) %ptr, float %data) {
-main_body:
- %p = getelementptr float, ptr addrspace(1) %ptr, i64 -1
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %p, float %data)
- ret void
-}
-
-; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16:
-; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
-define amdgpu_kernel void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
-main_body:
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
- ret void
-}
-
-; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_off4:
-; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4
-define amdgpu_kernel void @global_atomic_pk_add_v2f16_off4(ptr addrspace(1) %ptr, <2 x half> %data) {
-main_body:
- %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 1
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data)
- ret void
-}
-
-; GCN-LABEL: {{^}}global_atomic_pk_add_v2f16_offneg4:
-; GCN: global_atomic_pk_add_f16 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:-4{{$}}
-define amdgpu_kernel void @global_atomic_pk_add_v2f16_offneg4(ptr addrspace(1) %ptr, <2 x half> %data) {
-main_body:
- %p = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -1
- %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %p, <2 x half> %data)
- ret void
-}
-
-; Make sure this artificially selects with an incorrect subtarget, but
-; the feature set.
-; GCN-LABEL: {{^}}global_atomic_fadd_f32_wrong_subtarget:
-; GCN: global_atomic_add_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}}
-define amdgpu_kernel void @global_atomic_fadd_f32_wrong_subtarget(ptr addrspace(1) %ptr, float %data) #0 {
- %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %ptr, float %data)
- ret void
-}
-
-; GCN-LABEL: {{^}}flat_atomic_fadd_f32_wrong_subtarget:
-; GCN: flat_atomic_add_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
-define amdgpu_kernel void @flat_atomic_fadd_f32_wrong_subtarget(ptr %ptr, float %data) #1 {
- %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
- ret void
-}
-
-attributes #0 = { "target-cpu"="gfx803" "target-features"="+atomic-fadd-no-rtn-insts"}
-attributes #1 = { "target-cpu"="gfx803" "target-features"="+flat-atomic-fadd-f32-inst"}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
deleted file mode 100644
index eed648f167f39..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
+++ /dev/null
@@ -1,209 +0,0 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
-
-;CHECK-LABEL: {{^}}test1:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc
-;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap v0, v[1:2], s[0:3], 0 idxen offen glc
-;SICI: v_mov_b32_e32 v1, 0x2000
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc
-;CHECK-DAG: s_waitcnt vmcnt(0)
-;SICI: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc
-;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:4 glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}}
-define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) {
-main_body:
- %o1 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %o2 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %o3 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0)
- %o4 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0)
- %ofs.5 = add i32 %voffset, 42
- %o5 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0)
- %o6 = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o5, <4 x i32> %rsrc, i32 0, i32 8192, i1 0)
- %unused = call i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %out = bitcast i32 %o6 to float
- ret float %out
-}
-
-;CHECK-LABEL: {{^}}test11:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_atomic_swap_x2 v[3:4], off, s[0:3], 0 glc
-;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap_x2 v[3:4], v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap_x2 v[3:4], v2, s[0:3], 0 offen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap_x2 v[3:4], v[1:2], s[0:3], 0 idxen offen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap_x2 v[3:4], v2, s[0:3], 0 offen offset:42 glc
-;CHECK-DAG: s_waitcnt vmcnt(0)
-;SICI: buffer_atomic_swap_x2 v[3:4], v0, s[0:3], 0 offen glc
-;VI: buffer_atomic_swap_x2 v[3:4], off, s[0:3], [[SOFS]] offset:4 glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_swap_x2 v[3:4], off, s[0:3], 0{{$}}
-define amdgpu_ps float @test11(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) {
-main_body:
- %o0 = sext i32 %data to i64
- %o1 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o0, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %o2 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %o3 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o2, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0)
- %o4 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o3, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0)
- %ofs.5 = add i32 %voffset, 42
- %o5 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o4, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0)
- %o6 = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o5, <4 x i32> %rsrc, i32 0, i32 8192, i1 0)
- %unused = call i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64 %o6, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %o7 = trunc i64 %o6 to i32
- %out = bitcast i32 %o7 to float
- ret float %out
-}
-
-;CHECK-LABEL: {{^}}test2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_atomic_add v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_sub v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_smin v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_umin v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_smax v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_umax v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_and v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc
-define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
-main_body:
- %t1 = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t2 = call i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t3 = call i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t4 = call i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t5 = call i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t6 = call i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t7 = call i32 @llvm.amdgcn.buffer.atomic.and.i32(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t8 = call i32 @llvm.amdgcn.buffer.atomic.or.i32(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t9 = call i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %out = bitcast i32 %t9 to float
- ret float %out
-}
-
-;CHECK-LABEL: {{^}}test3:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_atomic_add_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_sub_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_smin_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_umin_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_smax_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_umax_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_and_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_or_x2 v[0:1], v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_xor_x2 v[0:1], v2, s[0:3], 0 idxen glc
-define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
-main_body:
- %t0 = sext i32 %data to i64
- %t1 = call i64 @llvm.amdgcn.buffer.atomic.add.i64(i64 %t0, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t2 = call i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t3 = call i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64 %t2, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t4 = call i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64 %t3, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t5 = call i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64 %t4, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t6 = call i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64 %t5, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t7 = call i64 @llvm.amdgcn.buffer.atomic.and.i64(i64 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t8 = call i64 @llvm.amdgcn.buffer.atomic.or.i64(i64 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t9 = call i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %t10 = trunc i64 %t9 to i32
- %out = bitcast i32 %t10 to float
- ret float %out
-}
-
-; Ideally, we would teach tablegen & friends that cmpswap only modifies the
-; first vgpr. Since we don't do that yet, the register allocator will have to
-; create copies which we don't bother to track here.
-;
-;CHECK-LABEL: {{^}}test4:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc
-;CHECK: s_waitcnt vmcnt(0)
-;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
-;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v[2:3], s[0:3], 0 idxen offen glc
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:44 glc
-;CHECK-DAG: s_waitcnt vmcnt(0)
-;SICI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen glc
-;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc
-define amdgpu_ps float @test4(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) {
-main_body:
- %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
- %o3 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o2, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i1 0)
- %o4 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o3, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i1 0)
- %ofs.5 = add i32 %voffset, 44
- %o5 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o4, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 %ofs.5, i1 0)
- %o6 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o5, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 8192, i1 0)
-
-; Detecting the no-return variant doesn't work right now because of how the
-; intrinsic is replaced by an instruction that feeds into an EXTRACT_SUBREG.
-; Since there probably isn't a reasonable use-case of cmpswap that discards
-; the return value, that seems okay.
-;
-; %unused = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o6, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
- %out = bitcast i32 %o6 to float
- ret float %out
-}
-
-;CHECK-LABEL: {{^}}test7:
-;CHECK: buffer_atomic_add v0,
-define amdgpu_ps float @test7() {
-main_body:
- %v = call i32 @llvm.amdgcn.buffer.atomic.add.i32(i32 1, <4 x i32> undef, i32 0, i32 4, i1 false)
- %v.float = bitcast i32 %v to float
- ret float %v.float
-}
-
-declare i32 @llvm.amdgcn.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i1) #0
-declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.swap.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.add.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.sub.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.smin.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.umin.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.smax.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.umax.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.and.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.or.i64(i64, <4 x i32>, i32, i32, i1) #0
-declare i64 @llvm.amdgcn.buffer.atomic.xor.i64(i64, <4 x i32>, i32, i32, i1) #0
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll
index fdbe6db8da14e..659842afad485 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.dwordx3.ll
@@ -1,26 +1,6 @@
;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,SI
;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,GCNX3
-;CHECK-LABEL: {{^}}buffer_load_format_immoffs_x3:
-;SI: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
-;GCNX3: buffer_load_format_xyz v[0:2], off, s[0:3], 0 offset:42
-;CHECK: s_waitcnt
-define amdgpu_ps <3 x float> @buffer_load_format_immoffs_x3(<4 x i32> inreg) {
-main_body:
- %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
- ret <3 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_immoffs_x3:
-;SI: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
-;GCNX3: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:40
-;CHECK: s_waitcnt
-define amdgpu_ps <3 x float> @buffer_load_immoffs_x3(<4 x i32> inreg) {
-main_body:
- %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0)
- ret <3 x float> %data
-}
-
;CHECK-LABEL: {{^}}buffer_raw_load_immoffs_x3:
;SI: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
;GCNX3: buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:40
@@ -81,8 +61,6 @@ main_body:
ret <3 x float> %data
}
-declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #0
-declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #0
declare <3 x float> @llvm.amdgcn.raw.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32) #0
declare <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32>, i32, i32, i32) #0
declare <3 x float> @llvm.amdgcn.struct.buffer.load.format.v3f32(<4 x i32>, i32, i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll
deleted file mode 100644
index 3d67dfdd674d4..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.d16.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
-
-; GCN-LABEL: {{^}}buffer_load_format_d16_x:
-; GCN: buffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0
-define amdgpu_ps half @buffer_load_format_d16_x(<4 x i32> inreg %rsrc) {
-main_body:
- %data = call half @llvm.amdgcn.buffer.load.format.f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
- ret half %data
-}
-
-; GCN-LABEL: {{^}}buffer_load_format_d16_xy:
-; UNPACKED: buffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]]
-define amdgpu_ps half @buffer_load_format_d16_xy(<4 x i32> inreg %rsrc) {
-main_body:
- %data = call <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
- %elt = extractelement <2 x half> %data, i32 1
- ret half %elt
-}
-
-; GCN-LABEL: {{^}}buffer_load_format_d16_xyz:
-; UNPACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-define amdgpu_ps half @buffer_load_format_d16_xyz(<4 x i32> inreg %rsrc) {
-main_body:
- %data = call <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
- %elt = extractelement <3 x half> %data, i32 2
- ret half %elt
-}
-
-; GCN-LABEL: {{^}}buffer_load_format_d16_xyzw:
-; UNPACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: buffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
-define amdgpu_ps half @buffer_load_format_d16_xyzw(<4 x i32> inreg %rsrc) {
-main_body:
- %data = call <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
- %elt = extractelement <4 x half> %data, i32 3
- ret half %elt
-}
-
-declare half @llvm.amdgcn.buffer.load.format.f16(<4 x i32>, i32, i32, i1, i1)
-declare <2 x half> @llvm.amdgcn.buffer.load.format.v2f16(<4 x i32>, i32, i32, i1, i1)
-declare <3 x half> @llvm.amdgcn.buffer.load.format.v3f16(<4 x i32>, i32, i32, i1, i1)
-declare <4 x half> @llvm.amdgcn.buffer.load.format.v4f16(<4 x i32>, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
deleted file mode 100644
index 6851302fdcda3..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
+++ /dev/null
@@ -1,133 +0,0 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
-
-;CHECK-LABEL: {{^}}buffer_load:
-;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0
-;CHECK: buffer_load_format_xyzw v[4:7], off, s[0:3], 0 glc
-;CHECK: buffer_load_format_xyzw v[8:11], off, s[0:3], 0 slc
-;CHECK: s_waitcnt
-define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
-main_body:
- %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
- %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
- %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
- %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
- %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
- %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
- ret {<4 x float>, <4 x float>, <4 x float>} %r2
-}
-
-;CHECK-LABEL: {{^}}buffer_load_immoffs:
-;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:42
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
-main_body:
- %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
- ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
-;SICI: v_mov_b32_e32 [[VOFS:v[0-9]+]], 0x1038
-;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[VOFS]], s[0:3], 0 offen
-;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen
-;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 60 offset:4092
-;VI-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7ffc
-;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4092
-;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen
-;VI-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8ffc
-;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:4
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
-main_body:
- %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4152, i1 0, i1 0)
- %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36856, i1 0, i1 0)
- %d.2 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36864, i1 0, i1 0)
- %d.3 = fadd <4 x float> %d.0, %d.1
- %data = fadd <4 x float> %d.2, %d.3
- ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_immoffs_reuse:
-;VI: s_movk_i32 [[OFS:s[0-9]+]], 0xffc
-;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:68
-;VI-NOT: s_mov
-;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:84
-;VI: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) {
-main_body:
- %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0)
- %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4176, i1 0, i1 0)
- %data = fadd <4 x float> %d.0, %d.1
- ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_idx:
-;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
-main_body:
- %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
- ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ofs:
-;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
-main_body:
- %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
- ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
-;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:60
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
-main_body:
- %ofs = add i32 %1, 60
- %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
- ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_both:
-;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
-main_body:
- %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
- ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_both_reversed:
-;CHECK: v_mov_b32_e32 v2, v0
-;CHECK: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
-main_body:
- %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
- ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x:
-;CHECK: buffer_load_format_x v0, off, s[0:3], 0
-;CHECK: s_waitcnt
-define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) {
-main_body:
- %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
- ret float %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_xy:
-;CHECK: buffer_load_format_xy v[0:1], off, s[0:3], 0
-;CHECK: s_waitcnt
-define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
-main_body:
- %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
- ret <2 x float> %data
-}
-
-declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #0
-declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #0
-declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #0
-
-attributes #0 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
deleted file mode 100644
index a209dcfe3a7a0..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ /dev/null
@@ -1,476 +0,0 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
-
-;CHECK-LABEL: {{^}}buffer_load:
-;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-;CHECK: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 glc
-;CHECK: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 slc
-;CHECK: s_waitcnt
-define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
-main_body:
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
- %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
- %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
- %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
- %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
- %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
- ret {<4 x float>, <4 x float>, <4 x float>} %r2
-}
-
-;CHECK-LABEL: {{^}}buffer_load_immoffs:
-;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:40
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
-main_body:
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 40, i1 0, i1 0)
- ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
-;SICI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 offen
-;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1ffc
-;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:4
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
-main_body:
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0)
- ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_idx:
-;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
-main_body:
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
- ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ofs:
-;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
-main_body:
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
- ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
-;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:60
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
-main_body:
- %ofs = add i32 %1, 60
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
- ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_both:
-;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
-main_body:
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
- ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_both_reversed:
-;CHECK: v_mov_b32_e32 v2, v0
-;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
-define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
-main_body:
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
- ret <4 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x1:
-;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
-define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
-main_body:
- %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
- ret float %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x2:
-;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
-;CHECK: s_waitcnt
-define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
-main_body:
- %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
- ret <2 x float> %data
-}
-
-;CHECK-LABEL: {{^}}buffer_load_negative_offset:
-;CHECK: v_add_{{[iu]}}32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0
-;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen
-define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) {
-main_body:
- %ofs.1 = add i32 %ofs, -16
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i1 0, i1 0)
- ret <4 x float> %data
-}
-
-; SI won't merge ds memory operations, because of the signed offset bug, so
-; we only have check lines for VI.
-; CHECK-LABEL: buffer_load_mmo:
-; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-define amdgpu_ps float @buffer_load_mmo(<4 x i32> inreg %rsrc, ptr addrspace(3) %lds) {
-entry:
- store float 0.0, ptr addrspace(3) %lds
- %val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0)
- %tmp2 = getelementptr float, ptr addrspace(3) %lds, i32 4
- store float 0.0, ptr addrspace(3) %tmp2
- ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
-;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
-main_body:
- %a1 = add i32 %a, 4
- %a2 = add i32 %a, 8
- %a3 = add i32 %a, 12
- %a4 = add i32 %a, 16
- %a5 = add i32 %a, 28
- %a6 = add i32 %a, 32
- %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
- %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
- %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0)
- %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 0, i1 0)
- %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 0, i1 0)
- %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 0, i1 0)
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x1_offen_merged_glc_slc:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
-;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
-;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
-;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a) {
-main_body:
- %a1 = add i32 %a, 4
- %a2 = add i32 %a, 8
- %a3 = add i32 %a, 12
- %a4 = add i32 %a, 16
- %a5 = add i32 %a, 28
- %a6 = add i32 %a, 32
- %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
- %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
- %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a3, i1 1, i1 0)
- %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a4, i1 1, i1 0)
- %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a5, i1 1, i1 1)
- %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a6, i1 1, i1 1)
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x2_offen_merged:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
-main_body:
- %a1 = add i32 %a, 4
- %a2 = add i32 %a, 12
- %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
- %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
- %r1 = extractelement <2 x float> %vr1, i32 0
- %r2 = extractelement <2 x float> %vr1, i32 1
- %r3 = extractelement <2 x float> %vr2, i32 0
- %r4 = extractelement <2 x float> %vr2, i32 1
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x3_offen_merged:
-;CHECK-NEXT: %bb.
-;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a) {
-main_body:
- %a1 = add i32 %a, 4
- %a2 = add i32 %a, 12
- %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
- %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
- %r1 = extractelement <2 x float> %vr1, i32 0
- %r2 = extractelement <2 x float> %vr1, i32 1
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x1_offset_merged:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
-;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) {
-main_body:
- %r1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
- %r2 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
- %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
- %r4 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
- %r5 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 28, i1 0, i1 0)
- %r6 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 32, i1 0, i1 0)
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x2_offset_merged:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x2_offset_merged(<4 x i32> inreg %rsrc) {
-main_body:
- %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
- %vr2 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
- %r1 = extractelement <2 x float> %vr1, i32 0
- %r2 = extractelement <2 x float> %vr1, i32 1
- %r3 = extractelement <2 x float> %vr2, i32 0
- %r4 = extractelement <2 x float> %vr2, i32 1
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_load_x3_offset_merged:
-;CHECK-NEXT: %bb.
-;VI-NEXT: buffer_load_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-;CHECK: s_waitcnt
-define amdgpu_ps void @buffer_load_x3_offset_merged(<4 x i32> inreg %rsrc) {
-main_body:
- %vr1 = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
- %r3 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
- %r1 = extractelement <2 x float> %vr1, i32 0
- %r2 = extractelement <2 x float> %vr1, i32 1
- call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float undef, i1 true, i1 true)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ubyte:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ubyte(<4 x i32> inreg %rsrc) {
-main_body:
- %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
- %val = uitofp i8 %tmp to float
- ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ushort:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:16
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_u32_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ushort(<4 x i32> inreg %rsrc) {
-main_body:
- %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
- %tmp2 = zext i16 %tmp to i32
- %val = uitofp i32 %tmp2 to float
- ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_sbyte:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sbyte(<4 x i32> inreg %rsrc) {
-main_body:
- %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
- %tmp2 = sext i8 %tmp to i32
- %val = sitofp i32 %tmp2 to float
- ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_sshort:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:16
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_cvt_f32_i32_e32 v0, v0
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sshort(<4 x i32> inreg %rsrc) {
-main_body:
- %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
- %tmp2 = sext i16 %tmp to i32
- %val = sitofp i32 %tmp2 to float
- ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ubyte_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ubyte_bitcast(<4 x i32> inreg %rsrc) {
-main_body:
- %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
- %tmp2 = zext i8 %tmp to i32
- %val = bitcast i32 %tmp2 to float
- ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ushort_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ushort_bitcast(<4 x i32> inreg %rsrc) {
-main_body:
- %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
- %tmp2 = zext i16 %tmp to i32
- %val = bitcast i32 %tmp2 to float
- ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_sbyte_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sbyte_bitcast(<4 x i32> inreg %rsrc) {
-main_body:
- %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
- %tmp2 = sext i8 %tmp to i32
- %val = bitcast i32 %tmp2 to float
- ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_sshort_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sshort_bitcast(<4 x i32> inreg %rsrc) {
-main_body:
- %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
- %tmp2 = sext i16 %tmp to i32
- %val = bitcast i32 %tmp2 to float
- ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ubyte_mul_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, v0, s[0:3], 0 idxen offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ubyte_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) {
-main_body:
- %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0)
- %tmp2 = zext i8 %tmp to i32
- %tmp3 = mul i32 %tmp2, 255
- %val = bitcast i32 %tmp3 to float
- ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_ushort_mul_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ushort v{{[0-9]}}, v0, s[0:3], 0 idxen offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_mul_u32_u24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_ushort_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) {
-main_body:
- %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0)
- %tmp2 = zext i16 %tmp to i32
- %tmp3 = mul i32 %tmp2, 255
- %val = bitcast i32 %tmp3 to float
- ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_sbyte_mul_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sbyte v{{[0-9]}}, v0, s[0:3], 0 idxen offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sbyte_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) {
-main_body:
- %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0)
- %tmp2 = sext i8 %tmp to i32
- %tmp3 = mul i32 %tmp2, 255
- %val = bitcast i32 %tmp3 to float
- ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_sshort_mul_bitcast:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_sshort v{{[0-9]}}, v0, s[0:3], 0 idxen offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_mul_i32_i24_e32 v{{[0-9]}}, 0xff, v{{[0-9]}}
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sshort_mul_bitcast(<4 x i32> inreg %rsrc, i32 %idx) {
-main_body:
- %tmp = call i16 @llvm.amdgcn.buffer.load.i16(<4 x i32> %rsrc, i32 %idx, i32 8, i1 0, i1 0)
- %tmp2 = sext i16 %tmp to i32
- %tmp3 = mul i32 %tmp2, 255
- %val = bitcast i32 %tmp3 to float
- ret float %val
-}
-
-;CHECK-LABEL: {{^}}buffer_load_sbyte_type_check:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: buffer_load_ubyte v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-NEXT: s_waitcnt vmcnt(0)
-;CHECK-NEXT: v_bfe_i32 v{{[0-9]}}, v{{[0-9]}}, 0, 5
-;CHECK-NEXT: ; return to shader part epilog
-define amdgpu_ps float @buffer_load_sbyte_type_check(<4 x i32> inreg %rsrc) {
-main_body:
- %tmp = call i8 @llvm.amdgcn.buffer.load.i8(<4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
- %tmp2 = zext i8 %tmp to i32
- %tmp3 = shl i32 %tmp2, 27
- %tmp4 = ashr i32 %tmp3, 27
- %val = bitcast i32 %tmp4 to float
- ret float %val
-}
-
-; Make sure a frame index folding doesn't crash on a MUBUF not used
-; for stack access.
-
-; CHECK-LABEL: {{^}}no_fold_fi_imm_soffset:
-; CHECK: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
-; CHECK-NEXT: buffer_load_dword v0, [[FI]], s{{\[[0-9]+:[0-9]+\]}}, 0 idxen
-define amdgpu_ps float @no_fold_fi_imm_soffset(<4 x i32> inreg %rsrc) {
- %alloca = alloca i32, addrspace(5)
- %alloca.cast = ptrtoint ptr addrspace(5) %alloca to i32
-
- %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 0, i1 false, i1 false)
- ret float %ret.val
-}
-
-; CHECK-LABEL: {{^}}no_fold_fi_reg_soffset:
-; CHECK-DAG: v_mov_b32_e32 v[[FI:[0-9]+]], 0{{$}}
-; CHECK-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s
-; CHECK: buffer_load_dword v0, v[[[FI]]:[[HI]]
-define amdgpu_ps float @no_fold_fi_reg_soffset(<4 x i32> inreg %rsrc, i32 inreg %soffset) {
- %alloca = alloca i32, addrspace(5)
- %alloca.cast = ptrtoint ptr addrspace(5) %alloca to i32
-
- %ret.val = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %alloca.cast, i32 %soffset, i1 false, i1 false)
- ret float %ret.val
-}
-
-declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
-declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
-declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
-declare i8 @llvm.amdgcn.buffer.load.i8(<4 x i32>, i32, i32, i1, i1) #0
-declare i16 @llvm.amdgcn.buffer.load.i16(<4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
-
-attributes #0 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll
index 269956f948f50..7723b565d962b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.dwordx3.ll
@@ -1,23 +1,5 @@
;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
-;CHECK-LABEL: {{^}}buffer_store_format_immoffs_x3:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyz v[0:2], off, s[0:3], 0 offset:42
-define amdgpu_ps void @buffer_store_format_immoffs_x3(<4 x i32> inreg, <3 x float>) {
-main_body:
- call void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_immoffs_x3:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 offset:42
-define amdgpu_ps void @buffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) {
-main_body:
- call void @llvm.amdgcn.buffer.store.v3f32(<3 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
- ret void
-}
-
;CHECK-LABEL: {{^}}raw_buffer_store_format_immoffs_x3:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_store_format_xyz v[0:2], off, s[0:3], 0 offset:42
@@ -72,8 +54,6 @@ main_body:
ret void
}
-declare void @llvm.amdgcn.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i1, i1) #0
declare void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) #0
declare void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float>, <4 x i32>, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.buffer.store.format.v3f32(<3 x float>, <4 x i32>, i32, i32, i32, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
deleted file mode 100644
index a8cabdc9b0304..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,UNPACKED %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED %s
-
-; GCN-LABEL: {{^}}buffer_store_format_d16_x:
-; GCN: s_load_dword s[[LO:[0-9]+]]
-; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
-; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) {
-main_body:
- call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}buffer_store_format_d16_xy:
-
-; UNPACKED: s_load_dwordx2 s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], s[[S_DATA]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], s[[S_DATA]], 0xffff{{$}}
-; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
-; UNPACKED: buffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-
-; PACKED: buffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-define amdgpu_kernel void @buffer_store_format_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %index) {
-main_body:
- call void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
- ret void
-}
-
-define amdgpu_kernel void @buffer_store_format_d16_xyz(<4 x i32> %rsrc, <3 x half> %data, i32 %index) {
-main_body:
- call void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
-; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-
-; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
-; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
-
-; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
-
-; UNPACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-
-; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
-; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
-
-; PACKED: buffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
-define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) {
-main_body:
- call void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
- ret void
-}
-
-declare void @llvm.amdgcn.buffer.store.format.f16(half, <4 x i32>, i32, i32, i1, i1)
-declare void @llvm.amdgcn.buffer.store.format.v2f16(<2 x half>, <4 x i32>, i32, i32, i1, i1)
-declare void @llvm.amdgcn.buffer.store.format.v3f16(<3 x half>, <4 x i32>, i32, i32, i1, i1)
-declare void @llvm.amdgcn.buffer.store.format.v4f16(<4 x half>, <4 x i32>, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
deleted file mode 100644
index 41e2b4d0e5512..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
+++ /dev/null
@@ -1,104 +0,0 @@
-;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
-;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}buffer_store:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0
-;CHECK: buffer_store_format_xyzw v[4:7], off, s[0:3], 0 glc
-;CHECK: buffer_store_format_xyzw v[8:11], off, s[0:3], 0 slc
-define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
-main_body:
- call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
- call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_immoffs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:42
-define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
-main_body:
- call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_idx:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
-main_body:
- call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_ofs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen
-define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
-main_body:
- call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_both:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen
-define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
-main_body:
- call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_both_reversed:
-;CHECK: v_mov_b32_e32 v6, v4
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen
-define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
-main_body:
- call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0)
- ret void
-}
-
-; Ideally, the register allocator would avoid the wait here
-;
-;CHECK-LABEL: {{^}}buffer_store_wait:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen
-;VERDE: s_waitcnt expcnt(0)
-;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
-main_body:
- call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
- %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x1:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
-main_body:
- call void @llvm.amdgcn.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) {
-main_body:
- call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
- ret void
-}
-
-declare void @llvm.amdgcn.buffer.store.format.f32(float, <4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
-declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
deleted file mode 100644
index 8b18848a62792..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
+++ /dev/null
@@ -1,268 +0,0 @@
-;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
-;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}buffer_store:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
-;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
-define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
-main_body:
- call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
- call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_immoffs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
-define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
-main_body:
- call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_idx:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
-main_body:
- call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_ofs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
-define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
-main_body:
- call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_both:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
-define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
-main_body:
- call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_both_reversed:
-;CHECK: v_mov_b32_e32 v6, v4
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
-define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
-main_body:
- call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0)
- ret void
-}
-
-; Ideally, the register allocator would avoid the wait here
-;
-;CHECK-LABEL: {{^}}buffer_store_wait:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
-;VERDE: s_waitcnt expcnt(0)
-;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
-main_body:
- call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x1:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
-main_body:
- call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
-define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {
-main_body:
- call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
-define amdgpu_ps void @buffer_store_x1_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
- %a1 = add i32 %a, 4
- %a2 = add i32 %a, 8
- %a3 = add i32 %a, 12
- %a4 = add i32 %a, 16
- %a5 = add i32 %a, 28
- %a6 = add i32 %a, 32
- call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 %a4, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 %a5, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 %a6, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
-define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
- %a1 = add i32 %a, 4
- %a2 = add i32 %a, 8
- %a3 = add i32 %a, 12
- %a4 = add i32 %a, 16
- %a5 = add i32 %a, 28
- %a6 = add i32 %a, 32
- call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 1, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 %a4, i1 1, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 %a5, i1 1, i1 1)
- call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 %a6, i1 1, i1 1)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-define amdgpu_ps void @buffer_store_x2_offen_merged(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) {
- %a1 = add i32 %a, 4
- %a2 = add i32 %a, 12
- call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
-define amdgpu_ps void @buffer_store_x3_offen_merged(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3) {
- %a1 = add i32 %a, 28
- %a2 = add i32 %a, 32
- %a3 = add i32 %a, 36
- call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 %a3, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-define amdgpu_ps void @buffer_store_x3_offen_merged2(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, float %v2) {
- %a1 = add i32 %a, 4
- %a2 = add i32 %a, 12
- call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x3_offen_merged3:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-define amdgpu_ps void @buffer_store_x3_offen_merged3(<4 x i32> inreg %rsrc, i32 %a, float %v1, <2 x float> %v2) {
- %a1 = add i32 %a, 4
- %a2 = add i32 %a, 8
- call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 %a1, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 %a2, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
-define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
- call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 0, i32 28, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 0, i32 32, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1, <2 x float> %v2) {
- call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-define amdgpu_ps void @buffer_store_x3_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3) {
- call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 0, i32 12, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged2:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-define amdgpu_ps void @buffer_store_x3_offset_merged2(<4 x i32> inreg %rsrc, float %v1, <2 x float> %v2) {
- call void @llvm.amdgcn.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 0, i32 4, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_x3_offset_merged3:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx3 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:8
-define amdgpu_ps void @buffer_store_x3_offset_merged3(<4 x i32> inreg %rsrc, <2 x float> %v1, float %v2) {
- call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
- call void @llvm.amdgcn.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_byte:
-;CHECK-NOT: s_waitcnt
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0 offset:8
-define amdgpu_ps void @buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) {
-main_body:
- %v2 = fptoui float %v1 to i32
- %v3 = trunc i32 %v2 to i8
- call void @llvm.amdgcn.buffer.store.i8(i8 %v3, <4 x i32> %rsrc, i32 0, i32 8, i1 0, i1 0)
- ret void
-}
-
-;CHECK-LABEL: {{^}}buffer_store_short:
-;CHECK-NOT: s_waitcnt
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_short v{{[0-9]}}, off, s[0:3], 0 offset:16
-define amdgpu_ps void @buffer_store_short(<4 x i32> inreg %rsrc, float %v1) {
-main_body:
- %v2 = fptoui float %v1 to i32
- %v3 = trunc i32 %v2 to i16
- call void @llvm.amdgcn.buffer.store.i16(i16 %v3, <4 x i32> %rsrc, i32 0, i32 16, i1 0, i1 0)
- ret void
-}
-
-declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.i8(i8, <4 x i32>, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.buffer.store.i16(i16, <4 x i32>, i32, i32, i1, i1) #0
-declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
deleted file mode 100644
index bd1888b6965a1..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-
-declare void @llvm.amdgcn.buffer.wbinvl1() #0
-
-; GCN-LABEL: {{^}}test_buffer_wbinvl1:
-; GCN-NEXT: ; %bb.0:
-; SI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xc4,0xe1,0x00,0x00,0x00,0x00]
-; VI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xf8,0xe0,0x00,0x00,0x00,0x00]
-; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @test_buffer_wbinvl1() #0 {
- call void @llvm.amdgcn.buffer.wbinvl1()
- ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
deleted file mode 100644
index b937c42e14ed5..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -show-mc-encoding < %s | FileCheck -check-prefix=SI %s
-
-declare void @llvm.amdgcn.buffer.wbinvl1.sc() #0
-
-; SI-LABEL: {{^}}test_buffer_wbinvl1_sc:
-; SI-NEXT: ; %bb.0:
-; SI-NEXT: buffer_wbinvl1_sc ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00]
-; SI-NEXT: s_endpgm
-define amdgpu_kernel void @test_buffer_wbinvl1_sc() #0 {
- call void @llvm.amdgcn.buffer.wbinvl1.sc()
- ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
deleted file mode 100644
index 64ab8ecefd490..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-
-declare void @llvm.amdgcn.buffer.wbinvl1.vol() #0
-
-; GCN-LABEL: {{^}}test_buffer_wbinvl1_vol:
-; GCN-NEXT: ; %bb.0:
-; CI: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00]
-; VI: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xfc,0xe0,0x00,0x00,0x00,0x00]
-; GCN: _store_byte
-; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @test_buffer_wbinvl1_vol(ptr addrspace(1) %ptr) #0 {
- call void @llvm.amdgcn.buffer.wbinvl1.vol()
-; This used to crash in hazard recognizer
- store i8 0, ptr addrspace(1) %ptr, align 1
- ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
index 98ed437581e1a..47b7658f50cc5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.ll
@@ -313,7 +313,7 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i3
main_body:
%in1 = bitcast <4 x float> %vdata to <4 x i32>
call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 63, i32 0)
- %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0)
+ %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i32 0, i32 0)
%data.i = bitcast <4 x float> %data to <4 x i32>
call void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 46, i32 0)
ret void
@@ -620,7 +620,7 @@ declare void @llvm.amdgcn.struct.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32
declare void @llvm.amdgcn.struct.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32) #1
attributes #0 = { nounwind }
attributes #1 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll
deleted file mode 100644
index 06d66ce9db37d..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.d16.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=UNPACKED %s
-; RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
-; RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=PACKED %s
-
-; GCN-LABEL: {{^}}tbuffer_load_d16_x:
-; GCN: tbuffer_load_format_d16_x v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-define amdgpu_ps half @tbuffer_load_d16_x(<4 x i32> inreg %rsrc) {
-main_body:
- %data = call half @llvm.amdgcn.tbuffer.load.f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0)
- ret half %data
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_d16_xy:
-; UNPACKED: tbuffer_load_format_d16_xy v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: tbuffer_load_format_d16_xy v[[FULL:[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[FULL]]
-define amdgpu_ps half @tbuffer_load_d16_xy(<4 x i32> inreg %rsrc) {
-main_body:
- %data = call <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0)
- %elt = extractelement <2 x half> %data, i32 1
- ret half %elt
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_d16_xyz:
-; UNPACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: tbuffer_load_format_d16_xyz v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; PACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-define amdgpu_ps half @tbuffer_load_d16_xyz(<4 x i32> inreg %rsrc) {
-main_body:
- %data = call <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0)
- %elt = extractelement <3 x half> %data, i32 2
- ret half %elt
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_d16_xyzw:
-; UNPACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; UNPACKED: v_mov_b32_e32 v{{[0-9]+}}, v[[HI]]
-
-; PACKED: tbuffer_load_format_d16_xyzw v[{{[0-9]+}}:[[HI:[0-9]+]]], off, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; PACKED: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v[[HI]]
-define amdgpu_ps half @tbuffer_load_d16_xyzw(<4 x i32> inreg %rsrc) {
-main_body:
- %data = call <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0)
- %elt = extractelement <4 x half> %data, i32 3
- ret half %elt
-}
-
-declare half @llvm.amdgcn.tbuffer.load.f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare <2 x half> @llvm.amdgcn.tbuffer.load.v2f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare <3 x half> @llvm.amdgcn.tbuffer.load.v3f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare <4 x half> @llvm.amdgcn.tbuffer.load.v4f16(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll
index de929c2893baf..c89c5c5599334 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.dwordx3.ll
@@ -23,18 +23,5 @@ main_body:
ret <3 x float> %vdata.f
}
-
-; GCN-LABEL: {{^}}tbuffer_load_format_immoffs_x3:
-; SI: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42
-; GCNX3: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42
-define amdgpu_vs <3 x float> @tbuffer_load_format_immoffs_x3(<4 x i32> inreg) {
-main_body:
- %vdata = call <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0)
- %vdata.f = bitcast <3 x i32> %vdata to <3 x float>
- ret <3 x float> %vdata.f
-}
-
declare <3 x i32> @llvm.amdgcn.raw.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32)
declare <3 x i32> @llvm.amdgcn.struct.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32, i32)
-declare <3 x i32> @llvm.amdgcn.tbuffer.load.v3i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
deleted file mode 100644
index eb0dc37849b0f..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN %s
-;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}tbuffer_load:
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT]
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] glc
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM] slc
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_SNORM]
-; GCN: s_waitcnt
-define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) {
-main_body:
- %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
- %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 15, i32 3, i1 1, i1 0)
- %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 1)
- %vdata_f32 = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 0)
- %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
- %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
- %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
- %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
- %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
- %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
- %r3 = insertvalue {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r2, <4 x float> %vdata_f32, 3
- ret {<4 x float>, <4 x float>, <4 x float>, <4 x float>} %r3
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_immoffs:
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offset:42
-define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) {
-main_body:
- %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0)
- %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
- ret <4 x float> %vdata.f
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_immoffs_large
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 61 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] offset:4095
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_SSCALED] offset:73
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] offset:1
-; GCN: s_waitcnt
-define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg, i32 inreg %soffs) {
- %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 61, i32 4095, i32 15, i32 2, i1 0, i1 0)
- %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 73, i32 14, i32 3, i1 0, i1 0)
- %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 %soffs, i32 1, i32 13, i32 4, i1 0, i1 0)
- %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
- %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
- %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
- %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
- %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
- %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
- ret {<4 x float>, <4 x float>, <4 x float>} %r2
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_idx:
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen
-define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) {
-main_body:
- %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
- %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
- ret <4 x float> %vdata.f
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_ofs:
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen
-define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %voffs) {
-main_body:
- %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
- %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
- ret <4 x float> %vdata.f
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_ofs_imm:
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] offen offset:52
-define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %voffs) {
-main_body:
- %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %voffs, i32 0, i32 52, i32 14, i32 4, i1 0, i1 0)
- %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
- ret <4 x float> %vdata.f
-}
-
-; GCN-LABEL: {{^}}tbuffer_load_both:
-; GCN: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] idxen offen
-define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %voffs) {
-main_body:
- %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %voffs, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
- %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
- ret <4 x float> %vdata.f
-}
-
-
-; GCN-LABEL: {{^}}buffer_load_xy:
-; GCN: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT]
-define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
- %vdata = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0)
- %vdata.f = bitcast <2 x i32> %vdata to <2 x float>
- ret <2 x float> %vdata.f
-}
-
-; GCN-LABEL: {{^}}buffer_load_x:
-; GCN: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT]
-define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) {
- %vdata = call i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0)
- %vdata.f = bitcast i32 %vdata to float
- ret float %vdata.f
-}
-
-declare i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
deleted file mode 100644
index 16d7de698aa03..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,UNPACKED %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,PACKED %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,PACKED %s
-
-
-; GCN-LABEL: {{^}}tbuffer_store_d16_x:
-; GCN: s_load_dword s[[S_LO:[0-9]+]]
-; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
-; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %vindex) {
-main_body:
- call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}tbuffer_store_d16_xy:
-; GCN: s_load_dwordx2 s[[[S_DATA:[0-9]+]]:{{[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-; UNPACKED-DAG: s_lshr_b32 [[SHR:s[0-9]+]], s[[S_DATA]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED:s[0-9]+]], s[[S_DATA]], 0xffff{{$}}
-; UNPACKED-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[MASKED]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[SHR]]
-; UNPACKED: tbuffer_store_format_d16_xy v[[[V_LO]]:[[V_HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-
-; PACKED: tbuffer_store_format_d16_xy v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data, i32 %vindex) {
-main_body:
- call void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}tbuffer_store_d16_xyz:
-; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-
-; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
-; UNPACKED-DAG: s_and_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
-
-; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
-; UNPACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-
-; PACKED-DAG: s_and_b32 [[SHR0:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
-; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
-; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR0]]
-; PACKED: tbuffer_store_format_d16_xyz v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <3 x half> %data, i32 %vindex) {
-main_body:
- call void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
-; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
-
-; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
-; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
-; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
-
-; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
-; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
-; UNPACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-
-; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
-; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
-; PACKED: tbuffer_store_format_d16_xyzw v[[[LO]]:[[HI]]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) {
-main_body:
- call void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
- ret void
-}
-
-declare void @llvm.amdgcn.tbuffer.store.f16(half, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare void @llvm.amdgcn.tbuffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare void @llvm.amdgcn.tbuffer.store.v3f16(<3 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
-declare void @llvm.amdgcn.tbuffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll
index 83b5b0c65c47e..d5cbaddff020b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.dwordx3.ll
@@ -20,16 +20,6 @@ main_body:
ret void
}
-; GCN-LABEL: {{^}}tbuffer_store_immoffs_x3:
-; GCN: tbuffer_store_format_xyz v[0:2], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42
-define amdgpu_ps void @tbuffer_store_immoffs_x3(<4 x i32> inreg, <3 x float>) {
-main_body:
- %in1 = bitcast <3 x float> %1 to <3 x i32>
- call void @llvm.amdgcn.tbuffer.store.v3i32(<3 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0)
- ret void
-}
-
declare void @llvm.amdgcn.raw.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.struct.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32, i32) #0
-declare void @llvm.amdgcn.tbuffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
deleted file mode 100644
index b3a135a35a652..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
+++ /dev/null
@@ -1,110 +0,0 @@
-;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GCN,VERDE %s
-;RUN: llc -global-isel=0 < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}tbuffer_store:
-; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16_16_16,BUF_NUM_FORMAT_USCALED]
-; GCN: tbuffer_store_format_xyzw v[4:7], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_SSCALED] glc
-; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT] slc
-; GCN: tbuffer_store_format_xyzw v[8:11], off, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_UINT]
-define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
-main_body:
- %in1 = bitcast <4 x float> %1 to <4 x i32>
- %in2 = bitcast <4 x float> %2 to <4 x i32>
- %in3 = bitcast <4 x float> %3 to <4 x i32>
- call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 12, i32 2, i1 0, i1 0)
- call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 13, i32 3, i1 1, i1 0)
- call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 1)
- call void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}tbuffer_store_immoffs:
-; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], 0 format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42
-define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
-main_body:
- %in1 = bitcast <4 x float> %1 to <4 x i32>
- call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}tbuffer_store_scalar_and_imm_offs:
-; GCN: tbuffer_store_format_xyzw v[0:3], off, s[0:3], {{s[0-9]+}} format:[BUF_DATA_FORMAT_16_16,BUF_NUM_FORMAT_FLOAT] offset:42
-define amdgpu_ps void @tbuffer_store_scalar_and_imm_offs(<4 x i32> inreg, <4 x float> %vdata, i32 inreg %soffset) {
-main_body:
- %in1 = bitcast <4 x float> %vdata to <4 x i32>
- call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 %soffset, i32 42, i32 5, i32 7, i1 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}buffer_store_idx:
-; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_USCALED] idxen
-define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex) {
-main_body:
- %in1 = bitcast <4 x float> %vdata to <4 x i32>
- call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 0, i32 0, i32 0, i32 15, i32 2, i1 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}buffer_store_ofs:
-; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_8_8,BUF_NUM_FORMAT_FLOAT] offen
-define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float> %vdata, i32 %voffset) {
-main_body:
- %in1 = bitcast <4 x float> %vdata to <4 x i32>
- call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 %voffset, i32 0, i32 0, i32 3, i32 7, i1 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}buffer_store_both:
-; GCN: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 format:[BUF_DATA_FORMAT_10_11_11,BUF_NUM_FORMAT_UINT] idxen offen
-define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex, i32 %voffset) {
-main_body:
- %in1 = bitcast <4 x float> %vdata to <4 x i32>
- call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex, i32 %voffset, i32 0, i32 0, i32 6, i32 4, i1 0, i1 0)
- ret void
-}
-
-; Ideally, the register allocator would avoid the wait here
-;
-; GCN-LABEL: {{^}}buffer_store_wait:
-; GCN: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_RESERVED_15,BUF_NUM_FORMAT_SSCALED] idxen
-; VERDE: s_waitcnt expcnt(0)
-; GCN: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
-; GCN: s_waitcnt vmcnt(0)
-; GCN: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_USCALED] idxen
-define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float> %vdata, i32 %vindex.1, i32 %vindex.2, i32 %vindex.3) {
-main_body:
- %in1 = bitcast <4 x float> %vdata to <4 x i32>
- call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %vindex.1, i32 0, i32 0, i32 0, i32 15, i32 3, i1 0, i1 0)
- %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %vindex.2, i32 0, i1 0, i1 0)
- %data.i = bitcast <4 x float> %data to <4 x i32>
- call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %vindex.3, i32 0, i32 0, i32 0, i32 14, i32 2, i1 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}buffer_store_x1:
-; GCN: tbuffer_store_format_x v0, v1, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen
-define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
-main_body:
- %data.i = bitcast float %data to i32
- call void @llvm.amdgcn.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 13, i32 7, i1 0, i1 0)
- ret void
-}
-
-; GCN-LABEL: {{^}}buffer_store_x2:
-; GCN: tbuffer_store_format_xy v[0:1], v2, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED] idxen
-define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %vindex) {
-main_body:
- %data.i = bitcast <2 x float> %data to <2 x i32>
- call void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
- ret void
-}
-
-declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
-declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
-declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readonly }
-
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
index 4566865bc7c67..7e9d12241ee36 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll
@@ -1,423 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck %s
-; --------------------------------------------------------------------
-; llvm.amdgcn.buffer.load
-; --------------------------------------------------------------------
-
-define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- ret float %data
-}
-
-define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_v1f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <1 x float> [[DATA]]
-;
- %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- ret <1 x float> %data
-}
-
-define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- ret <2 x float> %data
-}
-
-define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <4 x float> [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- ret <4 x float> %data
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt0 = extractelement <2 x float> %data, i32 0
- ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_buffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt1 = extractelement <2 x float> %data, i32 1
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt0 = extractelement <4 x float> %data, i32 0
- ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt1 = extractelement <4 x float> %data, i32 1
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt2_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt1 = extractelement <4 x float> %data, i32 2
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt3_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt1 = extractelement <4 x float> %data, i32 3
- ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <3 x float> [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
- ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
- ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
- ret <3 x float> %shuf
-}
-
-define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[DATA]], i64 0
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT: [[INS0:%.*]] = insertvalue { float, float } undef, float [[ELT0]], 0
-; CHECK-NEXT: [[INS1:%.*]] = insertvalue { float, float } [[INS0]], float [[ELT1]], 1
-; CHECK-NEXT: ret { float, float } [[INS1]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt0 = extractelement <4 x float> %data, i32 0
- %elt1 = extractelement <4 x float> %data, i32 1
- %ins0 = insertvalue { float, float } undef, float %elt0, 0
- %ins1 = insertvalue { float, float } %ins0, float %elt1, 1
- ret { float, float } %ins1
-}
-
-define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT0:%.*]] = extractelement <3 x float> [[DATA]], i64 0
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 1
-; CHECK-NEXT: [[ELT2:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT: [[INS0:%.*]] = insertvalue { float, float, float } undef, float [[ELT0]], 0
-; CHECK-NEXT: [[INS1:%.*]] = insertvalue { float, float, float } [[INS0]], float [[ELT1]], 1
-; CHECK-NEXT: [[INS2:%.*]] = insertvalue { float, float, float } [[INS1]], float [[ELT2]], 2
-; CHECK-NEXT: ret { float, float, float } [[INS2]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt0 = extractelement <4 x float> %data, i32 0
- %elt1 = extractelement <4 x float> %data, i32 1
- %elt2 = extractelement <4 x float> %data, i32 2
- %ins0 = insertvalue { float, float, float } undef, float %elt0, 0
- %ins1 = insertvalue { float, float, float } %ins0, float %elt1, 1
- %ins2 = insertvalue { float, float, float } %ins1, float %elt2, 2
- ret { float, float, float } %ins2
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 poison, i32 1>
-; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
-; CHECK-NEXT: ret <2 x float> [[RET]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt0 = extractelement <4 x float> %data, i32 0
- %elt2 = extractelement <4 x float> %data, i32 2
- %ins0 = insertelement <2 x float> poison, float %elt0, i32 0
- %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 4, i32 1>
- %ret = fadd <2 x float> %ins1, %shuf
- ret <2 x float> %ret
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
-; CHECK-NEXT: ret <2 x float> [[RET]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt0 = extractelement <4 x float> %data, i32 0
- %elt2 = extractelement <4 x float> %data, i32 2
- %ins0 = insertelement <2 x float> poison, float %elt0, i32 0
- %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
- %shuf = shufflevector <4 x float> poison, <4 x float> %data, <2 x i32> <i32 5, i32 1>
- %ret = fadd <2 x float> %ins1, %shuf
- ret <2 x float> %ret
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 2, i32 2>
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
-; CHECK-NEXT: ret <2 x float> [[RET]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt2 = extractelement <4 x float> %data, i32 2
- %ins0 = insertelement <2 x float> poison, float %elt2, i32 0
- %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
- %shuf = shufflevector <4 x float> %data, <4 x float> %data, <2 x i32> <i32 0, i32 5>
- %ret = fadd <2 x float> %ins1, %shuf
- ret <2 x float> %ret
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt0 = extractelement <3 x float> %data, i32 0
- ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_buffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt1 = extractelement <3 x float> %data, i32 1
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt2_buffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt1 = extractelement <3 x float> %data, i32 2
- ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> [[SHUF]]
-;
- %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false), !fpmath [[META0:![0-9]+]]
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0
- %elt0 = extractelement <2 x float> %data, i32 0
- ret float %elt0
-}
-
-declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.buffer.load.format
-; --------------------------------------------------------------------
-
-define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_format_v1f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 true)
-; CHECK-NEXT: ret <1 x float> [[DATA]]
-;
- %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true)
- ret <1 x float> %data
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 true, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false)
- %elt0 = extractelement <2 x float> %data, i32 0
- ret float %elt0
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
- ret <2 x float> %shuf
-}
-
-; The initial insertion point is at the extractelement
-define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT: [[VAR:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[VAR]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[VAR1:%.*]] = bitcast <4 x float> [[TMP1]] to <2 x double>
-; CHECK-NEXT: [[VAR2:%.*]] = extractelement <2 x double> [[VAR1]], i64 0
-; CHECK-NEXT: ret double [[VAR2]]
-;
- %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
- %var1 = bitcast <4 x float> %var to <2 x double>
- %var2 = extractelement <2 x double> %var1, i32 0
- ret double %var2
-}
-
-define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
-; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32
-; CHECK-NEXT: ret i32 [[VAR2]]
-;
- %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
- %var1 = bitcast <4 x float> %var to <4 x i32>
- %var2 = extractelement <4 x i32> %var1, i32 0
- ret i32 %var2
-}
-
-define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[VAR]] to i32
-; CHECK-NEXT: [[VAR2:%.*]] = trunc i32 [[TMP1]] to i16
-; CHECK-NEXT: ret i16 [[VAR2]]
-;
- %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
- %var1 = bitcast <4 x float> %var to <8 x i16>
- %var2 = extractelement <8 x i16> %var1, i32 0
- ret i16 %var2
-}
-
-declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
-
; --------------------------------------------------------------------
; llvm.amdgcn.raw.buffer.load
; --------------------------------------------------------------------
@@ -665,7 +248,7 @@ define float @extract0_bitcast_raw_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32
define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath [[META0]]
+; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath [[META0:![0-9]+]]
; CHECK-NEXT: ret float [[DATA]]
;
%data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
@@ -4496,248 +4079,6 @@ declare <4 x float> @llvm.amdgcn.struct.ptr.tbuffer.load.v4f32(ptr addrspace(8),
declare <4 x i32> @llvm.amdgcn.struct.ptr.tbuffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32, i32) #1
-; --------------------------------------------------------------------
-; llvm.amdgcn.tbuffer.load
-; --------------------------------------------------------------------
-
-define amdgpu_ps float @tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @tbuffer_load_f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- ret float %data
-}
-
-define amdgpu_ps <2 x float> @tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @tbuffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- ret <2 x float> %data
-}
-
-define amdgpu_ps <4 x float> @tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <4 x float> [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- ret <4 x float> %data
-}
-
-define amdgpu_ps float @extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt0 = extractelement <2 x float> %data, i32 0
- ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt1 = extractelement <2 x float> %data, i32 1
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt0_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt0 = extractelement <4 x float> %data, i32 0
- ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt1 = extractelement <4 x float> %data, i32 1
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt1 = extractelement <4 x float> %data, i32 2
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt1 = extractelement <4 x float> %data, i32 3
- ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <3 x float> [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
- ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
- ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
- ret <3 x float> %shuf
-}
-
-define amdgpu_ps float @extract_elt0_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt0 = extractelement <3 x float> %data, i32 0
- ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt1 = extractelement <3 x float> %data, i32 1
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt1 = extractelement <3 x float> %data, i32 2
- ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> [[SHUF]]
-;
- %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
- ret <2 x float> %shuf
-}
-
-define i32 @extract0_bitcast_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32(
-; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32
-; CHECK-NEXT: ret i32 [[VAR2]]
-;
- %var = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %var1 = bitcast <4 x float> %var to <4 x i32>
- %var2 = extractelement <4 x i32> %var1, i32 0
- ret i32 %var2
-}
-
-define amdgpu_ps float @preserve_metadata_extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath [[META0]]
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0
- %elt0 = extractelement <2 x float> %data, i32 0
- ret float %elt0
-}
-
-declare float @llvm.amdgcn.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <1 x float> @llvm.amdgcn.tbuffer.load.v1f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-
-declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-
; --------------------------------------------------------------------
; llvm.amdgcn.image.sample
; --------------------------------------------------------------------
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
index 598175b08315f..af12367b9759c 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
@@ -1,423 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -passes=instcombine -mtriple=amdgcn-amd-amdhsa %s | FileCheck %s
-; --------------------------------------------------------------------
-; llvm.amdgcn.buffer.load
-; --------------------------------------------------------------------
-
-define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- ret float %data
-}
-
-define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_v1f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <1 x float> [[DATA]]
-;
- %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- ret <1 x float> %data
-}
-
-define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- ret <2 x float> %data
-}
-
-define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <4 x float> [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- ret <4 x float> %data
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt0 = extractelement <2 x float> %data, i32 0
- ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_buffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt1 = extractelement <2 x float> %data, i32 1
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt0 = extractelement <4 x float> %data, i32 0
- ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt1 = extractelement <4 x float> %data, i32 1
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt2_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt1 = extractelement <4 x float> %data, i32 2
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt3_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt1 = extractelement <4 x float> %data, i32 3
- ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <3 x float> [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
- ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
- ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
- ret <3 x float> %shuf
-}
-
-define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT0:%.*]] = extractelement <2 x float> [[DATA]], i64 0
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT: [[INS0:%.*]] = insertvalue { float, float } undef, float [[ELT0]], 0
-; CHECK-NEXT: [[INS1:%.*]] = insertvalue { float, float } [[INS0]], float [[ELT1]], 1
-; CHECK-NEXT: ret { float, float } [[INS1]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt0 = extractelement <4 x float> %data, i32 0
- %elt1 = extractelement <4 x float> %data, i32 1
- %ins0 = insertvalue { float, float } undef, float %elt0, 0
- %ins1 = insertvalue { float, float } %ins0, float %elt1, 1
- ret { float, float } %ins1
-}
-
-define amdgpu_ps { float, float, float } @extract_elt0_elt1_elt2_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_2(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT0:%.*]] = extractelement <3 x float> [[DATA]], i64 0
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 1
-; CHECK-NEXT: [[ELT2:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT: [[INS0:%.*]] = insertvalue { float, float, float } undef, float [[ELT0]], 0
-; CHECK-NEXT: [[INS1:%.*]] = insertvalue { float, float, float } [[INS0]], float [[ELT1]], 1
-; CHECK-NEXT: [[INS2:%.*]] = insertvalue { float, float, float } [[INS1]], float [[ELT2]], 2
-; CHECK-NEXT: ret { float, float, float } [[INS2]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt0 = extractelement <4 x float> %data, i32 0
- %elt1 = extractelement <4 x float> %data, i32 1
- %elt2 = extractelement <4 x float> %data, i32 2
- %ins0 = insertvalue { float, float, float } undef, float %elt0, 0
- %ins1 = insertvalue { float, float, float } %ins0, float %elt1, 1
- %ins2 = insertvalue { float, float, float } %ins1, float %elt2, 2
- ret { float, float, float } %ins2
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_3(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_3(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 poison, i32 1>
-; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
-; CHECK-NEXT: ret <2 x float> [[RET]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt0 = extractelement <4 x float> %data, i32 0
- %elt2 = extractelement <4 x float> %data, i32 2
- %ins0 = insertelement <2 x float> undef, float %elt0, i32 0
- %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 4, i32 1>
- %ret = fadd <2 x float> %ins1, %shuf
- ret <2 x float> %ret
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_4(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_4(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
-; CHECK-NEXT: ret <2 x float> [[RET]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt0 = extractelement <4 x float> %data, i32 0
- %elt2 = extractelement <4 x float> %data, i32 2
- %ins0 = insertelement <2 x float> undef, float %elt0, i32 0
- %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
- %shuf = shufflevector <4 x float> poison, <4 x float> %data, <2 x i32> <i32 5, i32 1>
- %ret = fadd <2 x float> %ins1, %shuf
- ret <2 x float> %ret
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32_5(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32_5(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[INS1:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 2, i32 2>
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT: [[RET:%.*]] = fadd <2 x float> [[INS1]], [[SHUF]]
-; CHECK-NEXT: ret <2 x float> [[RET]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt2 = extractelement <4 x float> %data, i32 2
- %ins0 = insertelement <2 x float> undef, float %elt2, i32 0
- %ins1 = insertelement <2 x float> %ins0, float %elt2, i32 1
- %shuf = shufflevector <4 x float> %data, <4 x float> %data, <2 x i32> <i32 0, i32 5>
- %ret = fadd <2 x float> %ins1, %shuf
- ret <2 x float> %ret
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt0 = extractelement <3 x float> %data, i32 0
- ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_buffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt1 = extractelement <3 x float> %data, i32 1
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt2_buffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %elt1 = extractelement <3 x float> %data, i32 2
- ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> [[SHUF]]
-;
- %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false), !fpmath [[META0:![0-9]+]]
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0
- %elt0 = extractelement <2 x float> %data, i32 0
- ret float %elt0
-}
-
-declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
-
-; --------------------------------------------------------------------
-; llvm.amdgcn.buffer.load.format
-; --------------------------------------------------------------------
-
-define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @buffer_load_format_v1f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 true)
-; CHECK-NEXT: ret <1 x float> [[DATA]]
-;
- %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true)
- ret <1 x float> %data
-}
-
-define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 true, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false)
- %elt0 = extractelement <2 x float> %data, i32 0
- ret float %elt0
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[IDX:%.*]], i32 [[OFS:%.*]], i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
- ret <2 x float> %shuf
-}
-
-; The initial insertion point is at the extractelement
-define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT: [[VAR:%.*]] = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[VAR]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[VAR1:%.*]] = bitcast <4 x float> [[TMP1]] to <2 x double>
-; CHECK-NEXT: [[VAR2:%.*]] = extractelement <2 x double> [[VAR1]], i64 0
-; CHECK-NEXT: ret double [[VAR2]]
-;
- %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
- %var1 = bitcast <4 x float> %var to <2 x double>
- %var2 = extractelement <2 x double> %var1, i32 0
- ret double %var2
-}
-
-define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
-; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32
-; CHECK-NEXT: ret i32 [[VAR2]]
-;
- %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
- %var1 = bitcast <4 x float> %var to <4 x i32>
- %var2 = extractelement <4 x i32> %var1, i32 0
- ret i32 %var2
-}
-
-define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
-; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32(
-; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 [[ARG:%.*]], i32 16, i1 false, i1 false)
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[VAR]] to i32
-; CHECK-NEXT: [[VAR2:%.*]] = trunc i32 [[TMP1]] to i16
-; CHECK-NEXT: ret i16 [[VAR2]]
-;
- %var = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
- %var1 = bitcast <4 x float> %var to <8 x i16>
- %var2 = extractelement <8 x i16> %var1, i32 0
- ret i16 %var2
-}
-
-declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
-
; --------------------------------------------------------------------
; llvm.amdgcn.raw.buffer.load
; --------------------------------------------------------------------
@@ -665,7 +248,7 @@ define float @extract0_bitcast_raw_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32
define amdgpu_ps float @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
; CHECK-LABEL: @preserve_metadata_extract_elt0_raw_buffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath [[META0]]
+; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[OFS:%.*]], i32 [[SOFS:%.*]], i32 0), !fpmath [[META0:![0-9]+]]
; CHECK-NEXT: ret float [[DATA]]
;
%data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0), !fpmath !0
@@ -4495,248 +4078,6 @@ declare <4 x float> @llvm.amdgcn.struct.ptr.tbuffer.load.v4f32(ptr addrspace(8),
declare <4 x i32> @llvm.amdgcn.struct.ptr.tbuffer.load.v4i32(ptr addrspace(8), i32, i32, i32, i32, i32) #1
-; --------------------------------------------------------------------
-; llvm.amdgcn.tbuffer.load
-; --------------------------------------------------------------------
-
-define amdgpu_ps float @tbuffer_load_f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @tbuffer_load_f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- ret float %data
-}
-
-define amdgpu_ps <2 x float> @tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @tbuffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- ret <2 x float> %data
-}
-
-define amdgpu_ps <4 x float> @tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <4 x float> [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- ret <4 x float> %data
-}
-
-define amdgpu_ps float @extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt0 = extractelement <2 x float> %data, i32 0
- ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt1 = extractelement <2 x float> %data, i32 1
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt0_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt0 = extractelement <4 x float> %data, i32 0
- ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt1 = extractelement <4 x float> %data, i32 1
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt1 = extractelement <4 x float> %data, i32 2
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <4 x float> [[DATA]], i64 3
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt1 = extractelement <4 x float> %data, i32 3
- ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 0, i32 1>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 1, i32 2>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT: ret <2 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <2 x i32> <i32 2, i32 3>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_elt2_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <3 x float> [[DATA]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
- ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 1, i32 2, i32 3>
- ret <3 x float> %shuf
-}
-
-define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt2_elt3_tbuffer_load_v4f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x float> [[DATA]], <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
-; CHECK-NEXT: ret <3 x float> [[SHUF]]
-;
- %data = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <4 x float> %data, <4 x float> poison, <3 x i32> <i32 0, i32 2, i32 3>
- ret <3 x float> %shuf
-}
-
-define amdgpu_ps float @extract_elt0_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_tbuffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt0 = extractelement <3 x float> %data, i32 0
- ret float %elt0
-}
-
-define amdgpu_ps float @extract_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_tbuffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <2 x float> [[DATA]], i64 1
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt1 = extractelement <3 x float> %data, i32 1
- ret float %elt1
-}
-
-define amdgpu_ps float @extract_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt2_tbuffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[ELT1:%.*]] = extractelement <3 x float> [[DATA]], i64 2
-; CHECK-NEXT: ret float [[ELT1]]
-;
- %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %elt1 = extractelement <3 x float> %data, i32 2
- ret float %elt1
-}
-
-define amdgpu_ps <2 x float> @extract_elt0_elt1_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt0_elt1_tbuffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: ret <2 x float> [[DATA]]
-;
- %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 0, i32 1>
- ret <2 x float> %shuf
-}
-
-define amdgpu_ps <2 x float> @extract_elt1_elt2_tbuffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract_elt1_elt2_tbuffer_load_v3f32(
-; CHECK-NEXT: [[DATA:%.*]] = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <3 x float> [[DATA]], <3 x float> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: ret <2 x float> [[SHUF]]
-;
- %data = call <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %shuf = shufflevector <3 x float> %data, <3 x float> poison, <2 x i32> <i32 1, i32 2>
- ret <2 x float> %shuf
-}
-
-define i32 @extract0_bitcast_tbuffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @extract0_bitcast_tbuffer_load_v4f32(
-; CHECK-NEXT: [[VAR:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
-; CHECK-NEXT: [[VAR2:%.*]] = bitcast float [[VAR]] to i32
-; CHECK-NEXT: ret i32 [[VAR2]]
-;
- %var = call <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false)
- %var1 = bitcast <4 x float> %var to <4 x i32>
- %var2 = extractelement <4 x i32> %var1, i32 0
- ret i32 %var2
-}
-
-define amdgpu_ps float @preserve_metadata_extract_elt0_tbuffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %arg0, i32 %arg1) #0 {
-; CHECK-LABEL: @preserve_metadata_extract_elt0_tbuffer_load_v2f32(
-; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.tbuffer.load.f32(<4 x i32> [[RSRC:%.*]], i32 [[ARG0:%.*]], i32 [[ARG1:%.*]], i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath [[META0]]
-; CHECK-NEXT: ret float [[DATA]]
-;
- %data = call <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32> %rsrc, i32 %arg0, i32 %arg1, i32 0, i32 0, i32 14, i32 4, i1 false, i1 false), !fpmath !0
- %elt0 = extractelement <2 x float> %data, i32 0
- ret float %elt0
-}
-
-declare float @llvm.amdgcn.tbuffer.load.f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <1 x float> @llvm.amdgcn.tbuffer.load.v1f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <2 x float> @llvm.amdgcn.tbuffer.load.v2f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <3 x float> @llvm.amdgcn.tbuffer.load.v3f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-declare <4 x float> @llvm.amdgcn.tbuffer.load.v4f32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-
-declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #1
-
; --------------------------------------------------------------------
; llvm.amdgcn.image.sample
; --------------------------------------------------------------------
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
index 9cef4a3c7cc0f..65c89618e20ba 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-simplify-image-buffer-stores.ll
@@ -72,33 +72,6 @@ define amdgpu_ps void @image_store_mip_1d_store_insert_zeros_at_end(<8 x i32> in
ret void
}
-define amdgpu_ps void @buffer_store_format_insert_zeros_at_end(<4 x i32> inreg %a, float %vdata1, i32 %b) {
-; GCN-LABEL: @buffer_store_format_insert_zeros_at_end(
-; GCN-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[VDATA1:%.*]], i64 0
-; GCN-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer
-; GCN-NEXT: call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> [[TMP2]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
-; GCN-NEXT: ret void
-;
-; GFX12-LABEL: @buffer_store_format_insert_zeros_at_end(
-; GFX12-NEXT: [[TMP1:%.*]] = insertelement <4 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
-; GFX12-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 1
-; GFX12-NEXT: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
-; GFX12-NEXT: ret void
-;
-; GFXUNKNOWN-LABEL: @buffer_store_format_insert_zeros_at_end(
-; GFXUNKNOWN-NEXT: [[TMP1:%.*]] = insertelement <4 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00>, float [[VDATA1:%.*]], i64 0
-; GFXUNKNOWN-NEXT: [[NEWVDATA4:%.*]] = insertelement <4 x float> [[TMP1]], float [[VDATA1]], i64 1
-; GFXUNKNOWN-NEXT: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> [[NEWVDATA4]], <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 0, i1 false, i1 false)
-; GFXUNKNOWN-NEXT: ret void
-;
- %newvdata1 = insertelement <4 x float> undef, float %vdata1, i32 0
- %newvdata2 = insertelement <4 x float> %newvdata1, float %vdata1, i32 1
- %newvdata3 = insertelement <4 x float> %newvdata2, float 0.0, i32 2
- %newvdata4 = insertelement <4 x float> %newvdata3, float 0.0, i32 3
- call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %newvdata4, <4 x i32> %a, i32 %b, i32 0, i1 0, i1 0)
- ret void
-}
-
define amdgpu_ps void @struct_buffer_store_format_insert_zeros(<4 x i32> inreg %a, float %vdata1, i32 %b) {
; GCN-LABEL: @struct_buffer_store_format_insert_zeros(
; GCN-NEXT: [[TMP1:%.*]] = insertelement <3 x float> <float poison, float 0.000000e+00, float poison>, float [[VDATA1:%.*]], i64 0
@@ -304,11 +277,9 @@ define amdgpu_ps void @struct_tbuffer_store_argument_insert_first(<4 x i32> inre
}
declare void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #2
-declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #2
declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32) #0
declare void @llvm.amdgcn.raw.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32) #0
-declare void @llvm.amdgcn.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1) #0
declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #0
declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32, i32, i32, <8 x i32>, i32, i32) #0
declare void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float>, i32, i32, i32, i32, <8 x i32>, i32, i32) #0
diff --git a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
index 7f7790cecb0eb..9ed62d3545f07 100644
--- a/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/AMDGPU/intrinsic-immarg.ll
@@ -1,19 +1,5 @@
; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
-declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1)
-define void @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i1 %bool) {
- ; CHECK: immarg operand has non-immediate parameter
- ; CHECK-NEXT: i1 %bool
- ; CHECK-NEXT: %data0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 %bool, i1 false)
- %data0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 %bool, i1 false)
-
- ; CHECK: immarg operand has non-immediate parameter
- ; CHECK-NEXT: i1 %bool
- ; CHECK-NEXT: %data1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 %bool)
- %data1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 %bool)
- ret void
-}
-
declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32)
define void @raw_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs, i32 %arg) {
; CHECK: immarg operand has non-immediate parameter
@@ -665,12 +651,3 @@ define void @test_mfma_f32_32x32x1f32(float %arg0, float %arg1, <32 x i32> %arg2
ret void
}
-
-declare void @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1)
-define amdgpu_cs void @test_buffer_atomic_fadd(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %offset, i1 %slc) {
- ; CHECK: immarg operand has non-immediate parameter
- ; CHECK-NEXT: i1 %slc
- ; CHECK-NEXT: call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc)
- call void @llvm.amdgcn.buffer.atomic.fadd.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %offset, i1 %slc)
- ret void
-}
>From 170ab4f95583adc91a1fd8a02032c6f07fd4d61f Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 30 May 2024 11:55:09 +0100
Subject: [PATCH 2/2] Remove unused function
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 6 ------
1 file changed, 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d9b7dc2eb59d2..5bb29b1f018f6 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8589,12 +8589,6 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
M->getMemOperand());
}
-// Return a value to use for the idxen operand by examining the vindex operand.
-static unsigned getIdxEn(SDValue VIndex) {
- // No need to set idxen if vindex is known to be zero.
- return isNullConstant(VIndex) ? 0 : 1;
-}
-
SDValue
SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
unsigned NewOpcode) const {
More information about the llvm-commits
mailing list